aboutsummaryrefslogtreecommitdiff
path: root/lib/Target/X86
diff options
context:
space:
mode:
authorDimitry Andric <dim@FreeBSD.org>2019-08-20 20:50:12 +0000
committerDimitry Andric <dim@FreeBSD.org>2019-08-20 20:50:12 +0000
commite6d1592492a3a379186bfb02bd0f4eda0669c0d5 (patch)
tree599ab169a01f1c86eda9adc774edaedde2f2db5b /lib/Target/X86
parent1a56a5ead7a2e84bee8240f5f6b033b5f1707154 (diff)
downloadsrc-e6d1592492a3a379186bfb02bd0f4eda0669c0d5.tar.gz
src-e6d1592492a3a379186bfb02bd0f4eda0669c0d5.zip
Notes
Diffstat (limited to 'lib/Target/X86')
-rw-r--r--lib/Target/X86/AsmParser/X86AsmInstrumentation.cpp1089
-rw-r--r--lib/Target/X86/AsmParser/X86AsmInstrumentation.h68
-rw-r--r--lib/Target/X86/AsmParser/X86AsmParser.cpp447
-rw-r--r--lib/Target/X86/AsmParser/X86AsmParserCommon.h7
-rw-r--r--lib/Target/X86/AsmParser/X86Operand.h58
-rw-r--r--lib/Target/X86/Disassembler/X86Disassembler.cpp217
-rw-r--r--lib/Target/X86/Disassembler/X86DisassemblerDecoder.cpp19
-rw-r--r--lib/Target/X86/Disassembler/X86DisassemblerDecoder.h14
-rw-r--r--lib/Target/X86/InstPrinter/X86ATTInstPrinter.cpp202
-rw-r--r--lib/Target/X86/InstPrinter/X86InstPrinterCommon.cpp142
-rw-r--r--lib/Target/X86/InstPrinter/X86IntelInstPrinter.cpp162
-rw-r--r--lib/Target/X86/MCTargetDesc/X86ATTInstPrinter.cpp487
-rw-r--r--lib/Target/X86/MCTargetDesc/X86ATTInstPrinter.h (renamed from lib/Target/X86/InstPrinter/X86ATTInstPrinter.h)48
-rw-r--r--lib/Target/X86/MCTargetDesc/X86AsmBackend.cpp82
-rw-r--r--lib/Target/X86/MCTargetDesc/X86BaseInfo.h94
-rw-r--r--lib/Target/X86/MCTargetDesc/X86ELFObjectWriter.cpp38
-rw-r--r--lib/Target/X86/MCTargetDesc/X86FixupKinds.h7
-rw-r--r--lib/Target/X86/MCTargetDesc/X86InstComments.cpp (renamed from lib/Target/X86/InstPrinter/X86InstComments.cpp)36
-rw-r--r--lib/Target/X86/MCTargetDesc/X86InstComments.h (renamed from lib/Target/X86/InstPrinter/X86InstComments.h)11
-rw-r--r--lib/Target/X86/MCTargetDesc/X86InstPrinterCommon.cpp362
-rw-r--r--lib/Target/X86/MCTargetDesc/X86InstPrinterCommon.h (renamed from lib/Target/X86/InstPrinter/X86InstPrinterCommon.h)19
-rw-r--r--lib/Target/X86/MCTargetDesc/X86IntelInstPrinter.cpp445
-rw-r--r--lib/Target/X86/MCTargetDesc/X86IntelInstPrinter.h (renamed from lib/Target/X86/InstPrinter/X86IntelInstPrinter.h)57
-rw-r--r--lib/Target/X86/MCTargetDesc/X86MCAsmInfo.cpp7
-rw-r--r--lib/Target/X86/MCTargetDesc/X86MCAsmInfo.h7
-rw-r--r--lib/Target/X86/MCTargetDesc/X86MCCodeEmitter.cpp97
-rw-r--r--lib/Target/X86/MCTargetDesc/X86MCExpr.h9
-rw-r--r--lib/Target/X86/MCTargetDesc/X86MCTargetDesc.cpp22
-rw-r--r--lib/Target/X86/MCTargetDesc/X86MCTargetDesc.h10
-rw-r--r--lib/Target/X86/MCTargetDesc/X86MachObjectWriter.cpp7
-rw-r--r--lib/Target/X86/MCTargetDesc/X86TargetStreamer.h7
-rw-r--r--lib/Target/X86/MCTargetDesc/X86WinCOFFObjectWriter.cpp7
-rw-r--r--lib/Target/X86/MCTargetDesc/X86WinCOFFStreamer.cpp7
-rw-r--r--lib/Target/X86/MCTargetDesc/X86WinCOFFTargetStreamer.cpp7
-rw-r--r--lib/Target/X86/ShadowCallStack.cpp322
-rw-r--r--lib/Target/X86/TargetInfo/X86TargetInfo.cpp9
-rw-r--r--lib/Target/X86/TargetInfo/X86TargetInfo.h21
-rw-r--r--lib/Target/X86/Utils/X86ShuffleDecode.cpp14
-rw-r--r--lib/Target/X86/Utils/X86ShuffleDecode.h9
-rw-r--r--lib/Target/X86/X86.h15
-rw-r--r--lib/Target/X86/X86.td1226
-rw-r--r--lib/Target/X86/X86AsmPrinter.cpp274
-rw-r--r--lib/Target/X86/X86AsmPrinter.h25
-rw-r--r--lib/Target/X86/X86AvoidStoreForwardingBlocks.cpp29
-rw-r--r--lib/Target/X86/X86CallFrameOptimization.cpp12
-rw-r--r--lib/Target/X86/X86CallLowering.cpp78
-rw-r--r--lib/Target/X86/X86CallLowering.h13
-rw-r--r--lib/Target/X86/X86CallingConv.cpp162
-rw-r--r--lib/Target/X86/X86CallingConv.h104
-rw-r--r--lib/Target/X86/X86CallingConv.td28
-rw-r--r--lib/Target/X86/X86CmovConversion.cpp35
-rw-r--r--lib/Target/X86/X86CondBrFolding.cpp26
-rw-r--r--lib/Target/X86/X86DiscriminateMemOps.cpp42
-rw-r--r--lib/Target/X86/X86DomainReassignment.cpp12
-rwxr-xr-xlib/Target/X86/X86EvexToVex.cpp21
-rw-r--r--lib/Target/X86/X86ExpandPseudo.cpp41
-rw-r--r--lib/Target/X86/X86FastISel.cpp264
-rw-r--r--lib/Target/X86/X86FixupBWInsts.cpp13
-rw-r--r--lib/Target/X86/X86FixupLEAs.cpp393
-rw-r--r--lib/Target/X86/X86FixupSetCC.cpp37
-rw-r--r--lib/Target/X86/X86FlagsCopyLowering.cpp56
-rw-r--r--lib/Target/X86/X86FloatingPoint.cpp28
-rw-r--r--lib/Target/X86/X86FrameLowering.cpp80
-rw-r--r--lib/Target/X86/X86FrameLowering.h11
-rw-r--r--lib/Target/X86/X86GenRegisterBankInfo.def7
-rw-r--r--lib/Target/X86/X86ISelDAGToDAG.cpp1590
-rw-r--r--lib/Target/X86/X86ISelLowering.cpp9548
-rw-r--r--lib/Target/X86/X86ISelLowering.h216
-rw-r--r--lib/Target/X86/X86IndirectBranchTracking.cpp49
-rw-r--r--lib/Target/X86/X86InsertPrefetch.cpp10
-rw-r--r--lib/Target/X86/X86Instr3DNow.td11
-rw-r--r--lib/Target/X86/X86InstrAVX512.td3486
-rw-r--r--lib/Target/X86/X86InstrArithmetic.td101
-rw-r--r--lib/Target/X86/X86InstrBuilder.h7
-rw-r--r--lib/Target/X86/X86InstrCMovSetCC.td176
-rw-r--r--lib/Target/X86/X86InstrCompiler.td323
-rw-r--r--lib/Target/X86/X86InstrControl.td64
-rw-r--r--lib/Target/X86/X86InstrExtension.td11
-rw-r--r--lib/Target/X86/X86InstrFMA.td13
-rw-r--r--lib/Target/X86/X86InstrFMA3Info.cpp17
-rw-r--r--lib/Target/X86/X86InstrFMA3Info.h7
-rw-r--r--lib/Target/X86/X86InstrFPStack.td341
-rw-r--r--lib/Target/X86/X86InstrFoldTables.cpp186
-rw-r--r--lib/Target/X86/X86InstrFoldTables.h7
-rw-r--r--lib/Target/X86/X86InstrFormats.td33
-rw-r--r--lib/Target/X86/X86InstrFragmentsSIMD.td368
-rw-r--r--lib/Target/X86/X86InstrInfo.cpp1116
-rw-r--r--lib/Target/X86/X86InstrInfo.h79
-rw-r--r--lib/Target/X86/X86InstrInfo.td439
-rw-r--r--lib/Target/X86/X86InstrMMX.td13
-rw-r--r--lib/Target/X86/X86InstrMPX.td7
-rw-r--r--lib/Target/X86/X86InstrSGX.td7
-rw-r--r--lib/Target/X86/X86InstrSSE.td1917
-rw-r--r--lib/Target/X86/X86InstrSVM.td7
-rw-r--r--lib/Target/X86/X86InstrShiftRotate.td98
-rw-r--r--lib/Target/X86/X86InstrSystem.td26
-rw-r--r--lib/Target/X86/X86InstrTSX.td7
-rw-r--r--lib/Target/X86/X86InstrVMX.td7
-rw-r--r--lib/Target/X86/X86InstrVecCompiler.td104
-rw-r--r--lib/Target/X86/X86InstrXOP.td33
-rw-r--r--lib/Target/X86/X86InstructionSelector.cpp92
-rw-r--r--lib/Target/X86/X86InterleavedAccess.cpp27
-rw-r--r--lib/Target/X86/X86IntrinsicsInfo.h781
-rw-r--r--lib/Target/X86/X86LegalizerInfo.cpp30
-rw-r--r--lib/Target/X86/X86LegalizerInfo.h7
-rw-r--r--lib/Target/X86/X86MCInstLower.cpp274
-rw-r--r--lib/Target/X86/X86MachineFunctionInfo.cpp7
-rw-r--r--lib/Target/X86/X86MachineFunctionInfo.h7
-rw-r--r--lib/Target/X86/X86MacroFusion.cpp164
-rw-r--r--lib/Target/X86/X86MacroFusion.h7
-rw-r--r--lib/Target/X86/X86OptimizeLEAs.cpp14
-rw-r--r--lib/Target/X86/X86PadShortFunction.cpp16
-rw-r--r--lib/Target/X86/X86PfmCounters.td7
-rw-r--r--lib/Target/X86/X86RegisterBankInfo.cpp24
-rw-r--r--lib/Target/X86/X86RegisterBankInfo.h7
-rw-r--r--lib/Target/X86/X86RegisterBanks.td7
-rw-r--r--lib/Target/X86/X86RegisterInfo.cpp37
-rw-r--r--lib/Target/X86/X86RegisterInfo.h23
-rw-r--r--lib/Target/X86/X86RegisterInfo.td44
-rw-r--r--lib/Target/X86/X86RetpolineThunks.cpp7
-rwxr-xr-xlib/Target/X86/X86SchedBroadwell.td169
-rw-r--r--lib/Target/X86/X86SchedHaswell.td195
-rw-r--r--lib/Target/X86/X86SchedPredicates.td31
-rw-r--r--lib/Target/X86/X86SchedSandyBridge.td96
-rw-r--r--lib/Target/X86/X86SchedSkylakeClient.td193
-rwxr-xr-xlib/Target/X86/X86SchedSkylakeServer.td212
-rw-r--r--lib/Target/X86/X86Schedule.td14
-rw-r--r--lib/Target/X86/X86ScheduleAtom.td12
-rw-r--r--lib/Target/X86/X86ScheduleBdVer2.td599
-rw-r--r--lib/Target/X86/X86ScheduleBtVer2.td45
-rw-r--r--lib/Target/X86/X86ScheduleSLM.td10
-rw-r--r--lib/Target/X86/X86ScheduleZnver1.td10
-rw-r--r--lib/Target/X86/X86SelectionDAGInfo.cpp222
-rw-r--r--lib/Target/X86/X86SelectionDAGInfo.h7
-rw-r--r--lib/Target/X86/X86ShuffleDecodeConstantPool.cpp7
-rw-r--r--lib/Target/X86/X86ShuffleDecodeConstantPool.h7
-rw-r--r--lib/Target/X86/X86SpeculativeLoadHardening.cpp41
-rw-r--r--lib/Target/X86/X86Subtarget.cpp22
-rw-r--r--lib/Target/X86/X86Subtarget.h47
-rw-r--r--lib/Target/X86/X86TargetMachine.cpp33
-rw-r--r--lib/Target/X86/X86TargetMachine.h7
-rw-r--r--lib/Target/X86/X86TargetObjectFile.cpp7
-rw-r--r--lib/Target/X86/X86TargetObjectFile.h7
-rw-r--r--lib/Target/X86/X86TargetTransformInfo.cpp529
-rw-r--r--lib/Target/X86/X86TargetTransformInfo.h76
-rw-r--r--lib/Target/X86/X86VZeroUpper.cpp7
-rw-r--r--lib/Target/X86/X86WinAllocaExpander.cpp46
-rw-r--r--lib/Target/X86/X86WinEHState.cpp45
148 files changed, 18207 insertions, 14200 deletions
diff --git a/lib/Target/X86/AsmParser/X86AsmInstrumentation.cpp b/lib/Target/X86/AsmParser/X86AsmInstrumentation.cpp
deleted file mode 100644
index 2c376fd062ca..000000000000
--- a/lib/Target/X86/AsmParser/X86AsmInstrumentation.cpp
+++ /dev/null
@@ -1,1089 +0,0 @@
-//===-- X86AsmInstrumentation.cpp - Instrument X86 inline assembly --------===//
-//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
-//
-//===----------------------------------------------------------------------===//
-
-#include "X86AsmInstrumentation.h"
-#include "MCTargetDesc/X86MCTargetDesc.h"
-#include "X86Operand.h"
-#include "llvm/ADT/Triple.h"
-#include "llvm/ADT/Twine.h"
-#include "llvm/MC/MCContext.h"
-#include "llvm/MC/MCDwarf.h"
-#include "llvm/MC/MCExpr.h"
-#include "llvm/MC/MCInst.h"
-#include "llvm/MC/MCInstBuilder.h"
-#include "llvm/MC/MCInstrInfo.h"
-#include "llvm/MC/MCParser/MCParsedAsmOperand.h"
-#include "llvm/MC/MCParser/MCTargetAsmParser.h"
-#include "llvm/MC/MCRegisterInfo.h"
-#include "llvm/MC/MCStreamer.h"
-#include "llvm/MC/MCSubtargetInfo.h"
-#include "llvm/MC/MCTargetOptions.h"
-#include "llvm/Support/CommandLine.h"
-#include "llvm/Support/ErrorHandling.h"
-#include "llvm/Support/SMLoc.h"
-#include <algorithm>
-#include <cassert>
-#include <cstdint>
-#include <limits>
-#include <memory>
-#include <vector>
-
-// Following comment describes how assembly instrumentation works.
-// Currently we have only AddressSanitizer instrumentation, but we're
-// planning to implement MemorySanitizer for inline assembly too. If
-// you're not familiar with AddressSanitizer algorithm, please, read
-// https://github.com/google/sanitizers/wiki/AddressSanitizerAlgorithm
-//
-// When inline assembly is parsed by an instance of X86AsmParser, all
-// instructions are emitted via EmitInstruction method. That's the
-// place where X86AsmInstrumentation analyzes an instruction and
-// decides, whether the instruction should be emitted as is or
-// instrumentation is required. The latter case happens when an
-// instruction reads from or writes to memory. Now instruction opcode
-// is explicitly checked, and if an instruction has a memory operand
-// (for instance, movq (%rsi, %rcx, 8), %rax) - it should be
-// instrumented. There're also exist instructions that modify
-// memory but don't have an explicit memory operands, for instance,
-// movs.
-//
-// Let's consider at first 8-byte memory accesses when an instruction
-// has an explicit memory operand. In this case we need two registers -
-// AddressReg to compute address of a memory cells which are accessed
-// and ShadowReg to compute corresponding shadow address. So, we need
-// to spill both registers before instrumentation code and restore them
-// after instrumentation. Thus, in general, instrumentation code will
-// look like this:
-// PUSHF # Store flags, otherwise they will be overwritten
-// PUSH AddressReg # spill AddressReg
-// PUSH ShadowReg # spill ShadowReg
-// LEA MemOp, AddressReg # compute address of the memory operand
-// MOV AddressReg, ShadowReg
-// SHR ShadowReg, 3
-// # ShadowOffset(AddressReg >> 3) contains address of a shadow
-// # corresponding to MemOp.
-// CMP ShadowOffset(ShadowReg), 0 # test shadow value
-// JZ .Done # when shadow equals to zero, everything is fine
-// MOV AddressReg, RDI
-// # Call __asan_report function with AddressReg as an argument
-// CALL __asan_report
-// .Done:
-// POP ShadowReg # Restore ShadowReg
-// POP AddressReg # Restore AddressReg
-// POPF # Restore flags
-//
-// Memory accesses with different size (1-, 2-, 4- and 16-byte) are
-// handled in a similar manner, but small memory accesses (less than 8
-// byte) require an additional ScratchReg, which is used for shadow value.
-//
-// If, suppose, we're instrumenting an instruction like movs, only
-// contents of RDI, RDI + AccessSize * RCX, RSI, RSI + AccessSize *
-// RCX are checked. In this case there're no need to spill and restore
-// AddressReg , ShadowReg or flags four times, they're saved on stack
-// just once, before instrumentation of these four addresses, and restored
-// at the end of the instrumentation.
-//
-// There exist several things which complicate this simple algorithm.
-// * Instrumented memory operand can have RSP as a base or an index
-// register. So we need to add a constant offset before computation
-// of memory address, since flags, AddressReg, ShadowReg, etc. were
-// already stored on stack and RSP was modified.
-// * Debug info (usually, DWARF) should be adjusted, because sometimes
-// RSP is used as a frame register. So, we need to select some
-// register as a frame register and temprorary override current CFA
-// register.
-
-using namespace llvm;
-
-static cl::opt<bool> ClAsanInstrumentAssembly(
- "asan-instrument-assembly",
- cl::desc("instrument assembly with AddressSanitizer checks"), cl::Hidden,
- cl::init(false));
-
-static const int64_t MinAllowedDisplacement =
- std::numeric_limits<int32_t>::min();
-static const int64_t MaxAllowedDisplacement =
- std::numeric_limits<int32_t>::max();
-
-static int64_t ApplyDisplacementBounds(int64_t Displacement) {
- return std::max(std::min(MaxAllowedDisplacement, Displacement),
- MinAllowedDisplacement);
-}
-
-static void CheckDisplacementBounds(int64_t Displacement) {
- assert(Displacement >= MinAllowedDisplacement &&
- Displacement <= MaxAllowedDisplacement);
-}
-
-static bool IsStackReg(unsigned Reg) {
- return Reg == X86::RSP || Reg == X86::ESP;
-}
-
-static bool IsSmallMemAccess(unsigned AccessSize) { return AccessSize < 8; }
-
-namespace {
-
-class X86AddressSanitizer : public X86AsmInstrumentation {
-public:
- struct RegisterContext {
- private:
- enum RegOffset {
- REG_OFFSET_ADDRESS = 0,
- REG_OFFSET_SHADOW,
- REG_OFFSET_SCRATCH
- };
-
- public:
- RegisterContext(unsigned AddressReg, unsigned ShadowReg,
- unsigned ScratchReg) {
- BusyRegs.push_back(convReg(AddressReg, 64));
- BusyRegs.push_back(convReg(ShadowReg, 64));
- BusyRegs.push_back(convReg(ScratchReg, 64));
- }
-
- unsigned AddressReg(unsigned Size) const {
- return convReg(BusyRegs[REG_OFFSET_ADDRESS], Size);
- }
-
- unsigned ShadowReg(unsigned Size) const {
- return convReg(BusyRegs[REG_OFFSET_SHADOW], Size);
- }
-
- unsigned ScratchReg(unsigned Size) const {
- return convReg(BusyRegs[REG_OFFSET_SCRATCH], Size);
- }
-
- void AddBusyReg(unsigned Reg) {
- if (Reg != X86::NoRegister)
- BusyRegs.push_back(convReg(Reg, 64));
- }
-
- void AddBusyRegs(const X86Operand &Op) {
- AddBusyReg(Op.getMemBaseReg());
- AddBusyReg(Op.getMemIndexReg());
- }
-
- unsigned ChooseFrameReg(unsigned Size) const {
- static const MCPhysReg Candidates[] = { X86::RBP, X86::RAX, X86::RBX,
- X86::RCX, X86::RDX, X86::RDI,
- X86::RSI };
- for (unsigned Reg : Candidates) {
- if (!std::count(BusyRegs.begin(), BusyRegs.end(), Reg))
- return convReg(Reg, Size);
- }
- return X86::NoRegister;
- }
-
- private:
- unsigned convReg(unsigned Reg, unsigned Size) const {
- return Reg == X86::NoRegister ? Reg : getX86SubSuperRegister(Reg, Size);
- }
-
- std::vector<unsigned> BusyRegs;
- };
-
- X86AddressSanitizer(const MCSubtargetInfo *&STI)
- : X86AsmInstrumentation(STI), RepPrefix(false), OrigSPOffset(0) {}
-
- ~X86AddressSanitizer() override = default;
-
- // X86AsmInstrumentation implementation:
- void InstrumentAndEmitInstruction(const MCInst &Inst, OperandVector &Operands,
- MCContext &Ctx, const MCInstrInfo &MII,
- MCStreamer &Out,
- /* unused */ bool) override {
- InstrumentMOVS(Inst, Operands, Ctx, MII, Out);
- if (RepPrefix)
- EmitInstruction(Out, MCInstBuilder(X86::REP_PREFIX));
-
- InstrumentMOV(Inst, Operands, Ctx, MII, Out);
-
- RepPrefix = (Inst.getOpcode() == X86::REP_PREFIX);
- if (!RepPrefix)
- EmitInstruction(Out, Inst);
- }
-
- // Adjusts up stack and saves all registers used in instrumentation.
- virtual void InstrumentMemOperandPrologue(const RegisterContext &RegCtx,
- MCContext &Ctx,
- MCStreamer &Out) = 0;
-
- // Restores all registers used in instrumentation and adjusts stack.
- virtual void InstrumentMemOperandEpilogue(const RegisterContext &RegCtx,
- MCContext &Ctx,
- MCStreamer &Out) = 0;
-
- virtual void InstrumentMemOperandSmall(X86Operand &Op, unsigned AccessSize,
- bool IsWrite,
- const RegisterContext &RegCtx,
- MCContext &Ctx, MCStreamer &Out) = 0;
- virtual void InstrumentMemOperandLarge(X86Operand &Op, unsigned AccessSize,
- bool IsWrite,
- const RegisterContext &RegCtx,
- MCContext &Ctx, MCStreamer &Out) = 0;
-
- virtual void InstrumentMOVSImpl(unsigned AccessSize, MCContext &Ctx,
- MCStreamer &Out) = 0;
-
- void InstrumentMemOperand(X86Operand &Op, unsigned AccessSize, bool IsWrite,
- const RegisterContext &RegCtx, MCContext &Ctx,
- MCStreamer &Out);
- void InstrumentMOVSBase(unsigned DstReg, unsigned SrcReg, unsigned CntReg,
- unsigned AccessSize, MCContext &Ctx, MCStreamer &Out);
-
- void InstrumentMOVS(const MCInst &Inst, OperandVector &Operands,
- MCContext &Ctx, const MCInstrInfo &MII, MCStreamer &Out);
- void InstrumentMOV(const MCInst &Inst, OperandVector &Operands,
- MCContext &Ctx, const MCInstrInfo &MII, MCStreamer &Out);
-
-protected:
- void EmitLabel(MCStreamer &Out, MCSymbol *Label) { Out.EmitLabel(Label); }
-
- void EmitLEA(X86Operand &Op, unsigned Size, unsigned Reg, MCStreamer &Out) {
- assert(Size == 32 || Size == 64);
- MCInst Inst;
- Inst.setOpcode(Size == 32 ? X86::LEA32r : X86::LEA64r);
- Inst.addOperand(MCOperand::createReg(getX86SubSuperRegister(Reg, Size)));
- Op.addMemOperands(Inst, 5);
- EmitInstruction(Out, Inst);
- }
-
- void ComputeMemOperandAddress(X86Operand &Op, unsigned Size,
- unsigned Reg, MCContext &Ctx, MCStreamer &Out);
-
- // Creates new memory operand with Displacement added to an original
- // displacement. Residue will contain a residue which could happen when the
- // total displacement exceeds 32-bit limitation.
- std::unique_ptr<X86Operand> AddDisplacement(X86Operand &Op,
- int64_t Displacement,
- MCContext &Ctx, int64_t *Residue);
-
- bool is64BitMode() const {
- return STI->getFeatureBits()[X86::Mode64Bit];
- }
-
- bool is32BitMode() const {
- return STI->getFeatureBits()[X86::Mode32Bit];
- }
-
- bool is16BitMode() const {
- return STI->getFeatureBits()[X86::Mode16Bit];
- }
-
- unsigned getPointerWidth() {
- if (is16BitMode()) return 16;
- if (is32BitMode()) return 32;
- if (is64BitMode()) return 64;
- llvm_unreachable("invalid mode");
- }
-
- // True when previous instruction was actually REP prefix.
- bool RepPrefix;
-
- // Offset from the original SP register.
- int64_t OrigSPOffset;
-};
-
-void X86AddressSanitizer::InstrumentMemOperand(
- X86Operand &Op, unsigned AccessSize, bool IsWrite,
- const RegisterContext &RegCtx, MCContext &Ctx, MCStreamer &Out) {
- assert(Op.isMem() && "Op should be a memory operand.");
- assert((AccessSize & (AccessSize - 1)) == 0 && AccessSize <= 16 &&
- "AccessSize should be a power of two, less or equal than 16.");
- // FIXME: take into account load/store alignment.
- if (IsSmallMemAccess(AccessSize))
- InstrumentMemOperandSmall(Op, AccessSize, IsWrite, RegCtx, Ctx, Out);
- else
- InstrumentMemOperandLarge(Op, AccessSize, IsWrite, RegCtx, Ctx, Out);
-}
-
-void X86AddressSanitizer::InstrumentMOVSBase(unsigned DstReg, unsigned SrcReg,
- unsigned CntReg,
- unsigned AccessSize,
- MCContext &Ctx, MCStreamer &Out) {
- // FIXME: check whole ranges [DstReg .. DstReg + AccessSize * (CntReg - 1)]
- // and [SrcReg .. SrcReg + AccessSize * (CntReg - 1)].
- RegisterContext RegCtx(X86::RDX /* AddressReg */, X86::RAX /* ShadowReg */,
- IsSmallMemAccess(AccessSize)
- ? X86::RBX
- : X86::NoRegister /* ScratchReg */);
- RegCtx.AddBusyReg(DstReg);
- RegCtx.AddBusyReg(SrcReg);
- RegCtx.AddBusyReg(CntReg);
-
- InstrumentMemOperandPrologue(RegCtx, Ctx, Out);
-
- // Test (%SrcReg)
- {
- const MCExpr *Disp = MCConstantExpr::create(0, Ctx);
- std::unique_ptr<X86Operand> Op(X86Operand::CreateMem(
- getPointerWidth(), 0, Disp, SrcReg, 0, AccessSize, SMLoc(), SMLoc()));
- InstrumentMemOperand(*Op, AccessSize, false /* IsWrite */, RegCtx, Ctx,
- Out);
- }
-
- // Test -1(%SrcReg, %CntReg, AccessSize)
- {
- const MCExpr *Disp = MCConstantExpr::create(-1, Ctx);
- std::unique_ptr<X86Operand> Op(X86Operand::CreateMem(
- getPointerWidth(), 0, Disp, SrcReg, CntReg, AccessSize, SMLoc(),
- SMLoc()));
- InstrumentMemOperand(*Op, AccessSize, false /* IsWrite */, RegCtx, Ctx,
- Out);
- }
-
- // Test (%DstReg)
- {
- const MCExpr *Disp = MCConstantExpr::create(0, Ctx);
- std::unique_ptr<X86Operand> Op(X86Operand::CreateMem(
- getPointerWidth(), 0, Disp, DstReg, 0, AccessSize, SMLoc(), SMLoc()));
- InstrumentMemOperand(*Op, AccessSize, true /* IsWrite */, RegCtx, Ctx, Out);
- }
-
- // Test -1(%DstReg, %CntReg, AccessSize)
- {
- const MCExpr *Disp = MCConstantExpr::create(-1, Ctx);
- std::unique_ptr<X86Operand> Op(X86Operand::CreateMem(
- getPointerWidth(), 0, Disp, DstReg, CntReg, AccessSize, SMLoc(),
- SMLoc()));
- InstrumentMemOperand(*Op, AccessSize, true /* IsWrite */, RegCtx, Ctx, Out);
- }
-
- InstrumentMemOperandEpilogue(RegCtx, Ctx, Out);
-}
-
-void X86AddressSanitizer::InstrumentMOVS(const MCInst &Inst,
- OperandVector &Operands,
- MCContext &Ctx, const MCInstrInfo &MII,
- MCStreamer &Out) {
- // Access size in bytes.
- unsigned AccessSize = 0;
-
- switch (Inst.getOpcode()) {
- case X86::MOVSB:
- AccessSize = 1;
- break;
- case X86::MOVSW:
- AccessSize = 2;
- break;
- case X86::MOVSL:
- AccessSize = 4;
- break;
- case X86::MOVSQ:
- AccessSize = 8;
- break;
- default:
- return;
- }
-
- InstrumentMOVSImpl(AccessSize, Ctx, Out);
-}
-
-void X86AddressSanitizer::InstrumentMOV(const MCInst &Inst,
- OperandVector &Operands, MCContext &Ctx,
- const MCInstrInfo &MII,
- MCStreamer &Out) {
- // Access size in bytes.
- unsigned AccessSize = 0;
-
- switch (Inst.getOpcode()) {
- case X86::MOV8mi:
- case X86::MOV8mr:
- case X86::MOV8rm:
- AccessSize = 1;
- break;
- case X86::MOV16mi:
- case X86::MOV16mr:
- case X86::MOV16rm:
- AccessSize = 2;
- break;
- case X86::MOV32mi:
- case X86::MOV32mr:
- case X86::MOV32rm:
- AccessSize = 4;
- break;
- case X86::MOV64mi32:
- case X86::MOV64mr:
- case X86::MOV64rm:
- AccessSize = 8;
- break;
- case X86::MOVAPDmr:
- case X86::MOVAPSmr:
- case X86::MOVAPDrm:
- case X86::MOVAPSrm:
- AccessSize = 16;
- break;
- default:
- return;
- }
-
- const bool IsWrite = MII.get(Inst.getOpcode()).mayStore();
-
- for (unsigned Ix = 0; Ix < Operands.size(); ++Ix) {
- assert(Operands[Ix]);
- MCParsedAsmOperand &Op = *Operands[Ix];
- if (Op.isMem()) {
- X86Operand &MemOp = static_cast<X86Operand &>(Op);
- RegisterContext RegCtx(
- X86::RDI /* AddressReg */, X86::RAX /* ShadowReg */,
- IsSmallMemAccess(AccessSize) ? X86::RCX
- : X86::NoRegister /* ScratchReg */);
- RegCtx.AddBusyRegs(MemOp);
- InstrumentMemOperandPrologue(RegCtx, Ctx, Out);
- InstrumentMemOperand(MemOp, AccessSize, IsWrite, RegCtx, Ctx, Out);
- InstrumentMemOperandEpilogue(RegCtx, Ctx, Out);
- }
- }
-}
-
-void X86AddressSanitizer::ComputeMemOperandAddress(X86Operand &Op,
- unsigned Size,
- unsigned Reg, MCContext &Ctx,
- MCStreamer &Out) {
- int64_t Displacement = 0;
- if (IsStackReg(Op.getMemBaseReg()))
- Displacement -= OrigSPOffset;
- if (IsStackReg(Op.getMemIndexReg()))
- Displacement -= OrigSPOffset * Op.getMemScale();
-
- assert(Displacement >= 0);
-
- // Emit Op as is.
- if (Displacement == 0) {
- EmitLEA(Op, Size, Reg, Out);
- return;
- }
-
- int64_t Residue;
- std::unique_ptr<X86Operand> NewOp =
- AddDisplacement(Op, Displacement, Ctx, &Residue);
- EmitLEA(*NewOp, Size, Reg, Out);
-
- while (Residue != 0) {
- const MCConstantExpr *Disp =
- MCConstantExpr::create(ApplyDisplacementBounds(Residue), Ctx);
- std::unique_ptr<X86Operand> DispOp =
- X86Operand::CreateMem(getPointerWidth(), 0, Disp, Reg, 0, 1, SMLoc(),
- SMLoc());
- EmitLEA(*DispOp, Size, Reg, Out);
- Residue -= Disp->getValue();
- }
-}
-
-std::unique_ptr<X86Operand>
-X86AddressSanitizer::AddDisplacement(X86Operand &Op, int64_t Displacement,
- MCContext &Ctx, int64_t *Residue) {
- assert(Displacement >= 0);
-
- if (Displacement == 0 ||
- (Op.getMemDisp() && Op.getMemDisp()->getKind() != MCExpr::Constant)) {
- *Residue = Displacement;
- return X86Operand::CreateMem(Op.getMemModeSize(), Op.getMemSegReg(),
- Op.getMemDisp(), Op.getMemBaseReg(),
- Op.getMemIndexReg(), Op.getMemScale(),
- SMLoc(), SMLoc());
- }
-
- int64_t OrigDisplacement =
- static_cast<const MCConstantExpr *>(Op.getMemDisp())->getValue();
- CheckDisplacementBounds(OrigDisplacement);
- Displacement += OrigDisplacement;
-
- int64_t NewDisplacement = ApplyDisplacementBounds(Displacement);
- CheckDisplacementBounds(NewDisplacement);
-
- *Residue = Displacement - NewDisplacement;
- const MCExpr *Disp = MCConstantExpr::create(NewDisplacement, Ctx);
- return X86Operand::CreateMem(Op.getMemModeSize(), Op.getMemSegReg(), Disp,
- Op.getMemBaseReg(), Op.getMemIndexReg(),
- Op.getMemScale(), SMLoc(), SMLoc());
-}
-
-class X86AddressSanitizer32 : public X86AddressSanitizer {
-public:
- static const long kShadowOffset = 0x20000000;
-
- X86AddressSanitizer32(const MCSubtargetInfo *&STI)
- : X86AddressSanitizer(STI) {}
-
- ~X86AddressSanitizer32() override = default;
-
- unsigned GetFrameReg(const MCContext &Ctx, MCStreamer &Out) {
- unsigned FrameReg = GetFrameRegGeneric(Ctx, Out);
- if (FrameReg == X86::NoRegister)
- return FrameReg;
- return getX86SubSuperRegister(FrameReg, 32);
- }
-
- void SpillReg(MCStreamer &Out, unsigned Reg) {
- EmitInstruction(Out, MCInstBuilder(X86::PUSH32r).addReg(Reg));
- OrigSPOffset -= 4;
- }
-
- void RestoreReg(MCStreamer &Out, unsigned Reg) {
- EmitInstruction(Out, MCInstBuilder(X86::POP32r).addReg(Reg));
- OrigSPOffset += 4;
- }
-
- void StoreFlags(MCStreamer &Out) {
- EmitInstruction(Out, MCInstBuilder(X86::PUSHF32));
- OrigSPOffset -= 4;
- }
-
- void RestoreFlags(MCStreamer &Out) {
- EmitInstruction(Out, MCInstBuilder(X86::POPF32));
- OrigSPOffset += 4;
- }
-
- void InstrumentMemOperandPrologue(const RegisterContext &RegCtx,
- MCContext &Ctx,
- MCStreamer &Out) override {
- unsigned LocalFrameReg = RegCtx.ChooseFrameReg(32);
- assert(LocalFrameReg != X86::NoRegister);
-
- const MCRegisterInfo *MRI = Ctx.getRegisterInfo();
- unsigned FrameReg = GetFrameReg(Ctx, Out);
- if (MRI && FrameReg != X86::NoRegister) {
- SpillReg(Out, LocalFrameReg);
- if (FrameReg == X86::ESP) {
- Out.EmitCFIAdjustCfaOffset(4 /* byte size of the LocalFrameReg */);
- Out.EmitCFIRelOffset(
- MRI->getDwarfRegNum(LocalFrameReg, true /* IsEH */), 0);
- }
- EmitInstruction(
- Out,
- MCInstBuilder(X86::MOV32rr).addReg(LocalFrameReg).addReg(FrameReg));
- Out.EmitCFIRememberState();
- Out.EmitCFIDefCfaRegister(
- MRI->getDwarfRegNum(LocalFrameReg, true /* IsEH */));
- }
-
- SpillReg(Out, RegCtx.AddressReg(32));
- SpillReg(Out, RegCtx.ShadowReg(32));
- if (RegCtx.ScratchReg(32) != X86::NoRegister)
- SpillReg(Out, RegCtx.ScratchReg(32));
- StoreFlags(Out);
- }
-
- void InstrumentMemOperandEpilogue(const RegisterContext &RegCtx,
- MCContext &Ctx,
- MCStreamer &Out) override {
- unsigned LocalFrameReg = RegCtx.ChooseFrameReg(32);
- assert(LocalFrameReg != X86::NoRegister);
-
- RestoreFlags(Out);
- if (RegCtx.ScratchReg(32) != X86::NoRegister)
- RestoreReg(Out, RegCtx.ScratchReg(32));
- RestoreReg(Out, RegCtx.ShadowReg(32));
- RestoreReg(Out, RegCtx.AddressReg(32));
-
- unsigned FrameReg = GetFrameReg(Ctx, Out);
- if (Ctx.getRegisterInfo() && FrameReg != X86::NoRegister) {
- RestoreReg(Out, LocalFrameReg);
- Out.EmitCFIRestoreState();
- if (FrameReg == X86::ESP)
- Out.EmitCFIAdjustCfaOffset(-4 /* byte size of the LocalFrameReg */);
- }
- }
-
- void InstrumentMemOperandSmall(X86Operand &Op, unsigned AccessSize,
- bool IsWrite,
- const RegisterContext &RegCtx,
- MCContext &Ctx,
- MCStreamer &Out) override;
- void InstrumentMemOperandLarge(X86Operand &Op, unsigned AccessSize,
- bool IsWrite,
- const RegisterContext &RegCtx,
- MCContext &Ctx,
- MCStreamer &Out) override;
- void InstrumentMOVSImpl(unsigned AccessSize, MCContext &Ctx,
- MCStreamer &Out) override;
-
-private:
- void EmitCallAsanReport(unsigned AccessSize, bool IsWrite, MCContext &Ctx,
- MCStreamer &Out, const RegisterContext &RegCtx) {
- EmitInstruction(Out, MCInstBuilder(X86::CLD));
- EmitInstruction(Out, MCInstBuilder(X86::MMX_EMMS));
-
- EmitInstruction(Out, MCInstBuilder(X86::AND32ri8)
- .addReg(X86::ESP)
- .addReg(X86::ESP)
- .addImm(-16));
- EmitInstruction(
- Out, MCInstBuilder(X86::PUSH32r).addReg(RegCtx.AddressReg(32)));
-
- MCSymbol *FnSym = Ctx.getOrCreateSymbol(Twine("__asan_report_") +
- (IsWrite ? "store" : "load") +
- Twine(AccessSize));
- const MCSymbolRefExpr *FnExpr =
- MCSymbolRefExpr::create(FnSym, MCSymbolRefExpr::VK_PLT, Ctx);
- EmitInstruction(Out, MCInstBuilder(X86::CALLpcrel32).addExpr(FnExpr));
- }
-};
-
-void X86AddressSanitizer32::InstrumentMemOperandSmall(
- X86Operand &Op, unsigned AccessSize, bool IsWrite,
- const RegisterContext &RegCtx, MCContext &Ctx, MCStreamer &Out) {
- unsigned AddressRegI32 = RegCtx.AddressReg(32);
- unsigned ShadowRegI32 = RegCtx.ShadowReg(32);
- unsigned ShadowRegI8 = RegCtx.ShadowReg(8);
-
- assert(RegCtx.ScratchReg(32) != X86::NoRegister);
- unsigned ScratchRegI32 = RegCtx.ScratchReg(32);
-
- ComputeMemOperandAddress(Op, 32, AddressRegI32, Ctx, Out);
-
- EmitInstruction(Out, MCInstBuilder(X86::MOV32rr).addReg(ShadowRegI32).addReg(
- AddressRegI32));
- EmitInstruction(Out, MCInstBuilder(X86::SHR32ri)
- .addReg(ShadowRegI32)
- .addReg(ShadowRegI32)
- .addImm(3));
-
- {
- MCInst Inst;
- Inst.setOpcode(X86::MOV8rm);
- Inst.addOperand(MCOperand::createReg(ShadowRegI8));
- const MCExpr *Disp = MCConstantExpr::create(kShadowOffset, Ctx);
- std::unique_ptr<X86Operand> Op(
- X86Operand::CreateMem(getPointerWidth(), 0, Disp, ShadowRegI32, 0, 1,
- SMLoc(), SMLoc()));
- Op->addMemOperands(Inst, 5);
- EmitInstruction(Out, Inst);
- }
-
- EmitInstruction(
- Out, MCInstBuilder(X86::TEST8rr).addReg(ShadowRegI8).addReg(ShadowRegI8));
- MCSymbol *DoneSym = Ctx.createTempSymbol();
- const MCExpr *DoneExpr = MCSymbolRefExpr::create(DoneSym, Ctx);
- EmitInstruction(Out, MCInstBuilder(X86::JE_1).addExpr(DoneExpr));
-
- EmitInstruction(Out, MCInstBuilder(X86::MOV32rr).addReg(ScratchRegI32).addReg(
- AddressRegI32));
- EmitInstruction(Out, MCInstBuilder(X86::AND32ri)
- .addReg(ScratchRegI32)
- .addReg(ScratchRegI32)
- .addImm(7));
-
- switch (AccessSize) {
- default: llvm_unreachable("Incorrect access size");
- case 1:
- break;
- case 2: {
- const MCExpr *Disp = MCConstantExpr::create(1, Ctx);
- std::unique_ptr<X86Operand> Op(
- X86Operand::CreateMem(getPointerWidth(), 0, Disp, ScratchRegI32, 0, 1,
- SMLoc(), SMLoc()));
- EmitLEA(*Op, 32, ScratchRegI32, Out);
- break;
- }
- case 4:
- EmitInstruction(Out, MCInstBuilder(X86::ADD32ri8)
- .addReg(ScratchRegI32)
- .addReg(ScratchRegI32)
- .addImm(3));
- break;
- }
-
- EmitInstruction(
- Out,
- MCInstBuilder(X86::MOVSX32rr8).addReg(ShadowRegI32).addReg(ShadowRegI8));
- EmitInstruction(Out, MCInstBuilder(X86::CMP32rr).addReg(ScratchRegI32).addReg(
- ShadowRegI32));
- EmitInstruction(Out, MCInstBuilder(X86::JL_1).addExpr(DoneExpr));
-
- EmitCallAsanReport(AccessSize, IsWrite, Ctx, Out, RegCtx);
- EmitLabel(Out, DoneSym);
-}
-
-void X86AddressSanitizer32::InstrumentMemOperandLarge(
- X86Operand &Op, unsigned AccessSize, bool IsWrite,
- const RegisterContext &RegCtx, MCContext &Ctx, MCStreamer &Out) {
- unsigned AddressRegI32 = RegCtx.AddressReg(32);
- unsigned ShadowRegI32 = RegCtx.ShadowReg(32);
-
- ComputeMemOperandAddress(Op, 32, AddressRegI32, Ctx, Out);
-
- EmitInstruction(Out, MCInstBuilder(X86::MOV32rr).addReg(ShadowRegI32).addReg(
- AddressRegI32));
- EmitInstruction(Out, MCInstBuilder(X86::SHR32ri)
- .addReg(ShadowRegI32)
- .addReg(ShadowRegI32)
- .addImm(3));
- {
- MCInst Inst;
- switch (AccessSize) {
- default: llvm_unreachable("Incorrect access size");
- case 8:
- Inst.setOpcode(X86::CMP8mi);
- break;
- case 16:
- Inst.setOpcode(X86::CMP16mi);
- break;
- }
- const MCExpr *Disp = MCConstantExpr::create(kShadowOffset, Ctx);
- std::unique_ptr<X86Operand> Op(
- X86Operand::CreateMem(getPointerWidth(), 0, Disp, ShadowRegI32, 0, 1,
- SMLoc(), SMLoc()));
- Op->addMemOperands(Inst, 5);
- Inst.addOperand(MCOperand::createImm(0));
- EmitInstruction(Out, Inst);
- }
- MCSymbol *DoneSym = Ctx.createTempSymbol();
- const MCExpr *DoneExpr = MCSymbolRefExpr::create(DoneSym, Ctx);
- EmitInstruction(Out, MCInstBuilder(X86::JE_1).addExpr(DoneExpr));
-
- EmitCallAsanReport(AccessSize, IsWrite, Ctx, Out, RegCtx);
- EmitLabel(Out, DoneSym);
-}
-
-void X86AddressSanitizer32::InstrumentMOVSImpl(unsigned AccessSize,
- MCContext &Ctx,
- MCStreamer &Out) {
- StoreFlags(Out);
-
- // No need to test when ECX is equals to zero.
- MCSymbol *DoneSym = Ctx.createTempSymbol();
- const MCExpr *DoneExpr = MCSymbolRefExpr::create(DoneSym, Ctx);
- EmitInstruction(
- Out, MCInstBuilder(X86::TEST32rr).addReg(X86::ECX).addReg(X86::ECX));
- EmitInstruction(Out, MCInstBuilder(X86::JE_1).addExpr(DoneExpr));
-
- // Instrument first and last elements in src and dst range.
- InstrumentMOVSBase(X86::EDI /* DstReg */, X86::ESI /* SrcReg */,
- X86::ECX /* CntReg */, AccessSize, Ctx, Out);
-
- EmitLabel(Out, DoneSym);
- RestoreFlags(Out);
-}
-
-class X86AddressSanitizer64 : public X86AddressSanitizer {
-public:
- static const long kShadowOffset = 0x7fff8000;
-
- X86AddressSanitizer64(const MCSubtargetInfo *&STI)
- : X86AddressSanitizer(STI) {}
-
- ~X86AddressSanitizer64() override = default;
-
- unsigned GetFrameReg(const MCContext &Ctx, MCStreamer &Out) {
- unsigned FrameReg = GetFrameRegGeneric(Ctx, Out);
- if (FrameReg == X86::NoRegister)
- return FrameReg;
- return getX86SubSuperRegister(FrameReg, 64);
- }
-
- void SpillReg(MCStreamer &Out, unsigned Reg) {
- EmitInstruction(Out, MCInstBuilder(X86::PUSH64r).addReg(Reg));
- OrigSPOffset -= 8;
- }
-
- void RestoreReg(MCStreamer &Out, unsigned Reg) {
- EmitInstruction(Out, MCInstBuilder(X86::POP64r).addReg(Reg));
- OrigSPOffset += 8;
- }
-
- void StoreFlags(MCStreamer &Out) {
- EmitInstruction(Out, MCInstBuilder(X86::PUSHF64));
- OrigSPOffset -= 8;
- }
-
- void RestoreFlags(MCStreamer &Out) {
- EmitInstruction(Out, MCInstBuilder(X86::POPF64));
- OrigSPOffset += 8;
- }
-
- void InstrumentMemOperandPrologue(const RegisterContext &RegCtx,
- MCContext &Ctx,
- MCStreamer &Out) override {
- unsigned LocalFrameReg = RegCtx.ChooseFrameReg(64);
- assert(LocalFrameReg != X86::NoRegister);
-
- const MCRegisterInfo *MRI = Ctx.getRegisterInfo();
- unsigned FrameReg = GetFrameReg(Ctx, Out);
- if (MRI && FrameReg != X86::NoRegister) {
- SpillReg(Out, X86::RBP);
- if (FrameReg == X86::RSP) {
- Out.EmitCFIAdjustCfaOffset(8 /* byte size of the LocalFrameReg */);
- Out.EmitCFIRelOffset(
- MRI->getDwarfRegNum(LocalFrameReg, true /* IsEH */), 0);
- }
- EmitInstruction(
- Out,
- MCInstBuilder(X86::MOV64rr).addReg(LocalFrameReg).addReg(FrameReg));
- Out.EmitCFIRememberState();
- Out.EmitCFIDefCfaRegister(
- MRI->getDwarfRegNum(LocalFrameReg, true /* IsEH */));
- }
-
- EmitAdjustRSP(Ctx, Out, -128);
- SpillReg(Out, RegCtx.ShadowReg(64));
- SpillReg(Out, RegCtx.AddressReg(64));
- if (RegCtx.ScratchReg(64) != X86::NoRegister)
- SpillReg(Out, RegCtx.ScratchReg(64));
- StoreFlags(Out);
- }
-
- void InstrumentMemOperandEpilogue(const RegisterContext &RegCtx,
- MCContext &Ctx,
- MCStreamer &Out) override {
- unsigned LocalFrameReg = RegCtx.ChooseFrameReg(64);
- assert(LocalFrameReg != X86::NoRegister);
-
- RestoreFlags(Out);
- if (RegCtx.ScratchReg(64) != X86::NoRegister)
- RestoreReg(Out, RegCtx.ScratchReg(64));
- RestoreReg(Out, RegCtx.AddressReg(64));
- RestoreReg(Out, RegCtx.ShadowReg(64));
- EmitAdjustRSP(Ctx, Out, 128);
-
- unsigned FrameReg = GetFrameReg(Ctx, Out);
- if (Ctx.getRegisterInfo() && FrameReg != X86::NoRegister) {
- RestoreReg(Out, LocalFrameReg);
- Out.EmitCFIRestoreState();
- if (FrameReg == X86::RSP)
- Out.EmitCFIAdjustCfaOffset(-8 /* byte size of the LocalFrameReg */);
- }
- }
-
- void InstrumentMemOperandSmall(X86Operand &Op, unsigned AccessSize,
- bool IsWrite,
- const RegisterContext &RegCtx,
- MCContext &Ctx,
- MCStreamer &Out) override;
- void InstrumentMemOperandLarge(X86Operand &Op, unsigned AccessSize,
- bool IsWrite,
- const RegisterContext &RegCtx,
- MCContext &Ctx,
- MCStreamer &Out) override;
- void InstrumentMOVSImpl(unsigned AccessSize, MCContext &Ctx,
- MCStreamer &Out) override;
-
-private:
- void EmitAdjustRSP(MCContext &Ctx, MCStreamer &Out, long Offset) {
- const MCExpr *Disp = MCConstantExpr::create(Offset, Ctx);
- std::unique_ptr<X86Operand> Op(
- X86Operand::CreateMem(getPointerWidth(), 0, Disp, X86::RSP, 0, 1,
- SMLoc(), SMLoc()));
- EmitLEA(*Op, 64, X86::RSP, Out);
- OrigSPOffset += Offset;
- }
-
- void EmitCallAsanReport(unsigned AccessSize, bool IsWrite, MCContext &Ctx,
- MCStreamer &Out, const RegisterContext &RegCtx) {
- EmitInstruction(Out, MCInstBuilder(X86::CLD));
- EmitInstruction(Out, MCInstBuilder(X86::MMX_EMMS));
-
- EmitInstruction(Out, MCInstBuilder(X86::AND64ri8)
- .addReg(X86::RSP)
- .addReg(X86::RSP)
- .addImm(-16));
-
- if (RegCtx.AddressReg(64) != X86::RDI) {
- EmitInstruction(Out, MCInstBuilder(X86::MOV64rr).addReg(X86::RDI).addReg(
- RegCtx.AddressReg(64)));
- }
- MCSymbol *FnSym = Ctx.getOrCreateSymbol(Twine("__asan_report_") +
- (IsWrite ? "store" : "load") +
- Twine(AccessSize));
- const MCSymbolRefExpr *FnExpr =
- MCSymbolRefExpr::create(FnSym, MCSymbolRefExpr::VK_PLT, Ctx);
- EmitInstruction(Out, MCInstBuilder(X86::CALL64pcrel32).addExpr(FnExpr));
- }
-};
-
-} // end anonymous namespace
-
-void X86AddressSanitizer64::InstrumentMemOperandSmall(
- X86Operand &Op, unsigned AccessSize, bool IsWrite,
- const RegisterContext &RegCtx, MCContext &Ctx, MCStreamer &Out) {
- unsigned AddressRegI64 = RegCtx.AddressReg(64);
- unsigned AddressRegI32 = RegCtx.AddressReg(32);
- unsigned ShadowRegI64 = RegCtx.ShadowReg(64);
- unsigned ShadowRegI32 = RegCtx.ShadowReg(32);
- unsigned ShadowRegI8 = RegCtx.ShadowReg(8);
-
- assert(RegCtx.ScratchReg(32) != X86::NoRegister);
- unsigned ScratchRegI32 = RegCtx.ScratchReg(32);
-
- ComputeMemOperandAddress(Op, 64, AddressRegI64, Ctx, Out);
-
- EmitInstruction(Out, MCInstBuilder(X86::MOV64rr).addReg(ShadowRegI64).addReg(
- AddressRegI64));
- EmitInstruction(Out, MCInstBuilder(X86::SHR64ri)
- .addReg(ShadowRegI64)
- .addReg(ShadowRegI64)
- .addImm(3));
- {
- MCInst Inst;
- Inst.setOpcode(X86::MOV8rm);
- Inst.addOperand(MCOperand::createReg(ShadowRegI8));
- const MCExpr *Disp = MCConstantExpr::create(kShadowOffset, Ctx);
- std::unique_ptr<X86Operand> Op(
- X86Operand::CreateMem(getPointerWidth(), 0, Disp, ShadowRegI64, 0, 1,
- SMLoc(), SMLoc()));
- Op->addMemOperands(Inst, 5);
- EmitInstruction(Out, Inst);
- }
-
- EmitInstruction(
- Out, MCInstBuilder(X86::TEST8rr).addReg(ShadowRegI8).addReg(ShadowRegI8));
- MCSymbol *DoneSym = Ctx.createTempSymbol();
- const MCExpr *DoneExpr = MCSymbolRefExpr::create(DoneSym, Ctx);
- EmitInstruction(Out, MCInstBuilder(X86::JE_1).addExpr(DoneExpr));
-
- EmitInstruction(Out, MCInstBuilder(X86::MOV32rr).addReg(ScratchRegI32).addReg(
- AddressRegI32));
- EmitInstruction(Out, MCInstBuilder(X86::AND32ri)
- .addReg(ScratchRegI32)
- .addReg(ScratchRegI32)
- .addImm(7));
-
- switch (AccessSize) {
- default: llvm_unreachable("Incorrect access size");
- case 1:
- break;
- case 2: {
- const MCExpr *Disp = MCConstantExpr::create(1, Ctx);
- std::unique_ptr<X86Operand> Op(
- X86Operand::CreateMem(getPointerWidth(), 0, Disp, ScratchRegI32, 0, 1,
- SMLoc(), SMLoc()));
- EmitLEA(*Op, 32, ScratchRegI32, Out);
- break;
- }
- case 4:
- EmitInstruction(Out, MCInstBuilder(X86::ADD32ri8)
- .addReg(ScratchRegI32)
- .addReg(ScratchRegI32)
- .addImm(3));
- break;
- }
-
- EmitInstruction(
- Out,
- MCInstBuilder(X86::MOVSX32rr8).addReg(ShadowRegI32).addReg(ShadowRegI8));
- EmitInstruction(Out, MCInstBuilder(X86::CMP32rr).addReg(ScratchRegI32).addReg(
- ShadowRegI32));
- EmitInstruction(Out, MCInstBuilder(X86::JL_1).addExpr(DoneExpr));
-
- EmitCallAsanReport(AccessSize, IsWrite, Ctx, Out, RegCtx);
- EmitLabel(Out, DoneSym);
-}
-
-void X86AddressSanitizer64::InstrumentMemOperandLarge(
- X86Operand &Op, unsigned AccessSize, bool IsWrite,
- const RegisterContext &RegCtx, MCContext &Ctx, MCStreamer &Out) {
- unsigned AddressRegI64 = RegCtx.AddressReg(64);
- unsigned ShadowRegI64 = RegCtx.ShadowReg(64);
-
- ComputeMemOperandAddress(Op, 64, AddressRegI64, Ctx, Out);
-
- EmitInstruction(Out, MCInstBuilder(X86::MOV64rr).addReg(ShadowRegI64).addReg(
- AddressRegI64));
- EmitInstruction(Out, MCInstBuilder(X86::SHR64ri)
- .addReg(ShadowRegI64)
- .addReg(ShadowRegI64)
- .addImm(3));
- {
- MCInst Inst;
- switch (AccessSize) {
- default: llvm_unreachable("Incorrect access size");
- case 8:
- Inst.setOpcode(X86::CMP8mi);
- break;
- case 16:
- Inst.setOpcode(X86::CMP16mi);
- break;
- }
- const MCExpr *Disp = MCConstantExpr::create(kShadowOffset, Ctx);
- std::unique_ptr<X86Operand> Op(
- X86Operand::CreateMem(getPointerWidth(), 0, Disp, ShadowRegI64, 0, 1,
- SMLoc(), SMLoc()));
- Op->addMemOperands(Inst, 5);
- Inst.addOperand(MCOperand::createImm(0));
- EmitInstruction(Out, Inst);
- }
-
- MCSymbol *DoneSym = Ctx.createTempSymbol();
- const MCExpr *DoneExpr = MCSymbolRefExpr::create(DoneSym, Ctx);
- EmitInstruction(Out, MCInstBuilder(X86::JE_1).addExpr(DoneExpr));
-
- EmitCallAsanReport(AccessSize, IsWrite, Ctx, Out, RegCtx);
- EmitLabel(Out, DoneSym);
-}
-
-void X86AddressSanitizer64::InstrumentMOVSImpl(unsigned AccessSize,
- MCContext &Ctx,
- MCStreamer &Out) {
- StoreFlags(Out);
-
- // No need to test when RCX is equals to zero.
- MCSymbol *DoneSym = Ctx.createTempSymbol();
- const MCExpr *DoneExpr = MCSymbolRefExpr::create(DoneSym, Ctx);
- EmitInstruction(
- Out, MCInstBuilder(X86::TEST64rr).addReg(X86::RCX).addReg(X86::RCX));
- EmitInstruction(Out, MCInstBuilder(X86::JE_1).addExpr(DoneExpr));
-
- // Instrument first and last elements in src and dst range.
- InstrumentMOVSBase(X86::RDI /* DstReg */, X86::RSI /* SrcReg */,
- X86::RCX /* CntReg */, AccessSize, Ctx, Out);
-
- EmitLabel(Out, DoneSym);
- RestoreFlags(Out);
-}
-
-X86AsmInstrumentation::X86AsmInstrumentation(const MCSubtargetInfo *&STI)
- : STI(STI) {}
-
-X86AsmInstrumentation::~X86AsmInstrumentation() = default;
-
-void X86AsmInstrumentation::InstrumentAndEmitInstruction(
- const MCInst &Inst, OperandVector &Operands, MCContext &Ctx,
- const MCInstrInfo &MII, MCStreamer &Out, bool PrintSchedInfoEnabled) {
- EmitInstruction(Out, Inst, PrintSchedInfoEnabled);
-}
-
-void X86AsmInstrumentation::EmitInstruction(MCStreamer &Out, const MCInst &Inst,
- bool PrintSchedInfoEnabled) {
- Out.EmitInstruction(Inst, *STI, PrintSchedInfoEnabled);
-}
-
-unsigned X86AsmInstrumentation::GetFrameRegGeneric(const MCContext &Ctx,
- MCStreamer &Out) {
- if (!Out.getNumFrameInfos()) // No active dwarf frame
- return X86::NoRegister;
- const MCDwarfFrameInfo &Frame = Out.getDwarfFrameInfos().back();
- if (Frame.End) // Active dwarf frame is closed
- return X86::NoRegister;
- const MCRegisterInfo *MRI = Ctx.getRegisterInfo();
- if (!MRI) // No register info
- return X86::NoRegister;
-
- if (InitialFrameReg) {
- // FrameReg is set explicitly, we're instrumenting a MachineFunction.
- return InitialFrameReg;
- }
-
- return MRI->getLLVMRegNum(Frame.CurrentCfaRegister, true /* IsEH */);
-}
-
-X86AsmInstrumentation *
-llvm::CreateX86AsmInstrumentation(const MCTargetOptions &MCOptions,
- const MCContext &Ctx,
- const MCSubtargetInfo *&STI) {
- Triple T(STI->getTargetTriple());
- const bool hasCompilerRTSupport = T.isOSLinux();
- if (ClAsanInstrumentAssembly && hasCompilerRTSupport &&
- MCOptions.SanitizeAddress) {
- if (STI->getFeatureBits()[X86::Mode32Bit] != 0)
- return new X86AddressSanitizer32(STI);
- if (STI->getFeatureBits()[X86::Mode64Bit] != 0)
- return new X86AddressSanitizer64(STI);
- }
- return new X86AsmInstrumentation(STI);
-}
diff --git a/lib/Target/X86/AsmParser/X86AsmInstrumentation.h b/lib/Target/X86/AsmParser/X86AsmInstrumentation.h
deleted file mode 100644
index 42a9dc3ba26a..000000000000
--- a/lib/Target/X86/AsmParser/X86AsmInstrumentation.h
+++ /dev/null
@@ -1,68 +0,0 @@
-//===- X86AsmInstrumentation.h - Instrument X86 inline assembly -*- C++ -*-===//
-//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
-//
-//===----------------------------------------------------------------------===//
-
-#ifndef LLVM_LIB_TARGET_X86_ASMPARSER_X86ASMINSTRUMENTATION_H
-#define LLVM_LIB_TARGET_X86_ASMPARSER_X86ASMINSTRUMENTATION_H
-
-#include "llvm/ADT/SmallVector.h"
-#include <memory>
-
-namespace llvm {
-
-class MCContext;
-class MCInst;
-class MCInstrInfo;
-class MCParsedAsmOperand;
-class MCStreamer;
-class MCSubtargetInfo;
-class MCTargetOptions;
-class X86AsmInstrumentation;
-
-X86AsmInstrumentation *
-CreateX86AsmInstrumentation(const MCTargetOptions &MCOptions,
- const MCContext &Ctx,
- const MCSubtargetInfo *&STI);
-
-class X86AsmInstrumentation {
-public:
- virtual ~X86AsmInstrumentation();
-
- // Sets frame register corresponding to a current frame.
- void SetInitialFrameRegister(unsigned RegNo) {
- InitialFrameReg = RegNo;
- }
-
- // Tries to instrument and emit instruction.
- virtual void InstrumentAndEmitInstruction(
- const MCInst &Inst,
- SmallVectorImpl<std::unique_ptr<MCParsedAsmOperand>> &Operands,
- MCContext &Ctx, const MCInstrInfo &MII, MCStreamer &Out,
- bool PrintSchedInfoEnabled);
-
-protected:
- friend X86AsmInstrumentation *
- CreateX86AsmInstrumentation(const MCTargetOptions &MCOptions,
- const MCContext &Ctx,
- const MCSubtargetInfo *&STI);
-
- X86AsmInstrumentation(const MCSubtargetInfo *&STI);
-
- unsigned GetFrameRegGeneric(const MCContext &Ctx, MCStreamer &Out);
-
- void EmitInstruction(MCStreamer &Out, const MCInst &Inst,
- bool PrintSchedInfoEnabled = false);
-
- const MCSubtargetInfo *&STI;
-
- unsigned InitialFrameReg = 0;
-};
-
-} // end namespace llvm
-
-#endif // LLVM_LIB_TARGET_X86_ASMPARSER_X86ASMINSTRUMENTATION_H
diff --git a/lib/Target/X86/AsmParser/X86AsmParser.cpp b/lib/Target/X86/AsmParser/X86AsmParser.cpp
index 899b50d0f78f..95cbf46d37ed 100644
--- a/lib/Target/X86/AsmParser/X86AsmParser.cpp
+++ b/lib/Target/X86/AsmParser/X86AsmParser.cpp
@@ -1,17 +1,16 @@
//===-- X86AsmParser.cpp - Parse X86 assembly to MCInst instructions ------===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
-#include "InstPrinter/X86IntelInstPrinter.h"
#include "MCTargetDesc/X86BaseInfo.h"
+#include "MCTargetDesc/X86IntelInstPrinter.h"
#include "MCTargetDesc/X86MCExpr.h"
#include "MCTargetDesc/X86TargetStreamer.h"
-#include "X86AsmInstrumentation.h"
+#include "TargetInfo/X86TargetInfo.h"
#include "X86AsmParserCommon.h"
#include "X86Operand.h"
#include "llvm/ADT/STLExtras.h"
@@ -71,9 +70,17 @@ static const char OpPrecedence[] = {
class X86AsmParser : public MCTargetAsmParser {
ParseInstructionInfo *InstInfo;
- std::unique_ptr<X86AsmInstrumentation> Instrumentation;
bool Code16GCC;
+ enum VEXEncoding {
+ VEXEncoding_Default,
+ VEXEncoding_VEX2,
+ VEXEncoding_VEX3,
+ VEXEncoding_EVEX,
+ };
+
+ VEXEncoding ForcedVEXEncoding = VEXEncoding_Default;
+
private:
SMLoc consumeToken() {
MCAsmParser &Parser = getParser();
@@ -90,13 +97,14 @@ private:
}
unsigned MatchInstruction(const OperandVector &Operands, MCInst &Inst,
- uint64_t &ErrorInfo, bool matchingInlineAsm,
- unsigned VariantID = 0) {
+ uint64_t &ErrorInfo, FeatureBitset &MissingFeatures,
+ bool matchingInlineAsm, unsigned VariantID = 0) {
// In Code16GCC mode, match as 32-bit.
if (Code16GCC)
SwitchMode(X86::Mode32Bit);
unsigned rv = MatchInstructionImpl(Operands, Inst, ErrorInfo,
- matchingInlineAsm, VariantID);
+ MissingFeatures, matchingInlineAsm,
+ VariantID);
if (Code16GCC)
SwitchMode(X86::Mode16Bit);
return rv;
@@ -840,6 +848,8 @@ private:
const SMLoc &StartLoc,
SMLoc &EndLoc);
+ X86::CondCode ParseConditionCode(StringRef CCode);
+
bool ParseIntelMemoryOperandSize(unsigned &Size);
std::unique_ptr<X86Operand>
CreateMemForInlineAsm(unsigned SegReg, const MCExpr *Disp, unsigned BaseReg,
@@ -860,6 +870,8 @@ private:
bool parseDirectiveFPOEndProc(SMLoc L);
bool parseDirectiveFPOData(SMLoc L);
+ unsigned checkTargetMatchPredicate(MCInst &Inst) override;
+
bool validateInstruction(MCInst &Inst, const OperandVector &Ops);
bool processInstruction(MCInst &Inst, const OperandVector &Ops);
@@ -875,7 +887,7 @@ private:
void MatchFPUWaitAlias(SMLoc IDLoc, X86Operand &Op, OperandVector &Operands,
MCStreamer &Out, bool MatchingInlineAsm);
- bool ErrorMissingFeature(SMLoc IDLoc, uint64_t ErrorInfo,
+ bool ErrorMissingFeature(SMLoc IDLoc, const FeatureBitset &MissingFeatures,
bool MatchingInlineAsm);
bool MatchAndEmitATTInstruction(SMLoc IDLoc, unsigned &Opcode,
@@ -914,7 +926,7 @@ private:
MCSubtargetInfo &STI = copySTI();
FeatureBitset AllModes({X86::Mode64Bit, X86::Mode32Bit, X86::Mode16Bit});
FeatureBitset OldMode = STI.getFeatureBits() & AllModes;
- uint64_t FB = ComputeAvailableFeatures(
+ FeatureBitset FB = ComputeAvailableFeatures(
STI.ToggleFeature(OldMode.flip(mode)));
setAvailableFeatures(FB);
@@ -941,6 +953,9 @@ private:
/// }
public:
+ enum X86MatchResultTy {
+ Match_Unsupported = FIRST_TARGET_MATCH_RESULT_TY,
+ };
X86AsmParser(const MCSubtargetInfo &sti, MCAsmParser &Parser,
const MCInstrInfo &mii, const MCTargetOptions &Options)
@@ -951,14 +966,10 @@ public:
// Initialize the set of available features.
setAvailableFeatures(ComputeAvailableFeatures(getSTI().getFeatureBits()));
- Instrumentation.reset(
- CreateX86AsmInstrumentation(Options, Parser.getContext(), STI));
}
bool ParseRegister(unsigned &RegNo, SMLoc &StartLoc, SMLoc &EndLoc) override;
- void SetFrameRegister(unsigned RegNo) override;
-
bool parsePrimaryExpr(const MCExpr *&Res, SMLoc &EndLoc) override;
bool ParseInstruction(ParseInstructionInfo &Info, StringRef Name,
@@ -1115,8 +1126,7 @@ bool X86AsmParser::ParseRegister(unsigned &RegNo,
}
// Parse "%st" as "%st(0)" and "%st(1)", which is multiple tokens.
- if (RegNo == 0 && (Tok.getString() == "st" || Tok.getString() == "ST")) {
- RegNo = X86::ST0;
+ if (RegNo == X86::ST0) {
Parser.Lex(); // Eat 'st'
// Check to see if we have '(4)' after %st.
@@ -1194,10 +1204,6 @@ bool X86AsmParser::ParseRegister(unsigned &RegNo,
return false;
}
-void X86AsmParser::SetFrameRegister(unsigned RegNo) {
- Instrumentation->SetInitialFrameRegister(RegNo);
-}
-
std::unique_ptr<X86Operand> X86AsmParser::DefaultMemSIOperand(SMLoc Loc) {
bool Parse32 = is32BitMode() || Code16GCC;
unsigned Basereg = is64BitMode() ? X86::RSI : (Parse32 ? X86::ESI : X86::SI);
@@ -1656,6 +1662,8 @@ X86AsmParser::ParseRoundingModeOp(SMLoc Start) {
const AsmToken &Tok = Parser.getTok();
// Eat "{" and mark the current place.
const SMLoc consumedToken = consumeToken();
+ if (Tok.isNot(AsmToken::Identifier))
+ return ErrorOperand(Tok.getLoc(), "Expected an identifier after {");
if (Tok.getIdentifier().startswith("r")){
int rndMode = StringSwitch<int>(Tok.getIdentifier())
.Case("rn", X86::STATIC_ROUNDING::TO_NEAREST_INT)
@@ -1999,6 +2007,29 @@ std::unique_ptr<X86Operand> X86AsmParser::ParseATTOperand() {
}
}
+// X86::COND_INVALID if not a recognized condition code or alternate mnemonic,
+// otherwise the EFLAGS Condition Code enumerator.
+X86::CondCode X86AsmParser::ParseConditionCode(StringRef CC) {
+ return StringSwitch<X86::CondCode>(CC)
+ .Case("o", X86::COND_O) // Overflow
+ .Case("no", X86::COND_NO) // No Overflow
+ .Cases("b", "nae", X86::COND_B) // Below/Neither Above nor Equal
+ .Cases("ae", "nb", X86::COND_AE) // Above or Equal/Not Below
+ .Cases("e", "z", X86::COND_E) // Equal/Zero
+ .Cases("ne", "nz", X86::COND_NE) // Not Equal/Not Zero
+ .Cases("be", "na", X86::COND_BE) // Below or Equal/Not Above
+ .Cases("a", "nbe", X86::COND_A) // Above/Neither Below nor Equal
+ .Case("s", X86::COND_S) // Sign
+ .Case("ns", X86::COND_NS) // No Sign
+ .Cases("p", "pe", X86::COND_P) // Parity/Parity Even
+ .Cases("np", "po", X86::COND_NP) // No Parity/Parity Odd
+ .Cases("l", "nge", X86::COND_L) // Less/Neither Greater nor Equal
+ .Cases("ge", "nl", X86::COND_GE) // Greater or Equal/Not Less
+ .Cases("le", "ng", X86::COND_LE) // Less or Equal/Not Greater
+ .Cases("g", "nle", X86::COND_G) // Greater/Neither Less nor Equal
+ .Default(X86::COND_INVALID);
+}
+
// true on failure, false otherwise
// If no {z} mark was found - Parser doesn't advance
bool X86AsmParser::ParseZ(std::unique_ptr<X86Operand> &Z,
@@ -2305,18 +2336,64 @@ bool X86AsmParser::ParseInstruction(ParseInstructionInfo &Info, StringRef Name,
SMLoc NameLoc, OperandVector &Operands) {
MCAsmParser &Parser = getParser();
InstInfo = &Info;
+
+ // Reset the forced VEX encoding.
+ ForcedVEXEncoding = VEXEncoding_Default;
+
+ // Parse pseudo prefixes.
+ while (1) {
+ if (Name == "{") {
+ if (getLexer().isNot(AsmToken::Identifier))
+ return Error(Parser.getTok().getLoc(), "Unexpected token after '{'");
+ std::string Prefix = Parser.getTok().getString().lower();
+ Parser.Lex(); // Eat identifier.
+ if (getLexer().isNot(AsmToken::RCurly))
+ return Error(Parser.getTok().getLoc(), "Expected '}'");
+ Parser.Lex(); // Eat curly.
+
+ if (Prefix == "vex2")
+ ForcedVEXEncoding = VEXEncoding_VEX2;
+ else if (Prefix == "vex3")
+ ForcedVEXEncoding = VEXEncoding_VEX3;
+ else if (Prefix == "evex")
+ ForcedVEXEncoding = VEXEncoding_EVEX;
+ else
+ return Error(NameLoc, "unknown prefix");
+
+ NameLoc = Parser.getTok().getLoc();
+ if (getLexer().is(AsmToken::LCurly)) {
+ Parser.Lex();
+ Name = "{";
+ } else {
+ if (getLexer().isNot(AsmToken::Identifier))
+ return Error(Parser.getTok().getLoc(), "Expected identifier");
+ // FIXME: The mnemonic won't match correctly if its not in lower case.
+ Name = Parser.getTok().getString();
+ Parser.Lex();
+ }
+ continue;
+ }
+
+ break;
+ }
+
StringRef PatchedName = Name;
- if ((Name.equals("jmp") || Name.equals("jc") || Name.equals("jz")) &&
- isParsingIntelSyntax() && isParsingInlineAsm()) {
+ // Hack to skip "short" following Jcc.
+ if (isParsingIntelSyntax() &&
+ (PatchedName == "jmp" || PatchedName == "jc" || PatchedName == "jnc" ||
+ PatchedName == "jcxz" || PatchedName == "jexcz" ||
+ (PatchedName.startswith("j") &&
+ ParseConditionCode(PatchedName.substr(1)) != X86::COND_INVALID))) {
StringRef NextTok = Parser.getTok().getString();
if (NextTok == "short") {
SMLoc NameEndLoc =
NameLoc.getFromPointer(NameLoc.getPointer() + Name.size());
- // Eat the short keyword
+ // Eat the short keyword.
Parser.Lex();
- // MS ignores the short keyword, it determines the jmp type based
- // on the distance of the label
+ // MS and GAS ignore the short keyword; they both determine the jmp type
+ // based on the distance of the label. (NASM does emit different code with
+ // and without "short," though.)
InstInfo->AsmRewrites->emplace_back(AOK_Skip, NameEndLoc,
NextTok.size() + 1);
}
@@ -2327,13 +2404,15 @@ bool X86AsmParser::ParseInstruction(ParseInstructionInfo &Info, StringRef Name,
PatchedName != "setb" && PatchedName != "setnb")
PatchedName = PatchedName.substr(0, Name.size()-1);
+ unsigned ComparisonPredicate = ~0U;
+
// FIXME: Hack to recognize cmp<comparison code>{ss,sd,ps,pd}.
if ((PatchedName.startswith("cmp") || PatchedName.startswith("vcmp")) &&
(PatchedName.endswith("ss") || PatchedName.endswith("sd") ||
PatchedName.endswith("ps") || PatchedName.endswith("pd"))) {
bool IsVCMP = PatchedName[0] == 'v';
unsigned CCIdx = IsVCMP ? 4 : 3;
- unsigned ComparisonCode = StringSwitch<unsigned>(
+ unsigned CC = StringSwitch<unsigned>(
PatchedName.slice(CCIdx, PatchedName.size() - 2))
.Case("eq", 0x00)
.Case("eq_oq", 0x00)
@@ -2383,26 +2462,29 @@ bool X86AsmParser::ParseInstruction(ParseInstructionInfo &Info, StringRef Name,
.Case("gt_oq", 0x1E)
.Case("true_us", 0x1F)
.Default(~0U);
- if (ComparisonCode != ~0U && (IsVCMP || ComparisonCode < 8)) {
-
- Operands.push_back(X86Operand::CreateToken(PatchedName.slice(0, CCIdx),
- NameLoc));
-
- const MCExpr *ImmOp = MCConstantExpr::create(ComparisonCode,
- getParser().getContext());
- Operands.push_back(X86Operand::CreateImm(ImmOp, NameLoc, NameLoc));
+ if (CC != ~0U && (IsVCMP || CC < 8)) {
+ if (PatchedName.endswith("ss"))
+ PatchedName = IsVCMP ? "vcmpss" : "cmpss";
+ else if (PatchedName.endswith("sd"))
+ PatchedName = IsVCMP ? "vcmpsd" : "cmpsd";
+ else if (PatchedName.endswith("ps"))
+ PatchedName = IsVCMP ? "vcmpps" : "cmpps";
+ else if (PatchedName.endswith("pd"))
+ PatchedName = IsVCMP ? "vcmppd" : "cmppd";
+ else
+ llvm_unreachable("Unexpected suffix!");
- PatchedName = PatchedName.substr(PatchedName.size() - 2);
+ ComparisonPredicate = CC;
}
}
// FIXME: Hack to recognize vpcmp<comparison code>{ub,uw,ud,uq,b,w,d,q}.
if (PatchedName.startswith("vpcmp") &&
- (PatchedName.endswith("b") || PatchedName.endswith("w") ||
- PatchedName.endswith("d") || PatchedName.endswith("q"))) {
- unsigned CCIdx = PatchedName.drop_back().back() == 'u' ? 2 : 1;
- unsigned ComparisonCode = StringSwitch<unsigned>(
- PatchedName.slice(5, PatchedName.size() - CCIdx))
+ (PatchedName.back() == 'b' || PatchedName.back() == 'w' ||
+ PatchedName.back() == 'd' || PatchedName.back() == 'q')) {
+ unsigned SuffixSize = PatchedName.drop_back().back() == 'u' ? 2 : 1;
+ unsigned CC = StringSwitch<unsigned>(
+ PatchedName.slice(5, PatchedName.size() - SuffixSize))
.Case("eq", 0x0) // Only allowed on unsigned. Checked below.
.Case("lt", 0x1)
.Case("le", 0x2)
@@ -2412,24 +2494,26 @@ bool X86AsmParser::ParseInstruction(ParseInstructionInfo &Info, StringRef Name,
.Case("nle", 0x6)
//.Case("true", 0x7) // Not a documented alias.
.Default(~0U);
- if (ComparisonCode != ~0U && (ComparisonCode != 0 || CCIdx == 2)) {
- Operands.push_back(X86Operand::CreateToken("vpcmp", NameLoc));
-
- const MCExpr *ImmOp = MCConstantExpr::create(ComparisonCode,
- getParser().getContext());
- Operands.push_back(X86Operand::CreateImm(ImmOp, NameLoc, NameLoc));
-
- PatchedName = PatchedName.substr(PatchedName.size() - CCIdx);
+ if (CC != ~0U && (CC != 0 || SuffixSize == 2)) {
+ switch (PatchedName.back()) {
+ default: llvm_unreachable("Unexpected character!");
+ case 'b': PatchedName = SuffixSize == 2 ? "vpcmpub" : "vpcmpb"; break;
+ case 'w': PatchedName = SuffixSize == 2 ? "vpcmpuw" : "vpcmpw"; break;
+ case 'd': PatchedName = SuffixSize == 2 ? "vpcmpud" : "vpcmpd"; break;
+ case 'q': PatchedName = SuffixSize == 2 ? "vpcmpuq" : "vpcmpq"; break;
+ }
+ // Set up the immediate to push into the operands later.
+ ComparisonPredicate = CC;
}
}
// FIXME: Hack to recognize vpcom<comparison code>{ub,uw,ud,uq,b,w,d,q}.
if (PatchedName.startswith("vpcom") &&
- (PatchedName.endswith("b") || PatchedName.endswith("w") ||
- PatchedName.endswith("d") || PatchedName.endswith("q"))) {
- unsigned CCIdx = PatchedName.drop_back().back() == 'u' ? 2 : 1;
- unsigned ComparisonCode = StringSwitch<unsigned>(
- PatchedName.slice(5, PatchedName.size() - CCIdx))
+ (PatchedName.back() == 'b' || PatchedName.back() == 'w' ||
+ PatchedName.back() == 'd' || PatchedName.back() == 'q')) {
+ unsigned SuffixSize = PatchedName.drop_back().back() == 'u' ? 2 : 1;
+ unsigned CC = StringSwitch<unsigned>(
+ PatchedName.slice(5, PatchedName.size() - SuffixSize))
.Case("lt", 0x0)
.Case("le", 0x1)
.Case("gt", 0x2)
@@ -2439,14 +2523,16 @@ bool X86AsmParser::ParseInstruction(ParseInstructionInfo &Info, StringRef Name,
.Case("false", 0x6)
.Case("true", 0x7)
.Default(~0U);
- if (ComparisonCode != ~0U) {
- Operands.push_back(X86Operand::CreateToken("vpcom", NameLoc));
-
- const MCExpr *ImmOp = MCConstantExpr::create(ComparisonCode,
- getParser().getContext());
- Operands.push_back(X86Operand::CreateImm(ImmOp, NameLoc, NameLoc));
-
- PatchedName = PatchedName.substr(PatchedName.size() - CCIdx);
+ if (CC != ~0U) {
+ switch (PatchedName.back()) {
+ default: llvm_unreachable("Unexpected character!");
+ case 'b': PatchedName = SuffixSize == 2 ? "vpcomub" : "vpcomb"; break;
+ case 'w': PatchedName = SuffixSize == 2 ? "vpcomuw" : "vpcomw"; break;
+ case 'd': PatchedName = SuffixSize == 2 ? "vpcomud" : "vpcomd"; break;
+ case 'q': PatchedName = SuffixSize == 2 ? "vpcomuq" : "vpcomq"; break;
+ }
+ // Set up the immediate to push into the operands later.
+ ComparisonPredicate = CC;
}
}
@@ -2489,6 +2575,7 @@ bool X86AsmParser::ParseInstruction(ParseInstructionInfo &Info, StringRef Name,
Flags = X86::IP_NO_PREFIX;
break;
}
+ // FIXME: The mnemonic won't match correctly if its not in lower case.
Name = Parser.getTok().getString();
Parser.Lex(); // eat the prefix
// Hack: we could have something like "rep # some comment" or
@@ -2496,6 +2583,7 @@ bool X86AsmParser::ParseInstruction(ParseInstructionInfo &Info, StringRef Name,
while (Name.startswith(";") || Name.startswith("\n") ||
Name.startswith("#") || Name.startswith("\t") ||
Name.startswith("/")) {
+ // FIXME: The mnemonic won't match correctly if its not in lower case.
Name = Parser.getTok().getString();
Parser.Lex(); // go to next prefix or instr
}
@@ -2519,6 +2607,13 @@ bool X86AsmParser::ParseInstruction(ParseInstructionInfo &Info, StringRef Name,
Operands.push_back(X86Operand::CreateToken(PatchedName, NameLoc));
+ // Push the immediate if we extracted one from the mnemonic.
+ if (ComparisonPredicate != ~0U && !isParsingIntelSyntax()) {
+ const MCExpr *ImmOp = MCConstantExpr::create(ComparisonPredicate,
+ getParser().getContext());
+ Operands.push_back(X86Operand::CreateImm(ImmOp, NameLoc, NameLoc));
+ }
+
// This does the actual operand parsing. Don't parse any more if we have a
// prefix juxtaposed with an operation like "lock incl 4(%rax)", because we
// just want to parse the "lock" as the first instruction and the "incl" as
@@ -2553,6 +2648,13 @@ bool X86AsmParser::ParseInstruction(ParseInstructionInfo &Info, StringRef Name,
return TokError("unexpected token in argument list");
}
+ // Push the immediate if we extracted one from the mnemonic.
+ if (ComparisonPredicate != ~0U && isParsingIntelSyntax()) {
+ const MCExpr *ImmOp = MCConstantExpr::create(ComparisonPredicate,
+ getParser().getContext());
+ Operands.push_back(X86Operand::CreateImm(ImmOp, NameLoc, NameLoc));
+ }
+
// Consume the EndOfStatement or the prefix separator Slash
if (getLexer().is(AsmToken::EndOfStatement) ||
(isPrefix && getLexer().is(AsmToken::Slash)))
@@ -2576,13 +2678,13 @@ bool X86AsmParser::ParseInstruction(ParseInstructionInfo &Info, StringRef Name,
static_cast<X86Operand &>(*Operands[0]).setTokenValue(Repl);
}
- // Moving a 32 or 16 bit value into a segment register has the same
- // behavior. Modify such instructions to always take shorter form.
if ((Name == "mov" || Name == "movw" || Name == "movl") &&
(Operands.size() == 3)) {
X86Operand &Op1 = (X86Operand &)*Operands[1];
X86Operand &Op2 = (X86Operand &)*Operands[2];
SMLoc Loc = Op1.getEndLoc();
+ // Moving a 32 or 16 bit value into a segment register has the same
+ // behavior. Modify such instructions to always take shorter form.
if (Op1.isReg() && Op2.isReg() &&
X86MCRegisterClasses[X86::SEGMENT_REGRegClassID].contains(
Op2.getReg()) &&
@@ -2759,7 +2861,69 @@ bool X86AsmParser::ParseInstruction(ParseInstructionInfo &Info, StringRef Name,
}
bool X86AsmParser::processInstruction(MCInst &Inst, const OperandVector &Ops) {
- return false;
+ const MCRegisterInfo *MRI = getContext().getRegisterInfo();
+
+ switch (Inst.getOpcode()) {
+ default: return false;
+ case X86::VMOVZPQILo2PQIrr:
+ case X86::VMOVAPDrr:
+ case X86::VMOVAPDYrr:
+ case X86::VMOVAPSrr:
+ case X86::VMOVAPSYrr:
+ case X86::VMOVDQArr:
+ case X86::VMOVDQAYrr:
+ case X86::VMOVDQUrr:
+ case X86::VMOVDQUYrr:
+ case X86::VMOVUPDrr:
+ case X86::VMOVUPDYrr:
+ case X86::VMOVUPSrr:
+ case X86::VMOVUPSYrr: {
+ // We can get a smaller encoding by using VEX.R instead of VEX.B if one of
+ // the registers is extended, but other isn't.
+ if (ForcedVEXEncoding == VEXEncoding_VEX3 ||
+ MRI->getEncodingValue(Inst.getOperand(0).getReg()) >= 8 ||
+ MRI->getEncodingValue(Inst.getOperand(1).getReg()) < 8)
+ return false;
+
+ unsigned NewOpc;
+ switch (Inst.getOpcode()) {
+ default: llvm_unreachable("Invalid opcode");
+ case X86::VMOVZPQILo2PQIrr: NewOpc = X86::VMOVPQI2QIrr; break;
+ case X86::VMOVAPDrr: NewOpc = X86::VMOVAPDrr_REV; break;
+ case X86::VMOVAPDYrr: NewOpc = X86::VMOVAPDYrr_REV; break;
+ case X86::VMOVAPSrr: NewOpc = X86::VMOVAPSrr_REV; break;
+ case X86::VMOVAPSYrr: NewOpc = X86::VMOVAPSYrr_REV; break;
+ case X86::VMOVDQArr: NewOpc = X86::VMOVDQArr_REV; break;
+ case X86::VMOVDQAYrr: NewOpc = X86::VMOVDQAYrr_REV; break;
+ case X86::VMOVDQUrr: NewOpc = X86::VMOVDQUrr_REV; break;
+ case X86::VMOVDQUYrr: NewOpc = X86::VMOVDQUYrr_REV; break;
+ case X86::VMOVUPDrr: NewOpc = X86::VMOVUPDrr_REV; break;
+ case X86::VMOVUPDYrr: NewOpc = X86::VMOVUPDYrr_REV; break;
+ case X86::VMOVUPSrr: NewOpc = X86::VMOVUPSrr_REV; break;
+ case X86::VMOVUPSYrr: NewOpc = X86::VMOVUPSYrr_REV; break;
+ }
+ Inst.setOpcode(NewOpc);
+ return true;
+ }
+ case X86::VMOVSDrr:
+ case X86::VMOVSSrr: {
+ // We can get a smaller encoding by using VEX.R instead of VEX.B if one of
+ // the registers is extended, but other isn't.
+ if (ForcedVEXEncoding == VEXEncoding_VEX3 ||
+ MRI->getEncodingValue(Inst.getOperand(0).getReg()) >= 8 ||
+ MRI->getEncodingValue(Inst.getOperand(2).getReg()) < 8)
+ return false;
+
+ unsigned NewOpc;
+ switch (Inst.getOpcode()) {
+ default: llvm_unreachable("Invalid opcode");
+ case X86::VMOVSDrr: NewOpc = X86::VMOVSDrr_REV; break;
+ case X86::VMOVSSrr: NewOpc = X86::VMOVSSrr_REV; break;
+ }
+ Inst.setOpcode(NewOpc);
+ return true;
+ }
+ }
}
bool X86AsmParser::validateInstruction(MCInst &Inst, const OperandVector &Ops) {
@@ -2865,9 +3029,7 @@ static const char *getSubtargetFeatureName(uint64_t Val);
void X86AsmParser::EmitInstruction(MCInst &Inst, OperandVector &Operands,
MCStreamer &Out) {
- Instrumentation->InstrumentAndEmitInstruction(
- Inst, Operands, getContext(), MII, Out,
- getParser().shouldPrintSchedInfo());
+ Out.EmitInstruction(Inst, getSTI());
}
bool X86AsmParser::MatchAndEmitInstruction(SMLoc IDLoc, unsigned &Opcode,
@@ -2907,17 +3069,16 @@ void X86AsmParser::MatchFPUWaitAlias(SMLoc IDLoc, X86Operand &Op,
}
}
-bool X86AsmParser::ErrorMissingFeature(SMLoc IDLoc, uint64_t ErrorInfo,
+bool X86AsmParser::ErrorMissingFeature(SMLoc IDLoc,
+ const FeatureBitset &MissingFeatures,
bool MatchingInlineAsm) {
- assert(ErrorInfo && "Unknown missing feature!");
+ assert(MissingFeatures.any() && "Unknown missing feature!");
SmallString<126> Msg;
raw_svector_ostream OS(Msg);
OS << "instruction requires:";
- uint64_t Mask = 1;
- for (unsigned i = 0; i < (sizeof(ErrorInfo)*8-1); ++i) {
- if (ErrorInfo & Mask)
- OS << ' ' << getSubtargetFeatureName(ErrorInfo & Mask);
- Mask <<= 1;
+ for (unsigned i = 0, e = MissingFeatures.size(); i != e; ++i) {
+ if (MissingFeatures[i])
+ OS << ' ' << getSubtargetFeatureName(i);
}
return Error(IDLoc, OS.str(), SMRange(), MatchingInlineAsm);
}
@@ -2932,30 +3093,70 @@ static unsigned getPrefixes(OperandVector &Operands) {
return Result;
}
+unsigned X86AsmParser::checkTargetMatchPredicate(MCInst &Inst) {
+ unsigned Opc = Inst.getOpcode();
+ const MCInstrDesc &MCID = MII.get(Opc);
+
+ if (ForcedVEXEncoding == VEXEncoding_EVEX &&
+ (MCID.TSFlags & X86II::EncodingMask) != X86II::EVEX)
+ return Match_Unsupported;
+
+ if ((ForcedVEXEncoding == VEXEncoding_VEX2 ||
+ ForcedVEXEncoding == VEXEncoding_VEX3) &&
+ (MCID.TSFlags & X86II::EncodingMask) != X86II::VEX)
+ return Match_Unsupported;
+
+ // These instructions match ambiguously with their VEX encoded counterparts
+ // and appear first in the matching table. Reject them unless we're forcing
+ // EVEX encoding.
+ // FIXME: We really need a way to break the ambiguity.
+ switch (Opc) {
+ case X86::VCVTSD2SIZrm_Int:
+ case X86::VCVTSD2SI64Zrm_Int:
+ case X86::VCVTSS2SIZrm_Int:
+ case X86::VCVTSS2SI64Zrm_Int:
+ case X86::VCVTTSD2SIZrm: case X86::VCVTTSD2SIZrm_Int:
+ case X86::VCVTTSD2SI64Zrm: case X86::VCVTTSD2SI64Zrm_Int:
+ case X86::VCVTTSS2SIZrm: case X86::VCVTTSS2SIZrm_Int:
+ case X86::VCVTTSS2SI64Zrm: case X86::VCVTTSS2SI64Zrm_Int:
+ if (ForcedVEXEncoding != VEXEncoding_EVEX)
+ return Match_Unsupported;
+ }
+
+ return Match_Success;
+}
+
bool X86AsmParser::MatchAndEmitATTInstruction(SMLoc IDLoc, unsigned &Opcode,
OperandVector &Operands,
MCStreamer &Out,
uint64_t &ErrorInfo,
bool MatchingInlineAsm) {
assert(!Operands.empty() && "Unexpect empty operand list!");
- X86Operand &Op = static_cast<X86Operand &>(*Operands[0]);
- assert(Op.isToken() && "Leading operand should always be a mnemonic!");
+ assert((*Operands[0]).isToken() && "Leading operand should always be a mnemonic!");
SMRange EmptyRange = None;
// First, handle aliases that expand to multiple instructions.
- MatchFPUWaitAlias(IDLoc, Op, Operands, Out, MatchingInlineAsm);
-
- bool WasOriginallyInvalidOperand = false;
+ MatchFPUWaitAlias(IDLoc, static_cast<X86Operand &>(*Operands[0]), Operands,
+ Out, MatchingInlineAsm);
+ X86Operand &Op = static_cast<X86Operand &>(*Operands[0]);
unsigned Prefixes = getPrefixes(Operands);
MCInst Inst;
+ // If VEX3 encoding is forced, we need to pass the USE_VEX3 flag to the
+ // encoder.
+ if (ForcedVEXEncoding == VEXEncoding_VEX3)
+ Prefixes |= X86::IP_USE_VEX3;
+
if (Prefixes)
Inst.setFlags(Prefixes);
// First, try a direct match.
- switch (MatchInstruction(Operands, Inst, ErrorInfo, MatchingInlineAsm,
- isParsingIntelSyntax())) {
+ FeatureBitset MissingFeatures;
+ unsigned OriginalError = MatchInstruction(Operands, Inst, ErrorInfo,
+ MissingFeatures, MatchingInlineAsm,
+ isParsingIntelSyntax());
+ switch (OriginalError) {
default: llvm_unreachable("Unexpected match result!");
case Match_Success:
if (!MatchingInlineAsm && validateInstruction(Inst, Operands))
@@ -2973,13 +3174,17 @@ bool X86AsmParser::MatchAndEmitATTInstruction(SMLoc IDLoc, unsigned &Opcode,
Opcode = Inst.getOpcode();
return false;
case Match_MissingFeature:
- return ErrorMissingFeature(IDLoc, ErrorInfo, MatchingInlineAsm);
+ return ErrorMissingFeature(IDLoc, MissingFeatures, MatchingInlineAsm);
case Match_InvalidOperand:
- WasOriginallyInvalidOperand = true;
- break;
case Match_MnemonicFail:
+ case Match_Unsupported:
break;
}
+ if (Op.getToken().empty()) {
+ Error(IDLoc, "instruction must have size higher than 0", EmptyRange,
+ MatchingInlineAsm);
+ return true;
+ }
// FIXME: Ideally, we would only attempt suffix matches for things which are
// valid prefixes, and we could just infer the right unambiguous
@@ -3003,16 +3208,17 @@ bool X86AsmParser::MatchAndEmitATTInstruction(SMLoc IDLoc, unsigned &Opcode,
// Check for the various suffix matches.
uint64_t ErrorInfoIgnore;
- uint64_t ErrorInfoMissingFeature = 0; // Init suppresses compiler warnings.
+ FeatureBitset ErrorInfoMissingFeatures; // Init suppresses compiler warnings.
unsigned Match[4];
for (unsigned I = 0, E = array_lengthof(Match); I != E; ++I) {
Tmp.back() = Suffixes[I];
Match[I] = MatchInstruction(Operands, Inst, ErrorInfoIgnore,
- MatchingInlineAsm, isParsingIntelSyntax());
+ MissingFeatures, MatchingInlineAsm,
+ isParsingIntelSyntax());
// If this returned as a missing feature failure, remember that.
if (Match[I] == Match_MissingFeature)
- ErrorInfoMissingFeature = ErrorInfoIgnore;
+ ErrorInfoMissingFeatures = MissingFeatures;
}
// Restore the old token.
@@ -3062,11 +3268,15 @@ bool X86AsmParser::MatchAndEmitATTInstruction(SMLoc IDLoc, unsigned &Opcode,
// If all of the instructions reported an invalid mnemonic, then the original
// mnemonic was invalid.
if (std::count(std::begin(Match), std::end(Match), Match_MnemonicFail) == 4) {
- if (!WasOriginallyInvalidOperand) {
+ if (OriginalError == Match_MnemonicFail)
return Error(IDLoc, "invalid instruction mnemonic '" + Base + "'",
Op.getLocRange(), MatchingInlineAsm);
- }
+ if (OriginalError == Match_Unsupported)
+ return Error(IDLoc, "unsupported instruction", EmptyRange,
+ MatchingInlineAsm);
+
+ assert(OriginalError == Match_InvalidOperand && "Unexpected error");
// Recover location info for the operand if we know which was the problem.
if (ErrorInfo != ~0ULL) {
if (ErrorInfo >= Operands.size())
@@ -3085,12 +3295,19 @@ bool X86AsmParser::MatchAndEmitATTInstruction(SMLoc IDLoc, unsigned &Opcode,
MatchingInlineAsm);
}
+ // If one instruction matched as unsupported, report this as unsupported.
+ if (std::count(std::begin(Match), std::end(Match),
+ Match_Unsupported) == 1) {
+ return Error(IDLoc, "unsupported instruction", EmptyRange,
+ MatchingInlineAsm);
+ }
+
// If one instruction matched with a missing feature, report this as a
// missing feature.
if (std::count(std::begin(Match), std::end(Match),
Match_MissingFeature) == 1) {
- ErrorInfo = ErrorInfoMissingFeature;
- return ErrorMissingFeature(IDLoc, ErrorInfoMissingFeature,
+ ErrorInfo = Match_MissingFeature;
+ return ErrorMissingFeature(IDLoc, ErrorInfoMissingFeatures,
MatchingInlineAsm);
}
@@ -3114,18 +3331,23 @@ bool X86AsmParser::MatchAndEmitIntelInstruction(SMLoc IDLoc, unsigned &Opcode,
uint64_t &ErrorInfo,
bool MatchingInlineAsm) {
assert(!Operands.empty() && "Unexpect empty operand list!");
- X86Operand &Op = static_cast<X86Operand &>(*Operands[0]);
- assert(Op.isToken() && "Leading operand should always be a mnemonic!");
- StringRef Mnemonic = Op.getToken();
+ assert((*Operands[0]).isToken() && "Leading operand should always be a mnemonic!");
+ StringRef Mnemonic = (static_cast<X86Operand &>(*Operands[0])).getToken();
SMRange EmptyRange = None;
- StringRef Base = Op.getToken();
+ StringRef Base = (static_cast<X86Operand &>(*Operands[0])).getToken();
unsigned Prefixes = getPrefixes(Operands);
// First, handle aliases that expand to multiple instructions.
- MatchFPUWaitAlias(IDLoc, Op, Operands, Out, MatchingInlineAsm);
+ MatchFPUWaitAlias(IDLoc, static_cast<X86Operand &>(*Operands[0]), Operands, Out, MatchingInlineAsm);
+ X86Operand &Op = static_cast<X86Operand &>(*Operands[0]);
MCInst Inst;
+ // If VEX3 encoding is forced, we need to pass the USE_VEX3 flag to the
+ // encoder.
+ if (ForcedVEXEncoding == VEXEncoding_VEX3)
+ Prefixes |= X86::IP_USE_VEX3;
+
if (Prefixes)
Inst.setFlags(Prefixes);
@@ -3154,7 +3376,8 @@ bool X86AsmParser::MatchAndEmitIntelInstruction(SMLoc IDLoc, unsigned &Opcode,
}
SmallVector<unsigned, 8> Match;
- uint64_t ErrorInfoMissingFeature = 0;
+ FeatureBitset ErrorInfoMissingFeatures;
+ FeatureBitset MissingFeatures;
// If unsized push has immediate operand we should default the default pointer
// size for the size.
@@ -3174,7 +3397,7 @@ bool X86AsmParser::MatchAndEmitIntelInstruction(SMLoc IDLoc, unsigned &Opcode,
Op.setTokenValue(Tmp);
// Do match in ATT mode to allow explicit suffix usage.
Match.push_back(MatchInstruction(Operands, Inst, ErrorInfo,
- MatchingInlineAsm,
+ MissingFeatures, MatchingInlineAsm,
false /*isParsingIntelSyntax()*/));
Op.setTokenValue(Base);
}
@@ -3191,13 +3414,14 @@ bool X86AsmParser::MatchAndEmitIntelInstruction(SMLoc IDLoc, unsigned &Opcode,
uint64_t ErrorInfoIgnore;
unsigned LastOpcode = Inst.getOpcode();
unsigned M = MatchInstruction(Operands, Inst, ErrorInfoIgnore,
- MatchingInlineAsm, isParsingIntelSyntax());
+ MissingFeatures, MatchingInlineAsm,
+ isParsingIntelSyntax());
if (Match.empty() || LastOpcode != Inst.getOpcode())
Match.push_back(M);
// If this returned as a missing feature failure, remember that.
if (Match.back() == Match_MissingFeature)
- ErrorInfoMissingFeature = ErrorInfoIgnore;
+ ErrorInfoMissingFeatures = MissingFeatures;
}
// Restore the size of the unsized memory operand if we modified it.
@@ -3209,10 +3433,11 @@ bool X86AsmParser::MatchAndEmitIntelInstruction(SMLoc IDLoc, unsigned &Opcode,
// matching with the unsized operand.
if (Match.empty()) {
Match.push_back(MatchInstruction(
- Operands, Inst, ErrorInfo, MatchingInlineAsm, isParsingIntelSyntax()));
+ Operands, Inst, ErrorInfo, MissingFeatures, MatchingInlineAsm,
+ isParsingIntelSyntax()));
// If this returned as a missing feature failure, remember that.
if (Match.back() == Match_MissingFeature)
- ErrorInfoMissingFeature = ErrorInfo;
+ ErrorInfoMissingFeatures = MissingFeatures;
}
// Restore the size of the unsized memory operand if we modified it.
@@ -3234,7 +3459,8 @@ bool X86AsmParser::MatchAndEmitIntelInstruction(SMLoc IDLoc, unsigned &Opcode,
UnsizedMemOp->getMemFrontendSize()) {
UnsizedMemOp->Mem.Size = UnsizedMemOp->getMemFrontendSize();
unsigned M = MatchInstruction(
- Operands, Inst, ErrorInfo, MatchingInlineAsm, isParsingIntelSyntax());
+ Operands, Inst, ErrorInfo, MissingFeatures, MatchingInlineAsm,
+ isParsingIntelSyntax());
if (M == Match_Success)
NumSuccessfulMatches = 1;
@@ -3270,12 +3496,19 @@ bool X86AsmParser::MatchAndEmitIntelInstruction(SMLoc IDLoc, unsigned &Opcode,
UnsizedMemOp->getLocRange());
}
+ // If one instruction matched as unsupported, report this as unsupported.
+ if (std::count(std::begin(Match), std::end(Match),
+ Match_Unsupported) == 1) {
+ return Error(IDLoc, "unsupported instruction", EmptyRange,
+ MatchingInlineAsm);
+ }
+
// If one instruction matched with a missing feature, report this as a
// missing feature.
if (std::count(std::begin(Match), std::end(Match),
Match_MissingFeature) == 1) {
- ErrorInfo = ErrorInfoMissingFeature;
- return ErrorMissingFeature(IDLoc, ErrorInfoMissingFeature,
+ ErrorInfo = Match_MissingFeature;
+ return ErrorMissingFeature(IDLoc, ErrorInfoMissingFeatures,
MatchingInlineAsm);
}
diff --git a/lib/Target/X86/AsmParser/X86AsmParserCommon.h b/lib/Target/X86/AsmParser/X86AsmParserCommon.h
index c45a3f14ef11..5bc979d1f18c 100644
--- a/lib/Target/X86/AsmParser/X86AsmParserCommon.h
+++ b/lib/Target/X86/AsmParser/X86AsmParserCommon.h
@@ -1,9 +1,8 @@
//===-- X86AsmParserCommon.h - Common functions for X86AsmParser ---------===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
diff --git a/lib/Target/X86/AsmParser/X86Operand.h b/lib/Target/X86/AsmParser/X86Operand.h
index 4d4aae0a1c6a..a771ba366318 100644
--- a/lib/Target/X86/AsmParser/X86Operand.h
+++ b/lib/Target/X86/AsmParser/X86Operand.h
@@ -1,16 +1,15 @@
//===- X86Operand.h - Parsed X86 machine instruction ------------*- C++ -*-===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
#ifndef LLVM_LIB_TARGET_X86_ASMPARSER_X86OPERAND_H
#define LLVM_LIB_TARGET_X86_ASMPARSER_X86OPERAND_H
-#include "InstPrinter/X86IntelInstPrinter.h"
+#include "MCTargetDesc/X86IntelInstPrinter.h"
#include "MCTargetDesc/X86MCTargetDesc.h"
#include "X86AsmParserCommon.h"
#include "llvm/ADT/STLExtras.h"
@@ -452,6 +451,31 @@ struct X86Operand final : public MCParsedAsmOperand {
X86MCRegisterClasses[X86::GR64RegClassID].contains(getReg()));
}
+ bool isVK1Pair() const {
+ return Kind == Register &&
+ X86MCRegisterClasses[X86::VK1RegClassID].contains(getReg());
+ }
+
+ bool isVK2Pair() const {
+ return Kind == Register &&
+ X86MCRegisterClasses[X86::VK2RegClassID].contains(getReg());
+ }
+
+ bool isVK4Pair() const {
+ return Kind == Register &&
+ X86MCRegisterClasses[X86::VK4RegClassID].contains(getReg());
+ }
+
+ bool isVK8Pair() const {
+ return Kind == Register &&
+ X86MCRegisterClasses[X86::VK8RegClassID].contains(getReg());
+ }
+
+ bool isVK16Pair() const {
+ return Kind == Register &&
+ X86MCRegisterClasses[X86::VK16RegClassID].contains(getReg());
+ }
+
void addExpr(MCInst &Inst, const MCExpr *Expr) const {
// Add as immediates when possible.
if (const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(Expr))
@@ -483,6 +507,30 @@ struct X86Operand final : public MCParsedAsmOperand {
addExpr(Inst, getImm());
}
+ void addMaskPairOperands(MCInst &Inst, unsigned N) const {
+ assert(N == 1 && "Invalid number of operands!");
+ unsigned Reg = getReg();
+ switch (Reg) {
+ case X86::K0:
+ case X86::K1:
+ Reg = X86::K0_K1;
+ break;
+ case X86::K2:
+ case X86::K3:
+ Reg = X86::K2_K3;
+ break;
+ case X86::K4:
+ case X86::K5:
+ Reg = X86::K4_K5;
+ break;
+ case X86::K6:
+ case X86::K7:
+ Reg = X86::K6_K7;
+ break;
+ }
+ Inst.addOperand(MCOperand::createReg(Reg));
+ }
+
void addMemOperands(MCInst &Inst, unsigned N) const {
assert((N == 5) && "Invalid number of operands!");
Inst.addOperand(MCOperand::createReg(getMemBaseReg()));
diff --git a/lib/Target/X86/Disassembler/X86Disassembler.cpp b/lib/Target/X86/Disassembler/X86Disassembler.cpp
index 62312777318e..9a635bbe5f85 100644
--- a/lib/Target/X86/Disassembler/X86Disassembler.cpp
+++ b/lib/Target/X86/Disassembler/X86Disassembler.cpp
@@ -1,9 +1,8 @@
//===-- X86Disassembler.cpp - Disassembler for x86 and x86_64 -------------===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
@@ -76,6 +75,7 @@
#include "MCTargetDesc/X86BaseInfo.h"
#include "MCTargetDesc/X86MCTargetDesc.h"
+#include "TargetInfo/X86TargetInfo.h"
#include "X86DisassemblerDecoder.h"
#include "llvm/MC/MCContext.h"
#include "llvm/MC/MCDisassembler/MCDisassembler.h"
@@ -446,211 +446,6 @@ static void translateImmediate(MCInst &mcInst, uint64_t immediate,
case ENCODING_IO:
break;
}
- } else if (type == TYPE_IMM3) {
- // Check for immediates that printSSECC can't handle.
- if (immediate >= 8) {
- unsigned NewOpc;
- switch (mcInst.getOpcode()) {
- default: llvm_unreachable("unexpected opcode");
- case X86::CMPPDrmi: NewOpc = X86::CMPPDrmi_alt; break;
- case X86::CMPPDrri: NewOpc = X86::CMPPDrri_alt; break;
- case X86::CMPPSrmi: NewOpc = X86::CMPPSrmi_alt; break;
- case X86::CMPPSrri: NewOpc = X86::CMPPSrri_alt; break;
- case X86::CMPSDrm: NewOpc = X86::CMPSDrm_alt; break;
- case X86::CMPSDrr: NewOpc = X86::CMPSDrr_alt; break;
- case X86::CMPSSrm: NewOpc = X86::CMPSSrm_alt; break;
- case X86::CMPSSrr: NewOpc = X86::CMPSSrr_alt; break;
- case X86::VPCOMBri: NewOpc = X86::VPCOMBri_alt; break;
- case X86::VPCOMBmi: NewOpc = X86::VPCOMBmi_alt; break;
- case X86::VPCOMWri: NewOpc = X86::VPCOMWri_alt; break;
- case X86::VPCOMWmi: NewOpc = X86::VPCOMWmi_alt; break;
- case X86::VPCOMDri: NewOpc = X86::VPCOMDri_alt; break;
- case X86::VPCOMDmi: NewOpc = X86::VPCOMDmi_alt; break;
- case X86::VPCOMQri: NewOpc = X86::VPCOMQri_alt; break;
- case X86::VPCOMQmi: NewOpc = X86::VPCOMQmi_alt; break;
- case X86::VPCOMUBri: NewOpc = X86::VPCOMUBri_alt; break;
- case X86::VPCOMUBmi: NewOpc = X86::VPCOMUBmi_alt; break;
- case X86::VPCOMUWri: NewOpc = X86::VPCOMUWri_alt; break;
- case X86::VPCOMUWmi: NewOpc = X86::VPCOMUWmi_alt; break;
- case X86::VPCOMUDri: NewOpc = X86::VPCOMUDri_alt; break;
- case X86::VPCOMUDmi: NewOpc = X86::VPCOMUDmi_alt; break;
- case X86::VPCOMUQri: NewOpc = X86::VPCOMUQri_alt; break;
- case X86::VPCOMUQmi: NewOpc = X86::VPCOMUQmi_alt; break;
- }
- // Switch opcode to the one that doesn't get special printing.
- mcInst.setOpcode(NewOpc);
- }
- } else if (type == TYPE_IMM5) {
- // Check for immediates that printAVXCC can't handle.
- if (immediate >= 32) {
- unsigned NewOpc;
- switch (mcInst.getOpcode()) {
- default: llvm_unreachable("unexpected opcode");
- case X86::VCMPPDrmi: NewOpc = X86::VCMPPDrmi_alt; break;
- case X86::VCMPPDrri: NewOpc = X86::VCMPPDrri_alt; break;
- case X86::VCMPPSrmi: NewOpc = X86::VCMPPSrmi_alt; break;
- case X86::VCMPPSrri: NewOpc = X86::VCMPPSrri_alt; break;
- case X86::VCMPSDrm: NewOpc = X86::VCMPSDrm_alt; break;
- case X86::VCMPSDrr: NewOpc = X86::VCMPSDrr_alt; break;
- case X86::VCMPSSrm: NewOpc = X86::VCMPSSrm_alt; break;
- case X86::VCMPSSrr: NewOpc = X86::VCMPSSrr_alt; break;
- case X86::VCMPPDYrmi: NewOpc = X86::VCMPPDYrmi_alt; break;
- case X86::VCMPPDYrri: NewOpc = X86::VCMPPDYrri_alt; break;
- case X86::VCMPPSYrmi: NewOpc = X86::VCMPPSYrmi_alt; break;
- case X86::VCMPPSYrri: NewOpc = X86::VCMPPSYrri_alt; break;
- case X86::VCMPPDZrmi: NewOpc = X86::VCMPPDZrmi_alt; break;
- case X86::VCMPPDZrri: NewOpc = X86::VCMPPDZrri_alt; break;
- case X86::VCMPPDZrrib: NewOpc = X86::VCMPPDZrrib_alt; break;
- case X86::VCMPPSZrmi: NewOpc = X86::VCMPPSZrmi_alt; break;
- case X86::VCMPPSZrri: NewOpc = X86::VCMPPSZrri_alt; break;
- case X86::VCMPPSZrrib: NewOpc = X86::VCMPPSZrrib_alt; break;
- case X86::VCMPPDZ128rmi: NewOpc = X86::VCMPPDZ128rmi_alt; break;
- case X86::VCMPPDZ128rri: NewOpc = X86::VCMPPDZ128rri_alt; break;
- case X86::VCMPPSZ128rmi: NewOpc = X86::VCMPPSZ128rmi_alt; break;
- case X86::VCMPPSZ128rri: NewOpc = X86::VCMPPSZ128rri_alt; break;
- case X86::VCMPPDZ256rmi: NewOpc = X86::VCMPPDZ256rmi_alt; break;
- case X86::VCMPPDZ256rri: NewOpc = X86::VCMPPDZ256rri_alt; break;
- case X86::VCMPPSZ256rmi: NewOpc = X86::VCMPPSZ256rmi_alt; break;
- case X86::VCMPPSZ256rri: NewOpc = X86::VCMPPSZ256rri_alt; break;
- case X86::VCMPSDZrm_Int: NewOpc = X86::VCMPSDZrmi_alt; break;
- case X86::VCMPSDZrr_Int: NewOpc = X86::VCMPSDZrri_alt; break;
- case X86::VCMPSDZrrb_Int: NewOpc = X86::VCMPSDZrrb_alt; break;
- case X86::VCMPSSZrm_Int: NewOpc = X86::VCMPSSZrmi_alt; break;
- case X86::VCMPSSZrr_Int: NewOpc = X86::VCMPSSZrri_alt; break;
- case X86::VCMPSSZrrb_Int: NewOpc = X86::VCMPSSZrrb_alt; break;
- }
- // Switch opcode to the one that doesn't get special printing.
- mcInst.setOpcode(NewOpc);
- }
- } else if (type == TYPE_AVX512ICC) {
- if (immediate >= 8 || ((immediate & 0x3) == 3)) {
- unsigned NewOpc;
- switch (mcInst.getOpcode()) {
- default: llvm_unreachable("unexpected opcode");
- case X86::VPCMPBZ128rmi: NewOpc = X86::VPCMPBZ128rmi_alt; break;
- case X86::VPCMPBZ128rmik: NewOpc = X86::VPCMPBZ128rmik_alt; break;
- case X86::VPCMPBZ128rri: NewOpc = X86::VPCMPBZ128rri_alt; break;
- case X86::VPCMPBZ128rrik: NewOpc = X86::VPCMPBZ128rrik_alt; break;
- case X86::VPCMPBZ256rmi: NewOpc = X86::VPCMPBZ256rmi_alt; break;
- case X86::VPCMPBZ256rmik: NewOpc = X86::VPCMPBZ256rmik_alt; break;
- case X86::VPCMPBZ256rri: NewOpc = X86::VPCMPBZ256rri_alt; break;
- case X86::VPCMPBZ256rrik: NewOpc = X86::VPCMPBZ256rrik_alt; break;
- case X86::VPCMPBZrmi: NewOpc = X86::VPCMPBZrmi_alt; break;
- case X86::VPCMPBZrmik: NewOpc = X86::VPCMPBZrmik_alt; break;
- case X86::VPCMPBZrri: NewOpc = X86::VPCMPBZrri_alt; break;
- case X86::VPCMPBZrrik: NewOpc = X86::VPCMPBZrrik_alt; break;
- case X86::VPCMPDZ128rmi: NewOpc = X86::VPCMPDZ128rmi_alt; break;
- case X86::VPCMPDZ128rmib: NewOpc = X86::VPCMPDZ128rmib_alt; break;
- case X86::VPCMPDZ128rmibk: NewOpc = X86::VPCMPDZ128rmibk_alt; break;
- case X86::VPCMPDZ128rmik: NewOpc = X86::VPCMPDZ128rmik_alt; break;
- case X86::VPCMPDZ128rri: NewOpc = X86::VPCMPDZ128rri_alt; break;
- case X86::VPCMPDZ128rrik: NewOpc = X86::VPCMPDZ128rrik_alt; break;
- case X86::VPCMPDZ256rmi: NewOpc = X86::VPCMPDZ256rmi_alt; break;
- case X86::VPCMPDZ256rmib: NewOpc = X86::VPCMPDZ256rmib_alt; break;
- case X86::VPCMPDZ256rmibk: NewOpc = X86::VPCMPDZ256rmibk_alt; break;
- case X86::VPCMPDZ256rmik: NewOpc = X86::VPCMPDZ256rmik_alt; break;
- case X86::VPCMPDZ256rri: NewOpc = X86::VPCMPDZ256rri_alt; break;
- case X86::VPCMPDZ256rrik: NewOpc = X86::VPCMPDZ256rrik_alt; break;
- case X86::VPCMPDZrmi: NewOpc = X86::VPCMPDZrmi_alt; break;
- case X86::VPCMPDZrmib: NewOpc = X86::VPCMPDZrmib_alt; break;
- case X86::VPCMPDZrmibk: NewOpc = X86::VPCMPDZrmibk_alt; break;
- case X86::VPCMPDZrmik: NewOpc = X86::VPCMPDZrmik_alt; break;
- case X86::VPCMPDZrri: NewOpc = X86::VPCMPDZrri_alt; break;
- case X86::VPCMPDZrrik: NewOpc = X86::VPCMPDZrrik_alt; break;
- case X86::VPCMPQZ128rmi: NewOpc = X86::VPCMPQZ128rmi_alt; break;
- case X86::VPCMPQZ128rmib: NewOpc = X86::VPCMPQZ128rmib_alt; break;
- case X86::VPCMPQZ128rmibk: NewOpc = X86::VPCMPQZ128rmibk_alt; break;
- case X86::VPCMPQZ128rmik: NewOpc = X86::VPCMPQZ128rmik_alt; break;
- case X86::VPCMPQZ128rri: NewOpc = X86::VPCMPQZ128rri_alt; break;
- case X86::VPCMPQZ128rrik: NewOpc = X86::VPCMPQZ128rrik_alt; break;
- case X86::VPCMPQZ256rmi: NewOpc = X86::VPCMPQZ256rmi_alt; break;
- case X86::VPCMPQZ256rmib: NewOpc = X86::VPCMPQZ256rmib_alt; break;
- case X86::VPCMPQZ256rmibk: NewOpc = X86::VPCMPQZ256rmibk_alt; break;
- case X86::VPCMPQZ256rmik: NewOpc = X86::VPCMPQZ256rmik_alt; break;
- case X86::VPCMPQZ256rri: NewOpc = X86::VPCMPQZ256rri_alt; break;
- case X86::VPCMPQZ256rrik: NewOpc = X86::VPCMPQZ256rrik_alt; break;
- case X86::VPCMPQZrmi: NewOpc = X86::VPCMPQZrmi_alt; break;
- case X86::VPCMPQZrmib: NewOpc = X86::VPCMPQZrmib_alt; break;
- case X86::VPCMPQZrmibk: NewOpc = X86::VPCMPQZrmibk_alt; break;
- case X86::VPCMPQZrmik: NewOpc = X86::VPCMPQZrmik_alt; break;
- case X86::VPCMPQZrri: NewOpc = X86::VPCMPQZrri_alt; break;
- case X86::VPCMPQZrrik: NewOpc = X86::VPCMPQZrrik_alt; break;
- case X86::VPCMPUBZ128rmi: NewOpc = X86::VPCMPUBZ128rmi_alt; break;
- case X86::VPCMPUBZ128rmik: NewOpc = X86::VPCMPUBZ128rmik_alt; break;
- case X86::VPCMPUBZ128rri: NewOpc = X86::VPCMPUBZ128rri_alt; break;
- case X86::VPCMPUBZ128rrik: NewOpc = X86::VPCMPUBZ128rrik_alt; break;
- case X86::VPCMPUBZ256rmi: NewOpc = X86::VPCMPUBZ256rmi_alt; break;
- case X86::VPCMPUBZ256rmik: NewOpc = X86::VPCMPUBZ256rmik_alt; break;
- case X86::VPCMPUBZ256rri: NewOpc = X86::VPCMPUBZ256rri_alt; break;
- case X86::VPCMPUBZ256rrik: NewOpc = X86::VPCMPUBZ256rrik_alt; break;
- case X86::VPCMPUBZrmi: NewOpc = X86::VPCMPUBZrmi_alt; break;
- case X86::VPCMPUBZrmik: NewOpc = X86::VPCMPUBZrmik_alt; break;
- case X86::VPCMPUBZrri: NewOpc = X86::VPCMPUBZrri_alt; break;
- case X86::VPCMPUBZrrik: NewOpc = X86::VPCMPUBZrrik_alt; break;
- case X86::VPCMPUDZ128rmi: NewOpc = X86::VPCMPUDZ128rmi_alt; break;
- case X86::VPCMPUDZ128rmib: NewOpc = X86::VPCMPUDZ128rmib_alt; break;
- case X86::VPCMPUDZ128rmibk: NewOpc = X86::VPCMPUDZ128rmibk_alt; break;
- case X86::VPCMPUDZ128rmik: NewOpc = X86::VPCMPUDZ128rmik_alt; break;
- case X86::VPCMPUDZ128rri: NewOpc = X86::VPCMPUDZ128rri_alt; break;
- case X86::VPCMPUDZ128rrik: NewOpc = X86::VPCMPUDZ128rrik_alt; break;
- case X86::VPCMPUDZ256rmi: NewOpc = X86::VPCMPUDZ256rmi_alt; break;
- case X86::VPCMPUDZ256rmib: NewOpc = X86::VPCMPUDZ256rmib_alt; break;
- case X86::VPCMPUDZ256rmibk: NewOpc = X86::VPCMPUDZ256rmibk_alt; break;
- case X86::VPCMPUDZ256rmik: NewOpc = X86::VPCMPUDZ256rmik_alt; break;
- case X86::VPCMPUDZ256rri: NewOpc = X86::VPCMPUDZ256rri_alt; break;
- case X86::VPCMPUDZ256rrik: NewOpc = X86::VPCMPUDZ256rrik_alt; break;
- case X86::VPCMPUDZrmi: NewOpc = X86::VPCMPUDZrmi_alt; break;
- case X86::VPCMPUDZrmib: NewOpc = X86::VPCMPUDZrmib_alt; break;
- case X86::VPCMPUDZrmibk: NewOpc = X86::VPCMPUDZrmibk_alt; break;
- case X86::VPCMPUDZrmik: NewOpc = X86::VPCMPUDZrmik_alt; break;
- case X86::VPCMPUDZrri: NewOpc = X86::VPCMPUDZrri_alt; break;
- case X86::VPCMPUDZrrik: NewOpc = X86::VPCMPUDZrrik_alt; break;
- case X86::VPCMPUQZ128rmi: NewOpc = X86::VPCMPUQZ128rmi_alt; break;
- case X86::VPCMPUQZ128rmib: NewOpc = X86::VPCMPUQZ128rmib_alt; break;
- case X86::VPCMPUQZ128rmibk: NewOpc = X86::VPCMPUQZ128rmibk_alt; break;
- case X86::VPCMPUQZ128rmik: NewOpc = X86::VPCMPUQZ128rmik_alt; break;
- case X86::VPCMPUQZ128rri: NewOpc = X86::VPCMPUQZ128rri_alt; break;
- case X86::VPCMPUQZ128rrik: NewOpc = X86::VPCMPUQZ128rrik_alt; break;
- case X86::VPCMPUQZ256rmi: NewOpc = X86::VPCMPUQZ256rmi_alt; break;
- case X86::VPCMPUQZ256rmib: NewOpc = X86::VPCMPUQZ256rmib_alt; break;
- case X86::VPCMPUQZ256rmibk: NewOpc = X86::VPCMPUQZ256rmibk_alt; break;
- case X86::VPCMPUQZ256rmik: NewOpc = X86::VPCMPUQZ256rmik_alt; break;
- case X86::VPCMPUQZ256rri: NewOpc = X86::VPCMPUQZ256rri_alt; break;
- case X86::VPCMPUQZ256rrik: NewOpc = X86::VPCMPUQZ256rrik_alt; break;
- case X86::VPCMPUQZrmi: NewOpc = X86::VPCMPUQZrmi_alt; break;
- case X86::VPCMPUQZrmib: NewOpc = X86::VPCMPUQZrmib_alt; break;
- case X86::VPCMPUQZrmibk: NewOpc = X86::VPCMPUQZrmibk_alt; break;
- case X86::VPCMPUQZrmik: NewOpc = X86::VPCMPUQZrmik_alt; break;
- case X86::VPCMPUQZrri: NewOpc = X86::VPCMPUQZrri_alt; break;
- case X86::VPCMPUQZrrik: NewOpc = X86::VPCMPUQZrrik_alt; break;
- case X86::VPCMPUWZ128rmi: NewOpc = X86::VPCMPUWZ128rmi_alt; break;
- case X86::VPCMPUWZ128rmik: NewOpc = X86::VPCMPUWZ128rmik_alt; break;
- case X86::VPCMPUWZ128rri: NewOpc = X86::VPCMPUWZ128rri_alt; break;
- case X86::VPCMPUWZ128rrik: NewOpc = X86::VPCMPUWZ128rrik_alt; break;
- case X86::VPCMPUWZ256rmi: NewOpc = X86::VPCMPUWZ256rmi_alt; break;
- case X86::VPCMPUWZ256rmik: NewOpc = X86::VPCMPUWZ256rmik_alt; break;
- case X86::VPCMPUWZ256rri: NewOpc = X86::VPCMPUWZ256rri_alt; break;
- case X86::VPCMPUWZ256rrik: NewOpc = X86::VPCMPUWZ256rrik_alt; break;
- case X86::VPCMPUWZrmi: NewOpc = X86::VPCMPUWZrmi_alt; break;
- case X86::VPCMPUWZrmik: NewOpc = X86::VPCMPUWZrmik_alt; break;
- case X86::VPCMPUWZrri: NewOpc = X86::VPCMPUWZrri_alt; break;
- case X86::VPCMPUWZrrik: NewOpc = X86::VPCMPUWZrrik_alt; break;
- case X86::VPCMPWZ128rmi: NewOpc = X86::VPCMPWZ128rmi_alt; break;
- case X86::VPCMPWZ128rmik: NewOpc = X86::VPCMPWZ128rmik_alt; break;
- case X86::VPCMPWZ128rri: NewOpc = X86::VPCMPWZ128rri_alt; break;
- case X86::VPCMPWZ128rrik: NewOpc = X86::VPCMPWZ128rrik_alt; break;
- case X86::VPCMPWZ256rmi: NewOpc = X86::VPCMPWZ256rmi_alt; break;
- case X86::VPCMPWZ256rmik: NewOpc = X86::VPCMPWZ256rmik_alt; break;
- case X86::VPCMPWZ256rri: NewOpc = X86::VPCMPWZ256rri_alt; break;
- case X86::VPCMPWZ256rrik: NewOpc = X86::VPCMPWZ256rrik_alt; break;
- case X86::VPCMPWZrmi: NewOpc = X86::VPCMPWZrmi_alt; break;
- case X86::VPCMPWZrmik: NewOpc = X86::VPCMPWZrmik_alt; break;
- case X86::VPCMPWZrri: NewOpc = X86::VPCMPWZrri_alt; break;
- case X86::VPCMPWZrrik: NewOpc = X86::VPCMPWZrrik_alt; break;
- }
- // Switch opcode to the one that doesn't get special printing.
- mcInst.setOpcode(NewOpc);
- }
}
switch (type) {
@@ -899,6 +694,7 @@ static bool translateRM(MCInst &mcInst, const OperandSpecifier &operand,
case TYPE_XMM:
case TYPE_YMM:
case TYPE_ZMM:
+ case TYPE_VK_PAIR:
case TYPE_VK:
case TYPE_DEBUGREG:
case TYPE_CONTROLREG:
@@ -987,6 +783,9 @@ static bool translateOperand(MCInst &mcInst, const OperandSpecifier &operand,
case ENCODING_Rv:
translateRegister(mcInst, insn.opcodeRegister);
return false;
+ case ENCODING_CC:
+ mcInst.addOperand(MCOperand::createImm(insn.immediates[1]));
+ return false;
case ENCODING_FP:
translateFPRegister(mcInst, insn.modRM & 7);
return false;
diff --git a/lib/Target/X86/Disassembler/X86DisassemblerDecoder.cpp b/lib/Target/X86/Disassembler/X86DisassemblerDecoder.cpp
index 54d550b60652..a241362a271d 100644
--- a/lib/Target/X86/Disassembler/X86DisassemblerDecoder.cpp
+++ b/lib/Target/X86/Disassembler/X86DisassemblerDecoder.cpp
@@ -1,9 +1,8 @@
//===-- X86DisassemblerDecoder.cpp - Disassembler decoder -----------------===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
@@ -377,8 +376,7 @@ static int readPrefixes(struct InternalInstruction* insn) {
if (byte == 0xf3 && (nextByte == 0x88 || nextByte == 0x89 ||
nextByte == 0xc6 || nextByte == 0xc7)) {
insn->xAcquireRelease = true;
- if (nextByte != 0x90) // PAUSE instruction support
- break;
+ break;
}
if (isREX(insn, nextByte)) {
uint8_t nnextByte;
@@ -884,7 +882,7 @@ static int getID(struct InternalInstruction* insn, const void *miiArg) {
if (aaaFromEVEX4of4(insn->vectorExtensionPrefix[3]))
attrMask |= ATTR_EVEXK;
if (lFromEVEX4of4(insn->vectorExtensionPrefix[3]))
- attrMask |= ATTR_EVEXL;
+ attrMask |= ATTR_VEXL;
if (l2FromEVEX4of4(insn->vectorExtensionPrefix[3]))
attrMask |= ATTR_EVEXL2;
} else if (insn->vectorExtensionType == TYPE_VEX_3B) {
@@ -1470,6 +1468,10 @@ static int readModRM(struct InternalInstruction* insn) {
if (index > 7) \
*valid = 0; \
return prefix##_K0 + index; \
+ case TYPE_VK_PAIR: \
+ if (index > 7) \
+ *valid = 0; \
+ return prefix##_K0_K1 + (index / 2); \
case TYPE_MM64: \
return prefix##_MM0 + (index & 0x7); \
case TYPE_SEGMENTREG: \
@@ -1847,6 +1849,9 @@ static int readOperands(struct InternalInstruction* insn) {
if (readOpcodeRegister(insn, 0))
return -1;
break;
+ case ENCODING_CC:
+ insn->immediates[1] = insn->opcode & 0xf;
+ break;
case ENCODING_FP:
break;
case ENCODING_VVVV:
diff --git a/lib/Target/X86/Disassembler/X86DisassemblerDecoder.h b/lib/Target/X86/Disassembler/X86DisassemblerDecoder.h
index 3b8a4f732eed..7c0a42c019e3 100644
--- a/lib/Target/X86/Disassembler/X86DisassemblerDecoder.h
+++ b/lib/Target/X86/Disassembler/X86DisassemblerDecoder.h
@@ -1,9 +1,8 @@
//===-- X86DisassemblerDecoderInternal.h - Disassembler decoder -*- C++ -*-===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
@@ -325,6 +324,12 @@ namespace X86Disassembler {
ENTRY(K6) \
ENTRY(K7)
+#define REGS_MASK_PAIRS \
+ ENTRY(K0_K1) \
+ ENTRY(K2_K3) \
+ ENTRY(K4_K5) \
+ ENTRY(K6_K7)
+
#define REGS_SEGMENT \
ENTRY(ES) \
ENTRY(CS) \
@@ -394,6 +399,7 @@ namespace X86Disassembler {
REGS_YMM \
REGS_ZMM \
REGS_MASKS \
+ REGS_MASK_PAIRS \
REGS_SEGMENT \
REGS_DEBUG \
REGS_CONTROL \
diff --git a/lib/Target/X86/InstPrinter/X86ATTInstPrinter.cpp b/lib/Target/X86/InstPrinter/X86ATTInstPrinter.cpp
deleted file mode 100644
index 0e861d5ddbc9..000000000000
--- a/lib/Target/X86/InstPrinter/X86ATTInstPrinter.cpp
+++ /dev/null
@@ -1,202 +0,0 @@
-//===-- X86ATTInstPrinter.cpp - AT&T assembly instruction printing --------===//
-//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
-//
-//===----------------------------------------------------------------------===//
-//
-// This file includes code for rendering MCInst instances as AT&T-style
-// assembly.
-//
-//===----------------------------------------------------------------------===//
-
-#include "X86ATTInstPrinter.h"
-#include "MCTargetDesc/X86BaseInfo.h"
-#include "X86InstComments.h"
-#include "llvm/MC/MCExpr.h"
-#include "llvm/MC/MCInst.h"
-#include "llvm/MC/MCInstrInfo.h"
-#include "llvm/MC/MCSubtargetInfo.h"
-#include "llvm/Support/Casting.h"
-#include "llvm/Support/ErrorHandling.h"
-#include "llvm/Support/Format.h"
-#include "llvm/Support/raw_ostream.h"
-#include <cassert>
-#include <cinttypes>
-#include <cstdint>
-
-using namespace llvm;
-
-#define DEBUG_TYPE "asm-printer"
-
-// Include the auto-generated portion of the assembly writer.
-#define PRINT_ALIAS_INSTR
-#include "X86GenAsmWriter.inc"
-
-void X86ATTInstPrinter::printRegName(raw_ostream &OS, unsigned RegNo) const {
- OS << markup("<reg:") << '%' << getRegisterName(RegNo) << markup(">");
-}
-
-void X86ATTInstPrinter::printInst(const MCInst *MI, raw_ostream &OS,
- StringRef Annot, const MCSubtargetInfo &STI) {
- // If verbose assembly is enabled, we can print some informative comments.
- if (CommentStream)
- HasCustomInstComment = EmitAnyX86InstComments(MI, *CommentStream, MII);
-
- printInstFlags(MI, OS);
-
- // Output CALLpcrel32 as "callq" in 64-bit mode.
- // In Intel annotation it's always emitted as "call".
- //
- // TODO: Probably this hack should be redesigned via InstAlias in
- // InstrInfo.td as soon as Requires clause is supported properly
- // for InstAlias.
- if (MI->getOpcode() == X86::CALLpcrel32 &&
- (STI.getFeatureBits()[X86::Mode64Bit])) {
- OS << "\tcallq\t";
- printPCRelImm(MI, 0, OS);
- }
- // data16 and data32 both have the same encoding of 0x66. While data32 is
- // valid only in 16 bit systems, data16 is valid in the rest.
- // There seems to be some lack of support of the Requires clause that causes
- // 0x66 to be interpreted as "data16" by the asm printer.
- // Thus we add an adjustment here in order to print the "right" instruction.
- else if (MI->getOpcode() == X86::DATA16_PREFIX &&
- STI.getFeatureBits()[X86::Mode16Bit]) {
- OS << "\tdata32";
- }
- // Try to print any aliases first.
- else if (!printAliasInstr(MI, OS))
- printInstruction(MI, OS);
-
- // Next always print the annotation.
- printAnnotation(OS, Annot);
-}
-
-void X86ATTInstPrinter::printOperand(const MCInst *MI, unsigned OpNo,
- raw_ostream &O) {
- const MCOperand &Op = MI->getOperand(OpNo);
- if (Op.isReg()) {
- printRegName(O, Op.getReg());
- } else if (Op.isImm()) {
- // Print immediates as signed values.
- int64_t Imm = Op.getImm();
- O << markup("<imm:") << '$' << formatImm(Imm) << markup(">");
-
- // TODO: This should be in a helper function in the base class, so it can
- // be used by other printers.
-
- // If there are no instruction-specific comments, add a comment clarifying
- // the hex value of the immediate operand when it isn't in the range
- // [-256,255].
- if (CommentStream && !HasCustomInstComment && (Imm > 255 || Imm < -256)) {
- // Don't print unnecessary hex sign bits.
- if (Imm == (int16_t)(Imm))
- *CommentStream << format("imm = 0x%" PRIX16 "\n", (uint16_t)Imm);
- else if (Imm == (int32_t)(Imm))
- *CommentStream << format("imm = 0x%" PRIX32 "\n", (uint32_t)Imm);
- else
- *CommentStream << format("imm = 0x%" PRIX64 "\n", (uint64_t)Imm);
- }
- } else {
- assert(Op.isExpr() && "unknown operand kind in printOperand");
- O << markup("<imm:") << '$';
- Op.getExpr()->print(O, &MAI);
- O << markup(">");
- }
-}
-
-void X86ATTInstPrinter::printMemReference(const MCInst *MI, unsigned Op,
- raw_ostream &O) {
- const MCOperand &BaseReg = MI->getOperand(Op + X86::AddrBaseReg);
- const MCOperand &IndexReg = MI->getOperand(Op + X86::AddrIndexReg);
- const MCOperand &DispSpec = MI->getOperand(Op + X86::AddrDisp);
-
- O << markup("<mem:");
-
- // If this has a segment register, print it.
- printOptionalSegReg(MI, Op + X86::AddrSegmentReg, O);
-
- if (DispSpec.isImm()) {
- int64_t DispVal = DispSpec.getImm();
- if (DispVal || (!IndexReg.getReg() && !BaseReg.getReg()))
- O << formatImm(DispVal);
- } else {
- assert(DispSpec.isExpr() && "non-immediate displacement for LEA?");
- DispSpec.getExpr()->print(O, &MAI);
- }
-
- if (IndexReg.getReg() || BaseReg.getReg()) {
- O << '(';
- if (BaseReg.getReg())
- printOperand(MI, Op + X86::AddrBaseReg, O);
-
- if (IndexReg.getReg()) {
- O << ',';
- printOperand(MI, Op + X86::AddrIndexReg, O);
- unsigned ScaleVal = MI->getOperand(Op + X86::AddrScaleAmt).getImm();
- if (ScaleVal != 1) {
- O << ',' << markup("<imm:") << ScaleVal // never printed in hex.
- << markup(">");
- }
- }
- O << ')';
- }
-
- O << markup(">");
-}
-
-void X86ATTInstPrinter::printSrcIdx(const MCInst *MI, unsigned Op,
- raw_ostream &O) {
- O << markup("<mem:");
-
- // If this has a segment register, print it.
- printOptionalSegReg(MI, Op + 1, O);
-
- O << "(";
- printOperand(MI, Op, O);
- O << ")";
-
- O << markup(">");
-}
-
-void X86ATTInstPrinter::printDstIdx(const MCInst *MI, unsigned Op,
- raw_ostream &O) {
- O << markup("<mem:");
-
- O << "%es:(";
- printOperand(MI, Op, O);
- O << ")";
-
- O << markup(">");
-}
-
-void X86ATTInstPrinter::printMemOffset(const MCInst *MI, unsigned Op,
- raw_ostream &O) {
- const MCOperand &DispSpec = MI->getOperand(Op);
-
- O << markup("<mem:");
-
- // If this has a segment register, print it.
- printOptionalSegReg(MI, Op + 1, O);
-
- if (DispSpec.isImm()) {
- O << formatImm(DispSpec.getImm());
- } else {
- assert(DispSpec.isExpr() && "non-immediate displacement?");
- DispSpec.getExpr()->print(O, &MAI);
- }
-
- O << markup(">");
-}
-
-void X86ATTInstPrinter::printU8Imm(const MCInst *MI, unsigned Op,
- raw_ostream &O) {
- if (MI->getOperand(Op).isExpr())
- return printOperand(MI, Op, O);
-
- O << markup("<imm:") << '$' << formatImm(MI->getOperand(Op).getImm() & 0xff)
- << markup(">");
-}
diff --git a/lib/Target/X86/InstPrinter/X86InstPrinterCommon.cpp b/lib/Target/X86/InstPrinter/X86InstPrinterCommon.cpp
deleted file mode 100644
index 432cd47ae499..000000000000
--- a/lib/Target/X86/InstPrinter/X86InstPrinterCommon.cpp
+++ /dev/null
@@ -1,142 +0,0 @@
-//===--- X86InstPrinterCommon.cpp - X86 assembly instruction printing -----===//
-//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
-//
-//===----------------------------------------------------------------------===//
-//
-// This file includes common code for rendering MCInst instances as Intel-style
-// and Intel-style assembly.
-//
-//===----------------------------------------------------------------------===//
-
-#include "X86InstPrinterCommon.h"
-#include "MCTargetDesc/X86BaseInfo.h"
-#include "llvm/MC/MCExpr.h"
-#include "llvm/MC/MCInst.h"
-#include "llvm/MC/MCInstrDesc.h"
-#include "llvm/MC/MCInstrInfo.h"
-#include "llvm/Support/raw_ostream.h"
-#include "llvm/Support/Casting.h"
-#include <cstdint>
-#include <cassert>
-
-using namespace llvm;
-
-void X86InstPrinterCommon::printSSEAVXCC(const MCInst *MI, unsigned Op,
- raw_ostream &O) {
- int64_t Imm = MI->getOperand(Op).getImm();
- switch (Imm) {
- default: llvm_unreachable("Invalid ssecc/avxcc argument!");
- case 0: O << "eq"; break;
- case 1: O << "lt"; break;
- case 2: O << "le"; break;
- case 3: O << "unord"; break;
- case 4: O << "neq"; break;
- case 5: O << "nlt"; break;
- case 6: O << "nle"; break;
- case 7: O << "ord"; break;
- case 8: O << "eq_uq"; break;
- case 9: O << "nge"; break;
- case 0xa: O << "ngt"; break;
- case 0xb: O << "false"; break;
- case 0xc: O << "neq_oq"; break;
- case 0xd: O << "ge"; break;
- case 0xe: O << "gt"; break;
- case 0xf: O << "true"; break;
- case 0x10: O << "eq_os"; break;
- case 0x11: O << "lt_oq"; break;
- case 0x12: O << "le_oq"; break;
- case 0x13: O << "unord_s"; break;
- case 0x14: O << "neq_us"; break;
- case 0x15: O << "nlt_uq"; break;
- case 0x16: O << "nle_uq"; break;
- case 0x17: O << "ord_s"; break;
- case 0x18: O << "eq_us"; break;
- case 0x19: O << "nge_uq"; break;
- case 0x1a: O << "ngt_uq"; break;
- case 0x1b: O << "false_os"; break;
- case 0x1c: O << "neq_os"; break;
- case 0x1d: O << "ge_oq"; break;
- case 0x1e: O << "gt_oq"; break;
- case 0x1f: O << "true_us"; break;
- }
-}
-
-void X86InstPrinterCommon::printXOPCC(const MCInst *MI, unsigned Op,
- raw_ostream &O) {
- int64_t Imm = MI->getOperand(Op).getImm();
- switch (Imm) {
- default: llvm_unreachable("Invalid xopcc argument!");
- case 0: O << "lt"; break;
- case 1: O << "le"; break;
- case 2: O << "gt"; break;
- case 3: O << "ge"; break;
- case 4: O << "eq"; break;
- case 5: O << "neq"; break;
- case 6: O << "false"; break;
- case 7: O << "true"; break;
- }
-}
-
-void X86InstPrinterCommon::printRoundingControl(const MCInst *MI, unsigned Op,
- raw_ostream &O) {
- int64_t Imm = MI->getOperand(Op).getImm() & 0x3;
- switch (Imm) {
- case 0: O << "{rn-sae}"; break;
- case 1: O << "{rd-sae}"; break;
- case 2: O << "{ru-sae}"; break;
- case 3: O << "{rz-sae}"; break;
- }
-}
-
-/// printPCRelImm - This is used to print an immediate value that ends up
-/// being encoded as a pc-relative value (e.g. for jumps and calls). In
-/// Intel-style these print slightly differently than normal immediates.
-/// for example, a $ is not emitted.
-void X86InstPrinterCommon::printPCRelImm(const MCInst *MI, unsigned OpNo,
- raw_ostream &O) {
- const MCOperand &Op = MI->getOperand(OpNo);
- if (Op.isImm())
- O << formatImm(Op.getImm());
- else {
- assert(Op.isExpr() && "unknown pcrel immediate operand");
- // If a symbolic branch target was added as a constant expression then print
- // that address in hex.
- const MCConstantExpr *BranchTarget = dyn_cast<MCConstantExpr>(Op.getExpr());
- int64_t Address;
- if (BranchTarget && BranchTarget->evaluateAsAbsolute(Address)) {
- O << formatHex((uint64_t)Address);
- } else {
- // Otherwise, just print the expression.
- Op.getExpr()->print(O, &MAI);
- }
- }
-}
-
-void X86InstPrinterCommon::printOptionalSegReg(const MCInst *MI, unsigned OpNo,
- raw_ostream &O) {
- if (MI->getOperand(OpNo).getReg()) {
- printOperand(MI, OpNo, O);
- O << ':';
- }
-}
-
-void X86InstPrinterCommon::printInstFlags(const MCInst *MI, raw_ostream &O) {
- const MCInstrDesc &Desc = MII.get(MI->getOpcode());
- uint64_t TSFlags = Desc.TSFlags;
- unsigned Flags = MI->getFlags();
-
- if ((TSFlags & X86II::LOCK) || (Flags & X86::IP_HAS_LOCK))
- O << "\tlock\t";
-
- if ((TSFlags & X86II::NOTRACK) || (Flags & X86::IP_HAS_NOTRACK))
- O << "\tnotrack\t";
-
- if (Flags & X86::IP_HAS_REPEAT_NE)
- O << "\trepne\t";
- else if (Flags & X86::IP_HAS_REPEAT)
- O << "\trep\t";
-}
diff --git a/lib/Target/X86/InstPrinter/X86IntelInstPrinter.cpp b/lib/Target/X86/InstPrinter/X86IntelInstPrinter.cpp
deleted file mode 100644
index 044b71564152..000000000000
--- a/lib/Target/X86/InstPrinter/X86IntelInstPrinter.cpp
+++ /dev/null
@@ -1,162 +0,0 @@
-//===-- X86IntelInstPrinter.cpp - Intel assembly instruction printing -----===//
-//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
-//
-//===----------------------------------------------------------------------===//
-//
-// This file includes code for rendering MCInst instances as Intel-style
-// assembly.
-//
-//===----------------------------------------------------------------------===//
-
-#include "X86IntelInstPrinter.h"
-#include "MCTargetDesc/X86BaseInfo.h"
-#include "X86InstComments.h"
-#include "llvm/MC/MCExpr.h"
-#include "llvm/MC/MCInst.h"
-#include "llvm/MC/MCInstrDesc.h"
-#include "llvm/MC/MCInstrInfo.h"
-#include "llvm/MC/MCSubtargetInfo.h"
-#include "llvm/Support/Casting.h"
-#include "llvm/Support/ErrorHandling.h"
-#include <cassert>
-#include <cstdint>
-
-using namespace llvm;
-
-#define DEBUG_TYPE "asm-printer"
-
-#include "X86GenAsmWriter1.inc"
-
-void X86IntelInstPrinter::printRegName(raw_ostream &OS, unsigned RegNo) const {
- OS << getRegisterName(RegNo);
-}
-
-void X86IntelInstPrinter::printInst(const MCInst *MI, raw_ostream &OS,
- StringRef Annot,
- const MCSubtargetInfo &STI) {
- printInstFlags(MI, OS);
-
- // In 16-bit mode, print data16 as data32.
- if (MI->getOpcode() == X86::DATA16_PREFIX &&
- STI.getFeatureBits()[X86::Mode16Bit]) {
- OS << "\tdata32";
- } else
- printInstruction(MI, OS);
-
- // Next always print the annotation.
- printAnnotation(OS, Annot);
-
- // If verbose assembly is enabled, we can print some informative comments.
- if (CommentStream)
- EmitAnyX86InstComments(MI, *CommentStream, MII);
-}
-
-void X86IntelInstPrinter::printOperand(const MCInst *MI, unsigned OpNo,
- raw_ostream &O) {
- const MCOperand &Op = MI->getOperand(OpNo);
- if (Op.isReg()) {
- printRegName(O, Op.getReg());
- } else if (Op.isImm()) {
- O << formatImm((int64_t)Op.getImm());
- } else {
- assert(Op.isExpr() && "unknown operand kind in printOperand");
- O << "offset ";
- Op.getExpr()->print(O, &MAI);
- }
-}
-
-void X86IntelInstPrinter::printMemReference(const MCInst *MI, unsigned Op,
- raw_ostream &O) {
- const MCOperand &BaseReg = MI->getOperand(Op+X86::AddrBaseReg);
- unsigned ScaleVal = MI->getOperand(Op+X86::AddrScaleAmt).getImm();
- const MCOperand &IndexReg = MI->getOperand(Op+X86::AddrIndexReg);
- const MCOperand &DispSpec = MI->getOperand(Op+X86::AddrDisp);
-
- // If this has a segment register, print it.
- printOptionalSegReg(MI, Op + X86::AddrSegmentReg, O);
-
- O << '[';
-
- bool NeedPlus = false;
- if (BaseReg.getReg()) {
- printOperand(MI, Op+X86::AddrBaseReg, O);
- NeedPlus = true;
- }
-
- if (IndexReg.getReg()) {
- if (NeedPlus) O << " + ";
- if (ScaleVal != 1)
- O << ScaleVal << '*';
- printOperand(MI, Op+X86::AddrIndexReg, O);
- NeedPlus = true;
- }
-
- if (!DispSpec.isImm()) {
- if (NeedPlus) O << " + ";
- assert(DispSpec.isExpr() && "non-immediate displacement for LEA?");
- DispSpec.getExpr()->print(O, &MAI);
- } else {
- int64_t DispVal = DispSpec.getImm();
- if (DispVal || (!IndexReg.getReg() && !BaseReg.getReg())) {
- if (NeedPlus) {
- if (DispVal > 0)
- O << " + ";
- else {
- O << " - ";
- DispVal = -DispVal;
- }
- }
- O << formatImm(DispVal);
- }
- }
-
- O << ']';
-}
-
-void X86IntelInstPrinter::printSrcIdx(const MCInst *MI, unsigned Op,
- raw_ostream &O) {
- // If this has a segment register, print it.
- printOptionalSegReg(MI, Op + 1, O);
- O << '[';
- printOperand(MI, Op, O);
- O << ']';
-}
-
-void X86IntelInstPrinter::printDstIdx(const MCInst *MI, unsigned Op,
- raw_ostream &O) {
- // DI accesses are always ES-based.
- O << "es:[";
- printOperand(MI, Op, O);
- O << ']';
-}
-
-void X86IntelInstPrinter::printMemOffset(const MCInst *MI, unsigned Op,
- raw_ostream &O) {
- const MCOperand &DispSpec = MI->getOperand(Op);
-
- // If this has a segment register, print it.
- printOptionalSegReg(MI, Op + 1, O);
-
- O << '[';
-
- if (DispSpec.isImm()) {
- O << formatImm(DispSpec.getImm());
- } else {
- assert(DispSpec.isExpr() && "non-immediate displacement?");
- DispSpec.getExpr()->print(O, &MAI);
- }
-
- O << ']';
-}
-
-void X86IntelInstPrinter::printU8Imm(const MCInst *MI, unsigned Op,
- raw_ostream &O) {
- if (MI->getOperand(Op).isExpr())
- return MI->getOperand(Op).getExpr()->print(O, &MAI);
-
- O << formatImm(MI->getOperand(Op).getImm() & 0xff);
-}
diff --git a/lib/Target/X86/MCTargetDesc/X86ATTInstPrinter.cpp b/lib/Target/X86/MCTargetDesc/X86ATTInstPrinter.cpp
new file mode 100644
index 000000000000..ed2ee55ff2a5
--- /dev/null
+++ b/lib/Target/X86/MCTargetDesc/X86ATTInstPrinter.cpp
@@ -0,0 +1,487 @@
+//===-- X86ATTInstPrinter.cpp - AT&T assembly instruction printing --------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file includes code for rendering MCInst instances as AT&T-style
+// assembly.
+//
+//===----------------------------------------------------------------------===//
+
+#include "X86ATTInstPrinter.h"
+#include "X86BaseInfo.h"
+#include "X86InstComments.h"
+#include "llvm/MC/MCExpr.h"
+#include "llvm/MC/MCInst.h"
+#include "llvm/MC/MCInstrInfo.h"
+#include "llvm/MC/MCSubtargetInfo.h"
+#include "llvm/Support/Casting.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/Format.h"
+#include "llvm/Support/raw_ostream.h"
+#include <cassert>
+#include <cinttypes>
+#include <cstdint>
+
+using namespace llvm;
+
+#define DEBUG_TYPE "asm-printer"
+
+// Include the auto-generated portion of the assembly writer.
+#define PRINT_ALIAS_INSTR
+#include "X86GenAsmWriter.inc"
+
+void X86ATTInstPrinter::printRegName(raw_ostream &OS, unsigned RegNo) const {
+ OS << markup("<reg:") << '%' << getRegisterName(RegNo) << markup(">");
+}
+
+void X86ATTInstPrinter::printInst(const MCInst *MI, raw_ostream &OS,
+ StringRef Annot, const MCSubtargetInfo &STI) {
+ // If verbose assembly is enabled, we can print some informative comments.
+ if (CommentStream)
+ HasCustomInstComment = EmitAnyX86InstComments(MI, *CommentStream, MII);
+
+ printInstFlags(MI, OS);
+
+ // Output CALLpcrel32 as "callq" in 64-bit mode.
+ // In Intel annotation it's always emitted as "call".
+ //
+ // TODO: Probably this hack should be redesigned via InstAlias in
+ // InstrInfo.td as soon as Requires clause is supported properly
+ // for InstAlias.
+ if (MI->getOpcode() == X86::CALLpcrel32 &&
+ (STI.getFeatureBits()[X86::Mode64Bit])) {
+ OS << "\tcallq\t";
+ printPCRelImm(MI, 0, OS);
+ }
+ // data16 and data32 both have the same encoding of 0x66. While data32 is
+ // valid only in 16 bit systems, data16 is valid in the rest.
+ // There seems to be some lack of support of the Requires clause that causes
+ // 0x66 to be interpreted as "data16" by the asm printer.
+ // Thus we add an adjustment here in order to print the "right" instruction.
+ else if (MI->getOpcode() == X86::DATA16_PREFIX &&
+ STI.getFeatureBits()[X86::Mode16Bit]) {
+ OS << "\tdata32";
+ }
+ // Try to print any aliases first.
+ else if (!printAliasInstr(MI, OS) &&
+ !printVecCompareInstr(MI, OS))
+ printInstruction(MI, OS);
+
+ // Next always print the annotation.
+ printAnnotation(OS, Annot);
+}
+
+bool X86ATTInstPrinter::printVecCompareInstr(const MCInst *MI,
+ raw_ostream &OS) {
+ if (MI->getNumOperands() == 0 ||
+ !MI->getOperand(MI->getNumOperands() - 1).isImm())
+ return false;
+
+ int64_t Imm = MI->getOperand(MI->getNumOperands() - 1).getImm();
+
+ const MCInstrDesc &Desc = MII.get(MI->getOpcode());
+
+ // Custom print the vector compare instructions to get the immediate
+ // translated into the mnemonic.
+ switch (MI->getOpcode()) {
+ case X86::CMPPDrmi: case X86::CMPPDrri:
+ case X86::CMPPSrmi: case X86::CMPPSrri:
+ case X86::CMPSDrm: case X86::CMPSDrr:
+ case X86::CMPSDrm_Int: case X86::CMPSDrr_Int:
+ case X86::CMPSSrm: case X86::CMPSSrr:
+ case X86::CMPSSrm_Int: case X86::CMPSSrr_Int:
+ if (Imm >= 0 && Imm <= 7) {
+ OS << '\t';
+ printCMPMnemonic(MI, /*IsVCMP*/false, OS);
+
+ if ((Desc.TSFlags & X86II::FormMask) == X86II::MRMSrcMem) {
+ if ((Desc.TSFlags & X86II::OpPrefixMask) == X86II::XS)
+ printdwordmem(MI, 2, OS);
+ else if ((Desc.TSFlags & X86II::OpPrefixMask) == X86II::XD)
+ printqwordmem(MI, 2, OS);
+ else
+ printxmmwordmem(MI, 2, OS);
+ } else
+ printOperand(MI, 2, OS);
+
+ // Skip operand 1 as its tied to the dest.
+
+ OS << ", ";
+ printOperand(MI, 0, OS);
+ return true;
+ }
+ break;
+
+ case X86::VCMPPDrmi: case X86::VCMPPDrri:
+ case X86::VCMPPDYrmi: case X86::VCMPPDYrri:
+ case X86::VCMPPDZ128rmi: case X86::VCMPPDZ128rri:
+ case X86::VCMPPDZ256rmi: case X86::VCMPPDZ256rri:
+ case X86::VCMPPDZrmi: case X86::VCMPPDZrri:
+ case X86::VCMPPSrmi: case X86::VCMPPSrri:
+ case X86::VCMPPSYrmi: case X86::VCMPPSYrri:
+ case X86::VCMPPSZ128rmi: case X86::VCMPPSZ128rri:
+ case X86::VCMPPSZ256rmi: case X86::VCMPPSZ256rri:
+ case X86::VCMPPSZrmi: case X86::VCMPPSZrri:
+ case X86::VCMPSDrm: case X86::VCMPSDrr:
+ case X86::VCMPSDZrm: case X86::VCMPSDZrr:
+ case X86::VCMPSDrm_Int: case X86::VCMPSDrr_Int:
+ case X86::VCMPSDZrm_Int: case X86::VCMPSDZrr_Int:
+ case X86::VCMPSSrm: case X86::VCMPSSrr:
+ case X86::VCMPSSZrm: case X86::VCMPSSZrr:
+ case X86::VCMPSSrm_Int: case X86::VCMPSSrr_Int:
+ case X86::VCMPSSZrm_Int: case X86::VCMPSSZrr_Int:
+ case X86::VCMPPDZ128rmik: case X86::VCMPPDZ128rrik:
+ case X86::VCMPPDZ256rmik: case X86::VCMPPDZ256rrik:
+ case X86::VCMPPDZrmik: case X86::VCMPPDZrrik:
+ case X86::VCMPPSZ128rmik: case X86::VCMPPSZ128rrik:
+ case X86::VCMPPSZ256rmik: case X86::VCMPPSZ256rrik:
+ case X86::VCMPPSZrmik: case X86::VCMPPSZrrik:
+ case X86::VCMPSDZrm_Intk: case X86::VCMPSDZrr_Intk:
+ case X86::VCMPSSZrm_Intk: case X86::VCMPSSZrr_Intk:
+ case X86::VCMPPDZ128rmbi: case X86::VCMPPDZ128rmbik:
+ case X86::VCMPPDZ256rmbi: case X86::VCMPPDZ256rmbik:
+ case X86::VCMPPDZrmbi: case X86::VCMPPDZrmbik:
+ case X86::VCMPPSZ128rmbi: case X86::VCMPPSZ128rmbik:
+ case X86::VCMPPSZ256rmbi: case X86::VCMPPSZ256rmbik:
+ case X86::VCMPPSZrmbi: case X86::VCMPPSZrmbik:
+ case X86::VCMPPDZrrib: case X86::VCMPPDZrribk:
+ case X86::VCMPPSZrrib: case X86::VCMPPSZrribk:
+ case X86::VCMPSDZrrb_Int: case X86::VCMPSDZrrb_Intk:
+ case X86::VCMPSSZrrb_Int: case X86::VCMPSSZrrb_Intk:
+ if (Imm >= 0 && Imm <= 31) {
+ OS << '\t';
+ printCMPMnemonic(MI, /*IsVCMP*/true, OS);
+
+ unsigned CurOp = (Desc.TSFlags & X86II::EVEX_K) ? 3 : 2;
+
+ if ((Desc.TSFlags & X86II::FormMask) == X86II::MRMSrcMem) {
+ if (Desc.TSFlags & X86II::EVEX_B) {
+ // Broadcast form.
+ // Load size is based on W-bit.
+ if (Desc.TSFlags & X86II::VEX_W)
+ printqwordmem(MI, CurOp--, OS);
+ else
+ printdwordmem(MI, CurOp--, OS);
+
+ // Print the number of elements broadcasted.
+ unsigned NumElts;
+ if (Desc.TSFlags & X86II::EVEX_L2)
+ NumElts = (Desc.TSFlags & X86II::VEX_W) ? 8 : 16;
+ else if (Desc.TSFlags & X86II::VEX_L)
+ NumElts = (Desc.TSFlags & X86II::VEX_W) ? 4 : 8;
+ else
+ NumElts = (Desc.TSFlags & X86II::VEX_W) ? 2 : 4;
+ OS << "{1to" << NumElts << "}";
+ } else {
+ if ((Desc.TSFlags & X86II::OpPrefixMask) == X86II::XS)
+ printdwordmem(MI, CurOp--, OS);
+ else if ((Desc.TSFlags & X86II::OpPrefixMask) == X86II::XD)
+ printqwordmem(MI, CurOp--, OS);
+ else if (Desc.TSFlags & X86II::EVEX_L2)
+ printzmmwordmem(MI, CurOp--, OS);
+ else if (Desc.TSFlags & X86II::VEX_L)
+ printymmwordmem(MI, CurOp--, OS);
+ else
+ printxmmwordmem(MI, CurOp--, OS);
+ }
+ } else {
+ if (Desc.TSFlags & X86II::EVEX_B)
+ OS << "{sae}, ";
+ printOperand(MI, CurOp--, OS);
+ }
+
+ OS << ", ";
+ printOperand(MI, CurOp--, OS);
+ OS << ", ";
+ printOperand(MI, 0, OS);
+ if (CurOp > 0) {
+ // Print mask operand.
+ OS << " {";
+ printOperand(MI, CurOp--, OS);
+ OS << "}";
+ }
+
+ return true;
+ }
+ break;
+
+ case X86::VPCOMBmi: case X86::VPCOMBri:
+ case X86::VPCOMDmi: case X86::VPCOMDri:
+ case X86::VPCOMQmi: case X86::VPCOMQri:
+ case X86::VPCOMUBmi: case X86::VPCOMUBri:
+ case X86::VPCOMUDmi: case X86::VPCOMUDri:
+ case X86::VPCOMUQmi: case X86::VPCOMUQri:
+ case X86::VPCOMUWmi: case X86::VPCOMUWri:
+ case X86::VPCOMWmi: case X86::VPCOMWri:
+ if (Imm >= 0 && Imm <= 7) {
+ OS << '\t';
+ printVPCOMMnemonic(MI, OS);
+
+ if ((Desc.TSFlags & X86II::FormMask) == X86II::MRMSrcMem)
+ printxmmwordmem(MI, 2, OS);
+ else
+ printOperand(MI, 2, OS);
+
+ OS << ", ";
+ printOperand(MI, 1, OS);
+ OS << ", ";
+ printOperand(MI, 0, OS);
+ return true;
+ }
+ break;
+
+ case X86::VPCMPBZ128rmi: case X86::VPCMPBZ128rri:
+ case X86::VPCMPBZ256rmi: case X86::VPCMPBZ256rri:
+ case X86::VPCMPBZrmi: case X86::VPCMPBZrri:
+ case X86::VPCMPDZ128rmi: case X86::VPCMPDZ128rri:
+ case X86::VPCMPDZ256rmi: case X86::VPCMPDZ256rri:
+ case X86::VPCMPDZrmi: case X86::VPCMPDZrri:
+ case X86::VPCMPQZ128rmi: case X86::VPCMPQZ128rri:
+ case X86::VPCMPQZ256rmi: case X86::VPCMPQZ256rri:
+ case X86::VPCMPQZrmi: case X86::VPCMPQZrri:
+ case X86::VPCMPUBZ128rmi: case X86::VPCMPUBZ128rri:
+ case X86::VPCMPUBZ256rmi: case X86::VPCMPUBZ256rri:
+ case X86::VPCMPUBZrmi: case X86::VPCMPUBZrri:
+ case X86::VPCMPUDZ128rmi: case X86::VPCMPUDZ128rri:
+ case X86::VPCMPUDZ256rmi: case X86::VPCMPUDZ256rri:
+ case X86::VPCMPUDZrmi: case X86::VPCMPUDZrri:
+ case X86::VPCMPUQZ128rmi: case X86::VPCMPUQZ128rri:
+ case X86::VPCMPUQZ256rmi: case X86::VPCMPUQZ256rri:
+ case X86::VPCMPUQZrmi: case X86::VPCMPUQZrri:
+ case X86::VPCMPUWZ128rmi: case X86::VPCMPUWZ128rri:
+ case X86::VPCMPUWZ256rmi: case X86::VPCMPUWZ256rri:
+ case X86::VPCMPUWZrmi: case X86::VPCMPUWZrri:
+ case X86::VPCMPWZ128rmi: case X86::VPCMPWZ128rri:
+ case X86::VPCMPWZ256rmi: case X86::VPCMPWZ256rri:
+ case X86::VPCMPWZrmi: case X86::VPCMPWZrri:
+ case X86::VPCMPBZ128rmik: case X86::VPCMPBZ128rrik:
+ case X86::VPCMPBZ256rmik: case X86::VPCMPBZ256rrik:
+ case X86::VPCMPBZrmik: case X86::VPCMPBZrrik:
+ case X86::VPCMPDZ128rmik: case X86::VPCMPDZ128rrik:
+ case X86::VPCMPDZ256rmik: case X86::VPCMPDZ256rrik:
+ case X86::VPCMPDZrmik: case X86::VPCMPDZrrik:
+ case X86::VPCMPQZ128rmik: case X86::VPCMPQZ128rrik:
+ case X86::VPCMPQZ256rmik: case X86::VPCMPQZ256rrik:
+ case X86::VPCMPQZrmik: case X86::VPCMPQZrrik:
+ case X86::VPCMPUBZ128rmik: case X86::VPCMPUBZ128rrik:
+ case X86::VPCMPUBZ256rmik: case X86::VPCMPUBZ256rrik:
+ case X86::VPCMPUBZrmik: case X86::VPCMPUBZrrik:
+ case X86::VPCMPUDZ128rmik: case X86::VPCMPUDZ128rrik:
+ case X86::VPCMPUDZ256rmik: case X86::VPCMPUDZ256rrik:
+ case X86::VPCMPUDZrmik: case X86::VPCMPUDZrrik:
+ case X86::VPCMPUQZ128rmik: case X86::VPCMPUQZ128rrik:
+ case X86::VPCMPUQZ256rmik: case X86::VPCMPUQZ256rrik:
+ case X86::VPCMPUQZrmik: case X86::VPCMPUQZrrik:
+ case X86::VPCMPUWZ128rmik: case X86::VPCMPUWZ128rrik:
+ case X86::VPCMPUWZ256rmik: case X86::VPCMPUWZ256rrik:
+ case X86::VPCMPUWZrmik: case X86::VPCMPUWZrrik:
+ case X86::VPCMPWZ128rmik: case X86::VPCMPWZ128rrik:
+ case X86::VPCMPWZ256rmik: case X86::VPCMPWZ256rrik:
+ case X86::VPCMPWZrmik: case X86::VPCMPWZrrik:
+ case X86::VPCMPDZ128rmib: case X86::VPCMPDZ128rmibk:
+ case X86::VPCMPDZ256rmib: case X86::VPCMPDZ256rmibk:
+ case X86::VPCMPDZrmib: case X86::VPCMPDZrmibk:
+ case X86::VPCMPQZ128rmib: case X86::VPCMPQZ128rmibk:
+ case X86::VPCMPQZ256rmib: case X86::VPCMPQZ256rmibk:
+ case X86::VPCMPQZrmib: case X86::VPCMPQZrmibk:
+ case X86::VPCMPUDZ128rmib: case X86::VPCMPUDZ128rmibk:
+ case X86::VPCMPUDZ256rmib: case X86::VPCMPUDZ256rmibk:
+ case X86::VPCMPUDZrmib: case X86::VPCMPUDZrmibk:
+ case X86::VPCMPUQZ128rmib: case X86::VPCMPUQZ128rmibk:
+ case X86::VPCMPUQZ256rmib: case X86::VPCMPUQZ256rmibk:
+ case X86::VPCMPUQZrmib: case X86::VPCMPUQZrmibk:
+ if ((Imm >= 0 && Imm <= 2) || (Imm >= 4 && Imm <= 6)) {
+ OS << '\t';
+ printVPCMPMnemonic(MI, OS);
+
+ unsigned CurOp = (Desc.TSFlags & X86II::EVEX_K) ? 3 : 2;
+
+ if ((Desc.TSFlags & X86II::FormMask) == X86II::MRMSrcMem) {
+ if (Desc.TSFlags & X86II::EVEX_B) {
+ // Broadcast form.
+ // Load size is based on W-bit as only D and Q are supported.
+ if (Desc.TSFlags & X86II::VEX_W)
+ printqwordmem(MI, CurOp--, OS);
+ else
+ printdwordmem(MI, CurOp--, OS);
+
+ // Print the number of elements broadcasted.
+ unsigned NumElts;
+ if (Desc.TSFlags & X86II::EVEX_L2)
+ NumElts = (Desc.TSFlags & X86II::VEX_W) ? 8 : 16;
+ else if (Desc.TSFlags & X86II::VEX_L)
+ NumElts = (Desc.TSFlags & X86II::VEX_W) ? 4 : 8;
+ else
+ NumElts = (Desc.TSFlags & X86II::VEX_W) ? 2 : 4;
+ OS << "{1to" << NumElts << "}";
+ } else {
+ if (Desc.TSFlags & X86II::EVEX_L2)
+ printzmmwordmem(MI, CurOp--, OS);
+ else if (Desc.TSFlags & X86II::VEX_L)
+ printymmwordmem(MI, CurOp--, OS);
+ else
+ printxmmwordmem(MI, CurOp--, OS);
+ }
+ } else {
+ printOperand(MI, CurOp--, OS);
+ }
+
+ OS << ", ";
+ printOperand(MI, CurOp--, OS);
+ OS << ", ";
+ printOperand(MI, 0, OS);
+ if (CurOp > 0) {
+ // Print mask operand.
+ OS << " {";
+ printOperand(MI, CurOp--, OS);
+ OS << "}";
+ }
+
+ return true;
+ }
+ break;
+ }
+
+ return false;
+}
+
+void X86ATTInstPrinter::printOperand(const MCInst *MI, unsigned OpNo,
+ raw_ostream &O) {
+ const MCOperand &Op = MI->getOperand(OpNo);
+ if (Op.isReg()) {
+ printRegName(O, Op.getReg());
+ } else if (Op.isImm()) {
+ // Print immediates as signed values.
+ int64_t Imm = Op.getImm();
+ O << markup("<imm:") << '$' << formatImm(Imm) << markup(">");
+
+ // TODO: This should be in a helper function in the base class, so it can
+ // be used by other printers.
+
+ // If there are no instruction-specific comments, add a comment clarifying
+ // the hex value of the immediate operand when it isn't in the range
+ // [-256,255].
+ if (CommentStream && !HasCustomInstComment && (Imm > 255 || Imm < -256)) {
+ // Don't print unnecessary hex sign bits.
+ if (Imm == (int16_t)(Imm))
+ *CommentStream << format("imm = 0x%" PRIX16 "\n", (uint16_t)Imm);
+ else if (Imm == (int32_t)(Imm))
+ *CommentStream << format("imm = 0x%" PRIX32 "\n", (uint32_t)Imm);
+ else
+ *CommentStream << format("imm = 0x%" PRIX64 "\n", (uint64_t)Imm);
+ }
+ } else {
+ assert(Op.isExpr() && "unknown operand kind in printOperand");
+ O << markup("<imm:") << '$';
+ Op.getExpr()->print(O, &MAI);
+ O << markup(">");
+ }
+}
+
+void X86ATTInstPrinter::printMemReference(const MCInst *MI, unsigned Op,
+ raw_ostream &O) {
+ const MCOperand &BaseReg = MI->getOperand(Op + X86::AddrBaseReg);
+ const MCOperand &IndexReg = MI->getOperand(Op + X86::AddrIndexReg);
+ const MCOperand &DispSpec = MI->getOperand(Op + X86::AddrDisp);
+
+ O << markup("<mem:");
+
+ // If this has a segment register, print it.
+ printOptionalSegReg(MI, Op + X86::AddrSegmentReg, O);
+
+ if (DispSpec.isImm()) {
+ int64_t DispVal = DispSpec.getImm();
+ if (DispVal || (!IndexReg.getReg() && !BaseReg.getReg()))
+ O << formatImm(DispVal);
+ } else {
+ assert(DispSpec.isExpr() && "non-immediate displacement for LEA?");
+ DispSpec.getExpr()->print(O, &MAI);
+ }
+
+ if (IndexReg.getReg() || BaseReg.getReg()) {
+ O << '(';
+ if (BaseReg.getReg())
+ printOperand(MI, Op + X86::AddrBaseReg, O);
+
+ if (IndexReg.getReg()) {
+ O << ',';
+ printOperand(MI, Op + X86::AddrIndexReg, O);
+ unsigned ScaleVal = MI->getOperand(Op + X86::AddrScaleAmt).getImm();
+ if (ScaleVal != 1) {
+ O << ',' << markup("<imm:") << ScaleVal // never printed in hex.
+ << markup(">");
+ }
+ }
+ O << ')';
+ }
+
+ O << markup(">");
+}
+
+void X86ATTInstPrinter::printSrcIdx(const MCInst *MI, unsigned Op,
+ raw_ostream &O) {
+ O << markup("<mem:");
+
+ // If this has a segment register, print it.
+ printOptionalSegReg(MI, Op + 1, O);
+
+ O << "(";
+ printOperand(MI, Op, O);
+ O << ")";
+
+ O << markup(">");
+}
+
+void X86ATTInstPrinter::printDstIdx(const MCInst *MI, unsigned Op,
+ raw_ostream &O) {
+ O << markup("<mem:");
+
+ O << "%es:(";
+ printOperand(MI, Op, O);
+ O << ")";
+
+ O << markup(">");
+}
+
+void X86ATTInstPrinter::printMemOffset(const MCInst *MI, unsigned Op,
+ raw_ostream &O) {
+ const MCOperand &DispSpec = MI->getOperand(Op);
+
+ O << markup("<mem:");
+
+ // If this has a segment register, print it.
+ printOptionalSegReg(MI, Op + 1, O);
+
+ if (DispSpec.isImm()) {
+ O << formatImm(DispSpec.getImm());
+ } else {
+ assert(DispSpec.isExpr() && "non-immediate displacement?");
+ DispSpec.getExpr()->print(O, &MAI);
+ }
+
+ O << markup(">");
+}
+
+void X86ATTInstPrinter::printU8Imm(const MCInst *MI, unsigned Op,
+ raw_ostream &O) {
+ if (MI->getOperand(Op).isExpr())
+ return printOperand(MI, Op, O);
+
+ O << markup("<imm:") << '$' << formatImm(MI->getOperand(Op).getImm() & 0xff)
+ << markup(">");
+}
+
+void X86ATTInstPrinter::printSTiRegOperand(const MCInst *MI, unsigned OpNo,
+ raw_ostream &OS) {
+ const MCOperand &Op = MI->getOperand(OpNo);
+ unsigned Reg = Op.getReg();
+ // Override the default printing to print st(0) instead st.
+ if (Reg == X86::ST0)
+ OS << markup("<reg:") << "%st(0)" << markup(">");
+ else
+ printRegName(OS, Reg);
+}
diff --git a/lib/Target/X86/InstPrinter/X86ATTInstPrinter.h b/lib/Target/X86/MCTargetDesc/X86ATTInstPrinter.h
index 57422bc9a0b2..747ddd30a2d9 100644
--- a/lib/Target/X86/InstPrinter/X86ATTInstPrinter.h
+++ b/lib/Target/X86/MCTargetDesc/X86ATTInstPrinter.h
@@ -1,9 +1,8 @@
//=- X86ATTInstPrinter.h - Convert X86 MCInst to assembly syntax --*- C++ -*-=//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
@@ -11,8 +10,8 @@
//
//===----------------------------------------------------------------------===//
-#ifndef LLVM_LIB_TARGET_X86_INSTPRINTER_X86ATTINSTPRINTER_H
-#define LLVM_LIB_TARGET_X86_INSTPRINTER_X86ATTINSTPRINTER_H
+#ifndef LLVM_LIB_TARGET_X86_MCTARGETDESC_X86ATTINSTPRINTER_H
+#define LLVM_LIB_TARGET_X86_MCTARGETDESC_X86ATTINSTPRINTER_H
#include "X86InstPrinterCommon.h"
@@ -22,11 +21,12 @@ class X86ATTInstPrinter final : public X86InstPrinterCommon {
public:
X86ATTInstPrinter(const MCAsmInfo &MAI, const MCInstrInfo &MII,
const MCRegisterInfo &MRI)
- : X86InstPrinterCommon(MAI, MII, MRI) {}
+ : X86InstPrinterCommon(MAI, MII, MRI), HasCustomInstComment(false) {}
void printRegName(raw_ostream &OS, unsigned RegNo) const override;
void printInst(const MCInst *MI, raw_ostream &OS, StringRef Annot,
const MCSubtargetInfo &STI) override;
+ bool printVecCompareInstr(const MCInst *MI, raw_ostream &OS);
// Autogenerated by tblgen, returns true if we successfully printed an
// alias.
@@ -44,6 +44,7 @@ public:
void printSrcIdx(const MCInst *MI, unsigned Op, raw_ostream &O);
void printDstIdx(const MCInst *MI, unsigned Op, raw_ostream &O);
void printU8Imm(const MCInst *MI, unsigned Op, raw_ostream &OS);
+ void printSTiRegOperand(const MCInst *MI, unsigned OpNo, raw_ostream &OS);
void printanymem(const MCInst *MI, unsigned OpNo, raw_ostream &O) {
printMemReference(MI, OpNo, O);
@@ -52,43 +53,28 @@ public:
printMemReference(MI, OpNo, O);
}
- void printi8mem(const MCInst *MI, unsigned OpNo, raw_ostream &O) {
- printMemReference(MI, OpNo, O);
- }
- void printi16mem(const MCInst *MI, unsigned OpNo, raw_ostream &O) {
- printMemReference(MI, OpNo, O);
- }
- void printi32mem(const MCInst *MI, unsigned OpNo, raw_ostream &O) {
- printMemReference(MI, OpNo, O);
- }
- void printi64mem(const MCInst *MI, unsigned OpNo, raw_ostream &O) {
- printMemReference(MI, OpNo, O);
- }
- void printi128mem(const MCInst *MI, unsigned OpNo, raw_ostream &O) {
- printMemReference(MI, OpNo, O);
- }
- void printi256mem(const MCInst *MI, unsigned OpNo, raw_ostream &O) {
+ void printbytemem(const MCInst *MI, unsigned OpNo, raw_ostream &O) {
printMemReference(MI, OpNo, O);
}
- void printi512mem(const MCInst *MI, unsigned OpNo, raw_ostream &O) {
+ void printwordmem(const MCInst *MI, unsigned OpNo, raw_ostream &O) {
printMemReference(MI, OpNo, O);
}
- void printf32mem(const MCInst *MI, unsigned OpNo, raw_ostream &O) {
+ void printdwordmem(const MCInst *MI, unsigned OpNo, raw_ostream &O) {
printMemReference(MI, OpNo, O);
}
- void printf64mem(const MCInst *MI, unsigned OpNo, raw_ostream &O) {
+ void printqwordmem(const MCInst *MI, unsigned OpNo, raw_ostream &O) {
printMemReference(MI, OpNo, O);
}
- void printf80mem(const MCInst *MI, unsigned OpNo, raw_ostream &O) {
+ void printxmmwordmem(const MCInst *MI, unsigned OpNo, raw_ostream &O) {
printMemReference(MI, OpNo, O);
}
- void printf128mem(const MCInst *MI, unsigned OpNo, raw_ostream &O) {
+ void printymmwordmem(const MCInst *MI, unsigned OpNo, raw_ostream &O) {
printMemReference(MI, OpNo, O);
}
- void printf256mem(const MCInst *MI, unsigned OpNo, raw_ostream &O) {
+ void printzmmwordmem(const MCInst *MI, unsigned OpNo, raw_ostream &O) {
printMemReference(MI, OpNo, O);
}
- void printf512mem(const MCInst *MI, unsigned OpNo, raw_ostream &O) {
+ void printtbytemem(const MCInst *MI, unsigned OpNo, raw_ostream &O) {
printMemReference(MI, OpNo, O);
}
@@ -135,4 +121,4 @@ private:
} // end namespace llvm
-#endif // LLVM_LIB_TARGET_X86_INSTPRINTER_X86ATTINSTPRINTER_H
+#endif // LLVM_LIB_TARGET_X86_MCTARGETDESC_X86ATTINSTPRINTER_H
diff --git a/lib/Target/X86/MCTargetDesc/X86AsmBackend.cpp b/lib/Target/X86/MCTargetDesc/X86AsmBackend.cpp
index 64e6fb9f0375..54413fa1a02f 100644
--- a/lib/Target/X86/MCTargetDesc/X86AsmBackend.cpp
+++ b/lib/Target/X86/MCTargetDesc/X86AsmBackend.cpp
@@ -1,9 +1,8 @@
//===-- X86AsmBackend.cpp - X86 Assembler Backend -------------------------===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
@@ -13,6 +12,7 @@
#include "llvm/BinaryFormat/ELF.h"
#include "llvm/BinaryFormat/MachO.h"
#include "llvm/MC/MCAsmBackend.h"
+#include "llvm/MC/MCDwarf.h"
#include "llvm/MC/MCELFObjectWriter.h"
#include "llvm/MC/MCExpr.h"
#include "llvm/MC/MCFixupKindInfo.h"
@@ -26,18 +26,20 @@
#include "llvm/Support/raw_ostream.h"
using namespace llvm;
-static unsigned getFixupKindLog2Size(unsigned Kind) {
+static unsigned getFixupKindSize(unsigned Kind) {
switch (Kind) {
default:
llvm_unreachable("invalid fixup kind!");
+ case FK_NONE:
+ return 0;
case FK_PCRel_1:
case FK_SecRel_1:
case FK_Data_1:
- return 0;
+ return 1;
case FK_PCRel_2:
case FK_SecRel_2:
case FK_Data_2:
- return 1;
+ return 2;
case FK_PCRel_4:
case X86::reloc_riprel_4byte:
case X86::reloc_riprel_4byte_relax:
@@ -49,12 +51,12 @@ static unsigned getFixupKindLog2Size(unsigned Kind) {
case X86::reloc_branch_4byte_pcrel:
case FK_SecRel_4:
case FK_Data_4:
- return 2;
+ return 4;
case FK_PCRel_8:
case FK_SecRel_8:
case FK_Data_8:
case X86::reloc_global_offset_table8:
- return 3;
+ return 8;
}
}
@@ -77,6 +79,8 @@ public:
return X86::NumTargetFixupKinds;
}
+ Optional<MCFixupKind> getFixupKind(StringRef Name) const override;
+
const MCFixupKindInfo &getFixupKindInfo(MCFixupKind Kind) const override {
const static MCFixupKindInfo Infos[X86::NumTargetFixupKinds] = {
{"reloc_riprel_4byte", 0, 32, MCFixupKindInfo::FKF_IsPCRel},
@@ -99,11 +103,14 @@ public:
return Infos[Kind - FirstTargetFixupKind];
}
+ bool shouldForceRelocation(const MCAssembler &Asm, const MCFixup &Fixup,
+ const MCValue &Target) override;
+
void applyFixup(const MCAssembler &Asm, const MCFixup &Fixup,
const MCValue &Target, MutableArrayRef<char> Data,
uint64_t Value, bool IsResolved,
const MCSubtargetInfo *STI) const override {
- unsigned Size = 1 << getFixupKindLog2Size(Fixup.getKind());
+ unsigned Size = getFixupKindSize(Fixup.getKind());
assert(Fixup.getOffset() + Size <= Data.size() && "Invalid fixup offset!");
@@ -111,7 +118,7 @@ public:
// Specifically ignore overflow/underflow as long as the leakage is
// limited to the lower bits. This is to remain compatible with
// other assemblers.
- assert(isIntN(Size * 8 + 1, Value) &&
+ assert((Size == 0 || isIntN(Size * 8 + 1, Value)) &&
"Value does not fit in the Fixup field");
for (unsigned i = 0; i != Size; ++i)
@@ -137,40 +144,10 @@ static unsigned getRelaxedOpcodeBranch(const MCInst &Inst, bool is16BitMode) {
switch (Op) {
default:
return Op;
- case X86::JAE_1:
- return (is16BitMode) ? X86::JAE_2 : X86::JAE_4;
- case X86::JA_1:
- return (is16BitMode) ? X86::JA_2 : X86::JA_4;
- case X86::JBE_1:
- return (is16BitMode) ? X86::JBE_2 : X86::JBE_4;
- case X86::JB_1:
- return (is16BitMode) ? X86::JB_2 : X86::JB_4;
- case X86::JE_1:
- return (is16BitMode) ? X86::JE_2 : X86::JE_4;
- case X86::JGE_1:
- return (is16BitMode) ? X86::JGE_2 : X86::JGE_4;
- case X86::JG_1:
- return (is16BitMode) ? X86::JG_2 : X86::JG_4;
- case X86::JLE_1:
- return (is16BitMode) ? X86::JLE_2 : X86::JLE_4;
- case X86::JL_1:
- return (is16BitMode) ? X86::JL_2 : X86::JL_4;
+ case X86::JCC_1:
+ return (is16BitMode) ? X86::JCC_2 : X86::JCC_4;
case X86::JMP_1:
return (is16BitMode) ? X86::JMP_2 : X86::JMP_4;
- case X86::JNE_1:
- return (is16BitMode) ? X86::JNE_2 : X86::JNE_4;
- case X86::JNO_1:
- return (is16BitMode) ? X86::JNO_2 : X86::JNO_4;
- case X86::JNP_1:
- return (is16BitMode) ? X86::JNP_2 : X86::JNP_4;
- case X86::JNS_1:
- return (is16BitMode) ? X86::JNS_2 : X86::JNS_4;
- case X86::JO_1:
- return (is16BitMode) ? X86::JO_2 : X86::JO_4;
- case X86::JP_1:
- return (is16BitMode) ? X86::JP_2 : X86::JP_4;
- case X86::JS_1:
- return (is16BitMode) ? X86::JS_2 : X86::JS_4;
}
}
@@ -266,6 +243,25 @@ static unsigned getRelaxedOpcode(const MCInst &Inst, bool is16BitMode) {
return getRelaxedOpcodeBranch(Inst, is16BitMode);
}
+Optional<MCFixupKind> X86AsmBackend::getFixupKind(StringRef Name) const {
+ if (STI.getTargetTriple().isOSBinFormatELF()) {
+ if (STI.getTargetTriple().getArch() == Triple::x86_64) {
+ if (Name == "R_X86_64_NONE")
+ return FK_NONE;
+ } else {
+ if (Name == "R_386_NONE")
+ return FK_NONE;
+ }
+ }
+ return MCAsmBackend::getFixupKind(Name);
+}
+
+bool X86AsmBackend::shouldForceRelocation(const MCAssembler &,
+ const MCFixup &Fixup,
+ const MCValue &) {
+ return Fixup.getKind() == FK_NONE;
+}
+
bool X86AsmBackend::mayNeedRelaxation(const MCInst &Inst,
const MCSubtargetInfo &STI) const {
// Branches can always be relaxed in either mode.
diff --git a/lib/Target/X86/MCTargetDesc/X86BaseInfo.h b/lib/Target/X86/MCTargetDesc/X86BaseInfo.h
index c85ce9bbd5a4..6bd6c6cac7df 100644
--- a/lib/Target/X86/MCTargetDesc/X86BaseInfo.h
+++ b/lib/Target/X86/MCTargetDesc/X86BaseInfo.h
@@ -1,9 +1,8 @@
//===-- X86BaseInfo.h - Top level definitions for X86 -------- --*- C++ -*-===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
@@ -49,7 +48,8 @@ namespace X86 {
TO_NEG_INF = 1,
TO_POS_INF = 2,
TO_ZERO = 3,
- CUR_DIRECTION = 4
+ CUR_DIRECTION = 4,
+ NO_EXC = 8
};
/// The constants to describe instr prefixes if there are
@@ -60,9 +60,46 @@ namespace X86 {
IP_HAS_REPEAT_NE = 4,
IP_HAS_REPEAT = 8,
IP_HAS_LOCK = 16,
- NO_SCHED_INFO = 32, // Don't add sched comment to the current instr because
- // it was already added
- IP_HAS_NOTRACK = 64
+ IP_HAS_NOTRACK = 32,
+ IP_USE_VEX3 = 64,
+ };
+
+ enum OperandType : unsigned {
+ /// AVX512 embedded rounding control. This should only have values 0-3.
+ OPERAND_ROUNDING_CONTROL = MCOI::OPERAND_FIRST_TARGET,
+ OPERAND_COND_CODE,
+ };
+
+ // X86 specific condition code. These correspond to X86_*_COND in
+ // X86InstrInfo.td. They must be kept in synch.
+ enum CondCode {
+ COND_O = 0,
+ COND_NO = 1,
+ COND_B = 2,
+ COND_AE = 3,
+ COND_E = 4,
+ COND_NE = 5,
+ COND_BE = 6,
+ COND_A = 7,
+ COND_S = 8,
+ COND_NS = 9,
+ COND_P = 10,
+ COND_NP = 11,
+ COND_L = 12,
+ COND_GE = 13,
+ COND_LE = 14,
+ COND_G = 15,
+ LAST_VALID_COND = COND_G,
+
+ // Artificial condition codes. These are used by AnalyzeBranch
+ // to indicate a block terminated with two conditional branches that together
+ // form a compound condition. They occur in code using FCMP_OEQ or FCMP_UNE,
+ // which can't be represented on x86 with a single condition. These
+ // are never used in MachineInstrs and are inverses of one another.
+ COND_NE_OR_P,
+ COND_E_AND_NP,
+
+ COND_INVALID
};
} // end namespace X86;
@@ -285,6 +322,10 @@ namespace X86II {
/// manual, this operand is described as pntr16:32 and pntr16:16
RawFrmImm16 = 8,
+ /// AddCCFrm - This form is used for Jcc that encode the condition code
+ /// in the lower 4 bits of the opcode.
+ AddCCFrm = 9,
+
/// MRM[0-7][rm] - These forms are used to represent instructions that use
/// a Mod/RM byte, and use the middle field to hold extended opcode
/// information. In the intel manual these are represented as /0, /1, ...
@@ -310,10 +351,21 @@ namespace X86II {
///
MRMSrcMemOp4 = 35,
+ /// MRMSrcMemCC - This form is used for instructions that use the Mod/RM
+ /// byte to specify the operands and also encodes a condition code.
+ ///
+ MRMSrcMemCC = 36,
+
+ /// MRMXm - This form is used for instructions that use the Mod/RM byte
+ /// to specify a memory source, but doesn't use the middle field. And has
+ /// a condition code.
+ ///
+ MRMXmCC = 38,
+
/// MRMXm - This form is used for instructions that use the Mod/RM byte
/// to specify a memory source, but doesn't use the middle field.
///
- MRMXm = 39, // Instruction that uses Mod/RM but not the middle field.
+ MRMXm = 39,
// Next, instructions that operate on a memory r/m operand...
MRM0m = 40, MRM1m = 41, MRM2m = 42, MRM3m = 43, // Format /0 /1 /2 /3
@@ -339,10 +391,21 @@ namespace X86II {
///
MRMSrcRegOp4 = 51,
+ /// MRMSrcRegCC - This form is used for instructions that use the Mod/RM
+ /// byte to specify the operands and also encodes a condition code
+ ///
+ MRMSrcRegCC = 52,
+
+ /// MRMXCCr - This form is used for instructions that use the Mod/RM byte
+ /// to specify a register source, but doesn't use the middle field. And has
+ /// a condition code.
+ ///
+ MRMXrCC = 54,
+
/// MRMXr - This form is used for instructions that use the Mod/RM byte
/// to specify a register source, but doesn't use the middle field.
///
- MRMXr = 55, // Instruction that uses Mod/RM but not the middle field.
+ MRMXr = 55,
// Instructions that operate on a register r/m operand...
MRM0r = 56, MRM1r = 57, MRM2r = 58, MRM3r = 59, // Format /0 /1 /2 /3
@@ -681,8 +744,7 @@ namespace X86II {
// has it as the last op.
if (NumOps == 9 && Desc.getOperandConstraint(2, MCOI::TIED_TO) == 0 &&
(Desc.getOperandConstraint(3, MCOI::TIED_TO) == 1 ||
- Desc.getOperandConstraint(8, MCOI::TIED_TO) == 1) &&
- "Instruction with 2 defs isn't gather?")
+ Desc.getOperandConstraint(8, MCOI::TIED_TO) == 1))
return 2;
return 0;
}
@@ -711,6 +773,7 @@ namespace X86II {
case X86II::RawFrmSrc:
case X86II::RawFrmDst:
case X86II::RawFrmDstSrc:
+ case X86II::AddCCFrm:
return -1;
case X86II::MRMDestMem:
return 0;
@@ -724,16 +787,23 @@ namespace X86II {
case X86II::MRMSrcMemOp4:
// Skip registers encoded in reg, VEX_VVVV, and I8IMM.
return 3;
+ case X86II::MRMSrcMemCC:
+ // Start from 1, skip any registers encoded in VEX_VVVV or I8IMM, or a
+ // mask register.
+ return 1;
case X86II::MRMDestReg:
case X86II::MRMSrcReg:
case X86II::MRMSrcReg4VOp3:
case X86II::MRMSrcRegOp4:
+ case X86II::MRMSrcRegCC:
+ case X86II::MRMXrCC:
case X86II::MRMXr:
case X86II::MRM0r: case X86II::MRM1r:
case X86II::MRM2r: case X86II::MRM3r:
case X86II::MRM4r: case X86II::MRM5r:
case X86II::MRM6r: case X86II::MRM7r:
return -1;
+ case X86II::MRMXmCC:
case X86II::MRMXm:
case X86II::MRM0m: case X86II::MRM1m:
case X86II::MRM2m: case X86II::MRM3m:
diff --git a/lib/Target/X86/MCTargetDesc/X86ELFObjectWriter.cpp b/lib/Target/X86/MCTargetDesc/X86ELFObjectWriter.cpp
index b724a89f81d2..232a06593238 100644
--- a/lib/Target/X86/MCTargetDesc/X86ELFObjectWriter.cpp
+++ b/lib/Target/X86/MCTargetDesc/X86ELFObjectWriter.cpp
@@ -1,9 +1,8 @@
//===-- X86ELFObjectWriter.cpp - X86 ELF Writer ---------------------------===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
@@ -45,7 +44,7 @@ X86ELFObjectWriter::X86ELFObjectWriter(bool IsELF64, uint8_t OSABI,
(EMachine != ELF::EM_386) &&
(EMachine != ELF::EM_IAMCU)) {}
-enum X86_64RelType { RT64_64, RT64_32, RT64_32S, RT64_16, RT64_8 };
+enum X86_64RelType { RT64_NONE, RT64_64, RT64_32, RT64_32S, RT64_16, RT64_8 };
static X86_64RelType getType64(unsigned Kind,
MCSymbolRefExpr::VariantKind &Modifier,
@@ -53,6 +52,8 @@ static X86_64RelType getType64(unsigned Kind,
switch (Kind) {
default:
llvm_unreachable("Unimplemented");
+ case FK_NONE:
+ return RT64_NONE;
case X86::reloc_global_offset_table8:
Modifier = MCSymbolRefExpr::VK_GOT;
IsPCRel = true;
@@ -103,6 +104,10 @@ static unsigned getRelocType64(MCContext &Ctx, SMLoc Loc,
case MCSymbolRefExpr::VK_None:
case MCSymbolRefExpr::VK_X86_ABS8:
switch (Type) {
+ case RT64_NONE:
+ if (Modifier == MCSymbolRefExpr::VK_None)
+ return ELF::R_X86_64_NONE;
+ llvm_unreachable("Unimplemented");
case RT64_64:
return IsPCRel ? ELF::R_X86_64_PC64 : ELF::R_X86_64_64;
case RT64_32:
@@ -114,6 +119,7 @@ static unsigned getRelocType64(MCContext &Ctx, SMLoc Loc,
case RT64_8:
return IsPCRel ? ELF::R_X86_64_PC8 : ELF::R_X86_64_8;
}
+ llvm_unreachable("unexpected relocation type!");
case MCSymbolRefExpr::VK_GOT:
switch (Type) {
case RT64_64:
@@ -123,8 +129,10 @@ static unsigned getRelocType64(MCContext &Ctx, SMLoc Loc,
case RT64_32S:
case RT64_16:
case RT64_8:
+ case RT64_NONE:
llvm_unreachable("Unimplemented");
}
+ llvm_unreachable("unexpected relocation type!");
case MCSymbolRefExpr::VK_GOTOFF:
assert(Type == RT64_64);
assert(!IsPCRel);
@@ -139,8 +147,10 @@ static unsigned getRelocType64(MCContext &Ctx, SMLoc Loc,
case RT64_32S:
case RT64_16:
case RT64_8:
+ case RT64_NONE:
llvm_unreachable("Unimplemented");
}
+ llvm_unreachable("unexpected relocation type!");
case MCSymbolRefExpr::VK_DTPOFF:
assert(!IsPCRel);
switch (Type) {
@@ -151,8 +161,10 @@ static unsigned getRelocType64(MCContext &Ctx, SMLoc Loc,
case RT64_32S:
case RT64_16:
case RT64_8:
+ case RT64_NONE:
llvm_unreachable("Unimplemented");
}
+ llvm_unreachable("unexpected relocation type!");
case MCSymbolRefExpr::VK_SIZE:
assert(!IsPCRel);
switch (Type) {
@@ -163,8 +175,10 @@ static unsigned getRelocType64(MCContext &Ctx, SMLoc Loc,
case RT64_32S:
case RT64_16:
case RT64_8:
+ case RT64_NONE:
llvm_unreachable("Unimplemented");
}
+ llvm_unreachable("unexpected relocation type!");
case MCSymbolRefExpr::VK_TLSCALL:
return ELF::R_X86_64_TLSDESC_CALL;
case MCSymbolRefExpr::VK_TLSDESC:
@@ -197,13 +211,16 @@ static unsigned getRelocType64(MCContext &Ctx, SMLoc Loc,
case X86::reloc_riprel_4byte_movq_load:
return ELF::R_X86_64_REX_GOTPCRELX;
}
+ llvm_unreachable("unexpected relocation type!");
}
}
-enum X86_32RelType { RT32_32, RT32_16, RT32_8 };
+enum X86_32RelType { RT32_NONE, RT32_32, RT32_16, RT32_8 };
static X86_32RelType getType32(X86_64RelType T) {
switch (T) {
+ case RT64_NONE:
+ return RT32_NONE;
case RT64_64:
llvm_unreachable("Unimplemented");
case RT64_32:
@@ -227,6 +244,10 @@ static unsigned getRelocType32(MCContext &Ctx,
case MCSymbolRefExpr::VK_None:
case MCSymbolRefExpr::VK_X86_ABS8:
switch (Type) {
+ case RT32_NONE:
+ if (Modifier == MCSymbolRefExpr::VK_None)
+ return ELF::R_386_NONE;
+ llvm_unreachable("Unimplemented");
case RT32_32:
return IsPCRel ? ELF::R_386_PC32 : ELF::R_386_32;
case RT32_16:
@@ -234,6 +255,7 @@ static unsigned getRelocType32(MCContext &Ctx,
case RT32_8:
return IsPCRel ? ELF::R_386_PC8 : ELF::R_386_8;
}
+ llvm_unreachable("unexpected relocation type!");
case MCSymbolRefExpr::VK_GOT:
assert(Type == RT32_32);
if (IsPCRel)
@@ -249,6 +271,10 @@ static unsigned getRelocType32(MCContext &Ctx,
assert(Type == RT32_32);
assert(!IsPCRel);
return ELF::R_386_GOTOFF;
+ case MCSymbolRefExpr::VK_TLSCALL:
+ return ELF::R_386_TLS_DESC_CALL;
+ case MCSymbolRefExpr::VK_TLSDESC:
+ return ELF::R_386_TLS_GOTDESC;
case MCSymbolRefExpr::VK_TPOFF:
assert(Type == RT32_32);
assert(!IsPCRel);
diff --git a/lib/Target/X86/MCTargetDesc/X86FixupKinds.h b/lib/Target/X86/MCTargetDesc/X86FixupKinds.h
index 3c04b13e002e..2d5217115d07 100644
--- a/lib/Target/X86/MCTargetDesc/X86FixupKinds.h
+++ b/lib/Target/X86/MCTargetDesc/X86FixupKinds.h
@@ -1,9 +1,8 @@
//===-- X86FixupKinds.h - X86 Specific Fixup Entries ------------*- C++ -*-===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
diff --git a/lib/Target/X86/InstPrinter/X86InstComments.cpp b/lib/Target/X86/MCTargetDesc/X86InstComments.cpp
index 37bed37b0994..73b1969b4e82 100644
--- a/lib/Target/X86/InstPrinter/X86InstComments.cpp
+++ b/lib/Target/X86/MCTargetDesc/X86InstComments.cpp
@@ -1,9 +1,8 @@
//===-- X86InstComments.cpp - Generate verbose-asm comments for instrs ----===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
@@ -14,8 +13,8 @@
#include "X86InstComments.h"
#include "X86ATTInstPrinter.h"
-#include "MCTargetDesc/X86BaseInfo.h"
-#include "MCTargetDesc/X86MCTargetDesc.h"
+#include "X86BaseInfo.h"
+#include "X86MCTargetDesc.h"
#include "Utils/X86ShuffleDecode.h"
#include "llvm/MC/MCInst.h"
#include "llvm/MC/MCInstrInfo.h"
@@ -1076,9 +1075,12 @@ bool llvm::EmitAnyX86InstComments(const MCInst *MI, raw_ostream &OS,
Src1Name = getRegName(MI->getOperand(1).getReg());
LLVM_FALLTHROUGH;
+ case X86::MOVSDrm_alt:
case X86::MOVSDrm:
+ case X86::VMOVSDrm_alt:
case X86::VMOVSDrm:
case X86::VMOVSDZrm:
+ case X86::VMOVSDZrm_alt:
DecodeScalarMoveMask(2, nullptr == Src2Name, ShuffleMask);
DestName = getRegName(MI->getOperand(0).getReg());
break;
@@ -1091,8 +1093,11 @@ bool llvm::EmitAnyX86InstComments(const MCInst *MI, raw_ostream &OS,
LLVM_FALLTHROUGH;
case X86::MOVSSrm:
+ case X86::MOVSSrm_alt:
case X86::VMOVSSrm:
+ case X86::VMOVSSrm_alt:
case X86::VMOVSSZrm:
+ case X86::VMOVSSZrm_alt:
DecodeScalarMoveMask(4, nullptr == Src2Name, ShuffleMask);
DestName = getRegName(MI->getOperand(0).getReg());
break;
@@ -1203,7 +1208,8 @@ bool llvm::EmitAnyX86InstComments(const MCInst *MI, raw_ostream &OS,
Src1Name = getRegName(MI->getOperand(NumOperands - 1).getReg());
LLVM_FALLTHROUGH;
CASE_PMOVZX(PMOVZXBW, m)
- DecodeZeroExtendMask(8, 16, getRegOperandNumElts(MI, 16, 0), ShuffleMask);
+ DecodeZeroExtendMask(8, 16, getRegOperandNumElts(MI, 16, 0), false,
+ ShuffleMask);
DestName = getRegName(MI->getOperand(0).getReg());
break;
@@ -1211,7 +1217,8 @@ bool llvm::EmitAnyX86InstComments(const MCInst *MI, raw_ostream &OS,
Src1Name = getRegName(MI->getOperand(NumOperands - 1).getReg());
LLVM_FALLTHROUGH;
CASE_PMOVZX(PMOVZXBD, m)
- DecodeZeroExtendMask(8, 32, getRegOperandNumElts(MI, 32, 0), ShuffleMask);
+ DecodeZeroExtendMask(8, 32, getRegOperandNumElts(MI, 32, 0), false,
+ ShuffleMask);
DestName = getRegName(MI->getOperand(0).getReg());
break;
@@ -1219,7 +1226,8 @@ bool llvm::EmitAnyX86InstComments(const MCInst *MI, raw_ostream &OS,
Src1Name = getRegName(MI->getOperand(NumOperands - 1).getReg());
LLVM_FALLTHROUGH;
CASE_PMOVZX(PMOVZXBQ, m)
- DecodeZeroExtendMask(8, 64, getRegOperandNumElts(MI, 64, 0), ShuffleMask);
+ DecodeZeroExtendMask(8, 64, getRegOperandNumElts(MI, 64, 0), false,
+ ShuffleMask);
DestName = getRegName(MI->getOperand(0).getReg());
break;
@@ -1227,7 +1235,8 @@ bool llvm::EmitAnyX86InstComments(const MCInst *MI, raw_ostream &OS,
Src1Name = getRegName(MI->getOperand(NumOperands - 1).getReg());
LLVM_FALLTHROUGH;
CASE_PMOVZX(PMOVZXWD, m)
- DecodeZeroExtendMask(16, 32, getRegOperandNumElts(MI, 32, 0), ShuffleMask);
+ DecodeZeroExtendMask(16, 32, getRegOperandNumElts(MI, 32, 0), false,
+ ShuffleMask);
DestName = getRegName(MI->getOperand(0).getReg());
break;
@@ -1235,7 +1244,8 @@ bool llvm::EmitAnyX86InstComments(const MCInst *MI, raw_ostream &OS,
Src1Name = getRegName(MI->getOperand(NumOperands - 1).getReg());
LLVM_FALLTHROUGH;
CASE_PMOVZX(PMOVZXWQ, m)
- DecodeZeroExtendMask(16, 64, getRegOperandNumElts(MI, 64, 0), ShuffleMask);
+ DecodeZeroExtendMask(16, 64, getRegOperandNumElts(MI, 64, 0), false,
+ ShuffleMask);
DestName = getRegName(MI->getOperand(0).getReg());
break;
@@ -1243,7 +1253,8 @@ bool llvm::EmitAnyX86InstComments(const MCInst *MI, raw_ostream &OS,
Src1Name = getRegName(MI->getOperand(NumOperands - 1).getReg());
LLVM_FALLTHROUGH;
CASE_PMOVZX(PMOVZXDQ, m)
- DecodeZeroExtendMask(32, 64, getRegOperandNumElts(MI, 64, 0), ShuffleMask);
+ DecodeZeroExtendMask(32, 64, getRegOperandNumElts(MI, 64, 0), false,
+ ShuffleMask);
DestName = getRegName(MI->getOperand(0).getReg());
break;
}
@@ -1304,6 +1315,7 @@ bool llvm::EmitAnyX86InstComments(const MCInst *MI, raw_ostream &OS,
OS << ']';
--i; // For loop increments element #.
}
+ OS << '\n';
// We successfully added a comment to this instruction.
return true;
diff --git a/lib/Target/X86/InstPrinter/X86InstComments.h b/lib/Target/X86/MCTargetDesc/X86InstComments.h
index 40dffa5fbb8a..96760664012a 100644
--- a/lib/Target/X86/InstPrinter/X86InstComments.h
+++ b/lib/Target/X86/MCTargetDesc/X86InstComments.h
@@ -1,9 +1,8 @@
//=- X86InstComments.h - Generate verbose-asm comments for instrs -*- C++ -*-=//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
@@ -12,8 +11,8 @@
//
//===----------------------------------------------------------------------===//
-#ifndef LLVM_LIB_TARGET_X86_INSTPRINTER_X86INSTCOMMENTS_H
-#define LLVM_LIB_TARGET_X86_INSTPRINTER_X86INSTCOMMENTS_H
+#ifndef LLVM_LIB_TARGET_X86_MCTARGETDESC_X86INSTCOMMENTS_H
+#define LLVM_LIB_TARGET_X86_MCTARGETDESC_X86INSTCOMMENTS_H
namespace llvm {
diff --git a/lib/Target/X86/MCTargetDesc/X86InstPrinterCommon.cpp b/lib/Target/X86/MCTargetDesc/X86InstPrinterCommon.cpp
new file mode 100644
index 000000000000..a21555076976
--- /dev/null
+++ b/lib/Target/X86/MCTargetDesc/X86InstPrinterCommon.cpp
@@ -0,0 +1,362 @@
+//===--- X86InstPrinterCommon.cpp - X86 assembly instruction printing -----===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file includes common code for rendering MCInst instances as Intel-style
+// and Intel-style assembly.
+//
+//===----------------------------------------------------------------------===//
+
+#include "X86InstPrinterCommon.h"
+#include "X86BaseInfo.h"
+#include "llvm/MC/MCExpr.h"
+#include "llvm/MC/MCInst.h"
+#include "llvm/MC/MCInstrDesc.h"
+#include "llvm/MC/MCInstrInfo.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/Support/Casting.h"
+#include <cstdint>
+#include <cassert>
+
+using namespace llvm;
+
+void X86InstPrinterCommon::printCondCode(const MCInst *MI, unsigned Op,
+ raw_ostream &O) {
+ int64_t Imm = MI->getOperand(Op).getImm();
+ switch (Imm) {
+ default: llvm_unreachable("Invalid condcode argument!");
+ case 0: O << "o"; break;
+ case 1: O << "no"; break;
+ case 2: O << "b"; break;
+ case 3: O << "ae"; break;
+ case 4: O << "e"; break;
+ case 5: O << "ne"; break;
+ case 6: O << "be"; break;
+ case 7: O << "a"; break;
+ case 8: O << "s"; break;
+ case 9: O << "ns"; break;
+ case 0xa: O << "p"; break;
+ case 0xb: O << "np"; break;
+ case 0xc: O << "l"; break;
+ case 0xd: O << "ge"; break;
+ case 0xe: O << "le"; break;
+ case 0xf: O << "g"; break;
+ }
+}
+
+void X86InstPrinterCommon::printSSEAVXCC(const MCInst *MI, unsigned Op,
+ raw_ostream &O) {
+ int64_t Imm = MI->getOperand(Op).getImm();
+ switch (Imm) {
+ default: llvm_unreachable("Invalid ssecc/avxcc argument!");
+ case 0: O << "eq"; break;
+ case 1: O << "lt"; break;
+ case 2: O << "le"; break;
+ case 3: O << "unord"; break;
+ case 4: O << "neq"; break;
+ case 5: O << "nlt"; break;
+ case 6: O << "nle"; break;
+ case 7: O << "ord"; break;
+ case 8: O << "eq_uq"; break;
+ case 9: O << "nge"; break;
+ case 0xa: O << "ngt"; break;
+ case 0xb: O << "false"; break;
+ case 0xc: O << "neq_oq"; break;
+ case 0xd: O << "ge"; break;
+ case 0xe: O << "gt"; break;
+ case 0xf: O << "true"; break;
+ case 0x10: O << "eq_os"; break;
+ case 0x11: O << "lt_oq"; break;
+ case 0x12: O << "le_oq"; break;
+ case 0x13: O << "unord_s"; break;
+ case 0x14: O << "neq_us"; break;
+ case 0x15: O << "nlt_uq"; break;
+ case 0x16: O << "nle_uq"; break;
+ case 0x17: O << "ord_s"; break;
+ case 0x18: O << "eq_us"; break;
+ case 0x19: O << "nge_uq"; break;
+ case 0x1a: O << "ngt_uq"; break;
+ case 0x1b: O << "false_os"; break;
+ case 0x1c: O << "neq_os"; break;
+ case 0x1d: O << "ge_oq"; break;
+ case 0x1e: O << "gt_oq"; break;
+ case 0x1f: O << "true_us"; break;
+ }
+}
+
+void X86InstPrinterCommon::printVPCOMMnemonic(const MCInst *MI,
+ raw_ostream &OS) {
+ OS << "vpcom";
+
+ int64_t Imm = MI->getOperand(MI->getNumOperands() - 1).getImm();
+ switch (Imm) {
+ default: llvm_unreachable("Invalid vpcom argument!");
+ case 0: OS << "lt"; break;
+ case 1: OS << "le"; break;
+ case 2: OS << "gt"; break;
+ case 3: OS << "ge"; break;
+ case 4: OS << "eq"; break;
+ case 5: OS << "neq"; break;
+ case 6: OS << "false"; break;
+ case 7: OS << "true"; break;
+ }
+
+ switch (MI->getOpcode()) {
+ default: llvm_unreachable("Unexpected opcode!");
+ case X86::VPCOMBmi: case X86::VPCOMBri: OS << "b\t"; break;
+ case X86::VPCOMDmi: case X86::VPCOMDri: OS << "d\t"; break;
+ case X86::VPCOMQmi: case X86::VPCOMQri: OS << "q\t"; break;
+ case X86::VPCOMUBmi: case X86::VPCOMUBri: OS << "ub\t"; break;
+ case X86::VPCOMUDmi: case X86::VPCOMUDri: OS << "ud\t"; break;
+ case X86::VPCOMUQmi: case X86::VPCOMUQri: OS << "uq\t"; break;
+ case X86::VPCOMUWmi: case X86::VPCOMUWri: OS << "uw\t"; break;
+ case X86::VPCOMWmi: case X86::VPCOMWri: OS << "w\t"; break;
+ }
+}
+
+void X86InstPrinterCommon::printVPCMPMnemonic(const MCInst *MI,
+ raw_ostream &OS) {
+ OS << "vpcmp";
+
+ printSSEAVXCC(MI, MI->getNumOperands() - 1, OS);
+
+ switch (MI->getOpcode()) {
+ default: llvm_unreachable("Unexpected opcode!");
+ case X86::VPCMPBZ128rmi: case X86::VPCMPBZ128rri:
+ case X86::VPCMPBZ256rmi: case X86::VPCMPBZ256rri:
+ case X86::VPCMPBZrmi: case X86::VPCMPBZrri:
+ case X86::VPCMPBZ128rmik: case X86::VPCMPBZ128rrik:
+ case X86::VPCMPBZ256rmik: case X86::VPCMPBZ256rrik:
+ case X86::VPCMPBZrmik: case X86::VPCMPBZrrik:
+ OS << "b\t";
+ break;
+ case X86::VPCMPDZ128rmi: case X86::VPCMPDZ128rri:
+ case X86::VPCMPDZ256rmi: case X86::VPCMPDZ256rri:
+ case X86::VPCMPDZrmi: case X86::VPCMPDZrri:
+ case X86::VPCMPDZ128rmik: case X86::VPCMPDZ128rrik:
+ case X86::VPCMPDZ256rmik: case X86::VPCMPDZ256rrik:
+ case X86::VPCMPDZrmik: case X86::VPCMPDZrrik:
+ case X86::VPCMPDZ128rmib: case X86::VPCMPDZ128rmibk:
+ case X86::VPCMPDZ256rmib: case X86::VPCMPDZ256rmibk:
+ case X86::VPCMPDZrmib: case X86::VPCMPDZrmibk:
+ OS << "d\t";
+ break;
+ case X86::VPCMPQZ128rmi: case X86::VPCMPQZ128rri:
+ case X86::VPCMPQZ256rmi: case X86::VPCMPQZ256rri:
+ case X86::VPCMPQZrmi: case X86::VPCMPQZrri:
+ case X86::VPCMPQZ128rmik: case X86::VPCMPQZ128rrik:
+ case X86::VPCMPQZ256rmik: case X86::VPCMPQZ256rrik:
+ case X86::VPCMPQZrmik: case X86::VPCMPQZrrik:
+ case X86::VPCMPQZ128rmib: case X86::VPCMPQZ128rmibk:
+ case X86::VPCMPQZ256rmib: case X86::VPCMPQZ256rmibk:
+ case X86::VPCMPQZrmib: case X86::VPCMPQZrmibk:
+ OS << "q\t";
+ break;
+ case X86::VPCMPUBZ128rmi: case X86::VPCMPUBZ128rri:
+ case X86::VPCMPUBZ256rmi: case X86::VPCMPUBZ256rri:
+ case X86::VPCMPUBZrmi: case X86::VPCMPUBZrri:
+ case X86::VPCMPUBZ128rmik: case X86::VPCMPUBZ128rrik:
+ case X86::VPCMPUBZ256rmik: case X86::VPCMPUBZ256rrik:
+ case X86::VPCMPUBZrmik: case X86::VPCMPUBZrrik:
+ OS << "ub\t";
+ break;
+ case X86::VPCMPUDZ128rmi: case X86::VPCMPUDZ128rri:
+ case X86::VPCMPUDZ256rmi: case X86::VPCMPUDZ256rri:
+ case X86::VPCMPUDZrmi: case X86::VPCMPUDZrri:
+ case X86::VPCMPUDZ128rmik: case X86::VPCMPUDZ128rrik:
+ case X86::VPCMPUDZ256rmik: case X86::VPCMPUDZ256rrik:
+ case X86::VPCMPUDZrmik: case X86::VPCMPUDZrrik:
+ case X86::VPCMPUDZ128rmib: case X86::VPCMPUDZ128rmibk:
+ case X86::VPCMPUDZ256rmib: case X86::VPCMPUDZ256rmibk:
+ case X86::VPCMPUDZrmib: case X86::VPCMPUDZrmibk:
+ OS << "ud\t";
+ break;
+ case X86::VPCMPUQZ128rmi: case X86::VPCMPUQZ128rri:
+ case X86::VPCMPUQZ256rmi: case X86::VPCMPUQZ256rri:
+ case X86::VPCMPUQZrmi: case X86::VPCMPUQZrri:
+ case X86::VPCMPUQZ128rmik: case X86::VPCMPUQZ128rrik:
+ case X86::VPCMPUQZ256rmik: case X86::VPCMPUQZ256rrik:
+ case X86::VPCMPUQZrmik: case X86::VPCMPUQZrrik:
+ case X86::VPCMPUQZ128rmib: case X86::VPCMPUQZ128rmibk:
+ case X86::VPCMPUQZ256rmib: case X86::VPCMPUQZ256rmibk:
+ case X86::VPCMPUQZrmib: case X86::VPCMPUQZrmibk:
+ OS << "uq\t";
+ break;
+ case X86::VPCMPUWZ128rmi: case X86::VPCMPUWZ128rri:
+ case X86::VPCMPUWZ256rri: case X86::VPCMPUWZ256rmi:
+ case X86::VPCMPUWZrmi: case X86::VPCMPUWZrri:
+ case X86::VPCMPUWZ128rmik: case X86::VPCMPUWZ128rrik:
+ case X86::VPCMPUWZ256rrik: case X86::VPCMPUWZ256rmik:
+ case X86::VPCMPUWZrmik: case X86::VPCMPUWZrrik:
+ OS << "uw\t";
+ break;
+ case X86::VPCMPWZ128rmi: case X86::VPCMPWZ128rri:
+ case X86::VPCMPWZ256rmi: case X86::VPCMPWZ256rri:
+ case X86::VPCMPWZrmi: case X86::VPCMPWZrri:
+ case X86::VPCMPWZ128rmik: case X86::VPCMPWZ128rrik:
+ case X86::VPCMPWZ256rmik: case X86::VPCMPWZ256rrik:
+ case X86::VPCMPWZrmik: case X86::VPCMPWZrrik:
+ OS << "w\t";
+ break;
+ }
+}
+
+void X86InstPrinterCommon::printCMPMnemonic(const MCInst *MI, bool IsVCmp,
+ raw_ostream &OS) {
+ OS << (IsVCmp ? "vcmp" : "cmp");
+
+ printSSEAVXCC(MI, MI->getNumOperands() - 1, OS);
+
+ switch (MI->getOpcode()) {
+ default: llvm_unreachable("Unexpected opcode!");
+ case X86::CMPPDrmi: case X86::CMPPDrri:
+ case X86::VCMPPDrmi: case X86::VCMPPDrri:
+ case X86::VCMPPDYrmi: case X86::VCMPPDYrri:
+ case X86::VCMPPDZ128rmi: case X86::VCMPPDZ128rri:
+ case X86::VCMPPDZ256rmi: case X86::VCMPPDZ256rri:
+ case X86::VCMPPDZrmi: case X86::VCMPPDZrri:
+ case X86::VCMPPDZ128rmik: case X86::VCMPPDZ128rrik:
+ case X86::VCMPPDZ256rmik: case X86::VCMPPDZ256rrik:
+ case X86::VCMPPDZrmik: case X86::VCMPPDZrrik:
+ case X86::VCMPPDZ128rmbi: case X86::VCMPPDZ128rmbik:
+ case X86::VCMPPDZ256rmbi: case X86::VCMPPDZ256rmbik:
+ case X86::VCMPPDZrmbi: case X86::VCMPPDZrmbik:
+ case X86::VCMPPDZrrib: case X86::VCMPPDZrribk:
+ OS << "pd\t";
+ break;
+ case X86::CMPPSrmi: case X86::CMPPSrri:
+ case X86::VCMPPSrmi: case X86::VCMPPSrri:
+ case X86::VCMPPSYrmi: case X86::VCMPPSYrri:
+ case X86::VCMPPSZ128rmi: case X86::VCMPPSZ128rri:
+ case X86::VCMPPSZ256rmi: case X86::VCMPPSZ256rri:
+ case X86::VCMPPSZrmi: case X86::VCMPPSZrri:
+ case X86::VCMPPSZ128rmik: case X86::VCMPPSZ128rrik:
+ case X86::VCMPPSZ256rmik: case X86::VCMPPSZ256rrik:
+ case X86::VCMPPSZrmik: case X86::VCMPPSZrrik:
+ case X86::VCMPPSZ128rmbi: case X86::VCMPPSZ128rmbik:
+ case X86::VCMPPSZ256rmbi: case X86::VCMPPSZ256rmbik:
+ case X86::VCMPPSZrmbi: case X86::VCMPPSZrmbik:
+ case X86::VCMPPSZrrib: case X86::VCMPPSZrribk:
+ OS << "ps\t";
+ break;
+ case X86::CMPSDrm: case X86::CMPSDrr:
+ case X86::CMPSDrm_Int: case X86::CMPSDrr_Int:
+ case X86::VCMPSDrm: case X86::VCMPSDrr:
+ case X86::VCMPSDrm_Int: case X86::VCMPSDrr_Int:
+ case X86::VCMPSDZrm: case X86::VCMPSDZrr:
+ case X86::VCMPSDZrm_Int: case X86::VCMPSDZrr_Int:
+ case X86::VCMPSDZrm_Intk: case X86::VCMPSDZrr_Intk:
+ case X86::VCMPSDZrrb_Int: case X86::VCMPSDZrrb_Intk:
+ OS << "sd\t";
+ break;
+ case X86::CMPSSrm: case X86::CMPSSrr:
+ case X86::CMPSSrm_Int: case X86::CMPSSrr_Int:
+ case X86::VCMPSSrm: case X86::VCMPSSrr:
+ case X86::VCMPSSrm_Int: case X86::VCMPSSrr_Int:
+ case X86::VCMPSSZrm: case X86::VCMPSSZrr:
+ case X86::VCMPSSZrm_Int: case X86::VCMPSSZrr_Int:
+ case X86::VCMPSSZrm_Intk: case X86::VCMPSSZrr_Intk:
+ case X86::VCMPSSZrrb_Int: case X86::VCMPSSZrrb_Intk:
+ OS << "ss\t";
+ break;
+ }
+}
+
+void X86InstPrinterCommon::printRoundingControl(const MCInst *MI, unsigned Op,
+ raw_ostream &O) {
+ int64_t Imm = MI->getOperand(Op).getImm();
+ switch (Imm) {
+ default:
+ llvm_unreachable("Invalid rounding control!");
+ case X86::TO_NEAREST_INT:
+ O << "{rn-sae}";
+ break;
+ case X86::TO_NEG_INF:
+ O << "{rd-sae}";
+ break;
+ case X86::TO_POS_INF:
+ O << "{ru-sae}";
+ break;
+ case X86::TO_ZERO:
+ O << "{rz-sae}";
+ break;
+ }
+}
+
+/// printPCRelImm - This is used to print an immediate value that ends up
+/// being encoded as a pc-relative value (e.g. for jumps and calls). In
+/// Intel-style these print slightly differently than normal immediates.
+/// for example, a $ is not emitted.
+void X86InstPrinterCommon::printPCRelImm(const MCInst *MI, unsigned OpNo,
+ raw_ostream &O) {
+ const MCOperand &Op = MI->getOperand(OpNo);
+ if (Op.isImm())
+ O << formatImm(Op.getImm());
+ else {
+ assert(Op.isExpr() && "unknown pcrel immediate operand");
+ // If a symbolic branch target was added as a constant expression then print
+ // that address in hex.
+ const MCConstantExpr *BranchTarget = dyn_cast<MCConstantExpr>(Op.getExpr());
+ int64_t Address;
+ if (BranchTarget && BranchTarget->evaluateAsAbsolute(Address)) {
+ O << formatHex((uint64_t)Address);
+ } else {
+ // Otherwise, just print the expression.
+ Op.getExpr()->print(O, &MAI);
+ }
+ }
+}
+
+void X86InstPrinterCommon::printOptionalSegReg(const MCInst *MI, unsigned OpNo,
+ raw_ostream &O) {
+ if (MI->getOperand(OpNo).getReg()) {
+ printOperand(MI, OpNo, O);
+ O << ':';
+ }
+}
+
+void X86InstPrinterCommon::printInstFlags(const MCInst *MI, raw_ostream &O) {
+ const MCInstrDesc &Desc = MII.get(MI->getOpcode());
+ uint64_t TSFlags = Desc.TSFlags;
+ unsigned Flags = MI->getFlags();
+
+ if ((TSFlags & X86II::LOCK) || (Flags & X86::IP_HAS_LOCK))
+ O << "\tlock\t";
+
+ if ((TSFlags & X86II::NOTRACK) || (Flags & X86::IP_HAS_NOTRACK))
+ O << "\tnotrack\t";
+
+ if (Flags & X86::IP_HAS_REPEAT_NE)
+ O << "\trepne\t";
+ else if (Flags & X86::IP_HAS_REPEAT)
+ O << "\trep\t";
+}
+
+void X86InstPrinterCommon::printVKPair(const MCInst *MI, unsigned OpNo,
+ raw_ostream &OS) {
+ // In assembly listings, a pair is represented by one of its members, any
+ // of the two. Here, we pick k0, k2, k4, k6, but we could as well
+ // print K2_K3 as "k3". It would probably make a lot more sense, if
+ // the assembly would look something like:
+ // "vp2intersect %zmm5, %zmm7, {%k2, %k3}"
+ // but this can work too.
+ switch (MI->getOperand(OpNo).getReg()) {
+ case X86::K0_K1:
+ printRegName(OS, X86::K0);
+ return;
+ case X86::K2_K3:
+ printRegName(OS, X86::K2);
+ return;
+ case X86::K4_K5:
+ printRegName(OS, X86::K4);
+ return;
+ case X86::K6_K7:
+ printRegName(OS, X86::K6);
+ return;
+ }
+ llvm_unreachable("Unknown mask pair register name");
+}
diff --git a/lib/Target/X86/InstPrinter/X86InstPrinterCommon.h b/lib/Target/X86/MCTargetDesc/X86InstPrinterCommon.h
index f2875e71f22c..8e28f24b619a 100644
--- a/lib/Target/X86/InstPrinter/X86InstPrinterCommon.h
+++ b/lib/Target/X86/MCTargetDesc/X86InstPrinterCommon.h
@@ -1,9 +1,8 @@
//===-- X86InstPrinterCommon.cpp - X86 assembly instruction printing ------===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
@@ -12,8 +11,8 @@
//
//===----------------------------------------------------------------------===//
-#ifndef LLVM_LIB_TARGET_X86_INSTPRINTER_X86INSTPRINTERCOMMON_H
-#define LLVM_LIB_TARGET_X86_INSTPRINTER_X86INSTPRINTERCOMMON_H
+#ifndef LLVM_LIB_TARGET_X86_MCTARGETDESC_X86INSTPRINTERCOMMON_H
+#define LLVM_LIB_TARGET_X86_MCTARGETDESC_X86INSTPRINTERCOMMON_H
#include "llvm/MC/MCInstPrinter.h"
@@ -24,15 +23,19 @@ public:
using MCInstPrinter::MCInstPrinter;
virtual void printOperand(const MCInst *MI, unsigned OpNo, raw_ostream &O) = 0;
+ void printCondCode(const MCInst *MI, unsigned Op, raw_ostream &OS);
void printSSEAVXCC(const MCInst *MI, unsigned Op, raw_ostream &OS);
- void printXOPCC(const MCInst *MI, unsigned Op, raw_ostream &OS);
+ void printVPCOMMnemonic(const MCInst *MI, raw_ostream &OS);
+ void printVPCMPMnemonic(const MCInst *MI, raw_ostream &OS);
+ void printCMPMnemonic(const MCInst *MI, bool IsVCmp, raw_ostream &OS);
void printRoundingControl(const MCInst *MI, unsigned Op, raw_ostream &O);
void printPCRelImm(const MCInst *MI, unsigned OpNo, raw_ostream &O);
protected:
void printInstFlags(const MCInst *MI, raw_ostream &O);
void printOptionalSegReg(const MCInst *MI, unsigned OpNo, raw_ostream &O);
+ void printVKPair(const MCInst *MI, unsigned OpNo, raw_ostream &OS);
};
} // end namespace llvm
-#endif // LLVM_LIB_TARGET_X86_INSTPRINTER_X86ATTINSTPRINTER_H
+#endif // LLVM_LIB_TARGET_X86_MCTARGETDESC_X86ATTINSTPRINTER_H
diff --git a/lib/Target/X86/MCTargetDesc/X86IntelInstPrinter.cpp b/lib/Target/X86/MCTargetDesc/X86IntelInstPrinter.cpp
new file mode 100644
index 000000000000..ea28bef42569
--- /dev/null
+++ b/lib/Target/X86/MCTargetDesc/X86IntelInstPrinter.cpp
@@ -0,0 +1,445 @@
+//===-- X86IntelInstPrinter.cpp - Intel assembly instruction printing -----===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file includes code for rendering MCInst instances as Intel-style
+// assembly.
+//
+//===----------------------------------------------------------------------===//
+
+#include "X86IntelInstPrinter.h"
+#include "X86BaseInfo.h"
+#include "X86InstComments.h"
+#include "llvm/MC/MCExpr.h"
+#include "llvm/MC/MCInst.h"
+#include "llvm/MC/MCInstrDesc.h"
+#include "llvm/MC/MCInstrInfo.h"
+#include "llvm/MC/MCSubtargetInfo.h"
+#include "llvm/Support/Casting.h"
+#include "llvm/Support/ErrorHandling.h"
+#include <cassert>
+#include <cstdint>
+
+using namespace llvm;
+
+#define DEBUG_TYPE "asm-printer"
+
+// Include the auto-generated portion of the assembly writer.
+#define PRINT_ALIAS_INSTR
+#include "X86GenAsmWriter1.inc"
+
+void X86IntelInstPrinter::printRegName(raw_ostream &OS, unsigned RegNo) const {
+ OS << getRegisterName(RegNo);
+}
+
+void X86IntelInstPrinter::printInst(const MCInst *MI, raw_ostream &OS,
+ StringRef Annot,
+ const MCSubtargetInfo &STI) {
+ printInstFlags(MI, OS);
+
+ // In 16-bit mode, print data16 as data32.
+ if (MI->getOpcode() == X86::DATA16_PREFIX &&
+ STI.getFeatureBits()[X86::Mode16Bit]) {
+ OS << "\tdata32";
+ } else if (!printAliasInstr(MI, OS) &&
+ !printVecCompareInstr(MI, OS))
+ printInstruction(MI, OS);
+
+ // Next always print the annotation.
+ printAnnotation(OS, Annot);
+
+ // If verbose assembly is enabled, we can print some informative comments.
+ if (CommentStream)
+ EmitAnyX86InstComments(MI, *CommentStream, MII);
+}
+
+bool X86IntelInstPrinter::printVecCompareInstr(const MCInst *MI, raw_ostream &OS) {
+ if (MI->getNumOperands() == 0 ||
+ !MI->getOperand(MI->getNumOperands() - 1).isImm())
+ return false;
+
+ int64_t Imm = MI->getOperand(MI->getNumOperands() - 1).getImm();
+
+ const MCInstrDesc &Desc = MII.get(MI->getOpcode());
+
+ // Custom print the vector compare instructions to get the immediate
+ // translated into the mnemonic.
+ switch (MI->getOpcode()) {
+ case X86::CMPPDrmi: case X86::CMPPDrri:
+ case X86::CMPPSrmi: case X86::CMPPSrri:
+ case X86::CMPSDrm: case X86::CMPSDrr:
+ case X86::CMPSDrm_Int: case X86::CMPSDrr_Int:
+ case X86::CMPSSrm: case X86::CMPSSrr:
+ case X86::CMPSSrm_Int: case X86::CMPSSrr_Int:
+ if (Imm >= 0 && Imm <= 7) {
+ OS << '\t';
+ printCMPMnemonic(MI, /*IsVCMP*/false, OS);
+ printOperand(MI, 0, OS);
+ OS << ", ";
+ // Skip operand 1 as its tied to the dest.
+
+ if ((Desc.TSFlags & X86II::FormMask) == X86II::MRMSrcMem) {
+ if ((Desc.TSFlags & X86II::OpPrefixMask) == X86II::XS)
+ printdwordmem(MI, 2, OS);
+ else if ((Desc.TSFlags & X86II::OpPrefixMask) == X86II::XD)
+ printqwordmem(MI, 2, OS);
+ else
+ printxmmwordmem(MI, 2, OS);
+ } else
+ printOperand(MI, 2, OS);
+
+ return true;
+ }
+ break;
+
+ case X86::VCMPPDrmi: case X86::VCMPPDrri:
+ case X86::VCMPPDYrmi: case X86::VCMPPDYrri:
+ case X86::VCMPPDZ128rmi: case X86::VCMPPDZ128rri:
+ case X86::VCMPPDZ256rmi: case X86::VCMPPDZ256rri:
+ case X86::VCMPPDZrmi: case X86::VCMPPDZrri:
+ case X86::VCMPPSrmi: case X86::VCMPPSrri:
+ case X86::VCMPPSYrmi: case X86::VCMPPSYrri:
+ case X86::VCMPPSZ128rmi: case X86::VCMPPSZ128rri:
+ case X86::VCMPPSZ256rmi: case X86::VCMPPSZ256rri:
+ case X86::VCMPPSZrmi: case X86::VCMPPSZrri:
+ case X86::VCMPSDrm: case X86::VCMPSDrr:
+ case X86::VCMPSDZrm: case X86::VCMPSDZrr:
+ case X86::VCMPSDrm_Int: case X86::VCMPSDrr_Int:
+ case X86::VCMPSDZrm_Int: case X86::VCMPSDZrr_Int:
+ case X86::VCMPSSrm: case X86::VCMPSSrr:
+ case X86::VCMPSSZrm: case X86::VCMPSSZrr:
+ case X86::VCMPSSrm_Int: case X86::VCMPSSrr_Int:
+ case X86::VCMPSSZrm_Int: case X86::VCMPSSZrr_Int:
+ case X86::VCMPPDZ128rmik: case X86::VCMPPDZ128rrik:
+ case X86::VCMPPDZ256rmik: case X86::VCMPPDZ256rrik:
+ case X86::VCMPPDZrmik: case X86::VCMPPDZrrik:
+ case X86::VCMPPSZ128rmik: case X86::VCMPPSZ128rrik:
+ case X86::VCMPPSZ256rmik: case X86::VCMPPSZ256rrik:
+ case X86::VCMPPSZrmik: case X86::VCMPPSZrrik:
+ case X86::VCMPSDZrm_Intk: case X86::VCMPSDZrr_Intk:
+ case X86::VCMPSSZrm_Intk: case X86::VCMPSSZrr_Intk:
+ case X86::VCMPPDZ128rmbi: case X86::VCMPPDZ128rmbik:
+ case X86::VCMPPDZ256rmbi: case X86::VCMPPDZ256rmbik:
+ case X86::VCMPPDZrmbi: case X86::VCMPPDZrmbik:
+ case X86::VCMPPSZ128rmbi: case X86::VCMPPSZ128rmbik:
+ case X86::VCMPPSZ256rmbi: case X86::VCMPPSZ256rmbik:
+ case X86::VCMPPSZrmbi: case X86::VCMPPSZrmbik:
+ case X86::VCMPPDZrrib: case X86::VCMPPDZrribk:
+ case X86::VCMPPSZrrib: case X86::VCMPPSZrribk:
+ case X86::VCMPSDZrrb_Int: case X86::VCMPSDZrrb_Intk:
+ case X86::VCMPSSZrrb_Int: case X86::VCMPSSZrrb_Intk:
+ if (Imm >= 0 && Imm <= 31) {
+ OS << '\t';
+ printCMPMnemonic(MI, /*IsVCMP*/true, OS);
+
+ unsigned CurOp = 0;
+ printOperand(MI, CurOp++, OS);
+
+ if (Desc.TSFlags & X86II::EVEX_K) {
+ // Print mask operand.
+ OS << " {";
+ printOperand(MI, CurOp++, OS);
+ OS << "}";
+ }
+ OS << ", ";
+ printOperand(MI, CurOp++, OS);
+ OS << ", ";
+
+ if ((Desc.TSFlags & X86II::FormMask) == X86II::MRMSrcMem) {
+ if (Desc.TSFlags & X86II::EVEX_B) {
+ // Broadcast form.
+ // Load size is based on W-bit.
+ if (Desc.TSFlags & X86II::VEX_W)
+ printqwordmem(MI, CurOp++, OS);
+ else
+ printdwordmem(MI, CurOp++, OS);
+
+ // Print the number of elements broadcasted.
+ unsigned NumElts;
+ if (Desc.TSFlags & X86II::EVEX_L2)
+ NumElts = (Desc.TSFlags & X86II::VEX_W) ? 8 : 16;
+ else if (Desc.TSFlags & X86II::VEX_L)
+ NumElts = (Desc.TSFlags & X86II::VEX_W) ? 4 : 8;
+ else
+ NumElts = (Desc.TSFlags & X86II::VEX_W) ? 2 : 4;
+ OS << "{1to" << NumElts << "}";
+ } else {
+ if ((Desc.TSFlags & X86II::OpPrefixMask) == X86II::XS)
+ printdwordmem(MI, CurOp++, OS);
+ else if ((Desc.TSFlags & X86II::OpPrefixMask) == X86II::XD)
+ printqwordmem(MI, CurOp++, OS);
+ else if (Desc.TSFlags & X86II::EVEX_L2)
+ printzmmwordmem(MI, CurOp++, OS);
+ else if (Desc.TSFlags & X86II::VEX_L)
+ printymmwordmem(MI, CurOp++, OS);
+ else
+ printxmmwordmem(MI, CurOp++, OS);
+ }
+ } else {
+ printOperand(MI, CurOp++, OS);
+ if (Desc.TSFlags & X86II::EVEX_B)
+ OS << ", {sae}";
+ }
+
+ return true;
+ }
+ break;
+
+ case X86::VPCOMBmi: case X86::VPCOMBri:
+ case X86::VPCOMDmi: case X86::VPCOMDri:
+ case X86::VPCOMQmi: case X86::VPCOMQri:
+ case X86::VPCOMUBmi: case X86::VPCOMUBri:
+ case X86::VPCOMUDmi: case X86::VPCOMUDri:
+ case X86::VPCOMUQmi: case X86::VPCOMUQri:
+ case X86::VPCOMUWmi: case X86::VPCOMUWri:
+ case X86::VPCOMWmi: case X86::VPCOMWri:
+ if (Imm >= 0 && Imm <= 7) {
+ OS << '\t';
+ printVPCOMMnemonic(MI, OS);
+ printOperand(MI, 0, OS);
+ OS << ", ";
+ printOperand(MI, 1, OS);
+ OS << ", ";
+ if ((Desc.TSFlags & X86II::FormMask) == X86II::MRMSrcMem)
+ printxmmwordmem(MI, 2, OS);
+ else
+ printOperand(MI, 2, OS);
+ return true;
+ }
+ break;
+
+ case X86::VPCMPBZ128rmi: case X86::VPCMPBZ128rri:
+ case X86::VPCMPBZ256rmi: case X86::VPCMPBZ256rri:
+ case X86::VPCMPBZrmi: case X86::VPCMPBZrri:
+ case X86::VPCMPDZ128rmi: case X86::VPCMPDZ128rri:
+ case X86::VPCMPDZ256rmi: case X86::VPCMPDZ256rri:
+ case X86::VPCMPDZrmi: case X86::VPCMPDZrri:
+ case X86::VPCMPQZ128rmi: case X86::VPCMPQZ128rri:
+ case X86::VPCMPQZ256rmi: case X86::VPCMPQZ256rri:
+ case X86::VPCMPQZrmi: case X86::VPCMPQZrri:
+ case X86::VPCMPUBZ128rmi: case X86::VPCMPUBZ128rri:
+ case X86::VPCMPUBZ256rmi: case X86::VPCMPUBZ256rri:
+ case X86::VPCMPUBZrmi: case X86::VPCMPUBZrri:
+ case X86::VPCMPUDZ128rmi: case X86::VPCMPUDZ128rri:
+ case X86::VPCMPUDZ256rmi: case X86::VPCMPUDZ256rri:
+ case X86::VPCMPUDZrmi: case X86::VPCMPUDZrri:
+ case X86::VPCMPUQZ128rmi: case X86::VPCMPUQZ128rri:
+ case X86::VPCMPUQZ256rmi: case X86::VPCMPUQZ256rri:
+ case X86::VPCMPUQZrmi: case X86::VPCMPUQZrri:
+ case X86::VPCMPUWZ128rmi: case X86::VPCMPUWZ128rri:
+ case X86::VPCMPUWZ256rmi: case X86::VPCMPUWZ256rri:
+ case X86::VPCMPUWZrmi: case X86::VPCMPUWZrri:
+ case X86::VPCMPWZ128rmi: case X86::VPCMPWZ128rri:
+ case X86::VPCMPWZ256rmi: case X86::VPCMPWZ256rri:
+ case X86::VPCMPWZrmi: case X86::VPCMPWZrri:
+ case X86::VPCMPBZ128rmik: case X86::VPCMPBZ128rrik:
+ case X86::VPCMPBZ256rmik: case X86::VPCMPBZ256rrik:
+ case X86::VPCMPBZrmik: case X86::VPCMPBZrrik:
+ case X86::VPCMPDZ128rmik: case X86::VPCMPDZ128rrik:
+ case X86::VPCMPDZ256rmik: case X86::VPCMPDZ256rrik:
+ case X86::VPCMPDZrmik: case X86::VPCMPDZrrik:
+ case X86::VPCMPQZ128rmik: case X86::VPCMPQZ128rrik:
+ case X86::VPCMPQZ256rmik: case X86::VPCMPQZ256rrik:
+ case X86::VPCMPQZrmik: case X86::VPCMPQZrrik:
+ case X86::VPCMPUBZ128rmik: case X86::VPCMPUBZ128rrik:
+ case X86::VPCMPUBZ256rmik: case X86::VPCMPUBZ256rrik:
+ case X86::VPCMPUBZrmik: case X86::VPCMPUBZrrik:
+ case X86::VPCMPUDZ128rmik: case X86::VPCMPUDZ128rrik:
+ case X86::VPCMPUDZ256rmik: case X86::VPCMPUDZ256rrik:
+ case X86::VPCMPUDZrmik: case X86::VPCMPUDZrrik:
+ case X86::VPCMPUQZ128rmik: case X86::VPCMPUQZ128rrik:
+ case X86::VPCMPUQZ256rmik: case X86::VPCMPUQZ256rrik:
+ case X86::VPCMPUQZrmik: case X86::VPCMPUQZrrik:
+ case X86::VPCMPUWZ128rmik: case X86::VPCMPUWZ128rrik:
+ case X86::VPCMPUWZ256rmik: case X86::VPCMPUWZ256rrik:
+ case X86::VPCMPUWZrmik: case X86::VPCMPUWZrrik:
+ case X86::VPCMPWZ128rmik: case X86::VPCMPWZ128rrik:
+ case X86::VPCMPWZ256rmik: case X86::VPCMPWZ256rrik:
+ case X86::VPCMPWZrmik: case X86::VPCMPWZrrik:
+ case X86::VPCMPDZ128rmib: case X86::VPCMPDZ128rmibk:
+ case X86::VPCMPDZ256rmib: case X86::VPCMPDZ256rmibk:
+ case X86::VPCMPDZrmib: case X86::VPCMPDZrmibk:
+ case X86::VPCMPQZ128rmib: case X86::VPCMPQZ128rmibk:
+ case X86::VPCMPQZ256rmib: case X86::VPCMPQZ256rmibk:
+ case X86::VPCMPQZrmib: case X86::VPCMPQZrmibk:
+ case X86::VPCMPUDZ128rmib: case X86::VPCMPUDZ128rmibk:
+ case X86::VPCMPUDZ256rmib: case X86::VPCMPUDZ256rmibk:
+ case X86::VPCMPUDZrmib: case X86::VPCMPUDZrmibk:
+ case X86::VPCMPUQZ128rmib: case X86::VPCMPUQZ128rmibk:
+ case X86::VPCMPUQZ256rmib: case X86::VPCMPUQZ256rmibk:
+ case X86::VPCMPUQZrmib: case X86::VPCMPUQZrmibk:
+ if ((Imm >= 0 && Imm <= 2) || (Imm >= 4 && Imm <= 6)) {
+ OS << '\t';
+ printVPCMPMnemonic(MI, OS);
+
+ unsigned CurOp = 0;
+ printOperand(MI, CurOp++, OS);
+
+ if (Desc.TSFlags & X86II::EVEX_K) {
+ // Print mask operand.
+ OS << " {";
+ printOperand(MI, CurOp++, OS);
+ OS << "}";
+ }
+ OS << ", ";
+ printOperand(MI, CurOp++, OS);
+ OS << ", ";
+
+ if ((Desc.TSFlags & X86II::FormMask) == X86II::MRMSrcMem) {
+ if (Desc.TSFlags & X86II::EVEX_B) {
+ // Broadcast form.
+ // Load size is based on W-bit as only D and Q are supported.
+ if (Desc.TSFlags & X86II::VEX_W)
+ printqwordmem(MI, CurOp++, OS);
+ else
+ printdwordmem(MI, CurOp++, OS);
+
+ // Print the number of elements broadcasted.
+ unsigned NumElts;
+ if (Desc.TSFlags & X86II::EVEX_L2)
+ NumElts = (Desc.TSFlags & X86II::VEX_W) ? 8 : 16;
+ else if (Desc.TSFlags & X86II::VEX_L)
+ NumElts = (Desc.TSFlags & X86II::VEX_W) ? 4 : 8;
+ else
+ NumElts = (Desc.TSFlags & X86II::VEX_W) ? 2 : 4;
+ OS << "{1to" << NumElts << "}";
+ } else {
+ if (Desc.TSFlags & X86II::EVEX_L2)
+ printzmmwordmem(MI, CurOp++, OS);
+ else if (Desc.TSFlags & X86II::VEX_L)
+ printymmwordmem(MI, CurOp++, OS);
+ else
+ printxmmwordmem(MI, CurOp++, OS);
+ }
+ } else {
+ printOperand(MI, CurOp++, OS);
+ }
+
+ return true;
+ }
+ break;
+ }
+
+ return false;
+}
+
+void X86IntelInstPrinter::printOperand(const MCInst *MI, unsigned OpNo,
+ raw_ostream &O) {
+ const MCOperand &Op = MI->getOperand(OpNo);
+ if (Op.isReg()) {
+ printRegName(O, Op.getReg());
+ } else if (Op.isImm()) {
+ O << formatImm((int64_t)Op.getImm());
+ } else {
+ assert(Op.isExpr() && "unknown operand kind in printOperand");
+ O << "offset ";
+ Op.getExpr()->print(O, &MAI);
+ }
+}
+
+void X86IntelInstPrinter::printMemReference(const MCInst *MI, unsigned Op,
+ raw_ostream &O) {
+ const MCOperand &BaseReg = MI->getOperand(Op+X86::AddrBaseReg);
+ unsigned ScaleVal = MI->getOperand(Op+X86::AddrScaleAmt).getImm();
+ const MCOperand &IndexReg = MI->getOperand(Op+X86::AddrIndexReg);
+ const MCOperand &DispSpec = MI->getOperand(Op+X86::AddrDisp);
+
+ // If this has a segment register, print it.
+ printOptionalSegReg(MI, Op + X86::AddrSegmentReg, O);
+
+ O << '[';
+
+ bool NeedPlus = false;
+ if (BaseReg.getReg()) {
+ printOperand(MI, Op+X86::AddrBaseReg, O);
+ NeedPlus = true;
+ }
+
+ if (IndexReg.getReg()) {
+ if (NeedPlus) O << " + ";
+ if (ScaleVal != 1)
+ O << ScaleVal << '*';
+ printOperand(MI, Op+X86::AddrIndexReg, O);
+ NeedPlus = true;
+ }
+
+ if (!DispSpec.isImm()) {
+ if (NeedPlus) O << " + ";
+ assert(DispSpec.isExpr() && "non-immediate displacement for LEA?");
+ DispSpec.getExpr()->print(O, &MAI);
+ } else {
+ int64_t DispVal = DispSpec.getImm();
+ if (DispVal || (!IndexReg.getReg() && !BaseReg.getReg())) {
+ if (NeedPlus) {
+ if (DispVal > 0)
+ O << " + ";
+ else {
+ O << " - ";
+ DispVal = -DispVal;
+ }
+ }
+ O << formatImm(DispVal);
+ }
+ }
+
+ O << ']';
+}
+
+void X86IntelInstPrinter::printSrcIdx(const MCInst *MI, unsigned Op,
+ raw_ostream &O) {
+ // If this has a segment register, print it.
+ printOptionalSegReg(MI, Op + 1, O);
+ O << '[';
+ printOperand(MI, Op, O);
+ O << ']';
+}
+
+void X86IntelInstPrinter::printDstIdx(const MCInst *MI, unsigned Op,
+ raw_ostream &O) {
+ // DI accesses are always ES-based.
+ O << "es:[";
+ printOperand(MI, Op, O);
+ O << ']';
+}
+
+void X86IntelInstPrinter::printMemOffset(const MCInst *MI, unsigned Op,
+ raw_ostream &O) {
+ const MCOperand &DispSpec = MI->getOperand(Op);
+
+ // If this has a segment register, print it.
+ printOptionalSegReg(MI, Op + 1, O);
+
+ O << '[';
+
+ if (DispSpec.isImm()) {
+ O << formatImm(DispSpec.getImm());
+ } else {
+ assert(DispSpec.isExpr() && "non-immediate displacement?");
+ DispSpec.getExpr()->print(O, &MAI);
+ }
+
+ O << ']';
+}
+
+void X86IntelInstPrinter::printU8Imm(const MCInst *MI, unsigned Op,
+ raw_ostream &O) {
+ if (MI->getOperand(Op).isExpr())
+ return MI->getOperand(Op).getExpr()->print(O, &MAI);
+
+ O << formatImm(MI->getOperand(Op).getImm() & 0xff);
+}
+
+void X86IntelInstPrinter::printSTiRegOperand(const MCInst *MI, unsigned OpNo,
+ raw_ostream &OS) {
+ const MCOperand &Op = MI->getOperand(OpNo);
+ unsigned Reg = Op.getReg();
+ // Override the default printing to print st(0) instead st.
+ if (Reg == X86::ST0)
+ OS << "st(0)";
+ else
+ printRegName(OS, Reg);
+}
diff --git a/lib/Target/X86/InstPrinter/X86IntelInstPrinter.h b/lib/Target/X86/MCTargetDesc/X86IntelInstPrinter.h
index 3b34a8052bec..f32f49f7c417 100644
--- a/lib/Target/X86/InstPrinter/X86IntelInstPrinter.h
+++ b/lib/Target/X86/MCTargetDesc/X86IntelInstPrinter.h
@@ -1,9 +1,8 @@
//= X86IntelInstPrinter.h - Convert X86 MCInst to assembly syntax -*- C++ -*-=//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
@@ -11,8 +10,8 @@
//
//===----------------------------------------------------------------------===//
-#ifndef LLVM_LIB_TARGET_X86_INSTPRINTER_X86INTELINSTPRINTER_H
-#define LLVM_LIB_TARGET_X86_INSTPRINTER_X86INTELINSTPRINTER_H
+#ifndef LLVM_LIB_TARGET_X86_MCTARGETDESC_X86INTELINSTPRINTER_H
+#define LLVM_LIB_TARGET_X86_MCTARGETDESC_X86INTELINSTPRINTER_H
#include "X86InstPrinterCommon.h"
#include "llvm/Support/raw_ostream.h"
@@ -28,6 +27,13 @@ public:
void printRegName(raw_ostream &OS, unsigned RegNo) const override;
void printInst(const MCInst *MI, raw_ostream &OS, StringRef Annot,
const MCSubtargetInfo &STI) override;
+ bool printVecCompareInstr(const MCInst *MI, raw_ostream &OS);
+
+ // Autogenerated by tblgen, returns true if we successfully printed an
+ // alias.
+ bool printAliasInstr(const MCInst *MI, raw_ostream &OS);
+ void printCustomAliasOperand(const MCInst *MI, unsigned OpIdx,
+ unsigned PrintMethodIdx, raw_ostream &O);
// Autogenerated by tblgen.
void printInstruction(const MCInst *MI, raw_ostream &O);
@@ -39,6 +45,7 @@ public:
void printSrcIdx(const MCInst *MI, unsigned OpNo, raw_ostream &O);
void printDstIdx(const MCInst *MI, unsigned OpNo, raw_ostream &O);
void printU8Imm(const MCInst *MI, unsigned Op, raw_ostream &O);
+ void printSTiRegOperand(const MCInst *MI, unsigned OpNo, raw_ostream &OS);
void printanymem(const MCInst *MI, unsigned OpNo, raw_ostream &O) {
printMemReference(MI, OpNo, O);
@@ -48,58 +55,38 @@ public:
printMemReference(MI, OpNo, O);
}
- void printi8mem(const MCInst *MI, unsigned OpNo, raw_ostream &O) {
+ void printbytemem(const MCInst *MI, unsigned OpNo, raw_ostream &O) {
O << "byte ptr ";
printMemReference(MI, OpNo, O);
}
- void printi16mem(const MCInst *MI, unsigned OpNo, raw_ostream &O) {
+ void printwordmem(const MCInst *MI, unsigned OpNo, raw_ostream &O) {
O << "word ptr ";
printMemReference(MI, OpNo, O);
}
- void printi32mem(const MCInst *MI, unsigned OpNo, raw_ostream &O) {
+ void printdwordmem(const MCInst *MI, unsigned OpNo, raw_ostream &O) {
O << "dword ptr ";
printMemReference(MI, OpNo, O);
}
- void printi64mem(const MCInst *MI, unsigned OpNo, raw_ostream &O) {
+ void printqwordmem(const MCInst *MI, unsigned OpNo, raw_ostream &O) {
O << "qword ptr ";
printMemReference(MI, OpNo, O);
}
- void printi128mem(const MCInst *MI, unsigned OpNo, raw_ostream &O) {
+ void printxmmwordmem(const MCInst *MI, unsigned OpNo, raw_ostream &O) {
O << "xmmword ptr ";
printMemReference(MI, OpNo, O);
}
- void printi256mem(const MCInst *MI, unsigned OpNo, raw_ostream &O) {
+ void printymmwordmem(const MCInst *MI, unsigned OpNo, raw_ostream &O) {
O << "ymmword ptr ";
printMemReference(MI, OpNo, O);
}
- void printi512mem(const MCInst *MI, unsigned OpNo, raw_ostream &O) {
+ void printzmmwordmem(const MCInst *MI, unsigned OpNo, raw_ostream &O) {
O << "zmmword ptr ";
printMemReference(MI, OpNo, O);
}
- void printf32mem(const MCInst *MI, unsigned OpNo, raw_ostream &O) {
- O << "dword ptr ";
- printMemReference(MI, OpNo, O);
- }
- void printf64mem(const MCInst *MI, unsigned OpNo, raw_ostream &O) {
- O << "qword ptr ";
- printMemReference(MI, OpNo, O);
- }
- void printf80mem(const MCInst *MI, unsigned OpNo, raw_ostream &O) {
+ void printtbytemem(const MCInst *MI, unsigned OpNo, raw_ostream &O) {
O << "tbyte ptr ";
printMemReference(MI, OpNo, O);
}
- void printf128mem(const MCInst *MI, unsigned OpNo, raw_ostream &O) {
- O << "xmmword ptr ";
- printMemReference(MI, OpNo, O);
- }
- void printf256mem(const MCInst *MI, unsigned OpNo, raw_ostream &O) {
- O << "ymmword ptr ";
- printMemReference(MI, OpNo, O);
- }
- void printf512mem(const MCInst *MI, unsigned OpNo, raw_ostream &O) {
- O << "zmmword ptr ";
- printMemReference(MI, OpNo, O);
- }
void printSrcIdx8(const MCInst *MI, unsigned OpNo, raw_ostream &O) {
@@ -154,4 +141,4 @@ public:
} // end namespace llvm
-#endif // LLVM_LIB_TARGET_X86_INSTPRINTER_X86INTELINSTPRINTER_H
+#endif // LLVM_LIB_TARGET_X86_MCTARGETDESC_X86INTELINSTPRINTER_H
diff --git a/lib/Target/X86/MCTargetDesc/X86MCAsmInfo.cpp b/lib/Target/X86/MCTargetDesc/X86MCAsmInfo.cpp
index fa7c352a1b63..e1125c176b25 100644
--- a/lib/Target/X86/MCTargetDesc/X86MCAsmInfo.cpp
+++ b/lib/Target/X86/MCTargetDesc/X86MCAsmInfo.cpp
@@ -1,9 +1,8 @@
//===-- X86MCAsmInfo.cpp - X86 asm properties -----------------------------===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
diff --git a/lib/Target/X86/MCTargetDesc/X86MCAsmInfo.h b/lib/Target/X86/MCTargetDesc/X86MCAsmInfo.h
index 30d5c802d1ed..b2369647a40f 100644
--- a/lib/Target/X86/MCTargetDesc/X86MCAsmInfo.h
+++ b/lib/Target/X86/MCTargetDesc/X86MCAsmInfo.h
@@ -1,9 +1,8 @@
//===-- X86MCAsmInfo.h - X86 asm properties --------------------*- C++ -*--===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
diff --git a/lib/Target/X86/MCTargetDesc/X86MCCodeEmitter.cpp b/lib/Target/X86/MCTargetDesc/X86MCCodeEmitter.cpp
index f5371db9e77a..31d26d08a63f 100644
--- a/lib/Target/X86/MCTargetDesc/X86MCCodeEmitter.cpp
+++ b/lib/Target/X86/MCTargetDesc/X86MCCodeEmitter.cpp
@@ -1,9 +1,8 @@
//===-- X86MCCodeEmitter.cpp - Convert X86 code to machine code -----------===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
@@ -525,9 +524,23 @@ void X86MCCodeEmitter::emitMemModRMByte(const MCInst &MI, unsigned Op,
// indirect register encoding, this handles addresses like [EAX]. The
// encoding for [EBP] with no displacement means [disp32] so we handle it
// by emitting a displacement of 0 below.
- if (Disp.isImm() && Disp.getImm() == 0 && BaseRegNo != N86::EBP) {
- EmitByte(ModRMByte(0, RegOpcodeField, BaseRegNo), CurByte, OS);
- return;
+ if (BaseRegNo != N86::EBP) {
+ if (Disp.isImm() && Disp.getImm() == 0) {
+ EmitByte(ModRMByte(0, RegOpcodeField, BaseRegNo), CurByte, OS);
+ return;
+ }
+
+ // If the displacement is @tlscall, treat it as a zero.
+ if (Disp.isExpr()) {
+ auto *Sym = dyn_cast<MCSymbolRefExpr>(Disp.getExpr());
+ if (Sym && Sym->getKind() == MCSymbolRefExpr::VK_TLSCALL) {
+ // This is exclusively used by call *a@tlscall(base). The relocation
+ // (R_386_TLSCALL or R_X86_64_TLSCALL) applies to the beginning.
+ Fixups.push_back(MCFixup::create(0, Sym, FK_NONE, MI.getLoc()));
+ EmitByte(ModRMByte(0, RegOpcodeField, BaseRegNo), CurByte, OS);
+ return;
+ }
+ }
}
// Otherwise, if the displacement fits in a byte, encode as [REG+disp8].
@@ -880,7 +893,8 @@ void X86MCCodeEmitter::EmitVEXOpcodePrefix(uint64_t TSFlags, unsigned &CurByte,
if (HasEVEX_RC) {
unsigned RcOperand = NumOps-1;
assert(RcOperand >= CurOp);
- EVEX_rc = MI.getOperand(RcOperand).getImm() & 0x3;
+ EVEX_rc = MI.getOperand(RcOperand).getImm();
+ assert(EVEX_rc <= 3 && "Invalid rounding control!");
}
EncodeRC = true;
}
@@ -979,7 +993,8 @@ void X86MCCodeEmitter::EmitVEXOpcodePrefix(uint64_t TSFlags, unsigned &CurByte,
uint8_t LastByte = VEX_PP | (VEX_L << 2) | (VEX_4V << 3);
// Can we use the 2 byte VEX prefix?
- if (Encoding == X86II::VEX && VEX_B && VEX_X && !VEX_W && (VEX_5M == 1)) {
+ if (!(MI.getFlags() & X86::IP_USE_VEX3) &&
+ Encoding == X86II::VEX && VEX_B && VEX_X && !VEX_W && (VEX_5M == 1)) {
EmitByte(0xC5, CurByte, OS);
EmitByte(LastByte | (VEX_R << 7), CurByte, OS);
return;
@@ -1060,16 +1075,17 @@ uint8_t X86MCCodeEmitter::DetermineREXPrefix(const MCInst &MI, uint64_t TSFlags,
REX |= isREXExtendedReg(MI, CurOp++) << 0; // REX.B
break;
case X86II::MRMSrcReg:
+ case X86II::MRMSrcRegCC:
REX |= isREXExtendedReg(MI, CurOp++) << 2; // REX.R
REX |= isREXExtendedReg(MI, CurOp++) << 0; // REX.B
break;
- case X86II::MRMSrcMem: {
+ case X86II::MRMSrcMem:
+ case X86II::MRMSrcMemCC:
REX |= isREXExtendedReg(MI, CurOp++) << 2; // REX.R
REX |= isREXExtendedReg(MI, MemOperand+X86::AddrBaseReg) << 0; // REX.B
REX |= isREXExtendedReg(MI, MemOperand+X86::AddrIndexReg) << 1; // REX.X
CurOp += X86::AddrNumOperands;
break;
- }
case X86II::MRMDestReg:
REX |= isREXExtendedReg(MI, CurOp++) << 0; // REX.B
REX |= isREXExtendedReg(MI, CurOp++) << 2; // REX.R
@@ -1080,7 +1096,7 @@ uint8_t X86MCCodeEmitter::DetermineREXPrefix(const MCInst &MI, uint64_t TSFlags,
CurOp += X86::AddrNumOperands;
REX |= isREXExtendedReg(MI, CurOp++) << 2; // REX.R
break;
- case X86II::MRMXm:
+ case X86II::MRMXmCC: case X86II::MRMXm:
case X86II::MRM0m: case X86II::MRM1m:
case X86II::MRM2m: case X86II::MRM3m:
case X86II::MRM4m: case X86II::MRM5m:
@@ -1088,7 +1104,7 @@ uint8_t X86MCCodeEmitter::DetermineREXPrefix(const MCInst &MI, uint64_t TSFlags,
REX |= isREXExtendedReg(MI, MemOperand+X86::AddrBaseReg) << 0; // REX.B
REX |= isREXExtendedReg(MI, MemOperand+X86::AddrIndexReg) << 1; // REX.X
break;
- case X86II::MRMXr:
+ case X86II::MRMXrCC: case X86II::MRMXr:
case X86II::MRM0r: case X86II::MRM1r:
case X86II::MRM2r: case X86II::MRM3r:
case X86II::MRM4r: case X86II::MRM5r:
@@ -1272,6 +1288,8 @@ encodeInstruction(const MCInst &MI, raw_ostream &OS,
if ((TSFlags & X86II::OpMapMask) == X86II::ThreeDNow)
BaseOpcode = 0x0F; // Weird 3DNow! encoding.
+ unsigned OpcodeOffset = 0;
+
uint64_t Form = TSFlags & X86II::FormMask;
switch (Form) {
default: errs() << "FORM: " << Form << "\n";
@@ -1318,8 +1336,14 @@ encodeInstruction(const MCInst &MI, raw_ostream &OS,
EmitByte(BaseOpcode, CurByte, OS);
break;
}
- case X86II::RawFrm: {
- EmitByte(BaseOpcode, CurByte, OS);
+ case X86II::AddCCFrm: {
+ // This will be added to the opcode in the fallthrough.
+ OpcodeOffset = MI.getOperand(NumOps - 1).getImm();
+ assert(OpcodeOffset < 16 && "Unexpected opcode offset!");
+ --NumOps; // Drop the operand from the end.
+ LLVM_FALLTHROUGH;
+ case X86II::RawFrm:
+ EmitByte(BaseOpcode + OpcodeOffset, CurByte, OS);
if (!is64BitMode(STI) || !isPCRel32Branch(MI))
break;
@@ -1436,6 +1460,17 @@ encodeInstruction(const MCInst &MI, raw_ostream &OS,
CurOp = SrcRegNum + 1;
break;
}
+ case X86II::MRMSrcRegCC: {
+ unsigned FirstOp = CurOp++;
+ unsigned SecondOp = CurOp++;
+
+ unsigned CC = MI.getOperand(CurOp++).getImm();
+ EmitByte(BaseOpcode + CC, CurByte, OS);
+
+ EmitRegModRMByte(MI.getOperand(SecondOp),
+ GetX86RegNum(MI.getOperand(FirstOp)), CurByte, OS);
+ break;
+ }
case X86II::MRMSrcMem: {
unsigned FirstMemOp = CurOp+1;
@@ -1481,6 +1516,27 @@ encodeInstruction(const MCInst &MI, raw_ostream &OS,
CurOp = FirstMemOp + X86::AddrNumOperands;
break;
}
+ case X86II::MRMSrcMemCC: {
+ unsigned RegOp = CurOp++;
+ unsigned FirstMemOp = CurOp;
+ CurOp = FirstMemOp + X86::AddrNumOperands;
+
+ unsigned CC = MI.getOperand(CurOp++).getImm();
+ EmitByte(BaseOpcode + CC, CurByte, OS);
+
+ emitMemModRMByte(MI, FirstMemOp, GetX86RegNum(MI.getOperand(RegOp)),
+ TSFlags, Rex, CurByte, OS, Fixups, STI);
+ break;
+ }
+
+ case X86II::MRMXrCC: {
+ unsigned RegOp = CurOp++;
+
+ unsigned CC = MI.getOperand(CurOp++).getImm();
+ EmitByte(BaseOpcode + CC, CurByte, OS);
+ EmitRegModRMByte(MI.getOperand(RegOp), 0, CurByte, OS);
+ break;
+ }
case X86II::MRMXr:
case X86II::MRM0r: case X86II::MRM1r:
@@ -1497,6 +1553,17 @@ encodeInstruction(const MCInst &MI, raw_ostream &OS,
CurByte, OS);
break;
+ case X86II::MRMXmCC: {
+ unsigned FirstMemOp = CurOp;
+ CurOp = FirstMemOp + X86::AddrNumOperands;
+
+ unsigned CC = MI.getOperand(CurOp++).getImm();
+ EmitByte(BaseOpcode + CC, CurByte, OS);
+
+ emitMemModRMByte(MI, FirstMemOp, 0, TSFlags, Rex, CurByte, OS, Fixups, STI);
+ break;
+ }
+
case X86II::MRMXm:
case X86II::MRM0m: case X86II::MRM1m:
case X86II::MRM2m: case X86II::MRM3m:
diff --git a/lib/Target/X86/MCTargetDesc/X86MCExpr.h b/lib/Target/X86/MCTargetDesc/X86MCExpr.h
index 1070f70468fa..532fecd9951b 100644
--- a/lib/Target/X86/MCTargetDesc/X86MCExpr.h
+++ b/lib/Target/X86/MCTargetDesc/X86MCExpr.h
@@ -1,9 +1,8 @@
//=--- X86MCExpr.h - X86 specific MC expression classes ---*- C++ -*-=//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
@@ -15,7 +14,7 @@
#ifndef LLVM_LIB_TARGET_X86_MCTARGETDESC_X86MCEXPR_H
#define LLVM_LIB_TARGET_X86_MCTARGETDESC_X86MCEXPR_H
-#include "InstPrinter/X86ATTInstPrinter.h"
+#include "X86ATTInstPrinter.h"
#include "llvm/MC/MCAsmInfo.h"
#include "llvm/MC/MCContext.h"
#include "llvm/MC/MCExpr.h"
diff --git a/lib/Target/X86/MCTargetDesc/X86MCTargetDesc.cpp b/lib/Target/X86/MCTargetDesc/X86MCTargetDesc.cpp
index ea4aaf14223d..ce05ad974507 100644
--- a/lib/Target/X86/MCTargetDesc/X86MCTargetDesc.cpp
+++ b/lib/Target/X86/MCTargetDesc/X86MCTargetDesc.cpp
@@ -1,9 +1,8 @@
//===-- X86MCTargetDesc.cpp - X86 Target Descriptions ---------------------===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
@@ -12,13 +11,15 @@
//===----------------------------------------------------------------------===//
#include "X86MCTargetDesc.h"
-#include "InstPrinter/X86ATTInstPrinter.h"
-#include "InstPrinter/X86IntelInstPrinter.h"
+#include "TargetInfo/X86TargetInfo.h"
+#include "X86ATTInstPrinter.h"
#include "X86BaseInfo.h"
+#include "X86IntelInstPrinter.h"
#include "X86MCAsmInfo.h"
#include "llvm/ADT/APInt.h"
#include "llvm/ADT/Triple.h"
#include "llvm/DebugInfo/CodeView/CodeView.h"
+#include "llvm/MC/MCDwarf.h"
#include "llvm/MC/MCInstrAnalysis.h"
#include "llvm/MC/MCInstrInfo.h"
#include "llvm/MC/MCRegisterInfo.h"
@@ -117,6 +118,15 @@ void X86_MC::initLLVMToSEHAndCVRegMapping(MCRegisterInfo *MRI) {
{codeview::RegisterId::ST6, X86::FP6},
{codeview::RegisterId::ST7, X86::FP7},
+ {codeview::RegisterId::MM0, X86::MM0},
+ {codeview::RegisterId::MM1, X86::MM1},
+ {codeview::RegisterId::MM2, X86::MM2},
+ {codeview::RegisterId::MM3, X86::MM3},
+ {codeview::RegisterId::MM4, X86::MM4},
+ {codeview::RegisterId::MM5, X86::MM5},
+ {codeview::RegisterId::MM6, X86::MM6},
+ {codeview::RegisterId::MM7, X86::MM7},
+
{codeview::RegisterId::XMM0, X86::XMM0},
{codeview::RegisterId::XMM1, X86::XMM1},
{codeview::RegisterId::XMM2, X86::XMM2},
diff --git a/lib/Target/X86/MCTargetDesc/X86MCTargetDesc.h b/lib/Target/X86/MCTargetDesc/X86MCTargetDesc.h
index 4e9f5ba60d2e..00dd5908cbf5 100644
--- a/lib/Target/X86/MCTargetDesc/X86MCTargetDesc.h
+++ b/lib/Target/X86/MCTargetDesc/X86MCTargetDesc.h
@@ -1,9 +1,8 @@
//===-- X86MCTargetDesc.h - X86 Target Descriptions -------------*- C++ -*-===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
@@ -35,9 +34,6 @@ class StringRef;
class raw_ostream;
class raw_pwrite_stream;
-Target &getTheX86_32Target();
-Target &getTheX86_64Target();
-
/// Flavour of dwarf regnumbers
///
namespace DWARFFlavour {
diff --git a/lib/Target/X86/MCTargetDesc/X86MachObjectWriter.cpp b/lib/Target/X86/MCTargetDesc/X86MachObjectWriter.cpp
index 883278b7bc1f..fc7e99f61e5e 100644
--- a/lib/Target/X86/MCTargetDesc/X86MachObjectWriter.cpp
+++ b/lib/Target/X86/MCTargetDesc/X86MachObjectWriter.cpp
@@ -1,9 +1,8 @@
//===-- X86MachObjectWriter.cpp - X86 Mach-O Writer -----------------------===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
diff --git a/lib/Target/X86/MCTargetDesc/X86TargetStreamer.h b/lib/Target/X86/MCTargetDesc/X86TargetStreamer.h
index 10a282dd2962..3b1e9e7c34fb 100644
--- a/lib/Target/X86/MCTargetDesc/X86TargetStreamer.h
+++ b/lib/Target/X86/MCTargetDesc/X86TargetStreamer.h
@@ -1,9 +1,8 @@
//===- X86TargetStreamer.h ------------------------------*- C++ -*---------===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
diff --git a/lib/Target/X86/MCTargetDesc/X86WinCOFFObjectWriter.cpp b/lib/Target/X86/MCTargetDesc/X86WinCOFFObjectWriter.cpp
index 2aec695b2dbf..3baab9da1c41 100644
--- a/lib/Target/X86/MCTargetDesc/X86WinCOFFObjectWriter.cpp
+++ b/lib/Target/X86/MCTargetDesc/X86WinCOFFObjectWriter.cpp
@@ -1,9 +1,8 @@
//===-- X86WinCOFFObjectWriter.cpp - X86 Win COFF Writer ------------------===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
diff --git a/lib/Target/X86/MCTargetDesc/X86WinCOFFStreamer.cpp b/lib/Target/X86/MCTargetDesc/X86WinCOFFStreamer.cpp
index 0085787e576a..796a27a17255 100644
--- a/lib/Target/X86/MCTargetDesc/X86WinCOFFStreamer.cpp
+++ b/lib/Target/X86/MCTargetDesc/X86WinCOFFStreamer.cpp
@@ -1,9 +1,8 @@
//===-- X86WinCOFFStreamer.cpp - X86 Target WinCOFF Streamer ----*- C++ -*-===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
diff --git a/lib/Target/X86/MCTargetDesc/X86WinCOFFTargetStreamer.cpp b/lib/Target/X86/MCTargetDesc/X86WinCOFFTargetStreamer.cpp
index bee9b7046338..e9987d1f62bd 100644
--- a/lib/Target/X86/MCTargetDesc/X86WinCOFFTargetStreamer.cpp
+++ b/lib/Target/X86/MCTargetDesc/X86WinCOFFTargetStreamer.cpp
@@ -1,9 +1,8 @@
//===-- X86WinCOFFTargetStreamer.cpp ----------------------------*- C++ -*-===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
diff --git a/lib/Target/X86/ShadowCallStack.cpp b/lib/Target/X86/ShadowCallStack.cpp
deleted file mode 100644
index ab2cebcb58ee..000000000000
--- a/lib/Target/X86/ShadowCallStack.cpp
+++ /dev/null
@@ -1,322 +0,0 @@
-//===------- ShadowCallStack.cpp - Shadow Call Stack pass -----------------===//
-//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
-//
-//===----------------------------------------------------------------------===//
-//
-// The ShadowCallStack pass instruments function prologs/epilogs to check that
-// the return address has not been corrupted during the execution of the
-// function. The return address is stored in a 'shadow call stack' addressed
-// using the %gs segment register.
-//
-//===----------------------------------------------------------------------===//
-
-#include "X86.h"
-#include "X86InstrBuilder.h"
-#include "X86InstrInfo.h"
-#include "X86Subtarget.h"
-
-#include "llvm/CodeGen/MachineFunction.h"
-#include "llvm/CodeGen/MachineFunctionPass.h"
-#include "llvm/CodeGen/MachineInstrBuilder.h"
-#include "llvm/CodeGen/MachineModuleInfo.h"
-#include "llvm/CodeGen/MachineRegisterInfo.h"
-#include "llvm/CodeGen/Passes.h"
-#include "llvm/CodeGen/TargetInstrInfo.h"
-#include "llvm/Pass.h"
-#include "llvm/Support/raw_ostream.h"
-
-using namespace llvm;
-
-namespace {
-
-class ShadowCallStack : public MachineFunctionPass {
-public:
- static char ID;
-
- ShadowCallStack() : MachineFunctionPass(ID) {
- initializeShadowCallStackPass(*PassRegistry::getPassRegistry());
- }
-
- void getAnalysisUsage(AnalysisUsage &AU) const override {
- MachineFunctionPass::getAnalysisUsage(AU);
- }
-
- bool runOnMachineFunction(MachineFunction &Fn) override;
-
-private:
- // Do not instrument leaf functions with this many or fewer instructions. The
- // shadow call stack instrumented prolog/epilog are slightly race-y reading
- // and checking the saved return address, so it is better to not instrument
- // functions that have fewer instructions than the instrumented prolog/epilog
- // race.
- static const size_t SkipLeafInstructions = 3;
-};
-
-char ShadowCallStack::ID = 0;
-} // end anonymous namespace.
-
-static void addProlog(MachineFunction &Fn, const TargetInstrInfo *TII,
- MachineBasicBlock &MBB, const DebugLoc &DL);
-static void addPrologLeaf(MachineFunction &Fn, const TargetInstrInfo *TII,
- MachineBasicBlock &MBB, const DebugLoc &DL,
- MCPhysReg FreeRegister);
-
-static void addEpilog(const TargetInstrInfo *TII, MachineBasicBlock &MBB,
- MachineInstr &MI, MachineBasicBlock &TrapBB);
-static void addEpilogLeaf(const TargetInstrInfo *TII, MachineBasicBlock &MBB,
- MachineInstr &MI, MachineBasicBlock &TrapBB,
- MCPhysReg FreeRegister);
-// Generate a longer epilog that only uses r10 when a tailcall branches to r11.
-static void addEpilogOnlyR10(const TargetInstrInfo *TII, MachineBasicBlock &MBB,
- MachineInstr &MI, MachineBasicBlock &TrapBB);
-
-// Helper function to add ModR/M references for [Seg: Reg + Offset] memory
-// accesses
-static inline const MachineInstrBuilder &
-addSegmentedMem(const MachineInstrBuilder &MIB, MCPhysReg Seg, MCPhysReg Reg,
- int Offset = 0) {
- return MIB.addReg(Reg).addImm(1).addReg(0).addImm(Offset).addReg(Seg);
-}
-
-static void addProlog(MachineFunction &Fn, const TargetInstrInfo *TII,
- MachineBasicBlock &MBB, const DebugLoc &DL) {
- const MCPhysReg ReturnReg = X86::R10;
- const MCPhysReg OffsetReg = X86::R11;
-
- auto MBBI = MBB.begin();
- // mov r10, [rsp]
- addDirectMem(BuildMI(MBB, MBBI, DL, TII->get(X86::MOV64rm)).addDef(ReturnReg),
- X86::RSP);
- // xor r11, r11
- BuildMI(MBB, MBBI, DL, TII->get(X86::XOR64rr))
- .addDef(OffsetReg)
- .addReg(OffsetReg, RegState::Undef)
- .addReg(OffsetReg, RegState::Undef);
- // add QWORD [gs:r11], 8
- addSegmentedMem(BuildMI(MBB, MBBI, DL, TII->get(X86::ADD64mi8)), X86::GS,
- OffsetReg)
- .addImm(8);
- // mov r11, [gs:r11]
- addSegmentedMem(
- BuildMI(MBB, MBBI, DL, TII->get(X86::MOV64rm)).addDef(OffsetReg), X86::GS,
- OffsetReg);
- // mov [gs:r11], r10
- addSegmentedMem(BuildMI(MBB, MBBI, DL, TII->get(X86::MOV64mr)), X86::GS,
- OffsetReg)
- .addReg(ReturnReg);
-}
-
-static void addPrologLeaf(MachineFunction &Fn, const TargetInstrInfo *TII,
- MachineBasicBlock &MBB, const DebugLoc &DL,
- MCPhysReg FreeRegister) {
- // mov REG, [rsp]
- addDirectMem(BuildMI(MBB, MBB.begin(), DL, TII->get(X86::MOV64rm))
- .addDef(FreeRegister),
- X86::RSP);
-}
-
-static void addEpilog(const TargetInstrInfo *TII, MachineBasicBlock &MBB,
- MachineInstr &MI, MachineBasicBlock &TrapBB) {
- const DebugLoc &DL = MI.getDebugLoc();
-
- // xor r11, r11
- BuildMI(MBB, MI, DL, TII->get(X86::XOR64rr))
- .addDef(X86::R11)
- .addReg(X86::R11, RegState::Undef)
- .addReg(X86::R11, RegState::Undef);
- // mov r10, [gs:r11]
- addSegmentedMem(BuildMI(MBB, MI, DL, TII->get(X86::MOV64rm)).addDef(X86::R10),
- X86::GS, X86::R11);
- // mov r10, [gs:r10]
- addSegmentedMem(BuildMI(MBB, MI, DL, TII->get(X86::MOV64rm)).addDef(X86::R10),
- X86::GS, X86::R10);
- // sub QWORD [gs:r11], 8
- // This instruction should not be moved up to avoid a signal race.
- addSegmentedMem(BuildMI(MBB, MI, DL, TII->get(X86::SUB64mi8)),
- X86::GS, X86::R11)
- .addImm(8);
- // cmp [rsp], r10
- addDirectMem(BuildMI(MBB, MI, DL, TII->get(X86::CMP64mr)), X86::RSP)
- .addReg(X86::R10);
- // jne trap
- BuildMI(MBB, MI, DL, TII->get(X86::JNE_1)).addMBB(&TrapBB);
- MBB.addSuccessor(&TrapBB);
-}
-
-static void addEpilogLeaf(const TargetInstrInfo *TII, MachineBasicBlock &MBB,
- MachineInstr &MI, MachineBasicBlock &TrapBB,
- MCPhysReg FreeRegister) {
- const DebugLoc &DL = MI.getDebugLoc();
-
- // cmp [rsp], REG
- addDirectMem(BuildMI(MBB, MI, DL, TII->get(X86::CMP64mr)), X86::RSP)
- .addReg(FreeRegister);
- // jne trap
- BuildMI(MBB, MI, DL, TII->get(X86::JNE_1)).addMBB(&TrapBB);
- MBB.addSuccessor(&TrapBB);
-}
-
-static void addEpilogOnlyR10(const TargetInstrInfo *TII, MachineBasicBlock &MBB,
- MachineInstr &MI, MachineBasicBlock &TrapBB) {
- const DebugLoc &DL = MI.getDebugLoc();
-
- // xor r10, r10
- BuildMI(MBB, MI, DL, TII->get(X86::XOR64rr))
- .addDef(X86::R10)
- .addReg(X86::R10, RegState::Undef)
- .addReg(X86::R10, RegState::Undef);
- // mov r10, [gs:r10]
- addSegmentedMem(BuildMI(MBB, MI, DL, TII->get(X86::MOV64rm)).addDef(X86::R10),
- X86::GS, X86::R10);
- // mov r10, [gs:r10]
- addSegmentedMem(BuildMI(MBB, MI, DL, TII->get(X86::MOV64rm)).addDef(X86::R10),
- X86::GS, X86::R10);
- // sub QWORD [gs:0], 8
- // This instruction should not be moved up to avoid a signal race.
- addSegmentedMem(BuildMI(MBB, MI, DL, TII->get(X86::SUB64mi8)), X86::GS, 0)
- .addImm(8);
- // cmp [rsp], r10
- addDirectMem(BuildMI(MBB, MI, DL, TII->get(X86::CMP64mr)), X86::RSP)
- .addReg(X86::R10);
- // jne trap
- BuildMI(MBB, MI, DL, TII->get(X86::JNE_1)).addMBB(&TrapBB);
- MBB.addSuccessor(&TrapBB);
-}
-
-bool ShadowCallStack::runOnMachineFunction(MachineFunction &Fn) {
- if (!Fn.getFunction().hasFnAttribute(Attribute::ShadowCallStack) ||
- Fn.getFunction().hasFnAttribute(Attribute::Naked))
- return false;
-
- if (Fn.empty() || !Fn.getRegInfo().tracksLiveness())
- return false;
-
- // FIXME: Skip functions that have r10 or r11 live on entry (r10 can be live
- // on entry for parameters with the nest attribute.)
- if (Fn.front().isLiveIn(X86::R10) || Fn.front().isLiveIn(X86::R11))
- return false;
-
- // FIXME: Skip functions with conditional and r10 tail calls for now.
- bool HasReturn = false;
- for (auto &MBB : Fn) {
- if (MBB.empty())
- continue;
-
- const MachineInstr &MI = MBB.instr_back();
- if (MI.isReturn())
- HasReturn = true;
-
- if (MI.isReturn() && MI.isCall()) {
- if (MI.findRegisterUseOperand(X86::EFLAGS))
- return false;
- // This should only be possible on Windows 64 (see GR64_TC versus
- // GR64_TCW64.)
- if (MI.findRegisterUseOperand(X86::R10) ||
- MI.hasRegisterImplicitUseOperand(X86::R10))
- return false;
- }
- }
-
- if (!HasReturn)
- return false;
-
- // For leaf functions:
- // 1. Do not instrument very short functions where it would not improve that
- // function's security.
- // 2. Detect if there is an unused caller-saved register we can reserve to
- // hold the return address instead of writing/reading it from the shadow
- // call stack.
- MCPhysReg LeafFuncRegister = X86::NoRegister;
- if (!Fn.getFrameInfo().adjustsStack()) {
- size_t InstructionCount = 0;
- std::bitset<X86::NUM_TARGET_REGS> UsedRegs;
- for (auto &MBB : Fn) {
- for (auto &LiveIn : MBB.liveins())
- UsedRegs.set(LiveIn.PhysReg);
- for (auto &MI : MBB) {
- if (!MI.isDebugValue() && !MI.isCFIInstruction() && !MI.isLabel())
- InstructionCount++;
- for (auto &Op : MI.operands())
- if (Op.isReg() && Op.isDef())
- UsedRegs.set(Op.getReg());
- }
- }
-
- if (InstructionCount <= SkipLeafInstructions)
- return false;
-
- std::bitset<X86::NUM_TARGET_REGS> CalleeSavedRegs;
- const MCPhysReg *CSRegs = Fn.getRegInfo().getCalleeSavedRegs();
- for (size_t i = 0; CSRegs[i]; i++)
- CalleeSavedRegs.set(CSRegs[i]);
-
- const TargetRegisterInfo *TRI = Fn.getSubtarget().getRegisterInfo();
- for (auto &Reg : X86::GR64_NOSPRegClass.getRegisters()) {
- // FIXME: Optimization opportunity: spill/restore a callee-saved register
- // if a caller-saved register is unavailable.
- if (CalleeSavedRegs.test(Reg))
- continue;
-
- bool Used = false;
- for (MCSubRegIterator SR(Reg, TRI, true); SR.isValid(); ++SR)
- if ((Used = UsedRegs.test(*SR)))
- break;
-
- if (!Used) {
- LeafFuncRegister = Reg;
- break;
- }
- }
- }
-
- const bool LeafFuncOptimization = LeafFuncRegister != X86::NoRegister;
- if (LeafFuncOptimization)
- // Mark the leaf function register live-in for all MBBs except the entry MBB
- for (auto I = ++Fn.begin(), E = Fn.end(); I != E; ++I)
- I->addLiveIn(LeafFuncRegister);
-
- MachineBasicBlock &MBB = Fn.front();
- const MachineBasicBlock *NonEmpty = MBB.empty() ? MBB.getFallThrough() : &MBB;
- const DebugLoc &DL = NonEmpty->front().getDebugLoc();
-
- const TargetInstrInfo *TII = Fn.getSubtarget().getInstrInfo();
- if (LeafFuncOptimization)
- addPrologLeaf(Fn, TII, MBB, DL, LeafFuncRegister);
- else
- addProlog(Fn, TII, MBB, DL);
-
- MachineBasicBlock *Trap = nullptr;
- for (auto &MBB : Fn) {
- if (MBB.empty())
- continue;
-
- MachineInstr &MI = MBB.instr_back();
- if (MI.isReturn()) {
- if (!Trap) {
- Trap = Fn.CreateMachineBasicBlock();
- BuildMI(Trap, MI.getDebugLoc(), TII->get(X86::TRAP));
- Fn.push_back(Trap);
- }
-
- if (LeafFuncOptimization)
- addEpilogLeaf(TII, MBB, MI, *Trap, LeafFuncRegister);
- else if (MI.findRegisterUseOperand(X86::R11))
- addEpilogOnlyR10(TII, MBB, MI, *Trap);
- else
- addEpilog(TII, MBB, MI, *Trap);
- }
- }
-
- return true;
-}
-
-INITIALIZE_PASS(ShadowCallStack, "shadow-call-stack", "Shadow Call Stack",
- false, false)
-
-FunctionPass *llvm::createShadowCallStackPass() {
- return new ShadowCallStack();
-}
diff --git a/lib/Target/X86/TargetInfo/X86TargetInfo.cpp b/lib/Target/X86/TargetInfo/X86TargetInfo.cpp
index 16c2b56c48b5..47c41626a666 100644
--- a/lib/Target/X86/TargetInfo/X86TargetInfo.cpp
+++ b/lib/Target/X86/TargetInfo/X86TargetInfo.cpp
@@ -1,13 +1,12 @@
//===-- X86TargetInfo.cpp - X86 Target Implementation ---------------------===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
-#include "MCTargetDesc/X86MCTargetDesc.h"
+#include "TargetInfo/X86TargetInfo.h"
#include "llvm/Support/TargetRegistry.h"
using namespace llvm;
diff --git a/lib/Target/X86/TargetInfo/X86TargetInfo.h b/lib/Target/X86/TargetInfo/X86TargetInfo.h
new file mode 100644
index 000000000000..caf6b8d424fc
--- /dev/null
+++ b/lib/Target/X86/TargetInfo/X86TargetInfo.h
@@ -0,0 +1,21 @@
+//===-- X86TargetInfo.h - X86 Target Implementation -------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_TARGET_X86_TARGETINFO_X86TARGETINFO_H
+#define LLVM_LIB_TARGET_X86_TARGETINFO_X86TARGETINFO_H
+
+namespace llvm {
+
+class Target;
+
+Target &getTheX86_32Target();
+Target &getTheX86_64Target();
+
+}
+
+#endif // LLVM_LIB_TARGET_X86_TARGETINFO_X86TARGETINFO_H
diff --git a/lib/Target/X86/Utils/X86ShuffleDecode.cpp b/lib/Target/X86/Utils/X86ShuffleDecode.cpp
index bed940d0d0e9..48fd3e0b7ab9 100644
--- a/lib/Target/X86/Utils/X86ShuffleDecode.cpp
+++ b/lib/Target/X86/Utils/X86ShuffleDecode.cpp
@@ -1,9 +1,8 @@
//===-- X86ShuffleDecode.cpp - X86 shuffle decode logic -------------------===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
@@ -300,7 +299,7 @@ void DecodeVPERM2X128Mask(unsigned NumElts, unsigned Imm,
unsigned HalfMask = Imm >> (l * 4);
unsigned HalfBegin = (HalfMask & 0x3) * HalfSize;
for (unsigned i = HalfBegin, e = HalfBegin + HalfSize; i != e; ++i)
- ShuffleMask.push_back(HalfMask & 8 ? SM_SentinelZero : (int)i);
+ ShuffleMask.push_back((HalfMask & 8) ? SM_SentinelZero : (int)i);
}
}
@@ -384,7 +383,8 @@ void DecodeVPERMMask(unsigned NumElts, unsigned Imm,
}
void DecodeZeroExtendMask(unsigned SrcScalarBits, unsigned DstScalarBits,
- unsigned NumDstElts, SmallVectorImpl<int> &Mask) {
+ unsigned NumDstElts, bool IsAnyExtend,
+ SmallVectorImpl<int> &Mask) {
unsigned Scale = DstScalarBits / SrcScalarBits;
assert(SrcScalarBits < DstScalarBits &&
"Expected zero extension mask to increase scalar size");
@@ -392,7 +392,7 @@ void DecodeZeroExtendMask(unsigned SrcScalarBits, unsigned DstScalarBits,
for (unsigned i = 0; i != NumDstElts; i++) {
Mask.push_back(i);
for (unsigned j = 1; j != Scale; j++)
- Mask.push_back(SM_SentinelZero);
+ Mask.push_back(IsAnyExtend ? SM_SentinelUndef : SM_SentinelZero);
}
}
diff --git a/lib/Target/X86/Utils/X86ShuffleDecode.h b/lib/Target/X86/Utils/X86ShuffleDecode.h
index 85cde14a3241..f52785063071 100644
--- a/lib/Target/X86/Utils/X86ShuffleDecode.h
+++ b/lib/Target/X86/Utils/X86ShuffleDecode.h
@@ -1,9 +1,8 @@
//===-- X86ShuffleDecode.h - X86 shuffle decode logic -----------*-C++-*---===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
@@ -137,7 +136,7 @@ void DecodeVPPERMMask(ArrayRef<uint64_t> RawMask, const APInt &UndefElts,
/// Decode a zero extension instruction as a shuffle mask.
void DecodeZeroExtendMask(unsigned SrcScalarBits, unsigned DstScalarBits,
- unsigned NumDstElts,
+ unsigned NumDstElts, bool IsAnyExtend,
SmallVectorImpl<int> &ShuffleMask);
/// Decode a move lower and zero upper instruction as a shuffle mask.
diff --git a/lib/Target/X86/X86.h b/lib/Target/X86/X86.h
index 1c8813815b86..a95f68434d12 100644
--- a/lib/Target/X86/X86.h
+++ b/lib/Target/X86/X86.h
@@ -1,9 +1,8 @@
//===-- X86.h - Top-level interface for X86 representation ------*- C++ -*-===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
@@ -50,11 +49,6 @@ FunctionPass *createX86FloatingPointStackifierPass();
/// transition penalty between functions encoded with AVX and SSE.
FunctionPass *createX86IssueVZeroUpperPass();
-/// This pass instruments the function prolog to save the return address to a
-/// 'shadow call stack' and the function epilog to check that the return address
-/// did not change during function execution.
-FunctionPass *createShadowCallStackPass();
-
/// This pass inserts ENDBR instructions before indirect jump/call
/// destinations as part of CET IBT mechanism.
FunctionPass *createX86IndirectBranchTrackingPass();
@@ -138,11 +132,12 @@ FunctionPass *createX86SpeculativeLoadHardeningPass();
void initializeEvexToVexInstPassPass(PassRegistry &);
void initializeFixupBWInstPassPass(PassRegistry &);
void initializeFixupLEAPassPass(PassRegistry &);
-void initializeShadowCallStackPass(PassRegistry &);
+void initializeFPSPass(PassRegistry &);
void initializeWinEHStatePassPass(PassRegistry &);
void initializeX86AvoidSFBPassPass(PassRegistry &);
void initializeX86CallFrameOptimizationPass(PassRegistry &);
void initializeX86CmovConverterPassPass(PassRegistry &);
+void initializeX86ExpandPseudoPass(PassRegistry&);
void initializeX86CondBrFoldingPassPass(PassRegistry &);
void initializeX86DomainReassignmentPass(PassRegistry &);
void initializeX86ExecutionDomainFixPass(PassRegistry &);
diff --git a/lib/Target/X86/X86.td b/lib/Target/X86/X86.td
index 6b1749fc7500..3112f00c91f2 100644
--- a/lib/Target/X86/X86.td
+++ b/lib/Target/X86/X86.td
@@ -1,9 +1,8 @@
//===-- X86.td - Target definition file for the Intel X86 --*- tablegen -*-===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
@@ -40,6 +39,9 @@ def FeatureNOPL : SubtargetFeature<"nopl", "HasNOPL", "true",
def FeatureCMOV : SubtargetFeature<"cmov","HasCMov", "true",
"Enable conditional move instructions">;
+def FeatureCMPXCHG8B : SubtargetFeature<"cx8", "HasCmpxchg8b", "true",
+ "Support CMPXCHG8B instructions">;
+
def FeaturePOPCNT : SubtargetFeature<"popcnt", "HasPOPCNT", "true",
"Support POPCNT instruction">;
@@ -165,9 +167,16 @@ def FeaturePKU : SubtargetFeature<"pku", "HasPKU", "true",
def FeatureVNNI : SubtargetFeature<"avx512vnni", "HasVNNI", "true",
"Enable AVX-512 Vector Neural Network Instructions",
[FeatureAVX512]>;
+def FeatureBF16 : SubtargetFeature<"avx512bf16", "HasBF16", "true",
+ "Support bfloat16 floating point",
+ [FeatureBWI]>;
def FeatureBITALG : SubtargetFeature<"avx512bitalg", "HasBITALG", "true",
"Enable AVX-512 Bit Algorithms",
[FeatureBWI]>;
+def FeatureVP2INTERSECT : SubtargetFeature<"avx512vp2intersect",
+ "HasVP2INTERSECT", "true",
+ "Enable AVX-512 vp2intersect",
+ [FeatureAVX512]>;
def FeaturePCLMUL : SubtargetFeature<"pclmul", "HasPCLMUL", "true",
"Enable packed carry-less multiplication instructions",
[FeatureSSE2]>;
@@ -258,6 +267,8 @@ def FeatureRDPID : SubtargetFeature<"rdpid", "HasRDPID", "true",
"Support RDPID instructions">;
def FeatureWAITPKG : SubtargetFeature<"waitpkg", "HasWAITPKG", "true",
"Wait and pause enhancements">;
+def FeatureENQCMD : SubtargetFeature<"enqcmd", "HasENQCMD", "true",
+ "Has ENQCMD instructions">;
// On some processors, instructions that implicitly take two memory operands are
// slow. In practice, this means that CALL, PUSH, and POP with memory operands
// should be avoided in favor of a MOV + register CALL/PUSH/POP.
@@ -274,7 +285,7 @@ def FeatureSlowIncDec : SubtargetFeature<"slow-incdec", "SlowIncDec", "true",
"INC and DEC instructions are slower than ADD and SUB">;
def FeatureSoftFloat
: SubtargetFeature<"soft-float", "UseSoftFloat", "true",
- "Use software floating point features.">;
+ "Use software floating point features">;
def FeaturePOPCNTFalseDeps : SubtargetFeature<"false-deps-popcnt",
"HasPOPCNTFalseDeps", "true",
"POPCNT has a false dependency on dest register">;
@@ -342,6 +353,12 @@ def FeatureERMSB
"ermsb", "HasERMSB", "true",
"REP MOVS/STOS are fast">;
+// Bulldozer and newer processors can merge CMP/TEST (but not other
+// instructions) with conditional branches.
+def FeatureBranchFusion
+ : SubtargetFeature<"branchfusion", "HasBranchFusion", "true",
+ "CMP/TEST can be fused with conditional branches">;
+
// Sandy Bridge and newer processors have many instructions that can be
// fused with conditional branches and pass through the CPU as a single
// operation.
@@ -355,7 +372,7 @@ def FeatureMacroFusion
// similar to Skylake Server (AVX-512).
def FeatureHasFastGather
: SubtargetFeature<"fast-gather", "HasFastGather", "true",
- "Indicates if gather is reasonably fast.">;
+ "Indicates if gather is reasonably fast">;
def FeaturePrefer256Bit
: SubtargetFeature<"prefer-256-bit", "Prefer256Bit", "true",
@@ -366,7 +383,7 @@ def FeaturePrefer256Bit
def FeatureRetpolineIndirectCalls
: SubtargetFeature<
"retpoline-indirect-calls", "UseRetpolineIndirectCalls", "true",
- "Remove speculation of indirect calls from the generated code.">;
+ "Remove speculation of indirect calls from the generated code">;
// Lower indirect branches and switches either using conditional branch trees
// or using a special construct called a `retpoline` to mitigate potential
@@ -374,7 +391,7 @@ def FeatureRetpolineIndirectCalls
def FeatureRetpolineIndirectBranches
: SubtargetFeature<
"retpoline-indirect-branches", "UseRetpolineIndirectBranches", "true",
- "Remove speculation of indirect branches from the generated code.">;
+ "Remove speculation of indirect branches from the generated code">;
// Deprecated umbrella feature for enabling both `retpoline-indirect-calls` and
// `retpoline-indirect-branches` above.
@@ -382,7 +399,7 @@ def FeatureRetpoline
: SubtargetFeature<"retpoline", "DeprecatedUseRetpoline", "true",
"Remove speculation of indirect branches from the "
"generated code, either by avoiding them entirely or "
- "lowering them with a speculation blocking construct.",
+ "lowering them with a speculation blocking construct",
[FeatureRetpolineIndirectCalls,
FeatureRetpolineIndirectBranches]>;
@@ -395,7 +412,7 @@ def FeatureRetpolineExternalThunk
"When lowering an indirect call or branch using a `retpoline`, rely "
"on the specified user provided thunk rather than emitting one "
"ourselves. Only has effect when combined with some other retpoline "
- "feature.", [FeatureRetpolineIndirectCalls]>;
+ "feature", [FeatureRetpolineIndirectCalls]>;
// Direct Move instructions.
def FeatureMOVDIRI : SubtargetFeature<"movdiri", "HasMOVDIRI", "true",
@@ -405,7 +422,7 @@ def FeatureMOVDIR64B : SubtargetFeature<"movdir64b", "HasMOVDIR64B", "true",
def FeatureFastBEXTR : SubtargetFeature<"fast-bextr", "HasFastBEXTR", "true",
"Indicates that the BEXTR instruction is implemented as a single uop "
- "with good throughput.">;
+ "with good throughput">;
// Combine vector math operations with shuffles into horizontal math
// instructions if a CPU implements horizontal operations (introduced with
@@ -416,12 +433,33 @@ def FeatureFastHorizontalOps
"Prefer horizontal vector math instructions (haddp, phsub, etc.) over "
"normal vector instructions with shuffles", [FeatureSSE3]>;
+def FeatureFastScalarShiftMasks
+ : SubtargetFeature<
+ "fast-scalar-shift-masks", "HasFastScalarShiftMasks", "true",
+ "Prefer a left/right scalar logical shift pair over a shift+and pair">;
+
+def FeatureFastVectorShiftMasks
+ : SubtargetFeature<
+ "fast-vector-shift-masks", "HasFastVectorShiftMasks", "true",
+ "Prefer a left/right vector logical shift pair over a shift+and pair">;
+
// Merge branches using three-way conditional code.
def FeatureMergeToThreeWayBranch : SubtargetFeature<"merge-to-threeway-branch",
"ThreewayBranchProfitable", "true",
"Merge branches to a three-way "
"conditional branch">;
+// Bonnell
+def ProcIntelAtom : SubtargetFeature<"", "X86ProcFamily", "IntelAtom", "">;
+// Silvermont
+def ProcIntelSLM : SubtargetFeature<"", "X86ProcFamily", "IntelSLM", "">;
+// Goldmont
+def ProcIntelGLM : SubtargetFeature<"", "X86ProcFamily", "IntelGLM", "">;
+// Goldmont Plus
+def ProcIntelGLP : SubtargetFeature<"", "X86ProcFamily", "IntelGLP", "">;
+// Tremont
+def ProcIntelTRM : SubtargetFeature<"", "X86ProcFamily", "IntelTRM", "">;
+
//===----------------------------------------------------------------------===//
// Register File Description
//===----------------------------------------------------------------------===//
@@ -440,7 +478,7 @@ include "X86SchedPredicates.td"
def X86InstrInfo : InstrInfo;
//===----------------------------------------------------------------------===//
-// X86 processors supported.
+// X86 Scheduler Models
//===----------------------------------------------------------------------===//
include "X86ScheduleAtom.td"
@@ -454,37 +492,468 @@ include "X86ScheduleBtVer2.td"
include "X86SchedSkylakeClient.td"
include "X86SchedSkylakeServer.td"
-def ProcIntelAtom : SubtargetFeature<"atom", "X86ProcFamily", "IntelAtom",
- "Intel Atom processors">;
-def ProcIntelSLM : SubtargetFeature<"slm", "X86ProcFamily", "IntelSLM",
- "Intel Silvermont processors">;
-def ProcIntelGLM : SubtargetFeature<"glm", "X86ProcFamily", "IntelGLM",
- "Intel Goldmont processors">;
-def ProcIntelGLP : SubtargetFeature<"glp", "X86ProcFamily", "IntelGLP",
- "Intel Goldmont Plus processors">;
-def ProcIntelTRM : SubtargetFeature<"tremont", "X86ProcFamily", "IntelTRM",
- "Intel Tremont processors">;
+//===----------------------------------------------------------------------===//
+// X86 Processor Feature Lists
+//===----------------------------------------------------------------------===//
+
+def ProcessorFeatures {
+ // Nehalem
+ list<SubtargetFeature> NHMInheritableFeatures = [FeatureX87,
+ FeatureCMPXCHG8B,
+ FeatureCMOV,
+ FeatureMMX,
+ FeatureSSE42,
+ FeatureFXSR,
+ FeatureNOPL,
+ Feature64Bit,
+ FeatureCMPXCHG16B,
+ FeaturePOPCNT,
+ FeatureLAHFSAHF,
+ FeatureMacroFusion];
+ list<SubtargetFeature> NHMSpecificFeatures = [];
+ list<SubtargetFeature> NHMFeatures =
+ !listconcat(NHMInheritableFeatures, NHMSpecificFeatures);
+
+ // Westmere
+ list<SubtargetFeature> WSMAdditionalFeatures = [FeaturePCLMUL];
+ list<SubtargetFeature> WSMSpecificFeatures = [];
+ list<SubtargetFeature> WSMInheritableFeatures =
+ !listconcat(NHMInheritableFeatures, WSMAdditionalFeatures);
+ list<SubtargetFeature> WSMFeatures =
+ !listconcat(WSMInheritableFeatures, WSMSpecificFeatures);
+
+ // Sandybridge
+ list<SubtargetFeature> SNBAdditionalFeatures = [FeatureAVX,
+ FeatureSlowDivide64,
+ FeatureXSAVE,
+ FeatureXSAVEOPT,
+ FeatureSlow3OpsLEA,
+ FeatureFastScalarFSQRT,
+ FeatureFastSHLDRotate,
+ FeatureMergeToThreeWayBranch];
+ list<SubtargetFeature> SNBSpecificFeatures = [FeatureSlowUAMem32,
+ FeaturePOPCNTFalseDeps];
+ list<SubtargetFeature> SNBInheritableFeatures =
+ !listconcat(WSMInheritableFeatures, SNBAdditionalFeatures);
+ list<SubtargetFeature> SNBFeatures =
+ !listconcat(SNBInheritableFeatures, SNBSpecificFeatures);
+
+ // Ivybridge
+ list<SubtargetFeature> IVBAdditionalFeatures = [FeatureRDRAND,
+ FeatureF16C,
+ FeatureFSGSBase];
+ list<SubtargetFeature> IVBSpecificFeatures = [FeatureSlowUAMem32,
+ FeaturePOPCNTFalseDeps];
+ list<SubtargetFeature> IVBInheritableFeatures =
+ !listconcat(SNBInheritableFeatures, IVBAdditionalFeatures);
+ list<SubtargetFeature> IVBFeatures =
+ !listconcat(IVBInheritableFeatures, IVBSpecificFeatures);
+
+ // Haswell
+ list<SubtargetFeature> HSWAdditionalFeatures = [FeatureAVX2,
+ FeatureBMI,
+ FeatureBMI2,
+ FeatureERMSB,
+ FeatureFMA,
+ FeatureINVPCID,
+ FeatureLZCNT,
+ FeatureMOVBE,
+ FeatureFastVariableShuffle];
+ list<SubtargetFeature> HSWSpecificFeatures = [FeaturePOPCNTFalseDeps,
+ FeatureLZCNTFalseDeps];
+ list<SubtargetFeature> HSWInheritableFeatures =
+ !listconcat(IVBInheritableFeatures, HSWAdditionalFeatures);
+ list<SubtargetFeature> HSWFeatures =
+ !listconcat(HSWInheritableFeatures, HSWSpecificFeatures);
+
+ // Broadwell
+ list<SubtargetFeature> BDWAdditionalFeatures = [FeatureADX,
+ FeatureRDSEED,
+ FeaturePRFCHW];
+ list<SubtargetFeature> BDWSpecificFeatures = [FeaturePOPCNTFalseDeps,
+ FeatureLZCNTFalseDeps];
+ list<SubtargetFeature> BDWInheritableFeatures =
+ !listconcat(HSWInheritableFeatures, BDWAdditionalFeatures);
+ list<SubtargetFeature> BDWFeatures =
+ !listconcat(BDWInheritableFeatures, BDWSpecificFeatures);
+
+ // Skylake
+ list<SubtargetFeature> SKLAdditionalFeatures = [FeatureAES,
+ FeatureMPX,
+ FeatureXSAVEC,
+ FeatureXSAVES,
+ FeatureCLFLUSHOPT,
+ FeatureFastVectorFSQRT];
+ list<SubtargetFeature> SKLSpecificFeatures = [FeatureHasFastGather,
+ FeaturePOPCNTFalseDeps,
+ FeatureSGX];
+ list<SubtargetFeature> SKLInheritableFeatures =
+ !listconcat(BDWInheritableFeatures, SKLAdditionalFeatures);
+ list<SubtargetFeature> SKLFeatures =
+ !listconcat(SKLInheritableFeatures, SKLSpecificFeatures);
+
+ // Skylake-AVX512
+ list<SubtargetFeature> SKXAdditionalFeatures = [FeatureAVX512,
+ FeatureCDI,
+ FeatureDQI,
+ FeatureBWI,
+ FeatureVLX,
+ FeaturePKU,
+ FeatureCLWB];
+ list<SubtargetFeature> SKXSpecificFeatures = [FeatureHasFastGather,
+ FeaturePOPCNTFalseDeps];
+ list<SubtargetFeature> SKXInheritableFeatures =
+ !listconcat(SKLInheritableFeatures, SKXAdditionalFeatures);
+ list<SubtargetFeature> SKXFeatures =
+ !listconcat(SKXInheritableFeatures, SKXSpecificFeatures);
+
+ // Cascadelake
+ list<SubtargetFeature> CLXAdditionalFeatures = [FeatureVNNI];
+ list<SubtargetFeature> CLXSpecificFeatures = [FeatureHasFastGather,
+ FeaturePOPCNTFalseDeps];
+ list<SubtargetFeature> CLXInheritableFeatures =
+ !listconcat(SKXInheritableFeatures, CLXAdditionalFeatures);
+ list<SubtargetFeature> CLXFeatures =
+ !listconcat(CLXInheritableFeatures, CLXSpecificFeatures);
+
+ // Cooperlake
+ list<SubtargetFeature> CPXAdditionalFeatures = [FeatureBF16];
+ list<SubtargetFeature> CPXSpecificFeatures = [FeatureHasFastGather,
+ FeaturePOPCNTFalseDeps];
+ list<SubtargetFeature> CPXInheritableFeatures =
+ !listconcat(CLXInheritableFeatures, CPXAdditionalFeatures);
+ list<SubtargetFeature> CPXFeatures =
+ !listconcat(CPXInheritableFeatures, CPXSpecificFeatures);
+
+ // Cannonlake
+ list<SubtargetFeature> CNLAdditionalFeatures = [FeatureAVX512,
+ FeatureCDI,
+ FeatureDQI,
+ FeatureBWI,
+ FeatureVLX,
+ FeaturePKU,
+ FeatureVBMI,
+ FeatureIFMA,
+ FeatureSHA,
+ FeatureSGX];
+ list<SubtargetFeature> CNLSpecificFeatures = [FeatureHasFastGather];
+ list<SubtargetFeature> CNLInheritableFeatures =
+ !listconcat(SKLInheritableFeatures, CNLAdditionalFeatures);
+ list<SubtargetFeature> CNLFeatures =
+ !listconcat(CNLInheritableFeatures, CNLSpecificFeatures);
+
+ // Icelake
+ list<SubtargetFeature> ICLAdditionalFeatures = [FeatureBITALG,
+ FeatureVAES,
+ FeatureVBMI2,
+ FeatureVNNI,
+ FeatureVPCLMULQDQ,
+ FeatureVPOPCNTDQ,
+ FeatureGFNI,
+ FeatureCLWB,
+ FeatureRDPID];
+ list<SubtargetFeature> ICLSpecificFeatures = [FeatureHasFastGather];
+ list<SubtargetFeature> ICLInheritableFeatures =
+ !listconcat(CNLInheritableFeatures, ICLAdditionalFeatures);
+ list<SubtargetFeature> ICLFeatures =
+ !listconcat(ICLInheritableFeatures, ICLSpecificFeatures);
+
+ // Icelake Server
+ list<SubtargetFeature> ICXSpecificFeatures = [FeaturePCONFIG,
+ FeatureWBNOINVD,
+ FeatureHasFastGather];
+ list<SubtargetFeature> ICXFeatures =
+ !listconcat(ICLInheritableFeatures, ICXSpecificFeatures);
+
+ // Atom
+ list<SubtargetFeature> AtomInheritableFeatures = [FeatureX87,
+ FeatureCMPXCHG8B,
+ FeatureCMOV,
+ FeatureMMX,
+ FeatureSSSE3,
+ FeatureFXSR,
+ FeatureNOPL,
+ Feature64Bit,
+ FeatureCMPXCHG16B,
+ FeatureMOVBE,
+ FeatureSlowTwoMemOps,
+ FeatureLAHFSAHF];
+ list<SubtargetFeature> AtomSpecificFeatures = [ProcIntelAtom,
+ FeatureSlowUAMem16,
+ FeatureLEAForSP,
+ FeatureSlowDivide32,
+ FeatureSlowDivide64,
+ FeatureLEAUsesAG,
+ FeaturePadShortFunctions];
+ list<SubtargetFeature> AtomFeatures =
+ !listconcat(AtomInheritableFeatures, AtomSpecificFeatures);
+
+ // Silvermont
+ list<SubtargetFeature> SLMAdditionalFeatures = [FeatureSSE42,
+ FeaturePOPCNT,
+ FeaturePCLMUL,
+ FeaturePRFCHW,
+ FeatureSlowLEA,
+ FeatureSlowIncDec,
+ FeatureRDRAND];
+ list<SubtargetFeature> SLMSpecificFeatures = [ProcIntelSLM,
+ FeatureSlowDivide64,
+ FeatureSlowPMULLD,
+ FeaturePOPCNTFalseDeps];
+ list<SubtargetFeature> SLMInheritableFeatures =
+ !listconcat(AtomInheritableFeatures, SLMAdditionalFeatures);
+ list<SubtargetFeature> SLMFeatures =
+ !listconcat(SLMInheritableFeatures, SLMSpecificFeatures);
+
+ // Goldmont
+ list<SubtargetFeature> GLMAdditionalFeatures = [FeatureAES,
+ FeatureMPX,
+ FeatureSHA,
+ FeatureRDSEED,
+ FeatureXSAVE,
+ FeatureXSAVEOPT,
+ FeatureXSAVEC,
+ FeatureXSAVES,
+ FeatureCLFLUSHOPT,
+ FeatureFSGSBase];
+ list<SubtargetFeature> GLMSpecificFeatures = [ProcIntelGLM,
+ FeaturePOPCNTFalseDeps];
+ list<SubtargetFeature> GLMInheritableFeatures =
+ !listconcat(SLMInheritableFeatures, GLMAdditionalFeatures);
+ list<SubtargetFeature> GLMFeatures =
+ !listconcat(GLMInheritableFeatures, GLMSpecificFeatures);
+
+ // Goldmont Plus
+ list<SubtargetFeature> GLPAdditionalFeatures = [FeaturePTWRITE,
+ FeatureRDPID,
+ FeatureSGX];
+ list<SubtargetFeature> GLPSpecificFeatures = [ProcIntelGLP];
+ list<SubtargetFeature> GLPInheritableFeatures =
+ !listconcat(GLMInheritableFeatures, GLPAdditionalFeatures);
+ list<SubtargetFeature> GLPFeatures =
+ !listconcat(GLPInheritableFeatures, GLPSpecificFeatures);
+
+ // Tremont
+ list<SubtargetFeature> TRMAdditionalFeatures = [FeatureCLDEMOTE,
+ FeatureGFNI,
+ FeatureMOVDIRI,
+ FeatureMOVDIR64B,
+ FeatureWAITPKG];
+ list<SubtargetFeature> TRMSpecificFeatures = [ProcIntelTRM];
+ list<SubtargetFeature> TRMFeatures =
+ !listconcat(GLPInheritableFeatures, TRMAdditionalFeatures,
+ TRMSpecificFeatures);
+
+ // Knights Landing
+ list<SubtargetFeature> KNLFeatures = [FeatureX87,
+ FeatureCMPXCHG8B,
+ FeatureCMOV,
+ FeatureMMX,
+ FeatureFXSR,
+ FeatureNOPL,
+ Feature64Bit,
+ FeatureCMPXCHG16B,
+ FeaturePOPCNT,
+ FeatureSlowDivide64,
+ FeaturePCLMUL,
+ FeatureXSAVE,
+ FeatureXSAVEOPT,
+ FeatureLAHFSAHF,
+ FeatureSlow3OpsLEA,
+ FeatureSlowIncDec,
+ FeatureAES,
+ FeatureRDRAND,
+ FeatureF16C,
+ FeatureFSGSBase,
+ FeatureAVX512,
+ FeatureERI,
+ FeatureCDI,
+ FeaturePFI,
+ FeaturePREFETCHWT1,
+ FeatureADX,
+ FeatureRDSEED,
+ FeatureMOVBE,
+ FeatureLZCNT,
+ FeatureBMI,
+ FeatureBMI2,
+ FeatureFMA,
+ FeaturePRFCHW,
+ FeatureSlowTwoMemOps,
+ FeatureFastPartialYMMorZMMWrite,
+ FeatureHasFastGather,
+ FeatureSlowPMADDWD];
+ // TODO Add AVX5124FMAPS/AVX5124VNNIW features
+ list<SubtargetFeature> KNMFeatures =
+ !listconcat(KNLFeatures, [FeatureVPOPCNTDQ]);
+
+
+ // Bobcat
+ list<SubtargetFeature> BtVer1InheritableFeatures = [FeatureX87,
+ FeatureCMPXCHG8B,
+ FeatureCMOV,
+ FeatureMMX,
+ FeatureSSSE3,
+ FeatureSSE4A,
+ FeatureFXSR,
+ FeatureNOPL,
+ Feature64Bit,
+ FeatureCMPXCHG16B,
+ FeaturePRFCHW,
+ FeatureLZCNT,
+ FeaturePOPCNT,
+ FeatureSlowSHLD,
+ FeatureLAHFSAHF,
+ FeatureFast15ByteNOP,
+ FeatureFastScalarShiftMasks,
+ FeatureFastVectorShiftMasks];
+ list<SubtargetFeature> BtVer1Features = BtVer1InheritableFeatures;
+
+ // Jaguar
+ list<SubtargetFeature> BtVer2AdditionalFeatures = [FeatureAVX,
+ FeatureAES,
+ FeaturePCLMUL,
+ FeatureBMI,
+ FeatureF16C,
+ FeatureMOVBE,
+ FeatureXSAVE,
+ FeatureXSAVEOPT];
+ list<SubtargetFeature> BtVer2SpecificFeatures = [FeatureFastLZCNT,
+ FeatureFastBEXTR,
+ FeatureFastPartialYMMorZMMWrite,
+ FeatureFastHorizontalOps];
+ list<SubtargetFeature> BtVer2InheritableFeatures =
+ !listconcat(BtVer1InheritableFeatures, BtVer2AdditionalFeatures);
+ list<SubtargetFeature> BtVer2Features =
+ !listconcat(BtVer2InheritableFeatures, BtVer2SpecificFeatures);
+
+ // Bulldozer
+ list<SubtargetFeature> BdVer1InheritableFeatures = [FeatureX87,
+ FeatureCMPXCHG8B,
+ FeatureCMOV,
+ FeatureXOP,
+ Feature64Bit,
+ FeatureCMPXCHG16B,
+ FeatureAES,
+ FeaturePRFCHW,
+ FeaturePCLMUL,
+ FeatureMMX,
+ FeatureFXSR,
+ FeatureNOPL,
+ FeatureLZCNT,
+ FeaturePOPCNT,
+ FeatureXSAVE,
+ FeatureLWP,
+ FeatureSlowSHLD,
+ FeatureLAHFSAHF,
+ FeatureFast11ByteNOP,
+ FeatureFastScalarShiftMasks,
+ FeatureBranchFusion];
+ list<SubtargetFeature> BdVer1Features = BdVer1InheritableFeatures;
+
+ // PileDriver
+ list<SubtargetFeature> BdVer2AdditionalFeatures = [FeatureF16C,
+ FeatureBMI,
+ FeatureTBM,
+ FeatureFMA,
+ FeatureFastBEXTR];
+ list<SubtargetFeature> BdVer2InheritableFeatures =
+ !listconcat(BdVer1InheritableFeatures, BdVer2AdditionalFeatures);
+ list<SubtargetFeature> BdVer2Features = BdVer2InheritableFeatures;
+
+ // Steamroller
+ list<SubtargetFeature> BdVer3AdditionalFeatures = [FeatureXSAVEOPT,
+ FeatureFSGSBase];
+ list<SubtargetFeature> BdVer3InheritableFeatures =
+ !listconcat(BdVer2InheritableFeatures, BdVer3AdditionalFeatures);
+ list<SubtargetFeature> BdVer3Features = BdVer3InheritableFeatures;
+
+ // Excavator
+ list<SubtargetFeature> BdVer4AdditionalFeatures = [FeatureAVX2,
+ FeatureBMI2,
+ FeatureMWAITX];
+ list<SubtargetFeature> BdVer4InheritableFeatures =
+ !listconcat(BdVer3InheritableFeatures, BdVer4AdditionalFeatures);
+ list<SubtargetFeature> BdVer4Features = BdVer4InheritableFeatures;
+
+
+ // AMD Zen Processors common ISAs
+ list<SubtargetFeature> ZNFeatures = [FeatureADX,
+ FeatureAES,
+ FeatureAVX2,
+ FeatureBMI,
+ FeatureBMI2,
+ FeatureCLFLUSHOPT,
+ FeatureCLZERO,
+ FeatureCMOV,
+ Feature64Bit,
+ FeatureCMPXCHG16B,
+ FeatureF16C,
+ FeatureFMA,
+ FeatureFSGSBase,
+ FeatureFXSR,
+ FeatureNOPL,
+ FeatureFastLZCNT,
+ FeatureLAHFSAHF,
+ FeatureLZCNT,
+ FeatureFastBEXTR,
+ FeatureFast15ByteNOP,
+ FeatureBranchFusion,
+ FeatureFastScalarShiftMasks,
+ FeatureMMX,
+ FeatureMOVBE,
+ FeatureMWAITX,
+ FeaturePCLMUL,
+ FeaturePOPCNT,
+ FeaturePRFCHW,
+ FeatureRDRAND,
+ FeatureRDSEED,
+ FeatureSHA,
+ FeatureSSE4A,
+ FeatureSlowSHLD,
+ FeatureX87,
+ FeatureXSAVE,
+ FeatureXSAVEC,
+ FeatureXSAVEOPT,
+ FeatureXSAVES];
+ list<SubtargetFeature> ZN2AdditionalFeatures = [FeatureCLWB,
+ FeatureRDPID,
+ FeatureWBNOINVD];
+ list<SubtargetFeature> ZN2Features =
+ !listconcat(ZNFeatures, ZN2AdditionalFeatures);
+}
+
+//===----------------------------------------------------------------------===//
+// X86 processors supported.
+//===----------------------------------------------------------------------===//
class Proc<string Name, list<SubtargetFeature> Features>
: ProcessorModel<Name, GenericModel, Features>;
-def : Proc<"generic", [FeatureX87, FeatureSlowUAMem16]>;
+// NOTE: CMPXCHG8B is here for legacy compatbility so that it is only disabled
+// if i386/i486 is specifically requested.
+def : Proc<"generic", [FeatureX87, FeatureSlowUAMem16,
+ FeatureCMPXCHG8B]>;
def : Proc<"i386", [FeatureX87, FeatureSlowUAMem16]>;
def : Proc<"i486", [FeatureX87, FeatureSlowUAMem16]>;
-def : Proc<"i586", [FeatureX87, FeatureSlowUAMem16]>;
-def : Proc<"pentium", [FeatureX87, FeatureSlowUAMem16]>;
-def : Proc<"pentium-mmx", [FeatureX87, FeatureSlowUAMem16, FeatureMMX]>;
-
-def : Proc<"i686", [FeatureX87, FeatureSlowUAMem16, FeatureCMOV]>;
-def : Proc<"pentiumpro", [FeatureX87, FeatureSlowUAMem16, FeatureCMOV,
- FeatureNOPL]>;
-
-def : Proc<"pentium2", [FeatureX87, FeatureSlowUAMem16, FeatureMMX,
- FeatureCMOV, FeatureFXSR, FeatureNOPL]>;
+def : Proc<"i586", [FeatureX87, FeatureSlowUAMem16,
+ FeatureCMPXCHG8B]>;
+def : Proc<"pentium", [FeatureX87, FeatureSlowUAMem16,
+ FeatureCMPXCHG8B]>;
+def : Proc<"pentium-mmx", [FeatureX87, FeatureSlowUAMem16,
+ FeatureCMPXCHG8B, FeatureMMX]>;
+
+def : Proc<"i686", [FeatureX87, FeatureSlowUAMem16, FeatureCMPXCHG8B,
+ FeatureCMOV]>;
+def : Proc<"pentiumpro", [FeatureX87, FeatureSlowUAMem16, FeatureCMPXCHG8B,
+ FeatureCMOV, FeatureNOPL]>;
+
+def : Proc<"pentium2", [FeatureX87, FeatureSlowUAMem16, FeatureCMPXCHG8B,
+ FeatureMMX, FeatureCMOV, FeatureFXSR,
+ FeatureNOPL]>;
foreach P = ["pentium3", "pentium3m"] in {
- def : Proc<P, [FeatureX87, FeatureSlowUAMem16, FeatureMMX, FeatureSSE1,
- FeatureFXSR, FeatureNOPL, FeatureCMOV]>;
+ def : Proc<P, [FeatureX87, FeatureSlowUAMem16, FeatureCMPXCHG8B,FeatureMMX,
+ FeatureSSE1, FeatureFXSR, FeatureNOPL, FeatureCMOV]>;
}
// Enable the PostRAScheduler for SSE2 and SSE3 class cpus.
@@ -498,13 +967,15 @@ foreach P = ["pentium3", "pentium3m"] in {
// changes slightly.
def : ProcessorModel<"pentium-m", GenericPostRAModel,
- [FeatureX87, FeatureSlowUAMem16, FeatureMMX,
- FeatureSSE2, FeatureFXSR, FeatureNOPL, FeatureCMOV]>;
+ [FeatureX87, FeatureSlowUAMem16, FeatureCMPXCHG8B,
+ FeatureMMX, FeatureSSE2, FeatureFXSR, FeatureNOPL,
+ FeatureCMOV]>;
foreach P = ["pentium4", "pentium4m"] in {
def : ProcessorModel<P, GenericPostRAModel,
- [FeatureX87, FeatureSlowUAMem16, FeatureMMX,
- FeatureSSE2, FeatureFXSR, FeatureNOPL, FeatureCMOV]>;
+ [FeatureX87, FeatureSlowUAMem16, FeatureCMPXCHG8B,
+ FeatureMMX, FeatureSSE2, FeatureFXSR, FeatureNOPL,
+ FeatureCMOV]>;
}
// Intel Quark.
@@ -512,16 +983,19 @@ def : Proc<"lakemont", []>;
// Intel Core Duo.
def : ProcessorModel<"yonah", SandyBridgeModel,
- [FeatureX87, FeatureSlowUAMem16, FeatureMMX, FeatureSSE3,
- FeatureFXSR, FeatureNOPL, FeatureCMOV]>;
+ [FeatureX87, FeatureSlowUAMem16, FeatureCMPXCHG8B,
+ FeatureMMX, FeatureSSE3, FeatureFXSR, FeatureNOPL,
+ FeatureCMOV]>;
// NetBurst.
def : ProcessorModel<"prescott", GenericPostRAModel,
- [FeatureX87, FeatureSlowUAMem16, FeatureMMX, FeatureSSE3,
- FeatureFXSR, FeatureNOPL, FeatureCMOV]>;
+ [FeatureX87, FeatureSlowUAMem16, FeatureCMPXCHG8B,
+ FeatureMMX, FeatureSSE3, FeatureFXSR, FeatureNOPL,
+ FeatureCMOV]>;
def : ProcessorModel<"nocona", GenericPostRAModel, [
FeatureX87,
FeatureSlowUAMem16,
+ FeatureCMPXCHG8B,
FeatureCMOV,
FeatureMMX,
FeatureSSE3,
@@ -535,6 +1009,7 @@ def : ProcessorModel<"nocona", GenericPostRAModel, [
def : ProcessorModel<"core2", SandyBridgeModel, [
FeatureX87,
FeatureSlowUAMem16,
+ FeatureCMPXCHG8B,
FeatureCMOV,
FeatureMMX,
FeatureSSSE3,
@@ -548,6 +1023,7 @@ def : ProcessorModel<"core2", SandyBridgeModel, [
def : ProcessorModel<"penryn", SandyBridgeModel, [
FeatureX87,
FeatureSlowUAMem16,
+ FeatureCMPXCHG8B,
FeatureCMOV,
FeatureMMX,
FeatureSSE41,
@@ -560,638 +1036,131 @@ def : ProcessorModel<"penryn", SandyBridgeModel, [
]>;
// Atom CPUs.
-class BonnellProc<string Name> : ProcessorModel<Name, AtomModel, [
- ProcIntelAtom,
- FeatureX87,
- FeatureSlowUAMem16,
- FeatureCMOV,
- FeatureMMX,
- FeatureSSSE3,
- FeatureFXSR,
- FeatureNOPL,
- Feature64Bit,
- FeatureCMPXCHG16B,
- FeatureMOVBE,
- FeatureLEAForSP,
- FeatureSlowDivide32,
- FeatureSlowDivide64,
- FeatureSlowTwoMemOps,
- FeatureLEAUsesAG,
- FeaturePadShortFunctions,
- FeatureLAHFSAHF
-]>;
-def : BonnellProc<"bonnell">;
-def : BonnellProc<"atom">; // Pin the generic name to the baseline.
-
-class SilvermontProc<string Name> : ProcessorModel<Name, SLMModel, [
- ProcIntelSLM,
- FeatureX87,
- FeatureCMOV,
- FeatureMMX,
- FeatureSSE42,
- FeatureFXSR,
- FeatureNOPL,
- Feature64Bit,
- FeatureCMPXCHG16B,
- FeatureMOVBE,
- FeaturePOPCNT,
- FeaturePCLMUL,
- FeatureSlowDivide64,
- FeatureSlowTwoMemOps,
- FeaturePRFCHW,
- FeatureSlowLEA,
- FeatureSlowIncDec,
- FeatureSlowPMULLD,
- FeatureRDRAND,
- FeatureLAHFSAHF,
- FeaturePOPCNTFalseDeps
-]>;
-def : SilvermontProc<"silvermont">;
-def : SilvermontProc<"slm">; // Legacy alias.
-
-class ProcessorFeatures<list<SubtargetFeature> Inherited,
- list<SubtargetFeature> NewFeatures> {
- list<SubtargetFeature> Value = !listconcat(Inherited, NewFeatures);
+foreach P = ["bonnell", "atom"] in {
+ def : ProcessorModel<P, AtomModel, ProcessorFeatures.AtomFeatures>;
}
-class ProcModel<string Name, SchedMachineModel Model,
- list<SubtargetFeature> ProcFeatures,
- list<SubtargetFeature> OtherFeatures> :
- ProcessorModel<Name, Model, !listconcat(ProcFeatures, OtherFeatures)>;
-
-def GLMFeatures : ProcessorFeatures<[], [
- FeatureX87,
- FeatureCMOV,
- FeatureMMX,
- FeatureSSE42,
- FeatureFXSR,
- FeatureNOPL,
- Feature64Bit,
- FeatureCMPXCHG16B,
- FeatureMOVBE,
- FeaturePOPCNT,
- FeaturePCLMUL,
- FeatureAES,
- FeaturePRFCHW,
- FeatureSlowTwoMemOps,
- FeatureSlowLEA,
- FeatureSlowIncDec,
- FeatureLAHFSAHF,
- FeatureMPX,
- FeatureSHA,
- FeatureRDRAND,
- FeatureRDSEED,
- FeatureXSAVE,
- FeatureXSAVEOPT,
- FeatureXSAVEC,
- FeatureXSAVES,
- FeatureCLFLUSHOPT,
- FeatureFSGSBase
-]>;
+foreach P = ["silvermont", "slm"] in {
+ def : ProcessorModel<P, SLMModel, ProcessorFeatures.SLMFeatures>;
+}
-class GoldmontProc<string Name> : ProcModel<Name, SLMModel,
- GLMFeatures.Value, [
- ProcIntelGLM,
- FeaturePOPCNTFalseDeps
-]>;
-def : GoldmontProc<"goldmont">;
-
-def GLPFeatures : ProcessorFeatures<GLMFeatures.Value, [
- FeaturePTWRITE,
- FeatureRDPID,
- FeatureSGX
-]>;
-
-class GoldmontPlusProc<string Name> : ProcModel<Name, SLMModel,
- GLPFeatures.Value, [
- ProcIntelGLP
-]>;
-def : GoldmontPlusProc<"goldmont-plus">;
-
-class TremontProc<string Name> : ProcModel<Name, SLMModel,
- GLPFeatures.Value, [
- ProcIntelTRM,
- FeatureCLDEMOTE,
- FeatureGFNI,
- FeatureMOVDIRI,
- FeatureMOVDIR64B,
- FeatureWAITPKG
-]>;
-def : TremontProc<"tremont">;
+def : ProcessorModel<"goldmont", SLMModel, ProcessorFeatures.GLMFeatures>;
+def : ProcessorModel<"goldmont-plus", SLMModel, ProcessorFeatures.GLPFeatures>;
+def : ProcessorModel<"tremont", SLMModel, ProcessorFeatures.TRMFeatures>;
// "Arrandale" along with corei3 and corei5
-class NehalemProc<string Name> : ProcessorModel<Name, SandyBridgeModel, [
- FeatureX87,
- FeatureCMOV,
- FeatureMMX,
- FeatureSSE42,
- FeatureFXSR,
- FeatureNOPL,
- Feature64Bit,
- FeatureCMPXCHG16B,
- FeaturePOPCNT,
- FeatureLAHFSAHF,
- FeatureMacroFusion
-]>;
-def : NehalemProc<"nehalem">;
-def : NehalemProc<"corei7">;
+foreach P = ["nehalem", "corei7"] in {
+ def : ProcessorModel<P, SandyBridgeModel, ProcessorFeatures.NHMFeatures>;
+}
-// Westmere is a similar machine to nehalem with some additional features.
// Westmere is the corei3/i5/i7 path from nehalem to sandybridge
-class WestmereProc<string Name> : ProcessorModel<Name, SandyBridgeModel, [
- FeatureX87,
- FeatureCMOV,
- FeatureMMX,
- FeatureSSE42,
- FeatureFXSR,
- FeatureNOPL,
- Feature64Bit,
- FeatureCMPXCHG16B,
- FeaturePOPCNT,
- FeaturePCLMUL,
- FeatureLAHFSAHF,
- FeatureMacroFusion
-]>;
-def : WestmereProc<"westmere">;
-
-// SSE is not listed here since llvm treats AVX as a reimplementation of SSE,
-// rather than a superset.
-def SNBFeatures : ProcessorFeatures<[], [
- FeatureX87,
- FeatureCMOV,
- FeatureMMX,
- FeatureAVX,
- FeatureFXSR,
- FeatureNOPL,
- Feature64Bit,
- FeatureCMPXCHG16B,
- FeaturePOPCNT,
- FeatureSlowDivide64,
- FeaturePCLMUL,
- FeatureXSAVE,
- FeatureXSAVEOPT,
- FeatureLAHFSAHF,
- FeatureSlow3OpsLEA,
- FeatureFastScalarFSQRT,
- FeatureFastSHLDRotate,
- FeatureSlowIncDec,
- FeatureMergeToThreeWayBranch,
- FeatureMacroFusion
-]>;
-
-class SandyBridgeProc<string Name> : ProcModel<Name, SandyBridgeModel,
- SNBFeatures.Value, [
- FeatureSlowUAMem32,
- FeaturePOPCNTFalseDeps
-]>;
-def : SandyBridgeProc<"sandybridge">;
-def : SandyBridgeProc<"corei7-avx">; // Legacy alias.
-
-def IVBFeatures : ProcessorFeatures<SNBFeatures.Value, [
- FeatureRDRAND,
- FeatureF16C,
- FeatureFSGSBase
-]>;
-
-class IvyBridgeProc<string Name> : ProcModel<Name, SandyBridgeModel,
- IVBFeatures.Value, [
- FeatureSlowUAMem32,
- FeaturePOPCNTFalseDeps
-]>;
-def : IvyBridgeProc<"ivybridge">;
-def : IvyBridgeProc<"core-avx-i">; // Legacy alias.
-
-def HSWFeatures : ProcessorFeatures<IVBFeatures.Value, [
- FeatureAVX2,
- FeatureBMI,
- FeatureBMI2,
- FeatureERMSB,
- FeatureFMA,
- FeatureINVPCID,
- FeatureLZCNT,
- FeatureMOVBE,
- FeatureFastVariableShuffle
-]>;
-
-class HaswellProc<string Name> : ProcModel<Name, HaswellModel,
- HSWFeatures.Value, [
- FeaturePOPCNTFalseDeps,
- FeatureLZCNTFalseDeps
-]>;
-def : HaswellProc<"haswell">;
-def : HaswellProc<"core-avx2">; // Legacy alias.
+def : ProcessorModel<"westmere", SandyBridgeModel,
+ ProcessorFeatures.WSMFeatures>;
-def BDWFeatures : ProcessorFeatures<HSWFeatures.Value, [
- FeatureADX,
- FeatureRDSEED,
- FeaturePRFCHW
-]>;
-class BroadwellProc<string Name> : ProcModel<Name, BroadwellModel,
- BDWFeatures.Value, [
- FeaturePOPCNTFalseDeps,
- FeatureLZCNTFalseDeps
-]>;
-def : BroadwellProc<"broadwell">;
-
-def SKLFeatures : ProcessorFeatures<BDWFeatures.Value, [
- FeatureAES,
- FeatureMPX,
- FeatureXSAVEC,
- FeatureXSAVES,
- FeatureCLFLUSHOPT,
- FeatureFastVectorFSQRT
-]>;
-
-class SkylakeClientProc<string Name> : ProcModel<Name, SkylakeClientModel,
- SKLFeatures.Value, [
- FeatureHasFastGather,
- FeaturePOPCNTFalseDeps,
- FeatureSGX
-]>;
-def : SkylakeClientProc<"skylake">;
+foreach P = ["sandybridge", "corei7-avx"] in {
+ def : ProcessorModel<P, SandyBridgeModel, ProcessorFeatures.SNBFeatures>;
+}
-def KNLFeatures : ProcessorFeatures<[], [
- FeatureX87,
- FeatureCMOV,
- FeatureMMX,
- FeatureFXSR,
- FeatureNOPL,
- Feature64Bit,
- FeatureCMPXCHG16B,
- FeaturePOPCNT,
- FeatureSlowDivide64,
- FeaturePCLMUL,
- FeatureXSAVE,
- FeatureXSAVEOPT,
- FeatureLAHFSAHF,
- FeatureSlow3OpsLEA,
- FeatureSlowIncDec,
- FeatureAES,
- FeatureRDRAND,
- FeatureF16C,
- FeatureFSGSBase,
- FeatureAVX512,
- FeatureERI,
- FeatureCDI,
- FeaturePFI,
- FeaturePREFETCHWT1,
- FeatureADX,
- FeatureRDSEED,
- FeatureMOVBE,
- FeatureLZCNT,
- FeatureBMI,
- FeatureBMI2,
- FeatureFMA,
- FeaturePRFCHW
-]>;
+foreach P = ["ivybridge", "core-avx-i"] in {
+ def : ProcessorModel<P, SandyBridgeModel, ProcessorFeatures.IVBFeatures>;
+}
-// FIXME: define KNL model
-class KnightsLandingProc<string Name> : ProcModel<Name, HaswellModel,
- KNLFeatures.Value, [
- FeatureSlowTwoMemOps,
- FeatureFastPartialYMMorZMMWrite,
- FeatureHasFastGather,
- FeatureSlowPMADDWD
-]>;
-def : KnightsLandingProc<"knl">;
-
-class KnightsMillProc<string Name> : ProcModel<Name, HaswellModel,
- KNLFeatures.Value, [
- FeatureSlowTwoMemOps,
- FeatureFastPartialYMMorZMMWrite,
- FeatureHasFastGather,
- FeatureSlowPMADDWD,
- FeatureVPOPCNTDQ
-]>;
-def : KnightsMillProc<"knm">; // TODO Add AVX5124FMAPS/AVX5124VNNIW features
-
-def SKXFeatures : ProcessorFeatures<SKLFeatures.Value, [
- FeatureAVX512,
- FeatureCDI,
- FeatureDQI,
- FeatureBWI,
- FeatureVLX,
- FeaturePKU,
- FeatureCLWB
-]>;
+foreach P = ["haswell", "core-avx2"] in {
+ def : ProcessorModel<P, HaswellModel, ProcessorFeatures.HSWFeatures>;
+}
-class SkylakeServerProc<string Name> : ProcModel<Name, SkylakeServerModel,
- SKXFeatures.Value, [
- FeatureHasFastGather,
- FeaturePOPCNTFalseDeps
-]>;
-def : SkylakeServerProc<"skylake-avx512">;
-def : SkylakeServerProc<"skx">; // Legacy alias.
+def : ProcessorModel<"broadwell", BroadwellModel,
+ ProcessorFeatures.BDWFeatures>;
-def CLXFeatures : ProcessorFeatures<SKXFeatures.Value, [
- FeatureVNNI
-]>;
+def : ProcessorModel<"skylake", SkylakeClientModel,
+ ProcessorFeatures.SKLFeatures>;
-class CascadelakeProc<string Name> : ProcModel<Name, SkylakeServerModel,
- CLXFeatures.Value, [
- FeatureHasFastGather,
- FeaturePOPCNTFalseDeps
-]>;
-def : CascadelakeProc<"cascadelake">;
-
-def CNLFeatures : ProcessorFeatures<SKLFeatures.Value, [
- FeatureAVX512,
- FeatureCDI,
- FeatureDQI,
- FeatureBWI,
- FeatureVLX,
- FeaturePKU,
- FeatureVBMI,
- FeatureIFMA,
- FeatureSHA,
- FeatureSGX
-]>;
+// FIXME: define KNL scheduler model
+def : ProcessorModel<"knl", HaswellModel, ProcessorFeatures.KNLFeatures>;
+def : ProcessorModel<"knm", HaswellModel, ProcessorFeatures.KNMFeatures>;
-class CannonlakeProc<string Name> : ProcModel<Name, SkylakeServerModel,
- CNLFeatures.Value, [
- FeatureHasFastGather
-]>;
-def : CannonlakeProc<"cannonlake">;
-
-def ICLFeatures : ProcessorFeatures<CNLFeatures.Value, [
- FeatureBITALG,
- FeatureVAES,
- FeatureVBMI2,
- FeatureVNNI,
- FeatureVPCLMULQDQ,
- FeatureVPOPCNTDQ,
- FeatureGFNI,
- FeatureCLWB,
- FeatureRDPID
-]>;
-
-class IcelakeClientProc<string Name> : ProcModel<Name, SkylakeServerModel,
- ICLFeatures.Value, [
- FeatureHasFastGather
-]>;
-def : IcelakeClientProc<"icelake-client">;
+foreach P = ["skylake-avx512", "skx"] in {
+ def : ProcessorModel<P, SkylakeServerModel, ProcessorFeatures.SKXFeatures>;
+}
-class IcelakeServerProc<string Name> : ProcModel<Name, SkylakeServerModel,
- ICLFeatures.Value, [
- FeaturePCONFIG,
- FeatureWBNOINVD,
- FeatureHasFastGather
-]>;
-def : IcelakeServerProc<"icelake-server">;
+def : ProcessorModel<"cascadelake", SkylakeServerModel,
+ ProcessorFeatures.CLXFeatures>;
+def : ProcessorModel<"cooperlake", SkylakeServerModel,
+ ProcessorFeatures.CPXFeatures>;
+def : ProcessorModel<"cannonlake", SkylakeServerModel,
+ ProcessorFeatures.CNLFeatures>;
+def : ProcessorModel<"icelake-client", SkylakeServerModel,
+ ProcessorFeatures.ICLFeatures>;
+def : ProcessorModel<"icelake-server", SkylakeServerModel,
+ ProcessorFeatures.ICXFeatures>;
// AMD CPUs.
-def : Proc<"k6", [FeatureX87, FeatureSlowUAMem16, FeatureMMX]>;
-def : Proc<"k6-2", [FeatureX87, FeatureSlowUAMem16, Feature3DNow]>;
-def : Proc<"k6-3", [FeatureX87, FeatureSlowUAMem16, Feature3DNow]>;
+def : Proc<"k6", [FeatureX87, FeatureSlowUAMem16, FeatureCMPXCHG8B,
+ FeatureMMX]>;
+def : Proc<"k6-2", [FeatureX87, FeatureSlowUAMem16, FeatureCMPXCHG8B,
+ Feature3DNow]>;
+def : Proc<"k6-3", [FeatureX87, FeatureSlowUAMem16, FeatureCMPXCHG8B,
+ Feature3DNow]>;
foreach P = ["athlon", "athlon-tbird"] in {
- def : Proc<P, [FeatureX87, FeatureSlowUAMem16, FeatureCMOV, Feature3DNowA,
- FeatureNOPL, FeatureSlowSHLD]>;
+ def : Proc<P, [FeatureX87, FeatureSlowUAMem16, FeatureCMPXCHG8B, FeatureCMOV,
+ Feature3DNowA, FeatureNOPL, FeatureSlowSHLD]>;
}
foreach P = ["athlon-4", "athlon-xp", "athlon-mp"] in {
- def : Proc<P, [FeatureX87, FeatureSlowUAMem16, FeatureCMOV, FeatureSSE1,
- Feature3DNowA, FeatureFXSR, FeatureNOPL, FeatureSlowSHLD]>;
+ def : Proc<P, [FeatureX87, FeatureSlowUAMem16, FeatureCMPXCHG8B, FeatureCMOV,
+ FeatureSSE1, Feature3DNowA, FeatureFXSR, FeatureNOPL,
+ FeatureSlowSHLD]>;
}
foreach P = ["k8", "opteron", "athlon64", "athlon-fx"] in {
- def : Proc<P, [FeatureX87, FeatureSlowUAMem16, FeatureSSE2, Feature3DNowA,
- FeatureFXSR, FeatureNOPL, Feature64Bit, FeatureSlowSHLD,
- FeatureCMOV]>;
+ def : Proc<P, [FeatureX87, FeatureSlowUAMem16, FeatureCMPXCHG8B,
+ FeatureSSE2, Feature3DNowA, FeatureFXSR, FeatureNOPL,
+ Feature64Bit, FeatureSlowSHLD, FeatureCMOV,
+ FeatureFastScalarShiftMasks]>;
}
foreach P = ["k8-sse3", "opteron-sse3", "athlon64-sse3"] in {
- def : Proc<P, [FeatureX87, FeatureSlowUAMem16, FeatureSSE3, Feature3DNowA,
- FeatureFXSR, FeatureNOPL, FeatureCMPXCHG16B, FeatureSlowSHLD,
- FeatureCMOV, Feature64Bit]>;
+ def : Proc<P, [FeatureX87, FeatureSlowUAMem16, FeatureCMPXCHG8B, FeatureSSE3,
+ Feature3DNowA, FeatureFXSR, FeatureNOPL, FeatureCMPXCHG16B,
+ FeatureSlowSHLD, FeatureCMOV, Feature64Bit,
+ FeatureFastScalarShiftMasks]>;
}
foreach P = ["amdfam10", "barcelona"] in {
- def : Proc<P, [FeatureX87, FeatureSSE4A, Feature3DNowA, FeatureFXSR,
- FeatureNOPL, FeatureCMPXCHG16B, FeatureLZCNT, FeaturePOPCNT,
- FeatureSlowSHLD, FeatureLAHFSAHF, FeatureCMOV, Feature64Bit]>;
+ def : Proc<P, [FeatureX87, FeatureCMPXCHG8B, FeatureSSE4A, Feature3DNowA,
+ FeatureFXSR, FeatureNOPL, FeatureCMPXCHG16B, FeatureLZCNT,
+ FeaturePOPCNT, FeatureSlowSHLD, FeatureLAHFSAHF, FeatureCMOV,
+ Feature64Bit, FeatureFastScalarShiftMasks]>;
}
// Bobcat
-def : Proc<"btver1", [
- FeatureX87,
- FeatureCMOV,
- FeatureMMX,
- FeatureSSSE3,
- FeatureSSE4A,
- FeatureFXSR,
- FeatureNOPL,
- Feature64Bit,
- FeatureCMPXCHG16B,
- FeaturePRFCHW,
- FeatureLZCNT,
- FeaturePOPCNT,
- FeatureSlowSHLD,
- FeatureLAHFSAHF,
- FeatureFast15ByteNOP
-]>;
-
+def : Proc<"btver1", ProcessorFeatures.BtVer1Features>;
// Jaguar
-def : ProcessorModel<"btver2", BtVer2Model, [
- FeatureX87,
- FeatureCMOV,
- FeatureMMX,
- FeatureAVX,
- FeatureFXSR,
- FeatureNOPL,
- FeatureSSE4A,
- Feature64Bit,
- FeatureCMPXCHG16B,
- FeaturePRFCHW,
- FeatureAES,
- FeaturePCLMUL,
- FeatureBMI,
- FeatureF16C,
- FeatureMOVBE,
- FeatureLZCNT,
- FeatureFastLZCNT,
- FeaturePOPCNT,
- FeatureXSAVE,
- FeatureXSAVEOPT,
- FeatureSlowSHLD,
- FeatureLAHFSAHF,
- FeatureFast15ByteNOP,
- FeatureFastBEXTR,
- FeatureFastPartialYMMorZMMWrite,
- FeatureFastHorizontalOps
-]>;
+def : ProcessorModel<"btver2", BtVer2Model, ProcessorFeatures.BtVer2Features>;
// Bulldozer
-def : ProcessorModel<"bdver1", BdVer2Model, [
- FeatureX87,
- FeatureCMOV,
- FeatureXOP,
- FeatureFMA4,
- Feature64Bit,
- FeatureCMPXCHG16B,
- FeatureAES,
- FeaturePRFCHW,
- FeaturePCLMUL,
- FeatureMMX,
- FeatureAVX,
- FeatureFXSR,
- FeatureNOPL,
- FeatureSSE4A,
- FeatureLZCNT,
- FeaturePOPCNT,
- FeatureXSAVE,
- FeatureLWP,
- FeatureSlowSHLD,
- FeatureLAHFSAHF,
- FeatureFast11ByteNOP,
- FeatureMacroFusion
-]>;
+def : ProcessorModel<"bdver1", BdVer2Model, ProcessorFeatures.BdVer1Features>;
// Piledriver
-def : ProcessorModel<"bdver2", BdVer2Model, [
- FeatureX87,
- FeatureCMOV,
- FeatureXOP,
- FeatureFMA4,
- Feature64Bit,
- FeatureCMPXCHG16B,
- FeatureAES,
- FeaturePRFCHW,
- FeaturePCLMUL,
- FeatureMMX,
- FeatureAVX,
- FeatureFXSR,
- FeatureNOPL,
- FeatureSSE4A,
- FeatureF16C,
- FeatureLZCNT,
- FeaturePOPCNT,
- FeatureXSAVE,
- FeatureBMI,
- FeatureTBM,
- FeatureLWP,
- FeatureFMA,
- FeatureSlowSHLD,
- FeatureLAHFSAHF,
- FeatureFast11ByteNOP,
- FeatureFastBEXTR,
- FeatureMacroFusion
-]>;
-
+def : ProcessorModel<"bdver2", BdVer2Model, ProcessorFeatures.BdVer2Features>;
// Steamroller
-def : Proc<"bdver3", [
- FeatureX87,
- FeatureCMOV,
- FeatureXOP,
- FeatureFMA4,
- Feature64Bit,
- FeatureCMPXCHG16B,
- FeatureAES,
- FeaturePRFCHW,
- FeaturePCLMUL,
- FeatureMMX,
- FeatureAVX,
- FeatureFXSR,
- FeatureNOPL,
- FeatureSSE4A,
- FeatureF16C,
- FeatureLZCNT,
- FeaturePOPCNT,
- FeatureXSAVE,
- FeatureBMI,
- FeatureTBM,
- FeatureLWP,
- FeatureFMA,
- FeatureXSAVEOPT,
- FeatureSlowSHLD,
- FeatureFSGSBase,
- FeatureLAHFSAHF,
- FeatureFast11ByteNOP,
- FeatureFastBEXTR,
- FeatureMacroFusion
-]>;
-
+def : Proc<"bdver3", ProcessorFeatures.BdVer3Features>;
// Excavator
-def : Proc<"bdver4", [
- FeatureX87,
- FeatureCMOV,
- FeatureMMX,
- FeatureAVX2,
- FeatureFXSR,
- FeatureNOPL,
- FeatureXOP,
- FeatureFMA4,
- Feature64Bit,
- FeatureCMPXCHG16B,
- FeatureAES,
- FeaturePRFCHW,
- FeaturePCLMUL,
- FeatureF16C,
- FeatureLZCNT,
- FeaturePOPCNT,
- FeatureXSAVE,
- FeatureBMI,
- FeatureBMI2,
- FeatureTBM,
- FeatureLWP,
- FeatureFMA,
- FeatureXSAVEOPT,
- FeatureSlowSHLD,
- FeatureFSGSBase,
- FeatureLAHFSAHF,
- FeatureFastBEXTR,
- FeatureFast11ByteNOP,
- FeatureMWAITX,
- FeatureMacroFusion
-]>;
+def : Proc<"bdver4", ProcessorFeatures.BdVer4Features>;
-// Znver1
-def: ProcessorModel<"znver1", Znver1Model, [
- FeatureADX,
- FeatureAES,
- FeatureAVX2,
- FeatureBMI,
- FeatureBMI2,
- FeatureCLFLUSHOPT,
- FeatureCLZERO,
- FeatureCMOV,
- Feature64Bit,
- FeatureCMPXCHG16B,
- FeatureF16C,
- FeatureFMA,
- FeatureFSGSBase,
- FeatureFXSR,
- FeatureNOPL,
- FeatureFastLZCNT,
- FeatureLAHFSAHF,
- FeatureLZCNT,
- FeatureFastBEXTR,
- FeatureFast15ByteNOP,
- FeatureMacroFusion,
- FeatureMMX,
- FeatureMOVBE,
- FeatureMWAITX,
- FeaturePCLMUL,
- FeaturePOPCNT,
- FeaturePRFCHW,
- FeatureRDRAND,
- FeatureRDSEED,
- FeatureSHA,
- FeatureSSE4A,
- FeatureSlowSHLD,
- FeatureX87,
- FeatureXSAVE,
- FeatureXSAVEC,
- FeatureXSAVEOPT,
- FeatureXSAVES]>;
+def : ProcessorModel<"znver1", Znver1Model, ProcessorFeatures.ZNFeatures>;
+def : ProcessorModel<"znver2", Znver1Model, ProcessorFeatures.ZN2Features>;
-def : Proc<"geode", [FeatureX87, FeatureSlowUAMem16, Feature3DNowA]>;
+def : Proc<"geode", [FeatureX87, FeatureSlowUAMem16, FeatureCMPXCHG8B,
+ Feature3DNowA]>;
def : Proc<"winchip-c6", [FeatureX87, FeatureSlowUAMem16, FeatureMMX]>;
def : Proc<"winchip2", [FeatureX87, FeatureSlowUAMem16, Feature3DNow]>;
def : Proc<"c3", [FeatureX87, FeatureSlowUAMem16, Feature3DNow]>;
-def : Proc<"c3-2", [FeatureX87, FeatureSlowUAMem16, FeatureMMX,
- FeatureSSE1, FeatureFXSR, FeatureCMOV]>;
+def : Proc<"c3-2", [FeatureX87, FeatureSlowUAMem16, FeatureCMPXCHG8B,
+ FeatureMMX, FeatureSSE1, FeatureFXSR,
+ FeatureCMOV]>;
// We also provide a generic 64-bit specific x86 processor model which tries to
// be good for modern chips without enabling instruction set encodings past the
@@ -1205,6 +1174,7 @@ def : Proc<"c3-2", [FeatureX87, FeatureSlowUAMem16, FeatureMMX,
// forming a common base for them.
def : ProcessorModel<"x86-64", SandyBridgeModel, [
FeatureX87,
+ FeatureCMPXCHG8B,
FeatureCMOV,
FeatureMMX,
FeatureSSE2,
diff --git a/lib/Target/X86/X86AsmPrinter.cpp b/lib/Target/X86/X86AsmPrinter.cpp
index 36cef98a1ef5..80120722e0e6 100644
--- a/lib/Target/X86/X86AsmPrinter.cpp
+++ b/lib/Target/X86/X86AsmPrinter.cpp
@@ -1,9 +1,8 @@
//===-- X86AsmPrinter.cpp - Convert X86 LLVM code to AT&T assembly --------===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
@@ -13,9 +12,10 @@
//===----------------------------------------------------------------------===//
#include "X86AsmPrinter.h"
-#include "InstPrinter/X86ATTInstPrinter.h"
+#include "MCTargetDesc/X86ATTInstPrinter.h"
#include "MCTargetDesc/X86BaseInfo.h"
#include "MCTargetDesc/X86TargetStreamer.h"
+#include "TargetInfo/X86TargetInfo.h"
#include "X86InstrInfo.h"
#include "X86MachineFunctionInfo.h"
#include "llvm/BinaryFormat/COFF.h"
@@ -24,6 +24,7 @@
#include "llvm/CodeGen/MachineModuleInfoImpls.h"
#include "llvm/CodeGen/TargetLoweringObjectFileImpl.h"
#include "llvm/IR/DerivedTypes.h"
+#include "llvm/IR/InlineAsm.h"
#include "llvm/IR/Mangler.h"
#include "llvm/IR/Module.h"
#include "llvm/IR/Type.h"
@@ -104,16 +105,16 @@ void X86AsmPrinter::EmitFunctionBodyEnd() {
}
}
-/// printSymbolOperand - Print a raw symbol reference operand. This handles
+/// PrintSymbolOperand - Print a raw symbol reference operand. This handles
/// jump tables, constant pools, global address and external symbols, all of
/// which print to a label with various suffixes for relocation types etc.
-static void printSymbolOperand(X86AsmPrinter &P, const MachineOperand &MO,
- raw_ostream &O) {
+void X86AsmPrinter::PrintSymbolOperand(const MachineOperand &MO,
+ raw_ostream &O) {
switch (MO.getType()) {
default: llvm_unreachable("unknown symbol type!");
case MachineOperand::MO_ConstantPoolIndex:
- P.GetCPISymbol(MO.getIndex())->print(O, P.MAI);
- P.printOffset(MO.getOffset(), O);
+ GetCPISymbol(MO.getIndex())->print(O, MAI);
+ printOffset(MO.getOffset(), O);
break;
case MachineOperand::MO_GlobalAddress: {
const GlobalValue *GV = MO.getGlobal();
@@ -121,38 +122,37 @@ static void printSymbolOperand(X86AsmPrinter &P, const MachineOperand &MO,
MCSymbol *GVSym;
if (MO.getTargetFlags() == X86II::MO_DARWIN_NONLAZY ||
MO.getTargetFlags() == X86II::MO_DARWIN_NONLAZY_PIC_BASE)
- GVSym = P.getSymbolWithGlobalValueBase(GV, "$non_lazy_ptr");
+ GVSym = getSymbolWithGlobalValueBase(GV, "$non_lazy_ptr");
else
- GVSym = P.getSymbol(GV);
+ GVSym = getSymbol(GV);
// Handle dllimport linkage.
if (MO.getTargetFlags() == X86II::MO_DLLIMPORT)
- GVSym =
- P.OutContext.getOrCreateSymbol(Twine("__imp_") + GVSym->getName());
+ GVSym = OutContext.getOrCreateSymbol(Twine("__imp_") + GVSym->getName());
else if (MO.getTargetFlags() == X86II::MO_COFFSTUB)
GVSym =
- P.OutContext.getOrCreateSymbol(Twine(".refptr.") + GVSym->getName());
+ OutContext.getOrCreateSymbol(Twine(".refptr.") + GVSym->getName());
if (MO.getTargetFlags() == X86II::MO_DARWIN_NONLAZY ||
MO.getTargetFlags() == X86II::MO_DARWIN_NONLAZY_PIC_BASE) {
- MCSymbol *Sym = P.getSymbolWithGlobalValueBase(GV, "$non_lazy_ptr");
+ MCSymbol *Sym = getSymbolWithGlobalValueBase(GV, "$non_lazy_ptr");
MachineModuleInfoImpl::StubValueTy &StubSym =
- P.MMI->getObjFileInfo<MachineModuleInfoMachO>().getGVStubEntry(Sym);
+ MMI->getObjFileInfo<MachineModuleInfoMachO>().getGVStubEntry(Sym);
if (!StubSym.getPointer())
- StubSym = MachineModuleInfoImpl::
- StubValueTy(P.getSymbol(GV), !GV->hasInternalLinkage());
+ StubSym = MachineModuleInfoImpl::StubValueTy(getSymbol(GV),
+ !GV->hasInternalLinkage());
}
// If the name begins with a dollar-sign, enclose it in parens. We do this
// to avoid having it look like an integer immediate to the assembler.
if (GVSym->getName()[0] != '$')
- GVSym->print(O, P.MAI);
+ GVSym->print(O, MAI);
else {
O << '(';
- GVSym->print(O, P.MAI);
+ GVSym->print(O, MAI);
O << ')';
}
- P.printOffset(MO.getOffset(), O);
+ printOffset(MO.getOffset(), O);
break;
}
}
@@ -169,13 +169,13 @@ static void printSymbolOperand(X86AsmPrinter &P, const MachineOperand &MO,
break;
case X86II::MO_GOT_ABSOLUTE_ADDRESS:
O << " + [.-";
- P.MF->getPICBaseSymbol()->print(O, P.MAI);
+ MF->getPICBaseSymbol()->print(O, MAI);
O << ']';
break;
case X86II::MO_PIC_BASE_OFFSET:
case X86II::MO_DARWIN_NONLAZY_PIC_BASE:
O << '-';
- P.MF->getPICBaseSymbol()->print(O, P.MAI);
+ MF->getPICBaseSymbol()->print(O, MAI);
break;
case X86II::MO_TLSGD: O << "@TLSGD"; break;
case X86II::MO_TLSLD: O << "@TLSLD"; break;
@@ -193,76 +193,91 @@ static void printSymbolOperand(X86AsmPrinter &P, const MachineOperand &MO,
case X86II::MO_TLVP: O << "@TLVP"; break;
case X86II::MO_TLVP_PIC_BASE:
O << "@TLVP" << '-';
- P.MF->getPICBaseSymbol()->print(O, P.MAI);
+ MF->getPICBaseSymbol()->print(O, MAI);
break;
case X86II::MO_SECREL: O << "@SECREL32"; break;
}
}
-static void printOperand(X86AsmPrinter &P, const MachineInstr *MI,
- unsigned OpNo, raw_ostream &O,
- const char *Modifier = nullptr, unsigned AsmVariant = 0);
-
-/// printPCRelImm - This is used to print an immediate value that ends up
-/// being encoded as a pc-relative value. These print slightly differently, for
-/// example, a $ is not emitted.
-static void printPCRelImm(X86AsmPrinter &P, const MachineInstr *MI,
- unsigned OpNo, raw_ostream &O) {
+void X86AsmPrinter::PrintOperand(const MachineInstr *MI, unsigned OpNo,
+ raw_ostream &O) {
const MachineOperand &MO = MI->getOperand(OpNo);
+ const bool IsATT = MI->getInlineAsmDialect() == InlineAsm::AD_ATT;
switch (MO.getType()) {
- default: llvm_unreachable("Unknown pcrel immediate operand");
- case MachineOperand::MO_Register:
- // pc-relativeness was handled when computing the value in the reg.
- printOperand(P, MI, OpNo, O);
+ default: llvm_unreachable("unknown operand type!");
+ case MachineOperand::MO_Register: {
+ if (IsATT)
+ O << '%';
+ O << X86ATTInstPrinter::getRegisterName(MO.getReg());
return;
+ }
+
case MachineOperand::MO_Immediate:
+ if (IsATT)
+ O << '$';
O << MO.getImm();
return;
- case MachineOperand::MO_GlobalAddress:
- printSymbolOperand(P, MO, O);
- return;
+
+ case MachineOperand::MO_GlobalAddress: {
+ if (IsATT)
+ O << '$';
+ PrintSymbolOperand(MO, O);
+ break;
+ }
+ case MachineOperand::MO_BlockAddress: {
+ MCSymbol *Sym = GetBlockAddressSymbol(MO.getBlockAddress());
+ Sym->print(O, MAI);
+ break;
+ }
}
}
-static void printOperand(X86AsmPrinter &P, const MachineInstr *MI,
- unsigned OpNo, raw_ostream &O, const char *Modifier,
- unsigned AsmVariant) {
+/// PrintModifiedOperand - Print subregisters based on supplied modifier,
+/// deferring to PrintOperand() if no modifier was supplied or if operand is not
+/// a register.
+void X86AsmPrinter::PrintModifiedOperand(const MachineInstr *MI, unsigned OpNo,
+ raw_ostream &O, const char *Modifier) {
const MachineOperand &MO = MI->getOperand(OpNo);
- switch (MO.getType()) {
- default: llvm_unreachable("unknown operand type!");
- case MachineOperand::MO_Register: {
- // FIXME: Enumerating AsmVariant, so we can remove magic number.
- if (AsmVariant == 0) O << '%';
- unsigned Reg = MO.getReg();
- if (Modifier && strncmp(Modifier, "subreg", strlen("subreg")) == 0) {
- unsigned Size = (strcmp(Modifier+6,"64") == 0) ? 64 :
- (strcmp(Modifier+6,"32") == 0) ? 32 :
- (strcmp(Modifier+6,"16") == 0) ? 16 : 8;
- Reg = getX86SubSuperRegister(Reg, Size);
- }
- O << X86ATTInstPrinter::getRegisterName(Reg);
- return;
+ if (!Modifier || MO.getType() != MachineOperand::MO_Register)
+ return PrintOperand(MI, OpNo, O);
+ if (MI->getInlineAsmDialect() == InlineAsm::AD_ATT)
+ O << '%';
+ unsigned Reg = MO.getReg();
+ if (strncmp(Modifier, "subreg", strlen("subreg")) == 0) {
+ unsigned Size = (strcmp(Modifier+6,"64") == 0) ? 64 :
+ (strcmp(Modifier+6,"32") == 0) ? 32 :
+ (strcmp(Modifier+6,"16") == 0) ? 16 : 8;
+ Reg = getX86SubSuperRegister(Reg, Size);
}
+ O << X86ATTInstPrinter::getRegisterName(Reg);
+}
+/// PrintPCRelImm - This is used to print an immediate value that ends up
+/// being encoded as a pc-relative value. These print slightly differently, for
+/// example, a $ is not emitted.
+void X86AsmPrinter::PrintPCRelImm(const MachineInstr *MI, unsigned OpNo,
+ raw_ostream &O) {
+ const MachineOperand &MO = MI->getOperand(OpNo);
+ switch (MO.getType()) {
+ default: llvm_unreachable("Unknown pcrel immediate operand");
+ case MachineOperand::MO_Register:
+ // pc-relativeness was handled when computing the value in the reg.
+ PrintOperand(MI, OpNo, O);
+ return;
case MachineOperand::MO_Immediate:
- if (AsmVariant == 0) O << '$';
O << MO.getImm();
return;
-
- case MachineOperand::MO_GlobalAddress: {
- if (AsmVariant == 0) O << '$';
- printSymbolOperand(P, MO, O);
- break;
- }
+ case MachineOperand::MO_GlobalAddress:
+ PrintSymbolOperand(MO, O);
+ return;
}
}
-static void printLeaMemReference(X86AsmPrinter &P, const MachineInstr *MI,
- unsigned Op, raw_ostream &O,
- const char *Modifier = nullptr) {
- const MachineOperand &BaseReg = MI->getOperand(Op+X86::AddrBaseReg);
- const MachineOperand &IndexReg = MI->getOperand(Op+X86::AddrIndexReg);
- const MachineOperand &DispSpec = MI->getOperand(Op+X86::AddrDisp);
+void X86AsmPrinter::PrintLeaMemReference(const MachineInstr *MI, unsigned OpNo,
+ raw_ostream &O, const char *Modifier) {
+ const MachineOperand &BaseReg = MI->getOperand(OpNo + X86::AddrBaseReg);
+ const MachineOperand &IndexReg = MI->getOperand(OpNo + X86::AddrIndexReg);
+ const MachineOperand &DispSpec = MI->getOperand(OpNo + X86::AddrDisp);
// If we really don't want to print out (rip), don't.
bool HasBaseReg = BaseReg.getReg() != 0;
@@ -284,7 +299,8 @@ static void printLeaMemReference(X86AsmPrinter &P, const MachineInstr *MI,
}
case MachineOperand::MO_GlobalAddress:
case MachineOperand::MO_ConstantPoolIndex:
- printSymbolOperand(P, DispSpec, O);
+ PrintSymbolOperand(DispSpec, O);
+ break;
}
if (Modifier && strcmp(Modifier, "H") == 0)
@@ -296,12 +312,12 @@ static void printLeaMemReference(X86AsmPrinter &P, const MachineInstr *MI,
O << '(';
if (HasBaseReg)
- printOperand(P, MI, Op+X86::AddrBaseReg, O, Modifier);
+ PrintModifiedOperand(MI, OpNo + X86::AddrBaseReg, O, Modifier);
if (IndexReg.getReg()) {
O << ',';
- printOperand(P, MI, Op+X86::AddrIndexReg, O, Modifier);
- unsigned ScaleVal = MI->getOperand(Op+X86::AddrScaleAmt).getImm();
+ PrintModifiedOperand(MI, OpNo + X86::AddrIndexReg, O, Modifier);
+ unsigned ScaleVal = MI->getOperand(OpNo + X86::AddrScaleAmt).getImm();
if (ScaleVal != 1)
O << ',' << ScaleVal;
}
@@ -309,31 +325,28 @@ static void printLeaMemReference(X86AsmPrinter &P, const MachineInstr *MI,
}
}
-static void printMemReference(X86AsmPrinter &P, const MachineInstr *MI,
- unsigned Op, raw_ostream &O,
- const char *Modifier = nullptr) {
- assert(isMem(*MI, Op) && "Invalid memory reference!");
- const MachineOperand &Segment = MI->getOperand(Op+X86::AddrSegmentReg);
+void X86AsmPrinter::PrintMemReference(const MachineInstr *MI, unsigned OpNo,
+ raw_ostream &O, const char *Modifier) {
+ assert(isMem(*MI, OpNo) && "Invalid memory reference!");
+ const MachineOperand &Segment = MI->getOperand(OpNo + X86::AddrSegmentReg);
if (Segment.getReg()) {
- printOperand(P, MI, Op+X86::AddrSegmentReg, O, Modifier);
+ PrintModifiedOperand(MI, OpNo + X86::AddrSegmentReg, O, Modifier);
O << ':';
}
- printLeaMemReference(P, MI, Op, O, Modifier);
+ PrintLeaMemReference(MI, OpNo, O, Modifier);
}
-static void printIntelMemReference(X86AsmPrinter &P, const MachineInstr *MI,
- unsigned Op, raw_ostream &O,
- const char *Modifier = nullptr,
- unsigned AsmVariant = 1) {
- const MachineOperand &BaseReg = MI->getOperand(Op+X86::AddrBaseReg);
- unsigned ScaleVal = MI->getOperand(Op+X86::AddrScaleAmt).getImm();
- const MachineOperand &IndexReg = MI->getOperand(Op+X86::AddrIndexReg);
- const MachineOperand &DispSpec = MI->getOperand(Op+X86::AddrDisp);
- const MachineOperand &SegReg = MI->getOperand(Op+X86::AddrSegmentReg);
+void X86AsmPrinter::PrintIntelMemReference(const MachineInstr *MI,
+ unsigned OpNo, raw_ostream &O) {
+ const MachineOperand &BaseReg = MI->getOperand(OpNo + X86::AddrBaseReg);
+ unsigned ScaleVal = MI->getOperand(OpNo + X86::AddrScaleAmt).getImm();
+ const MachineOperand &IndexReg = MI->getOperand(OpNo + X86::AddrIndexReg);
+ const MachineOperand &DispSpec = MI->getOperand(OpNo + X86::AddrDisp);
+ const MachineOperand &SegReg = MI->getOperand(OpNo + X86::AddrSegmentReg);
// If this has a segment register, print it.
if (SegReg.getReg()) {
- printOperand(P, MI, Op+X86::AddrSegmentReg, O, Modifier, AsmVariant);
+ PrintOperand(MI, OpNo + X86::AddrSegmentReg, O);
O << ':';
}
@@ -341,7 +354,7 @@ static void printIntelMemReference(X86AsmPrinter &P, const MachineInstr *MI,
bool NeedPlus = false;
if (BaseReg.getReg()) {
- printOperand(P, MI, Op+X86::AddrBaseReg, O, Modifier, AsmVariant);
+ PrintOperand(MI, OpNo + X86::AddrBaseReg, O);
NeedPlus = true;
}
@@ -349,13 +362,13 @@ static void printIntelMemReference(X86AsmPrinter &P, const MachineInstr *MI,
if (NeedPlus) O << " + ";
if (ScaleVal != 1)
O << ScaleVal << '*';
- printOperand(P, MI, Op+X86::AddrIndexReg, O, Modifier, AsmVariant);
+ PrintOperand(MI, OpNo + X86::AddrIndexReg, O);
NeedPlus = true;
}
if (!DispSpec.isImm()) {
if (NeedPlus) O << " + ";
- printOperand(P, MI, Op+X86::AddrDisp, O, Modifier, AsmVariant);
+ PrintOperand(MI, OpNo + X86::AddrDisp, O);
} else {
int64_t DispVal = DispSpec.getImm();
if (DispVal || (!IndexReg.getReg() && !BaseReg.getReg())) {
@@ -418,7 +431,6 @@ static bool printAsmMRegister(X86AsmPrinter &P, const MachineOperand &MO,
/// PrintAsmOperand - Print out an operand for an inline asm expression.
///
bool X86AsmPrinter::PrintAsmOperand(const MachineInstr *MI, unsigned OpNo,
- unsigned AsmVariant,
const char *ExtraCode, raw_ostream &O) {
// Does this asm operand have a single letter operand modifier?
if (ExtraCode && ExtraCode[0]) {
@@ -429,7 +441,7 @@ bool X86AsmPrinter::PrintAsmOperand(const MachineInstr *MI, unsigned OpNo,
switch (ExtraCode[0]) {
default:
// See if this is a generic print operand
- return AsmPrinter::PrintAsmOperand(MI, OpNo, AsmVariant, ExtraCode, O);
+ return AsmPrinter::PrintAsmOperand(MI, OpNo, ExtraCode, O);
case 'a': // This is an address. Currently only 'i' and 'r' are expected.
switch (MO.getType()) {
default:
@@ -442,13 +454,13 @@ bool X86AsmPrinter::PrintAsmOperand(const MachineInstr *MI, unsigned OpNo,
case MachineOperand::MO_ExternalSymbol:
llvm_unreachable("unexpected operand type!");
case MachineOperand::MO_GlobalAddress:
- printSymbolOperand(*this, MO, O);
+ PrintSymbolOperand(MO, O);
if (Subtarget->isPICStyleRIPRel())
O << "(%rip)";
return false;
case MachineOperand::MO_Register:
O << '(';
- printOperand(*this, MI, OpNo, O);
+ PrintOperand(MI, OpNo, O);
O << ')';
return false;
}
@@ -456,7 +468,7 @@ bool X86AsmPrinter::PrintAsmOperand(const MachineInstr *MI, unsigned OpNo,
case 'c': // Don't print "$" before a global var name or constant.
switch (MO.getType()) {
default:
- printOperand(*this, MI, OpNo, O);
+ PrintOperand(MI, OpNo, O);
break;
case MachineOperand::MO_Immediate:
O << MO.getImm();
@@ -466,7 +478,7 @@ bool X86AsmPrinter::PrintAsmOperand(const MachineInstr *MI, unsigned OpNo,
case MachineOperand::MO_ExternalSymbol:
llvm_unreachable("unexpected operand type!");
case MachineOperand::MO_GlobalAddress:
- printSymbolOperand(*this, MO, O);
+ PrintSymbolOperand(MO, O);
break;
}
return false;
@@ -474,7 +486,7 @@ bool X86AsmPrinter::PrintAsmOperand(const MachineInstr *MI, unsigned OpNo,
case 'A': // Print '*' before a register (it must be a register)
if (MO.isReg()) {
O << '*';
- printOperand(*this, MI, OpNo, O);
+ PrintOperand(MI, OpNo, O);
return false;
}
return true;
@@ -487,11 +499,11 @@ bool X86AsmPrinter::PrintAsmOperand(const MachineInstr *MI, unsigned OpNo,
case 'V': // Print native register without '%'
if (MO.isReg())
return printAsmMRegister(*this, MO, ExtraCode[0], O);
- printOperand(*this, MI, OpNo, O);
+ PrintOperand(MI, OpNo, O);
return false;
case 'P': // This is the operand of a call, treat specially.
- printPCRelImm(*this, MI, OpNo, O);
+ PrintPCRelImm(MI, OpNo, O);
return false;
case 'n': // Negate the immediate or print a '-' before the operand.
@@ -505,16 +517,15 @@ bool X86AsmPrinter::PrintAsmOperand(const MachineInstr *MI, unsigned OpNo,
}
}
- printOperand(*this, MI, OpNo, O, /*Modifier*/ nullptr, AsmVariant);
+ PrintOperand(MI, OpNo, O);
return false;
}
-bool X86AsmPrinter::PrintAsmMemoryOperand(const MachineInstr *MI,
- unsigned OpNo, unsigned AsmVariant,
+bool X86AsmPrinter::PrintAsmMemoryOperand(const MachineInstr *MI, unsigned OpNo,
const char *ExtraCode,
raw_ostream &O) {
- if (AsmVariant) {
- printIntelMemReference(*this, MI, OpNo, O);
+ if (MI->getInlineAsmDialect() == InlineAsm::AD_Intel) {
+ PrintIntelMemReference(MI, OpNo, O);
return false;
}
@@ -531,14 +542,14 @@ bool X86AsmPrinter::PrintAsmMemoryOperand(const MachineInstr *MI,
// These only apply to registers, ignore on mem.
break;
case 'H':
- printMemReference(*this, MI, OpNo, O, "H");
+ PrintMemReference(MI, OpNo, O, "H");
return false;
case 'P': // Don't print @PLT, but do print as memory.
- printMemReference(*this, MI, OpNo, O, "no-rip");
+ PrintMemReference(MI, OpNo, O, "no-rip");
return false;
}
}
- printMemReference(*this, MI, OpNo, O);
+ PrintMemReference(MI, OpNo, O, nullptr);
return false;
}
@@ -683,26 +694,31 @@ void X86AsmPrinter::EmitEndOfAsmFile(Module &M) {
// stripping. Since LLVM never generates code that does this, it is always
// safe to set.
OutStreamer->EmitAssemblerFlag(MCAF_SubsectionsViaSymbols);
- return;
- }
-
- if (TT.isKnownWindowsMSVCEnvironment() && MMI->usesVAFloatArgument()) {
- StringRef SymbolName =
- (TT.getArch() == Triple::x86_64) ? "_fltused" : "__fltused";
- MCSymbol *S = MMI->getContext().getOrCreateSymbol(SymbolName);
- OutStreamer->EmitSymbolAttribute(S, MCSA_Global);
- return;
- }
-
- if (TT.isOSBinFormatCOFF()) {
+ } else if (TT.isOSBinFormatCOFF()) {
+ if (MMI->usesMSVCFloatingPoint()) {
+ // In Windows' libcmt.lib, there is a file which is linked in only if the
+ // symbol _fltused is referenced. Linking this in causes some
+ // side-effects:
+ //
+ // 1. For x86-32, it will set the x87 rounding mode to 53-bit instead of
+ // 64-bit mantissas at program start.
+ //
+ // 2. It links in support routines for floating-point in scanf and printf.
+ //
+ // MSVC emits an undefined reference to _fltused when there are any
+ // floating point operations in the program (including calls). A program
+ // that only has: `scanf("%f", &global_float);` may fail to trigger this,
+ // but oh well...that's a documented issue.
+ StringRef SymbolName =
+ (TT.getArch() == Triple::x86) ? "__fltused" : "_fltused";
+ MCSymbol *S = MMI->getContext().getOrCreateSymbol(SymbolName);
+ OutStreamer->EmitSymbolAttribute(S, MCSA_Global);
+ return;
+ }
emitStackMaps(SM);
- return;
- }
-
- if (TT.isOSBinFormatELF()) {
+ } else if (TT.isOSBinFormatELF()) {
emitStackMaps(SM);
FM.serializeToFaultMapSection();
- return;
}
}
diff --git a/lib/Target/X86/X86AsmPrinter.h b/lib/Target/X86/X86AsmPrinter.h
index 55abdf2ba601..a011310970b3 100644
--- a/lib/Target/X86/X86AsmPrinter.h
+++ b/lib/Target/X86/X86AsmPrinter.h
@@ -1,9 +1,8 @@
//===-- X86AsmPrinter.h - X86 implementation of AsmPrinter ------*- C++ -*-===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
@@ -103,6 +102,18 @@ class LLVM_LIBRARY_VISIBILITY X86AsmPrinter : public AsmPrinter {
// Choose between emitting .seh_ directives and .cv_fpo_ directives.
void EmitSEHInstruction(const MachineInstr *MI);
+ void PrintSymbolOperand(const MachineOperand &MO, raw_ostream &O) override;
+ void PrintOperand(const MachineInstr *MI, unsigned OpNo, raw_ostream &O);
+ void PrintModifiedOperand(const MachineInstr *MI, unsigned OpNo,
+ raw_ostream &O, const char *Modifier);
+ void PrintPCRelImm(const MachineInstr *MI, unsigned OpNo, raw_ostream &O);
+ void PrintLeaMemReference(const MachineInstr *MI, unsigned OpNo,
+ raw_ostream &O, const char *Modifier);
+ void PrintMemReference(const MachineInstr *MI, unsigned OpNo, raw_ostream &O,
+ const char *Modifier);
+ void PrintIntelMemReference(const MachineInstr *MI, unsigned OpNo,
+ raw_ostream &O);
+
public:
X86AsmPrinter(TargetMachine &TM, std::unique_ptr<MCStreamer> Streamer);
@@ -124,11 +135,9 @@ public:
}
bool PrintAsmOperand(const MachineInstr *MI, unsigned OpNo,
- unsigned AsmVariant, const char *ExtraCode,
- raw_ostream &OS) override;
+ const char *ExtraCode, raw_ostream &OS) override;
bool PrintAsmMemoryOperand(const MachineInstr *MI, unsigned OpNo,
- unsigned AsmVariant, const char *ExtraCode,
- raw_ostream &OS) override;
+ const char *ExtraCode, raw_ostream &OS) override;
bool doInitialization(Module &M) override {
SMShadowTracker.reset(0);
diff --git a/lib/Target/X86/X86AvoidStoreForwardingBlocks.cpp b/lib/Target/X86/X86AvoidStoreForwardingBlocks.cpp
index 627a6cb14514..3dcc1015dc7c 100644
--- a/lib/Target/X86/X86AvoidStoreForwardingBlocks.cpp
+++ b/lib/Target/X86/X86AvoidStoreForwardingBlocks.cpp
@@ -1,9 +1,8 @@
//===- X86AvoidStoreForwardingBlockis.cpp - Avoid HW Store Forward Block --===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
@@ -69,9 +68,7 @@ using DisplacementSizeMap = std::map<int64_t, unsigned>;
class X86AvoidSFBPass : public MachineFunctionPass {
public:
static char ID;
- X86AvoidSFBPass() : MachineFunctionPass(ID) {
- initializeX86AvoidSFBPassPass(*PassRegistry::getPassRegistry());
- }
+ X86AvoidSFBPass() : MachineFunctionPass(ID) { }
StringRef getPassName() const override {
return "X86 Avoid Store Forwarding Blocks";
@@ -343,6 +340,8 @@ findPotentialBlockers(MachineInstr *LoadInst) {
for (auto PBInst = std::next(MachineBasicBlock::reverse_iterator(LoadInst)),
E = LoadInst->getParent()->rend();
PBInst != E; ++PBInst) {
+ if (PBInst->isMetaInstruction())
+ continue;
BlockCount++;
if (BlockCount >= InspectionLimit)
break;
@@ -366,6 +365,8 @@ findPotentialBlockers(MachineInstr *LoadInst) {
for (MachineBasicBlock::reverse_iterator PBInst = PMBB->rbegin(),
PME = PMBB->rend();
PBInst != PME; ++PBInst) {
+ if (PBInst->isMetaInstruction())
+ continue;
PredCount++;
if (PredCount >= LimitLeft)
break;
@@ -407,7 +408,10 @@ void X86AvoidSFBPass::buildCopy(MachineInstr *LoadInst, unsigned NLoadOpcode,
// If the load and store are consecutive, use the loadInst location to
// reduce register pressure.
MachineInstr *StInst = StoreInst;
- if (StoreInst->getPrevNode() == LoadInst)
+ auto PrevInstrIt = skipDebugInstructionsBackward(
+ std::prev(MachineBasicBlock::instr_iterator(StoreInst)),
+ MBB->instr_begin());
+ if (PrevInstrIt.getNodePtr() == LoadInst)
StInst = LoadInst;
MachineInstr *NewStore =
BuildMI(*MBB, StInst, StInst->getDebugLoc(), TII->get(NStoreOpcode))
@@ -492,19 +496,22 @@ void X86AvoidSFBPass::buildCopies(int Size, MachineInstr *LoadInst,
static void updateKillStatus(MachineInstr *LoadInst, MachineInstr *StoreInst) {
MachineOperand &LoadBase = getBaseOperand(LoadInst);
MachineOperand &StoreBase = getBaseOperand(StoreInst);
+ auto StorePrevNonDbgInstr = skipDebugInstructionsBackward(
+ std::prev(MachineBasicBlock::instr_iterator(StoreInst)),
+ LoadInst->getParent()->instr_begin()).getNodePtr();
if (LoadBase.isReg()) {
MachineInstr *LastLoad = LoadInst->getPrevNode();
// If the original load and store to xmm/ymm were consecutive
// then the partial copies were also created in
// a consecutive order to reduce register pressure,
// and the location of the last load is before the last store.
- if (StoreInst->getPrevNode() == LoadInst)
+ if (StorePrevNonDbgInstr == LoadInst)
LastLoad = LoadInst->getPrevNode()->getPrevNode();
getBaseOperand(LastLoad).setIsKill(LoadBase.isKill());
}
if (StoreBase.isReg()) {
MachineInstr *StInst = StoreInst;
- if (StoreInst->getPrevNode() == LoadInst)
+ if (StorePrevNonDbgInstr == LoadInst)
StInst = LoadInst;
getBaseOperand(StInst->getPrevNode()).setIsKill(StoreBase.isKill());
}
@@ -531,7 +538,7 @@ void X86AvoidSFBPass::findPotentiallylBlockedCopies(MachineFunction &MF) {
if (!isPotentialBlockedMemCpyLd(MI.getOpcode()))
continue;
int DefVR = MI.getOperand(0).getReg();
- if (!MRI->hasOneUse(DefVR))
+ if (!MRI->hasOneNonDBGUse(DefVR))
continue;
for (auto UI = MRI->use_nodbg_begin(DefVR), UE = MRI->use_nodbg_end();
UI != UE;) {
diff --git a/lib/Target/X86/X86CallFrameOptimization.cpp b/lib/Target/X86/X86CallFrameOptimization.cpp
index 24d7a219e751..4df849a2e14c 100644
--- a/lib/Target/X86/X86CallFrameOptimization.cpp
+++ b/lib/Target/X86/X86CallFrameOptimization.cpp
@@ -1,9 +1,8 @@
//===----- X86CallFrameOptimization.cpp - Optimize x86 call sequences -----===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
@@ -60,10 +59,7 @@ namespace {
class X86CallFrameOptimization : public MachineFunctionPass {
public:
- X86CallFrameOptimization() : MachineFunctionPass(ID) {
- initializeX86CallFrameOptimizationPass(
- *PassRegistry::getPassRegistry());
- }
+ X86CallFrameOptimization() : MachineFunctionPass(ID) { }
bool runOnMachineFunction(MachineFunction &MF) override;
diff --git a/lib/Target/X86/X86CallLowering.cpp b/lib/Target/X86/X86CallLowering.cpp
index 1dc83b76595d..b16b3839c85a 100644
--- a/lib/Target/X86/X86CallLowering.cpp
+++ b/lib/Target/X86/X86CallLowering.cpp
@@ -1,9 +1,8 @@
//===- llvm/lib/Target/X86/X86CallLowering.cpp - Call lowering ------------===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
@@ -48,8 +47,6 @@
using namespace llvm;
-#include "X86GenCallingConv.inc"
-
X86CallLowering::X86CallLowering(const X86TargetLowering &TLI)
: CallLowering(&TLI) {}
@@ -64,6 +61,7 @@ bool X86CallLowering::splitToValueTypes(const ArgInfo &OrigArg,
SmallVector<EVT, 4> SplitVTs;
SmallVector<uint64_t, 4> Offsets;
ComputeValueVTs(TLI, DL, OrigArg.Ty, SplitVTs, &Offsets, 0);
+ assert(OrigArg.Regs.size() == 1 && "Can't handle multple regs yet");
if (OrigArg.Ty->isVoidTy())
return true;
@@ -73,12 +71,12 @@ bool X86CallLowering::splitToValueTypes(const ArgInfo &OrigArg,
if (NumParts == 1) {
// replace the original type ( pointer -> GPR ).
- SplitArgs.emplace_back(OrigArg.Reg, VT.getTypeForEVT(Context),
+ SplitArgs.emplace_back(OrigArg.Regs[0], VT.getTypeForEVT(Context),
OrigArg.Flags, OrigArg.IsFixed);
return true;
}
- SmallVector<unsigned, 8> SplitRegs;
+ SmallVector<Register, 8> SplitRegs;
EVT PartVT = TLI.getRegisterType(Context, VT);
Type *PartTy = PartVT.getTypeForEVT(Context);
@@ -88,7 +86,7 @@ bool X86CallLowering::splitToValueTypes(const ArgInfo &OrigArg,
ArgInfo{MRI.createGenericVirtualRegister(getLLTForType(*PartTy, DL)),
PartTy, OrigArg.Flags};
SplitArgs.push_back(Info);
- SplitRegs.push_back(Info.Reg);
+ SplitRegs.push_back(Info.Regs[0]);
}
PerformArgSplit(SplitRegs);
@@ -104,28 +102,28 @@ struct OutgoingValueHandler : public CallLowering::ValueHandler {
DL(MIRBuilder.getMF().getDataLayout()),
STI(MIRBuilder.getMF().getSubtarget<X86Subtarget>()) {}
- unsigned getStackAddress(uint64_t Size, int64_t Offset,
+ Register getStackAddress(uint64_t Size, int64_t Offset,
MachinePointerInfo &MPO) override {
LLT p0 = LLT::pointer(0, DL.getPointerSizeInBits(0));
LLT SType = LLT::scalar(DL.getPointerSizeInBits(0));
- unsigned SPReg = MRI.createGenericVirtualRegister(p0);
+ Register SPReg = MRI.createGenericVirtualRegister(p0);
MIRBuilder.buildCopy(SPReg, STI.getRegisterInfo()->getStackRegister());
- unsigned OffsetReg = MRI.createGenericVirtualRegister(SType);
+ Register OffsetReg = MRI.createGenericVirtualRegister(SType);
MIRBuilder.buildConstant(OffsetReg, Offset);
- unsigned AddrReg = MRI.createGenericVirtualRegister(p0);
+ Register AddrReg = MRI.createGenericVirtualRegister(p0);
MIRBuilder.buildGEP(AddrReg, SPReg, OffsetReg);
MPO = MachinePointerInfo::getStack(MIRBuilder.getMF(), Offset);
return AddrReg;
}
- void assignValueToReg(unsigned ValVReg, unsigned PhysReg,
+ void assignValueToReg(Register ValVReg, Register PhysReg,
CCValAssign &VA) override {
MIB.addUse(PhysReg, RegState::Implicit);
- unsigned ExtReg;
+ Register ExtReg;
// If we are copying the value to a physical register with the
// size larger than the size of the value itself - build AnyExt
// to the size of the register first and only then do the copy.
@@ -146,12 +144,12 @@ struct OutgoingValueHandler : public CallLowering::ValueHandler {
MIRBuilder.buildCopy(PhysReg, ExtReg);
}
- void assignValueToAddress(unsigned ValVReg, unsigned Addr, uint64_t Size,
+ void assignValueToAddress(Register ValVReg, Register Addr, uint64_t Size,
MachinePointerInfo &MPO, CCValAssign &VA) override {
- unsigned ExtReg = extendRegister(ValVReg, VA);
+ Register ExtReg = extendRegister(ValVReg, VA);
auto MMO = MIRBuilder.getMF().getMachineMemOperand(
MPO, MachineMemOperand::MOStore, VA.getLocVT().getStoreSize(),
- /* Alignment */ 0);
+ /* Alignment */ 1);
MIRBuilder.buildStore(ExtReg, Addr, *MMO);
}
@@ -185,7 +183,7 @@ protected:
bool X86CallLowering::lowerReturn(
MachineIRBuilder &MIRBuilder, const Value *Val,
- ArrayRef<unsigned> VRegs) const {
+ ArrayRef<Register> VRegs) const {
assert(((Val && !VRegs.empty()) || (!Val && VRegs.empty())) &&
"Return value without a vreg");
auto MIB = MIRBuilder.buildInstrNoInsert(X86::RET).addImm(0);
@@ -208,7 +206,7 @@ bool X86CallLowering::lowerReturn(
ArgInfo CurArgInfo = ArgInfo{VRegs[i], SplitEVTs[i].getTypeForEVT(Ctx)};
setArgFlags(CurArgInfo, AttributeList::ReturnIndex, DL, F);
if (!splitToValueTypes(CurArgInfo, SplitArgs, DL, MRI,
- [&](ArrayRef<unsigned> Regs) {
+ [&](ArrayRef<Register> Regs) {
MIRBuilder.buildUnmerge(Regs, VRegs[i]);
}))
return false;
@@ -231,7 +229,9 @@ struct IncomingValueHandler : public CallLowering::ValueHandler {
: ValueHandler(MIRBuilder, MRI, AssignFn),
DL(MIRBuilder.getMF().getDataLayout()) {}
- unsigned getStackAddress(uint64_t Size, int64_t Offset,
+ bool isArgumentHandler() const override { return true; }
+
+ Register getStackAddress(uint64_t Size, int64_t Offset,
MachinePointerInfo &MPO) override {
auto &MFI = MIRBuilder.getMF().getFrameInfo();
int FI = MFI.CreateFixedObject(Size, Offset, true);
@@ -243,15 +243,15 @@ struct IncomingValueHandler : public CallLowering::ValueHandler {
return AddrReg;
}
- void assignValueToAddress(unsigned ValVReg, unsigned Addr, uint64_t Size,
+ void assignValueToAddress(Register ValVReg, Register Addr, uint64_t Size,
MachinePointerInfo &MPO, CCValAssign &VA) override {
auto MMO = MIRBuilder.getMF().getMachineMemOperand(
MPO, MachineMemOperand::MOLoad | MachineMemOperand::MOInvariant, Size,
- 0);
+ 1);
MIRBuilder.buildLoad(ValVReg, Addr, *MMO);
}
- void assignValueToReg(unsigned ValVReg, unsigned PhysReg,
+ void assignValueToReg(Register ValVReg, Register PhysReg,
CCValAssign &VA) override {
markPhysRegUsed(PhysReg);
@@ -320,9 +320,9 @@ protected:
} // end anonymous namespace
-bool X86CallLowering::lowerFormalArguments(MachineIRBuilder &MIRBuilder,
- const Function &F,
- ArrayRef<unsigned> VRegs) const {
+bool X86CallLowering::lowerFormalArguments(
+ MachineIRBuilder &MIRBuilder, const Function &F,
+ ArrayRef<ArrayRef<Register>> VRegs) const {
if (F.arg_empty())
return true;
@@ -344,14 +344,14 @@ bool X86CallLowering::lowerFormalArguments(MachineIRBuilder &MIRBuilder,
Arg.hasAttribute(Attribute::StructRet) ||
Arg.hasAttribute(Attribute::SwiftSelf) ||
Arg.hasAttribute(Attribute::SwiftError) ||
- Arg.hasAttribute(Attribute::Nest))
+ Arg.hasAttribute(Attribute::Nest) || VRegs[Idx].size() > 1)
return false;
ArgInfo OrigArg(VRegs[Idx], Arg.getType());
setArgFlags(OrigArg, Idx + AttributeList::FirstArgIndex, DL, F);
if (!splitToValueTypes(OrigArg, SplitArgs, DL, MRI,
- [&](ArrayRef<unsigned> Regs) {
- MIRBuilder.buildMerge(VRegs[Idx], Regs);
+ [&](ArrayRef<Register> Regs) {
+ MIRBuilder.buildMerge(VRegs[Idx][0], Regs);
}))
return false;
Idx++;
@@ -409,9 +409,12 @@ bool X86CallLowering::lowerCall(MachineIRBuilder &MIRBuilder,
if (OrigArg.Flags.isByVal())
return false;
+ if (OrigArg.Regs.size() > 1)
+ return false;
+
if (!splitToValueTypes(OrigArg, SplitArgs, DL, MRI,
- [&](ArrayRef<unsigned> Regs) {
- MIRBuilder.buildUnmerge(Regs, OrigArg.Reg);
+ [&](ArrayRef<Register> Regs) {
+ MIRBuilder.buildUnmerge(Regs, OrigArg.Regs[0]);
}))
return false;
}
@@ -451,12 +454,15 @@ bool X86CallLowering::lowerCall(MachineIRBuilder &MIRBuilder,
// symmetry with the arguments, the physical register must be an
// implicit-define of the call instruction.
- if (OrigRet.Reg) {
+ if (!OrigRet.Ty->isVoidTy()) {
+ if (OrigRet.Regs.size() > 1)
+ return false;
+
SplitArgs.clear();
- SmallVector<unsigned, 8> NewRegs;
+ SmallVector<Register, 8> NewRegs;
if (!splitToValueTypes(OrigRet, SplitArgs, DL, MRI,
- [&](ArrayRef<unsigned> Regs) {
+ [&](ArrayRef<Register> Regs) {
NewRegs.assign(Regs.begin(), Regs.end());
}))
return false;
@@ -466,7 +472,7 @@ bool X86CallLowering::lowerCall(MachineIRBuilder &MIRBuilder,
return false;
if (!NewRegs.empty())
- MIRBuilder.buildMerge(OrigRet.Reg, NewRegs);
+ MIRBuilder.buildMerge(OrigRet.Regs[0], NewRegs);
}
CallSeqStart.addImm(Handler.getStackSize())
diff --git a/lib/Target/X86/X86CallLowering.h b/lib/Target/X86/X86CallLowering.h
index f5f8f9a3ef6d..0445331bc3ff 100644
--- a/lib/Target/X86/X86CallLowering.h
+++ b/lib/Target/X86/X86CallLowering.h
@@ -1,9 +1,8 @@
//===- llvm/lib/Target/X86/X86CallLowering.h - Call lowering ----*- C++ -*-===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
@@ -30,10 +29,10 @@ public:
X86CallLowering(const X86TargetLowering &TLI);
bool lowerReturn(MachineIRBuilder &MIRBuilder, const Value *Val,
- ArrayRef<unsigned> VRegs) const override;
+ ArrayRef<Register> VRegs) const override;
bool lowerFormalArguments(MachineIRBuilder &MIRBuilder, const Function &F,
- ArrayRef<unsigned> VRegs) const override;
+ ArrayRef<ArrayRef<Register>> VRegs) const override;
bool lowerCall(MachineIRBuilder &MIRBuilder, CallingConv::ID CallConv,
const MachineOperand &Callee, const ArgInfo &OrigRet,
@@ -41,7 +40,7 @@ public:
private:
/// A function of this type is used to perform value split action.
- using SplitArgTy = std::function<void(ArrayRef<unsigned>)>;
+ using SplitArgTy = std::function<void(ArrayRef<Register>)>;
bool splitToValueTypes(const ArgInfo &OrigArgInfo,
SmallVectorImpl<ArgInfo> &SplitArgs,
diff --git a/lib/Target/X86/X86CallingConv.cpp b/lib/Target/X86/X86CallingConv.cpp
index 59dde982f512..aee344a26764 100644
--- a/lib/Target/X86/X86CallingConv.cpp
+++ b/lib/Target/X86/X86CallingConv.cpp
@@ -1,9 +1,8 @@
//=== X86CallingConv.cpp - X86 Custom Calling Convention Impl -*- C++ -*-===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
@@ -12,16 +11,23 @@
//
//===----------------------------------------------------------------------===//
-#include "MCTargetDesc/X86MCTargetDesc.h"
+#include "X86CallingConv.h"
#include "X86Subtarget.h"
+#include "llvm/ADT/SmallVector.h"
#include "llvm/CodeGen/CallingConvLower.h"
#include "llvm/IR/CallingConv.h"
-namespace llvm {
-
-bool CC_X86_32_RegCall_Assign2Regs(unsigned &ValNo, MVT &ValVT, MVT &LocVT,
- CCValAssign::LocInfo &LocInfo,
- ISD::ArgFlagsTy &ArgFlags, CCState &State) {
+using namespace llvm;
+
+/// When regcall calling convention compiled to 32 bit arch, special treatment
+/// is required for 64 bit masks.
+/// The value should be assigned to two GPRs.
+/// \return true if registers were allocated and false otherwise.
+static bool CC_X86_32_RegCall_Assign2Regs(unsigned &ValNo, MVT &ValVT,
+ MVT &LocVT,
+ CCValAssign::LocInfo &LocInfo,
+ ISD::ArgFlagsTy &ArgFlags,
+ CCState &State) {
// List of GPR registers that are available to store values in regcall
// calling convention.
static const MCPhysReg RegList[] = {X86::EAX, X86::ECX, X86::EDX, X86::EDI,
@@ -113,9 +119,15 @@ static bool CC_X86_VectorCallAssignRegister(unsigned &ValNo, MVT &ValVT,
return false;
}
-bool CC_X86_64_VectorCall(unsigned &ValNo, MVT &ValVT, MVT &LocVT,
- CCValAssign::LocInfo &LocInfo,
- ISD::ArgFlagsTy &ArgFlags, CCState &State) {
+/// Vectorcall calling convention has special handling for vector types or
+/// HVA for 64 bit arch.
+/// For HVAs shadow registers might be allocated on the first pass
+/// and actual XMM registers are allocated on the second pass.
+/// For vector types, actual XMM registers are allocated on the first pass.
+/// \return true if registers were allocated and false otherwise.
+static bool CC_X86_64_VectorCall(unsigned &ValNo, MVT &ValVT, MVT &LocVT,
+ CCValAssign::LocInfo &LocInfo,
+ ISD::ArgFlagsTy &ArgFlags, CCState &State) {
// On the second pass, go through the HVAs only.
if (ArgFlags.isSecArgPass()) {
if (ArgFlags.isHva())
@@ -150,7 +162,10 @@ bool CC_X86_64_VectorCall(unsigned &ValNo, MVT &ValVT, MVT &LocVT,
// created on top of the basic 32 bytes of win64.
// It can happen if the fifth or sixth argument is vector type or HVA.
// At that case for each argument a shadow stack of 8 bytes is allocated.
- if (Reg == X86::XMM4 || Reg == X86::XMM5)
+ const TargetRegisterInfo *TRI =
+ State.getMachineFunction().getSubtarget().getRegisterInfo();
+ if (TRI->regsOverlap(Reg, X86::XMM4) ||
+ TRI->regsOverlap(Reg, X86::XMM5))
State.AllocateStack(8, 8);
if (!ArgFlags.isHva()) {
@@ -165,9 +180,14 @@ bool CC_X86_64_VectorCall(unsigned &ValNo, MVT &ValVT, MVT &LocVT,
return ArgFlags.isHva();
}
-bool CC_X86_32_VectorCall(unsigned &ValNo, MVT &ValVT, MVT &LocVT,
- CCValAssign::LocInfo &LocInfo,
- ISD::ArgFlagsTy &ArgFlags, CCState &State) {
+/// Vectorcall calling convention has special handling for vector types or
+/// HVA for 32 bit arch.
+/// For HVAs actual XMM registers are allocated on the second pass.
+/// For vector types, actual XMM registers are allocated on the first pass.
+/// \return true if registers were allocated and false otherwise.
+static bool CC_X86_32_VectorCall(unsigned &ValNo, MVT &ValVT, MVT &LocVT,
+ CCValAssign::LocInfo &LocInfo,
+ ISD::ArgFlagsTy &ArgFlags, CCState &State) {
// On the second pass, go through the HVAs only.
if (ArgFlags.isSecArgPass()) {
if (ArgFlags.isHva())
@@ -205,4 +225,110 @@ bool CC_X86_32_VectorCall(unsigned &ValNo, MVT &ValVT, MVT &LocVT,
return false; // No register was assigned - Continue the search.
}
-} // End llvm namespace
+static bool CC_X86_AnyReg_Error(unsigned &, MVT &, MVT &,
+ CCValAssign::LocInfo &, ISD::ArgFlagsTy &,
+ CCState &) {
+ llvm_unreachable("The AnyReg calling convention is only supported by the "
+ "stackmap and patchpoint intrinsics.");
+ // gracefully fallback to X86 C calling convention on Release builds.
+ return false;
+}
+
+static bool CC_X86_32_MCUInReg(unsigned &ValNo, MVT &ValVT, MVT &LocVT,
+ CCValAssign::LocInfo &LocInfo,
+ ISD::ArgFlagsTy &ArgFlags, CCState &State) {
+ // This is similar to CCAssignToReg<[EAX, EDX, ECX]>, but makes sure
+ // not to split i64 and double between a register and stack
+ static const MCPhysReg RegList[] = {X86::EAX, X86::EDX, X86::ECX};
+ static const unsigned NumRegs = sizeof(RegList) / sizeof(RegList[0]);
+
+ SmallVectorImpl<CCValAssign> &PendingMembers = State.getPendingLocs();
+
+ // If this is the first part of an double/i64/i128, or if we're already
+ // in the middle of a split, add to the pending list. If this is not
+ // the end of the split, return, otherwise go on to process the pending
+ // list
+ if (ArgFlags.isSplit() || !PendingMembers.empty()) {
+ PendingMembers.push_back(
+ CCValAssign::getPending(ValNo, ValVT, LocVT, LocInfo));
+ if (!ArgFlags.isSplitEnd())
+ return true;
+ }
+
+ // If there are no pending members, we are not in the middle of a split,
+ // so do the usual inreg stuff.
+ if (PendingMembers.empty()) {
+ if (unsigned Reg = State.AllocateReg(RegList)) {
+ State.addLoc(CCValAssign::getReg(ValNo, ValVT, Reg, LocVT, LocInfo));
+ return true;
+ }
+ return false;
+ }
+
+ assert(ArgFlags.isSplitEnd());
+
+ // We now have the entire original argument in PendingMembers, so decide
+ // whether to use registers or the stack.
+ // Per the MCU ABI:
+ // a) To use registers, we need to have enough of them free to contain
+ // the entire argument.
+ // b) We never want to use more than 2 registers for a single argument.
+
+ unsigned FirstFree = State.getFirstUnallocated(RegList);
+ bool UseRegs = PendingMembers.size() <= std::min(2U, NumRegs - FirstFree);
+
+ for (auto &It : PendingMembers) {
+ if (UseRegs)
+ It.convertToReg(State.AllocateReg(RegList[FirstFree++]));
+ else
+ It.convertToMem(State.AllocateStack(4, 4));
+ State.addLoc(It);
+ }
+
+ PendingMembers.clear();
+
+ return true;
+}
+
+/// X86 interrupt handlers can only take one or two stack arguments, but if
+/// there are two arguments, they are in the opposite order from the standard
+/// convention. Therefore, we have to look at the argument count up front before
+/// allocating stack for each argument.
+static bool CC_X86_Intr(unsigned &ValNo, MVT &ValVT, MVT &LocVT,
+ CCValAssign::LocInfo &LocInfo,
+ ISD::ArgFlagsTy &ArgFlags, CCState &State) {
+ const MachineFunction &MF = State.getMachineFunction();
+ size_t ArgCount = State.getMachineFunction().getFunction().arg_size();
+ bool Is64Bit = static_cast<const X86Subtarget &>(MF.getSubtarget()).is64Bit();
+ unsigned SlotSize = Is64Bit ? 8 : 4;
+ unsigned Offset;
+ if (ArgCount == 1 && ValNo == 0) {
+ // If we have one argument, the argument is five stack slots big, at fixed
+ // offset zero.
+ Offset = State.AllocateStack(5 * SlotSize, 4);
+ } else if (ArgCount == 2 && ValNo == 0) {
+ // If we have two arguments, the stack slot is *after* the error code
+ // argument. Pretend it doesn't consume stack space, and account for it when
+ // we assign the second argument.
+ Offset = SlotSize;
+ } else if (ArgCount == 2 && ValNo == 1) {
+ // If this is the second of two arguments, it must be the error code. It
+ // appears first on the stack, and is then followed by the five slot
+ // interrupt struct.
+ Offset = 0;
+ (void)State.AllocateStack(6 * SlotSize, 4);
+ } else {
+ report_fatal_error("unsupported x86 interrupt prototype");
+ }
+
+ // FIXME: This should be accounted for in
+ // X86FrameLowering::getFrameIndexReference, not here.
+ if (Is64Bit && ArgCount == 2)
+ Offset += SlotSize;
+
+ State.addLoc(CCValAssign::getMem(ValNo, ValVT, Offset, LocVT, LocInfo));
+ return true;
+}
+
+// Provides entry points of CC_X86 and RetCC_X86.
+#include "X86GenCallingConv.inc"
diff --git a/lib/Target/X86/X86CallingConv.h b/lib/Target/X86/X86CallingConv.h
index d0fcbd313312..191e0fa619b2 100644
--- a/lib/Target/X86/X86CallingConv.h
+++ b/lib/Target/X86/X86CallingConv.h
@@ -1,9 +1,8 @@
//=== X86CallingConv.h - X86 Custom Calling Convention Routines -*- C++ -*-===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
@@ -21,99 +20,12 @@
namespace llvm {
-/// When regcall calling convention compiled to 32 bit arch, special treatment
-/// is required for 64 bit masks.
-/// The value should be assigned to two GPRs.
-/// \return true if registers were allocated and false otherwise.
-bool CC_X86_32_RegCall_Assign2Regs(unsigned &ValNo, MVT &ValVT, MVT &LocVT,
- CCValAssign::LocInfo &LocInfo,
- ISD::ArgFlagsTy &ArgFlags, CCState &State);
-
-/// Vectorcall calling convention has special handling for vector types or
-/// HVA for 64 bit arch.
-/// For HVAs shadow registers might be allocated on the first pass
-/// and actual XMM registers are allocated on the second pass.
-/// For vector types, actual XMM registers are allocated on the first pass.
-/// \return true if registers were allocated and false otherwise.
-bool CC_X86_64_VectorCall(unsigned &ValNo, MVT &ValVT, MVT &LocVT,
- CCValAssign::LocInfo &LocInfo,
- ISD::ArgFlagsTy &ArgFlags, CCState &State);
-
-/// Vectorcall calling convention has special handling for vector types or
-/// HVA for 32 bit arch.
-/// For HVAs actual XMM registers are allocated on the second pass.
-/// For vector types, actual XMM registers are allocated on the first pass.
-/// \return true if registers were allocated and false otherwise.
-bool CC_X86_32_VectorCall(unsigned &ValNo, MVT &ValVT, MVT &LocVT,
- CCValAssign::LocInfo &LocInfo,
- ISD::ArgFlagsTy &ArgFlags, CCState &State);
-
-inline bool CC_X86_AnyReg_Error(unsigned &, MVT &, MVT &,
- CCValAssign::LocInfo &, ISD::ArgFlagsTy &,
- CCState &) {
- llvm_unreachable("The AnyReg calling convention is only supported by the " \
- "stackmap and patchpoint intrinsics.");
- // gracefully fallback to X86 C calling convention on Release builds.
- return false;
-}
-
-inline bool CC_X86_32_MCUInReg(unsigned &ValNo, MVT &ValVT,
- MVT &LocVT,
- CCValAssign::LocInfo &LocInfo,
- ISD::ArgFlagsTy &ArgFlags,
- CCState &State) {
- // This is similar to CCAssignToReg<[EAX, EDX, ECX]>, but makes sure
- // not to split i64 and double between a register and stack
- static const MCPhysReg RegList[] = {X86::EAX, X86::EDX, X86::ECX};
- static const unsigned NumRegs = sizeof(RegList)/sizeof(RegList[0]);
-
- SmallVectorImpl<CCValAssign> &PendingMembers = State.getPendingLocs();
-
- // If this is the first part of an double/i64/i128, or if we're already
- // in the middle of a split, add to the pending list. If this is not
- // the end of the split, return, otherwise go on to process the pending
- // list
- if (ArgFlags.isSplit() || !PendingMembers.empty()) {
- PendingMembers.push_back(
- CCValAssign::getPending(ValNo, ValVT, LocVT, LocInfo));
- if (!ArgFlags.isSplitEnd())
- return true;
- }
-
- // If there are no pending members, we are not in the middle of a split,
- // so do the usual inreg stuff.
- if (PendingMembers.empty()) {
- if (unsigned Reg = State.AllocateReg(RegList)) {
- State.addLoc(CCValAssign::getReg(ValNo, ValVT, Reg, LocVT, LocInfo));
- return true;
- }
- return false;
- }
-
- assert(ArgFlags.isSplitEnd());
-
- // We now have the entire original argument in PendingMembers, so decide
- // whether to use registers or the stack.
- // Per the MCU ABI:
- // a) To use registers, we need to have enough of them free to contain
- // the entire argument.
- // b) We never want to use more than 2 registers for a single argument.
-
- unsigned FirstFree = State.getFirstUnallocated(RegList);
- bool UseRegs = PendingMembers.size() <= std::min(2U, NumRegs - FirstFree);
-
- for (auto &It : PendingMembers) {
- if (UseRegs)
- It.convertToReg(State.AllocateReg(RegList[FirstFree++]));
- else
- It.convertToMem(State.AllocateStack(4, 4));
- State.addLoc(It);
- }
-
- PendingMembers.clear();
+bool RetCC_X86(unsigned ValNo, MVT ValVT, MVT LocVT,
+ CCValAssign::LocInfo LocInfo, ISD::ArgFlagsTy ArgFlags,
+ CCState &State);
- return true;
-}
+bool CC_X86(unsigned ValNo, MVT ValVT, MVT LocVT, CCValAssign::LocInfo LocInfo,
+ ISD::ArgFlagsTy ArgFlags, CCState &State);
} // End llvm namespace
diff --git a/lib/Target/X86/X86CallingConv.td b/lib/Target/X86/X86CallingConv.td
index fe49c9ffbd95..1c3034a5116a 100644
--- a/lib/Target/X86/X86CallingConv.td
+++ b/lib/Target/X86/X86CallingConv.td
@@ -1,9 +1,8 @@
//===-- X86CallingConv.td - Calling Conventions X86 32/64 --*- tablegen -*-===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
@@ -148,7 +147,8 @@ def CC_#NAME : CallingConv<[
CCAssignToStack<32, 32>>,
// 512-bit vectors get 64-byte stack slots that are 64-byte aligned.
- CCIfType<[v16i32, v8i64, v16f32, v8f64], CCAssignToStack<64, 64>>
+ CCIfType<[v64i8, v32i16, v16i32, v8i64, v16f32, v8f64],
+ CCAssignToStack<64, 64>>
]>;
def RetCC_#NAME : CallingConv<[
@@ -477,6 +477,7 @@ def RetCC_X86_64 : CallingConv<[
]>;
// This is the return-value convention used for the entire X86 backend.
+let Entry = 1 in
def RetCC_X86 : CallingConv<[
// Check if this is the Intel OpenCL built-ins calling convention
@@ -567,7 +568,7 @@ def CC_X86_64_C : CallingConv<[
CCAssignToStack<32, 32>>,
// 512-bit vectors get 64-byte stack slots that are 64-byte aligned.
- CCIfType<[v16i32, v8i64, v16f32, v8f64],
+ CCIfType<[v64i8, v32i16, v16i32, v8i64, v16f32, v8f64],
CCAssignToStack<64, 64>>
]>;
@@ -612,7 +613,7 @@ def CC_X86_Win64_C : CallingConv<[
CCIfType<[v32i8, v16i16, v8i32, v4i64, v8f32, v4f64], CCPassIndirect<i64>>,
// 512 bit vectors are passed by pointer
- CCIfType<[v16i32, v16f32, v8f64, v8i64], CCPassIndirect<i64>>,
+ CCIfType<[v64i8, v32i16, v16i32, v16f32, v8f64, v8i64], CCPassIndirect<i64>>,
// Long doubles are passed by pointer
CCIfType<[f80], CCPassIndirect<i64>>,
@@ -985,14 +986,6 @@ def CC_Intel_OCL_BI : CallingConv<[
CCDelegateTo<CC_X86_32_C>
]>;
-def CC_X86_32_Intr : CallingConv<[
- CCAssignToStack<4, 4>
-]>;
-
-def CC_X86_64_Intr : CallingConv<[
- CCAssignToStack<8, 8>
-]>;
-
//===----------------------------------------------------------------------===//
// X86 Root Argument Calling Conventions
//===----------------------------------------------------------------------===//
@@ -1001,7 +994,7 @@ def CC_X86_64_Intr : CallingConv<[
def CC_X86_32 : CallingConv<[
// X86_INTR calling convention is valid in MCU target and should override the
// MCU calling convention. Thus, this should be checked before isTargetMCU().
- CCIfCC<"CallingConv::X86_INTR", CCDelegateTo<CC_X86_32_Intr>>,
+ CCIfCC<"CallingConv::X86_INTR", CCCustom<"CC_X86_Intr">>,
CCIfSubtarget<"isTargetMCU()", CCDelegateTo<CC_X86_32_MCU>>,
CCIfCC<"CallingConv::X86_FastCall", CCDelegateTo<CC_X86_32_FastCall>>,
CCIfCC<"CallingConv::X86_VectorCall", CCDelegateTo<CC_X86_Win32_VectorCall>>,
@@ -1029,7 +1022,7 @@ def CC_X86_64 : CallingConv<[
CCIfCC<"CallingConv::X86_RegCall",
CCIfSubtarget<"isTargetWin64()", CCDelegateTo<CC_X86_Win64_RegCall>>>,
CCIfCC<"CallingConv::X86_RegCall", CCDelegateTo<CC_X86_SysV64_RegCall>>,
- CCIfCC<"CallingConv::X86_INTR", CCDelegateTo<CC_X86_64_Intr>>,
+ CCIfCC<"CallingConv::X86_INTR", CCCustom<"CC_X86_Intr">>,
// Mingw64 and native Win64 use Win64 CC
CCIfSubtarget<"isTargetWin64()", CCDelegateTo<CC_X86_Win64_C>>,
@@ -1039,6 +1032,7 @@ def CC_X86_64 : CallingConv<[
]>;
// This is the argument convention used for the entire X86 backend.
+let Entry = 1 in
def CC_X86 : CallingConv<[
CCIfCC<"CallingConv::Intel_OCL_BI", CCDelegateTo<CC_Intel_OCL_BI>>,
CCIfSubtarget<"is64Bit()", CCDelegateTo<CC_X86_64>>,
diff --git a/lib/Target/X86/X86CmovConversion.cpp b/lib/Target/X86/X86CmovConversion.cpp
index c3e76fd2a856..a61fa3246f09 100644
--- a/lib/Target/X86/X86CmovConversion.cpp
+++ b/lib/Target/X86/X86CmovConversion.cpp
@@ -1,9 +1,8 @@
//====- X86CmovConversion.cpp - Convert Cmov to Branch --------------------===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
@@ -102,9 +101,7 @@ namespace {
/// Converts X86 cmov instructions into branches when profitable.
class X86CmovConverterPass : public MachineFunctionPass {
public:
- X86CmovConverterPass() : MachineFunctionPass(ID) {
- initializeX86CmovConverterPassPass(*PassRegistry::getPassRegistry());
- }
+ X86CmovConverterPass() : MachineFunctionPass(ID) { }
StringRef getPassName() const override { return "X86 cmov Conversion"; }
bool runOnMachineFunction(MachineFunction &MF) override;
@@ -281,7 +278,8 @@ bool X86CmovConverterPass::collectCmovCandidates(
Group.clear();
// Condition code of first CMOV instruction current processed range and its
// opposite condition code.
- X86::CondCode FirstCC, FirstOppCC, MemOpCC;
+ X86::CondCode FirstCC = X86::COND_INVALID, FirstOppCC = X86::COND_INVALID,
+ MemOpCC = X86::COND_INVALID;
// Indicator of a non CMOVrr instruction in the current processed range.
bool FoundNonCMOVInst = false;
// Indicator for current processed CMOV-group if it should be skipped.
@@ -291,7 +289,7 @@ bool X86CmovConverterPass::collectCmovCandidates(
// Skip debug instructions.
if (I.isDebugInstr())
continue;
- X86::CondCode CC = X86::getCondFromCMovOpc(I.getOpcode());
+ X86::CondCode CC = X86::getCondFromCMov(I);
// Check if we found a X86::CMOVrr instruction.
if (CC != X86::COND_INVALID && (IncludeLoads || !I.mayLoad())) {
if (Group.empty()) {
@@ -546,7 +544,7 @@ bool X86CmovConverterPass::checkForProfitableCmovCandidates(
}
unsigned CondCost =
- DepthMap[OperandToDefMap.lookup(&MI->getOperand(3))].Depth;
+ DepthMap[OperandToDefMap.lookup(&MI->getOperand(4))].Depth;
unsigned ValCost = getDepthOfOptCmov(
DepthMap[OperandToDefMap.lookup(&MI->getOperand(1))].Depth,
DepthMap[OperandToDefMap.lookup(&MI->getOperand(2))].Depth);
@@ -594,7 +592,7 @@ static bool checkEFLAGSLive(MachineInstr *MI) {
/// move all debug instructions to after the last CMOV instruction, making the
/// CMOV group consecutive.
static void packCmovGroup(MachineInstr *First, MachineInstr *Last) {
- assert(X86::getCondFromCMovOpc(Last->getOpcode()) != X86::COND_INVALID &&
+ assert(X86::getCondFromCMov(*Last) != X86::COND_INVALID &&
"Last instruction in a CMOV group must be a CMOV instruction");
SmallVector<MachineInstr *, 2> DBGInstructions;
@@ -652,14 +650,14 @@ void X86CmovConverterPass::convertCmovInstsToBranches(
MachineInstr *LastCMOV = Group.back();
DebugLoc DL = MI.getDebugLoc();
- X86::CondCode CC = X86::CondCode(X86::getCondFromCMovOpc(MI.getOpcode()));
+ X86::CondCode CC = X86::CondCode(X86::getCondFromCMov(MI));
X86::CondCode OppCC = X86::GetOppositeBranchCondition(CC);
// Potentially swap the condition codes so that any memory operand to a CMOV
// is in the *false* position instead of the *true* position. We can invert
// any non-memory operand CMOV instructions to cope with this and we ensure
// memory operand CMOVs are only included with a single condition code.
if (llvm::any_of(Group, [&](MachineInstr *I) {
- return I->mayLoad() && X86::getCondFromCMovOpc(I->getOpcode()) == CC;
+ return I->mayLoad() && X86::getCondFromCMov(*I) == CC;
}))
std::swap(CC, OppCC);
@@ -690,7 +688,7 @@ void X86CmovConverterPass::convertCmovInstsToBranches(
MBB->addSuccessor(SinkMBB);
// Create the conditional branch instruction.
- BuildMI(MBB, DL, TII->get(X86::GetCondBranchFromCond(CC))).addMBB(SinkMBB);
+ BuildMI(MBB, DL, TII->get(X86::JCC_1)).addMBB(SinkMBB).addImm(CC);
// Add the sink block to the false block successors.
FalseMBB->addSuccessor(SinkMBB);
@@ -713,8 +711,7 @@ void X86CmovConverterPass::convertCmovInstsToBranches(
if (!MI.mayLoad()) {
// Remember the false-side register input.
unsigned FalseReg =
- MI.getOperand(X86::getCondFromCMovOpc(MI.getOpcode()) == CC ? 1 : 2)
- .getReg();
+ MI.getOperand(X86::getCondFromCMov(MI) == CC ? 1 : 2).getReg();
// Walk back through any intermediate cmovs referenced.
while (true) {
auto FRIt = FalseBBRegRewriteTable.find(FalseReg);
@@ -729,7 +726,7 @@ void X86CmovConverterPass::convertCmovInstsToBranches(
// The condition must be the *opposite* of the one we've decided to branch
// on as the branch will go *around* the load and the load should happen
// when the CMOV condition is false.
- assert(X86::getCondFromCMovOpc(MI.getOpcode()) == OppCC &&
+ assert(X86::getCondFromCMov(MI) == OppCC &&
"Can only handle memory-operand cmov instructions with a condition "
"opposite to the selected branch direction.");
@@ -768,7 +765,7 @@ void X86CmovConverterPass::convertCmovInstsToBranches(
// Move the new CMOV to just before the old one and reset any impacted
// iterator.
auto *NewCMOV = NewMIs.pop_back_val();
- assert(X86::getCondFromCMovOpc(NewCMOV->getOpcode()) == OppCC &&
+ assert(X86::getCondFromCMov(*NewCMOV) == OppCC &&
"Last new instruction isn't the expected CMOV!");
LLVM_DEBUG(dbgs() << "\tRewritten cmov: "; NewCMOV->dump());
MBB->insert(MachineBasicBlock::iterator(MI), NewCMOV);
@@ -820,7 +817,7 @@ void X86CmovConverterPass::convertCmovInstsToBranches(
// If this CMOV we are processing is the opposite condition from the jump we
// generated, then we have to swap the operands for the PHI that is going to
// be generated.
- if (X86::getCondFromCMovOpc(MIIt->getOpcode()) == OppCC)
+ if (X86::getCondFromCMov(*MIIt) == OppCC)
std::swap(Op1Reg, Op2Reg);
auto Op1Itr = RegRewriteTable.find(Op1Reg);
diff --git a/lib/Target/X86/X86CondBrFolding.cpp b/lib/Target/X86/X86CondBrFolding.cpp
index 7ce443c4656a..9dea94f1368d 100644
--- a/lib/Target/X86/X86CondBrFolding.cpp
+++ b/lib/Target/X86/X86CondBrFolding.cpp
@@ -1,9 +1,8 @@
//===---- X86CondBrFolding.cpp - optimize conditional branches ------------===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
// This file defines a pass that optimizes condition branches on x86 by taking
@@ -62,9 +61,7 @@ STATISTIC(NumFixedCondBrs, "Number of x86 condbr folded");
namespace {
class X86CondBrFoldingPass : public MachineFunctionPass {
public:
- X86CondBrFoldingPass() : MachineFunctionPass(ID) {
- initializeX86CondBrFoldingPassPass(*PassRegistry::getPassRegistry());
- }
+ X86CondBrFoldingPass() : MachineFunctionPass(ID) { }
StringRef getPassName() const override { return "X86 CondBr Folding"; }
bool runOnMachineFunction(MachineFunction &MF) override;
@@ -226,10 +223,9 @@ void X86CondBrFolding::replaceBrDest(MachineBasicBlock *MBB,
MachineInstr *BrMI;
if (MBBInfo->TBB == OrigDest) {
BrMI = MBBInfo->BrInstr;
- unsigned JNCC = GetCondBranchFromCond(MBBInfo->BranchCode);
MachineInstrBuilder MIB =
- BuildMI(*MBB, BrMI, MBB->findDebugLoc(BrMI), TII->get(JNCC))
- .addMBB(NewDest);
+ BuildMI(*MBB, BrMI, MBB->findDebugLoc(BrMI), TII->get(X86::JCC_1))
+ .addMBB(NewDest).addImm(MBBInfo->BranchCode);
MBBInfo->TBB = NewDest;
MBBInfo->BrInstr = MIB.getInstr();
} else { // Should be the unconditional jump stmt.
@@ -255,8 +251,8 @@ void X86CondBrFolding::fixupModifiedCond(MachineBasicBlock *MBB) {
MachineInstr *BrMI = MBBInfo->BrInstr;
X86::CondCode CC = MBBInfo->BranchCode;
MachineInstrBuilder MIB = BuildMI(*MBB, BrMI, MBB->findDebugLoc(BrMI),
- TII->get(GetCondBranchFromCond(CC)))
- .addMBB(MBBInfo->TBB);
+ TII->get(X86::JCC_1))
+ .addMBB(MBBInfo->TBB).addImm(CC);
BrMI->eraseFromParent();
MBBInfo->BrInstr = MIB.getInstr();
@@ -324,8 +320,8 @@ void X86CondBrFolding::optimizeCondBr(
llvm_unreachable("unexpected condtional code.");
}
BuildMI(*RootMBB, UncondBrI, RootMBB->findDebugLoc(UncondBrI),
- TII->get(GetCondBranchFromCond(NewCC)))
- .addMBB(RootMBBInfo->FBB);
+ TII->get(X86::JCC_1))
+ .addMBB(RootMBBInfo->FBB).addImm(NewCC);
// RootMBB: Jump to TargetMBB
BuildMI(*RootMBB, UncondBrI, RootMBB->findDebugLoc(UncondBrI),
@@ -513,7 +509,7 @@ X86CondBrFolding::analyzeMBB(MachineBasicBlock &MBB) {
if (I->isBranch()) {
if (TBB)
return nullptr;
- CC = X86::getCondFromBranchOpc(I->getOpcode());
+ CC = X86::getCondFromBranch(*I);
switch (CC) {
default:
return nullptr;
diff --git a/lib/Target/X86/X86DiscriminateMemOps.cpp b/lib/Target/X86/X86DiscriminateMemOps.cpp
index 3654bf04f4e9..7051550d52e6 100644
--- a/lib/Target/X86/X86DiscriminateMemOps.cpp
+++ b/lib/Target/X86/X86DiscriminateMemOps.cpp
@@ -1,9 +1,8 @@
//===- X86DiscriminateMemOps.cpp - Unique IDs for Mem Ops -----------------===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
///
@@ -27,6 +26,22 @@ using namespace llvm;
#define DEBUG_TYPE "x86-discriminate-memops"
+static cl::opt<bool> EnableDiscriminateMemops(
+ DEBUG_TYPE, cl::init(false),
+ cl::desc("Generate unique debug info for each instruction with a memory "
+ "operand. Should be enabled for profile-drived cache prefetching, "
+ "both in the build of the binary being profiled, as well as in "
+ "the build of the binary consuming the profile."),
+ cl::Hidden);
+
+static cl::opt<bool> BypassPrefetchInstructions(
+ "x86-bypass-prefetch-instructions", cl::init(true),
+ cl::desc("When discriminating instructions with memory operands, ignore "
+ "prefetch instructions. This ensures the other memory operand "
+ "instructions have the same identifiers after inserting "
+ "prefetches, allowing for successive insertions."),
+ cl::Hidden);
+
namespace {
using Location = std::pair<StringRef, unsigned>;
@@ -55,6 +70,10 @@ public:
X86DiscriminateMemOps();
};
+bool IsPrefetchOpcode(unsigned Opcode) {
+ return Opcode == X86::PREFETCHNTA || Opcode == X86::PREFETCHT0 ||
+ Opcode == X86::PREFETCHT1 || Opcode == X86::PREFETCHT2;
+}
} // end anonymous namespace
//===----------------------------------------------------------------------===//
@@ -67,6 +86,9 @@ char X86DiscriminateMemOps::ID = 0;
X86DiscriminateMemOps::X86DiscriminateMemOps() : MachineFunctionPass(ID) {}
bool X86DiscriminateMemOps::runOnMachineFunction(MachineFunction &MF) {
+ if (!EnableDiscriminateMemops)
+ return false;
+
DISubprogram *FDI = MF.getFunction().getSubprogram();
if (!FDI || !FDI->getUnit()->getDebugInfoForProfiling())
return false;
@@ -75,7 +97,7 @@ bool X86DiscriminateMemOps::runOnMachineFunction(MachineFunction &MF) {
// have any debug info.
const DILocation *ReferenceDI =
DILocation::get(FDI->getContext(), FDI->getLine(), 0, FDI);
-
+ assert(ReferenceDI && "ReferenceDI should not be nullptr");
DenseMap<Location, unsigned> MemOpDiscriminators;
MemOpDiscriminators[diToLocation(ReferenceDI)] = 0;
@@ -88,6 +110,8 @@ bool X86DiscriminateMemOps::runOnMachineFunction(MachineFunction &MF) {
const auto &DI = MI.getDebugLoc();
if (!DI)
continue;
+ if (BypassPrefetchInstructions && IsPrefetchOpcode(MI.getDesc().Opcode))
+ continue;
Location Loc = diToLocation(DI);
MemOpDiscriminators[Loc] =
std::max(MemOpDiscriminators[Loc], DI->getBaseDiscriminator());
@@ -104,15 +128,18 @@ bool X86DiscriminateMemOps::runOnMachineFunction(MachineFunction &MF) {
for (auto &MI : MBB) {
if (X86II::getMemoryOperandNo(MI.getDesc().TSFlags) < 0)
continue;
+ if (BypassPrefetchInstructions && IsPrefetchOpcode(MI.getDesc().Opcode))
+ continue;
const DILocation *DI = MI.getDebugLoc();
- if (!DI) {
+ bool HasDebug = DI;
+ if (!HasDebug) {
DI = ReferenceDI;
}
Location L = diToLocation(DI);
DenseSet<unsigned> &Set = Seen[L];
const std::pair<DenseSet<unsigned>::iterator, bool> TryInsert =
Set.insert(DI->getBaseDiscriminator());
- if (!TryInsert.second) {
+ if (!TryInsert.second || !HasDebug) {
unsigned BF, DF, CI = 0;
DILocation::decodeDiscriminator(DI->getDiscriminator(), BF, DF, CI);
Optional<unsigned> EncodedDiscriminator = DILocation::encodeDiscriminator(
@@ -133,6 +160,7 @@ bool X86DiscriminateMemOps::runOnMachineFunction(MachineFunction &MF) {
// Since we were able to encode, bump the MemOpDiscriminators.
++MemOpDiscriminators[L];
DI = DI->cloneWithDiscriminator(EncodedDiscriminator.getValue());
+ assert(DI && "DI should not be nullptr");
updateDebugInfo(&MI, DI);
Changed = true;
std::pair<DenseSet<unsigned>::iterator, bool> MustInsert =
diff --git a/lib/Target/X86/X86DomainReassignment.cpp b/lib/Target/X86/X86DomainReassignment.cpp
index d9ebbb506ca4..18bbfa32e11b 100644
--- a/lib/Target/X86/X86DomainReassignment.cpp
+++ b/lib/Target/X86/X86DomainReassignment.cpp
@@ -1,9 +1,8 @@
//===--- X86DomainReassignment.cpp - Selectively switch register classes---===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
@@ -387,9 +386,7 @@ class X86DomainReassignment : public MachineFunctionPass {
public:
static char ID;
- X86DomainReassignment() : MachineFunctionPass(ID) {
- initializeX86DomainReassignmentPass(*PassRegistry::getPassRegistry());
- }
+ X86DomainReassignment() : MachineFunctionPass(ID) { }
bool runOnMachineFunction(MachineFunction &MF) override;
@@ -557,6 +554,7 @@ void X86DomainReassignment::buildClosure(Closure &C, unsigned Reg) {
// Register already in this closure.
if (!C.insertEdge(CurReg))
continue;
+ EnclosedEdges.insert(Reg);
MachineInstr *DefMI = MRI->getVRegDef(CurReg);
encloseInstr(C, DefMI);
diff --git a/lib/Target/X86/X86EvexToVex.cpp b/lib/Target/X86/X86EvexToVex.cpp
index 80674c7251fe..58680f1815bb 100755
--- a/lib/Target/X86/X86EvexToVex.cpp
+++ b/lib/Target/X86/X86EvexToVex.cpp
@@ -1,10 +1,9 @@
//===- X86EvexToVex.cpp ---------------------------------------------------===//
// Compress EVEX instructions to VEX encoding when possible to reduce code size
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
@@ -13,15 +12,15 @@
/// are encoded using the EVEX prefix and if possible replaces them by their
/// corresponding VEX encoding which is usually shorter by 2 bytes.
/// EVEX instructions may be encoded via the VEX prefix when the AVX-512
-/// instruction has a corresponding AVX/AVX2 opcode and when it does not
-/// use the xmm or the mask registers or xmm/ymm registers with indexes
-/// higher than 15.
+/// instruction has a corresponding AVX/AVX2 opcode, when vector length
+/// accessed by instruction is less than 512 bits and when it does not use
+// the xmm or the mask registers or xmm/ymm registers with indexes higher than 15.
/// The pass applies code reduction on the generated code for AVX-512 instrs.
//
//===----------------------------------------------------------------------===//
-#include "InstPrinter/X86InstComments.h"
#include "MCTargetDesc/X86BaseInfo.h"
+#include "MCTargetDesc/X86InstComments.h"
#include "X86.h"
#include "X86InstrInfo.h"
#include "X86Subtarget.h"
@@ -69,9 +68,7 @@ class EvexToVexInstPass : public MachineFunctionPass {
public:
static char ID;
- EvexToVexInstPass() : MachineFunctionPass(ID) {
- initializeEvexToVexInstPassPass(*PassRegistry::getPassRegistry());
- }
+ EvexToVexInstPass() : MachineFunctionPass(ID) { }
StringRef getPassName() const override { return EVEX2VEX_DESC; }
@@ -255,7 +252,7 @@ bool EvexToVexInstPass::CompressEvexToVexImpl(MachineInstr &MI) const {
(Desc.TSFlags & X86II::VEX_L) ? makeArrayRef(X86EvexToVex256CompressTable)
: makeArrayRef(X86EvexToVex128CompressTable);
- auto I = std::lower_bound(Table.begin(), Table.end(), MI.getOpcode());
+ auto I = llvm::lower_bound(Table, MI.getOpcode());
if (I == Table.end() || I->EvexOpcode != MI.getOpcode())
return false;
diff --git a/lib/Target/X86/X86ExpandPseudo.cpp b/lib/Target/X86/X86ExpandPseudo.cpp
index 1dd73163080b..b8624b40f2f7 100644
--- a/lib/Target/X86/X86ExpandPseudo.cpp
+++ b/lib/Target/X86/X86ExpandPseudo.cpp
@@ -1,9 +1,8 @@
//===------- X86ExpandPseudo.cpp - Expand pseudo instructions -------------===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
@@ -27,6 +26,7 @@
using namespace llvm;
#define DEBUG_TYPE "x86-pseudo"
+#define X86_EXPAND_PSEUDO_NAME "X86 pseudo instruction expansion pass"
namespace {
class X86ExpandPseudo : public MachineFunctionPass {
@@ -66,8 +66,12 @@ private:
bool ExpandMBB(MachineBasicBlock &MBB);
};
char X86ExpandPseudo::ID = 0;
+
} // End anonymous namespace.
+INITIALIZE_PASS(X86ExpandPseudo, DEBUG_TYPE, X86_EXPAND_PSEUDO_NAME, false,
+ false)
+
void X86ExpandPseudo::ExpandICallBranchFunnel(
MachineBasicBlock *MBB, MachineBasicBlock::iterator MBBI) {
MachineBasicBlock *JTMBB = MBB;
@@ -83,6 +87,8 @@ void X86ExpandPseudo::ExpandICallBranchFunnel(
const GlobalValue *CombinedGlobal = JTInst->getOperand(1).getGlobal();
auto CmpTarget = [&](unsigned Target) {
+ if (Selector.isReg())
+ MBB->addLiveIn(Selector.getReg());
BuildMI(*MBB, MBBI, DL, TII->get(X86::LEA64r), X86::R11)
.addReg(X86::RIP)
.addImm(1)
@@ -98,11 +104,13 @@ void X86ExpandPseudo::ExpandICallBranchFunnel(
auto CreateMBB = [&]() {
auto *NewMBB = MF->CreateMachineBasicBlock(BB);
MBB->addSuccessor(NewMBB);
+ if (!MBB->isLiveIn(X86::EFLAGS))
+ MBB->addLiveIn(X86::EFLAGS);
return NewMBB;
};
- auto EmitCondJump = [&](unsigned Opcode, MachineBasicBlock *ThenMBB) {
- BuildMI(*MBB, MBBI, DL, TII->get(Opcode)).addMBB(ThenMBB);
+ auto EmitCondJump = [&](unsigned CC, MachineBasicBlock *ThenMBB) {
+ BuildMI(*MBB, MBBI, DL, TII->get(X86::JCC_1)).addMBB(ThenMBB).addImm(CC);
auto *ElseMBB = CreateMBB();
MF->insert(InsPt, ElseMBB);
@@ -110,10 +118,10 @@ void X86ExpandPseudo::ExpandICallBranchFunnel(
MBBI = MBB->end();
};
- auto EmitCondJumpTarget = [&](unsigned Opcode, unsigned Target) {
+ auto EmitCondJumpTarget = [&](unsigned CC, unsigned Target) {
auto *ThenMBB = CreateMBB();
TargetMBBs.push_back({ThenMBB, Target});
- EmitCondJump(Opcode, ThenMBB);
+ EmitCondJump(CC, ThenMBB);
};
auto EmitTailCall = [&](unsigned Target) {
@@ -130,23 +138,23 @@ void X86ExpandPseudo::ExpandICallBranchFunnel(
if (NumTargets == 2) {
CmpTarget(FirstTarget + 1);
- EmitCondJumpTarget(X86::JB_1, FirstTarget);
+ EmitCondJumpTarget(X86::COND_B, FirstTarget);
EmitTailCall(FirstTarget + 1);
return;
}
if (NumTargets < 6) {
CmpTarget(FirstTarget + 1);
- EmitCondJumpTarget(X86::JB_1, FirstTarget);
- EmitCondJumpTarget(X86::JE_1, FirstTarget + 1);
+ EmitCondJumpTarget(X86::COND_B, FirstTarget);
+ EmitCondJumpTarget(X86::COND_E, FirstTarget + 1);
EmitBranchFunnel(FirstTarget + 2, NumTargets - 2);
return;
}
auto *ThenMBB = CreateMBB();
CmpTarget(FirstTarget + (NumTargets / 2));
- EmitCondJump(X86::JB_1, ThenMBB);
- EmitCondJumpTarget(X86::JE_1, FirstTarget + (NumTargets / 2));
+ EmitCondJump(X86::COND_B, ThenMBB);
+ EmitCondJumpTarget(X86::COND_E, FirstTarget + (NumTargets / 2));
EmitBranchFunnel(FirstTarget + (NumTargets / 2) + 1,
NumTargets - (NumTargets / 2) - 1);
@@ -254,16 +262,19 @@ bool X86ExpandPseudo::ExpandMI(MachineBasicBlock &MBB,
for (unsigned i = 0; i != 5; ++i)
MIB.add(MBBI->getOperand(i));
} else if (Opcode == X86::TCRETURNri64) {
+ JumpTarget.setIsKill();
BuildMI(MBB, MBBI, DL,
TII->get(IsWin64 ? X86::TAILJMPr64_REX : X86::TAILJMPr64))
- .addReg(JumpTarget.getReg(), RegState::Kill);
+ .add(JumpTarget);
} else {
+ JumpTarget.setIsKill();
BuildMI(MBB, MBBI, DL, TII->get(X86::TAILJMPr))
- .addReg(JumpTarget.getReg(), RegState::Kill);
+ .add(JumpTarget);
}
MachineInstr &NewMI = *std::prev(MBBI);
NewMI.copyImplicitOps(*MBBI->getParent()->getParent(), *MBBI);
+ MBB.getParent()->updateCallSiteInfo(&*MBBI, &NewMI);
// Delete the pseudo instruction TCRETURN.
MBB.erase(MBBI);
diff --git a/lib/Target/X86/X86FastISel.cpp b/lib/Target/X86/X86FastISel.cpp
index 9dd3f2652543..7b9ce0271205 100644
--- a/lib/Target/X86/X86FastISel.cpp
+++ b/lib/Target/X86/X86FastISel.cpp
@@ -1,9 +1,8 @@
//===-- X86FastISel.cpp - X86 FastISel implementation ---------------------===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
@@ -85,7 +84,7 @@ private:
bool X86FastEmitCompare(const Value *LHS, const Value *RHS, EVT VT,
const DebugLoc &DL);
- bool X86FastEmitLoad(EVT VT, X86AddressMode &AM, MachineMemOperand *MMO,
+ bool X86FastEmitLoad(MVT VT, X86AddressMode &AM, MachineMemOperand *MMO,
unsigned &ResultReg, unsigned Alignment = 1);
bool X86FastEmitStore(EVT VT, const Value *Val, X86AddressMode &AM,
@@ -290,7 +289,7 @@ bool X86FastISel::foldX86XALUIntrinsic(X86::CondCode &CC, const Instruction *I,
}
bool X86FastISel::isTypeLegal(Type *Ty, MVT &VT, bool AllowI1) {
- EVT evt = TLI.getValueType(DL, Ty, /*HandleUnknown=*/true);
+ EVT evt = TLI.getValueType(DL, Ty, /*AllowUnknown=*/true);
if (evt == MVT::Other || !evt.isSimple())
// Unhandled type. Halt "fast" selection and bail.
return false;
@@ -312,12 +311,10 @@ bool X86FastISel::isTypeLegal(Type *Ty, MVT &VT, bool AllowI1) {
return (AllowI1 && VT == MVT::i1) || TLI.isTypeLegal(VT);
}
-#include "X86GenCallingConv.inc"
-
/// X86FastEmitLoad - Emit a machine instruction to load a value of type VT.
/// The address is either pre-computed, i.e. Ptr, or a GlobalAddress, i.e. GV.
/// Return true and the result register by reference if it is possible.
-bool X86FastISel::X86FastEmitLoad(EVT VT, X86AddressMode &AM,
+bool X86FastISel::X86FastEmitLoad(MVT VT, X86AddressMode &AM,
MachineMemOperand *MMO, unsigned &ResultReg,
unsigned Alignment) {
bool HasSSE41 = Subtarget->hasSSE41();
@@ -327,46 +324,42 @@ bool X86FastISel::X86FastEmitLoad(EVT VT, X86AddressMode &AM,
bool HasVLX = Subtarget->hasVLX();
bool IsNonTemporal = MMO && MMO->isNonTemporal();
+ // Treat i1 loads the same as i8 loads. Masking will be done when storing.
+ if (VT == MVT::i1)
+ VT = MVT::i8;
+
// Get opcode and regclass of the output for the given load instruction.
unsigned Opc = 0;
- const TargetRegisterClass *RC = nullptr;
- switch (VT.getSimpleVT().SimpleTy) {
+ switch (VT.SimpleTy) {
default: return false;
- case MVT::i1:
case MVT::i8:
Opc = X86::MOV8rm;
- RC = &X86::GR8RegClass;
break;
case MVT::i16:
Opc = X86::MOV16rm;
- RC = &X86::GR16RegClass;
break;
case MVT::i32:
Opc = X86::MOV32rm;
- RC = &X86::GR32RegClass;
break;
case MVT::i64:
// Must be in x86-64 mode.
Opc = X86::MOV64rm;
- RC = &X86::GR64RegClass;
break;
case MVT::f32:
- if (X86ScalarSSEf32) {
- Opc = HasAVX512 ? X86::VMOVSSZrm : HasAVX ? X86::VMOVSSrm : X86::MOVSSrm;
- RC = HasAVX512 ? &X86::FR32XRegClass : &X86::FR32RegClass;
- } else {
+ if (X86ScalarSSEf32)
+ Opc = HasAVX512 ? X86::VMOVSSZrm_alt :
+ HasAVX ? X86::VMOVSSrm_alt :
+ X86::MOVSSrm_alt;
+ else
Opc = X86::LD_Fp32m;
- RC = &X86::RFP32RegClass;
- }
break;
case MVT::f64:
- if (X86ScalarSSEf64) {
- Opc = HasAVX512 ? X86::VMOVSDZrm : HasAVX ? X86::VMOVSDrm : X86::MOVSDrm;
- RC = HasAVX512 ? &X86::FR64XRegClass : &X86::FR64RegClass;
- } else {
+ if (X86ScalarSSEf64)
+ Opc = HasAVX512 ? X86::VMOVSDZrm_alt :
+ HasAVX ? X86::VMOVSDrm_alt :
+ X86::MOVSDrm_alt;
+ else
Opc = X86::LD_Fp64m;
- RC = &X86::RFP64RegClass;
- }
break;
case MVT::f80:
// No f80 support yet.
@@ -381,7 +374,6 @@ bool X86FastISel::X86FastEmitLoad(EVT VT, X86AddressMode &AM,
else
Opc = HasVLX ? X86::VMOVUPSZ128rm :
HasAVX ? X86::VMOVUPSrm : X86::MOVUPSrm;
- RC = HasVLX ? &X86::VR128XRegClass : &X86::VR128RegClass;
break;
case MVT::v2f64:
if (IsNonTemporal && Alignment >= 16 && HasSSE41)
@@ -393,13 +385,12 @@ bool X86FastISel::X86FastEmitLoad(EVT VT, X86AddressMode &AM,
else
Opc = HasVLX ? X86::VMOVUPDZ128rm :
HasAVX ? X86::VMOVUPDrm : X86::MOVUPDrm;
- RC = HasVLX ? &X86::VR128XRegClass : &X86::VR128RegClass;
break;
case MVT::v4i32:
case MVT::v2i64:
case MVT::v8i16:
case MVT::v16i8:
- if (IsNonTemporal && Alignment >= 16)
+ if (IsNonTemporal && Alignment >= 16 && HasSSE41)
Opc = HasVLX ? X86::VMOVNTDQAZ128rm :
HasAVX ? X86::VMOVNTDQArm : X86::MOVNTDQArm;
else if (Alignment >= 16)
@@ -408,7 +399,6 @@ bool X86FastISel::X86FastEmitLoad(EVT VT, X86AddressMode &AM,
else
Opc = HasVLX ? X86::VMOVDQU64Z128rm :
HasAVX ? X86::VMOVDQUrm : X86::MOVDQUrm;
- RC = HasVLX ? &X86::VR128XRegClass : &X86::VR128RegClass;
break;
case MVT::v8f32:
assert(HasAVX);
@@ -420,7 +410,6 @@ bool X86FastISel::X86FastEmitLoad(EVT VT, X86AddressMode &AM,
Opc = HasVLX ? X86::VMOVAPSZ256rm : X86::VMOVAPSYrm;
else
Opc = HasVLX ? X86::VMOVUPSZ256rm : X86::VMOVUPSYrm;
- RC = HasVLX ? &X86::VR256XRegClass : &X86::VR256RegClass;
break;
case MVT::v4f64:
assert(HasAVX);
@@ -432,7 +421,6 @@ bool X86FastISel::X86FastEmitLoad(EVT VT, X86AddressMode &AM,
Opc = HasVLX ? X86::VMOVAPDZ256rm : X86::VMOVAPDYrm;
else
Opc = HasVLX ? X86::VMOVUPDZ256rm : X86::VMOVUPDYrm;
- RC = HasVLX ? &X86::VR256XRegClass : &X86::VR256RegClass;
break;
case MVT::v8i32:
case MVT::v4i64:
@@ -447,7 +435,6 @@ bool X86FastISel::X86FastEmitLoad(EVT VT, X86AddressMode &AM,
Opc = HasVLX ? X86::VMOVDQA64Z256rm : X86::VMOVDQAYrm;
else
Opc = HasVLX ? X86::VMOVDQU64Z256rm : X86::VMOVDQUYrm;
- RC = HasVLX ? &X86::VR256XRegClass : &X86::VR256RegClass;
break;
case MVT::v16f32:
assert(HasAVX512);
@@ -455,7 +442,6 @@ bool X86FastISel::X86FastEmitLoad(EVT VT, X86AddressMode &AM,
Opc = X86::VMOVNTDQAZrm;
else
Opc = (Alignment >= 64) ? X86::VMOVAPSZrm : X86::VMOVUPSZrm;
- RC = &X86::VR512RegClass;
break;
case MVT::v8f64:
assert(HasAVX512);
@@ -463,7 +449,6 @@ bool X86FastISel::X86FastEmitLoad(EVT VT, X86AddressMode &AM,
Opc = X86::VMOVNTDQAZrm;
else
Opc = (Alignment >= 64) ? X86::VMOVAPDZrm : X86::VMOVUPDZrm;
- RC = &X86::VR512RegClass;
break;
case MVT::v8i64:
case MVT::v16i32:
@@ -476,10 +461,11 @@ bool X86FastISel::X86FastEmitLoad(EVT VT, X86AddressMode &AM,
Opc = X86::VMOVNTDQAZrm;
else
Opc = (Alignment >= 64) ? X86::VMOVDQA64Zrm : X86::VMOVDQU64Zrm;
- RC = &X86::VR512RegClass;
break;
}
+ const TargetRegisterClass *RC = TLI.getRegClassFor(VT);
+
ResultReg = createResultReg(RC);
MachineInstrBuilder MIB =
BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(Opc), ResultReg);
@@ -1483,8 +1469,8 @@ bool X86FastISel::X86SelectCmp(const Instruction *I) {
// FCMP_OEQ and FCMP_UNE cannot be checked with a single instruction.
static const uint16_t SETFOpcTable[2][3] = {
- { X86::SETEr, X86::SETNPr, X86::AND8rr },
- { X86::SETNEr, X86::SETPr, X86::OR8rr }
+ { X86::COND_E, X86::COND_NP, X86::AND8rr },
+ { X86::COND_NE, X86::COND_P, X86::OR8rr }
};
const uint16_t *SETFOpc = nullptr;
switch (Predicate) {
@@ -1500,10 +1486,10 @@ bool X86FastISel::X86SelectCmp(const Instruction *I) {
unsigned FlagReg1 = createResultReg(&X86::GR8RegClass);
unsigned FlagReg2 = createResultReg(&X86::GR8RegClass);
- BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(SETFOpc[0]),
- FlagReg1);
- BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(SETFOpc[1]),
- FlagReg2);
+ BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(X86::SETCCr),
+ FlagReg1).addImm(SETFOpc[0]);
+ BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(X86::SETCCr),
+ FlagReg2).addImm(SETFOpc[1]);
BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(SETFOpc[2]),
ResultReg).addReg(FlagReg1).addReg(FlagReg2);
updateValueMap(I, ResultReg);
@@ -1514,7 +1500,6 @@ bool X86FastISel::X86SelectCmp(const Instruction *I) {
bool SwapArgs;
std::tie(CC, SwapArgs) = X86::getX86ConditionCode(Predicate);
assert(CC <= X86::LAST_VALID_COND && "Unexpected condition code.");
- unsigned Opc = X86::getSETFromCond(CC);
if (SwapArgs)
std::swap(LHS, RHS);
@@ -1523,7 +1508,8 @@ bool X86FastISel::X86SelectCmp(const Instruction *I) {
if (!X86FastEmitCompare(LHS, RHS, VT, I->getDebugLoc()))
return false;
- BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(Opc), ResultReg);
+ BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(X86::SETCCr),
+ ResultReg).addImm(CC);
updateValueMap(I, ResultReg);
return true;
}
@@ -1693,11 +1679,9 @@ bool X86FastISel::X86SelectBranch(const Instruction *I) {
}
bool SwapArgs;
- unsigned BranchOpc;
std::tie(CC, SwapArgs) = X86::getX86ConditionCode(Predicate);
assert(CC <= X86::LAST_VALID_COND && "Unexpected condition code.");
- BranchOpc = X86::GetCondBranchFromCond(CC);
if (SwapArgs)
std::swap(CmpLHS, CmpRHS);
@@ -1705,14 +1689,14 @@ bool X86FastISel::X86SelectBranch(const Instruction *I) {
if (!X86FastEmitCompare(CmpLHS, CmpRHS, VT, CI->getDebugLoc()))
return false;
- BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(BranchOpc))
- .addMBB(TrueMBB);
+ BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(X86::JCC_1))
+ .addMBB(TrueMBB).addImm(CC);
// X86 requires a second branch to handle UNE (and OEQ, which is mapped
// to UNE above).
if (NeedExtraBranch) {
- BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(X86::JP_1))
- .addMBB(TrueMBB);
+ BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(X86::JCC_1))
+ .addMBB(TrueMBB).addImm(X86::COND_P);
}
finishCondBranch(BI->getParent(), TrueMBB, FalseMBB);
@@ -1739,14 +1723,14 @@ bool X86FastISel::X86SelectBranch(const Instruction *I) {
BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(TestOpc))
.addReg(OpReg).addImm(1);
- unsigned JmpOpc = X86::JNE_1;
+ unsigned JmpCond = X86::COND_NE;
if (FuncInfo.MBB->isLayoutSuccessor(TrueMBB)) {
std::swap(TrueMBB, FalseMBB);
- JmpOpc = X86::JE_1;
+ JmpCond = X86::COND_E;
}
- BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(JmpOpc))
- .addMBB(TrueMBB);
+ BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(X86::JCC_1))
+ .addMBB(TrueMBB).addImm(JmpCond);
finishCondBranch(BI->getParent(), TrueMBB, FalseMBB);
return true;
@@ -1759,10 +1743,8 @@ bool X86FastISel::X86SelectBranch(const Instruction *I) {
if (TmpReg == 0)
return false;
- unsigned BranchOpc = X86::GetCondBranchFromCond(CC);
-
- BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(BranchOpc))
- .addMBB(TrueMBB);
+ BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(X86::JCC_1))
+ .addMBB(TrueMBB).addImm(CC);
finishCondBranch(BI->getParent(), TrueMBB, FalseMBB);
return true;
}
@@ -1786,8 +1768,8 @@ bool X86FastISel::X86SelectBranch(const Instruction *I) {
BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(X86::TEST8ri))
.addReg(OpReg)
.addImm(1);
- BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(X86::JNE_1))
- .addMBB(TrueMBB);
+ BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(X86::JCC_1))
+ .addMBB(TrueMBB).addImm(X86::COND_NE);
finishCondBranch(BI->getParent(), TrueMBB, FalseMBB);
return true;
}
@@ -2050,8 +2032,8 @@ bool X86FastISel::X86FastEmitCMoveSelect(MVT RetVT, const Instruction *I) {
// FCMP_OEQ and FCMP_UNE cannot be checked with a single instruction.
static const uint16_t SETFOpcTable[2][3] = {
- { X86::SETNPr, X86::SETEr , X86::TEST8rr },
- { X86::SETPr, X86::SETNEr, X86::OR8rr }
+ { X86::COND_NP, X86::COND_E, X86::TEST8rr },
+ { X86::COND_P, X86::COND_NE, X86::OR8rr }
};
const uint16_t *SETFOpc = nullptr;
switch (Predicate) {
@@ -2083,10 +2065,10 @@ bool X86FastISel::X86FastEmitCMoveSelect(MVT RetVT, const Instruction *I) {
if (SETFOpc) {
unsigned FlagReg1 = createResultReg(&X86::GR8RegClass);
unsigned FlagReg2 = createResultReg(&X86::GR8RegClass);
- BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(SETFOpc[0]),
- FlagReg1);
- BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(SETFOpc[1]),
- FlagReg2);
+ BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(X86::SETCCr),
+ FlagReg1).addImm(SETFOpc[0]);
+ BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(X86::SETCCr),
+ FlagReg2).addImm(SETFOpc[1]);
auto const &II = TII.get(SETFOpc[2]);
if (II.getNumDefs()) {
unsigned TmpReg = createResultReg(&X86::GR8RegClass);
@@ -2147,9 +2129,9 @@ bool X86FastISel::X86FastEmitCMoveSelect(MVT RetVT, const Instruction *I) {
return false;
const TargetRegisterInfo &TRI = *Subtarget->getRegisterInfo();
- unsigned Opc = X86::getCMovFromCond(CC, TRI.getRegSizeInBits(*RC)/8);
- unsigned ResultReg = fastEmitInst_rr(Opc, RC, RHSReg, RHSIsKill,
- LHSReg, LHSIsKill);
+ unsigned Opc = X86::getCMovOpcode(TRI.getRegSizeInBits(*RC)/8);
+ unsigned ResultReg = fastEmitInst_rri(Opc, RC, RHSReg, RHSIsKill,
+ LHSReg, LHSIsKill, CC);
updateValueMap(I, ResultReg);
return true;
}
@@ -2194,19 +2176,6 @@ bool X86FastISel::X86FastEmitSSESelect(MVT RetVT, const Instruction *I) {
if (NeedSwap)
std::swap(CmpLHS, CmpRHS);
- // Choose the SSE instruction sequence based on data type (float or double).
- static const uint16_t OpcTable[2][4] = {
- { X86::CMPSSrr, X86::ANDPSrr, X86::ANDNPSrr, X86::ORPSrr },
- { X86::CMPSDrr, X86::ANDPDrr, X86::ANDNPDrr, X86::ORPDrr }
- };
-
- const uint16_t *Opc = nullptr;
- switch (RetVT.SimpleTy) {
- default: return false;
- case MVT::f32: Opc = &OpcTable[0][0]; break;
- case MVT::f64: Opc = &OpcTable[1][0]; break;
- }
-
const Value *LHS = I->getOperand(1);
const Value *RHS = I->getOperand(2);
@@ -2277,6 +2246,19 @@ bool X86FastISel::X86FastEmitSSESelect(MVT RetVT, const Instruction *I) {
BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
TII.get(TargetOpcode::COPY), ResultReg).addReg(VBlendReg);
} else {
+ // Choose the SSE instruction sequence based on data type (float or double).
+ static const uint16_t OpcTable[2][4] = {
+ { X86::CMPSSrr, X86::ANDPSrr, X86::ANDNPSrr, X86::ORPSrr },
+ { X86::CMPSDrr, X86::ANDPDrr, X86::ANDNPDrr, X86::ORPDrr }
+ };
+
+ const uint16_t *Opc = nullptr;
+ switch (RetVT.SimpleTy) {
+ default: return false;
+ case MVT::f32: Opc = &OpcTable[0][0]; break;
+ case MVT::f64: Opc = &OpcTable[1][0]; break;
+ }
+
const TargetRegisterClass *VR128 = &X86::VR128RegClass;
unsigned CmpReg = fastEmitInst_rri(Opc[0], RC, CmpLHSReg, CmpLHSIsKill,
CmpRHSReg, CmpRHSIsKill, CC);
@@ -2303,8 +2285,10 @@ bool X86FastISel::X86FastEmitPseudoSelect(MVT RetVT, const Instruction *I) {
case MVT::i8: Opc = X86::CMOV_GR8; break;
case MVT::i16: Opc = X86::CMOV_GR16; break;
case MVT::i32: Opc = X86::CMOV_GR32; break;
- case MVT::f32: Opc = X86::CMOV_FR32; break;
- case MVT::f64: Opc = X86::CMOV_FR64; break;
+ case MVT::f32: Opc = Subtarget->hasAVX512() ? X86::CMOV_FR32X
+ : X86::CMOV_FR32; break;
+ case MVT::f64: Opc = Subtarget->hasAVX512() ? X86::CMOV_FR64X
+ : X86::CMOV_FR64; break;
}
const Value *Cond = I->getOperand(0);
@@ -2485,13 +2469,14 @@ bool X86FastISel::X86SelectFPExtOrFPTrunc(const Instruction *I,
assert((I->getOpcode() == Instruction::FPExt ||
I->getOpcode() == Instruction::FPTrunc) &&
"Instruction must be an FPExt or FPTrunc!");
+ bool HasAVX = Subtarget->hasAVX();
unsigned OpReg = getRegForValue(I->getOperand(0));
if (OpReg == 0)
return false;
unsigned ImplicitDefReg;
- if (Subtarget->hasAVX()) {
+ if (HasAVX) {
ImplicitDefReg = createResultReg(RC);
BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
TII.get(TargetOpcode::IMPLICIT_DEF), ImplicitDefReg);
@@ -2503,7 +2488,7 @@ bool X86FastISel::X86SelectFPExtOrFPTrunc(const Instruction *I,
MIB = BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(TargetOpc),
ResultReg);
- if (Subtarget->hasAVX())
+ if (HasAVX)
MIB.addReg(ImplicitDefReg);
MIB.addReg(OpReg);
@@ -2519,8 +2504,7 @@ bool X86FastISel::X86SelectFPExt(const Instruction *I) {
unsigned Opc =
HasAVX512 ? X86::VCVTSS2SDZrr
: Subtarget->hasAVX() ? X86::VCVTSS2SDrr : X86::CVTSS2SDrr;
- return X86SelectFPExtOrFPTrunc(
- I, Opc, HasAVX512 ? &X86::FR64XRegClass : &X86::FR64RegClass);
+ return X86SelectFPExtOrFPTrunc(I, Opc, TLI.getRegClassFor(MVT::f64));
}
return false;
@@ -2534,8 +2518,7 @@ bool X86FastISel::X86SelectFPTrunc(const Instruction *I) {
unsigned Opc =
HasAVX512 ? X86::VCVTSD2SSZrr
: Subtarget->hasAVX() ? X86::VCVTSD2SSrr : X86::CVTSD2SSrr;
- return X86SelectFPExtOrFPTrunc(
- I, Opc, HasAVX512 ? &X86::FR32XRegClass : &X86::FR32RegClass);
+ return X86SelectFPExtOrFPTrunc(I, Opc, TLI.getRegClassFor(MVT::f32));
}
return false;
@@ -2900,21 +2883,21 @@ bool X86FastISel::fastLowerIntrinsicCall(const IntrinsicInst *II) {
isCommutativeIntrinsic(II))
std::swap(LHS, RHS);
- unsigned BaseOpc, CondOpc;
+ unsigned BaseOpc, CondCode;
switch (II->getIntrinsicID()) {
default: llvm_unreachable("Unexpected intrinsic!");
case Intrinsic::sadd_with_overflow:
- BaseOpc = ISD::ADD; CondOpc = X86::SETOr; break;
+ BaseOpc = ISD::ADD; CondCode = X86::COND_O; break;
case Intrinsic::uadd_with_overflow:
- BaseOpc = ISD::ADD; CondOpc = X86::SETBr; break;
+ BaseOpc = ISD::ADD; CondCode = X86::COND_B; break;
case Intrinsic::ssub_with_overflow:
- BaseOpc = ISD::SUB; CondOpc = X86::SETOr; break;
+ BaseOpc = ISD::SUB; CondCode = X86::COND_O; break;
case Intrinsic::usub_with_overflow:
- BaseOpc = ISD::SUB; CondOpc = X86::SETBr; break;
+ BaseOpc = ISD::SUB; CondCode = X86::COND_B; break;
case Intrinsic::smul_with_overflow:
- BaseOpc = X86ISD::SMUL; CondOpc = X86::SETOr; break;
+ BaseOpc = X86ISD::SMUL; CondCode = X86::COND_O; break;
case Intrinsic::umul_with_overflow:
- BaseOpc = X86ISD::UMUL; CondOpc = X86::SETOr; break;
+ BaseOpc = X86ISD::UMUL; CondCode = X86::COND_O; break;
}
unsigned LHSReg = getRegForValue(LHS);
@@ -2931,7 +2914,7 @@ bool X86FastISel::fastLowerIntrinsicCall(const IntrinsicInst *II) {
};
if (CI->isOne() && (BaseOpc == ISD::ADD || BaseOpc == ISD::SUB) &&
- CondOpc == X86::SETOr) {
+ CondCode == X86::COND_O) {
// We can use INC/DEC.
ResultReg = createResultReg(TLI.getRegClassFor(VT));
bool IsDec = BaseOpc == ISD::SUB;
@@ -2990,8 +2973,8 @@ bool X86FastISel::fastLowerIntrinsicCall(const IntrinsicInst *II) {
// Assign to a GPR since the overflow return value is lowered to a SETcc.
unsigned ResultReg2 = createResultReg(&X86::GR8RegClass);
assert((ResultReg+1) == ResultReg2 && "Nonconsecutive result registers.");
- BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(CondOpc),
- ResultReg2);
+ BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(X86::SETCCr),
+ ResultReg2).addImm(CondCode);
updateValueMap(II, ResultReg, 2);
return true;
@@ -3509,8 +3492,9 @@ bool X86FastISel::fastLowerCall(CallLoweringInfo &CLI) {
// This will be a direct call, or an indirect call through memory for
// NonLazyBind calls or dllimport calls.
- bool NeedLoad =
- OpFlags == X86II::MO_DLLIMPORT || OpFlags == X86II::MO_GOTPCREL;
+ bool NeedLoad = OpFlags == X86II::MO_DLLIMPORT ||
+ OpFlags == X86II::MO_GOTPCREL ||
+ OpFlags == X86II::MO_COFFSTUB;
unsigned CallOpc = NeedLoad
? (Is64Bit ? X86::CALL64m : X86::CALL32m)
: (Is64Bit ? X86::CALL64pcrel32 : X86::CALLpcrel32);
@@ -3595,7 +3579,7 @@ bool X86FastISel::fastLowerCall(CallLoweringInfo &CLI) {
addFrameReference(BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
TII.get(Opc)), FI)
.addReg(CopyReg);
- Opc = ResVT == MVT::f32 ? X86::MOVSSrm : X86::MOVSDrm;
+ Opc = ResVT == MVT::f32 ? X86::MOVSSrm_alt : X86::MOVSDrm_alt;
addFrameReference(BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
TII.get(Opc), ResultReg + i), FI);
}
@@ -3662,24 +3646,19 @@ X86FastISel::fastSelectInstruction(const Instruction *I) {
return true;
}
case Instruction::BitCast: {
- // Select SSE2/AVX bitcasts between 128/256 bit vector types.
+ // Select SSE2/AVX bitcasts between 128/256/512 bit vector types.
if (!Subtarget->hasSSE2())
return false;
- EVT SrcVT = TLI.getValueType(DL, I->getOperand(0)->getType());
- EVT DstVT = TLI.getValueType(DL, I->getType());
-
- if (!SrcVT.isSimple() || !DstVT.isSimple())
+ MVT SrcVT, DstVT;
+ if (!isTypeLegal(I->getOperand(0)->getType(), SrcVT) ||
+ !isTypeLegal(I->getType(), DstVT))
return false;
- MVT SVT = SrcVT.getSimpleVT();
- MVT DVT = DstVT.getSimpleVT();
-
- if (!SVT.is128BitVector() &&
- !(Subtarget->hasAVX() && SVT.is256BitVector()) &&
- !(Subtarget->hasAVX512() && SVT.is512BitVector() &&
- (Subtarget->hasBWI() || (SVT.getScalarSizeInBits() >= 32 &&
- DVT.getScalarSizeInBits() >= 32))))
+ // Only allow vectors that use xmm/ymm/zmm.
+ if (!SrcVT.isVector() || !DstVT.isVector() ||
+ SrcVT.getVectorElementType() == MVT::i1 ||
+ DstVT.getVectorElementType() == MVT::i1)
return false;
unsigned Reg = getRegForValue(I->getOperand(0));
@@ -3757,30 +3736,25 @@ unsigned X86FastISel::X86MaterializeFP(const ConstantFP *CFP, MVT VT) {
// Get opcode and regclass of the output for the given load instruction.
unsigned Opc = 0;
- const TargetRegisterClass *RC = nullptr;
+ bool HasAVX = Subtarget->hasAVX();
+ bool HasAVX512 = Subtarget->hasAVX512();
switch (VT.SimpleTy) {
default: return 0;
case MVT::f32:
- if (X86ScalarSSEf32) {
- Opc = Subtarget->hasAVX512()
- ? X86::VMOVSSZrm
- : Subtarget->hasAVX() ? X86::VMOVSSrm : X86::MOVSSrm;
- RC = Subtarget->hasAVX512() ? &X86::FR32XRegClass : &X86::FR32RegClass;
- } else {
+ if (X86ScalarSSEf32)
+ Opc = HasAVX512 ? X86::VMOVSSZrm_alt :
+ HasAVX ? X86::VMOVSSrm_alt :
+ X86::MOVSSrm_alt;
+ else
Opc = X86::LD_Fp32m;
- RC = &X86::RFP32RegClass;
- }
break;
case MVT::f64:
- if (X86ScalarSSEf64) {
- Opc = Subtarget->hasAVX512()
- ? X86::VMOVSDZrm
- : Subtarget->hasAVX() ? X86::VMOVSDrm : X86::MOVSDrm;
- RC = Subtarget->hasAVX512() ? &X86::FR64XRegClass : &X86::FR64RegClass;
- } else {
+ if (X86ScalarSSEf64)
+ Opc = HasAVX512 ? X86::VMOVSDZrm_alt :
+ HasAVX ? X86::VMOVSDrm_alt :
+ X86::MOVSDrm_alt;
+ else
Opc = X86::LD_Fp64m;
- RC = &X86::RFP64RegClass;
- }
break;
case MVT::f80:
// No f80 support yet.
@@ -3806,7 +3780,7 @@ unsigned X86FastISel::X86MaterializeFP(const ConstantFP *CFP, MVT VT) {
// Create the load from the constant pool.
unsigned CPI = MCP.getConstantPoolIndex(CFP, Align);
- unsigned ResultReg = createResultReg(RC);
+ unsigned ResultReg = createResultReg(TLI.getRegClassFor(VT.SimpleTy));
if (CM == CodeModel::Large) {
unsigned AddrReg = createResultReg(&X86::GR64RegClass);
@@ -3916,33 +3890,26 @@ unsigned X86FastISel::fastMaterializeFloatZero(const ConstantFP *CF) {
// Get opcode and regclass for the given zero.
bool HasAVX512 = Subtarget->hasAVX512();
unsigned Opc = 0;
- const TargetRegisterClass *RC = nullptr;
switch (VT.SimpleTy) {
default: return 0;
case MVT::f32:
- if (X86ScalarSSEf32) {
+ if (X86ScalarSSEf32)
Opc = HasAVX512 ? X86::AVX512_FsFLD0SS : X86::FsFLD0SS;
- RC = HasAVX512 ? &X86::FR32XRegClass : &X86::FR32RegClass;
- } else {
+ else
Opc = X86::LD_Fp032;
- RC = &X86::RFP32RegClass;
- }
break;
case MVT::f64:
- if (X86ScalarSSEf64) {
+ if (X86ScalarSSEf64)
Opc = HasAVX512 ? X86::AVX512_FsFLD0SD : X86::FsFLD0SD;
- RC = HasAVX512 ? &X86::FR64XRegClass : &X86::FR64RegClass;
- } else {
+ else
Opc = X86::LD_Fp064;
- RC = &X86::RFP64RegClass;
- }
break;
case MVT::f80:
// No f80 support yet.
return 0;
}
- unsigned ResultReg = createResultReg(RC);
+ unsigned ResultReg = createResultReg(TLI.getRegClassFor(VT));
BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(Opc), ResultReg);
return ResultReg;
}
@@ -3992,6 +3959,7 @@ bool X86FastISel::tryToFoldLoadIntoMI(MachineInstr *MI, unsigned OpNo,
}
Result->addMemOperand(*FuncInfo.MF, createMachineMemOperandFor(LI));
+ Result->cloneInstrSymbols(*FuncInfo.MF, *MI);
MachineBasicBlock::iterator I(MI);
removeDeadCode(I, std::next(I));
return true;
diff --git a/lib/Target/X86/X86FixupBWInsts.cpp b/lib/Target/X86/X86FixupBWInsts.cpp
index ed297e678203..bf541d933790 100644
--- a/lib/Target/X86/X86FixupBWInsts.cpp
+++ b/lib/Target/X86/X86FixupBWInsts.cpp
@@ -1,9 +1,8 @@
//===-- X86FixupBWInsts.cpp - Fixup Byte or Word instructions -----------===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
/// \file
@@ -103,9 +102,7 @@ public:
StringRef getPassName() const override { return FIXUPBW_DESC; }
- FixupBWInstPass() : MachineFunctionPass(ID) {
- initializeFixupBWInstPassPass(*PassRegistry::getPassRegistry());
- }
+ FixupBWInstPass() : MachineFunctionPass(ID) { }
void getAnalysisUsage(AnalysisUsage &AU) const override {
AU.addRequired<MachineLoopInfo>(); // Machine loop info is used to
@@ -151,7 +148,7 @@ bool FixupBWInstPass::runOnMachineFunction(MachineFunction &MF) {
this->MF = &MF;
TII = MF.getSubtarget<X86Subtarget>().getInstrInfo();
- OptForSize = MF.getFunction().optForSize();
+ OptForSize = MF.getFunction().hasOptSize();
MLI = &getAnalysis<MachineLoopInfo>();
LiveRegs.init(TII->getRegisterInfo());
diff --git a/lib/Target/X86/X86FixupLEAs.cpp b/lib/Target/X86/X86FixupLEAs.cpp
index a346085a52cb..041529a0be68 100644
--- a/lib/Target/X86/X86FixupLEAs.cpp
+++ b/lib/Target/X86/X86FixupLEAs.cpp
@@ -1,15 +1,14 @@
//===-- X86FixupLEAs.cpp - use or replace LEA instructions -----------===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
// This file defines the pass that finds instructions that can be
// re-written as LEA instructions in order to reduce pipeline delays.
-// When optimizing for size it replaces suitable LEAs with INC or DEC.
+// It replaces LEAs with ADD/INC/DEC when that is better for size/speed.
//
//===----------------------------------------------------------------------===//
@@ -36,31 +35,25 @@ namespace {
class FixupLEAPass : public MachineFunctionPass {
enum RegUsageState { RU_NotUsed, RU_Write, RU_Read };
- /// Loop over all of the instructions in the basic block
- /// replacing applicable instructions with LEA instructions,
- /// where appropriate.
- bool processBasicBlock(MachineFunction &MF, MachineFunction::iterator MFI,
- bool IsSlowLEA, bool IsSlow3OpsLEA);
-
/// Given a machine register, look for the instruction
/// which writes it in the current basic block. If found,
/// try to replace it with an equivalent LEA instruction.
/// If replacement succeeds, then also process the newly created
/// instruction.
void seekLEAFixup(MachineOperand &p, MachineBasicBlock::iterator &I,
- MachineFunction::iterator MFI);
+ MachineBasicBlock &MBB);
/// Given a memory access or LEA instruction
/// whose address mode uses a base and/or index register, look for
/// an opportunity to replace the instruction which sets the base or index
/// register with an equivalent LEA instruction.
void processInstruction(MachineBasicBlock::iterator &I,
- MachineFunction::iterator MFI);
+ MachineBasicBlock &MBB);
/// Given a LEA instruction which is unprofitable
/// on SlowLEA targets try to replace it with an equivalent ADD instruction.
void processInstructionForSlowLEA(MachineBasicBlock::iterator &I,
- MachineFunction::iterator MFI);
+ MachineBasicBlock &MBB);
/// Given a LEA instruction which is unprofitable
/// on SNB+ try to replace it with other instructions.
@@ -75,12 +68,13 @@ class FixupLEAPass : public MachineFunctionPass {
/// - LEA that uses 16-bit addressing mode "
/// This function currently handles the first 2 cases only.
MachineInstr *processInstrForSlow3OpLEA(MachineInstr &MI,
- MachineFunction::iterator MFI);
+ MachineBasicBlock &MBB);
- /// Look for LEAs that add 1 to reg or subtract 1 from reg
- /// and convert them to INC or DEC respectively.
- bool fixupIncDec(MachineBasicBlock::iterator &I,
- MachineFunction::iterator MFI) const;
+ /// Look for LEAs that are really two address LEAs that we might be able to
+ /// turn into regular ADD instructions.
+ bool optTwoAddrLEA(MachineBasicBlock::iterator &I,
+ MachineBasicBlock &MBB, bool OptIncDec,
+ bool UseLEAForSP) const;
/// Determine if an instruction references a machine register
/// and, if so, whether it reads or writes the register.
@@ -91,12 +85,12 @@ class FixupLEAPass : public MachineFunctionPass {
/// a maximum of INSTR_DISTANCE_THRESHOLD instruction latency cycles.
MachineBasicBlock::iterator searchBackwards(MachineOperand &p,
MachineBasicBlock::iterator &I,
- MachineFunction::iterator MFI);
+ MachineBasicBlock &MBB);
/// if an instruction can be converted to an
/// equivalent LEA, insert the new instruction into the basic block
/// and return a pointer to it. Otherwise, return zero.
- MachineInstr *postRAConvertToLEA(MachineFunction::iterator &MFI,
+ MachineInstr *postRAConvertToLEA(MachineBasicBlock &MBB,
MachineBasicBlock::iterator &MBBI) const;
public:
@@ -104,9 +98,7 @@ public:
StringRef getPassName() const override { return FIXUPLEA_DESC; }
- FixupLEAPass() : MachineFunctionPass(ID) {
- initializeFixupLEAPassPass(*PassRegistry::getPassRegistry());
- }
+ FixupLEAPass() : MachineFunctionPass(ID) { }
/// Loop over all of the basic blocks,
/// replacing instructions by equivalent LEA instructions
@@ -121,10 +113,8 @@ public:
private:
TargetSchedModel TSM;
- MachineFunction *MF;
- const X86InstrInfo *TII; // Machine instruction info.
- bool OptIncDec;
- bool OptLEA;
+ const X86InstrInfo *TII;
+ const X86RegisterInfo *TRI;
};
}
@@ -133,7 +123,7 @@ char FixupLEAPass::ID = 0;
INITIALIZE_PASS(FixupLEAPass, FIXUPLEA_NAME, FIXUPLEA_DESC, false, false)
MachineInstr *
-FixupLEAPass::postRAConvertToLEA(MachineFunction::iterator &MFI,
+FixupLEAPass::postRAConvertToLEA(MachineBasicBlock &MBB,
MachineBasicBlock::iterator &MBBI) const {
MachineInstr &MI = *MBBI;
switch (MI.getOpcode()) {
@@ -142,7 +132,7 @@ FixupLEAPass::postRAConvertToLEA(MachineFunction::iterator &MFI,
const MachineOperand &Src = MI.getOperand(1);
const MachineOperand &Dest = MI.getOperand(0);
MachineInstr *NewMI =
- BuildMI(*MF, MI.getDebugLoc(),
+ BuildMI(MBB, MBBI, MI.getDebugLoc(),
TII->get(MI.getOpcode() == X86::MOV32rr ? X86::LEA32r
: X86::LEA64r))
.add(Dest)
@@ -151,9 +141,17 @@ FixupLEAPass::postRAConvertToLEA(MachineFunction::iterator &MFI,
.addReg(0)
.addImm(0)
.addReg(0);
- MFI->insert(MBBI, NewMI); // Insert the new inst
return NewMI;
}
+ }
+
+ if (!MI.isConvertibleTo3Addr())
+ return nullptr;
+
+ switch (MI.getOpcode()) {
+ default:
+ // Only convert instructions that we've verified are safe.
+ return nullptr;
case X86::ADD64ri32:
case X86::ADD64ri8:
case X86::ADD64ri32_DB:
@@ -162,52 +160,80 @@ FixupLEAPass::postRAConvertToLEA(MachineFunction::iterator &MFI,
case X86::ADD32ri8:
case X86::ADD32ri_DB:
case X86::ADD32ri8_DB:
- case X86::ADD16ri:
- case X86::ADD16ri8:
- case X86::ADD16ri_DB:
- case X86::ADD16ri8_DB:
if (!MI.getOperand(2).isImm()) {
// convertToThreeAddress will call getImm()
// which requires isImm() to be true
return nullptr;
}
break;
- case X86::ADD16rr:
- case X86::ADD16rr_DB:
- if (MI.getOperand(1).getReg() != MI.getOperand(2).getReg()) {
- // if src1 != src2, then convertToThreeAddress will
- // need to create a Virtual register, which we cannot do
- // after register allocation.
- return nullptr;
- }
+ case X86::SHL64ri:
+ case X86::SHL32ri:
+ case X86::INC64r:
+ case X86::INC32r:
+ case X86::DEC64r:
+ case X86::DEC32r:
+ case X86::ADD64rr:
+ case X86::ADD64rr_DB:
+ case X86::ADD32rr:
+ case X86::ADD32rr_DB:
+ // These instructions are all fine to convert.
+ break;
}
+ MachineFunction::iterator MFI = MBB.getIterator();
return TII->convertToThreeAddress(MFI, MI, nullptr);
}
FunctionPass *llvm::createX86FixupLEAs() { return new FixupLEAPass(); }
-bool FixupLEAPass::runOnMachineFunction(MachineFunction &Func) {
- if (skipFunction(Func.getFunction()))
+static bool isLEA(unsigned Opcode) {
+ return Opcode == X86::LEA32r || Opcode == X86::LEA64r ||
+ Opcode == X86::LEA64_32r;
+}
+
+bool FixupLEAPass::runOnMachineFunction(MachineFunction &MF) {
+ if (skipFunction(MF.getFunction()))
return false;
- MF = &Func;
- const X86Subtarget &ST = Func.getSubtarget<X86Subtarget>();
+ const X86Subtarget &ST = MF.getSubtarget<X86Subtarget>();
bool IsSlowLEA = ST.slowLEA();
bool IsSlow3OpsLEA = ST.slow3OpsLEA();
+ bool LEAUsesAG = ST.LEAusesAG();
- OptIncDec = !ST.slowIncDec() || Func.getFunction().optForMinSize();
- OptLEA = ST.LEAusesAG() || IsSlowLEA || IsSlow3OpsLEA;
-
- if (!OptLEA && !OptIncDec)
- return false;
+ bool OptIncDec = !ST.slowIncDec() || MF.getFunction().hasOptSize();
+ bool UseLEAForSP = ST.useLeaForSP();
- TSM.init(&Func.getSubtarget());
+ TSM.init(&ST);
TII = ST.getInstrInfo();
+ TRI = ST.getRegisterInfo();
LLVM_DEBUG(dbgs() << "Start X86FixupLEAs\n";);
- // Process all basic blocks.
- for (MachineFunction::iterator I = Func.begin(), E = Func.end(); I != E; ++I)
- processBasicBlock(Func, I, IsSlowLEA, IsSlow3OpsLEA);
+ for (MachineBasicBlock &MBB : MF) {
+ // First pass. Try to remove or optimize existing LEAs.
+ for (MachineBasicBlock::iterator I = MBB.begin(); I != MBB.end(); ++I) {
+ if (!isLEA(I->getOpcode()))
+ continue;
+
+ if (optTwoAddrLEA(I, MBB, OptIncDec, UseLEAForSP))
+ continue;
+
+ if (IsSlowLEA) {
+ processInstructionForSlowLEA(I, MBB);
+ } else if (IsSlow3OpsLEA) {
+ if (auto *NewMI = processInstrForSlow3OpLEA(*I, MBB)) {
+ MBB.erase(I);
+ I = NewMI;
+ }
+ }
+ }
+
+ // Second pass for creating LEAs. This may reverse some of the
+ // transformations above.
+ if (LEAUsesAG) {
+ for (MachineBasicBlock::iterator I = MBB.begin(); I != MBB.end(); ++I)
+ processInstruction(I, MBB);
+ }
+ }
+
LLVM_DEBUG(dbgs() << "End X86FixupLEAs\n";);
return true;
@@ -218,7 +244,7 @@ FixupLEAPass::usesRegister(MachineOperand &p, MachineBasicBlock::iterator I) {
RegUsageState RegUsage = RU_NotUsed;
MachineInstr &MI = *I;
- for (unsigned int i = 0; i < MI.getNumOperands(); ++i) {
+ for (unsigned i = 0; i < MI.getNumOperands(); ++i) {
MachineOperand &opnd = MI.getOperand(i);
if (opnd.isReg() && opnd.getReg() == p.getReg()) {
if (opnd.isDef())
@@ -234,10 +260,10 @@ FixupLEAPass::usesRegister(MachineOperand &p, MachineBasicBlock::iterator I) {
/// wrapping around to the last instruction of the block if the block
/// branches to itself.
static inline bool getPreviousInstr(MachineBasicBlock::iterator &I,
- MachineFunction::iterator MFI) {
- if (I == MFI->begin()) {
- if (MFI->isPredecessor(&*MFI)) {
- I = --MFI->end();
+ MachineBasicBlock &MBB) {
+ if (I == MBB.begin()) {
+ if (MBB.isPredecessor(&MBB)) {
+ I = --MBB.end();
return true;
} else
return false;
@@ -248,14 +274,14 @@ static inline bool getPreviousInstr(MachineBasicBlock::iterator &I,
MachineBasicBlock::iterator
FixupLEAPass::searchBackwards(MachineOperand &p, MachineBasicBlock::iterator &I,
- MachineFunction::iterator MFI) {
+ MachineBasicBlock &MBB) {
int InstrDistance = 1;
MachineBasicBlock::iterator CurInst;
static const int INSTR_DISTANCE_THRESHOLD = 5;
CurInst = I;
bool Found;
- Found = getPreviousInstr(CurInst, MFI);
+ Found = getPreviousInstr(CurInst, MBB);
while (Found && I != CurInst) {
if (CurInst->isCall() || CurInst->isInlineAsm())
break;
@@ -265,17 +291,12 @@ FixupLEAPass::searchBackwards(MachineOperand &p, MachineBasicBlock::iterator &I,
return CurInst;
}
InstrDistance += TSM.computeInstrLatency(&*CurInst);
- Found = getPreviousInstr(CurInst, MFI);
+ Found = getPreviousInstr(CurInst, MBB);
}
return MachineBasicBlock::iterator();
}
-static inline bool isLEA(const int Opcode) {
- return Opcode == X86::LEA16r || Opcode == X86::LEA32r ||
- Opcode == X86::LEA64r || Opcode == X86::LEA64_32r;
-}
-
-static inline bool isInefficientLEAReg(unsigned int Reg) {
+static inline bool isInefficientLEAReg(unsigned Reg) {
return Reg == X86::EBP || Reg == X86::RBP ||
Reg == X86::R13D || Reg == X86::R13;
}
@@ -298,27 +319,24 @@ static inline bool hasLEAOffset(const MachineOperand &Offset) {
return (Offset.isImm() && Offset.getImm() != 0) || Offset.isGlobal();
}
-static inline int getADDrrFromLEA(int LEAOpcode) {
+static inline unsigned getADDrrFromLEA(unsigned LEAOpcode) {
switch (LEAOpcode) {
default:
llvm_unreachable("Unexpected LEA instruction");
- case X86::LEA16r:
- return X86::ADD16rr;
case X86::LEA32r:
- return X86::ADD32rr;
case X86::LEA64_32r:
+ return X86::ADD32rr;
case X86::LEA64r:
return X86::ADD64rr;
}
}
-static inline int getADDriFromLEA(int LEAOpcode, const MachineOperand &Offset) {
+static inline unsigned getADDriFromLEA(unsigned LEAOpcode,
+ const MachineOperand &Offset) {
bool IsInt8 = Offset.isImm() && isInt<8>(Offset.getImm());
switch (LEAOpcode) {
default:
llvm_unreachable("Unexpected LEA instruction");
- case X86::LEA16r:
- return IsInt8 ? X86::ADD16ri8 : X86::ADD16ri;
case X86::LEA32r:
case X86::LEA64_32r:
return IsInt8 ? X86::ADD32ri8 : X86::ADD32ri;
@@ -327,56 +345,110 @@ static inline int getADDriFromLEA(int LEAOpcode, const MachineOperand &Offset) {
}
}
-/// isLEASimpleIncOrDec - Does this LEA have one these forms:
-/// lea %reg, 1(%reg)
-/// lea %reg, -1(%reg)
-static inline bool isLEASimpleIncOrDec(MachineInstr &LEA) {
- unsigned SrcReg = LEA.getOperand(1 + X86::AddrBaseReg).getReg();
- unsigned DstReg = LEA.getOperand(0).getReg();
- const MachineOperand &AddrDisp = LEA.getOperand(1 + X86::AddrDisp);
- return SrcReg == DstReg &&
- LEA.getOperand(1 + X86::AddrIndexReg).getReg() == 0 &&
- LEA.getOperand(1 + X86::AddrSegmentReg).getReg() == 0 &&
- AddrDisp.isImm() &&
- (AddrDisp.getImm() == 1 || AddrDisp.getImm() == -1);
+static inline unsigned getINCDECFromLEA(unsigned LEAOpcode, bool IsINC) {
+ switch (LEAOpcode) {
+ default:
+ llvm_unreachable("Unexpected LEA instruction");
+ case X86::LEA32r:
+ case X86::LEA64_32r:
+ return IsINC ? X86::INC32r : X86::DEC32r;
+ case X86::LEA64r:
+ return IsINC ? X86::INC64r : X86::DEC64r;
+ }
}
-bool FixupLEAPass::fixupIncDec(MachineBasicBlock::iterator &I,
- MachineFunction::iterator MFI) const {
+bool FixupLEAPass::optTwoAddrLEA(MachineBasicBlock::iterator &I,
+ MachineBasicBlock &MBB, bool OptIncDec,
+ bool UseLEAForSP) const {
MachineInstr &MI = *I;
- int Opcode = MI.getOpcode();
- if (!isLEA(Opcode))
+
+ const MachineOperand &Base = MI.getOperand(1 + X86::AddrBaseReg);
+ const MachineOperand &Scale = MI.getOperand(1 + X86::AddrScaleAmt);
+ const MachineOperand &Index = MI.getOperand(1 + X86::AddrIndexReg);
+ const MachineOperand &Disp = MI.getOperand(1 + X86::AddrDisp);
+ const MachineOperand &Segment = MI.getOperand(1 + X86::AddrSegmentReg);
+
+ if (Segment.getReg() != 0 || !Disp.isImm() || Scale.getImm() > 1 ||
+ !TII->isSafeToClobberEFLAGS(MBB, I))
return false;
- if (isLEASimpleIncOrDec(MI) && TII->isSafeToClobberEFLAGS(*MFI, I)) {
- int NewOpcode;
- bool isINC = MI.getOperand(1 + X86::AddrDisp).getImm() == 1;
- switch (Opcode) {
- case X86::LEA16r:
- NewOpcode = isINC ? X86::INC16r : X86::DEC16r;
- break;
- case X86::LEA32r:
- case X86::LEA64_32r:
- NewOpcode = isINC ? X86::INC32r : X86::DEC32r;
- break;
- case X86::LEA64r:
- NewOpcode = isINC ? X86::INC64r : X86::DEC64r;
- break;
- }
+ unsigned DestReg = MI.getOperand(0).getReg();
+ unsigned BaseReg = Base.getReg();
+ unsigned IndexReg = Index.getReg();
- MachineInstr *NewMI =
- BuildMI(*MFI, I, MI.getDebugLoc(), TII->get(NewOpcode))
- .add(MI.getOperand(0))
- .add(MI.getOperand(1 + X86::AddrBaseReg));
- MFI->erase(I);
- I = static_cast<MachineBasicBlock::iterator>(NewMI);
- return true;
+ // Don't change stack adjustment LEAs.
+ if (UseLEAForSP && (DestReg == X86::ESP || DestReg == X86::RSP))
+ return false;
+
+ // LEA64_32 has 64-bit operands but 32-bit result.
+ if (MI.getOpcode() == X86::LEA64_32r) {
+ if (BaseReg != 0)
+ BaseReg = TRI->getSubReg(BaseReg, X86::sub_32bit);
+ if (IndexReg != 0)
+ IndexReg = TRI->getSubReg(IndexReg, X86::sub_32bit);
}
- return false;
+
+ MachineInstr *NewMI = nullptr;
+
+ // Look for lea(%reg1, %reg2), %reg1 or lea(%reg2, %reg1), %reg1
+ // which can be turned into add %reg2, %reg1
+ if (BaseReg != 0 && IndexReg != 0 && Disp.getImm() == 0 &&
+ (DestReg == BaseReg || DestReg == IndexReg)) {
+ unsigned NewOpcode = getADDrrFromLEA(MI.getOpcode());
+ if (DestReg != BaseReg)
+ std::swap(BaseReg, IndexReg);
+
+ if (MI.getOpcode() == X86::LEA64_32r) {
+ // TODO: Do we need the super register implicit use?
+ NewMI = BuildMI(MBB, I, MI.getDebugLoc(), TII->get(NewOpcode), DestReg)
+ .addReg(BaseReg).addReg(IndexReg)
+ .addReg(Base.getReg(), RegState::Implicit)
+ .addReg(Index.getReg(), RegState::Implicit);
+ } else {
+ NewMI = BuildMI(MBB, I, MI.getDebugLoc(), TII->get(NewOpcode), DestReg)
+ .addReg(BaseReg).addReg(IndexReg);
+ }
+ } else if (DestReg == BaseReg && IndexReg == 0) {
+ // This is an LEA with only a base register and a displacement,
+ // We can use ADDri or INC/DEC.
+
+ // Does this LEA have one these forms:
+ // lea %reg, 1(%reg)
+ // lea %reg, -1(%reg)
+ if (OptIncDec && (Disp.getImm() == 1 || Disp.getImm() == -1)) {
+ bool IsINC = Disp.getImm() == 1;
+ unsigned NewOpcode = getINCDECFromLEA(MI.getOpcode(), IsINC);
+
+ if (MI.getOpcode() == X86::LEA64_32r) {
+ // TODO: Do we need the super register implicit use?
+ NewMI = BuildMI(MBB, I, MI.getDebugLoc(), TII->get(NewOpcode), DestReg)
+ .addReg(BaseReg).addReg(Base.getReg(), RegState::Implicit);
+ } else {
+ NewMI = BuildMI(MBB, I, MI.getDebugLoc(), TII->get(NewOpcode), DestReg)
+ .addReg(BaseReg);
+ }
+ } else {
+ unsigned NewOpcode = getADDriFromLEA(MI.getOpcode(), Disp);
+ if (MI.getOpcode() == X86::LEA64_32r) {
+ // TODO: Do we need the super register implicit use?
+ NewMI = BuildMI(MBB, I, MI.getDebugLoc(), TII->get(NewOpcode), DestReg)
+ .addReg(BaseReg).addImm(Disp.getImm())
+ .addReg(Base.getReg(), RegState::Implicit);
+ } else {
+ NewMI = BuildMI(MBB, I, MI.getDebugLoc(), TII->get(NewOpcode), DestReg)
+ .addReg(BaseReg).addImm(Disp.getImm());
+ }
+ }
+ } else
+ return false;
+
+ MBB.erase(I);
+ I = NewMI;
+ return true;
}
void FixupLEAPass::processInstruction(MachineBasicBlock::iterator &I,
- MachineFunction::iterator MFI) {
+ MachineBasicBlock &MBB) {
// Process a load, store, or LEA instruction.
MachineInstr &MI = *I;
const MCInstrDesc &Desc = MI.getDesc();
@@ -385,40 +457,38 @@ void FixupLEAPass::processInstruction(MachineBasicBlock::iterator &I,
AddrOffset += X86II::getOperandBias(Desc);
MachineOperand &p = MI.getOperand(AddrOffset + X86::AddrBaseReg);
if (p.isReg() && p.getReg() != X86::ESP) {
- seekLEAFixup(p, I, MFI);
+ seekLEAFixup(p, I, MBB);
}
MachineOperand &q = MI.getOperand(AddrOffset + X86::AddrIndexReg);
if (q.isReg() && q.getReg() != X86::ESP) {
- seekLEAFixup(q, I, MFI);
+ seekLEAFixup(q, I, MBB);
}
}
}
void FixupLEAPass::seekLEAFixup(MachineOperand &p,
MachineBasicBlock::iterator &I,
- MachineFunction::iterator MFI) {
- MachineBasicBlock::iterator MBI = searchBackwards(p, I, MFI);
+ MachineBasicBlock &MBB) {
+ MachineBasicBlock::iterator MBI = searchBackwards(p, I, MBB);
if (MBI != MachineBasicBlock::iterator()) {
- MachineInstr *NewMI = postRAConvertToLEA(MFI, MBI);
+ MachineInstr *NewMI = postRAConvertToLEA(MBB, MBI);
if (NewMI) {
++NumLEAs;
LLVM_DEBUG(dbgs() << "FixLEA: Candidate to replace:"; MBI->dump(););
// now to replace with an equivalent LEA...
LLVM_DEBUG(dbgs() << "FixLEA: Replaced by: "; NewMI->dump(););
- MFI->erase(MBI);
+ MBB.erase(MBI);
MachineBasicBlock::iterator J =
static_cast<MachineBasicBlock::iterator>(NewMI);
- processInstruction(J, MFI);
+ processInstruction(J, MBB);
}
}
}
void FixupLEAPass::processInstructionForSlowLEA(MachineBasicBlock::iterator &I,
- MachineFunction::iterator MFI) {
+ MachineBasicBlock &MBB) {
MachineInstr &MI = *I;
- const int Opcode = MI.getOpcode();
- if (!isLEA(Opcode))
- return;
+ const unsigned Opcode = MI.getOpcode();
const MachineOperand &Dst = MI.getOperand(0);
const MachineOperand &Base = MI.getOperand(1 + X86::AddrBaseReg);
@@ -428,7 +498,7 @@ void FixupLEAPass::processInstructionForSlowLEA(MachineBasicBlock::iterator &I,
const MachineOperand &Segment = MI.getOperand(1 + X86::AddrSegmentReg);
if (Segment.getReg() != 0 || !Offset.isImm() ||
- !TII->isSafeToClobberEFLAGS(*MFI, I))
+ !TII->isSafeToClobberEFLAGS(MBB, I))
return;
const unsigned DstR = Dst.getReg();
const unsigned SrcR1 = Base.getReg();
@@ -445,7 +515,7 @@ void FixupLEAPass::processInstructionForSlowLEA(MachineBasicBlock::iterator &I,
const MCInstrDesc &ADDrr = TII->get(getADDrrFromLEA(Opcode));
const MachineOperand &Src = SrcR1 == DstR ? Index : Base;
NewMI =
- BuildMI(*MFI, I, MI.getDebugLoc(), ADDrr, DstR).addReg(DstR).add(Src);
+ BuildMI(MBB, I, MI.getDebugLoc(), ADDrr, DstR).addReg(DstR).add(Src);
LLVM_DEBUG(NewMI->dump(););
}
// Make ADD instruction for immediate
@@ -453,24 +523,21 @@ void FixupLEAPass::processInstructionForSlowLEA(MachineBasicBlock::iterator &I,
const MCInstrDesc &ADDri =
TII->get(getADDriFromLEA(Opcode, Offset));
const MachineOperand &SrcR = SrcR1 == DstR ? Base : Index;
- NewMI = BuildMI(*MFI, I, MI.getDebugLoc(), ADDri, DstR)
+ NewMI = BuildMI(MBB, I, MI.getDebugLoc(), ADDri, DstR)
.add(SrcR)
.addImm(Offset.getImm());
LLVM_DEBUG(NewMI->dump(););
}
if (NewMI) {
- MFI->erase(I);
+ MBB.erase(I);
I = NewMI;
}
}
MachineInstr *
FixupLEAPass::processInstrForSlow3OpLEA(MachineInstr &MI,
- MachineFunction::iterator MFI) {
-
- const int LEAOpcode = MI.getOpcode();
- if (!isLEA(LEAOpcode))
- return nullptr;
+ MachineBasicBlock &MBB) {
+ const unsigned LEAOpcode = MI.getOpcode();
const MachineOperand &Dst = MI.getOperand(0);
const MachineOperand &Base = MI.getOperand(1 + X86::AddrBaseReg);
@@ -481,13 +548,13 @@ FixupLEAPass::processInstrForSlow3OpLEA(MachineInstr &MI,
if (!(TII->isThreeOperandsLEA(MI) ||
hasInefficientLEABaseReg(Base, Index)) ||
- !TII->isSafeToClobberEFLAGS(*MFI, MI) ||
+ !TII->isSafeToClobberEFLAGS(MBB, MI) ||
Segment.getReg() != X86::NoRegister)
return nullptr;
- unsigned int DstR = Dst.getReg();
- unsigned int BaseR = Base.getReg();
- unsigned int IndexR = Index.getReg();
+ unsigned DstR = Dst.getReg();
+ unsigned BaseR = Base.getReg();
+ unsigned IndexR = Index.getReg();
unsigned SSDstR =
(LEAOpcode == X86::LEA64_32r) ? getX86SubSuperRegister(DstR, 64) : DstR;
bool IsScale1 = Scale.getImm() == 1;
@@ -516,11 +583,11 @@ FixupLEAPass::processInstrForSlow3OpLEA(MachineInstr &MI,
if (IsScale1 && (DstR == BaseR || DstR == IndexR)) {
const MachineOperand &Src = DstR == BaseR ? Index : Base;
MachineInstr *NewMI =
- BuildMI(*MFI, MI, DL, ADDrr, DstR).addReg(DstR).add(Src);
+ BuildMI(MBB, MI, DL, ADDrr, DstR).addReg(DstR).add(Src);
LLVM_DEBUG(NewMI->dump(););
// Create ADD instruction for the Offset in case of 3-Ops LEA.
if (hasLEAOffset(Offset)) {
- NewMI = BuildMI(*MFI, MI, DL, ADDri, DstR).addReg(DstR).add(Offset);
+ NewMI = BuildMI(MBB, MI, DL, ADDri, DstR).addReg(DstR).add(Offset);
LLVM_DEBUG(NewMI->dump(););
}
return NewMI;
@@ -530,7 +597,7 @@ FixupLEAPass::processInstrForSlow3OpLEA(MachineInstr &MI,
// lea offset(%base,%index,scale),%dst =>
// lea (%base,%index,scale); add offset,%dst
if (!IsInefficientBase || (!IsInefficientIndex && IsScale1)) {
- MachineInstr *NewMI = BuildMI(*MFI, MI, DL, TII->get(LEAOpcode))
+ MachineInstr *NewMI = BuildMI(MBB, MI, DL, TII->get(LEAOpcode))
.add(Dst)
.add(IsInefficientBase ? Index : Base)
.add(Scale)
@@ -540,7 +607,7 @@ FixupLEAPass::processInstrForSlow3OpLEA(MachineInstr &MI,
LLVM_DEBUG(NewMI->dump(););
// Create ADD instruction for the Offset in case of 3-Ops LEA.
if (hasLEAOffset(Offset)) {
- NewMI = BuildMI(*MFI, MI, DL, ADDri, DstR).addReg(DstR).add(Offset);
+ NewMI = BuildMI(MBB, MI, DL, ADDri, DstR).addReg(DstR).add(Offset);
LLVM_DEBUG(NewMI->dump(););
}
return NewMI;
@@ -552,17 +619,17 @@ FixupLEAPass::processInstrForSlow3OpLEA(MachineInstr &MI,
// lea (%base,%index,1), %dst => mov %base,%dst; add %index,%dst
if (IsScale1 && !hasLEAOffset(Offset)) {
bool BIK = Base.isKill() && BaseR != IndexR;
- TII->copyPhysReg(*MFI, MI, DL, DstR, BaseR, BIK);
+ TII->copyPhysReg(MBB, MI, DL, DstR, BaseR, BIK);
LLVM_DEBUG(MI.getPrevNode()->dump(););
MachineInstr *NewMI =
- BuildMI(*MFI, MI, DL, ADDrr, DstR).addReg(DstR).add(Index);
+ BuildMI(MBB, MI, DL, ADDrr, DstR).addReg(DstR).add(Index);
LLVM_DEBUG(NewMI->dump(););
return NewMI;
}
// lea offset(%base,%index,scale), %dst =>
// lea offset( ,%index,scale), %dst; add %base,%dst
- MachineInstr *NewMI = BuildMI(*MFI, MI, DL, TII->get(LEAOpcode))
+ MachineInstr *NewMI = BuildMI(MBB, MI, DL, TII->get(LEAOpcode))
.add(Dst)
.addReg(0)
.add(Scale)
@@ -571,35 +638,7 @@ FixupLEAPass::processInstrForSlow3OpLEA(MachineInstr &MI,
.add(Segment);
LLVM_DEBUG(NewMI->dump(););
- NewMI = BuildMI(*MFI, MI, DL, ADDrr, DstR).addReg(DstR).add(Base);
+ NewMI = BuildMI(MBB, MI, DL, ADDrr, DstR).addReg(DstR).add(Base);
LLVM_DEBUG(NewMI->dump(););
return NewMI;
}
-
-bool FixupLEAPass::processBasicBlock(MachineFunction &MF,
- MachineFunction::iterator MFI,
- bool IsSlowLEA, bool IsSlow3OpsLEA) {
- for (MachineBasicBlock::iterator I = MFI->begin(); I != MFI->end(); ++I) {
- if (OptIncDec)
- if (fixupIncDec(I, MFI))
- continue;
-
- if (OptLEA) {
- if (IsSlowLEA) {
- processInstructionForSlowLEA(I, MFI);
- continue;
- }
-
- if (IsSlow3OpsLEA) {
- if (auto *NewMI = processInstrForSlow3OpLEA(*I, MFI)) {
- MFI->erase(I);
- I = NewMI;
- }
- continue;
- }
-
- processInstruction(I, MFI);
- }
- }
- return false;
-}
diff --git a/lib/Target/X86/X86FixupSetCC.cpp b/lib/Target/X86/X86FixupSetCC.cpp
index a86eb997635e..e2d4d1ede6f3 100644
--- a/lib/Target/X86/X86FixupSetCC.cpp
+++ b/lib/Target/X86/X86FixupSetCC.cpp
@@ -1,9 +1,8 @@
//===---- X86FixupSetCC.cpp - optimize usage of LEA instructions ----------===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
@@ -68,30 +67,6 @@ char X86FixupSetCCPass::ID = 0;
FunctionPass *llvm::createX86FixupSetCC() { return new X86FixupSetCCPass(); }
-bool X86FixupSetCCPass::isSetCCr(unsigned Opcode) {
- switch (Opcode) {
- default:
- return false;
- case X86::SETOr:
- case X86::SETNOr:
- case X86::SETBr:
- case X86::SETAEr:
- case X86::SETEr:
- case X86::SETNEr:
- case X86::SETBEr:
- case X86::SETAr:
- case X86::SETSr:
- case X86::SETNSr:
- case X86::SETPr:
- case X86::SETNPr:
- case X86::SETLr:
- case X86::SETGEr:
- case X86::SETLEr:
- case X86::SETGr:
- return true;
- }
-}
-
// We expect the instruction *immediately* before the setcc to imp-def
// EFLAGS (because of scheduling glue). To make this less brittle w.r.t
// scheduling, look backwards until we hit the beginning of the
@@ -103,7 +78,7 @@ X86FixupSetCCPass::findFlagsImpDef(MachineBasicBlock *MBB,
auto MBBStart = MBB->rend();
for (int i = 0; (i < SearchBound) && (MI != MBBStart); ++i, ++MI)
for (auto &Op : MI->implicit_operands())
- if ((Op.getReg() == X86::EFLAGS) && (Op.isDef()))
+ if (Op.isReg() && (Op.getReg() == X86::EFLAGS) && Op.isDef())
return &*MI;
return nullptr;
@@ -111,7 +86,7 @@ X86FixupSetCCPass::findFlagsImpDef(MachineBasicBlock *MBB,
bool X86FixupSetCCPass::impUsesFlags(MachineInstr *MI) {
for (auto &Op : MI->implicit_operands())
- if ((Op.getReg() == X86::EFLAGS) && (Op.isUse()))
+ if (Op.isReg() && (Op.getReg() == X86::EFLAGS) && Op.isUse())
return true;
return false;
@@ -129,7 +104,7 @@ bool X86FixupSetCCPass::runOnMachineFunction(MachineFunction &MF) {
// Find a setcc that is used by a zext.
// This doesn't have to be the only use, the transformation is safe
// regardless.
- if (!isSetCCr(MI.getOpcode()))
+ if (MI.getOpcode() != X86::SETCCr)
continue;
MachineInstr *ZExt = nullptr;
diff --git a/lib/Target/X86/X86FlagsCopyLowering.cpp b/lib/Target/X86/X86FlagsCopyLowering.cpp
index 778aa505b2d9..5ce3255ea96a 100644
--- a/lib/Target/X86/X86FlagsCopyLowering.cpp
+++ b/lib/Target/X86/X86FlagsCopyLowering.cpp
@@ -1,9 +1,8 @@
//====- X86FlagsCopyLowering.cpp - Lowers COPY nodes of EFLAGS ------------===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
/// \file
@@ -71,12 +70,6 @@ STATISTIC(NumSetCCsInserted, "Number of setCC instructions inserted");
STATISTIC(NumTestsInserted, "Number of test instructions inserted");
STATISTIC(NumAddsInserted, "Number of adds instructions inserted");
-namespace llvm {
-
-void initializeX86FlagsCopyLoweringPassPass(PassRegistry &);
-
-} // end namespace llvm
-
namespace {
// Convenient array type for storing registers associated with each condition.
@@ -84,9 +77,7 @@ using CondRegArray = std::array<unsigned, X86::LAST_VALID_COND + 1>;
class X86FlagsCopyLoweringPass : public MachineFunctionPass {
public:
- X86FlagsCopyLoweringPass() : MachineFunctionPass(ID) {
- initializeX86FlagsCopyLoweringPassPass(*PassRegistry::getPassRegistry());
- }
+ X86FlagsCopyLoweringPass() : MachineFunctionPass(ID) { }
StringRef getPassName() const override { return "X86 EFLAGS copy lowering"; }
bool runOnMachineFunction(MachineFunction &MF) override;
@@ -252,13 +243,13 @@ static MachineBasicBlock &splitBlock(MachineBasicBlock &MBB,
"Split instruction must be in the split block!");
assert(SplitI.isBranch() &&
"Only designed to split a tail of branch instructions!");
- assert(X86::getCondFromBranchOpc(SplitI.getOpcode()) != X86::COND_INVALID &&
+ assert(X86::getCondFromBranch(SplitI) != X86::COND_INVALID &&
"Must split on an actual jCC instruction!");
// Dig out the previous instruction to the split point.
MachineInstr &PrevI = *std::prev(SplitI.getIterator());
assert(PrevI.isBranch() && "Must split after a branch!");
- assert(X86::getCondFromBranchOpc(PrevI.getOpcode()) != X86::COND_INVALID &&
+ assert(X86::getCondFromBranch(PrevI) != X86::COND_INVALID &&
"Must split after an actual jCC instruction!");
assert(!std::prev(PrevI.getIterator())->isTerminator() &&
"Must only have this one terminator prior to the split!");
@@ -588,22 +579,21 @@ bool X86FlagsCopyLoweringPass::runOnMachineFunction(MachineFunction &MF) {
// branch folding or black placement. As a consequence, we get to deal
// with the simpler formulation of conditional branches followed by tail
// calls.
- if (X86::getCondFromBranchOpc(MI.getOpcode()) != X86::COND_INVALID) {
+ if (X86::getCondFromBranch(MI) != X86::COND_INVALID) {
auto JmpIt = MI.getIterator();
do {
JmpIs.push_back(&*JmpIt);
++JmpIt;
} while (JmpIt != UseMBB.instr_end() &&
- X86::getCondFromBranchOpc(JmpIt->getOpcode()) !=
+ X86::getCondFromBranch(*JmpIt) !=
X86::COND_INVALID);
break;
}
// Otherwise we can just rewrite in-place.
- if (X86::getCondFromCMovOpc(MI.getOpcode()) != X86::COND_INVALID) {
+ if (X86::getCondFromCMov(MI) != X86::COND_INVALID) {
rewriteCMov(*TestMBB, TestPos, TestLoc, MI, *FlagUse, CondRegs);
- } else if (X86::getCondFromSETOpc(MI.getOpcode()) !=
- X86::COND_INVALID) {
+ } else if (X86::getCondFromSETCC(MI) != X86::COND_INVALID) {
rewriteSetCC(*TestMBB, TestPos, TestLoc, MI, *FlagUse, CondRegs);
} else if (MI.getOpcode() == TargetOpcode::COPY) {
rewriteCopy(MI, *FlagUse, CopyDefI);
@@ -730,7 +720,7 @@ CondRegArray X86FlagsCopyLoweringPass::collectCondsInRegs(
// Scan backwards across the range of instructions with live EFLAGS.
for (MachineInstr &MI :
llvm::reverse(llvm::make_range(MBB.begin(), TestPos))) {
- X86::CondCode Cond = X86::getCondFromSETOpc(MI.getOpcode());
+ X86::CondCode Cond = X86::getCondFromSETCC(MI);
if (Cond != X86::COND_INVALID && !MI.mayStore() && MI.getOperand(0).isReg() &&
TRI->isVirtualRegister(MI.getOperand(0).getReg())) {
assert(MI.getOperand(0).isDef() &&
@@ -751,7 +741,7 @@ unsigned X86FlagsCopyLoweringPass::promoteCondToReg(
DebugLoc TestLoc, X86::CondCode Cond) {
unsigned Reg = MRI->createVirtualRegister(PromoteRC);
auto SetI = BuildMI(TestMBB, TestPos, TestLoc,
- TII->get(X86::getSETFromCond(Cond)), Reg);
+ TII->get(X86::SETCCr), Reg).addImm(Cond);
(void)SetI;
LLVM_DEBUG(dbgs() << " save cond: "; SetI->dump());
++NumSetCCsInserted;
@@ -842,7 +832,7 @@ void X86FlagsCopyLoweringPass::rewriteCMov(MachineBasicBlock &TestMBB,
MachineOperand &FlagUse,
CondRegArray &CondRegs) {
// First get the register containing this specific condition.
- X86::CondCode Cond = X86::getCondFromCMovOpc(CMovI.getOpcode());
+ X86::CondCode Cond = X86::getCondFromCMov(CMovI);
unsigned CondReg;
bool Inverted;
std::tie(CondReg, Inverted) =
@@ -853,12 +843,10 @@ void X86FlagsCopyLoweringPass::rewriteCMov(MachineBasicBlock &TestMBB,
// Insert a direct test of the saved register.
insertTest(MBB, CMovI.getIterator(), CMovI.getDebugLoc(), CondReg);
- // Rewrite the CMov to use the !ZF flag from the test (but match register
- // size and memory operand), and then kill its use of the flags afterward.
- auto &CMovRC = *MRI->getRegClass(CMovI.getOperand(0).getReg());
- CMovI.setDesc(TII->get(X86::getCMovFromCond(
- Inverted ? X86::COND_E : X86::COND_NE, TRI->getRegSizeInBits(CMovRC) / 8,
- !CMovI.memoperands_empty())));
+ // Rewrite the CMov to use the !ZF flag from the test, and then kill its use
+ // of the flags afterward.
+ CMovI.getOperand(CMovI.getDesc().getNumOperands() - 1)
+ .setImm(Inverted ? X86::COND_E : X86::COND_NE);
FlagUse.setIsKill(true);
LLVM_DEBUG(dbgs() << " fixed cmov: "; CMovI.dump());
}
@@ -867,7 +855,7 @@ void X86FlagsCopyLoweringPass::rewriteCondJmp(
MachineBasicBlock &TestMBB, MachineBasicBlock::iterator TestPos,
DebugLoc TestLoc, MachineInstr &JmpI, CondRegArray &CondRegs) {
// First get the register containing this specific condition.
- X86::CondCode Cond = X86::getCondFromBranchOpc(JmpI.getOpcode());
+ X86::CondCode Cond = X86::getCondFromBranch(JmpI);
unsigned CondReg;
bool Inverted;
std::tie(CondReg, Inverted) =
@@ -880,10 +868,8 @@ void X86FlagsCopyLoweringPass::rewriteCondJmp(
// Rewrite the jump to use the !ZF flag from the test, and kill its use of
// flags afterward.
- JmpI.setDesc(TII->get(
- X86::GetCondBranchFromCond(Inverted ? X86::COND_E : X86::COND_NE)));
- const int ImplicitEFLAGSOpIdx = 1;
- JmpI.getOperand(ImplicitEFLAGSOpIdx).setIsKill(true);
+ JmpI.getOperand(1).setImm(Inverted ? X86::COND_E : X86::COND_NE);
+ JmpI.findRegisterUseOperand(X86::EFLAGS)->setIsKill(true);
LLVM_DEBUG(dbgs() << " fixed jCC: "; JmpI.dump());
}
@@ -1026,7 +1012,7 @@ void X86FlagsCopyLoweringPass::rewriteSetCC(MachineBasicBlock &TestMBB,
MachineInstr &SetCCI,
MachineOperand &FlagUse,
CondRegArray &CondRegs) {
- X86::CondCode Cond = X86::getCondFromSETOpc(SetCCI.getOpcode());
+ X86::CondCode Cond = X86::getCondFromSETCC(SetCCI);
// Note that we can't usefully rewrite this to the inverse without complex
// analysis of the users of the setCC. Largely we rely on duplicates which
// could have been avoided already being avoided here.
diff --git a/lib/Target/X86/X86FloatingPoint.cpp b/lib/Target/X86/X86FloatingPoint.cpp
index f330acff61a1..074cf21d03f5 100644
--- a/lib/Target/X86/X86FloatingPoint.cpp
+++ b/lib/Target/X86/X86FloatingPoint.cpp
@@ -1,9 +1,8 @@
//===-- X86FloatingPoint.cpp - Floating point Reg -> Stack converter ------===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
@@ -60,7 +59,6 @@ namespace {
struct FPS : public MachineFunctionPass {
static char ID;
FPS() : MachineFunctionPass(ID) {
- initializeEdgeBundlesPass(*PassRegistry::getPassRegistry());
// This is really only to keep valgrind quiet.
// The logic in isLive() is too much for it.
memset(Stack, 0, sizeof(Stack));
@@ -299,9 +297,16 @@ namespace {
void setKillFlags(MachineBasicBlock &MBB) const;
};
- char FPS::ID = 0;
}
+char FPS::ID = 0;
+
+INITIALIZE_PASS_BEGIN(FPS, DEBUG_TYPE, "X86 FP Stackifier",
+ false, false)
+INITIALIZE_PASS_DEPENDENCY(EdgeBundles)
+INITIALIZE_PASS_END(FPS, DEBUG_TYPE, "X86 FP Stackifier",
+ false, false)
+
FunctionPass *llvm::createX86FloatingPointStackifierPass() { return new FPS(); }
/// getFPReg - Return the X86::FPx register number for the specified operand.
@@ -591,7 +596,7 @@ namespace {
}
static int Lookup(ArrayRef<TableEntry> Table, unsigned Opcode) {
- const TableEntry *I = std::lower_bound(Table.begin(), Table.end(), Opcode);
+ const TableEntry *I = llvm::lower_bound(Table, Opcode);
if (I != Table.end() && I->from == Opcode)
return I->to;
return -1;
@@ -1096,6 +1101,8 @@ void FPS::handleZeroArgFP(MachineBasicBlock::iterator &I) {
// Change from the pseudo instruction to the concrete instruction.
MI.RemoveOperand(0); // Remove the explicit ST(0) operand
MI.setDesc(TII->get(getConcreteOpcode(MI.getOpcode())));
+ MI.addOperand(
+ MachineOperand::CreateReg(X86::ST0, /*isDef*/ true, /*isImp*/ true));
// Result gets pushed on the stack.
pushReg(DestReg);
@@ -1140,6 +1147,8 @@ void FPS::handleOneArgFP(MachineBasicBlock::iterator &I) {
// Convert from the pseudo instruction to the concrete instruction.
MI.RemoveOperand(NumOps - 1); // Remove explicit ST(0) operand
MI.setDesc(TII->get(getConcreteOpcode(MI.getOpcode())));
+ MI.addOperand(
+ MachineOperand::CreateReg(X86::ST0, /*isDef*/ false, /*isImp*/ true));
if (MI.getOpcode() == X86::IST_FP64m || MI.getOpcode() == X86::ISTT_FP16m ||
MI.getOpcode() == X86::ISTT_FP32m || MI.getOpcode() == X86::ISTT_FP64m ||
@@ -1369,8 +1378,6 @@ void FPS::handleTwoArgFP(MachineBasicBlock::iterator &I) {
/// register arguments and no explicit destinations.
///
void FPS::handleCompareFP(MachineBasicBlock::iterator &I) {
- ASSERT_SORTED(ForwardST0Table); ASSERT_SORTED(ReverseST0Table);
- ASSERT_SORTED(ForwardSTiTable); ASSERT_SORTED(ReverseSTiTable);
MachineInstr &MI = *I;
unsigned NumOperands = MI.getDesc().getNumOperands();
@@ -1475,7 +1482,8 @@ void FPS::handleSpecialFP(MachineBasicBlock::iterator &Inst) {
break;
}
- case TargetOpcode::INLINEASM: {
+ case TargetOpcode::INLINEASM:
+ case TargetOpcode::INLINEASM_BR: {
// The inline asm MachineInstr currently only *uses* FP registers for the
// 'f' constraint. These should be turned into the current ST(x) register
// in the machine instr.
diff --git a/lib/Target/X86/X86FrameLowering.cpp b/lib/Target/X86/X86FrameLowering.cpp
index 984db12201ed..e310fe069117 100644
--- a/lib/Target/X86/X86FrameLowering.cpp
+++ b/lib/Target/X86/X86FrameLowering.cpp
@@ -1,9 +1,8 @@
//===-- X86FrameLowering.cpp - X86 Frame Information ----------------------===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
@@ -585,23 +584,23 @@ void X86FrameLowering::emitStackProbeInline(MachineFunction &MF,
// registers. For the prolog expansion we use RAX, RCX and RDX.
MachineRegisterInfo &MRI = MF.getRegInfo();
const TargetRegisterClass *RegClass = &X86::GR64RegClass;
- const unsigned SizeReg = InProlog ? (unsigned)X86::RAX
+ const Register SizeReg = InProlog ? X86::RAX
: MRI.createVirtualRegister(RegClass),
- ZeroReg = InProlog ? (unsigned)X86::RCX
+ ZeroReg = InProlog ? X86::RCX
: MRI.createVirtualRegister(RegClass),
- CopyReg = InProlog ? (unsigned)X86::RDX
+ CopyReg = InProlog ? X86::RDX
: MRI.createVirtualRegister(RegClass),
- TestReg = InProlog ? (unsigned)X86::RDX
+ TestReg = InProlog ? X86::RDX
: MRI.createVirtualRegister(RegClass),
- FinalReg = InProlog ? (unsigned)X86::RDX
+ FinalReg = InProlog ? X86::RDX
: MRI.createVirtualRegister(RegClass),
- RoundedReg = InProlog ? (unsigned)X86::RDX
+ RoundedReg = InProlog ? X86::RDX
: MRI.createVirtualRegister(RegClass),
- LimitReg = InProlog ? (unsigned)X86::RCX
+ LimitReg = InProlog ? X86::RCX
: MRI.createVirtualRegister(RegClass),
- JoinReg = InProlog ? (unsigned)X86::RCX
+ JoinReg = InProlog ? X86::RCX
: MRI.createVirtualRegister(RegClass),
- ProbeReg = InProlog ? (unsigned)X86::RCX
+ ProbeReg = InProlog ? X86::RCX
: MRI.createVirtualRegister(RegClass);
// SP-relative offsets where we can save RCX and RDX.
@@ -654,9 +653,10 @@ void X86FrameLowering::emitStackProbeInline(MachineFunction &MF,
BuildMI(&MBB, DL, TII.get(X86::SUB64rr), TestReg)
.addReg(CopyReg)
.addReg(SizeReg);
- BuildMI(&MBB, DL, TII.get(X86::CMOVB64rr), FinalReg)
+ BuildMI(&MBB, DL, TII.get(X86::CMOV64rr), FinalReg)
.addReg(TestReg)
- .addReg(ZeroReg);
+ .addReg(ZeroReg)
+ .addImm(X86::COND_B);
// FinalReg now holds final stack pointer value, or zero if
// allocation would overflow. Compare against the current stack
@@ -673,7 +673,7 @@ void X86FrameLowering::emitStackProbeInline(MachineFunction &MF,
.addReg(X86::GS);
BuildMI(&MBB, DL, TII.get(X86::CMP64rr)).addReg(FinalReg).addReg(LimitReg);
// Jump if the desired stack pointer is at or above the stack limit.
- BuildMI(&MBB, DL, TII.get(X86::JAE_1)).addMBB(ContinueMBB);
+ BuildMI(&MBB, DL, TII.get(X86::JCC_1)).addMBB(ContinueMBB).addImm(X86::COND_AE);
// Add code to roundMBB to round the final stack pointer to a page boundary.
RoundMBB->addLiveIn(FinalReg);
@@ -710,7 +710,7 @@ void X86FrameLowering::emitStackProbeInline(MachineFunction &MF,
BuildMI(LoopMBB, DL, TII.get(X86::CMP64rr))
.addReg(RoundedReg)
.addReg(ProbeReg);
- BuildMI(LoopMBB, DL, TII.get(X86::JNE_1)).addMBB(LoopMBB);
+ BuildMI(LoopMBB, DL, TII.get(X86::JCC_1)).addMBB(LoopMBB).addImm(X86::COND_NE);
MachineBasicBlock::iterator ContinueMBBI = ContinueMBB->getFirstNonPHI();
@@ -794,8 +794,8 @@ void X86FrameLowering::emitStackProbeCall(MachineFunction &MF,
.addExternalSymbol(MF.createExternalSymbolName(Symbol));
}
- unsigned AX = Is64Bit ? X86::RAX : X86::EAX;
- unsigned SP = Is64Bit ? X86::RSP : X86::ESP;
+ unsigned AX = Uses64BitFramePtr ? X86::RAX : X86::EAX;
+ unsigned SP = Uses64BitFramePtr ? X86::RSP : X86::ESP;
CI.addReg(AX, RegState::Implicit)
.addReg(SP, RegState::Implicit)
.addReg(AX, RegState::Define | RegState::Implicit)
@@ -809,7 +809,7 @@ void X86FrameLowering::emitStackProbeCall(MachineFunction &MF,
// adjusting %rsp.
// All other platforms do not specify a particular ABI for the stack probe
// function, so we arbitrarily define it to not adjust %esp/%rsp itself.
- BuildMI(MBB, MBBI, DL, TII.get(getSUBrrOpcode(Is64Bit)), SP)
+ BuildMI(MBB, MBBI, DL, TII.get(getSUBrrOpcode(Uses64BitFramePtr)), SP)
.addReg(SP)
.addReg(AX);
}
@@ -872,6 +872,17 @@ void X86FrameLowering::BuildStackAlignAND(MachineBasicBlock &MBB,
MI->getOperand(3).setIsDead();
}
+bool X86FrameLowering::has128ByteRedZone(const MachineFunction& MF) const {
+ // x86-64 (non Win64) has a 128 byte red zone which is guaranteed not to be
+ // clobbered by any interrupt handler.
+ assert(&STI == &MF.getSubtarget<X86Subtarget>() &&
+ "MF used frame lowering for wrong subtarget");
+ const Function &Fn = MF.getFunction();
+ const bool IsWin64CC = STI.isCallingConvWin64(Fn.getCallingConv());
+ return Is64Bit && !IsWin64CC && !Fn.hasFnAttribute(Attribute::NoRedZone);
+}
+
+
/// emitPrologue - Push callee-saved registers onto the stack, which
/// automatically adjust the stack pointer. Adjust the stack pointer to allocate
/// space for local variables. Also emit labels used by the exception handler to
@@ -976,7 +987,6 @@ void X86FrameLowering::emitPrologue(MachineFunction &MF,
MF.hasEHFunclets() && Personality == EHPersonality::CoreCLR;
bool IsClrFunclet = IsFunclet && FnHasClrFunclet;
bool HasFP = hasFP(MF);
- bool IsWin64CC = STI.isCallingConvWin64(Fn.getCallingConv());
bool IsWin64Prologue = MF.getTarget().getMCAsmInfo()->usesWindowsCFI();
bool NeedsWin64CFI = IsWin64Prologue && Fn.needsUnwindTableEntry();
// FIXME: Emit FPO data for EH funclets.
@@ -1030,12 +1040,11 @@ void X86FrameLowering::emitPrologue(MachineFunction &MF,
// pointer, calls, or dynamic alloca then we do not need to adjust the
// stack pointer (we fit in the Red Zone). We also check that we don't
// push and pop from the stack.
- if (Is64Bit && !Fn.hasFnAttribute(Attribute::NoRedZone) &&
+ if (has128ByteRedZone(MF) &&
!TRI->needsStackRealignment(MF) &&
!MFI.hasVarSizedObjects() && // No dynamic alloca.
!MFI.adjustsStack() && // No calls.
!UseStackProbe && // No stack probes.
- !IsWin64CC && // Win64 has no Red Zone
!MFI.hasCopyImplyingStackAdjustment() && // Don't push and pop.
!MF.shouldSplitStack()) { // Regular stack
uint64_t MinSize = X86FI->getCalleeSavedFrameSize();
@@ -1774,6 +1783,15 @@ int X86FrameLowering::getFrameIndexReference(const MachineFunction &MF, int FI,
bool IsWin64Prologue = MF.getTarget().getMCAsmInfo()->usesWindowsCFI();
int64_t FPDelta = 0;
+ // In an x86 interrupt, remove the offset we added to account for the return
+ // address from any stack object allocated in the caller's frame. Interrupts
+ // do not have a standard return address. Fixed objects in the current frame,
+ // such as SSE register spills, should not get this treatment.
+ if (MF.getFunction().getCallingConv() == CallingConv::X86_INTR &&
+ Offset >= 0) {
+ Offset += getOffsetOfLocalArea();
+ }
+
if (IsWin64Prologue) {
assert(!MFI.hasCalls() || (StackSize % 16) == 8);
@@ -1888,8 +1906,7 @@ X86FrameLowering::getFrameIndexReferencePreferSP(const MachineFunction &MF,
// If !hasReservedCallFrame the function might have SP adjustement in the
// body. So, even though the offset is statically known, it depends on where
// we are in the function.
- const TargetFrameLowering *TFI = MF.getSubtarget().getFrameLowering();
- if (!IgnoreSPUpdates && !TFI->hasReservedCallFrame(MF))
+ if (!IgnoreSPUpdates && !hasReservedCallFrame(MF))
return getFrameIndexReference(MF, FI, FrameReg);
// We don't handle tail calls, and shouldn't be seeing them either.
@@ -2407,7 +2424,7 @@ void X86FrameLowering::adjustForSegmentedStacks(
// This jump is taken if SP >= (Stacklet Limit + Stack Space required).
// It jumps to normal execution of the function body.
- BuildMI(checkMBB, DL, TII.get(X86::JA_1)).addMBB(&PrologueMBB);
+ BuildMI(checkMBB, DL, TII.get(X86::JCC_1)).addMBB(&PrologueMBB).addImm(X86::COND_A);
// On 32 bit we first push the arguments size and then the frame size. On 64
// bit, we pass the stack frame size in r10 and the argument size in r11.
@@ -2637,7 +2654,7 @@ void X86FrameLowering::adjustForHiPEPrologue(
// SPLimitOffset is in a fixed heap location (pointed by BP).
addRegOffset(BuildMI(stackCheckMBB, DL, TII.get(CMPop))
.addReg(ScratchReg), PReg, false, SPLimitOffset);
- BuildMI(stackCheckMBB, DL, TII.get(X86::JAE_1)).addMBB(&PrologueMBB);
+ BuildMI(stackCheckMBB, DL, TII.get(X86::JCC_1)).addMBB(&PrologueMBB).addImm(X86::COND_AE);
// Create new MBB for IncStack:
BuildMI(incStackMBB, DL, TII.get(CALLop)).
@@ -2646,7 +2663,7 @@ void X86FrameLowering::adjustForHiPEPrologue(
SPReg, false, -MaxStack);
addRegOffset(BuildMI(incStackMBB, DL, TII.get(CMPop))
.addReg(ScratchReg), PReg, false, SPLimitOffset);
- BuildMI(incStackMBB, DL, TII.get(X86::JLE_1)).addMBB(incStackMBB);
+ BuildMI(incStackMBB, DL, TII.get(X86::JCC_1)).addMBB(incStackMBB).addImm(X86::COND_LE);
stackCheckMBB->addSuccessor(&PrologueMBB, {99, 100});
stackCheckMBB->addSuccessor(incStackMBB, {1, 100});
@@ -2802,7 +2819,7 @@ eliminateCallFramePseudoInstr(MachineFunction &MF, MachineBasicBlock &MBB,
StackAdjustment += mergeSPUpdates(MBB, InsertPos, false);
if (StackAdjustment) {
- if (!(F.optForMinSize() &&
+ if (!(F.hasMinSize() &&
adjustStackWithPops(MBB, InsertPos, DL, StackAdjustment)))
BuildStackAdjustment(MBB, InsertPos, DL, StackAdjustment,
/*InEpilogue=*/false);
@@ -3079,8 +3096,7 @@ void X86FrameLowering::orderFrameObjects(
// Sort the objects using X86FrameSortingAlgorithm (see its comment for
// info).
- std::stable_sort(SortingObjects.begin(), SortingObjects.end(),
- X86FrameSortingComparator());
+ llvm::stable_sort(SortingObjects, X86FrameSortingComparator());
// Now modify the original list to represent the final order that
// we want. The order will depend on whether we're going to access them
@@ -3154,7 +3170,7 @@ void X86FrameLowering::processFunctionBeforeFrameFinalized(
MinFixedObjOffset -= std::abs(MinFixedObjOffset) % 8;
int64_t UnwindHelpOffset = MinFixedObjOffset - SlotSize;
int UnwindHelpFI =
- MFI.CreateFixedObject(SlotSize, UnwindHelpOffset, /*Immutable=*/false);
+ MFI.CreateFixedObject(SlotSize, UnwindHelpOffset, /*IsImmutable=*/false);
EHInfo.UnwindHelpFrameIdx = UnwindHelpFI;
// Store -2 into UnwindHelp on function entry. We have to scan forwards past
diff --git a/lib/Target/X86/X86FrameLowering.h b/lib/Target/X86/X86FrameLowering.h
index 3bd805aae123..d32746e3a36e 100644
--- a/lib/Target/X86/X86FrameLowering.h
+++ b/lib/Target/X86/X86FrameLowering.h
@@ -1,9 +1,8 @@
//===-- X86TargetFrameLowering.h - Define frame lowering for X86 -*- C++ -*-==//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
@@ -172,6 +171,10 @@ public:
unsigned getInitialCFARegister(const MachineFunction &MF) const override;
+ /// Return true if the function has a redzone (accessible bytes past the
+ /// frame of the top of stack function) as part of it's ABI.
+ bool has128ByteRedZone(const MachineFunction& MF) const;
+
private:
uint64_t calculateMaxStackAlign(const MachineFunction &MF) const;
diff --git a/lib/Target/X86/X86GenRegisterBankInfo.def b/lib/Target/X86/X86GenRegisterBankInfo.def
index 9cd3f96f83ac..0fdea9071c29 100644
--- a/lib/Target/X86/X86GenRegisterBankInfo.def
+++ b/lib/Target/X86/X86GenRegisterBankInfo.def
@@ -1,9 +1,8 @@
//===- X86GenRegisterBankInfo.def ----------------------------*- C++ -*-==//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
/// \file
diff --git a/lib/Target/X86/X86ISelDAGToDAG.cpp b/lib/Target/X86/X86ISelDAGToDAG.cpp
index 5ac153244df9..95d31e62cafc 100644
--- a/lib/Target/X86/X86ISelDAGToDAG.cpp
+++ b/lib/Target/X86/X86ISelDAGToDAG.cpp
@@ -1,9 +1,8 @@
//===- X86ISelDAGToDAG.cpp - A DAG pattern matching inst selector for X86 -===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
@@ -74,6 +73,7 @@ namespace {
int JT;
unsigned Align; // CP alignment.
unsigned char SymbolFlags; // X86II::MO_*
+ bool NegateIndex = false;
X86ISelAddressMode()
: BaseType(RegBase), Base_FrameIndex(0), Scale(1), IndexReg(), Disp(0),
@@ -116,6 +116,8 @@ namespace {
dbgs() << " Base.FrameIndex " << Base_FrameIndex << '\n';
dbgs() << " Scale " << Scale << '\n'
<< "IndexReg ";
+ if (NegateIndex)
+ dbgs() << "negate ";
if (IndexReg.getNode())
IndexReg.getNode()->dump(DAG);
else
@@ -170,8 +172,8 @@ namespace {
public:
explicit X86DAGToDAGISel(X86TargetMachine &tm, CodeGenOpt::Level OptLevel)
- : SelectionDAGISel(tm, OptLevel), OptForSize(false),
- OptForMinSize(false) {}
+ : SelectionDAGISel(tm, OptLevel), Subtarget(nullptr), OptForSize(false),
+ OptForMinSize(false), IndirectTlsSegRefs(false) {}
StringRef getPassName() const override {
return "X86 DAG->DAG Instruction Selection";
@@ -182,6 +184,13 @@ namespace {
Subtarget = &MF.getSubtarget<X86Subtarget>();
IndirectTlsSegRefs = MF.getFunction().hasFnAttribute(
"indirect-tls-seg-refs");
+
+ // OptFor[Min]Size are used in pattern predicates that isel is matching.
+ OptForSize = MF.getFunction().hasOptSize();
+ OptForMinSize = MF.getFunction().hasMinSize();
+ assert((!OptForMinSize || OptForSize) &&
+ "OptForMinSize implies OptForSize");
+
SelectionDAGISel::runOnMachineFunction(MF);
return true;
}
@@ -204,7 +213,7 @@ namespace {
bool matchWrapper(SDValue N, X86ISelAddressMode &AM);
bool matchAddress(SDValue N, X86ISelAddressMode &AM);
bool matchVectorAddress(SDValue N, X86ISelAddressMode &AM);
- bool matchAdd(SDValue N, X86ISelAddressMode &AM, unsigned Depth);
+ bool matchAdd(SDValue &N, X86ISelAddressMode &AM, unsigned Depth);
bool matchAddressRecursively(SDValue N, X86ISelAddressMode &AM,
unsigned Depth);
bool matchAddressBase(SDValue N, X86ISelAddressMode &AM);
@@ -252,16 +261,32 @@ namespace {
void emitSpecialCodeForMain();
inline void getAddressOperands(X86ISelAddressMode &AM, const SDLoc &DL,
- SDValue &Base, SDValue &Scale,
+ MVT VT, SDValue &Base, SDValue &Scale,
SDValue &Index, SDValue &Disp,
SDValue &Segment) {
- Base = (AM.BaseType == X86ISelAddressMode::FrameIndexBase)
- ? CurDAG->getTargetFrameIndex(
- AM.Base_FrameIndex,
- TLI->getPointerTy(CurDAG->getDataLayout()))
- : AM.Base_Reg;
+ if (AM.BaseType == X86ISelAddressMode::FrameIndexBase)
+ Base = CurDAG->getTargetFrameIndex(
+ AM.Base_FrameIndex, TLI->getPointerTy(CurDAG->getDataLayout()));
+ else if (AM.Base_Reg.getNode())
+ Base = AM.Base_Reg;
+ else
+ Base = CurDAG->getRegister(0, VT);
+
Scale = getI8Imm(AM.Scale, DL);
- Index = AM.IndexReg;
+
+ // Negate the index if needed.
+ if (AM.NegateIndex) {
+ unsigned NegOpc = VT == MVT::i64 ? X86::NEG64r : X86::NEG32r;
+ SDValue Neg = SDValue(CurDAG->getMachineNode(NegOpc, DL, VT, MVT::i32,
+ AM.IndexReg), 0);
+ AM.IndexReg = Neg;
+ }
+
+ if (AM.IndexReg.getNode())
+ Index = AM.IndexReg;
+ else
+ Index = CurDAG->getRegister(0, VT);
+
// These are 32-bit even in 64-bit mode since RIP-relative offset
// is 32-bit.
if (AM.GV)
@@ -290,7 +315,7 @@ namespace {
if (AM.Segment.getNode())
Segment = AM.Segment;
else
- Segment = CurDAG->getRegister(0, MVT::i32);
+ Segment = CurDAG->getRegister(0, MVT::i16);
}
// Utility function to determine whether we should avoid selecting
@@ -400,6 +425,19 @@ namespace {
return getI8Imm((Index * VecVT.getScalarSizeInBits()) / VecWidth, DL);
}
+ // Helper to detect unneeded and instructions on shift amounts. Called
+ // from PatFrags in tablegen.
+ bool isUnneededShiftMask(SDNode *N, unsigned Width) const {
+ assert(N->getOpcode() == ISD::AND && "Unexpected opcode");
+ const APInt &Val = cast<ConstantSDNode>(N->getOperand(1))->getAPIntValue();
+
+ if (Val.countTrailingOnes() >= Width)
+ return true;
+
+ APInt Mask = Val | CurDAG->computeKnownBits(N->getOperand(0)).Zero;
+ return Mask.countTrailingOnes() >= Width;
+ }
+
/// Return an SDNode that returns the value of the global base register.
/// Output instructions required to initialize the global base register,
/// if necessary.
@@ -464,6 +502,8 @@ namespace {
bool shrinkAndImmediate(SDNode *N);
bool isMaskZeroExtended(SDNode *N) const;
bool tryShiftAmountMod(SDNode *N);
+ bool tryShrinkShlLogicImm(SDNode *N);
+ bool tryVPTESTM(SDNode *Root, SDValue Setcc, SDValue Mask);
MachineSDNode *emitPCMPISTR(unsigned ROpc, unsigned MOpc, bool MayFoldLoad,
const SDLoc &dl, MVT VT, SDNode *Node);
@@ -485,7 +525,7 @@ namespace {
static bool isLegalMaskCompare(SDNode *N, const X86Subtarget *Subtarget) {
unsigned Opcode = N->getOpcode();
if (Opcode == X86ISD::CMPM || Opcode == ISD::SETCC ||
- Opcode == X86ISD::CMPM_RND || Opcode == X86ISD::VFPCLASS) {
+ Opcode == X86ISD::CMPM_SAE || Opcode == X86ISD::VFPCLASS) {
// We can get 256-bit 8 element types here without VLX being enabled. When
// this happens we will use 512-bit operations and the mask will not be
// zero extended.
@@ -497,7 +537,7 @@ static bool isLegalMaskCompare(SDNode *N, const X86Subtarget *Subtarget) {
}
// Scalar opcodes use 128 bit registers, but aren't subject to the VLX check.
if (Opcode == X86ISD::VFPCLASSS || Opcode == X86ISD::FSETCCM ||
- Opcode == X86ISD::FSETCCM_RND)
+ Opcode == X86ISD::FSETCCM_SAE)
return true;
return false;
@@ -571,6 +611,21 @@ X86DAGToDAGISel::IsProfitableToFold(SDValue N, SDNode *U, SDNode *Root) const {
Imm->getAPIntValue().getBitWidth() == 64 &&
Imm->getAPIntValue().isIntN(32))
return false;
+
+ // If this really a zext_inreg that can be represented with a movzx
+ // instruction, prefer that.
+ // TODO: We could shrink the load and fold if it is non-volatile.
+ if (U->getOpcode() == ISD::AND &&
+ (Imm->getAPIntValue() == UINT8_MAX ||
+ Imm->getAPIntValue() == UINT16_MAX ||
+ Imm->getAPIntValue() == UINT32_MAX))
+ return false;
+
+ // ADD/SUB with can negate the immediate and use the opposite operation
+ // to fit 128 into a sign extended 8 bit immediate.
+ if ((U->getOpcode() == ISD::ADD || U->getOpcode() == ISD::SUB) &&
+ (-Imm->getAPIntValue()).isSignedIntN(8))
+ return false;
}
// If the other operand is a TLS address, we should fold it instead.
@@ -720,11 +775,6 @@ static bool isCalleeLoad(SDValue Callee, SDValue &Chain, bool HasCallSeq) {
}
void X86DAGToDAGISel::PreprocessISelDAG() {
- // OptFor[Min]Size are used in pattern predicates that isel is matching.
- OptForSize = MF->getFunction().optForSize();
- OptForMinSize = MF->getFunction().optForMinSize();
- assert((!OptForMinSize || OptForSize) && "OptForMinSize implies OptForSize");
-
for (SelectionDAG::allnodes_iterator I = CurDAG->allnodes_begin(),
E = CurDAG->allnodes_end(); I != E; ) {
SDNode *N = &*I++; // Preincrement iterator to avoid invalidation issues.
@@ -741,6 +791,143 @@ void X86DAGToDAGISel::PreprocessISelDAG() {
continue;
}
+ switch (N->getOpcode()) {
+ case ISD::FP_TO_SINT:
+ case ISD::FP_TO_UINT: {
+ // Replace vector fp_to_s/uint with their X86 specific equivalent so we
+ // don't need 2 sets of patterns.
+ if (!N->getSimpleValueType(0).isVector())
+ break;
+
+ unsigned NewOpc;
+ switch (N->getOpcode()) {
+ default: llvm_unreachable("Unexpected opcode!");
+ case ISD::FP_TO_SINT: NewOpc = X86ISD::CVTTP2SI; break;
+ case ISD::FP_TO_UINT: NewOpc = X86ISD::CVTTP2UI; break;
+ }
+ SDValue Res = CurDAG->getNode(NewOpc, SDLoc(N), N->getValueType(0),
+ N->getOperand(0));
+ --I;
+ CurDAG->ReplaceAllUsesOfValueWith(SDValue(N, 0), Res);
+ ++I;
+ CurDAG->DeleteNode(N);
+ continue;
+ }
+ case ISD::SHL:
+ case ISD::SRA:
+ case ISD::SRL: {
+ // Replace vector shifts with their X86 specific equivalent so we don't
+ // need 2 sets of patterns.
+ if (!N->getValueType(0).isVector())
+ break;
+
+ unsigned NewOpc;
+ switch (N->getOpcode()) {
+ default: llvm_unreachable("Unexpected opcode!");
+ case ISD::SHL: NewOpc = X86ISD::VSHLV; break;
+ case ISD::SRA: NewOpc = X86ISD::VSRAV; break;
+ case ISD::SRL: NewOpc = X86ISD::VSRLV; break;
+ }
+ SDValue Res = CurDAG->getNode(NewOpc, SDLoc(N), N->getValueType(0),
+ N->getOperand(0), N->getOperand(1));
+ --I;
+ CurDAG->ReplaceAllUsesOfValueWith(SDValue(N, 0), Res);
+ ++I;
+ CurDAG->DeleteNode(N);
+ continue;
+ }
+ case ISD::ANY_EXTEND:
+ case ISD::ANY_EXTEND_VECTOR_INREG: {
+ // Replace vector any extend with the zero extend equivalents so we don't
+ // need 2 sets of patterns. Ignore vXi1 extensions.
+ if (!N->getValueType(0).isVector() ||
+ N->getOperand(0).getScalarValueSizeInBits() == 1)
+ break;
+
+ unsigned NewOpc = N->getOpcode() == ISD::ANY_EXTEND
+ ? ISD::ZERO_EXTEND
+ : ISD::ZERO_EXTEND_VECTOR_INREG;
+
+ SDValue Res = CurDAG->getNode(NewOpc, SDLoc(N), N->getValueType(0),
+ N->getOperand(0));
+ --I;
+ CurDAG->ReplaceAllUsesOfValueWith(SDValue(N, 0), Res);
+ ++I;
+ CurDAG->DeleteNode(N);
+ continue;
+ }
+ case ISD::FCEIL:
+ case ISD::FFLOOR:
+ case ISD::FTRUNC:
+ case ISD::FNEARBYINT:
+ case ISD::FRINT: {
+ // Replace fp rounding with their X86 specific equivalent so we don't
+ // need 2 sets of patterns.
+ unsigned Imm;
+ switch (N->getOpcode()) {
+ default: llvm_unreachable("Unexpected opcode!");
+ case ISD::FCEIL: Imm = 0xA; break;
+ case ISD::FFLOOR: Imm = 0x9; break;
+ case ISD::FTRUNC: Imm = 0xB; break;
+ case ISD::FNEARBYINT: Imm = 0xC; break;
+ case ISD::FRINT: Imm = 0x4; break;
+ }
+ SDLoc dl(N);
+ SDValue Res = CurDAG->getNode(X86ISD::VRNDSCALE, dl,
+ N->getValueType(0),
+ N->getOperand(0),
+ CurDAG->getConstant(Imm, dl, MVT::i8));
+ --I;
+ CurDAG->ReplaceAllUsesOfValueWith(SDValue(N, 0), Res);
+ ++I;
+ CurDAG->DeleteNode(N);
+ continue;
+ }
+ case X86ISD::FANDN:
+ case X86ISD::FAND:
+ case X86ISD::FOR:
+ case X86ISD::FXOR: {
+ // Widen scalar fp logic ops to vector to reduce isel patterns.
+ // FIXME: Can we do this during lowering/combine.
+ MVT VT = N->getSimpleValueType(0);
+ if (VT.isVector() || VT == MVT::f128)
+ break;
+
+ MVT VecVT = VT == MVT::f64 ? MVT::v2f64 : MVT::v4f32;
+ SDLoc dl(N);
+ SDValue Op0 = CurDAG->getNode(ISD::SCALAR_TO_VECTOR, dl, VecVT,
+ N->getOperand(0));
+ SDValue Op1 = CurDAG->getNode(ISD::SCALAR_TO_VECTOR, dl, VecVT,
+ N->getOperand(1));
+
+ SDValue Res;
+ if (Subtarget->hasSSE2()) {
+ EVT IntVT = EVT(VecVT).changeVectorElementTypeToInteger();
+ Op0 = CurDAG->getNode(ISD::BITCAST, dl, IntVT, Op0);
+ Op1 = CurDAG->getNode(ISD::BITCAST, dl, IntVT, Op1);
+ unsigned Opc;
+ switch (N->getOpcode()) {
+ default: llvm_unreachable("Unexpected opcode!");
+ case X86ISD::FANDN: Opc = X86ISD::ANDNP; break;
+ case X86ISD::FAND: Opc = ISD::AND; break;
+ case X86ISD::FOR: Opc = ISD::OR; break;
+ case X86ISD::FXOR: Opc = ISD::XOR; break;
+ }
+ Res = CurDAG->getNode(Opc, dl, IntVT, Op0, Op1);
+ Res = CurDAG->getNode(ISD::BITCAST, dl, VecVT, Res);
+ } else {
+ Res = CurDAG->getNode(N->getOpcode(), dl, VecVT, Op0, Op1);
+ }
+ Res = CurDAG->getNode(ISD::EXTRACT_VECTOR_ELT, dl, VT, Res,
+ CurDAG->getIntPtrConstant(0, dl));
+ --I;
+ CurDAG->ReplaceAllUsesOfValueWith(SDValue(N, 0), Res);
+ ++I;
+ CurDAG->DeleteNode(N);
+ continue;
+ }
+ }
+
if (OptLevel != CodeGenOpt::None &&
// Only do this when the target can fold the load into the call or
// jmp.
@@ -786,65 +973,135 @@ void X86DAGToDAGISel::PreprocessISelDAG() {
// and the node legalization. As such this pass basically does "really
// late" legalization of these inline with the X86 isel pass.
// FIXME: This should only happen when not compiled with -O0.
- if (N->getOpcode() != ISD::FP_ROUND && N->getOpcode() != ISD::FP_EXTEND)
- continue;
+ switch (N->getOpcode()) {
+ default: continue;
+ case ISD::FP_ROUND:
+ case ISD::FP_EXTEND:
+ {
+ MVT SrcVT = N->getOperand(0).getSimpleValueType();
+ MVT DstVT = N->getSimpleValueType(0);
+
+ // If any of the sources are vectors, no fp stack involved.
+ if (SrcVT.isVector() || DstVT.isVector())
+ continue;
- MVT SrcVT = N->getOperand(0).getSimpleValueType();
- MVT DstVT = N->getSimpleValueType(0);
+ // If the source and destination are SSE registers, then this is a legal
+ // conversion that should not be lowered.
+ const X86TargetLowering *X86Lowering =
+ static_cast<const X86TargetLowering *>(TLI);
+ bool SrcIsSSE = X86Lowering->isScalarFPTypeInSSEReg(SrcVT);
+ bool DstIsSSE = X86Lowering->isScalarFPTypeInSSEReg(DstVT);
+ if (SrcIsSSE && DstIsSSE)
+ continue;
- // If any of the sources are vectors, no fp stack involved.
- if (SrcVT.isVector() || DstVT.isVector())
- continue;
+ if (!SrcIsSSE && !DstIsSSE) {
+ // If this is an FPStack extension, it is a noop.
+ if (N->getOpcode() == ISD::FP_EXTEND)
+ continue;
+ // If this is a value-preserving FPStack truncation, it is a noop.
+ if (N->getConstantOperandVal(1))
+ continue;
+ }
- // If the source and destination are SSE registers, then this is a legal
- // conversion that should not be lowered.
- const X86TargetLowering *X86Lowering =
- static_cast<const X86TargetLowering *>(TLI);
- bool SrcIsSSE = X86Lowering->isScalarFPTypeInSSEReg(SrcVT);
- bool DstIsSSE = X86Lowering->isScalarFPTypeInSSEReg(DstVT);
- if (SrcIsSSE && DstIsSSE)
- continue;
+ // Here we could have an FP stack truncation or an FPStack <-> SSE convert.
+ // FPStack has extload and truncstore. SSE can fold direct loads into other
+ // operations. Based on this, decide what we want to do.
+ MVT MemVT;
+ if (N->getOpcode() == ISD::FP_ROUND)
+ MemVT = DstVT; // FP_ROUND must use DstVT, we can't do a 'trunc load'.
+ else
+ MemVT = SrcIsSSE ? SrcVT : DstVT;
+
+ SDValue MemTmp = CurDAG->CreateStackTemporary(MemVT);
+ SDLoc dl(N);
- if (!SrcIsSSE && !DstIsSSE) {
- // If this is an FPStack extension, it is a noop.
- if (N->getOpcode() == ISD::FP_EXTEND)
+ // FIXME: optimize the case where the src/dest is a load or store?
+
+ SDValue Store = CurDAG->getTruncStore(CurDAG->getEntryNode(), dl, N->getOperand(0),
+ MemTmp, MachinePointerInfo(), MemVT);
+ SDValue Result = CurDAG->getExtLoad(ISD::EXTLOAD, dl, DstVT, Store, MemTmp,
+ MachinePointerInfo(), MemVT);
+
+ // We're about to replace all uses of the FP_ROUND/FP_EXTEND with the
+ // extload we created. This will cause general havok on the dag because
+ // anything below the conversion could be folded into other existing nodes.
+ // To avoid invalidating 'I', back it up to the convert node.
+ --I;
+ CurDAG->ReplaceAllUsesOfValueWith(SDValue(N, 0), Result);
+ break;
+ }
+
+ //The sequence of events for lowering STRICT_FP versions of these nodes requires
+ //dealing with the chain differently, as there is already a preexisting chain.
+ case ISD::STRICT_FP_ROUND:
+ case ISD::STRICT_FP_EXTEND:
+ {
+ MVT SrcVT = N->getOperand(1).getSimpleValueType();
+ MVT DstVT = N->getSimpleValueType(0);
+
+ // If any of the sources are vectors, no fp stack involved.
+ if (SrcVT.isVector() || DstVT.isVector())
continue;
- // If this is a value-preserving FPStack truncation, it is a noop.
- if (N->getConstantOperandVal(1))
+
+ // If the source and destination are SSE registers, then this is a legal
+ // conversion that should not be lowered.
+ const X86TargetLowering *X86Lowering =
+ static_cast<const X86TargetLowering *>(TLI);
+ bool SrcIsSSE = X86Lowering->isScalarFPTypeInSSEReg(SrcVT);
+ bool DstIsSSE = X86Lowering->isScalarFPTypeInSSEReg(DstVT);
+ if (SrcIsSSE && DstIsSSE)
continue;
- }
- // Here we could have an FP stack truncation or an FPStack <-> SSE convert.
- // FPStack has extload and truncstore. SSE can fold direct loads into other
- // operations. Based on this, decide what we want to do.
- MVT MemVT;
- if (N->getOpcode() == ISD::FP_ROUND)
- MemVT = DstVT; // FP_ROUND must use DstVT, we can't do a 'trunc load'.
- else
- MemVT = SrcIsSSE ? SrcVT : DstVT;
+ if (!SrcIsSSE && !DstIsSSE) {
+ // If this is an FPStack extension, it is a noop.
+ if (N->getOpcode() == ISD::STRICT_FP_EXTEND)
+ continue;
+ // If this is a value-preserving FPStack truncation, it is a noop.
+ if (N->getConstantOperandVal(2))
+ continue;
+ }
+
+ // Here we could have an FP stack truncation or an FPStack <-> SSE convert.
+ // FPStack has extload and truncstore. SSE can fold direct loads into other
+ // operations. Based on this, decide what we want to do.
+ MVT MemVT;
+ if (N->getOpcode() == ISD::STRICT_FP_ROUND)
+ MemVT = DstVT; // FP_ROUND must use DstVT, we can't do a 'trunc load'.
+ else
+ MemVT = SrcIsSSE ? SrcVT : DstVT;
+
+ SDValue MemTmp = CurDAG->CreateStackTemporary(MemVT);
+ SDLoc dl(N);
+
+ // FIXME: optimize the case where the src/dest is a load or store?
- SDValue MemTmp = CurDAG->CreateStackTemporary(MemVT);
- SDLoc dl(N);
+ //Since the operation is StrictFP, use the preexisting chain.
+ SDValue Store = CurDAG->getTruncStore(N->getOperand(0), dl, N->getOperand(1),
+ MemTmp, MachinePointerInfo(), MemVT);
+ SDValue Result = CurDAG->getExtLoad(ISD::EXTLOAD, dl, DstVT, Store, MemTmp,
+ MachinePointerInfo(), MemVT);
- // FIXME: optimize the case where the src/dest is a load or store?
- SDValue Store =
- CurDAG->getTruncStore(CurDAG->getEntryNode(), dl, N->getOperand(0),
- MemTmp, MachinePointerInfo(), MemVT);
- SDValue Result = CurDAG->getExtLoad(ISD::EXTLOAD, dl, DstVT, Store, MemTmp,
- MachinePointerInfo(), MemVT);
+ // We're about to replace all uses of the FP_ROUND/FP_EXTEND with the
+ // extload we created. This will cause general havok on the dag because
+ // anything below the conversion could be folded into other existing nodes.
+ // To avoid invalidating 'I', back it up to the convert node.
+ --I;
+ CurDAG->ReplaceAllUsesWith(N, Result.getNode());
+ break;
+ }
+ }
- // We're about to replace all uses of the FP_ROUND/FP_EXTEND with the
- // extload we created. This will cause general havok on the dag because
- // anything below the conversion could be folded into other existing nodes.
- // To avoid invalidating 'I', back it up to the convert node.
- --I;
- CurDAG->ReplaceAllUsesOfValueWith(SDValue(N, 0), Result);
// Now that we did that, the node is dead. Increment the iterator to the
// next node to process, then delete N.
++I;
CurDAG->DeleteNode(N);
}
+
+ // The load+call transform above can leave some dead nodes in the graph. Make
+ // sure we remove them. Its possible some of the other transforms do to so
+ // just remove dead nodes unconditionally.
+ CurDAG->RemoveDeadNodes();
}
// Look for a redundant movzx/movsx that can occur after an 8-bit divrem.
@@ -1138,15 +1395,23 @@ bool X86DAGToDAGISel::matchWrapper(SDValue N, X86ISelAddressMode &AM) {
if (AM.hasSymbolicDisplacement())
return true;
+ bool IsRIPRelTLS = false;
bool IsRIPRel = N.getOpcode() == X86ISD::WrapperRIP;
+ if (IsRIPRel) {
+ SDValue Val = N.getOperand(0);
+ if (Val.getOpcode() == ISD::TargetGlobalTLSAddress)
+ IsRIPRelTLS = true;
+ }
- // We can't use an addressing mode in the 64-bit large code model. In the
- // medium code model, we use can use an mode when RIP wrappers are present.
- // That signifies access to globals that are known to be "near", such as the
- // GOT itself.
+ // We can't use an addressing mode in the 64-bit large code model.
+ // Global TLS addressing is an exception. In the medium code model,
+ // we use can use a mode when RIP wrappers are present.
+ // That signifies access to globals that are known to be "near",
+ // such as the GOT itself.
CodeModel::Model M = TM.getCodeModel();
if (Subtarget->is64Bit() &&
- (M == CodeModel::Large || (M == CodeModel::Medium && !IsRIPRel)))
+ ((M == CodeModel::Large && !IsRIPRelTLS) ||
+ (M == CodeModel::Medium && !IsRIPRel)))
return true;
// Base and index reg must be 0 in order to use %rip as base.
@@ -1212,20 +1477,25 @@ bool X86DAGToDAGISel::matchAddress(SDValue N, X86ISelAddressMode &AM) {
// Post-processing: Convert foo to foo(%rip), even in non-PIC mode,
// because it has a smaller encoding.
// TODO: Which other code models can use this?
- if (TM.getCodeModel() == CodeModel::Small &&
- Subtarget->is64Bit() &&
- AM.Scale == 1 &&
- AM.BaseType == X86ISelAddressMode::RegBase &&
- AM.Base_Reg.getNode() == nullptr &&
- AM.IndexReg.getNode() == nullptr &&
- AM.SymbolFlags == X86II::MO_NO_FLAG &&
- AM.hasSymbolicDisplacement())
- AM.Base_Reg = CurDAG->getRegister(X86::RIP, MVT::i64);
+ switch (TM.getCodeModel()) {
+ default: break;
+ case CodeModel::Small:
+ case CodeModel::Kernel:
+ if (Subtarget->is64Bit() &&
+ AM.Scale == 1 &&
+ AM.BaseType == X86ISelAddressMode::RegBase &&
+ AM.Base_Reg.getNode() == nullptr &&
+ AM.IndexReg.getNode() == nullptr &&
+ AM.SymbolFlags == X86II::MO_NO_FLAG &&
+ AM.hasSymbolicDisplacement())
+ AM.Base_Reg = CurDAG->getRegister(X86::RIP, MVT::i64);
+ break;
+ }
return false;
}
-bool X86DAGToDAGISel::matchAdd(SDValue N, X86ISelAddressMode &AM,
+bool X86DAGToDAGISel::matchAdd(SDValue &N, X86ISelAddressMode &AM,
unsigned Depth) {
// Add an artificial use to this node so that we can keep track of
// it if it gets CSE'd with a different node.
@@ -1317,6 +1587,7 @@ static bool foldMaskAndShiftToExtract(SelectionDAG &DAG, SDValue N,
insertDAGNode(DAG, N, ShlCount);
insertDAGNode(DAG, N, Shl);
DAG.ReplaceAllUsesWith(N, Shl);
+ DAG.RemoveDeadNode(N.getNode());
AM.IndexReg = And;
AM.Scale = (1 << ScaleLog);
return false;
@@ -1326,13 +1597,31 @@ static bool foldMaskAndShiftToExtract(SelectionDAG &DAG, SDValue N,
// allows us to fold the shift into this addressing mode. Returns false if the
// transform succeeded.
static bool foldMaskedShiftToScaledMask(SelectionDAG &DAG, SDValue N,
- uint64_t Mask,
- SDValue Shift, SDValue X,
X86ISelAddressMode &AM) {
+ SDValue Shift = N.getOperand(0);
+
+ // Use a signed mask so that shifting right will insert sign bits. These
+ // bits will be removed when we shift the result left so it doesn't matter
+ // what we use. This might allow a smaller immediate encoding.
+ int64_t Mask = cast<ConstantSDNode>(N->getOperand(1))->getSExtValue();
+
+ // If we have an any_extend feeding the AND, look through it to see if there
+ // is a shift behind it. But only if the AND doesn't use the extended bits.
+ // FIXME: Generalize this to other ANY_EXTEND than i32 to i64?
+ bool FoundAnyExtend = false;
+ if (Shift.getOpcode() == ISD::ANY_EXTEND && Shift.hasOneUse() &&
+ Shift.getOperand(0).getSimpleValueType() == MVT::i32 &&
+ isUInt<32>(Mask)) {
+ FoundAnyExtend = true;
+ Shift = Shift.getOperand(0);
+ }
+
if (Shift.getOpcode() != ISD::SHL ||
!isa<ConstantSDNode>(Shift.getOperand(1)))
return true;
+ SDValue X = Shift.getOperand(0);
+
// Not likely to be profitable if either the AND or SHIFT node has more
// than one use (unless all uses are for address computation). Besides,
// isel mechanism requires their node ids to be reused.
@@ -1346,6 +1635,12 @@ static bool foldMaskedShiftToScaledMask(SelectionDAG &DAG, SDValue N,
MVT VT = N.getSimpleValueType();
SDLoc DL(N);
+ if (FoundAnyExtend) {
+ SDValue NewX = DAG.getNode(ISD::ANY_EXTEND, DL, VT, X);
+ insertDAGNode(DAG, N, NewX);
+ X = NewX;
+ }
+
SDValue NewMask = DAG.getConstant(Mask >> ShiftAmt, DL, VT);
SDValue NewAnd = DAG.getNode(ISD::AND, DL, VT, X, NewMask);
SDValue NewShift = DAG.getNode(ISD::SHL, DL, VT, NewAnd, Shift.getOperand(1));
@@ -1359,6 +1654,7 @@ static bool foldMaskedShiftToScaledMask(SelectionDAG &DAG, SDValue N,
insertDAGNode(DAG, N, NewAnd);
insertDAGNode(DAG, N, NewShift);
DAG.ReplaceAllUsesWith(N, NewShift);
+ DAG.RemoveDeadNode(N.getNode());
AM.Scale = 1 << ShiftAmt;
AM.IndexReg = NewAnd;
@@ -1469,6 +1765,7 @@ static bool foldMaskAndShiftToScale(SelectionDAG &DAG, SDValue N,
insertDAGNode(DAG, N, NewSHLAmt);
insertDAGNode(DAG, N, NewSHL);
DAG.ReplaceAllUsesWith(N, NewSHL);
+ DAG.RemoveDeadNode(N.getNode());
AM.Scale = 1 << AMShiftAmt;
AM.IndexReg = NewSRL;
@@ -1527,6 +1824,7 @@ static bool foldMaskedShiftToBEXTR(SelectionDAG &DAG, SDValue N,
insertDAGNode(DAG, N, NewSHLAmt);
insertDAGNode(DAG, N, NewSHL);
DAG.ReplaceAllUsesWith(N, NewSHL);
+ DAG.RemoveDeadNode(N.getNode());
AM.Scale = 1 << AMShiftAmt;
AM.IndexReg = NewAnd;
@@ -1634,14 +1932,15 @@ bool X86DAGToDAGISel::matchAddressRecursively(SDValue N, X86ISelAddressMode &AM,
// Scale must not be used already.
if (AM.IndexReg.getNode() != nullptr || AM.Scale != 1) break;
+ // We only handle up to 64-bit values here as those are what matter for
+ // addressing mode optimizations.
+ assert(N.getSimpleValueType().getSizeInBits() <= 64 &&
+ "Unexpected value size!");
+
SDValue And = N.getOperand(0);
if (And.getOpcode() != ISD::AND) break;
SDValue X = And.getOperand(0);
- // We only handle up to 64-bit values here as those are what matter for
- // addressing mode optimizations.
- if (X.getSimpleValueType().getSizeInBits() > 64) break;
-
// The mask used for the transform is expected to be post-shift, but we
// found the shift first so just apply the shift to the mask before passing
// it down.
@@ -1712,9 +2011,11 @@ bool X86DAGToDAGISel::matchAddressRecursively(SDValue N, X86ISelAddressMode &AM,
// Test if the LHS of the sub can be folded.
X86ISelAddressMode Backup = AM;
if (matchAddressRecursively(N.getOperand(0), AM, Depth+1)) {
+ N = Handle.getValue();
AM = Backup;
break;
}
+ N = Handle.getValue();
// Test if the index field is free for use.
if (AM.IndexReg.getNode() || AM.isRIPRelative()) {
AM = Backup;
@@ -1722,7 +2023,7 @@ bool X86DAGToDAGISel::matchAddressRecursively(SDValue N, X86ISelAddressMode &AM,
}
int Cost = 0;
- SDValue RHS = Handle.getValue().getOperand(1);
+ SDValue RHS = N.getOperand(1);
// If the RHS involves a register with multiple uses, this
// transformation incurs an extra mov, due to the neg instruction
// clobbering its operand.
@@ -1735,9 +2036,7 @@ bool X86DAGToDAGISel::matchAddressRecursively(SDValue N, X86ISelAddressMode &AM,
++Cost;
// If the base is a register with multiple uses, this
// transformation may save a mov.
- // FIXME: Don't rely on DELETED_NODEs.
if ((AM.BaseType == X86ISelAddressMode::RegBase && AM.Base_Reg.getNode() &&
- AM.Base_Reg->getOpcode() != ISD::DELETED_NODE &&
!AM.Base_Reg.getNode()->hasOneUse()) ||
AM.BaseType == X86ISelAddressMode::FrameIndexBase)
--Cost;
@@ -1754,14 +2053,11 @@ bool X86DAGToDAGISel::matchAddressRecursively(SDValue N, X86ISelAddressMode &AM,
}
// Ok, the transformation is legal and appears profitable. Go for it.
- SDValue Zero = CurDAG->getConstant(0, dl, N.getValueType());
- SDValue Neg = CurDAG->getNode(ISD::SUB, dl, N.getValueType(), Zero, RHS);
- AM.IndexReg = Neg;
+ // Negation will be emitted later to avoid creating dangling nodes if this
+ // was an unprofitable LEA.
+ AM.IndexReg = RHS;
+ AM.NegateIndex = true;
AM.Scale = 1;
-
- // Insert the new nodes into the topological ordering.
- insertDAGNode(*CurDAG, Handle.getValue(), Zero);
- insertDAGNode(*CurDAG, Handle.getValue(), Neg);
return false;
}
@@ -1789,37 +2085,77 @@ bool X86DAGToDAGISel::matchAddressRecursively(SDValue N, X86ISelAddressMode &AM,
// Scale must not be used already.
if (AM.IndexReg.getNode() != nullptr || AM.Scale != 1) break;
- SDValue Shift = N.getOperand(0);
- if (Shift.getOpcode() != ISD::SRL && Shift.getOpcode() != ISD::SHL) break;
- SDValue X = Shift.getOperand(0);
-
// We only handle up to 64-bit values here as those are what matter for
// addressing mode optimizations.
- if (X.getSimpleValueType().getSizeInBits() > 64) break;
+ assert(N.getSimpleValueType().getSizeInBits() <= 64 &&
+ "Unexpected value size!");
if (!isa<ConstantSDNode>(N.getOperand(1)))
break;
- uint64_t Mask = N.getConstantOperandVal(1);
- // Try to fold the mask and shift into an extract and scale.
- if (!foldMaskAndShiftToExtract(*CurDAG, N, Mask, Shift, X, AM))
- return false;
+ if (N.getOperand(0).getOpcode() == ISD::SRL) {
+ SDValue Shift = N.getOperand(0);
+ SDValue X = Shift.getOperand(0);
- // Try to fold the mask and shift directly into the scale.
- if (!foldMaskAndShiftToScale(*CurDAG, N, Mask, Shift, X, AM))
- return false;
+ uint64_t Mask = N.getConstantOperandVal(1);
+
+ // Try to fold the mask and shift into an extract and scale.
+ if (!foldMaskAndShiftToExtract(*CurDAG, N, Mask, Shift, X, AM))
+ return false;
+
+ // Try to fold the mask and shift directly into the scale.
+ if (!foldMaskAndShiftToScale(*CurDAG, N, Mask, Shift, X, AM))
+ return false;
+
+ // Try to fold the mask and shift into BEXTR and scale.
+ if (!foldMaskedShiftToBEXTR(*CurDAG, N, Mask, Shift, X, AM, *Subtarget))
+ return false;
+ }
// Try to swap the mask and shift to place shifts which can be done as
// a scale on the outside of the mask.
- if (!foldMaskedShiftToScaledMask(*CurDAG, N, Mask, Shift, X, AM))
- return false;
-
- // Try to fold the mask and shift into BEXTR and scale.
- if (!foldMaskedShiftToBEXTR(*CurDAG, N, Mask, Shift, X, AM, *Subtarget))
+ if (!foldMaskedShiftToScaledMask(*CurDAG, N, AM))
return false;
break;
}
+ case ISD::ZERO_EXTEND: {
+ // Try to widen a zexted shift left to the same size as its use, so we can
+ // match the shift as a scale factor.
+ if (AM.IndexReg.getNode() != nullptr || AM.Scale != 1)
+ break;
+ if (N.getOperand(0).getOpcode() != ISD::SHL || !N.getOperand(0).hasOneUse())
+ break;
+
+ // Give up if the shift is not a valid scale factor [1,2,3].
+ SDValue Shl = N.getOperand(0);
+ auto *ShAmtC = dyn_cast<ConstantSDNode>(Shl.getOperand(1));
+ if (!ShAmtC || ShAmtC->getZExtValue() > 3)
+ break;
+
+ // The narrow shift must only shift out zero bits (it must be 'nuw').
+ // That makes it safe to widen to the destination type.
+ APInt HighZeros = APInt::getHighBitsSet(Shl.getValueSizeInBits(),
+ ShAmtC->getZExtValue());
+ if (!CurDAG->MaskedValueIsZero(Shl.getOperand(0), HighZeros))
+ break;
+
+ // zext (shl nuw i8 %x, C) to i32 --> shl (zext i8 %x to i32), (zext C)
+ MVT VT = N.getSimpleValueType();
+ SDLoc DL(N);
+ SDValue Zext = CurDAG->getNode(ISD::ZERO_EXTEND, DL, VT, Shl.getOperand(0));
+ SDValue NewShl = CurDAG->getNode(ISD::SHL, DL, VT, Zext, Shl.getOperand(1));
+
+ // Convert the shift to scale factor.
+ AM.Scale = 1 << ShAmtC->getZExtValue();
+ AM.IndexReg = Zext;
+
+ insertDAGNode(*CurDAG, N, Zext);
+ insertDAGNode(*CurDAG, N, NewShl);
+ CurDAG->ReplaceAllUsesWith(N, NewShl);
+ CurDAG->RemoveDeadNode(N.getNode());
+ return false;
+ }
}
return matchAddressBase(N, AM);
@@ -1885,17 +2221,14 @@ bool X86DAGToDAGISel::selectVectorAddr(SDNode *Parent, SDValue N, SDValue &Base,
if (AddrSpace == 258)
AM.Segment = CurDAG->getRegister(X86::SS, MVT::i16);
+ SDLoc DL(N);
+ MVT VT = N.getSimpleValueType();
+
// Try to match into the base and displacement fields.
if (matchVectorAddress(N, AM))
return false;
- MVT VT = N.getSimpleValueType();
- if (AM.BaseType == X86ISelAddressMode::RegBase) {
- if (!AM.Base_Reg.getNode())
- AM.Base_Reg = CurDAG->getRegister(0, VT);
- }
-
- getAddressOperands(AM, SDLoc(N), Base, Scale, Index, Disp, Segment);
+ getAddressOperands(AM, DL, VT, Base, Scale, Index, Disp, Segment);
return true;
}
@@ -1917,6 +2250,8 @@ bool X86DAGToDAGISel::selectAddr(SDNode *Parent, SDValue N, SDValue &Base,
Parent->getOpcode() != ISD::INTRINSIC_W_CHAIN && // unaligned loads, fixme
Parent->getOpcode() != ISD::INTRINSIC_VOID && // nontemporal stores
Parent->getOpcode() != X86ISD::TLSCALL && // Fixme
+ Parent->getOpcode() != X86ISD::ENQCMD && // Fixme
+ Parent->getOpcode() != X86ISD::ENQCMDS && // Fixme
Parent->getOpcode() != X86ISD::EH_SJLJ_SETJMP && // setjmp
Parent->getOpcode() != X86ISD::EH_SJLJ_LONGJMP) { // longjmp
unsigned AddrSpace =
@@ -1930,19 +2265,14 @@ bool X86DAGToDAGISel::selectAddr(SDNode *Parent, SDValue N, SDValue &Base,
AM.Segment = CurDAG->getRegister(X86::SS, MVT::i16);
}
- if (matchAddress(N, AM))
- return false;
-
+ // Save the DL and VT before calling matchAddress, it can invalidate N.
+ SDLoc DL(N);
MVT VT = N.getSimpleValueType();
- if (AM.BaseType == X86ISelAddressMode::RegBase) {
- if (!AM.Base_Reg.getNode())
- AM.Base_Reg = CurDAG->getRegister(0, VT);
- }
- if (!AM.IndexReg.getNode())
- AM.IndexReg = CurDAG->getRegister(0, VT);
+ if (matchAddress(N, AM))
+ return false;
- getAddressOperands(AM, SDLoc(N), Base, Scale, Index, Disp, Segment);
+ getAddressOperands(AM, DL, VT, Base, Scale, Index, Disp, Segment);
return true;
}
@@ -1974,12 +2304,14 @@ bool X86DAGToDAGISel::selectScalarSSELoad(SDNode *Root, SDNode *Parent,
if (!hasSingleUsesFromRoot(Root, Parent))
return false;
- // We can allow a full vector load here since narrowing a load is ok.
+ // We can allow a full vector load here since narrowing a load is ok unless
+ // it's volatile.
if (ISD::isNON_EXTLoad(N.getNode())) {
- PatternNodeWithChain = N;
- if (IsProfitableToFold(PatternNodeWithChain, N.getNode(), Root) &&
- IsLegalToFold(PatternNodeWithChain, Parent, Root, OptLevel)) {
- LoadSDNode *LD = cast<LoadSDNode>(PatternNodeWithChain);
+ LoadSDNode *LD = cast<LoadSDNode>(N);
+ if (!LD->isVolatile() &&
+ IsProfitableToFold(N, LD, Root) &&
+ IsLegalToFold(N, Parent, Root, OptLevel)) {
+ PatternNodeWithChain = N;
return selectAddr(LD, LD->getBasePtr(), Base, Scale, Index, Disp,
Segment);
}
@@ -2010,23 +2342,6 @@ bool X86DAGToDAGISel::selectScalarSSELoad(SDNode *Root, SDNode *Parent,
}
}
- // Also handle the case where we explicitly require zeros in the top
- // elements. This is a vector shuffle from the zero vector.
- if (N.getOpcode() == X86ISD::VZEXT_MOVL && N.getNode()->hasOneUse() &&
- // Check to see if the top elements are all zeros (or bitcast of zeros).
- N.getOperand(0).getOpcode() == ISD::SCALAR_TO_VECTOR &&
- N.getOperand(0).getNode()->hasOneUse()) {
- PatternNodeWithChain = N.getOperand(0).getOperand(0);
- if (ISD::isNON_EXTLoad(PatternNodeWithChain.getNode()) &&
- IsProfitableToFold(PatternNodeWithChain, N.getNode(), Root) &&
- IsLegalToFold(PatternNodeWithChain, N.getNode(), Root, OptLevel)) {
- // Okay, this is a zero extending load. Fold it.
- LoadSDNode *LD = cast<LoadSDNode>(PatternNodeWithChain);
- return selectAddr(LD, LD->getBasePtr(), Base, Scale, Index, Disp,
- Segment);
- }
- }
-
return false;
}
@@ -2077,14 +2392,12 @@ bool X86DAGToDAGISel::selectLEA64_32Addr(SDValue N, SDValue &Base,
RegisterSDNode *RN = dyn_cast<RegisterSDNode>(Base);
if (RN && RN->getReg() == 0)
Base = CurDAG->getRegister(0, MVT::i64);
- else if (Base.getValueType() == MVT::i32 && !dyn_cast<FrameIndexSDNode>(Base)) {
+ else if (Base.getValueType() == MVT::i32 && !isa<FrameIndexSDNode>(Base)) {
// Base could already be %rip, particularly in the x32 ABI.
- Base = SDValue(CurDAG->getMachineNode(
- TargetOpcode::SUBREG_TO_REG, DL, MVT::i64,
- CurDAG->getTargetConstant(0, DL, MVT::i64),
- Base,
- CurDAG->getTargetConstant(X86::sub_32bit, DL, MVT::i32)),
- 0);
+ SDValue ImplDef = SDValue(CurDAG->getMachineNode(X86::IMPLICIT_DEF, DL,
+ MVT::i64), 0);
+ Base = CurDAG->getTargetInsertSubreg(X86::sub_32bit, DL, MVT::i64, ImplDef,
+ Base);
}
RN = dyn_cast<RegisterSDNode>(Index);
@@ -2093,13 +2406,10 @@ bool X86DAGToDAGISel::selectLEA64_32Addr(SDValue N, SDValue &Base,
else {
assert(Index.getValueType() == MVT::i32 &&
"Expect to be extending 32-bit registers for use in LEA");
- Index = SDValue(CurDAG->getMachineNode(
- TargetOpcode::SUBREG_TO_REG, DL, MVT::i64,
- CurDAG->getTargetConstant(0, DL, MVT::i64),
- Index,
- CurDAG->getTargetConstant(X86::sub_32bit, DL,
- MVT::i32)),
- 0);
+ SDValue ImplDef = SDValue(CurDAG->getMachineNode(X86::IMPLICIT_DEF, DL,
+ MVT::i64), 0);
+ Index = CurDAG->getTargetInsertSubreg(X86::sub_32bit, DL, MVT::i64, ImplDef,
+ Index);
}
return true;
@@ -2128,18 +2438,13 @@ bool X86DAGToDAGISel::selectLEAAddr(SDValue N,
AM.Segment = Copy;
unsigned Complexity = 0;
- if (AM.BaseType == X86ISelAddressMode::RegBase)
- if (AM.Base_Reg.getNode())
- Complexity = 1;
- else
- AM.Base_Reg = CurDAG->getRegister(0, VT);
+ if (AM.BaseType == X86ISelAddressMode::RegBase && AM.Base_Reg.getNode())
+ Complexity = 1;
else if (AM.BaseType == X86ISelAddressMode::FrameIndexBase)
Complexity = 4;
if (AM.IndexReg.getNode())
Complexity++;
- else
- AM.IndexReg = CurDAG->getRegister(0, VT);
// Don't match just leal(,%reg,2). It's cheaper to do addl %reg, %reg, or with
// a simple shift.
@@ -2159,14 +2464,14 @@ bool X86DAGToDAGISel::selectLEAAddr(SDValue N,
Complexity += 2;
}
- if (AM.Disp && (AM.Base_Reg.getNode() || AM.IndexReg.getNode()))
+ if (AM.Disp)
Complexity++;
// If it isn't worth using an LEA, reject it.
if (Complexity <= 2)
return false;
- getAddressOperands(AM, DL, Base, Scale, Index, Disp, Segment);
+ getAddressOperands(AM, DL, VT, Base, Scale, Index, Disp, Segment);
return true;
}
@@ -2180,17 +2485,15 @@ bool X86DAGToDAGISel::selectTLSADDRAddr(SDValue N, SDValue &Base,
X86ISelAddressMode AM;
AM.GV = GA->getGlobal();
AM.Disp += GA->getOffset();
- AM.Base_Reg = CurDAG->getRegister(0, N.getValueType());
AM.SymbolFlags = GA->getTargetFlags();
- if (N.getValueType() == MVT::i32) {
+ MVT VT = N.getSimpleValueType();
+ if (VT == MVT::i32) {
AM.Scale = 1;
AM.IndexReg = CurDAG->getRegister(X86::EBX, MVT::i32);
- } else {
- AM.IndexReg = CurDAG->getRegister(0, MVT::i64);
}
- getAddressOperands(AM, SDLoc(N), Base, Scale, Index, Disp, Segment);
+ getAddressOperands(AM, SDLoc(N), VT, Base, Scale, Index, Disp, Segment);
return true;
}
@@ -2274,14 +2577,22 @@ bool X86DAGToDAGISel::isSExtAbsoluteSymbolRef(unsigned Width, SDNode *N) const {
CR->getSignedMax().slt(1ull << Width);
}
-static X86::CondCode getCondFromOpc(unsigned Opc) {
+static X86::CondCode getCondFromNode(SDNode *N) {
+ assert(N->isMachineOpcode() && "Unexpected node");
X86::CondCode CC = X86::COND_INVALID;
- if (CC == X86::COND_INVALID)
- CC = X86::getCondFromBranchOpc(Opc);
- if (CC == X86::COND_INVALID)
- CC = X86::getCondFromSETOpc(Opc);
- if (CC == X86::COND_INVALID)
- CC = X86::getCondFromCMovOpc(Opc);
+ unsigned Opc = N->getMachineOpcode();
+ if (Opc == X86::JCC_1)
+ CC = static_cast<X86::CondCode>(N->getConstantOperandVal(1));
+ else if (Opc == X86::SETCCr)
+ CC = static_cast<X86::CondCode>(N->getConstantOperandVal(0));
+ else if (Opc == X86::SETCCm)
+ CC = static_cast<X86::CondCode>(N->getConstantOperandVal(5));
+ else if (Opc == X86::CMOV16rr || Opc == X86::CMOV32rr ||
+ Opc == X86::CMOV64rr)
+ CC = static_cast<X86::CondCode>(N->getConstantOperandVal(2));
+ else if (Opc == X86::CMOV16rm || Opc == X86::CMOV32rm ||
+ Opc == X86::CMOV64rm)
+ CC = static_cast<X86::CondCode>(N->getConstantOperandVal(6));
return CC;
}
@@ -2307,7 +2618,7 @@ bool X86DAGToDAGISel::onlyUsesZeroFlag(SDValue Flags) const {
// Anything unusual: assume conservatively.
if (!FlagUI->isMachineOpcode()) return false;
// Examine the condition code of the user.
- X86::CondCode CC = getCondFromOpc(FlagUI->getMachineOpcode());
+ X86::CondCode CC = getCondFromNode(*FlagUI);
switch (CC) {
// Comparisons which only use the zero flag.
@@ -2343,7 +2654,7 @@ bool X86DAGToDAGISel::hasNoSignFlagUses(SDValue Flags) const {
// Anything unusual: assume conservatively.
if (!FlagUI->isMachineOpcode()) return false;
// Examine the condition code of the user.
- X86::CondCode CC = getCondFromOpc(FlagUI->getMachineOpcode());
+ X86::CondCode CC = getCondFromNode(*FlagUI);
switch (CC) {
// Comparisons which don't examine the SF flag.
@@ -2404,7 +2715,7 @@ static bool mayUseCarryFlag(X86::CondCode CC) {
if (!FlagUI->isMachineOpcode())
return false;
// Examine the condition code of the user.
- X86::CondCode CC = getCondFromOpc(FlagUI->getMachineOpcode());
+ X86::CondCode CC = getCondFromNode(*FlagUI);
if (mayUseCarryFlag(CC))
return false;
@@ -2582,10 +2893,13 @@ bool X86DAGToDAGISel::foldLoadStoreIntoMemOperand(SDNode *Node) {
return false;
bool IsCommutable = false;
+ bool IsNegate = false;
switch (Opc) {
default:
return false;
case X86ISD::SUB:
+ IsNegate = isNullConstant(StoredVal.getOperand(0));
+ break;
case X86ISD::SBB:
break;
case X86ISD::ADD:
@@ -2597,7 +2911,7 @@ bool X86DAGToDAGISel::foldLoadStoreIntoMemOperand(SDNode *Node) {
break;
}
- unsigned LoadOpNo = 0;
+ unsigned LoadOpNo = IsNegate ? 1 : 0;
LoadSDNode *LoadNode = nullptr;
SDValue InputChain;
if (!isFusableLoadOpStorePattern(StoreNode, StoredVal, CurDAG, LoadOpNo,
@@ -2635,11 +2949,20 @@ bool X86DAGToDAGISel::foldLoadStoreIntoMemOperand(SDNode *Node) {
MachineSDNode *Result;
switch (Opc) {
- case X86ISD::ADD:
case X86ISD::SUB:
+ // Handle negate.
+ if (IsNegate) {
+ unsigned NewOpc = SelectOpcode(X86::NEG64m, X86::NEG32m, X86::NEG16m,
+ X86::NEG8m);
+ const SDValue Ops[] = {Base, Scale, Index, Disp, Segment, InputChain};
+ Result = CurDAG->getMachineNode(NewOpc, SDLoc(Node), MVT::i32,
+ MVT::Other, Ops);
+ break;
+ }
+ LLVM_FALLTHROUGH;
+ case X86ISD::ADD:
// Try to match inc/dec.
- if (!Subtarget->slowIncDec() ||
- CurDAG->getMachineFunction().getFunction().optForSize()) {
+ if (!Subtarget->slowIncDec() || OptForSize) {
bool IsOne = isOneConstant(StoredVal.getOperand(1));
bool IsNegOne = isAllOnesConstant(StoredVal.getOperand(1));
// ADD/SUB with 1/-1 and carry flag isn't used can use inc/dec.
@@ -2740,16 +3063,15 @@ bool X86DAGToDAGISel::foldLoadStoreIntoMemOperand(SDNode *Node) {
// See if the operand is a constant that we can fold into an immediate
// operand.
if (auto *OperandC = dyn_cast<ConstantSDNode>(Operand)) {
- auto OperandV = OperandC->getAPIntValue();
+ int64_t OperandV = OperandC->getSExtValue();
// Check if we can shrink the operand enough to fit in an immediate (or
// fit into a smaller immediate) by negating it and switching the
// operation.
if ((Opc == X86ISD::ADD || Opc == X86ISD::SUB) &&
- ((MemVT != MVT::i8 && OperandV.getMinSignedBits() > 8 &&
- (-OperandV).getMinSignedBits() <= 8) ||
- (MemVT == MVT::i64 && OperandV.getMinSignedBits() > 32 &&
- (-OperandV).getMinSignedBits() <= 32)) &&
+ ((MemVT != MVT::i8 && !isInt<8>(OperandV) && isInt<8>(-OperandV)) ||
+ (MemVT == MVT::i64 && !isInt<32>(OperandV) &&
+ isInt<32>(-OperandV))) &&
hasNoCarryFlagUses(StoredVal.getValue(1))) {
OperandV = -OperandV;
Opc = Opc == X86ISD::ADD ? X86ISD::SUB : X86ISD::ADD;
@@ -2757,11 +3079,10 @@ bool X86DAGToDAGISel::foldLoadStoreIntoMemOperand(SDNode *Node) {
// First try to fit this into an Imm8 operand. If it doesn't fit, then try
// the larger immediate operand.
- if (MemVT != MVT::i8 && OperandV.getMinSignedBits() <= 8) {
+ if (MemVT != MVT::i8 && isInt<8>(OperandV)) {
Operand = CurDAG->getTargetConstant(OperandV, SDLoc(Node), MemVT);
NewOpc = SelectImm8Opcode(Opc);
- } else if (OperandV.getActiveBits() <= MemVT.getSizeInBits() &&
- (MemVT != MVT::i64 || OperandV.getMinSignedBits() <= 32)) {
+ } else if (MemVT != MVT::i64 || isInt<32>(OperandV)) {
Operand = CurDAG->getTargetConstant(OperandV, SDLoc(Node), MemVT);
NewOpc = SelectImmOpcode(Opc);
}
@@ -2821,8 +3142,6 @@ bool X86DAGToDAGISel::matchBitExtract(SDNode *Node) {
if (NVT != MVT::i32 && NVT != MVT::i64)
return false;
- unsigned Size = NVT.getSizeInBits();
-
SDValue NBits;
// If we have BMI2's BZHI, we are ok with muti-use patterns.
@@ -2835,16 +3154,27 @@ bool X86DAGToDAGISel::matchBitExtract(SDNode *Node) {
auto checkOneUse = [checkUses](SDValue Op) { return checkUses(Op, 1); };
auto checkTwoUse = [checkUses](SDValue Op) { return checkUses(Op, 2); };
+ auto peekThroughOneUseTruncation = [checkOneUse](SDValue V) {
+ if (V->getOpcode() == ISD::TRUNCATE && checkOneUse(V)) {
+ assert(V.getSimpleValueType() == MVT::i32 &&
+ V.getOperand(0).getSimpleValueType() == MVT::i64 &&
+ "Expected i64 -> i32 truncation");
+ V = V.getOperand(0);
+ }
+ return V;
+ };
+
// a) x & ((1 << nbits) + (-1))
- auto matchPatternA = [&checkOneUse, &NBits](SDValue Mask) -> bool {
+ auto matchPatternA = [checkOneUse, peekThroughOneUseTruncation,
+ &NBits](SDValue Mask) -> bool {
// Match `add`. Must only have one use!
if (Mask->getOpcode() != ISD::ADD || !checkOneUse(Mask))
return false;
// We should be adding all-ones constant (i.e. subtracting one.)
if (!isAllOnesConstant(Mask->getOperand(1)))
return false;
- // Match `1 << nbits`. Must only have one use!
- SDValue M0 = Mask->getOperand(0);
+ // Match `1 << nbits`. Might be truncated. Must only have one use!
+ SDValue M0 = peekThroughOneUseTruncation(Mask->getOperand(0));
if (M0->getOpcode() != ISD::SHL || !checkOneUse(M0))
return false;
if (!isOneConstant(M0->getOperand(0)))
@@ -2853,23 +3183,36 @@ bool X86DAGToDAGISel::matchBitExtract(SDNode *Node) {
return true;
};
+ auto isAllOnes = [this, peekThroughOneUseTruncation, NVT](SDValue V) {
+ V = peekThroughOneUseTruncation(V);
+ return CurDAG->MaskedValueIsAllOnes(
+ V, APInt::getLowBitsSet(V.getSimpleValueType().getSizeInBits(),
+ NVT.getSizeInBits()));
+ };
+
// b) x & ~(-1 << nbits)
- auto matchPatternB = [&checkOneUse, &NBits](SDValue Mask) -> bool {
+ auto matchPatternB = [checkOneUse, isAllOnes, peekThroughOneUseTruncation,
+ &NBits](SDValue Mask) -> bool {
// Match `~()`. Must only have one use!
- if (!isBitwiseNot(Mask) || !checkOneUse(Mask))
+ if (Mask.getOpcode() != ISD::XOR || !checkOneUse(Mask))
return false;
- // Match `-1 << nbits`. Must only have one use!
- SDValue M0 = Mask->getOperand(0);
+ // The -1 only has to be all-ones for the final Node's NVT.
+ if (!isAllOnes(Mask->getOperand(1)))
+ return false;
+ // Match `-1 << nbits`. Might be truncated. Must only have one use!
+ SDValue M0 = peekThroughOneUseTruncation(Mask->getOperand(0));
if (M0->getOpcode() != ISD::SHL || !checkOneUse(M0))
return false;
- if (!isAllOnesConstant(M0->getOperand(0)))
+ // The -1 only has to be all-ones for the final Node's NVT.
+ if (!isAllOnes(M0->getOperand(0)))
return false;
NBits = M0->getOperand(1);
return true;
};
// Match potentially-truncated (bitwidth - y)
- auto matchShiftAmt = [checkOneUse, Size, &NBits](SDValue ShiftAmt) {
+ auto matchShiftAmt = [checkOneUse, &NBits](SDValue ShiftAmt,
+ unsigned Bitwidth) {
// Skip over a truncate of the shift amount.
if (ShiftAmt.getOpcode() == ISD::TRUNCATE) {
ShiftAmt = ShiftAmt.getOperand(0);
@@ -2881,52 +3224,56 @@ bool X86DAGToDAGISel::matchBitExtract(SDNode *Node) {
if (ShiftAmt.getOpcode() != ISD::SUB)
return false;
auto V0 = dyn_cast<ConstantSDNode>(ShiftAmt.getOperand(0));
- if (!V0 || V0->getZExtValue() != Size)
+ if (!V0 || V0->getZExtValue() != Bitwidth)
return false;
NBits = ShiftAmt.getOperand(1);
return true;
};
// c) x & (-1 >> (32 - y))
- auto matchPatternC = [&checkOneUse, matchShiftAmt](SDValue Mask) -> bool {
+ auto matchPatternC = [checkOneUse, peekThroughOneUseTruncation,
+ matchShiftAmt](SDValue Mask) -> bool {
+ // The mask itself may be truncated.
+ Mask = peekThroughOneUseTruncation(Mask);
+ unsigned Bitwidth = Mask.getSimpleValueType().getSizeInBits();
// Match `l>>`. Must only have one use!
if (Mask.getOpcode() != ISD::SRL || !checkOneUse(Mask))
return false;
- // We should be shifting all-ones constant.
+ // We should be shifting truly all-ones constant.
if (!isAllOnesConstant(Mask.getOperand(0)))
return false;
SDValue M1 = Mask.getOperand(1);
// The shift amount should not be used externally.
if (!checkOneUse(M1))
return false;
- return matchShiftAmt(M1);
+ return matchShiftAmt(M1, Bitwidth);
};
SDValue X;
// d) x << (32 - y) >> (32 - y)
- auto matchPatternD = [&checkOneUse, &checkTwoUse, matchShiftAmt,
+ auto matchPatternD = [checkOneUse, checkTwoUse, matchShiftAmt,
&X](SDNode *Node) -> bool {
if (Node->getOpcode() != ISD::SRL)
return false;
SDValue N0 = Node->getOperand(0);
if (N0->getOpcode() != ISD::SHL || !checkOneUse(N0))
return false;
+ unsigned Bitwidth = N0.getSimpleValueType().getSizeInBits();
SDValue N1 = Node->getOperand(1);
SDValue N01 = N0->getOperand(1);
// Both of the shifts must be by the exact same value.
// There should not be any uses of the shift amount outside of the pattern.
if (N1 != N01 || !checkTwoUse(N1))
return false;
- if (!matchShiftAmt(N1))
+ if (!matchShiftAmt(N1, Bitwidth))
return false;
X = N0->getOperand(0);
return true;
};
- auto matchLowBitMask = [&matchPatternA, &matchPatternB,
- &matchPatternC](SDValue Mask) -> bool {
- // FIXME: pattern c.
+ auto matchLowBitMask = [matchPatternA, matchPatternB,
+ matchPatternC](SDValue Mask) -> bool {
return matchPatternA(Mask) || matchPatternB(Mask) || matchPatternC(Mask);
};
@@ -2946,42 +3293,46 @@ bool X86DAGToDAGISel::matchBitExtract(SDNode *Node) {
SDLoc DL(Node);
- // If we do *NOT* have BMI2, let's find out if the if the 'X' is *logically*
- // shifted (potentially with one-use trunc inbetween),
- // and if so look past one-use truncation.
- MVT XVT = NVT;
- if (!Subtarget->hasBMI2() && X.getOpcode() == ISD::TRUNCATE &&
- X.hasOneUse() && X.getOperand(0).getOpcode() == ISD::SRL) {
- assert(NVT == MVT::i32 && "Expected target valuetype to be i32");
- X = X.getOperand(0);
- XVT = X.getSimpleValueType();
- assert(XVT == MVT::i64 && "Expected truncation from i64");
- }
+ // Truncate the shift amount.
+ NBits = CurDAG->getNode(ISD::TRUNCATE, DL, MVT::i8, NBits);
+ insertDAGNode(*CurDAG, SDValue(Node, 0), NBits);
- SDValue OrigNBits = NBits;
- if (NBits.getValueType() != XVT) {
- // Truncate the shift amount.
- NBits = CurDAG->getNode(ISD::TRUNCATE, DL, MVT::i8, NBits);
- insertDAGNode(*CurDAG, OrigNBits, NBits);
-
- // Insert 8-bit NBits into lowest 8 bits of XVT-sized (32 or 64-bit)
- // register. All the other bits are undefined, we do not care about them.
- SDValue ImplDef =
- SDValue(CurDAG->getMachineNode(TargetOpcode::IMPLICIT_DEF, DL, XVT), 0);
- insertDAGNode(*CurDAG, OrigNBits, ImplDef);
- NBits =
- CurDAG->getTargetInsertSubreg(X86::sub_8bit, DL, XVT, ImplDef, NBits);
- insertDAGNode(*CurDAG, OrigNBits, NBits);
- }
+ // Insert 8-bit NBits into lowest 8 bits of 32-bit register.
+ // All the other bits are undefined, we do not care about them.
+ SDValue ImplDef = SDValue(
+ CurDAG->getMachineNode(TargetOpcode::IMPLICIT_DEF, DL, MVT::i32), 0);
+ insertDAGNode(*CurDAG, SDValue(Node, 0), ImplDef);
+ NBits = CurDAG->getTargetInsertSubreg(X86::sub_8bit, DL, MVT::i32, ImplDef,
+ NBits);
+ insertDAGNode(*CurDAG, SDValue(Node, 0), NBits);
if (Subtarget->hasBMI2()) {
// Great, just emit the the BZHI..
- SDValue Extract = CurDAG->getNode(X86ISD::BZHI, DL, XVT, X, NBits);
+ if (NVT != MVT::i32) {
+ // But have to place the bit count into the wide-enough register first.
+ NBits = CurDAG->getNode(ISD::ANY_EXTEND, DL, NVT, NBits);
+ insertDAGNode(*CurDAG, SDValue(Node, 0), NBits);
+ }
+
+ SDValue Extract = CurDAG->getNode(X86ISD::BZHI, DL, NVT, X, NBits);
ReplaceNode(Node, Extract.getNode());
SelectCode(Extract.getNode());
return true;
}
+ // Else, if we do *NOT* have BMI2, let's find out if the if the 'X' is
+ // *logically* shifted (potentially with one-use trunc inbetween),
+ // and the truncation was the only use of the shift,
+ // and if so look past one-use truncation.
+ {
+ SDValue RealX = peekThroughOneUseTruncation(X);
+ // FIXME: only if the shift is one-use?
+ if (RealX != X && RealX.getOpcode() == ISD::SRL)
+ X = RealX;
+ }
+
+ MVT XVT = X.getSimpleValueType();
+
// Else, emitting BEXTR requires one more step.
// The 'control' of BEXTR has the pattern of:
// [15...8 bit][ 7...0 bit] location
@@ -2991,10 +3342,11 @@ bool X86DAGToDAGISel::matchBitExtract(SDNode *Node) {
// Shift NBits left by 8 bits, thus producing 'control'.
// This makes the low 8 bits to be zero.
SDValue C8 = CurDAG->getConstant(8, DL, MVT::i8);
- SDValue Control = CurDAG->getNode(ISD::SHL, DL, XVT, NBits, C8);
- insertDAGNode(*CurDAG, OrigNBits, Control);
+ SDValue Control = CurDAG->getNode(ISD::SHL, DL, MVT::i32, NBits, C8);
+ insertDAGNode(*CurDAG, SDValue(Node, 0), Control);
// If the 'X' is *logically* shifted, we can fold that shift into 'control'.
+ // FIXME: only if the shift is one-use?
if (X.getOpcode() == ISD::SRL) {
SDValue ShiftAmt = X.getOperand(1);
X = X.getOperand(0);
@@ -3003,13 +3355,20 @@ bool X86DAGToDAGISel::matchBitExtract(SDNode *Node) {
"Expected shift amount to be i8");
// Now, *zero*-extend the shift amount. The bits 8...15 *must* be zero!
+ // We could zext to i16 in some form, but we intentionally don't do that.
SDValue OrigShiftAmt = ShiftAmt;
- ShiftAmt = CurDAG->getNode(ISD::ZERO_EXTEND, DL, XVT, ShiftAmt);
+ ShiftAmt = CurDAG->getNode(ISD::ZERO_EXTEND, DL, MVT::i32, ShiftAmt);
insertDAGNode(*CurDAG, OrigShiftAmt, ShiftAmt);
// And now 'or' these low 8 bits of shift amount into the 'control'.
- Control = CurDAG->getNode(ISD::OR, DL, XVT, Control, ShiftAmt);
- insertDAGNode(*CurDAG, OrigNBits, Control);
+ Control = CurDAG->getNode(ISD::OR, DL, MVT::i32, Control, ShiftAmt);
+ insertDAGNode(*CurDAG, SDValue(Node, 0), Control);
+ }
+
+ // But have to place the 'control' into the wide-enough register first.
+ if (XVT != MVT::i32) {
+ Control = CurDAG->getNode(ISD::ANY_EXTEND, DL, XVT, Control);
+ insertDAGNode(*CurDAG, SDValue(Node, 0), Control);
}
// And finally, form the BEXTR itself.
@@ -3017,7 +3376,7 @@ bool X86DAGToDAGISel::matchBitExtract(SDNode *Node) {
// The 'X' was originally truncated. Do that now.
if (XVT != NVT) {
- insertDAGNode(*CurDAG, OrigNBits, Extract);
+ insertDAGNode(*CurDAG, SDValue(Node, 0), Extract);
Extract = CurDAG->getNode(ISD::TRUNCATE, DL, NVT, Extract);
}
@@ -3098,14 +3457,14 @@ MachineSDNode *X86DAGToDAGISel::matchBEXTRFromAndImm(SDNode *Node) {
SDValue Tmp0, Tmp1, Tmp2, Tmp3, Tmp4;
if (tryFoldLoad(Node, N0.getNode(), Input, Tmp0, Tmp1, Tmp2, Tmp3, Tmp4)) {
SDValue Ops[] = { Tmp0, Tmp1, Tmp2, Tmp3, Tmp4, New, Input.getOperand(0) };
- SDVTList VTs = CurDAG->getVTList(NVT, MVT::Other);
+ SDVTList VTs = CurDAG->getVTList(NVT, MVT::i32, MVT::Other);
NewNode = CurDAG->getMachineNode(MOpc, dl, VTs, Ops);
// Update the chain.
- ReplaceUses(Input.getValue(1), SDValue(NewNode, 1));
+ ReplaceUses(Input.getValue(1), SDValue(NewNode, 2));
// Record the mem-refs
CurDAG->setNodeMemRefs(NewNode, {cast<LoadSDNode>(Input)->getMemOperand()});
} else {
- NewNode = CurDAG->getMachineNode(ROpc, dl, NVT, Input, New);
+ NewNode = CurDAG->getMachineNode(ROpc, dl, NVT, MVT::i32, Input, New);
}
return NewNode;
@@ -3263,6 +3622,119 @@ bool X86DAGToDAGISel::tryShiftAmountMod(SDNode *N) {
return true;
}
+bool X86DAGToDAGISel::tryShrinkShlLogicImm(SDNode *N) {
+ MVT NVT = N->getSimpleValueType(0);
+ unsigned Opcode = N->getOpcode();
+ SDLoc dl(N);
+
+ // For operations of the form (x << C1) op C2, check if we can use a smaller
+ // encoding for C2 by transforming it into (x op (C2>>C1)) << C1.
+ SDValue Shift = N->getOperand(0);
+ SDValue N1 = N->getOperand(1);
+
+ ConstantSDNode *Cst = dyn_cast<ConstantSDNode>(N1);
+ if (!Cst)
+ return false;
+
+ int64_t Val = Cst->getSExtValue();
+
+ // If we have an any_extend feeding the AND, look through it to see if there
+ // is a shift behind it. But only if the AND doesn't use the extended bits.
+ // FIXME: Generalize this to other ANY_EXTEND than i32 to i64?
+ bool FoundAnyExtend = false;
+ if (Shift.getOpcode() == ISD::ANY_EXTEND && Shift.hasOneUse() &&
+ Shift.getOperand(0).getSimpleValueType() == MVT::i32 &&
+ isUInt<32>(Val)) {
+ FoundAnyExtend = true;
+ Shift = Shift.getOperand(0);
+ }
+
+ if (Shift.getOpcode() != ISD::SHL || !Shift.hasOneUse())
+ return false;
+
+ // i8 is unshrinkable, i16 should be promoted to i32.
+ if (NVT != MVT::i32 && NVT != MVT::i64)
+ return false;
+
+ ConstantSDNode *ShlCst = dyn_cast<ConstantSDNode>(Shift.getOperand(1));
+ if (!ShlCst)
+ return false;
+
+ uint64_t ShAmt = ShlCst->getZExtValue();
+
+ // Make sure that we don't change the operation by removing bits.
+ // This only matters for OR and XOR, AND is unaffected.
+ uint64_t RemovedBitsMask = (1ULL << ShAmt) - 1;
+ if (Opcode != ISD::AND && (Val & RemovedBitsMask) != 0)
+ return false;
+
+ // Check the minimum bitwidth for the new constant.
+ // TODO: Using 16 and 8 bit operations is also possible for or32 & xor32.
+ auto CanShrinkImmediate = [&](int64_t &ShiftedVal) {
+ if (Opcode == ISD::AND) {
+ // AND32ri is the same as AND64ri32 with zext imm.
+ // Try this before sign extended immediates below.
+ ShiftedVal = (uint64_t)Val >> ShAmt;
+ if (NVT == MVT::i64 && !isUInt<32>(Val) && isUInt<32>(ShiftedVal))
+ return true;
+ // Also swap order when the AND can become MOVZX.
+ if (ShiftedVal == UINT8_MAX || ShiftedVal == UINT16_MAX)
+ return true;
+ }
+ ShiftedVal = Val >> ShAmt;
+ if ((!isInt<8>(Val) && isInt<8>(ShiftedVal)) ||
+ (!isInt<32>(Val) && isInt<32>(ShiftedVal)))
+ return true;
+ if (Opcode != ISD::AND) {
+ // MOV32ri+OR64r/XOR64r is cheaper than MOV64ri64+OR64rr/XOR64rr
+ ShiftedVal = (uint64_t)Val >> ShAmt;
+ if (NVT == MVT::i64 && !isUInt<32>(Val) && isUInt<32>(ShiftedVal))
+ return true;
+ }
+ return false;
+ };
+
+ int64_t ShiftedVal;
+ if (!CanShrinkImmediate(ShiftedVal))
+ return false;
+
+ // Ok, we can reorder to get a smaller immediate.
+
+ // But, its possible the original immediate allowed an AND to become MOVZX.
+ // Doing this late due to avoid the MakedValueIsZero call as late as
+ // possible.
+ if (Opcode == ISD::AND) {
+ // Find the smallest zext this could possibly be.
+ unsigned ZExtWidth = Cst->getAPIntValue().getActiveBits();
+ ZExtWidth = PowerOf2Ceil(std::max(ZExtWidth, 8U));
+
+ // Figure out which bits need to be zero to achieve that mask.
+ APInt NeededMask = APInt::getLowBitsSet(NVT.getSizeInBits(),
+ ZExtWidth);
+ NeededMask &= ~Cst->getAPIntValue();
+
+ if (CurDAG->MaskedValueIsZero(N->getOperand(0), NeededMask))
+ return false;
+ }
+
+ SDValue X = Shift.getOperand(0);
+ if (FoundAnyExtend) {
+ SDValue NewX = CurDAG->getNode(ISD::ANY_EXTEND, dl, NVT, X);
+ insertDAGNode(*CurDAG, SDValue(N, 0), NewX);
+ X = NewX;
+ }
+
+ SDValue NewCst = CurDAG->getConstant(ShiftedVal, dl, NVT);
+ insertDAGNode(*CurDAG, SDValue(N, 0), NewCst);
+ SDValue NewBinOp = CurDAG->getNode(Opcode, dl, NVT, X, NewCst);
+ insertDAGNode(*CurDAG, SDValue(N, 0), NewBinOp);
+ SDValue NewSHL = CurDAG->getNode(ISD::SHL, dl, NVT, NewBinOp,
+ Shift.getOperand(1));
+ ReplaceNode(N, NewSHL.getNode());
+ SelectCode(NewSHL.getNode());
+ return true;
+}
+
/// If the high bits of an 'and' operand are known zero, try setting the
/// high bits of an 'and' constant operand to produce a smaller encoding by
/// creating a small, sign-extended negative immediate rather than a large
@@ -3333,6 +3805,347 @@ bool X86DAGToDAGISel::shrinkAndImmediate(SDNode *And) {
return true;
}
+static unsigned getVPTESTMOpc(MVT TestVT, bool IsTestN, bool FoldedLoad,
+ bool FoldedBCast, bool Masked) {
+ if (Masked) {
+ if (FoldedLoad) {
+ switch (TestVT.SimpleTy) {
+ default: llvm_unreachable("Unexpected VT!");
+ case MVT::v16i8:
+ return IsTestN ? X86::VPTESTNMBZ128rmk : X86::VPTESTMBZ128rmk;
+ case MVT::v8i16:
+ return IsTestN ? X86::VPTESTNMWZ128rmk : X86::VPTESTMWZ128rmk;
+ case MVT::v4i32:
+ return IsTestN ? X86::VPTESTNMDZ128rmk : X86::VPTESTMDZ128rmk;
+ case MVT::v2i64:
+ return IsTestN ? X86::VPTESTNMQZ128rmk : X86::VPTESTMQZ128rmk;
+ case MVT::v32i8:
+ return IsTestN ? X86::VPTESTNMBZ256rmk : X86::VPTESTMBZ256rmk;
+ case MVT::v16i16:
+ return IsTestN ? X86::VPTESTNMWZ256rmk : X86::VPTESTMWZ256rmk;
+ case MVT::v8i32:
+ return IsTestN ? X86::VPTESTNMDZ256rmk : X86::VPTESTMDZ256rmk;
+ case MVT::v4i64:
+ return IsTestN ? X86::VPTESTNMQZ256rmk : X86::VPTESTMQZ256rmk;
+ case MVT::v64i8:
+ return IsTestN ? X86::VPTESTNMBZrmk : X86::VPTESTMBZrmk;
+ case MVT::v32i16:
+ return IsTestN ? X86::VPTESTNMWZrmk : X86::VPTESTMWZrmk;
+ case MVT::v16i32:
+ return IsTestN ? X86::VPTESTNMDZrmk : X86::VPTESTMDZrmk;
+ case MVT::v8i64:
+ return IsTestN ? X86::VPTESTNMQZrmk : X86::VPTESTMQZrmk;
+ }
+ }
+
+ if (FoldedBCast) {
+ switch (TestVT.SimpleTy) {
+ default: llvm_unreachable("Unexpected VT!");
+ case MVT::v4i32:
+ return IsTestN ? X86::VPTESTNMDZ128rmbk : X86::VPTESTMDZ128rmbk;
+ case MVT::v2i64:
+ return IsTestN ? X86::VPTESTNMQZ128rmbk : X86::VPTESTMQZ128rmbk;
+ case MVT::v8i32:
+ return IsTestN ? X86::VPTESTNMDZ256rmbk : X86::VPTESTMDZ256rmbk;
+ case MVT::v4i64:
+ return IsTestN ? X86::VPTESTNMQZ256rmbk : X86::VPTESTMQZ256rmbk;
+ case MVT::v16i32:
+ return IsTestN ? X86::VPTESTNMDZrmbk : X86::VPTESTMDZrmbk;
+ case MVT::v8i64:
+ return IsTestN ? X86::VPTESTNMQZrmbk : X86::VPTESTMQZrmbk;
+ }
+ }
+
+ switch (TestVT.SimpleTy) {
+ default: llvm_unreachable("Unexpected VT!");
+ case MVT::v16i8:
+ return IsTestN ? X86::VPTESTNMBZ128rrk : X86::VPTESTMBZ128rrk;
+ case MVT::v8i16:
+ return IsTestN ? X86::VPTESTNMWZ128rrk : X86::VPTESTMWZ128rrk;
+ case MVT::v4i32:
+ return IsTestN ? X86::VPTESTNMDZ128rrk : X86::VPTESTMDZ128rrk;
+ case MVT::v2i64:
+ return IsTestN ? X86::VPTESTNMQZ128rrk : X86::VPTESTMQZ128rrk;
+ case MVT::v32i8:
+ return IsTestN ? X86::VPTESTNMBZ256rrk : X86::VPTESTMBZ256rrk;
+ case MVT::v16i16:
+ return IsTestN ? X86::VPTESTNMWZ256rrk : X86::VPTESTMWZ256rrk;
+ case MVT::v8i32:
+ return IsTestN ? X86::VPTESTNMDZ256rrk : X86::VPTESTMDZ256rrk;
+ case MVT::v4i64:
+ return IsTestN ? X86::VPTESTNMQZ256rrk : X86::VPTESTMQZ256rrk;
+ case MVT::v64i8:
+ return IsTestN ? X86::VPTESTNMBZrrk : X86::VPTESTMBZrrk;
+ case MVT::v32i16:
+ return IsTestN ? X86::VPTESTNMWZrrk : X86::VPTESTMWZrrk;
+ case MVT::v16i32:
+ return IsTestN ? X86::VPTESTNMDZrrk : X86::VPTESTMDZrrk;
+ case MVT::v8i64:
+ return IsTestN ? X86::VPTESTNMQZrrk : X86::VPTESTMQZrrk;
+ }
+ }
+
+ if (FoldedLoad) {
+ switch (TestVT.SimpleTy) {
+ default: llvm_unreachable("Unexpected VT!");
+ case MVT::v16i8:
+ return IsTestN ? X86::VPTESTNMBZ128rm : X86::VPTESTMBZ128rm;
+ case MVT::v8i16:
+ return IsTestN ? X86::VPTESTNMWZ128rm : X86::VPTESTMWZ128rm;
+ case MVT::v4i32:
+ return IsTestN ? X86::VPTESTNMDZ128rm : X86::VPTESTMDZ128rm;
+ case MVT::v2i64:
+ return IsTestN ? X86::VPTESTNMQZ128rm : X86::VPTESTMQZ128rm;
+ case MVT::v32i8:
+ return IsTestN ? X86::VPTESTNMBZ256rm : X86::VPTESTMBZ256rm;
+ case MVT::v16i16:
+ return IsTestN ? X86::VPTESTNMWZ256rm : X86::VPTESTMWZ256rm;
+ case MVT::v8i32:
+ return IsTestN ? X86::VPTESTNMDZ256rm : X86::VPTESTMDZ256rm;
+ case MVT::v4i64:
+ return IsTestN ? X86::VPTESTNMQZ256rm : X86::VPTESTMQZ256rm;
+ case MVT::v64i8:
+ return IsTestN ? X86::VPTESTNMBZrm : X86::VPTESTMBZrm;
+ case MVT::v32i16:
+ return IsTestN ? X86::VPTESTNMWZrm : X86::VPTESTMWZrm;
+ case MVT::v16i32:
+ return IsTestN ? X86::VPTESTNMDZrm : X86::VPTESTMDZrm;
+ case MVT::v8i64:
+ return IsTestN ? X86::VPTESTNMQZrm : X86::VPTESTMQZrm;
+ }
+ }
+
+ if (FoldedBCast) {
+ switch (TestVT.SimpleTy) {
+ default: llvm_unreachable("Unexpected VT!");
+ case MVT::v4i32:
+ return IsTestN ? X86::VPTESTNMDZ128rmb : X86::VPTESTMDZ128rmb;
+ case MVT::v2i64:
+ return IsTestN ? X86::VPTESTNMQZ128rmb : X86::VPTESTMQZ128rmb;
+ case MVT::v8i32:
+ return IsTestN ? X86::VPTESTNMDZ256rmb : X86::VPTESTMDZ256rmb;
+ case MVT::v4i64:
+ return IsTestN ? X86::VPTESTNMQZ256rmb : X86::VPTESTMQZ256rmb;
+ case MVT::v16i32:
+ return IsTestN ? X86::VPTESTNMDZrmb : X86::VPTESTMDZrmb;
+ case MVT::v8i64:
+ return IsTestN ? X86::VPTESTNMQZrmb : X86::VPTESTMQZrmb;
+ }
+ }
+
+ switch (TestVT.SimpleTy) {
+ default: llvm_unreachable("Unexpected VT!");
+ case MVT::v16i8:
+ return IsTestN ? X86::VPTESTNMBZ128rr : X86::VPTESTMBZ128rr;
+ case MVT::v8i16:
+ return IsTestN ? X86::VPTESTNMWZ128rr : X86::VPTESTMWZ128rr;
+ case MVT::v4i32:
+ return IsTestN ? X86::VPTESTNMDZ128rr : X86::VPTESTMDZ128rr;
+ case MVT::v2i64:
+ return IsTestN ? X86::VPTESTNMQZ128rr : X86::VPTESTMQZ128rr;
+ case MVT::v32i8:
+ return IsTestN ? X86::VPTESTNMBZ256rr : X86::VPTESTMBZ256rr;
+ case MVT::v16i16:
+ return IsTestN ? X86::VPTESTNMWZ256rr : X86::VPTESTMWZ256rr;
+ case MVT::v8i32:
+ return IsTestN ? X86::VPTESTNMDZ256rr : X86::VPTESTMDZ256rr;
+ case MVT::v4i64:
+ return IsTestN ? X86::VPTESTNMQZ256rr : X86::VPTESTMQZ256rr;
+ case MVT::v64i8:
+ return IsTestN ? X86::VPTESTNMBZrr : X86::VPTESTMBZrr;
+ case MVT::v32i16:
+ return IsTestN ? X86::VPTESTNMWZrr : X86::VPTESTMWZrr;
+ case MVT::v16i32:
+ return IsTestN ? X86::VPTESTNMDZrr : X86::VPTESTMDZrr;
+ case MVT::v8i64:
+ return IsTestN ? X86::VPTESTNMQZrr : X86::VPTESTMQZrr;
+ }
+}
+
+// Try to create VPTESTM instruction. If InMask is not null, it will be used
+// to form a masked operation.
+bool X86DAGToDAGISel::tryVPTESTM(SDNode *Root, SDValue Setcc,
+ SDValue InMask) {
+ assert(Subtarget->hasAVX512() && "Expected AVX512!");
+ assert(Setcc.getSimpleValueType().getVectorElementType() == MVT::i1 &&
+ "Unexpected VT!");
+
+ // Look for equal and not equal compares.
+ ISD::CondCode CC = cast<CondCodeSDNode>(Setcc.getOperand(2))->get();
+ if (CC != ISD::SETEQ && CC != ISD::SETNE)
+ return false;
+
+ // See if we're comparing against zero. This should have been canonicalized
+ // to RHS during lowering.
+ if (!ISD::isBuildVectorAllZeros(Setcc.getOperand(1).getNode()))
+ return false;
+
+ SDValue N0 = Setcc.getOperand(0);
+
+ MVT CmpVT = N0.getSimpleValueType();
+ MVT CmpSVT = CmpVT.getVectorElementType();
+
+ // Start with both operands the same. We'll try to refine this.
+ SDValue Src0 = N0;
+ SDValue Src1 = N0;
+
+ {
+ // Look through single use bitcasts.
+ SDValue N0Temp = N0;
+ if (N0Temp.getOpcode() == ISD::BITCAST && N0Temp.hasOneUse())
+ N0Temp = N0.getOperand(0);
+
+ // Look for single use AND.
+ if (N0Temp.getOpcode() == ISD::AND && N0Temp.hasOneUse()) {
+ Src0 = N0Temp.getOperand(0);
+ Src1 = N0Temp.getOperand(1);
+ }
+ }
+
+ // Without VLX we need to widen the load.
+ bool Widen = !Subtarget->hasVLX() && !CmpVT.is512BitVector();
+
+ // We can only fold loads if the sources are unique.
+ bool CanFoldLoads = Src0 != Src1;
+
+ // Try to fold loads unless we need to widen.
+ bool FoldedLoad = false;
+ SDValue Tmp0, Tmp1, Tmp2, Tmp3, Tmp4, Load;
+ if (!Widen && CanFoldLoads) {
+ Load = Src1;
+ FoldedLoad = tryFoldLoad(Root, N0.getNode(), Load, Tmp0, Tmp1, Tmp2, Tmp3,
+ Tmp4);
+ if (!FoldedLoad) {
+ // And is computative.
+ Load = Src0;
+ FoldedLoad = tryFoldLoad(Root, N0.getNode(), Load, Tmp0, Tmp1, Tmp2,
+ Tmp3, Tmp4);
+ if (FoldedLoad)
+ std::swap(Src0, Src1);
+ }
+ }
+
+ auto findBroadcastedOp = [](SDValue Src, MVT CmpSVT, SDNode *&Parent) {
+ // Look through single use bitcasts.
+ if (Src.getOpcode() == ISD::BITCAST && Src.hasOneUse())
+ Src = Src.getOperand(0);
+
+ if (Src.getOpcode() == X86ISD::VBROADCAST && Src.hasOneUse()) {
+ Parent = Src.getNode();
+ Src = Src.getOperand(0);
+ if (Src.getSimpleValueType() == CmpSVT)
+ return Src;
+ }
+
+ return SDValue();
+ };
+
+ // If we didn't fold a load, try to match broadcast. No widening limitation
+ // for this. But only 32 and 64 bit types are supported.
+ bool FoldedBCast = false;
+ if (!FoldedLoad && CanFoldLoads &&
+ (CmpSVT == MVT::i32 || CmpSVT == MVT::i64)) {
+ SDNode *ParentNode = nullptr;
+ if ((Load = findBroadcastedOp(Src1, CmpSVT, ParentNode))) {
+ FoldedBCast = tryFoldLoad(Root, ParentNode, Load, Tmp0,
+ Tmp1, Tmp2, Tmp3, Tmp4);
+ }
+
+ // Try the other operand.
+ if (!FoldedBCast) {
+ if ((Load = findBroadcastedOp(Src0, CmpSVT, ParentNode))) {
+ FoldedBCast = tryFoldLoad(Root, ParentNode, Load, Tmp0,
+ Tmp1, Tmp2, Tmp3, Tmp4);
+ if (FoldedBCast)
+ std::swap(Src0, Src1);
+ }
+ }
+ }
+
+ auto getMaskRC = [](MVT MaskVT) {
+ switch (MaskVT.SimpleTy) {
+ default: llvm_unreachable("Unexpected VT!");
+ case MVT::v2i1: return X86::VK2RegClassID;
+ case MVT::v4i1: return X86::VK4RegClassID;
+ case MVT::v8i1: return X86::VK8RegClassID;
+ case MVT::v16i1: return X86::VK16RegClassID;
+ case MVT::v32i1: return X86::VK32RegClassID;
+ case MVT::v64i1: return X86::VK64RegClassID;
+ }
+ };
+
+ bool IsMasked = InMask.getNode() != nullptr;
+
+ SDLoc dl(Root);
+
+ MVT ResVT = Setcc.getSimpleValueType();
+ MVT MaskVT = ResVT;
+ if (Widen) {
+ // Widen the inputs using insert_subreg or copy_to_regclass.
+ unsigned Scale = CmpVT.is128BitVector() ? 4 : 2;
+ unsigned SubReg = CmpVT.is128BitVector() ? X86::sub_xmm : X86::sub_ymm;
+ unsigned NumElts = CmpVT.getVectorNumElements() * Scale;
+ CmpVT = MVT::getVectorVT(CmpSVT, NumElts);
+ MaskVT = MVT::getVectorVT(MVT::i1, NumElts);
+ SDValue ImplDef = SDValue(CurDAG->getMachineNode(X86::IMPLICIT_DEF, dl,
+ CmpVT), 0);
+ Src0 = CurDAG->getTargetInsertSubreg(SubReg, dl, CmpVT, ImplDef, Src0);
+
+ assert(!FoldedLoad && "Shouldn't have folded the load");
+ if (!FoldedBCast)
+ Src1 = CurDAG->getTargetInsertSubreg(SubReg, dl, CmpVT, ImplDef, Src1);
+
+ if (IsMasked) {
+ // Widen the mask.
+ unsigned RegClass = getMaskRC(MaskVT);
+ SDValue RC = CurDAG->getTargetConstant(RegClass, dl, MVT::i32);
+ InMask = SDValue(CurDAG->getMachineNode(TargetOpcode::COPY_TO_REGCLASS,
+ dl, MaskVT, InMask, RC), 0);
+ }
+ }
+
+ bool IsTestN = CC == ISD::SETEQ;
+ unsigned Opc = getVPTESTMOpc(CmpVT, IsTestN, FoldedLoad, FoldedBCast,
+ IsMasked);
+
+ MachineSDNode *CNode;
+ if (FoldedLoad || FoldedBCast) {
+ SDVTList VTs = CurDAG->getVTList(MaskVT, MVT::Other);
+
+ if (IsMasked) {
+ SDValue Ops[] = { InMask, Src0, Tmp0, Tmp1, Tmp2, Tmp3, Tmp4,
+ Load.getOperand(0) };
+ CNode = CurDAG->getMachineNode(Opc, dl, VTs, Ops);
+ } else {
+ SDValue Ops[] = { Src0, Tmp0, Tmp1, Tmp2, Tmp3, Tmp4,
+ Load.getOperand(0) };
+ CNode = CurDAG->getMachineNode(Opc, dl, VTs, Ops);
+ }
+
+ // Update the chain.
+ ReplaceUses(Load.getValue(1), SDValue(CNode, 1));
+ // Record the mem-refs
+ CurDAG->setNodeMemRefs(CNode, {cast<LoadSDNode>(Load)->getMemOperand()});
+ } else {
+ if (IsMasked)
+ CNode = CurDAG->getMachineNode(Opc, dl, MaskVT, InMask, Src0, Src1);
+ else
+ CNode = CurDAG->getMachineNode(Opc, dl, MaskVT, Src0, Src1);
+ }
+
+ // If we widened, we need to shrink the mask VT.
+ if (Widen) {
+ unsigned RegClass = getMaskRC(ResVT);
+ SDValue RC = CurDAG->getTargetConstant(RegClass, dl, MVT::i32);
+ CNode = CurDAG->getMachineNode(TargetOpcode::COPY_TO_REGCLASS,
+ dl, ResVT, SDValue(CNode, 0), RC);
+ }
+
+ ReplaceUses(SDValue(Root, 0), SDValue(CNode, 0));
+ CurDAG->RemoveDeadNode(Root);
+ return true;
+}
+
void X86DAGToDAGISel::Select(SDNode *Node) {
MVT NVT = Node->getSimpleValueType(0);
unsigned Opcode = Node->getOpcode();
@@ -3346,6 +4159,61 @@ void X86DAGToDAGISel::Select(SDNode *Node) {
switch (Opcode) {
default: break;
+ case ISD::INTRINSIC_VOID: {
+ unsigned IntNo = Node->getConstantOperandVal(1);
+ switch (IntNo) {
+ default: break;
+ case Intrinsic::x86_sse3_monitor:
+ case Intrinsic::x86_monitorx:
+ case Intrinsic::x86_clzero: {
+ bool Use64BitPtr = Node->getOperand(2).getValueType() == MVT::i64;
+
+ unsigned Opc = 0;
+ switch (IntNo) {
+ case Intrinsic::x86_sse3_monitor:
+ if (!Subtarget->hasSSE3())
+ break;
+ Opc = Use64BitPtr ? X86::MONITOR64rrr : X86::MONITOR32rrr;
+ break;
+ case Intrinsic::x86_monitorx:
+ if (!Subtarget->hasMWAITX())
+ break;
+ Opc = Use64BitPtr ? X86::MONITORX64rrr : X86::MONITORX32rrr;
+ break;
+ case Intrinsic::x86_clzero:
+ if (!Subtarget->hasCLZERO())
+ break;
+ Opc = Use64BitPtr ? X86::CLZERO64r : X86::CLZERO32r;
+ break;
+ }
+
+ if (Opc) {
+ unsigned PtrReg = Use64BitPtr ? X86::RAX : X86::EAX;
+ SDValue Chain = CurDAG->getCopyToReg(Node->getOperand(0), dl, PtrReg,
+ Node->getOperand(2), SDValue());
+ SDValue InFlag = Chain.getValue(1);
+
+ if (IntNo == Intrinsic::x86_sse3_monitor ||
+ IntNo == Intrinsic::x86_monitorx) {
+ // Copy the other two operands to ECX and EDX.
+ Chain = CurDAG->getCopyToReg(Chain, dl, X86::ECX, Node->getOperand(3),
+ InFlag);
+ InFlag = Chain.getValue(1);
+ Chain = CurDAG->getCopyToReg(Chain, dl, X86::EDX, Node->getOperand(4),
+ InFlag);
+ InFlag = Chain.getValue(1);
+ }
+
+ MachineSDNode *CNode = CurDAG->getMachineNode(Opc, dl, MVT::Other,
+ { Chain, InFlag});
+ ReplaceNode(Node, CNode);
+ return;
+ }
+ }
+ }
+
+ break;
+ }
case ISD::BRIND: {
if (Subtarget->isTargetNaCl())
// NaCl has its own pass where jmp %r32 are converted to jmp %r64. We
@@ -3381,13 +4249,17 @@ void X86DAGToDAGISel::Select(SDNode *Node) {
}
break;
- case X86ISD::BLENDV: {
- // BLENDV selects like a regular VSELECT.
- SDValue VSelect = CurDAG->getNode(
- ISD::VSELECT, SDLoc(Node), Node->getValueType(0), Node->getOperand(0),
+ case ISD::VSELECT: {
+ // Replace VSELECT with non-mask conditions with with BLENDV.
+ if (Node->getOperand(0).getValueType().getVectorElementType() == MVT::i1)
+ break;
+
+ assert(Subtarget->hasSSE41() && "Expected SSE4.1 support!");
+ SDValue Blendv = CurDAG->getNode(
+ X86ISD::BLENDV, SDLoc(Node), Node->getValueType(0), Node->getOperand(0),
Node->getOperand(1), Node->getOperand(2));
- ReplaceNode(Node, VSelect.getNode());
- SelectCode(VSelect.getNode());
+ ReplaceNode(Node, Blendv.getNode());
+ SelectCode(Blendv.getNode());
// We already called ReplaceUses.
return;
}
@@ -3403,6 +4275,18 @@ void X86DAGToDAGISel::Select(SDNode *Node) {
break;
case ISD::AND:
+ if (NVT.isVector() && NVT.getVectorElementType() == MVT::i1) {
+ // Try to form a masked VPTESTM. Operands can be in either order.
+ SDValue N0 = Node->getOperand(0);
+ SDValue N1 = Node->getOperand(1);
+ if (N0.getOpcode() == ISD::SETCC && N0.hasOneUse() &&
+ tryVPTESTM(Node, N0, N1))
+ return;
+ if (N1.getOpcode() == ISD::SETCC && N1.hasOneUse() &&
+ tryVPTESTM(Node, N1, N0))
+ return;
+ }
+
if (MachineSDNode *NewNode = matchBEXTRFromAndImm(Node)) {
ReplaceUses(SDValue(Node, 0), SDValue(NewNode, 0));
CurDAG->RemoveDeadNode(Node);
@@ -3415,89 +4299,113 @@ void X86DAGToDAGISel::Select(SDNode *Node) {
LLVM_FALLTHROUGH;
case ISD::OR:
- case ISD::XOR: {
-
- // For operations of the form (x << C1) op C2, check if we can use a smaller
- // encoding for C2 by transforming it into (x op (C2>>C1)) << C1.
- SDValue N0 = Node->getOperand(0);
- SDValue N1 = Node->getOperand(1);
+ case ISD::XOR:
+ if (tryShrinkShlLogicImm(Node))
+ return;
- if (N0->getOpcode() != ISD::SHL || !N0->hasOneUse())
+ LLVM_FALLTHROUGH;
+ case ISD::ADD:
+ case ISD::SUB: {
+ // Try to avoid folding immediates with multiple uses for optsize.
+ // This code tries to select to register form directly to avoid going
+ // through the isel table which might fold the immediate. We can't change
+ // the patterns on the add/sub/and/or/xor with immediate paterns in the
+ // tablegen files to check immediate use count without making the patterns
+ // unavailable to the fast-isel table.
+ if (!OptForSize)
break;
- // i8 is unshrinkable, i16 should be promoted to i32.
- if (NVT != MVT::i32 && NVT != MVT::i64)
+ // Only handle i8/i16/i32/i64.
+ if (NVT != MVT::i8 && NVT != MVT::i16 && NVT != MVT::i32 && NVT != MVT::i64)
break;
+ SDValue N0 = Node->getOperand(0);
+ SDValue N1 = Node->getOperand(1);
+
ConstantSDNode *Cst = dyn_cast<ConstantSDNode>(N1);
- ConstantSDNode *ShlCst = dyn_cast<ConstantSDNode>(N0->getOperand(1));
- if (!Cst || !ShlCst)
+ if (!Cst)
break;
int64_t Val = Cst->getSExtValue();
- uint64_t ShlVal = ShlCst->getZExtValue();
- // Make sure that we don't change the operation by removing bits.
- // This only matters for OR and XOR, AND is unaffected.
- uint64_t RemovedBitsMask = (1ULL << ShlVal) - 1;
- if (Opcode != ISD::AND && (Val & RemovedBitsMask) != 0)
+ // Make sure its an immediate that is considered foldable.
+ // FIXME: Handle unsigned 32 bit immediates for 64-bit AND.
+ if (!isInt<8>(Val) && !isInt<32>(Val))
break;
- unsigned ShlOp, AddOp, Op;
- MVT CstVT = NVT;
-
- // Check the minimum bitwidth for the new constant.
- // TODO: AND32ri is the same as AND64ri32 with zext imm.
- // TODO: MOV32ri+OR64r is cheaper than MOV64ri64+OR64rr
- // TODO: Using 16 and 8 bit operations is also possible for or32 & xor32.
- if (!isInt<8>(Val) && isInt<8>(Val >> ShlVal))
- CstVT = MVT::i8;
- else if (!isInt<32>(Val) && isInt<32>(Val >> ShlVal))
- CstVT = MVT::i32;
-
- // Bail if there is no smaller encoding.
- if (NVT == CstVT)
+ // Check if we should avoid folding this immediate.
+ if (!shouldAvoidImmediateInstFormsForSize(N1.getNode()))
break;
+ // We should not fold the immediate. So we need a register form instead.
+ unsigned ROpc, MOpc;
switch (NVT.SimpleTy) {
- default: llvm_unreachable("Unsupported VT!");
+ default: llvm_unreachable("Unexpected VT!");
+ case MVT::i8:
+ switch (Opcode) {
+ default: llvm_unreachable("Unexpected opcode!");
+ case ISD::ADD: ROpc = X86::ADD8rr; MOpc = X86::ADD8rm; break;
+ case ISD::SUB: ROpc = X86::SUB8rr; MOpc = X86::SUB8rm; break;
+ case ISD::AND: ROpc = X86::AND8rr; MOpc = X86::AND8rm; break;
+ case ISD::OR: ROpc = X86::OR8rr; MOpc = X86::OR8rm; break;
+ case ISD::XOR: ROpc = X86::XOR8rr; MOpc = X86::XOR8rm; break;
+ }
+ break;
+ case MVT::i16:
+ switch (Opcode) {
+ default: llvm_unreachable("Unexpected opcode!");
+ case ISD::ADD: ROpc = X86::ADD16rr; MOpc = X86::ADD16rm; break;
+ case ISD::SUB: ROpc = X86::SUB16rr; MOpc = X86::SUB16rm; break;
+ case ISD::AND: ROpc = X86::AND16rr; MOpc = X86::AND16rm; break;
+ case ISD::OR: ROpc = X86::OR16rr; MOpc = X86::OR16rm; break;
+ case ISD::XOR: ROpc = X86::XOR16rr; MOpc = X86::XOR16rm; break;
+ }
+ break;
case MVT::i32:
- assert(CstVT == MVT::i8);
- ShlOp = X86::SHL32ri;
- AddOp = X86::ADD32rr;
-
switch (Opcode) {
- default: llvm_unreachable("Impossible opcode");
- case ISD::AND: Op = X86::AND32ri8; break;
- case ISD::OR: Op = X86::OR32ri8; break;
- case ISD::XOR: Op = X86::XOR32ri8; break;
+ default: llvm_unreachable("Unexpected opcode!");
+ case ISD::ADD: ROpc = X86::ADD32rr; MOpc = X86::ADD32rm; break;
+ case ISD::SUB: ROpc = X86::SUB32rr; MOpc = X86::SUB32rm; break;
+ case ISD::AND: ROpc = X86::AND32rr; MOpc = X86::AND32rm; break;
+ case ISD::OR: ROpc = X86::OR32rr; MOpc = X86::OR32rm; break;
+ case ISD::XOR: ROpc = X86::XOR32rr; MOpc = X86::XOR32rm; break;
}
break;
case MVT::i64:
- assert(CstVT == MVT::i8 || CstVT == MVT::i32);
- ShlOp = X86::SHL64ri;
- AddOp = X86::ADD64rr;
-
switch (Opcode) {
- default: llvm_unreachable("Impossible opcode");
- case ISD::AND: Op = CstVT==MVT::i8? X86::AND64ri8 : X86::AND64ri32; break;
- case ISD::OR: Op = CstVT==MVT::i8? X86::OR64ri8 : X86::OR64ri32; break;
- case ISD::XOR: Op = CstVT==MVT::i8? X86::XOR64ri8 : X86::XOR64ri32; break;
+ default: llvm_unreachable("Unexpected opcode!");
+ case ISD::ADD: ROpc = X86::ADD64rr; MOpc = X86::ADD64rm; break;
+ case ISD::SUB: ROpc = X86::SUB64rr; MOpc = X86::SUB64rm; break;
+ case ISD::AND: ROpc = X86::AND64rr; MOpc = X86::AND64rm; break;
+ case ISD::OR: ROpc = X86::OR64rr; MOpc = X86::OR64rm; break;
+ case ISD::XOR: ROpc = X86::XOR64rr; MOpc = X86::XOR64rm; break;
}
break;
}
- // Emit the smaller op and the shift.
- SDValue NewCst = CurDAG->getTargetConstant(Val >> ShlVal, dl, CstVT);
- SDNode *New = CurDAG->getMachineNode(Op, dl, NVT, N0->getOperand(0),NewCst);
- if (ShlVal == 1)
- CurDAG->SelectNodeTo(Node, AddOp, NVT, SDValue(New, 0),
- SDValue(New, 0));
- else
- CurDAG->SelectNodeTo(Node, ShlOp, NVT, SDValue(New, 0),
- getI8Imm(ShlVal, dl));
+ // Ok this is a AND/OR/XOR/ADD/SUB with constant.
+
+ // If this is a not a subtract, we can still try to fold a load.
+ if (Opcode != ISD::SUB) {
+ SDValue Tmp0, Tmp1, Tmp2, Tmp3, Tmp4;
+ if (tryFoldLoad(Node, N0, Tmp0, Tmp1, Tmp2, Tmp3, Tmp4)) {
+ SDValue Ops[] = { N1, Tmp0, Tmp1, Tmp2, Tmp3, Tmp4, N0.getOperand(0) };
+ SDVTList VTs = CurDAG->getVTList(NVT, MVT::i32, MVT::Other);
+ MachineSDNode *CNode = CurDAG->getMachineNode(MOpc, dl, VTs, Ops);
+ // Update the chain.
+ ReplaceUses(N0.getValue(1), SDValue(CNode, 2));
+ // Record the mem-refs
+ CurDAG->setNodeMemRefs(CNode, {cast<LoadSDNode>(N0)->getMemOperand()});
+ ReplaceUses(SDValue(Node, 0), SDValue(CNode, 0));
+ CurDAG->RemoveDeadNode(Node);
+ return;
+ }
+ }
+
+ CurDAG->SelectNodeTo(Node, ROpc, NVT, MVT::i32, N0, N1);
return;
}
+
case X86ISD::SMUL:
// i16/i32/i64 are handled with isel patterns.
if (NVT != MVT::i8)
@@ -3895,7 +4803,7 @@ void X86DAGToDAGISel::Select(SDNode *Node) {
unsigned TrailingZeros = countTrailingZeros(Mask);
SDValue Imm = CurDAG->getTargetConstant(TrailingZeros, dl, MVT::i64);
SDValue Shift =
- SDValue(CurDAG->getMachineNode(X86::SHR64ri, dl, MVT::i64,
+ SDValue(CurDAG->getMachineNode(X86::SHR64ri, dl, MVT::i64, MVT::i32,
N0.getOperand(0), Imm), 0);
MachineSDNode *Test = CurDAG->getMachineNode(X86::TEST64rr, dl,
MVT::i32, Shift, Shift);
@@ -3906,7 +4814,7 @@ void X86DAGToDAGISel::Select(SDNode *Node) {
unsigned LeadingZeros = countLeadingZeros(Mask);
SDValue Imm = CurDAG->getTargetConstant(LeadingZeros, dl, MVT::i64);
SDValue Shift =
- SDValue(CurDAG->getMachineNode(X86::SHL64ri, dl, MVT::i64,
+ SDValue(CurDAG->getMachineNode(X86::SHL64ri, dl, MVT::i64, MVT::i32,
N0.getOperand(0), Imm), 0);
MachineSDNode *Test = CurDAG->getMachineNode(X86::TEST64rr, dl,
MVT::i32, Shift, Shift);
@@ -3964,8 +4872,6 @@ void X86DAGToDAGISel::Select(SDNode *Node) {
break;
}
- // FIXME: We should be able to fold loads here.
-
SDValue Imm = CurDAG->getTargetConstant(Mask, dl, VT);
SDValue Reg = N0.getOperand(0);
@@ -4058,10 +4964,46 @@ void X86DAGToDAGISel::Select(SDNode *Node) {
return;
}
+ case ISD::SETCC: {
+ if (NVT.isVector() && tryVPTESTM(Node, SDValue(Node, 0), SDValue()))
+ return;
+
+ break;
+ }
+
case ISD::STORE:
if (foldLoadStoreIntoMemOperand(Node))
return;
break;
+ case ISD::FCEIL:
+ case ISD::FFLOOR:
+ case ISD::FTRUNC:
+ case ISD::FNEARBYINT:
+ case ISD::FRINT: {
+ // Replace fp rounding with their X86 specific equivalent so we don't
+ // need 2 sets of patterns.
+ // FIXME: This can only happen when the nodes started as STRICT_* and have
+ // been mutated into their non-STRICT equivalents. Eventually this
+ // mutation will be removed and we should switch the STRICT_ nodes to a
+ // strict version of RNDSCALE in PreProcessISelDAG.
+ unsigned Imm;
+ switch (Node->getOpcode()) {
+ default: llvm_unreachable("Unexpected opcode!");
+ case ISD::FCEIL: Imm = 0xA; break;
+ case ISD::FFLOOR: Imm = 0x9; break;
+ case ISD::FTRUNC: Imm = 0xB; break;
+ case ISD::FNEARBYINT: Imm = 0xC; break;
+ case ISD::FRINT: Imm = 0x4; break;
+ }
+ SDLoc dl(Node);
+ SDValue Res = CurDAG->getNode(X86ISD::VRNDSCALE, dl,
+ Node->getValueType(0),
+ Node->getOperand(0),
+ CurDAG->getConstant(Imm, dl, MVT::i8));
+ ReplaceNode(Node, Res.getNode());
+ SelectCode(Res.getNode());
+ return;
+ }
}
SelectCode(Node);
diff --git a/lib/Target/X86/X86ISelLowering.cpp b/lib/Target/X86/X86ISelLowering.cpp
index b6a692ee187d..0b4bf687e6cf 100644
--- a/lib/Target/X86/X86ISelLowering.cpp
+++ b/lib/Target/X86/X86ISelLowering.cpp
@@ -1,9 +1,8 @@
//===-- X86ISelLowering.cpp - X86 DAG Lowering Implementation -------------===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
@@ -131,7 +130,7 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
addBypassSlowDiv(64, 32);
}
- if (Subtarget.isTargetKnownWindowsMSVC() ||
+ if (Subtarget.isTargetWindowsMSVC() ||
Subtarget.isTargetWindowsItanium()) {
// Setup Windows compiler runtime calls.
setLibcallName(RTLIB::SDIV_I64, "_alldiv");
@@ -159,6 +158,13 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
setUseUnderscoreLongJmp(true);
}
+ // If we don't have cmpxchg8b(meaing this is a 386/486), limit atomic size to
+ // 32 bits so the AtomicExpandPass will expand it so we don't need cmpxchg8b.
+ // FIXME: Should we be limitting the atomic size on other configs? Default is
+ // 1024.
+ if (!Subtarget.hasCmpxchg8b())
+ setMaxAtomicSizeInBitsSupported(32);
+
// Set up the register classes.
addRegisterClass(MVT::i8, &X86::GR8RegClass);
addRegisterClass(MVT::i16, &X86::GR16RegClass);
@@ -190,10 +196,9 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
// Integer absolute.
if (Subtarget.hasCMov()) {
setOperationAction(ISD::ABS , MVT::i16 , Custom);
- setOperationAction(ISD::ABS , MVT::i32 , Custom);
- if (Subtarget.is64Bit())
- setOperationAction(ISD::ABS , MVT::i64 , Custom);
+ setOperationAction(ISD::ABS , MVT::i32 , Custom);
}
+ setOperationAction(ISD::ABS , MVT::i64 , Custom);
// Funnel shifts.
for (auto ShiftOp : {ISD::FSHL, ISD::FSHR}) {
@@ -258,14 +263,8 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
setOperationAction(ISD::FP_TO_SINT , MVT::i64 , Custom);
setOperationAction(ISD::SINT_TO_FP , MVT::i64 , Custom);
- if (X86ScalarSSEf32) {
- setOperationAction(ISD::FP_TO_SINT , MVT::i16 , Promote);
- // f32 and f64 cases are Legal, f80 case is not
- setOperationAction(ISD::FP_TO_SINT , MVT::i32 , Custom);
- } else {
- setOperationAction(ISD::FP_TO_SINT , MVT::i16 , Custom);
- setOperationAction(ISD::FP_TO_SINT , MVT::i32 , Custom);
- }
+ setOperationAction(ISD::FP_TO_SINT , MVT::i16 , Custom);
+ setOperationAction(ISD::FP_TO_SINT , MVT::i32 , Custom);
} else {
setOperationAction(ISD::FP_TO_SINT , MVT::i16 , Promote);
setOperationAction(ISD::FP_TO_SINT , MVT::i32 , Expand);
@@ -415,6 +414,8 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
setOperationAction(ISD::CTPOP , MVT::i32 , Expand);
if (Subtarget.is64Bit())
setOperationAction(ISD::CTPOP , MVT::i64 , Expand);
+ else
+ setOperationAction(ISD::CTPOP , MVT::i64 , Custom);
}
setOperationAction(ISD::READCYCLECOUNTER , MVT::i64 , Custom);
@@ -486,6 +487,9 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
setOperationAction(ISD::ATOMIC_STORE, VT, Custom);
}
+ if (!Subtarget.is64Bit())
+ setOperationAction(ISD::ATOMIC_LOAD, MVT::i64, Custom);
+
if (Subtarget.hasCmpxchg16b()) {
setOperationAction(ISD::ATOMIC_CMP_SWAP_WITH_SUCCESS, MVT::i128, Custom);
}
@@ -530,6 +534,12 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
addRegisterClass(MVT::f64, Subtarget.hasAVX512() ? &X86::FR64XRegClass
: &X86::FR64RegClass);
+ // Disable f32->f64 extload as we can only generate this in one instruction
+ // under optsize. So its easier to pattern match (fpext (load)) for that
+ // case instead of needing to emit 2 instructions for extload in the
+ // non-optsize case.
+ setLoadExtAction(ISD::EXTLOAD, MVT::f64, MVT::f32, Expand);
+
for (auto VT : { MVT::f32, MVT::f64 }) {
// Use ANDPD to simulate FABS.
setOperationAction(ISD::FABS, VT, Custom);
@@ -668,6 +678,10 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
setOperationAction(ISD::FRINT, MVT::f80, Expand);
setOperationAction(ISD::FNEARBYINT, MVT::f80, Expand);
setOperationAction(ISD::FMA, MVT::f80, Expand);
+ setOperationAction(ISD::LROUND, MVT::f80, Expand);
+ setOperationAction(ISD::LLROUND, MVT::f80, Expand);
+ setOperationAction(ISD::LRINT, MVT::f80, Expand);
+ setOperationAction(ISD::LLRINT, MVT::f80, Expand);
}
// Always use a library call for pow.
@@ -780,6 +794,9 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v4f32, Custom);
setOperationAction(ISD::SELECT, MVT::v4f32, Custom);
setOperationAction(ISD::UINT_TO_FP, MVT::v4i32, Custom);
+
+ setOperationAction(ISD::LOAD, MVT::v2f32, Custom);
+ setOperationAction(ISD::STORE, MVT::v2f32, Custom);
}
if (!Subtarget.useSoftFloat() && Subtarget.hasSSE2()) {
@@ -841,6 +858,10 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
setOperationAction(ISD::SADDSAT, MVT::v8i16, Legal);
setOperationAction(ISD::USUBSAT, MVT::v8i16, Legal);
setOperationAction(ISD::SSUBSAT, MVT::v8i16, Legal);
+ setOperationAction(ISD::UADDSAT, MVT::v4i32, Custom);
+ setOperationAction(ISD::USUBSAT, MVT::v4i32, Custom);
+ setOperationAction(ISD::UADDSAT, MVT::v2i64, Custom);
+ setOperationAction(ISD::USUBSAT, MVT::v2i64, Custom);
if (!ExperimentalVectorWideningLegalization) {
// Use widening instead of promotion.
@@ -950,17 +971,12 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
setOperationAction(ISD::FP_EXTEND, MVT::v2f32, Custom);
setOperationAction(ISD::FP_ROUND, MVT::v2f32, Custom);
- for (MVT VT : MVT::fp_vector_valuetypes())
- setLoadExtAction(ISD::EXTLOAD, VT, MVT::v2f32, Legal);
-
// We want to legalize this to an f64 load rather than an i64 load on
// 64-bit targets and two 32-bit loads on a 32-bit target. Similar for
// store.
- setOperationAction(ISD::LOAD, MVT::v2f32, Custom);
setOperationAction(ISD::LOAD, MVT::v2i32, Custom);
setOperationAction(ISD::LOAD, MVT::v4i16, Custom);
setOperationAction(ISD::LOAD, MVT::v8i8, Custom);
- setOperationAction(ISD::STORE, MVT::v2f32, Custom);
setOperationAction(ISD::STORE, MVT::v2i32, Custom);
setOperationAction(ISD::STORE, MVT::v4i16, Custom);
setOperationAction(ISD::STORE, MVT::v8i8, Custom);
@@ -1128,14 +1144,10 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
setOperationAction(ISD::FP_TO_SINT, MVT::v8i32, Legal);
setOperationAction(ISD::SINT_TO_FP, MVT::v8i32, Legal);
- setOperationAction(ISD::FP_ROUND, MVT::v4f32, Legal);
if (!Subtarget.hasAVX512())
setOperationAction(ISD::BITCAST, MVT::v32i1, Custom);
- for (MVT VT : MVT::fp_vector_valuetypes())
- setLoadExtAction(ISD::EXTLOAD, VT, MVT::v4f32, Legal);
-
// In the customized shift lowering, the legal v8i32/v4i64 cases
// in AVX2 will be recognized.
for (auto VT : { MVT::v32i8, MVT::v16i16, MVT::v8i32, MVT::v4i64 }) {
@@ -1144,13 +1156,11 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
setOperationAction(ISD::SRA, VT, Custom);
}
- if (ExperimentalVectorWideningLegalization) {
- // These types need custom splitting if their input is a 128-bit vector.
- setOperationAction(ISD::SIGN_EXTEND, MVT::v8i64, Custom);
- setOperationAction(ISD::SIGN_EXTEND, MVT::v16i32, Custom);
- setOperationAction(ISD::ZERO_EXTEND, MVT::v8i64, Custom);
- setOperationAction(ISD::ZERO_EXTEND, MVT::v16i32, Custom);
- }
+ // These types need custom splitting if their input is a 128-bit vector.
+ setOperationAction(ISD::SIGN_EXTEND, MVT::v8i64, Custom);
+ setOperationAction(ISD::SIGN_EXTEND, MVT::v16i32, Custom);
+ setOperationAction(ISD::ZERO_EXTEND, MVT::v8i64, Custom);
+ setOperationAction(ISD::ZERO_EXTEND, MVT::v16i32, Custom);
setOperationAction(ISD::ROTL, MVT::v8i32, Custom);
setOperationAction(ISD::ROTL, MVT::v16i16, Custom);
@@ -1182,9 +1192,6 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
setOperationAction(ISD::CTPOP, VT, Custom);
setOperationAction(ISD::CTLZ, VT, Custom);
- // TODO - remove this once 256-bit X86ISD::ANDNP correctly split.
- setOperationAction(ISD::CTTZ, VT, HasInt256 ? Expand : Custom);
-
// The condition codes aren't legal in SSE/AVX and under AVX512 we use
// setcc all the way to isel and prefer SETGT in some isel patterns.
setCondCodeAction(ISD::SETLT, VT, Custom);
@@ -1260,7 +1267,7 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
for (auto VT : { MVT::v4i32, MVT::v8i32, MVT::v2i64, MVT::v4i64,
MVT::v4f32, MVT::v8f32, MVT::v2f64, MVT::v4f64 }) {
- setOperationAction(ISD::MLOAD, VT, Legal);
+ setOperationAction(ISD::MLOAD, VT, Subtarget.hasVLX() ? Legal : Custom);
setOperationAction(ISD::MSTORE, VT, Legal);
}
@@ -1282,6 +1289,7 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
setOperationAction(ISD::SCALAR_TO_VECTOR, VT, Custom);
setOperationAction(ISD::INSERT_SUBVECTOR, VT, Legal);
setOperationAction(ISD::CONCAT_VECTORS, VT, Custom);
+ setOperationAction(ISD::STORE, VT, Custom);
}
if (HasInt256)
@@ -1352,19 +1360,14 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
setOperationAction(ISD::SSUBSAT, VT, Custom);
setOperationAction(ISD::BUILD_VECTOR, VT, Custom);
+ setOperationAction(ISD::CONCAT_VECTORS, VT, Custom);
setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT, Custom);
+ setOperationAction(ISD::INSERT_SUBVECTOR, VT, Custom);
setOperationAction(ISD::INSERT_VECTOR_ELT, VT, Custom);
setOperationAction(ISD::VECTOR_SHUFFLE, VT, Custom);
setOperationAction(ISD::VSELECT, VT, Expand);
}
- setOperationAction(ISD::CONCAT_VECTORS, MVT::v16i1, Custom);
- setOperationAction(ISD::CONCAT_VECTORS, MVT::v8i1, Custom);
- setOperationAction(ISD::CONCAT_VECTORS, MVT::v4i1, Custom);
- setOperationAction(ISD::INSERT_SUBVECTOR, MVT::v2i1, Custom);
- setOperationAction(ISD::INSERT_SUBVECTOR, MVT::v4i1, Custom);
- setOperationAction(ISD::INSERT_SUBVECTOR, MVT::v8i1, Custom);
- setOperationAction(ISD::INSERT_SUBVECTOR, MVT::v16i1, Custom);
for (auto VT : { MVT::v1i1, MVT::v2i1, MVT::v4i1, MVT::v8i1 })
setOperationAction(ISD::EXTRACT_SUBVECTOR, VT, Custom);
}
@@ -1378,9 +1381,6 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
addRegisterClass(MVT::v8i64, &X86::VR512RegClass);
addRegisterClass(MVT::v8f64, &X86::VR512RegClass);
- for (MVT VT : MVT::fp_vector_valuetypes())
- setLoadExtAction(ISD::EXTLOAD, VT, MVT::v8f32, Legal);
-
for (auto ExtType : {ISD::ZEXTLOAD, ISD::SEXTLOAD}) {
setLoadExtAction(ExtType, MVT::v16i32, MVT::v16i8, Legal);
setLoadExtAction(ExtType, MVT::v16i32, MVT::v16i16, Legal);
@@ -1413,10 +1413,10 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
setTruncStoreAction(MVT::v16i32, MVT::v16i8, Legal);
setTruncStoreAction(MVT::v16i32, MVT::v16i16, Legal);
+ // With 512-bit vectors and no VLX, we prefer to widen MLOAD/MSTORE
+ // to 512-bit rather than use the AVX2 instructions so that we can use
+ // k-masks.
if (!Subtarget.hasVLX()) {
- // With 512-bit vectors and no VLX, we prefer to widen MLOAD/MSTORE
- // to 512-bit rather than use the AVX2 instructions so that we can use
- // k-masks.
for (auto VT : {MVT::v4i32, MVT::v8i32, MVT::v2i64, MVT::v4i64,
MVT::v4f32, MVT::v8f32, MVT::v2f64, MVT::v4f64}) {
setOperationAction(ISD::MLOAD, VT, Custom);
@@ -1446,6 +1446,8 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
setOperationAction(ISD::FTRUNC, VT, Legal);
setOperationAction(ISD::FRINT, VT, Legal);
setOperationAction(ISD::FNEARBYINT, VT, Legal);
+
+ setOperationAction(ISD::SELECT, VT, Custom);
}
// Without BWI we need to use custom lowering to handle MVT::v64i8 input.
@@ -1465,13 +1467,6 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
setOperationAction(ISD::MULHU, MVT::v16i32, Custom);
setOperationAction(ISD::MULHS, MVT::v16i32, Custom);
- setOperationAction(ISD::SELECT, MVT::v8f64, Custom);
- setOperationAction(ISD::SELECT, MVT::v8i64, Custom);
- setOperationAction(ISD::SELECT, MVT::v16i32, Custom);
- setOperationAction(ISD::SELECT, MVT::v32i16, Custom);
- setOperationAction(ISD::SELECT, MVT::v64i8, Custom);
- setOperationAction(ISD::SELECT, MVT::v16f32, Custom);
-
for (auto VT : { MVT::v16i32, MVT::v8i64 }) {
setOperationAction(ISD::SMAX, VT, Legal);
setOperationAction(ISD::UMAX, VT, Legal);
@@ -1485,6 +1480,7 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
setOperationAction(ISD::ROTL, VT, Custom);
setOperationAction(ISD::ROTR, VT, Custom);
setOperationAction(ISD::SETCC, VT, Custom);
+ setOperationAction(ISD::SELECT, VT, Custom);
// The condition codes aren't legal in SSE/AVX and under AVX512 we use
// setcc all the way to isel and prefer SETGT in some isel patterns.
@@ -1705,6 +1701,7 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
setOperationAction(ISD::SADDSAT, VT, Legal);
setOperationAction(ISD::USUBSAT, VT, Legal);
setOperationAction(ISD::SSUBSAT, VT, Legal);
+ setOperationAction(ISD::SELECT, VT, Custom);
// The condition codes aren't legal in SSE/AVX and under AVX512 we use
// setcc all the way to isel and prefer SETGT in some isel patterns.
@@ -1788,7 +1785,6 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
setOperationAction(ISD::INTRINSIC_VOID, MVT::Other, Custom);
if (!Subtarget.is64Bit()) {
setOperationAction(ISD::INTRINSIC_W_CHAIN, MVT::i64, Custom);
- setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::i64, Custom);
}
// Only custom-lower 64-bit SADDO and friends on 64-bit because we don't
@@ -1842,8 +1838,8 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
// is. We should promote the value to 64-bits to solve this.
// This is what the CRT headers do - `fmodf` is an inline header
// function casting to f64 and calling `fmod`.
- if (Subtarget.is32Bit() && (Subtarget.isTargetKnownWindowsMSVC() ||
- Subtarget.isTargetWindowsItanium()))
+ if (Subtarget.is32Bit() &&
+ (Subtarget.isTargetWindowsMSVC() || Subtarget.isTargetWindowsItanium()))
for (ISD::NodeType Op :
{ISD::FCEIL, ISD::FCOS, ISD::FEXP, ISD::FFLOOR, ISD::FREM, ISD::FLOG,
ISD::FLOG10, ISD::FPOW, ISD::FSIN})
@@ -1854,6 +1850,7 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
setTargetDAGCombine(ISD::VECTOR_SHUFFLE);
setTargetDAGCombine(ISD::SCALAR_TO_VECTOR);
setTargetDAGCombine(ISD::EXTRACT_VECTOR_ELT);
+ setTargetDAGCombine(ISD::CONCAT_VECTORS);
setTargetDAGCombine(ISD::INSERT_SUBVECTOR);
setTargetDAGCombine(ISD::EXTRACT_SUBVECTOR);
setTargetDAGCombine(ISD::BITCAST);
@@ -1881,6 +1878,9 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
setTargetDAGCombine(ISD::ANY_EXTEND);
setTargetDAGCombine(ISD::SIGN_EXTEND);
setTargetDAGCombine(ISD::SIGN_EXTEND_INREG);
+ setTargetDAGCombine(ISD::ANY_EXTEND_VECTOR_INREG);
+ setTargetDAGCombine(ISD::SIGN_EXTEND_VECTOR_INREG);
+ setTargetDAGCombine(ISD::ZERO_EXTEND_VECTOR_INREG);
setTargetDAGCombine(ISD::SINT_TO_FP);
setTargetDAGCombine(ISD::UINT_TO_FP);
setTargetDAGCombine(ISD::SETCC);
@@ -2050,20 +2050,19 @@ unsigned X86TargetLowering::getByValTypeAlignment(Type *Ty,
/// source is constant so it does not need to be loaded.
/// It returns EVT::Other if the type should be determined using generic
/// target-independent logic.
-EVT
-X86TargetLowering::getOptimalMemOpType(uint64_t Size,
- unsigned DstAlign, unsigned SrcAlign,
- bool IsMemset, bool ZeroMemset,
- bool MemcpyStrSrc,
- MachineFunction &MF) const {
- const Function &F = MF.getFunction();
- if (!F.hasFnAttribute(Attribute::NoImplicitFloat)) {
- if (Size >= 16 &&
- (!Subtarget.isUnalignedMem16Slow() ||
- ((DstAlign == 0 || DstAlign >= 16) &&
- (SrcAlign == 0 || SrcAlign >= 16)))) {
+/// For vector ops we check that the overall size isn't larger than our
+/// preferred vector width.
+EVT X86TargetLowering::getOptimalMemOpType(
+ uint64_t Size, unsigned DstAlign, unsigned SrcAlign, bool IsMemset,
+ bool ZeroMemset, bool MemcpyStrSrc,
+ const AttributeList &FuncAttributes) const {
+ if (!FuncAttributes.hasFnAttribute(Attribute::NoImplicitFloat)) {
+ if (Size >= 16 && (!Subtarget.isUnalignedMem16Slow() ||
+ ((DstAlign == 0 || DstAlign >= 16) &&
+ (SrcAlign == 0 || SrcAlign >= 16)))) {
// FIXME: Check if unaligned 32-byte accesses are slow.
- if (Size >= 32 && Subtarget.hasAVX()) {
+ if (Size >= 32 && Subtarget.hasAVX() &&
+ (Subtarget.getPreferVectorWidth() >= 256)) {
// Although this isn't a well-supported type for AVX1, we'll let
// legalization and shuffle lowering produce the optimal codegen. If we
// choose an optimal type with a vector element larger than a byte,
@@ -2071,11 +2070,12 @@ X86TargetLowering::getOptimalMemOpType(uint64_t Size,
// multiply) before we splat as a vector.
return MVT::v32i8;
}
- if (Subtarget.hasSSE2())
+ if (Subtarget.hasSSE2() && (Subtarget.getPreferVectorWidth() >= 128))
return MVT::v16i8;
// TODO: Can SSE1 handle a byte vector?
// If we have SSE1 registers we should be able to use them.
- if (Subtarget.hasSSE1() && (Subtarget.is64Bit() || Subtarget.hasX87()))
+ if (Subtarget.hasSSE1() && (Subtarget.is64Bit() || Subtarget.hasX87()) &&
+ (Subtarget.getPreferVectorWidth() >= 128))
return MVT::v4f32;
} else if ((!IsMemset || ZeroMemset) && !MemcpyStrSrc && Size >= 8 &&
!Subtarget.is64Bit() && Subtarget.hasSSE2()) {
@@ -2104,11 +2104,9 @@ bool X86TargetLowering::isSafeMemOpType(MVT VT) const {
return true;
}
-bool
-X86TargetLowering::allowsMisalignedMemoryAccesses(EVT VT,
- unsigned,
- unsigned,
- bool *Fast) const {
+bool X86TargetLowering::allowsMisalignedMemoryAccesses(
+ EVT VT, unsigned, unsigned Align, MachineMemOperand::Flags Flags,
+ bool *Fast) const {
if (Fast) {
switch (VT.getSizeInBits()) {
default:
@@ -2124,6 +2122,16 @@ X86TargetLowering::allowsMisalignedMemoryAccesses(EVT VT,
// TODO: What about AVX-512 (512-bit) accesses?
}
}
+ // NonTemporal vector memory ops must be aligned.
+ if (!!(Flags & MachineMemOperand::MONonTemporal) && VT.isVector()) {
+ // NT loads can only be vector aligned, so if its less aligned than the
+ // minimum vector size (which we can split the vector down to), we might as
+ // well use a regular unaligned vector load.
+ // We don't have any NT loads pre-SSE41.
+ if (!!(Flags & MachineMemOperand::MOLoad))
+ return (Align < 16 || !Subtarget.hasSSE41());
+ return false;
+ }
// Misaligned accesses of any size are always allowed.
return true;
}
@@ -2281,12 +2289,13 @@ void X86TargetLowering::insertSSPDeclarations(Module &M) const {
Type::getInt8PtrTy(M.getContext()));
// MSVC CRT has a function to validate security cookie.
- auto *SecurityCheckCookie = cast<Function>(
- M.getOrInsertFunction("__security_check_cookie",
- Type::getVoidTy(M.getContext()),
- Type::getInt8PtrTy(M.getContext())));
- SecurityCheckCookie->setCallingConv(CallingConv::X86_FastCall);
- SecurityCheckCookie->addAttribute(1, Attribute::AttrKind::InReg);
+ FunctionCallee SecurityCheckCookie = M.getOrInsertFunction(
+ "__security_check_cookie", Type::getVoidTy(M.getContext()),
+ Type::getInt8PtrTy(M.getContext()));
+ if (Function *F = dyn_cast<Function>(SecurityCheckCookie.getCallee())) {
+ F->setCallingConv(CallingConv::X86_FastCall);
+ F->addAttribute(1, Attribute::AttrKind::InReg);
+ }
return;
}
// glibc, bionic, and Fuchsia have a special slot for the stack guard.
@@ -2304,7 +2313,7 @@ Value *X86TargetLowering::getSDagStackGuard(const Module &M) const {
return TargetLowering::getSDagStackGuard(M);
}
-Value *X86TargetLowering::getSSPStackGuardCheck(const Module &M) const {
+Function *X86TargetLowering::getSSPStackGuardCheck(const Module &M) const {
// MSVC CRT has a function to validate security cookie.
if (Subtarget.getTargetTriple().isWindowsMSVCEnvironment() ||
Subtarget.getTargetTriple().isWindowsItaniumEnvironment()) {
@@ -2347,8 +2356,6 @@ bool X86TargetLowering::isNoopAddrSpaceCast(unsigned SrcAS,
// Return Value Calling Convention Implementation
//===----------------------------------------------------------------------===//
-#include "X86GenCallingConv.inc"
-
bool X86TargetLowering::CanLowerReturn(
CallingConv::ID CallConv, MachineFunction &MF, bool isVarArg,
const SmallVectorImpl<ISD::OutputArg> &Outs, LLVMContext &Context) const {
@@ -2703,7 +2710,6 @@ static SDValue getv64i1Argument(CCValAssign &VA, CCValAssign &NextVA,
"The values should reside in two registers");
SDValue Lo, Hi;
- unsigned Reg;
SDValue ArgValueLo, ArgValueHi;
MachineFunction &MF = DAG.getMachineFunction();
@@ -2713,7 +2719,7 @@ static SDValue getv64i1Argument(CCValAssign &VA, CCValAssign &NextVA,
if (nullptr == InFlag) {
// When no physical register is present,
// create an intermediate virtual register.
- Reg = MF.addLiveIn(VA.getLocReg(), RC);
+ unsigned Reg = MF.addLiveIn(VA.getLocReg(), RC);
ArgValueLo = DAG.getCopyFromReg(Root, Dl, Reg, MVT::i32);
Reg = MF.addLiveIn(NextVA.getLocReg(), RC);
ArgValueHi = DAG.getCopyFromReg(Root, Dl, Reg, MVT::i32);
@@ -2934,6 +2940,8 @@ static bool mayTailCallThisCC(CallingConv::ID CC) {
case CallingConv::X86_StdCall:
case CallingConv::X86_VectorCall:
case CallingConv::X86_FastCall:
+ // Swift:
+ case CallingConv::Swift:
return true;
default:
return canGuaranteeTCO(CC);
@@ -2986,22 +2994,6 @@ X86TargetLowering::LowerMemArgument(SDValue Chain, CallingConv::ID CallConv,
else
ValVT = VA.getValVT();
- // Calculate SP offset of interrupt parameter, re-arrange the slot normally
- // taken by a return address.
- int Offset = 0;
- if (CallConv == CallingConv::X86_INTR) {
- // X86 interrupts may take one or two arguments.
- // On the stack there will be no return address as in regular call.
- // Offset of last argument need to be set to -4/-8 bytes.
- // Where offset of the first argument out of two, should be set to 0 bytes.
- Offset = (Subtarget.is64Bit() ? 8 : 4) * ((i + 1) % Ins.size() - 1);
- if (Subtarget.is64Bit() && Ins.size() == 2) {
- // The stack pointer needs to be realigned for 64 bit handlers with error
- // code, so the argument offset changes by 8 bytes.
- Offset += 8;
- }
- }
-
// FIXME: For now, all byval parameter objects are marked mutable. This can be
// changed with more analysis.
// In case of tail call optimization mark all arguments mutable. Since they
@@ -3014,15 +3006,15 @@ X86TargetLowering::LowerMemArgument(SDValue Chain, CallingConv::ID CallConv,
// can be improved with deeper analysis.
int FI = MFI.CreateFixedObject(Bytes, VA.getLocMemOffset(), isImmutable,
/*isAliased=*/true);
- // Adjust SP offset of interrupt parameter.
- if (CallConv == CallingConv::X86_INTR) {
- MFI.setObjectOffset(FI, Offset);
- }
return DAG.getFrameIndex(FI, PtrVT);
}
// This is an argument in memory. We might be able to perform copy elision.
- if (Flags.isCopyElisionCandidate()) {
+ // If the argument is passed directly in memory without any extension, then we
+ // can perform copy elision. Large vector types, for example, may be passed
+ // indirectly by pointer.
+ if (Flags.isCopyElisionCandidate() &&
+ VA.getLocInfo() != CCValAssign::Indirect && !ExtendedInMem) {
EVT ArgVT = Ins[i].ArgVT;
SDValue PartAddr;
if (Ins[i].PartOffset == 0) {
@@ -3031,7 +3023,7 @@ X86TargetLowering::LowerMemArgument(SDValue Chain, CallingConv::ID CallConv,
// load from our portion of it. This assumes that if the first part of an
// argument is in memory, the rest will also be in memory.
int FI = MFI.CreateFixedObject(ArgVT.getStoreSize(), VA.getLocMemOffset(),
- /*Immutable=*/false);
+ /*IsImmutable=*/false);
PartAddr = DAG.getFrameIndex(FI, PtrVT);
return DAG.getLoad(
ValVT, dl, Chain, PartAddr,
@@ -3072,11 +3064,6 @@ X86TargetLowering::LowerMemArgument(SDValue Chain, CallingConv::ID CallConv,
MFI.setObjectSExt(FI, true);
}
- // Adjust SP offset of interrupt parameter.
- if (CallConv == CallingConv::X86_INTR) {
- MFI.setObjectOffset(FI, Offset);
- }
-
SDValue FIN = DAG.getFrameIndex(FI, PtrVT);
SDValue Val = DAG.getLoad(
ValVT, dl, Chain, FIN,
@@ -3166,14 +3153,6 @@ SDValue X86TargetLowering::LowerFormalArguments(
!(isVarArg && canGuaranteeTCO(CallConv)) &&
"Var args not supported with calling conv' regcall, fastcc, ghc or hipe");
- if (CallConv == CallingConv::X86_INTR) {
- bool isLegal = Ins.size() == 1 ||
- (Ins.size() == 2 && ((Is64Bit && Ins[1].VT == MVT::i64) ||
- (!Is64Bit && Ins[1].VT == MVT::i32)));
- if (!isLegal)
- report_fatal_error("X86 interrupts may take one or two arguments");
- }
-
// Assign locations to all of the incoming arguments.
SmallVector<CCValAssign, 16> ArgLocs;
CCState CCInfo(CallConv, isVarArg, MF, ArgLocs, *DAG.getContext());
@@ -3454,11 +3433,11 @@ SDValue X86TargetLowering::LowerFormalArguments(
}
// Copy all forwards from physical to virtual registers.
- for (ForwardedRegister &F : Forwards) {
+ for (ForwardedRegister &FR : Forwards) {
// FIXME: Can we use a less constrained schedule?
- SDValue RegVal = DAG.getCopyFromReg(Chain, dl, F.VReg, F.VT);
- F.VReg = MF.getRegInfo().createVirtualRegister(getRegClassFor(F.VT));
- Chain = DAG.getCopyToReg(Chain, dl, F.VReg, RegVal);
+ SDValue RegVal = DAG.getCopyFromReg(Chain, dl, FR.VReg, FR.VT);
+ FR.VReg = MF.getRegInfo().createVirtualRegister(getRegClassFor(FR.VT));
+ Chain = DAG.getCopyToReg(Chain, dl, FR.VReg, RegVal);
}
}
@@ -3610,6 +3589,8 @@ X86TargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
const Module *M = MF.getMMI().getModule();
Metadata *IsCFProtectionSupported = M->getModuleFlag("cf-protection-branch");
+ MachineFunction::CallSiteInfo CSInfo;
+
if (CallConv == CallingConv::X86_INTR)
report_fatal_error("X86 interrupts may not be called directly");
@@ -3805,6 +3786,9 @@ X86TargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
Subtarget);
} else if (VA.isRegLoc()) {
RegsToPass.push_back(std::make_pair(VA.getLocReg(), Arg));
+ const TargetOptions &Options = DAG.getTarget().Options;
+ if (Options.EnableDebugEntryValues)
+ CSInfo.emplace_back(VA.getLocReg(), I);
if (isVarArg && IsWin64) {
// Win64 ABI requires argument XMM reg to be copied to the corresponding
// shadow reg if callee is a varargs function.
@@ -3975,46 +3959,13 @@ X86TargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
// through a register, since the call instruction's 32-bit
// pc-relative offset may not be large enough to hold the whole
// address.
- } else if (Callee->getOpcode() == ISD::GlobalAddress) {
- // If the callee is a GlobalAddress node (quite common, every direct call
- // is) turn it into a TargetGlobalAddress node so that legalize doesn't hack
- // it.
- GlobalAddressSDNode* G = cast<GlobalAddressSDNode>(Callee);
-
- // We should use extra load for direct calls to dllimported functions in
- // non-JIT mode.
- const GlobalValue *GV = G->getGlobal();
- if (!GV->hasDLLImportStorageClass()) {
- unsigned char OpFlags = Subtarget.classifyGlobalFunctionReference(GV);
-
- Callee = DAG.getTargetGlobalAddress(
- GV, dl, getPointerTy(DAG.getDataLayout()), G->getOffset(), OpFlags);
-
- if (OpFlags == X86II::MO_GOTPCREL) {
- // Add a wrapper.
- Callee = DAG.getNode(X86ISD::WrapperRIP, dl,
- getPointerTy(DAG.getDataLayout()), Callee);
- // Add extra indirection
- Callee = DAG.getLoad(
- getPointerTy(DAG.getDataLayout()), dl, DAG.getEntryNode(), Callee,
- MachinePointerInfo::getGOT(DAG.getMachineFunction()));
- }
- }
- } else if (ExternalSymbolSDNode *S = dyn_cast<ExternalSymbolSDNode>(Callee)) {
- const Module *Mod = DAG.getMachineFunction().getFunction().getParent();
- unsigned char OpFlags =
- Subtarget.classifyGlobalFunctionReference(nullptr, *Mod);
-
- Callee = DAG.getTargetExternalSymbol(
- S->getSymbol(), getPointerTy(DAG.getDataLayout()), OpFlags);
-
- if (OpFlags == X86II::MO_GOTPCREL) {
- Callee = DAG.getNode(X86ISD::WrapperRIP, dl,
- getPointerTy(DAG.getDataLayout()), Callee);
- Callee = DAG.getLoad(
- getPointerTy(DAG.getDataLayout()), dl, DAG.getEntryNode(), Callee,
- MachinePointerInfo::getGOT(DAG.getMachineFunction()));
- }
+ } else if (Callee->getOpcode() == ISD::GlobalAddress ||
+ Callee->getOpcode() == ISD::ExternalSymbol) {
+ // Lower direct calls to global addresses and external symbols. Setting
+ // ForCall to true here has the effect of removing WrapperRIP when possible
+ // to allow direct calls to be selected without first materializing the
+ // address into a register.
+ Callee = LowerGlobalOrExternal(Callee, DAG, /*ForCall=*/true);
} else if (Subtarget.isTarget64BitILP32() &&
Callee->getValueType(0) == MVT::i32) {
// Zero-extend the 32-bit Callee address into a 64-bit according to x32 ABI
@@ -4105,7 +4056,9 @@ X86TargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
// should be computed from returns not tail calls. Consider a void
// function making a tail call to a function returning int.
MF.getFrameInfo().setHasTailCall();
- return DAG.getNode(X86ISD::TC_RETURN, dl, NodeTys, Ops);
+ SDValue Ret = DAG.getNode(X86ISD::TC_RETURN, dl, NodeTys, Ops);
+ DAG.addCallSiteInfo(Ret.getNode(), std::move(CSInfo));
+ return Ret;
}
if (HasNoCfCheck && IsCFProtectionSupported) {
@@ -4114,6 +4067,7 @@ X86TargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
Chain = DAG.getNode(X86ISD::CALL, dl, NodeTys, Ops);
}
InFlag = Chain.getValue(1);
+ DAG.addCallSiteInfo(Chain.getNode(), std::move(CSInfo));
// Create the CALLSEQ_END node.
unsigned NumBytesForCalleeToPop;
@@ -4787,7 +4741,6 @@ bool X86TargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info,
if (!IntrData)
return false;
- Info.opc = ISD::INTRINSIC_W_CHAIN;
Info.flags = MachineMemOperand::MONone;
Info.offset = 0;
@@ -4795,6 +4748,7 @@ bool X86TargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info,
case TRUNCATE_TO_MEM_VI8:
case TRUNCATE_TO_MEM_VI16:
case TRUNCATE_TO_MEM_VI32: {
+ Info.opc = ISD::INTRINSIC_VOID;
Info.ptrVal = I.getArgOperand(0);
MVT VT = MVT::getVT(I.getArgOperand(1)->getType());
MVT ScalarVT = MVT::INVALID_SIMPLE_VALUE_TYPE;
@@ -4810,6 +4764,31 @@ bool X86TargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info,
Info.flags |= MachineMemOperand::MOStore;
break;
}
+ case GATHER:
+ case GATHER_AVX2: {
+ Info.opc = ISD::INTRINSIC_W_CHAIN;
+ Info.ptrVal = nullptr;
+ MVT DataVT = MVT::getVT(I.getType());
+ MVT IndexVT = MVT::getVT(I.getArgOperand(2)->getType());
+ unsigned NumElts = std::min(DataVT.getVectorNumElements(),
+ IndexVT.getVectorNumElements());
+ Info.memVT = MVT::getVectorVT(DataVT.getVectorElementType(), NumElts);
+ Info.align = 1;
+ Info.flags |= MachineMemOperand::MOLoad;
+ break;
+ }
+ case SCATTER: {
+ Info.opc = ISD::INTRINSIC_VOID;
+ Info.ptrVal = nullptr;
+ MVT DataVT = MVT::getVT(I.getArgOperand(3)->getType());
+ MVT IndexVT = MVT::getVT(I.getArgOperand(2)->getType());
+ unsigned NumElts = std::min(DataVT.getVectorNumElements(),
+ IndexVT.getVectorNumElements());
+ Info.memVT = MVT::getVectorVT(DataVT.getVectorElementType(), NumElts);
+ Info.align = 1;
+ Info.flags |= MachineMemOperand::MOStore;
+ break;
+ }
default:
return false;
}
@@ -4820,7 +4799,8 @@ bool X86TargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info,
/// Returns true if the target can instruction select the
/// specified FP immediate natively. If false, the legalizer will
/// materialize the FP immediate as a load from a constant pool.
-bool X86TargetLowering::isFPImmLegal(const APFloat &Imm, EVT VT) const {
+bool X86TargetLowering::isFPImmLegal(const APFloat &Imm, EVT VT,
+ bool ForCodeSize) const {
for (unsigned i = 0, e = LegalFPImmediates.size(); i != e; ++i) {
if (Imm.bitwiseIsEqual(LegalFPImmediates[i]))
return true;
@@ -4837,6 +4817,26 @@ bool X86TargetLowering::shouldReduceLoadWidth(SDNode *Load,
if (BasePtr.getOpcode() == X86ISD::WrapperRIP)
if (const auto *GA = dyn_cast<GlobalAddressSDNode>(BasePtr.getOperand(0)))
return GA->getTargetFlags() != X86II::MO_GOTTPOFF;
+
+ // If this is an (1) AVX vector load with (2) multiple uses and (3) all of
+ // those uses are extracted directly into a store, then the extract + store
+ // can be store-folded. Therefore, it's probably not worth splitting the load.
+ EVT VT = Load->getValueType(0);
+ if ((VT.is256BitVector() || VT.is512BitVector()) && !Load->hasOneUse()) {
+ for (auto UI = Load->use_begin(), UE = Load->use_end(); UI != UE; ++UI) {
+ // Skip uses of the chain value. Result 0 of the node is the load value.
+ if (UI.getUse().getResNo() != 0)
+ continue;
+
+ // If this use is not an extract + store, it's probably worth splitting.
+ if (UI->getOpcode() != ISD::EXTRACT_SUBVECTOR || !UI->hasOneUse() ||
+ UI->use_begin()->getOpcode() != ISD::STORE)
+ return true;
+ }
+ // All non-chain uses are extract + store.
+ return false;
+ }
+
return true;
}
@@ -4909,15 +4909,29 @@ bool X86TargetLowering::isExtractSubvectorCheap(EVT ResVT, EVT SrcVT,
}
bool X86TargetLowering::shouldScalarizeBinop(SDValue VecOp) const {
+ unsigned Opc = VecOp.getOpcode();
+
+ // Assume target opcodes can't be scalarized.
+ // TODO - do we have any exceptions?
+ if (Opc >= ISD::BUILTIN_OP_END)
+ return false;
+
// If the vector op is not supported, try to convert to scalar.
EVT VecVT = VecOp.getValueType();
- if (!isOperationLegalOrCustomOrPromote(VecOp.getOpcode(), VecVT))
+ if (!isOperationLegalOrCustomOrPromote(Opc, VecVT))
return true;
// If the vector op is supported, but the scalar op is not, the transform may
// not be worthwhile.
EVT ScalarVT = VecVT.getScalarType();
- return isOperationLegalOrCustomOrPromote(VecOp.getOpcode(), ScalarVT);
+ return isOperationLegalOrCustomOrPromote(Opc, ScalarVT);
+}
+
+bool X86TargetLowering::shouldFormOverflowOp(unsigned Opcode, EVT VT) const {
+ // TODO: Allow vectors?
+ if (VT.isVector())
+ return false;
+ return VT.isSimple() || !isOperationExpand(Opcode, VT);
}
bool X86TargetLowering::isCheapToSpeculateCttz() const {
@@ -4930,8 +4944,9 @@ bool X86TargetLowering::isCheapToSpeculateCtlz() const {
return Subtarget.hasLZCNT();
}
-bool X86TargetLowering::isLoadBitCastBeneficial(EVT LoadVT,
- EVT BitcastVT) const {
+bool X86TargetLowering::isLoadBitCastBeneficial(EVT LoadVT, EVT BitcastVT,
+ const SelectionDAG &DAG,
+ const MachineMemOperand &MMO) const {
if (!Subtarget.hasAVX512() && !LoadVT.isVector() && BitcastVT.isVector() &&
BitcastVT.getVectorElementType() == MVT::i1)
return false;
@@ -4939,7 +4954,12 @@ bool X86TargetLowering::isLoadBitCastBeneficial(EVT LoadVT,
if (!Subtarget.hasDQI() && BitcastVT == MVT::v8i1 && LoadVT == MVT::i8)
return false;
- return TargetLowering::isLoadBitCastBeneficial(LoadVT, BitcastVT);
+ // If both types are legal vectors, it's always ok to convert them.
+ if (LoadVT.isVector() && BitcastVT.isVector() &&
+ isTypeLegal(LoadVT) && isTypeLegal(BitcastVT))
+ return true;
+
+ return TargetLowering::isLoadBitCastBeneficial(LoadVT, BitcastVT, DAG, MMO);
}
bool X86TargetLowering::canMergeStoresTo(unsigned AddressSpace, EVT MemVT,
@@ -4953,6 +4973,10 @@ bool X86TargetLowering::canMergeStoresTo(unsigned AddressSpace, EVT MemVT,
unsigned MaxIntSize = Subtarget.is64Bit() ? 64 : 32;
return (MemVT.getSizeInBits() <= MaxIntSize);
}
+ // Make sure we don't merge greater than our preferred vector
+ // width.
+ if (MemVT.getSizeInBits() > Subtarget.getPreferVectorWidth())
+ return false;
return true;
}
@@ -4998,7 +5022,25 @@ bool X86TargetLowering::hasAndNot(SDValue Y) const {
return Subtarget.hasSSE2();
}
-bool X86TargetLowering::preferShiftsToClearExtremeBits(SDValue Y) const {
+bool X86TargetLowering::shouldFoldConstantShiftPairToMask(
+ const SDNode *N, CombineLevel Level) const {
+ assert(((N->getOpcode() == ISD::SHL &&
+ N->getOperand(0).getOpcode() == ISD::SRL) ||
+ (N->getOpcode() == ISD::SRL &&
+ N->getOperand(0).getOpcode() == ISD::SHL)) &&
+ "Expected shift-shift mask");
+ EVT VT = N->getValueType(0);
+ if ((Subtarget.hasFastVectorShiftMasks() && VT.isVector()) ||
+ (Subtarget.hasFastScalarShiftMasks() && !VT.isVector())) {
+ // Only fold if the shift values are equal - so it folds to AND.
+ // TODO - we should fold if either is a non-uniform vector but we don't do
+ // the fold for non-splats yet.
+ return N->getOperand(1) == N->getOperand(0).getOperand(1);
+ }
+ return TargetLoweringBase::shouldFoldConstantShiftPairToMask(N, Level);
+}
+
+bool X86TargetLowering::shouldFoldMaskToVariableShiftPair(SDValue Y) const {
EVT VT = Y.getValueType();
// For vectors, we don't have a preference, but we probably want a mask.
@@ -5048,8 +5090,8 @@ static bool isUndefOrZero(int Val) {
return ((Val == SM_SentinelUndef) || (Val == SM_SentinelZero));
}
-/// Return true if every element in Mask, beginning
-/// from position Pos and ending in Pos+Size is the undef sentinel value.
+/// Return true if every element in Mask, beginning from position Pos and ending
+/// in Pos+Size is the undef sentinel value.
static bool isUndefInRange(ArrayRef<int> Mask, unsigned Pos, unsigned Size) {
for (unsigned i = Pos, e = Pos + Size; i != e; ++i)
if (Mask[i] != SM_SentinelUndef)
@@ -5057,6 +5099,18 @@ static bool isUndefInRange(ArrayRef<int> Mask, unsigned Pos, unsigned Size) {
return true;
}
+/// Return true if the mask creates a vector whose lower half is undefined.
+static bool isUndefLowerHalf(ArrayRef<int> Mask) {
+ unsigned NumElts = Mask.size();
+ return isUndefInRange(Mask, 0, NumElts / 2);
+}
+
+/// Return true if the mask creates a vector whose upper half is undefined.
+static bool isUndefUpperHalf(ArrayRef<int> Mask) {
+ unsigned NumElts = Mask.size();
+ return isUndefInRange(Mask, NumElts / 2, NumElts / 2);
+}
+
/// Return true if Val falls within the specified range (L, H].
static bool isInRange(int Val, int Low, int Hi) {
return (Val >= Low && Val < Hi);
@@ -5409,6 +5463,53 @@ static SDValue widenSubVector(MVT VT, SDValue Vec, bool ZeroNewElements,
DAG.getIntPtrConstant(0, dl));
}
+/// Widen a vector to a larger size with the same scalar type, with the new
+/// elements either zero or undef.
+static SDValue widenSubVector(SDValue Vec, bool ZeroNewElements,
+ const X86Subtarget &Subtarget, SelectionDAG &DAG,
+ const SDLoc &dl, unsigned WideSizeInBits) {
+ assert(Vec.getValueSizeInBits() < WideSizeInBits &&
+ (WideSizeInBits % Vec.getScalarValueSizeInBits()) == 0 &&
+ "Unsupported vector widening type");
+ unsigned WideNumElts = WideSizeInBits / Vec.getScalarValueSizeInBits();
+ MVT SVT = Vec.getSimpleValueType().getScalarType();
+ MVT VT = MVT::getVectorVT(SVT, WideNumElts);
+ return widenSubVector(VT, Vec, ZeroNewElements, Subtarget, DAG, dl);
+}
+
+// Helper function to collect subvector ops that are concated together,
+// either by ISD::CONCAT_VECTORS or a ISD::INSERT_SUBVECTOR series.
+// The subvectors in Ops are guaranteed to be the same type.
+static bool collectConcatOps(SDNode *N, SmallVectorImpl<SDValue> &Ops) {
+ assert(Ops.empty() && "Expected an empty ops vector");
+
+ if (N->getOpcode() == ISD::CONCAT_VECTORS) {
+ Ops.append(N->op_begin(), N->op_end());
+ return true;
+ }
+
+ if (N->getOpcode() == ISD::INSERT_SUBVECTOR &&
+ isa<ConstantSDNode>(N->getOperand(2))) {
+ SDValue Src = N->getOperand(0);
+ SDValue Sub = N->getOperand(1);
+ const APInt &Idx = N->getConstantOperandAPInt(2);
+ EVT VT = Src.getValueType();
+ EVT SubVT = Sub.getValueType();
+
+ // TODO - Handle more general insert_subvector chains.
+ if (VT.getSizeInBits() == (SubVT.getSizeInBits() * 2) &&
+ Idx == (VT.getVectorNumElements() / 2) &&
+ Src.getOpcode() == ISD::INSERT_SUBVECTOR &&
+ isNullConstant(Src.getOperand(2))) {
+ Ops.push_back(Src.getOperand(1));
+ Ops.push_back(Sub);
+ return true;
+ }
+ }
+
+ return false;
+}
+
// Helper for splitting operands of an operation to legal target size and
// apply a function on each part.
// Useful for operations that are available on SSE2 in 128-bit, on AVX2 in
@@ -5457,19 +5558,6 @@ SDValue SplitOpsAndApply(SelectionDAG &DAG, const X86Subtarget &Subtarget,
return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, Subs);
}
-// Return true if the instruction zeroes the unused upper part of the
-// destination and accepts mask.
-static bool isMaskedZeroUpperBitsvXi1(unsigned int Opcode) {
- switch (Opcode) {
- default:
- return false;
- case X86ISD::CMPM:
- case X86ISD::CMPM_RND:
- case ISD::SETCC:
- return true;
- }
-}
-
/// Insert i1-subvector to i1-vector.
static SDValue insert1BitVector(SDValue Op, SelectionDAG &DAG,
const X86Subtarget &Subtarget) {
@@ -5626,10 +5714,29 @@ static SDValue getOnesVector(EVT VT, SelectionDAG &DAG, const SDLoc &dl) {
return DAG.getBitcast(VT, Vec);
}
-static SDValue getExtendInVec(bool Signed, const SDLoc &DL, EVT VT, SDValue In,
- SelectionDAG &DAG) {
+// Convert *_EXTEND to *_EXTEND_VECTOR_INREG opcode.
+static unsigned getOpcode_EXTEND_VECTOR_INREG(unsigned Opcode) {
+ switch (Opcode) {
+ case ISD::ANY_EXTEND:
+ case ISD::ANY_EXTEND_VECTOR_INREG:
+ return ISD::ANY_EXTEND_VECTOR_INREG;
+ case ISD::ZERO_EXTEND:
+ case ISD::ZERO_EXTEND_VECTOR_INREG:
+ return ISD::ZERO_EXTEND_VECTOR_INREG;
+ case ISD::SIGN_EXTEND:
+ case ISD::SIGN_EXTEND_VECTOR_INREG:
+ return ISD::SIGN_EXTEND_VECTOR_INREG;
+ }
+ llvm_unreachable("Unknown opcode");
+}
+
+static SDValue getExtendInVec(unsigned Opcode, const SDLoc &DL, EVT VT,
+ SDValue In, SelectionDAG &DAG) {
EVT InVT = In.getValueType();
assert(VT.isVector() && InVT.isVector() && "Expected vector VTs.");
+ assert((ISD::ANY_EXTEND == Opcode || ISD::SIGN_EXTEND == Opcode ||
+ ISD::ZERO_EXTEND == Opcode) &&
+ "Unknown extension opcode");
// For 256-bit vectors, we only need the lower (128-bit) input half.
// For 512-bit vectors, we only need the lower input half or quarter.
@@ -5642,13 +5749,10 @@ static SDValue getExtendInVec(bool Signed, const SDLoc &DL, EVT VT, SDValue In,
InVT = In.getValueType();
}
- if (VT.getVectorNumElements() == InVT.getVectorNumElements())
- return DAG.getNode(Signed ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND,
- DL, VT, In);
+ if (VT.getVectorNumElements() != InVT.getVectorNumElements())
+ Opcode = getOpcode_EXTEND_VECTOR_INREG(Opcode);
- return DAG.getNode(Signed ? ISD::SIGN_EXTEND_VECTOR_INREG
- : ISD::ZERO_EXTEND_VECTOR_INREG,
- DL, VT, In);
+ return DAG.getNode(Opcode, DL, VT, In);
}
/// Returns a vector_shuffle node for an unpackl operation.
@@ -5686,18 +5790,8 @@ static SDValue getShuffleVectorZeroOrUndef(SDValue V2, int Idx,
return DAG.getVectorShuffle(VT, SDLoc(V2), V1, V2, MaskVec);
}
-// Peek through EXTRACT_SUBVECTORs - typically used for AVX1 256-bit intops.
-static SDValue peekThroughEXTRACT_SUBVECTORs(SDValue V) {
- while (V.getOpcode() == ISD::EXTRACT_SUBVECTOR)
- V = V.getOperand(0);
- return V;
-}
-
-static const Constant *getTargetConstantFromNode(SDValue Op) {
- Op = peekThroughBitcasts(Op);
-
- auto *Load = dyn_cast<LoadSDNode>(Op);
- if (!Load)
+static const Constant *getTargetConstantFromNode(LoadSDNode *Load) {
+ if (!Load || !ISD::isNormalLoad(Load))
return nullptr;
SDValue Ptr = Load->getBasePtr();
@@ -5712,6 +5806,17 @@ static const Constant *getTargetConstantFromNode(SDValue Op) {
return CNode->getConstVal();
}
+static const Constant *getTargetConstantFromNode(SDValue Op) {
+ Op = peekThroughBitcasts(Op);
+ return getTargetConstantFromNode(dyn_cast<LoadSDNode>(Op));
+}
+
+const Constant *
+X86TargetLowering::getTargetConstantFromLoad(LoadSDNode *LD) const {
+ assert(LD && "Unexpected null LoadSDNode");
+ return getTargetConstantFromNode(LD);
+}
+
// Extract raw constant bits from constant pools.
static bool getTargetConstantBitsFromNode(SDValue Op, unsigned EltSizeInBits,
APInt &UndefElts,
@@ -5778,8 +5883,7 @@ static bool getTargetConstantBitsFromNode(SDValue Op, unsigned EltSizeInBits,
if (UndefEltBits.getBoolValue() && !AllowPartialUndefs)
return false;
- APInt Bits = MaskBits.extractBits(EltSizeInBits, BitOffset);
- EltBits[i] = Bits.getZExtValue();
+ EltBits[i] = MaskBits.extractBits(EltSizeInBits, BitOffset);
}
return true;
};
@@ -5899,6 +6003,19 @@ static bool getTargetConstantBitsFromNode(SDValue Op, unsigned EltSizeInBits,
}
}
+ // Extract constant bits from a subvector broadcast.
+ if (Op.getOpcode() == X86ISD::SUBV_BROADCAST) {
+ SmallVector<APInt, 16> SubEltBits;
+ if (getTargetConstantBitsFromNode(Op.getOperand(0), EltSizeInBits,
+ UndefElts, SubEltBits, AllowWholeUndefs,
+ AllowPartialUndefs)) {
+ UndefElts = APInt::getSplat(NumElts, UndefElts);
+ while (EltBits.size() < NumElts)
+ EltBits.append(SubEltBits.begin(), SubEltBits.end());
+ return true;
+ }
+ }
+
// Extract a rematerialized scalar constant insertion.
if (Op.getOpcode() == X86ISD::VZEXT_MOVL &&
Op.getOperand(0).getOpcode() == ISD::SCALAR_TO_VECTOR &&
@@ -5914,6 +6031,29 @@ static bool getTargetConstantBitsFromNode(SDValue Op, unsigned EltSizeInBits,
return CastBitData(UndefSrcElts, SrcEltBits);
}
+ // Insert constant bits from a base and sub vector sources.
+ if (Op.getOpcode() == ISD::INSERT_SUBVECTOR &&
+ isa<ConstantSDNode>(Op.getOperand(2))) {
+ // TODO - support insert_subvector through bitcasts.
+ if (EltSizeInBits != VT.getScalarSizeInBits())
+ return false;
+
+ APInt UndefSubElts;
+ SmallVector<APInt, 32> EltSubBits;
+ if (getTargetConstantBitsFromNode(Op.getOperand(1), EltSizeInBits,
+ UndefSubElts, EltSubBits,
+ AllowWholeUndefs, AllowPartialUndefs) &&
+ getTargetConstantBitsFromNode(Op.getOperand(0), EltSizeInBits,
+ UndefElts, EltBits, AllowWholeUndefs,
+ AllowPartialUndefs)) {
+ unsigned BaseIdx = Op.getConstantOperandVal(2);
+ UndefElts.insertBits(UndefSubElts, BaseIdx);
+ for (unsigned i = 0, e = EltSubBits.size(); i != e; ++i)
+ EltBits[BaseIdx + i] = EltSubBits[i];
+ return true;
+ }
+ }
+
// Extract constant bits from a subvector's source.
if (Op.getOpcode() == ISD::EXTRACT_SUBVECTOR &&
isa<ConstantSDNode>(Op.getOperand(1))) {
@@ -6068,6 +6208,34 @@ static void getPackDemandedElts(EVT VT, const APInt &DemandedElts,
}
}
+// Split the demanded elts of a HADD/HSUB node between its operands.
+static void getHorizDemandedElts(EVT VT, const APInt &DemandedElts,
+ APInt &DemandedLHS, APInt &DemandedRHS) {
+ int NumLanes = VT.getSizeInBits() / 128;
+ int NumElts = DemandedElts.getBitWidth();
+ int NumEltsPerLane = NumElts / NumLanes;
+ int HalfEltsPerLane = NumEltsPerLane / 2;
+
+ DemandedLHS = APInt::getNullValue(NumElts);
+ DemandedRHS = APInt::getNullValue(NumElts);
+
+ // Map DemandedElts to the horizontal operands.
+ for (int Idx = 0; Idx != NumElts; ++Idx) {
+ if (!DemandedElts[Idx])
+ continue;
+ int LaneIdx = (Idx / NumEltsPerLane) * NumEltsPerLane;
+ int LocalIdx = Idx % NumEltsPerLane;
+ if (LocalIdx < HalfEltsPerLane) {
+ DemandedLHS.setBit(LaneIdx + 2 * LocalIdx + 0);
+ DemandedLHS.setBit(LaneIdx + 2 * LocalIdx + 1);
+ } else {
+ LocalIdx -= HalfEltsPerLane;
+ DemandedRHS.setBit(LaneIdx + 2 * LocalIdx + 0);
+ DemandedRHS.setBit(LaneIdx + 2 * LocalIdx + 1);
+ }
+ }
+}
+
/// Calculates the shuffle mask corresponding to the target-specific opcode.
/// If the mask could be calculated, returns it in \p Mask, returns the shuffle
/// operands in \p Ops, and returns true.
@@ -6468,14 +6636,15 @@ static bool setTargetShuffleZeroElements(SDValue N,
static bool resolveTargetShuffleInputs(SDValue Op,
SmallVectorImpl<SDValue> &Inputs,
SmallVectorImpl<int> &Mask,
- const SelectionDAG &DAG);
+ SelectionDAG &DAG);
// Attempt to decode ops that could be represented as a shuffle mask.
// The decoded shuffle mask may contain a different number of elements to the
// destination value type.
-static bool getFauxShuffleMask(SDValue N, SmallVectorImpl<int> &Mask,
+static bool getFauxShuffleMask(SDValue N, const APInt &DemandedElts,
+ SmallVectorImpl<int> &Mask,
SmallVectorImpl<SDValue> &Ops,
- const SelectionDAG &DAG) {
+ SelectionDAG &DAG) {
Mask.clear();
Ops.clear();
@@ -6483,8 +6652,9 @@ static bool getFauxShuffleMask(SDValue N, SmallVectorImpl<int> &Mask,
unsigned NumElts = VT.getVectorNumElements();
unsigned NumSizeInBits = VT.getSizeInBits();
unsigned NumBitsPerElt = VT.getScalarSizeInBits();
- assert((NumBitsPerElt % 8) == 0 && (NumSizeInBits % 8) == 0 &&
- "Expected byte aligned value types");
+ if ((NumBitsPerElt % 8) != 0 || (NumSizeInBits % 8) != 0)
+ return false;
+ assert(NumElts == DemandedElts.getBitWidth() && "Unexpected vector size");
unsigned Opcode = N.getOpcode();
switch (Opcode) {
@@ -6524,6 +6694,40 @@ static bool getFauxShuffleMask(SDValue N, SmallVectorImpl<int> &Mask,
return true;
}
case ISD::OR: {
+ // Inspect each operand at the byte level. We can merge these into a
+ // blend shuffle mask if for each byte at least one is masked out (zero).
+ KnownBits Known0 = DAG.computeKnownBits(N.getOperand(0), DemandedElts);
+ KnownBits Known1 = DAG.computeKnownBits(N.getOperand(1), DemandedElts);
+ if (Known0.One.isNullValue() && Known1.One.isNullValue()) {
+ bool IsByteMask = true;
+ unsigned NumSizeInBytes = NumSizeInBits / 8;
+ unsigned NumBytesPerElt = NumBitsPerElt / 8;
+ APInt ZeroMask = APInt::getNullValue(NumBytesPerElt);
+ APInt SelectMask = APInt::getNullValue(NumBytesPerElt);
+ for (unsigned i = 0; i != NumBytesPerElt && IsByteMask; ++i) {
+ unsigned LHS = Known0.Zero.extractBits(8, i * 8).getZExtValue();
+ unsigned RHS = Known1.Zero.extractBits(8, i * 8).getZExtValue();
+ if (LHS == 255 && RHS == 0)
+ SelectMask.setBit(i);
+ else if (LHS == 255 && RHS == 255)
+ ZeroMask.setBit(i);
+ else if (!(LHS == 0 && RHS == 255))
+ IsByteMask = false;
+ }
+ if (IsByteMask) {
+ for (unsigned i = 0; i != NumSizeInBytes; i += NumBytesPerElt) {
+ for (unsigned j = 0; j != NumBytesPerElt; ++j) {
+ unsigned Ofs = (SelectMask[j] ? NumSizeInBytes : 0);
+ int Idx = (ZeroMask[j] ? (int)SM_SentinelZero : (i + j + Ofs));
+ Mask.push_back(Idx);
+ }
+ }
+ Ops.push_back(N.getOperand(0));
+ Ops.push_back(N.getOperand(1));
+ return true;
+ }
+ }
+
// Handle OR(SHUFFLE,SHUFFLE) case where one source is zero and the other
// is a valid shuffle index.
SDValue N0 = peekThroughOneUseBitcasts(N.getOperand(0));
@@ -6558,9 +6762,6 @@ static bool getFauxShuffleMask(SDValue N, SmallVectorImpl<int> &Mask,
return true;
}
case ISD::INSERT_SUBVECTOR: {
- // Handle INSERT_SUBVECTOR(SRC0, SHUFFLE(EXTRACT_SUBVECTOR(SRC1)) where
- // SRC0/SRC1 are both of the same valuetype VT.
- // TODO - add peekThroughOneUseBitcasts support.
SDValue Src = N.getOperand(0);
SDValue Sub = N.getOperand(1);
EVT SubVT = Sub.getValueType();
@@ -6568,28 +6769,57 @@ static bool getFauxShuffleMask(SDValue N, SmallVectorImpl<int> &Mask,
if (!isa<ConstantSDNode>(N.getOperand(2)) ||
!N->isOnlyUserOf(Sub.getNode()))
return false;
+ uint64_t InsertIdx = N.getConstantOperandVal(2);
+ // Handle INSERT_SUBVECTOR(SRC0, EXTRACT_SUBVECTOR(SRC1)).
+ if (Sub.getOpcode() == ISD::EXTRACT_SUBVECTOR &&
+ Sub.getOperand(0).getValueType() == VT &&
+ isa<ConstantSDNode>(Sub.getOperand(1))) {
+ uint64_t ExtractIdx = Sub.getConstantOperandVal(1);
+ for (int i = 0; i != (int)NumElts; ++i)
+ Mask.push_back(i);
+ for (int i = 0; i != (int)NumSubElts; ++i)
+ Mask[InsertIdx + i] = NumElts + ExtractIdx + i;
+ Ops.push_back(Src);
+ Ops.push_back(Sub.getOperand(0));
+ return true;
+ }
+ // Handle INSERT_SUBVECTOR(SRC0, SHUFFLE(SRC1)).
SmallVector<int, 64> SubMask;
SmallVector<SDValue, 2> SubInputs;
- if (!resolveTargetShuffleInputs(Sub, SubInputs, SubMask, DAG) ||
- SubMask.size() != NumSubElts)
+ if (!resolveTargetShuffleInputs(peekThroughOneUseBitcasts(Sub), SubInputs,
+ SubMask, DAG))
return false;
+ if (SubMask.size() != NumSubElts) {
+ assert(((SubMask.size() % NumSubElts) == 0 ||
+ (NumSubElts % SubMask.size()) == 0) && "Illegal submask scale");
+ if ((NumSubElts % SubMask.size()) == 0) {
+ int Scale = NumSubElts / SubMask.size();
+ SmallVector<int,64> ScaledSubMask;
+ scaleShuffleMask<int>(Scale, SubMask, ScaledSubMask);
+ SubMask = ScaledSubMask;
+ } else {
+ int Scale = SubMask.size() / NumSubElts;
+ NumSubElts = SubMask.size();
+ NumElts *= Scale;
+ InsertIdx *= Scale;
+ }
+ }
Ops.push_back(Src);
for (SDValue &SubInput : SubInputs) {
- if (SubInput.getOpcode() != ISD::EXTRACT_SUBVECTOR ||
- SubInput.getOperand(0).getValueType() != VT ||
- !isa<ConstantSDNode>(SubInput.getOperand(1)))
- return false;
- Ops.push_back(SubInput.getOperand(0));
+ EVT SubSVT = SubInput.getValueType().getScalarType();
+ EVT AltVT = EVT::getVectorVT(*DAG.getContext(), SubSVT,
+ NumSizeInBits / SubSVT.getSizeInBits());
+ Ops.push_back(DAG.getNode(ISD::INSERT_SUBVECTOR, SDLoc(N), AltVT,
+ DAG.getUNDEF(AltVT), SubInput,
+ DAG.getIntPtrConstant(0, SDLoc(N))));
}
- int InsertIdx = N.getConstantOperandVal(2);
for (int i = 0; i != (int)NumElts; ++i)
Mask.push_back(i);
for (int i = 0; i != (int)NumSubElts; ++i) {
int M = SubMask[i];
if (0 <= M) {
int InputIdx = M / NumSubElts;
- int ExtractIdx = SubInputs[InputIdx].getConstantOperandVal(1);
- M = (NumElts * (1 + InputIdx)) + ExtractIdx + (M % NumSubElts);
+ M = (NumElts * (1 + InputIdx)) + (M % NumSubElts);
}
Mask[i + InsertIdx] = M;
}
@@ -6674,16 +6904,21 @@ static bool getFauxShuffleMask(SDValue N, SmallVectorImpl<int> &Mask,
N1.getValueType().getVectorNumElements() == (NumElts / 2) &&
"Unexpected input value type");
+ APInt EltsLHS, EltsRHS;
+ getPackDemandedElts(VT, DemandedElts, EltsLHS, EltsRHS);
+
// If we know input saturation won't happen we can treat this
// as a truncation shuffle.
if (Opcode == X86ISD::PACKSS) {
- if ((!N0.isUndef() && DAG.ComputeNumSignBits(N0) <= NumBitsPerElt) ||
- (!N1.isUndef() && DAG.ComputeNumSignBits(N1) <= NumBitsPerElt))
+ if ((!N0.isUndef() &&
+ DAG.ComputeNumSignBits(N0, EltsLHS) <= NumBitsPerElt) ||
+ (!N1.isUndef() &&
+ DAG.ComputeNumSignBits(N1, EltsRHS) <= NumBitsPerElt))
return false;
} else {
APInt ZeroMask = APInt::getHighBitsSet(2 * NumBitsPerElt, NumBitsPerElt);
- if ((!N0.isUndef() && !DAG.MaskedValueIsZero(N0, ZeroMask)) ||
- (!N1.isUndef() && !DAG.MaskedValueIsZero(N1, ZeroMask)))
+ if ((!N0.isUndef() && !DAG.MaskedValueIsZero(N0, ZeroMask, EltsLHS)) ||
+ (!N1.isUndef() && !DAG.MaskedValueIsZero(N1, ZeroMask, EltsRHS)))
return false;
}
@@ -6728,15 +6963,54 @@ static bool getFauxShuffleMask(SDValue N, SmallVectorImpl<int> &Mask,
}
return true;
}
- case ISD::ZERO_EXTEND_VECTOR_INREG:
- case ISD::ZERO_EXTEND: {
- // TODO - add support for VPMOVZX with smaller input vector types.
+ case X86ISD::VBROADCAST: {
SDValue Src = N.getOperand(0);
MVT SrcVT = Src.getSimpleValueType();
- if (NumSizeInBits != SrcVT.getSizeInBits())
- break;
- DecodeZeroExtendMask(SrcVT.getScalarSizeInBits(), NumBitsPerElt, NumElts,
+ if (!SrcVT.isVector())
+ return false;
+
+ if (NumSizeInBits != SrcVT.getSizeInBits()) {
+ assert((NumSizeInBits % SrcVT.getSizeInBits()) == 0 &&
+ "Illegal broadcast type");
+ SrcVT = MVT::getVectorVT(SrcVT.getScalarType(),
+ NumSizeInBits / SrcVT.getScalarSizeInBits());
+ Src = DAG.getNode(ISD::INSERT_SUBVECTOR, SDLoc(N), SrcVT,
+ DAG.getUNDEF(SrcVT), Src,
+ DAG.getIntPtrConstant(0, SDLoc(N)));
+ }
+
+ Ops.push_back(Src);
+ Mask.append(NumElts, 0);
+ return true;
+ }
+ case ISD::ZERO_EXTEND:
+ case ISD::ANY_EXTEND:
+ case ISD::ZERO_EXTEND_VECTOR_INREG:
+ case ISD::ANY_EXTEND_VECTOR_INREG: {
+ SDValue Src = N.getOperand(0);
+ EVT SrcVT = Src.getValueType();
+
+ // Extended source must be a simple vector.
+ if (!SrcVT.isSimple() || (SrcVT.getSizeInBits() % 128) != 0 ||
+ (SrcVT.getScalarSizeInBits() % 8) != 0)
+ return false;
+
+ unsigned NumSrcBitsPerElt = SrcVT.getScalarSizeInBits();
+ bool IsAnyExtend =
+ (ISD::ANY_EXTEND == Opcode || ISD::ANY_EXTEND_VECTOR_INREG == Opcode);
+ DecodeZeroExtendMask(NumSrcBitsPerElt, NumBitsPerElt, NumElts, IsAnyExtend,
Mask);
+
+ if (NumSizeInBits != SrcVT.getSizeInBits()) {
+ assert((NumSizeInBits % SrcVT.getSizeInBits()) == 0 &&
+ "Illegal zero-extension type");
+ SrcVT = MVT::getVectorVT(SrcVT.getSimpleVT().getScalarType(),
+ NumSizeInBits / NumSrcBitsPerElt);
+ Src = DAG.getNode(ISD::INSERT_SUBVECTOR, SDLoc(N), SrcVT,
+ DAG.getUNDEF(SrcVT), Src,
+ DAG.getIntPtrConstant(0, SDLoc(N)));
+ }
+
Ops.push_back(Src);
return true;
}
@@ -6745,7 +7019,7 @@ static bool getFauxShuffleMask(SDValue N, SmallVectorImpl<int> &Mask,
return false;
}
-/// Removes unused shuffle source inputs and adjusts the shuffle mask accordingly.
+/// Removes unused/repeated shuffle source inputs and adjusts the shuffle mask.
static void resolveTargetShuffleInputsAndMask(SmallVectorImpl<SDValue> &Inputs,
SmallVectorImpl<int> &Mask) {
int MaskWidth = Mask.size();
@@ -6761,13 +7035,28 @@ static void resolveTargetShuffleInputsAndMask(SmallVectorImpl<SDValue> &Inputs,
M = SM_SentinelUndef;
// Check for unused inputs.
- if (any_of(Mask, [lo, hi](int i) { return (lo <= i) && (i < hi); })) {
- UsedInputs.push_back(Inputs[i]);
+ if (none_of(Mask, [lo, hi](int i) { return (lo <= i) && (i < hi); })) {
+ for (int &M : Mask)
+ if (lo <= M)
+ M -= MaskWidth;
continue;
}
- for (int &M : Mask)
- if (lo <= M)
- M -= MaskWidth;
+
+ // Check for repeated inputs.
+ bool IsRepeat = false;
+ for (int j = 0, ue = UsedInputs.size(); j != ue; ++j) {
+ if (UsedInputs[j] != Inputs[i])
+ continue;
+ for (int &M : Mask)
+ if (lo <= M)
+ M = (M < hi) ? ((M - lo) + (j * MaskWidth)) : (M - MaskWidth);
+ IsRepeat = true;
+ break;
+ }
+ if (IsRepeat)
+ continue;
+
+ UsedInputs.push_back(Inputs[i]);
}
Inputs = UsedInputs;
}
@@ -6780,9 +7069,11 @@ static void resolveTargetShuffleInputsAndMask(SmallVectorImpl<SDValue> &Inputs,
static bool resolveTargetShuffleInputs(SDValue Op,
SmallVectorImpl<SDValue> &Inputs,
SmallVectorImpl<int> &Mask,
- const SelectionDAG &DAG) {
+ SelectionDAG &DAG) {
+ unsigned NumElts = Op.getValueType().getVectorNumElements();
+ APInt DemandedElts = APInt::getAllOnesValue(NumElts);
if (!setTargetShuffleZeroElements(Op, Mask, Inputs))
- if (!getFauxShuffleMask(Op, Mask, Inputs, DAG))
+ if (!getFauxShuffleMask(Op, DemandedElts, Mask, Inputs, DAG))
return false;
resolveTargetShuffleInputsAndMask(Inputs, Mask);
@@ -6838,6 +7129,28 @@ static SDValue getShuffleScalarElt(SDNode *N, unsigned Index, SelectionDAG &DAG,
Depth+1);
}
+ // Recurse into insert_subvector base/sub vector to find scalars.
+ if (Opcode == ISD::INSERT_SUBVECTOR &&
+ isa<ConstantSDNode>(N->getOperand(2))) {
+ SDValue Vec = N->getOperand(0);
+ SDValue Sub = N->getOperand(1);
+ EVT SubVT = Sub.getValueType();
+ unsigned NumSubElts = SubVT.getVectorNumElements();
+ uint64_t SubIdx = N->getConstantOperandVal(2);
+
+ if (SubIdx <= Index && Index < (SubIdx + NumSubElts))
+ return getShuffleScalarElt(Sub.getNode(), Index - SubIdx, DAG, Depth + 1);
+ return getShuffleScalarElt(Vec.getNode(), Index, DAG, Depth + 1);
+ }
+
+ // Recurse into extract_subvector src vector to find scalars.
+ if (Opcode == ISD::EXTRACT_SUBVECTOR &&
+ isa<ConstantSDNode>(N->getOperand(1))) {
+ SDValue Src = N->getOperand(0);
+ uint64_t SrcIdx = N->getConstantOperandVal(1);
+ return getShuffleScalarElt(Src.getNode(), Index + SrcIdx, DAG, Depth + 1);
+ }
+
// Actual nodes that may contain scalar elements
if (Opcode == ISD::BITCAST) {
V = V.getOperand(0);
@@ -6880,7 +7193,7 @@ static SDValue LowerBuildVectorAsInsert(SDValue Op, unsigned NonZeros,
// If the build vector contains zeros or our first insertion is not the
// first index then insert into zero vector to break any register
- // dependency else use SCALAR_TO_VECTOR/VZEXT_MOVL.
+ // dependency else use SCALAR_TO_VECTOR.
if (First) {
First = false;
if (NumZero || 0 != i)
@@ -6889,7 +7202,6 @@ static SDValue LowerBuildVectorAsInsert(SDValue Op, unsigned NonZeros,
assert(0 == i && "Expected insertion into zero-index");
V = DAG.getAnyExtOrTrunc(Op.getOperand(i), dl, MVT::i32);
V = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4i32, V);
- V = DAG.getNode(X86ISD::VZEXT_MOVL, dl, MVT::v4i32, V);
V = DAG.getBitcast(VT, V);
continue;
}
@@ -6916,50 +7228,51 @@ static SDValue LowerBuildVectorv16i8(SDValue Op, unsigned NonZeros,
SDLoc dl(Op);
SDValue V;
- bool First = true;
// Pre-SSE4.1 - merge byte pairs and insert with PINSRW.
- for (unsigned i = 0; i < 16; ++i) {
+ for (unsigned i = 0; i < 16; i += 2) {
bool ThisIsNonZero = (NonZeros & (1 << i)) != 0;
- if (ThisIsNonZero && First) {
- if (NumZero)
- V = getZeroVector(MVT::v8i16, Subtarget, DAG, dl);
+ bool NextIsNonZero = (NonZeros & (1 << (i + 1))) != 0;
+ if (!ThisIsNonZero && !NextIsNonZero)
+ continue;
+
+ // FIXME: Investigate combining the first 4 bytes as a i32 instead.
+ SDValue Elt;
+ if (ThisIsNonZero) {
+ if (NumZero || NextIsNonZero)
+ Elt = DAG.getZExtOrTrunc(Op.getOperand(i), dl, MVT::i32);
else
- V = DAG.getUNDEF(MVT::v8i16);
- First = false;
+ Elt = DAG.getAnyExtOrTrunc(Op.getOperand(i), dl, MVT::i32);
}
- if ((i & 1) != 0) {
- // FIXME: Investigate extending to i32 instead of just i16.
- // FIXME: Investigate combining the first 4 bytes as a i32 instead.
- SDValue ThisElt, LastElt;
- bool LastIsNonZero = (NonZeros & (1 << (i - 1))) != 0;
- if (LastIsNonZero) {
- LastElt =
- DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i16, Op.getOperand(i - 1));
- }
- if (ThisIsNonZero) {
- ThisElt = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i16, Op.getOperand(i));
- ThisElt = DAG.getNode(ISD::SHL, dl, MVT::i16, ThisElt,
- DAG.getConstant(8, dl, MVT::i8));
- if (LastIsNonZero)
- ThisElt = DAG.getNode(ISD::OR, dl, MVT::i16, ThisElt, LastElt);
- } else
- ThisElt = LastElt;
-
- if (ThisElt) {
- if (1 == i) {
- V = NumZero ? DAG.getZExtOrTrunc(ThisElt, dl, MVT::i32)
- : DAG.getAnyExtOrTrunc(ThisElt, dl, MVT::i32);
- V = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4i32, V);
- V = DAG.getNode(X86ISD::VZEXT_MOVL, dl, MVT::v4i32, V);
- V = DAG.getBitcast(MVT::v8i16, V);
- } else {
- V = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, MVT::v8i16, V, ThisElt,
- DAG.getIntPtrConstant(i / 2, dl));
- }
+ if (NextIsNonZero) {
+ SDValue NextElt = Op.getOperand(i + 1);
+ if (i == 0 && NumZero)
+ NextElt = DAG.getZExtOrTrunc(NextElt, dl, MVT::i32);
+ else
+ NextElt = DAG.getAnyExtOrTrunc(NextElt, dl, MVT::i32);
+ NextElt = DAG.getNode(ISD::SHL, dl, MVT::i32, NextElt,
+ DAG.getConstant(8, dl, MVT::i8));
+ if (ThisIsNonZero)
+ Elt = DAG.getNode(ISD::OR, dl, MVT::i32, NextElt, Elt);
+ else
+ Elt = NextElt;
+ }
+
+ // If our first insertion is not the first index then insert into zero
+ // vector to break any register dependency else use SCALAR_TO_VECTOR.
+ if (!V) {
+ if (i != 0)
+ V = getZeroVector(MVT::v8i16, Subtarget, DAG, dl);
+ else {
+ V = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4i32, Elt);
+ V = DAG.getBitcast(MVT::v8i16, V);
+ continue;
}
}
+ Elt = DAG.getNode(ISD::TRUNCATE, dl, MVT::i16, Elt);
+ V = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, MVT::v8i16, V, Elt,
+ DAG.getIntPtrConstant(i / 2, dl));
}
return DAG.getBitcast(MVT::v16i8, V);
@@ -7002,9 +7315,10 @@ static SDValue LowerBuildVectorv4x32(SDValue Op, SelectionDAG &DAG,
}
// Find all zeroable elements.
- std::bitset<4> Zeroable;
- for (int i=0; i < 4; ++i) {
- SDValue Elt = Op->getOperand(i);
+ std::bitset<4> Zeroable, Undefs;
+ for (int i = 0; i < 4; ++i) {
+ SDValue Elt = Op.getOperand(i);
+ Undefs[i] = Elt.isUndef();
Zeroable[i] = (Elt.isUndef() || X86::isZeroNode(Elt));
}
assert(Zeroable.size() - Zeroable.count() > 1 &&
@@ -7014,10 +7328,10 @@ static SDValue LowerBuildVectorv4x32(SDValue Op, SelectionDAG &DAG,
// zeroable or extract_vector_elt with constant index.
SDValue FirstNonZero;
unsigned FirstNonZeroIdx;
- for (unsigned i=0; i < 4; ++i) {
+ for (unsigned i = 0; i < 4; ++i) {
if (Zeroable[i])
continue;
- SDValue Elt = Op->getOperand(i);
+ SDValue Elt = Op.getOperand(i);
if (Elt.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
!isa<ConstantSDNode>(Elt.getOperand(1)))
return SDValue();
@@ -7056,10 +7370,12 @@ static SDValue LowerBuildVectorv4x32(SDValue Op, SelectionDAG &DAG,
if (EltIdx == 4) {
// Let the shuffle legalizer deal with blend operations.
- SDValue VZero = getZeroVector(VT, Subtarget, DAG, SDLoc(Op));
+ SDValue VZeroOrUndef = (Zeroable == Undefs)
+ ? DAG.getUNDEF(VT)
+ : getZeroVector(VT, Subtarget, DAG, SDLoc(Op));
if (V1.getSimpleValueType() != VT)
V1 = DAG.getBitcast(VT, V1);
- return DAG.getVectorShuffle(VT, SDLoc(V1), V1, VZero, Mask);
+ return DAG.getVectorShuffle(VT, SDLoc(V1), V1, VZeroOrUndef, Mask);
}
// See if we can lower this build_vector to a INSERTPS.
@@ -7079,7 +7395,7 @@ static SDValue LowerBuildVectorv4x32(SDValue Op, SelectionDAG &DAG,
SDValue SrcVector = Current->getOperand(0);
if (!V1.getNode())
V1 = SrcVector;
- CanFold = (SrcVector == V1) && (Current.getConstantOperandVal(1) == i);
+ CanFold = (SrcVector == V1) && (Current.getConstantOperandAPInt(1) == i);
}
if (!CanFold)
@@ -7200,9 +7516,11 @@ static SDValue EltsFromConsecutiveLoads(EVT VT, ArrayRef<SDValue> Elts,
unsigned NumElems = Elts.size();
int LastLoadedElt = -1;
- SmallBitVector LoadMask(NumElems, false);
- SmallBitVector ZeroMask(NumElems, false);
- SmallBitVector UndefMask(NumElems, false);
+ APInt LoadMask = APInt::getNullValue(NumElems);
+ APInt ZeroMask = APInt::getNullValue(NumElems);
+ APInt UndefMask = APInt::getNullValue(NumElems);
+
+ SmallVector<LoadSDNode*, 8> Loads(NumElems, nullptr);
// For each element in the initializer, see if we've found a load, zero or an
// undef.
@@ -7210,38 +7528,52 @@ static SDValue EltsFromConsecutiveLoads(EVT VT, ArrayRef<SDValue> Elts,
SDValue Elt = peekThroughBitcasts(Elts[i]);
if (!Elt.getNode())
return SDValue();
+ if (Elt.isUndef()) {
+ UndefMask.setBit(i);
+ continue;
+ }
+ if (X86::isZeroNode(Elt) || ISD::isBuildVectorAllZeros(Elt.getNode())) {
+ ZeroMask.setBit(i);
+ continue;
+ }
+
+ // Each loaded element must be the correct fractional portion of the
+ // requested vector load.
+ if ((NumElems * Elt.getValueSizeInBits()) != VT.getSizeInBits())
+ return SDValue();
- if (Elt.isUndef())
- UndefMask[i] = true;
- else if (X86::isZeroNode(Elt) || ISD::isBuildVectorAllZeros(Elt.getNode()))
- ZeroMask[i] = true;
- else if (ISD::isNON_EXTLoad(Elt.getNode())) {
- LoadMask[i] = true;
- LastLoadedElt = i;
- // Each loaded element must be the correct fractional portion of the
- // requested vector load.
- if ((NumElems * Elt.getValueSizeInBits()) != VT.getSizeInBits())
- return SDValue();
- } else
+ if (!ISD::isNON_EXTLoad(Elt.getNode()))
return SDValue();
+
+ Loads[i] = cast<LoadSDNode>(Elt);
+ LoadMask.setBit(i);
+ LastLoadedElt = i;
}
- assert((ZeroMask | UndefMask | LoadMask).count() == NumElems &&
+ assert((ZeroMask.countPopulation() + UndefMask.countPopulation() +
+ LoadMask.countPopulation()) == NumElems &&
"Incomplete element masks");
// Handle Special Cases - all undef or undef/zero.
- if (UndefMask.count() == NumElems)
+ if (UndefMask.countPopulation() == NumElems)
return DAG.getUNDEF(VT);
// FIXME: Should we return this as a BUILD_VECTOR instead?
- if ((ZeroMask | UndefMask).count() == NumElems)
+ if ((ZeroMask.countPopulation() + UndefMask.countPopulation()) == NumElems)
return VT.isInteger() ? DAG.getConstant(0, DL, VT)
: DAG.getConstantFP(0.0, DL, VT);
const TargetLowering &TLI = DAG.getTargetLoweringInfo();
- int FirstLoadedElt = LoadMask.find_first();
+ int FirstLoadedElt = LoadMask.countTrailingZeros();
SDValue EltBase = peekThroughBitcasts(Elts[FirstLoadedElt]);
- LoadSDNode *LDBase = cast<LoadSDNode>(EltBase);
- EVT LDBaseVT = EltBase.getValueType();
+ EVT EltBaseVT = EltBase.getValueType();
+ assert(EltBaseVT.getSizeInBits() == EltBaseVT.getStoreSizeInBits() &&
+ "Register/Memory size mismatch");
+ LoadSDNode *LDBase = Loads[FirstLoadedElt];
+ assert(LDBase && "Did not find base load for merging consecutive loads");
+ unsigned BaseSizeInBits = EltBaseVT.getStoreSizeInBits();
+ unsigned BaseSizeInBytes = BaseSizeInBits / 8;
+ int LoadSizeInBits = (1 + LastLoadedElt - FirstLoadedElt) * BaseSizeInBits;
+ assert((BaseSizeInBits % 8) == 0 && "Sub-byte element loads detected");
// Consecutive loads can contain UNDEFS but not ZERO elements.
// Consecutive loads with UNDEFs and ZEROs elements require a
@@ -7250,11 +7582,8 @@ static SDValue EltsFromConsecutiveLoads(EVT VT, ArrayRef<SDValue> Elts,
bool IsConsecutiveLoadWithZeros = true;
for (int i = FirstLoadedElt + 1; i <= LastLoadedElt; ++i) {
if (LoadMask[i]) {
- SDValue Elt = peekThroughBitcasts(Elts[i]);
- LoadSDNode *LD = cast<LoadSDNode>(Elt);
- if (!DAG.areNonVolatileConsecutiveLoads(
- LD, LDBase, Elt.getValueType().getStoreSizeInBits() / 8,
- i - FirstLoadedElt)) {
+ if (!DAG.areNonVolatileConsecutiveLoads(Loads[i], LDBase, BaseSizeInBytes,
+ i - FirstLoadedElt)) {
IsConsecutiveLoad = false;
IsConsecutiveLoadWithZeros = false;
break;
@@ -7264,11 +7593,6 @@ static SDValue EltsFromConsecutiveLoads(EVT VT, ArrayRef<SDValue> Elts,
}
}
- SmallVector<LoadSDNode *, 8> Loads;
- for (int i = FirstLoadedElt; i <= LastLoadedElt; ++i)
- if (LoadMask[i])
- Loads.push_back(cast<LoadSDNode>(peekThroughBitcasts(Elts[i])));
-
auto CreateLoad = [&DAG, &DL, &Loads](EVT VT, LoadSDNode *LDBase) {
auto MMOFlags = LDBase->getMemOperand()->getFlags();
assert(!(MMOFlags & MachineMemOperand::MOVolatile) &&
@@ -7277,23 +7601,23 @@ static SDValue EltsFromConsecutiveLoads(EVT VT, ArrayRef<SDValue> Elts,
DAG.getLoad(VT, DL, LDBase->getChain(), LDBase->getBasePtr(),
LDBase->getPointerInfo(), LDBase->getAlignment(), MMOFlags);
for (auto *LD : Loads)
- DAG.makeEquivalentMemoryOrdering(LD, NewLd);
+ if (LD)
+ DAG.makeEquivalentMemoryOrdering(LD, NewLd);
return NewLd;
};
- // LOAD - all consecutive load/undefs (must start/end with a load).
- // If we have found an entire vector of loads and undefs, then return a large
- // load of the entire vector width starting at the base pointer.
- // If the vector contains zeros, then attempt to shuffle those elements.
- if (FirstLoadedElt == 0 && LastLoadedElt == (int)(NumElems - 1) &&
+ // Check if the base load is entirely dereferenceable.
+ bool IsDereferenceable = LDBase->getPointerInfo().isDereferenceable(
+ VT.getSizeInBits() / 8, *DAG.getContext(), DAG.getDataLayout());
+
+ // LOAD - all consecutive load/undefs (must start/end with a load or be
+ // entirely dereferenceable). If we have found an entire vector of loads and
+ // undefs, then return a large load of the entire vector width starting at the
+ // base pointer. If the vector contains zeros, then attempt to shuffle those
+ // elements.
+ if (FirstLoadedElt == 0 &&
+ (LastLoadedElt == (int)(NumElems - 1) || IsDereferenceable) &&
(IsConsecutiveLoad || IsConsecutiveLoadWithZeros)) {
- assert(LDBase && "Did not find base load for merging consecutive loads");
- EVT EltVT = LDBase->getValueType(0);
- // Ensure that the input vector size for the merged loads matches the
- // cumulative size of the input elements.
- if (VT.getSizeInBits() != EltVT.getSizeInBits() * NumElems)
- return SDValue();
-
if (isAfterLegalize && !TLI.isOperationLegal(ISD::LOAD, VT))
return SDValue();
@@ -7303,12 +7627,15 @@ static SDValue EltsFromConsecutiveLoads(EVT VT, ArrayRef<SDValue> Elts,
VT.is256BitVector() && !Subtarget.hasInt256())
return SDValue();
- if (IsConsecutiveLoad)
+ if (NumElems == 1)
+ return DAG.getBitcast(VT, Elts[FirstLoadedElt]);
+
+ if (!ZeroMask)
return CreateLoad(VT, LDBase);
// IsConsecutiveLoadWithZeros - we need to create a shuffle of the loaded
// vector and a zero vector to clear out the zero elements.
- if (!isAfterLegalize && NumElems == VT.getVectorNumElements()) {
+ if (!isAfterLegalize && VT.isVector()) {
SmallVector<int, 4> ClearMask(NumElems, -1);
for (unsigned i = 0; i < NumElems; ++i) {
if (ZeroMask[i])
@@ -7323,16 +7650,28 @@ static SDValue EltsFromConsecutiveLoads(EVT VT, ArrayRef<SDValue> Elts,
}
}
- int LoadSize =
- (1 + LastLoadedElt - FirstLoadedElt) * LDBaseVT.getStoreSizeInBits();
+ // If the upper half of a ymm/zmm load is undef then just load the lower half.
+ if (VT.is256BitVector() || VT.is512BitVector()) {
+ unsigned HalfNumElems = NumElems / 2;
+ if (UndefMask.extractBits(HalfNumElems, HalfNumElems).isAllOnesValue()) {
+ EVT HalfVT =
+ EVT::getVectorVT(*DAG.getContext(), VT.getScalarType(), HalfNumElems);
+ SDValue HalfLD =
+ EltsFromConsecutiveLoads(HalfVT, Elts.drop_back(HalfNumElems), DL,
+ DAG, Subtarget, isAfterLegalize);
+ if (HalfLD)
+ return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VT, DAG.getUNDEF(VT),
+ HalfLD, DAG.getIntPtrConstant(0, DL));
+ }
+ }
// VZEXT_LOAD - consecutive 32/64-bit load/undefs followed by zeros/undefs.
if (IsConsecutiveLoad && FirstLoadedElt == 0 &&
- (LoadSize == 32 || LoadSize == 64) &&
+ (LoadSizeInBits == 32 || LoadSizeInBits == 64) &&
((VT.is128BitVector() || VT.is256BitVector() || VT.is512BitVector()))) {
- MVT VecSVT = VT.isFloatingPoint() ? MVT::getFloatingPointVT(LoadSize)
- : MVT::getIntegerVT(LoadSize);
- MVT VecVT = MVT::getVectorVT(VecSVT, VT.getSizeInBits() / LoadSize);
+ MVT VecSVT = VT.isFloatingPoint() ? MVT::getFloatingPointVT(LoadSizeInBits)
+ : MVT::getIntegerVT(LoadSizeInBits);
+ MVT VecVT = MVT::getVectorVT(VecSVT, VT.getSizeInBits() / LoadSizeInBits);
if (TLI.isTypeLegal(VecVT)) {
SDVTList Tys = DAG.getVTList(VecVT, MVT::Other);
SDValue Ops[] = { LDBase->getChain(), LDBase->getBasePtr() };
@@ -7342,14 +7681,85 @@ static SDValue EltsFromConsecutiveLoads(EVT VT, ArrayRef<SDValue> Elts,
LDBase->getAlignment(),
MachineMemOperand::MOLoad);
for (auto *LD : Loads)
- DAG.makeEquivalentMemoryOrdering(LD, ResNode);
+ if (LD)
+ DAG.makeEquivalentMemoryOrdering(LD, ResNode);
return DAG.getBitcast(VT, ResNode);
}
}
+ // BROADCAST - match the smallest possible repetition pattern, load that
+ // scalar/subvector element and then broadcast to the entire vector.
+ if (ZeroMask.isNullValue() && isPowerOf2_32(NumElems) && Subtarget.hasAVX() &&
+ (VT.is128BitVector() || VT.is256BitVector() || VT.is512BitVector())) {
+ for (unsigned SubElems = 1; SubElems < NumElems; SubElems *= 2) {
+ unsigned RepeatSize = SubElems * BaseSizeInBits;
+ unsigned ScalarSize = std::min(RepeatSize, 64u);
+ if (!Subtarget.hasAVX2() && ScalarSize < 32)
+ continue;
+
+ bool Match = true;
+ SmallVector<SDValue, 8> RepeatedLoads(SubElems, DAG.getUNDEF(EltBaseVT));
+ for (unsigned i = 0; i != NumElems && Match; ++i) {
+ if (!LoadMask[i])
+ continue;
+ SDValue Elt = peekThroughBitcasts(Elts[i]);
+ if (RepeatedLoads[i % SubElems].isUndef())
+ RepeatedLoads[i % SubElems] = Elt;
+ else
+ Match &= (RepeatedLoads[i % SubElems] == Elt);
+ }
+
+ // We must have loads at both ends of the repetition.
+ Match &= !RepeatedLoads.front().isUndef();
+ Match &= !RepeatedLoads.back().isUndef();
+ if (!Match)
+ continue;
+
+ EVT RepeatVT =
+ VT.isInteger() && (RepeatSize != 64 || TLI.isTypeLegal(MVT::i64))
+ ? EVT::getIntegerVT(*DAG.getContext(), ScalarSize)
+ : EVT::getFloatingPointVT(ScalarSize);
+ if (RepeatSize > ScalarSize)
+ RepeatVT = EVT::getVectorVT(*DAG.getContext(), RepeatVT,
+ RepeatSize / ScalarSize);
+ EVT BroadcastVT =
+ EVT::getVectorVT(*DAG.getContext(), RepeatVT.getScalarType(),
+ VT.getSizeInBits() / ScalarSize);
+ if (TLI.isTypeLegal(BroadcastVT)) {
+ if (SDValue RepeatLoad = EltsFromConsecutiveLoads(
+ RepeatVT, RepeatedLoads, DL, DAG, Subtarget, isAfterLegalize)) {
+ unsigned Opcode = RepeatSize > ScalarSize ? X86ISD::SUBV_BROADCAST
+ : X86ISD::VBROADCAST;
+ SDValue Broadcast = DAG.getNode(Opcode, DL, BroadcastVT, RepeatLoad);
+ return DAG.getBitcast(VT, Broadcast);
+ }
+ }
+ }
+ }
+
return SDValue();
}
+// Combine a vector ops (shuffles etc.) that is equal to build_vector load1,
+// load2, load3, load4, <0, 1, 2, 3> into a vector load if the load addresses
+// are consecutive, non-overlapping, and in the right order.
+static SDValue combineToConsecutiveLoads(EVT VT, SDNode *N, const SDLoc &DL,
+ SelectionDAG &DAG,
+ const X86Subtarget &Subtarget,
+ bool isAfterLegalize) {
+ SmallVector<SDValue, 64> Elts;
+ for (unsigned i = 0, e = VT.getVectorNumElements(); i != e; ++i) {
+ if (SDValue Elt = getShuffleScalarElt(N, i, DAG, 0)) {
+ Elts.push_back(Elt);
+ continue;
+ }
+ return SDValue();
+ }
+ assert(Elts.size() == VT.getVectorNumElements());
+ return EltsFromConsecutiveLoads(VT, Elts, DL, DAG, Subtarget,
+ isAfterLegalize);
+}
+
static Constant *getConstantVector(MVT VT, const APInt &SplatValue,
unsigned SplatBitSize, LLVMContext &C) {
unsigned ScalarSize = VT.getScalarSizeInBits();
@@ -7373,12 +7783,20 @@ static Constant *getConstantVector(MVT VT, const APInt &SplatValue,
return ConstantVector::get(ArrayRef<Constant *>(ConstantVec));
}
-static bool isUseOfShuffle(SDNode *N) {
+static bool isFoldableUseOfShuffle(SDNode *N) {
for (auto *U : N->uses()) {
- if (isTargetShuffle(U->getOpcode()))
+ unsigned Opc = U->getOpcode();
+ // VPERMV/VPERMV3 shuffles can never fold their index operands.
+ if (Opc == X86ISD::VPERMV && U->getOperand(0).getNode() == N)
+ return false;
+ if (Opc == X86ISD::VPERMV3 && U->getOperand(1).getNode() == N)
+ return false;
+ if (isTargetShuffle(Opc))
+ return true;
+ if (Opc == ISD::BITCAST) // Ignore bitcasts
+ return isFoldableUseOfShuffle(U);
+ if (N->hasOneUse())
return true;
- if (U->getOpcode() == ISD::BITCAST) // Ignore bitcasts
- return isUseOfShuffle(U);
}
return false;
}
@@ -7486,7 +7904,7 @@ static SDValue lowerBuildVectorAsBroadcast(BuildVectorSDNode *BVOp,
SplatBitSize < VT.getSizeInBits()) {
// Avoid replacing with broadcast when it's a use of a shuffle
// instruction to preserve the present custom lowering of shuffles.
- if (isUseOfShuffle(BVOp) || BVOp->hasOneUse())
+ if (isFoldableUseOfShuffle(BVOp))
return SDValue();
// replace BUILD_VECTOR with broadcast of the repeated constants.
const TargetLowering &TLI = DAG.getTargetLoweringInfo();
@@ -7581,7 +7999,7 @@ static SDValue lowerBuildVectorAsBroadcast(BuildVectorSDNode *BVOp,
// TODO: If multiple splats are generated to load the same constant,
// it may be detrimental to overall size. There needs to be a way to detect
// that condition to know if this is truly a size win.
- bool OptForSize = DAG.getMachineFunction().getFunction().optForSize();
+ bool OptForSize = DAG.getMachineFunction().getFunction().hasOptSize();
// Handle broadcasting a single constant scalar from the constant pool
// into a vector.
@@ -8330,6 +8748,22 @@ static SDValue getHopForBuildVector(const BuildVectorSDNode *BV,
else if (V1.getValueSizeInBits() < Width)
V1 = insertSubVector(DAG.getUNDEF(VT), V1, 0, DAG, SDLoc(BV), Width);
+ unsigned NumElts = VT.getVectorNumElements();
+ APInt DemandedElts = APInt::getAllOnesValue(NumElts);
+ for (unsigned i = 0; i != NumElts; ++i)
+ if (BV->getOperand(i).isUndef())
+ DemandedElts.clearBit(i);
+
+ // If we don't need the upper xmm, then perform as a xmm hop.
+ unsigned HalfNumElts = NumElts / 2;
+ if (VT.is256BitVector() && DemandedElts.lshr(HalfNumElts) == 0) {
+ MVT HalfVT = MVT::getVectorVT(VT.getScalarType(), HalfNumElts);
+ V0 = extractSubVector(V0, 0, DAG, SDLoc(BV), 128);
+ V1 = extractSubVector(V1, 0, DAG, SDLoc(BV), 128);
+ SDValue Half = DAG.getNode(HOpcode, SDLoc(BV), HalfVT, V0, V1);
+ return insertSubVector(DAG.getUNDEF(VT), Half, 0, DAG, SDLoc(BV), 256);
+ }
+
return DAG.getNode(HOpcode, SDLoc(BV), VT, V0, V1);
}
@@ -8338,11 +8772,8 @@ static SDValue LowerToHorizontalOp(const BuildVectorSDNode *BV,
const X86Subtarget &Subtarget,
SelectionDAG &DAG) {
// We need at least 2 non-undef elements to make this worthwhile by default.
- unsigned NumNonUndefs = 0;
- for (const SDValue &V : BV->op_values())
- if (!V.isUndef())
- ++NumNonUndefs;
-
+ unsigned NumNonUndefs =
+ count_if(BV->op_values(), [](SDValue V) { return !V.isUndef(); });
if (NumNonUndefs < 2)
return SDValue();
@@ -8350,23 +8781,15 @@ static SDValue LowerToHorizontalOp(const BuildVectorSDNode *BV,
// int/FP at 128-bit/256-bit. Each type was introduced with a different
// subtarget feature. Try to match those "native" patterns first.
MVT VT = BV->getSimpleValueType(0);
- unsigned HOpcode;
- SDValue V0, V1;
- if ((VT == MVT::v4f32 || VT == MVT::v2f64) && Subtarget.hasSSE3())
- if (isHopBuildVector(BV, DAG, HOpcode, V0, V1))
- return getHopForBuildVector(BV, DAG, HOpcode, V0, V1);
-
- if ((VT == MVT::v8i16 || VT == MVT::v4i32) && Subtarget.hasSSSE3())
- if (isHopBuildVector(BV, DAG, HOpcode, V0, V1))
- return getHopForBuildVector(BV, DAG, HOpcode, V0, V1);
-
- if ((VT == MVT::v8f32 || VT == MVT::v4f64) && Subtarget.hasAVX())
- if (isHopBuildVector(BV, DAG, HOpcode, V0, V1))
- return getHopForBuildVector(BV, DAG, HOpcode, V0, V1);
-
- if ((VT == MVT::v16i16 || VT == MVT::v8i32) && Subtarget.hasAVX2())
+ if (((VT == MVT::v4f32 || VT == MVT::v2f64) && Subtarget.hasSSE3()) ||
+ ((VT == MVT::v8i16 || VT == MVT::v4i32) && Subtarget.hasSSSE3()) ||
+ ((VT == MVT::v8f32 || VT == MVT::v4f64) && Subtarget.hasAVX()) ||
+ ((VT == MVT::v16i16 || VT == MVT::v8i32) && Subtarget.hasAVX2())) {
+ unsigned HOpcode;
+ SDValue V0, V1;
if (isHopBuildVector(BV, DAG, HOpcode, V0, V1))
return getHopForBuildVector(BV, DAG, HOpcode, V0, V1);
+ }
// Try harder to match 256-bit ops by using extract/concat.
if (!Subtarget.hasAVX() || !VT.is256BitVector())
@@ -8481,9 +8904,15 @@ static SDValue lowerBuildVectorToBitOp(BuildVectorSDNode *Op,
return SDValue();
// TODO: We may be able to add support for other Ops (ADD/SUB + shifts).
+ bool IsShift = false;
switch (Opcode) {
default:
return SDValue();
+ case ISD::SHL:
+ case ISD::SRL:
+ case ISD::SRA:
+ IsShift = true;
+ break;
case ISD::AND:
case ISD::XOR:
case ISD::OR:
@@ -8504,10 +8933,24 @@ static SDValue lowerBuildVectorToBitOp(BuildVectorSDNode *Op,
// We expect the canonicalized RHS operand to be the constant.
if (!isa<ConstantSDNode>(RHS))
return SDValue();
+
+ // Extend shift amounts.
+ if (RHS.getValueSizeInBits() != VT.getScalarSizeInBits()) {
+ if (!IsShift)
+ return SDValue();
+ RHS = DAG.getZExtOrTrunc(RHS, DL, VT.getScalarType());
+ }
+
LHSElts.push_back(LHS);
RHSElts.push_back(RHS);
}
+ // Limit to shifts by uniform immediates.
+ // TODO: Only accept vXi8/vXi64 special cases?
+ // TODO: Permit non-uniform XOP/AVX2/MULLO cases?
+ if (IsShift && any_of(RHSElts, [&](SDValue V) { return RHSElts[0] != V; }))
+ return SDValue();
+
SDValue LHS = DAG.getBuildVector(VT, DL, LHSElts);
SDValue RHS = DAG.getBuildVector(VT, DL, RHSElts);
return DAG.getNode(Opcode, DL, VT, LHS, RHS);
@@ -9288,60 +9731,9 @@ static SDValue LowerAVXCONCAT_VECTORS(SDValue Op, SelectionDAG &DAG,
return Vec;
}
-// Return true if all the operands of the given CONCAT_VECTORS node are zeros
-// except for the first one. (CONCAT_VECTORS Op, 0, 0,...,0)
-static bool isExpandWithZeros(const SDValue &Op) {
- assert(Op.getOpcode() == ISD::CONCAT_VECTORS &&
- "Expand with zeros only possible in CONCAT_VECTORS nodes!");
-
- for (unsigned i = 1; i < Op.getNumOperands(); i++)
- if (!ISD::isBuildVectorAllZeros(Op.getOperand(i).getNode()))
- return false;
-
- return true;
-}
-
// Returns true if the given node is a type promotion (by concatenating i1
// zeros) of the result of a node that already zeros all upper bits of
// k-register.
-static SDValue isTypePromotionOfi1ZeroUpBits(SDValue Op) {
- unsigned Opc = Op.getOpcode();
-
- assert(Opc == ISD::CONCAT_VECTORS &&
- Op.getSimpleValueType().getVectorElementType() == MVT::i1 &&
- "Unexpected node to check for type promotion!");
-
- // As long as we are concatenating zeros to the upper part of a previous node
- // result, climb up the tree until a node with different opcode is
- // encountered
- while (Opc == ISD::INSERT_SUBVECTOR || Opc == ISD::CONCAT_VECTORS) {
- if (Opc == ISD::INSERT_SUBVECTOR) {
- if (ISD::isBuildVectorAllZeros(Op.getOperand(0).getNode()) &&
- Op.getConstantOperandVal(2) == 0)
- Op = Op.getOperand(1);
- else
- return SDValue();
- } else { // Opc == ISD::CONCAT_VECTORS
- if (isExpandWithZeros(Op))
- Op = Op.getOperand(0);
- else
- return SDValue();
- }
- Opc = Op.getOpcode();
- }
-
- // Check if the first inserted node zeroes the upper bits, or an 'and' result
- // of a node that zeros the upper bits (its masked version).
- if (isMaskedZeroUpperBitsvXi1(Op.getOpcode()) ||
- (Op.getOpcode() == ISD::AND &&
- (isMaskedZeroUpperBitsvXi1(Op.getOperand(0).getOpcode()) ||
- isMaskedZeroUpperBitsvXi1(Op.getOperand(1).getOpcode())))) {
- return Op;
- }
-
- return SDValue();
-}
-
// TODO: Merge this with LowerAVXCONCAT_VECTORS?
static SDValue LowerCONCAT_VECTORSvXi1(SDValue Op,
const X86Subtarget &Subtarget,
@@ -9353,13 +9745,6 @@ static SDValue LowerCONCAT_VECTORSvXi1(SDValue Op,
assert(NumOperands > 1 && isPowerOf2_32(NumOperands) &&
"Unexpected number of operands in CONCAT_VECTORS");
- // If this node promotes - by concatenating zeroes - the type of the result
- // of a node with instruction that zeroes all upper (irrelevant) bits of the
- // output register, mark it as legal and catch the pattern in instruction
- // selection to avoid emitting extra instructions (for zeroing upper bits).
- if (SDValue Promoted = isTypePromotionOfi1ZeroUpBits(Op))
- return widenSubVector(ResVT, Promoted, true, Subtarget, DAG, dl);
-
unsigned NumZero = 0;
unsigned NumNonZero = 0;
uint64_t NonZeros = 0;
@@ -9618,6 +10003,8 @@ static bool isTargetShuffleEquivalent(ArrayRef<int> Mask,
int Size = Mask.size();
if (Size != (int)ExpectedMask.size())
return false;
+ assert(isUndefOrZeroOrInRange(ExpectedMask, 0, 2 * Size) &&
+ "Illegal target shuffle mask");
for (int i = 0; i < Size; ++i)
if (Mask[i] == SM_SentinelUndef)
@@ -9687,6 +10074,40 @@ static bool isUnpackWdShuffleMask(ArrayRef<int> Mask, MVT VT) {
return IsUnpackwdMask;
}
+static bool is128BitUnpackShuffleMask(ArrayRef<int> Mask) {
+ // Create 128-bit vector type based on mask size.
+ MVT EltVT = MVT::getIntegerVT(128 / Mask.size());
+ MVT VT = MVT::getVectorVT(EltVT, Mask.size());
+
+ // We can't assume a canonical shuffle mask, so try the commuted version too.
+ SmallVector<int, 4> CommutedMask(Mask.begin(), Mask.end());
+ ShuffleVectorSDNode::commuteMask(CommutedMask);
+
+ // Match any of unary/binary or low/high.
+ for (unsigned i = 0; i != 4; ++i) {
+ SmallVector<int, 16> UnpackMask;
+ createUnpackShuffleMask(VT, UnpackMask, (i >> 1) % 2, i % 2);
+ if (isTargetShuffleEquivalent(Mask, UnpackMask) ||
+ isTargetShuffleEquivalent(CommutedMask, UnpackMask))
+ return true;
+ }
+ return false;
+}
+
+/// Return true if a shuffle mask chooses elements identically in its top and
+/// bottom halves. For example, any splat mask has the same top and bottom
+/// halves. If an element is undefined in only one half of the mask, the halves
+/// are not considered identical.
+static bool hasIdenticalHalvesShuffleMask(ArrayRef<int> Mask) {
+ assert(Mask.size() % 2 == 0 && "Expecting even number of elements in mask");
+ unsigned HalfSize = Mask.size() / 2;
+ for (unsigned i = 0; i != HalfSize; ++i) {
+ if (Mask[i] != Mask[i + HalfSize])
+ return false;
+ }
+ return true;
+}
+
/// Get a 4-lane 8-bit shuffle immediate for a mask.
///
/// This helper function produces an 8-bit shuffle immediate corresponding to
@@ -9826,12 +10247,11 @@ static bool isNonZeroElementsInOrder(const APInt &Zeroable,
}
/// Try to lower a shuffle with a single PSHUFB of V1 or V2.
-static SDValue lowerVectorShuffleWithPSHUFB(const SDLoc &DL, MVT VT,
- ArrayRef<int> Mask, SDValue V1,
- SDValue V2,
- const APInt &Zeroable,
- const X86Subtarget &Subtarget,
- SelectionDAG &DAG) {
+static SDValue lowerShuffleWithPSHUFB(const SDLoc &DL, MVT VT,
+ ArrayRef<int> Mask, SDValue V1,
+ SDValue V2, const APInt &Zeroable,
+ const X86Subtarget &Subtarget,
+ SelectionDAG &DAG) {
int Size = Mask.size();
int LaneSize = 128 / VT.getScalarSizeInBits();
const int NumBytes = VT.getSizeInBits() / 8;
@@ -9885,11 +10305,11 @@ static SDValue getMaskNode(SDValue Mask, MVT MaskVT,
const SDLoc &dl);
// X86 has dedicated shuffle that can be lowered to VEXPAND
-static SDValue lowerVectorShuffleToEXPAND(const SDLoc &DL, MVT VT,
- const APInt &Zeroable,
- ArrayRef<int> Mask, SDValue &V1,
- SDValue &V2, SelectionDAG &DAG,
- const X86Subtarget &Subtarget) {
+static SDValue lowerShuffleToEXPAND(const SDLoc &DL, MVT VT,
+ const APInt &Zeroable,
+ ArrayRef<int> Mask, SDValue &V1,
+ SDValue &V2, SelectionDAG &DAG,
+ const X86Subtarget &Subtarget) {
bool IsLeftZeroSide = true;
if (!isNonZeroElementsInOrder(Zeroable, Mask, V1.getValueType(),
IsLeftZeroSide))
@@ -9905,9 +10325,7 @@ static SDValue lowerVectorShuffleToEXPAND(const SDLoc &DL, MVT VT,
Subtarget, DAG, DL);
SDValue ZeroVector = getZeroVector(VT, Subtarget, DAG, DL);
SDValue ExpandedVector = IsLeftZeroSide ? V2 : V1;
- return DAG.getSelect(DL, VT, VMask,
- DAG.getNode(X86ISD::EXPAND, DL, VT, ExpandedVector),
- ZeroVector);
+ return DAG.getNode(X86ISD::EXPAND, DL, VT, ExpandedVector, ZeroVector, VMask);
}
static bool matchVectorShuffleWithUNPCK(MVT VT, SDValue &V1, SDValue &V2,
@@ -9997,9 +10415,9 @@ static bool matchVectorShuffleWithUNPCK(MVT VT, SDValue &V1, SDValue &V2,
// X86 has dedicated unpack instructions that can handle specific blend
// operations: UNPCKH and UNPCKL.
-static SDValue lowerVectorShuffleWithUNPCK(const SDLoc &DL, MVT VT,
- ArrayRef<int> Mask, SDValue V1,
- SDValue V2, SelectionDAG &DAG) {
+static SDValue lowerShuffleWithUNPCK(const SDLoc &DL, MVT VT,
+ ArrayRef<int> Mask, SDValue V1, SDValue V2,
+ SelectionDAG &DAG) {
SmallVector<int, 8> Unpckl;
createUnpackShuffleMask(VT, Unpckl, /* Lo = */ true, /* Unary = */ false);
if (isShuffleEquivalent(V1, V2, Mask, Unpckl))
@@ -10061,10 +10479,10 @@ static bool matchVectorShuffleAsVPMOV(ArrayRef<int> Mask, bool SwappedOps,
//
// But when avx512vl is available, one can just use a single vpmovdw
// instruction.
-static SDValue lowerVectorShuffleWithVPMOV(const SDLoc &DL, ArrayRef<int> Mask,
- MVT VT, SDValue V1, SDValue V2,
- SelectionDAG &DAG,
- const X86Subtarget &Subtarget) {
+static SDValue lowerShuffleWithVPMOV(const SDLoc &DL, ArrayRef<int> Mask,
+ MVT VT, SDValue V1, SDValue V2,
+ SelectionDAG &DAG,
+ const X86Subtarget &Subtarget) {
if (VT != MVT::v16i8 && VT != MVT::v8i16)
return SDValue();
@@ -10169,10 +10587,9 @@ static bool matchVectorShuffleWithPACK(MVT VT, MVT &SrcVT, SDValue &V1,
return false;
}
-static SDValue lowerVectorShuffleWithPACK(const SDLoc &DL, MVT VT,
- ArrayRef<int> Mask, SDValue V1,
- SDValue V2, SelectionDAG &DAG,
- const X86Subtarget &Subtarget) {
+static SDValue lowerShuffleWithPACK(const SDLoc &DL, MVT VT, ArrayRef<int> Mask,
+ SDValue V1, SDValue V2, SelectionDAG &DAG,
+ const X86Subtarget &Subtarget) {
MVT PackVT;
unsigned PackOpcode;
if (matchVectorShuffleWithPACK(VT, PackVT, V1, V2, PackOpcode, Mask, DAG,
@@ -10187,14 +10604,32 @@ static SDValue lowerVectorShuffleWithPACK(const SDLoc &DL, MVT VT,
///
/// This handles cases where we can model a blend exactly as a bitmask due to
/// one of the inputs being zeroable.
-static SDValue lowerVectorShuffleAsBitMask(const SDLoc &DL, MVT VT, SDValue V1,
- SDValue V2, ArrayRef<int> Mask,
- const APInt &Zeroable,
- SelectionDAG &DAG) {
- assert(!VT.isFloatingPoint() && "Floating point types are not supported");
+static SDValue lowerShuffleAsBitMask(const SDLoc &DL, MVT VT, SDValue V1,
+ SDValue V2, ArrayRef<int> Mask,
+ const APInt &Zeroable,
+ const X86Subtarget &Subtarget,
+ SelectionDAG &DAG) {
+ MVT MaskVT = VT;
MVT EltVT = VT.getVectorElementType();
- SDValue Zero = DAG.getConstant(0, DL, EltVT);
- SDValue AllOnes = DAG.getAllOnesConstant(DL, EltVT);
+ SDValue Zero, AllOnes;
+ // Use f64 if i64 isn't legal.
+ if (EltVT == MVT::i64 && !Subtarget.is64Bit()) {
+ EltVT = MVT::f64;
+ MaskVT = MVT::getVectorVT(EltVT, Mask.size());
+ }
+
+ MVT LogicVT = VT;
+ if (EltVT == MVT::f32 || EltVT == MVT::f64) {
+ Zero = DAG.getConstantFP(0.0, DL, EltVT);
+ AllOnes = DAG.getConstantFP(
+ APFloat::getAllOnesValue(EltVT.getSizeInBits(), true), DL, EltVT);
+ LogicVT =
+ MVT::getVectorVT(EltVT == MVT::f64 ? MVT::i64 : MVT::i32, Mask.size());
+ } else {
+ Zero = DAG.getConstant(0, DL, EltVT);
+ AllOnes = DAG.getAllOnesConstant(DL, EltVT);
+ }
+
SmallVector<SDValue, 16> VMaskOps(Mask.size(), Zero);
SDValue V;
for (int i = 0, Size = Mask.size(); i < Size; ++i) {
@@ -10212,8 +10647,11 @@ static SDValue lowerVectorShuffleAsBitMask(const SDLoc &DL, MVT VT, SDValue V1,
if (!V)
return SDValue(); // No non-zeroable elements!
- SDValue VMask = DAG.getBuildVector(VT, DL, VMaskOps);
- return DAG.getNode(ISD::AND, DL, VT, V, VMask);
+ SDValue VMask = DAG.getBuildVector(MaskVT, DL, VMaskOps);
+ VMask = DAG.getBitcast(LogicVT, VMask);
+ V = DAG.getBitcast(LogicVT, V);
+ SDValue And = DAG.getNode(ISD::AND, DL, LogicVT, V, VMask);
+ return DAG.getBitcast(VT, And);
}
/// Try to emit a blend instruction for a shuffle using bit math.
@@ -10221,9 +10659,9 @@ static SDValue lowerVectorShuffleAsBitMask(const SDLoc &DL, MVT VT, SDValue V1,
/// This is used as a fallback approach when first class blend instructions are
/// unavailable. Currently it is only suitable for integer vectors, but could
/// be generalized for floating point vectors if desirable.
-static SDValue lowerVectorShuffleAsBitBlend(const SDLoc &DL, MVT VT, SDValue V1,
- SDValue V2, ArrayRef<int> Mask,
- SelectionDAG &DAG) {
+static SDValue lowerShuffleAsBitBlend(const SDLoc &DL, MVT VT, SDValue V1,
+ SDValue V2, ArrayRef<int> Mask,
+ SelectionDAG &DAG) {
assert(VT.isInteger() && "Only supports integer vector types!");
MVT EltVT = VT.getVectorElementType();
SDValue Zero = DAG.getConstant(0, DL, EltVT);
@@ -10305,11 +10743,11 @@ static uint64_t scaleVectorShuffleBlendMask(uint64_t BlendMask, int Size,
/// these values. It relies on the availability of the X86ISD::BLENDI pattern to
/// be matched in the backend with the type given. What it does check for is
/// that the shuffle mask is a blend, or convertible into a blend with zero.
-static SDValue lowerVectorShuffleAsBlend(const SDLoc &DL, MVT VT, SDValue V1,
- SDValue V2, ArrayRef<int> Original,
- const APInt &Zeroable,
- const X86Subtarget &Subtarget,
- SelectionDAG &DAG) {
+static SDValue lowerShuffleAsBlend(const SDLoc &DL, MVT VT, SDValue V1,
+ SDValue V2, ArrayRef<int> Original,
+ const APInt &Zeroable,
+ const X86Subtarget &Subtarget,
+ SelectionDAG &DAG) {
SmallVector<int, 64> Mask = createTargetShuffleMask(Original, Zeroable);
uint64_t BlendMask = 0;
@@ -10325,45 +10763,24 @@ static SDValue lowerVectorShuffleAsBlend(const SDLoc &DL, MVT VT, SDValue V1,
V2 = getZeroVector(VT, Subtarget, DAG, DL);
switch (VT.SimpleTy) {
- case MVT::v2f64:
- case MVT::v4f32:
- case MVT::v4f64:
- case MVT::v8f32:
- return DAG.getNode(X86ISD::BLENDI, DL, VT, V1, V2,
- DAG.getConstant(BlendMask, DL, MVT::i8));
case MVT::v4i64:
case MVT::v8i32:
assert(Subtarget.hasAVX2() && "256-bit integer blends require AVX2!");
LLVM_FALLTHROUGH;
+ case MVT::v4f64:
+ case MVT::v8f32:
+ assert(Subtarget.hasAVX() && "256-bit float blends require AVX!");
+ LLVM_FALLTHROUGH;
+ case MVT::v2f64:
case MVT::v2i64:
+ case MVT::v4f32:
case MVT::v4i32:
- // If we have AVX2 it is faster to use VPBLENDD when the shuffle fits into
- // that instruction.
- if (Subtarget.hasAVX2()) {
- // Scale the blend by the number of 32-bit dwords per element.
- int Scale = VT.getScalarSizeInBits() / 32;
- BlendMask = scaleVectorShuffleBlendMask(BlendMask, Mask.size(), Scale);
- MVT BlendVT = VT.getSizeInBits() > 128 ? MVT::v8i32 : MVT::v4i32;
- V1 = DAG.getBitcast(BlendVT, V1);
- V2 = DAG.getBitcast(BlendVT, V2);
- return DAG.getBitcast(
- VT, DAG.getNode(X86ISD::BLENDI, DL, BlendVT, V1, V2,
- DAG.getConstant(BlendMask, DL, MVT::i8)));
- }
- LLVM_FALLTHROUGH;
- case MVT::v8i16: {
- // For integer shuffles we need to expand the mask and cast the inputs to
- // v8i16s prior to blending.
- int Scale = 8 / VT.getVectorNumElements();
- BlendMask = scaleVectorShuffleBlendMask(BlendMask, Mask.size(), Scale);
- V1 = DAG.getBitcast(MVT::v8i16, V1);
- V2 = DAG.getBitcast(MVT::v8i16, V2);
- return DAG.getBitcast(VT,
- DAG.getNode(X86ISD::BLENDI, DL, MVT::v8i16, V1, V2,
- DAG.getConstant(BlendMask, DL, MVT::i8)));
- }
+ case MVT::v8i16:
+ assert(Subtarget.hasSSE41() && "128-bit blends require SSE41!");
+ return DAG.getNode(X86ISD::BLENDI, DL, VT, V1, V2,
+ DAG.getConstant(BlendMask, DL, MVT::i8));
case MVT::v16i16: {
- assert(Subtarget.hasAVX2() && "256-bit integer blends require AVX2!");
+ assert(Subtarget.hasAVX2() && "v16i16 blends require AVX2!");
SmallVector<int, 8> RepeatedMask;
if (is128BitLaneRepeatedShuffleMask(MVT::v16i16, Mask, RepeatedMask)) {
// We can lower these with PBLENDW which is mirrored across 128-bit lanes.
@@ -10391,14 +10808,15 @@ static SDValue lowerVectorShuffleAsBlend(const SDLoc &DL, MVT VT, SDValue V1,
}
LLVM_FALLTHROUGH;
}
- case MVT::v16i8:
- case MVT::v32i8: {
- assert((VT.is128BitVector() || Subtarget.hasAVX2()) &&
- "256-bit byte-blends require AVX2 support!");
+ case MVT::v32i8:
+ assert(Subtarget.hasAVX2() && "256-bit byte-blends require AVX2!");
+ LLVM_FALLTHROUGH;
+ case MVT::v16i8: {
+ assert(Subtarget.hasSSE41() && "128-bit byte-blends require SSE41!");
// Attempt to lower to a bitmask if we can. VPAND is faster than VPBLENDVB.
- if (SDValue Masked =
- lowerVectorShuffleAsBitMask(DL, VT, V1, V2, Mask, Zeroable, DAG))
+ if (SDValue Masked = lowerShuffleAsBitMask(DL, VT, V1, V2, Mask, Zeroable,
+ Subtarget, DAG))
return Masked;
if (Subtarget.hasBWI() && Subtarget.hasVLX()) {
@@ -10456,6 +10874,16 @@ static SDValue lowerVectorShuffleAsBlend(const SDLoc &DL, MVT VT, SDValue V1,
case MVT::v16i32:
case MVT::v32i16:
case MVT::v64i8: {
+ // Attempt to lower to a bitmask if we can. Only if not optimizing for size.
+ bool OptForSize = DAG.getMachineFunction().getFunction().hasOptSize();
+ if (!OptForSize) {
+ if (SDValue Masked = lowerShuffleAsBitMask(DL, VT, V1, V2, Mask, Zeroable,
+ Subtarget, DAG))
+ return Masked;
+ }
+
+ // Otherwise load an immediate into a GPR, cast to k-register, and use a
+ // masked move.
MVT IntegerType =
MVT::getIntegerVT(std::max((int)VT.getVectorNumElements(), 8));
SDValue MaskNode = DAG.getConstant(BlendMask, DL, IntegerType);
@@ -10471,11 +10899,11 @@ static SDValue lowerVectorShuffleAsBlend(const SDLoc &DL, MVT VT, SDValue V1,
///
/// This matches the pattern where we can blend elements from two inputs and
/// then reduce the shuffle to a single-input permutation.
-static SDValue lowerVectorShuffleAsBlendAndPermute(const SDLoc &DL, MVT VT,
- SDValue V1, SDValue V2,
- ArrayRef<int> Mask,
- SelectionDAG &DAG,
- bool ImmBlends = false) {
+static SDValue lowerShuffleAsBlendAndPermute(const SDLoc &DL, MVT VT,
+ SDValue V1, SDValue V2,
+ ArrayRef<int> Mask,
+ SelectionDAG &DAG,
+ bool ImmBlends = false) {
// We build up the blend mask while checking whether a blend is a viable way
// to reduce the shuffle.
SmallVector<int, 32> BlendMask(Mask.size(), -1);
@@ -10510,10 +10938,10 @@ static SDValue lowerVectorShuffleAsBlendAndPermute(const SDLoc &DL, MVT VT,
///
/// This matches the pattern where we can unpack elements from two inputs and
/// then reduce the shuffle to a single-input (wider) permutation.
-static SDValue lowerVectorShuffleAsUNPCKAndPermute(const SDLoc &DL, MVT VT,
- SDValue V1, SDValue V2,
- ArrayRef<int> Mask,
- SelectionDAG &DAG) {
+static SDValue lowerShuffleAsUNPCKAndPermute(const SDLoc &DL, MVT VT,
+ SDValue V1, SDValue V2,
+ ArrayRef<int> Mask,
+ SelectionDAG &DAG) {
int NumElts = Mask.size();
int NumLanes = VT.getSizeInBits() / 128;
int NumLaneElts = NumElts / NumLanes;
@@ -10573,7 +11001,7 @@ static SDValue lowerVectorShuffleAsUNPCKAndPermute(const SDLoc &DL, MVT VT,
/// Helper to form a PALIGNR-based rotate+permute, merging 2 inputs and then
/// permuting the elements of the result in place.
-static SDValue lowerVectorShuffleAsByteRotateAndPermute(
+static SDValue lowerShuffleAsByteRotateAndPermute(
const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef<int> Mask,
const X86Subtarget &Subtarget, SelectionDAG &DAG) {
if ((VT.is128BitVector() && !Subtarget.hasSSSE3()) ||
@@ -10664,7 +11092,7 @@ static SDValue lowerVectorShuffleAsByteRotateAndPermute(
/// shuffle+blend operations on newer X86 ISAs where we have very fast blend
/// operations. It will try to pick the best arrangement of shuffles and
/// blends.
-static SDValue lowerVectorShuffleAsDecomposedShuffleBlend(
+static SDValue lowerShuffleAsDecomposedShuffleBlend(
const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef<int> Mask,
const X86Subtarget &Subtarget, SelectionDAG &DAG) {
// Shuffle the input elements into the desired positions in V1 and V2 and
@@ -10688,18 +11116,18 @@ static SDValue lowerVectorShuffleAsDecomposedShuffleBlend(
// pre-shuffle first is a better strategy.
if (!isNoopShuffleMask(V1Mask) && !isNoopShuffleMask(V2Mask)) {
// Only prefer immediate blends to unpack/rotate.
- if (SDValue BlendPerm = lowerVectorShuffleAsBlendAndPermute(
- DL, VT, V1, V2, Mask, DAG, true))
+ if (SDValue BlendPerm = lowerShuffleAsBlendAndPermute(DL, VT, V1, V2, Mask,
+ DAG, true))
return BlendPerm;
- if (SDValue UnpackPerm =
- lowerVectorShuffleAsUNPCKAndPermute(DL, VT, V1, V2, Mask, DAG))
+ if (SDValue UnpackPerm = lowerShuffleAsUNPCKAndPermute(DL, VT, V1, V2, Mask,
+ DAG))
return UnpackPerm;
- if (SDValue RotatePerm = lowerVectorShuffleAsByteRotateAndPermute(
+ if (SDValue RotatePerm = lowerShuffleAsByteRotateAndPermute(
DL, VT, V1, V2, Mask, Subtarget, DAG))
return RotatePerm;
// Unpack/rotate failed - try again with variable blends.
- if (SDValue BlendPerm =
- lowerVectorShuffleAsBlendAndPermute(DL, VT, V1, V2, Mask, DAG))
+ if (SDValue BlendPerm = lowerShuffleAsBlendAndPermute(DL, VT, V1, V2, Mask,
+ DAG))
return BlendPerm;
}
@@ -10711,8 +11139,7 @@ static SDValue lowerVectorShuffleAsDecomposedShuffleBlend(
/// Try to lower a vector shuffle as a rotation.
///
/// This is used for support PALIGNR for SSSE3 or VALIGND/Q for AVX512.
-static int matchVectorShuffleAsRotate(SDValue &V1, SDValue &V2,
- ArrayRef<int> Mask) {
+static int matchShuffleAsRotate(SDValue &V1, SDValue &V2, ArrayRef<int> Mask) {
int NumElts = Mask.size();
// We need to detect various ways of spelling a rotation:
@@ -10796,8 +11223,8 @@ static int matchVectorShuffleAsRotate(SDValue &V1, SDValue &V2,
/// elements, and takes the low elements as the result. Note that while this is
/// specified as a *right shift* because x86 is little-endian, it is a *left
/// rotate* of the vector lanes.
-static int matchVectorShuffleAsByteRotate(MVT VT, SDValue &V1, SDValue &V2,
- ArrayRef<int> Mask) {
+static int matchShuffleAsByteRotate(MVT VT, SDValue &V1, SDValue &V2,
+ ArrayRef<int> Mask) {
// Don't accept any shuffles with zero elements.
if (any_of(Mask, [](int M) { return M == SM_SentinelZero; }))
return -1;
@@ -10807,7 +11234,7 @@ static int matchVectorShuffleAsByteRotate(MVT VT, SDValue &V1, SDValue &V2,
if (!is128BitLaneRepeatedShuffleMask(VT, Mask, RepeatedMask))
return -1;
- int Rotation = matchVectorShuffleAsRotate(V1, V2, RepeatedMask);
+ int Rotation = matchShuffleAsRotate(V1, V2, RepeatedMask);
if (Rotation <= 0)
return -1;
@@ -10818,15 +11245,14 @@ static int matchVectorShuffleAsByteRotate(MVT VT, SDValue &V1, SDValue &V2,
return Rotation * Scale;
}
-static SDValue lowerVectorShuffleAsByteRotate(const SDLoc &DL, MVT VT,
- SDValue V1, SDValue V2,
- ArrayRef<int> Mask,
- const X86Subtarget &Subtarget,
- SelectionDAG &DAG) {
+static SDValue lowerShuffleAsByteRotate(const SDLoc &DL, MVT VT, SDValue V1,
+ SDValue V2, ArrayRef<int> Mask,
+ const X86Subtarget &Subtarget,
+ SelectionDAG &DAG) {
assert(!isNoopShuffleMask(Mask) && "We shouldn't lower no-op shuffles!");
SDValue Lo = V1, Hi = V2;
- int ByteRotation = matchVectorShuffleAsByteRotate(VT, Lo, Hi, Mask);
+ int ByteRotation = matchShuffleAsByteRotate(VT, Lo, Hi, Mask);
if (ByteRotation <= 0)
return SDValue();
@@ -10874,11 +11300,10 @@ static SDValue lowerVectorShuffleAsByteRotate(const SDLoc &DL, MVT VT,
/// elements, and takes the low elements as the result. Note that while this is
/// specified as a *right shift* because x86 is little-endian, it is a *left
/// rotate* of the vector lanes.
-static SDValue lowerVectorShuffleAsRotate(const SDLoc &DL, MVT VT,
- SDValue V1, SDValue V2,
- ArrayRef<int> Mask,
- const X86Subtarget &Subtarget,
- SelectionDAG &DAG) {
+static SDValue lowerShuffleAsRotate(const SDLoc &DL, MVT VT, SDValue V1,
+ SDValue V2, ArrayRef<int> Mask,
+ const X86Subtarget &Subtarget,
+ SelectionDAG &DAG) {
assert((VT.getScalarType() == MVT::i32 || VT.getScalarType() == MVT::i64) &&
"Only 32-bit and 64-bit elements are supported!");
@@ -10887,7 +11312,7 @@ static SDValue lowerVectorShuffleAsRotate(const SDLoc &DL, MVT VT,
&& "VLX required for 128/256-bit vectors");
SDValue Lo = V1, Hi = V2;
- int Rotation = matchVectorShuffleAsRotate(Lo, Hi, Mask);
+ int Rotation = matchShuffleAsRotate(Lo, Hi, Mask);
if (Rotation <= 0)
return SDValue();
@@ -10895,6 +11320,69 @@ static SDValue lowerVectorShuffleAsRotate(const SDLoc &DL, MVT VT,
DAG.getConstant(Rotation, DL, MVT::i8));
}
+/// Try to lower a vector shuffle as a byte shift sequence.
+static SDValue lowerVectorShuffleAsByteShiftMask(
+ const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef<int> Mask,
+ const APInt &Zeroable, const X86Subtarget &Subtarget, SelectionDAG &DAG) {
+ assert(!isNoopShuffleMask(Mask) && "We shouldn't lower no-op shuffles!");
+ assert(VT.is128BitVector() && "Only 128-bit vectors supported");
+
+ // We need a shuffle that has zeros at one/both ends and a sequential
+ // shuffle from one source within.
+ unsigned ZeroLo = Zeroable.countTrailingOnes();
+ unsigned ZeroHi = Zeroable.countLeadingOnes();
+ if (!ZeroLo && !ZeroHi)
+ return SDValue();
+
+ unsigned NumElts = Mask.size();
+ unsigned Len = NumElts - (ZeroLo + ZeroHi);
+ if (!isSequentialOrUndefInRange(Mask, ZeroLo, Len, Mask[ZeroLo]))
+ return SDValue();
+
+ unsigned Scale = VT.getScalarSizeInBits() / 8;
+ ArrayRef<int> StubMask = Mask.slice(ZeroLo, Len);
+ if (!isUndefOrInRange(StubMask, 0, NumElts) &&
+ !isUndefOrInRange(StubMask, NumElts, 2 * NumElts))
+ return SDValue();
+
+ SDValue Res = Mask[ZeroLo] < (int)NumElts ? V1 : V2;
+ Res = DAG.getBitcast(MVT::v16i8, Res);
+
+ // Use VSHLDQ/VSRLDQ ops to zero the ends of a vector and leave an
+ // inner sequential set of elements, possibly offset:
+ // 01234567 --> zzzzzz01 --> 1zzzzzzz
+ // 01234567 --> 4567zzzz --> zzzzz456
+ // 01234567 --> z0123456 --> 3456zzzz --> zz3456zz
+ if (ZeroLo == 0) {
+ unsigned Shift = (NumElts - 1) - (Mask[ZeroLo + Len - 1] % NumElts);
+ Res = DAG.getNode(X86ISD::VSHLDQ, DL, MVT::v16i8, Res,
+ DAG.getConstant(Scale * Shift, DL, MVT::i8));
+ Res = DAG.getNode(X86ISD::VSRLDQ, DL, MVT::v16i8, Res,
+ DAG.getConstant(Scale * ZeroHi, DL, MVT::i8));
+ } else if (ZeroHi == 0) {
+ unsigned Shift = Mask[ZeroLo] % NumElts;
+ Res = DAG.getNode(X86ISD::VSRLDQ, DL, MVT::v16i8, Res,
+ DAG.getConstant(Scale * Shift, DL, MVT::i8));
+ Res = DAG.getNode(X86ISD::VSHLDQ, DL, MVT::v16i8, Res,
+ DAG.getConstant(Scale * ZeroLo, DL, MVT::i8));
+ } else if (!Subtarget.hasSSSE3()) {
+ // If we don't have PSHUFB then its worth avoiding an AND constant mask
+ // by performing 3 byte shifts. Shuffle combining can kick in above that.
+ // TODO: There may be some cases where VSH{LR}DQ+PAND is still better.
+ unsigned Shift = (NumElts - 1) - (Mask[ZeroLo + Len - 1] % NumElts);
+ Res = DAG.getNode(X86ISD::VSHLDQ, DL, MVT::v16i8, Res,
+ DAG.getConstant(Scale * Shift, DL, MVT::i8));
+ Shift += Mask[ZeroLo] % NumElts;
+ Res = DAG.getNode(X86ISD::VSRLDQ, DL, MVT::v16i8, Res,
+ DAG.getConstant(Scale * Shift, DL, MVT::i8));
+ Res = DAG.getNode(X86ISD::VSHLDQ, DL, MVT::v16i8, Res,
+ DAG.getConstant(Scale * ZeroLo, DL, MVT::i8));
+ } else
+ return SDValue();
+
+ return DAG.getBitcast(VT, Res);
+}
+
/// Try to lower a vector shuffle as a bit shift (shifts in zeros).
///
/// Attempts to match a shuffle mask against the PSLL(W/D/Q/DQ) and
@@ -10918,11 +11406,10 @@ static SDValue lowerVectorShuffleAsRotate(const SDLoc &DL, MVT VT,
/// [ 5, 6, 7, zz, zz, zz, zz, zz]
/// [ -1, 5, 6, 7, zz, zz, zz, zz]
/// [ 1, 2, -1, -1, -1, -1, zz, zz]
-static int matchVectorShuffleAsShift(MVT &ShiftVT, unsigned &Opcode,
- unsigned ScalarSizeInBits,
- ArrayRef<int> Mask, int MaskOffset,
- const APInt &Zeroable,
- const X86Subtarget &Subtarget) {
+static int matchShuffleAsShift(MVT &ShiftVT, unsigned &Opcode,
+ unsigned ScalarSizeInBits, ArrayRef<int> Mask,
+ int MaskOffset, const APInt &Zeroable,
+ const X86Subtarget &Subtarget) {
int Size = Mask.size();
unsigned SizeInBits = Size * ScalarSizeInBits;
@@ -10981,11 +11468,11 @@ static int matchVectorShuffleAsShift(MVT &ShiftVT, unsigned &Opcode,
return -1;
}
-static SDValue lowerVectorShuffleAsShift(const SDLoc &DL, MVT VT, SDValue V1,
- SDValue V2, ArrayRef<int> Mask,
- const APInt &Zeroable,
- const X86Subtarget &Subtarget,
- SelectionDAG &DAG) {
+static SDValue lowerShuffleAsShift(const SDLoc &DL, MVT VT, SDValue V1,
+ SDValue V2, ArrayRef<int> Mask,
+ const APInt &Zeroable,
+ const X86Subtarget &Subtarget,
+ SelectionDAG &DAG) {
int Size = Mask.size();
assert(Size == (int)VT.getVectorNumElements() && "Unexpected mask size");
@@ -10994,14 +11481,13 @@ static SDValue lowerVectorShuffleAsShift(const SDLoc &DL, MVT VT, SDValue V1,
unsigned Opcode;
// Try to match shuffle against V1 shift.
- int ShiftAmt = matchVectorShuffleAsShift(
- ShiftVT, Opcode, VT.getScalarSizeInBits(), Mask, 0, Zeroable, Subtarget);
+ int ShiftAmt = matchShuffleAsShift(ShiftVT, Opcode, VT.getScalarSizeInBits(),
+ Mask, 0, Zeroable, Subtarget);
// If V1 failed, try to match shuffle against V2 shift.
if (ShiftAmt < 0) {
- ShiftAmt =
- matchVectorShuffleAsShift(ShiftVT, Opcode, VT.getScalarSizeInBits(),
- Mask, Size, Zeroable, Subtarget);
+ ShiftAmt = matchShuffleAsShift(ShiftVT, Opcode, VT.getScalarSizeInBits(),
+ Mask, Size, Zeroable, Subtarget);
V = V2;
}
@@ -11018,16 +11504,16 @@ static SDValue lowerVectorShuffleAsShift(const SDLoc &DL, MVT VT, SDValue V1,
// EXTRQ: Extract Len elements from lower half of source, starting at Idx.
// Remainder of lower half result is zero and upper half is all undef.
-static bool matchVectorShuffleAsEXTRQ(MVT VT, SDValue &V1, SDValue &V2,
- ArrayRef<int> Mask, uint64_t &BitLen,
- uint64_t &BitIdx, const APInt &Zeroable) {
+static bool matchShuffleAsEXTRQ(MVT VT, SDValue &V1, SDValue &V2,
+ ArrayRef<int> Mask, uint64_t &BitLen,
+ uint64_t &BitIdx, const APInt &Zeroable) {
int Size = Mask.size();
int HalfSize = Size / 2;
assert(Size == (int)VT.getVectorNumElements() && "Unexpected mask size");
assert(!Zeroable.isAllOnesValue() && "Fully zeroable shuffle mask");
// Upper half must be undefined.
- if (!isUndefInRange(Mask, HalfSize, HalfSize))
+ if (!isUndefUpperHalf(Mask))
return false;
// Determine the extraction length from the part of the
@@ -11074,15 +11560,15 @@ static bool matchVectorShuffleAsEXTRQ(MVT VT, SDValue &V1, SDValue &V2,
// INSERTQ: Extract lowest Len elements from lower half of second source and
// insert over first source, starting at Idx.
// { A[0], .., A[Idx-1], B[0], .., B[Len-1], A[Idx+Len], .., UNDEF, ... }
-static bool matchVectorShuffleAsINSERTQ(MVT VT, SDValue &V1, SDValue &V2,
- ArrayRef<int> Mask, uint64_t &BitLen,
- uint64_t &BitIdx) {
+static bool matchShuffleAsINSERTQ(MVT VT, SDValue &V1, SDValue &V2,
+ ArrayRef<int> Mask, uint64_t &BitLen,
+ uint64_t &BitIdx) {
int Size = Mask.size();
int HalfSize = Size / 2;
assert(Size == (int)VT.getVectorNumElements() && "Unexpected mask size");
// Upper half must be undefined.
- if (!isUndefInRange(Mask, HalfSize, HalfSize))
+ if (!isUndefUpperHalf(Mask))
return false;
for (int Idx = 0; Idx != HalfSize; ++Idx) {
@@ -11140,17 +11626,16 @@ static bool matchVectorShuffleAsINSERTQ(MVT VT, SDValue &V1, SDValue &V2,
}
/// Try to lower a vector shuffle using SSE4a EXTRQ/INSERTQ.
-static SDValue lowerVectorShuffleWithSSE4A(const SDLoc &DL, MVT VT, SDValue V1,
- SDValue V2, ArrayRef<int> Mask,
- const APInt &Zeroable,
- SelectionDAG &DAG) {
+static SDValue lowerShuffleWithSSE4A(const SDLoc &DL, MVT VT, SDValue V1,
+ SDValue V2, ArrayRef<int> Mask,
+ const APInt &Zeroable, SelectionDAG &DAG) {
uint64_t BitLen, BitIdx;
- if (matchVectorShuffleAsEXTRQ(VT, V1, V2, Mask, BitLen, BitIdx, Zeroable))
+ if (matchShuffleAsEXTRQ(VT, V1, V2, Mask, BitLen, BitIdx, Zeroable))
return DAG.getNode(X86ISD::EXTRQI, DL, VT, V1,
DAG.getConstant(BitLen, DL, MVT::i8),
DAG.getConstant(BitIdx, DL, MVT::i8));
- if (matchVectorShuffleAsINSERTQ(VT, V1, V2, Mask, BitLen, BitIdx))
+ if (matchShuffleAsINSERTQ(VT, V1, V2, Mask, BitLen, BitIdx))
return DAG.getNode(X86ISD::INSERTQI, DL, VT, V1 ? V1 : DAG.getUNDEF(VT),
V2 ? V2 : DAG.getUNDEF(VT),
DAG.getConstant(BitLen, DL, MVT::i8),
@@ -11168,7 +11653,7 @@ static SDValue lowerVectorShuffleWithSSE4A(const SDLoc &DL, MVT VT, SDValue V1,
/// avoid excess shuffling the offset must either being in the bottom lane
/// or at the start of a higher lane. All extended elements must be from
/// the same lane.
-static SDValue lowerVectorShuffleAsSpecificZeroOrAnyExtend(
+static SDValue lowerShuffleAsSpecificZeroOrAnyExtend(
const SDLoc &DL, MVT VT, int Scale, int Offset, bool AnyExt, SDValue InputV,
ArrayRef<int> Mask, const X86Subtarget &Subtarget, SelectionDAG &DAG) {
assert(Scale > 1 && "Need a scale to extend.");
@@ -11203,6 +11688,7 @@ static SDValue lowerVectorShuffleAsSpecificZeroOrAnyExtend(
// Found a valid zext mask! Try various lowering strategies based on the
// input type and available ISA extensions.
+ // TODO: Add AnyExt support.
if (Subtarget.hasSSE41()) {
// Not worth offsetting 128-bit vectors if scale == 2, a pattern using
// PUNPCK will catch this in a later shuffle match.
@@ -11211,7 +11697,7 @@ static SDValue lowerVectorShuffleAsSpecificZeroOrAnyExtend(
MVT ExtVT = MVT::getVectorVT(MVT::getIntegerVT(EltBits * Scale),
NumElements / Scale);
InputV = ShuffleOffset(InputV);
- InputV = getExtendInVec(/*Signed*/false, DL, ExtVT, InputV, DAG);
+ InputV = getExtendInVec(ISD::ZERO_EXTEND, DL, ExtVT, InputV, DAG);
return DAG.getBitcast(VT, InputV);
}
@@ -11234,7 +11720,7 @@ static SDValue lowerVectorShuffleAsSpecificZeroOrAnyExtend(
DAG.getBitcast(MVT::v4i32, InputV),
getV4X86ShuffleImm8ForMask(PSHUFDMask, DL, DAG));
int PSHUFWMask[4] = {1, -1, -1, -1};
- unsigned OddEvenOp = (Offset & 1 ? X86ISD::PSHUFLW : X86ISD::PSHUFHW);
+ unsigned OddEvenOp = (Offset & 1) ? X86ISD::PSHUFLW : X86ISD::PSHUFHW;
return DAG.getBitcast(
VT, DAG.getNode(OddEvenOp, DL, MVT::v8i16,
DAG.getBitcast(MVT::v8i16, InputV),
@@ -11253,8 +11739,7 @@ static SDValue lowerVectorShuffleAsSpecificZeroOrAnyExtend(
DAG.getConstant(EltBits, DL, MVT::i8),
DAG.getConstant(LoIdx, DL, MVT::i8)));
- if (isUndefInRange(Mask, NumElements / 2, NumElements / 2) ||
- !SafeOffset(Offset + 1))
+ if (isUndefUpperHalf(Mask) || !SafeOffset(Offset + 1))
return DAG.getBitcast(VT, Lo);
int HiIdx = (Offset + 1) * EltBits;
@@ -11326,7 +11811,7 @@ static SDValue lowerVectorShuffleAsSpecificZeroOrAnyExtend(
///
/// The reason we have dedicated lowering for zext-style shuffles is that they
/// are both incredibly common and often quite performance sensitive.
-static SDValue lowerVectorShuffleAsZeroOrAnyExtend(
+static SDValue lowerShuffleAsZeroOrAnyExtend(
const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef<int> Mask,
const APInt &Zeroable, const X86Subtarget &Subtarget,
SelectionDAG &DAG) {
@@ -11397,8 +11882,8 @@ static SDValue lowerVectorShuffleAsZeroOrAnyExtend(
if (Offset != 0 && Matches < 2)
return SDValue();
- return lowerVectorShuffleAsSpecificZeroOrAnyExtend(
- DL, VT, Scale, Offset, AnyExt, InputV, Mask, Subtarget, DAG);
+ return lowerShuffleAsSpecificZeroOrAnyExtend(DL, VT, Scale, Offset, AnyExt,
+ InputV, Mask, Subtarget, DAG);
};
// The widest scale possible for extending is to a 64-bit integer.
@@ -11482,7 +11967,7 @@ static bool isShuffleFoldableLoad(SDValue V) {
///
/// This is a common pattern that we have especially efficient patterns to lower
/// across all subtarget feature sets.
-static SDValue lowerVectorShuffleAsElementInsertion(
+static SDValue lowerShuffleAsElementInsertion(
const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef<int> Mask,
const APInt &Zeroable, const X86Subtarget &Subtarget,
SelectionDAG &DAG) {
@@ -11580,10 +12065,10 @@ static SDValue lowerVectorShuffleAsElementInsertion(
/// coming from a scalar_to_vector/build_vector node \p V0 with larger elements.
///
/// This assumes we have AVX2.
-static SDValue lowerVectorShuffleAsTruncBroadcast(const SDLoc &DL, MVT VT,
- SDValue V0, int BroadcastIdx,
- const X86Subtarget &Subtarget,
- SelectionDAG &DAG) {
+static SDValue lowerShuffleAsTruncBroadcast(const SDLoc &DL, MVT VT, SDValue V0,
+ int BroadcastIdx,
+ const X86Subtarget &Subtarget,
+ SelectionDAG &DAG) {
assert(Subtarget.hasAVX2() &&
"We can only lower integer broadcasts with AVX2!");
@@ -11629,16 +12114,90 @@ static SDValue lowerVectorShuffleAsTruncBroadcast(const SDLoc &DL, MVT VT,
DAG.getNode(ISD::TRUNCATE, DL, EltVT, Scalar));
}
+/// Test whether this can be lowered with a single SHUFPS instruction.
+///
+/// This is used to disable more specialized lowerings when the shufps lowering
+/// will happen to be efficient.
+static bool isSingleSHUFPSMask(ArrayRef<int> Mask) {
+ // This routine only handles 128-bit shufps.
+ assert(Mask.size() == 4 && "Unsupported mask size!");
+ assert(Mask[0] >= -1 && Mask[0] < 8 && "Out of bound mask element!");
+ assert(Mask[1] >= -1 && Mask[1] < 8 && "Out of bound mask element!");
+ assert(Mask[2] >= -1 && Mask[2] < 8 && "Out of bound mask element!");
+ assert(Mask[3] >= -1 && Mask[3] < 8 && "Out of bound mask element!");
+
+ // To lower with a single SHUFPS we need to have the low half and high half
+ // each requiring a single input.
+ if (Mask[0] >= 0 && Mask[1] >= 0 && (Mask[0] < 4) != (Mask[1] < 4))
+ return false;
+ if (Mask[2] >= 0 && Mask[3] >= 0 && (Mask[2] < 4) != (Mask[3] < 4))
+ return false;
+
+ return true;
+}
+
+/// If we are extracting two 128-bit halves of a vector and shuffling the
+/// result, match that to a 256-bit AVX2 vperm* instruction to avoid a
+/// multi-shuffle lowering.
+static SDValue lowerShuffleOfExtractsAsVperm(const SDLoc &DL, SDValue N0,
+ SDValue N1, ArrayRef<int> Mask,
+ SelectionDAG &DAG) {
+ EVT VT = N0.getValueType();
+ assert((VT.is128BitVector() &&
+ (VT.getScalarSizeInBits() == 32 || VT.getScalarSizeInBits() == 64)) &&
+ "VPERM* family of shuffles requires 32-bit or 64-bit elements");
+
+ // Check that both sources are extracts of the same source vector.
+ if (!N0.hasOneUse() || !N1.hasOneUse() ||
+ N0.getOpcode() != ISD::EXTRACT_SUBVECTOR ||
+ N1.getOpcode() != ISD::EXTRACT_SUBVECTOR ||
+ N0.getOperand(0) != N1.getOperand(0))
+ return SDValue();
+
+ SDValue WideVec = N0.getOperand(0);
+ EVT WideVT = WideVec.getValueType();
+ if (!WideVT.is256BitVector() || !isa<ConstantSDNode>(N0.getOperand(1)) ||
+ !isa<ConstantSDNode>(N1.getOperand(1)))
+ return SDValue();
+
+ // Match extracts of each half of the wide source vector. Commute the shuffle
+ // if the extract of the low half is N1.
+ unsigned NumElts = VT.getVectorNumElements();
+ SmallVector<int, 4> NewMask(Mask.begin(), Mask.end());
+ const APInt &ExtIndex0 = N0.getConstantOperandAPInt(1);
+ const APInt &ExtIndex1 = N1.getConstantOperandAPInt(1);
+ if (ExtIndex1 == 0 && ExtIndex0 == NumElts)
+ ShuffleVectorSDNode::commuteMask(NewMask);
+ else if (ExtIndex0 != 0 || ExtIndex1 != NumElts)
+ return SDValue();
+
+ // Final bailout: if the mask is simple, we are better off using an extract
+ // and a simple narrow shuffle. Prefer extract+unpack(h/l)ps to vpermps
+ // because that avoids a constant load from memory.
+ if (NumElts == 4 &&
+ (isSingleSHUFPSMask(NewMask) || is128BitUnpackShuffleMask(NewMask)))
+ return SDValue();
+
+ // Extend the shuffle mask with undef elements.
+ NewMask.append(NumElts, -1);
+
+ // shuf (extract X, 0), (extract X, 4), M --> extract (shuf X, undef, M'), 0
+ SDValue Shuf = DAG.getVectorShuffle(WideVT, DL, WideVec, DAG.getUNDEF(WideVT),
+ NewMask);
+ // This is free: ymm -> xmm.
+ return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, Shuf,
+ DAG.getIntPtrConstant(0, DL));
+}
+
/// Try to lower broadcast of a single element.
///
/// For convenience, this code also bundles all of the subtarget feature set
/// filtering. While a little annoying to re-dispatch on type here, there isn't
/// a convenient way to factor it out.
-static SDValue lowerVectorShuffleAsBroadcast(const SDLoc &DL, MVT VT,
- SDValue V1, SDValue V2,
- ArrayRef<int> Mask,
- const X86Subtarget &Subtarget,
- SelectionDAG &DAG) {
+static SDValue lowerShuffleAsBroadcast(const SDLoc &DL, MVT VT, SDValue V1,
+ SDValue V2, ArrayRef<int> Mask,
+ const X86Subtarget &Subtarget,
+ SelectionDAG &DAG) {
if (!((Subtarget.hasSSE3() && VT == MVT::v2f64) ||
(Subtarget.hasAVX() && VT.isFloatingPoint()) ||
(Subtarget.hasAVX2() && VT.isInteger())))
@@ -11647,6 +12206,7 @@ static SDValue lowerVectorShuffleAsBroadcast(const SDLoc &DL, MVT VT,
// With MOVDDUP (v2f64) we can broadcast from a register or a load, otherwise
// we can only broadcast from a register with AVX2.
unsigned NumElts = Mask.size();
+ unsigned NumEltBits = VT.getScalarSizeInBits();
unsigned Opcode = (VT == MVT::v2f64 && !Subtarget.hasAVX2())
? X86ISD::MOVDDUP
: X86ISD::VBROADCAST;
@@ -11670,29 +12230,19 @@ static SDValue lowerVectorShuffleAsBroadcast(const SDLoc &DL, MVT VT,
// Go up the chain of (vector) values to find a scalar load that we can
// combine with the broadcast.
+ int BitOffset = BroadcastIdx * NumEltBits;
SDValue V = V1;
for (;;) {
switch (V.getOpcode()) {
case ISD::BITCAST: {
- // Peek through bitcasts as long as BroadcastIdx can be adjusted.
- SDValue VSrc = V.getOperand(0);
- unsigned NumEltBits = V.getScalarValueSizeInBits();
- unsigned NumSrcBits = VSrc.getScalarValueSizeInBits();
- if ((NumEltBits % NumSrcBits) == 0)
- BroadcastIdx *= (NumEltBits / NumSrcBits);
- else if ((NumSrcBits % NumEltBits) == 0 &&
- (BroadcastIdx % (NumSrcBits / NumEltBits)) == 0)
- BroadcastIdx /= (NumSrcBits / NumEltBits);
- else
- break;
- V = VSrc;
+ V = V.getOperand(0);
continue;
}
case ISD::CONCAT_VECTORS: {
- int OperandSize =
- V.getOperand(0).getSimpleValueType().getVectorNumElements();
- V = V.getOperand(BroadcastIdx / OperandSize);
- BroadcastIdx %= OperandSize;
+ int OpBitWidth = V.getOperand(0).getValueSizeInBits();
+ int OpIdx = BitOffset / OpBitWidth;
+ V = V.getOperand(OpIdx);
+ BitOffset %= OpBitWidth;
continue;
}
case ISD::INSERT_SUBVECTOR: {
@@ -11701,11 +12251,13 @@ static SDValue lowerVectorShuffleAsBroadcast(const SDLoc &DL, MVT VT,
if (!ConstantIdx)
break;
- int BeginIdx = (int)ConstantIdx->getZExtValue();
- int EndIdx =
- BeginIdx + (int)VInner.getSimpleValueType().getVectorNumElements();
- if (BroadcastIdx >= BeginIdx && BroadcastIdx < EndIdx) {
- BroadcastIdx -= BeginIdx;
+ int EltBitWidth = VOuter.getScalarValueSizeInBits();
+ int Idx = (int)ConstantIdx->getZExtValue();
+ int NumSubElts = (int)VInner.getSimpleValueType().getVectorNumElements();
+ int BeginOffset = Idx * EltBitWidth;
+ int EndOffset = BeginOffset + NumSubElts * EltBitWidth;
+ if (BeginOffset <= BitOffset && BitOffset < EndOffset) {
+ BitOffset -= BeginOffset;
V = VInner;
} else {
V = VOuter;
@@ -11715,48 +12267,34 @@ static SDValue lowerVectorShuffleAsBroadcast(const SDLoc &DL, MVT VT,
}
break;
}
+ assert((BitOffset % NumEltBits) == 0 && "Illegal bit-offset");
+ BroadcastIdx = BitOffset / NumEltBits;
- // Ensure the source vector and BroadcastIdx are for a suitable type.
- if (VT.getScalarSizeInBits() != V.getScalarValueSizeInBits()) {
- unsigned NumEltBits = VT.getScalarSizeInBits();
- unsigned NumSrcBits = V.getScalarValueSizeInBits();
- if ((NumSrcBits % NumEltBits) == 0)
- BroadcastIdx *= (NumSrcBits / NumEltBits);
- else if ((NumEltBits % NumSrcBits) == 0 &&
- (BroadcastIdx % (NumEltBits / NumSrcBits)) == 0)
- BroadcastIdx /= (NumEltBits / NumSrcBits);
- else
- return SDValue();
-
- unsigned NumSrcElts = V.getValueSizeInBits() / NumEltBits;
- MVT SrcVT = MVT::getVectorVT(VT.getScalarType(), NumSrcElts);
- V = DAG.getBitcast(SrcVT, V);
- }
+ // Do we need to bitcast the source to retrieve the original broadcast index?
+ bool BitCastSrc = V.getScalarValueSizeInBits() != NumEltBits;
// Check if this is a broadcast of a scalar. We special case lowering
// for scalars so that we can more effectively fold with loads.
- // First, look through bitcast: if the original value has a larger element
- // type than the shuffle, the broadcast element is in essence truncated.
- // Make that explicit to ease folding.
- if (V.getOpcode() == ISD::BITCAST && VT.isInteger())
- if (SDValue TruncBroadcast = lowerVectorShuffleAsTruncBroadcast(
- DL, VT, V.getOperand(0), BroadcastIdx, Subtarget, DAG))
+ // If the original value has a larger element type than the shuffle, the
+ // broadcast element is in essence truncated. Make that explicit to ease
+ // folding.
+ if (BitCastSrc && VT.isInteger())
+ if (SDValue TruncBroadcast = lowerShuffleAsTruncBroadcast(
+ DL, VT, V, BroadcastIdx, Subtarget, DAG))
return TruncBroadcast;
MVT BroadcastVT = VT;
- // Peek through any bitcast (only useful for loads).
- SDValue BC = peekThroughBitcasts(V);
-
// Also check the simpler case, where we can directly reuse the scalar.
- if ((V.getOpcode() == ISD::BUILD_VECTOR && V.hasOneUse()) ||
- (V.getOpcode() == ISD::SCALAR_TO_VECTOR && BroadcastIdx == 0)) {
+ if (!BitCastSrc &&
+ ((V.getOpcode() == ISD::BUILD_VECTOR && V.hasOneUse()) ||
+ (V.getOpcode() == ISD::SCALAR_TO_VECTOR && BroadcastIdx == 0))) {
V = V.getOperand(BroadcastIdx);
// If we can't broadcast from a register, check that the input is a load.
if (!BroadcastFromReg && !isShuffleFoldableLoad(V))
return SDValue();
- } else if (MayFoldLoad(BC) && !cast<LoadSDNode>(BC)->isVolatile()) {
+ } else if (MayFoldLoad(V) && !cast<LoadSDNode>(V)->isVolatile()) {
// 32-bit targets need to load i64 as a f64 and then bitcast the result.
if (!Subtarget.is64Bit() && VT.getScalarType() == MVT::i64) {
BroadcastVT = MVT::getVectorVT(MVT::f64, VT.getVectorNumElements());
@@ -11767,10 +12305,11 @@ static SDValue lowerVectorShuffleAsBroadcast(const SDLoc &DL, MVT VT,
// If we are broadcasting a load that is only used by the shuffle
// then we can reduce the vector load to the broadcasted scalar load.
- LoadSDNode *Ld = cast<LoadSDNode>(BC);
+ LoadSDNode *Ld = cast<LoadSDNode>(V);
SDValue BaseAddr = Ld->getOperand(1);
EVT SVT = BroadcastVT.getScalarType();
unsigned Offset = BroadcastIdx * SVT.getStoreSize();
+ assert((int)(Offset * 8) == BitOffset && "Unexpected bit-offset");
SDValue NewAddr = DAG.getMemBasePlusOffset(BaseAddr, Offset, DL);
V = DAG.getLoad(SVT, DL, Ld->getChain(), NewAddr,
DAG.getMachineFunction().getMachineMemOperand(
@@ -11779,7 +12318,7 @@ static SDValue lowerVectorShuffleAsBroadcast(const SDLoc &DL, MVT VT,
} else if (!BroadcastFromReg) {
// We can't broadcast from a vector register.
return SDValue();
- } else if (BroadcastIdx != 0) {
+ } else if (BitOffset != 0) {
// We can only broadcast from the zero-element of a vector register,
// but it can be advantageous to broadcast from the zero-element of a
// subvector.
@@ -11791,18 +12330,15 @@ static SDValue lowerVectorShuffleAsBroadcast(const SDLoc &DL, MVT VT,
return SDValue();
// Only broadcast the zero-element of a 128-bit subvector.
- unsigned EltSize = VT.getScalarSizeInBits();
- if (((BroadcastIdx * EltSize) % 128) != 0)
+ if ((BitOffset % 128) != 0)
return SDValue();
- // The shuffle input might have been a bitcast we looked through; look at
- // the original input vector. Emit an EXTRACT_SUBVECTOR of that type; we'll
- // later bitcast it to BroadcastVT.
- assert(V.getScalarValueSizeInBits() == BroadcastVT.getScalarSizeInBits() &&
- "Unexpected vector element size");
+ assert((BitOffset % V.getScalarValueSizeInBits()) == 0 &&
+ "Unexpected bit-offset");
assert((V.getValueSizeInBits() == 256 || V.getValueSizeInBits() == 512) &&
"Unexpected vector size");
- V = extract128BitVector(V, BroadcastIdx, DAG, DL);
+ unsigned ExtractIdx = BitOffset / V.getScalarValueSizeInBits();
+ V = extract128BitVector(V, ExtractIdx, DAG, DL);
}
if (Opcode == X86ISD::MOVDDUP && !V.getValueType().isVector())
@@ -11810,21 +12346,21 @@ static SDValue lowerVectorShuffleAsBroadcast(const SDLoc &DL, MVT VT,
DAG.getBitcast(MVT::f64, V));
// Bitcast back to the same scalar type as BroadcastVT.
- MVT SrcVT = V.getSimpleValueType();
- if (SrcVT.getScalarType() != BroadcastVT.getScalarType()) {
- assert(SrcVT.getScalarSizeInBits() == BroadcastVT.getScalarSizeInBits() &&
+ if (V.getValueType().getScalarType() != BroadcastVT.getScalarType()) {
+ assert(NumEltBits == BroadcastVT.getScalarSizeInBits() &&
"Unexpected vector element size");
- if (SrcVT.isVector()) {
- unsigned NumSrcElts = SrcVT.getVectorNumElements();
- SrcVT = MVT::getVectorVT(BroadcastVT.getScalarType(), NumSrcElts);
+ MVT ExtVT;
+ if (V.getValueType().isVector()) {
+ unsigned NumSrcElts = V.getValueSizeInBits() / NumEltBits;
+ ExtVT = MVT::getVectorVT(BroadcastVT.getScalarType(), NumSrcElts);
} else {
- SrcVT = BroadcastVT.getScalarType();
+ ExtVT = BroadcastVT.getScalarType();
}
- V = DAG.getBitcast(SrcVT, V);
+ V = DAG.getBitcast(ExtVT, V);
}
// 32-bit targets need to load i64 as a f64 and then bitcast the result.
- if (!Subtarget.is64Bit() && SrcVT == MVT::i64) {
+ if (!Subtarget.is64Bit() && V.getValueType() == MVT::i64) {
V = DAG.getBitcast(MVT::f64, V);
unsigned NumBroadcastElts = BroadcastVT.getVectorNumElements();
BroadcastVT = MVT::getVectorVT(MVT::f64, NumBroadcastElts);
@@ -11833,9 +12369,9 @@ static SDValue lowerVectorShuffleAsBroadcast(const SDLoc &DL, MVT VT,
// We only support broadcasting from 128-bit vectors to minimize the
// number of patterns we need to deal with in isel. So extract down to
// 128-bits, removing as many bitcasts as possible.
- if (SrcVT.getSizeInBits() > 128) {
- MVT ExtVT = MVT::getVectorVT(SrcVT.getScalarType(),
- 128 / SrcVT.getScalarSizeInBits());
+ if (V.getValueSizeInBits() > 128) {
+ MVT ExtVT = V.getSimpleValueType().getScalarType();
+ ExtVT = MVT::getVectorVT(ExtVT, 128 / ExtVT.getScalarSizeInBits());
V = extract128BitVector(peekThroughBitcasts(V), 0, DAG, DL);
V = DAG.getBitcast(ExtVT, V);
}
@@ -11849,11 +12385,10 @@ static SDValue lowerVectorShuffleAsBroadcast(const SDLoc &DL, MVT VT,
// are much smaller to encode than a SHUFPS and an INSERTPS. We can also
// perform INSERTPS if a single V1 element is out of place and all V2
// elements are zeroable.
-static bool matchVectorShuffleAsInsertPS(SDValue &V1, SDValue &V2,
- unsigned &InsertPSMask,
- const APInt &Zeroable,
- ArrayRef<int> Mask,
- SelectionDAG &DAG) {
+static bool matchShuffleAsInsertPS(SDValue &V1, SDValue &V2,
+ unsigned &InsertPSMask,
+ const APInt &Zeroable,
+ ArrayRef<int> Mask, SelectionDAG &DAG) {
assert(V1.getSimpleValueType().is128BitVector() && "Bad operand type!");
assert(V2.getSimpleValueType().is128BitVector() && "Bad operand type!");
assert(Mask.size() == 4 && "Unexpected mask size for v4 shuffle!");
@@ -11938,16 +12473,15 @@ static bool matchVectorShuffleAsInsertPS(SDValue &V1, SDValue &V2,
return false;
}
-static SDValue lowerVectorShuffleAsInsertPS(const SDLoc &DL, SDValue V1,
- SDValue V2, ArrayRef<int> Mask,
- const APInt &Zeroable,
- SelectionDAG &DAG) {
+static SDValue lowerShuffleAsInsertPS(const SDLoc &DL, SDValue V1, SDValue V2,
+ ArrayRef<int> Mask, const APInt &Zeroable,
+ SelectionDAG &DAG) {
assert(V1.getSimpleValueType() == MVT::v4f32 && "Bad operand type!");
assert(V2.getSimpleValueType() == MVT::v4f32 && "Bad operand type!");
// Attempt to match the insertps pattern.
unsigned InsertPSMask;
- if (!matchVectorShuffleAsInsertPS(V1, V2, InsertPSMask, Zeroable, Mask, DAG))
+ if (!matchShuffleAsInsertPS(V1, V2, InsertPSMask, Zeroable, Mask, DAG))
return SDValue();
// Insert the V2 element into the desired position.
@@ -11964,7 +12498,7 @@ static SDValue lowerVectorShuffleAsInsertPS(const SDLoc &DL, SDValue V1,
/// because for floating point vectors we have a generalized SHUFPS lowering
/// strategy that handles everything that doesn't *exactly* match an unpack,
/// making this clever lowering unnecessary.
-static SDValue lowerVectorShuffleAsPermuteAndUnpack(
+static SDValue lowerShuffleAsPermuteAndUnpack(
const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef<int> Mask,
const X86Subtarget &Subtarget, SelectionDAG &DAG) {
assert(!VT.isFloatingPoint() &&
@@ -12079,19 +12613,18 @@ static SDValue lowerVectorShuffleAsPermuteAndUnpack(
/// instructions will incur a domain crossing penalty on some chips though so
/// it is better to avoid lowering through this for integer vectors where
/// possible.
-static SDValue lowerV2F64VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
- const APInt &Zeroable,
- SDValue V1, SDValue V2,
- const X86Subtarget &Subtarget,
- SelectionDAG &DAG) {
+static SDValue lowerV2F64Shuffle(const SDLoc &DL, ArrayRef<int> Mask,
+ const APInt &Zeroable, SDValue V1, SDValue V2,
+ const X86Subtarget &Subtarget,
+ SelectionDAG &DAG) {
assert(V1.getSimpleValueType() == MVT::v2f64 && "Bad operand type!");
assert(V2.getSimpleValueType() == MVT::v2f64 && "Bad operand type!");
assert(Mask.size() == 2 && "Unexpected mask size for v2 shuffle!");
if (V2.isUndef()) {
// Check for being able to broadcast a single element.
- if (SDValue Broadcast = lowerVectorShuffleAsBroadcast(
- DL, MVT::v2f64, V1, V2, Mask, Subtarget, DAG))
+ if (SDValue Broadcast = lowerShuffleAsBroadcast(DL, MVT::v2f64, V1, V2,
+ Mask, Subtarget, DAG))
return Broadcast;
// Straight shuffle of a single input vector. Simulate this by using the
@@ -12116,16 +12649,20 @@ static SDValue lowerV2F64VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
assert(Mask[0] < 2 && "We sort V1 to be the first input.");
assert(Mask[1] >= 2 && "We sort V2 to be the second input.");
+ if (Subtarget.hasAVX2())
+ if (SDValue Extract = lowerShuffleOfExtractsAsVperm(DL, V1, V2, Mask, DAG))
+ return Extract;
+
// When loading a scalar and then shuffling it into a vector we can often do
// the insertion cheaply.
- if (SDValue Insertion = lowerVectorShuffleAsElementInsertion(
+ if (SDValue Insertion = lowerShuffleAsElementInsertion(
DL, MVT::v2f64, V1, V2, Mask, Zeroable, Subtarget, DAG))
return Insertion;
// Try inverting the insertion since for v2 masks it is easy to do and we
// can't reliably sort the mask one way or the other.
int InverseMask[2] = {Mask[0] < 0 ? -1 : (Mask[0] ^ 2),
Mask[1] < 0 ? -1 : (Mask[1] ^ 2)};
- if (SDValue Insertion = lowerVectorShuffleAsElementInsertion(
+ if (SDValue Insertion = lowerShuffleAsElementInsertion(
DL, MVT::v2f64, V2, V1, InverseMask, Zeroable, Subtarget, DAG))
return Insertion;
@@ -12141,13 +12678,12 @@ static SDValue lowerV2F64VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, MVT::v2f64, V1S));
if (Subtarget.hasSSE41())
- if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v2f64, V1, V2, Mask,
- Zeroable, Subtarget, DAG))
+ if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v2f64, V1, V2, Mask,
+ Zeroable, Subtarget, DAG))
return Blend;
// Use dedicated unpack instructions for masks that match their pattern.
- if (SDValue V =
- lowerVectorShuffleWithUNPCK(DL, MVT::v2f64, Mask, V1, V2, DAG))
+ if (SDValue V = lowerShuffleWithUNPCK(DL, MVT::v2f64, Mask, V1, V2, DAG))
return V;
unsigned SHUFPDMask = (Mask[0] == 1) | (((Mask[1] - 2) == 1) << 1);
@@ -12161,19 +12697,18 @@ static SDValue lowerV2F64VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
/// the integer unit to minimize domain crossing penalties. However, for blends
/// it falls back to the floating point shuffle operation with appropriate bit
/// casting.
-static SDValue lowerV2I64VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
- const APInt &Zeroable,
- SDValue V1, SDValue V2,
- const X86Subtarget &Subtarget,
- SelectionDAG &DAG) {
+static SDValue lowerV2I64Shuffle(const SDLoc &DL, ArrayRef<int> Mask,
+ const APInt &Zeroable, SDValue V1, SDValue V2,
+ const X86Subtarget &Subtarget,
+ SelectionDAG &DAG) {
assert(V1.getSimpleValueType() == MVT::v2i64 && "Bad operand type!");
assert(V2.getSimpleValueType() == MVT::v2i64 && "Bad operand type!");
assert(Mask.size() == 2 && "Unexpected mask size for v2 shuffle!");
if (V2.isUndef()) {
// Check for being able to broadcast a single element.
- if (SDValue Broadcast = lowerVectorShuffleAsBroadcast(
- DL, MVT::v2i64, V1, V2, Mask, Subtarget, DAG))
+ if (SDValue Broadcast = lowerShuffleAsBroadcast(DL, MVT::v2i64, V1, V2,
+ Mask, Subtarget, DAG))
return Broadcast;
// Straight shuffle of a single input vector. For everything from SSE2
@@ -12193,20 +12728,24 @@ static SDValue lowerV2I64VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
assert(Mask[0] < 2 && "We sort V1 to be the first input.");
assert(Mask[1] >= 2 && "We sort V2 to be the second input.");
+ if (Subtarget.hasAVX2())
+ if (SDValue Extract = lowerShuffleOfExtractsAsVperm(DL, V1, V2, Mask, DAG))
+ return Extract;
+
// Try to use shift instructions.
- if (SDValue Shift = lowerVectorShuffleAsShift(DL, MVT::v2i64, V1, V2, Mask,
- Zeroable, Subtarget, DAG))
+ if (SDValue Shift = lowerShuffleAsShift(DL, MVT::v2i64, V1, V2, Mask,
+ Zeroable, Subtarget, DAG))
return Shift;
// When loading a scalar and then shuffling it into a vector we can often do
// the insertion cheaply.
- if (SDValue Insertion = lowerVectorShuffleAsElementInsertion(
+ if (SDValue Insertion = lowerShuffleAsElementInsertion(
DL, MVT::v2i64, V1, V2, Mask, Zeroable, Subtarget, DAG))
return Insertion;
// Try inverting the insertion since for v2 masks it is easy to do and we
// can't reliably sort the mask one way or the other.
int InverseMask[2] = {Mask[0] ^ 2, Mask[1] ^ 2};
- if (SDValue Insertion = lowerVectorShuffleAsElementInsertion(
+ if (SDValue Insertion = lowerShuffleAsElementInsertion(
DL, MVT::v2i64, V2, V1, InverseMask, Zeroable, Subtarget, DAG))
return Insertion;
@@ -12214,33 +12753,32 @@ static SDValue lowerV2I64VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
// *exact* same predicate.
bool IsBlendSupported = Subtarget.hasSSE41();
if (IsBlendSupported)
- if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v2i64, V1, V2, Mask,
- Zeroable, Subtarget, DAG))
+ if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v2i64, V1, V2, Mask,
+ Zeroable, Subtarget, DAG))
return Blend;
// Use dedicated unpack instructions for masks that match their pattern.
- if (SDValue V =
- lowerVectorShuffleWithUNPCK(DL, MVT::v2i64, Mask, V1, V2, DAG))
+ if (SDValue V = lowerShuffleWithUNPCK(DL, MVT::v2i64, Mask, V1, V2, DAG))
return V;
// Try to use byte rotation instructions.
// Its more profitable for pre-SSSE3 to use shuffles/unpacks.
if (Subtarget.hasSSSE3()) {
if (Subtarget.hasVLX())
- if (SDValue Rotate = lowerVectorShuffleAsRotate(DL, MVT::v2i64, V1, V2,
- Mask, Subtarget, DAG))
+ if (SDValue Rotate = lowerShuffleAsRotate(DL, MVT::v2i64, V1, V2, Mask,
+ Subtarget, DAG))
return Rotate;
- if (SDValue Rotate = lowerVectorShuffleAsByteRotate(
- DL, MVT::v2i64, V1, V2, Mask, Subtarget, DAG))
+ if (SDValue Rotate = lowerShuffleAsByteRotate(DL, MVT::v2i64, V1, V2, Mask,
+ Subtarget, DAG))
return Rotate;
}
// If we have direct support for blends, we should lower by decomposing into
// a permute. That will be faster than the domain cross.
if (IsBlendSupported)
- return lowerVectorShuffleAsDecomposedShuffleBlend(DL, MVT::v2i64, V1, V2,
- Mask, Subtarget, DAG);
+ return lowerShuffleAsDecomposedShuffleBlend(DL, MVT::v2i64, V1, V2, Mask,
+ Subtarget, DAG);
// We implement this with SHUFPD which is pretty lame because it will likely
// incur 2 cycles of stall for integer vectors on Nehalem and older chips.
@@ -12252,36 +12790,14 @@ static SDValue lowerV2I64VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
DAG.getVectorShuffle(MVT::v2f64, DL, V1, V2, Mask));
}
-/// Test whether this can be lowered with a single SHUFPS instruction.
-///
-/// This is used to disable more specialized lowerings when the shufps lowering
-/// will happen to be efficient.
-static bool isSingleSHUFPSMask(ArrayRef<int> Mask) {
- // This routine only handles 128-bit shufps.
- assert(Mask.size() == 4 && "Unsupported mask size!");
- assert(Mask[0] >= -1 && Mask[0] < 8 && "Out of bound mask element!");
- assert(Mask[1] >= -1 && Mask[1] < 8 && "Out of bound mask element!");
- assert(Mask[2] >= -1 && Mask[2] < 8 && "Out of bound mask element!");
- assert(Mask[3] >= -1 && Mask[3] < 8 && "Out of bound mask element!");
-
- // To lower with a single SHUFPS we need to have the low half and high half
- // each requiring a single input.
- if (Mask[0] >= 0 && Mask[1] >= 0 && (Mask[0] < 4) != (Mask[1] < 4))
- return false;
- if (Mask[2] >= 0 && Mask[3] >= 0 && (Mask[2] < 4) != (Mask[3] < 4))
- return false;
-
- return true;
-}
-
/// Lower a vector shuffle using the SHUFPS instruction.
///
/// This is a helper routine dedicated to lowering vector shuffles using SHUFPS.
/// It makes no assumptions about whether this is the *best* lowering, it simply
/// uses it.
-static SDValue lowerVectorShuffleWithSHUFPS(const SDLoc &DL, MVT VT,
- ArrayRef<int> Mask, SDValue V1,
- SDValue V2, SelectionDAG &DAG) {
+static SDValue lowerShuffleWithSHUFPS(const SDLoc &DL, MVT VT,
+ ArrayRef<int> Mask, SDValue V1,
+ SDValue V2, SelectionDAG &DAG) {
SDValue LowV = V1, HighV = V2;
int NewMask[4] = {Mask[0], Mask[1], Mask[2], Mask[3]};
@@ -12366,11 +12882,10 @@ static SDValue lowerVectorShuffleWithSHUFPS(const SDLoc &DL, MVT VT,
/// Uses instructions exclusively from the floating point unit to minimize
/// domain crossing penalties, as these are sufficient to implement all v4f32
/// shuffles.
-static SDValue lowerV4F32VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
- const APInt &Zeroable,
- SDValue V1, SDValue V2,
- const X86Subtarget &Subtarget,
- SelectionDAG &DAG) {
+static SDValue lowerV4F32Shuffle(const SDLoc &DL, ArrayRef<int> Mask,
+ const APInt &Zeroable, SDValue V1, SDValue V2,
+ const X86Subtarget &Subtarget,
+ SelectionDAG &DAG) {
assert(V1.getSimpleValueType() == MVT::v4f32 && "Bad operand type!");
assert(V2.getSimpleValueType() == MVT::v4f32 && "Bad operand type!");
assert(Mask.size() == 4 && "Unexpected mask size for v4 shuffle!");
@@ -12379,8 +12894,8 @@ static SDValue lowerV4F32VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
if (NumV2Elements == 0) {
// Check for being able to broadcast a single element.
- if (SDValue Broadcast = lowerVectorShuffleAsBroadcast(
- DL, MVT::v4f32, V1, V2, Mask, Subtarget, DAG))
+ if (SDValue Broadcast = lowerShuffleAsBroadcast(DL, MVT::v4f32, V1, V2,
+ Mask, Subtarget, DAG))
return Broadcast;
// Use even/odd duplicate instructions for masks that match their pattern.
@@ -12413,29 +12928,32 @@ static SDValue lowerV4F32VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
getV4X86ShuffleImm8ForMask(Mask, DL, DAG));
}
+ if (Subtarget.hasAVX2())
+ if (SDValue Extract = lowerShuffleOfExtractsAsVperm(DL, V1, V2, Mask, DAG))
+ return Extract;
+
// There are special ways we can lower some single-element blends. However, we
// have custom ways we can lower more complex single-element blends below that
// we defer to if both this and BLENDPS fail to match, so restrict this to
// when the V2 input is targeting element 0 of the mask -- that is the fast
// case here.
if (NumV2Elements == 1 && Mask[0] >= 4)
- if (SDValue V = lowerVectorShuffleAsElementInsertion(
+ if (SDValue V = lowerShuffleAsElementInsertion(
DL, MVT::v4f32, V1, V2, Mask, Zeroable, Subtarget, DAG))
return V;
if (Subtarget.hasSSE41()) {
- if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v4f32, V1, V2, Mask,
- Zeroable, Subtarget, DAG))
+ if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v4f32, V1, V2, Mask,
+ Zeroable, Subtarget, DAG))
return Blend;
// Use INSERTPS if we can complete the shuffle efficiently.
- if (SDValue V =
- lowerVectorShuffleAsInsertPS(DL, V1, V2, Mask, Zeroable, DAG))
+ if (SDValue V = lowerShuffleAsInsertPS(DL, V1, V2, Mask, Zeroable, DAG))
return V;
if (!isSingleSHUFPSMask(Mask))
- if (SDValue BlendPerm = lowerVectorShuffleAsBlendAndPermute(
- DL, MVT::v4f32, V1, V2, Mask, DAG))
+ if (SDValue BlendPerm = lowerShuffleAsBlendAndPermute(DL, MVT::v4f32, V1,
+ V2, Mask, DAG))
return BlendPerm;
}
@@ -12449,23 +12967,21 @@ static SDValue lowerV4F32VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
}
// Use dedicated unpack instructions for masks that match their pattern.
- if (SDValue V =
- lowerVectorShuffleWithUNPCK(DL, MVT::v4f32, Mask, V1, V2, DAG))
+ if (SDValue V = lowerShuffleWithUNPCK(DL, MVT::v4f32, Mask, V1, V2, DAG))
return V;
// Otherwise fall back to a SHUFPS lowering strategy.
- return lowerVectorShuffleWithSHUFPS(DL, MVT::v4f32, Mask, V1, V2, DAG);
+ return lowerShuffleWithSHUFPS(DL, MVT::v4f32, Mask, V1, V2, DAG);
}
/// Lower 4-lane i32 vector shuffles.
///
/// We try to handle these with integer-domain shuffles where we can, but for
/// blends we use the floating point domain blend instructions.
-static SDValue lowerV4I32VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
- const APInt &Zeroable,
- SDValue V1, SDValue V2,
- const X86Subtarget &Subtarget,
- SelectionDAG &DAG) {
+static SDValue lowerV4I32Shuffle(const SDLoc &DL, ArrayRef<int> Mask,
+ const APInt &Zeroable, SDValue V1, SDValue V2,
+ const X86Subtarget &Subtarget,
+ SelectionDAG &DAG) {
assert(V1.getSimpleValueType() == MVT::v4i32 && "Bad operand type!");
assert(V2.getSimpleValueType() == MVT::v4i32 && "Bad operand type!");
assert(Mask.size() == 4 && "Unexpected mask size for v4 shuffle!");
@@ -12473,16 +12989,16 @@ static SDValue lowerV4I32VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
// Whenever we can lower this as a zext, that instruction is strictly faster
// than any alternative. It also allows us to fold memory operands into the
// shuffle in many cases.
- if (SDValue ZExt = lowerVectorShuffleAsZeroOrAnyExtend(
- DL, MVT::v4i32, V1, V2, Mask, Zeroable, Subtarget, DAG))
+ if (SDValue ZExt = lowerShuffleAsZeroOrAnyExtend(DL, MVT::v4i32, V1, V2, Mask,
+ Zeroable, Subtarget, DAG))
return ZExt;
int NumV2Elements = count_if(Mask, [](int M) { return M >= 4; });
if (NumV2Elements == 0) {
// Check for being able to broadcast a single element.
- if (SDValue Broadcast = lowerVectorShuffleAsBroadcast(
- DL, MVT::v4i32, V1, V2, Mask, Subtarget, DAG))
+ if (SDValue Broadcast = lowerShuffleAsBroadcast(DL, MVT::v4i32, V1, V2,
+ Mask, Subtarget, DAG))
return Broadcast;
// Straight shuffle of a single input vector. For everything from SSE2
@@ -12501,14 +13017,18 @@ static SDValue lowerV4I32VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
getV4X86ShuffleImm8ForMask(Mask, DL, DAG));
}
+ if (Subtarget.hasAVX2())
+ if (SDValue Extract = lowerShuffleOfExtractsAsVperm(DL, V1, V2, Mask, DAG))
+ return Extract;
+
// Try to use shift instructions.
- if (SDValue Shift = lowerVectorShuffleAsShift(DL, MVT::v4i32, V1, V2, Mask,
- Zeroable, Subtarget, DAG))
+ if (SDValue Shift = lowerShuffleAsShift(DL, MVT::v4i32, V1, V2, Mask,
+ Zeroable, Subtarget, DAG))
return Shift;
// There are special ways we can lower some single-element blends.
if (NumV2Elements == 1)
- if (SDValue V = lowerVectorShuffleAsElementInsertion(
+ if (SDValue V = lowerShuffleAsElementInsertion(
DL, MVT::v4i32, V1, V2, Mask, Zeroable, Subtarget, DAG))
return V;
@@ -12516,29 +13036,28 @@ static SDValue lowerV4I32VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
// *exact* same predicate.
bool IsBlendSupported = Subtarget.hasSSE41();
if (IsBlendSupported)
- if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v4i32, V1, V2, Mask,
- Zeroable, Subtarget, DAG))
+ if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v4i32, V1, V2, Mask,
+ Zeroable, Subtarget, DAG))
return Blend;
- if (SDValue Masked = lowerVectorShuffleAsBitMask(DL, MVT::v4i32, V1, V2, Mask,
- Zeroable, DAG))
+ if (SDValue Masked = lowerShuffleAsBitMask(DL, MVT::v4i32, V1, V2, Mask,
+ Zeroable, Subtarget, DAG))
return Masked;
// Use dedicated unpack instructions for masks that match their pattern.
- if (SDValue V =
- lowerVectorShuffleWithUNPCK(DL, MVT::v4i32, Mask, V1, V2, DAG))
+ if (SDValue V = lowerShuffleWithUNPCK(DL, MVT::v4i32, Mask, V1, V2, DAG))
return V;
// Try to use byte rotation instructions.
// Its more profitable for pre-SSSE3 to use shuffles/unpacks.
if (Subtarget.hasSSSE3()) {
if (Subtarget.hasVLX())
- if (SDValue Rotate = lowerVectorShuffleAsRotate(DL, MVT::v4i32, V1, V2,
- Mask, Subtarget, DAG))
+ if (SDValue Rotate = lowerShuffleAsRotate(DL, MVT::v4i32, V1, V2, Mask,
+ Subtarget, DAG))
return Rotate;
- if (SDValue Rotate = lowerVectorShuffleAsByteRotate(
- DL, MVT::v4i32, V1, V2, Mask, Subtarget, DAG))
+ if (SDValue Rotate = lowerShuffleAsByteRotate(DL, MVT::v4i32, V1, V2, Mask,
+ Subtarget, DAG))
return Rotate;
}
@@ -12549,12 +13068,12 @@ static SDValue lowerV4I32VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
// If we have direct support for blends, we should lower by decomposing into
// a permute. That will be faster than the domain cross.
if (IsBlendSupported)
- return lowerVectorShuffleAsDecomposedShuffleBlend(DL, MVT::v4i32, V1, V2,
- Mask, Subtarget, DAG);
+ return lowerShuffleAsDecomposedShuffleBlend(DL, MVT::v4i32, V1, V2, Mask,
+ Subtarget, DAG);
// Try to lower by permuting the inputs into an unpack instruction.
- if (SDValue Unpack = lowerVectorShuffleAsPermuteAndUnpack(
- DL, MVT::v4i32, V1, V2, Mask, Subtarget, DAG))
+ if (SDValue Unpack = lowerShuffleAsPermuteAndUnpack(DL, MVT::v4i32, V1, V2,
+ Mask, Subtarget, DAG))
return Unpack;
}
@@ -12585,7 +13104,7 @@ static SDValue lowerV4I32VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
/// lane must shuffle the *exact* same way. In fact, you must pass a v8 Mask to
/// this routine for it to work correctly. To shuffle a 256-bit or 512-bit i16
/// vector, form the analogous 128-bit 8-element Mask.
-static SDValue lowerV8I16GeneralSingleInputVectorShuffle(
+static SDValue lowerV8I16GeneralSingleInputShuffle(
const SDLoc &DL, MVT VT, SDValue V, MutableArrayRef<int> Mask,
const X86Subtarget &Subtarget, SelectionDAG &DAG) {
assert(VT.getVectorElementType() == MVT::i16 && "Bad input type!");
@@ -12617,11 +13136,9 @@ static SDValue lowerV8I16GeneralSingleInputVectorShuffle(
copy_if(HiMask, std::back_inserter(HiInputs), [](int M) { return M >= 0; });
array_pod_sort(HiInputs.begin(), HiInputs.end());
HiInputs.erase(std::unique(HiInputs.begin(), HiInputs.end()), HiInputs.end());
- int NumLToL =
- std::lower_bound(LoInputs.begin(), LoInputs.end(), 4) - LoInputs.begin();
+ int NumLToL = llvm::lower_bound(LoInputs, 4) - LoInputs.begin();
int NumHToL = LoInputs.size() - NumLToL;
- int NumLToH =
- std::lower_bound(HiInputs.begin(), HiInputs.end(), 4) - HiInputs.begin();
+ int NumLToH = llvm::lower_bound(HiInputs, 4) - HiInputs.begin();
int NumHToH = HiInputs.size() - NumLToH;
MutableArrayRef<int> LToLInputs(LoInputs.data(), NumLToL);
MutableArrayRef<int> LToHInputs(HiInputs.data(), NumLToH);
@@ -12730,7 +13247,7 @@ static SDValue lowerV8I16GeneralSingleInputVectorShuffle(
// a half by taking the sum of the half with three inputs and subtracting
// the sum of the actual three inputs. The difference is the remaining
// slot.
- int ADWord, BDWord;
+ int ADWord = 0, BDWord = 0;
int &TripleDWord = ThreeAInputs ? ADWord : BDWord;
int &OneInputDWord = ThreeAInputs ? BDWord : ADWord;
int TripleInputOffset = ThreeAInputs ? AOffset : BOffset;
@@ -12825,8 +13342,7 @@ static SDValue lowerV8I16GeneralSingleInputVectorShuffle(
// Recurse back into this routine to re-compute state now that this isn't
// a 3 and 1 problem.
- return lowerV8I16GeneralSingleInputVectorShuffle(DL, VT, V, Mask, Subtarget,
- DAG);
+ return lowerV8I16GeneralSingleInputShuffle(DL, VT, V, Mask, Subtarget, DAG);
};
if ((NumLToL == 3 && NumHToL == 1) || (NumLToL == 1 && NumHToL == 3))
return balanceSides(LToLInputs, HToLInputs, HToHInputs, LToHInputs, 0, 4);
@@ -13084,7 +13600,7 @@ static SDValue lowerV8I16GeneralSingleInputVectorShuffle(
/// Helper to form a PSHUFB-based shuffle+blend, opportunistically avoiding the
/// blend if only one input is used.
-static SDValue lowerVectorShuffleAsBlendOfPSHUFBs(
+static SDValue lowerShuffleAsBlendOfPSHUFBs(
const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef<int> Mask,
const APInt &Zeroable, SelectionDAG &DAG, bool &V1InUse, bool &V2InUse) {
assert(!is128BitLaneCrossingShuffleMask(VT, Mask) &&
@@ -13147,54 +13663,51 @@ static SDValue lowerVectorShuffleAsBlendOfPSHUFBs(
/// the two inputs, try to interleave them. Otherwise, blend the low and high
/// halves of the inputs separately (making them have relatively few inputs)
/// and then concatenate them.
-static SDValue lowerV8I16VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
- const APInt &Zeroable,
- SDValue V1, SDValue V2,
- const X86Subtarget &Subtarget,
- SelectionDAG &DAG) {
+static SDValue lowerV8I16Shuffle(const SDLoc &DL, ArrayRef<int> Mask,
+ const APInt &Zeroable, SDValue V1, SDValue V2,
+ const X86Subtarget &Subtarget,
+ SelectionDAG &DAG) {
assert(V1.getSimpleValueType() == MVT::v8i16 && "Bad operand type!");
assert(V2.getSimpleValueType() == MVT::v8i16 && "Bad operand type!");
assert(Mask.size() == 8 && "Unexpected mask size for v8 shuffle!");
// Whenever we can lower this as a zext, that instruction is strictly faster
// than any alternative.
- if (SDValue ZExt = lowerVectorShuffleAsZeroOrAnyExtend(
- DL, MVT::v8i16, V1, V2, Mask, Zeroable, Subtarget, DAG))
+ if (SDValue ZExt = lowerShuffleAsZeroOrAnyExtend(DL, MVT::v8i16, V1, V2, Mask,
+ Zeroable, Subtarget, DAG))
return ZExt;
int NumV2Inputs = count_if(Mask, [](int M) { return M >= 8; });
if (NumV2Inputs == 0) {
// Check for being able to broadcast a single element.
- if (SDValue Broadcast = lowerVectorShuffleAsBroadcast(
- DL, MVT::v8i16, V1, V2, Mask, Subtarget, DAG))
+ if (SDValue Broadcast = lowerShuffleAsBroadcast(DL, MVT::v8i16, V1, V2,
+ Mask, Subtarget, DAG))
return Broadcast;
// Try to use shift instructions.
- if (SDValue Shift = lowerVectorShuffleAsShift(DL, MVT::v8i16, V1, V1, Mask,
- Zeroable, Subtarget, DAG))
+ if (SDValue Shift = lowerShuffleAsShift(DL, MVT::v8i16, V1, V1, Mask,
+ Zeroable, Subtarget, DAG))
return Shift;
// Use dedicated unpack instructions for masks that match their pattern.
- if (SDValue V =
- lowerVectorShuffleWithUNPCK(DL, MVT::v8i16, Mask, V1, V2, DAG))
+ if (SDValue V = lowerShuffleWithUNPCK(DL, MVT::v8i16, Mask, V1, V2, DAG))
return V;
// Use dedicated pack instructions for masks that match their pattern.
- if (SDValue V = lowerVectorShuffleWithPACK(DL, MVT::v8i16, Mask, V1, V2,
- DAG, Subtarget))
+ if (SDValue V = lowerShuffleWithPACK(DL, MVT::v8i16, Mask, V1, V2, DAG,
+ Subtarget))
return V;
// Try to use byte rotation instructions.
- if (SDValue Rotate = lowerVectorShuffleAsByteRotate(DL, MVT::v8i16, V1, V1,
- Mask, Subtarget, DAG))
+ if (SDValue Rotate = lowerShuffleAsByteRotate(DL, MVT::v8i16, V1, V1, Mask,
+ Subtarget, DAG))
return Rotate;
// Make a copy of the mask so it can be modified.
SmallVector<int, 8> MutableMask(Mask.begin(), Mask.end());
- return lowerV8I16GeneralSingleInputVectorShuffle(DL, MVT::v8i16, V1,
- MutableMask, Subtarget,
- DAG);
+ return lowerV8I16GeneralSingleInputShuffle(DL, MVT::v8i16, V1, MutableMask,
+ Subtarget, DAG);
}
assert(llvm::any_of(Mask, [](int M) { return M >= 0 && M < 8; }) &&
@@ -13202,19 +13715,19 @@ static SDValue lowerV8I16VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
"shuffles.");
// Try to use shift instructions.
- if (SDValue Shift = lowerVectorShuffleAsShift(DL, MVT::v8i16, V1, V2, Mask,
- Zeroable, Subtarget, DAG))
+ if (SDValue Shift = lowerShuffleAsShift(DL, MVT::v8i16, V1, V2, Mask,
+ Zeroable, Subtarget, DAG))
return Shift;
// See if we can use SSE4A Extraction / Insertion.
if (Subtarget.hasSSE4A())
- if (SDValue V = lowerVectorShuffleWithSSE4A(DL, MVT::v8i16, V1, V2, Mask,
- Zeroable, DAG))
+ if (SDValue V = lowerShuffleWithSSE4A(DL, MVT::v8i16, V1, V2, Mask,
+ Zeroable, DAG))
return V;
// There are special ways we can lower some single-element blends.
if (NumV2Inputs == 1)
- if (SDValue V = lowerVectorShuffleAsElementInsertion(
+ if (SDValue V = lowerShuffleAsElementInsertion(
DL, MVT::v8i16, V1, V2, Mask, Zeroable, Subtarget, DAG))
return V;
@@ -13222,50 +13735,54 @@ static SDValue lowerV8I16VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
// *exact* same predicate.
bool IsBlendSupported = Subtarget.hasSSE41();
if (IsBlendSupported)
- if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v8i16, V1, V2, Mask,
- Zeroable, Subtarget, DAG))
+ if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v8i16, V1, V2, Mask,
+ Zeroable, Subtarget, DAG))
return Blend;
- if (SDValue Masked = lowerVectorShuffleAsBitMask(DL, MVT::v8i16, V1, V2, Mask,
- Zeroable, DAG))
+ if (SDValue Masked = lowerShuffleAsBitMask(DL, MVT::v8i16, V1, V2, Mask,
+ Zeroable, Subtarget, DAG))
return Masked;
// Use dedicated unpack instructions for masks that match their pattern.
- if (SDValue V =
- lowerVectorShuffleWithUNPCK(DL, MVT::v8i16, Mask, V1, V2, DAG))
+ if (SDValue V = lowerShuffleWithUNPCK(DL, MVT::v8i16, Mask, V1, V2, DAG))
return V;
// Use dedicated pack instructions for masks that match their pattern.
- if (SDValue V = lowerVectorShuffleWithPACK(DL, MVT::v8i16, Mask, V1, V2, DAG,
- Subtarget))
+ if (SDValue V = lowerShuffleWithPACK(DL, MVT::v8i16, Mask, V1, V2, DAG,
+ Subtarget))
return V;
// Try to use byte rotation instructions.
- if (SDValue Rotate = lowerVectorShuffleAsByteRotate(
- DL, MVT::v8i16, V1, V2, Mask, Subtarget, DAG))
+ if (SDValue Rotate = lowerShuffleAsByteRotate(DL, MVT::v8i16, V1, V2, Mask,
+ Subtarget, DAG))
return Rotate;
if (SDValue BitBlend =
- lowerVectorShuffleAsBitBlend(DL, MVT::v8i16, V1, V2, Mask, DAG))
+ lowerShuffleAsBitBlend(DL, MVT::v8i16, V1, V2, Mask, DAG))
return BitBlend;
+ // Try to use byte shift instructions to mask.
+ if (SDValue V = lowerVectorShuffleAsByteShiftMask(
+ DL, MVT::v8i16, V1, V2, Mask, Zeroable, Subtarget, DAG))
+ return V;
+
// Try to lower by permuting the inputs into an unpack instruction.
- if (SDValue Unpack = lowerVectorShuffleAsPermuteAndUnpack(
- DL, MVT::v8i16, V1, V2, Mask, Subtarget, DAG))
+ if (SDValue Unpack = lowerShuffleAsPermuteAndUnpack(DL, MVT::v8i16, V1, V2,
+ Mask, Subtarget, DAG))
return Unpack;
// If we can't directly blend but can use PSHUFB, that will be better as it
// can both shuffle and set up the inefficient blend.
if (!IsBlendSupported && Subtarget.hasSSSE3()) {
bool V1InUse, V2InUse;
- return lowerVectorShuffleAsBlendOfPSHUFBs(DL, MVT::v8i16, V1, V2, Mask,
- Zeroable, DAG, V1InUse, V2InUse);
+ return lowerShuffleAsBlendOfPSHUFBs(DL, MVT::v8i16, V1, V2, Mask,
+ Zeroable, DAG, V1InUse, V2InUse);
}
// We can always bit-blend if we have to so the fallback strategy is to
// decompose into single-input permutes and blends.
- return lowerVectorShuffleAsDecomposedShuffleBlend(DL, MVT::v8i16, V1, V2,
- Mask, Subtarget, DAG);
+ return lowerShuffleAsDecomposedShuffleBlend(DL, MVT::v8i16, V1, V2,
+ Mask, Subtarget, DAG);
}
/// Check whether a compaction lowering can be done by dropping even
@@ -13334,9 +13851,9 @@ static int canLowerByDroppingEvenElements(ArrayRef<int> Mask,
return 0;
}
-static SDValue lowerVectorShuffleWithPERMV(const SDLoc &DL, MVT VT,
- ArrayRef<int> Mask, SDValue V1,
- SDValue V2, SelectionDAG &DAG) {
+static SDValue lowerShuffleWithPERMV(const SDLoc &DL, MVT VT,
+ ArrayRef<int> Mask, SDValue V1,
+ SDValue V2, SelectionDAG &DAG) {
MVT MaskEltVT = MVT::getIntegerVT(VT.getScalarSizeInBits());
MVT MaskVecVT = MVT::getVectorVT(MaskEltVT, VT.getVectorNumElements());
@@ -13354,39 +13871,38 @@ static SDValue lowerVectorShuffleWithPERMV(const SDLoc &DL, MVT VT,
/// UNPCK to spread the i8 elements across two i16-element vectors, and uses
/// the existing lowering for v8i16 blends on each half, finally PACK-ing them
/// back together.
-static SDValue lowerV16I8VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
- const APInt &Zeroable,
- SDValue V1, SDValue V2,
- const X86Subtarget &Subtarget,
- SelectionDAG &DAG) {
+static SDValue lowerV16I8Shuffle(const SDLoc &DL, ArrayRef<int> Mask,
+ const APInt &Zeroable, SDValue V1, SDValue V2,
+ const X86Subtarget &Subtarget,
+ SelectionDAG &DAG) {
assert(V1.getSimpleValueType() == MVT::v16i8 && "Bad operand type!");
assert(V2.getSimpleValueType() == MVT::v16i8 && "Bad operand type!");
assert(Mask.size() == 16 && "Unexpected mask size for v16 shuffle!");
// Try to use shift instructions.
- if (SDValue Shift = lowerVectorShuffleAsShift(DL, MVT::v16i8, V1, V2, Mask,
- Zeroable, Subtarget, DAG))
+ if (SDValue Shift = lowerShuffleAsShift(DL, MVT::v16i8, V1, V2, Mask,
+ Zeroable, Subtarget, DAG))
return Shift;
// Try to use byte rotation instructions.
- if (SDValue Rotate = lowerVectorShuffleAsByteRotate(
- DL, MVT::v16i8, V1, V2, Mask, Subtarget, DAG))
+ if (SDValue Rotate = lowerShuffleAsByteRotate(DL, MVT::v16i8, V1, V2, Mask,
+ Subtarget, DAG))
return Rotate;
// Use dedicated pack instructions for masks that match their pattern.
- if (SDValue V = lowerVectorShuffleWithPACK(DL, MVT::v16i8, Mask, V1, V2, DAG,
- Subtarget))
+ if (SDValue V = lowerShuffleWithPACK(DL, MVT::v16i8, Mask, V1, V2, DAG,
+ Subtarget))
return V;
// Try to use a zext lowering.
- if (SDValue ZExt = lowerVectorShuffleAsZeroOrAnyExtend(
- DL, MVT::v16i8, V1, V2, Mask, Zeroable, Subtarget, DAG))
+ if (SDValue ZExt = lowerShuffleAsZeroOrAnyExtend(DL, MVT::v16i8, V1, V2, Mask,
+ Zeroable, Subtarget, DAG))
return ZExt;
// See if we can use SSE4A Extraction / Insertion.
if (Subtarget.hasSSE4A())
- if (SDValue V = lowerVectorShuffleWithSSE4A(DL, MVT::v16i8, V1, V2, Mask,
- Zeroable, DAG))
+ if (SDValue V = lowerShuffleWithSSE4A(DL, MVT::v16i8, V1, V2, Mask,
+ Zeroable, DAG))
return V;
int NumV2Elements = count_if(Mask, [](int M) { return M >= 16; });
@@ -13394,12 +13910,11 @@ static SDValue lowerV16I8VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
// For single-input shuffles, there are some nicer lowering tricks we can use.
if (NumV2Elements == 0) {
// Check for being able to broadcast a single element.
- if (SDValue Broadcast = lowerVectorShuffleAsBroadcast(
- DL, MVT::v16i8, V1, V2, Mask, Subtarget, DAG))
+ if (SDValue Broadcast = lowerShuffleAsBroadcast(DL, MVT::v16i8, V1, V2,
+ Mask, Subtarget, DAG))
return Broadcast;
- if (SDValue V =
- lowerVectorShuffleWithUNPCK(DL, MVT::v16i8, Mask, V1, V2, DAG))
+ if (SDValue V = lowerShuffleWithUNPCK(DL, MVT::v16i8, Mask, V1, V2, DAG))
return V;
// Check whether we can widen this to an i16 shuffle by duplicating bytes.
@@ -13492,13 +14007,17 @@ static SDValue lowerV16I8VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
return V;
}
- if (SDValue Masked = lowerVectorShuffleAsBitMask(DL, MVT::v16i8, V1, V2, Mask,
- Zeroable, DAG))
+ if (SDValue Masked = lowerShuffleAsBitMask(DL, MVT::v16i8, V1, V2, Mask,
+ Zeroable, Subtarget, DAG))
return Masked;
// Use dedicated unpack instructions for masks that match their pattern.
- if (SDValue V =
- lowerVectorShuffleWithUNPCK(DL, MVT::v16i8, Mask, V1, V2, DAG))
+ if (SDValue V = lowerShuffleWithUNPCK(DL, MVT::v16i8, Mask, V1, V2, DAG))
+ return V;
+
+ // Try to use byte shift instructions to mask.
+ if (SDValue V = lowerVectorShuffleAsByteShiftMask(
+ DL, MVT::v16i8, V1, V2, Mask, Zeroable, Subtarget, DAG))
return V;
// Check for SSSE3 which lets us lower all v16i8 shuffles much more directly
@@ -13518,7 +14037,7 @@ static SDValue lowerV16I8VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
bool V1InUse = false;
bool V2InUse = false;
- SDValue PSHUFB = lowerVectorShuffleAsBlendOfPSHUFBs(
+ SDValue PSHUFB = lowerShuffleAsBlendOfPSHUFBs(
DL, MVT::v16i8, V1, V2, Mask, Zeroable, DAG, V1InUse, V2InUse);
// If both V1 and V2 are in use and we can use a direct blend or an unpack,
@@ -13526,8 +14045,8 @@ static SDValue lowerV16I8VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
// important as a single pshufb is significantly faster for that.
if (V1InUse && V2InUse) {
if (Subtarget.hasSSE41())
- if (SDValue Blend = lowerVectorShuffleAsBlend(
- DL, MVT::v16i8, V1, V2, Mask, Zeroable, Subtarget, DAG))
+ if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v16i8, V1, V2, Mask,
+ Zeroable, Subtarget, DAG))
return Blend;
// We can use an unpack to do the blending rather than an or in some
@@ -13538,17 +14057,17 @@ static SDValue lowerV16I8VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
// FIXME: It might be worth trying to detect if the unpack-feeding
// shuffles will both be pshufb, in which case we shouldn't bother with
// this.
- if (SDValue Unpack = lowerVectorShuffleAsPermuteAndUnpack(
+ if (SDValue Unpack = lowerShuffleAsPermuteAndUnpack(
DL, MVT::v16i8, V1, V2, Mask, Subtarget, DAG))
return Unpack;
// If we have VBMI we can use one VPERM instead of multiple PSHUFBs.
if (Subtarget.hasVBMI() && Subtarget.hasVLX())
- return lowerVectorShuffleWithPERMV(DL, MVT::v16i8, Mask, V1, V2, DAG);
+ return lowerShuffleWithPERMV(DL, MVT::v16i8, Mask, V1, V2, DAG);
// Use PALIGNR+Permute if possible - permute might become PSHUFB but the
// PALIGNR will be cheaper than the second PSHUFB+OR.
- if (SDValue V = lowerVectorShuffleAsByteRotateAndPermute(
+ if (SDValue V = lowerShuffleAsByteRotateAndPermute(
DL, MVT::v16i8, V1, V2, Mask, Subtarget, DAG))
return V;
}
@@ -13558,13 +14077,12 @@ static SDValue lowerV16I8VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
// There are special ways we can lower some single-element blends.
if (NumV2Elements == 1)
- if (SDValue V = lowerVectorShuffleAsElementInsertion(
+ if (SDValue V = lowerShuffleAsElementInsertion(
DL, MVT::v16i8, V1, V2, Mask, Zeroable, Subtarget, DAG))
return V;
- if (SDValue BitBlend =
- lowerVectorShuffleAsBitBlend(DL, MVT::v16i8, V1, V2, Mask, DAG))
- return BitBlend;
+ if (SDValue Blend = lowerShuffleAsBitBlend(DL, MVT::v16i8, V1, V2, Mask, DAG))
+ return Blend;
// Check whether a compaction lowering can be done. This handles shuffles
// which take every Nth element for some even N. See the helper function for
@@ -13605,8 +14123,8 @@ static SDValue lowerV16I8VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
// Handle multi-input cases by blending single-input shuffles.
if (NumV2Elements > 0)
- return lowerVectorShuffleAsDecomposedShuffleBlend(DL, MVT::v16i8, V1, V2,
- Mask, Subtarget, DAG);
+ return lowerShuffleAsDecomposedShuffleBlend(DL, MVT::v16i8, V1, V2, Mask,
+ Subtarget, DAG);
// The fallback path for single-input shuffles widens this into two v8i16
// vectors with unpacks, shuffles those, and then pulls them back together
@@ -13661,24 +14179,24 @@ static SDValue lowerV16I8VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
///
/// This routine breaks down the specific type of 128-bit shuffle and
/// dispatches to the lowering routines accordingly.
-static SDValue lower128BitVectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
- MVT VT, SDValue V1, SDValue V2,
- const APInt &Zeroable,
- const X86Subtarget &Subtarget,
- SelectionDAG &DAG) {
+static SDValue lower128BitShuffle(const SDLoc &DL, ArrayRef<int> Mask,
+ MVT VT, SDValue V1, SDValue V2,
+ const APInt &Zeroable,
+ const X86Subtarget &Subtarget,
+ SelectionDAG &DAG) {
switch (VT.SimpleTy) {
case MVT::v2i64:
- return lowerV2I64VectorShuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
+ return lowerV2I64Shuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
case MVT::v2f64:
- return lowerV2F64VectorShuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
+ return lowerV2F64Shuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
case MVT::v4i32:
- return lowerV4I32VectorShuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
+ return lowerV4I32Shuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
case MVT::v4f32:
- return lowerV4F32VectorShuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
+ return lowerV4F32Shuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
case MVT::v8i16:
- return lowerV8I16VectorShuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
+ return lowerV8I16Shuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
case MVT::v16i8:
- return lowerV16I8VectorShuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
+ return lowerV16I8Shuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
default:
llvm_unreachable("Unimplemented!");
@@ -13690,9 +14208,9 @@ static SDValue lower128BitVectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
/// This routine just extracts two subvectors, shuffles them independently, and
/// then concatenates them back together. This should work effectively with all
/// AVX vector shuffle types.
-static SDValue splitAndLowerVectorShuffle(const SDLoc &DL, MVT VT, SDValue V1,
- SDValue V2, ArrayRef<int> Mask,
- SelectionDAG &DAG) {
+static SDValue splitAndLowerShuffle(const SDLoc &DL, MVT VT, SDValue V1,
+ SDValue V2, ArrayRef<int> Mask,
+ SelectionDAG &DAG) {
assert(VT.getSizeInBits() >= 256 &&
"Only for 256-bit or wider vector shuffles!");
assert(V1.getSimpleValueType() == VT && "Bad operand type!");
@@ -13816,11 +14334,10 @@ static SDValue splitAndLowerVectorShuffle(const SDLoc &DL, MVT VT, SDValue V1,
/// between splitting the shuffle into 128-bit components and stitching those
/// back together vs. extracting the single-input shuffles and blending those
/// results.
-static SDValue lowerVectorShuffleAsSplitOrBlend(const SDLoc &DL, MVT VT,
- SDValue V1, SDValue V2,
- ArrayRef<int> Mask,
- const X86Subtarget &Subtarget,
- SelectionDAG &DAG) {
+static SDValue lowerShuffleAsSplitOrBlend(const SDLoc &DL, MVT VT, SDValue V1,
+ SDValue V2, ArrayRef<int> Mask,
+ const X86Subtarget &Subtarget,
+ SelectionDAG &DAG) {
assert(!V2.isUndef() && "This routine must not be used to lower single-input "
"shuffles as it could then recurse on itself.");
int Size = Mask.size();
@@ -13845,8 +14362,8 @@ static SDValue lowerVectorShuffleAsSplitOrBlend(const SDLoc &DL, MVT VT,
return true;
};
if (DoBothBroadcast())
- return lowerVectorShuffleAsDecomposedShuffleBlend(DL, VT, V1, V2, Mask,
- Subtarget, DAG);
+ return lowerShuffleAsDecomposedShuffleBlend(DL, VT, V1, V2, Mask,
+ Subtarget, DAG);
// If the inputs all stem from a single 128-bit lane of each input, then we
// split them rather than blending because the split will decompose to
@@ -13860,12 +14377,12 @@ static SDValue lowerVectorShuffleAsSplitOrBlend(const SDLoc &DL, MVT VT,
if (Mask[i] >= 0)
LaneInputs[Mask[i] / Size][(Mask[i] % Size) / LaneSize] = true;
if (LaneInputs[0].count() <= 1 && LaneInputs[1].count() <= 1)
- return splitAndLowerVectorShuffle(DL, VT, V1, V2, Mask, DAG);
+ return splitAndLowerShuffle(DL, VT, V1, V2, Mask, DAG);
// Otherwise, just fall back to decomposed shuffles and a blend. This requires
// that the decomposed single-input shuffles don't end up here.
- return lowerVectorShuffleAsDecomposedShuffleBlend(DL, VT, V1, V2, Mask,
- Subtarget, DAG);
+ return lowerShuffleAsDecomposedShuffleBlend(DL, VT, V1, V2, Mask, Subtarget,
+ DAG);
}
/// Lower a vector shuffle crossing multiple 128-bit lanes as
@@ -13874,9 +14391,9 @@ static SDValue lowerVectorShuffleAsSplitOrBlend(const SDLoc &DL, MVT VT,
/// This is mainly for cases where we can have non-repeating permutes
/// in each lane.
///
-/// TODO: This is very similar to lowerVectorShuffleByMerging128BitLanes,
+/// TODO: This is very similar to lowerShuffleAsLanePermuteAndRepeatedMask,
/// we should investigate merging them.
-static SDValue lowerVectorShuffleAsLanePermuteAndPermute(
+static SDValue lowerShuffleAsLanePermuteAndPermute(
const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef<int> Mask,
SelectionDAG &DAG, const X86Subtarget &Subtarget) {
int NumElts = VT.getVectorNumElements();
@@ -13884,7 +14401,6 @@ static SDValue lowerVectorShuffleAsLanePermuteAndPermute(
int NumEltsPerLane = NumElts / NumLanes;
SmallVector<int, 4> SrcLaneMask(NumLanes, SM_SentinelUndef);
- SmallVector<int, 16> LaneMask(NumElts, SM_SentinelUndef);
SmallVector<int, 16> PermMask(NumElts, SM_SentinelUndef);
for (int i = 0; i != NumElts; ++i) {
@@ -13899,10 +14415,20 @@ static SDValue lowerVectorShuffleAsLanePermuteAndPermute(
return SDValue();
SrcLaneMask[DstLane] = SrcLane;
- LaneMask[i] = (SrcLane * NumEltsPerLane) + (i % NumEltsPerLane);
PermMask[i] = (DstLane * NumEltsPerLane) + (M % NumEltsPerLane);
}
+ // Make sure we set all elements of the lane mask, to avoid undef propagation.
+ SmallVector<int, 16> LaneMask(NumElts, SM_SentinelUndef);
+ for (int DstLane = 0; DstLane != NumLanes; ++DstLane) {
+ int SrcLane = SrcLaneMask[DstLane];
+ if (0 <= SrcLane)
+ for (int j = 0; j != NumEltsPerLane; ++j) {
+ LaneMask[(DstLane * NumEltsPerLane) + j] =
+ (SrcLane * NumEltsPerLane) + j;
+ }
+ }
+
// If we're only shuffling a single lowest lane and the rest are identity
// then don't bother.
// TODO - isShuffleMaskInputInPlace could be extended to something like this.
@@ -13931,11 +14457,9 @@ static SDValue lowerVectorShuffleAsLanePermuteAndPermute(
/// is lower than any other fully general cross-lane shuffle strategy I'm aware
/// of. Special cases for each particular shuffle pattern should be handled
/// prior to trying this lowering.
-static SDValue lowerVectorShuffleAsLanePermuteAndBlend(const SDLoc &DL, MVT VT,
- SDValue V1, SDValue V2,
- ArrayRef<int> Mask,
- SelectionDAG &DAG,
- const X86Subtarget &Subtarget) {
+static SDValue lowerShuffleAsLanePermuteAndBlend(
+ const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef<int> Mask,
+ SelectionDAG &DAG, const X86Subtarget &Subtarget) {
// FIXME: This should probably be generalized for 512-bit vectors as well.
assert(VT.is256BitVector() && "Only for 256-bit vector shuffles!");
int Size = Mask.size();
@@ -13950,14 +14474,14 @@ static SDValue lowerVectorShuffleAsLanePermuteAndBlend(const SDLoc &DL, MVT VT,
if (Mask[i] >= 0 && (Mask[i] % Size) / LaneSize != i / LaneSize)
LaneCrossing[(Mask[i] % Size) / LaneSize] = true;
if (!LaneCrossing[0] || !LaneCrossing[1])
- return splitAndLowerVectorShuffle(DL, VT, V1, V2, Mask, DAG);
+ return splitAndLowerShuffle(DL, VT, V1, V2, Mask, DAG);
} else {
bool LaneUsed[2] = {false, false};
for (int i = 0; i < Size; ++i)
if (Mask[i] >= 0)
LaneUsed[(Mask[i] / LaneSize)] = true;
if (!LaneUsed[0] || !LaneUsed[1])
- return splitAndLowerVectorShuffle(DL, VT, V1, V2, Mask, DAG);
+ return splitAndLowerShuffle(DL, VT, V1, V2, Mask, DAG);
}
assert(V2.isUndef() &&
@@ -13981,11 +14505,11 @@ static SDValue lowerVectorShuffleAsLanePermuteAndBlend(const SDLoc &DL, MVT VT,
}
/// Handle lowering 2-lane 128-bit shuffles.
-static SDValue lowerV2X128VectorShuffle(const SDLoc &DL, MVT VT, SDValue V1,
- SDValue V2, ArrayRef<int> Mask,
- const APInt &Zeroable,
- const X86Subtarget &Subtarget,
- SelectionDAG &DAG) {
+static SDValue lowerV2X128Shuffle(const SDLoc &DL, MVT VT, SDValue V1,
+ SDValue V2, ArrayRef<int> Mask,
+ const APInt &Zeroable,
+ const X86Subtarget &Subtarget,
+ SelectionDAG &DAG) {
// With AVX2, use VPERMQ/VPERMPD for unary shuffles to allow memory folding.
if (Subtarget.hasAVX2() && V2.isUndef())
return SDValue();
@@ -14012,8 +14536,8 @@ static SDValue lowerV2X128VectorShuffle(const SDLoc &DL, MVT VT, SDValue V1,
// instruction bytes needed to explicitly generate the zero vector.
// Blends are faster and handle all the non-lane-crossing cases.
- if (SDValue Blend = lowerVectorShuffleAsBlend(DL, VT, V1, V2, Mask,
- Zeroable, Subtarget, DAG))
+ if (SDValue Blend = lowerShuffleAsBlend(DL, VT, V1, V2, Mask, Zeroable,
+ Subtarget, DAG))
return Blend;
// If either input operand is a zero vector, use VPERM2X128 because its mask
@@ -14084,9 +14608,7 @@ static SDValue lowerV2X128VectorShuffle(const SDLoc &DL, MVT VT, SDValue V1,
/// or two of the lanes of the inputs. The lanes of the input vectors are
/// shuffled in one or two independent shuffles to get the lanes into the
/// position needed by the final shuffle.
-///
-/// FIXME: This should be generalized to 512-bit shuffles.
-static SDValue lowerVectorShuffleByMerging128BitLanes(
+static SDValue lowerShuffleAsLanePermuteAndRepeatedMask(
const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef<int> Mask,
const X86Subtarget &Subtarget, SelectionDAG &DAG) {
assert(!V2.isUndef() && "This is only useful with multiple inputs.");
@@ -14095,12 +14617,10 @@ static SDValue lowerVectorShuffleByMerging128BitLanes(
return SDValue();
int Size = Mask.size();
+ int NumLanes = VT.getSizeInBits() / 128;
int LaneSize = 128 / VT.getScalarSizeInBits();
- int NumLanes = Size / LaneSize;
- assert(NumLanes == 2 && "Only handles 256-bit shuffles.");
-
SmallVector<int, 16> RepeatMask(LaneSize, -1);
- int LaneSrcs[2][2] = { { -1, -1 }, { -1 , -1 } };
+ SmallVector<std::array<int, 2>, 2> LaneSrcs(NumLanes, {{-1, -1}});
// First pass will try to fill in the RepeatMask from lanes that need two
// sources.
@@ -14111,7 +14631,7 @@ static SDValue lowerVectorShuffleByMerging128BitLanes(
int M = Mask[(Lane * LaneSize) + i];
if (M < 0)
continue;
- // Determine which of the 4 possible input lanes (2 from each source)
+ // Determine which of the possible input lanes (NumLanes from each source)
// this element comes from. Assign that as one of the sources for this
// lane. We can assign up to 2 sources for this lane. If we run out
// sources we can't do anything.
@@ -14250,54 +14770,30 @@ static SDValue lowerVectorShuffleByMerging128BitLanes(
return DAG.getVectorShuffle(VT, DL, NewV1, NewV2, NewMask);
}
-/// Lower shuffles where an entire half of a 256 or 512-bit vector is UNDEF.
-/// This allows for fast cases such as subvector extraction/insertion
-/// or shuffling smaller vector types which can lower more efficiently.
-static SDValue lowerVectorShuffleWithUndefHalf(const SDLoc &DL, MVT VT,
- SDValue V1, SDValue V2,
- ArrayRef<int> Mask,
- const X86Subtarget &Subtarget,
- SelectionDAG &DAG) {
- assert((VT.is256BitVector() || VT.is512BitVector()) &&
- "Expected 256-bit or 512-bit vector");
-
- unsigned NumElts = VT.getVectorNumElements();
- unsigned HalfNumElts = NumElts / 2;
- MVT HalfVT = MVT::getVectorVT(VT.getVectorElementType(), HalfNumElts);
-
- bool UndefLower = isUndefInRange(Mask, 0, HalfNumElts);
- bool UndefUpper = isUndefInRange(Mask, HalfNumElts, HalfNumElts);
- if (!UndefLower && !UndefUpper)
- return SDValue();
-
- // Upper half is undef and lower half is whole upper subvector.
- // e.g. vector_shuffle <4, 5, 6, 7, u, u, u, u> or <2, 3, u, u>
- if (UndefUpper &&
- isSequentialOrUndefInRange(Mask, 0, HalfNumElts, HalfNumElts)) {
- SDValue Hi = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, HalfVT, V1,
- DAG.getIntPtrConstant(HalfNumElts, DL));
- return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VT, DAG.getUNDEF(VT), Hi,
- DAG.getIntPtrConstant(0, DL));
- }
-
- // Lower half is undef and upper half is whole lower subvector.
- // e.g. vector_shuffle <u, u, u, u, 0, 1, 2, 3> or <u, u, 0, 1>
- if (UndefLower &&
- isSequentialOrUndefInRange(Mask, HalfNumElts, HalfNumElts, 0)) {
- SDValue Hi = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, HalfVT, V1,
- DAG.getIntPtrConstant(0, DL));
- return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VT, DAG.getUNDEF(VT), Hi,
- DAG.getIntPtrConstant(HalfNumElts, DL));
- }
+/// If the input shuffle mask results in a vector that is undefined in all upper
+/// or lower half elements and that mask accesses only 2 halves of the
+/// shuffle's operands, return true. A mask of half the width with mask indexes
+/// adjusted to access the extracted halves of the original shuffle operands is
+/// returned in HalfMask. HalfIdx1 and HalfIdx2 return whether the upper or
+/// lower half of each input operand is accessed.
+static bool
+getHalfShuffleMask(ArrayRef<int> Mask, MutableArrayRef<int> HalfMask,
+ int &HalfIdx1, int &HalfIdx2) {
+ assert((Mask.size() == HalfMask.size() * 2) &&
+ "Expected input mask to be twice as long as output");
+
+ // Exactly one half of the result must be undef to allow narrowing.
+ bool UndefLower = isUndefLowerHalf(Mask);
+ bool UndefUpper = isUndefUpperHalf(Mask);
+ if (UndefLower == UndefUpper)
+ return false;
- // If the shuffle only uses two of the four halves of the input operands,
- // then extract them and perform the 'half' shuffle at half width.
- // e.g. vector_shuffle <X, X, X, X, u, u, u, u> or <X, X, u, u>
- int HalfIdx1 = -1, HalfIdx2 = -1;
- SmallVector<int, 8> HalfMask(HalfNumElts);
- unsigned Offset = UndefLower ? HalfNumElts : 0;
+ unsigned HalfNumElts = HalfMask.size();
+ unsigned MaskIndexOffset = UndefLower ? HalfNumElts : 0;
+ HalfIdx1 = -1;
+ HalfIdx2 = -1;
for (unsigned i = 0; i != HalfNumElts; ++i) {
- int M = Mask[i + Offset];
+ int M = Mask[i + MaskIndexOffset];
if (M < 0) {
HalfMask[i] = M;
continue;
@@ -14324,42 +14820,27 @@ static SDValue lowerVectorShuffleWithUndefHalf(const SDLoc &DL, MVT VT,
}
// Too many half vectors referenced.
- return SDValue();
+ return false;
}
- assert(HalfMask.size() == HalfNumElts && "Unexpected shuffle mask length");
-
- // Only shuffle the halves of the inputs when useful.
- int NumLowerHalves =
- (HalfIdx1 == 0 || HalfIdx1 == 2) + (HalfIdx2 == 0 || HalfIdx2 == 2);
- int NumUpperHalves =
- (HalfIdx1 == 1 || HalfIdx1 == 3) + (HalfIdx2 == 1 || HalfIdx2 == 3);
-
- // uuuuXXXX - don't extract uppers just to insert again.
- if (UndefLower && NumUpperHalves != 0)
- return SDValue();
- // XXXXuuuu - don't extract both uppers, instead shuffle and then extract.
- if (UndefUpper && NumUpperHalves == 2)
- return SDValue();
+ return true;
+}
- // AVX2 - XXXXuuuu - always extract lowers.
- if (Subtarget.hasAVX2() && !(UndefUpper && NumUpperHalves == 0)) {
- // AVX2 supports efficient immediate 64-bit element cross-lane shuffles.
- if (VT == MVT::v4f64 || VT == MVT::v4i64)
- return SDValue();
- // AVX2 supports variable 32-bit element cross-lane shuffles.
- if (VT == MVT::v8f32 || VT == MVT::v8i32) {
- // XXXXuuuu - don't extract lowers and uppers.
- if (UndefUpper && NumLowerHalves != 0 && NumUpperHalves != 0)
- return SDValue();
- }
- }
+/// Given the output values from getHalfShuffleMask(), create a half width
+/// shuffle of extracted vectors followed by an insert back to full width.
+static SDValue getShuffleHalfVectors(const SDLoc &DL, SDValue V1, SDValue V2,
+ ArrayRef<int> HalfMask, int HalfIdx1,
+ int HalfIdx2, bool UndefLower,
+ SelectionDAG &DAG) {
+ assert(V1.getValueType() == V2.getValueType() && "Different sized vectors?");
+ assert(V1.getValueType().isSimple() && "Expecting only simple types");
- // AVX512 - XXXXuuuu - always extract lowers.
- if (VT.is512BitVector() && !(UndefUpper && NumUpperHalves == 0))
- return SDValue();
+ MVT VT = V1.getSimpleValueType();
+ unsigned NumElts = VT.getVectorNumElements();
+ unsigned HalfNumElts = NumElts / 2;
+ MVT HalfVT = MVT::getVectorVT(VT.getVectorElementType(), HalfNumElts);
- auto GetHalfVector = [&](int HalfIdx) {
+ auto getHalfVector = [&](int HalfIdx) {
if (HalfIdx < 0)
return DAG.getUNDEF(HalfVT);
SDValue V = (HalfIdx < 2 ? V1 : V2);
@@ -14368,13 +14849,126 @@ static SDValue lowerVectorShuffleWithUndefHalf(const SDLoc &DL, MVT VT,
DAG.getIntPtrConstant(HalfIdx, DL));
};
- SDValue Half1 = GetHalfVector(HalfIdx1);
- SDValue Half2 = GetHalfVector(HalfIdx2);
+ // ins undef, (shuf (ext V1, HalfIdx1), (ext V2, HalfIdx2), HalfMask), Offset
+ SDValue Half1 = getHalfVector(HalfIdx1);
+ SDValue Half2 = getHalfVector(HalfIdx2);
SDValue V = DAG.getVectorShuffle(HalfVT, DL, Half1, Half2, HalfMask);
+ unsigned Offset = UndefLower ? HalfNumElts : 0;
return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VT, DAG.getUNDEF(VT), V,
DAG.getIntPtrConstant(Offset, DL));
}
+/// Lower shuffles where an entire half of a 256 or 512-bit vector is UNDEF.
+/// This allows for fast cases such as subvector extraction/insertion
+/// or shuffling smaller vector types which can lower more efficiently.
+static SDValue lowerShuffleWithUndefHalf(const SDLoc &DL, MVT VT, SDValue V1,
+ SDValue V2, ArrayRef<int> Mask,
+ const X86Subtarget &Subtarget,
+ SelectionDAG &DAG) {
+ assert((VT.is256BitVector() || VT.is512BitVector()) &&
+ "Expected 256-bit or 512-bit vector");
+
+ bool UndefLower = isUndefLowerHalf(Mask);
+ if (!UndefLower && !isUndefUpperHalf(Mask))
+ return SDValue();
+
+ assert((!UndefLower || !isUndefUpperHalf(Mask)) &&
+ "Completely undef shuffle mask should have been simplified already");
+
+ // Upper half is undef and lower half is whole upper subvector.
+ // e.g. vector_shuffle <4, 5, 6, 7, u, u, u, u> or <2, 3, u, u>
+ unsigned NumElts = VT.getVectorNumElements();
+ unsigned HalfNumElts = NumElts / 2;
+ MVT HalfVT = MVT::getVectorVT(VT.getVectorElementType(), HalfNumElts);
+ if (!UndefLower &&
+ isSequentialOrUndefInRange(Mask, 0, HalfNumElts, HalfNumElts)) {
+ SDValue Hi = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, HalfVT, V1,
+ DAG.getIntPtrConstant(HalfNumElts, DL));
+ return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VT, DAG.getUNDEF(VT), Hi,
+ DAG.getIntPtrConstant(0, DL));
+ }
+
+ // Lower half is undef and upper half is whole lower subvector.
+ // e.g. vector_shuffle <u, u, u, u, 0, 1, 2, 3> or <u, u, 0, 1>
+ if (UndefLower &&
+ isSequentialOrUndefInRange(Mask, HalfNumElts, HalfNumElts, 0)) {
+ SDValue Hi = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, HalfVT, V1,
+ DAG.getIntPtrConstant(0, DL));
+ return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VT, DAG.getUNDEF(VT), Hi,
+ DAG.getIntPtrConstant(HalfNumElts, DL));
+ }
+
+ int HalfIdx1, HalfIdx2;
+ SmallVector<int, 8> HalfMask(HalfNumElts);
+ if (!getHalfShuffleMask(Mask, HalfMask, HalfIdx1, HalfIdx2))
+ return SDValue();
+
+ assert(HalfMask.size() == HalfNumElts && "Unexpected shuffle mask length");
+
+ // Only shuffle the halves of the inputs when useful.
+ unsigned NumLowerHalves =
+ (HalfIdx1 == 0 || HalfIdx1 == 2) + (HalfIdx2 == 0 || HalfIdx2 == 2);
+ unsigned NumUpperHalves =
+ (HalfIdx1 == 1 || HalfIdx1 == 3) + (HalfIdx2 == 1 || HalfIdx2 == 3);
+ assert(NumLowerHalves + NumUpperHalves <= 2 && "Only 1 or 2 halves allowed");
+
+ // Determine the larger pattern of undef/halves, then decide if it's worth
+ // splitting the shuffle based on subtarget capabilities and types.
+ unsigned EltWidth = VT.getVectorElementType().getSizeInBits();
+ if (!UndefLower) {
+ // XXXXuuuu: no insert is needed.
+ // Always extract lowers when setting lower - these are all free subreg ops.
+ if (NumUpperHalves == 0)
+ return getShuffleHalfVectors(DL, V1, V2, HalfMask, HalfIdx1, HalfIdx2,
+ UndefLower, DAG);
+
+ if (NumUpperHalves == 1) {
+ // AVX2 has efficient 32/64-bit element cross-lane shuffles.
+ if (Subtarget.hasAVX2()) {
+ // extract128 + vunpckhps/vshufps, is better than vblend + vpermps.
+ if (EltWidth == 32 && NumLowerHalves && HalfVT.is128BitVector() &&
+ !is128BitUnpackShuffleMask(HalfMask) &&
+ (!isSingleSHUFPSMask(HalfMask) ||
+ Subtarget.hasFastVariableShuffle()))
+ return SDValue();
+ // If this is a unary shuffle (assume that the 2nd operand is
+ // canonicalized to undef), then we can use vpermpd. Otherwise, we
+ // are better off extracting the upper half of 1 operand and using a
+ // narrow shuffle.
+ if (EltWidth == 64 && V2.isUndef())
+ return SDValue();
+ }
+ // AVX512 has efficient cross-lane shuffles for all legal 512-bit types.
+ if (Subtarget.hasAVX512() && VT.is512BitVector())
+ return SDValue();
+ // Extract + narrow shuffle is better than the wide alternative.
+ return getShuffleHalfVectors(DL, V1, V2, HalfMask, HalfIdx1, HalfIdx2,
+ UndefLower, DAG);
+ }
+
+ // Don't extract both uppers, instead shuffle and then extract.
+ assert(NumUpperHalves == 2 && "Half vector count went wrong");
+ return SDValue();
+ }
+
+ // UndefLower - uuuuXXXX: an insert to high half is required if we split this.
+ if (NumUpperHalves == 0) {
+ // AVX2 has efficient 64-bit element cross-lane shuffles.
+ // TODO: Refine to account for unary shuffle, splat, and other masks?
+ if (Subtarget.hasAVX2() && EltWidth == 64)
+ return SDValue();
+ // AVX512 has efficient cross-lane shuffles for all legal 512-bit types.
+ if (Subtarget.hasAVX512() && VT.is512BitVector())
+ return SDValue();
+ // Narrow shuffle + insert is better than the wide alternative.
+ return getShuffleHalfVectors(DL, V1, V2, HalfMask, HalfIdx1, HalfIdx2,
+ UndefLower, DAG);
+ }
+
+ // NumUpperHalves != 0: don't bother with extract, shuffle, and then insert.
+ return SDValue();
+}
+
/// Test whether the specified input (0 or 1) is in-place blended by the
/// given mask.
///
@@ -14560,9 +15154,8 @@ static SDValue lowerShuffleAsRepeatedMaskAndLanePermute(
SubLaneMask);
}
-static bool matchVectorShuffleWithSHUFPD(MVT VT, SDValue &V1, SDValue &V2,
- unsigned &ShuffleImm,
- ArrayRef<int> Mask) {
+static bool matchShuffleWithSHUFPD(MVT VT, SDValue &V1, SDValue &V2,
+ unsigned &ShuffleImm, ArrayRef<int> Mask) {
int NumElts = VT.getVectorNumElements();
assert(VT.getScalarSizeInBits() == 64 &&
(NumElts == 2 || NumElts == 4 || NumElts == 8) &&
@@ -14597,14 +15190,14 @@ static bool matchVectorShuffleWithSHUFPD(MVT VT, SDValue &V1, SDValue &V2,
return false;
}
-static SDValue lowerVectorShuffleWithSHUFPD(const SDLoc &DL, MVT VT,
- ArrayRef<int> Mask, SDValue V1,
- SDValue V2, SelectionDAG &DAG) {
+static SDValue lowerShuffleWithSHUFPD(const SDLoc &DL, MVT VT,
+ ArrayRef<int> Mask, SDValue V1,
+ SDValue V2, SelectionDAG &DAG) {
assert((VT == MVT::v2f64 || VT == MVT::v4f64 || VT == MVT::v8f64)&&
"Unexpected data type for VSHUFPD");
unsigned Immediate = 0;
- if (!matchVectorShuffleWithSHUFPD(VT, V1, V2, Immediate, Mask))
+ if (!matchShuffleWithSHUFPD(VT, V1, V2, Immediate, Mask))
return SDValue();
return DAG.getNode(X86ISD::SHUFP, DL, VT, V1, V2,
@@ -14615,23 +15208,22 @@ static SDValue lowerVectorShuffleWithSHUFPD(const SDLoc &DL, MVT VT,
///
/// Also ends up handling lowering of 4-lane 64-bit integer shuffles when AVX2
/// isn't available.
-static SDValue lowerV4F64VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
- const APInt &Zeroable,
- SDValue V1, SDValue V2,
- const X86Subtarget &Subtarget,
- SelectionDAG &DAG) {
+static SDValue lowerV4F64Shuffle(const SDLoc &DL, ArrayRef<int> Mask,
+ const APInt &Zeroable, SDValue V1, SDValue V2,
+ const X86Subtarget &Subtarget,
+ SelectionDAG &DAG) {
assert(V1.getSimpleValueType() == MVT::v4f64 && "Bad operand type!");
assert(V2.getSimpleValueType() == MVT::v4f64 && "Bad operand type!");
assert(Mask.size() == 4 && "Unexpected mask size for v4 shuffle!");
- if (SDValue V = lowerV2X128VectorShuffle(DL, MVT::v4f64, V1, V2, Mask,
- Zeroable, Subtarget, DAG))
+ if (SDValue V = lowerV2X128Shuffle(DL, MVT::v4f64, V1, V2, Mask, Zeroable,
+ Subtarget, DAG))
return V;
if (V2.isUndef()) {
// Check for being able to broadcast a single element.
- if (SDValue Broadcast = lowerVectorShuffleAsBroadcast(
- DL, MVT::v4f64, V1, V2, Mask, Subtarget, DAG))
+ if (SDValue Broadcast = lowerShuffleAsBroadcast(DL, MVT::v4f64, V1, V2,
+ Mask, Subtarget, DAG))
return Broadcast;
// Use low duplicate instructions for masks that match their pattern.
@@ -14659,29 +15251,33 @@ static SDValue lowerV4F64VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
return V;
// Try to permute the lanes and then use a per-lane permute.
- if (SDValue V = lowerVectorShuffleAsLanePermuteAndPermute(
- DL, MVT::v4f64, V1, V2, Mask, DAG, Subtarget))
+ if (SDValue V = lowerShuffleAsLanePermuteAndPermute(DL, MVT::v4f64, V1, V2,
+ Mask, DAG, Subtarget))
return V;
// Otherwise, fall back.
- return lowerVectorShuffleAsLanePermuteAndBlend(DL, MVT::v4f64, V1, V2, Mask,
- DAG, Subtarget);
+ return lowerShuffleAsLanePermuteAndBlend(DL, MVT::v4f64, V1, V2, Mask, DAG,
+ Subtarget);
}
// Use dedicated unpack instructions for masks that match their pattern.
- if (SDValue V =
- lowerVectorShuffleWithUNPCK(DL, MVT::v4f64, Mask, V1, V2, DAG))
+ if (SDValue V = lowerShuffleWithUNPCK(DL, MVT::v4f64, Mask, V1, V2, DAG))
return V;
- if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v4f64, V1, V2, Mask,
- Zeroable, Subtarget, DAG))
+ if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v4f64, V1, V2, Mask,
+ Zeroable, Subtarget, DAG))
return Blend;
// Check if the blend happens to exactly fit that of SHUFPD.
- if (SDValue Op =
- lowerVectorShuffleWithSHUFPD(DL, MVT::v4f64, Mask, V1, V2, DAG))
+ if (SDValue Op = lowerShuffleWithSHUFPD(DL, MVT::v4f64, Mask, V1, V2, DAG))
return Op;
+ // If we have one input in place, then we can permute the other input and
+ // blend the result.
+ if (isShuffleMaskInputInPlace(0, Mask) || isShuffleMaskInputInPlace(1, Mask))
+ return lowerShuffleAsDecomposedShuffleBlend(DL, MVT::v4f64, V1, V2, Mask,
+ Subtarget, DAG);
+
// Try to create an in-lane repeating shuffle mask and then shuffle the
// results into the target lanes.
if (SDValue V = lowerShuffleAsRepeatedMaskAndLanePermute(
@@ -14694,52 +15290,51 @@ static SDValue lowerV4F64VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
// instruction so skip this pattern.
if (!(Subtarget.hasAVX2() && (isShuffleMaskInputInPlace(0, Mask) ||
isShuffleMaskInputInPlace(1, Mask))))
- if (SDValue Result = lowerVectorShuffleByMerging128BitLanes(
+ if (SDValue V = lowerShuffleAsLanePermuteAndRepeatedMask(
DL, MVT::v4f64, V1, V2, Mask, Subtarget, DAG))
- return Result;
+ return V;
// If we have VLX support, we can use VEXPAND.
if (Subtarget.hasVLX())
- if (SDValue V = lowerVectorShuffleToEXPAND(DL, MVT::v4f64, Zeroable, Mask,
- V1, V2, DAG, Subtarget))
+ if (SDValue V = lowerShuffleToEXPAND(DL, MVT::v4f64, Zeroable, Mask, V1, V2,
+ DAG, Subtarget))
return V;
// If we have AVX2 then we always want to lower with a blend because an v4 we
// can fully permute the elements.
if (Subtarget.hasAVX2())
- return lowerVectorShuffleAsDecomposedShuffleBlend(DL, MVT::v4f64, V1, V2,
- Mask, Subtarget, DAG);
+ return lowerShuffleAsDecomposedShuffleBlend(DL, MVT::v4f64, V1, V2, Mask,
+ Subtarget, DAG);
// Otherwise fall back on generic lowering.
- return lowerVectorShuffleAsSplitOrBlend(DL, MVT::v4f64, V1, V2, Mask,
- Subtarget, DAG);
+ return lowerShuffleAsSplitOrBlend(DL, MVT::v4f64, V1, V2, Mask,
+ Subtarget, DAG);
}
/// Handle lowering of 4-lane 64-bit integer shuffles.
///
/// This routine is only called when we have AVX2 and thus a reasonable
/// instruction set for v4i64 shuffling..
-static SDValue lowerV4I64VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
- const APInt &Zeroable,
- SDValue V1, SDValue V2,
- const X86Subtarget &Subtarget,
- SelectionDAG &DAG) {
+static SDValue lowerV4I64Shuffle(const SDLoc &DL, ArrayRef<int> Mask,
+ const APInt &Zeroable, SDValue V1, SDValue V2,
+ const X86Subtarget &Subtarget,
+ SelectionDAG &DAG) {
assert(V1.getSimpleValueType() == MVT::v4i64 && "Bad operand type!");
assert(V2.getSimpleValueType() == MVT::v4i64 && "Bad operand type!");
assert(Mask.size() == 4 && "Unexpected mask size for v4 shuffle!");
assert(Subtarget.hasAVX2() && "We can only lower v4i64 with AVX2!");
- if (SDValue V = lowerV2X128VectorShuffle(DL, MVT::v4i64, V1, V2, Mask,
- Zeroable, Subtarget, DAG))
+ if (SDValue V = lowerV2X128Shuffle(DL, MVT::v4i64, V1, V2, Mask, Zeroable,
+ Subtarget, DAG))
return V;
- if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v4i64, V1, V2, Mask,
- Zeroable, Subtarget, DAG))
+ if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v4i64, V1, V2, Mask,
+ Zeroable, Subtarget, DAG))
return Blend;
// Check for being able to broadcast a single element.
- if (SDValue Broadcast = lowerVectorShuffleAsBroadcast(DL, MVT::v4i64, V1, V2,
- Mask, Subtarget, DAG))
+ if (SDValue Broadcast = lowerShuffleAsBroadcast(DL, MVT::v4i64, V1, V2, Mask,
+ Subtarget, DAG))
return Broadcast;
if (V2.isUndef()) {
@@ -14763,31 +15358,36 @@ static SDValue lowerV4I64VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
}
// Try to use shift instructions.
- if (SDValue Shift = lowerVectorShuffleAsShift(DL, MVT::v4i64, V1, V2, Mask,
- Zeroable, Subtarget, DAG))
+ if (SDValue Shift = lowerShuffleAsShift(DL, MVT::v4i64, V1, V2, Mask,
+ Zeroable, Subtarget, DAG))
return Shift;
// If we have VLX support, we can use VALIGN or VEXPAND.
if (Subtarget.hasVLX()) {
- if (SDValue Rotate = lowerVectorShuffleAsRotate(DL, MVT::v4i64, V1, V2,
- Mask, Subtarget, DAG))
+ if (SDValue Rotate = lowerShuffleAsRotate(DL, MVT::v4i64, V1, V2, Mask,
+ Subtarget, DAG))
return Rotate;
- if (SDValue V = lowerVectorShuffleToEXPAND(DL, MVT::v4i64, Zeroable, Mask,
- V1, V2, DAG, Subtarget))
+ if (SDValue V = lowerShuffleToEXPAND(DL, MVT::v4i64, Zeroable, Mask, V1, V2,
+ DAG, Subtarget))
return V;
}
// Try to use PALIGNR.
- if (SDValue Rotate = lowerVectorShuffleAsByteRotate(DL, MVT::v4i64, V1, V2,
- Mask, Subtarget, DAG))
+ if (SDValue Rotate = lowerShuffleAsByteRotate(DL, MVT::v4i64, V1, V2, Mask,
+ Subtarget, DAG))
return Rotate;
// Use dedicated unpack instructions for masks that match their pattern.
- if (SDValue V =
- lowerVectorShuffleWithUNPCK(DL, MVT::v4i64, Mask, V1, V2, DAG))
+ if (SDValue V = lowerShuffleWithUNPCK(DL, MVT::v4i64, Mask, V1, V2, DAG))
return V;
+ // If we have one input in place, then we can permute the other input and
+ // blend the result.
+ if (isShuffleMaskInputInPlace(0, Mask) || isShuffleMaskInputInPlace(1, Mask))
+ return lowerShuffleAsDecomposedShuffleBlend(DL, MVT::v4i64, V1, V2, Mask,
+ Subtarget, DAG);
+
// Try to create an in-lane repeating shuffle mask and then shuffle the
// results into the target lanes.
if (SDValue V = lowerShuffleAsRepeatedMaskAndLanePermute(
@@ -14800,35 +15400,34 @@ static SDValue lowerV4I64VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
// instruction so skip this pattern.
if (!isShuffleMaskInputInPlace(0, Mask) &&
!isShuffleMaskInputInPlace(1, Mask))
- if (SDValue Result = lowerVectorShuffleByMerging128BitLanes(
+ if (SDValue Result = lowerShuffleAsLanePermuteAndRepeatedMask(
DL, MVT::v4i64, V1, V2, Mask, Subtarget, DAG))
return Result;
// Otherwise fall back on generic blend lowering.
- return lowerVectorShuffleAsDecomposedShuffleBlend(DL, MVT::v4i64, V1, V2,
- Mask, Subtarget, DAG);
+ return lowerShuffleAsDecomposedShuffleBlend(DL, MVT::v4i64, V1, V2, Mask,
+ Subtarget, DAG);
}
/// Handle lowering of 8-lane 32-bit floating point shuffles.
///
/// Also ends up handling lowering of 8-lane 32-bit integer shuffles when AVX2
/// isn't available.
-static SDValue lowerV8F32VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
- const APInt &Zeroable,
- SDValue V1, SDValue V2,
- const X86Subtarget &Subtarget,
- SelectionDAG &DAG) {
+static SDValue lowerV8F32Shuffle(const SDLoc &DL, ArrayRef<int> Mask,
+ const APInt &Zeroable, SDValue V1, SDValue V2,
+ const X86Subtarget &Subtarget,
+ SelectionDAG &DAG) {
assert(V1.getSimpleValueType() == MVT::v8f32 && "Bad operand type!");
assert(V2.getSimpleValueType() == MVT::v8f32 && "Bad operand type!");
assert(Mask.size() == 8 && "Unexpected mask size for v8 shuffle!");
- if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v8f32, V1, V2, Mask,
- Zeroable, Subtarget, DAG))
+ if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v8f32, V1, V2, Mask,
+ Zeroable, Subtarget, DAG))
return Blend;
// Check for being able to broadcast a single element.
- if (SDValue Broadcast = lowerVectorShuffleAsBroadcast(DL, MVT::v8f32, V1, V2,
- Mask, Subtarget, DAG))
+ if (SDValue Broadcast = lowerShuffleAsBroadcast(DL, MVT::v8f32, V1, V2, Mask,
+ Subtarget, DAG))
return Broadcast;
// If the shuffle mask is repeated in each 128-bit lane, we have many more
@@ -14849,13 +15448,12 @@ static SDValue lowerV8F32VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
getV4X86ShuffleImm8ForMask(RepeatedMask, DL, DAG));
// Use dedicated unpack instructions for masks that match their pattern.
- if (SDValue V =
- lowerVectorShuffleWithUNPCK(DL, MVT::v8f32, Mask, V1, V2, DAG))
+ if (SDValue V = lowerShuffleWithUNPCK(DL, MVT::v8f32, Mask, V1, V2, DAG))
return V;
// Otherwise, fall back to a SHUFPS sequence. Here it is important that we
// have already handled any direct blends.
- return lowerVectorShuffleWithSHUFPS(DL, MVT::v8f32, RepeatedMask, V1, V2, DAG);
+ return lowerShuffleWithSHUFPS(DL, MVT::v8f32, RepeatedMask, V1, V2, DAG);
}
// Try to create an in-lane repeating shuffle mask and then shuffle the
@@ -14875,49 +15473,49 @@ static SDValue lowerV8F32VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
return DAG.getNode(X86ISD::VPERMV, DL, MVT::v8f32, VPermMask, V1);
// Otherwise, fall back.
- return lowerVectorShuffleAsLanePermuteAndBlend(DL, MVT::v8f32, V1, V2, Mask,
- DAG, Subtarget);
+ return lowerShuffleAsLanePermuteAndBlend(DL, MVT::v8f32, V1, V2, Mask,
+ DAG, Subtarget);
}
// Try to simplify this by merging 128-bit lanes to enable a lane-based
// shuffle.
- if (SDValue Result = lowerVectorShuffleByMerging128BitLanes(
+ if (SDValue Result = lowerShuffleAsLanePermuteAndRepeatedMask(
DL, MVT::v8f32, V1, V2, Mask, Subtarget, DAG))
return Result;
+
// If we have VLX support, we can use VEXPAND.
if (Subtarget.hasVLX())
- if (SDValue V = lowerVectorShuffleToEXPAND(DL, MVT::v8f32, Zeroable, Mask,
- V1, V2, DAG, Subtarget))
+ if (SDValue V = lowerShuffleToEXPAND(DL, MVT::v8f32, Zeroable, Mask, V1, V2,
+ DAG, Subtarget))
return V;
// For non-AVX512 if the Mask is of 16bit elements in lane then try to split
// since after split we get a more efficient code using vpunpcklwd and
// vpunpckhwd instrs than vblend.
if (!Subtarget.hasAVX512() && isUnpackWdShuffleMask(Mask, MVT::v8f32))
- if (SDValue V = lowerVectorShuffleAsSplitOrBlend(DL, MVT::v8f32, V1, V2,
- Mask, Subtarget, DAG))
+ if (SDValue V = lowerShuffleAsSplitOrBlend(DL, MVT::v8f32, V1, V2, Mask,
+ Subtarget, DAG))
return V;
// If we have AVX2 then we always want to lower with a blend because at v8 we
// can fully permute the elements.
if (Subtarget.hasAVX2())
- return lowerVectorShuffleAsDecomposedShuffleBlend(DL, MVT::v8f32, V1, V2,
- Mask, Subtarget, DAG);
+ return lowerShuffleAsDecomposedShuffleBlend(DL, MVT::v8f32, V1, V2, Mask,
+ Subtarget, DAG);
// Otherwise fall back on generic lowering.
- return lowerVectorShuffleAsSplitOrBlend(DL, MVT::v8f32, V1, V2, Mask,
- Subtarget, DAG);
+ return lowerShuffleAsSplitOrBlend(DL, MVT::v8f32, V1, V2, Mask,
+ Subtarget, DAG);
}
/// Handle lowering of 8-lane 32-bit integer shuffles.
///
/// This routine is only called when we have AVX2 and thus a reasonable
/// instruction set for v8i32 shuffling..
-static SDValue lowerV8I32VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
- const APInt &Zeroable,
- SDValue V1, SDValue V2,
- const X86Subtarget &Subtarget,
- SelectionDAG &DAG) {
+static SDValue lowerV8I32Shuffle(const SDLoc &DL, ArrayRef<int> Mask,
+ const APInt &Zeroable, SDValue V1, SDValue V2,
+ const X86Subtarget &Subtarget,
+ SelectionDAG &DAG) {
assert(V1.getSimpleValueType() == MVT::v8i32 && "Bad operand type!");
assert(V2.getSimpleValueType() == MVT::v8i32 && "Bad operand type!");
assert(Mask.size() == 8 && "Unexpected mask size for v8 shuffle!");
@@ -14926,8 +15524,8 @@ static SDValue lowerV8I32VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
// Whenever we can lower this as a zext, that instruction is strictly faster
// than any alternative. It also allows us to fold memory operands into the
// shuffle in many cases.
- if (SDValue ZExt = lowerVectorShuffleAsZeroOrAnyExtend(
- DL, MVT::v8i32, V1, V2, Mask, Zeroable, Subtarget, DAG))
+ if (SDValue ZExt = lowerShuffleAsZeroOrAnyExtend(DL, MVT::v8i32, V1, V2, Mask,
+ Zeroable, Subtarget, DAG))
return ZExt;
// For non-AVX512 if the Mask is of 16bit elements in lane then try to split
@@ -14935,17 +15533,17 @@ static SDValue lowerV8I32VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
// vpunpcklwd and vpunpckhwd instrs.
if (isUnpackWdShuffleMask(Mask, MVT::v8i32) && !V2.isUndef() &&
!Subtarget.hasAVX512())
- if (SDValue V = lowerVectorShuffleAsSplitOrBlend(DL, MVT::v8i32, V1, V2,
- Mask, Subtarget, DAG))
+ if (SDValue V = lowerShuffleAsSplitOrBlend(DL, MVT::v8i32, V1, V2, Mask,
+ Subtarget, DAG))
return V;
- if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v8i32, V1, V2, Mask,
- Zeroable, Subtarget, DAG))
+ if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v8i32, V1, V2, Mask,
+ Zeroable, Subtarget, DAG))
return Blend;
// Check for being able to broadcast a single element.
- if (SDValue Broadcast = lowerVectorShuffleAsBroadcast(DL, MVT::v8i32, V1, V2,
- Mask, Subtarget, DAG))
+ if (SDValue Broadcast = lowerShuffleAsBroadcast(DL, MVT::v8i32, V1, V2, Mask,
+ Subtarget, DAG))
return Broadcast;
// If the shuffle mask is repeated in each 128-bit lane we can use more
@@ -14961,30 +15559,29 @@ static SDValue lowerV8I32VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
getV4X86ShuffleImm8ForMask(RepeatedMask, DL, DAG));
// Use dedicated unpack instructions for masks that match their pattern.
- if (SDValue V =
- lowerVectorShuffleWithUNPCK(DL, MVT::v8i32, Mask, V1, V2, DAG))
+ if (SDValue V = lowerShuffleWithUNPCK(DL, MVT::v8i32, Mask, V1, V2, DAG))
return V;
}
// Try to use shift instructions.
- if (SDValue Shift = lowerVectorShuffleAsShift(DL, MVT::v8i32, V1, V2, Mask,
- Zeroable, Subtarget, DAG))
+ if (SDValue Shift = lowerShuffleAsShift(DL, MVT::v8i32, V1, V2, Mask,
+ Zeroable, Subtarget, DAG))
return Shift;
// If we have VLX support, we can use VALIGN or EXPAND.
if (Subtarget.hasVLX()) {
- if (SDValue Rotate = lowerVectorShuffleAsRotate(DL, MVT::v8i32, V1, V2,
- Mask, Subtarget, DAG))
+ if (SDValue Rotate = lowerShuffleAsRotate(DL, MVT::v8i32, V1, V2, Mask,
+ Subtarget, DAG))
return Rotate;
- if (SDValue V = lowerVectorShuffleToEXPAND(DL, MVT::v8i32, Zeroable, Mask,
- V1, V2, DAG, Subtarget))
+ if (SDValue V = lowerShuffleToEXPAND(DL, MVT::v8i32, Zeroable, Mask, V1, V2,
+ DAG, Subtarget))
return V;
}
// Try to use byte rotation instructions.
- if (SDValue Rotate = lowerVectorShuffleAsByteRotate(
- DL, MVT::v8i32, V1, V2, Mask, Subtarget, DAG))
+ if (SDValue Rotate = lowerShuffleAsByteRotate(DL, MVT::v8i32, V1, V2, Mask,
+ Subtarget, DAG))
return Rotate;
// Try to create an in-lane repeating shuffle mask and then shuffle the
@@ -15006,31 +15603,30 @@ static SDValue lowerV8I32VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
if (Is128BitLaneRepeatedShuffle && isSingleSHUFPSMask(RepeatedMask)) {
SDValue CastV1 = DAG.getBitcast(MVT::v8f32, V1);
SDValue CastV2 = DAG.getBitcast(MVT::v8f32, V2);
- SDValue ShufPS = lowerVectorShuffleWithSHUFPS(DL, MVT::v8f32, RepeatedMask,
- CastV1, CastV2, DAG);
+ SDValue ShufPS = lowerShuffleWithSHUFPS(DL, MVT::v8f32, RepeatedMask,
+ CastV1, CastV2, DAG);
return DAG.getBitcast(MVT::v8i32, ShufPS);
}
// Try to simplify this by merging 128-bit lanes to enable a lane-based
// shuffle.
- if (SDValue Result = lowerVectorShuffleByMerging128BitLanes(
+ if (SDValue Result = lowerShuffleAsLanePermuteAndRepeatedMask(
DL, MVT::v8i32, V1, V2, Mask, Subtarget, DAG))
return Result;
// Otherwise fall back on generic blend lowering.
- return lowerVectorShuffleAsDecomposedShuffleBlend(DL, MVT::v8i32, V1, V2,
- Mask, Subtarget, DAG);
+ return lowerShuffleAsDecomposedShuffleBlend(DL, MVT::v8i32, V1, V2, Mask,
+ Subtarget, DAG);
}
/// Handle lowering of 16-lane 16-bit integer shuffles.
///
/// This routine is only called when we have AVX2 and thus a reasonable
/// instruction set for v16i16 shuffling..
-static SDValue lowerV16I16VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
- const APInt &Zeroable,
- SDValue V1, SDValue V2,
- const X86Subtarget &Subtarget,
- SelectionDAG &DAG) {
+static SDValue lowerV16I16Shuffle(const SDLoc &DL, ArrayRef<int> Mask,
+ const APInt &Zeroable, SDValue V1, SDValue V2,
+ const X86Subtarget &Subtarget,
+ SelectionDAG &DAG) {
assert(V1.getSimpleValueType() == MVT::v16i16 && "Bad operand type!");
assert(V2.getSimpleValueType() == MVT::v16i16 && "Bad operand type!");
assert(Mask.size() == 16 && "Unexpected mask size for v16 shuffle!");
@@ -15039,37 +15635,36 @@ static SDValue lowerV16I16VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
// Whenever we can lower this as a zext, that instruction is strictly faster
// than any alternative. It also allows us to fold memory operands into the
// shuffle in many cases.
- if (SDValue ZExt = lowerVectorShuffleAsZeroOrAnyExtend(
+ if (SDValue ZExt = lowerShuffleAsZeroOrAnyExtend(
DL, MVT::v16i16, V1, V2, Mask, Zeroable, Subtarget, DAG))
return ZExt;
// Check for being able to broadcast a single element.
- if (SDValue Broadcast = lowerVectorShuffleAsBroadcast(DL, MVT::v16i16, V1, V2,
- Mask, Subtarget, DAG))
+ if (SDValue Broadcast = lowerShuffleAsBroadcast(DL, MVT::v16i16, V1, V2, Mask,
+ Subtarget, DAG))
return Broadcast;
- if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v16i16, V1, V2, Mask,
- Zeroable, Subtarget, DAG))
+ if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v16i16, V1, V2, Mask,
+ Zeroable, Subtarget, DAG))
return Blend;
// Use dedicated unpack instructions for masks that match their pattern.
- if (SDValue V =
- lowerVectorShuffleWithUNPCK(DL, MVT::v16i16, Mask, V1, V2, DAG))
+ if (SDValue V = lowerShuffleWithUNPCK(DL, MVT::v16i16, Mask, V1, V2, DAG))
return V;
// Use dedicated pack instructions for masks that match their pattern.
- if (SDValue V = lowerVectorShuffleWithPACK(DL, MVT::v16i16, Mask, V1, V2, DAG,
- Subtarget))
+ if (SDValue V = lowerShuffleWithPACK(DL, MVT::v16i16, Mask, V1, V2, DAG,
+ Subtarget))
return V;
// Try to use shift instructions.
- if (SDValue Shift = lowerVectorShuffleAsShift(DL, MVT::v16i16, V1, V2, Mask,
- Zeroable, Subtarget, DAG))
+ if (SDValue Shift = lowerShuffleAsShift(DL, MVT::v16i16, V1, V2, Mask,
+ Zeroable, Subtarget, DAG))
return Shift;
// Try to use byte rotation instructions.
- if (SDValue Rotate = lowerVectorShuffleAsByteRotate(
- DL, MVT::v16i16, V1, V2, Mask, Subtarget, DAG))
+ if (SDValue Rotate = lowerShuffleAsByteRotate(DL, MVT::v16i16, V1, V2, Mask,
+ Subtarget, DAG))
return Rotate;
// Try to create an in-lane repeating shuffle mask and then shuffle the
@@ -15082,12 +15677,12 @@ static SDValue lowerV16I16VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
// There are no generalized cross-lane shuffle operations available on i16
// element types.
if (is128BitLaneCrossingShuffleMask(MVT::v16i16, Mask)) {
- if (SDValue V = lowerVectorShuffleAsLanePermuteAndPermute(
+ if (SDValue V = lowerShuffleAsLanePermuteAndPermute(
DL, MVT::v16i16, V1, V2, Mask, DAG, Subtarget))
return V;
- return lowerVectorShuffleAsLanePermuteAndBlend(DL, MVT::v16i16, V1, V2,
- Mask, DAG, Subtarget);
+ return lowerShuffleAsLanePermuteAndBlend(DL, MVT::v16i16, V1, V2, Mask,
+ DAG, Subtarget);
}
SmallVector<int, 8> RepeatedMask;
@@ -15095,44 +15690,43 @@ static SDValue lowerV16I16VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
// As this is a single-input shuffle, the repeated mask should be
// a strictly valid v8i16 mask that we can pass through to the v8i16
// lowering to handle even the v16 case.
- return lowerV8I16GeneralSingleInputVectorShuffle(
+ return lowerV8I16GeneralSingleInputShuffle(
DL, MVT::v16i16, V1, RepeatedMask, Subtarget, DAG);
}
}
- if (SDValue PSHUFB = lowerVectorShuffleWithPSHUFB(
- DL, MVT::v16i16, Mask, V1, V2, Zeroable, Subtarget, DAG))
+ if (SDValue PSHUFB = lowerShuffleWithPSHUFB(DL, MVT::v16i16, Mask, V1, V2,
+ Zeroable, Subtarget, DAG))
return PSHUFB;
// AVX512BWVL can lower to VPERMW.
if (Subtarget.hasBWI() && Subtarget.hasVLX())
- return lowerVectorShuffleWithPERMV(DL, MVT::v16i16, Mask, V1, V2, DAG);
+ return lowerShuffleWithPERMV(DL, MVT::v16i16, Mask, V1, V2, DAG);
// Try to simplify this by merging 128-bit lanes to enable a lane-based
// shuffle.
- if (SDValue Result = lowerVectorShuffleByMerging128BitLanes(
+ if (SDValue Result = lowerShuffleAsLanePermuteAndRepeatedMask(
DL, MVT::v16i16, V1, V2, Mask, Subtarget, DAG))
return Result;
// Try to permute the lanes and then use a per-lane permute.
- if (SDValue V = lowerVectorShuffleAsLanePermuteAndPermute(
+ if (SDValue V = lowerShuffleAsLanePermuteAndPermute(
DL, MVT::v16i16, V1, V2, Mask, DAG, Subtarget))
return V;
// Otherwise fall back on generic lowering.
- return lowerVectorShuffleAsSplitOrBlend(DL, MVT::v16i16, V1, V2, Mask,
- Subtarget, DAG);
+ return lowerShuffleAsSplitOrBlend(DL, MVT::v16i16, V1, V2, Mask,
+ Subtarget, DAG);
}
/// Handle lowering of 32-lane 8-bit integer shuffles.
///
/// This routine is only called when we have AVX2 and thus a reasonable
/// instruction set for v32i8 shuffling..
-static SDValue lowerV32I8VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
- const APInt &Zeroable,
- SDValue V1, SDValue V2,
- const X86Subtarget &Subtarget,
- SelectionDAG &DAG) {
+static SDValue lowerV32I8Shuffle(const SDLoc &DL, ArrayRef<int> Mask,
+ const APInt &Zeroable, SDValue V1, SDValue V2,
+ const X86Subtarget &Subtarget,
+ SelectionDAG &DAG) {
assert(V1.getSimpleValueType() == MVT::v32i8 && "Bad operand type!");
assert(V2.getSimpleValueType() == MVT::v32i8 && "Bad operand type!");
assert(Mask.size() == 32 && "Unexpected mask size for v32 shuffle!");
@@ -15141,37 +15735,36 @@ static SDValue lowerV32I8VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
// Whenever we can lower this as a zext, that instruction is strictly faster
// than any alternative. It also allows us to fold memory operands into the
// shuffle in many cases.
- if (SDValue ZExt = lowerVectorShuffleAsZeroOrAnyExtend(
- DL, MVT::v32i8, V1, V2, Mask, Zeroable, Subtarget, DAG))
+ if (SDValue ZExt = lowerShuffleAsZeroOrAnyExtend(DL, MVT::v32i8, V1, V2, Mask,
+ Zeroable, Subtarget, DAG))
return ZExt;
// Check for being able to broadcast a single element.
- if (SDValue Broadcast = lowerVectorShuffleAsBroadcast(DL, MVT::v32i8, V1, V2,
- Mask, Subtarget, DAG))
+ if (SDValue Broadcast = lowerShuffleAsBroadcast(DL, MVT::v32i8, V1, V2, Mask,
+ Subtarget, DAG))
return Broadcast;
- if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v32i8, V1, V2, Mask,
- Zeroable, Subtarget, DAG))
+ if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v32i8, V1, V2, Mask,
+ Zeroable, Subtarget, DAG))
return Blend;
// Use dedicated unpack instructions for masks that match their pattern.
- if (SDValue V =
- lowerVectorShuffleWithUNPCK(DL, MVT::v32i8, Mask, V1, V2, DAG))
+ if (SDValue V = lowerShuffleWithUNPCK(DL, MVT::v32i8, Mask, V1, V2, DAG))
return V;
// Use dedicated pack instructions for masks that match their pattern.
- if (SDValue V = lowerVectorShuffleWithPACK(DL, MVT::v32i8, Mask, V1, V2, DAG,
- Subtarget))
+ if (SDValue V = lowerShuffleWithPACK(DL, MVT::v32i8, Mask, V1, V2, DAG,
+ Subtarget))
return V;
// Try to use shift instructions.
- if (SDValue Shift = lowerVectorShuffleAsShift(DL, MVT::v32i8, V1, V2, Mask,
+ if (SDValue Shift = lowerShuffleAsShift(DL, MVT::v32i8, V1, V2, Mask,
Zeroable, Subtarget, DAG))
return Shift;
// Try to use byte rotation instructions.
- if (SDValue Rotate = lowerVectorShuffleAsByteRotate(
- DL, MVT::v32i8, V1, V2, Mask, Subtarget, DAG))
+ if (SDValue Rotate = lowerShuffleAsByteRotate(DL, MVT::v32i8, V1, V2, Mask,
+ Subtarget, DAG))
return Rotate;
// Try to create an in-lane repeating shuffle mask and then shuffle the
@@ -15183,36 +15776,36 @@ static SDValue lowerV32I8VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
// There are no generalized cross-lane shuffle operations available on i8
// element types.
if (V2.isUndef() && is128BitLaneCrossingShuffleMask(MVT::v32i8, Mask)) {
- if (SDValue V = lowerVectorShuffleAsLanePermuteAndPermute(
+ if (SDValue V = lowerShuffleAsLanePermuteAndPermute(
DL, MVT::v32i8, V1, V2, Mask, DAG, Subtarget))
return V;
- return lowerVectorShuffleAsLanePermuteAndBlend(DL, MVT::v32i8, V1, V2, Mask,
- DAG, Subtarget);
+ return lowerShuffleAsLanePermuteAndBlend(DL, MVT::v32i8, V1, V2, Mask, DAG,
+ Subtarget);
}
- if (SDValue PSHUFB = lowerVectorShuffleWithPSHUFB(
- DL, MVT::v32i8, Mask, V1, V2, Zeroable, Subtarget, DAG))
+ if (SDValue PSHUFB = lowerShuffleWithPSHUFB(DL, MVT::v32i8, Mask, V1, V2,
+ Zeroable, Subtarget, DAG))
return PSHUFB;
// AVX512VBMIVL can lower to VPERMB.
if (Subtarget.hasVBMI() && Subtarget.hasVLX())
- return lowerVectorShuffleWithPERMV(DL, MVT::v32i8, Mask, V1, V2, DAG);
+ return lowerShuffleWithPERMV(DL, MVT::v32i8, Mask, V1, V2, DAG);
// Try to simplify this by merging 128-bit lanes to enable a lane-based
// shuffle.
- if (SDValue Result = lowerVectorShuffleByMerging128BitLanes(
+ if (SDValue Result = lowerShuffleAsLanePermuteAndRepeatedMask(
DL, MVT::v32i8, V1, V2, Mask, Subtarget, DAG))
return Result;
// Try to permute the lanes and then use a per-lane permute.
- if (SDValue V = lowerVectorShuffleAsLanePermuteAndPermute(
+ if (SDValue V = lowerShuffleAsLanePermuteAndPermute(
DL, MVT::v32i8, V1, V2, Mask, DAG, Subtarget))
return V;
// Otherwise fall back on generic lowering.
- return lowerVectorShuffleAsSplitOrBlend(DL, MVT::v32i8, V1, V2, Mask,
- Subtarget, DAG);
+ return lowerShuffleAsSplitOrBlend(DL, MVT::v32i8, V1, V2, Mask,
+ Subtarget, DAG);
}
/// High-level routine to lower various 256-bit x86 vector shuffles.
@@ -15220,24 +15813,23 @@ static SDValue lowerV32I8VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
/// This routine either breaks down the specific type of a 256-bit x86 vector
/// shuffle or splits it into two 128-bit shuffles and fuses the results back
/// together based on the available instructions.
-static SDValue lower256BitVectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
- MVT VT, SDValue V1, SDValue V2,
- const APInt &Zeroable,
- const X86Subtarget &Subtarget,
- SelectionDAG &DAG) {
+static SDValue lower256BitShuffle(const SDLoc &DL, ArrayRef<int> Mask, MVT VT,
+ SDValue V1, SDValue V2, const APInt &Zeroable,
+ const X86Subtarget &Subtarget,
+ SelectionDAG &DAG) {
// If we have a single input to the zero element, insert that into V1 if we
// can do so cheaply.
int NumElts = VT.getVectorNumElements();
int NumV2Elements = count_if(Mask, [NumElts](int M) { return M >= NumElts; });
if (NumV2Elements == 1 && Mask[0] >= NumElts)
- if (SDValue Insertion = lowerVectorShuffleAsElementInsertion(
+ if (SDValue Insertion = lowerShuffleAsElementInsertion(
DL, VT, V1, V2, Mask, Zeroable, Subtarget, DAG))
return Insertion;
// Handle special cases where the lower or upper half is UNDEF.
if (SDValue V =
- lowerVectorShuffleWithUndefHalf(DL, VT, V1, V2, Mask, Subtarget, DAG))
+ lowerShuffleWithUndefHalf(DL, VT, V1, V2, Mask, Subtarget, DAG))
return V;
// There is a really nice hard cut-over between AVX1 and AVX2 that means we
@@ -15251,12 +15843,12 @@ static SDValue lower256BitVectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
if (ElementBits < 32) {
// No floating point type available, if we can't use the bit operations
// for masking/blending then decompose into 128-bit vectors.
- if (SDValue V =
- lowerVectorShuffleAsBitMask(DL, VT, V1, V2, Mask, Zeroable, DAG))
+ if (SDValue V = lowerShuffleAsBitMask(DL, VT, V1, V2, Mask, Zeroable,
+ Subtarget, DAG))
return V;
- if (SDValue V = lowerVectorShuffleAsBitBlend(DL, VT, V1, V2, Mask, DAG))
+ if (SDValue V = lowerShuffleAsBitBlend(DL, VT, V1, V2, Mask, DAG))
return V;
- return splitAndLowerVectorShuffle(DL, VT, V1, V2, Mask, DAG);
+ return splitAndLowerShuffle(DL, VT, V1, V2, Mask, DAG);
}
MVT FpVT = MVT::getVectorVT(MVT::getFloatingPointVT(ElementBits),
@@ -15268,17 +15860,17 @@ static SDValue lower256BitVectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
switch (VT.SimpleTy) {
case MVT::v4f64:
- return lowerV4F64VectorShuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
+ return lowerV4F64Shuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
case MVT::v4i64:
- return lowerV4I64VectorShuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
+ return lowerV4I64Shuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
case MVT::v8f32:
- return lowerV8F32VectorShuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
+ return lowerV8F32Shuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
case MVT::v8i32:
- return lowerV8I32VectorShuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
+ return lowerV8I32Shuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
case MVT::v16i16:
- return lowerV16I16VectorShuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
+ return lowerV16I16Shuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
case MVT::v32i8:
- return lowerV32I8VectorShuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
+ return lowerV32I8Shuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
default:
llvm_unreachable("Not a valid 256-bit x86 vector type!");
@@ -15286,12 +15878,10 @@ static SDValue lower256BitVectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
}
/// Try to lower a vector shuffle as a 128-bit shuffles.
-static SDValue lowerV4X128VectorShuffle(const SDLoc &DL, MVT VT,
- ArrayRef<int> Mask,
- const APInt &Zeroable,
- SDValue V1, SDValue V2,
- const X86Subtarget &Subtarget,
- SelectionDAG &DAG) {
+static SDValue lowerV4X128Shuffle(const SDLoc &DL, MVT VT, ArrayRef<int> Mask,
+ const APInt &Zeroable, SDValue V1, SDValue V2,
+ const X86Subtarget &Subtarget,
+ SelectionDAG &DAG) {
assert(VT.getScalarSizeInBits() == 64 &&
"Unexpected element type size for 128bit shuffle.");
@@ -15388,11 +15978,10 @@ static SDValue lowerV4X128VectorShuffle(const SDLoc &DL, MVT VT,
}
/// Handle lowering of 8-lane 64-bit floating point shuffles.
-static SDValue lowerV8F64VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
- const APInt &Zeroable,
- SDValue V1, SDValue V2,
- const X86Subtarget &Subtarget,
- SelectionDAG &DAG) {
+static SDValue lowerV8F64Shuffle(const SDLoc &DL, ArrayRef<int> Mask,
+ const APInt &Zeroable, SDValue V1, SDValue V2,
+ const X86Subtarget &Subtarget,
+ SelectionDAG &DAG) {
assert(V1.getSimpleValueType() == MVT::v8f64 && "Bad operand type!");
assert(V2.getSimpleValueType() == MVT::v8f64 && "Bad operand type!");
assert(Mask.size() == 8 && "Unexpected mask size for v8 shuffle!");
@@ -15419,37 +16008,33 @@ static SDValue lowerV8F64VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
getV4X86ShuffleImm8ForMask(RepeatedMask, DL, DAG));
}
- if (SDValue Shuf128 =
- lowerV4X128VectorShuffle(DL, MVT::v8f64, Mask, Zeroable, V1, V2,
- Subtarget, DAG))
+ if (SDValue Shuf128 = lowerV4X128Shuffle(DL, MVT::v8f64, Mask, Zeroable, V1,
+ V2, Subtarget, DAG))
return Shuf128;
- if (SDValue Unpck =
- lowerVectorShuffleWithUNPCK(DL, MVT::v8f64, Mask, V1, V2, DAG))
+ if (SDValue Unpck = lowerShuffleWithUNPCK(DL, MVT::v8f64, Mask, V1, V2, DAG))
return Unpck;
// Check if the blend happens to exactly fit that of SHUFPD.
- if (SDValue Op =
- lowerVectorShuffleWithSHUFPD(DL, MVT::v8f64, Mask, V1, V2, DAG))
+ if (SDValue Op = lowerShuffleWithSHUFPD(DL, MVT::v8f64, Mask, V1, V2, DAG))
return Op;
- if (SDValue V = lowerVectorShuffleToEXPAND(DL, MVT::v8f64, Zeroable, Mask, V1,
- V2, DAG, Subtarget))
+ if (SDValue V = lowerShuffleToEXPAND(DL, MVT::v8f64, Zeroable, Mask, V1, V2,
+ DAG, Subtarget))
return V;
- if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v8f64, V1, V2, Mask,
- Zeroable, Subtarget, DAG))
+ if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v8f64, V1, V2, Mask,
+ Zeroable, Subtarget, DAG))
return Blend;
- return lowerVectorShuffleWithPERMV(DL, MVT::v8f64, Mask, V1, V2, DAG);
+ return lowerShuffleWithPERMV(DL, MVT::v8f64, Mask, V1, V2, DAG);
}
/// Handle lowering of 16-lane 32-bit floating point shuffles.
-static SDValue lowerV16F32VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
- const APInt &Zeroable,
- SDValue V1, SDValue V2,
- const X86Subtarget &Subtarget,
- SelectionDAG &DAG) {
+static SDValue lowerV16F32Shuffle(const SDLoc &DL, ArrayRef<int> Mask,
+ const APInt &Zeroable, SDValue V1, SDValue V2,
+ const X86Subtarget &Subtarget,
+ SelectionDAG &DAG) {
assert(V1.getSimpleValueType() == MVT::v16f32 && "Bad operand type!");
assert(V2.getSimpleValueType() == MVT::v16f32 && "Bad operand type!");
assert(Mask.size() == 16 && "Unexpected mask size for v16 shuffle!");
@@ -15471,16 +16056,15 @@ static SDValue lowerV16F32VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
getV4X86ShuffleImm8ForMask(RepeatedMask, DL, DAG));
// Use dedicated unpack instructions for masks that match their pattern.
- if (SDValue Unpck =
- lowerVectorShuffleWithUNPCK(DL, MVT::v16f32, Mask, V1, V2, DAG))
- return Unpck;
+ if (SDValue V = lowerShuffleWithUNPCK(DL, MVT::v16f32, Mask, V1, V2, DAG))
+ return V;
- if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v16f32, V1, V2, Mask,
- Zeroable, Subtarget, DAG))
+ if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v16f32, V1, V2, Mask,
+ Zeroable, Subtarget, DAG))
return Blend;
// Otherwise, fall back to a SHUFPS sequence.
- return lowerVectorShuffleWithSHUFPS(DL, MVT::v16f32, RepeatedMask, V1, V2, DAG);
+ return lowerShuffleWithSHUFPS(DL, MVT::v16f32, RepeatedMask, V1, V2, DAG);
}
// If we have a single input shuffle with different shuffle patterns in the
@@ -15492,19 +16076,18 @@ static SDValue lowerV16F32VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
}
// If we have AVX512F support, we can use VEXPAND.
- if (SDValue V = lowerVectorShuffleToEXPAND(DL, MVT::v16f32, Zeroable, Mask,
+ if (SDValue V = lowerShuffleToEXPAND(DL, MVT::v16f32, Zeroable, Mask,
V1, V2, DAG, Subtarget))
return V;
- return lowerVectorShuffleWithPERMV(DL, MVT::v16f32, Mask, V1, V2, DAG);
+ return lowerShuffleWithPERMV(DL, MVT::v16f32, Mask, V1, V2, DAG);
}
/// Handle lowering of 8-lane 64-bit integer shuffles.
-static SDValue lowerV8I64VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
- const APInt &Zeroable,
- SDValue V1, SDValue V2,
- const X86Subtarget &Subtarget,
- SelectionDAG &DAG) {
+static SDValue lowerV8I64Shuffle(const SDLoc &DL, ArrayRef<int> Mask,
+ const APInt &Zeroable, SDValue V1, SDValue V2,
+ const X86Subtarget &Subtarget,
+ SelectionDAG &DAG) {
assert(V1.getSimpleValueType() == MVT::v8i64 && "Bad operand type!");
assert(V2.getSimpleValueType() == MVT::v8i64 && "Bad operand type!");
assert(Mask.size() == 8 && "Unexpected mask size for v8 shuffle!");
@@ -15530,47 +16113,44 @@ static SDValue lowerV8I64VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
getV4X86ShuffleImm8ForMask(Repeated256Mask, DL, DAG));
}
- if (SDValue Shuf128 =
- lowerV4X128VectorShuffle(DL, MVT::v8i64, Mask, Zeroable,
- V1, V2, Subtarget, DAG))
+ if (SDValue Shuf128 = lowerV4X128Shuffle(DL, MVT::v8i64, Mask, Zeroable, V1,
+ V2, Subtarget, DAG))
return Shuf128;
// Try to use shift instructions.
- if (SDValue Shift = lowerVectorShuffleAsShift(DL, MVT::v8i64, V1, V2, Mask,
- Zeroable, Subtarget, DAG))
+ if (SDValue Shift = lowerShuffleAsShift(DL, MVT::v8i64, V1, V2, Mask,
+ Zeroable, Subtarget, DAG))
return Shift;
// Try to use VALIGN.
- if (SDValue Rotate = lowerVectorShuffleAsRotate(DL, MVT::v8i64, V1, V2,
- Mask, Subtarget, DAG))
+ if (SDValue Rotate = lowerShuffleAsRotate(DL, MVT::v8i64, V1, V2, Mask,
+ Subtarget, DAG))
return Rotate;
// Try to use PALIGNR.
- if (SDValue Rotate = lowerVectorShuffleAsByteRotate(DL, MVT::v8i64, V1, V2,
- Mask, Subtarget, DAG))
+ if (SDValue Rotate = lowerShuffleAsByteRotate(DL, MVT::v8i64, V1, V2, Mask,
+ Subtarget, DAG))
return Rotate;
- if (SDValue Unpck =
- lowerVectorShuffleWithUNPCK(DL, MVT::v8i64, Mask, V1, V2, DAG))
+ if (SDValue Unpck = lowerShuffleWithUNPCK(DL, MVT::v8i64, Mask, V1, V2, DAG))
return Unpck;
// If we have AVX512F support, we can use VEXPAND.
- if (SDValue V = lowerVectorShuffleToEXPAND(DL, MVT::v8i64, Zeroable, Mask, V1,
- V2, DAG, Subtarget))
+ if (SDValue V = lowerShuffleToEXPAND(DL, MVT::v8i64, Zeroable, Mask, V1, V2,
+ DAG, Subtarget))
return V;
- if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v8i64, V1, V2, Mask,
- Zeroable, Subtarget, DAG))
+ if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v8i64, V1, V2, Mask,
+ Zeroable, Subtarget, DAG))
return Blend;
- return lowerVectorShuffleWithPERMV(DL, MVT::v8i64, Mask, V1, V2, DAG);
+ return lowerShuffleWithPERMV(DL, MVT::v8i64, Mask, V1, V2, DAG);
}
/// Handle lowering of 16-lane 32-bit integer shuffles.
-static SDValue lowerV16I32VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
- const APInt &Zeroable,
- SDValue V1, SDValue V2,
- const X86Subtarget &Subtarget,
- SelectionDAG &DAG) {
+static SDValue lowerV16I32Shuffle(const SDLoc &DL, ArrayRef<int> Mask,
+ const APInt &Zeroable, SDValue V1, SDValue V2,
+ const X86Subtarget &Subtarget,
+ SelectionDAG &DAG) {
assert(V1.getSimpleValueType() == MVT::v16i32 && "Bad operand type!");
assert(V2.getSimpleValueType() == MVT::v16i32 && "Bad operand type!");
assert(Mask.size() == 16 && "Unexpected mask size for v16 shuffle!");
@@ -15578,7 +16158,7 @@ static SDValue lowerV16I32VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
// Whenever we can lower this as a zext, that instruction is strictly faster
// than any alternative. It also allows us to fold memory operands into the
// shuffle in many cases.
- if (SDValue ZExt = lowerVectorShuffleAsZeroOrAnyExtend(
+ if (SDValue ZExt = lowerShuffleAsZeroOrAnyExtend(
DL, MVT::v16i32, V1, V2, Mask, Zeroable, Subtarget, DAG))
return ZExt;
@@ -15595,25 +16175,24 @@ static SDValue lowerV16I32VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
getV4X86ShuffleImm8ForMask(RepeatedMask, DL, DAG));
// Use dedicated unpack instructions for masks that match their pattern.
- if (SDValue V =
- lowerVectorShuffleWithUNPCK(DL, MVT::v16i32, Mask, V1, V2, DAG))
+ if (SDValue V = lowerShuffleWithUNPCK(DL, MVT::v16i32, Mask, V1, V2, DAG))
return V;
}
// Try to use shift instructions.
- if (SDValue Shift = lowerVectorShuffleAsShift(DL, MVT::v16i32, V1, V2, Mask,
- Zeroable, Subtarget, DAG))
+ if (SDValue Shift = lowerShuffleAsShift(DL, MVT::v16i32, V1, V2, Mask,
+ Zeroable, Subtarget, DAG))
return Shift;
// Try to use VALIGN.
- if (SDValue Rotate = lowerVectorShuffleAsRotate(DL, MVT::v16i32, V1, V2,
- Mask, Subtarget, DAG))
+ if (SDValue Rotate = lowerShuffleAsRotate(DL, MVT::v16i32, V1, V2, Mask,
+ Subtarget, DAG))
return Rotate;
// Try to use byte rotation instructions.
if (Subtarget.hasBWI())
- if (SDValue Rotate = lowerVectorShuffleAsByteRotate(
- DL, MVT::v16i32, V1, V2, Mask, Subtarget, DAG))
+ if (SDValue Rotate = lowerShuffleAsByteRotate(DL, MVT::v16i32, V1, V2, Mask,
+ Subtarget, DAG))
return Rotate;
// Assume that a single SHUFPS is faster than using a permv shuffle.
@@ -15621,27 +16200,26 @@ static SDValue lowerV16I32VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
if (Is128BitLaneRepeatedShuffle && isSingleSHUFPSMask(RepeatedMask)) {
SDValue CastV1 = DAG.getBitcast(MVT::v16f32, V1);
SDValue CastV2 = DAG.getBitcast(MVT::v16f32, V2);
- SDValue ShufPS = lowerVectorShuffleWithSHUFPS(DL, MVT::v16f32, RepeatedMask,
- CastV1, CastV2, DAG);
+ SDValue ShufPS = lowerShuffleWithSHUFPS(DL, MVT::v16f32, RepeatedMask,
+ CastV1, CastV2, DAG);
return DAG.getBitcast(MVT::v16i32, ShufPS);
}
// If we have AVX512F support, we can use VEXPAND.
- if (SDValue V = lowerVectorShuffleToEXPAND(DL, MVT::v16i32, Zeroable, Mask,
- V1, V2, DAG, Subtarget))
+ if (SDValue V = lowerShuffleToEXPAND(DL, MVT::v16i32, Zeroable, Mask, V1, V2,
+ DAG, Subtarget))
return V;
- if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v16i32, V1, V2, Mask,
- Zeroable, Subtarget, DAG))
+ if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v16i32, V1, V2, Mask,
+ Zeroable, Subtarget, DAG))
return Blend;
- return lowerVectorShuffleWithPERMV(DL, MVT::v16i32, Mask, V1, V2, DAG);
+ return lowerShuffleWithPERMV(DL, MVT::v16i32, Mask, V1, V2, DAG);
}
/// Handle lowering of 32-lane 16-bit integer shuffles.
-static SDValue lowerV32I16VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
- const APInt &Zeroable,
- SDValue V1, SDValue V2,
- const X86Subtarget &Subtarget,
- SelectionDAG &DAG) {
+static SDValue lowerV32I16Shuffle(const SDLoc &DL, ArrayRef<int> Mask,
+ const APInt &Zeroable, SDValue V1, SDValue V2,
+ const X86Subtarget &Subtarget,
+ SelectionDAG &DAG) {
assert(V1.getSimpleValueType() == MVT::v32i16 && "Bad operand type!");
assert(V2.getSimpleValueType() == MVT::v32i16 && "Bad operand type!");
assert(Mask.size() == 32 && "Unexpected mask size for v32 shuffle!");
@@ -15650,23 +16228,22 @@ static SDValue lowerV32I16VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
// Whenever we can lower this as a zext, that instruction is strictly faster
// than any alternative. It also allows us to fold memory operands into the
// shuffle in many cases.
- if (SDValue ZExt = lowerVectorShuffleAsZeroOrAnyExtend(
+ if (SDValue ZExt = lowerShuffleAsZeroOrAnyExtend(
DL, MVT::v32i16, V1, V2, Mask, Zeroable, Subtarget, DAG))
return ZExt;
// Use dedicated unpack instructions for masks that match their pattern.
- if (SDValue V =
- lowerVectorShuffleWithUNPCK(DL, MVT::v32i16, Mask, V1, V2, DAG))
+ if (SDValue V = lowerShuffleWithUNPCK(DL, MVT::v32i16, Mask, V1, V2, DAG))
return V;
// Try to use shift instructions.
- if (SDValue Shift = lowerVectorShuffleAsShift(DL, MVT::v32i16, V1, V2, Mask,
- Zeroable, Subtarget, DAG))
+ if (SDValue Shift = lowerShuffleAsShift(DL, MVT::v32i16, V1, V2, Mask,
+ Zeroable, Subtarget, DAG))
return Shift;
// Try to use byte rotation instructions.
- if (SDValue Rotate = lowerVectorShuffleAsByteRotate(
- DL, MVT::v32i16, V1, V2, Mask, Subtarget, DAG))
+ if (SDValue Rotate = lowerShuffleAsByteRotate(DL, MVT::v32i16, V1, V2, Mask,
+ Subtarget, DAG))
return Rotate;
if (V2.isUndef()) {
@@ -15675,28 +16252,27 @@ static SDValue lowerV32I16VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
// As this is a single-input shuffle, the repeated mask should be
// a strictly valid v8i16 mask that we can pass through to the v8i16
// lowering to handle even the v32 case.
- return lowerV8I16GeneralSingleInputVectorShuffle(
+ return lowerV8I16GeneralSingleInputShuffle(
DL, MVT::v32i16, V1, RepeatedMask, Subtarget, DAG);
}
}
- if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v32i16, V1, V2, Mask,
+ if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v32i16, V1, V2, Mask,
Zeroable, Subtarget, DAG))
return Blend;
- if (SDValue PSHUFB = lowerVectorShuffleWithPSHUFB(
- DL, MVT::v32i16, Mask, V1, V2, Zeroable, Subtarget, DAG))
+ if (SDValue PSHUFB = lowerShuffleWithPSHUFB(DL, MVT::v32i16, Mask, V1, V2,
+ Zeroable, Subtarget, DAG))
return PSHUFB;
- return lowerVectorShuffleWithPERMV(DL, MVT::v32i16, Mask, V1, V2, DAG);
+ return lowerShuffleWithPERMV(DL, MVT::v32i16, Mask, V1, V2, DAG);
}
/// Handle lowering of 64-lane 8-bit integer shuffles.
-static SDValue lowerV64I8VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
- const APInt &Zeroable,
- SDValue V1, SDValue V2,
- const X86Subtarget &Subtarget,
- SelectionDAG &DAG) {
+static SDValue lowerV64I8Shuffle(const SDLoc &DL, ArrayRef<int> Mask,
+ const APInt &Zeroable, SDValue V1, SDValue V2,
+ const X86Subtarget &Subtarget,
+ SelectionDAG &DAG) {
assert(V1.getSimpleValueType() == MVT::v64i8 && "Bad operand type!");
assert(V2.getSimpleValueType() == MVT::v64i8 && "Bad operand type!");
assert(Mask.size() == 64 && "Unexpected mask size for v64 shuffle!");
@@ -15705,37 +16281,36 @@ static SDValue lowerV64I8VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
// Whenever we can lower this as a zext, that instruction is strictly faster
// than any alternative. It also allows us to fold memory operands into the
// shuffle in many cases.
- if (SDValue ZExt = lowerVectorShuffleAsZeroOrAnyExtend(
+ if (SDValue ZExt = lowerShuffleAsZeroOrAnyExtend(
DL, MVT::v64i8, V1, V2, Mask, Zeroable, Subtarget, DAG))
return ZExt;
// Use dedicated unpack instructions for masks that match their pattern.
- if (SDValue V =
- lowerVectorShuffleWithUNPCK(DL, MVT::v64i8, Mask, V1, V2, DAG))
+ if (SDValue V = lowerShuffleWithUNPCK(DL, MVT::v64i8, Mask, V1, V2, DAG))
return V;
// Use dedicated pack instructions for masks that match their pattern.
- if (SDValue V = lowerVectorShuffleWithPACK(DL, MVT::v64i8, Mask, V1, V2, DAG,
- Subtarget))
+ if (SDValue V = lowerShuffleWithPACK(DL, MVT::v64i8, Mask, V1, V2, DAG,
+ Subtarget))
return V;
// Try to use shift instructions.
- if (SDValue Shift = lowerVectorShuffleAsShift(DL, MVT::v64i8, V1, V2, Mask,
- Zeroable, Subtarget, DAG))
+ if (SDValue Shift = lowerShuffleAsShift(DL, MVT::v64i8, V1, V2, Mask,
+ Zeroable, Subtarget, DAG))
return Shift;
// Try to use byte rotation instructions.
- if (SDValue Rotate = lowerVectorShuffleAsByteRotate(
- DL, MVT::v64i8, V1, V2, Mask, Subtarget, DAG))
+ if (SDValue Rotate = lowerShuffleAsByteRotate(DL, MVT::v64i8, V1, V2, Mask,
+ Subtarget, DAG))
return Rotate;
- if (SDValue PSHUFB = lowerVectorShuffleWithPSHUFB(
- DL, MVT::v64i8, Mask, V1, V2, Zeroable, Subtarget, DAG))
+ if (SDValue PSHUFB = lowerShuffleWithPSHUFB(DL, MVT::v64i8, Mask, V1, V2,
+ Zeroable, Subtarget, DAG))
return PSHUFB;
// VBMI can use VPERMV/VPERMV3 byte shuffles.
if (Subtarget.hasVBMI())
- return lowerVectorShuffleWithPERMV(DL, MVT::v64i8, Mask, V1, V2, DAG);
+ return lowerShuffleWithPERMV(DL, MVT::v64i8, Mask, V1, V2, DAG);
// Try to create an in-lane repeating shuffle mask and then shuffle the
// results into the target lanes.
@@ -15743,12 +16318,19 @@ static SDValue lowerV64I8VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
DL, MVT::v64i8, V1, V2, Mask, Subtarget, DAG))
return V;
- if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v64i8, V1, V2, Mask,
- Zeroable, Subtarget, DAG))
+ if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v64i8, V1, V2, Mask,
+ Zeroable, Subtarget, DAG))
return Blend;
+ // Try to simplify this by merging 128-bit lanes to enable a lane-based
+ // shuffle.
+ if (!V2.isUndef())
+ if (SDValue Result = lowerShuffleAsLanePermuteAndRepeatedMask(
+ DL, MVT::v64i8, V1, V2, Mask, Subtarget, DAG))
+ return Result;
+
// FIXME: Implement direct support for this type!
- return splitAndLowerVectorShuffle(DL, MVT::v64i8, V1, V2, Mask, DAG);
+ return splitAndLowerShuffle(DL, MVT::v64i8, V1, V2, Mask, DAG);
}
/// High-level routine to lower various 512-bit x86 vector shuffles.
@@ -15756,11 +16338,11 @@ static SDValue lowerV64I8VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
/// This routine either breaks down the specific type of a 512-bit x86 vector
/// shuffle or splits it into two 256-bit shuffles and fuses the results back
/// together based on the available instructions.
-static SDValue lower512BitVectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
- MVT VT, SDValue V1, SDValue V2,
- const APInt &Zeroable,
- const X86Subtarget &Subtarget,
- SelectionDAG &DAG) {
+static SDValue lower512BitShuffle(const SDLoc &DL, ArrayRef<int> Mask,
+ MVT VT, SDValue V1, SDValue V2,
+ const APInt &Zeroable,
+ const X86Subtarget &Subtarget,
+ SelectionDAG &DAG) {
assert(Subtarget.hasAVX512() &&
"Cannot lower 512-bit vectors w/ basic ISA!");
@@ -15770,18 +16352,18 @@ static SDValue lower512BitVectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
int NumV2Elements = count_if(Mask, [NumElts](int M) { return M >= NumElts; });
if (NumV2Elements == 1 && Mask[0] >= NumElts)
- if (SDValue Insertion = lowerVectorShuffleAsElementInsertion(
+ if (SDValue Insertion = lowerShuffleAsElementInsertion(
DL, VT, V1, V2, Mask, Zeroable, Subtarget, DAG))
return Insertion;
// Handle special cases where the lower or upper half is UNDEF.
if (SDValue V =
- lowerVectorShuffleWithUndefHalf(DL, VT, V1, V2, Mask, Subtarget, DAG))
+ lowerShuffleWithUndefHalf(DL, VT, V1, V2, Mask, Subtarget, DAG))
return V;
// Check for being able to broadcast a single element.
- if (SDValue Broadcast =
- lowerVectorShuffleAsBroadcast(DL, VT, V1, V2, Mask, Subtarget, DAG))
+ if (SDValue Broadcast = lowerShuffleAsBroadcast(DL, VT, V1, V2, Mask,
+ Subtarget, DAG))
return Broadcast;
// Dispatch to each element type for lowering. If we don't have support for
@@ -15790,17 +16372,17 @@ static SDValue lower512BitVectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
// the requisite ISA extensions for that element type are available.
switch (VT.SimpleTy) {
case MVT::v8f64:
- return lowerV8F64VectorShuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
+ return lowerV8F64Shuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
case MVT::v16f32:
- return lowerV16F32VectorShuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
+ return lowerV16F32Shuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
case MVT::v8i64:
- return lowerV8I64VectorShuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
+ return lowerV8I64Shuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
case MVT::v16i32:
- return lowerV16I32VectorShuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
+ return lowerV16I32Shuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
case MVT::v32i16:
- return lowerV32I16VectorShuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
+ return lowerV32I16Shuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
case MVT::v64i8:
- return lowerV64I8VectorShuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
+ return lowerV64I8Shuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
default:
llvm_unreachable("Not a valid 512-bit x86 vector type!");
@@ -15809,7 +16391,7 @@ static SDValue lower512BitVectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
// Determine if this shuffle can be implemented with a KSHIFT instruction.
// Returns the shift amount if possible or -1 if not. This is a simplified
-// version of matchVectorShuffleAsShift.
+// version of matchShuffleAsShift.
static int match1BitShuffleAsKSHIFT(unsigned &Opcode, ArrayRef<int> Mask,
int MaskOffset, const APInt &Zeroable) {
int Size = Mask.size();
@@ -15844,11 +16426,11 @@ static int match1BitShuffleAsKSHIFT(unsigned &Opcode, ArrayRef<int> Mask,
// There is no a dedicated instruction on AVX-512 that shuffles the masks.
// The only way to shuffle bits is to sign-extend the mask vector to SIMD
// vector, shuffle and then truncate it back.
-static SDValue lower1BitVectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
- MVT VT, SDValue V1, SDValue V2,
- const APInt &Zeroable,
- const X86Subtarget &Subtarget,
- SelectionDAG &DAG) {
+static SDValue lower1BitShuffle(const SDLoc &DL, ArrayRef<int> Mask,
+ MVT VT, SDValue V1, SDValue V2,
+ const APInt &Zeroable,
+ const X86Subtarget &Subtarget,
+ SelectionDAG &DAG) {
assert(Subtarget.hasAVX512() &&
"Cannot lower 512-bit vectors w/o basic ISA!");
@@ -16037,15 +16619,14 @@ static SDValue lowerVectorShuffle(SDValue Op, const X86Subtarget &Subtarget,
// Check for non-undef masks pointing at an undef vector and make the masks
// undef as well. This makes it easier to match the shuffle based solely on
// the mask.
- if (V2IsUndef)
- for (int M : Mask)
- if (M >= NumElements) {
- SmallVector<int, 8> NewMask(Mask.begin(), Mask.end());
- for (int &M : NewMask)
- if (M >= NumElements)
- M = -1;
- return DAG.getVectorShuffle(VT, DL, V1, V2, NewMask);
- }
+ if (V2IsUndef &&
+ any_of(Mask, [NumElements](int M) { return M >= NumElements; })) {
+ SmallVector<int, 8> NewMask(Mask.begin(), Mask.end());
+ for (int &M : NewMask)
+ if (M >= NumElements)
+ M = -1;
+ return DAG.getVectorShuffle(VT, DL, V1, V2, NewMask);
+ }
// Check for illegal shuffle mask element index values.
int MaskUpperLimit = Mask.size() * (V2IsUndef ? 1 : 2); (void)MaskUpperLimit;
@@ -16083,8 +16664,8 @@ static SDValue lowerVectorShuffle(SDValue Op, const X86Subtarget &Subtarget,
// by obfuscating the operands with bitcasts.
// TODO: Avoid lowering directly from this top-level function: make this
// a query (canLowerAsBroadcast) and defer lowering to the type-based calls.
- if (SDValue Broadcast =
- lowerVectorShuffleAsBroadcast(DL, VT, V1, V2, Mask, Subtarget, DAG))
+ if (SDValue Broadcast = lowerShuffleAsBroadcast(DL, VT, V1, V2, Mask,
+ Subtarget, DAG))
return Broadcast;
MVT NewEltVT = VT.isFloatingPoint()
@@ -16122,26 +16703,21 @@ static SDValue lowerVectorShuffle(SDValue Op, const X86Subtarget &Subtarget,
if (canonicalizeShuffleMaskWithCommute(Mask))
return DAG.getCommutedVectorShuffle(*SVOp);
- if (SDValue V =
- lowerVectorShuffleWithVPMOV(DL, Mask, VT, V1, V2, DAG, Subtarget))
+ if (SDValue V = lowerShuffleWithVPMOV(DL, Mask, VT, V1, V2, DAG, Subtarget))
return V;
// For each vector width, delegate to a specialized lowering routine.
if (VT.is128BitVector())
- return lower128BitVectorShuffle(DL, Mask, VT, V1, V2, Zeroable, Subtarget,
- DAG);
+ return lower128BitShuffle(DL, Mask, VT, V1, V2, Zeroable, Subtarget, DAG);
if (VT.is256BitVector())
- return lower256BitVectorShuffle(DL, Mask, VT, V1, V2, Zeroable, Subtarget,
- DAG);
+ return lower256BitShuffle(DL, Mask, VT, V1, V2, Zeroable, Subtarget, DAG);
if (VT.is512BitVector())
- return lower512BitVectorShuffle(DL, Mask, VT, V1, V2, Zeroable, Subtarget,
- DAG);
+ return lower512BitShuffle(DL, Mask, VT, V1, V2, Zeroable, Subtarget, DAG);
if (Is1BitVector)
- return lower1BitVectorShuffle(DL, Mask, VT, V1, V2, Zeroable, Subtarget,
- DAG);
+ return lower1BitShuffle(DL, Mask, VT, V1, V2, Zeroable, Subtarget, DAG);
llvm_unreachable("Unimplemented!");
}
@@ -16401,7 +16977,7 @@ X86TargetLowering::LowerEXTRACT_VECTOR_ELT(SDValue Op,
// this can be done with a mask.
IdxVal &= ElemsPerChunk - 1;
return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, Op.getValueType(), Vec,
- DAG.getConstant(IdxVal, dl, MVT::i32));
+ DAG.getIntPtrConstant(IdxVal, dl));
}
assert(VecVT.is128BitVector() && "Unexpected vector length");
@@ -16527,10 +17103,11 @@ SDValue X86TargetLowering::LowerINSERT_VECTOR_ELT(SDValue Op,
SDValue N0 = Op.getOperand(0);
SDValue N1 = Op.getOperand(1);
SDValue N2 = Op.getOperand(2);
- if (!isa<ConstantSDNode>(N2))
+
+ auto *N2C = dyn_cast<ConstantSDNode>(N2);
+ if (!N2C || N2C->getAPIntValue().uge(NumElts))
return SDValue();
- auto *N2C = cast<ConstantSDNode>(N2);
- unsigned IdxVal = N2C->getZExtValue();
+ uint64_t IdxVal = N2C->getZExtValue();
bool IsZeroElt = X86::isZeroNode(N1);
bool IsAllOnesElt = VT.isInteger() && llvm::isAllOnesConstant(N1);
@@ -16575,13 +17152,21 @@ SDValue X86TargetLowering::LowerINSERT_VECTOR_ELT(SDValue Op,
unsigned IdxIn128 = IdxVal & (NumEltsIn128 - 1);
V = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, V.getValueType(), V, N1,
- DAG.getConstant(IdxIn128, dl, MVT::i32));
+ DAG.getIntPtrConstant(IdxIn128, dl));
// Insert the changed part back into the bigger vector
return insert128BitVector(N0, V, IdxVal, DAG, dl);
}
assert(VT.is128BitVector() && "Only 128-bit vector types should be left!");
+ // This will be just movd/movq/movss/movsd.
+ if (IdxVal == 0 && ISD::isBuildVectorAllZeros(N0.getNode()) &&
+ (EltVT == MVT::i32 || EltVT == MVT::f32 || EltVT == MVT::f64 ||
+ EltVT == MVT::i64)) {
+ N1 = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, N1);
+ return getShuffleVectorZeroOrUndef(N1, 0, true, Subtarget, DAG);
+ }
+
// Transform it so it match pinsr{b,w} which expects a GR32 as its second
// argument. SSE41 required for pinsrb.
if (VT == MVT::v8i16 || (VT == MVT::v16i8 && Subtarget.hasSSE41())) {
@@ -16613,7 +17198,7 @@ SDValue X86TargetLowering::LowerINSERT_VECTOR_ELT(SDValue Op,
// Bits [3:0] of the constant are the zero mask. The DAG Combiner may
// combine either bitwise AND or insert of float 0.0 to set these bits.
- bool MinSize = DAG.getMachineFunction().getFunction().optForMinSize();
+ bool MinSize = DAG.getMachineFunction().getFunction().hasMinSize();
if (IdxVal == 0 && (!MinSize || !MayFoldLoad(N1))) {
// If this is an insertion of 32-bits into the low 32-bits of
// a vector, we prefer to generate a blend with immediate rather
@@ -16663,7 +17248,8 @@ static SDValue LowerSCALAR_TO_VECTOR(SDValue Op, const X86Subtarget &Subtarget,
// Insert the 128-bit vector.
return insert128BitVector(DAG.getUNDEF(OpVT), Op, 0, DAG, dl);
}
- assert(OpVT.is128BitVector() && "Expected an SSE type!");
+ assert(OpVT.is128BitVector() && OpVT.isInteger() && OpVT != MVT::v2i64 &&
+ "Expected an SSE type!");
// Pass through a v4i32 SCALAR_TO_VECTOR as that's what we use in tblgen.
if (OpVT == MVT::v4i32)
@@ -16789,35 +17375,9 @@ SDValue X86TargetLowering::LowerJumpTable(SDValue Op, SelectionDAG &DAG) const {
return Result;
}
-SDValue
-X86TargetLowering::LowerExternalSymbol(SDValue Op, SelectionDAG &DAG) const {
- const char *Sym = cast<ExternalSymbolSDNode>(Op)->getSymbol();
-
- // In PIC mode (unless we're in RIPRel PIC mode) we add an offset to the
- // global base reg.
- const Module *Mod = DAG.getMachineFunction().getFunction().getParent();
- unsigned char OpFlag = Subtarget.classifyGlobalReference(nullptr, *Mod);
-
- auto PtrVT = getPointerTy(DAG.getDataLayout());
- SDValue Result = DAG.getTargetExternalSymbol(Sym, PtrVT, OpFlag);
-
- SDLoc DL(Op);
- Result = DAG.getNode(getGlobalWrapperKind(), DL, PtrVT, Result);
-
- // With PIC, the address is actually $g + Offset.
- if (OpFlag) {
- Result =
- DAG.getNode(ISD::ADD, DL, PtrVT,
- DAG.getNode(X86ISD::GlobalBaseReg, SDLoc(), PtrVT), Result);
- }
-
- // For symbols that require a load from a stub to get the address, emit the
- // load.
- if (isGlobalStubReference(OpFlag))
- Result = DAG.getLoad(PtrVT, DL, DAG.getEntryNode(), Result,
- MachinePointerInfo::getGOT(DAG.getMachineFunction()));
-
- return Result;
+SDValue X86TargetLowering::LowerExternalSymbol(SDValue Op,
+ SelectionDAG &DAG) const {
+ return LowerGlobalOrExternal(Op, DAG, /*ForCall=*/false);
}
SDValue
@@ -16841,35 +17401,67 @@ X86TargetLowering::LowerBlockAddress(SDValue Op, SelectionDAG &DAG) const {
return Result;
}
-SDValue X86TargetLowering::LowerGlobalAddress(const GlobalValue *GV,
- const SDLoc &dl, int64_t Offset,
- SelectionDAG &DAG) const {
- // Create the TargetGlobalAddress node, folding in the constant
- // offset if it is legal.
- unsigned char OpFlags = Subtarget.classifyGlobalReference(GV);
+/// Creates target global address or external symbol nodes for calls or
+/// other uses.
+SDValue X86TargetLowering::LowerGlobalOrExternal(SDValue Op, SelectionDAG &DAG,
+ bool ForCall) const {
+ // Unpack the global address or external symbol.
+ const SDLoc &dl = SDLoc(Op);
+ const GlobalValue *GV = nullptr;
+ int64_t Offset = 0;
+ const char *ExternalSym = nullptr;
+ if (const auto *G = dyn_cast<GlobalAddressSDNode>(Op)) {
+ GV = G->getGlobal();
+ Offset = G->getOffset();
+ } else {
+ const auto *ES = cast<ExternalSymbolSDNode>(Op);
+ ExternalSym = ES->getSymbol();
+ }
+
+ // Calculate some flags for address lowering.
+ const Module &Mod = *DAG.getMachineFunction().getFunction().getParent();
+ unsigned char OpFlags;
+ if (ForCall)
+ OpFlags = Subtarget.classifyGlobalFunctionReference(GV, Mod);
+ else
+ OpFlags = Subtarget.classifyGlobalReference(GV, Mod);
+ bool HasPICReg = isGlobalRelativeToPICBase(OpFlags);
+ bool NeedsLoad = isGlobalStubReference(OpFlags);
+
CodeModel::Model M = DAG.getTarget().getCodeModel();
auto PtrVT = getPointerTy(DAG.getDataLayout());
SDValue Result;
- if (OpFlags == X86II::MO_NO_FLAG &&
- X86::isOffsetSuitableForCodeModel(Offset, M)) {
- // A direct static reference to a global.
- Result = DAG.getTargetGlobalAddress(GV, dl, PtrVT, Offset);
- Offset = 0;
+
+ if (GV) {
+ // Create a target global address if this is a global. If possible, fold the
+ // offset into the global address reference. Otherwise, ADD it on later.
+ int64_t GlobalOffset = 0;
+ if (OpFlags == X86II::MO_NO_FLAG &&
+ X86::isOffsetSuitableForCodeModel(Offset, M)) {
+ std::swap(GlobalOffset, Offset);
+ }
+ Result = DAG.getTargetGlobalAddress(GV, dl, PtrVT, GlobalOffset, OpFlags);
} else {
- Result = DAG.getTargetGlobalAddress(GV, dl, PtrVT, 0, OpFlags);
+ // If this is not a global address, this must be an external symbol.
+ Result = DAG.getTargetExternalSymbol(ExternalSym, PtrVT, OpFlags);
}
+ // If this is a direct call, avoid the wrapper if we don't need to do any
+ // loads or adds. This allows SDAG ISel to match direct calls.
+ if (ForCall && !NeedsLoad && !HasPICReg && Offset == 0)
+ return Result;
+
Result = DAG.getNode(getGlobalWrapperKind(GV, OpFlags), dl, PtrVT, Result);
// With PIC, the address is actually $g + Offset.
- if (isGlobalRelativeToPICBase(OpFlags)) {
+ if (HasPICReg) {
Result = DAG.getNode(ISD::ADD, dl, PtrVT,
DAG.getNode(X86ISD::GlobalBaseReg, dl, PtrVT), Result);
}
// For globals that require a load from a stub to get the address, emit the
// load.
- if (isGlobalStubReference(OpFlags))
+ if (NeedsLoad)
Result = DAG.getLoad(PtrVT, dl, DAG.getEntryNode(), Result,
MachinePointerInfo::getGOT(DAG.getMachineFunction()));
@@ -16884,9 +17476,7 @@ SDValue X86TargetLowering::LowerGlobalAddress(const GlobalValue *GV,
SDValue
X86TargetLowering::LowerGlobalAddress(SDValue Op, SelectionDAG &DAG) const {
- const GlobalValue *GV = cast<GlobalAddressSDNode>(Op)->getGlobal();
- int64_t Offset = cast<GlobalAddressSDNode>(Op)->getOffset();
- return LowerGlobalAddress(GV, SDLoc(Op), Offset, DAG);
+ return LowerGlobalOrExternal(Op, DAG, /*ForCall=*/false);
}
static SDValue
@@ -17112,9 +17702,7 @@ X86TargetLowering::LowerGlobalTLSAddress(SDValue Op, SelectionDAG &DAG) const {
return DAG.getCopyFromReg(Chain, DL, Reg, PtrVT, Chain.getValue(1));
}
- if (Subtarget.isTargetKnownWindowsMSVC() ||
- Subtarget.isTargetWindowsItanium() ||
- Subtarget.isTargetWindowsGNU()) {
+ if (Subtarget.isOSWindows()) {
// Just use the implicit TLS architecture
// Need to generate something similar to:
// mov rdx, qword [gs:abs 58H]; Load pointer to ThreadLocalStorage
@@ -17254,7 +17842,7 @@ static SDValue LowerFunnelShift(SDValue Op, const X86Subtarget &Subtarget,
APInt APIntShiftAmt;
if (isConstantSplat(Amt, APIntShiftAmt)) {
- uint64_t ShiftAmt = APIntShiftAmt.getZExtValue();
+ uint64_t ShiftAmt = APIntShiftAmt.urem(VT.getScalarSizeInBits());
return DAG.getNode(IsFSHR ? X86ISD::VSHRD : X86ISD::VSHLD, DL, VT,
Op0, Op1, DAG.getConstant(ShiftAmt, DL, MVT::i8));
}
@@ -17267,7 +17855,7 @@ static SDValue LowerFunnelShift(SDValue Op, const X86Subtarget &Subtarget,
"Unexpected funnel shift type!");
// Expand slow SHLD/SHRD cases if we are not optimizing for size.
- bool OptForSize = DAG.getMachineFunction().getFunction().optForSize();
+ bool OptForSize = DAG.getMachineFunction().getFunction().hasOptSize();
if (!OptForSize && Subtarget.isSHLDSlow())
return SDValue();
@@ -17311,6 +17899,70 @@ static SDValue LowerI64IntToFP_AVX512DQ(SDValue Op, SelectionDAG &DAG,
DAG.getIntPtrConstant(0, dl));
}
+static bool useVectorCast(unsigned Opcode, MVT FromVT, MVT ToVT,
+ const X86Subtarget &Subtarget) {
+ switch (Opcode) {
+ case ISD::SINT_TO_FP:
+ // TODO: Handle wider types with AVX/AVX512.
+ if (!Subtarget.hasSSE2() || FromVT != MVT::v4i32)
+ return false;
+ // CVTDQ2PS or (V)CVTDQ2PD
+ return ToVT == MVT::v4f32 || (Subtarget.hasAVX() && ToVT == MVT::v4f64);
+
+ case ISD::UINT_TO_FP:
+ // TODO: Handle wider types and i64 elements.
+ if (!Subtarget.hasAVX512() || FromVT != MVT::v4i32)
+ return false;
+ // VCVTUDQ2PS or VCVTUDQ2PD
+ return ToVT == MVT::v4f32 || ToVT == MVT::v4f64;
+
+ default:
+ return false;
+ }
+}
+
+/// Given a scalar cast operation that is extracted from a vector, try to
+/// vectorize the cast op followed by extraction. This will avoid an expensive
+/// round-trip between XMM and GPR.
+static SDValue vectorizeExtractedCast(SDValue Cast, SelectionDAG &DAG,
+ const X86Subtarget &Subtarget) {
+ // TODO: This could be enhanced to handle smaller integer types by peeking
+ // through an extend.
+ SDValue Extract = Cast.getOperand(0);
+ MVT DestVT = Cast.getSimpleValueType();
+ if (Extract.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
+ !isa<ConstantSDNode>(Extract.getOperand(1)))
+ return SDValue();
+
+ // See if we have a 128-bit vector cast op for this type of cast.
+ SDValue VecOp = Extract.getOperand(0);
+ MVT FromVT = VecOp.getSimpleValueType();
+ unsigned NumEltsInXMM = 128 / FromVT.getScalarSizeInBits();
+ MVT Vec128VT = MVT::getVectorVT(FromVT.getScalarType(), NumEltsInXMM);
+ MVT ToVT = MVT::getVectorVT(DestVT, NumEltsInXMM);
+ if (!useVectorCast(Cast.getOpcode(), Vec128VT, ToVT, Subtarget))
+ return SDValue();
+
+ // If we are extracting from a non-zero element, first shuffle the source
+ // vector to allow extracting from element zero.
+ SDLoc DL(Cast);
+ if (!isNullConstant(Extract.getOperand(1))) {
+ SmallVector<int, 16> Mask(FromVT.getVectorNumElements(), -1);
+ Mask[0] = Extract.getConstantOperandVal(1);
+ VecOp = DAG.getVectorShuffle(FromVT, DL, VecOp, DAG.getUNDEF(FromVT), Mask);
+ }
+ // If the source vector is wider than 128-bits, extract the low part. Do not
+ // create an unnecessarily wide vector cast op.
+ if (FromVT != Vec128VT)
+ VecOp = extract128BitVector(VecOp, 0, DAG, DL);
+
+ // cast (extelt V, 0) --> extelt (cast (extract_subv V)), 0
+ // cast (extelt V, C) --> extelt (cast (extract_subv (shuffle V, [C...]))), 0
+ SDValue VCast = DAG.getNode(Cast.getOpcode(), DL, ToVT, VecOp);
+ return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, DestVT, VCast,
+ DAG.getIntPtrConstant(0, DL));
+}
+
SDValue X86TargetLowering::LowerSINT_TO_FP(SDValue Op,
SelectionDAG &DAG) const {
SDValue Src = Op.getOperand(0);
@@ -17318,6 +17970,9 @@ SDValue X86TargetLowering::LowerSINT_TO_FP(SDValue Op,
MVT VT = Op.getSimpleValueType();
SDLoc dl(Op);
+ if (SDValue Extract = vectorizeExtractedCast(Op, DAG, Subtarget))
+ return Extract;
+
if (SrcVT.isVector()) {
if (SrcVT == MVT::v2i32 && VT == MVT::v2f64) {
return DAG.getNode(X86ISD::CVTSI2P, dl, VT,
@@ -17371,23 +18026,23 @@ SDValue X86TargetLowering::BuildFILD(SDValue Op, EVT SrcVT, SDValue Chain,
else
Tys = DAG.getVTList(Op.getValueType(), MVT::Other);
- unsigned ByteSize = SrcVT.getSizeInBits()/8;
+ unsigned ByteSize = SrcVT.getSizeInBits() / 8;
FrameIndexSDNode *FI = dyn_cast<FrameIndexSDNode>(StackSlot);
- MachineMemOperand *MMO;
+ MachineMemOperand *LoadMMO;
if (FI) {
int SSFI = FI->getIndex();
- MMO = DAG.getMachineFunction().getMachineMemOperand(
+ LoadMMO = DAG.getMachineFunction().getMachineMemOperand(
MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), SSFI),
MachineMemOperand::MOLoad, ByteSize, ByteSize);
} else {
- MMO = cast<LoadSDNode>(StackSlot)->getMemOperand();
+ LoadMMO = cast<LoadSDNode>(StackSlot)->getMemOperand();
StackSlot = StackSlot.getOperand(1);
}
- SDValue Ops[] = { Chain, StackSlot, DAG.getValueType(SrcVT) };
- SDValue Result = DAG.getMemIntrinsicNode(useSSE ? X86ISD::FILD_FLAG :
- X86ISD::FILD, DL,
- Tys, Ops, SrcVT, MMO);
+ SDValue FILDOps[] = {Chain, StackSlot};
+ SDValue Result =
+ DAG.getMemIntrinsicNode(useSSE ? X86ISD::FILD_FLAG : X86ISD::FILD, DL,
+ Tys, FILDOps, SrcVT, LoadMMO);
if (useSSE) {
Chain = Result.getValue(1);
@@ -17397,20 +18052,18 @@ SDValue X86TargetLowering::BuildFILD(SDValue Op, EVT SrcVT, SDValue Chain,
// shouldn't be necessary except that RFP cannot be live across
// multiple blocks. When stackifier is fixed, they can be uncoupled.
MachineFunction &MF = DAG.getMachineFunction();
- unsigned SSFISize = Op.getValueSizeInBits()/8;
+ unsigned SSFISize = Op.getValueSizeInBits() / 8;
int SSFI = MF.getFrameInfo().CreateStackObject(SSFISize, SSFISize, false);
auto PtrVT = getPointerTy(MF.getDataLayout());
SDValue StackSlot = DAG.getFrameIndex(SSFI, PtrVT);
Tys = DAG.getVTList(MVT::Other);
- SDValue Ops[] = {
- Chain, Result, StackSlot, DAG.getValueType(Op.getValueType()), InFlag
- };
- MachineMemOperand *MMO = DAG.getMachineFunction().getMachineMemOperand(
+ SDValue FSTOps[] = {Chain, Result, StackSlot, InFlag};
+ MachineMemOperand *StoreMMO = DAG.getMachineFunction().getMachineMemOperand(
MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), SSFI),
MachineMemOperand::MOStore, SSFISize, SSFISize);
- Chain = DAG.getMemIntrinsicNode(X86ISD::FST, DL, Tys,
- Ops, Op.getValueType(), MMO);
+ Chain = DAG.getMemIntrinsicNode(X86ISD::FST, DL, Tys, FSTOps,
+ Op.getValueType(), StoreMMO);
Result = DAG.getLoad(
Op.getValueType(), DL, Chain, StackSlot,
MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), SSFI));
@@ -17545,7 +18198,7 @@ static SDValue lowerUINT_TO_FP_v2i32(SDValue Op, SelectionDAG &DAG,
SDValue HalfWordMask = DAG.getConstant(0x0000FFFF, DL, MVT::v4i32);
// Two to the power of half-word-size.
- SDValue TWOHW = DAG.getConstantFP(1 << 16, DL, MVT::v2f64);
+ SDValue TWOHW = DAG.getConstantFP((double)(1 << 16), DL, MVT::v2f64);
// Clear upper part of LO, lower HI.
SDValue HI = DAG.getNode(ISD::SRL, DL, MVT::v4i32, N0, HalfWord);
@@ -17680,6 +18333,9 @@ SDValue X86TargetLowering::LowerUINT_TO_FP(SDValue Op,
if (Op.getSimpleValueType().isVector())
return lowerUINT_TO_FP_vec(Op, DAG, Subtarget);
+ if (SDValue Extract = vectorizeExtractedCast(Op, DAG, Subtarget))
+ return Extract;
+
MVT SrcVT = N0.getSimpleValueType();
MVT DstVT = Op.getSimpleValueType();
@@ -17732,7 +18388,7 @@ SDValue X86TargetLowering::LowerUINT_TO_FP(SDValue Op,
MachineMemOperand::MOLoad, 8, 8);
SDVTList Tys = DAG.getVTList(MVT::f80, MVT::Other);
- SDValue Ops[] = { Store, StackSlot, DAG.getValueType(MVT::i64) };
+ SDValue Ops[] = { Store, StackSlot };
SDValue Fild = DAG.getMemIntrinsicNode(X86ISD::FILD, dl, Tys, Ops,
MVT::i64, MMO);
@@ -17768,16 +18424,13 @@ SDValue X86TargetLowering::LowerUINT_TO_FP(SDValue Op,
// If the given FP_TO_SINT (IsSigned) or FP_TO_UINT (!IsSigned) operation
// is legal, or has an fp128 or f16 source (which needs to be promoted to f32),
-// just return an <SDValue(), SDValue()> pair.
+// just return an SDValue().
// Otherwise it is assumed to be a conversion from one of f32, f64 or f80
-// to i16, i32 or i64, and we lower it to a legal sequence.
-// If lowered to the final integer result we return a <result, SDValue()> pair.
-// Otherwise we lower it to a sequence ending with a FIST, return a
-// <FIST, StackSlot> pair, and the caller is responsible for loading
-// the final integer result from StackSlot.
-std::pair<SDValue,SDValue>
+// to i16, i32 or i64, and we lower it to a legal sequence and return the
+// result.
+SDValue
X86TargetLowering::FP_TO_INTHelper(SDValue Op, SelectionDAG &DAG,
- bool IsSigned, bool IsReplace) const {
+ bool IsSigned) const {
SDLoc DL(Op);
EVT DstTy = Op.getValueType();
@@ -17787,18 +18440,15 @@ X86TargetLowering::FP_TO_INTHelper(SDValue Op, SelectionDAG &DAG,
if (TheVT != MVT::f32 && TheVT != MVT::f64 && TheVT != MVT::f80) {
// f16 must be promoted before using the lowering in this routine.
// fp128 does not use this lowering.
- return std::make_pair(SDValue(), SDValue());
+ return SDValue();
}
// If using FIST to compute an unsigned i64, we'll need some fixup
// to handle values above the maximum signed i64. A FIST is always
// used for the 32-bit subtarget, but also for f80 on a 64-bit target.
- bool UnsignedFixup = !IsSigned &&
- DstTy == MVT::i64 &&
- (!Subtarget.is64Bit() ||
- !isScalarFPTypeInSSEReg(TheVT));
+ bool UnsignedFixup = !IsSigned && DstTy == MVT::i64;
- if (!IsSigned && DstTy != MVT::i64 && !Subtarget.hasAVX512()) {
+ if (!IsSigned && DstTy != MVT::i64) {
// Replace the fp-to-uint32 operation with an fp-to-sint64 FIST.
// The low 32 bits of the fist result will have the correct uint32 result.
assert(DstTy == MVT::i32 && "Unexpected FP_TO_UINT");
@@ -17809,30 +18459,13 @@ X86TargetLowering::FP_TO_INTHelper(SDValue Op, SelectionDAG &DAG,
DstTy.getSimpleVT() >= MVT::i16 &&
"Unknown FP_TO_INT to lower!");
- // These are really Legal.
- if (DstTy == MVT::i32 &&
- isScalarFPTypeInSSEReg(Op.getOperand(0).getValueType()))
- return std::make_pair(SDValue(), SDValue());
- if (Subtarget.is64Bit() &&
- DstTy == MVT::i64 &&
- isScalarFPTypeInSSEReg(Op.getOperand(0).getValueType()))
- return std::make_pair(SDValue(), SDValue());
-
// We lower FP->int64 into FISTP64 followed by a load from a temporary
// stack slot.
MachineFunction &MF = DAG.getMachineFunction();
- unsigned MemSize = DstTy.getSizeInBits()/8;
+ unsigned MemSize = DstTy.getStoreSize();
int SSFI = MF.getFrameInfo().CreateStackObject(MemSize, MemSize, false);
SDValue StackSlot = DAG.getFrameIndex(SSFI, PtrVT);
- unsigned Opc;
- switch (DstTy.getSimpleVT().SimpleTy) {
- default: llvm_unreachable("Invalid FP_TO_SINT to lower!");
- case MVT::i16: Opc = X86ISD::FP_TO_INT16_IN_MEM; break;
- case MVT::i32: Opc = X86ISD::FP_TO_INT32_IN_MEM; break;
- case MVT::i64: Opc = X86ISD::FP_TO_INT64_IN_MEM; break;
- }
-
SDValue Chain = DAG.getEntryNode();
SDValue Value = Op.getOperand(0);
SDValue Adjust; // 0x0 or 0x80000000, for result sign bit adjustment.
@@ -17874,9 +18507,10 @@ X86TargetLowering::FP_TO_INTHelper(SDValue Op, SelectionDAG &DAG,
getSetCCResultType(DAG.getDataLayout(),
*DAG.getContext(), TheVT),
Value, ThreshVal, ISD::SETLT);
- Adjust = DAG.getSelect(DL, MVT::i32, Cmp,
- DAG.getConstant(0, DL, MVT::i32),
- DAG.getConstant(0x80000000, DL, MVT::i32));
+ Adjust = DAG.getSelect(DL, MVT::i64, Cmp,
+ DAG.getConstant(0, DL, MVT::i64),
+ DAG.getConstant(APInt::getSignMask(64),
+ DL, MVT::i64));
SDValue Sub = DAG.getNode(ISD::FSUB, DL, TheVT, Value, ThreshVal);
Cmp = DAG.getSetCC(DL, getSetCCResultType(DAG.getDataLayout(),
*DAG.getContext(), TheVT),
@@ -17884,81 +18518,52 @@ X86TargetLowering::FP_TO_INTHelper(SDValue Op, SelectionDAG &DAG,
Value = DAG.getSelect(DL, TheVT, Cmp, Value, Sub);
}
+ MachinePointerInfo MPI = MachinePointerInfo::getFixedStack(MF, SSFI);
+
// FIXME This causes a redundant load/store if the SSE-class value is already
// in memory, such as if it is on the callstack.
if (isScalarFPTypeInSSEReg(TheVT)) {
assert(DstTy == MVT::i64 && "Invalid FP_TO_SINT to lower!");
- Chain = DAG.getStore(Chain, DL, Value, StackSlot,
- MachinePointerInfo::getFixedStack(MF, SSFI));
- SDVTList Tys = DAG.getVTList(Op.getOperand(0).getValueType(), MVT::Other);
- SDValue Ops[] = {
- Chain, StackSlot, DAG.getValueType(TheVT)
- };
-
- MachineMemOperand *MMO =
- MF.getMachineMemOperand(MachinePointerInfo::getFixedStack(MF, SSFI),
- MachineMemOperand::MOLoad, MemSize, MemSize);
- Value = DAG.getMemIntrinsicNode(X86ISD::FLD, DL, Tys, Ops, DstTy, MMO);
+ Chain = DAG.getStore(Chain, DL, Value, StackSlot, MPI);
+ SDVTList Tys = DAG.getVTList(TheVT, MVT::Other);
+ SDValue Ops[] = { Chain, StackSlot };
+
+ unsigned FLDSize = TheVT.getStoreSize();
+ assert(FLDSize <= MemSize && "Stack slot not big enough");
+ MachineMemOperand *MMO = MF.getMachineMemOperand(
+ MPI, MachineMemOperand::MOLoad, FLDSize, FLDSize);
+ Value = DAG.getMemIntrinsicNode(X86ISD::FLD, DL, Tys, Ops, TheVT, MMO);
Chain = Value.getValue(1);
- SSFI = MF.getFrameInfo().CreateStackObject(MemSize, MemSize, false);
- StackSlot = DAG.getFrameIndex(SSFI, PtrVT);
}
- MachineMemOperand *MMO =
- MF.getMachineMemOperand(MachinePointerInfo::getFixedStack(MF, SSFI),
- MachineMemOperand::MOStore, MemSize, MemSize);
-
- if (UnsignedFixup) {
-
- // Insert the FIST, load its result as two i32's,
- // and XOR the high i32 with Adjust.
+ // Build the FP_TO_INT*_IN_MEM
+ MachineMemOperand *MMO = MF.getMachineMemOperand(
+ MPI, MachineMemOperand::MOStore, MemSize, MemSize);
+ SDValue Ops[] = { Chain, Value, StackSlot };
+ SDValue FIST = DAG.getMemIntrinsicNode(X86ISD::FP_TO_INT_IN_MEM, DL,
+ DAG.getVTList(MVT::Other),
+ Ops, DstTy, MMO);
- SDValue FistOps[] = { Chain, Value, StackSlot };
- SDValue FIST = DAG.getMemIntrinsicNode(Opc, DL, DAG.getVTList(MVT::Other),
- FistOps, DstTy, MMO);
+ SDValue Res = DAG.getLoad(Op.getValueType(), SDLoc(Op), FIST, StackSlot, MPI);
- SDValue Low32 =
- DAG.getLoad(MVT::i32, DL, FIST, StackSlot, MachinePointerInfo());
- SDValue HighAddr = DAG.getMemBasePlusOffset(StackSlot, 4, DL);
+ // If we need an unsigned fixup, XOR the result with adjust.
+ if (UnsignedFixup)
+ Res = DAG.getNode(ISD::XOR, DL, MVT::i64, Res, Adjust);
- SDValue High32 =
- DAG.getLoad(MVT::i32, DL, FIST, HighAddr, MachinePointerInfo());
- High32 = DAG.getNode(ISD::XOR, DL, MVT::i32, High32, Adjust);
-
- if (Subtarget.is64Bit()) {
- // Join High32 and Low32 into a 64-bit result.
- // (High32 << 32) | Low32
- Low32 = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i64, Low32);
- High32 = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i64, High32);
- High32 = DAG.getNode(ISD::SHL, DL, MVT::i64, High32,
- DAG.getConstant(32, DL, MVT::i8));
- SDValue Result = DAG.getNode(ISD::OR, DL, MVT::i64, High32, Low32);
- return std::make_pair(Result, SDValue());
- }
-
- SDValue ResultOps[] = { Low32, High32 };
-
- SDValue pair = IsReplace
- ? DAG.getNode(ISD::BUILD_PAIR, DL, MVT::i64, ResultOps)
- : DAG.getMergeValues(ResultOps, DL);
- return std::make_pair(pair, SDValue());
- } else {
- // Build the FP_TO_INT*_IN_MEM
- SDValue Ops[] = { Chain, Value, StackSlot };
- SDValue FIST = DAG.getMemIntrinsicNode(Opc, DL, DAG.getVTList(MVT::Other),
- Ops, DstTy, MMO);
- return std::make_pair(FIST, StackSlot);
- }
+ return Res;
}
static SDValue LowerAVXExtend(SDValue Op, SelectionDAG &DAG,
const X86Subtarget &Subtarget) {
- MVT VT = Op->getSimpleValueType(0);
- SDValue In = Op->getOperand(0);
+ MVT VT = Op.getSimpleValueType();
+ SDValue In = Op.getOperand(0);
MVT InVT = In.getSimpleValueType();
SDLoc dl(Op);
+ unsigned Opc = Op.getOpcode();
assert(VT.isVector() && InVT.isVector() && "Expected vector type");
+ assert((Opc == ISD::ANY_EXTEND || Opc == ISD::ZERO_EXTEND) &&
+ "Unexpected extension opcode");
assert(VT.getVectorNumElements() == VT.getVectorNumElements() &&
"Expected same number of elements");
assert((VT.getVectorElementType() == MVT::i16 ||
@@ -17970,6 +18575,8 @@ static SDValue LowerAVXExtend(SDValue Op, SelectionDAG &DAG,
InVT.getVectorElementType() == MVT::i32) &&
"Unexpected element type");
+ unsigned ExtendInVecOpc = getOpcode_EXTEND_VECTOR_INREG(Opc);
+
// Custom legalize v8i8->v8i64 on CPUs without avx512bw.
if (InVT == MVT::v8i8) {
if (!ExperimentalVectorWideningLegalization || VT != MVT::v8i64)
@@ -17977,8 +18584,7 @@ static SDValue LowerAVXExtend(SDValue Op, SelectionDAG &DAG,
In = DAG.getNode(ISD::CONCAT_VECTORS, SDLoc(Op),
MVT::v16i8, In, DAG.getUNDEF(MVT::v8i8));
- // FIXME: This should be ANY_EXTEND_VECTOR_INREG for ANY_EXTEND input.
- return DAG.getNode(ISD::ZERO_EXTEND_VECTOR_INREG, dl, VT, In);
+ return DAG.getNode(ExtendInVecOpc, dl, VT, In);
}
if (Subtarget.hasInt256())
@@ -18000,11 +18606,17 @@ static SDValue LowerAVXExtend(SDValue Op, SelectionDAG &DAG,
MVT HalfVT = MVT::getVectorVT(VT.getVectorElementType(),
VT.getVectorNumElements() / 2);
- SDValue OpLo = DAG.getNode(ISD::ZERO_EXTEND_VECTOR_INREG, dl, HalfVT, In);
+ SDValue OpLo = DAG.getNode(ExtendInVecOpc, dl, HalfVT, In);
+
+ // Short-circuit if we can determine that each 128-bit half is the same value.
+ // Otherwise, this is difficult to match and optimize.
+ if (auto *Shuf = dyn_cast<ShuffleVectorSDNode>(In))
+ if (hasIdenticalHalvesShuffleMask(Shuf->getMask()))
+ return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, OpLo, OpLo);
SDValue ZeroVec = DAG.getConstant(0, dl, InVT);
SDValue Undef = DAG.getUNDEF(InVT);
- bool NeedZero = Op.getOpcode() == ISD::ZERO_EXTEND;
+ bool NeedZero = Opc == ISD::ZERO_EXTEND;
SDValue OpHi = getUnpackh(DAG, dl, InVT, In, NeedZero ? ZeroVec : Undef);
OpHi = DAG.getBitcast(HalfVT, OpHi);
@@ -18179,8 +18791,11 @@ static SDValue truncateVectorWithPACK(unsigned Opcode, EVT DstVT, SDValue In,
// 256-bit PACK(ARG0, ARG1) leaves us with ((LO0,LO1),(HI0,HI1)),
// so we need to shuffle to get ((LO0,HI0),(LO1,HI1)).
- Res = DAG.getBitcast(MVT::v4i64, Res);
- Res = DAG.getVectorShuffle(MVT::v4i64, DL, Res, Res, {0, 2, 1, 3});
+ // Scale shuffle mask to avoid bitcasts and help ComputeNumSignBits.
+ SmallVector<int, 64> Mask;
+ int Scale = 64 / OutVT.getScalarSizeInBits();
+ scaleShuffleMask<int>(Scale, ArrayRef<int>({ 0, 2, 1, 3 }), Mask);
+ Res = DAG.getVectorShuffle(OutVT, DL, Res, Res, Mask);
if (DstVT.is256BitVector())
return DAG.getBitcast(DstVT, Res);
@@ -18422,12 +19037,12 @@ SDValue X86TargetLowering::LowerTRUNCATE(SDValue Op, SelectionDAG &DAG) const {
SDValue X86TargetLowering::LowerFP_TO_INT(SDValue Op, SelectionDAG &DAG) const {
bool IsSigned = Op.getOpcode() == ISD::FP_TO_SINT;
MVT VT = Op.getSimpleValueType();
+ SDValue Src = Op.getOperand(0);
+ MVT SrcVT = Src.getSimpleValueType();
+ SDLoc dl(Op);
if (VT.isVector()) {
- SDValue Src = Op.getOperand(0);
- SDLoc dl(Op);
-
- if (VT == MVT::v2i1 && Src.getSimpleValueType() == MVT::v2f64) {
+ if (VT == MVT::v2i1 && SrcVT == MVT::v2f64) {
MVT ResVT = MVT::v4i32;
MVT TruncVT = MVT::v4i1;
unsigned Opc = IsSigned ? X86ISD::CVTTP2SI : X86ISD::CVTTP2UI;
@@ -18447,7 +19062,7 @@ SDValue X86TargetLowering::LowerFP_TO_INT(SDValue Op, SelectionDAG &DAG) const {
}
assert(Subtarget.hasDQI() && Subtarget.hasVLX() && "Requires AVX512DQVL!");
- if (VT == MVT::v2i64 && Src.getSimpleValueType() == MVT::v2f32) {
+ if (VT == MVT::v2i64 && SrcVT == MVT::v2f32) {
return DAG.getNode(IsSigned ? X86ISD::CVTTP2SI : X86ISD::CVTTP2UI, dl, VT,
DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4f32, Src,
DAG.getUNDEF(MVT::v2f32)));
@@ -18458,19 +19073,34 @@ SDValue X86TargetLowering::LowerFP_TO_INT(SDValue Op, SelectionDAG &DAG) const {
assert(!VT.isVector());
- std::pair<SDValue,SDValue> Vals = FP_TO_INTHelper(Op, DAG,
- IsSigned, /*IsReplace=*/ false);
- SDValue FIST = Vals.first, StackSlot = Vals.second;
- // If FP_TO_INTHelper failed, the node is actually supposed to be Legal.
- if (!FIST.getNode())
+ bool UseSSEReg = isScalarFPTypeInSSEReg(SrcVT);
+
+ if (!IsSigned && Subtarget.hasAVX512()) {
+ // Conversions from f32/f64 should be legal.
+ if (UseSSEReg)
+ return Op;
+
+ // Use default expansion.
+ if (VT == MVT::i64)
+ return SDValue();
+ }
+
+ // Promote i16 to i32 if we can use a SSE operation.
+ if (VT == MVT::i16 && UseSSEReg) {
+ assert(IsSigned && "Expected i16 FP_TO_UINT to have been promoted!");
+ SDValue Res = DAG.getNode(ISD::FP_TO_SINT, dl, MVT::i32, Src);
+ return DAG.getNode(ISD::TRUNCATE, dl, VT, Res);
+ }
+
+ // If this is a SINT_TO_FP using SSEReg we're done.
+ if (UseSSEReg && IsSigned)
return Op;
- if (StackSlot.getNode())
- // Load the result.
- return DAG.getLoad(VT, SDLoc(Op), FIST, StackSlot, MachinePointerInfo());
+ // Fall back to X87.
+ if (SDValue V = FP_TO_INTHelper(Op, DAG, IsSigned))
+ return V;
- // The node is the result.
- return FIST;
+ llvm_unreachable("Expected FP_TO_INTHelper to handle all remaining cases.");
}
static SDValue LowerFP_EXTEND(SDValue Op, SelectionDAG &DAG) {
@@ -18491,7 +19121,7 @@ static SDValue LowerFP_EXTEND(SDValue Op, SelectionDAG &DAG) {
/// implementation, and likely shuffle complexity of the alternate sequence.
static bool shouldUseHorizontalOp(bool IsSingleSource, SelectionDAG &DAG,
const X86Subtarget &Subtarget) {
- bool IsOptimizingSize = DAG.getMachineFunction().getFunction().optForSize();
+ bool IsOptimizingSize = DAG.getMachineFunction().getFunction().hasOptSize();
bool HasFastHOps = Subtarget.hasFastHorizontalOps();
return !IsSingleSource || IsOptimizingSize || HasFastHOps;
}
@@ -18513,16 +19143,11 @@ static SDValue lowerAddSubToHorizontalOp(SDValue Op, SelectionDAG &DAG,
if (!IsFP && !Subtarget.hasSSSE3())
return Op;
- // Defer forming the minimal horizontal op if the vector source has more than
- // the 2 extract element uses that we're matching here. In that case, we might
- // form a horizontal op that includes more than 1 add/sub op.
+ // Extract from a common vector.
if (LHS.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
RHS.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
LHS.getOperand(0) != RHS.getOperand(0) ||
- !LHS.getOperand(0)->hasNUsesOfValue(2, 0))
- return Op;
-
- if (!isa<ConstantSDNode>(LHS.getOperand(1)) ||
+ !isa<ConstantSDNode>(LHS.getOperand(1)) ||
!isa<ConstantSDNode>(RHS.getOperand(1)) ||
!shouldUseHorizontalOp(true, DAG, Subtarget))
return Op;
@@ -18540,33 +19165,37 @@ static SDValue lowerAddSubToHorizontalOp(SDValue Op, SelectionDAG &DAG,
}
unsigned LExtIndex = LHS.getConstantOperandVal(1);
unsigned RExtIndex = RHS.getConstantOperandVal(1);
- if (LExtIndex == 1 && RExtIndex == 0 &&
+ if ((LExtIndex & 1) == 1 && (RExtIndex & 1) == 0 &&
(HOpcode == X86ISD::HADD || HOpcode == X86ISD::FHADD))
std::swap(LExtIndex, RExtIndex);
- // TODO: This can be extended to handle other adjacent extract pairs.
- if (LExtIndex != 0 || RExtIndex != 1)
+ if ((LExtIndex & 1) != 0 || RExtIndex != (LExtIndex + 1))
return Op;
SDValue X = LHS.getOperand(0);
EVT VecVT = X.getValueType();
unsigned BitWidth = VecVT.getSizeInBits();
+ unsigned NumLanes = BitWidth / 128;
+ unsigned NumEltsPerLane = VecVT.getVectorNumElements() / NumLanes;
assert((BitWidth == 128 || BitWidth == 256 || BitWidth == 512) &&
"Not expecting illegal vector widths here");
// Creating a 256-bit horizontal op would be wasteful, and there is no 512-bit
- // equivalent, so extract the 256/512-bit source op to 128-bit.
- // This is free: ymm/zmm -> xmm.
+ // equivalent, so extract the 256/512-bit source op to 128-bit if we can.
SDLoc DL(Op);
- if (BitWidth == 256 || BitWidth == 512)
- X = extract128BitVector(X, 0, DAG, DL);
+ if (BitWidth == 256 || BitWidth == 512) {
+ unsigned LaneIdx = LExtIndex / NumEltsPerLane;
+ X = extract128BitVector(X, LaneIdx * NumEltsPerLane, DAG, DL);
+ LExtIndex %= NumEltsPerLane;
+ }
// add (extractelt (X, 0), extractelt (X, 1)) --> extractelt (hadd X, X), 0
// add (extractelt (X, 1), extractelt (X, 0)) --> extractelt (hadd X, X), 0
+ // add (extractelt (X, 2), extractelt (X, 3)) --> extractelt (hadd X, X), 1
// sub (extractelt (X, 0), extractelt (X, 1)) --> extractelt (hsub X, X), 0
SDValue HOp = DAG.getNode(HOpcode, DL, X.getValueType(), X, X);
return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, Op.getSimpleValueType(), HOp,
- DAG.getIntPtrConstant(0, DL));
+ DAG.getIntPtrConstant(LExtIndex / 2, DL));
}
/// Depending on uarch and/or optimizing for size, we might prefer to use a
@@ -18732,36 +19361,25 @@ static SDValue getSETCC(X86::CondCode Cond, SDValue EFLAGS, const SDLoc &dl,
DAG.getConstant(Cond, dl, MVT::i8), EFLAGS);
}
-// Check whether an OR'd tree is PTEST-able.
-static SDValue LowerVectorAllZeroTest(SDValue Op, ISD::CondCode CC,
- const X86Subtarget &Subtarget,
- SelectionDAG &DAG,
- SDValue &X86CC) {
- assert(Op.getOpcode() == ISD::OR && "Only check OR'd tree.");
-
- if (!Subtarget.hasSSE41())
- return SDValue();
-
- if (!Op->hasOneUse())
- return SDValue();
-
- SDNode *N = Op.getNode();
- SDLoc DL(N);
-
+/// Helper for matching OR(EXTRACTELT(X,0),OR(EXTRACTELT(X,1),...))
+/// style scalarized (associative) reduction patterns.
+static bool matchBitOpReduction(SDValue Op, ISD::NodeType BinOp,
+ SmallVectorImpl<SDValue> &SrcOps) {
SmallVector<SDValue, 8> Opnds;
- DenseMap<SDValue, unsigned> VecInMap;
- SmallVector<SDValue, 8> VecIns;
+ DenseMap<SDValue, APInt> SrcOpMap;
EVT VT = MVT::Other;
// Recognize a special case where a vector is casted into wide integer to
// test all 0s.
- Opnds.push_back(N->getOperand(0));
- Opnds.push_back(N->getOperand(1));
+ assert(Op.getOpcode() == unsigned(BinOp) &&
+ "Unexpected bit reduction opcode");
+ Opnds.push_back(Op.getOperand(0));
+ Opnds.push_back(Op.getOperand(1));
for (unsigned Slot = 0, e = Opnds.size(); Slot < e; ++Slot) {
SmallVectorImpl<SDValue>::const_iterator I = Opnds.begin() + Slot;
- // BFS traverse all OR'd operands.
- if (I->getOpcode() == ISD::OR) {
+ // BFS traverse all BinOp operands.
+ if (I->getOpcode() == unsigned(BinOp)) {
Opnds.push_back(I->getOperand(0));
Opnds.push_back(I->getOperand(1));
// Re-evaluate the number of nodes to be traversed.
@@ -18771,42 +19389,63 @@ static SDValue LowerVectorAllZeroTest(SDValue Op, ISD::CondCode CC,
// Quit if a non-EXTRACT_VECTOR_ELT
if (I->getOpcode() != ISD::EXTRACT_VECTOR_ELT)
- return SDValue();
+ return false;
// Quit if without a constant index.
SDValue Idx = I->getOperand(1);
if (!isa<ConstantSDNode>(Idx))
- return SDValue();
+ return false;
- SDValue ExtractedFromVec = I->getOperand(0);
- DenseMap<SDValue, unsigned>::iterator M = VecInMap.find(ExtractedFromVec);
- if (M == VecInMap.end()) {
- VT = ExtractedFromVec.getValueType();
- // Quit if not 128/256-bit vector.
- if (!VT.is128BitVector() && !VT.is256BitVector())
- return SDValue();
+ SDValue Src = I->getOperand(0);
+ DenseMap<SDValue, APInt>::iterator M = SrcOpMap.find(Src);
+ if (M == SrcOpMap.end()) {
+ VT = Src.getValueType();
// Quit if not the same type.
- if (VecInMap.begin() != VecInMap.end() &&
- VT != VecInMap.begin()->first.getValueType())
- return SDValue();
- M = VecInMap.insert(std::make_pair(ExtractedFromVec, 0)).first;
- VecIns.push_back(ExtractedFromVec);
+ if (SrcOpMap.begin() != SrcOpMap.end() &&
+ VT != SrcOpMap.begin()->first.getValueType())
+ return false;
+ unsigned NumElts = VT.getVectorNumElements();
+ APInt EltCount = APInt::getNullValue(NumElts);
+ M = SrcOpMap.insert(std::make_pair(Src, EltCount)).first;
+ SrcOps.push_back(Src);
}
- M->second |= 1U << cast<ConstantSDNode>(Idx)->getZExtValue();
+ // Quit if element already used.
+ unsigned CIdx = cast<ConstantSDNode>(Idx)->getZExtValue();
+ if (M->second[CIdx])
+ return false;
+ M->second.setBit(CIdx);
}
- assert((VT.is128BitVector() || VT.is256BitVector()) &&
- "Not extracted from 128-/256-bit vector.");
+ // Quit if not all elements are used.
+ for (DenseMap<SDValue, APInt>::const_iterator I = SrcOpMap.begin(),
+ E = SrcOpMap.end();
+ I != E; ++I) {
+ if (!I->second.isAllOnesValue())
+ return false;
+ }
- unsigned FullMask = (1U << VT.getVectorNumElements()) - 1U;
+ return true;
+}
- for (DenseMap<SDValue, unsigned>::const_iterator
- I = VecInMap.begin(), E = VecInMap.end(); I != E; ++I) {
- // Quit if not all elements are used.
- if (I->second != FullMask)
- return SDValue();
- }
+// Check whether an OR'd tree is PTEST-able.
+static SDValue LowerVectorAllZeroTest(SDValue Op, ISD::CondCode CC,
+ const X86Subtarget &Subtarget,
+ SelectionDAG &DAG, SDValue &X86CC) {
+ assert(Op.getOpcode() == ISD::OR && "Only check OR'd tree.");
+
+ if (!Subtarget.hasSSE41() || !Op->hasOneUse())
+ return SDValue();
+
+ SmallVector<SDValue, 8> VecIns;
+ if (!matchBitOpReduction(Op, ISD::OR, VecIns))
+ return SDValue();
+ // Quit if not 128/256-bit vector.
+ EVT VT = VecIns[0].getValueType();
+ if (!VT.is128BitVector() && !VT.is256BitVector())
+ return SDValue();
+
+ SDLoc DL(Op);
MVT TestVT = VT.is128BitVector() ? MVT::v2i64 : MVT::v4i64;
// Cast all vectors into TestVT for PTEST.
@@ -18822,10 +19461,9 @@ static SDValue LowerVectorAllZeroTest(SDValue Op, ISD::CondCode CC,
VecIns.push_back(DAG.getNode(ISD::OR, DL, TestVT, LHS, RHS));
}
- X86CC = DAG.getConstant(CC == ISD::SETEQ ? X86::COND_E : X86::COND_NE,
- DL, MVT::i8);
- return DAG.getNode(X86ISD::PTEST, DL, MVT::i32,
- VecIns.back(), VecIns.back());
+ X86CC = DAG.getConstant(CC == ISD::SETEQ ? X86::COND_E : X86::COND_NE, DL,
+ MVT::i8);
+ return DAG.getNode(X86ISD::PTEST, DL, MVT::i32, VecIns.back(), VecIns.back());
}
/// return true if \c Op has a use that doesn't just read flags.
@@ -18963,29 +19601,52 @@ SDValue X86TargetLowering::EmitCmp(SDValue Op0, SDValue Op1, unsigned X86CC,
if (isNullConstant(Op1))
return EmitTest(Op0, X86CC, dl, DAG, Subtarget);
- if ((Op0.getValueType() == MVT::i8 || Op0.getValueType() == MVT::i16 ||
- Op0.getValueType() == MVT::i32 || Op0.getValueType() == MVT::i64)) {
- // Only promote the compare up to I32 if it is a 16 bit operation
- // with an immediate. 16 bit immediates are to be avoided.
- if (Op0.getValueType() == MVT::i16 &&
- ((isa<ConstantSDNode>(Op0) &&
- !cast<ConstantSDNode>(Op0)->getAPIntValue().isSignedIntN(8)) ||
- (isa<ConstantSDNode>(Op1) &&
- !cast<ConstantSDNode>(Op1)->getAPIntValue().isSignedIntN(8))) &&
- !DAG.getMachineFunction().getFunction().optForMinSize() &&
- !Subtarget.isAtom()) {
+ EVT CmpVT = Op0.getValueType();
+
+ if (CmpVT.isFloatingPoint())
+ return DAG.getNode(X86ISD::CMP, dl, MVT::i32, Op0, Op1);
+
+ assert((CmpVT == MVT::i8 || CmpVT == MVT::i16 ||
+ CmpVT == MVT::i32 || CmpVT == MVT::i64) && "Unexpected VT!");
+
+ // Only promote the compare up to I32 if it is a 16 bit operation
+ // with an immediate. 16 bit immediates are to be avoided.
+ if (CmpVT == MVT::i16 && !Subtarget.isAtom() &&
+ !DAG.getMachineFunction().getFunction().hasMinSize()) {
+ ConstantSDNode *COp0 = dyn_cast<ConstantSDNode>(Op0);
+ ConstantSDNode *COp1 = dyn_cast<ConstantSDNode>(Op1);
+ // Don't do this if the immediate can fit in 8-bits.
+ if ((COp0 && !COp0->getAPIntValue().isSignedIntN(8)) ||
+ (COp1 && !COp1->getAPIntValue().isSignedIntN(8))) {
unsigned ExtendOp =
isX86CCUnsigned(X86CC) ? ISD::ZERO_EXTEND : ISD::SIGN_EXTEND;
- Op0 = DAG.getNode(ExtendOp, dl, MVT::i32, Op0);
- Op1 = DAG.getNode(ExtendOp, dl, MVT::i32, Op1);
+ if (X86CC == X86::COND_E || X86CC == X86::COND_NE) {
+ // For equality comparisons try to use SIGN_EXTEND if the input was
+ // truncate from something with enough sign bits.
+ if (Op0.getOpcode() == ISD::TRUNCATE) {
+ SDValue In = Op0.getOperand(0);
+ unsigned EffBits =
+ In.getScalarValueSizeInBits() - DAG.ComputeNumSignBits(In) + 1;
+ if (EffBits <= 16)
+ ExtendOp = ISD::SIGN_EXTEND;
+ } else if (Op1.getOpcode() == ISD::TRUNCATE) {
+ SDValue In = Op1.getOperand(0);
+ unsigned EffBits =
+ In.getScalarValueSizeInBits() - DAG.ComputeNumSignBits(In) + 1;
+ if (EffBits <= 16)
+ ExtendOp = ISD::SIGN_EXTEND;
+ }
+ }
+
+ CmpVT = MVT::i32;
+ Op0 = DAG.getNode(ExtendOp, dl, CmpVT, Op0);
+ Op1 = DAG.getNode(ExtendOp, dl, CmpVT, Op1);
}
- // Use SUB instead of CMP to enable CSE between SUB and CMP.
- SDVTList VTs = DAG.getVTList(Op0.getValueType(), MVT::i32);
- SDValue Sub = DAG.getNode(X86ISD::SUB, dl, VTs, Op0, Op1);
- return SDValue(Sub.getNode(), 1);
}
- assert(Op0.getValueType().isFloatingPoint() && "Unexpected VT!");
- return DAG.getNode(X86ISD::CMP, dl, MVT::i32, Op0, Op1);
+ // Use SUB instead of CMP to enable CSE between SUB and CMP.
+ SDVTList VTs = DAG.getVTList(CmpVT, MVT::i32);
+ SDValue Sub = DAG.getNode(X86ISD::SUB, dl, VTs, Op0, Op1);
+ return Sub.getValue(1);
}
/// Convert a comparison if required by the subtarget.
@@ -19146,7 +19807,7 @@ static SDValue LowerAndToBT(SDValue And, ISD::CondCode CC,
} else {
// Use BT if the immediate can't be encoded in a TEST instruction or we
// are optimizing for size and the immedaite won't fit in a byte.
- bool OptForSize = DAG.getMachineFunction().getFunction().optForSize();
+ bool OptForSize = DAG.getMachineFunction().getFunction().hasOptSize();
if ((!isUInt<32>(AndRHSVal) || (OptForSize && !isUInt<8>(AndRHSVal))) &&
isPowerOf2_64(AndRHSVal)) {
Src = AndLHS;
@@ -19290,10 +19951,11 @@ static SDValue LowerIntVSETCC_AVX512(SDValue Op, SelectionDAG &DAG) {
return DAG.getSetCC(dl, VT, Op0, Op1, SetCCOpcode);
}
-/// Given a simple buildvector constant, return a new vector constant with each
-/// element decremented. If decrementing would result in underflow or this
-/// is not a simple vector constant, return an empty value.
-static SDValue decrementVectorConstant(SDValue V, SelectionDAG &DAG) {
+/// Given a buildvector constant, return a new vector constant with each element
+/// incremented or decremented. If incrementing or decrementing would result in
+/// unsigned overflow or underflow or this is not a simple vector constant,
+/// return an empty value.
+static SDValue incDecVectorConstant(SDValue V, SelectionDAG &DAG, bool IsInc) {
auto *BV = dyn_cast<BuildVectorSDNode>(V.getNode());
if (!BV)
return SDValue();
@@ -19308,11 +19970,12 @@ static SDValue decrementVectorConstant(SDValue V, SelectionDAG &DAG) {
if (!Elt || Elt->isOpaque() || Elt->getSimpleValueType(0) != EltVT)
return SDValue();
- // Avoid underflow.
- if (Elt->getAPIntValue().isNullValue())
+ // Avoid overflow/underflow.
+ const APInt &EltC = Elt->getAPIntValue();
+ if ((IsInc && EltC.isMaxValue()) || (!IsInc && EltC.isNullValue()))
return SDValue();
- NewVecC.push_back(DAG.getConstant(Elt->getAPIntValue() - 1, DL, EltVT));
+ NewVecC.push_back(DAG.getConstant(EltC + (IsInc ? 1 : -1), DL, EltVT));
}
return DAG.getBuildVector(VT, DL, NewVecC);
@@ -19344,12 +20007,24 @@ static SDValue LowerVSETCCWithSUBUS(SDValue Op0, SDValue Op1, MVT VT,
// Only do this pre-AVX since vpcmp* is no longer destructive.
if (Subtarget.hasAVX())
return SDValue();
- SDValue ULEOp1 = decrementVectorConstant(Op1, DAG);
+ SDValue ULEOp1 = incDecVectorConstant(Op1, DAG, false);
if (!ULEOp1)
return SDValue();
Op1 = ULEOp1;
break;
}
+ case ISD::SETUGT: {
+ // If the comparison is against a constant, we can turn this into a setuge.
+ // This is beneficial because materializing a constant 0 for the PCMPEQ is
+ // probably cheaper than XOR+PCMPGT using 2 different vector constants:
+ // cmpgt (xor X, SignMaskC) CmpC --> cmpeq (usubsat (CmpC+1), X), 0
+ SDValue UGEOp1 = incDecVectorConstant(Op1, DAG, true);
+ if (!UGEOp1)
+ return SDValue();
+ Op1 = Op0;
+ Op0 = UGEOp1;
+ break;
+ }
// Psubus is better than flip-sign because it requires no inversion.
case ISD::SETUGE:
std::swap(Op0, Op1);
@@ -19446,10 +20121,6 @@ static SDValue LowerVSETCC(SDValue Op, const X86Subtarget &Subtarget,
assert((Subtarget.hasAVX512() || (VT == VTOp0)) &&
"Value types for source and destination must be the same!");
- // Break 256-bit integer vector compare into smaller ones.
- if (VT.is256BitVector() && !Subtarget.hasInt256())
- return Lower256IntVSETCC(Op, DAG);
-
// The result is boolean, but operands are int/float
if (VT.getVectorElementType() == MVT::i1) {
// In AVX-512 architecture setcc returns mask with i1 elements,
@@ -19503,6 +20174,27 @@ static SDValue LowerVSETCC(SDValue Op, const X86Subtarget &Subtarget,
}
}
+ // ICMP_EQ(AND(X,C),C) -> SRA(SHL(X,LOG2(C)),BW-1) iff C is power-of-2.
+ if (Cond == ISD::SETEQ && Op0.getOpcode() == ISD::AND &&
+ Op0.getOperand(1) == Op1 && Op0.hasOneUse()) {
+ ConstantSDNode *C1 = isConstOrConstSplat(Op1);
+ if (C1 && C1->getAPIntValue().isPowerOf2()) {
+ unsigned BitWidth = VT.getScalarSizeInBits();
+ unsigned ShiftAmt = BitWidth - C1->getAPIntValue().logBase2() - 1;
+
+ SDValue Result = Op0.getOperand(0);
+ Result = DAG.getNode(ISD::SHL, dl, VT, Result,
+ DAG.getConstant(ShiftAmt, dl, VT));
+ Result = DAG.getNode(ISD::SRA, dl, VT, Result,
+ DAG.getConstant(BitWidth - 1, dl, VT));
+ return Result;
+ }
+ }
+
+ // Break 256-bit integer vector compare into smaller ones.
+ if (VT.is256BitVector() && !Subtarget.hasInt256())
+ return Lower256IntVSETCC(Op, DAG);
+
// If this is a SETNE against the signed minimum value, change it to SETGT.
// If this is a SETNE against the signed maximum value, change it to SETLT.
// which will be swapped to SETGT.
@@ -19530,17 +20222,20 @@ static SDValue LowerVSETCC(SDValue Op, const X86Subtarget &Subtarget,
TLI.isOperationLegal(ISD::UMIN, VT)) {
// If we have a constant operand, increment/decrement it and change the
// condition to avoid an invert.
- // TODO: This could be extended to handle a non-splat constant by checking
- // that each element of the constant is not the max/null value.
- APInt C;
- if (Cond == ISD::SETUGT && isConstantSplat(Op1, C) && !C.isMaxValue()) {
+ if (Cond == ISD::SETUGT &&
+ ISD::matchUnaryPredicate(Op1, [](ConstantSDNode *C) {
+ return !C->getAPIntValue().isMaxValue();
+ })) {
// X > C --> X >= (C+1) --> X == umax(X, C+1)
- Op1 = DAG.getConstant(C + 1, dl, VT);
+ Op1 = DAG.getNode(ISD::ADD, dl, VT, Op1, DAG.getConstant(1, dl, VT));
Cond = ISD::SETUGE;
}
- if (Cond == ISD::SETULT && isConstantSplat(Op1, C) && !C.isNullValue()) {
+ if (Cond == ISD::SETULT &&
+ ISD::matchUnaryPredicate(Op1, [](ConstantSDNode *C) {
+ return !C->getAPIntValue().isNullValue();
+ })) {
// X < C --> X <= (C-1) --> X == umin(X, C-1)
- Op1 = DAG.getConstant(C - 1, dl, VT);
+ Op1 = DAG.getNode(ISD::SUB, dl, VT, Op1, DAG.getConstant(1, dl, VT));
Cond = ISD::SETULE;
}
bool Invert = false;
@@ -19826,7 +20521,7 @@ getX86XALUOOp(X86::CondCode &Cond, SDValue Op, SelectionDAG &DAG) {
break;
case ISD::UADDO:
BaseOp = X86ISD::ADD;
- Cond = X86::COND_B;
+ Cond = isOneConstant(RHS) ? X86::COND_E : X86::COND_B;
break;
case ISD::SSUBO:
BaseOp = X86ISD::SUB;
@@ -19867,6 +20562,7 @@ static SDValue LowerXALUO(SDValue Op, SelectionDAG &DAG) {
std::tie(Value, Overflow) = getX86XALUOOp(Cond, Op, DAG);
SDValue SetCC = getSETCC(Cond, Overflow, DL, DAG);
+ assert(Op->getValueType(1) == MVT::i8 && "Unexpected VT!");
return DAG.getNode(ISD::MERGE_VALUES, DL, Op->getVTList(), Value, SetCC);
}
@@ -20036,10 +20732,10 @@ SDValue X86TargetLowering::LowerSELECT(SDValue Op, SelectionDAG &DAG) const {
if (isNullConstant(Y) &&
(isAllOnesConstant(Op1) == (CondCode == X86::COND_NE))) {
SDValue Zero = DAG.getConstant(0, DL, CmpOp0.getValueType());
- SDValue Cmp = DAG.getNode(X86ISD::CMP, DL, MVT::i32, Zero, CmpOp0);
+ SDValue CmpZero = DAG.getNode(X86ISD::CMP, DL, MVT::i32, Zero, CmpOp0);
SDVTList VTs = DAG.getVTList(Op.getValueType(), MVT::i32);
Zero = DAG.getConstant(0, DL, Op.getValueType());
- return DAG.getNode(X86ISD::SBB, DL, VTs, Zero, Zero, Cmp);
+ return DAG.getNode(X86ISD::SBB, DL, VTs, Zero, Zero, CmpZero);
}
Cmp = DAG.getNode(X86ISD::CMP, DL, MVT::i32,
@@ -20111,7 +20807,6 @@ SDValue X86TargetLowering::LowerSELECT(SDValue Op, SelectionDAG &DAG) const {
CC = Cond.getOperand(0);
SDValue Cmp = Cond.getOperand(1);
- unsigned Opc = Cmp.getOpcode();
MVT VT = Op.getSimpleValueType();
bool IllegalFPCMov = false;
@@ -20120,7 +20815,7 @@ SDValue X86TargetLowering::LowerSELECT(SDValue Op, SelectionDAG &DAG) const {
IllegalFPCMov = !hasFPCMov(cast<ConstantSDNode>(CC)->getSExtValue());
if ((isX86LogicalCmp(Cmp) && !IllegalFPCMov) ||
- Opc == X86ISD::BT) { // FIXME
+ Cmp.getOpcode() == X86ISD::BT) { // FIXME
Cond = Cmp;
AddTest = false;
}
@@ -20193,8 +20888,15 @@ SDValue X86TargetLowering::LowerSELECT(SDValue Op, SelectionDAG &DAG) const {
}
}
- // Promote i16 cmovs if it won't prevent folding a load.
- if (Op.getValueType() == MVT::i16 && !MayFoldLoad(Op1) && !MayFoldLoad(Op2)) {
+ // Or finally, promote i8 cmovs if we have CMOV,
+ // or i16 cmovs if it won't prevent folding a load.
+ // FIXME: we should not limit promotion of i8 case to only when the CMOV is
+ // legal, but EmitLoweredSelect() can not deal with these extensions
+ // being inserted between two CMOV's. (in i16 case too TBN)
+ // https://bugs.llvm.org/show_bug.cgi?id=40974
+ if ((Op.getValueType() == MVT::i8 && Subtarget.hasCMov()) ||
+ (Op.getValueType() == MVT::i16 && !MayFoldLoad(Op1) &&
+ !MayFoldLoad(Op2))) {
Op1 = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i32, Op1);
Op2 = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i32, Op2);
SDValue Ops[] = { Op2, Op1, CC, Cond };
@@ -20453,6 +21155,76 @@ static SDValue LowerSIGN_EXTEND(SDValue Op, const X86Subtarget &Subtarget,
return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, OpLo, OpHi);
}
+/// Change a vector store into a pair of half-size vector stores.
+static SDValue splitVectorStore(StoreSDNode *Store, SelectionDAG &DAG) {
+ SDValue StoredVal = Store->getValue();
+ assert((StoredVal.getValueType().is256BitVector() ||
+ StoredVal.getValueType().is512BitVector()) &&
+ "Expecting 256/512-bit op");
+
+ // Splitting volatile memory ops is not allowed unless the operation was not
+ // legal to begin with. We are assuming the input op is legal (this transform
+ // is only used for targets with AVX).
+ if (Store->isVolatile())
+ return SDValue();
+
+ MVT StoreVT = StoredVal.getSimpleValueType();
+ unsigned NumElems = StoreVT.getVectorNumElements();
+ unsigned HalfSize = StoredVal.getValueSizeInBits() / 2;
+ unsigned HalfAlign = (128 == HalfSize ? 16 : 32);
+
+ SDLoc DL(Store);
+ SDValue Value0 = extractSubVector(StoredVal, 0, DAG, DL, HalfSize);
+ SDValue Value1 = extractSubVector(StoredVal, NumElems / 2, DAG, DL, HalfSize);
+ SDValue Ptr0 = Store->getBasePtr();
+ SDValue Ptr1 = DAG.getMemBasePlusOffset(Ptr0, HalfAlign, DL);
+ unsigned Alignment = Store->getAlignment();
+ SDValue Ch0 =
+ DAG.getStore(Store->getChain(), DL, Value0, Ptr0, Store->getPointerInfo(),
+ Alignment, Store->getMemOperand()->getFlags());
+ SDValue Ch1 = DAG.getStore(Store->getChain(), DL, Value1, Ptr1,
+ Store->getPointerInfo().getWithOffset(HalfAlign),
+ MinAlign(Alignment, HalfAlign),
+ Store->getMemOperand()->getFlags());
+ return DAG.getNode(ISD::TokenFactor, DL, MVT::Other, Ch0, Ch1);
+}
+
+/// Scalarize a vector store, bitcasting to TargetVT to determine the scalar
+/// type.
+static SDValue scalarizeVectorStore(StoreSDNode *Store, MVT StoreVT,
+ SelectionDAG &DAG) {
+ SDValue StoredVal = Store->getValue();
+ assert(StoreVT.is128BitVector() &&
+ StoredVal.getValueType().is128BitVector() && "Expecting 128-bit op");
+ StoredVal = DAG.getBitcast(StoreVT, StoredVal);
+
+ // Splitting volatile memory ops is not allowed unless the operation was not
+ // legal to begin with. We are assuming the input op is legal (this transform
+ // is only used for targets with AVX).
+ if (Store->isVolatile())
+ return SDValue();
+
+ MVT StoreSVT = StoreVT.getScalarType();
+ unsigned NumElems = StoreVT.getVectorNumElements();
+ unsigned ScalarSize = StoreSVT.getStoreSize();
+ unsigned Alignment = Store->getAlignment();
+
+ SDLoc DL(Store);
+ SmallVector<SDValue, 4> Stores;
+ for (unsigned i = 0; i != NumElems; ++i) {
+ unsigned Offset = i * ScalarSize;
+ SDValue Ptr = DAG.getMemBasePlusOffset(Store->getBasePtr(), Offset, DL);
+ SDValue Scl = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, StoreSVT, StoredVal,
+ DAG.getIntPtrConstant(i, DL));
+ SDValue Ch = DAG.getStore(Store->getChain(), DL, Scl, Ptr,
+ Store->getPointerInfo().getWithOffset(Offset),
+ MinAlign(Alignment, Offset),
+ Store->getMemOperand()->getFlags());
+ Stores.push_back(Ch);
+ }
+ return DAG.getNode(ISD::TokenFactor, DL, MVT::Other, Stores);
+}
+
static SDValue LowerStore(SDValue Op, const X86Subtarget &Subtarget,
SelectionDAG &DAG) {
StoreSDNode *St = cast<StoreSDNode>(Op.getNode());
@@ -20482,28 +21254,47 @@ static SDValue LowerStore(SDValue Op, const X86Subtarget &Subtarget,
if (St->isTruncatingStore())
return SDValue();
+ // If this is a 256-bit store of concatenated ops, we are better off splitting
+ // that store into two 128-bit stores. This avoids spurious use of 256-bit ops
+ // and each half can execute independently. Some cores would split the op into
+ // halves anyway, so the concat (vinsertf128) is purely an extra op.
MVT StoreVT = StoredVal.getSimpleValueType();
+ if (StoreVT.is256BitVector()) {
+ SmallVector<SDValue, 4> CatOps;
+ if (StoredVal.hasOneUse() && collectConcatOps(StoredVal.getNode(), CatOps))
+ return splitVectorStore(St, DAG);
+ return SDValue();
+ }
+
assert(StoreVT.isVector() && StoreVT.getSizeInBits() == 64 &&
"Unexpected VT");
if (DAG.getTargetLoweringInfo().getTypeAction(*DAG.getContext(), StoreVT) !=
TargetLowering::TypeWidenVector)
return SDValue();
- // Widen the vector, cast to a v2x64 type, extract the single 64-bit element
- // and store it.
MVT WideVT = MVT::getVectorVT(StoreVT.getVectorElementType(),
StoreVT.getVectorNumElements() * 2);
StoredVal = DAG.getNode(ISD::CONCAT_VECTORS, dl, WideVT, StoredVal,
DAG.getUNDEF(StoreVT));
- MVT StVT = Subtarget.is64Bit() && StoreVT.isInteger() ? MVT::i64 : MVT::f64;
- MVT CastVT = MVT::getVectorVT(StVT, 2);
- StoredVal = DAG.getBitcast(CastVT, StoredVal);
- StoredVal = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, StVT, StoredVal,
- DAG.getIntPtrConstant(0, dl));
- return DAG.getStore(St->getChain(), dl, StoredVal, St->getBasePtr(),
- St->getPointerInfo(), St->getAlignment(),
- St->getMemOperand()->getFlags());
+ if (Subtarget.hasSSE2()) {
+ // Widen the vector, cast to a v2x64 type, extract the single 64-bit element
+ // and store it.
+ MVT StVT = Subtarget.is64Bit() && StoreVT.isInteger() ? MVT::i64 : MVT::f64;
+ MVT CastVT = MVT::getVectorVT(StVT, 2);
+ StoredVal = DAG.getBitcast(CastVT, StoredVal);
+ StoredVal = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, StVT, StoredVal,
+ DAG.getIntPtrConstant(0, dl));
+
+ return DAG.getStore(St->getChain(), dl, StoredVal, St->getBasePtr(),
+ St->getPointerInfo(), St->getAlignment(),
+ St->getMemOperand()->getFlags());
+ }
+ assert(Subtarget.hasSSE1() && "Expected SSE");
+ SDVTList Tys = DAG.getVTList(MVT::Other);
+ SDValue Ops[] = {St->getChain(), StoredVal, St->getBasePtr()};
+ return DAG.getMemIntrinsicNode(X86ISD::VEXTRACT_STORE, dl, Tys, Ops, MVT::i64,
+ St->getMemOperand());
}
// Lower vector extended loads using a shuffle. If SSSE3 is not available we
@@ -20694,13 +21485,13 @@ static SDValue LowerLoad(SDValue Op, const X86Subtarget &Subtarget,
unsigned SizeRatio = RegSz / MemSz;
if (Ext == ISD::SEXTLOAD) {
- SDValue Sext = getExtendInVec(/*Signed*/true, dl, RegVT, SlicedVec, DAG);
+ SDValue Sext = getExtendInVec(ISD::SIGN_EXTEND, dl, RegVT, SlicedVec, DAG);
return DAG.getMergeValues({Sext, TF}, dl);
}
if (Ext == ISD::EXTLOAD && !Subtarget.hasBWI() && RegVT == MVT::v8i64 &&
MemVT == MVT::v8i8) {
- SDValue Sext = getExtendInVec(/*Signed*/false, dl, RegVT, SlicedVec, DAG);
+ SDValue Sext = getExtendInVec(ISD::ZERO_EXTEND, dl, RegVT, SlicedVec, DAG);
return DAG.getMergeValues({Sext, TF}, dl);
}
@@ -21240,42 +22031,41 @@ static SDValue getTargetVShiftByConstNode(unsigned Opc, const SDLoc &dl, MVT VT,
if (ISD::isBuildVectorOfConstantSDNodes(SrcOp.getNode())) {
SmallVector<SDValue, 8> Elts;
unsigned NumElts = SrcOp->getNumOperands();
- ConstantSDNode *ND;
- switch(Opc) {
+ switch (Opc) {
default: llvm_unreachable("Unknown opcode!");
case X86ISD::VSHLI:
- for (unsigned i=0; i!=NumElts; ++i) {
+ for (unsigned i = 0; i != NumElts; ++i) {
SDValue CurrentOp = SrcOp->getOperand(i);
if (CurrentOp->isUndef()) {
Elts.push_back(CurrentOp);
continue;
}
- ND = cast<ConstantSDNode>(CurrentOp);
+ auto *ND = cast<ConstantSDNode>(CurrentOp);
const APInt &C = ND->getAPIntValue();
Elts.push_back(DAG.getConstant(C.shl(ShiftAmt), dl, ElementType));
}
break;
case X86ISD::VSRLI:
- for (unsigned i=0; i!=NumElts; ++i) {
+ for (unsigned i = 0; i != NumElts; ++i) {
SDValue CurrentOp = SrcOp->getOperand(i);
if (CurrentOp->isUndef()) {
Elts.push_back(CurrentOp);
continue;
}
- ND = cast<ConstantSDNode>(CurrentOp);
+ auto *ND = cast<ConstantSDNode>(CurrentOp);
const APInt &C = ND->getAPIntValue();
Elts.push_back(DAG.getConstant(C.lshr(ShiftAmt), dl, ElementType));
}
break;
case X86ISD::VSRAI:
- for (unsigned i=0; i!=NumElts; ++i) {
+ for (unsigned i = 0; i != NumElts; ++i) {
SDValue CurrentOp = SrcOp->getOperand(i);
if (CurrentOp->isUndef()) {
Elts.push_back(CurrentOp);
continue;
}
- ND = cast<ConstantSDNode>(CurrentOp);
+ auto *ND = cast<ConstantSDNode>(CurrentOp);
const APInt &C = ND->getAPIntValue();
Elts.push_back(DAG.getConstant(C.ashr(ShiftAmt), dl, ElementType));
}
@@ -21443,7 +22233,7 @@ static SDValue getScalarMaskingNode(SDValue Op, SDValue Mask,
DAG.getBitcast(MVT::v8i1, Mask),
DAG.getIntPtrConstant(0, dl));
if (Op.getOpcode() == X86ISD::FSETCCM ||
- Op.getOpcode() == X86ISD::FSETCCM_RND ||
+ Op.getOpcode() == X86ISD::FSETCCM_SAE ||
Op.getOpcode() == X86ISD::VFPCLASSS)
return DAG.getNode(ISD::AND, dl, VT, Op, IMask);
@@ -21517,11 +22307,31 @@ SDValue X86TargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op,
SelectionDAG &DAG) const {
// Helper to detect if the operand is CUR_DIRECTION rounding mode.
auto isRoundModeCurDirection = [](SDValue Rnd) {
- if (!isa<ConstantSDNode>(Rnd))
- return false;
+ if (auto *C = dyn_cast<ConstantSDNode>(Rnd))
+ return C->getZExtValue() == X86::STATIC_ROUNDING::CUR_DIRECTION;
+
+ return false;
+ };
+ auto isRoundModeSAE = [](SDValue Rnd) {
+ if (auto *C = dyn_cast<ConstantSDNode>(Rnd))
+ return C->getZExtValue() == X86::STATIC_ROUNDING::NO_EXC;
- unsigned Round = cast<ConstantSDNode>(Rnd)->getZExtValue();
- return Round == X86::STATIC_ROUNDING::CUR_DIRECTION;
+ return false;
+ };
+ auto isRoundModeSAEToX = [](SDValue Rnd, unsigned &RC) {
+ if (auto *C = dyn_cast<ConstantSDNode>(Rnd)) {
+ RC = C->getZExtValue();
+ if (RC & X86::STATIC_ROUNDING::NO_EXC) {
+ // Clear the NO_EXC bit and check remaining bits.
+ RC ^= X86::STATIC_ROUNDING::NO_EXC;
+ return RC == X86::STATIC_ROUNDING::TO_NEAREST_INT ||
+ RC == X86::STATIC_ROUNDING::TO_NEG_INF ||
+ RC == X86::STATIC_ROUNDING::TO_POS_INF ||
+ RC == X86::STATIC_ROUNDING::TO_ZERO;
+ }
+ }
+
+ return false;
};
SDLoc dl(Op);
@@ -21537,13 +22347,29 @@ SDValue X86TargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op,
unsigned IntrWithRoundingModeOpcode = IntrData->Opc1;
if (IntrWithRoundingModeOpcode != 0) {
SDValue Rnd = Op.getOperand(2);
- if (!isRoundModeCurDirection(Rnd)) {
+ unsigned RC = 0;
+ if (isRoundModeSAEToX(Rnd, RC))
return DAG.getNode(IntrWithRoundingModeOpcode, dl, Op.getValueType(),
- Op.getOperand(1), Rnd);
- }
+ Op.getOperand(1),
+ DAG.getTargetConstant(RC, dl, MVT::i32));
+ if (!isRoundModeCurDirection(Rnd))
+ return SDValue();
}
return DAG.getNode(IntrData->Opc0, dl, Op.getValueType(), Op.getOperand(1));
}
+ case INTR_TYPE_1OP_SAE: {
+ SDValue Sae = Op.getOperand(2);
+
+ unsigned Opc;
+ if (isRoundModeCurDirection(Sae))
+ Opc = IntrData->Opc0;
+ else if (isRoundModeSAE(Sae))
+ Opc = IntrData->Opc1;
+ else
+ return SDValue();
+
+ return DAG.getNode(Opc, dl, Op.getValueType(), Op.getOperand(1));
+ }
case INTR_TYPE_2OP: {
SDValue Src2 = Op.getOperand(2);
@@ -21553,15 +22379,32 @@ SDValue X86TargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op,
unsigned IntrWithRoundingModeOpcode = IntrData->Opc1;
if (IntrWithRoundingModeOpcode != 0) {
SDValue Rnd = Op.getOperand(3);
- if (!isRoundModeCurDirection(Rnd)) {
+ unsigned RC = 0;
+ if (isRoundModeSAEToX(Rnd, RC))
return DAG.getNode(IntrWithRoundingModeOpcode, dl, Op.getValueType(),
- Op.getOperand(1), Src2, Rnd);
- }
+ Op.getOperand(1), Src2,
+ DAG.getTargetConstant(RC, dl, MVT::i32));
+ if (!isRoundModeCurDirection(Rnd))
+ return SDValue();
}
return DAG.getNode(IntrData->Opc0, dl, Op.getValueType(),
Op.getOperand(1), Src2);
}
+ case INTR_TYPE_2OP_SAE: {
+ SDValue Sae = Op.getOperand(3);
+
+ unsigned Opc;
+ if (isRoundModeCurDirection(Sae))
+ Opc = IntrData->Opc0;
+ else if (isRoundModeSAE(Sae))
+ Opc = IntrData->Opc1;
+ else
+ return SDValue();
+
+ return DAG.getNode(Opc, dl, Op.getValueType(), Op.getOperand(1),
+ Op.getOperand(2));
+ }
case INTR_TYPE_3OP:
case INTR_TYPE_3OP_IMM8: {
SDValue Src1 = Op.getOperand(1);
@@ -21577,11 +22420,13 @@ SDValue X86TargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op,
unsigned IntrWithRoundingModeOpcode = IntrData->Opc1;
if (IntrWithRoundingModeOpcode != 0) {
SDValue Rnd = Op.getOperand(4);
- if (!isRoundModeCurDirection(Rnd)) {
- return DAG.getNode(IntrWithRoundingModeOpcode,
- dl, Op.getValueType(),
- Src1, Src2, Src3, Rnd);
- }
+ unsigned RC = 0;
+ if (isRoundModeSAEToX(Rnd, RC))
+ return DAG.getNode(IntrWithRoundingModeOpcode, dl, Op.getValueType(),
+ Src1, Src2, Src3,
+ DAG.getTargetConstant(RC, dl, MVT::i32));
+ if (!isRoundModeCurDirection(Rnd))
+ return SDValue();
}
return DAG.getNode(IntrData->Opc0, dl, Op.getValueType(),
@@ -21590,44 +22435,45 @@ SDValue X86TargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op,
case INTR_TYPE_4OP:
return DAG.getNode(IntrData->Opc0, dl, Op.getValueType(), Op.getOperand(1),
Op.getOperand(2), Op.getOperand(3), Op.getOperand(4));
- case INTR_TYPE_1OP_MASK_RM: {
- SDValue Src = Op.getOperand(1);
- SDValue PassThru = Op.getOperand(2);
- SDValue Mask = Op.getOperand(3);
- SDValue RoundingMode;
- // We always add rounding mode to the Node.
- // If the rounding mode is not specified, we add the
- // "current direction" mode.
- if (Op.getNumOperands() == 4)
- RoundingMode =
- DAG.getConstant(X86::STATIC_ROUNDING::CUR_DIRECTION, dl, MVT::i32);
- else
- RoundingMode = Op.getOperand(4);
- assert(IntrData->Opc1 == 0 && "Unexpected second opcode!");
- return getVectorMaskingNode(DAG.getNode(IntrData->Opc0, dl, VT, Src,
- RoundingMode),
- Mask, PassThru, Subtarget, DAG);
- }
case INTR_TYPE_1OP_MASK: {
SDValue Src = Op.getOperand(1);
SDValue PassThru = Op.getOperand(2);
SDValue Mask = Op.getOperand(3);
// We add rounding mode to the Node when
- // - RM Opcode is specified and
- // - RM is not "current direction".
+ // - RC Opcode is specified and
+ // - RC is not "current direction".
unsigned IntrWithRoundingModeOpcode = IntrData->Opc1;
if (IntrWithRoundingModeOpcode != 0) {
SDValue Rnd = Op.getOperand(4);
- if (!isRoundModeCurDirection(Rnd)) {
- return getVectorMaskingNode(DAG.getNode(IntrWithRoundingModeOpcode,
- dl, Op.getValueType(),
- Src, Rnd),
- Mask, PassThru, Subtarget, DAG);
- }
+ unsigned RC = 0;
+ if (isRoundModeSAEToX(Rnd, RC))
+ return getVectorMaskingNode(
+ DAG.getNode(IntrWithRoundingModeOpcode, dl, Op.getValueType(),
+ Src, DAG.getTargetConstant(RC, dl, MVT::i32)),
+ Mask, PassThru, Subtarget, DAG);
+ if (!isRoundModeCurDirection(Rnd))
+ return SDValue();
}
return getVectorMaskingNode(DAG.getNode(IntrData->Opc0, dl, VT, Src),
Mask, PassThru, Subtarget, DAG);
}
+ case INTR_TYPE_1OP_MASK_SAE: {
+ SDValue Src = Op.getOperand(1);
+ SDValue PassThru = Op.getOperand(2);
+ SDValue Mask = Op.getOperand(3);
+ SDValue Rnd = Op.getOperand(4);
+
+ unsigned Opc;
+ if (isRoundModeCurDirection(Rnd))
+ Opc = IntrData->Opc0;
+ else if (isRoundModeSAE(Rnd))
+ Opc = IntrData->Opc1;
+ else
+ return SDValue();
+
+ return getVectorMaskingNode(DAG.getNode(Opc, dl, VT, Src),
+ Mask, PassThru, Subtarget, DAG);
+ }
case INTR_TYPE_SCALAR_MASK: {
SDValue Src1 = Op.getOperand(1);
SDValue Src2 = Op.getOperand(2);
@@ -21641,10 +22487,14 @@ SDValue X86TargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op,
if (Op.getNumOperands() == (5U + HasRounding)) {
if (HasRounding) {
SDValue Rnd = Op.getOperand(5);
+ unsigned RC = 0;
+ if (isRoundModeSAEToX(Rnd, RC))
+ return getScalarMaskingNode(
+ DAG.getNode(IntrWithRoundingModeOpcode, dl, VT, Src1, Src2,
+ DAG.getTargetConstant(RC, dl, MVT::i32)),
+ Mask, passThru, Subtarget, DAG);
if (!isRoundModeCurDirection(Rnd))
- return getScalarMaskingNode(DAG.getNode(IntrWithRoundingModeOpcode,
- dl, VT, Src1, Src2, Rnd),
- Mask, passThru, Subtarget, DAG);
+ return SDValue();
}
return getScalarMaskingNode(DAG.getNode(IntrData->Opc0, dl, VT, Src1,
Src2),
@@ -21654,123 +22504,138 @@ SDValue X86TargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op,
assert(Op.getNumOperands() == (6U + HasRounding) &&
"Unexpected intrinsic form");
SDValue RoundingMode = Op.getOperand(5);
+ unsigned Opc = IntrData->Opc0;
if (HasRounding) {
SDValue Sae = Op.getOperand(6);
- if (!isRoundModeCurDirection(Sae))
- return getScalarMaskingNode(DAG.getNode(IntrWithRoundingModeOpcode,
- dl, VT, Src1, Src2,
- RoundingMode, Sae),
- Mask, passThru, Subtarget, DAG);
+ if (isRoundModeSAE(Sae))
+ Opc = IntrWithRoundingModeOpcode;
+ else if (!isRoundModeCurDirection(Sae))
+ return SDValue();
}
- return getScalarMaskingNode(DAG.getNode(IntrData->Opc0, dl, VT, Src1,
+ return getScalarMaskingNode(DAG.getNode(Opc, dl, VT, Src1,
Src2, RoundingMode),
Mask, passThru, Subtarget, DAG);
}
- case INTR_TYPE_SCALAR_MASK_RM: {
+ case INTR_TYPE_SCALAR_MASK_RND: {
SDValue Src1 = Op.getOperand(1);
SDValue Src2 = Op.getOperand(2);
- SDValue Src0 = Op.getOperand(3);
+ SDValue passThru = Op.getOperand(3);
SDValue Mask = Op.getOperand(4);
- // There are 2 kinds of intrinsics in this group:
- // (1) With suppress-all-exceptions (sae) or rounding mode- 6 operands
- // (2) With rounding mode and sae - 7 operands.
- if (Op.getNumOperands() == 6) {
- SDValue Sae = Op.getOperand(5);
- return getScalarMaskingNode(DAG.getNode(IntrData->Opc0, dl, VT, Src1, Src2,
- Sae),
- Mask, Src0, Subtarget, DAG);
- }
- assert(Op.getNumOperands() == 7 && "Unexpected intrinsic form");
- SDValue RoundingMode = Op.getOperand(5);
- SDValue Sae = Op.getOperand(6);
- return getScalarMaskingNode(DAG.getNode(IntrData->Opc0, dl, VT, Src1, Src2,
- RoundingMode, Sae),
- Mask, Src0, Subtarget, DAG);
+ SDValue Rnd = Op.getOperand(5);
+
+ SDValue NewOp;
+ unsigned RC = 0;
+ if (isRoundModeCurDirection(Rnd))
+ NewOp = DAG.getNode(IntrData->Opc0, dl, VT, Src1, Src2);
+ else if (isRoundModeSAEToX(Rnd, RC))
+ NewOp = DAG.getNode(IntrData->Opc1, dl, VT, Src1, Src2,
+ DAG.getTargetConstant(RC, dl, MVT::i32));
+ else
+ return SDValue();
+
+ return getScalarMaskingNode(NewOp, Mask, passThru, Subtarget, DAG);
+ }
+ case INTR_TYPE_SCALAR_MASK_SAE: {
+ SDValue Src1 = Op.getOperand(1);
+ SDValue Src2 = Op.getOperand(2);
+ SDValue passThru = Op.getOperand(3);
+ SDValue Mask = Op.getOperand(4);
+ SDValue Sae = Op.getOperand(5);
+ unsigned Opc;
+ if (isRoundModeCurDirection(Sae))
+ Opc = IntrData->Opc0;
+ else if (isRoundModeSAE(Sae))
+ Opc = IntrData->Opc1;
+ else
+ return SDValue();
+
+ return getScalarMaskingNode(DAG.getNode(Opc, dl, VT, Src1, Src2),
+ Mask, passThru, Subtarget, DAG);
}
case INTR_TYPE_2OP_MASK: {
SDValue Src1 = Op.getOperand(1);
SDValue Src2 = Op.getOperand(2);
SDValue PassThru = Op.getOperand(3);
SDValue Mask = Op.getOperand(4);
-
- // We specify 2 possible opcodes for intrinsics with rounding modes.
- // First, we check if the intrinsic may have non-default rounding mode,
- // (IntrData->Opc1 != 0), then we check the rounding mode operand.
- unsigned IntrWithRoundingModeOpcode = IntrData->Opc1;
- if (IntrWithRoundingModeOpcode != 0) {
+ SDValue NewOp;
+ if (IntrData->Opc1 != 0) {
SDValue Rnd = Op.getOperand(5);
- if (!isRoundModeCurDirection(Rnd)) {
- return getVectorMaskingNode(DAG.getNode(IntrWithRoundingModeOpcode,
- dl, Op.getValueType(),
- Src1, Src2, Rnd),
- Mask, PassThru, Subtarget, DAG);
- }
+ unsigned RC = 0;
+ if (isRoundModeSAEToX(Rnd, RC))
+ NewOp = DAG.getNode(IntrData->Opc1, dl, VT, Src1, Src2,
+ DAG.getTargetConstant(RC, dl, MVT::i32));
+ else if (!isRoundModeCurDirection(Rnd))
+ return SDValue();
}
- // TODO: Intrinsics should have fast-math-flags to propagate.
- return getVectorMaskingNode(DAG.getNode(IntrData->Opc0, dl, VT,Src1,Src2),
- Mask, PassThru, Subtarget, DAG);
+ if (!NewOp)
+ NewOp = DAG.getNode(IntrData->Opc0, dl, VT, Src1, Src2);
+ return getVectorMaskingNode(NewOp, Mask, PassThru, Subtarget, DAG);
}
- case INTR_TYPE_2OP_MASK_RM: {
+ case INTR_TYPE_2OP_MASK_SAE: {
SDValue Src1 = Op.getOperand(1);
SDValue Src2 = Op.getOperand(2);
SDValue PassThru = Op.getOperand(3);
SDValue Mask = Op.getOperand(4);
- // We specify 2 possible modes for intrinsics, with/without rounding
- // modes.
- // First, we check if the intrinsic have rounding mode (6 operands),
- // if not, we set rounding mode to "current".
- SDValue Rnd;
- if (Op.getNumOperands() == 6)
- Rnd = Op.getOperand(5);
- else
- Rnd = DAG.getConstant(X86::STATIC_ROUNDING::CUR_DIRECTION, dl, MVT::i32);
- return getVectorMaskingNode(DAG.getNode(IntrData->Opc0, dl, VT,
- Src1, Src2, Rnd),
+
+ unsigned Opc = IntrData->Opc0;
+ if (IntrData->Opc1 != 0) {
+ SDValue Sae = Op.getOperand(5);
+ if (isRoundModeSAE(Sae))
+ Opc = IntrData->Opc1;
+ else if (!isRoundModeCurDirection(Sae))
+ return SDValue();
+ }
+
+ return getVectorMaskingNode(DAG.getNode(Opc, dl, VT, Src1, Src2),
Mask, PassThru, Subtarget, DAG);
}
- case INTR_TYPE_3OP_SCALAR_MASK: {
+ case INTR_TYPE_3OP_SCALAR_MASK_SAE: {
SDValue Src1 = Op.getOperand(1);
SDValue Src2 = Op.getOperand(2);
SDValue Src3 = Op.getOperand(3);
SDValue PassThru = Op.getOperand(4);
SDValue Mask = Op.getOperand(5);
+ SDValue Sae = Op.getOperand(6);
+ unsigned Opc;
+ if (isRoundModeCurDirection(Sae))
+ Opc = IntrData->Opc0;
+ else if (isRoundModeSAE(Sae))
+ Opc = IntrData->Opc1;
+ else
+ return SDValue();
- unsigned IntrWithRoundingModeOpcode = IntrData->Opc1;
- if (IntrWithRoundingModeOpcode != 0) {
- SDValue Rnd = Op.getOperand(6);
- if (!isRoundModeCurDirection(Rnd))
- return getScalarMaskingNode(DAG.getNode(IntrWithRoundingModeOpcode,
- dl, VT, Src1, Src2, Src3, Rnd),
- Mask, PassThru, Subtarget, DAG);
- }
- return getScalarMaskingNode(DAG.getNode(IntrData->Opc0, dl, VT, Src1,
- Src2, Src3),
+ return getScalarMaskingNode(DAG.getNode(Opc, dl, VT, Src1, Src2, Src3),
Mask, PassThru, Subtarget, DAG);
}
- case INTR_TYPE_3OP_MASK: {
+ case INTR_TYPE_3OP_MASK_SAE: {
SDValue Src1 = Op.getOperand(1);
SDValue Src2 = Op.getOperand(2);
SDValue Src3 = Op.getOperand(3);
SDValue PassThru = Op.getOperand(4);
SDValue Mask = Op.getOperand(5);
- // We specify 2 possible opcodes for intrinsics with rounding modes.
- // First, we check if the intrinsic may have non-default rounding mode,
- // (IntrData->Opc1 != 0), then we check the rounding mode operand.
- unsigned IntrWithRoundingModeOpcode = IntrData->Opc1;
- if (IntrWithRoundingModeOpcode != 0) {
- SDValue Rnd = Op.getOperand(6);
- if (!isRoundModeCurDirection(Rnd)) {
- return getVectorMaskingNode(DAG.getNode(IntrWithRoundingModeOpcode,
- dl, Op.getValueType(),
- Src1, Src2, Src3, Rnd),
- Mask, PassThru, Subtarget, DAG);
- }
+ unsigned Opc = IntrData->Opc0;
+ if (IntrData->Opc1 != 0) {
+ SDValue Sae = Op.getOperand(6);
+ if (isRoundModeSAE(Sae))
+ Opc = IntrData->Opc1;
+ else if (!isRoundModeCurDirection(Sae))
+ return SDValue();
}
- return getVectorMaskingNode(DAG.getNode(IntrData->Opc0, dl, VT,
- Src1, Src2, Src3),
+ return getVectorMaskingNode(DAG.getNode(Opc, dl, VT, Src1, Src2, Src3),
Mask, PassThru, Subtarget, DAG);
}
+ case BLENDV: {
+ SDValue Src1 = Op.getOperand(1);
+ SDValue Src2 = Op.getOperand(2);
+ SDValue Src3 = Op.getOperand(3);
+
+ EVT MaskVT = Src3.getValueType().changeVectorElementTypeToInteger();
+ Src3 = DAG.getBitcast(MaskVT, Src3);
+
+ // Reverse the operands to match VSELECT order.
+ return DAG.getNode(IntrData->Opc0, dl, VT, Src3, Src2, Src1);
+ }
case VPERM_2OP : {
SDValue Src1 = Op.getOperand(1);
SDValue Src2 = Op.getOperand(2);
@@ -21783,35 +22648,6 @@ SDValue X86TargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op,
// first.
return DAG.getNode(IntrData->Opc0, dl, Op.getValueType(),
Op.getOperand(2), Op.getOperand(3), Op.getOperand(1));
- case CVTPD2PS:
- // ISD::FP_ROUND has a second argument that indicates if the truncation
- // does not change the value. Set it to 0 since it can change.
- return DAG.getNode(IntrData->Opc0, dl, VT, Op.getOperand(1),
- DAG.getIntPtrConstant(0, dl));
- case CVTPD2PS_RND_MASK: {
- SDValue Src = Op.getOperand(1);
- SDValue PassThru = Op.getOperand(2);
- SDValue Mask = Op.getOperand(3);
- // We add rounding mode to the Node when
- // - RM Opcode is specified and
- // - RM is not "current direction".
- unsigned IntrWithRoundingModeOpcode = IntrData->Opc1;
- if (IntrWithRoundingModeOpcode != 0) {
- SDValue Rnd = Op.getOperand(4);
- if (!isRoundModeCurDirection(Rnd)) {
- return getVectorMaskingNode(DAG.getNode(IntrWithRoundingModeOpcode,
- dl, Op.getValueType(),
- Src, Rnd),
- Mask, PassThru, Subtarget, DAG);
- }
- }
- assert(IntrData->Opc0 == ISD::FP_ROUND && "Unexpected opcode!");
- // ISD::FP_ROUND has a second argument that indicates if the truncation
- // does not change the value. Set it to 0 since it can change.
- return getVectorMaskingNode(DAG.getNode(IntrData->Opc0, dl, VT, Src,
- DAG.getIntPtrConstant(0, dl)),
- Mask, PassThru, Subtarget, DAG);
- }
case FPCLASSS: {
SDValue Src1 = Op.getOperand(1);
SDValue Imm = Op.getOperand(2);
@@ -21829,24 +22665,22 @@ SDValue X86TargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op,
case CMP_MASK_CC: {
MVT MaskVT = Op.getSimpleValueType();
- SDValue Cmp;
SDValue CC = Op.getOperand(3);
CC = DAG.getNode(ISD::TRUNCATE, dl, MVT::i8, CC);
// We specify 2 possible opcodes for intrinsics with rounding modes.
// First, we check if the intrinsic may have non-default rounding mode,
// (IntrData->Opc1 != 0), then we check the rounding mode operand.
if (IntrData->Opc1 != 0) {
- SDValue Rnd = Op.getOperand(4);
- if (!isRoundModeCurDirection(Rnd))
- Cmp = DAG.getNode(IntrData->Opc1, dl, MaskVT, Op.getOperand(1),
- Op.getOperand(2), CC, Rnd);
+ SDValue Sae = Op.getOperand(4);
+ if (isRoundModeSAE(Sae))
+ return DAG.getNode(IntrData->Opc1, dl, MaskVT, Op.getOperand(1),
+ Op.getOperand(2), CC, Sae);
+ if (!isRoundModeCurDirection(Sae))
+ return SDValue();
}
//default rounding mode
- if (!Cmp.getNode())
- Cmp = DAG.getNode(IntrData->Opc0, dl, MaskVT, Op.getOperand(1),
+ return DAG.getNode(IntrData->Opc0, dl, MaskVT, Op.getOperand(1),
Op.getOperand(2), CC);
-
- return Cmp;
}
case CMP_MASK_SCALAR_CC: {
SDValue Src1 = Op.getOperand(1);
@@ -21856,12 +22690,14 @@ SDValue X86TargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op,
SDValue Cmp;
if (IntrData->Opc1 != 0) {
- SDValue Rnd = Op.getOperand(5);
- if (!isRoundModeCurDirection(Rnd))
- Cmp = DAG.getNode(IntrData->Opc1, dl, MVT::v1i1, Src1, Src2, CC, Rnd);
+ SDValue Sae = Op.getOperand(5);
+ if (isRoundModeSAE(Sae))
+ Cmp = DAG.getNode(IntrData->Opc1, dl, MVT::v1i1, Src1, Src2, CC, Sae);
+ else if (!isRoundModeCurDirection(Sae))
+ return SDValue();
}
//default rounding mode
- if(!Cmp.getNode())
+ if (!Cmp.getNode())
Cmp = DAG.getNode(IntrData->Opc0, dl, MVT::v1i1, Src1, Src2, CC);
SDValue CmpMask = getScalarMaskingNode(Cmp, Mask, SDValue(),
@@ -21921,9 +22757,11 @@ SDValue X86TargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op,
if (isRoundModeCurDirection(Sae))
FCmp = DAG.getNode(X86ISD::FSETCCM, dl, MVT::v1i1, LHS, RHS,
DAG.getConstant(CondVal, dl, MVT::i8));
- else
- FCmp = DAG.getNode(X86ISD::FSETCCM_RND, dl, MVT::v1i1, LHS, RHS,
+ else if (isRoundModeSAE(Sae))
+ FCmp = DAG.getNode(X86ISD::FSETCCM_SAE, dl, MVT::v1i1, LHS, RHS,
DAG.getConstant(CondVal, dl, MVT::i8), Sae);
+ else
+ return SDValue();
// Need to fill with zeros to ensure the bitcast will produce zeroes
// for the upper bits. An EXTRACT_ELEMENT here wouldn't guarantee that.
SDValue Ins = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, MVT::v16i1,
@@ -21940,41 +22778,42 @@ SDValue X86TargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op,
SDValue Mask = Op.getOperand(3);
SDValue DataToCompress = Op.getOperand(1);
SDValue PassThru = Op.getOperand(2);
- if (isAllOnesConstant(Mask)) // return data as is
+ if (ISD::isBuildVectorAllOnes(Mask.getNode())) // return data as is
return Op.getOperand(1);
- return getVectorMaskingNode(DAG.getNode(IntrData->Opc0, dl, VT,
- DataToCompress),
- Mask, PassThru, Subtarget, DAG);
+ // Avoid false dependency.
+ if (PassThru.isUndef())
+ PassThru = DAG.getConstant(0, dl, VT);
+
+ return DAG.getNode(IntrData->Opc0, dl, VT, DataToCompress, PassThru,
+ Mask);
}
- case FIXUPIMMS:
- case FIXUPIMMS_MASKZ:
case FIXUPIMM:
- case FIXUPIMM_MASKZ:{
+ case FIXUPIMM_MASKZ: {
SDValue Src1 = Op.getOperand(1);
SDValue Src2 = Op.getOperand(2);
SDValue Src3 = Op.getOperand(3);
SDValue Imm = Op.getOperand(4);
SDValue Mask = Op.getOperand(5);
- SDValue Passthru = (IntrData->Type == FIXUPIMM || IntrData->Type == FIXUPIMMS ) ?
- Src1 : getZeroVector(VT, Subtarget, DAG, dl);
- // We specify 2 possible modes for intrinsics, with/without rounding
- // modes.
- // First, we check if the intrinsic have rounding mode (7 operands),
- // if not, we set rounding mode to "current".
- SDValue Rnd;
- if (Op.getNumOperands() == 7)
- Rnd = Op.getOperand(6);
- else
- Rnd = DAG.getConstant(X86::STATIC_ROUNDING::CUR_DIRECTION, dl, MVT::i32);
- if (IntrData->Type == FIXUPIMM || IntrData->Type == FIXUPIMM_MASKZ)
- return getVectorMaskingNode(DAG.getNode(IntrData->Opc0, dl, VT,
- Src1, Src2, Src3, Imm, Rnd),
- Mask, Passthru, Subtarget, DAG);
- else // Scalar - FIXUPIMMS, FIXUPIMMS_MASKZ
- return getScalarMaskingNode(DAG.getNode(IntrData->Opc0, dl, VT,
- Src1, Src2, Src3, Imm, Rnd),
- Mask, Passthru, Subtarget, DAG);
+ SDValue Passthru = (IntrData->Type == FIXUPIMM)
+ ? Src1
+ : getZeroVector(VT, Subtarget, DAG, dl);
+
+ unsigned Opc = IntrData->Opc0;
+ if (IntrData->Opc1 != 0) {
+ SDValue Sae = Op.getOperand(6);
+ if (isRoundModeSAE(Sae))
+ Opc = IntrData->Opc1;
+ else if (!isRoundModeCurDirection(Sae))
+ return SDValue();
+ }
+
+ SDValue FixupImm = DAG.getNode(Opc, dl, VT, Src1, Src2, Src3, Imm);
+
+ if (Opc == X86ISD::VFIXUPIMM || Opc == X86ISD::VFIXUPIMM_SAE)
+ return getVectorMaskingNode(FixupImm, Mask, Passthru, Subtarget, DAG);
+
+ return getScalarMaskingNode(FixupImm, Mask, Passthru, Subtarget, DAG);
}
case ROUNDP: {
assert(IntrData->Opc0 == X86ISD::VRNDSCALE && "Unexpected opcode");
@@ -22018,7 +22857,8 @@ SDValue X86TargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op,
return DAG.getMergeValues(Results, dl);
}
case CVTPD2PS_MASK:
- case CVTPD2I_MASK:
+ case CVTPD2DQ_MASK:
+ case CVTQQ2PS_MASK:
case TRUNCATE_TO_REG: {
SDValue Src = Op.getOperand(1);
SDValue PassThru = Op.getOperand(2);
@@ -22049,6 +22889,21 @@ SDValue X86TargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op,
PassThru, Mask);
}
+ case CVTNEPS2BF16_MASK: {
+ SDValue Src = Op.getOperand(1);
+ SDValue PassThru = Op.getOperand(2);
+ SDValue Mask = Op.getOperand(3);
+
+ if (ISD::isBuildVectorAllOnes(Mask.getNode()))
+ return DAG.getNode(IntrData->Opc0, dl, Op.getValueType(), Src);
+
+ // Break false dependency.
+ if (PassThru.isUndef())
+ PassThru = DAG.getConstant(0, dl, PassThru.getValueType());
+
+ return DAG.getNode(IntrData->Opc1, dl, Op.getValueType(), Src, PassThru,
+ Mask);
+ }
default:
break;
}
@@ -22279,10 +23134,37 @@ SDValue X86TargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op,
unsigned Reg;
if (RegInfo->hasBasePointer(MF))
Reg = RegInfo->getBaseRegister();
- else // This function handles the SP or FP case.
- Reg = RegInfo->getPtrSizedFrameRegister(MF);
+ else { // Handles the SP or FP case.
+ bool CantUseFP = RegInfo->needsStackRealignment(MF);
+ if (CantUseFP)
+ Reg = RegInfo->getPtrSizedStackRegister(MF);
+ else
+ Reg = RegInfo->getPtrSizedFrameRegister(MF);
+ }
return DAG.getCopyFromReg(DAG.getEntryNode(), dl, Reg, VT);
}
+
+ case Intrinsic::x86_avx512_vp2intersect_q_512:
+ case Intrinsic::x86_avx512_vp2intersect_q_256:
+ case Intrinsic::x86_avx512_vp2intersect_q_128:
+ case Intrinsic::x86_avx512_vp2intersect_d_512:
+ case Intrinsic::x86_avx512_vp2intersect_d_256:
+ case Intrinsic::x86_avx512_vp2intersect_d_128: {
+ MVT MaskVT = Op.getSimpleValueType();
+
+ SDVTList VTs = DAG.getVTList(MVT::Untyped, MVT::Other);
+ SDLoc DL(Op);
+
+ SDValue Operation =
+ DAG.getNode(X86ISD::VP2INTERSECT, DL, VTs,
+ Op->getOperand(1), Op->getOperand(2));
+
+ SDValue Result0 = DAG.getTargetExtractSubreg(X86::sub_mask_0, DL,
+ MaskVT, Operation);
+ SDValue Result1 = DAG.getTargetExtractSubreg(X86::sub_mask_1, DL,
+ MaskVT, Operation);
+ return DAG.getMergeValues({Result0, Result1}, DL);
+ }
}
}
@@ -22296,25 +23178,26 @@ static SDValue getAVX2GatherNode(unsigned Opc, SDValue Op, SelectionDAG &DAG,
if (!C)
return SDValue();
SDValue Scale = DAG.getTargetConstant(C->getZExtValue(), dl, MVT::i8);
- EVT MaskVT = Mask.getValueType();
+ EVT MaskVT = Mask.getValueType().changeVectorElementTypeToInteger();
SDVTList VTs = DAG.getVTList(Op.getValueType(), MaskVT, MVT::Other);
- SDValue Disp = DAG.getTargetConstant(0, dl, MVT::i32);
- SDValue Segment = DAG.getRegister(0, MVT::i32);
// If source is undef or we know it won't be used, use a zero vector
// to break register dependency.
// TODO: use undef instead and let BreakFalseDeps deal with it?
if (Src.isUndef() || ISD::isBuildVectorAllOnes(Mask.getNode()))
Src = getZeroVector(Op.getSimpleValueType(), Subtarget, DAG, dl);
- SDValue Ops[] = {Src, Base, Scale, Index, Disp, Segment, Mask, Chain};
- SDNode *Res = DAG.getMachineNode(Opc, dl, VTs, Ops);
- SDValue RetOps[] = { SDValue(Res, 0), SDValue(Res, 2) };
- return DAG.getMergeValues(RetOps, dl);
+
+ MemIntrinsicSDNode *MemIntr = cast<MemIntrinsicSDNode>(Op);
+
+ SDValue Ops[] = {Chain, Src, Mask, Base, Index, Scale };
+ SDValue Res = DAG.getTargetMemSDNode<X86MaskedGatherSDNode>(
+ VTs, Ops, dl, MemIntr->getMemoryVT(), MemIntr->getMemOperand());
+ return DAG.getMergeValues({ Res, Res.getValue(2) }, dl);
}
-static SDValue getGatherNode(unsigned Opc, SDValue Op, SelectionDAG &DAG,
- SDValue Src, SDValue Mask, SDValue Base,
- SDValue Index, SDValue ScaleOp, SDValue Chain,
- const X86Subtarget &Subtarget) {
+static SDValue getGatherNode(SDValue Op, SelectionDAG &DAG,
+ SDValue Src, SDValue Mask, SDValue Base,
+ SDValue Index, SDValue ScaleOp, SDValue Chain,
+ const X86Subtarget &Subtarget) {
MVT VT = Op.getSimpleValueType();
SDLoc dl(Op);
auto *C = dyn_cast<ConstantSDNode>(ScaleOp);
@@ -22332,17 +23215,18 @@ static SDValue getGatherNode(unsigned Opc, SDValue Op, SelectionDAG &DAG,
Mask = getMaskNode(Mask, MaskVT, Subtarget, DAG, dl);
SDVTList VTs = DAG.getVTList(Op.getValueType(), MaskVT, MVT::Other);
- SDValue Disp = DAG.getTargetConstant(0, dl, MVT::i32);
- SDValue Segment = DAG.getRegister(0, MVT::i32);
// If source is undef or we know it won't be used, use a zero vector
// to break register dependency.
// TODO: use undef instead and let BreakFalseDeps deal with it?
if (Src.isUndef() || ISD::isBuildVectorAllOnes(Mask.getNode()))
Src = getZeroVector(Op.getSimpleValueType(), Subtarget, DAG, dl);
- SDValue Ops[] = {Src, Mask, Base, Scale, Index, Disp, Segment, Chain};
- SDNode *Res = DAG.getMachineNode(Opc, dl, VTs, Ops);
- SDValue RetOps[] = { SDValue(Res, 0), SDValue(Res, 2) };
- return DAG.getMergeValues(RetOps, dl);
+
+ MemIntrinsicSDNode *MemIntr = cast<MemIntrinsicSDNode>(Op);
+
+ SDValue Ops[] = {Chain, Src, Mask, Base, Index, Scale };
+ SDValue Res = DAG.getTargetMemSDNode<X86MaskedGatherSDNode>(
+ VTs, Ops, dl, MemIntr->getMemoryVT(), MemIntr->getMemOperand());
+ return DAG.getMergeValues({ Res, Res.getValue(2) }, dl);
}
static SDValue getScatterNode(unsigned Opc, SDValue Op, SelectionDAG &DAG,
@@ -22355,8 +23239,6 @@ static SDValue getScatterNode(unsigned Opc, SDValue Op, SelectionDAG &DAG,
if (!C)
return SDValue();
SDValue Scale = DAG.getTargetConstant(C->getZExtValue(), dl, MVT::i8);
- SDValue Disp = DAG.getTargetConstant(0, dl, MVT::i32);
- SDValue Segment = DAG.getRegister(0, MVT::i32);
unsigned MinElts = std::min(Index.getSimpleValueType().getVectorNumElements(),
Src.getSimpleValueType().getVectorNumElements());
MVT MaskVT = MVT::getVectorVT(MVT::i1, MinElts);
@@ -22366,10 +23248,13 @@ static SDValue getScatterNode(unsigned Opc, SDValue Op, SelectionDAG &DAG,
if (Mask.getValueType() != MaskVT)
Mask = getMaskNode(Mask, MaskVT, Subtarget, DAG, dl);
+ MemIntrinsicSDNode *MemIntr = cast<MemIntrinsicSDNode>(Op);
+
SDVTList VTs = DAG.getVTList(MaskVT, MVT::Other);
- SDValue Ops[] = {Base, Scale, Index, Disp, Segment, Mask, Src, Chain};
- SDNode *Res = DAG.getMachineNode(Opc, dl, VTs, Ops);
- return SDValue(Res, 1);
+ SDValue Ops[] = {Chain, Src, Mask, Base, Index, Scale};
+ SDValue Res = DAG.getTargetMemSDNode<X86MaskedScatterSDNode>(
+ VTs, Ops, dl, MemIntr->getMemoryVT(), MemIntr->getMemOperand());
+ return Res.getValue(1);
}
static SDValue getPrefetchNode(unsigned Opc, SDValue Op, SelectionDAG &DAG,
@@ -22392,24 +23277,37 @@ static SDValue getPrefetchNode(unsigned Opc, SDValue Op, SelectionDAG &DAG,
return SDValue(Res, 0);
}
-/// Handles the lowering of builtin intrinsic that return the value
-/// of the extended control register.
-static void getExtendedControlRegister(SDNode *N, const SDLoc &DL,
- SelectionDAG &DAG,
- const X86Subtarget &Subtarget,
- SmallVectorImpl<SDValue> &Results) {
- assert(N->getNumOperands() == 3 && "Unexpected number of operands!");
- SDVTList Tys = DAG.getVTList(MVT::Other, MVT::Glue);
- SDValue LO, HI;
+/// Handles the lowering of builtin intrinsics with chain that return their
+/// value into registers EDX:EAX.
+/// If operand ScrReg is a valid register identifier, then operand 2 of N is
+/// copied to SrcReg. The assumption is that SrcReg is an implicit input to
+/// TargetOpcode.
+/// Returns a Glue value which can be used to add extra copy-from-reg if the
+/// expanded intrinsics implicitly defines extra registers (i.e. not just
+/// EDX:EAX).
+static SDValue expandIntrinsicWChainHelper(SDNode *N, const SDLoc &DL,
+ SelectionDAG &DAG,
+ unsigned TargetOpcode,
+ unsigned SrcReg,
+ const X86Subtarget &Subtarget,
+ SmallVectorImpl<SDValue> &Results) {
+ SDValue Chain = N->getOperand(0);
+ SDValue Glue;
- // The ECX register is used to select the index of the XCR register to
- // return.
- SDValue Chain =
- DAG.getCopyToReg(N->getOperand(0), DL, X86::ECX, N->getOperand(2));
- SDNode *N1 = DAG.getMachineNode(X86::XGETBV, DL, Tys, Chain);
+ if (SrcReg) {
+ assert(N->getNumOperands() == 3 && "Unexpected number of operands!");
+ Chain = DAG.getCopyToReg(Chain, DL, SrcReg, N->getOperand(2), Glue);
+ Glue = Chain.getValue(1);
+ }
+
+ SDVTList Tys = DAG.getVTList(MVT::Other, MVT::Glue);
+ SDValue N1Ops[] = {Chain, Glue};
+ SDNode *N1 = DAG.getMachineNode(
+ TargetOpcode, DL, Tys, ArrayRef<SDValue>(N1Ops, Glue.getNode() ? 2 : 1));
Chain = SDValue(N1, 0);
// Reads the content of XCR and returns it in registers EDX:EAX.
+ SDValue LO, HI;
if (Subtarget.is64Bit()) {
LO = DAG.getCopyFromReg(Chain, DL, X86::RAX, MVT::i64, SDValue(N1, 1));
HI = DAG.getCopyFromReg(LO.getValue(1), DL, X86::RDX, MVT::i64,
@@ -22420,60 +23318,15 @@ static void getExtendedControlRegister(SDNode *N, const SDLoc &DL,
LO.getValue(2));
}
Chain = HI.getValue(1);
+ Glue = HI.getValue(2);
if (Subtarget.is64Bit()) {
- // Merge the two 32-bit values into a 64-bit one..
+ // Merge the two 32-bit values into a 64-bit one.
SDValue Tmp = DAG.getNode(ISD::SHL, DL, MVT::i64, HI,
DAG.getConstant(32, DL, MVT::i8));
Results.push_back(DAG.getNode(ISD::OR, DL, MVT::i64, LO, Tmp));
Results.push_back(Chain);
- return;
- }
-
- // Use a buildpair to merge the two 32-bit values into a 64-bit one.
- SDValue Ops[] = { LO, HI };
- SDValue Pair = DAG.getNode(ISD::BUILD_PAIR, DL, MVT::i64, Ops);
- Results.push_back(Pair);
- Results.push_back(Chain);
-}
-
-/// Handles the lowering of builtin intrinsics that read performance monitor
-/// counters (x86_rdpmc).
-static void getReadPerformanceCounter(SDNode *N, const SDLoc &DL,
- SelectionDAG &DAG,
- const X86Subtarget &Subtarget,
- SmallVectorImpl<SDValue> &Results) {
- assert(N->getNumOperands() == 3 && "Unexpected number of operands!");
- SDVTList Tys = DAG.getVTList(MVT::Other, MVT::Glue);
- SDValue LO, HI;
-
- // The ECX register is used to select the index of the performance counter
- // to read.
- SDValue Chain = DAG.getCopyToReg(N->getOperand(0), DL, X86::ECX,
- N->getOperand(2));
- SDValue rd = DAG.getNode(X86ISD::RDPMC_DAG, DL, Tys, Chain);
-
- // Reads the content of a 64-bit performance counter and returns it in the
- // registers EDX:EAX.
- if (Subtarget.is64Bit()) {
- LO = DAG.getCopyFromReg(rd, DL, X86::RAX, MVT::i64, rd.getValue(1));
- HI = DAG.getCopyFromReg(LO.getValue(1), DL, X86::RDX, MVT::i64,
- LO.getValue(2));
- } else {
- LO = DAG.getCopyFromReg(rd, DL, X86::EAX, MVT::i32, rd.getValue(1));
- HI = DAG.getCopyFromReg(LO.getValue(1), DL, X86::EDX, MVT::i32,
- LO.getValue(2));
- }
- Chain = HI.getValue(1);
-
- if (Subtarget.is64Bit()) {
- // The EAX register is loaded with the low-order 32 bits. The EDX register
- // is loaded with the supported high-order bits of the counter.
- SDValue Tmp = DAG.getNode(ISD::SHL, DL, MVT::i64, HI,
- DAG.getConstant(32, DL, MVT::i8));
- Results.push_back(DAG.getNode(ISD::OR, DL, MVT::i64, LO, Tmp));
- Results.push_back(Chain);
- return;
+ return Glue;
}
// Use a buildpair to merge the two 32-bit values into a 64-bit one.
@@ -22481,6 +23334,7 @@ static void getReadPerformanceCounter(SDNode *N, const SDLoc &DL,
SDValue Pair = DAG.getNode(ISD::BUILD_PAIR, DL, MVT::i64, Ops);
Results.push_back(Pair);
Results.push_back(Chain);
+ return Glue;
}
/// Handles the lowering of builtin intrinsics that read the time stamp counter
@@ -22490,59 +23344,28 @@ static void getReadTimeStampCounter(SDNode *N, const SDLoc &DL, unsigned Opcode,
SelectionDAG &DAG,
const X86Subtarget &Subtarget,
SmallVectorImpl<SDValue> &Results) {
- SDVTList Tys = DAG.getVTList(MVT::Other, MVT::Glue);
- SDValue rd = DAG.getNode(Opcode, DL, Tys, N->getOperand(0));
- SDValue LO, HI;
-
// The processor's time-stamp counter (a 64-bit MSR) is stored into the
// EDX:EAX registers. EDX is loaded with the high-order 32 bits of the MSR
// and the EAX register is loaded with the low-order 32 bits.
- if (Subtarget.is64Bit()) {
- LO = DAG.getCopyFromReg(rd, DL, X86::RAX, MVT::i64, rd.getValue(1));
- HI = DAG.getCopyFromReg(LO.getValue(1), DL, X86::RDX, MVT::i64,
- LO.getValue(2));
- } else {
- LO = DAG.getCopyFromReg(rd, DL, X86::EAX, MVT::i32, rd.getValue(1));
- HI = DAG.getCopyFromReg(LO.getValue(1), DL, X86::EDX, MVT::i32,
- LO.getValue(2));
- }
- SDValue Chain = HI.getValue(1);
-
- SDValue TSC;
- if (Subtarget.is64Bit()) {
- // The EDX register is loaded with the high-order 32 bits of the MSR, and
- // the EAX register is loaded with the low-order 32 bits.
- TSC = DAG.getNode(ISD::SHL, DL, MVT::i64, HI,
- DAG.getConstant(32, DL, MVT::i8));
- TSC = DAG.getNode(ISD::OR, DL, MVT::i64, LO, TSC);
- } else {
- // Use a buildpair to merge the two 32-bit values into a 64-bit one.
- TSC = DAG.getNode(ISD::BUILD_PAIR, DL, MVT::i64, { LO, HI });
- }
-
- if (Opcode == X86ISD::RDTSCP_DAG) {
- assert(N->getNumOperands() == 2 && "Unexpected number of operands!");
-
- // Instruction RDTSCP loads the IA32:TSC_AUX_MSR (address C000_0103H) into
- // the ECX register. Add 'ecx' explicitly to the chain.
- SDValue ecx = DAG.getCopyFromReg(Chain, DL, X86::ECX, MVT::i32,
- HI.getValue(2));
-
- Results.push_back(TSC);
- Results.push_back(ecx);
- Results.push_back(ecx.getValue(1));
+ SDValue Glue = expandIntrinsicWChainHelper(N, DL, DAG, Opcode,
+ /* NoRegister */0, Subtarget,
+ Results);
+ if (Opcode != X86::RDTSCP)
return;
- }
- Results.push_back(TSC);
- Results.push_back(Chain);
+ SDValue Chain = Results[1];
+ // Instruction RDTSCP loads the IA32:TSC_AUX_MSR (address C000_0103H) into
+ // the ECX register. Add 'ecx' explicitly to the chain.
+ SDValue ecx = DAG.getCopyFromReg(Chain, DL, X86::ECX, MVT::i32, Glue);
+ Results[1] = ecx;
+ Results.push_back(ecx.getValue(1));
}
static SDValue LowerREADCYCLECOUNTER(SDValue Op, const X86Subtarget &Subtarget,
SelectionDAG &DAG) {
SmallVector<SDValue, 3> Results;
SDLoc DL(Op);
- getReadTimeStampCounter(Op.getNode(), DL, X86ISD::RDTSC_DAG, DAG, Subtarget,
+ getReadTimeStampCounter(Op.getNode(), DL, X86::RDTSC, DAG, Subtarget,
Results);
return DAG.getMergeValues(Results, DL);
}
@@ -22621,6 +23444,22 @@ static SDValue LowerINTRINSIC_W_CHAIN(SDValue Op, const X86Subtarget &Subtarget,
return MarkEHRegistrationNode(Op, DAG);
case llvm::Intrinsic::x86_seh_ehguard:
return MarkEHGuard(Op, DAG);
+ case llvm::Intrinsic::x86_rdpkru: {
+ SDLoc dl(Op);
+ SDVTList VTs = DAG.getVTList(MVT::i32, MVT::Other);
+ // Create a RDPKRU node and pass 0 to the ECX parameter.
+ return DAG.getNode(X86ISD::RDPKRU, dl, VTs, Op.getOperand(0),
+ DAG.getConstant(0, dl, MVT::i32));
+ }
+ case llvm::Intrinsic::x86_wrpkru: {
+ SDLoc dl(Op);
+ // Create a WRPKRU node, pass the input to the EAX parameter, and pass 0
+ // to the EDX and ECX parameters.
+ return DAG.getNode(X86ISD::WRPKRU, dl, MVT::Other,
+ Op.getOperand(0), Op.getOperand(2),
+ DAG.getConstant(0, dl, MVT::i32),
+ DAG.getConstant(0, dl, MVT::i32));
+ }
case llvm::Intrinsic::x86_flags_read_u32:
case llvm::Intrinsic::x86_flags_read_u64:
case llvm::Intrinsic::x86_flags_write_u32:
@@ -22630,7 +23469,7 @@ static SDValue LowerINTRINSIC_W_CHAIN(SDValue Op, const X86Subtarget &Subtarget,
MachineFrameInfo &MFI = DAG.getMachineFunction().getFrameInfo();
MFI.setHasCopyImplyingStackAdjustment(true);
// Don't do anything here, we will expand these intrinsics out later
- // during ExpandISelPseudos in EmitInstrWithCustomInserter.
+ // during FinalizeISel in EmitInstrWithCustomInserter.
return SDValue();
}
case Intrinsic::x86_lwpins32:
@@ -22660,8 +23499,28 @@ static SDValue LowerINTRINSIC_W_CHAIN(SDValue Op, const X86Subtarget &Subtarget,
DAG.getNode(Opcode, dl, VTs, Chain, Op->getOperand(2),
Op->getOperand(3), Op->getOperand(4));
SDValue SetCC = getSETCC(X86::COND_B, Operation.getValue(0), dl, DAG);
- SDValue Result = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i8, SetCC);
- return DAG.getNode(ISD::MERGE_VALUES, dl, Op->getVTList(), Result,
+ return DAG.getNode(ISD::MERGE_VALUES, dl, Op->getVTList(), SetCC,
+ Operation.getValue(1));
+ }
+ case Intrinsic::x86_enqcmd:
+ case Intrinsic::x86_enqcmds: {
+ SDLoc dl(Op);
+ SDValue Chain = Op.getOperand(0);
+ SDVTList VTs = DAG.getVTList(MVT::i32, MVT::Other);
+ unsigned Opcode;
+ switch (IntNo) {
+ default: llvm_unreachable("Impossible intrinsic!");
+ case Intrinsic::x86_enqcmd:
+ Opcode = X86ISD::ENQCMD;
+ break;
+ case Intrinsic::x86_enqcmds:
+ Opcode = X86ISD::ENQCMDS;
+ break;
+ }
+ SDValue Operation = DAG.getNode(Opcode, dl, VTs, Chain, Op.getOperand(2),
+ Op.getOperand(3));
+ SDValue SetCC = getSETCC(X86::COND_E, Operation.getValue(0), dl, DAG);
+ return DAG.getNode(ISD::MERGE_VALUES, dl, Op->getVTList(), SetCC,
Operation.getValue(1));
}
}
@@ -22707,7 +23566,7 @@ static SDValue LowerINTRINSIC_W_CHAIN(SDValue Op, const X86Subtarget &Subtarget,
SDValue Index = Op.getOperand(4);
SDValue Mask = Op.getOperand(5);
SDValue Scale = Op.getOperand(6);
- return getGatherNode(IntrData->Opc0, Op, DAG, Src, Mask, Base, Index, Scale,
+ return getGatherNode(Op, DAG, Src, Mask, Base, Index, Scale,
Chain, Subtarget);
}
case SCATTER: {
@@ -22743,15 +23602,16 @@ static SDValue LowerINTRINSIC_W_CHAIN(SDValue Op, const X86Subtarget &Subtarget,
return DAG.getMergeValues(Results, dl);
}
// Read Performance Monitoring Counters.
- case RDPMC: {
- SmallVector<SDValue, 2> Results;
- getReadPerformanceCounter(Op.getNode(), dl, DAG, Subtarget, Results);
- return DAG.getMergeValues(Results, dl);
- }
- // Get Extended Control Register.
+ case RDPMC:
+ // GetExtended Control Register.
case XGETBV: {
SmallVector<SDValue, 2> Results;
- getExtendedControlRegister(Op.getNode(), dl, DAG, Subtarget, Results);
+
+ // RDPMC uses ECX to select the index of the performance counter to read.
+ // XGETBV uses ECX to select the index of the XCR register to return.
+ // The result is stored into registers EDX:EAX.
+ expandIntrinsicWChainHelper(Op.getNode(), dl, DAG, IntrData->Opc0, X86::ECX,
+ Subtarget, Results);
return DAG.getMergeValues(Results, dl);
}
// XTEST intrinsics.
@@ -22861,7 +23721,7 @@ SDValue X86TargetLowering::LowerFRAMEADDR(SDValue Op, SelectionDAG &DAG) const {
// Set up a frame object for the return address.
unsigned SlotSize = RegInfo->getSlotSize();
FrameAddrIndex = MF.getFrameInfo().CreateFixedObject(
- SlotSize, /*Offset=*/0, /*IsImmutable=*/false);
+ SlotSize, /*SPOffset=*/0, /*IsImmutable=*/false);
FuncInfo->setFAIndex(FrameAddrIndex);
}
return DAG.getFrameIndex(FrameAddrIndex, VT);
@@ -23444,10 +24304,6 @@ static SDValue LowerCTTZ(SDValue Op, const X86Subtarget &Subtarget,
SDValue N0 = Op.getOperand(0);
SDLoc dl(Op);
- // Decompose 256-bit ops into smaller 128-bit ops.
- if (VT.is256BitVector() && !Subtarget.hasInt256())
- return Lower256IntUnary(Op, DAG);
-
assert(!VT.isVector() && Op.getOpcode() == ISD::CTTZ &&
"Only scalar CTTZ requires custom lowering");
@@ -23539,22 +24395,48 @@ static SDValue lowerAddSub(SDValue Op, SelectionDAG &DAG,
return split256IntArith(Op, DAG);
}
-static SDValue LowerADDSAT_SUBSAT(SDValue Op, SelectionDAG &DAG) {
+static SDValue LowerADDSAT_SUBSAT(SDValue Op, SelectionDAG &DAG,
+ const X86Subtarget &Subtarget) {
MVT VT = Op.getSimpleValueType();
+ SDValue X = Op.getOperand(0), Y = Op.getOperand(1);
+ unsigned Opcode = Op.getOpcode();
if (VT.getScalarType() == MVT::i1) {
SDLoc dl(Op);
- switch (Op.getOpcode()) {
+ switch (Opcode) {
default: llvm_unreachable("Expected saturated arithmetic opcode");
case ISD::UADDSAT:
case ISD::SADDSAT:
- return DAG.getNode(ISD::OR, dl, VT, Op.getOperand(0), Op.getOperand(1));
+ // *addsat i1 X, Y --> X | Y
+ return DAG.getNode(ISD::OR, dl, VT, X, Y);
case ISD::USUBSAT:
case ISD::SSUBSAT:
- return DAG.getNode(ISD::AND, dl, VT, Op.getOperand(0),
- DAG.getNOT(dl, Op.getOperand(1), VT));
+ // *subsat i1 X, Y --> X & ~Y
+ return DAG.getNode(ISD::AND, dl, VT, X, DAG.getNOT(dl, Y, VT));
}
}
+ if (VT.is128BitVector()) {
+ // Avoid the generic expansion with min/max if we don't have pminu*/pmaxu*.
+ const TargetLowering &TLI = DAG.getTargetLoweringInfo();
+ EVT SetCCResultType = TLI.getSetCCResultType(DAG.getDataLayout(),
+ *DAG.getContext(), VT);
+ SDLoc DL(Op);
+ if (Opcode == ISD::UADDSAT && !TLI.isOperationLegal(ISD::UMIN, VT)) {
+ // uaddsat X, Y --> (X >u (X + Y)) ? -1 : X + Y
+ SDValue Add = DAG.getNode(ISD::ADD, DL, VT, X, Y);
+ SDValue Cmp = DAG.getSetCC(DL, SetCCResultType, X, Add, ISD::SETUGT);
+ return DAG.getSelect(DL, VT, Cmp, DAG.getAllOnesConstant(DL, VT), Add);
+ }
+ if (Opcode == ISD::USUBSAT && !TLI.isOperationLegal(ISD::UMAX, VT)) {
+ // usubsat X, Y --> (X >u Y) ? X - Y : 0
+ SDValue Sub = DAG.getNode(ISD::SUB, DL, VT, X, Y);
+ SDValue Cmp = DAG.getSetCC(DL, SetCCResultType, X, Y, ISD::SETUGT);
+ return DAG.getSelect(DL, VT, Cmp, Sub, DAG.getConstant(0, DL, VT));
+ }
+ // Use default expansion.
+ return SDValue();
+ }
+
assert(Op.getSimpleValueType().is256BitVector() &&
Op.getSimpleValueType().isInteger() &&
"Only handle AVX 256-bit vector integer operation");
@@ -23886,9 +24768,6 @@ static SDValue LowerMULH(SDValue Op, const X86Subtarget &Subtarget,
// Signed AVX2 implementation - extend xmm subvectors to ymm.
if (VT == MVT::v32i8 && IsSigned) {
- SDValue Lo = DAG.getIntPtrConstant(0, dl);
- SDValue Hi = DAG.getIntPtrConstant(NumElts / 2, dl);
-
MVT ExVT = MVT::v16i16;
SDValue ALo = extract128BitVector(A, 0, DAG, dl);
SDValue BLo = extract128BitVector(B, 0, DAG, dl);
@@ -23898,8 +24777,8 @@ static SDValue LowerMULH(SDValue Op, const X86Subtarget &Subtarget,
BLo = DAG.getNode(ExAVX, dl, ExVT, BLo);
AHi = DAG.getNode(ExAVX, dl, ExVT, AHi);
BHi = DAG.getNode(ExAVX, dl, ExVT, BHi);
- Lo = DAG.getNode(ISD::MUL, dl, ExVT, ALo, BLo);
- Hi = DAG.getNode(ISD::MUL, dl, ExVT, AHi, BHi);
+ SDValue Lo = DAG.getNode(ISD::MUL, dl, ExVT, ALo, BLo);
+ SDValue Hi = DAG.getNode(ISD::MUL, dl, ExVT, AHi, BHi);
Lo = getTargetVShiftByConstNode(X86ISD::VSRLI, dl, ExVT, Lo, 8, DAG);
Hi = getTargetVShiftByConstNode(X86ISD::VSRLI, dl, ExVT, Hi, 8, DAG);
@@ -24156,6 +25035,11 @@ static SDValue LowerScalarImmediateShift(SDValue Op, SelectionDAG &DAG,
APInt APIntShiftAmt;
if (!isConstantSplat(Amt, APIntShiftAmt))
return SDValue();
+
+ // If the shift amount is out of range, return undef.
+ if (APIntShiftAmt.uge(VT.getScalarSizeInBits()))
+ return DAG.getUNDEF(VT);
+
uint64_t ShiftAmt = APIntShiftAmt.getZExtValue();
if (SupportedVectorShiftWithImm(VT, Subtarget, Op.getOpcode()))
@@ -24197,8 +25081,8 @@ static SDValue LowerScalarImmediateShift(SDValue Op, SelectionDAG &DAG,
ShiftAmt, DAG);
SHL = DAG.getBitcast(VT, SHL);
// Zero out the rightmost bits.
- return DAG.getNode(ISD::AND, dl, VT, SHL,
- DAG.getConstant(uint8_t(-1U << ShiftAmt), dl, VT));
+ APInt Mask = APInt::getHighBitsSet(8, 8 - ShiftAmt);
+ return DAG.getNode(ISD::AND, dl, VT, SHL, DAG.getConstant(Mask, dl, VT));
}
if (Op.getOpcode() == ISD::SRL) {
// Make a large shift.
@@ -24224,54 +25108,6 @@ static SDValue LowerScalarImmediateShift(SDValue Op, SelectionDAG &DAG,
return SDValue();
}
-// If V is a splat value, return the source vector and splat index;
-static SDValue IsSplatVector(SDValue V, int &SplatIdx, SelectionDAG &DAG) {
- V = peekThroughEXTRACT_SUBVECTORs(V);
-
- EVT VT = V.getValueType();
- unsigned Opcode = V.getOpcode();
- switch (Opcode) {
- default: {
- APInt UndefElts;
- APInt DemandedElts = APInt::getAllOnesValue(VT.getVectorNumElements());
- if (DAG.isSplatValue(V, DemandedElts, UndefElts)) {
- // Handle case where all demanded elements are UNDEF.
- if (DemandedElts.isSubsetOf(UndefElts)) {
- SplatIdx = 0;
- return DAG.getUNDEF(VT);
- }
- SplatIdx = (UndefElts & DemandedElts).countTrailingOnes();
- return V;
- }
- break;
- }
- case ISD::VECTOR_SHUFFLE: {
- // Check if this is a shuffle node doing a splat.
- // TODO - remove this and rely purely on SelectionDAG::isSplatValue,
- // getTargetVShiftNode currently struggles without the splat source.
- auto *SVN = cast<ShuffleVectorSDNode>(V);
- if (!SVN->isSplat())
- break;
- int Idx = SVN->getSplatIndex();
- int NumElts = V.getValueType().getVectorNumElements();
- SplatIdx = Idx % NumElts;
- return V.getOperand(Idx / NumElts);
- }
- }
-
- return SDValue();
-}
-
-static SDValue GetSplatValue(SDValue V, const SDLoc &dl,
- SelectionDAG &DAG) {
- int SplatIdx;
- if (SDValue SrcVector = IsSplatVector(V, SplatIdx, DAG))
- return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl,
- SrcVector.getValueType().getScalarType(), SrcVector,
- DAG.getIntPtrConstant(SplatIdx, dl));
- return SDValue();
-}
-
static SDValue LowerScalarVariableShift(SDValue Op, SelectionDAG &DAG,
const X86Subtarget &Subtarget) {
MVT VT = Op.getSimpleValueType();
@@ -24282,7 +25118,7 @@ static SDValue LowerScalarVariableShift(SDValue Op, SelectionDAG &DAG,
unsigned X86OpcI = getTargetVShiftUniformOpcode(Opcode, false);
unsigned X86OpcV = getTargetVShiftUniformOpcode(Opcode, true);
- if (SDValue BaseShAmt = GetSplatValue(Amt, dl, DAG)) {
+ if (SDValue BaseShAmt = DAG.getSplatValue(Amt)) {
if (SupportedVectorShiftWithBaseAmnt(VT, Subtarget, Opcode)) {
MVT EltVT = VT.getVectorElementType();
assert(EltVT.bitsLE(MVT::i64) && "Unexpected element type!");
@@ -25102,24 +25938,45 @@ bool X86TargetLowering::needsCmpXchgNb(Type *MemType) const {
unsigned OpWidth = MemType->getPrimitiveSizeInBits();
if (OpWidth == 64)
- return !Subtarget.is64Bit(); // FIXME this should be Subtarget.hasCmpxchg8b
- else if (OpWidth == 128)
+ return Subtarget.hasCmpxchg8b() && !Subtarget.is64Bit();
+ if (OpWidth == 128)
return Subtarget.hasCmpxchg16b();
- else
- return false;
+
+ return false;
}
+// TODO: In 32-bit mode, use MOVLPS when SSE1 is available?
+// TODO: In 32-bit mode, use FISTP when X87 is available?
bool X86TargetLowering::shouldExpandAtomicStoreInIR(StoreInst *SI) const {
- return needsCmpXchgNb(SI->getValueOperand()->getType());
+ Type *MemType = SI->getValueOperand()->getType();
+
+ bool NoImplicitFloatOps =
+ SI->getFunction()->hasFnAttribute(Attribute::NoImplicitFloat);
+ if (MemType->getPrimitiveSizeInBits() == 64 && !Subtarget.is64Bit() &&
+ !Subtarget.useSoftFloat() && !NoImplicitFloatOps && Subtarget.hasSSE2())
+ return false;
+
+ return needsCmpXchgNb(MemType);
}
// Note: this turns large loads into lock cmpxchg8b/16b.
-// FIXME: On 32 bits x86, fild/movq might be faster than lock cmpxchg8b.
+// TODO: In 32-bit mode, use MOVLPS when SSE1 is available?
TargetLowering::AtomicExpansionKind
X86TargetLowering::shouldExpandAtomicLoadInIR(LoadInst *LI) const {
- auto PTy = cast<PointerType>(LI->getPointerOperandType());
- return needsCmpXchgNb(PTy->getElementType()) ? AtomicExpansionKind::CmpXChg
- : AtomicExpansionKind::None;
+ Type *MemType = LI->getType();
+
+ // If this a 64 bit atomic load on a 32-bit target and SSE2 is enabled, we
+ // can use movq to do the load. If we have X87 we can load into an 80-bit
+ // X87 register and store it to a stack temporary.
+ bool NoImplicitFloatOps =
+ LI->getFunction()->hasFnAttribute(Attribute::NoImplicitFloat);
+ if (MemType->getPrimitiveSizeInBits() == 64 && !Subtarget.is64Bit() &&
+ !Subtarget.useSoftFloat() && !NoImplicitFloatOps &&
+ (Subtarget.hasSSE2() || Subtarget.hasX87()))
+ return AtomicExpansionKind::None;
+
+ return needsCmpXchgNb(MemType) ? AtomicExpansionKind::CmpXChg
+ : AtomicExpansionKind::None;
}
TargetLowering::AtomicExpansionKind
@@ -25155,6 +26012,8 @@ X86TargetLowering::shouldExpandAtomicRMWInIR(AtomicRMWInst *AI) const {
case AtomicRMWInst::Min:
case AtomicRMWInst::UMax:
case AtomicRMWInst::UMin:
+ case AtomicRMWInst::FAdd:
+ case AtomicRMWInst::FSub:
// These always require a non-trivial set of data operations on x86. We must
// use a cmpxchg loop.
return AtomicExpansionKind::CmpXChg;
@@ -25171,13 +26030,20 @@ X86TargetLowering::lowerIdempotentRMWIntoFencedLoad(AtomicRMWInst *AI) const {
if (MemType->getPrimitiveSizeInBits() > NativeWidth)
return nullptr;
+ // If this is a canonical idempotent atomicrmw w/no uses, we have a better
+ // lowering available in lowerAtomicArith.
+ // TODO: push more cases through this path.
+ if (auto *C = dyn_cast<ConstantInt>(AI->getValOperand()))
+ if (AI->getOperation() == AtomicRMWInst::Or && C->isZero() &&
+ AI->use_empty())
+ return nullptr;
+
auto Builder = IRBuilder<>(AI);
Module *M = Builder.GetInsertBlock()->getParent()->getParent();
auto SSID = AI->getSyncScopeID();
// We must restrict the ordering to avoid generating loads with Release or
// ReleaseAcquire orderings.
auto Order = AtomicCmpXchgInst::getStrongestFailureOrdering(AI->getOrdering());
- auto Ptr = AI->getPointerOperand();
// Before the load we need a fence. Here is an example lifted from
// http://www.hpl.hp.com/techreports/2012/HPL-2012-68.pdf showing why a fence
@@ -25212,14 +26078,80 @@ X86TargetLowering::lowerIdempotentRMWIntoFencedLoad(AtomicRMWInst *AI) const {
Builder.CreateCall(MFence, {});
// Finally we can emit the atomic load.
- LoadInst *Loaded = Builder.CreateAlignedLoad(Ptr,
- AI->getType()->getPrimitiveSizeInBits());
+ LoadInst *Loaded =
+ Builder.CreateAlignedLoad(AI->getType(), AI->getPointerOperand(),
+ AI->getType()->getPrimitiveSizeInBits());
Loaded->setAtomic(Order, SSID);
AI->replaceAllUsesWith(Loaded);
AI->eraseFromParent();
return Loaded;
}
+/// Emit a locked operation on a stack location which does not change any
+/// memory location, but does involve a lock prefix. Location is chosen to be
+/// a) very likely accessed only by a single thread to minimize cache traffic,
+/// and b) definitely dereferenceable. Returns the new Chain result.
+static SDValue emitLockedStackOp(SelectionDAG &DAG,
+ const X86Subtarget &Subtarget,
+ SDValue Chain, SDLoc DL) {
+ // Implementation notes:
+ // 1) LOCK prefix creates a full read/write reordering barrier for memory
+ // operations issued by the current processor. As such, the location
+ // referenced is not relevant for the ordering properties of the instruction.
+ // See: Intel® 64 and IA-32 ArchitecturesSoftware Developer’s Manual,
+ // 8.2.3.9 Loads and Stores Are Not Reordered with Locked Instructions
+ // 2) Using an immediate operand appears to be the best encoding choice
+ // here since it doesn't require an extra register.
+ // 3) OR appears to be very slightly faster than ADD. (Though, the difference
+ // is small enough it might just be measurement noise.)
+ // 4) When choosing offsets, there are several contributing factors:
+ // a) If there's no redzone, we default to TOS. (We could allocate a cache
+ // line aligned stack object to improve this case.)
+ // b) To minimize our chances of introducing a false dependence, we prefer
+ // to offset the stack usage from TOS slightly.
+ // c) To minimize concerns about cross thread stack usage - in particular,
+ // the idiomatic MyThreadPool.run([&StackVars]() {...}) pattern which
+ // captures state in the TOS frame and accesses it from many threads -
+ // we want to use an offset such that the offset is in a distinct cache
+ // line from the TOS frame.
+ //
+ // For a general discussion of the tradeoffs and benchmark results, see:
+ // https://shipilev.net/blog/2014/on-the-fence-with-dependencies/
+
+ auto &MF = DAG.getMachineFunction();
+ auto &TFL = *Subtarget.getFrameLowering();
+ const unsigned SPOffset = TFL.has128ByteRedZone(MF) ? -64 : 0;
+
+ if (Subtarget.is64Bit()) {
+ SDValue Zero = DAG.getTargetConstant(0, DL, MVT::i32);
+ SDValue Ops[] = {
+ DAG.getRegister(X86::RSP, MVT::i64), // Base
+ DAG.getTargetConstant(1, DL, MVT::i8), // Scale
+ DAG.getRegister(0, MVT::i64), // Index
+ DAG.getTargetConstant(SPOffset, DL, MVT::i32), // Disp
+ DAG.getRegister(0, MVT::i16), // Segment.
+ Zero,
+ Chain};
+ SDNode *Res = DAG.getMachineNode(X86::OR32mi8Locked, DL, MVT::i32,
+ MVT::Other, Ops);
+ return SDValue(Res, 1);
+ }
+
+ SDValue Zero = DAG.getTargetConstant(0, DL, MVT::i32);
+ SDValue Ops[] = {
+ DAG.getRegister(X86::ESP, MVT::i32), // Base
+ DAG.getTargetConstant(1, DL, MVT::i8), // Scale
+ DAG.getRegister(0, MVT::i32), // Index
+ DAG.getTargetConstant(SPOffset, DL, MVT::i32), // Disp
+ DAG.getRegister(0, MVT::i16), // Segment.
+ Zero,
+ Chain
+ };
+ SDNode *Res = DAG.getMachineNode(X86::OR32mi8Locked, DL, MVT::i32,
+ MVT::Other, Ops);
+ return SDValue(Res, 1);
+}
+
static SDValue LowerATOMIC_FENCE(SDValue Op, const X86Subtarget &Subtarget,
SelectionDAG &DAG) {
SDLoc dl(Op);
@@ -25235,19 +26167,8 @@ static SDValue LowerATOMIC_FENCE(SDValue Op, const X86Subtarget &Subtarget,
if (Subtarget.hasMFence())
return DAG.getNode(X86ISD::MFENCE, dl, MVT::Other, Op.getOperand(0));
- SDValue Chain = Op.getOperand(0);
- SDValue Zero = DAG.getTargetConstant(0, dl, MVT::i32);
- SDValue Ops[] = {
- DAG.getRegister(X86::ESP, MVT::i32), // Base
- DAG.getTargetConstant(1, dl, MVT::i8), // Scale
- DAG.getRegister(0, MVT::i32), // Index
- DAG.getTargetConstant(0, dl, MVT::i32), // Disp
- DAG.getRegister(0, MVT::i32), // Segment.
- Zero,
- Chain
- };
- SDNode *Res = DAG.getMachineNode(X86::OR32mi8Locked, dl, MVT::Other, Ops);
- return SDValue(Res, 0);
+ SDValue Chain = Op.getOperand(0);
+ return emitLockedStackOp(DAG, Subtarget, Chain, dl);
}
// MEMBARRIER is a compiler barrier; it codegens to a no-op.
@@ -25288,10 +26209,8 @@ static SDValue LowerCMP_SWAP(SDValue Op, const X86Subtarget &Subtarget,
MVT::i32, cpOut.getValue(2));
SDValue Success = getSETCC(X86::COND_E, EFLAGS, DL, DAG);
- DAG.ReplaceAllUsesOfValueWith(Op.getValue(0), cpOut);
- DAG.ReplaceAllUsesOfValueWith(Op.getValue(1), Success);
- DAG.ReplaceAllUsesOfValueWith(Op.getValue(2), EFLAGS.getValue(1));
- return SDValue();
+ return DAG.getNode(ISD::MERGE_VALUES, DL, Op->getVTList(),
+ cpOut, Success, EFLAGS.getValue(1));
}
// Create MOVMSKB, taking into account whether we need to split for AVX1.
@@ -25703,6 +26622,7 @@ static SDValue lowerAtomicArithWithLOCK(SDValue N, SelectionDAG &DAG,
/// Lower atomic_load_ops into LOCK-prefixed operations.
static SDValue lowerAtomicArith(SDValue N, SelectionDAG &DAG,
const X86Subtarget &Subtarget) {
+ AtomicSDNode *AN = cast<AtomicSDNode>(N.getNode());
SDValue Chain = N->getOperand(0);
SDValue LHS = N->getOperand(1);
SDValue RHS = N->getOperand(2);
@@ -25717,7 +26637,6 @@ static SDValue lowerAtomicArith(SDValue N, SelectionDAG &DAG,
// Handle (atomic_load_sub p, v) as (atomic_load_add p, -v), to be able to
// select LXADD if LOCK_SUB can't be selected.
if (Opc == ISD::ATOMIC_LOAD_SUB) {
- AtomicSDNode *AN = cast<AtomicSDNode>(N.getNode());
RHS = DAG.getNode(ISD::SUB, DL, VT, DAG.getConstant(0, DL, VT), RHS);
return DAG.getAtomic(ISD::ATOMIC_LOAD_ADD, DL, VT, Chain, LHS,
RHS, AN->getMemOperand());
@@ -25727,35 +26646,93 @@ static SDValue lowerAtomicArith(SDValue N, SelectionDAG &DAG,
return N;
}
+ // Specialized lowering for the canonical form of an idemptotent atomicrmw.
+ // The core idea here is that since the memory location isn't actually
+ // changing, all we need is a lowering for the *ordering* impacts of the
+ // atomicrmw. As such, we can chose a different operation and memory
+ // location to minimize impact on other code.
+ if (Opc == ISD::ATOMIC_LOAD_OR && isNullConstant(RHS)) {
+ // On X86, the only ordering which actually requires an instruction is
+ // seq_cst which isn't SingleThread, everything just needs to be preserved
+ // during codegen and then dropped. Note that we expect (but don't assume),
+ // that orderings other than seq_cst and acq_rel have been canonicalized to
+ // a store or load.
+ if (AN->getOrdering() == AtomicOrdering::SequentiallyConsistent &&
+ AN->getSyncScopeID() == SyncScope::System) {
+ // Prefer a locked operation against a stack location to minimize cache
+ // traffic. This assumes that stack locations are very likely to be
+ // accessed only by the owning thread.
+ SDValue NewChain = emitLockedStackOp(DAG, Subtarget, Chain, DL);
+ assert(!N->hasAnyUseOfValue(0));
+ // NOTE: The getUNDEF is needed to give something for the unused result 0.
+ return DAG.getNode(ISD::MERGE_VALUES, DL, N->getVTList(),
+ DAG.getUNDEF(VT), NewChain);
+ }
+ // MEMBARRIER is a compiler barrier; it codegens to a no-op.
+ SDValue NewChain = DAG.getNode(X86ISD::MEMBARRIER, DL, MVT::Other, Chain);
+ assert(!N->hasAnyUseOfValue(0));
+ // NOTE: The getUNDEF is needed to give something for the unused result 0.
+ return DAG.getNode(ISD::MERGE_VALUES, DL, N->getVTList(),
+ DAG.getUNDEF(VT), NewChain);
+ }
+
SDValue LockOp = lowerAtomicArithWithLOCK(N, DAG, Subtarget);
// RAUW the chain, but don't worry about the result, as it's unused.
assert(!N->hasAnyUseOfValue(0));
- DAG.ReplaceAllUsesOfValueWith(N.getValue(1), LockOp.getValue(1));
- return SDValue();
+ // NOTE: The getUNDEF is needed to give something for the unused result 0.
+ return DAG.getNode(ISD::MERGE_VALUES, DL, N->getVTList(),
+ DAG.getUNDEF(VT), LockOp.getValue(1));
}
-static SDValue LowerATOMIC_STORE(SDValue Op, SelectionDAG &DAG) {
- SDNode *Node = Op.getNode();
+static SDValue LowerATOMIC_STORE(SDValue Op, SelectionDAG &DAG,
+ const X86Subtarget &Subtarget) {
+ auto *Node = cast<AtomicSDNode>(Op.getNode());
SDLoc dl(Node);
- EVT VT = cast<AtomicSDNode>(Node)->getMemoryVT();
+ EVT VT = Node->getMemoryVT();
+
+ bool IsSeqCst = Node->getOrdering() == AtomicOrdering::SequentiallyConsistent;
+ bool IsTypeLegal = DAG.getTargetLoweringInfo().isTypeLegal(VT);
+
+ // If this store is not sequentially consistent and the type is legal
+ // we can just keep it.
+ if (!IsSeqCst && IsTypeLegal)
+ return Op;
+
+ if (VT == MVT::i64 && !IsTypeLegal) {
+ // For illegal i64 atomic_stores, we can try to use MOVQ if SSE2 is enabled.
+ // FIXME: Use movlps with SSE1.
+ // FIXME: Use fist with X87.
+ bool NoImplicitFloatOps =
+ DAG.getMachineFunction().getFunction().hasFnAttribute(
+ Attribute::NoImplicitFloat);
+ if (!Subtarget.useSoftFloat() && !NoImplicitFloatOps &&
+ Subtarget.hasSSE2()) {
+ SDValue SclToVec = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2i64,
+ Node->getOperand(2));
+ SDVTList Tys = DAG.getVTList(MVT::Other);
+ SDValue Ops[] = { Node->getChain(), SclToVec, Node->getBasePtr() };
+ SDValue Chain = DAG.getMemIntrinsicNode(X86ISD::VEXTRACT_STORE, dl, Tys,
+ Ops, MVT::i64,
+ Node->getMemOperand());
+
+ // If this is a sequentially consistent store, also emit an appropriate
+ // barrier.
+ if (IsSeqCst)
+ Chain = emitLockedStackOp(DAG, Subtarget, Chain, dl);
+
+ return Chain;
+ }
+ }
// Convert seq_cst store -> xchg
// Convert wide store -> swap (-> cmpxchg8b/cmpxchg16b)
- // FIXME: On 32-bit, store -> fist or movq would be more efficient
- // (The only way to get a 16-byte store is cmpxchg16b)
// FIXME: 16-byte ATOMIC_SWAP isn't actually hooked up at the moment.
- if (cast<AtomicSDNode>(Node)->getOrdering() ==
- AtomicOrdering::SequentiallyConsistent ||
- !DAG.getTargetLoweringInfo().isTypeLegal(VT)) {
- SDValue Swap = DAG.getAtomic(ISD::ATOMIC_SWAP, dl,
- cast<AtomicSDNode>(Node)->getMemoryVT(),
- Node->getOperand(0),
- Node->getOperand(1), Node->getOperand(2),
- cast<AtomicSDNode>(Node)->getMemOperand());
- return Swap.getValue(1);
- }
- // Other atomic stores have a simple pattern.
- return Op;
+ SDValue Swap = DAG.getAtomic(ISD::ATOMIC_SWAP, dl,
+ Node->getMemoryVT(),
+ Node->getOperand(0),
+ Node->getOperand(1), Node->getOperand(2),
+ Node->getMemOperand());
+ return Swap.getValue(1);
}
static SDValue LowerADDSUBCARRY(SDValue Op, SelectionDAG &DAG) {
@@ -25919,7 +26896,6 @@ static SDValue LowerMSCATTER(SDValue Op, const X86Subtarget &Subtarget,
SDValue Ops[] = {Chain, Src, Mask, BasePtr, Index, Scale};
SDValue NewScatter = DAG.getTargetMemSDNode<X86MaskedScatterSDNode>(
VTs, Ops, dl, N->getMemoryVT(), N->getMemOperand());
- DAG.ReplaceAllUsesWith(Op, SDValue(NewScatter.getNode(), 1));
return SDValue(NewScatter.getNode(), 1);
}
return SDValue();
@@ -25935,7 +26911,6 @@ static SDValue LowerMSCATTER(SDValue Op, const X86Subtarget &Subtarget,
SDValue Ops[] = {Chain, Src, Mask, BasePtr, Index, Scale};
SDValue NewScatter = DAG.getTargetMemSDNode<X86MaskedScatterSDNode>(
VTs, Ops, dl, N->getMemoryVT(), N->getMemOperand());
- DAG.ReplaceAllUsesWith(Op, SDValue(NewScatter.getNode(), 1));
return SDValue(NewScatter.getNode(), 1);
}
// Custom widen all the operands to avoid promotion.
@@ -25980,7 +26955,6 @@ static SDValue LowerMSCATTER(SDValue Op, const X86Subtarget &Subtarget,
SDValue Ops[] = {Chain, Src, Mask, BasePtr, Index, Scale};
SDValue NewScatter = DAG.getTargetMemSDNode<X86MaskedScatterSDNode>(
VTs, Ops, dl, N->getMemoryVT(), N->getMemOperand());
- DAG.ReplaceAllUsesWith(Op, SDValue(NewScatter.getNode(), 1));
return SDValue(NewScatter.getNode(), 1);
}
@@ -25991,8 +26965,28 @@ static SDValue LowerMLOAD(SDValue Op, const X86Subtarget &Subtarget,
MVT VT = Op.getSimpleValueType();
MVT ScalarVT = VT.getScalarType();
SDValue Mask = N->getMask();
+ MVT MaskVT = Mask.getSimpleValueType();
+ SDValue PassThru = N->getPassThru();
SDLoc dl(Op);
+ // Handle AVX masked loads which don't support passthru other than 0.
+ if (MaskVT.getVectorElementType() != MVT::i1) {
+ // We also allow undef in the isel pattern.
+ if (PassThru.isUndef() || ISD::isBuildVectorAllZeros(PassThru.getNode()))
+ return Op;
+
+ SDValue NewLoad = DAG.getMaskedLoad(VT, dl, N->getChain(),
+ N->getBasePtr(), Mask,
+ getZeroVector(VT, Subtarget, DAG, dl),
+ N->getMemoryVT(), N->getMemOperand(),
+ N->getExtensionType(),
+ N->isExpandingLoad());
+ // Emit a blend.
+ SDValue Select = DAG.getNode(ISD::VSELECT, dl, MaskVT, Mask, NewLoad,
+ PassThru);
+ return DAG.getMergeValues({ Select, NewLoad.getValue(1) }, dl);
+ }
+
assert((!N->isExpandingLoad() || Subtarget.hasAVX512()) &&
"Expanding masked load is supported on AVX-512 target only!");
@@ -26011,7 +27005,7 @@ static SDValue LowerMLOAD(SDValue Op, const X86Subtarget &Subtarget,
// VLX the vector should be widened to 512 bit
unsigned NumEltsInWideVec = 512 / VT.getScalarSizeInBits();
MVT WideDataVT = MVT::getVectorVT(ScalarVT, NumEltsInWideVec);
- SDValue PassThru = ExtendToType(N->getPassThru(), WideDataVT, DAG);
+ PassThru = ExtendToType(PassThru, WideDataVT, DAG);
// Mask element has to be i1.
assert(Mask.getSimpleValueType().getScalarType() == MVT::i1 &&
@@ -26179,7 +27173,7 @@ SDValue X86TargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const {
case ISD::ATOMIC_LOAD_OR:
case ISD::ATOMIC_LOAD_XOR:
case ISD::ATOMIC_LOAD_AND: return lowerAtomicArith(Op, DAG, Subtarget);
- case ISD::ATOMIC_STORE: return LowerATOMIC_STORE(Op, DAG);
+ case ISD::ATOMIC_STORE: return LowerATOMIC_STORE(Op, DAG, Subtarget);
case ISD::BITREVERSE: return LowerBITREVERSE(Op, Subtarget, DAG);
case ISD::BUILD_VECTOR: return LowerBUILD_VECTOR(Op, DAG);
case ISD::CONCAT_VECTORS: return LowerCONCAT_VECTORS(Op, Subtarget, DAG);
@@ -26272,7 +27266,7 @@ SDValue X86TargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const {
case ISD::UADDSAT:
case ISD::SADDSAT:
case ISD::USUBSAT:
- case ISD::SSUBSAT: return LowerADDSAT_SUBSAT(Op, DAG);
+ case ISD::SSUBSAT: return LowerADDSAT_SUBSAT(Op, DAG, Subtarget);
case ISD::SMAX:
case ISD::SMIN:
case ISD::UMAX:
@@ -26301,12 +27295,19 @@ void X86TargetLowering::LowerOperationWrapper(SDNode *N,
if (!Res.getNode())
return;
- assert((N->getNumValues() <= Res->getNumValues()) &&
+ // If the original node has one result, take the return value from
+ // LowerOperation as is. It might not be result number 0.
+ if (N->getNumValues() == 1) {
+ Results.push_back(Res);
+ return;
+ }
+
+ // If the original node has multiple results, then the return node should
+ // have the same number of results.
+ assert((N->getNumValues() == Res->getNumValues()) &&
"Lowering returned the wrong number of results!");
// Places new result values base on N result number.
- // In some cases (LowerSINT_TO_FP for example) Res has more result values
- // than original node, chain should be dropped(last value).
for (unsigned I = 0, E = N->getNumValues(); I != E; ++I)
Results.push_back(Res.getValue(I));
}
@@ -26319,7 +27320,31 @@ void X86TargetLowering::ReplaceNodeResults(SDNode *N,
SDLoc dl(N);
switch (N->getOpcode()) {
default:
+#ifndef NDEBUG
+ dbgs() << "ReplaceNodeResults: ";
+ N->dump(&DAG);
+#endif
llvm_unreachable("Do not know how to custom type legalize this operation!");
+ case ISD::CTPOP: {
+ assert(N->getValueType(0) == MVT::i64 && "Unexpected VT!");
+ // Use a v2i64 if possible.
+ bool NoImplicitFloatOps =
+ DAG.getMachineFunction().getFunction().hasFnAttribute(
+ Attribute::NoImplicitFloat);
+ if (isTypeLegal(MVT::v2i64) && !NoImplicitFloatOps) {
+ SDValue Wide =
+ DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2i64, N->getOperand(0));
+ Wide = DAG.getNode(ISD::CTPOP, dl, MVT::v2i64, Wide);
+ // Bit count should fit in 32-bits, extract it as that and then zero
+ // extend to i64. Otherwise we end up extracting bits 63:32 separately.
+ Wide = DAG.getNode(ISD::BITCAST, dl, MVT::v4i32, Wide);
+ Wide = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i32, Wide,
+ DAG.getIntPtrConstant(0, dl));
+ Wide = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i64, Wide);
+ Results.push_back(Wide);
+ }
+ return;
+ }
case ISD::MUL: {
EVT VT = N->getValueType(0);
assert(VT.isVector() && "Unexpected VT");
@@ -26385,6 +27410,31 @@ void X86TargetLowering::ReplaceNodeResults(SDNode *N,
Results.push_back(Res);
return;
}
+ case ISD::ABS: {
+ const TargetLowering &TLI = DAG.getTargetLoweringInfo();
+ assert(N->getValueType(0) == MVT::i64 &&
+ "Unexpected type (!= i64) on ABS.");
+ MVT HalfT = MVT::i32;
+ SDValue Lo, Hi, Tmp;
+ SDVTList VTList = DAG.getVTList(HalfT, MVT::i1);
+
+ Lo = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, HalfT, N->getOperand(0),
+ DAG.getConstant(0, dl, HalfT));
+ Hi = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, HalfT, N->getOperand(0),
+ DAG.getConstant(1, dl, HalfT));
+ Tmp = DAG.getNode(
+ ISD::SRA, dl, HalfT, Hi,
+ DAG.getConstant(HalfT.getSizeInBits() - 1, dl,
+ TLI.getShiftAmountTy(HalfT, DAG.getDataLayout())));
+ Lo = DAG.getNode(ISD::UADDO, dl, VTList, Tmp, Lo);
+ Hi = DAG.getNode(ISD::ADDCARRY, dl, VTList, Tmp, Hi,
+ SDValue(Lo.getNode(), 1));
+ Hi = DAG.getNode(ISD::XOR, dl, HalfT, Tmp, Hi);
+ Lo = DAG.getNode(ISD::XOR, dl, HalfT, Tmp, Lo);
+ Results.push_back(Lo);
+ Results.push_back(Hi);
+ return;
+ }
case ISD::SETCC: {
// Widen v2i32 (setcc v2f32). This is really needed for AVX512VL when
// setCC result type is v2i1 because type legalzation will end up with
@@ -26557,14 +27607,13 @@ void X86TargetLowering::ReplaceNodeResults(SDNode *N,
}
case ISD::SIGN_EXTEND:
case ISD::ZERO_EXTEND: {
- if (!ExperimentalVectorWideningLegalization)
- return;
-
EVT VT = N->getValueType(0);
SDValue In = N->getOperand(0);
EVT InVT = In.getValueType();
if (!Subtarget.hasSSE41() && VT == MVT::v4i64 &&
- (InVT == MVT::v4i16 || InVT == MVT::v4i8)) {
+ (InVT == MVT::v4i16 || InVT == MVT::v4i8) &&
+ getTypeAction(*DAG.getContext(), InVT) == TypeWidenVector) {
+ assert(N->getOpcode() == ISD::SIGN_EXTEND && "Unexpected opcode");
// Custom split this so we can extend i8/i16->i32 invec. This is better
// since sign_extend_inreg i8/i16->i64 requires an extend to i32 using
// sra. Then extending from i32 to i64 using pcmpgt. By custom splitting
@@ -26589,16 +27638,28 @@ void X86TargetLowering::ReplaceNodeResults(SDNode *N,
return;
}
- if ((VT == MVT::v16i32 || VT == MVT::v8i64) && InVT.is128BitVector()) {
+ if (VT == MVT::v16i32 || VT == MVT::v8i64) {
+ if (!InVT.is128BitVector()) {
+ // Not a 128 bit vector, but maybe type legalization will promote
+ // it to 128 bits.
+ if (getTypeAction(*DAG.getContext(), InVT) != TypePromoteInteger)
+ return;
+ InVT = getTypeToTransformTo(*DAG.getContext(), InVT);
+ if (!InVT.is128BitVector())
+ return;
+
+ // Promote the input to 128 bits. Type legalization will turn this into
+ // zext_inreg/sext_inreg.
+ In = DAG.getNode(N->getOpcode(), dl, InVT, In);
+ }
+
// Perform custom splitting instead of the two stage extend we would get
// by default.
EVT LoVT, HiVT;
std::tie(LoVT, HiVT) = DAG.GetSplitDestVTs(N->getValueType(0));
assert(isTypeLegal(LoVT) && "Split VT not legal?");
- bool IsSigned = N->getOpcode() == ISD::SIGN_EXTEND;
-
- SDValue Lo = getExtendInVec(IsSigned, dl, LoVT, In, DAG);
+ SDValue Lo = getExtendInVec(N->getOpcode(), dl, LoVT, In, DAG);
// We need to shift the input over by half the number of elements.
unsigned NumElts = InVT.getVectorNumElements();
@@ -26608,7 +27669,7 @@ void X86TargetLowering::ReplaceNodeResults(SDNode *N,
ShufMask[i] = i + HalfNumElts;
SDValue Hi = DAG.getVectorShuffle(InVT, dl, In, In, ShufMask);
- Hi = getExtendInVec(IsSigned, dl, HiVT, Hi, DAG);
+ Hi = getExtendInVec(N->getOpcode(), dl, HiVT, Hi, DAG);
SDValue Res = DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, Lo, Hi);
Results.push_back(Res);
@@ -26735,17 +27796,8 @@ void X86TargetLowering::ReplaceNodeResults(SDNode *N,
return;
}
- std::pair<SDValue,SDValue> Vals =
- FP_TO_INTHelper(SDValue(N, 0), DAG, IsSigned, /*IsReplace=*/ true);
- SDValue FIST = Vals.first, StackSlot = Vals.second;
- if (FIST.getNode()) {
- // Return a load from the stack slot.
- if (StackSlot.getNode())
- Results.push_back(
- DAG.getLoad(VT, dl, FIST, StackSlot, MachinePointerInfo()));
- else
- Results.push_back(FIST);
- }
+ if (SDValue V = FP_TO_INTHelper(SDValue(N, 0), DAG, IsSigned))
+ Results.push_back(V);
return;
}
case ISD::SINT_TO_FP: {
@@ -26800,31 +27852,30 @@ void X86TargetLowering::ReplaceNodeResults(SDNode *N,
default : llvm_unreachable("Do not know how to custom type "
"legalize this intrinsic operation!");
case Intrinsic::x86_rdtsc:
- return getReadTimeStampCounter(N, dl, X86ISD::RDTSC_DAG, DAG, Subtarget,
+ return getReadTimeStampCounter(N, dl, X86::RDTSC, DAG, Subtarget,
Results);
case Intrinsic::x86_rdtscp:
- return getReadTimeStampCounter(N, dl, X86ISD::RDTSCP_DAG, DAG, Subtarget,
+ return getReadTimeStampCounter(N, dl, X86::RDTSCP, DAG, Subtarget,
Results);
case Intrinsic::x86_rdpmc:
- return getReadPerformanceCounter(N, dl, DAG, Subtarget, Results);
-
+ expandIntrinsicWChainHelper(N, dl, DAG, X86::RDPMC, X86::ECX, Subtarget,
+ Results);
+ return;
case Intrinsic::x86_xgetbv:
- return getExtendedControlRegister(N, dl, DAG, Subtarget, Results);
+ expandIntrinsicWChainHelper(N, dl, DAG, X86::XGETBV, X86::ECX, Subtarget,
+ Results);
+ return;
}
}
- case ISD::INTRINSIC_WO_CHAIN: {
- if (SDValue V = LowerINTRINSIC_WO_CHAIN(SDValue(N, 0), DAG))
- Results.push_back(V);
- return;
- }
case ISD::READCYCLECOUNTER: {
- return getReadTimeStampCounter(N, dl, X86ISD::RDTSC_DAG, DAG, Subtarget,
- Results);
+ return getReadTimeStampCounter(N, dl, X86::RDTSC, DAG, Subtarget, Results);
}
case ISD::ATOMIC_CMP_SWAP_WITH_SUCCESS: {
EVT T = N->getValueType(0);
assert((T == MVT::i64 || T == MVT::i128) && "can only expand cmpxchg pair");
bool Regs64bit = T == MVT::i128;
+ assert((!Regs64bit || Subtarget.hasCmpxchg16b()) &&
+ "64-bit ATOMIC_CMP_SWAP_WITH_SUCCESS requires CMPXCHG16B");
MVT HalfT = Regs64bit ? MVT::i64 : MVT::i32;
SDValue cpInL, cpInH;
cpInL = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, HalfT, N->getOperand(2),
@@ -26903,6 +27954,66 @@ void X86TargetLowering::ReplaceNodeResults(SDNode *N,
Results.push_back(EFLAGS.getValue(1));
return;
}
+ case ISD::ATOMIC_LOAD: {
+ assert(N->getValueType(0) == MVT::i64 && "Unexpected VT!");
+ bool NoImplicitFloatOps =
+ DAG.getMachineFunction().getFunction().hasFnAttribute(
+ Attribute::NoImplicitFloat);
+ if (!Subtarget.useSoftFloat() && !NoImplicitFloatOps) {
+ auto *Node = cast<AtomicSDNode>(N);
+ if (Subtarget.hasSSE2()) {
+ // Use a VZEXT_LOAD which will be selected as MOVQ. Then extract the
+ // lower 64-bits.
+ SDVTList Tys = DAG.getVTList(MVT::v2i64, MVT::Other);
+ SDValue Ops[] = { Node->getChain(), Node->getBasePtr() };
+ SDValue Ld = DAG.getMemIntrinsicNode(X86ISD::VZEXT_LOAD, dl, Tys, Ops,
+ MVT::i64, Node->getMemOperand());
+ SDValue Res = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i64, Ld,
+ DAG.getIntPtrConstant(0, dl));
+ Results.push_back(Res);
+ Results.push_back(Ld.getValue(1));
+ return;
+ }
+ if (Subtarget.hasX87()) {
+ // First load this into an 80-bit X87 register. This will put the whole
+ // integer into the significand.
+ // FIXME: Do we need to glue? See FIXME comment in BuildFILD.
+ SDVTList Tys = DAG.getVTList(MVT::f80, MVT::Other, MVT::Glue);
+ SDValue Ops[] = { Node->getChain(), Node->getBasePtr() };
+ SDValue Result = DAG.getMemIntrinsicNode(X86ISD::FILD_FLAG,
+ dl, Tys, Ops, MVT::i64,
+ Node->getMemOperand());
+ SDValue Chain = Result.getValue(1);
+ SDValue InFlag = Result.getValue(2);
+
+ // Now store the X87 register to a stack temporary and convert to i64.
+ // This store is not atomic and doesn't need to be.
+ // FIXME: We don't need a stack temporary if the result of the load
+ // is already being stored. We could just directly store there.
+ SDValue StackPtr = DAG.CreateStackTemporary(MVT::i64);
+ int SPFI = cast<FrameIndexSDNode>(StackPtr.getNode())->getIndex();
+ MachinePointerInfo MPI =
+ MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), SPFI);
+ SDValue StoreOps[] = { Chain, Result, StackPtr, InFlag };
+ Chain = DAG.getMemIntrinsicNode(X86ISD::FIST, dl,
+ DAG.getVTList(MVT::Other), StoreOps,
+ MVT::i64, MPI, 0 /*Align*/,
+ MachineMemOperand::MOStore);
+
+ // Finally load the value back from the stack temporary and return it.
+ // This load is not atomic and doesn't need to be.
+ // This load will be further type legalized.
+ Result = DAG.getLoad(MVT::i64, dl, Chain, StackPtr, MPI);
+ Results.push_back(Result);
+ Results.push_back(Result.getValue(1));
+ return;
+ }
+ }
+ // TODO: Use MOVLPS when SSE1 is available?
+ // Delegate to generic TypeLegalization. Situations we can really handle
+ // should have already been dealt with by AtomicExpandPass.cpp.
+ break;
+ }
case ISD::ATOMIC_SWAP:
case ISD::ATOMIC_LOAD_ADD:
case ISD::ATOMIC_LOAD_SUB:
@@ -26914,11 +28025,10 @@ void X86TargetLowering::ReplaceNodeResults(SDNode *N,
case ISD::ATOMIC_LOAD_MAX:
case ISD::ATOMIC_LOAD_UMIN:
case ISD::ATOMIC_LOAD_UMAX:
- case ISD::ATOMIC_LOAD: {
// Delegate to generic TypeLegalization. Situations we can really handle
// should have already been dealt with by AtomicExpandPass.cpp.
break;
- }
+
case ISD::BITCAST: {
assert(Subtarget.hasSSE2() && "Requires at least SSE2!");
EVT DstVT = N->getValueType(0);
@@ -27061,19 +28171,28 @@ void X86TargetLowering::ReplaceNodeResults(SDNode *N,
if (!ISD::isNON_EXTLoad(N))
return;
auto *Ld = cast<LoadSDNode>(N);
- MVT LdVT = Subtarget.is64Bit() && VT.isInteger() ? MVT::i64 : MVT::f64;
- SDValue Res = DAG.getLoad(LdVT, dl, Ld->getChain(), Ld->getBasePtr(),
- Ld->getPointerInfo(),
- Ld->getAlignment(),
- Ld->getMemOperand()->getFlags());
- SDValue Chain = Res.getValue(1);
- MVT WideVT = MVT::getVectorVT(LdVT, 2);
- Res = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, WideVT, Res);
- MVT CastVT = MVT::getVectorVT(VT.getVectorElementType(),
- VT.getVectorNumElements() * 2);
- Res = DAG.getBitcast(CastVT, Res);
+ if (Subtarget.hasSSE2()) {
+ MVT LdVT = Subtarget.is64Bit() && VT.isInteger() ? MVT::i64 : MVT::f64;
+ SDValue Res = DAG.getLoad(LdVT, dl, Ld->getChain(), Ld->getBasePtr(),
+ Ld->getPointerInfo(), Ld->getAlignment(),
+ Ld->getMemOperand()->getFlags());
+ SDValue Chain = Res.getValue(1);
+ MVT WideVT = MVT::getVectorVT(LdVT, 2);
+ Res = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, WideVT, Res);
+ MVT CastVT = MVT::getVectorVT(VT.getVectorElementType(),
+ VT.getVectorNumElements() * 2);
+ Res = DAG.getBitcast(CastVT, Res);
+ Results.push_back(Res);
+ Results.push_back(Chain);
+ return;
+ }
+ assert(Subtarget.hasSSE1() && "Expected SSE");
+ SDVTList Tys = DAG.getVTList(MVT::v4f32, MVT::Other);
+ SDValue Ops[] = {Ld->getChain(), Ld->getBasePtr()};
+ SDValue Res = DAG.getMemIntrinsicNode(X86ISD::VZEXT_LOAD, dl, Tys, Ops,
+ MVT::i64, Ld->getMemOperand());
Results.push_back(Res);
- Results.push_back(Chain);
+ Results.push_back(Res.getValue(1));
return;
}
}
@@ -27092,26 +28211,22 @@ const char *X86TargetLowering::getTargetNodeName(unsigned Opcode) const {
case X86ISD::FXOR: return "X86ISD::FXOR";
case X86ISD::FILD: return "X86ISD::FILD";
case X86ISD::FILD_FLAG: return "X86ISD::FILD_FLAG";
- case X86ISD::FP_TO_INT16_IN_MEM: return "X86ISD::FP_TO_INT16_IN_MEM";
- case X86ISD::FP_TO_INT32_IN_MEM: return "X86ISD::FP_TO_INT32_IN_MEM";
- case X86ISD::FP_TO_INT64_IN_MEM: return "X86ISD::FP_TO_INT64_IN_MEM";
+ case X86ISD::FIST: return "X86ISD::FIST";
+ case X86ISD::FP_TO_INT_IN_MEM: return "X86ISD::FP_TO_INT_IN_MEM";
case X86ISD::FLD: return "X86ISD::FLD";
case X86ISD::FST: return "X86ISD::FST";
case X86ISD::CALL: return "X86ISD::CALL";
- case X86ISD::RDTSC_DAG: return "X86ISD::RDTSC_DAG";
- case X86ISD::RDTSCP_DAG: return "X86ISD::RDTSCP_DAG";
- case X86ISD::RDPMC_DAG: return "X86ISD::RDPMC_DAG";
case X86ISD::BT: return "X86ISD::BT";
case X86ISD::CMP: return "X86ISD::CMP";
case X86ISD::COMI: return "X86ISD::COMI";
case X86ISD::UCOMI: return "X86ISD::UCOMI";
case X86ISD::CMPM: return "X86ISD::CMPM";
- case X86ISD::CMPM_RND: return "X86ISD::CMPM_RND";
+ case X86ISD::CMPM_SAE: return "X86ISD::CMPM_SAE";
case X86ISD::SETCC: return "X86ISD::SETCC";
case X86ISD::SETCC_CARRY: return "X86ISD::SETCC_CARRY";
case X86ISD::FSETCC: return "X86ISD::FSETCC";
case X86ISD::FSETCCM: return "X86ISD::FSETCCM";
- case X86ISD::FSETCCM_RND: return "X86ISD::FSETCCM_RND";
+ case X86ISD::FSETCCM_SAE: return "X86ISD::FSETCCM_SAE";
case X86ISD::CMOV: return "X86ISD::CMOV";
case X86ISD::BRCOND: return "X86ISD::BRCOND";
case X86ISD::RET_FLAG: return "X86ISD::RET_FLAG";
@@ -27140,12 +28255,12 @@ const char *X86TargetLowering::getTargetNodeName(unsigned Opcode) const {
case X86ISD::CONFLICT: return "X86ISD::CONFLICT";
case X86ISD::FMAX: return "X86ISD::FMAX";
case X86ISD::FMAXS: return "X86ISD::FMAXS";
- case X86ISD::FMAX_RND: return "X86ISD::FMAX_RND";
- case X86ISD::FMAXS_RND: return "X86ISD::FMAX_RND";
+ case X86ISD::FMAX_SAE: return "X86ISD::FMAX_SAE";
+ case X86ISD::FMAXS_SAE: return "X86ISD::FMAXS_SAE";
case X86ISD::FMIN: return "X86ISD::FMIN";
case X86ISD::FMINS: return "X86ISD::FMINS";
- case X86ISD::FMIN_RND: return "X86ISD::FMIN_RND";
- case X86ISD::FMINS_RND: return "X86ISD::FMINS_RND";
+ case X86ISD::FMIN_SAE: return "X86ISD::FMIN_SAE";
+ case X86ISD::FMINS_SAE: return "X86ISD::FMINS_SAE";
case X86ISD::FMAXC: return "X86ISD::FMAXC";
case X86ISD::FMINC: return "X86ISD::FMINC";
case X86ISD::FRSQRT: return "X86ISD::FRSQRT";
@@ -27177,6 +28292,7 @@ const char *X86TargetLowering::getTargetNodeName(unsigned Opcode) const {
case X86ISD::LAND: return "X86ISD::LAND";
case X86ISD::VZEXT_MOVL: return "X86ISD::VZEXT_MOVL";
case X86ISD::VZEXT_LOAD: return "X86ISD::VZEXT_LOAD";
+ case X86ISD::VEXTRACT_STORE: return "X86ISD::VEXTRACT_STORE";
case X86ISD::VTRUNC: return "X86ISD::VTRUNC";
case X86ISD::VTRUNCS: return "X86ISD::VTRUNCS";
case X86ISD::VTRUNCUS: return "X86ISD::VTRUNCUS";
@@ -27188,11 +28304,13 @@ const char *X86TargetLowering::getTargetNodeName(unsigned Opcode) const {
case X86ISD::VMTRUNCSTORES: return "X86ISD::VMTRUNCSTORES";
case X86ISD::VMTRUNCSTOREUS: return "X86ISD::VMTRUNCSTOREUS";
case X86ISD::VFPEXT: return "X86ISD::VFPEXT";
- case X86ISD::VFPEXT_RND: return "X86ISD::VFPEXT_RND";
- case X86ISD::VFPEXTS_RND: return "X86ISD::VFPEXTS_RND";
+ case X86ISD::VFPEXT_SAE: return "X86ISD::VFPEXT_SAE";
+ case X86ISD::VFPEXTS: return "X86ISD::VFPEXTS";
+ case X86ISD::VFPEXTS_SAE: return "X86ISD::VFPEXTS_SAE";
case X86ISD::VFPROUND: return "X86ISD::VFPROUND";
case X86ISD::VMFPROUND: return "X86ISD::VMFPROUND";
case X86ISD::VFPROUND_RND: return "X86ISD::VFPROUND_RND";
+ case X86ISD::VFPROUNDS: return "X86ISD::VFPROUNDS";
case X86ISD::VFPROUNDS_RND: return "X86ISD::VFPROUNDS_RND";
case X86ISD::VSHLDQ: return "X86ISD::VSHLDQ";
case X86ISD::VSRLDQ: return "X86ISD::VSRLDQ";
@@ -27202,6 +28320,8 @@ const char *X86TargetLowering::getTargetNodeName(unsigned Opcode) const {
case X86ISD::VSHLI: return "X86ISD::VSHLI";
case X86ISD::VSRLI: return "X86ISD::VSRLI";
case X86ISD::VSRAI: return "X86ISD::VSRAI";
+ case X86ISD::VSHLV: return "X86ISD::VSHLV";
+ case X86ISD::VSRLV: return "X86ISD::VSRLV";
case X86ISD::VSRAV: return "X86ISD::VSRAV";
case X86ISD::VROTLI: return "X86ISD::VROTLI";
case X86ISD::VROTRI: return "X86ISD::VROTRI";
@@ -27263,11 +28383,13 @@ const char *X86TargetLowering::getTargetNodeName(unsigned Opcode) const {
case X86ISD::VPERMI: return "X86ISD::VPERMI";
case X86ISD::VPTERNLOG: return "X86ISD::VPTERNLOG";
case X86ISD::VFIXUPIMM: return "X86ISD::VFIXUPIMM";
+ case X86ISD::VFIXUPIMM_SAE: return "X86ISD::VFIXUPIMM_SAE";
case X86ISD::VFIXUPIMMS: return "X86ISD::VFIXUPIMMS";
+ case X86ISD::VFIXUPIMMS_SAE: return "X86ISD::VFIXUPIMMS_SAE";
case X86ISD::VRANGE: return "X86ISD::VRANGE";
- case X86ISD::VRANGE_RND: return "X86ISD::VRANGE_RND";
+ case X86ISD::VRANGE_SAE: return "X86ISD::VRANGE_SAE";
case X86ISD::VRANGES: return "X86ISD::VRANGES";
- case X86ISD::VRANGES_RND: return "X86ISD::VRANGES_RND";
+ case X86ISD::VRANGES_SAE: return "X86ISD::VRANGES_SAE";
case X86ISD::PMULUDQ: return "X86ISD::PMULUDQ";
case X86ISD::PMULDQ: return "X86ISD::PMULDQ";
case X86ISD::PSADBW: return "X86ISD::PSADBW";
@@ -27281,6 +28403,8 @@ const char *X86TargetLowering::getTargetNodeName(unsigned Opcode) const {
case X86ISD::SAHF: return "X86ISD::SAHF";
case X86ISD::RDRAND: return "X86ISD::RDRAND";
case X86ISD::RDSEED: return "X86ISD::RDSEED";
+ case X86ISD::RDPKRU: return "X86ISD::RDPKRU";
+ case X86ISD::WRPKRU: return "X86ISD::WRPKRU";
case X86ISD::VPMADDUBSW: return "X86ISD::VPMADDUBSW";
case X86ISD::VPMADDWD: return "X86ISD::VPMADDWD";
case X86ISD::VPSHA: return "X86ISD::VPSHA";
@@ -27302,17 +28426,17 @@ const char *X86TargetLowering::getTargetNodeName(unsigned Opcode) const {
case X86ISD::VPMADD52H: return "X86ISD::VPMADD52H";
case X86ISD::VPMADD52L: return "X86ISD::VPMADD52L";
case X86ISD::VRNDSCALE: return "X86ISD::VRNDSCALE";
- case X86ISD::VRNDSCALE_RND: return "X86ISD::VRNDSCALE_RND";
+ case X86ISD::VRNDSCALE_SAE: return "X86ISD::VRNDSCALE_SAE";
case X86ISD::VRNDSCALES: return "X86ISD::VRNDSCALES";
- case X86ISD::VRNDSCALES_RND: return "X86ISD::VRNDSCALES_RND";
+ case X86ISD::VRNDSCALES_SAE: return "X86ISD::VRNDSCALES_SAE";
case X86ISD::VREDUCE: return "X86ISD::VREDUCE";
- case X86ISD::VREDUCE_RND: return "X86ISD::VREDUCE_RND";
+ case X86ISD::VREDUCE_SAE: return "X86ISD::VREDUCE_SAE";
case X86ISD::VREDUCES: return "X86ISD::VREDUCES";
- case X86ISD::VREDUCES_RND: return "X86ISD::VREDUCES_RND";
+ case X86ISD::VREDUCES_SAE: return "X86ISD::VREDUCES_SAE";
case X86ISD::VGETMANT: return "X86ISD::VGETMANT";
- case X86ISD::VGETMANT_RND: return "X86ISD::VGETMANT_RND";
+ case X86ISD::VGETMANT_SAE: return "X86ISD::VGETMANT_SAE";
case X86ISD::VGETMANTS: return "X86ISD::VGETMANTS";
- case X86ISD::VGETMANTS_RND: return "X86ISD::VGETMANTS_RND";
+ case X86ISD::VGETMANTS_SAE: return "X86ISD::VGETMANTS_SAE";
case X86ISD::PCMPESTR: return "X86ISD::PCMPESTR";
case X86ISD::PCMPISTR: return "X86ISD::PCMPISTR";
case X86ISD::XTEST: return "X86ISD::XTEST";
@@ -27323,26 +28447,40 @@ const char *X86TargetLowering::getTargetNodeName(unsigned Opcode) const {
case X86ISD::RCP14: return "X86ISD::RCP14";
case X86ISD::RCP14S: return "X86ISD::RCP14S";
case X86ISD::RCP28: return "X86ISD::RCP28";
+ case X86ISD::RCP28_SAE: return "X86ISD::RCP28_SAE";
case X86ISD::RCP28S: return "X86ISD::RCP28S";
+ case X86ISD::RCP28S_SAE: return "X86ISD::RCP28S_SAE";
case X86ISD::EXP2: return "X86ISD::EXP2";
+ case X86ISD::EXP2_SAE: return "X86ISD::EXP2_SAE";
case X86ISD::RSQRT14: return "X86ISD::RSQRT14";
case X86ISD::RSQRT14S: return "X86ISD::RSQRT14S";
case X86ISD::RSQRT28: return "X86ISD::RSQRT28";
+ case X86ISD::RSQRT28_SAE: return "X86ISD::RSQRT28_SAE";
case X86ISD::RSQRT28S: return "X86ISD::RSQRT28S";
+ case X86ISD::RSQRT28S_SAE: return "X86ISD::RSQRT28S_SAE";
case X86ISD::FADD_RND: return "X86ISD::FADD_RND";
+ case X86ISD::FADDS: return "X86ISD::FADDS";
case X86ISD::FADDS_RND: return "X86ISD::FADDS_RND";
case X86ISD::FSUB_RND: return "X86ISD::FSUB_RND";
+ case X86ISD::FSUBS: return "X86ISD::FSUBS";
case X86ISD::FSUBS_RND: return "X86ISD::FSUBS_RND";
case X86ISD::FMUL_RND: return "X86ISD::FMUL_RND";
+ case X86ISD::FMULS: return "X86ISD::FMULS";
case X86ISD::FMULS_RND: return "X86ISD::FMULS_RND";
case X86ISD::FDIV_RND: return "X86ISD::FDIV_RND";
+ case X86ISD::FDIVS: return "X86ISD::FDIVS";
case X86ISD::FDIVS_RND: return "X86ISD::FDIVS_RND";
case X86ISD::FSQRT_RND: return "X86ISD::FSQRT_RND";
+ case X86ISD::FSQRTS: return "X86ISD::FSQRTS";
case X86ISD::FSQRTS_RND: return "X86ISD::FSQRTS_RND";
- case X86ISD::FGETEXP_RND: return "X86ISD::FGETEXP_RND";
- case X86ISD::FGETEXPS_RND: return "X86ISD::FGETEXPS_RND";
+ case X86ISD::FGETEXP: return "X86ISD::FGETEXP";
+ case X86ISD::FGETEXP_SAE: return "X86ISD::FGETEXP_SAE";
+ case X86ISD::FGETEXPS: return "X86ISD::FGETEXPS";
+ case X86ISD::FGETEXPS_SAE: return "X86ISD::FGETEXPS_SAE";
case X86ISD::SCALEF: return "X86ISD::SCALEF";
+ case X86ISD::SCALEF_RND: return "X86ISD::SCALEF_RND";
case X86ISD::SCALEFS: return "X86ISD::SCALEFS";
+ case X86ISD::SCALEFS_RND: return "X86ISD::SCALEFS_RND";
case X86ISD::AVG: return "X86ISD::AVG";
case X86ISD::MULHRS: return "X86ISD::MULHRS";
case X86ISD::SINT_TO_FP_RND: return "X86ISD::SINT_TO_FP_RND";
@@ -27351,23 +28489,27 @@ const char *X86TargetLowering::getTargetNodeName(unsigned Opcode) const {
case X86ISD::CVTTP2UI: return "X86ISD::CVTTP2UI";
case X86ISD::MCVTTP2SI: return "X86ISD::MCVTTP2SI";
case X86ISD::MCVTTP2UI: return "X86ISD::MCVTTP2UI";
- case X86ISD::CVTTP2SI_RND: return "X86ISD::CVTTP2SI_RND";
- case X86ISD::CVTTP2UI_RND: return "X86ISD::CVTTP2UI_RND";
+ case X86ISD::CVTTP2SI_SAE: return "X86ISD::CVTTP2SI_SAE";
+ case X86ISD::CVTTP2UI_SAE: return "X86ISD::CVTTP2UI_SAE";
case X86ISD::CVTTS2SI: return "X86ISD::CVTTS2SI";
case X86ISD::CVTTS2UI: return "X86ISD::CVTTS2UI";
- case X86ISD::CVTTS2SI_RND: return "X86ISD::CVTTS2SI_RND";
- case X86ISD::CVTTS2UI_RND: return "X86ISD::CVTTS2UI_RND";
+ case X86ISD::CVTTS2SI_SAE: return "X86ISD::CVTTS2SI_SAE";
+ case X86ISD::CVTTS2UI_SAE: return "X86ISD::CVTTS2UI_SAE";
case X86ISD::CVTSI2P: return "X86ISD::CVTSI2P";
case X86ISD::CVTUI2P: return "X86ISD::CVTUI2P";
+ case X86ISD::MCVTSI2P: return "X86ISD::MCVTSI2P";
+ case X86ISD::MCVTUI2P: return "X86ISD::MCVTUI2P";
case X86ISD::VFPCLASS: return "X86ISD::VFPCLASS";
case X86ISD::VFPCLASSS: return "X86ISD::VFPCLASSS";
case X86ISD::MULTISHIFT: return "X86ISD::MULTISHIFT";
+ case X86ISD::SCALAR_SINT_TO_FP: return "X86ISD::SCALAR_SINT_TO_FP";
case X86ISD::SCALAR_SINT_TO_FP_RND: return "X86ISD::SCALAR_SINT_TO_FP_RND";
+ case X86ISD::SCALAR_UINT_TO_FP: return "X86ISD::SCALAR_UINT_TO_FP";
case X86ISD::SCALAR_UINT_TO_FP_RND: return "X86ISD::SCALAR_UINT_TO_FP_RND";
case X86ISD::CVTPS2PH: return "X86ISD::CVTPS2PH";
case X86ISD::MCVTPS2PH: return "X86ISD::MCVTPS2PH";
case X86ISD::CVTPH2PS: return "X86ISD::CVTPH2PS";
- case X86ISD::CVTPH2PS_RND: return "X86ISD::CVTPH2PS_RND";
+ case X86ISD::CVTPH2PS_SAE: return "X86ISD::CVTPH2PS_SAE";
case X86ISD::CVTP2SI: return "X86ISD::CVTP2SI";
case X86ISD::CVTP2UI: return "X86ISD::CVTP2UI";
case X86ISD::MCVTP2SI: return "X86ISD::MCVTP2SI";
@@ -27378,6 +28520,10 @@ const char *X86TargetLowering::getTargetNodeName(unsigned Opcode) const {
case X86ISD::CVTS2UI: return "X86ISD::CVTS2UI";
case X86ISD::CVTS2SI_RND: return "X86ISD::CVTS2SI_RND";
case X86ISD::CVTS2UI_RND: return "X86ISD::CVTS2UI_RND";
+ case X86ISD::CVTNE2PS2BF16: return "X86ISD::CVTNE2PS2BF16";
+ case X86ISD::CVTNEPS2BF16: return "X86ISD::CVTNEPS2BF16";
+ case X86ISD::MCVTNEPS2BF16: return "X86ISD::MCVTNEPS2BF16";
+ case X86ISD::DPBF16PS: return "X86ISD::DPBF16PS";
case X86ISD::LWPINS: return "X86ISD::LWPINS";
case X86ISD::MGATHER: return "X86ISD::MGATHER";
case X86ISD::MSCATTER: return "X86ISD::MSCATTER";
@@ -27393,6 +28539,9 @@ const char *X86TargetLowering::getTargetNodeName(unsigned Opcode) const {
case X86ISD::NT_BRIND: return "X86ISD::NT_BRIND";
case X86ISD::UMWAIT: return "X86ISD::UMWAIT";
case X86ISD::TPAUSE: return "X86ISD::TPAUSE";
+ case X86ISD::ENQCMD: return "X86ISD:ENQCMD";
+ case X86ISD::ENQCMDS: return "X86ISD:ENQCMDS";
+ case X86ISD::VP2INTERSECT: return "X86ISD::VP2INTERSECT";
}
return nullptr;
}
@@ -27478,6 +28627,38 @@ bool X86TargetLowering::isVectorShiftByScalarCheap(Type *Ty) const {
return true;
}
+bool X86TargetLowering::isBinOp(unsigned Opcode) const {
+ switch (Opcode) {
+ // These are non-commutative binops.
+ // TODO: Add more X86ISD opcodes once we have test coverage.
+ case X86ISD::ANDNP:
+ case X86ISD::PCMPGT:
+ case X86ISD::FMAX:
+ case X86ISD::FMIN:
+ case X86ISD::FANDN:
+ return true;
+ }
+
+ return TargetLoweringBase::isBinOp(Opcode);
+}
+
+bool X86TargetLowering::isCommutativeBinOp(unsigned Opcode) const {
+ switch (Opcode) {
+ // TODO: Add more X86ISD opcodes once we have test coverage.
+ case X86ISD::PCMPEQ:
+ case X86ISD::PMULDQ:
+ case X86ISD::PMULUDQ:
+ case X86ISD::FMAXC:
+ case X86ISD::FMINC:
+ case X86ISD::FAND:
+ case X86ISD::FOR:
+ case X86ISD::FXOR:
+ return true;
+ }
+
+ return TargetLoweringBase::isCommutativeBinOp(Opcode);
+}
+
bool X86TargetLowering::isTruncateFree(Type *Ty1, Type *Ty2) const {
if (!Ty1->isIntegerTy() || !Ty2->isIntegerTy())
return false;
@@ -27713,87 +28894,6 @@ static MachineBasicBlock *emitXBegin(MachineInstr &MI, MachineBasicBlock *MBB,
return sinkMBB;
}
-static MachineBasicBlock *emitWRPKRU(MachineInstr &MI, MachineBasicBlock *BB,
- const X86Subtarget &Subtarget) {
- DebugLoc dl = MI.getDebugLoc();
- const TargetInstrInfo *TII = Subtarget.getInstrInfo();
-
- // insert input VAL into EAX
- BuildMI(*BB, MI, dl, TII->get(TargetOpcode::COPY), X86::EAX)
- .addReg(MI.getOperand(0).getReg());
- // insert zero to ECX
- BuildMI(*BB, MI, dl, TII->get(X86::MOV32r0), X86::ECX);
-
- // insert zero to EDX
- BuildMI(*BB, MI, dl, TII->get(X86::MOV32r0), X86::EDX);
-
- // insert WRPKRU instruction
- BuildMI(*BB, MI, dl, TII->get(X86::WRPKRUr));
-
- MI.eraseFromParent(); // The pseudo is gone now.
- return BB;
-}
-
-static MachineBasicBlock *emitRDPKRU(MachineInstr &MI, MachineBasicBlock *BB,
- const X86Subtarget &Subtarget) {
- DebugLoc dl = MI.getDebugLoc();
- const TargetInstrInfo *TII = Subtarget.getInstrInfo();
-
- // insert zero to ECX
- BuildMI(*BB, MI, dl, TII->get(X86::MOV32r0), X86::ECX);
-
- // insert RDPKRU instruction
- BuildMI(*BB, MI, dl, TII->get(X86::RDPKRUr));
- BuildMI(*BB, MI, dl, TII->get(TargetOpcode::COPY), MI.getOperand(0).getReg())
- .addReg(X86::EAX);
-
- MI.eraseFromParent(); // The pseudo is gone now.
- return BB;
-}
-
-static MachineBasicBlock *emitMonitor(MachineInstr &MI, MachineBasicBlock *BB,
- const X86Subtarget &Subtarget,
- unsigned Opc) {
- DebugLoc dl = MI.getDebugLoc();
- const TargetInstrInfo *TII = Subtarget.getInstrInfo();
- // Address into RAX/EAX, other two args into ECX, EDX.
- unsigned MemOpc = Subtarget.is64Bit() ? X86::LEA64r : X86::LEA32r;
- unsigned MemReg = Subtarget.is64Bit() ? X86::RAX : X86::EAX;
- MachineInstrBuilder MIB = BuildMI(*BB, MI, dl, TII->get(MemOpc), MemReg);
- for (int i = 0; i < X86::AddrNumOperands; ++i)
- MIB.add(MI.getOperand(i));
-
- unsigned ValOps = X86::AddrNumOperands;
- BuildMI(*BB, MI, dl, TII->get(TargetOpcode::COPY), X86::ECX)
- .addReg(MI.getOperand(ValOps).getReg());
- BuildMI(*BB, MI, dl, TII->get(TargetOpcode::COPY), X86::EDX)
- .addReg(MI.getOperand(ValOps + 1).getReg());
-
- // The instruction doesn't actually take any operands though.
- BuildMI(*BB, MI, dl, TII->get(Opc));
-
- MI.eraseFromParent(); // The pseudo is gone now.
- return BB;
-}
-
-static MachineBasicBlock *emitClzero(MachineInstr *MI, MachineBasicBlock *BB,
- const X86Subtarget &Subtarget) {
- DebugLoc dl = MI->getDebugLoc();
- const TargetInstrInfo *TII = Subtarget.getInstrInfo();
- // Address into RAX/EAX
- unsigned MemOpc = Subtarget.is64Bit() ? X86::LEA64r : X86::LEA32r;
- unsigned MemReg = Subtarget.is64Bit() ? X86::RAX : X86::EAX;
- MachineInstrBuilder MIB = BuildMI(*BB, MI, dl, TII->get(MemOpc), MemReg);
- for (int i = 0; i < X86::AddrNumOperands; ++i)
- MIB.add(MI->getOperand(i));
-
- // The instruction doesn't actually take any operands though.
- BuildMI(*BB, MI, dl, TII->get(X86::CLZEROr));
-
- MI->eraseFromParent(); // The pseudo is gone now.
- return BB;
-}
-
MachineBasicBlock *
@@ -27823,10 +28923,18 @@ X86TargetLowering::EmitVAARG64WithCustomInserter(MachineInstr &MI,
unsigned ArgMode = MI.getOperand(7).getImm();
unsigned Align = MI.getOperand(8).getImm();
+ MachineFunction *MF = MBB->getParent();
+
// Memory Reference
assert(MI.hasOneMemOperand() && "Expected VAARG_64 to have one memoperand");
- SmallVector<MachineMemOperand *, 1> MMOs(MI.memoperands_begin(),
- MI.memoperands_end());
+
+ MachineMemOperand *OldMMO = MI.memoperands().front();
+
+ // Clone the MMO into two separate MMOs for loading and storing
+ MachineMemOperand *LoadOnlyMMO = MF->getMachineMemOperand(
+ OldMMO, OldMMO->getFlags() & ~MachineMemOperand::MOStore);
+ MachineMemOperand *StoreOnlyMMO = MF->getMachineMemOperand(
+ OldMMO, OldMMO->getFlags() & ~MachineMemOperand::MOLoad);
// Machine Information
const TargetInstrInfo *TII = Subtarget.getInstrInfo();
@@ -27891,7 +28999,6 @@ X86TargetLowering::EmitVAARG64WithCustomInserter(MachineInstr &MI,
OverflowDestReg = MRI.createVirtualRegister(AddrRegClass);
const BasicBlock *LLVM_BB = MBB->getBasicBlock();
- MachineFunction *MF = MBB->getParent();
overflowMBB = MF->CreateMachineBasicBlock(LLVM_BB);
offsetMBB = MF->CreateMachineBasicBlock(LLVM_BB);
endMBB = MF->CreateMachineBasicBlock(LLVM_BB);
@@ -27924,7 +29031,7 @@ X86TargetLowering::EmitVAARG64WithCustomInserter(MachineInstr &MI,
.add(Index)
.addDisp(Disp, UseFPOffset ? 4 : 0)
.add(Segment)
- .setMemRefs(MMOs);
+ .setMemRefs(LoadOnlyMMO);
// Check if there is enough room left to pull this argument.
BuildMI(thisMBB, DL, TII->get(X86::CMP32ri))
@@ -27933,8 +29040,8 @@ X86TargetLowering::EmitVAARG64WithCustomInserter(MachineInstr &MI,
// Branch to "overflowMBB" if offset >= max
// Fall through to "offsetMBB" otherwise
- BuildMI(thisMBB, DL, TII->get(X86::GetCondBranchFromCond(X86::COND_AE)))
- .addMBB(overflowMBB);
+ BuildMI(thisMBB, DL, TII->get(X86::JCC_1))
+ .addMBB(overflowMBB).addImm(X86::COND_AE);
}
// In offsetMBB, emit code to use the reg_save_area.
@@ -27949,7 +29056,7 @@ X86TargetLowering::EmitVAARG64WithCustomInserter(MachineInstr &MI,
.add(Index)
.addDisp(Disp, 16)
.add(Segment)
- .setMemRefs(MMOs);
+ .setMemRefs(LoadOnlyMMO);
// Zero-extend the offset
unsigned OffsetReg64 = MRI.createVirtualRegister(AddrRegClass);
@@ -27977,7 +29084,7 @@ X86TargetLowering::EmitVAARG64WithCustomInserter(MachineInstr &MI,
.addDisp(Disp, UseFPOffset ? 4 : 0)
.add(Segment)
.addReg(NextOffsetReg)
- .setMemRefs(MMOs);
+ .setMemRefs(StoreOnlyMMO);
// Jump to endMBB
BuildMI(offsetMBB, DL, TII->get(X86::JMP_1))
@@ -27996,7 +29103,7 @@ X86TargetLowering::EmitVAARG64WithCustomInserter(MachineInstr &MI,
.add(Index)
.addDisp(Disp, 8)
.add(Segment)
- .setMemRefs(MMOs);
+ .setMemRefs(LoadOnlyMMO);
// If we need to align it, do so. Otherwise, just copy the address
// to OverflowDestReg.
@@ -28033,7 +29140,7 @@ X86TargetLowering::EmitVAARG64WithCustomInserter(MachineInstr &MI,
.addDisp(Disp, 8)
.add(Segment)
.addReg(NextAddrReg)
- .setMemRefs(MMOs);
+ .setMemRefs(StoreOnlyMMO);
// If we branched, emit the PHI to the front of endMBB.
if (offsetMBB) {
@@ -28091,7 +29198,7 @@ MachineBasicBlock *X86TargetLowering::EmitVAStartSaveXMMRegsWithCustomInserter(
if (!Subtarget.isCallingConvWin64(F->getFunction().getCallingConv())) {
// If %al is 0, branch around the XMM save block.
BuildMI(MBB, DL, TII->get(X86::TEST8rr)).addReg(CountReg).addReg(CountReg);
- BuildMI(MBB, DL, TII->get(X86::JE_1)).addMBB(EndMBB);
+ BuildMI(MBB, DL, TII->get(X86::JCC_1)).addMBB(EndMBB).addImm(X86::COND_E);
MBB->addSuccessor(EndMBB);
}
@@ -28371,13 +29478,11 @@ X86TargetLowering::EmitLoweredCascadedSelect(MachineInstr &FirstCMOV,
// Create the conditional branch instructions.
X86::CondCode FirstCC = X86::CondCode(FirstCMOV.getOperand(3).getImm());
- unsigned Opc = X86::GetCondBranchFromCond(FirstCC);
- BuildMI(ThisMBB, DL, TII->get(Opc)).addMBB(SinkMBB);
+ BuildMI(ThisMBB, DL, TII->get(X86::JCC_1)).addMBB(SinkMBB).addImm(FirstCC);
X86::CondCode SecondCC =
X86::CondCode(SecondCascadedCMOV.getOperand(3).getImm());
- unsigned Opc2 = X86::GetCondBranchFromCond(SecondCC);
- BuildMI(FirstInsertedMBB, DL, TII->get(Opc2)).addMBB(SinkMBB);
+ BuildMI(FirstInsertedMBB, DL, TII->get(X86::JCC_1)).addMBB(SinkMBB).addImm(SecondCC);
// SinkMBB:
// %Result = phi [ %FalseValue, SecondInsertedMBB ], [ %TrueValue, ThisMBB ]
@@ -28463,20 +29568,21 @@ X86TargetLowering::EmitLoweredSelect(MachineInstr &MI,
X86::CondCode CC = X86::CondCode(MI.getOperand(3).getImm());
X86::CondCode OppCC = X86::GetOppositeBranchCondition(CC);
MachineInstr *LastCMOV = &MI;
- MachineBasicBlock::iterator NextMIIt =
- std::next(MachineBasicBlock::iterator(MI));
+ MachineBasicBlock::iterator NextMIIt = MachineBasicBlock::iterator(MI);
// Check for case 1, where there are multiple CMOVs with the same condition
// first. Of the two cases of multiple CMOV lowerings, case 1 reduces the
// number of jumps the most.
if (isCMOVPseudo(MI)) {
- // See if we have a string of CMOVS with the same condition.
+ // See if we have a string of CMOVS with the same condition. Skip over
+ // intervening debug insts.
while (NextMIIt != ThisMBB->end() && isCMOVPseudo(*NextMIIt) &&
(NextMIIt->getOperand(3).getImm() == CC ||
NextMIIt->getOperand(3).getImm() == OppCC)) {
LastCMOV = &*NextMIIt;
++NextMIIt;
+ NextMIIt = skipDebugInstructionsForward(NextMIIt, ThisMBB->end());
}
}
@@ -28508,8 +29614,18 @@ X86TargetLowering::EmitLoweredSelect(MachineInstr &MI,
SinkMBB->addLiveIn(X86::EFLAGS);
}
+ // Transfer any debug instructions inside the CMOV sequence to the sunk block.
+ auto DbgEnd = MachineBasicBlock::iterator(LastCMOV);
+ auto DbgIt = MachineBasicBlock::iterator(MI);
+ while (DbgIt != DbgEnd) {
+ auto Next = std::next(DbgIt);
+ if (DbgIt->isDebugInstr())
+ SinkMBB->push_back(DbgIt->removeFromParent());
+ DbgIt = Next;
+ }
+
// Transfer the remainder of ThisMBB and its successor edges to SinkMBB.
- SinkMBB->splice(SinkMBB->begin(), ThisMBB,
+ SinkMBB->splice(SinkMBB->end(), ThisMBB,
std::next(MachineBasicBlock::iterator(LastCMOV)),
ThisMBB->end());
SinkMBB->transferSuccessorsAndUpdatePHIs(ThisMBB);
@@ -28522,8 +29638,7 @@ X86TargetLowering::EmitLoweredSelect(MachineInstr &MI,
FalseMBB->addSuccessor(SinkMBB);
// Create the conditional branch instruction.
- unsigned Opc = X86::GetCondBranchFromCond(CC);
- BuildMI(ThisMBB, DL, TII->get(Opc)).addMBB(SinkMBB);
+ BuildMI(ThisMBB, DL, TII->get(X86::JCC_1)).addMBB(SinkMBB).addImm(CC);
// SinkMBB:
// %Result = phi [ %FalseValue, FalseMBB ], [ %TrueValue, ThisMBB ]
@@ -28540,53 +29655,6 @@ X86TargetLowering::EmitLoweredSelect(MachineInstr &MI,
}
MachineBasicBlock *
-X86TargetLowering::EmitLoweredAtomicFP(MachineInstr &MI,
- MachineBasicBlock *BB) const {
- // Combine the following atomic floating-point modification pattern:
- // a.store(reg OP a.load(acquire), release)
- // Transform them into:
- // OPss (%gpr), %xmm
- // movss %xmm, (%gpr)
- // Or sd equivalent for 64-bit operations.
- unsigned MOp, FOp;
- switch (MI.getOpcode()) {
- default: llvm_unreachable("unexpected instr type for EmitLoweredAtomicFP");
- case X86::RELEASE_FADD32mr:
- FOp = X86::ADDSSrm;
- MOp = X86::MOVSSmr;
- break;
- case X86::RELEASE_FADD64mr:
- FOp = X86::ADDSDrm;
- MOp = X86::MOVSDmr;
- break;
- }
- const X86InstrInfo *TII = Subtarget.getInstrInfo();
- DebugLoc DL = MI.getDebugLoc();
- MachineRegisterInfo &MRI = BB->getParent()->getRegInfo();
- unsigned ValOpIdx = X86::AddrNumOperands;
- unsigned VSrc = MI.getOperand(ValOpIdx).getReg();
- MachineInstrBuilder MIB =
- BuildMI(*BB, MI, DL, TII->get(FOp),
- MRI.createVirtualRegister(MRI.getRegClass(VSrc)))
- .addReg(VSrc);
- for (int i = 0; i < X86::AddrNumOperands; ++i) {
- MachineOperand &Operand = MI.getOperand(i);
- // Clear any kill flags on register operands as we'll create a second
- // instruction using the same address operands.
- if (Operand.isReg())
- Operand.setIsKill(false);
- MIB.add(Operand);
- }
- MachineInstr *FOpMI = MIB;
- MIB = BuildMI(*BB, MI, DL, TII->get(MOp));
- for (int i = 0; i < X86::AddrNumOperands; ++i)
- MIB.add(MI.getOperand(i));
- MIB.addReg(FOpMI->getOperand(0).getReg(), RegState::Kill);
- MI.eraseFromParent(); // The pseudo instruction is gone now.
- return BB;
-}
-
-MachineBasicBlock *
X86TargetLowering::EmitLoweredSegAlloca(MachineInstr &MI,
MachineBasicBlock *BB) const {
MachineFunction *MF = BB->getParent();
@@ -28652,7 +29720,7 @@ X86TargetLowering::EmitLoweredSegAlloca(MachineInstr &MI,
BuildMI(BB, DL, TII->get(IsLP64 ? X86::CMP64mr:X86::CMP32mr))
.addReg(0).addImm(1).addReg(0).addImm(TlsOffset).addReg(TlsReg)
.addReg(SPLimitVReg);
- BuildMI(BB, DL, TII->get(X86::JG_1)).addMBB(mallocMBB);
+ BuildMI(BB, DL, TII->get(X86::JCC_1)).addMBB(mallocMBB).addImm(X86::COND_G);
// bumpMBB simply decreases the stack pointer, since we know the current
// stacklet has enough space.
@@ -29279,7 +30347,7 @@ X86TargetLowering::emitLongJmpShadowStackFix(MachineInstr &MI,
BuildMI(checkSspMBB, DL, TII->get(TestRROpc))
.addReg(SSPCopyReg)
.addReg(SSPCopyReg);
- BuildMI(checkSspMBB, DL, TII->get(X86::JE_1)).addMBB(sinkMBB);
+ BuildMI(checkSspMBB, DL, TII->get(X86::JCC_1)).addMBB(sinkMBB).addImm(X86::COND_E);
checkSspMBB->addSuccessor(sinkMBB);
checkSspMBB->addSuccessor(fallMBB);
@@ -29309,7 +30377,7 @@ X86TargetLowering::emitLongJmpShadowStackFix(MachineInstr &MI,
.addReg(SSPCopyReg);
// Jump to sink in case PrevSSPReg <= SSPCopyReg.
- BuildMI(fallMBB, DL, TII->get(X86::JBE_1)).addMBB(sinkMBB);
+ BuildMI(fallMBB, DL, TII->get(X86::JCC_1)).addMBB(sinkMBB).addImm(X86::COND_BE);
fallMBB->addSuccessor(sinkMBB);
fallMBB->addSuccessor(fixShadowMBB);
@@ -29332,7 +30400,7 @@ X86TargetLowering::emitLongJmpShadowStackFix(MachineInstr &MI,
.addImm(8);
// Jump if the result of the shift is zero.
- BuildMI(fixShadowMBB, DL, TII->get(X86::JE_1)).addMBB(sinkMBB);
+ BuildMI(fixShadowMBB, DL, TII->get(X86::JCC_1)).addMBB(sinkMBB).addImm(X86::COND_E);
fixShadowMBB->addSuccessor(sinkMBB);
fixShadowMBB->addSuccessor(fixShadowLoopPrepareMBB);
@@ -29367,7 +30435,7 @@ X86TargetLowering::emitLongJmpShadowStackFix(MachineInstr &MI,
BuildMI(fixShadowLoopMBB, DL, TII->get(DecROpc), DecReg).addReg(CounterReg);
// Jump if the counter is not zero yet.
- BuildMI(fixShadowLoopMBB, DL, TII->get(X86::JNE_1)).addMBB(fixShadowLoopMBB);
+ BuildMI(fixShadowLoopMBB, DL, TII->get(X86::JCC_1)).addMBB(fixShadowLoopMBB).addImm(X86::COND_NE);
fixShadowLoopMBB->addSuccessor(sinkMBB);
fixShadowLoopMBB->addSuccessor(fixShadowLoopMBB);
@@ -29512,10 +30580,9 @@ X86TargetLowering::EmitSjLjDispatchBlock(MachineInstr &MI,
MachineBasicBlock *BB) const {
DebugLoc DL = MI.getDebugLoc();
MachineFunction *MF = BB->getParent();
- MachineFrameInfo &MFI = MF->getFrameInfo();
MachineRegisterInfo *MRI = &MF->getRegInfo();
const X86InstrInfo *TII = Subtarget.getInstrInfo();
- int FI = MFI.getFunctionContextIndex();
+ int FI = MF->getFrameInfo().getFunctionContextIndex();
// Get a mapping of the call site numbers to all of the landing pads they're
// associated with.
@@ -29613,7 +30680,7 @@ X86TargetLowering::EmitSjLjDispatchBlock(MachineInstr &MI,
BuildMI(DispatchBB, DL, TII->get(X86::CMP32ri))
.addReg(IReg)
.addImm(LPadList.size());
- BuildMI(DispatchBB, DL, TII->get(X86::JAE_1)).addMBB(TrapBB);
+ BuildMI(DispatchBB, DL, TII->get(X86::JCC_1)).addMBB(TrapBB).addImm(X86::COND_AE);
if (Subtarget.is64Bit()) {
unsigned BReg = MRI->createVirtualRegister(&X86::GR64RegClass);
@@ -29766,7 +30833,9 @@ X86TargetLowering::EmitInstrWithCustomInserter(MachineInstr &MI,
case X86::TLSCall_64:
return EmitLoweredTLSCall(MI, BB);
case X86::CMOV_FR32:
+ case X86::CMOV_FR32X:
case X86::CMOV_FR64:
+ case X86::CMOV_FR64X:
case X86::CMOV_GR8:
case X86::CMOV_GR16:
case X86::CMOV_GR32:
@@ -29821,10 +30890,6 @@ X86TargetLowering::EmitInstrWithCustomInserter(MachineInstr &MI,
return BB;
}
- case X86::RELEASE_FADD32mr:
- case X86::RELEASE_FADD64mr:
- return EmitLoweredAtomicFP(MI, BB);
-
case X86::FP32_TO_INT16_IN_MEM:
case X86::FP32_TO_INT32_IN_MEM:
case X86::FP32_TO_INT64_IN_MEM:
@@ -29836,27 +30901,37 @@ X86TargetLowering::EmitInstrWithCustomInserter(MachineInstr &MI,
case X86::FP80_TO_INT64_IN_MEM: {
// Change the floating point control register to use "round towards zero"
// mode when truncating to an integer value.
- int CWFrameIdx = MF->getFrameInfo().CreateStackObject(2, 2, false);
+ int OrigCWFrameIdx = MF->getFrameInfo().CreateStackObject(2, 2, false);
addFrameReference(BuildMI(*BB, MI, DL,
- TII->get(X86::FNSTCW16m)), CWFrameIdx);
+ TII->get(X86::FNSTCW16m)), OrigCWFrameIdx);
- // Load the old value of the high byte of the control word...
+ // Load the old value of the control word...
unsigned OldCW =
+ MF->getRegInfo().createVirtualRegister(&X86::GR32RegClass);
+ addFrameReference(BuildMI(*BB, MI, DL, TII->get(X86::MOVZX32rm16), OldCW),
+ OrigCWFrameIdx);
+
+ // OR 0b11 into bit 10 and 11. 0b11 is the encoding for round toward zero.
+ unsigned NewCW =
+ MF->getRegInfo().createVirtualRegister(&X86::GR32RegClass);
+ BuildMI(*BB, MI, DL, TII->get(X86::OR32ri), NewCW)
+ .addReg(OldCW, RegState::Kill).addImm(0xC00);
+
+ // Extract to 16 bits.
+ unsigned NewCW16 =
MF->getRegInfo().createVirtualRegister(&X86::GR16RegClass);
- addFrameReference(BuildMI(*BB, MI, DL, TII->get(X86::MOV16rm), OldCW),
- CWFrameIdx);
+ BuildMI(*BB, MI, DL, TII->get(TargetOpcode::COPY), NewCW16)
+ .addReg(NewCW, RegState::Kill, X86::sub_16bit);
- // Set the high part to be round to zero...
- addFrameReference(BuildMI(*BB, MI, DL, TII->get(X86::MOV16mi)), CWFrameIdx)
- .addImm(0xC7F);
+ // Prepare memory for FLDCW.
+ int NewCWFrameIdx = MF->getFrameInfo().CreateStackObject(2, 2, false);
+ addFrameReference(BuildMI(*BB, MI, DL, TII->get(X86::MOV16mr)),
+ NewCWFrameIdx)
+ .addReg(NewCW16, RegState::Kill);
// Reload the modified control word now...
addFrameReference(BuildMI(*BB, MI, DL,
- TII->get(X86::FLDCW16m)), CWFrameIdx);
-
- // Restore the memory image of control word to original value
- addFrameReference(BuildMI(*BB, MI, DL, TII->get(X86::MOV16mr)), CWFrameIdx)
- .addReg(OldCW);
+ TII->get(X86::FLDCW16m)), NewCWFrameIdx);
// Get the X86 opcode to use.
unsigned Opc;
@@ -29879,26 +30954,12 @@ X86TargetLowering::EmitInstrWithCustomInserter(MachineInstr &MI,
// Reload the original control word now.
addFrameReference(BuildMI(*BB, MI, DL,
- TII->get(X86::FLDCW16m)), CWFrameIdx);
+ TII->get(X86::FLDCW16m)), OrigCWFrameIdx);
MI.eraseFromParent(); // The pseudo instruction is gone now.
return BB;
}
- // Thread synchronization.
- case X86::MONITOR:
- return emitMonitor(MI, BB, Subtarget, X86::MONITORrrr);
- case X86::MONITORX:
- return emitMonitor(MI, BB, Subtarget, X86::MONITORXrrr);
-
- // Cache line zero
- case X86::CLZERO:
- return emitClzero(&MI, BB, Subtarget);
-
- // PKU feature
- case X86::WRPKRU:
- return emitWRPKRU(MI, BB, Subtarget);
- case X86::RDPKRU:
- return emitRDPKRU(MI, BB, Subtarget);
+
// xbegin
case X86::XBEGIN:
return emitXBegin(MI, BB, Subtarget.getInstrInfo());
@@ -30093,7 +31154,7 @@ void X86TargetLowering::computeKnownBitsForTargetNode(const SDValue Op,
APInt DemandedElt = APInt::getOneBitSet(SrcVT.getVectorNumElements(),
Op.getConstantOperandVal(1));
Known = DAG.computeKnownBits(Src, DemandedElt, Depth + 1);
- Known = Known.zextOrTrunc(BitWidth);
+ Known = Known.zextOrTrunc(BitWidth, false);
Known.Zero.setBitsFrom(SrcVT.getScalarSizeInBits());
break;
}
@@ -30150,6 +31211,27 @@ void X86TargetLowering::computeKnownBitsForTargetNode(const SDValue Op,
Known = Known.trunc(BitWidth);
break;
}
+ case X86ISD::ANDNP: {
+ KnownBits Known2;
+ Known = DAG.computeKnownBits(Op.getOperand(1), DemandedElts, Depth + 1);
+ Known2 = DAG.computeKnownBits(Op.getOperand(0), DemandedElts, Depth + 1);
+
+ // ANDNP = (~X & Y);
+ Known.One &= Known2.Zero;
+ Known.Zero |= Known2.One;
+ break;
+ }
+ case X86ISD::FOR: {
+ KnownBits Known2;
+ Known = DAG.computeKnownBits(Op.getOperand(1), DemandedElts, Depth + 1);
+ Known2 = DAG.computeKnownBits(Op.getOperand(0), DemandedElts, Depth + 1);
+
+ // Output known-0 bits are only known if clear in both the LHS & RHS.
+ Known.Zero &= Known2.Zero;
+ // Output known-1 are known to be set if set in either the LHS | RHS.
+ Known.One |= Known2.One;
+ break;
+ }
case X86ISD::CMOV: {
Known = DAG.computeKnownBits(Op.getOperand(1), Depth+1);
// If we don't know any bits, early out.
@@ -30219,7 +31301,8 @@ void X86TargetLowering::computeKnownBitsForTargetNode(const SDValue Op,
unsigned X86TargetLowering::ComputeNumSignBitsForTargetNode(
SDValue Op, const APInt &DemandedElts, const SelectionDAG &DAG,
unsigned Depth) const {
- unsigned VTBits = Op.getScalarValueSizeInBits();
+ EVT VT = Op.getValueType();
+ unsigned VTBits = VT.getScalarSizeInBits();
unsigned Opcode = Op.getOpcode();
switch (Opcode) {
case X86ISD::SETCC_CARRY:
@@ -30257,7 +31340,7 @@ unsigned X86TargetLowering::ComputeNumSignBitsForTargetNode(
case X86ISD::VSHLI: {
SDValue Src = Op.getOperand(0);
- APInt ShiftVal = cast<ConstantSDNode>(Op.getOperand(1))->getAPIntValue();
+ const APInt &ShiftVal = Op.getConstantOperandAPInt(1);
if (ShiftVal.uge(VTBits))
return VTBits; // Shifted all bits out --> zero.
unsigned Tmp = DAG.ComputeNumSignBits(Src, DemandedElts, Depth + 1);
@@ -30268,7 +31351,7 @@ unsigned X86TargetLowering::ComputeNumSignBitsForTargetNode(
case X86ISD::VSRAI: {
SDValue Src = Op.getOperand(0);
- APInt ShiftVal = cast<ConstantSDNode>(Op.getOperand(1))->getAPIntValue();
+ APInt ShiftVal = Op.getConstantOperandAPInt(1);
if (ShiftVal.uge(VTBits - 1))
return VTBits; // Sign splat.
unsigned Tmp = DAG.ComputeNumSignBits(Src, DemandedElts, Depth + 1);
@@ -30284,6 +31367,15 @@ unsigned X86TargetLowering::ComputeNumSignBitsForTargetNode(
// Vector compares return zero/all-bits result values.
return VTBits;
+ case X86ISD::ANDNP: {
+ unsigned Tmp0 =
+ DAG.ComputeNumSignBits(Op.getOperand(0), DemandedElts, Depth + 1);
+ if (Tmp0 == 1) return 1; // Early out.
+ unsigned Tmp1 =
+ DAG.ComputeNumSignBits(Op.getOperand(1), DemandedElts, Depth + 1);
+ return std::min(Tmp0, Tmp1);
+ }
+
case X86ISD::CMOV: {
unsigned Tmp0 = DAG.ComputeNumSignBits(Op.getOperand(0), Depth+1);
if (Tmp0 == 1) return 1; // Early out.
@@ -30292,6 +31384,54 @@ unsigned X86TargetLowering::ComputeNumSignBitsForTargetNode(
}
}
+ // Handle target shuffles.
+ // TODO - use resolveTargetShuffleInputs once we can limit recursive depth.
+ if (isTargetShuffle(Opcode)) {
+ bool IsUnary;
+ SmallVector<int, 64> Mask;
+ SmallVector<SDValue, 2> Ops;
+ if (getTargetShuffleMask(Op.getNode(), VT.getSimpleVT(), true, Ops, Mask,
+ IsUnary)) {
+ unsigned NumOps = Ops.size();
+ unsigned NumElts = VT.getVectorNumElements();
+ if (Mask.size() == NumElts) {
+ SmallVector<APInt, 2> DemandedOps(NumOps, APInt(NumElts, 0));
+ for (unsigned i = 0; i != NumElts; ++i) {
+ if (!DemandedElts[i])
+ continue;
+ int M = Mask[i];
+ if (M == SM_SentinelUndef) {
+ // For UNDEF elements, we don't know anything about the common state
+ // of the shuffle result.
+ return 1;
+ } else if (M == SM_SentinelZero) {
+ // Zero = all sign bits.
+ continue;
+ }
+ assert(0 <= M && (unsigned)M < (NumOps * NumElts) &&
+ "Shuffle index out of range");
+
+ unsigned OpIdx = (unsigned)M / NumElts;
+ unsigned EltIdx = (unsigned)M % NumElts;
+ if (Ops[OpIdx].getValueType() != VT) {
+ // TODO - handle target shuffle ops with different value types.
+ return 1;
+ }
+ DemandedOps[OpIdx].setBit(EltIdx);
+ }
+ unsigned Tmp0 = VTBits;
+ for (unsigned i = 0; i != NumOps && Tmp0 > 1; ++i) {
+ if (!DemandedOps[i])
+ continue;
+ unsigned Tmp1 =
+ DAG.ComputeNumSignBits(Ops[i], DemandedOps[i], Depth + 1);
+ Tmp0 = std::min(Tmp0, Tmp1);
+ }
+ return Tmp0;
+ }
+ }
+ }
+
// Fallback case.
return 1;
}
@@ -30305,12 +31445,11 @@ SDValue X86TargetLowering::unwrapAddress(SDValue N) const {
// Attempt to match a combined shuffle mask against supported unary shuffle
// instructions.
// TODO: Investigate sharing more of this with shuffle lowering.
-static bool matchUnaryVectorShuffle(MVT MaskVT, ArrayRef<int> Mask,
- bool AllowFloatDomain, bool AllowIntDomain,
- SDValue &V1, const SDLoc &DL,
- SelectionDAG &DAG,
- const X86Subtarget &Subtarget,
- unsigned &Shuffle, MVT &SrcVT, MVT &DstVT) {
+static bool matchUnaryShuffle(MVT MaskVT, ArrayRef<int> Mask,
+ bool AllowFloatDomain, bool AllowIntDomain,
+ SDValue &V1, const SDLoc &DL, SelectionDAG &DAG,
+ const X86Subtarget &Subtarget, unsigned &Shuffle,
+ MVT &SrcVT, MVT &DstVT) {
unsigned NumMaskElts = Mask.size();
unsigned MaskEltSize = MaskVT.getScalarSizeInBits();
@@ -30322,19 +31461,25 @@ static bool matchUnaryVectorShuffle(MVT MaskVT, ArrayRef<int> Mask,
return true;
}
- // Match against a ZERO_EXTEND_VECTOR_INREG/VZEXT instruction.
+ // Match against a ANY/ZERO_EXTEND_VECTOR_INREG instruction.
// TODO: Add 512-bit vector support (split AVX512F and AVX512BW).
if (AllowIntDomain && ((MaskVT.is128BitVector() && Subtarget.hasSSE41()) ||
(MaskVT.is256BitVector() && Subtarget.hasInt256()))) {
unsigned MaxScale = 64 / MaskEltSize;
for (unsigned Scale = 2; Scale <= MaxScale; Scale *= 2) {
- bool Match = true;
+ bool MatchAny = true;
+ bool MatchZero = true;
unsigned NumDstElts = NumMaskElts / Scale;
- for (unsigned i = 0; i != NumDstElts && Match; ++i) {
- Match &= isUndefOrEqual(Mask[i * Scale], (int)i);
- Match &= isUndefOrZeroInRange(Mask, (i * Scale) + 1, Scale - 1);
+ for (unsigned i = 0; i != NumDstElts && (MatchAny || MatchZero); ++i) {
+ if (!isUndefOrEqual(Mask[i * Scale], (int)i)) {
+ MatchAny = MatchZero = false;
+ break;
+ }
+ MatchAny &= isUndefInRange(Mask, (i * Scale) + 1, Scale - 1);
+ MatchZero &= isUndefOrZeroInRange(Mask, (i * Scale) + 1, Scale - 1);
}
- if (Match) {
+ if (MatchAny || MatchZero) {
+ assert(MatchZero && "Failed to match zext but matched aext?");
unsigned SrcSize = std::max(128u, NumDstElts * MaskEltSize);
MVT ScalarTy = MaskVT.isInteger() ? MaskVT.getScalarType() :
MVT::getIntegerVT(MaskEltSize);
@@ -30343,10 +31488,9 @@ static bool matchUnaryVectorShuffle(MVT MaskVT, ArrayRef<int> Mask,
if (SrcVT.getSizeInBits() != MaskVT.getSizeInBits())
V1 = extractSubVector(V1, 0, DAG, DL, SrcSize);
- if (SrcVT.getVectorNumElements() == NumDstElts)
- Shuffle = unsigned(ISD::ZERO_EXTEND);
- else
- Shuffle = unsigned(ISD::ZERO_EXTEND_VECTOR_INREG);
+ Shuffle = unsigned(MatchAny ? ISD::ANY_EXTEND : ISD::ZERO_EXTEND);
+ if (SrcVT.getVectorNumElements() != NumDstElts)
+ Shuffle = getOpcode_EXTEND_VECTOR_INREG(Shuffle);
DstVT = MVT::getIntegerVT(Scale * MaskEltSize);
DstVT = MVT::getVectorVT(DstVT, NumDstElts);
@@ -30368,7 +31512,7 @@ static bool matchUnaryVectorShuffle(MVT MaskVT, ArrayRef<int> Mask,
// instructions are no slower than UNPCKLPD but has the option to
// fold the input operand into even an unaligned memory load.
if (MaskVT.is128BitVector() && Subtarget.hasSSE3() && AllowFloatDomain) {
- if (!Subtarget.hasAVX2() && isTargetShuffleEquivalent(Mask, {0, 0})) {
+ if (isTargetShuffleEquivalent(Mask, {0, 0})) {
Shuffle = X86ISD::MOVDDUP;
SrcVT = DstVT = MVT::v2f64;
return true;
@@ -30426,29 +31570,18 @@ static bool matchUnaryVectorShuffle(MVT MaskVT, ArrayRef<int> Mask,
}
}
- // Attempt to match against broadcast-from-vector.
- if (Subtarget.hasAVX2()) {
- SmallVector<int, 64> BroadcastMask(NumMaskElts, 0);
- if (isTargetShuffleEquivalent(Mask, BroadcastMask)) {
- SrcVT = DstVT = MaskVT;
- Shuffle = X86ISD::VBROADCAST;
- return true;
- }
- }
-
return false;
}
// Attempt to match a combined shuffle mask against supported unary immediate
// permute instructions.
// TODO: Investigate sharing more of this with shuffle lowering.
-static bool matchUnaryPermuteVectorShuffle(MVT MaskVT, ArrayRef<int> Mask,
- const APInt &Zeroable,
- bool AllowFloatDomain,
- bool AllowIntDomain,
- const X86Subtarget &Subtarget,
- unsigned &Shuffle, MVT &ShuffleVT,
- unsigned &PermuteImm) {
+static bool matchUnaryPermuteShuffle(MVT MaskVT, ArrayRef<int> Mask,
+ const APInt &Zeroable,
+ bool AllowFloatDomain, bool AllowIntDomain,
+ const X86Subtarget &Subtarget,
+ unsigned &Shuffle, MVT &ShuffleVT,
+ unsigned &PermuteImm) {
unsigned NumMaskElts = Mask.size();
unsigned InputSizeInBits = MaskVT.getSizeInBits();
unsigned MaskScalarSizeInBits = InputSizeInBits / NumMaskElts;
@@ -30549,9 +31682,8 @@ static bool matchUnaryPermuteVectorShuffle(MVT MaskVT, ArrayRef<int> Mask,
// FIXME: Add 512-bit support.
if (AllowIntDomain && ((MaskVT.is128BitVector() && Subtarget.hasSSE2()) ||
(MaskVT.is256BitVector() && Subtarget.hasAVX2()))) {
- int ShiftAmt = matchVectorShuffleAsShift(ShuffleVT, Shuffle,
- MaskScalarSizeInBits, Mask,
- 0, Zeroable, Subtarget);
+ int ShiftAmt = matchShuffleAsShift(ShuffleVT, Shuffle, MaskScalarSizeInBits,
+ Mask, 0, Zeroable, Subtarget);
if (0 < ShiftAmt) {
PermuteImm = (unsigned)ShiftAmt;
return true;
@@ -30564,13 +31696,12 @@ static bool matchUnaryPermuteVectorShuffle(MVT MaskVT, ArrayRef<int> Mask,
// Attempt to match a combined unary shuffle mask against supported binary
// shuffle instructions.
// TODO: Investigate sharing more of this with shuffle lowering.
-static bool matchBinaryVectorShuffle(MVT MaskVT, ArrayRef<int> Mask,
- bool AllowFloatDomain, bool AllowIntDomain,
- SDValue &V1, SDValue &V2, const SDLoc &DL,
- SelectionDAG &DAG,
- const X86Subtarget &Subtarget,
- unsigned &Shuffle, MVT &SrcVT, MVT &DstVT,
- bool IsUnary) {
+static bool matchBinaryShuffle(MVT MaskVT, ArrayRef<int> Mask,
+ bool AllowFloatDomain, bool AllowIntDomain,
+ SDValue &V1, SDValue &V2, const SDLoc &DL,
+ SelectionDAG &DAG, const X86Subtarget &Subtarget,
+ unsigned &Shuffle, MVT &SrcVT, MVT &DstVT,
+ bool IsUnary) {
unsigned EltSizeInBits = MaskVT.getScalarSizeInBits();
if (MaskVT.is128BitVector()) {
@@ -30631,7 +31762,7 @@ static bool matchBinaryVectorShuffle(MVT MaskVT, ArrayRef<int> Mask,
return false;
}
-static bool matchBinaryPermuteVectorShuffle(
+static bool matchBinaryPermuteShuffle(
MVT MaskVT, ArrayRef<int> Mask, const APInt &Zeroable,
bool AllowFloatDomain, bool AllowIntDomain, SDValue &V1, SDValue &V2,
const SDLoc &DL, SelectionDAG &DAG, const X86Subtarget &Subtarget,
@@ -30642,7 +31773,7 @@ static bool matchBinaryPermuteVectorShuffle(
// Attempt to match against PALIGNR byte rotate.
if (AllowIntDomain && ((MaskVT.is128BitVector() && Subtarget.hasSSSE3()) ||
(MaskVT.is256BitVector() && Subtarget.hasAVX2()))) {
- int ByteRotation = matchVectorShuffleAsByteRotate(MaskVT, V1, V2, Mask);
+ int ByteRotation = matchShuffleAsByteRotate(MaskVT, V1, V2, Mask);
if (0 < ByteRotation) {
Shuffle = X86ISD::PALIGNR;
ShuffleVT = MVT::getVectorVT(MVT::i8, MaskVT.getSizeInBits() / 8);
@@ -30678,34 +31809,11 @@ static bool matchBinaryPermuteVectorShuffle(
return true;
}
} else {
- // Determine a type compatible with X86ISD::BLENDI.
- ShuffleVT = MaskVT;
- if (Subtarget.hasAVX2()) {
- if (ShuffleVT == MVT::v4i64)
- ShuffleVT = MVT::v8i32;
- else if (ShuffleVT == MVT::v2i64)
- ShuffleVT = MVT::v4i32;
- } else {
- if (ShuffleVT == MVT::v2i64 || ShuffleVT == MVT::v4i32)
- ShuffleVT = MVT::v8i16;
- else if (ShuffleVT == MVT::v4i64)
- ShuffleVT = MVT::v4f64;
- else if (ShuffleVT == MVT::v8i32)
- ShuffleVT = MVT::v8f32;
- }
-
- if (!ShuffleVT.isFloatingPoint()) {
- int Scale = EltSizeInBits / ShuffleVT.getScalarSizeInBits();
- BlendMask =
- scaleVectorShuffleBlendMask(BlendMask, NumMaskElts, Scale);
- ShuffleVT = MVT::getIntegerVT(EltSizeInBits / Scale);
- ShuffleVT = MVT::getVectorVT(ShuffleVT, NumMaskElts * Scale);
- }
-
V1 = ForceV1Zero ? getZeroVector(MaskVT, Subtarget, DAG, DL) : V1;
V2 = ForceV2Zero ? getZeroVector(MaskVT, Subtarget, DAG, DL) : V2;
PermuteImm = (unsigned)BlendMask;
Shuffle = X86ISD::BLENDI;
+ ShuffleVT = MaskVT;
return true;
}
}
@@ -30715,7 +31823,7 @@ static bool matchBinaryPermuteVectorShuffle(
if (AllowFloatDomain && EltSizeInBits == 32 && Subtarget.hasSSE41() &&
MaskVT.is128BitVector()) {
if (Zeroable.getBoolValue() &&
- matchVectorShuffleAsInsertPS(V1, V2, PermuteImm, Zeroable, Mask, DAG)) {
+ matchShuffleAsInsertPS(V1, V2, PermuteImm, Zeroable, Mask, DAG)) {
Shuffle = X86ISD::INSERTPS;
ShuffleVT = MVT::v4f32;
return true;
@@ -30727,7 +31835,7 @@ static bool matchBinaryPermuteVectorShuffle(
((MaskVT.is128BitVector() && Subtarget.hasSSE2()) ||
(MaskVT.is256BitVector() && Subtarget.hasAVX()) ||
(MaskVT.is512BitVector() && Subtarget.hasAVX512()))) {
- if (matchVectorShuffleWithSHUFPD(MaskVT, V1, V2, PermuteImm, Mask)) {
+ if (matchShuffleWithSHUFPD(MaskVT, V1, V2, PermuteImm, Mask)) {
Shuffle = X86ISD::SHUFP;
ShuffleVT = MVT::getVectorVT(MVT::f64, MaskVT.getSizeInBits() / 64);
return true;
@@ -30784,6 +31892,11 @@ static bool matchBinaryPermuteVectorShuffle(
return false;
}
+static SDValue combineX86ShuffleChainWithExtract(
+ ArrayRef<SDValue> Inputs, SDValue Root, ArrayRef<int> BaseMask, int Depth,
+ bool HasVariableMask, bool AllowVariableMask, SelectionDAG &DAG,
+ const X86Subtarget &Subtarget);
+
/// Combine an arbitrary chain of shuffles into a single instruction if
/// possible.
///
@@ -30841,6 +31954,24 @@ static SDValue combineX86ShuffleChain(ArrayRef<SDValue> Inputs, SDValue Root,
bool IsEVEXShuffle =
RootSizeInBits == 512 || (Subtarget.hasVLX() && RootSizeInBits >= 128);
+ // Attempt to match a subvector broadcast.
+ // shuffle(insert_subvector(undef, sub, 0), undef, 0, 0, 0, 0)
+ if (UnaryShuffle &&
+ (BaseMaskEltSizeInBits == 128 || BaseMaskEltSizeInBits == 256)) {
+ SmallVector<int, 64> BroadcastMask(NumBaseMaskElts, 0);
+ if (isTargetShuffleEquivalent(BaseMask, BroadcastMask)) {
+ SDValue Src = Inputs[0];
+ if (Src.getOpcode() == ISD::INSERT_SUBVECTOR &&
+ Src.getOperand(0).isUndef() &&
+ Src.getOperand(1).getValueSizeInBits() == BaseMaskEltSizeInBits &&
+ MayFoldLoad(Src.getOperand(1)) && isNullConstant(Src.getOperand(2))) {
+ return DAG.getBitcast(RootVT, DAG.getNode(X86ISD::SUBV_BROADCAST, DL,
+ Src.getValueType(),
+ Src.getOperand(1)));
+ }
+ }
+ }
+
// TODO - handle 128/256-bit lane shuffles of 512-bit vectors.
// Handle 128-bit lane shuffles of 256-bit vectors.
@@ -30894,6 +32025,7 @@ static SDValue combineX86ShuffleChain(ArrayRef<SDValue> Inputs, SDValue Root,
// Which shuffle domains are permitted?
// Permit domain crossing at higher combine depths.
+ // TODO: Should we indicate which domain is preferred if both are allowed?
bool AllowFloatDomain = FloatDomain || (Depth > 3);
bool AllowIntDomain = (!FloatDomain || (Depth > 3)) && Subtarget.hasSSE2() &&
(!MaskVT.is256BitVector() || Subtarget.hasAVX2());
@@ -30909,8 +32041,11 @@ static SDValue combineX86ShuffleChain(ArrayRef<SDValue> Inputs, SDValue Root,
// directly if we don't shuffle the lower element and we shuffle the upper
// (zero) elements within themselves.
if (V1.getOpcode() == X86ISD::VZEXT_LOAD &&
- (V1.getScalarValueSizeInBits() % MaskEltSizeInBits) == 0) {
- unsigned Scale = V1.getScalarValueSizeInBits() / MaskEltSizeInBits;
+ (cast<MemIntrinsicSDNode>(V1)->getMemoryVT().getScalarSizeInBits() %
+ MaskEltSizeInBits) == 0) {
+ unsigned Scale =
+ cast<MemIntrinsicSDNode>(V1)->getMemoryVT().getScalarSizeInBits() /
+ MaskEltSizeInBits;
ArrayRef<int> HiMask(Mask.data() + Scale, NumMaskElts - Scale);
if (isSequentialOrUndefInRange(Mask, 0, Scale, 0) &&
isUndefOrZeroOrInRange(HiMask, Scale, NumMaskElts)) {
@@ -30918,10 +32053,35 @@ static SDValue combineX86ShuffleChain(ArrayRef<SDValue> Inputs, SDValue Root,
}
}
+ // Attempt to match against broadcast-from-vector.
+ // Limit AVX1 to cases where we're loading+broadcasting a scalar element.
+ if ((Subtarget.hasAVX2() || (Subtarget.hasAVX() && 32 <= MaskEltSizeInBits))
+ && (!IsEVEXShuffle || NumRootElts == NumMaskElts)) {
+ SmallVector<int, 64> BroadcastMask(NumMaskElts, 0);
+ if (isTargetShuffleEquivalent(Mask, BroadcastMask)) {
+ if (V1.getValueType() == MaskVT &&
+ V1.getOpcode() == ISD::SCALAR_TO_VECTOR &&
+ MayFoldLoad(V1.getOperand(0))) {
+ if (Depth == 1 && Root.getOpcode() == X86ISD::VBROADCAST)
+ return SDValue(); // Nothing to do!
+ Res = V1.getOperand(0);
+ Res = DAG.getNode(X86ISD::VBROADCAST, DL, MaskVT, Res);
+ return DAG.getBitcast(RootVT, Res);
+ }
+ if (Subtarget.hasAVX2()) {
+ if (Depth == 1 && Root.getOpcode() == X86ISD::VBROADCAST)
+ return SDValue(); // Nothing to do!
+ Res = DAG.getBitcast(MaskVT, V1);
+ Res = DAG.getNode(X86ISD::VBROADCAST, DL, MaskVT, Res);
+ return DAG.getBitcast(RootVT, Res);
+ }
+ }
+ }
+
SDValue NewV1 = V1; // Save operand in case early exit happens.
- if (matchUnaryVectorShuffle(MaskVT, Mask, AllowFloatDomain, AllowIntDomain,
- NewV1, DL, DAG, Subtarget, Shuffle,
- ShuffleSrcVT, ShuffleVT) &&
+ if (matchUnaryShuffle(MaskVT, Mask, AllowFloatDomain, AllowIntDomain, NewV1,
+ DL, DAG, Subtarget, Shuffle, ShuffleSrcVT,
+ ShuffleVT) &&
(!IsEVEXShuffle || (NumRootElts == ShuffleVT.getVectorNumElements()))) {
if (Depth == 1 && Root.getOpcode() == Shuffle)
return SDValue(); // Nothing to do!
@@ -30930,9 +32090,9 @@ static SDValue combineX86ShuffleChain(ArrayRef<SDValue> Inputs, SDValue Root,
return DAG.getBitcast(RootVT, Res);
}
- if (matchUnaryPermuteVectorShuffle(MaskVT, Mask, Zeroable, AllowFloatDomain,
- AllowIntDomain, Subtarget, Shuffle,
- ShuffleVT, PermuteImm) &&
+ if (matchUnaryPermuteShuffle(MaskVT, Mask, Zeroable, AllowFloatDomain,
+ AllowIntDomain, Subtarget, Shuffle, ShuffleVT,
+ PermuteImm) &&
(!IsEVEXShuffle || (NumRootElts == ShuffleVT.getVectorNumElements()))) {
if (Depth == 1 && Root.getOpcode() == Shuffle)
return SDValue(); // Nothing to do!
@@ -30945,9 +32105,9 @@ static SDValue combineX86ShuffleChain(ArrayRef<SDValue> Inputs, SDValue Root,
SDValue NewV1 = V1; // Save operands in case early exit happens.
SDValue NewV2 = V2;
- if (matchBinaryVectorShuffle(MaskVT, Mask, AllowFloatDomain, AllowIntDomain,
- NewV1, NewV2, DL, DAG, Subtarget, Shuffle,
- ShuffleSrcVT, ShuffleVT, UnaryShuffle) &&
+ if (matchBinaryShuffle(MaskVT, Mask, AllowFloatDomain, AllowIntDomain, NewV1,
+ NewV2, DL, DAG, Subtarget, Shuffle, ShuffleSrcVT,
+ ShuffleVT, UnaryShuffle) &&
(!IsEVEXShuffle || (NumRootElts == ShuffleVT.getVectorNumElements()))) {
if (Depth == 1 && Root.getOpcode() == Shuffle)
return SDValue(); // Nothing to do!
@@ -30959,7 +32119,7 @@ static SDValue combineX86ShuffleChain(ArrayRef<SDValue> Inputs, SDValue Root,
NewV1 = V1; // Save operands in case early exit happens.
NewV2 = V2;
- if (matchBinaryPermuteVectorShuffle(
+ if (matchBinaryPermuteShuffle(
MaskVT, Mask, Zeroable, AllowFloatDomain, AllowIntDomain, NewV1,
NewV2, DL, DAG, Subtarget, Shuffle, ShuffleVT, PermuteImm) &&
(!IsEVEXShuffle || (NumRootElts == ShuffleVT.getVectorNumElements()))) {
@@ -30979,8 +32139,8 @@ static SDValue combineX86ShuffleChain(ArrayRef<SDValue> Inputs, SDValue Root,
// Annoyingly, SSE4A instructions don't map into the above match helpers.
if (Subtarget.hasSSE4A() && AllowIntDomain && RootSizeInBits == 128) {
uint64_t BitLen, BitIdx;
- if (matchVectorShuffleAsEXTRQ(IntMaskVT, V1, V2, Mask, BitLen, BitIdx,
- Zeroable)) {
+ if (matchShuffleAsEXTRQ(IntMaskVT, V1, V2, Mask, BitLen, BitIdx,
+ Zeroable)) {
if (Depth == 1 && Root.getOpcode() == X86ISD::EXTRQI)
return SDValue(); // Nothing to do!
V1 = DAG.getBitcast(IntMaskVT, V1);
@@ -30990,7 +32150,7 @@ static SDValue combineX86ShuffleChain(ArrayRef<SDValue> Inputs, SDValue Root,
return DAG.getBitcast(RootVT, Res);
}
- if (matchVectorShuffleAsINSERTQ(IntMaskVT, V1, V2, Mask, BitLen, BitIdx)) {
+ if (matchShuffleAsINSERTQ(IntMaskVT, V1, V2, Mask, BitLen, BitIdx)) {
if (Depth == 1 && Root.getOpcode() == X86ISD::INSERTQI)
return SDValue(); // Nothing to do!
V1 = DAG.getBitcast(IntMaskVT, V1);
@@ -31057,6 +32217,13 @@ static SDValue combineX86ShuffleChain(ArrayRef<SDValue> Inputs, SDValue Root,
return DAG.getBitcast(RootVT, Res);
}
+ // If that failed and either input is extracted then try to combine as a
+ // shuffle with the larger type.
+ if (SDValue WideShuffle = combineX86ShuffleChainWithExtract(
+ Inputs, Root, BaseMask, Depth, HasVariableMask, AllowVariableMask,
+ DAG, Subtarget))
+ return WideShuffle;
+
// If we have a dual input lane-crossing shuffle then lower to VPERMV3.
if (AllowVariableMask && !MaskContainsZeros &&
((Subtarget.hasAVX512() &&
@@ -31222,10 +32389,145 @@ static SDValue combineX86ShuffleChain(ArrayRef<SDValue> Inputs, SDValue Root,
return DAG.getBitcast(RootVT, Res);
}
+ // If that failed and either input is extracted then try to combine as a
+ // shuffle with the larger type.
+ if (SDValue WideShuffle = combineX86ShuffleChainWithExtract(
+ Inputs, Root, BaseMask, Depth, HasVariableMask, AllowVariableMask,
+ DAG, Subtarget))
+ return WideShuffle;
+
+ // If we have a dual input shuffle then lower to VPERMV3.
+ if (!UnaryShuffle && AllowVariableMask && !MaskContainsZeros &&
+ ((Subtarget.hasAVX512() &&
+ (MaskVT == MVT::v8f64 || MaskVT == MVT::v8i64 ||
+ MaskVT == MVT::v16f32 || MaskVT == MVT::v16i32)) ||
+ (Subtarget.hasVLX() &&
+ (MaskVT == MVT::v2f64 || MaskVT == MVT::v2i64 || MaskVT == MVT::v4f64 ||
+ MaskVT == MVT::v4i64 || MaskVT == MVT::v4f32 || MaskVT == MVT::v4i32 ||
+ MaskVT == MVT::v8f32 || MaskVT == MVT::v8i32)) ||
+ (Subtarget.hasBWI() && MaskVT == MVT::v32i16) ||
+ (Subtarget.hasBWI() && Subtarget.hasVLX() &&
+ (MaskVT == MVT::v8i16 || MaskVT == MVT::v16i16)) ||
+ (Subtarget.hasVBMI() && MaskVT == MVT::v64i8) ||
+ (Subtarget.hasVBMI() && Subtarget.hasVLX() &&
+ (MaskVT == MVT::v16i8 || MaskVT == MVT::v32i8)))) {
+ SDValue VPermMask = getConstVector(Mask, IntMaskVT, DAG, DL, true);
+ V1 = DAG.getBitcast(MaskVT, V1);
+ V2 = DAG.getBitcast(MaskVT, V2);
+ Res = DAG.getNode(X86ISD::VPERMV3, DL, MaskVT, V1, VPermMask, V2);
+ return DAG.getBitcast(RootVT, Res);
+ }
+
// Failed to find any combines.
return SDValue();
}
+// Combine an arbitrary chain of shuffles + extract_subvectors into a single
+// instruction if possible.
+//
+// Wrapper for combineX86ShuffleChain that extends the shuffle mask to a larger
+// type size to attempt to combine:
+// shuffle(extract_subvector(x,c1),extract_subvector(y,c2),m1)
+// -->
+// extract_subvector(shuffle(x,y,m2),0)
+static SDValue combineX86ShuffleChainWithExtract(
+ ArrayRef<SDValue> Inputs, SDValue Root, ArrayRef<int> BaseMask, int Depth,
+ bool HasVariableMask, bool AllowVariableMask, SelectionDAG &DAG,
+ const X86Subtarget &Subtarget) {
+ unsigned NumMaskElts = BaseMask.size();
+ unsigned NumInputs = Inputs.size();
+ if (NumInputs == 0)
+ return SDValue();
+
+ SmallVector<SDValue, 4> WideInputs(Inputs.begin(), Inputs.end());
+ SmallVector<unsigned, 4> Offsets(NumInputs, 0);
+
+ // Peek through subvectors.
+ // TODO: Support inter-mixed EXTRACT_SUBVECTORs + BITCASTs?
+ unsigned WideSizeInBits = WideInputs[0].getValueSizeInBits();
+ for (unsigned i = 0; i != NumInputs; ++i) {
+ SDValue &Src = WideInputs[i];
+ unsigned &Offset = Offsets[i];
+ Src = peekThroughBitcasts(Src);
+ EVT BaseVT = Src.getValueType();
+ while (Src.getOpcode() == ISD::EXTRACT_SUBVECTOR &&
+ isa<ConstantSDNode>(Src.getOperand(1))) {
+ Offset += Src.getConstantOperandVal(1);
+ Src = Src.getOperand(0);
+ }
+ WideSizeInBits = std::max(WideSizeInBits, Src.getValueSizeInBits());
+ assert((Offset % BaseVT.getVectorNumElements()) == 0 &&
+ "Unexpected subvector extraction");
+ Offset /= BaseVT.getVectorNumElements();
+ Offset *= NumMaskElts;
+ }
+
+ // Bail if we're always extracting from the lowest subvectors,
+ // combineX86ShuffleChain should match this for the current width.
+ if (llvm::all_of(Offsets, [](unsigned Offset) { return Offset == 0; }))
+ return SDValue();
+
+ EVT RootVT = Root.getValueType();
+ unsigned RootSizeInBits = RootVT.getSizeInBits();
+ unsigned Scale = WideSizeInBits / RootSizeInBits;
+ assert((WideSizeInBits % RootSizeInBits) == 0 &&
+ "Unexpected subvector extraction");
+
+ // If the src vector types aren't the same, see if we can extend
+ // them to match each other.
+ // TODO: Support different scalar types?
+ EVT WideSVT = WideInputs[0].getValueType().getScalarType();
+ if (llvm::any_of(WideInputs, [&WideSVT, &DAG](SDValue Op) {
+ return !DAG.getTargetLoweringInfo().isTypeLegal(Op.getValueType()) ||
+ Op.getValueType().getScalarType() != WideSVT;
+ }))
+ return SDValue();
+
+ for (SDValue &NewInput : WideInputs) {
+ assert((WideSizeInBits % NewInput.getValueSizeInBits()) == 0 &&
+ "Shuffle vector size mismatch");
+ if (WideSizeInBits > NewInput.getValueSizeInBits())
+ NewInput = widenSubVector(NewInput, false, Subtarget, DAG,
+ SDLoc(NewInput), WideSizeInBits);
+ assert(WideSizeInBits == NewInput.getValueSizeInBits() &&
+ "Unexpected subvector extraction");
+ }
+
+ // Create new mask for larger type.
+ for (unsigned i = 1; i != NumInputs; ++i)
+ Offsets[i] += i * Scale * NumMaskElts;
+
+ SmallVector<int, 64> WideMask(BaseMask.begin(), BaseMask.end());
+ for (int &M : WideMask) {
+ if (M < 0)
+ continue;
+ M = (M % NumMaskElts) + Offsets[M / NumMaskElts];
+ }
+ WideMask.append((Scale - 1) * NumMaskElts, SM_SentinelUndef);
+
+ // Remove unused/repeated shuffle source ops.
+ resolveTargetShuffleInputsAndMask(WideInputs, WideMask);
+ assert(!WideInputs.empty() && "Shuffle with no inputs detected");
+
+ if (WideInputs.size() > 2)
+ return SDValue();
+
+ // Increase depth for every upper subvector we've peeked through.
+ Depth += count_if(Offsets, [](unsigned Offset) { return Offset > 0; });
+
+ // Attempt to combine wider chain.
+ // TODO: Can we use a better Root?
+ SDValue WideRoot = WideInputs[0];
+ if (SDValue WideShuffle = combineX86ShuffleChain(
+ WideInputs, WideRoot, WideMask, Depth, HasVariableMask,
+ AllowVariableMask, DAG, Subtarget)) {
+ WideShuffle =
+ extractSubVector(WideShuffle, 0, DAG, SDLoc(Root), RootSizeInBits);
+ return DAG.getBitcast(RootVT, WideShuffle);
+ }
+ return SDValue();
+}
+
// Attempt to constant fold all of the constant source ops.
// Returns true if the entire shuffle is folded to a constant.
// TODO: Extend this to merge multiple constant Ops and update the mask.
@@ -31370,19 +32672,10 @@ static SDValue combineX86ShufflesRecursively(
if (!resolveTargetShuffleInputs(Op, OpInputs, OpMask, DAG))
return SDValue();
- // TODO - Add support for more than 2 inputs.
- if (2 < OpInputs.size())
- return SDValue();
-
- SDValue Input0 = (OpInputs.size() > 0 ? OpInputs[0] : SDValue());
- SDValue Input1 = (OpInputs.size() > 1 ? OpInputs[1] : SDValue());
-
// Add the inputs to the Ops list, avoiding duplicates.
SmallVector<SDValue, 16> Ops(SrcOps.begin(), SrcOps.end());
auto AddOp = [&Ops](SDValue Input, int InsertionPoint) -> int {
- if (!Input)
- return -1;
// Attempt to find an existing match.
SDValue InputBC = peekThroughBitcasts(Input);
for (int i = 0, e = Ops.size(); i < e; ++i)
@@ -31398,8 +32691,9 @@ static SDValue combineX86ShufflesRecursively(
return Ops.size() - 1;
};
- int InputIdx0 = AddOp(Input0, SrcOpIndex);
- int InputIdx1 = AddOp(Input1, -1);
+ SmallVector<int, 2> OpInputIdx;
+ for (SDValue OpInput : OpInputs)
+ OpInputIdx.push_back(AddOp(OpInput, OpInputIdx.empty() ? SrcOpIndex : -1));
assert(((RootMask.size() > OpMask.size() &&
RootMask.size() % OpMask.size() == 0) ||
@@ -31471,13 +32765,9 @@ static SDValue combineX86ShufflesRecursively(
: (OpMask[OpIdx] << OpRatioLog2) + (RootMaskedIdx & (OpRatio - 1));
OpMaskedIdx = OpMaskedIdx & (MaskWidth - 1);
- if (OpMask[OpIdx] < (int)OpMask.size()) {
- assert(0 <= InputIdx0 && "Unknown target shuffle input");
- OpMaskedIdx += InputIdx0 * MaskWidth;
- } else {
- assert(0 <= InputIdx1 && "Unknown target shuffle input");
- OpMaskedIdx += InputIdx1 * MaskWidth;
- }
+ int InputIdx = OpMask[OpIdx] / (int)OpMask.size();
+ assert(0 <= OpInputIdx[InputIdx] && "Unknown target shuffle input");
+ OpMaskedIdx += OpInputIdx[InputIdx] * MaskWidth;
Mask[i] = OpMaskedIdx;
}
@@ -31493,7 +32783,7 @@ static SDValue combineX86ShufflesRecursively(
return getZeroVector(Root.getSimpleValueType(), Subtarget, DAG,
SDLoc(Root));
- // Remove unused shuffle source ops.
+ // Remove unused/repeated shuffle source ops.
resolveTargetShuffleInputsAndMask(Ops, Mask);
assert(!Ops.empty() && "Shuffle with no inputs detected");
@@ -31530,29 +32820,42 @@ static SDValue combineX86ShufflesRecursively(
return Cst;
// We can only combine unary and binary shuffle mask cases.
- if (Ops.size() > 2)
- return SDValue();
+ if (Ops.size() <= 2) {
+ // Minor canonicalization of the accumulated shuffle mask to make it easier
+ // to match below. All this does is detect masks with sequential pairs of
+ // elements, and shrink them to the half-width mask. It does this in a loop
+ // so it will reduce the size of the mask to the minimal width mask which
+ // performs an equivalent shuffle.
+ SmallVector<int, 64> WidenedMask;
+ while (Mask.size() > 1 && canWidenShuffleElements(Mask, WidenedMask)) {
+ Mask = std::move(WidenedMask);
+ }
+
+ // Canonicalization of binary shuffle masks to improve pattern matching by
+ // commuting the inputs.
+ if (Ops.size() == 2 && canonicalizeShuffleMaskWithCommute(Mask)) {
+ ShuffleVectorSDNode::commuteMask(Mask);
+ std::swap(Ops[0], Ops[1]);
+ }
- // Minor canonicalization of the accumulated shuffle mask to make it easier
- // to match below. All this does is detect masks with sequential pairs of
- // elements, and shrink them to the half-width mask. It does this in a loop
- // so it will reduce the size of the mask to the minimal width mask which
- // performs an equivalent shuffle.
- SmallVector<int, 64> WidenedMask;
- while (Mask.size() > 1 && canWidenShuffleElements(Mask, WidenedMask)) {
- Mask = std::move(WidenedMask);
+ // Finally, try to combine into a single shuffle instruction.
+ return combineX86ShuffleChain(Ops, Root, Mask, Depth, HasVariableMask,
+ AllowVariableMask, DAG, Subtarget);
}
- // Canonicalization of binary shuffle masks to improve pattern matching by
- // commuting the inputs.
- if (Ops.size() == 2 && canonicalizeShuffleMaskWithCommute(Mask)) {
- ShuffleVectorSDNode::commuteMask(Mask);
- std::swap(Ops[0], Ops[1]);
- }
+ // If that failed and any input is extracted then try to combine as a
+ // shuffle with the larger type.
+ return combineX86ShuffleChainWithExtract(Ops, Root, Mask, Depth,
+ HasVariableMask, AllowVariableMask,
+ DAG, Subtarget);
+}
- // Finally, try to combine into a single shuffle instruction.
- return combineX86ShuffleChain(Ops, Root, Mask, Depth, HasVariableMask,
- AllowVariableMask, DAG, Subtarget);
+/// Helper entry wrapper to combineX86ShufflesRecursively.
+static SDValue combineX86ShufflesRecursively(SDValue Op, SelectionDAG &DAG,
+ const X86Subtarget &Subtarget) {
+ return combineX86ShufflesRecursively({Op}, 0, Op, {0}, {}, /*Depth*/ 1,
+ /*HasVarMask*/ false,
+ /*AllowVarMask*/ true, DAG, Subtarget);
}
/// Get the PSHUF-style mask from PSHUF node.
@@ -31770,12 +33073,13 @@ static SDValue combineTargetShuffle(SDValue N, SelectionDAG &DAG,
switch (Opcode) {
case X86ISD::VBROADCAST: {
- // If broadcasting from another shuffle, attempt to simplify it.
- // TODO - we really need a general SimplifyDemandedVectorElts mechanism.
SDValue Src = N.getOperand(0);
SDValue BC = peekThroughBitcasts(Src);
EVT SrcVT = Src.getValueType();
EVT BCVT = BC.getValueType();
+
+ // If broadcasting from another shuffle, attempt to simplify it.
+ // TODO - we really need a general SimplifyDemandedVectorElts mechanism.
if (isTargetShuffle(BC.getOpcode()) &&
VT.getScalarSizeInBits() % BCVT.getScalarSizeInBits() == 0) {
unsigned Scale = VT.getScalarSizeInBits() / BCVT.getScalarSizeInBits();
@@ -31789,6 +33093,71 @@ static SDValue combineTargetShuffle(SDValue N, SelectionDAG &DAG,
return DAG.getNode(X86ISD::VBROADCAST, DL, VT,
DAG.getBitcast(SrcVT, Res));
}
+
+ // broadcast(bitcast(src)) -> bitcast(broadcast(src))
+ // 32-bit targets have to bitcast i64 to f64, so better to bitcast upward.
+ if (Src.getOpcode() == ISD::BITCAST &&
+ SrcVT.getScalarSizeInBits() == BCVT.getScalarSizeInBits()) {
+ EVT NewVT = EVT::getVectorVT(*DAG.getContext(), BCVT.getScalarType(),
+ VT.getVectorNumElements());
+ return DAG.getBitcast(VT, DAG.getNode(X86ISD::VBROADCAST, DL, NewVT, BC));
+ }
+
+ // Reduce broadcast source vector to lowest 128-bits.
+ if (SrcVT.getSizeInBits() > 128)
+ return DAG.getNode(X86ISD::VBROADCAST, DL, VT,
+ extract128BitVector(Src, 0, DAG, DL));
+
+ // broadcast(scalar_to_vector(x)) -> broadcast(x).
+ if (Src.getOpcode() == ISD::SCALAR_TO_VECTOR)
+ return DAG.getNode(X86ISD::VBROADCAST, DL, VT, Src.getOperand(0));
+
+ // Share broadcast with the longest vector and extract low subvector (free).
+ for (SDNode *User : Src->uses())
+ if (User != N.getNode() && User->getOpcode() == X86ISD::VBROADCAST &&
+ User->getValueSizeInBits(0) > VT.getSizeInBits()) {
+ return extractSubVector(SDValue(User, 0), 0, DAG, DL,
+ VT.getSizeInBits());
+ }
+
+ return SDValue();
+ }
+ case X86ISD::BLENDI: {
+ SDValue N0 = N.getOperand(0);
+ SDValue N1 = N.getOperand(1);
+
+ // blend(bitcast(x),bitcast(y)) -> bitcast(blend(x,y)) to narrower types.
+ // TODO: Handle MVT::v16i16 repeated blend mask.
+ if (N0.getOpcode() == ISD::BITCAST && N1.getOpcode() == ISD::BITCAST &&
+ N0.getOperand(0).getValueType() == N1.getOperand(0).getValueType()) {
+ MVT SrcVT = N0.getOperand(0).getSimpleValueType();
+ if ((VT.getScalarSizeInBits() % SrcVT.getScalarSizeInBits()) == 0 &&
+ SrcVT.getScalarSizeInBits() >= 32) {
+ unsigned Mask = N.getConstantOperandVal(2);
+ unsigned Size = VT.getVectorNumElements();
+ unsigned Scale = VT.getScalarSizeInBits() / SrcVT.getScalarSizeInBits();
+ unsigned ScaleMask = scaleVectorShuffleBlendMask(Mask, Size, Scale);
+ return DAG.getBitcast(
+ VT, DAG.getNode(X86ISD::BLENDI, DL, SrcVT, N0.getOperand(0),
+ N1.getOperand(0),
+ DAG.getConstant(ScaleMask, DL, MVT::i8)));
+ }
+ }
+ return SDValue();
+ }
+ case X86ISD::VPERMI: {
+ // vpermi(bitcast(x)) -> bitcast(vpermi(x)) for same number of elements.
+ // TODO: Remove when we have preferred domains in combineX86ShuffleChain.
+ SDValue N0 = N.getOperand(0);
+ SDValue N1 = N.getOperand(1);
+ unsigned EltSizeInBits = VT.getScalarSizeInBits();
+ if (N0.getOpcode() == ISD::BITCAST &&
+ N0.getOperand(0).getScalarValueSizeInBits() == EltSizeInBits) {
+ SDValue Src = N0.getOperand(0);
+ EVT SrcVT = Src.getValueType();
+ SDValue Res = DAG.getNode(X86ISD::VPERMI, DL, SrcVT, Src, N1);
+ return DAG.getBitcast(VT, Res);
+ }
return SDValue();
}
case X86ISD::PSHUFD:
@@ -32212,8 +33581,22 @@ static SDValue combineShuffleOfConcatUndef(SDNode *N, SelectionDAG &DAG,
/// Eliminate a redundant shuffle of a horizontal math op.
static SDValue foldShuffleOfHorizOp(SDNode *N) {
- if (N->getOpcode() != ISD::VECTOR_SHUFFLE || !N->getOperand(1).isUndef())
- return SDValue();
+ unsigned Opcode = N->getOpcode();
+ if (Opcode != X86ISD::MOVDDUP && Opcode != X86ISD::VBROADCAST)
+ if (Opcode != ISD::VECTOR_SHUFFLE || !N->getOperand(1).isUndef())
+ return SDValue();
+
+ // For a broadcast, peek through an extract element of index 0 to find the
+ // horizontal op: broadcast (ext_vec_elt HOp, 0)
+ EVT VT = N->getValueType(0);
+ if (Opcode == X86ISD::VBROADCAST) {
+ SDValue SrcOp = N->getOperand(0);
+ if (SrcOp.getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
+ SrcOp.getValueType() == MVT::f64 &&
+ SrcOp.getOperand(0).getValueType() == VT &&
+ isNullConstant(SrcOp.getOperand(1)))
+ N = SrcOp.getNode();
+ }
SDValue HOp = N->getOperand(0);
if (HOp.getOpcode() != X86ISD::HADD && HOp.getOpcode() != X86ISD::FHADD &&
@@ -32224,13 +33607,25 @@ static SDValue foldShuffleOfHorizOp(SDNode *N) {
// lanes of each operand as:
// v4X32: A[0] + A[1] , A[2] + A[3] , B[0] + B[1] , B[2] + B[3]
// ...similarly for v2f64 and v8i16.
- // TODO: Handle UNDEF operands.
- if (HOp.getOperand(0) != HOp.getOperand(1))
+ if (!HOp.getOperand(0).isUndef() && !HOp.getOperand(1).isUndef() &&
+ HOp.getOperand(0) != HOp.getOperand(1))
return SDValue();
// When the operands of a horizontal math op are identical, the low half of
- // the result is the same as the high half. If the shuffle is also replicating
- // low and high halves, we don't need the shuffle.
+ // the result is the same as the high half. If a target shuffle is also
+ // replicating low and high halves, we don't need the shuffle.
+ if (Opcode == X86ISD::MOVDDUP || Opcode == X86ISD::VBROADCAST) {
+ if (HOp.getScalarValueSizeInBits() == 64) {
+ // movddup (hadd X, X) --> hadd X, X
+ // broadcast (extract_vec_elt (hadd X, X), 0) --> hadd X, X
+ assert((HOp.getValueType() == MVT::v2f64 ||
+ HOp.getValueType() == MVT::v4f64) && HOp.getValueType() == VT &&
+ "Unexpected type for h-op");
+ return HOp;
+ }
+ return SDValue();
+ }
+
// shuffle (hadd X, X), undef, [low half...high half] --> hadd X, X
ArrayRef<int> Mask = cast<ShuffleVectorSDNode>(N)->getMask();
// TODO: Other mask possibilities like {1,1} and {1,0} could be added here,
@@ -32252,14 +33647,51 @@ static SDValue foldShuffleOfHorizOp(SDNode *N) {
return SDValue();
}
+/// If we have a shuffle of AVX/AVX512 (256/512 bit) vectors that only uses the
+/// low half of each source vector and does not set any high half elements in
+/// the destination vector, narrow the shuffle to half its original size.
+static SDValue narrowShuffle(ShuffleVectorSDNode *Shuf, SelectionDAG &DAG) {
+ if (!Shuf->getValueType(0).isSimple())
+ return SDValue();
+ MVT VT = Shuf->getSimpleValueType(0);
+ if (!VT.is256BitVector() && !VT.is512BitVector())
+ return SDValue();
+
+ // See if we can ignore all of the high elements of the shuffle.
+ ArrayRef<int> Mask = Shuf->getMask();
+ if (!isUndefUpperHalf(Mask))
+ return SDValue();
+
+ // Check if the shuffle mask accesses only the low half of each input vector
+ // (half-index output is 0 or 2).
+ int HalfIdx1, HalfIdx2;
+ SmallVector<int, 8> HalfMask(Mask.size() / 2);
+ if (!getHalfShuffleMask(Mask, HalfMask, HalfIdx1, HalfIdx2) ||
+ (HalfIdx1 % 2 == 1) || (HalfIdx2 % 2 == 1))
+ return SDValue();
+
+ // Create a half-width shuffle to replace the unnecessarily wide shuffle.
+ // The trick is knowing that all of the insert/extract are actually free
+ // subregister (zmm<->ymm or ymm<->xmm) ops. That leaves us with a shuffle
+ // of narrow inputs into a narrow output, and that is always cheaper than
+ // the wide shuffle that we started with.
+ return getShuffleHalfVectors(SDLoc(Shuf), Shuf->getOperand(0),
+ Shuf->getOperand(1), HalfMask, HalfIdx1,
+ HalfIdx2, false, DAG);
+}
+
static SDValue combineShuffle(SDNode *N, SelectionDAG &DAG,
TargetLowering::DAGCombinerInfo &DCI,
const X86Subtarget &Subtarget) {
+ if (auto *Shuf = dyn_cast<ShuffleVectorSDNode>(N))
+ if (SDValue V = narrowShuffle(Shuf, DAG))
+ return V;
+
+ // If we have legalized the vector types, look for blends of FADD and FSUB
+ // nodes that we can fuse into an ADDSUB, FMADDSUB, or FMSUBADD node.
SDLoc dl(N);
EVT VT = N->getValueType(0);
const TargetLowering &TLI = DAG.getTargetLoweringInfo();
- // If we have legalized the vector types, look for blends of FADD and FSUB
- // nodes that we can fuse into an ADDSUB, FMADDSUB, or FMSUBADD node.
if (TLI.isTypeLegal(VT)) {
if (SDValue AddSub = combineShuffleToAddSubOrFMAddSub(N, Subtarget, DAG))
return AddSub;
@@ -32328,23 +33760,9 @@ static SDValue combineShuffle(SDNode *N, SelectionDAG &DAG,
}
}
- // Combine a vector_shuffle that is equal to build_vector load1, load2, load3,
- // load4, <0, 1, 2, 3> into a 128-bit load if the load addresses are
- // consecutive, non-overlapping, and in the right order.
- SmallVector<SDValue, 16> Elts;
- for (unsigned i = 0, e = VT.getVectorNumElements(); i != e; ++i) {
- if (SDValue Elt = getShuffleScalarElt(N, i, DAG, 0)) {
- Elts.push_back(Elt);
- continue;
- }
- Elts.clear();
- break;
- }
-
- if (Elts.size() == VT.getVectorNumElements())
- if (SDValue LD =
- EltsFromConsecutiveLoads(VT, Elts, dl, DAG, Subtarget, true))
- return LD;
+ // Attempt to combine into a vector load/broadcast.
+ if (SDValue LD = combineToConsecutiveLoads(VT, N, dl, DAG, Subtarget, true))
+ return LD;
// For AVX2, we sometimes want to combine
// (vector_shuffle <mask> (concat_vectors t1, undef)
@@ -32365,9 +33783,7 @@ static SDValue combineShuffle(SDNode *N, SelectionDAG &DAG,
// specific PSHUF instruction sequences into their minimal form so that we
// can evaluate how many specialized shuffle instructions are involved in
// a particular chain.
- if (SDValue Res = combineX86ShufflesRecursively(
- {Op}, 0, Op, {0}, {}, /*Depth*/ 1,
- /*HasVarMask*/ false, /*AllowVarMask*/ true, DAG, Subtarget))
+ if (SDValue Res = combineX86ShufflesRecursively(Op, DAG, Subtarget))
return Res;
// Simplify source operands based on shuffle mask.
@@ -32378,6 +33794,68 @@ static SDValue combineShuffle(SDNode *N, SelectionDAG &DAG,
return SDValue(N, 0);
}
+ // Look for a v2i64/v2f64 VZEXT_MOVL of a node that already produces zeros
+ // in the upper 64 bits.
+ // TODO: Can we generalize this using computeKnownBits.
+ if (N->getOpcode() == X86ISD::VZEXT_MOVL &&
+ (VT == MVT::v2f64 || VT == MVT::v2i64) &&
+ N->getOperand(0).getOpcode() == ISD::BITCAST &&
+ (N->getOperand(0).getOperand(0).getValueType() == MVT::v4f32 ||
+ N->getOperand(0).getOperand(0).getValueType() == MVT::v4i32)) {
+ SDValue In = N->getOperand(0).getOperand(0);
+ switch (In.getOpcode()) {
+ default:
+ break;
+ case X86ISD::CVTP2SI: case X86ISD::CVTP2UI:
+ case X86ISD::MCVTP2SI: case X86ISD::MCVTP2UI:
+ case X86ISD::CVTTP2SI: case X86ISD::CVTTP2UI:
+ case X86ISD::MCVTTP2SI: case X86ISD::MCVTTP2UI:
+ case X86ISD::CVTSI2P: case X86ISD::CVTUI2P:
+ case X86ISD::MCVTSI2P: case X86ISD::MCVTUI2P:
+ case X86ISD::VFPROUND: case X86ISD::VMFPROUND:
+ if (In.getOperand(0).getValueType() == MVT::v2f64 ||
+ In.getOperand(0).getValueType() == MVT::v2i64)
+ return N->getOperand(0); // return the bitcast
+ break;
+ }
+ }
+
+ // Pull subvector inserts into undef through VZEXT_MOVL by making it an
+ // insert into a zero vector. This helps get VZEXT_MOVL closer to
+ // scalar_to_vectors where 256/512 are canonicalized to an insert and a
+ // 128-bit scalar_to_vector. This reduces the number of isel patterns.
+ if (N->getOpcode() == X86ISD::VZEXT_MOVL && !DCI.isBeforeLegalizeOps() &&
+ N->getOperand(0).getOpcode() == ISD::INSERT_SUBVECTOR &&
+ N->getOperand(0).hasOneUse() &&
+ N->getOperand(0).getOperand(0).isUndef() &&
+ isNullConstant(N->getOperand(0).getOperand(2))) {
+ SDValue In = N->getOperand(0).getOperand(1);
+ SDValue Movl = DAG.getNode(X86ISD::VZEXT_MOVL, dl, In.getValueType(), In);
+ return DAG.getNode(ISD::INSERT_SUBVECTOR, dl, VT,
+ getZeroVector(VT.getSimpleVT(), Subtarget, DAG, dl),
+ Movl, N->getOperand(0).getOperand(2));
+ }
+
+ // If this a vzmovl of a full vector load, replace it with a vzload, unless
+ // the load is volatile.
+ if (N->getOpcode() == X86ISD::VZEXT_MOVL && N->getOperand(0).hasOneUse() &&
+ ISD::isNormalLoad(N->getOperand(0).getNode())) {
+ LoadSDNode *LN = cast<LoadSDNode>(N->getOperand(0));
+ if (!LN->isVolatile()) {
+ SDVTList Tys = DAG.getVTList(VT, MVT::Other);
+ SDValue Ops[] = { LN->getChain(), LN->getBasePtr() };
+ SDValue VZLoad =
+ DAG.getMemIntrinsicNode(X86ISD::VZEXT_LOAD, dl, Tys, Ops,
+ VT.getVectorElementType(),
+ LN->getPointerInfo(),
+ LN->getAlignment(),
+ MachineMemOperand::MOLoad);
+ DAG.ReplaceAllUsesOfValueWith(SDValue(LN, 1), VZLoad.getValue(1));
+ return VZLoad;
+ }
+ }
+
+
// Look for a truncating shuffle to v2i32 of a PMULUDQ where one of the
// operands is an extend from v2i32 to v2i64. Turn it into a pmulld.
// FIXME: This can probably go away once we default to widening legalization.
@@ -32436,6 +33914,22 @@ bool X86TargetLowering::SimplifyDemandedVectorEltsForTargetNode(
// Handle special case opcodes.
switch (Opc) {
+ case X86ISD::PMULDQ:
+ case X86ISD::PMULUDQ: {
+ APInt LHSUndef, LHSZero;
+ APInt RHSUndef, RHSZero;
+ SDValue LHS = Op.getOperand(0);
+ SDValue RHS = Op.getOperand(1);
+ if (SimplifyDemandedVectorElts(LHS, DemandedElts, LHSUndef, LHSZero, TLO,
+ Depth + 1))
+ return true;
+ if (SimplifyDemandedVectorElts(RHS, DemandedElts, RHSUndef, RHSZero, TLO,
+ Depth + 1))
+ return true;
+ // Multiply by zero.
+ KnownZero = LHSZero | RHSZero;
+ break;
+ }
case X86ISD::VSHL:
case X86ISD::VSRL:
case X86ISD::VSRA: {
@@ -32443,11 +33937,21 @@ bool X86TargetLowering::SimplifyDemandedVectorEltsForTargetNode(
SDValue Amt = Op.getOperand(1);
MVT AmtVT = Amt.getSimpleValueType();
assert(AmtVT.is128BitVector() && "Unexpected value type");
+
+ // If we reuse the shift amount just for sse shift amounts then we know that
+ // only the bottom 64-bits are only ever used.
+ bool AssumeSingleUse = llvm::all_of(Amt->uses(), [&Amt](SDNode *Use) {
+ unsigned UseOpc = Use->getOpcode();
+ return (UseOpc == X86ISD::VSHL || UseOpc == X86ISD::VSRL ||
+ UseOpc == X86ISD::VSRA) &&
+ Use->getOperand(0) != Amt;
+ });
+
APInt AmtUndef, AmtZero;
unsigned NumAmtElts = AmtVT.getVectorNumElements();
APInt AmtElts = APInt::getLowBitsSet(NumAmtElts, NumAmtElts / 2);
if (SimplifyDemandedVectorElts(Amt, AmtElts, AmtUndef, AmtZero, TLO,
- Depth + 1))
+ Depth + 1, AssumeSingleUse))
return true;
LLVM_FALLTHROUGH;
}
@@ -32487,6 +33991,58 @@ bool X86TargetLowering::SimplifyDemandedVectorEltsForTargetNode(
return true;
break;
}
+ case X86ISD::HADD:
+ case X86ISD::HSUB:
+ case X86ISD::FHADD:
+ case X86ISD::FHSUB: {
+ APInt DemandedLHS, DemandedRHS;
+ getHorizDemandedElts(VT, DemandedElts, DemandedLHS, DemandedRHS);
+
+ APInt LHSUndef, LHSZero;
+ if (SimplifyDemandedVectorElts(Op.getOperand(0), DemandedLHS, LHSUndef,
+ LHSZero, TLO, Depth + 1))
+ return true;
+ APInt RHSUndef, RHSZero;
+ if (SimplifyDemandedVectorElts(Op.getOperand(1), DemandedRHS, RHSUndef,
+ RHSZero, TLO, Depth + 1))
+ return true;
+ break;
+ }
+ case X86ISD::VTRUNC:
+ case X86ISD::VTRUNCS:
+ case X86ISD::VTRUNCUS: {
+ SDValue Src = Op.getOperand(0);
+ MVT SrcVT = Src.getSimpleValueType();
+ APInt DemandedSrc = DemandedElts.zextOrTrunc(SrcVT.getVectorNumElements());
+ APInt SrcUndef, SrcZero;
+ if (SimplifyDemandedVectorElts(Src, DemandedSrc, SrcUndef, SrcZero, TLO,
+ Depth + 1))
+ return true;
+ KnownZero = SrcZero.zextOrTrunc(NumElts);
+ KnownUndef = SrcUndef.zextOrTrunc(NumElts);
+ break;
+ }
+ case X86ISD::BLENDV: {
+ APInt SelUndef, SelZero;
+ if (SimplifyDemandedVectorElts(Op.getOperand(0), DemandedElts, SelUndef,
+ SelZero, TLO, Depth + 1))
+ return true;
+
+ // TODO: Use SelZero to adjust LHS/RHS DemandedElts.
+ APInt LHSUndef, LHSZero;
+ if (SimplifyDemandedVectorElts(Op.getOperand(1), DemandedElts, LHSUndef,
+ LHSZero, TLO, Depth + 1))
+ return true;
+
+ APInt RHSUndef, RHSZero;
+ if (SimplifyDemandedVectorElts(Op.getOperand(2), DemandedElts, RHSUndef,
+ RHSZero, TLO, Depth + 1))
+ return true;
+
+ KnownZero = LHSZero & RHSZero;
+ KnownUndef = LHSUndef & RHSUndef;
+ break;
+ }
case X86ISD::VBROADCAST: {
SDValue Src = Op.getOperand(0);
MVT SrcVT = Src.getSimpleValueType();
@@ -32494,7 +34050,7 @@ bool X86TargetLowering::SimplifyDemandedVectorEltsForTargetNode(
return false;
// Don't bother broadcasting if we just need the 0'th element.
if (DemandedElts == 1) {
- if(Src.getValueType() != VT)
+ if (Src.getValueType() != VT)
Src = widenSubVector(VT.getSimpleVT(), Src, false, Subtarget, TLO.DAG,
SDLoc(Op));
return TLO.CombineTo(Op, Src);
@@ -32506,8 +34062,36 @@ bool X86TargetLowering::SimplifyDemandedVectorEltsForTargetNode(
return true;
break;
}
- case X86ISD::PSHUFB: {
- // TODO - simplify other variable shuffle masks.
+ case X86ISD::SUBV_BROADCAST: {
+ // Reduce size of broadcast if we don't need the upper half.
+ unsigned HalfElts = NumElts / 2;
+ if (DemandedElts.extractBits(HalfElts, HalfElts).isNullValue()) {
+ SDValue Src = Op.getOperand(0);
+ MVT SrcVT = Src.getSimpleValueType();
+
+ SDValue Half = Src;
+ if (SrcVT.getVectorNumElements() != HalfElts) {
+ MVT HalfVT = MVT::getVectorVT(SrcVT.getScalarType(), HalfElts);
+ Half = TLO.DAG.getNode(X86ISD::SUBV_BROADCAST, SDLoc(Op), HalfVT, Src);
+ }
+
+ return TLO.CombineTo(Op, insertSubVector(TLO.DAG.getUNDEF(VT), Half, 0,
+ TLO.DAG, SDLoc(Op),
+ Half.getValueSizeInBits()));
+ }
+ break;
+ }
+ case X86ISD::VPERMV: {
+ SDValue Mask = Op.getOperand(0);
+ APInt MaskUndef, MaskZero;
+ if (SimplifyDemandedVectorElts(Mask, DemandedElts, MaskUndef, MaskZero, TLO,
+ Depth + 1))
+ return true;
+ break;
+ }
+ case X86ISD::PSHUFB:
+ case X86ISD::VPERMV3:
+ case X86ISD::VPERMILPV: {
SDValue Mask = Op.getOperand(1);
APInt MaskUndef, MaskZero;
if (SimplifyDemandedVectorElts(Mask, DemandedElts, MaskUndef, MaskZero, TLO,
@@ -32515,6 +34099,106 @@ bool X86TargetLowering::SimplifyDemandedVectorEltsForTargetNode(
return true;
break;
}
+ case X86ISD::VPPERM:
+ case X86ISD::VPERMIL2: {
+ SDValue Mask = Op.getOperand(2);
+ APInt MaskUndef, MaskZero;
+ if (SimplifyDemandedVectorElts(Mask, DemandedElts, MaskUndef, MaskZero, TLO,
+ Depth + 1))
+ return true;
+ break;
+ }
+ }
+
+ // For 256/512-bit ops that are 128/256-bit ops glued together, if we do not
+ // demand any of the high elements, then narrow the op to 128/256-bits: e.g.
+ // (op ymm0, ymm1) --> insert undef, (op xmm0, xmm1), 0
+ if ((VT.is256BitVector() || VT.is512BitVector()) &&
+ DemandedElts.lshr(NumElts / 2) == 0) {
+ unsigned SizeInBits = VT.getSizeInBits();
+ unsigned ExtSizeInBits = SizeInBits / 2;
+
+ // See if 512-bit ops only use the bottom 128-bits.
+ if (VT.is512BitVector() && DemandedElts.lshr(NumElts / 4) == 0)
+ ExtSizeInBits = SizeInBits / 4;
+
+ switch (Opc) {
+ // Zero upper elements.
+ case X86ISD::VZEXT_MOVL: {
+ SDLoc DL(Op);
+ SDValue Ext0 =
+ extractSubVector(Op.getOperand(0), 0, TLO.DAG, DL, ExtSizeInBits);
+ SDValue ExtOp =
+ TLO.DAG.getNode(Opc, DL, Ext0.getValueType(), Ext0);
+ SDValue UndefVec = TLO.DAG.getUNDEF(VT);
+ SDValue Insert =
+ insertSubVector(UndefVec, ExtOp, 0, TLO.DAG, DL, ExtSizeInBits);
+ return TLO.CombineTo(Op, Insert);
+ }
+ // Byte shifts by immediate.
+ case X86ISD::VSHLDQ:
+ case X86ISD::VSRLDQ:
+ // Shift by uniform.
+ case X86ISD::VSHL:
+ case X86ISD::VSRL:
+ case X86ISD::VSRA:
+ // Shift by immediate.
+ case X86ISD::VSHLI:
+ case X86ISD::VSRLI:
+ case X86ISD::VSRAI: {
+ SDLoc DL(Op);
+ SDValue Ext0 =
+ extractSubVector(Op.getOperand(0), 0, TLO.DAG, DL, ExtSizeInBits);
+ SDValue ExtOp =
+ TLO.DAG.getNode(Opc, DL, Ext0.getValueType(), Ext0, Op.getOperand(1));
+ SDValue UndefVec = TLO.DAG.getUNDEF(VT);
+ SDValue Insert =
+ insertSubVector(UndefVec, ExtOp, 0, TLO.DAG, DL, ExtSizeInBits);
+ return TLO.CombineTo(Op, Insert);
+ }
+ case X86ISD::VPERMI: {
+ // Simplify PERMPD/PERMQ to extract_subvector.
+ // TODO: This should be done in shuffle combining.
+ if (VT == MVT::v4f64 || VT == MVT::v4i64) {
+ SmallVector<int, 4> Mask;
+ DecodeVPERMMask(NumElts, Op.getConstantOperandVal(1), Mask);
+ if (isUndefOrEqual(Mask[0], 2) && isUndefOrEqual(Mask[1], 3)) {
+ SDLoc DL(Op);
+ SDValue Ext = extractSubVector(Op.getOperand(0), 2, TLO.DAG, DL, 128);
+ SDValue UndefVec = TLO.DAG.getUNDEF(VT);
+ SDValue Insert = insertSubVector(UndefVec, Ext, 0, TLO.DAG, DL, 128);
+ return TLO.CombineTo(Op, Insert);
+ }
+ }
+ break;
+ }
+ // Target Shuffles.
+ case X86ISD::PSHUFB:
+ case X86ISD::UNPCKL:
+ case X86ISD::UNPCKH:
+ // Saturated Packs.
+ case X86ISD::PACKSS:
+ case X86ISD::PACKUS:
+ // Horizontal Ops.
+ case X86ISD::HADD:
+ case X86ISD::HSUB:
+ case X86ISD::FHADD:
+ case X86ISD::FHSUB: {
+ SDLoc DL(Op);
+ MVT ExtVT = VT.getSimpleVT();
+ ExtVT = MVT::getVectorVT(ExtVT.getScalarType(),
+ ExtSizeInBits / ExtVT.getScalarSizeInBits());
+ SDValue Ext0 =
+ extractSubVector(Op.getOperand(0), 0, TLO.DAG, DL, ExtSizeInBits);
+ SDValue Ext1 =
+ extractSubVector(Op.getOperand(1), 0, TLO.DAG, DL, ExtSizeInBits);
+ SDValue ExtOp = TLO.DAG.getNode(Opc, DL, ExtVT, Ext0, Ext1);
+ SDValue UndefVec = TLO.DAG.getUNDEF(VT);
+ SDValue Insert =
+ insertSubVector(UndefVec, ExtOp, 0, TLO.DAG, DL, ExtSizeInBits);
+ return TLO.CombineTo(Op, Insert);
+ }
+ }
}
// Simplify target shuffles.
@@ -32606,9 +34290,11 @@ bool X86TargetLowering::SimplifyDemandedBitsForTargetNode(
SDValue RHS = Op.getOperand(1);
// FIXME: Can we bound this better?
APInt DemandedMask = APInt::getLowBitsSet(64, 32);
- if (SimplifyDemandedBits(LHS, DemandedMask, KnownOp, TLO, Depth + 1))
+ if (SimplifyDemandedBits(LHS, DemandedMask, OriginalDemandedElts, KnownOp,
+ TLO, Depth + 1))
return true;
- if (SimplifyDemandedBits(RHS, DemandedMask, KnownOp, TLO, Depth + 1))
+ if (SimplifyDemandedBits(RHS, DemandedMask, OriginalDemandedElts, KnownOp,
+ TLO, Depth + 1))
return true;
break;
}
@@ -32727,6 +34413,97 @@ bool X86TargetLowering::SimplifyDemandedBitsForTargetNode(
}
break;
}
+ case X86ISD::PEXTRB:
+ case X86ISD::PEXTRW: {
+ SDValue Vec = Op.getOperand(0);
+ auto *CIdx = dyn_cast<ConstantSDNode>(Op.getOperand(1));
+ MVT VecVT = Vec.getSimpleValueType();
+ unsigned NumVecElts = VecVT.getVectorNumElements();
+
+ if (CIdx && CIdx->getAPIntValue().ult(NumVecElts)) {
+ unsigned Idx = CIdx->getZExtValue();
+ unsigned VecBitWidth = VecVT.getScalarSizeInBits();
+
+ // If we demand no bits from the vector then we must have demanded
+ // bits from the implict zext - simplify to zero.
+ APInt DemandedVecBits = OriginalDemandedBits.trunc(VecBitWidth);
+ if (DemandedVecBits == 0)
+ return TLO.CombineTo(Op, TLO.DAG.getConstant(0, SDLoc(Op), VT));
+
+ APInt KnownUndef, KnownZero;
+ APInt DemandedVecElts = APInt::getOneBitSet(NumVecElts, Idx);
+ if (SimplifyDemandedVectorElts(Vec, DemandedVecElts, KnownUndef,
+ KnownZero, TLO, Depth + 1))
+ return true;
+
+ KnownBits KnownVec;
+ if (SimplifyDemandedBits(Vec, DemandedVecBits, DemandedVecElts,
+ KnownVec, TLO, Depth + 1))
+ return true;
+
+ Known = KnownVec.zext(BitWidth, true);
+ return false;
+ }
+ break;
+ }
+ case X86ISD::PINSRB:
+ case X86ISD::PINSRW: {
+ SDValue Vec = Op.getOperand(0);
+ SDValue Scl = Op.getOperand(1);
+ auto *CIdx = dyn_cast<ConstantSDNode>(Op.getOperand(2));
+ MVT VecVT = Vec.getSimpleValueType();
+
+ if (CIdx && CIdx->getAPIntValue().ult(VecVT.getVectorNumElements())) {
+ unsigned Idx = CIdx->getZExtValue();
+ if (!OriginalDemandedElts[Idx])
+ return TLO.CombineTo(Op, Vec);
+
+ KnownBits KnownVec;
+ APInt DemandedVecElts(OriginalDemandedElts);
+ DemandedVecElts.clearBit(Idx);
+ if (SimplifyDemandedBits(Vec, OriginalDemandedBits, DemandedVecElts,
+ KnownVec, TLO, Depth + 1))
+ return true;
+
+ KnownBits KnownScl;
+ unsigned NumSclBits = Scl.getScalarValueSizeInBits();
+ APInt DemandedSclBits = OriginalDemandedBits.zext(NumSclBits);
+ if (SimplifyDemandedBits(Scl, DemandedSclBits, KnownScl, TLO, Depth + 1))
+ return true;
+
+ KnownScl = KnownScl.trunc(VecVT.getScalarSizeInBits());
+ Known.One = KnownVec.One & KnownScl.One;
+ Known.Zero = KnownVec.Zero & KnownScl.Zero;
+ return false;
+ }
+ break;
+ }
+ case X86ISD::PACKSS:
+ // PACKSS saturates to MIN/MAX integer values. So if we just want the
+ // sign bit then we can just ask for the source operands sign bit.
+ // TODO - add known bits handling.
+ if (OriginalDemandedBits.isSignMask()) {
+ APInt DemandedLHS, DemandedRHS;
+ getPackDemandedElts(VT, OriginalDemandedElts, DemandedLHS, DemandedRHS);
+
+ KnownBits KnownLHS, KnownRHS;
+ APInt SignMask = APInt::getSignMask(BitWidth * 2);
+ if (SimplifyDemandedBits(Op.getOperand(0), SignMask, DemandedLHS,
+ KnownLHS, TLO, Depth + 1))
+ return true;
+ if (SimplifyDemandedBits(Op.getOperand(1), SignMask, DemandedRHS,
+ KnownRHS, TLO, Depth + 1))
+ return true;
+ }
+ // TODO - add general PACKSS/PACKUS SimplifyDemandedBits support.
+ break;
+ case X86ISD::PCMPGT:
+ // icmp sgt(0, R) == ashr(R, BitWidth-1).
+ // iff we only need the sign bit then we can use R directly.
+ if (OriginalDemandedBits.isSignMask() &&
+ ISD::isBuildVectorAllZeros(Op.getOperand(0).getNode()))
+ return TLO.CombineTo(Op, Op.getOperand(1));
+ break;
case X86ISD::MOVMSK: {
SDValue Src = Op.getOperand(0);
MVT SrcVT = Src.getSimpleValueType();
@@ -32868,29 +34645,42 @@ static SDValue XFormVExtractWithShuffleIntoLoad(SDNode *N, SelectionDAG &DAG,
EltNo);
}
+// Helper to peek through bitops/setcc to determine size of source vector.
+// Allows combineBitcastvxi1 to determine what size vector generated a <X x i1>.
+static bool checkBitcastSrcVectorSize(SDValue Src, unsigned Size) {
+ switch (Src.getOpcode()) {
+ case ISD::SETCC:
+ return Src.getOperand(0).getValueSizeInBits() == Size;
+ case ISD::AND:
+ case ISD::XOR:
+ case ISD::OR:
+ return checkBitcastSrcVectorSize(Src.getOperand(0), Size) &&
+ checkBitcastSrcVectorSize(Src.getOperand(1), Size);
+ }
+ return false;
+}
+
// Try to match patterns such as
// (i16 bitcast (v16i1 x))
// ->
// (i16 movmsk (16i8 sext (v16i1 x)))
// before the illegal vector is scalarized on subtargets that don't have legal
// vxi1 types.
-static SDValue combineBitcastvxi1(SelectionDAG &DAG, SDValue BitCast,
+static SDValue combineBitcastvxi1(SelectionDAG &DAG, EVT VT, SDValue Src,
+ const SDLoc &DL,
const X86Subtarget &Subtarget) {
- EVT VT = BitCast.getValueType();
- SDValue N0 = BitCast.getOperand(0);
- EVT VecVT = N0->getValueType(0);
-
- if (!VT.isScalarInteger() || !VecVT.isSimple())
+ EVT SrcVT = Src.getValueType();
+ if (!SrcVT.isSimple() || SrcVT.getScalarType() != MVT::i1)
return SDValue();
// If the input is a truncate from v16i8 or v32i8 go ahead and use a
// movmskb even with avx512. This will be better than truncating to vXi1 and
// using a kmov. This can especially help KNL if the input is a v16i8/v32i8
// vpcmpeqb/vpcmpgtb.
- bool IsTruncated = N0.getOpcode() == ISD::TRUNCATE && N0.hasOneUse() &&
- (N0.getOperand(0).getValueType() == MVT::v16i8 ||
- N0.getOperand(0).getValueType() == MVT::v32i8 ||
- N0.getOperand(0).getValueType() == MVT::v64i8);
+ bool IsTruncated = Src.getOpcode() == ISD::TRUNCATE && Src.hasOneUse() &&
+ (Src.getOperand(0).getValueType() == MVT::v16i8 ||
+ Src.getOperand(0).getValueType() == MVT::v32i8 ||
+ Src.getOperand(0).getValueType() == MVT::v64i8);
// With AVX512 vxi1 types are legal and we prefer using k-regs.
// MOVMSK is supported in SSE2 or later.
@@ -32908,7 +34698,7 @@ static SDValue combineBitcastvxi1(SelectionDAG &DAG, SDValue BitCast,
// For example, t0 := (v8i16 sext(v8i1 x)) needs to be shuffled as:
// (v16i8 shuffle <0,2,4,6,8,10,12,14,u,u,...,u> (v16i8 bitcast t0), undef)
MVT SExtVT;
- switch (VecVT.getSimpleVT().SimpleTy) {
+ switch (SrcVT.getSimpleVT().SimpleTy) {
default:
return SDValue();
case MVT::v2i1:
@@ -32918,10 +34708,8 @@ static SDValue combineBitcastvxi1(SelectionDAG &DAG, SDValue BitCast,
SExtVT = MVT::v4i32;
// For cases such as (i4 bitcast (v4i1 setcc v4i64 v1, v2))
// sign-extend to a 256-bit operation to avoid truncation.
- if (N0->getOpcode() == ISD::SETCC && Subtarget.hasAVX() &&
- N0->getOperand(0).getValueType().is256BitVector()) {
+ if (Subtarget.hasAVX() && checkBitcastSrcVectorSize(Src, 256))
SExtVT = MVT::v4i64;
- }
break;
case MVT::v8i1:
SExtVT = MVT::v8i16;
@@ -32930,9 +34718,10 @@ static SDValue combineBitcastvxi1(SelectionDAG &DAG, SDValue BitCast,
// If the setcc operand is 128-bit, prefer sign-extending to 128-bit over
// 256-bit because the shuffle is cheaper than sign extending the result of
// the compare.
- if (N0->getOpcode() == ISD::SETCC && Subtarget.hasAVX() &&
- (N0->getOperand(0).getValueType().is256BitVector() ||
- N0->getOperand(0).getValueType().is512BitVector())) {
+ // TODO : use checkBitcastSrcVectorSize
+ if (Src.getOpcode() == ISD::SETCC && Subtarget.hasAVX() &&
+ (Src.getOperand(0).getValueType().is256BitVector() ||
+ Src.getOperand(0).getValueType().is512BitVector())) {
SExtVT = MVT::v8i32;
}
break;
@@ -32956,8 +34745,7 @@ static SDValue combineBitcastvxi1(SelectionDAG &DAG, SDValue BitCast,
return SDValue();
};
- SDLoc DL(BitCast);
- SDValue V = DAG.getNode(ISD::SIGN_EXTEND, DL, SExtVT, N0);
+ SDValue V = DAG.getNode(ISD::SIGN_EXTEND, DL, SExtVT, Src);
if (SExtVT == MVT::v64i8) {
SDValue Lo, Hi;
@@ -32977,7 +34765,11 @@ static SDValue combineBitcastvxi1(SelectionDAG &DAG, SDValue BitCast,
DAG.getUNDEF(MVT::v8i16));
V = DAG.getNode(X86ISD::MOVMSK, DL, MVT::i32, V);
}
- return DAG.getZExtOrTrunc(V, DL, VT);
+
+ EVT IntVT =
+ EVT::getIntegerVT(*DAG.getContext(), SrcVT.getVectorNumElements());
+ V = DAG.getZExtOrTrunc(V, DL, IntVT);
+ return DAG.getBitcast(VT, V);
}
// Convert a vXi1 constant build vector to the same width scalar integer.
@@ -33054,12 +34846,10 @@ static SDValue combineCastedMaskArithmetic(SDNode *N, SelectionDAG &DAG,
return SDValue();
}
-static SDValue createMMXBuildVector(SDValue N, SelectionDAG &DAG,
+static SDValue createMMXBuildVector(BuildVectorSDNode *BV, SelectionDAG &DAG,
const X86Subtarget &Subtarget) {
- SDLoc DL(N);
- unsigned NumElts = N.getNumOperands();
-
- auto *BV = cast<BuildVectorSDNode>(N);
+ SDLoc DL(BV);
+ unsigned NumElts = BV->getNumOperands();
SDValue Splat = BV->getSplatValue();
// Build MMX element from integer GPR or SSE float values.
@@ -33107,7 +34897,7 @@ static SDValue createMMXBuildVector(SDValue N, SelectionDAG &DAG,
Ops.append(NumElts, Splat);
} else {
for (unsigned i = 0; i != NumElts; ++i)
- Ops.push_back(CreateMMXElement(N.getOperand(i)));
+ Ops.push_back(CreateMMXElement(BV->getOperand(i)));
}
// Use tree of PUNPCKLs to build up general MMX vector.
@@ -33141,14 +34931,14 @@ static SDValue combineBitcast(SDNode *N, SelectionDAG &DAG,
// before the setcc result is scalarized on subtargets that don't have legal
// vxi1 types.
if (DCI.isBeforeLegalize()) {
- if (SDValue V = combineBitcastvxi1(DAG, SDValue(N, 0), Subtarget))
+ SDLoc dl(N);
+ if (SDValue V = combineBitcastvxi1(DAG, VT, N0, dl, Subtarget))
return V;
// If this is a bitcast between a MVT::v4i1/v2i1 and an illegal integer
// type, widen both sides to avoid a trip through memory.
if ((VT == MVT::v4i1 || VT == MVT::v2i1) && SrcVT.isScalarInteger() &&
Subtarget.hasAVX512()) {
- SDLoc dl(N);
N0 = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i8, N0);
N0 = DAG.getBitcast(MVT::v8i1, N0);
return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT, N0,
@@ -33159,7 +34949,6 @@ static SDValue combineBitcast(SDNode *N, SelectionDAG &DAG,
// type, widen both sides to avoid a trip through memory.
if ((SrcVT == MVT::v4i1 || SrcVT == MVT::v2i1) && VT.isScalarInteger() &&
Subtarget.hasAVX512()) {
- SDLoc dl(N);
unsigned NumConcats = 8 / SrcVT.getVectorNumElements();
SmallVector<SDValue, 4> Ops(NumConcats, DAG.getUNDEF(SrcVT));
Ops[0] = N0;
@@ -33213,7 +35002,7 @@ static SDValue combineBitcast(SDNode *N, SelectionDAG &DAG,
if (N0.getOpcode() == ISD::BUILD_VECTOR &&
(SrcVT == MVT::v2f32 || SrcVT == MVT::v2i32 || SrcVT == MVT::v4i16 ||
SrcVT == MVT::v8i8))
- return createMMXBuildVector(N0, DAG, Subtarget);
+ return createMMXBuildVector(cast<BuildVectorSDNode>(N0), DAG, Subtarget);
// Detect bitcasts between element or subvector extraction to x86mmx.
if ((N0.getOpcode() == ISD::EXTRACT_VECTOR_ELT ||
@@ -33297,66 +35086,16 @@ static SDValue combineBitcast(SDNode *N, SelectionDAG &DAG,
return SDValue();
}
-// Given a select, detect the following pattern:
-// 1: %2 = zext <N x i8> %0 to <N x i32>
-// 2: %3 = zext <N x i8> %1 to <N x i32>
-// 3: %4 = sub nsw <N x i32> %2, %3
-// 4: %5 = icmp sgt <N x i32> %4, [0 x N] or [-1 x N]
-// 5: %6 = sub nsw <N x i32> zeroinitializer, %4
-// 6: %7 = select <N x i1> %5, <N x i32> %4, <N x i32> %6
+// Given a ABS node, detect the following pattern:
+// (ABS (SUB (ZERO_EXTEND a), (ZERO_EXTEND b))).
// This is useful as it is the input into a SAD pattern.
-static bool detectZextAbsDiff(const SDValue &Select, SDValue &Op0,
- SDValue &Op1) {
- // Check the condition of the select instruction is greater-than.
- SDValue SetCC = Select->getOperand(0);
- if (SetCC.getOpcode() != ISD::SETCC)
- return false;
- ISD::CondCode CC = cast<CondCodeSDNode>(SetCC.getOperand(2))->get();
- if (CC != ISD::SETGT && CC != ISD::SETLT)
- return false;
-
- SDValue SelectOp1 = Select->getOperand(1);
- SDValue SelectOp2 = Select->getOperand(2);
-
- // The following instructions assume SelectOp1 is the subtraction operand
- // and SelectOp2 is the negation operand.
- // In the case of SETLT this is the other way around.
- if (CC == ISD::SETLT)
- std::swap(SelectOp1, SelectOp2);
-
- // The second operand of the select should be the negation of the first
- // operand, which is implemented as 0 - SelectOp1.
- if (!(SelectOp2.getOpcode() == ISD::SUB &&
- ISD::isBuildVectorAllZeros(SelectOp2.getOperand(0).getNode()) &&
- SelectOp2.getOperand(1) == SelectOp1))
- return false;
-
- // The first operand of SetCC is the first operand of the select, which is the
- // difference between the two input vectors.
- if (SetCC.getOperand(0) != SelectOp1)
- return false;
-
- // In SetLT case, The second operand of the comparison can be either 1 or 0.
- APInt SplatVal;
- if ((CC == ISD::SETLT) &&
- !((ISD::isConstantSplatVector(SetCC.getOperand(1).getNode(), SplatVal) &&
- SplatVal.isOneValue()) ||
- (ISD::isBuildVectorAllZeros(SetCC.getOperand(1).getNode()))))
+static bool detectZextAbsDiff(const SDValue &Abs, SDValue &Op0, SDValue &Op1) {
+ SDValue AbsOp1 = Abs->getOperand(0);
+ if (AbsOp1.getOpcode() != ISD::SUB)
return false;
- // In SetGT case, The second operand of the comparison can be either -1 or 0.
- if ((CC == ISD::SETGT) &&
- !(ISD::isBuildVectorAllZeros(SetCC.getOperand(1).getNode()) ||
- ISD::isBuildVectorAllOnes(SetCC.getOperand(1).getNode())))
- return false;
-
- // The first operand of the select is the difference between the two input
- // vectors.
- if (SelectOp1.getOpcode() != ISD::SUB)
- return false;
-
- Op0 = SelectOp1.getOperand(0);
- Op1 = SelectOp1.getOperand(1);
+ Op0 = AbsOp1.getOperand(0);
+ Op1 = AbsOp1.getOperand(1);
// Check if the operands of the sub are zero-extended from vectors of i8.
if (Op0.getOpcode() != ISD::ZERO_EXTEND ||
@@ -33476,23 +35215,25 @@ static SDValue combineHorizontalMinMaxResult(SDNode *Extract, SelectionDAG &DAG,
DAG.getIntPtrConstant(0, DL));
}
-// Attempt to replace an all_of/any_of style horizontal reduction with a MOVMSK.
+// Attempt to replace an all_of/any_of/parity style horizontal reduction with a MOVMSK.
static SDValue combineHorizontalPredicateResult(SDNode *Extract,
SelectionDAG &DAG,
const X86Subtarget &Subtarget) {
- // Bail without SSE2 or with AVX512VL (which uses predicate registers).
- if (!Subtarget.hasSSE2() || Subtarget.hasVLX())
+ // Bail without SSE2.
+ if (!Subtarget.hasSSE2())
return SDValue();
EVT ExtractVT = Extract->getValueType(0);
unsigned BitWidth = ExtractVT.getSizeInBits();
if (ExtractVT != MVT::i64 && ExtractVT != MVT::i32 && ExtractVT != MVT::i16 &&
- ExtractVT != MVT::i8)
+ ExtractVT != MVT::i8 && ExtractVT != MVT::i1)
return SDValue();
- // Check for OR(any_of) and AND(all_of) horizontal reduction patterns.
+ // Check for OR(any_of)/AND(all_of)/XOR(parity) horizontal reduction patterns.
ISD::NodeType BinOp;
SDValue Match = DAG.matchBinOpReduction(Extract, BinOp, {ISD::OR, ISD::AND});
+ if (!Match && ExtractVT == MVT::i1)
+ Match = DAG.matchBinOpReduction(Extract, BinOp, {ISD::XOR});
if (!Match)
return SDValue();
@@ -33501,53 +35242,104 @@ static SDValue combineHorizontalPredicateResult(SDNode *Extract,
if (Match.getScalarValueSizeInBits() != BitWidth)
return SDValue();
- // We require AVX2 for PMOVMSKB for v16i16/v32i8;
- unsigned MatchSizeInBits = Match.getValueSizeInBits();
- if (!(MatchSizeInBits == 128 ||
- (MatchSizeInBits == 256 &&
- ((Subtarget.hasAVX() && BitWidth >= 32) || Subtarget.hasAVX2()))))
- return SDValue();
+ SDValue Movmsk;
+ SDLoc DL(Extract);
+ EVT MatchVT = Match.getValueType();
+ unsigned NumElts = MatchVT.getVectorNumElements();
- // Don't bother performing this for 2-element vectors.
- if (Match.getValueType().getVectorNumElements() <= 2)
- return SDValue();
+ if (ExtractVT == MVT::i1) {
+ // Special case for (pre-legalization) vXi1 reductions.
+ if (NumElts > 32)
+ return SDValue();
+ if (DAG.getTargetLoweringInfo().isTypeLegal(MatchVT)) {
+ // If this is a legal AVX512 predicate type then we can just bitcast.
+ EVT MovmskVT = EVT::getIntegerVT(*DAG.getContext(), NumElts);
+ Movmsk = DAG.getBitcast(MovmskVT, Match);
+ } else {
+ // Use combineBitcastvxi1 to create the MOVMSK.
+ if (NumElts == 32 && !Subtarget.hasInt256()) {
+ SDValue Lo, Hi;
+ std::tie(Lo, Hi) = DAG.SplitVector(Match, DL);
+ Match = DAG.getNode(BinOp, DL, Lo.getValueType(), Lo, Hi);
+ NumElts = 16;
+ }
+ EVT MovmskVT = EVT::getIntegerVT(*DAG.getContext(), NumElts);
+ Movmsk = combineBitcastvxi1(DAG, MovmskVT, Match, DL, Subtarget);
+ }
+ if (!Movmsk)
+ return SDValue();
+ Movmsk = DAG.getZExtOrTrunc(Movmsk, DL, MVT::i32);
+ } else {
+ // Bail with AVX512VL (which uses predicate registers).
+ if (Subtarget.hasVLX())
+ return SDValue();
- // Check that we are extracting a reduction of all sign bits.
- if (DAG.ComputeNumSignBits(Match) != BitWidth)
- return SDValue();
+ unsigned MatchSizeInBits = Match.getValueSizeInBits();
+ if (!(MatchSizeInBits == 128 ||
+ (MatchSizeInBits == 256 && Subtarget.hasAVX())))
+ return SDValue();
- // For 32/64 bit comparisons use MOVMSKPS/MOVMSKPD, else PMOVMSKB.
- MVT MaskVT;
- if (64 == BitWidth || 32 == BitWidth)
- MaskVT = MVT::getVectorVT(MVT::getFloatingPointVT(BitWidth),
- MatchSizeInBits / BitWidth);
- else
- MaskVT = MVT::getVectorVT(MVT::i8, MatchSizeInBits / 8);
+ // Make sure this isn't a vector of 1 element. The perf win from using
+ // MOVMSK diminishes with less elements in the reduction, but it is
+ // generally better to get the comparison over to the GPRs as soon as
+ // possible to reduce the number of vector ops.
+ if (Match.getValueType().getVectorNumElements() < 2)
+ return SDValue();
+
+ // Check that we are extracting a reduction of all sign bits.
+ if (DAG.ComputeNumSignBits(Match) != BitWidth)
+ return SDValue();
+
+ if (MatchSizeInBits == 256 && BitWidth < 32 && !Subtarget.hasInt256()) {
+ SDValue Lo, Hi;
+ std::tie(Lo, Hi) = DAG.SplitVector(Match, DL);
+ Match = DAG.getNode(BinOp, DL, Lo.getValueType(), Lo, Hi);
+ MatchSizeInBits = Match.getValueSizeInBits();
+ }
+
+ // For 32/64 bit comparisons use MOVMSKPS/MOVMSKPD, else PMOVMSKB.
+ MVT MaskSrcVT;
+ if (64 == BitWidth || 32 == BitWidth)
+ MaskSrcVT = MVT::getVectorVT(MVT::getFloatingPointVT(BitWidth),
+ MatchSizeInBits / BitWidth);
+ else
+ MaskSrcVT = MVT::getVectorVT(MVT::i8, MatchSizeInBits / 8);
+
+ SDValue BitcastLogicOp = DAG.getBitcast(MaskSrcVT, Match);
+ Movmsk = getPMOVMSKB(DL, BitcastLogicOp, DAG, Subtarget);
+ NumElts = MaskSrcVT.getVectorNumElements();
+ }
+ assert(NumElts <= 32 && "Not expecting more than 32 elements");
- APInt CompareBits;
+ if (BinOp == ISD::XOR) {
+ // parity -> (AND (CTPOP(MOVMSK X)), 1)
+ SDValue Mask = DAG.getConstant(1, DL, MVT::i32);
+ SDValue Result = DAG.getNode(ISD::CTPOP, DL, MVT::i32, Movmsk);
+ Result = DAG.getNode(ISD::AND, DL, MVT::i32, Result, Mask);
+ return DAG.getZExtOrTrunc(Result, DL, ExtractVT);
+ }
+
+ SDValue CmpC;
ISD::CondCode CondCode;
if (BinOp == ISD::OR) {
// any_of -> MOVMSK != 0
- CompareBits = APInt::getNullValue(32);
+ CmpC = DAG.getConstant(0, DL, MVT::i32);
CondCode = ISD::CondCode::SETNE;
} else {
// all_of -> MOVMSK == ((1 << NumElts) - 1)
- CompareBits = APInt::getLowBitsSet(32, MaskVT.getVectorNumElements());
+ CmpC = DAG.getConstant((1ULL << NumElts) - 1, DL, MVT::i32);
CondCode = ISD::CondCode::SETEQ;
}
- // Perform the select as i32/i64 and then truncate to avoid partial register
- // stalls.
- unsigned ResWidth = std::max(BitWidth, 32u);
- EVT ResVT = EVT::getIntegerVT(*DAG.getContext(), ResWidth);
- SDLoc DL(Extract);
- SDValue Zero = DAG.getConstant(0, DL, ResVT);
- SDValue Ones = DAG.getAllOnesConstant(DL, ResVT);
- SDValue Res = DAG.getBitcast(MaskVT, Match);
- Res = DAG.getNode(X86ISD::MOVMSK, DL, MVT::i32, Res);
- Res = DAG.getSelectCC(DL, Res, DAG.getConstant(CompareBits, DL, MVT::i32),
- Ones, Zero, CondCode);
- return DAG.getSExtOrTrunc(Res, DL, ExtractVT);
+ // The setcc produces an i8 of 0/1, so extend that to the result width and
+ // negate to get the final 0/-1 mask value.
+ const TargetLowering &TLI = DAG.getTargetLoweringInfo();
+ EVT SetccVT =
+ TLI.getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), MVT::i32);
+ SDValue Setcc = DAG.getSetCC(DL, SetccVT, Movmsk, CmpC, CondCode);
+ SDValue Zext = DAG.getZExtOrTrunc(Setcc, DL, ExtractVT);
+ SDValue Zero = DAG.getConstant(0, DL, ExtractVT);
+ return DAG.getNode(ISD::SUB, DL, ExtractVT, Zero, Zext);
}
static SDValue combineBasicSADPattern(SDNode *Extract, SelectionDAG &DAG,
@@ -33592,7 +35384,7 @@ static SDValue combineBasicSADPattern(SDNode *Extract, SelectionDAG &DAG,
// If there was a match, we want Root to be a select that is the root of an
// abs-diff pattern.
- if (!Root || (Root.getOpcode() != ISD::VSELECT))
+ if (!Root || Root.getOpcode() != ISD::ABS)
return SDValue();
// Check whether we have an abs-diff pattern feeding into the select.
@@ -33651,15 +35443,19 @@ static SDValue combineExtractWithShuffle(SDNode *N, SelectionDAG &DAG,
if (SrcSVT == MVT::i1 || !isa<ConstantSDNode>(Idx))
return SDValue();
+ SDValue SrcBC = peekThroughBitcasts(Src);
+
// Handle extract(broadcast(scalar_value)), it doesn't matter what index is.
- if (X86ISD::VBROADCAST == Src.getOpcode() &&
- Src.getOperand(0).getValueType() == VT)
- return Src.getOperand(0);
+ if (X86ISD::VBROADCAST == SrcBC.getOpcode()) {
+ SDValue SrcOp = SrcBC.getOperand(0);
+ if (SrcOp.getValueSizeInBits() == VT.getSizeInBits())
+ return DAG.getBitcast(VT, SrcOp);
+ }
// Resolve the target shuffle inputs and mask.
SmallVector<int, 16> Mask;
SmallVector<SDValue, 2> Ops;
- if (!resolveTargetShuffleInputs(peekThroughBitcasts(Src), Ops, Mask, DAG))
+ if (!resolveTargetShuffleInputs(SrcBC, Ops, Mask, DAG))
return SDValue();
// Attempt to narrow/widen the shuffle mask to the correct size.
@@ -33704,7 +35500,6 @@ static SDValue combineExtractWithShuffle(SDNode *N, SelectionDAG &DAG,
: DAG.getConstant(0, dl, VT);
SDValue SrcOp = Ops[SrcIdx / Mask.size()];
- SrcOp = DAG.getBitcast(SrcVT, SrcOp);
SrcIdx = SrcIdx % Mask.size();
// We can only extract other elements from 128-bit vectors and in certain
@@ -33714,6 +35509,7 @@ static SDValue combineExtractWithShuffle(SDNode *N, SelectionDAG &DAG,
if ((SrcVT == MVT::v4i32 || SrcVT == MVT::v2i64) &&
((SrcIdx == 0 && Subtarget.hasSSE2()) || Subtarget.hasSSE41())) {
assert(SrcSVT == VT && "Unexpected extraction type");
+ SrcOp = DAG.getBitcast(SrcVT, SrcOp);
return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, SrcSVT, SrcOp,
DAG.getIntPtrConstant(SrcIdx, dl));
}
@@ -33723,6 +35519,7 @@ static SDValue combineExtractWithShuffle(SDNode *N, SelectionDAG &DAG,
assert(VT.getSizeInBits() >= SrcSVT.getSizeInBits() &&
"Unexpected extraction type");
unsigned OpCode = (SrcVT == MVT::v8i16 ? X86ISD::PEXTRW : X86ISD::PEXTRB);
+ SrcOp = DAG.getBitcast(SrcVT, SrcOp);
SDValue ExtOp = DAG.getNode(OpCode, dl, MVT::i32, SrcOp,
DAG.getIntPtrConstant(SrcIdx, dl));
return DAG.getZExtOrTrunc(ExtOp, dl, VT);
@@ -33731,6 +35528,155 @@ static SDValue combineExtractWithShuffle(SDNode *N, SelectionDAG &DAG,
return SDValue();
}
+/// Extracting a scalar FP value from vector element 0 is free, so extract each
+/// operand first, then perform the math as a scalar op.
+static SDValue scalarizeExtEltFP(SDNode *ExtElt, SelectionDAG &DAG) {
+ assert(ExtElt->getOpcode() == ISD::EXTRACT_VECTOR_ELT && "Expected extract");
+ SDValue Vec = ExtElt->getOperand(0);
+ SDValue Index = ExtElt->getOperand(1);
+ EVT VT = ExtElt->getValueType(0);
+ EVT VecVT = Vec.getValueType();
+
+ // TODO: If this is a unary/expensive/expand op, allow extraction from a
+ // non-zero element because the shuffle+scalar op will be cheaper?
+ if (!Vec.hasOneUse() || !isNullConstant(Index) || VecVT.getScalarType() != VT)
+ return SDValue();
+
+ // Vector FP compares don't fit the pattern of FP math ops (propagate, not
+ // extract, the condition code), so deal with those as a special-case.
+ if (Vec.getOpcode() == ISD::SETCC && VT == MVT::i1) {
+ EVT OpVT = Vec.getOperand(0).getValueType().getScalarType();
+ if (OpVT != MVT::f32 && OpVT != MVT::f64)
+ return SDValue();
+
+ // extract (setcc X, Y, CC), 0 --> setcc (extract X, 0), (extract Y, 0), CC
+ SDLoc DL(ExtElt);
+ SDValue Ext0 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, OpVT,
+ Vec.getOperand(0), Index);
+ SDValue Ext1 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, OpVT,
+ Vec.getOperand(1), Index);
+ return DAG.getNode(Vec.getOpcode(), DL, VT, Ext0, Ext1, Vec.getOperand(2));
+ }
+
+ if (VT != MVT::f32 && VT != MVT::f64)
+ return SDValue();
+
+ // Vector FP selects don't fit the pattern of FP math ops (because the
+ // condition has a different type and we have to change the opcode), so deal
+ // with those here.
+ // FIXME: This is restricted to pre type legalization by ensuring the setcc
+ // has i1 elements. If we loosen this we need to convert vector bool to a
+ // scalar bool.
+ if (Vec.getOpcode() == ISD::VSELECT &&
+ Vec.getOperand(0).getOpcode() == ISD::SETCC &&
+ Vec.getOperand(0).getValueType().getScalarType() == MVT::i1 &&
+ Vec.getOperand(0).getOperand(0).getValueType() == VecVT) {
+ // ext (sel Cond, X, Y), 0 --> sel (ext Cond, 0), (ext X, 0), (ext Y, 0)
+ SDLoc DL(ExtElt);
+ SDValue Ext0 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL,
+ Vec.getOperand(0).getValueType().getScalarType(),
+ Vec.getOperand(0), Index);
+ SDValue Ext1 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT,
+ Vec.getOperand(1), Index);
+ SDValue Ext2 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT,
+ Vec.getOperand(2), Index);
+ return DAG.getNode(ISD::SELECT, DL, VT, Ext0, Ext1, Ext2);
+ }
+
+ // TODO: This switch could include FNEG and the x86-specific FP logic ops
+ // (FAND, FANDN, FOR, FXOR). But that may require enhancements to avoid
+ // missed load folding and fma+fneg combining.
+ switch (Vec.getOpcode()) {
+ case ISD::FMA: // Begin 3 operands
+ case ISD::FMAD:
+ case ISD::FADD: // Begin 2 operands
+ case ISD::FSUB:
+ case ISD::FMUL:
+ case ISD::FDIV:
+ case ISD::FREM:
+ case ISD::FCOPYSIGN:
+ case ISD::FMINNUM:
+ case ISD::FMAXNUM:
+ case ISD::FMINNUM_IEEE:
+ case ISD::FMAXNUM_IEEE:
+ case ISD::FMAXIMUM:
+ case ISD::FMINIMUM:
+ case X86ISD::FMAX:
+ case X86ISD::FMIN:
+ case ISD::FABS: // Begin 1 operand
+ case ISD::FSQRT:
+ case ISD::FRINT:
+ case ISD::FCEIL:
+ case ISD::FTRUNC:
+ case ISD::FNEARBYINT:
+ case ISD::FROUND:
+ case ISD::FFLOOR:
+ case X86ISD::FRCP:
+ case X86ISD::FRSQRT: {
+ // extract (fp X, Y, ...), 0 --> fp (extract X, 0), (extract Y, 0), ...
+ SDLoc DL(ExtElt);
+ SmallVector<SDValue, 4> ExtOps;
+ for (SDValue Op : Vec->ops())
+ ExtOps.push_back(DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT, Op, Index));
+ return DAG.getNode(Vec.getOpcode(), DL, VT, ExtOps);
+ }
+ default:
+ return SDValue();
+ }
+ llvm_unreachable("All opcodes should return within switch");
+}
+
+/// Try to convert a vector reduction sequence composed of binops and shuffles
+/// into horizontal ops.
+static SDValue combineReductionToHorizontal(SDNode *ExtElt, SelectionDAG &DAG,
+ const X86Subtarget &Subtarget) {
+ assert(ExtElt->getOpcode() == ISD::EXTRACT_VECTOR_ELT && "Unexpected caller");
+ bool OptForSize = DAG.getMachineFunction().getFunction().hasOptSize();
+ if (!Subtarget.hasFastHorizontalOps() && !OptForSize)
+ return SDValue();
+ SDValue Index = ExtElt->getOperand(1);
+ if (!isNullConstant(Index))
+ return SDValue();
+
+ // TODO: Allow FADD with reduction and/or reassociation and no-signed-zeros.
+ ISD::NodeType Opc;
+ SDValue Rdx = DAG.matchBinOpReduction(ExtElt, Opc, {ISD::ADD});
+ if (!Rdx)
+ return SDValue();
+
+ EVT VT = ExtElt->getValueType(0);
+ EVT VecVT = ExtElt->getOperand(0).getValueType();
+ if (VecVT.getScalarType() != VT)
+ return SDValue();
+
+ unsigned HorizOpcode = Opc == ISD::ADD ? X86ISD::HADD : X86ISD::FHADD;
+ SDLoc DL(ExtElt);
+
+ // 256-bit horizontal instructions operate on 128-bit chunks rather than
+ // across the whole vector, so we need an extract + hop preliminary stage.
+ // This is the only step where the operands of the hop are not the same value.
+ // TODO: We could extend this to handle 512-bit or even longer vectors.
+ if (((VecVT == MVT::v16i16 || VecVT == MVT::v8i32) && Subtarget.hasSSSE3()) ||
+ ((VecVT == MVT::v8f32 || VecVT == MVT::v4f64) && Subtarget.hasSSE3())) {
+ unsigned NumElts = VecVT.getVectorNumElements();
+ SDValue Hi = extract128BitVector(Rdx, NumElts / 2, DAG, DL);
+ SDValue Lo = extract128BitVector(Rdx, 0, DAG, DL);
+ VecVT = EVT::getVectorVT(*DAG.getContext(), VT, NumElts / 2);
+ Rdx = DAG.getNode(HorizOpcode, DL, VecVT, Hi, Lo);
+ }
+ if (!((VecVT == MVT::v8i16 || VecVT == MVT::v4i32) && Subtarget.hasSSSE3()) &&
+ !((VecVT == MVT::v4f32 || VecVT == MVT::v2f64) && Subtarget.hasSSE3()))
+ return SDValue();
+
+ // extract (add (shuf X), X), 0 --> extract (hadd X, X), 0
+ assert(Rdx.getValueType() == VecVT && "Unexpected reduction match");
+ unsigned ReductionSteps = Log2_32(VecVT.getVectorNumElements());
+ for (unsigned i = 0; i != ReductionSteps; ++i)
+ Rdx = DAG.getNode(HorizOpcode, DL, VecVT, Rdx, Rdx);
+
+ return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT, Rdx, Index);
+}
+
/// Detect vector gather/scatter index generation and convert it from being a
/// bunch of shuffles and extracts into a somewhat faster sequence.
/// For i686, the best sequence is apparently storing the value and loading
@@ -33741,23 +35687,48 @@ static SDValue combineExtractVectorElt(SDNode *N, SelectionDAG &DAG,
if (SDValue NewOp = combineExtractWithShuffle(N, DAG, DCI, Subtarget))
return NewOp;
+ SDValue InputVector = N->getOperand(0);
+ SDValue EltIdx = N->getOperand(1);
+ auto *CIdx = dyn_cast<ConstantSDNode>(EltIdx);
+
+ EVT SrcVT = InputVector.getValueType();
+ EVT VT = N->getValueType(0);
+ SDLoc dl(InputVector);
+ bool IsPextr = N->getOpcode() != ISD::EXTRACT_VECTOR_ELT;
+
+ if (CIdx && CIdx->getAPIntValue().uge(SrcVT.getVectorNumElements()))
+ return IsPextr ? DAG.getConstant(0, dl, VT) : DAG.getUNDEF(VT);
+
+ // Integer Constant Folding.
+ if (CIdx && VT.isInteger()) {
+ APInt UndefVecElts;
+ SmallVector<APInt, 16> EltBits;
+ unsigned VecEltBitWidth = SrcVT.getScalarSizeInBits();
+ if (getTargetConstantBitsFromNode(InputVector, VecEltBitWidth, UndefVecElts,
+ EltBits, true, false)) {
+ uint64_t Idx = CIdx->getZExtValue();
+ if (UndefVecElts[Idx])
+ return IsPextr ? DAG.getConstant(0, dl, VT) : DAG.getUNDEF(VT);
+ return DAG.getConstant(EltBits[Idx].zextOrSelf(VT.getScalarSizeInBits()),
+ dl, VT);
+ }
+ }
+
// TODO - Remove this once we can handle the implicit zero-extension of
// X86ISD::PEXTRW/X86ISD::PEXTRB in:
// XFormVExtractWithShuffleIntoLoad, combineHorizontalPredicateResult and
// combineBasicSADPattern.
- if (N->getOpcode() != ISD::EXTRACT_VECTOR_ELT)
+ if (IsPextr) {
+ const TargetLowering &TLI = DAG.getTargetLoweringInfo();
+ if (TLI.SimplifyDemandedBits(
+ SDValue(N, 0), APInt::getAllOnesValue(VT.getSizeInBits()), DCI))
+ return SDValue(N, 0);
return SDValue();
+ }
if (SDValue NewOp = XFormVExtractWithShuffleIntoLoad(N, DAG, DCI))
return NewOp;
- SDValue InputVector = N->getOperand(0);
- SDValue EltIdx = N->getOperand(1);
-
- EVT SrcVT = InputVector.getValueType();
- EVT VT = N->getValueType(0);
- SDLoc dl(InputVector);
-
// Detect mmx extraction of all bits as a i64. It works better as a bitcast.
if (InputVector.getOpcode() == ISD::BITCAST && InputVector.hasOneUse() &&
VT == MVT::i64 && SrcVT == MVT::v1i64 && isNullConstant(EltIdx)) {
@@ -33778,16 +35749,6 @@ static SDValue combineExtractVectorElt(SDNode *N, SelectionDAG &DAG,
return DAG.getNode(X86ISD::MMX_MOVD2W, dl, MVT::i32, MMXSrc);
}
- if (VT == MVT::i1 && InputVector.getOpcode() == ISD::BITCAST &&
- isa<ConstantSDNode>(EltIdx) &&
- isa<ConstantSDNode>(InputVector.getOperand(0))) {
- uint64_t ExtractedElt = N->getConstantOperandVal(1);
- auto *InputC = cast<ConstantSDNode>(InputVector.getOperand(0));
- const APInt &InputValue = InputC->getAPIntValue();
- uint64_t Res = InputValue[ExtractedElt];
- return DAG.getConstant(Res, dl, MVT::i1);
- }
-
// Check whether this extract is the root of a sum of absolute differences
// pattern. This has to be done here because we really want it to happen
// pre-legalization,
@@ -33802,6 +35763,45 @@ static SDValue combineExtractVectorElt(SDNode *N, SelectionDAG &DAG,
if (SDValue MinMax = combineHorizontalMinMaxResult(N, DAG, Subtarget))
return MinMax;
+ if (SDValue V = combineReductionToHorizontal(N, DAG, Subtarget))
+ return V;
+
+ if (SDValue V = scalarizeExtEltFP(N, DAG))
+ return V;
+
+ // Attempt to extract a i1 element by using MOVMSK to extract the signbits
+ // and then testing the relevant element.
+ if (CIdx && SrcVT.getScalarType() == MVT::i1) {
+ SmallVector<SDNode *, 16> BoolExtracts;
+ auto IsBoolExtract = [&BoolExtracts](SDNode *Use) {
+ if (Use->getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
+ isa<ConstantSDNode>(Use->getOperand(1)) &&
+ Use->getValueType(0) == MVT::i1) {
+ BoolExtracts.push_back(Use);
+ return true;
+ }
+ return false;
+ };
+ if (all_of(InputVector->uses(), IsBoolExtract) &&
+ BoolExtracts.size() > 1) {
+ unsigned NumSrcElts = SrcVT.getVectorNumElements();
+ EVT BCVT = EVT::getIntegerVT(*DAG.getContext(), NumSrcElts);
+ if (SDValue BC =
+ combineBitcastvxi1(DAG, BCVT, InputVector, dl, Subtarget)) {
+ for (SDNode *Use : BoolExtracts) {
+ // extractelement vXi1 X, MaskIdx --> ((movmsk X) & Mask) == Mask
+ unsigned MaskIdx = Use->getConstantOperandVal(1);
+ APInt MaskBit = APInt::getOneBitSet(NumSrcElts, MaskIdx);
+ SDValue Mask = DAG.getConstant(MaskBit, dl, BCVT);
+ SDValue Res = DAG.getNode(ISD::AND, dl, BCVT, BC, Mask);
+ Res = DAG.getSetCC(dl, MVT::i1, Res, Mask, ISD::SETEQ);
+ DCI.CombineTo(Use, Res);
+ }
+ return SDValue(N, 0);
+ }
+ }
+ }
+
return SDValue();
}
@@ -33825,11 +35825,15 @@ combineVSelectWithAllOnesOrZeros(SDNode *N, SelectionDAG &DAG,
assert(CondVT.isVector() && "Vector select expects a vector selector!");
- bool TValIsAllZeros = ISD::isBuildVectorAllZeros(LHS.getNode());
// Check if the first operand is all zeros and Cond type is vXi1.
// This situation only applies to avx512.
- if (TValIsAllZeros && Subtarget.hasAVX512() && Cond.hasOneUse() &&
- CondVT.getVectorElementType() == MVT::i1) {
+ // TODO: Use isNullOrNullSplat() to distinguish constants with undefs?
+ // TODO: Can we assert that both operands are not zeros (because that should
+ // get simplified at node creation time)?
+ bool TValIsAllZeros = ISD::isBuildVectorAllZeros(LHS.getNode());
+ bool FValIsAllZeros = ISD::isBuildVectorAllZeros(RHS.getNode());
+ if (TValIsAllZeros && !FValIsAllZeros && Subtarget.hasAVX512() &&
+ Cond.hasOneUse() && CondVT.getVectorElementType() == MVT::i1) {
// Invert the cond to not(cond) : xor(op,allones)=not(op)
SDValue CondNew = DAG.getNOT(DL, Cond, CondVT);
// Vselect cond, op1, op2 = Vselect not(cond), op2, op1
@@ -33844,12 +35848,10 @@ combineVSelectWithAllOnesOrZeros(SDNode *N, SelectionDAG &DAG,
if (CondVT.getScalarSizeInBits() != VT.getScalarSizeInBits())
return SDValue();
- bool TValIsAllOnes = ISD::isBuildVectorAllOnes(LHS.getNode());
- bool FValIsAllZeros = ISD::isBuildVectorAllZeros(RHS.getNode());
-
// Try to invert the condition if true value is not all 1s and false value is
- // not all 0s.
- if (!TValIsAllOnes && !FValIsAllZeros &&
+ // not all 0s. Only do this if the condition has one use.
+ bool TValIsAllOnes = ISD::isBuildVectorAllOnes(LHS.getNode());
+ if (!TValIsAllOnes && !FValIsAllZeros && Cond.hasOneUse() &&
// Check if the selector will be produced by CMPP*/PCMP*.
Cond.getOpcode() == ISD::SETCC &&
// Check if SETCC has already been promoted.
@@ -33907,6 +35909,39 @@ combineVSelectWithAllOnesOrZeros(SDNode *N, SelectionDAG &DAG,
return SDValue();
}
+/// If both arms of a vector select are concatenated vectors, split the select,
+/// and concatenate the result to eliminate a wide (256-bit) vector instruction:
+/// vselect Cond, (concat T0, T1), (concat F0, F1) -->
+/// concat (vselect (split Cond), T0, F0), (vselect (split Cond), T1, F1)
+static SDValue narrowVectorSelect(SDNode *N, SelectionDAG &DAG,
+ const X86Subtarget &Subtarget) {
+ unsigned Opcode = N->getOpcode();
+ if (Opcode != X86ISD::BLENDV && Opcode != ISD::VSELECT)
+ return SDValue();
+
+ // TODO: Split 512-bit vectors too?
+ EVT VT = N->getValueType(0);
+ if (!VT.is256BitVector())
+ return SDValue();
+
+ // TODO: Split as long as any 2 of the 3 operands are concatenated?
+ SDValue Cond = N->getOperand(0);
+ SDValue TVal = N->getOperand(1);
+ SDValue FVal = N->getOperand(2);
+ SmallVector<SDValue, 4> CatOpsT, CatOpsF;
+ if (!TVal.hasOneUse() || !FVal.hasOneUse() ||
+ !collectConcatOps(TVal.getNode(), CatOpsT) ||
+ !collectConcatOps(FVal.getNode(), CatOpsF))
+ return SDValue();
+
+ auto makeBlend = [Opcode](SelectionDAG &DAG, const SDLoc &DL,
+ ArrayRef<SDValue> Ops) {
+ return DAG.getNode(Opcode, DL, Ops[1].getValueType(), Ops);
+ };
+ return SplitOpsAndApply(DAG, Subtarget, SDLoc(N), VT, { Cond, TVal, FVal },
+ makeBlend, /*CheckBWI*/ false);
+}
+
static SDValue combineSelectOfTwoConstants(SDNode *N, SelectionDAG &DAG) {
SDValue Cond = N->getOperand(0);
SDValue LHS = N->getOperand(1);
@@ -33973,7 +36008,7 @@ static SDValue combineSelectOfTwoConstants(SDNode *N, SelectionDAG &DAG) {
/// If this is a *dynamic* select (non-constant condition) and we can match
/// this node with one of the variable blend instructions, restructure the
/// condition so that blends can use the high (sign) bit of each element.
-/// This function will also call SimplfiyDemandedBits on already created
+/// This function will also call SimplifyDemandedBits on already created
/// BLENDV to perform additional simplifications.
static SDValue combineVSelectToBLENDV(SDNode *N, SelectionDAG &DAG,
TargetLowering::DAGCombinerInfo &DCI,
@@ -34268,6 +36303,42 @@ static SDValue combineSelect(SDNode *N, SelectionDAG &DAG,
return DAG.getNode(N->getOpcode(), DL, VT, Cond, LHS, RHS);
}
+ // AVX512 - Extend select with zero to merge with target shuffle.
+ // select(mask, extract_subvector(shuffle(x)), zero) -->
+ // extract_subvector(select(insert_subvector(mask), shuffle(x), zero))
+ // TODO - support non target shuffles as well.
+ if (Subtarget.hasAVX512() && CondVT.isVector() &&
+ CondVT.getVectorElementType() == MVT::i1) {
+ auto SelectableOp = [&TLI](SDValue Op) {
+ return Op.getOpcode() == ISD::EXTRACT_SUBVECTOR &&
+ isTargetShuffle(Op.getOperand(0).getOpcode()) &&
+ isNullConstant(Op.getOperand(1)) &&
+ TLI.isTypeLegal(Op.getOperand(0).getValueType()) &&
+ Op.hasOneUse() && Op.getOperand(0).hasOneUse();
+ };
+
+ bool SelectableLHS = SelectableOp(LHS);
+ bool SelectableRHS = SelectableOp(RHS);
+ bool ZeroLHS = ISD::isBuildVectorAllZeros(LHS.getNode());
+ bool ZeroRHS = ISD::isBuildVectorAllZeros(RHS.getNode());
+
+ if ((SelectableLHS && ZeroRHS) || (SelectableRHS && ZeroLHS)) {
+ EVT SrcVT = SelectableLHS ? LHS.getOperand(0).getValueType()
+ : RHS.getOperand(0).getValueType();
+ unsigned NumSrcElts = SrcVT.getVectorNumElements();
+ EVT SrcCondVT = EVT::getVectorVT(*DAG.getContext(), MVT::i1, NumSrcElts);
+ LHS = insertSubVector(DAG.getUNDEF(SrcVT), LHS, 0, DAG, DL,
+ VT.getSizeInBits());
+ RHS = insertSubVector(DAG.getUNDEF(SrcVT), RHS, 0, DAG, DL,
+ VT.getSizeInBits());
+ Cond = DAG.getNode(ISD::INSERT_SUBVECTOR, DL, SrcCondVT,
+ DAG.getUNDEF(SrcCondVT), Cond,
+ DAG.getIntPtrConstant(0, DL));
+ SDValue Res = DAG.getSelect(DL, SrcVT, Cond, LHS, RHS);
+ return extractSubVector(Res, 0, DAG, DL, VT.getSizeInBits());
+ }
+ }
+
if (SDValue V = combineSelectOfTwoConstants(N, DAG))
return V;
@@ -34338,14 +36409,16 @@ static SDValue combineSelect(SDNode *N, SelectionDAG &DAG,
// If the RHS is a constant we have to reverse the const
// canonicalization.
// x > C-1 ? x+-C : 0 --> subus x, C
- // TODO: Handle build_vectors with undef elements.
auto MatchUSUBSAT = [](ConstantSDNode *Op, ConstantSDNode *Cond) {
- return Cond->getAPIntValue() == (-Op->getAPIntValue() - 1);
+ return (!Op && !Cond) ||
+ (Op && Cond &&
+ Cond->getAPIntValue() == (-Op->getAPIntValue() - 1));
};
if (CC == ISD::SETUGT && Other->getOpcode() == ISD::ADD &&
- ISD::matchBinaryPredicate(OpRHS, CondRHS, MatchUSUBSAT)) {
- OpRHS = DAG.getNode(ISD::SUB, DL, VT,
- DAG.getConstant(0, DL, VT), OpRHS);
+ ISD::matchBinaryPredicate(OpRHS, CondRHS, MatchUSUBSAT,
+ /*AllowUndefs*/ true)) {
+ OpRHS = DAG.getNode(ISD::SUB, DL, VT, DAG.getConstant(0, DL, VT),
+ OpRHS);
return DAG.getNode(ISD::USUBSAT, DL, VT, OpLHS, OpRHS);
}
@@ -34432,6 +36505,9 @@ static SDValue combineSelect(SDNode *N, SelectionDAG &DAG,
if (SDValue V = combineVSelectToBLENDV(N, DAG, DCI, Subtarget))
return V;
+ if (SDValue V = narrowVectorSelect(N, DAG, Subtarget))
+ return V;
+
// Custom action for SELECT MMX
if (VT == MVT::x86mmx) {
LHS = DAG.getBitcast(MVT::i64, LHS);
@@ -34715,7 +36791,7 @@ static bool checkBoolTestAndOrSetCCCombine(SDValue Cond, X86::CondCode &CC0,
// When legalizing carry, we create carries via add X, -1
// If that comes from an actual carry, via setcc, we use the
// carry directly.
-static SDValue combineCarryThroughADD(SDValue EFLAGS) {
+static SDValue combineCarryThroughADD(SDValue EFLAGS, SelectionDAG &DAG) {
if (EFLAGS.getOpcode() == X86ISD::ADD) {
if (isAllOnesConstant(EFLAGS.getOperand(1))) {
SDValue Carry = EFLAGS.getOperand(0);
@@ -34728,8 +36804,34 @@ static SDValue combineCarryThroughADD(SDValue EFLAGS) {
Carry = Carry.getOperand(0);
if (Carry.getOpcode() == X86ISD::SETCC ||
Carry.getOpcode() == X86ISD::SETCC_CARRY) {
- if (Carry.getConstantOperandVal(0) == X86::COND_B)
- return Carry.getOperand(1);
+ // TODO: Merge this code with equivalent in combineAddOrSubToADCOrSBB?
+ uint64_t CarryCC = Carry.getConstantOperandVal(0);
+ SDValue CarryOp1 = Carry.getOperand(1);
+ if (CarryCC == X86::COND_B)
+ return CarryOp1;
+ if (CarryCC == X86::COND_A) {
+ // Try to convert COND_A into COND_B in an attempt to facilitate
+ // materializing "setb reg".
+ //
+ // Do not flip "e > c", where "c" is a constant, because Cmp
+ // instruction cannot take an immediate as its first operand.
+ //
+ if (CarryOp1.getOpcode() == X86ISD::SUB &&
+ CarryOp1.getNode()->hasOneUse() &&
+ CarryOp1.getValueType().isInteger() &&
+ !isa<ConstantSDNode>(CarryOp1.getOperand(1))) {
+ SDValue SubCommute =
+ DAG.getNode(X86ISD::SUB, SDLoc(CarryOp1), CarryOp1->getVTList(),
+ CarryOp1.getOperand(1), CarryOp1.getOperand(0));
+ return SDValue(SubCommute.getNode(), CarryOp1.getResNo());
+ }
+ }
+ // If this is a check of the z flag of an add with 1, switch to the
+ // C flag.
+ if (CarryCC == X86::COND_E &&
+ CarryOp1.getOpcode() == X86ISD::ADD &&
+ isOneConstant(CarryOp1.getOperand(1)))
+ return CarryOp1;
}
}
}
@@ -34744,7 +36846,7 @@ static SDValue combineSetCCEFLAGS(SDValue EFLAGS, X86::CondCode &CC,
SelectionDAG &DAG,
const X86Subtarget &Subtarget) {
if (CC == X86::COND_B)
- if (SDValue Flags = combineCarryThroughADD(EFLAGS))
+ if (SDValue Flags = combineCarryThroughADD(EFLAGS, DAG))
return Flags;
if (SDValue R = checkBoolTestSetCCCombine(EFLAGS, CC))
@@ -34763,6 +36865,10 @@ static SDValue combineCMov(SDNode *N, SelectionDAG &DAG,
X86::CondCode CC = (X86::CondCode)N->getConstantOperandVal(2);
SDValue Cond = N->getOperand(3);
+ // cmov X, X, ?, ? --> X
+ if (TrueOp == FalseOp)
+ return TrueOp;
+
// Try to simplify the EFLAGS and condition code operands.
// We can't always do this as FCMOV only supports a subset of X86 cond.
if (SDValue Flags = combineSetCCEFLAGS(Cond, CC, DAG, Subtarget)) {
@@ -35044,7 +37150,7 @@ static SDValue reduceVMULWidth(SDNode *N, SelectionDAG &DAG,
// pmulld is supported since SSE41. It is better to use pmulld
// instead of pmullw+pmulhw, except for subtargets where pmulld is slower than
// the expansion.
- bool OptForMinSize = DAG.getMachineFunction().getFunction().optForMinSize();
+ bool OptForMinSize = DAG.getMachineFunction().getFunction().hasMinSize();
if (Subtarget.hasSSE41() && (OptForMinSize || !Subtarget.isPMULLDSlow()))
return SDValue();
@@ -35283,8 +37389,8 @@ static SDValue combineMulToPMADDWD(SDNode *N, SelectionDAG &DAG,
// Use SplitOpsAndApply to handle AVX splitting.
auto PMADDWDBuilder = [](SelectionDAG &DAG, const SDLoc &DL,
ArrayRef<SDValue> Ops) {
- MVT VT = MVT::getVectorVT(MVT::i32, Ops[0].getValueSizeInBits() / 32);
- return DAG.getNode(X86ISD::VPMADDWD, DL, VT, Ops);
+ MVT OpVT = MVT::getVectorVT(MVT::i32, Ops[0].getValueSizeInBits() / 32);
+ return DAG.getNode(X86ISD::VPMADDWD, DL, OpVT, Ops);
};
return SplitOpsAndApply(DAG, Subtarget, SDLoc(N), VT,
{ DAG.getBitcast(WVT, N0), DAG.getBitcast(WVT, N1) },
@@ -35352,7 +37458,7 @@ static SDValue combineMul(SDNode *N, SelectionDAG &DAG,
if (!MulConstantOptimization)
return SDValue();
// An imul is usually smaller than the alternative sequence.
- if (DAG.getMachineFunction().getFunction().optForMinSize())
+ if (DAG.getMachineFunction().getFunction().hasMinSize())
return SDValue();
if (DCI.isBeforeLegalize() || DCI.isCalledByLegalizer())
@@ -35489,7 +37595,7 @@ static SDValue combineShiftLeft(SDNode *N, SelectionDAG &DAG) {
N1C && N0.getOpcode() == ISD::AND &&
N0.getOperand(1).getOpcode() == ISD::Constant) {
SDValue N00 = N0.getOperand(0);
- APInt Mask = cast<ConstantSDNode>(N0.getOperand(1))->getAPIntValue();
+ APInt Mask = N0.getConstantOperandAPInt(1);
Mask <<= N1C->getAPIntValue();
bool MaskOK = false;
// We can handle cases concerning bit-widening nodes containing setcc_c if
@@ -35638,24 +37744,6 @@ static SDValue combineShiftRightLogical(SDNode *N, SelectionDAG &DAG,
return SDValue();
}
-static SDValue combineShift(SDNode* N, SelectionDAG &DAG,
- TargetLowering::DAGCombinerInfo &DCI,
- const X86Subtarget &Subtarget) {
- if (N->getOpcode() == ISD::SHL)
- if (SDValue V = combineShiftLeft(N, DAG))
- return V;
-
- if (N->getOpcode() == ISD::SRA)
- if (SDValue V = combineShiftRightArithmetic(N, DAG))
- return V;
-
- if (N->getOpcode() == ISD::SRL)
- if (SDValue V = combineShiftRightLogical(N, DAG, DCI))
- return V;
-
- return SDValue();
-}
-
static SDValue combineVectorPack(SDNode *N, SelectionDAG &DAG,
TargetLowering::DAGCombinerInfo &DCI,
const X86Subtarget &Subtarget) {
@@ -35677,8 +37765,8 @@ static SDValue combineVectorPack(SDNode *N, SelectionDAG &DAG,
// Constant Folding.
APInt UndefElts0, UndefElts1;
SmallVector<APInt, 32> EltBits0, EltBits1;
- if ((N0->isUndef() || N->isOnlyUserOf(N0.getNode())) &&
- (N1->isUndef() || N->isOnlyUserOf(N1.getNode())) &&
+ if ((N0.isUndef() || N->isOnlyUserOf(N0.getNode())) &&
+ (N1.isUndef() || N->isOnlyUserOf(N1.getNode())) &&
getTargetConstantBitsFromNode(N0, SrcBitsPerElt, UndefElts0, EltBits0) &&
getTargetConstantBitsFromNode(N1, SrcBitsPerElt, UndefElts1, EltBits1)) {
unsigned NumLanes = VT.getSizeInBits() / 128;
@@ -35750,10 +37838,7 @@ static SDValue combineVectorPack(SDNode *N, SelectionDAG &DAG,
// Attempt to combine as shuffle.
SDValue Op(N, 0);
- if (SDValue Res =
- combineX86ShufflesRecursively({Op}, 0, Op, {0}, {}, /*Depth*/ 1,
- /*HasVarMask*/ false,
- /*AllowVarMask*/ true, DAG, Subtarget))
+ if (SDValue Res = combineX86ShufflesRecursively(Op, DAG, Subtarget))
return Res;
return SDValue();
@@ -35766,11 +37851,22 @@ static SDValue combineVectorShiftVar(SDNode *N, SelectionDAG &DAG,
X86ISD::VSRL == N->getOpcode()) &&
"Unexpected shift opcode");
EVT VT = N->getValueType(0);
+ SDValue N0 = N->getOperand(0);
+ SDValue N1 = N->getOperand(1);
// Shift zero -> zero.
- if (ISD::isBuildVectorAllZeros(N->getOperand(0).getNode()))
+ if (ISD::isBuildVectorAllZeros(N0.getNode()))
return DAG.getConstant(0, SDLoc(N), VT);
+ // Detect constant shift amounts.
+ APInt UndefElts;
+ SmallVector<APInt, 32> EltBits;
+ if (getTargetConstantBitsFromNode(N1, 64, UndefElts, EltBits, true, false)) {
+ unsigned X86Opc = getTargetVShiftUniformOpcode(N->getOpcode(), false);
+ return getTargetVShiftByConstNode(X86Opc, SDLoc(N), VT.getSimpleVT(), N0,
+ EltBits[0].getZExtValue(), DAG);
+ }
+
APInt KnownUndef, KnownZero;
const TargetLowering &TLI = DAG.getTargetLoweringInfo();
APInt DemandedElts = APInt::getAllOnesValue(VT.getVectorNumElements());
@@ -35829,9 +37925,7 @@ static SDValue combineVectorShiftImm(SDNode *N, SelectionDAG &DAG,
// We can decode 'whole byte' logical bit shifts as shuffles.
if (LogicalShift && (ShiftVal % 8) == 0) {
SDValue Op(N, 0);
- if (SDValue Res = combineX86ShufflesRecursively(
- {Op}, 0, Op, {0}, {}, /*Depth*/ 1,
- /*HasVarMask*/ false, /*AllowVarMask*/ true, DAG, Subtarget))
+ if (SDValue Res = combineX86ShufflesRecursively(Op, DAG, Subtarget))
return Res;
}
@@ -35864,18 +37958,20 @@ static SDValue combineVectorShiftImm(SDNode *N, SelectionDAG &DAG,
static SDValue combineVectorInsert(SDNode *N, SelectionDAG &DAG,
TargetLowering::DAGCombinerInfo &DCI,
const X86Subtarget &Subtarget) {
- assert(
- ((N->getOpcode() == X86ISD::PINSRB && N->getValueType(0) == MVT::v16i8) ||
- (N->getOpcode() == X86ISD::PINSRW &&
- N->getValueType(0) == MVT::v8i16)) &&
- "Unexpected vector insertion");
+ EVT VT = N->getValueType(0);
+ assert(((N->getOpcode() == X86ISD::PINSRB && VT == MVT::v16i8) ||
+ (N->getOpcode() == X86ISD::PINSRW && VT == MVT::v8i16)) &&
+ "Unexpected vector insertion");
+
+ unsigned NumBitsPerElt = VT.getScalarSizeInBits();
+ const TargetLowering &TLI = DAG.getTargetLoweringInfo();
+ if (TLI.SimplifyDemandedBits(SDValue(N, 0),
+ APInt::getAllOnesValue(NumBitsPerElt), DCI))
+ return SDValue(N, 0);
// Attempt to combine PINSRB/PINSRW patterns to a shuffle.
SDValue Op(N, 0);
- if (SDValue Res =
- combineX86ShufflesRecursively({Op}, 0, Op, {0}, {}, /*Depth*/ 1,
- /*HasVarMask*/ false,
- /*AllowVarMask*/ true, DAG, Subtarget))
+ if (SDValue Res = combineX86ShufflesRecursively(Op, DAG, Subtarget))
return Res;
return SDValue();
@@ -35894,8 +37990,8 @@ static SDValue combineCompareEqual(SDNode *N, SelectionDAG &DAG,
if (Subtarget.hasSSE2() && isAndOrOfSetCCs(SDValue(N, 0U), opcode)) {
SDValue N0 = N->getOperand(0);
SDValue N1 = N->getOperand(1);
- SDValue CMP0 = N0->getOperand(1);
- SDValue CMP1 = N1->getOperand(1);
+ SDValue CMP0 = N0.getOperand(1);
+ SDValue CMP1 = N1.getOperand(1);
SDLoc DL(N);
// The SETCCs should both refer to the same CMP.
@@ -35987,6 +38083,34 @@ static SDValue combineCompareEqual(SDNode *N, SelectionDAG &DAG,
return SDValue();
}
+// Match (xor X, -1) -> X.
+// Match extract_subvector(xor X, -1) -> extract_subvector(X).
+// Match concat_vectors(xor X, -1, xor Y, -1) -> concat_vectors(X, Y).
+static SDValue IsNOT(SDValue V, SelectionDAG &DAG) {
+ V = peekThroughBitcasts(V);
+ if (V.getOpcode() == ISD::XOR &&
+ ISD::isBuildVectorAllOnes(V.getOperand(1).getNode()))
+ return V.getOperand(0);
+ if (V.getOpcode() == ISD::EXTRACT_SUBVECTOR &&
+ (isNullConstant(V.getOperand(1)) || V.getOperand(0).hasOneUse())) {
+ if (SDValue Not = IsNOT(V.getOperand(0), DAG)) {
+ Not = DAG.getBitcast(V.getOperand(0).getValueType(), Not);
+ return DAG.getNode(ISD::EXTRACT_SUBVECTOR, SDLoc(Not), V.getValueType(),
+ Not, V.getOperand(1));
+ }
+ }
+ SmallVector<SDValue, 2> CatOps;
+ if (collectConcatOps(V.getNode(), CatOps)) {
+ for (SDValue &CatOp : CatOps) {
+ SDValue NotCat = IsNOT(CatOp, DAG);
+ if (!NotCat) return SDValue();
+ CatOp = DAG.getBitcast(CatOp.getValueType(), NotCat);
+ }
+ return DAG.getNode(ISD::CONCAT_VECTORS, SDLoc(V), V.getValueType(), CatOps);
+ }
+ return SDValue();
+}
+
/// Try to fold: (and (xor X, -1), Y) -> (andnp X, Y).
static SDValue combineANDXORWithAllOnesIntoANDNP(SDNode *N, SelectionDAG &DAG) {
assert(N->getOpcode() == ISD::AND);
@@ -35996,15 +38120,14 @@ static SDValue combineANDXORWithAllOnesIntoANDNP(SDNode *N, SelectionDAG &DAG) {
return SDValue();
SDValue X, Y;
- SDValue N0 = peekThroughBitcasts(N->getOperand(0));
- SDValue N1 = peekThroughBitcasts(N->getOperand(1));
- if (N0.getOpcode() == ISD::XOR &&
- ISD::isBuildVectorAllOnes(N0.getOperand(1).getNode())) {
- X = N0.getOperand(0);
+ SDValue N0 = N->getOperand(0);
+ SDValue N1 = N->getOperand(1);
+
+ if (SDValue Not = IsNOT(N0, DAG)) {
+ X = Not;
Y = N1;
- } else if (N1.getOpcode() == ISD::XOR &&
- ISD::isBuildVectorAllOnes(N1.getOperand(1).getNode())) {
- X = N1.getOperand(0);
+ } else if (SDValue Not = IsNOT(N1, DAG)) {
+ X = Not;
Y = N0;
} else
return SDValue();
@@ -36046,7 +38169,7 @@ static SDValue PromoteMaskArithmetic(SDNode *N, SelectionDAG &DAG,
return SDValue();
// The type of the truncated inputs.
- if (N0->getOperand(0).getValueType() != VT)
+ if (N0.getOperand(0).getValueType() != VT)
return SDValue();
// The right side has to be a 'trunc' or a constant vector.
@@ -36062,9 +38185,9 @@ static SDValue PromoteMaskArithmetic(SDNode *N, SelectionDAG &DAG,
return SDValue();
// Set N0 and N1 to hold the inputs to the new wide operation.
- N0 = N0->getOperand(0);
+ N0 = N0.getOperand(0);
if (RHSTrunc)
- N1 = N1->getOperand(0);
+ N1 = N1.getOperand(0);
else
N1 = DAG.getNode(ISD::ZERO_EXTEND, DL, VT, N1);
@@ -36088,34 +38211,35 @@ static SDValue PromoteMaskArithmetic(SDNode *N, SelectionDAG &DAG,
/// unnecessary moves from SSE to integer registers.
static SDValue convertIntLogicToFPLogic(SDNode *N, SelectionDAG &DAG,
const X86Subtarget &Subtarget) {
- unsigned FPOpcode = ISD::DELETED_NODE;
- if (N->getOpcode() == ISD::AND)
- FPOpcode = X86ISD::FAND;
- else if (N->getOpcode() == ISD::OR)
- FPOpcode = X86ISD::FOR;
- else if (N->getOpcode() == ISD::XOR)
- FPOpcode = X86ISD::FXOR;
-
- assert(FPOpcode != ISD::DELETED_NODE &&
- "Unexpected input node for FP logic conversion");
-
EVT VT = N->getValueType(0);
SDValue N0 = N->getOperand(0);
SDValue N1 = N->getOperand(1);
SDLoc DL(N);
- if (N0.getOpcode() == ISD::BITCAST && N1.getOpcode() == ISD::BITCAST &&
- ((Subtarget.hasSSE1() && VT == MVT::i32) ||
- (Subtarget.hasSSE2() && VT == MVT::i64))) {
- SDValue N00 = N0.getOperand(0);
- SDValue N10 = N1.getOperand(0);
- EVT N00Type = N00.getValueType();
- EVT N10Type = N10.getValueType();
- if (N00Type.isFloatingPoint() && N10Type.isFloatingPoint()) {
- SDValue FPLogic = DAG.getNode(FPOpcode, DL, N00Type, N00, N10);
- return DAG.getBitcast(VT, FPLogic);
- }
+
+ if (N0.getOpcode() != ISD::BITCAST || N1.getOpcode() != ISD::BITCAST)
+ return SDValue();
+
+ SDValue N00 = N0.getOperand(0);
+ SDValue N10 = N1.getOperand(0);
+ EVT N00Type = N00.getValueType();
+ EVT N10Type = N10.getValueType();
+
+ // Ensure that both types are the same and are legal scalar fp types.
+ if (N00Type != N10Type ||
+ !((Subtarget.hasSSE1() && N00Type == MVT::f32) ||
+ (Subtarget.hasSSE2() && N00Type == MVT::f64)))
+ return SDValue();
+
+ unsigned FPOpcode;
+ switch (N->getOpcode()) {
+ default: llvm_unreachable("Unexpected input node for FP logic conversion");
+ case ISD::AND: FPOpcode = X86ISD::FAND; break;
+ case ISD::OR: FPOpcode = X86ISD::FOR; break;
+ case ISD::XOR: FPOpcode = X86ISD::FXOR; break;
}
- return SDValue();
+
+ SDValue FPLogic = DAG.getNode(FPOpcode, DL, N00Type, N00, N10);
+ return DAG.getBitcast(VT, FPLogic);
}
/// If this is a zero/all-bits result that is bitwise-anded with a low bits
@@ -36371,6 +38495,24 @@ static SDValue combineAnd(SDNode *N, SelectionDAG &DAG,
if (SDValue V = combineParity(N, DAG, Subtarget))
return V;
+ // Match all-of bool scalar reductions into a bitcast/movmsk + cmp.
+ // TODO: Support multiple SrcOps.
+ if (VT == MVT::i1) {
+ SmallVector<SDValue, 2> SrcOps;
+ if (matchBitOpReduction(SDValue(N, 0), ISD::AND, SrcOps) &&
+ SrcOps.size() == 1) {
+ SDLoc dl(N);
+ unsigned NumElts = SrcOps[0].getValueType().getVectorNumElements();
+ EVT MaskVT = EVT::getIntegerVT(*DAG.getContext(), NumElts);
+ SDValue Mask = combineBitcastvxi1(DAG, MaskVT, SrcOps[0], dl, Subtarget);
+ if (Mask) {
+ APInt AllBits = APInt::getAllOnesValue(NumElts);
+ return DAG.getSetCC(dl, MVT::i1, Mask,
+ DAG.getConstant(AllBits, dl, MaskVT), ISD::SETEQ);
+ }
+ }
+ }
+
if (DCI.isBeforeLegalizeOps())
return SDValue();
@@ -36392,9 +38534,7 @@ static SDValue combineAnd(SDNode *N, SelectionDAG &DAG,
// Attempt to recursively combine a bitmask AND with shuffles.
if (VT.isVector() && (VT.getScalarSizeInBits() % 8) == 0) {
SDValue Op(N, 0);
- if (SDValue Res = combineX86ShufflesRecursively(
- {Op}, 0, Op, {0}, {}, /*Depth*/ 1,
- /*HasVarMask*/ false, /*AllowVarMask*/ true, DAG, Subtarget))
+ if (SDValue Res = combineX86ShufflesRecursively(Op, DAG, Subtarget))
return Res;
}
@@ -36440,6 +38580,52 @@ static SDValue combineAnd(SDNode *N, SelectionDAG &DAG,
return SDValue();
}
+// Canonicalize OR(AND(X,C),AND(Y,~C)) -> OR(AND(X,C),ANDNP(C,Y))
+static SDValue canonicalizeBitSelect(SDNode *N, SelectionDAG &DAG,
+ const X86Subtarget &Subtarget) {
+ assert(N->getOpcode() == ISD::OR && "Unexpected Opcode");
+
+ EVT VT = N->getValueType(0);
+ if (!VT.isVector() || (VT.getScalarSizeInBits() % 8) != 0)
+ return SDValue();
+
+ SDValue N0 = peekThroughBitcasts(N->getOperand(0));
+ SDValue N1 = peekThroughBitcasts(N->getOperand(1));
+ if (N0.getOpcode() != ISD::AND || N1.getOpcode() != ISD::AND)
+ return SDValue();
+
+ // On XOP we'll lower to PCMOV so accept one use, otherwise only
+ // do this if either mask has multiple uses already.
+ if (!(Subtarget.hasXOP() || !N0.getOperand(1).hasOneUse() ||
+ !N1.getOperand(1).hasOneUse()))
+ return SDValue();
+
+ // Attempt to extract constant byte masks.
+ APInt UndefElts0, UndefElts1;
+ SmallVector<APInt, 32> EltBits0, EltBits1;
+ if (!getTargetConstantBitsFromNode(N0.getOperand(1), 8, UndefElts0, EltBits0,
+ false, false))
+ return SDValue();
+ if (!getTargetConstantBitsFromNode(N1.getOperand(1), 8, UndefElts1, EltBits1,
+ false, false))
+ return SDValue();
+
+ for (unsigned i = 0, e = EltBits0.size(); i != e; ++i) {
+ // TODO - add UNDEF elts support.
+ if (UndefElts0[i] || UndefElts1[i])
+ return SDValue();
+ if (EltBits0[i] != ~EltBits1[i])
+ return SDValue();
+ }
+
+ SDLoc DL(N);
+ SDValue X = N->getOperand(0);
+ SDValue Y =
+ DAG.getNode(X86ISD::ANDNP, DL, VT, DAG.getBitcast(VT, N0.getOperand(1)),
+ DAG.getBitcast(VT, N1.getOperand(0)));
+ return DAG.getNode(ISD::OR, DL, VT, X, Y);
+}
+
// Try to match OR(AND(~MASK,X),AND(MASK,Y)) logic pattern.
static bool matchLogicBlend(SDNode *N, SDValue &X, SDValue &Y, SDValue &Mask) {
if (N->getOpcode() != ISD::OR)
@@ -36472,6 +38658,68 @@ static bool matchLogicBlend(SDNode *N, SDValue &X, SDValue &Y, SDValue &Mask) {
return true;
}
+// Try to match:
+// (or (and (M, (sub 0, X)), (pandn M, X)))
+// which is a special case of vselect:
+// (vselect M, (sub 0, X), X)
+// Per:
+// http://graphics.stanford.edu/~seander/bithacks.html#ConditionalNegate
+// We know that, if fNegate is 0 or 1:
+// (fNegate ? -v : v) == ((v ^ -fNegate) + fNegate)
+//
+// Here, we have a mask, M (all 1s or 0), and, similarly, we know that:
+// ((M & 1) ? -X : X) == ((X ^ -(M & 1)) + (M & 1))
+// ( M ? -X : X) == ((X ^ M ) + (M & 1))
+// This lets us transform our vselect to:
+// (add (xor X, M), (and M, 1))
+// And further to:
+// (sub (xor X, M), M)
+static SDValue combineLogicBlendIntoConditionalNegate(
+ EVT VT, SDValue Mask, SDValue X, SDValue Y, const SDLoc &DL,
+ SelectionDAG &DAG, const X86Subtarget &Subtarget) {
+ EVT MaskVT = Mask.getValueType();
+ assert(MaskVT.isInteger() &&
+ DAG.ComputeNumSignBits(Mask) == MaskVT.getScalarSizeInBits() &&
+ "Mask must be zero/all-bits");
+
+ if (X.getValueType() != MaskVT || Y.getValueType() != MaskVT)
+ return SDValue();
+ if (!DAG.getTargetLoweringInfo().isOperationLegal(ISD::SUB, MaskVT))
+ return SDValue();
+
+ auto IsNegV = [](SDNode *N, SDValue V) {
+ return N->getOpcode() == ISD::SUB && N->getOperand(1) == V &&
+ ISD::isBuildVectorAllZeros(N->getOperand(0).getNode());
+ };
+
+ SDValue V;
+ if (IsNegV(Y.getNode(), X))
+ V = X;
+ else if (IsNegV(X.getNode(), Y))
+ V = Y;
+ else
+ return SDValue();
+
+ SDValue SubOp1 = DAG.getNode(ISD::XOR, DL, MaskVT, V, Mask);
+ SDValue SubOp2 = Mask;
+
+ // If the negate was on the false side of the select, then
+ // the operands of the SUB need to be swapped. PR 27251.
+ // This is because the pattern being matched above is
+ // (vselect M, (sub (0, X), X) -> (sub (xor X, M), M)
+ // but if the pattern matched was
+ // (vselect M, X, (sub (0, X))), that is really negation of the pattern
+ // above, -(vselect M, (sub 0, X), X), and therefore the replacement
+ // pattern also needs to be a negation of the replacement pattern above.
+ // And -(sub X, Y) is just sub (Y, X), so swapping the operands of the
+ // sub accomplishes the negation of the replacement pattern.
+ if (V == Y)
+ std::swap(SubOp1, SubOp2);
+
+ SDValue Res = DAG.getNode(ISD::SUB, DL, MaskVT, SubOp1, SubOp2);
+ return DAG.getBitcast(VT, Res);
+}
+
// Try to fold:
// (or (and (m, y), (pandn m, x)))
// into:
@@ -36507,55 +38755,10 @@ static SDValue combineLogicBlendIntoPBLENDV(SDNode *N, SelectionDAG &DAG,
SDLoc DL(N);
- // Try to match:
- // (or (and (M, (sub 0, X)), (pandn M, X)))
- // which is a special case of vselect:
- // (vselect M, (sub 0, X), X)
- // Per:
- // http://graphics.stanford.edu/~seander/bithacks.html#ConditionalNegate
- // We know that, if fNegate is 0 or 1:
- // (fNegate ? -v : v) == ((v ^ -fNegate) + fNegate)
- //
- // Here, we have a mask, M (all 1s or 0), and, similarly, we know that:
- // ((M & 1) ? -X : X) == ((X ^ -(M & 1)) + (M & 1))
- // ( M ? -X : X) == ((X ^ M ) + (M & 1))
- // This lets us transform our vselect to:
- // (add (xor X, M), (and M, 1))
- // And further to:
- // (sub (xor X, M), M)
- if (X.getValueType() == MaskVT && Y.getValueType() == MaskVT &&
- DAG.getTargetLoweringInfo().isOperationLegal(ISD::SUB, MaskVT)) {
- auto IsNegV = [](SDNode *N, SDValue V) {
- return N->getOpcode() == ISD::SUB && N->getOperand(1) == V &&
- ISD::isBuildVectorAllZeros(N->getOperand(0).getNode());
- };
- SDValue V;
- if (IsNegV(Y.getNode(), X))
- V = X;
- else if (IsNegV(X.getNode(), Y))
- V = Y;
-
- if (V) {
- SDValue SubOp1 = DAG.getNode(ISD::XOR, DL, MaskVT, V, Mask);
- SDValue SubOp2 = Mask;
-
- // If the negate was on the false side of the select, then
- // the operands of the SUB need to be swapped. PR 27251.
- // This is because the pattern being matched above is
- // (vselect M, (sub (0, X), X) -> (sub (xor X, M), M)
- // but if the pattern matched was
- // (vselect M, X, (sub (0, X))), that is really negation of the pattern
- // above, -(vselect M, (sub 0, X), X), and therefore the replacement
- // pattern also needs to be a negation of the replacement pattern above.
- // And -(sub X, Y) is just sub (Y, X), so swapping the operands of the
- // sub accomplishes the negation of the replacement pattern.
- if (V == Y)
- std::swap(SubOp1, SubOp2);
-
- SDValue Res = DAG.getNode(ISD::SUB, DL, MaskVT, SubOp1, SubOp2);
- return DAG.getBitcast(VT, Res);
- }
- }
+ // Attempt to combine to conditional negate: (sub (xor X, M), M)
+ if (SDValue Res = combineLogicBlendIntoConditionalNegate(VT, Mask, X, Y, DL,
+ DAG, Subtarget))
+ return Res;
// PBLENDVB is only available on SSE 4.1.
if (!Subtarget.hasSSE41())
@@ -36665,8 +38868,7 @@ static SDValue combineOrCmpEqZeroToCtlzSrl(SDNode *N, SelectionDAG &DAG,
// Swap rhs with lhs to match or(setcc(eq, cmp, 0), or).
if (RHS->getOpcode() == ISD::OR)
std::swap(LHS, RHS);
- EVT VT = OR->getValueType(0);
- SDValue NewRHS = lowerX86CmpEqZeroToCtlzSrl(RHS, VT, DAG);
+ NewRHS = lowerX86CmpEqZeroToCtlzSrl(RHS, VT, DAG);
if (!NewRHS)
return SDValue();
Ret = DAG.getNode(ISD::OR, SDLoc(OR), VT, Ret, NewRHS);
@@ -36702,15 +38904,16 @@ static SDValue combineOr(SDNode *N, SelectionDAG &DAG,
if (SDValue FPLogic = convertIntLogicToFPLogic(N, DAG, Subtarget))
return FPLogic;
+ if (SDValue R = canonicalizeBitSelect(N, DAG, Subtarget))
+ return R;
+
if (SDValue R = combineLogicBlendIntoPBLENDV(N, DAG, Subtarget))
return R;
// Attempt to recursively combine an OR of shuffles.
if (VT.isVector() && (VT.getScalarSizeInBits() % 8) == 0) {
SDValue Op(N, 0);
- if (SDValue Res = combineX86ShufflesRecursively(
- {Op}, 0, Op, {0}, {}, /*Depth*/ 1,
- /*HasVarMask*/ false, /*AllowVarMask*/ true, DAG, Subtarget))
+ if (SDValue Res = combineX86ShufflesRecursively(Op, DAG, Subtarget))
return Res;
}
@@ -36718,7 +38921,7 @@ static SDValue combineOr(SDNode *N, SelectionDAG &DAG,
return SDValue();
// fold (or (x << c) | (y >> (64 - c))) ==> (shld64 x, y, c)
- bool OptForSize = DAG.getMachineFunction().getFunction().optForSize();
+ bool OptForSize = DAG.getMachineFunction().getFunction().hasOptSize();
unsigned Bits = VT.getScalarSizeInBits();
// SHLD/SHRD instructions have lower register pressure, but on some
@@ -36747,14 +38950,14 @@ static SDValue combineOr(SDNode *N, SelectionDAG &DAG,
SDValue ShMsk0;
if (ShAmt0.getOpcode() == ISD::AND &&
isa<ConstantSDNode>(ShAmt0.getOperand(1)) &&
- ShAmt0.getConstantOperandVal(1) == (Bits - 1)) {
+ ShAmt0.getConstantOperandAPInt(1) == (Bits - 1)) {
ShMsk0 = ShAmt0;
ShAmt0 = ShAmt0.getOperand(0);
}
SDValue ShMsk1;
if (ShAmt1.getOpcode() == ISD::AND &&
isa<ConstantSDNode>(ShAmt1.getOperand(1)) &&
- ShAmt1.getConstantOperandVal(1) == (Bits - 1)) {
+ ShAmt1.getConstantOperandAPInt(1) == (Bits - 1)) {
ShMsk1 = ShAmt1;
ShAmt1 = ShAmt1.getOperand(0);
}
@@ -36765,46 +38968,55 @@ static SDValue combineOr(SDNode *N, SelectionDAG &DAG,
ShAmt1 = ShAmt1.getOperand(0);
SDLoc DL(N);
- unsigned Opc = X86ISD::SHLD;
+ unsigned Opc = ISD::FSHL;
SDValue Op0 = N0.getOperand(0);
SDValue Op1 = N1.getOperand(0);
- if (ShAmt0.getOpcode() == ISD::SUB ||
- ShAmt0.getOpcode() == ISD::XOR) {
- Opc = X86ISD::SHRD;
+ if (ShAmt0.getOpcode() == ISD::SUB || ShAmt0.getOpcode() == ISD::XOR) {
+ Opc = ISD::FSHR;
std::swap(Op0, Op1);
std::swap(ShAmt0, ShAmt1);
std::swap(ShMsk0, ShMsk1);
}
- // OR( SHL( X, C ), SRL( Y, 32 - C ) ) -> SHLD( X, Y, C )
- // OR( SRL( X, C ), SHL( Y, 32 - C ) ) -> SHRD( X, Y, C )
- // OR( SHL( X, C ), SRL( SRL( Y, 1 ), XOR( C, 31 ) ) ) -> SHLD( X, Y, C )
- // OR( SRL( X, C ), SHL( SHL( Y, 1 ), XOR( C, 31 ) ) ) -> SHRD( X, Y, C )
- // OR( SHL( X, AND( C, 31 ) ), SRL( Y, AND( 0 - C, 31 ) ) ) -> SHLD( X, Y, C )
- // OR( SRL( X, AND( C, 31 ) ), SHL( Y, AND( 0 - C, 31 ) ) ) -> SHRD( X, Y, C )
+ auto GetFunnelShift = [&DAG, &DL, VT, Opc](SDValue Op0, SDValue Op1,
+ SDValue Amt) {
+ if (Opc == ISD::FSHR)
+ std::swap(Op0, Op1);
+ return DAG.getNode(Opc, DL, VT, Op0, Op1,
+ DAG.getNode(ISD::TRUNCATE, DL, MVT::i8, Amt));
+ };
+
+ // OR( SHL( X, C ), SRL( Y, 32 - C ) ) -> FSHL( X, Y, C )
+ // OR( SRL( X, C ), SHL( Y, 32 - C ) ) -> FSHR( Y, X, C )
+ // OR( SHL( X, C ), SRL( SRL( Y, 1 ), XOR( C, 31 ) ) ) -> FSHL( X, Y, C )
+ // OR( SRL( X, C ), SHL( SHL( Y, 1 ), XOR( C, 31 ) ) ) -> FSHR( Y, X, C )
+ // OR( SHL( X, AND( C, 31 ) ), SRL( Y, AND( 0 - C, 31 ) ) ) -> FSHL( X, Y, C )
+ // OR( SRL( X, AND( C, 31 ) ), SHL( Y, AND( 0 - C, 31 ) ) ) -> FSHR( Y, X, C )
if (ShAmt1.getOpcode() == ISD::SUB) {
SDValue Sum = ShAmt1.getOperand(0);
if (auto *SumC = dyn_cast<ConstantSDNode>(Sum)) {
SDValue ShAmt1Op1 = ShAmt1.getOperand(1);
+ if (ShAmt1Op1.getOpcode() == ISD::AND &&
+ isa<ConstantSDNode>(ShAmt1Op1.getOperand(1)) &&
+ ShAmt1Op1.getConstantOperandAPInt(1) == (Bits - 1)) {
+ ShMsk1 = ShAmt1Op1;
+ ShAmt1Op1 = ShAmt1Op1.getOperand(0);
+ }
if (ShAmt1Op1.getOpcode() == ISD::TRUNCATE)
ShAmt1Op1 = ShAmt1Op1.getOperand(0);
if ((SumC->getAPIntValue() == Bits ||
(SumC->getAPIntValue() == 0 && ShMsk1)) &&
ShAmt1Op1 == ShAmt0)
- return DAG.getNode(Opc, DL, VT, Op0, Op1,
- DAG.getNode(ISD::TRUNCATE, DL, MVT::i8, ShAmt0));
+ return GetFunnelShift(Op0, Op1, ShAmt0);
}
} else if (auto *ShAmt1C = dyn_cast<ConstantSDNode>(ShAmt1)) {
auto *ShAmt0C = dyn_cast<ConstantSDNode>(ShAmt0);
if (ShAmt0C && (ShAmt0C->getSExtValue() + ShAmt1C->getSExtValue()) == Bits)
- return DAG.getNode(Opc, DL, VT,
- N0.getOperand(0), N1.getOperand(0),
- DAG.getNode(ISD::TRUNCATE, DL,
- MVT::i8, ShAmt0));
+ return GetFunnelShift(Op0, Op1, ShAmt0);
} else if (ShAmt1.getOpcode() == ISD::XOR) {
SDValue Mask = ShAmt1.getOperand(1);
if (auto *MaskC = dyn_cast<ConstantSDNode>(Mask)) {
- unsigned InnerShift = (X86ISD::SHLD == Opc ? ISD::SRL : ISD::SHL);
+ unsigned InnerShift = (ISD::FSHL == Opc ? ISD::SRL : ISD::SHL);
SDValue ShAmt1Op0 = ShAmt1.getOperand(0);
if (ShAmt1Op0.getOpcode() == ISD::TRUNCATE)
ShAmt1Op0 = ShAmt1Op0.getOperand(0);
@@ -36812,15 +39024,13 @@ static SDValue combineOr(SDNode *N, SelectionDAG &DAG,
(ShAmt1Op0 == ShAmt0 || ShAmt1Op0 == ShMsk0)) {
if (Op1.getOpcode() == InnerShift &&
isa<ConstantSDNode>(Op1.getOperand(1)) &&
- Op1.getConstantOperandVal(1) == 1) {
- return DAG.getNode(Opc, DL, VT, Op0, Op1.getOperand(0),
- DAG.getNode(ISD::TRUNCATE, DL, MVT::i8, ShAmt0));
+ Op1.getConstantOperandAPInt(1) == 1) {
+ return GetFunnelShift(Op0, Op1.getOperand(0), ShAmt0);
}
// Test for ADD( Y, Y ) as an equivalent to SHL( Y, 1 ).
if (InnerShift == ISD::SHL && Op1.getOpcode() == ISD::ADD &&
Op1.getOperand(0) == Op1.getOperand(1)) {
- return DAG.getNode(Opc, DL, VT, Op0, Op1.getOperand(0),
- DAG.getNode(ISD::TRUNCATE, DL, MVT::i8, ShAmt0));
+ return GetFunnelShift(Op0, Op1.getOperand(0), ShAmt0);
}
}
}
@@ -36862,7 +39072,7 @@ static SDValue foldXorTruncShiftIntoCmp(SDNode *N, SelectionDAG &DAG) {
// Make sure the shift amount extracts the sign bit.
if (!isa<ConstantSDNode>(Shift.getOperand(1)) ||
- Shift.getConstantOperandVal(1) != ShiftTy.getSizeInBits() - 1)
+ Shift.getConstantOperandAPInt(1) != (ShiftTy.getSizeInBits() - 1))
return SDValue();
// Create a greater-than comparison against -1.
@@ -36915,13 +39125,10 @@ static SDValue foldVectorXorShiftIntoCmp(SDNode *N, SelectionDAG &DAG,
return SDValue();
// The shift should be smearing the sign bit across each vector element.
- auto *ShiftBV = dyn_cast<BuildVectorSDNode>(Shift.getOperand(1));
- if (!ShiftBV)
- return SDValue();
-
- EVT ShiftEltTy = Shift.getValueType().getVectorElementType();
- auto *ShiftAmt = ShiftBV->getConstantSplatNode();
- if (!ShiftAmt || ShiftAmt->getZExtValue() != ShiftEltTy.getSizeInBits() - 1)
+ auto *ShiftAmt =
+ isConstOrConstSplat(Shift.getOperand(1), /*AllowUndefs*/ true);
+ if (!ShiftAmt ||
+ ShiftAmt->getAPIntValue() != (Shift.getScalarValueSizeInBits() - 1))
return SDValue();
// Create a greater-than comparison against -1. We don't use the more obvious
@@ -37203,15 +39410,35 @@ static SDValue detectAVGPattern(SDValue In, EVT VT, SelectionDAG &DAG,
AVGBuilder);
}
- if (Operands[0].getOpcode() == ISD::ADD)
+ // Matches 'add like' patterns: add(Op0,Op1) + zext(or(Op0,Op1)).
+ // Match the or case only if its 'add-like' - can be replaced by an add.
+ auto FindAddLike = [&](SDValue V, SDValue &Op0, SDValue &Op1) {
+ if (ISD::ADD == V.getOpcode()) {
+ Op0 = V.getOperand(0);
+ Op1 = V.getOperand(1);
+ return true;
+ }
+ if (ISD::ZERO_EXTEND != V.getOpcode())
+ return false;
+ V = V.getOperand(0);
+ if (V.getValueType() != VT || ISD::OR != V.getOpcode() ||
+ !DAG.haveNoCommonBitsSet(V.getOperand(0), V.getOperand(1)))
+ return false;
+ Op0 = V.getOperand(0);
+ Op1 = V.getOperand(1);
+ return true;
+ };
+
+ SDValue Op0, Op1;
+ if (FindAddLike(Operands[0], Op0, Op1))
std::swap(Operands[0], Operands[1]);
- else if (Operands[1].getOpcode() != ISD::ADD)
+ else if (!FindAddLike(Operands[1], Op0, Op1))
return SDValue();
- Operands[2] = Operands[1].getOperand(0);
- Operands[1] = Operands[1].getOperand(1);
+ Operands[2] = Op0;
+ Operands[1] = Op1;
// Now we have three operands of two additions. Check that one of them is a
- // constant vector with ones, and the other two are promoted from i8/i16.
+ // constant vector with ones, and the other two can be promoted from i8/i16.
for (int i = 0; i < 3; ++i) {
if (!IsConstVectorInRange(Operands[i], 1, 1))
continue;
@@ -37219,14 +39446,16 @@ static SDValue detectAVGPattern(SDValue In, EVT VT, SelectionDAG &DAG,
// Check if Operands[0] and Operands[1] are results of type promotion.
for (int j = 0; j < 2; ++j)
- if (Operands[j].getOpcode() != ISD::ZERO_EXTEND ||
- Operands[j].getOperand(0).getValueType() != VT)
- return SDValue();
+ if (Operands[j].getValueType() != VT) {
+ if (Operands[j].getOpcode() != ISD::ZERO_EXTEND ||
+ Operands[j].getOperand(0).getValueType() != VT)
+ return SDValue();
+ Operands[j] = Operands[j].getOperand(0);
+ }
// The pattern is detected, emit X86ISD::AVG instruction(s).
- return SplitOpsAndApply(DAG, Subtarget, DL, VT,
- { Operands[0].getOperand(0),
- Operands[1].getOperand(0) }, AVGBuilder);
+ return SplitOpsAndApply(DAG, Subtarget, DL, VT, {Operands[0], Operands[1]},
+ AVGBuilder);
}
return SDValue();
@@ -37246,38 +39475,51 @@ static SDValue combineLoad(SDNode *N, SelectionDAG &DAG,
// pre-AVX2 targets as 32-byte loads will lower to regular temporal loads.
ISD::LoadExtType Ext = Ld->getExtensionType();
bool Fast;
- unsigned AddressSpace = Ld->getAddressSpace();
unsigned Alignment = Ld->getAlignment();
if (RegVT.is256BitVector() && !DCI.isBeforeLegalizeOps() &&
Ext == ISD::NON_EXTLOAD &&
((Ld->isNonTemporal() && !Subtarget.hasInt256() && Alignment >= 16) ||
(TLI.allowsMemoryAccess(*DAG.getContext(), DAG.getDataLayout(), RegVT,
- AddressSpace, Alignment, &Fast) && !Fast))) {
+ *Ld->getMemOperand(), &Fast) &&
+ !Fast))) {
unsigned NumElems = RegVT.getVectorNumElements();
if (NumElems < 2)
return SDValue();
- SDValue Ptr = Ld->getBasePtr();
-
+ unsigned HalfAlign = 16;
+ SDValue Ptr1 = Ld->getBasePtr();
+ SDValue Ptr2 = DAG.getMemBasePlusOffset(Ptr1, HalfAlign, dl);
EVT HalfVT = EVT::getVectorVT(*DAG.getContext(), MemVT.getScalarType(),
- NumElems/2);
+ NumElems / 2);
SDValue Load1 =
- DAG.getLoad(HalfVT, dl, Ld->getChain(), Ptr, Ld->getPointerInfo(),
+ DAG.getLoad(HalfVT, dl, Ld->getChain(), Ptr1, Ld->getPointerInfo(),
Alignment, Ld->getMemOperand()->getFlags());
-
- Ptr = DAG.getMemBasePlusOffset(Ptr, 16, dl);
- SDValue Load2 =
- DAG.getLoad(HalfVT, dl, Ld->getChain(), Ptr,
- Ld->getPointerInfo().getWithOffset(16),
- MinAlign(Alignment, 16U), Ld->getMemOperand()->getFlags());
+ SDValue Load2 = DAG.getLoad(HalfVT, dl, Ld->getChain(), Ptr2,
+ Ld->getPointerInfo().getWithOffset(HalfAlign),
+ MinAlign(Alignment, HalfAlign),
+ Ld->getMemOperand()->getFlags());
SDValue TF = DAG.getNode(ISD::TokenFactor, dl, MVT::Other,
- Load1.getValue(1),
- Load2.getValue(1));
+ Load1.getValue(1), Load2.getValue(1));
SDValue NewVec = DAG.getNode(ISD::CONCAT_VECTORS, dl, RegVT, Load1, Load2);
return DCI.CombineTo(N, NewVec, TF, true);
}
+ // Bool vector load - attempt to cast to an integer, as we have good
+ // (vXiY *ext(vXi1 bitcast(iX))) handling.
+ if (Ext == ISD::NON_EXTLOAD && !Subtarget.hasAVX512() && RegVT.isVector() &&
+ RegVT.getScalarType() == MVT::i1 && DCI.isBeforeLegalize()) {
+ unsigned NumElts = RegVT.getVectorNumElements();
+ EVT IntVT = EVT::getIntegerVT(*DAG.getContext(), NumElts);
+ if (TLI.isTypeLegal(IntVT)) {
+ SDValue IntLoad = DAG.getLoad(IntVT, dl, Ld->getChain(), Ld->getBasePtr(),
+ Ld->getPointerInfo(), Alignment,
+ Ld->getMemOperand()->getFlags());
+ SDValue BoolVec = DAG.getBitcast(RegVT, IntLoad);
+ return DCI.CombineTo(N, BoolVec, IntLoad.getValue(1), true);
+ }
+ }
+
return SDValue();
}
@@ -37404,6 +39646,9 @@ combineMaskedLoadConstantMask(MaskedLoadSDNode *ML, SelectionDAG &DAG,
if (ML->getPassThru().isUndef())
return SDValue();
+ if (ISD::isBuildVectorAllZeros(ML->getPassThru().getNode()))
+ return SDValue();
+
// The new masked load has an undef pass-through operand. The select uses the
// original pass-through operand.
SDValue NewML = DAG.getMaskedLoad(VT, DL, ML->getChain(), ML->getBasePtr(),
@@ -37434,7 +39679,7 @@ static SDValue combineMaskedLoad(SDNode *N, SelectionDAG &DAG,
return Blend;
}
- if (Mld->getExtensionType() != ISD::SEXTLOAD)
+ if (Mld->getExtensionType() != ISD::EXTLOAD)
return SDValue();
// Resolve extending loads.
@@ -37504,8 +39749,20 @@ static SDValue combineMaskedLoad(SDNode *N, SelectionDAG &DAG,
Mld->getBasePtr(), NewMask, WidePassThru,
Mld->getMemoryVT(), Mld->getMemOperand(),
ISD::NON_EXTLOAD);
- SDValue NewVec = getExtendInVec(/*Signed*/true, dl, VT, WideLd, DAG);
- return DCI.CombineTo(N, NewVec, WideLd.getValue(1), true);
+
+ SDValue SlicedVec = DAG.getBitcast(WideVecVT, WideLd);
+ SmallVector<int, 16> ShuffleVec(NumElems * SizeRatio, -1);
+ for (unsigned i = 0; i != NumElems; ++i)
+ ShuffleVec[i * SizeRatio] = i;
+
+ // Can't shuffle using an illegal type.
+ assert(DAG.getTargetLoweringInfo().isTypeLegal(WideVecVT) &&
+ "WideVecVT should be legal");
+ SlicedVec = DAG.getVectorShuffle(WideVecVT, dl, SlicedVec,
+ DAG.getUNDEF(WideVecVT), ShuffleVec);
+ SlicedVec = DAG.getBitcast(VT, SlicedVec);
+
+ return DCI.CombineTo(N, SlicedVec, WideLd.getValue(1), true);
}
/// If exactly one element of the mask is set for a non-truncating masked store,
@@ -37543,6 +39800,10 @@ static SDValue combineMaskedStore(SDNode *N, SelectionDAG &DAG,
return SDValue();
EVT VT = Mst->getValue().getValueType();
+ EVT StVT = Mst->getMemoryVT();
+ SDLoc dl(Mst);
+ const TargetLowering &TLI = DAG.getTargetLoweringInfo();
+
if (!Mst->isTruncatingStore()) {
if (SDValue ScalarStore = reduceMaskedStoreToScalarStore(Mst, DAG))
return ScalarStore;
@@ -37551,7 +39812,6 @@ static SDValue combineMaskedStore(SDNode *N, SelectionDAG &DAG,
// simplify ops leading up to it. We only demand the MSB of each lane.
SDValue Mask = Mst->getMask();
if (Mask.getScalarValueSizeInBits() != 1) {
- const TargetLowering &TLI = DAG.getTargetLoweringInfo();
APInt DemandedMask(APInt::getSignMask(VT.getScalarSizeInBits()));
if (TLI.SimplifyDemandedBits(Mask, DemandedMask, DCI))
return SDValue(N, 0);
@@ -37561,20 +39821,25 @@ static SDValue combineMaskedStore(SDNode *N, SelectionDAG &DAG,
// pattern above, but that pattern will be different. It will either need to
// match setcc more generally or match PCMPGTM later (in tablegen?).
+ SDValue Value = Mst->getValue();
+ if (Value.getOpcode() == ISD::TRUNCATE && Value.getNode()->hasOneUse() &&
+ TLI.isTruncStoreLegal(Value.getOperand(0).getValueType(),
+ Mst->getMemoryVT())) {
+ return DAG.getMaskedStore(Mst->getChain(), SDLoc(N), Value.getOperand(0),
+ Mst->getBasePtr(), Mask,
+ Mst->getMemoryVT(), Mst->getMemOperand(), true);
+ }
+
return SDValue();
}
// Resolve truncating stores.
unsigned NumElems = VT.getVectorNumElements();
- EVT StVT = Mst->getMemoryVT();
- SDLoc dl(Mst);
assert(StVT != VT && "Cannot truncate to the same type");
unsigned FromSz = VT.getScalarSizeInBits();
unsigned ToSz = StVT.getScalarSizeInBits();
- const TargetLowering &TLI = DAG.getTargetLoweringInfo();
-
// The truncating store is legal in some cases. For example
// vpmovqb, vpmovqw, vpmovqd, vpmovdb, vpmovdw
// are designated for truncate store.
@@ -37644,11 +39909,13 @@ static SDValue combineMaskedStore(SDNode *N, SelectionDAG &DAG,
}
static SDValue combineStore(SDNode *N, SelectionDAG &DAG,
+ TargetLowering::DAGCombinerInfo &DCI,
const X86Subtarget &Subtarget) {
StoreSDNode *St = cast<StoreSDNode>(N);
EVT VT = St->getValue().getValueType();
EVT StVT = St->getMemoryVT();
SDLoc dl(St);
+ unsigned Alignment = St->getAlignment();
SDValue StoredVal = St->getOperand(1);
const TargetLowering &TLI = DAG.getTargetLoweringInfo();
@@ -37699,8 +39966,6 @@ static SDValue combineStore(SDNode *N, SelectionDAG &DAG,
StoredVal->ops().slice(32, 32));
Hi = combinevXi1ConstantToInteger(Hi, DAG);
- unsigned Alignment = St->getAlignment();
-
SDValue Ptr0 = St->getBasePtr();
SDValue Ptr1 = DAG.getMemBasePlusOffset(Ptr0, 4, dl);
@@ -37724,30 +39989,48 @@ static SDValue combineStore(SDNode *N, SelectionDAG &DAG,
// If we are saving a concatenation of two XMM registers and 32-byte stores
// are slow, such as on Sandy Bridge, perform two 16-byte stores.
bool Fast;
- unsigned AddressSpace = St->getAddressSpace();
- unsigned Alignment = St->getAlignment();
if (VT.is256BitVector() && StVT == VT &&
TLI.allowsMemoryAccess(*DAG.getContext(), DAG.getDataLayout(), VT,
- AddressSpace, Alignment, &Fast) &&
+ *St->getMemOperand(), &Fast) &&
!Fast) {
unsigned NumElems = VT.getVectorNumElements();
if (NumElems < 2)
return SDValue();
- SDValue Value0 = extract128BitVector(StoredVal, 0, DAG, dl);
- SDValue Value1 = extract128BitVector(StoredVal, NumElems / 2, DAG, dl);
+ return splitVectorStore(St, DAG);
+ }
- SDValue Ptr0 = St->getBasePtr();
- SDValue Ptr1 = DAG.getMemBasePlusOffset(Ptr0, 16, dl);
+ // Split under-aligned vector non-temporal stores.
+ if (St->isNonTemporal() && StVT == VT && Alignment < VT.getStoreSize()) {
+ // ZMM/YMM nt-stores - either it can be stored as a series of shorter
+ // vectors or the legalizer can scalarize it to use MOVNTI.
+ if (VT.is256BitVector() || VT.is512BitVector()) {
+ unsigned NumElems = VT.getVectorNumElements();
+ if (NumElems < 2)
+ return SDValue();
+ return splitVectorStore(St, DAG);
+ }
+
+ // XMM nt-stores - scalarize this to f64 nt-stores on SSE4A, else i32/i64
+ // to use MOVNTI.
+ if (VT.is128BitVector() && Subtarget.hasSSE2()) {
+ MVT NTVT = Subtarget.hasSSE4A()
+ ? MVT::v2f64
+ : (TLI.isTypeLegal(MVT::i64) ? MVT::v2i64 : MVT::v4i32);
+ return scalarizeVectorStore(St, NTVT, DAG);
+ }
+ }
- SDValue Ch0 =
- DAG.getStore(St->getChain(), dl, Value0, Ptr0, St->getPointerInfo(),
- Alignment, St->getMemOperand()->getFlags());
- SDValue Ch1 =
- DAG.getStore(St->getChain(), dl, Value1, Ptr1,
- St->getPointerInfo().getWithOffset(16),
- MinAlign(Alignment, 16U), St->getMemOperand()->getFlags());
- return DAG.getNode(ISD::TokenFactor, dl, MVT::Other, Ch0, Ch1);
+ // Try to optimize v16i16->v16i8 truncating stores when BWI is not
+ // supported, but avx512f is by extending to v16i32 and truncating.
+ if (!St->isTruncatingStore() && VT == MVT::v16i8 && !Subtarget.hasBWI() &&
+ St->getValue().getOpcode() == ISD::TRUNCATE &&
+ St->getValue().getOperand(0).getValueType() == MVT::v16i16 &&
+ TLI.isTruncStoreLegalOrCustom(MVT::v16i32, MVT::v16i8) &&
+ !DCI.isBeforeLegalizeOps()) {
+ SDValue Ext = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::v16i32, St->getValue());
+ return DAG.getTruncStore(St->getChain(), dl, Ext, St->getBasePtr(),
+ MVT::v16i8, St->getMemOperand());
}
// Optimize trunc store (of multiple scalars) to shuffle and store.
@@ -37763,7 +40046,6 @@ static SDValue combineStore(SDNode *N, SelectionDAG &DAG,
St->getPointerInfo(), St->getAlignment(),
St->getMemOperand()->getFlags());
- const TargetLowering &TLI = DAG.getTargetLoweringInfo();
if (SDValue Val =
detectAVX512SSatPattern(St->getValue(), St->getMemoryVT(), Subtarget,
TLI))
@@ -37867,7 +40149,7 @@ static SDValue combineStore(SDNode *N, SelectionDAG &DAG,
bool NoImplicitFloatOps = F.hasFnAttribute(Attribute::NoImplicitFloat);
bool F64IsLegal =
!Subtarget.useSoftFloat() && !NoImplicitFloatOps && Subtarget.hasSSE2();
- if ((VT.isVector() ||
+ if (((VT.isVector() && !VT.isFloatingPoint()) ||
(VT == MVT::i64 && F64IsLegal && !Subtarget.is64Bit())) &&
isa<LoadSDNode>(St->getValue()) &&
!cast<LoadSDNode>(St->getValue())->isVolatile() &&
@@ -37890,8 +40172,7 @@ static SDValue combineStore(SDNode *N, SelectionDAG &DAG,
// Otherwise, if it's legal to use f64 SSE instructions, use f64 load/store
// pair instead.
if (Subtarget.is64Bit() || F64IsLegal) {
- MVT LdVT = (Subtarget.is64Bit() &&
- (!VT.isFloatingPoint() || !F64IsLegal)) ? MVT::i64 : MVT::f64;
+ MVT LdVT = Subtarget.is64Bit() ? MVT::i64 : MVT::f64;
SDValue NewLd = DAG.getLoad(LdVT, LdDL, Ld->getChain(), Ld->getBasePtr(),
Ld->getMemOperand());
@@ -37965,7 +40246,9 @@ static SDValue combineStore(SDNode *N, SelectionDAG &DAG,
/// In short, LHS and RHS are inspected to see if LHS op RHS is of the form
/// A horizontal-op B, for some already available A and B, and if so then LHS is
/// set to A, RHS to B, and the routine returns 'true'.
-static bool isHorizontalBinOp(SDValue &LHS, SDValue &RHS, bool IsCommutative) {
+static bool isHorizontalBinOp(SDValue &LHS, SDValue &RHS, SelectionDAG &DAG,
+ const X86Subtarget &Subtarget,
+ bool IsCommutative) {
// If either operand is undef, bail out. The binop should be simplified.
if (LHS.isUndef() || RHS.isUndef())
return false;
@@ -37979,51 +40262,83 @@ static bool isHorizontalBinOp(SDValue &LHS, SDValue &RHS, bool IsCommutative) {
// then LHS op RHS = < a0 op a1, a2 op a3, b0 op b1, b2 op b3 >
// which is A horizontal-op B.
- // At least one of the operands should be a vector shuffle.
- if (LHS.getOpcode() != ISD::VECTOR_SHUFFLE &&
- RHS.getOpcode() != ISD::VECTOR_SHUFFLE)
- return false;
-
MVT VT = LHS.getSimpleValueType();
assert((VT.is128BitVector() || VT.is256BitVector()) &&
"Unsupported vector type for horizontal add/sub");
+ unsigned NumElts = VT.getVectorNumElements();
+
+ // TODO - can we make a general helper method that does all of this for us?
+ auto GetShuffle = [&](SDValue Op, SDValue &N0, SDValue &N1,
+ SmallVectorImpl<int> &ShuffleMask) {
+ if (Op.getOpcode() == ISD::VECTOR_SHUFFLE) {
+ if (!Op.getOperand(0).isUndef())
+ N0 = Op.getOperand(0);
+ if (!Op.getOperand(1).isUndef())
+ N1 = Op.getOperand(1);
+ ArrayRef<int> Mask = cast<ShuffleVectorSDNode>(Op)->getMask();
+ ShuffleMask.append(Mask.begin(), Mask.end());
+ return;
+ }
+ bool UseSubVector = false;
+ if (Op.getOpcode() == ISD::EXTRACT_SUBVECTOR &&
+ Op.getOperand(0).getValueType().is256BitVector() &&
+ llvm::isNullConstant(Op.getOperand(1))) {
+ Op = Op.getOperand(0);
+ UseSubVector = true;
+ }
+ bool IsUnary;
+ SmallVector<SDValue, 2> SrcOps;
+ SmallVector<int, 16> SrcShuffleMask;
+ SDValue BC = peekThroughBitcasts(Op);
+ if (isTargetShuffle(BC.getOpcode()) &&
+ getTargetShuffleMask(BC.getNode(), BC.getSimpleValueType(), false,
+ SrcOps, SrcShuffleMask, IsUnary)) {
+ if (!UseSubVector && SrcShuffleMask.size() == NumElts &&
+ SrcOps.size() <= 2) {
+ N0 = SrcOps.size() > 0 ? SrcOps[0] : SDValue();
+ N1 = SrcOps.size() > 1 ? SrcOps[1] : SDValue();
+ ShuffleMask.append(SrcShuffleMask.begin(), SrcShuffleMask.end());
+ }
+ if (UseSubVector && (SrcShuffleMask.size() == (NumElts * 2)) &&
+ SrcOps.size() == 1) {
+ N0 = extract128BitVector(SrcOps[0], 0, DAG, SDLoc(Op));
+ N1 = extract128BitVector(SrcOps[0], NumElts, DAG, SDLoc(Op));
+ ArrayRef<int> Mask = ArrayRef<int>(SrcShuffleMask).slice(0, NumElts);
+ ShuffleMask.append(Mask.begin(), Mask.end());
+ }
+ }
+ };
// View LHS in the form
// LHS = VECTOR_SHUFFLE A, B, LMask
// If LHS is not a shuffle, then pretend it is the identity shuffle:
// LHS = VECTOR_SHUFFLE LHS, undef, <0, 1, ..., N-1>
// NOTE: A default initialized SDValue represents an UNDEF of type VT.
- unsigned NumElts = VT.getVectorNumElements();
SDValue A, B;
- SmallVector<int, 16> LMask(NumElts);
- if (LHS.getOpcode() == ISD::VECTOR_SHUFFLE) {
- if (!LHS.getOperand(0).isUndef())
- A = LHS.getOperand(0);
- if (!LHS.getOperand(1).isUndef())
- B = LHS.getOperand(1);
- ArrayRef<int> Mask = cast<ShuffleVectorSDNode>(LHS.getNode())->getMask();
- llvm::copy(Mask, LMask.begin());
- } else {
- A = LHS;
- for (unsigned i = 0; i != NumElts; ++i)
- LMask[i] = i;
- }
+ SmallVector<int, 16> LMask;
+ GetShuffle(LHS, A, B, LMask);
// Likewise, view RHS in the form
// RHS = VECTOR_SHUFFLE C, D, RMask
SDValue C, D;
- SmallVector<int, 16> RMask(NumElts);
- if (RHS.getOpcode() == ISD::VECTOR_SHUFFLE) {
- if (!RHS.getOperand(0).isUndef())
- C = RHS.getOperand(0);
- if (!RHS.getOperand(1).isUndef())
- D = RHS.getOperand(1);
- ArrayRef<int> Mask = cast<ShuffleVectorSDNode>(RHS.getNode())->getMask();
- llvm::copy(Mask, RMask.begin());
- } else {
+ SmallVector<int, 16> RMask;
+ GetShuffle(RHS, C, D, RMask);
+
+ // At least one of the operands should be a vector shuffle.
+ unsigned NumShuffles = (LMask.empty() ? 0 : 1) + (RMask.empty() ? 0 : 1);
+ if (NumShuffles == 0)
+ return false;
+
+ if (LMask.empty()) {
+ A = LHS;
+ for (unsigned i = 0; i != NumElts; ++i)
+ LMask.push_back(i);
+ }
+
+ if (RMask.empty()) {
C = RHS;
for (unsigned i = 0; i != NumElts; ++i)
- RMask[i] = i;
+ RMask.push_back(i);
}
// If A and B occur in reverse order in RHS, then canonicalize by commuting
@@ -38072,6 +40387,12 @@ static bool isHorizontalBinOp(SDValue &LHS, SDValue &RHS, bool IsCommutative) {
LHS = A.getNode() ? A : B; // If A is 'UNDEF', use B for it.
RHS = B.getNode() ? B : A; // If B is 'UNDEF', use A for it.
+
+ if (!shouldUseHorizontalOp(LHS == RHS && NumShuffles < 2, DAG, Subtarget))
+ return false;
+
+ LHS = DAG.getBitcast(VT, LHS);
+ RHS = DAG.getBitcast(VT, RHS);
return true;
}
@@ -38088,8 +40409,7 @@ static SDValue combineFaddFsub(SDNode *N, SelectionDAG &DAG,
// Try to synthesize horizontal add/sub from adds/subs of shuffles.
if (((Subtarget.hasSSE3() && (VT == MVT::v4f32 || VT == MVT::v2f64)) ||
(Subtarget.hasAVX() && (VT == MVT::v8f32 || VT == MVT::v4f64))) &&
- isHorizontalBinOp(LHS, RHS, IsFadd) &&
- shouldUseHorizontalOp(LHS == RHS, DAG, Subtarget))
+ isHorizontalBinOp(LHS, RHS, DAG, Subtarget, IsFadd))
return DAG.getNode(HorizOpcode, SDLoc(N), VT, LHS, RHS);
return SDValue();
@@ -38105,7 +40425,7 @@ static SDValue combineTruncatedArithmetic(SDNode *N, SelectionDAG &DAG,
const SDLoc &DL) {
assert(N->getOpcode() == ISD::TRUNCATE && "Wrong opcode");
SDValue Src = N->getOperand(0);
- unsigned Opcode = Src.getOpcode();
+ unsigned SrcOpcode = Src.getOpcode();
const TargetLowering &TLI = DAG.getTargetLoweringInfo();
EVT VT = N->getValueType(0);
@@ -38123,14 +40443,17 @@ static SDValue combineTruncatedArithmetic(SDNode *N, SelectionDAG &DAG,
return true;
// See if this is a single use constant which can be constant folded.
- SDValue BC = peekThroughOneUseBitcasts(Op);
- return ISD::isBuildVectorOfConstantSDNodes(BC.getNode());
+ // NOTE: We don't peek throught bitcasts here because there is currently
+ // no support for constant folding truncate+bitcast+vector_of_constants. So
+ // we'll just send up with a truncate on both operands which will
+ // get turned back into (truncate (binop)) causing an infinite loop.
+ return ISD::isBuildVectorOfConstantSDNodes(Op.getNode());
};
auto TruncateArithmetic = [&](SDValue N0, SDValue N1) {
SDValue Trunc0 = DAG.getNode(ISD::TRUNCATE, DL, VT, N0);
SDValue Trunc1 = DAG.getNode(ISD::TRUNCATE, DL, VT, N1);
- return DAG.getNode(Opcode, DL, VT, Trunc0, Trunc1);
+ return DAG.getNode(SrcOpcode, DL, VT, Trunc0, Trunc1);
};
// Don't combine if the operation has other uses.
@@ -38145,13 +40468,13 @@ static SDValue combineTruncatedArithmetic(SDNode *N, SelectionDAG &DAG,
// In most cases its only worth pre-truncating if we're only facing the cost
// of one truncation.
// i.e. if one of the inputs will constant fold or the input is repeated.
- switch (Opcode) {
+ switch (SrcOpcode) {
case ISD::AND:
case ISD::XOR:
case ISD::OR: {
SDValue Op0 = Src.getOperand(0);
SDValue Op1 = Src.getOperand(1);
- if (TLI.isOperationLegalOrPromote(Opcode, VT) &&
+ if (TLI.isOperationLegalOrPromote(SrcOpcode, VT) &&
(Op0 == Op1 || IsFreeTruncation(Op0) || IsFreeTruncation(Op1)))
return TruncateArithmetic(Op0, Op1);
break;
@@ -38160,14 +40483,15 @@ static SDValue combineTruncatedArithmetic(SDNode *N, SelectionDAG &DAG,
case ISD::MUL:
// X86 is rubbish at scalar and vector i64 multiplies (until AVX512DQ) - its
// better to truncate if we have the chance.
- if (SrcVT.getScalarType() == MVT::i64 && TLI.isOperationLegal(Opcode, VT) &&
- !TLI.isOperationLegal(Opcode, SrcVT))
+ if (SrcVT.getScalarType() == MVT::i64 &&
+ TLI.isOperationLegal(SrcOpcode, VT) &&
+ !TLI.isOperationLegal(SrcOpcode, SrcVT))
return TruncateArithmetic(Src.getOperand(0), Src.getOperand(1));
LLVM_FALLTHROUGH;
case ISD::ADD: {
SDValue Op0 = Src.getOperand(0);
SDValue Op1 = Src.getOperand(1);
- if (TLI.isOperationLegal(Opcode, VT) &&
+ if (TLI.isOperationLegal(SrcOpcode, VT) &&
(Op0 == Op1 || IsFreeTruncation(Op0) || IsFreeTruncation(Op1)))
return TruncateArithmetic(Op0, Op1);
break;
@@ -38177,7 +40501,7 @@ static SDValue combineTruncatedArithmetic(SDNode *N, SelectionDAG &DAG,
// truncatable to avoid interfering with combineSubToSubus.
SDValue Op0 = Src.getOperand(0);
SDValue Op1 = Src.getOperand(1);
- if (TLI.isOperationLegal(Opcode, VT) &&
+ if (TLI.isOperationLegal(SrcOpcode, VT) &&
(Op0 == Op1 || (IsFreeTruncation(Op0) && IsFreeTruncation(Op1))))
return TruncateArithmetic(Op0, Op1);
break;
@@ -38188,36 +40512,19 @@ static SDValue combineTruncatedArithmetic(SDNode *N, SelectionDAG &DAG,
}
/// Truncate using ISD::AND mask and X86ISD::PACKUS.
+/// e.g. trunc <8 x i32> X to <8 x i16> -->
+/// MaskX = X & 0xffff (clear high bits to prevent saturation)
+/// packus (extract_subv MaskX, 0), (extract_subv MaskX, 1)
static SDValue combineVectorTruncationWithPACKUS(SDNode *N, const SDLoc &DL,
const X86Subtarget &Subtarget,
SelectionDAG &DAG) {
SDValue In = N->getOperand(0);
EVT InVT = In.getValueType();
- EVT InSVT = InVT.getVectorElementType();
EVT OutVT = N->getValueType(0);
- EVT OutSVT = OutVT.getVectorElementType();
-
- // Split a long vector into vectors of legal type and mask to unset all bits
- // that won't appear in the result to prevent saturation.
- // TODO - we should be doing this at the maximum legal size but this is
- // causing regressions where we're concatenating back to max width just to
- // perform the AND and then extracting back again.....
- unsigned NumSubRegs = InVT.getSizeInBits() / 128;
- unsigned NumSubRegElts = 128 / InSVT.getSizeInBits();
- EVT SubRegVT = EVT::getVectorVT(*DAG.getContext(), InSVT, NumSubRegElts);
- SmallVector<SDValue, 8> SubVecs(NumSubRegs);
-
- APInt Mask =
- APInt::getLowBitsSet(InSVT.getSizeInBits(), OutSVT.getSizeInBits());
- SDValue MaskVal = DAG.getConstant(Mask, DL, SubRegVT);
-
- for (unsigned i = 0; i < NumSubRegs; i++) {
- SDValue Sub = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, SubRegVT, In,
- DAG.getIntPtrConstant(i * NumSubRegElts, DL));
- SubVecs[i] = DAG.getNode(ISD::AND, DL, SubRegVT, Sub, MaskVal);
- }
- In = DAG.getNode(ISD::CONCAT_VECTORS, DL, InVT, SubVecs);
+ APInt Mask = APInt::getLowBitsSet(InVT.getScalarSizeInBits(),
+ OutVT.getScalarSizeInBits());
+ In = DAG.getNode(ISD::AND, DL, InVT, In, DAG.getConstant(Mask, DL, InVT));
return truncateVectorWithPACK(X86ISD::PACKUS, OutVT, In, DL, DAG, Subtarget);
}
@@ -38580,16 +40887,23 @@ static SDValue isFNEG(SelectionDAG &DAG, SDNode *N) {
if (N->getOpcode() == ISD::FNEG)
return N->getOperand(0);
+ unsigned ScalarSize = N->getValueType(0).getScalarSizeInBits();
+
SDValue Op = peekThroughBitcasts(SDValue(N, 0));
- auto VT = Op->getValueType(0);
+ EVT VT = Op->getValueType(0);
+ // Make sure the element size does't change.
+ if (VT.getScalarSizeInBits() != ScalarSize)
+ return SDValue();
+
if (auto SVOp = dyn_cast<ShuffleVectorSDNode>(Op.getNode())) {
// For a VECTOR_SHUFFLE(VEC1, VEC2), if the VEC2 is undef, then the negate
// of this is VECTOR_SHUFFLE(-VEC1, UNDEF). The mask can be anything here.
if (!SVOp->getOperand(1).isUndef())
return SDValue();
if (SDValue NegOp0 = isFNEG(DAG, SVOp->getOperand(0).getNode()))
- return DAG.getVectorShuffle(VT, SDLoc(SVOp), NegOp0, DAG.getUNDEF(VT),
- SVOp->getMask());
+ if (NegOp0.getValueType() == VT) // FIXME: Can we do better?
+ return DAG.getVectorShuffle(VT, SDLoc(SVOp), NegOp0, DAG.getUNDEF(VT),
+ SVOp->getMask());
return SDValue();
}
unsigned Opc = Op.getOpcode();
@@ -38601,19 +40915,17 @@ static SDValue isFNEG(SelectionDAG &DAG, SDNode *N) {
if (!InsVector.isUndef())
return SDValue();
if (SDValue NegInsVal = isFNEG(DAG, InsVal.getNode()))
- return DAG.getNode(ISD::INSERT_VECTOR_ELT, SDLoc(Op), VT, InsVector,
- NegInsVal, Op.getOperand(2));
+ if (NegInsVal.getValueType() == VT.getVectorElementType()) // FIXME
+ return DAG.getNode(ISD::INSERT_VECTOR_ELT, SDLoc(Op), VT, InsVector,
+ NegInsVal, Op.getOperand(2));
return SDValue();
}
if (Opc != X86ISD::FXOR && Opc != ISD::XOR && Opc != ISD::FSUB)
return SDValue();
- SDValue Op1 = peekThroughBitcasts(Op.getOperand(1));
- if (!Op1.getValueType().isFloatingPoint())
- return SDValue();
-
- SDValue Op0 = peekThroughBitcasts(Op.getOperand(0));
+ SDValue Op1 = Op.getOperand(1);
+ SDValue Op0 = Op.getOperand(0);
// For XOR and FXOR, we want to check if constant bits of Op1 are sign bit
// masks. For FSUB, we have to check if constant bits of Op0 are sign bit
@@ -38625,7 +40937,7 @@ static SDValue isFNEG(SelectionDAG &DAG, SDNode *N) {
SmallVector<APInt, 16> EltBits;
// Extract constant bits and see if they are all sign bit masks. Ignore the
// undef elements.
- if (getTargetConstantBitsFromNode(Op1, Op1.getScalarValueSizeInBits(),
+ if (getTargetConstantBitsFromNode(Op1, ScalarSize,
UndefElts, EltBits,
/* AllowWholeUndefs */ true,
/* AllowPartialUndefs */ false)) {
@@ -38922,13 +41234,12 @@ static SDValue combineFMinNumFMaxNum(SDNode *N, SelectionDAG &DAG,
if (Subtarget.useSoftFloat())
return SDValue();
- // TODO: If an operand is already known to be a NaN or not a NaN, this
- // should be an optional swap and FMAX/FMIN.
+ const TargetLowering &TLI = DAG.getTargetLoweringInfo();
EVT VT = N->getValueType(0);
- if (!((Subtarget.hasSSE1() && (VT == MVT::f32 || VT == MVT::v4f32)) ||
- (Subtarget.hasSSE2() && (VT == MVT::f64 || VT == MVT::v2f64)) ||
- (Subtarget.hasAVX() && (VT == MVT::v8f32 || VT == MVT::v4f64))))
+ if (!((Subtarget.hasSSE1() && VT == MVT::f32) ||
+ (Subtarget.hasSSE2() && VT == MVT::f64) ||
+ (VT.isVector() && TLI.isTypeLegal(VT))))
return SDValue();
SDValue Op0 = N->getOperand(0);
@@ -38941,13 +41252,20 @@ static SDValue combineFMinNumFMaxNum(SDNode *N, SelectionDAG &DAG,
if (DAG.getTarget().Options.NoNaNsFPMath || N->getFlags().hasNoNaNs())
return DAG.getNode(MinMaxOp, DL, VT, Op0, Op1, N->getFlags());
+ // If one of the operands is known non-NaN use the native min/max instructions
+ // with the non-NaN input as second operand.
+ if (DAG.isKnownNeverNaN(Op1))
+ return DAG.getNode(MinMaxOp, DL, VT, Op0, Op1, N->getFlags());
+ if (DAG.isKnownNeverNaN(Op0))
+ return DAG.getNode(MinMaxOp, DL, VT, Op1, Op0, N->getFlags());
+
// If we have to respect NaN inputs, this takes at least 3 instructions.
// Favor a library call when operating on a scalar and minimizing code size.
- if (!VT.isVector() && DAG.getMachineFunction().getFunction().optForMinSize())
+ if (!VT.isVector() && DAG.getMachineFunction().getFunction().hasMinSize())
return SDValue();
- EVT SetCCType = DAG.getTargetLoweringInfo().getSetCCResultType(
- DAG.getDataLayout(), *DAG.getContext(), VT);
+ EVT SetCCType = TLI.getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(),
+ VT);
// There are 4 possibilities involving NaN inputs, and these are the required
// outputs:
@@ -38987,6 +41305,69 @@ static SDValue combineX86INT_TO_FP(SDNode *N, SelectionDAG &DAG,
KnownZero, DCI))
return SDValue(N, 0);
+ // Convert a full vector load into vzload when not all bits are needed.
+ SDValue In = N->getOperand(0);
+ MVT InVT = In.getSimpleValueType();
+ if (VT.getVectorNumElements() < InVT.getVectorNumElements() &&
+ ISD::isNormalLoad(In.getNode()) && In.hasOneUse()) {
+ assert(InVT.is128BitVector() && "Expected 128-bit input vector");
+ LoadSDNode *LN = cast<LoadSDNode>(N->getOperand(0));
+ // Unless the load is volatile.
+ if (!LN->isVolatile()) {
+ SDLoc dl(N);
+ unsigned NumBits = InVT.getScalarSizeInBits() * VT.getVectorNumElements();
+ MVT MemVT = MVT::getIntegerVT(NumBits);
+ MVT LoadVT = MVT::getVectorVT(MemVT, 128 / NumBits);
+ SDVTList Tys = DAG.getVTList(LoadVT, MVT::Other);
+ SDValue Ops[] = { LN->getChain(), LN->getBasePtr() };
+ SDValue VZLoad =
+ DAG.getMemIntrinsicNode(X86ISD::VZEXT_LOAD, dl, Tys, Ops, MemVT,
+ LN->getPointerInfo(),
+ LN->getAlignment(),
+ LN->getMemOperand()->getFlags());
+ SDValue Convert = DAG.getNode(N->getOpcode(), dl, VT,
+ DAG.getBitcast(InVT, VZLoad));
+ DCI.CombineTo(N, Convert);
+ DAG.ReplaceAllUsesOfValueWith(SDValue(LN, 1), VZLoad.getValue(1));
+ return SDValue(N, 0);
+ }
+ }
+
+ return SDValue();
+}
+
+static SDValue combineCVTP2I_CVTTP2I(SDNode *N, SelectionDAG &DAG,
+ TargetLowering::DAGCombinerInfo &DCI) {
+ EVT VT = N->getValueType(0);
+
+ // Convert a full vector load into vzload when not all bits are needed.
+ SDValue In = N->getOperand(0);
+ MVT InVT = In.getSimpleValueType();
+ if (VT.getVectorNumElements() < InVT.getVectorNumElements() &&
+ ISD::isNormalLoad(In.getNode()) && In.hasOneUse()) {
+ assert(InVT.is128BitVector() && "Expected 128-bit input vector");
+ LoadSDNode *LN = cast<LoadSDNode>(N->getOperand(0));
+ // Unless the load is volatile.
+ if (!LN->isVolatile()) {
+ SDLoc dl(N);
+ unsigned NumBits = InVT.getScalarSizeInBits() * VT.getVectorNumElements();
+ MVT MemVT = MVT::getFloatingPointVT(NumBits);
+ MVT LoadVT = MVT::getVectorVT(MemVT, 128 / NumBits);
+ SDVTList Tys = DAG.getVTList(LoadVT, MVT::Other);
+ SDValue Ops[] = { LN->getChain(), LN->getBasePtr() };
+ SDValue VZLoad =
+ DAG.getMemIntrinsicNode(X86ISD::VZEXT_LOAD, dl, Tys, Ops, MemVT,
+ LN->getPointerInfo(),
+ LN->getAlignment(),
+ LN->getMemOperand()->getFlags());
+ SDValue Convert = DAG.getNode(N->getOpcode(), dl, VT,
+ DAG.getBitcast(InVT, VZLoad));
+ DCI.CombineTo(N, Convert);
+ DAG.ReplaceAllUsesOfValueWith(SDValue(LN, 1), VZLoad.getValue(1));
+ return SDValue(N, 0);
+ }
+ }
+
return SDValue();
}
@@ -39005,18 +41386,14 @@ static SDValue combineAndnp(SDNode *N, SelectionDAG &DAG,
return DAG.getConstant(0, SDLoc(N), VT);
// Turn ANDNP back to AND if input is inverted.
- if (VT.isVector() && N->getOperand(0).getOpcode() == ISD::XOR &&
- ISD::isBuildVectorAllOnes(N->getOperand(0).getOperand(1).getNode())) {
- return DAG.getNode(ISD::AND, SDLoc(N), VT,
- N->getOperand(0).getOperand(0), N->getOperand(1));
- }
+ if (SDValue Not = IsNOT(N->getOperand(0), DAG))
+ return DAG.getNode(ISD::AND, SDLoc(N), VT, DAG.getBitcast(VT, Not),
+ N->getOperand(1));
// Attempt to recursively combine a bitmask ANDNP with shuffles.
if (VT.isVector() && (VT.getScalarSizeInBits() % 8) == 0) {
SDValue Op(N, 0);
- if (SDValue Res = combineX86ShufflesRecursively(
- {Op}, 0, Op, {0}, {}, /*Depth*/ 1,
- /*HasVarMask*/ false, /*AllowVarMask*/ true, DAG, Subtarget))
+ if (SDValue Res = combineX86ShufflesRecursively(Op, DAG, Subtarget))
return Res;
}
@@ -39039,18 +41416,24 @@ static SDValue combineBT(SDNode *N, SelectionDAG &DAG,
// Try to combine sext_in_reg of a cmov of constants by extending the constants.
static SDValue combineSextInRegCmov(SDNode *N, SelectionDAG &DAG) {
- EVT VT = N->getValueType(0);
+ assert(N->getOpcode() == ISD::SIGN_EXTEND_INREG);
+
+ EVT DstVT = N->getValueType(0);
SDValue N0 = N->getOperand(0);
SDValue N1 = N->getOperand(1);
EVT ExtraVT = cast<VTSDNode>(N1)->getVT();
- if (ExtraVT != MVT::i16)
+ if (ExtraVT != MVT::i8 && ExtraVT != MVT::i16)
return SDValue();
- // Look through single use any_extends.
- if (N0.getOpcode() == ISD::ANY_EXTEND && N0.hasOneUse())
+ // Look through single use any_extends / truncs.
+ SDValue IntermediateBitwidthOp;
+ if ((N0.getOpcode() == ISD::ANY_EXTEND || N0.getOpcode() == ISD::TRUNCATE) &&
+ N0.hasOneUse()) {
+ IntermediateBitwidthOp = N0;
N0 = N0.getOperand(0);
+ }
// See if we have a single use cmov.
if (N0.getOpcode() != X86ISD::CMOV || !N0.hasOneUse())
@@ -39066,21 +41449,37 @@ static SDValue combineSextInRegCmov(SDNode *N, SelectionDAG &DAG) {
SDLoc DL(N);
- // If we looked through an any_extend above, add one to the constants.
- if (N0.getValueType() != VT) {
- CMovOp0 = DAG.getNode(ISD::ANY_EXTEND, DL, VT, CMovOp0);
- CMovOp1 = DAG.getNode(ISD::ANY_EXTEND, DL, VT, CMovOp1);
+ // If we looked through an any_extend/trunc above, add one to the constants.
+ if (IntermediateBitwidthOp) {
+ unsigned IntermediateOpc = IntermediateBitwidthOp.getOpcode();
+ CMovOp0 = DAG.getNode(IntermediateOpc, DL, DstVT, CMovOp0);
+ CMovOp1 = DAG.getNode(IntermediateOpc, DL, DstVT, CMovOp1);
+ }
+
+ CMovOp0 = DAG.getNode(ISD::SIGN_EXTEND_INREG, DL, DstVT, CMovOp0, N1);
+ CMovOp1 = DAG.getNode(ISD::SIGN_EXTEND_INREG, DL, DstVT, CMovOp1, N1);
+
+ EVT CMovVT = DstVT;
+ // We do not want i16 CMOV's. Promote to i32 and truncate afterwards.
+ if (DstVT == MVT::i16) {
+ CMovVT = MVT::i32;
+ CMovOp0 = DAG.getNode(ISD::ZERO_EXTEND, DL, CMovVT, CMovOp0);
+ CMovOp1 = DAG.getNode(ISD::ZERO_EXTEND, DL, CMovVT, CMovOp1);
}
- CMovOp0 = DAG.getNode(ISD::SIGN_EXTEND_INREG, DL, VT, CMovOp0, N1);
- CMovOp1 = DAG.getNode(ISD::SIGN_EXTEND_INREG, DL, VT, CMovOp1, N1);
+ SDValue CMov = DAG.getNode(X86ISD::CMOV, DL, CMovVT, CMovOp0, CMovOp1,
+ N0.getOperand(2), N0.getOperand(3));
- return DAG.getNode(X86ISD::CMOV, DL, VT, CMovOp0, CMovOp1,
- N0.getOperand(2), N0.getOperand(3));
+ if (CMovVT != DstVT)
+ CMov = DAG.getNode(ISD::TRUNCATE, DL, DstVT, CMov);
+
+ return CMov;
}
static SDValue combineSignExtendInReg(SDNode *N, SelectionDAG &DAG,
const X86Subtarget &Subtarget) {
+ assert(N->getOpcode() == ISD::SIGN_EXTEND_INREG);
+
if (SDValue V = combineSextInRegCmov(N, DAG))
return V;
@@ -39336,6 +41735,7 @@ static SDValue combineToExtendVectorInReg(SDNode *N, SelectionDAG &DAG,
return SDValue();
unsigned Opcode = N->getOpcode();
+ // TODO - add ANY_EXTEND support.
if (Opcode != ISD::SIGN_EXTEND && Opcode != ISD::ZERO_EXTEND)
return SDValue();
if (!DCI.isBeforeLegalizeOps())
@@ -39382,13 +41782,13 @@ static SDValue combineToExtendVectorInReg(SDNode *N, SelectionDAG &DAG,
SDLoc DL(N);
auto ExtendVecSize = [&DAG](const SDLoc &DL, SDValue N, unsigned Size) {
- EVT InVT = N.getValueType();
- EVT OutVT = EVT::getVectorVT(*DAG.getContext(), InVT.getScalarType(),
- Size / InVT.getScalarSizeInBits());
- SmallVector<SDValue, 8> Opnds(Size / InVT.getSizeInBits(),
- DAG.getUNDEF(InVT));
+ EVT SrcVT = N.getValueType();
+ EVT DstVT = EVT::getVectorVT(*DAG.getContext(), SrcVT.getScalarType(),
+ Size / SrcVT.getScalarSizeInBits());
+ SmallVector<SDValue, 8> Opnds(Size / SrcVT.getSizeInBits(),
+ DAG.getUNDEF(SrcVT));
Opnds[0] = N;
- return DAG.getNode(ISD::CONCAT_VECTORS, DL, OutVT, Opnds);
+ return DAG.getNode(ISD::CONCAT_VECTORS, DL, DstVT, Opnds);
};
// If target-size is less than 128-bits, extend to a type that would extend
@@ -39410,8 +41810,7 @@ static SDValue combineToExtendVectorInReg(SDNode *N, SelectionDAG &DAG,
(VT.is256BitVector() && Subtarget.hasAVX()) ||
(VT.is512BitVector() && Subtarget.useAVX512Regs())) {
SDValue ExOp = ExtendVecSize(DL, N0, VT.getSizeInBits());
- Opcode = Opcode == ISD::SIGN_EXTEND ? ISD::SIGN_EXTEND_VECTOR_INREG
- : ISD::ZERO_EXTEND_VECTOR_INREG;
+ Opcode = getOpcode_EXTEND_VECTOR_INREG(Opcode);
return DAG.getNode(Opcode, DL, VT, ExOp);
}
@@ -39421,9 +41820,7 @@ static SDValue combineToExtendVectorInReg(SDNode *N, SelectionDAG &DAG,
EVT SubVT = EVT::getVectorVT(*DAG.getContext(), SVT, NumSubElts);
EVT InSubVT = EVT::getVectorVT(*DAG.getContext(), InSVT, NumSubElts);
- unsigned IROpc = Opcode == ISD::SIGN_EXTEND ? ISD::SIGN_EXTEND_VECTOR_INREG
- : ISD::ZERO_EXTEND_VECTOR_INREG;
-
+ unsigned IROpc = getOpcode_EXTEND_VECTOR_INREG(Opcode);
SmallVector<SDValue, 8> Opnds;
for (unsigned i = 0, Offset = 0; i != NumVecs; ++i, Offset += NumSubElts) {
SDValue SrcVec = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, InSubVT, N0,
@@ -39457,7 +41854,7 @@ static SDValue combineExtSetcc(SDNode *N, SelectionDAG &DAG,
SDLoc dl(N);
// Only do this combine with AVX512 for vector extends.
- if (!Subtarget.hasAVX512() || !VT.isVector() || N0->getOpcode() != ISD::SETCC)
+ if (!Subtarget.hasAVX512() || !VT.isVector() || N0.getOpcode() != ISD::SETCC)
return SDValue();
// Only combine legal element types.
@@ -39473,7 +41870,7 @@ static SDValue combineExtSetcc(SDNode *N, SelectionDAG &DAG,
// Don't fold if the condition code can't be handled by PCMPEQ/PCMPGT since
// that's the only integer compares with we have.
- ISD::CondCode CC = cast<CondCodeSDNode>(N0->getOperand(2))->get();
+ ISD::CondCode CC = cast<CondCodeSDNode>(N0.getOperand(2))->get();
if (ISD::isUnsignedIntSetCC(CC))
return SDValue();
@@ -39629,6 +42026,10 @@ static SDValue combineFMADDSUB(SDNode *N, SelectionDAG &DAG,
if (!NegVal)
return SDValue();
+ // FIXME: Should we bitcast instead?
+ if (NegVal.getValueType() != VT)
+ return SDValue();
+
unsigned NewOpcode;
switch (N->getOpcode()) {
default: llvm_unreachable("Unexpected opcode!");
@@ -39705,6 +42106,20 @@ static SDValue combineZext(SDNode *N, SelectionDAG &DAG,
if (SDValue R = combineOrCmpEqZeroToCtlzSrl(N, DAG, DCI, Subtarget))
return R;
+ // TODO: Combine with any target/faux shuffle.
+ if (N0.getOpcode() == X86ISD::PACKUS && N0.getValueSizeInBits() == 128 &&
+ VT.getScalarSizeInBits() == N0.getOperand(0).getScalarValueSizeInBits()) {
+ SDValue N00 = N0.getOperand(0);
+ SDValue N01 = N0.getOperand(1);
+ unsigned NumSrcElts = N00.getValueType().getVectorNumElements();
+ unsigned NumSrcEltBits = N00.getScalarValueSizeInBits();
+ APInt ZeroMask = APInt::getHighBitsSet(NumSrcEltBits, NumSrcEltBits / 2);
+ if ((N00.isUndef() || DAG.MaskedValueIsZero(N00, ZeroMask)) &&
+ (N01.isUndef() || DAG.MaskedValueIsZero(N01, ZeroMask))) {
+ return concatSubVectors(N00, N01, VT, NumSrcElts * 2, DAG, dl, 128);
+ }
+ }
+
return SDValue();
}
@@ -39734,9 +42149,14 @@ static SDValue combineVectorSizedSetCCEquality(SDNode *SetCC, SelectionDAG &DAG,
if (isNullConstant(Y) && !IsOrXorXorCCZero)
return SDValue();
- // Bail out if we know that this is not really just an oversized integer.
- if (peekThroughBitcasts(X).getValueType() == MVT::f128 ||
- peekThroughBitcasts(Y).getValueType() == MVT::f128)
+ // Don't perform this combine if constructing the vector will be expensive.
+ auto IsVectorBitCastCheap = [](SDValue X) {
+ X = peekThroughBitcasts(X);
+ return isa<ConstantSDNode>(X) || X.getValueType().isVector() ||
+ X.getOpcode() == ISD::LOAD;
+ };
+ if ((!IsVectorBitCastCheap(X) || !IsVectorBitCastCheap(Y)) &&
+ !IsOrXorXorCCZero)
return SDValue();
// TODO: Use PXOR + PTEST for SSE4.1 or later?
@@ -39873,66 +42293,44 @@ static SDValue combineMOVMSK(SDNode *N, SelectionDAG &DAG,
SDValue Src = N->getOperand(0);
MVT SrcVT = Src.getSimpleValueType();
MVT VT = N->getSimpleValueType(0);
+ unsigned NumBits = VT.getScalarSizeInBits();
+ unsigned NumElts = SrcVT.getVectorNumElements();
// Perform constant folding.
if (ISD::isBuildVectorOfConstantSDNodes(Src.getNode())) {
- assert(VT== MVT::i32 && "Unexpected result type");
+ assert(VT == MVT::i32 && "Unexpected result type");
APInt Imm(32, 0);
for (unsigned Idx = 0, e = Src.getNumOperands(); Idx < e; ++Idx) {
- SDValue In = Src.getOperand(Idx);
- if (!In.isUndef() &&
- cast<ConstantSDNode>(In)->getAPIntValue().isNegative())
+ if (!Src.getOperand(Idx).isUndef() &&
+ Src.getConstantOperandAPInt(Idx).isNegative())
Imm.setBit(Idx);
}
return DAG.getConstant(Imm, SDLoc(N), VT);
}
// Look through int->fp bitcasts that don't change the element width.
- if (Src.getOpcode() == ISD::BITCAST && Src.hasOneUse() &&
- SrcVT.isFloatingPoint() &&
- Src.getOperand(0).getValueType() ==
- EVT(SrcVT).changeVectorElementTypeToInteger())
- Src = Src.getOperand(0);
+ unsigned EltWidth = SrcVT.getScalarSizeInBits();
+ if (Src.getOpcode() == ISD::BITCAST &&
+ Src.getOperand(0).getScalarValueSizeInBits() == EltWidth)
+ return DAG.getNode(X86ISD::MOVMSK, SDLoc(N), VT, Src.getOperand(0));
+
+ // Fold movmsk(not(x)) -> not(movmsk) to improve folding of movmsk results
+ // with scalar comparisons.
+ if (SDValue NotSrc = IsNOT(Src, DAG)) {
+ SDLoc DL(N);
+ APInt NotMask = APInt::getLowBitsSet(NumBits, NumElts);
+ NotSrc = DAG.getBitcast(SrcVT, NotSrc);
+ return DAG.getNode(ISD::XOR, DL, VT,
+ DAG.getNode(X86ISD::MOVMSK, DL, VT, NotSrc),
+ DAG.getConstant(NotMask, DL, VT));
+ }
// Simplify the inputs.
const TargetLowering &TLI = DAG.getTargetLoweringInfo();
- APInt DemandedMask(APInt::getAllOnesValue(VT.getScalarSizeInBits()));
+ APInt DemandedMask(APInt::getAllOnesValue(NumBits));
if (TLI.SimplifyDemandedBits(SDValue(N, 0), DemandedMask, DCI))
return SDValue(N, 0);
- // Combine (movmsk (setne (and X, (1 << C)), 0)) -> (movmsk (X << C)).
- // Only do this when the setcc input and output types are the same and the
- // setcc and the 'and' node have a single use.
- // FIXME: Support 256-bits with AVX1. The movmsk is split, but the and isn't.
- APInt SplatVal;
- if (Src.getOpcode() == ISD::SETCC && Src.hasOneUse() &&
- Src.getOperand(0).getValueType() == Src.getValueType() &&
- cast<CondCodeSDNode>(Src.getOperand(2))->get() == ISD::SETNE &&
- ISD::isBuildVectorAllZeros(Src.getOperand(1).getNode()) &&
- Src.getOperand(0).getOpcode() == ISD::AND) {
- SDValue And = Src.getOperand(0);
- if (And.hasOneUse() &&
- ISD::isConstantSplatVector(And.getOperand(1).getNode(), SplatVal) &&
- SplatVal.isPowerOf2()) {
- MVT VT = Src.getSimpleValueType();
- unsigned BitWidth = VT.getScalarSizeInBits();
- unsigned ShAmt = BitWidth - SplatVal.logBase2() - 1;
- SDLoc DL(And);
- SDValue X = And.getOperand(0);
- // If the element type is i8, we need to bitcast to i16 to use a legal
- // shift. If we wait until lowering we end up with an extra and to bits
- // from crossing the 8-bit elements, but we don't care about that here.
- if (VT.getVectorElementType() == MVT::i8) {
- VT = MVT::getVectorVT(MVT::i16, VT.getVectorNumElements() / 2);
- X = DAG.getBitcast(VT, X);
- }
- SDValue Shl = DAG.getNode(ISD::SHL, DL, VT, X,
- DAG.getConstant(ShAmt, DL, VT));
- SDValue Cast = DAG.getBitcast(SrcVT, Shl);
- return DAG.getNode(X86ISD::MOVMSK, SDLoc(N), N->getValueType(0), Cast);
- }
- }
-
return SDValue();
}
@@ -40065,8 +42463,7 @@ static SDValue combineVectorCompareAndMaskUnaryOp(SDNode *N,
// make the transformation for non-constant splats as well, but it's unclear
// that would be a benefit as it would not eliminate any operations, just
// perform one more step in scalar code before moving to the vector unit.
- if (BuildVectorSDNode *BV =
- dyn_cast<BuildVectorSDNode>(N->getOperand(0)->getOperand(1))) {
+ if (auto *BV = dyn_cast<BuildVectorSDNode>(N->getOperand(0).getOperand(1))) {
// Bail out if the vector isn't a constant.
if (!BV->isConstant())
return SDValue();
@@ -40088,6 +42485,41 @@ static SDValue combineVectorCompareAndMaskUnaryOp(SDNode *N,
return SDValue();
}
+/// If we are converting a value to floating-point, try to replace scalar
+/// truncate of an extracted vector element with a bitcast. This tries to keep
+/// the sequence on XMM registers rather than moving between vector and GPRs.
+static SDValue combineToFPTruncExtElt(SDNode *N, SelectionDAG &DAG) {
+ // TODO: This is currently only used by combineSIntToFP, but it is generalized
+ // to allow being called by any similar cast opcode.
+ // TODO: Consider merging this into lowering: vectorizeExtractedCast().
+ SDValue Trunc = N->getOperand(0);
+ if (!Trunc.hasOneUse() || Trunc.getOpcode() != ISD::TRUNCATE)
+ return SDValue();
+
+ SDValue ExtElt = Trunc.getOperand(0);
+ if (!ExtElt.hasOneUse() || ExtElt.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
+ !isNullConstant(ExtElt.getOperand(1)))
+ return SDValue();
+
+ EVT TruncVT = Trunc.getValueType();
+ EVT SrcVT = ExtElt.getValueType();
+ unsigned DestWidth = TruncVT.getSizeInBits();
+ unsigned SrcWidth = SrcVT.getSizeInBits();
+ if (SrcWidth % DestWidth != 0)
+ return SDValue();
+
+ // inttofp (trunc (extelt X, 0)) --> inttofp (extelt (bitcast X), 0)
+ EVT SrcVecVT = ExtElt.getOperand(0).getValueType();
+ unsigned VecWidth = SrcVecVT.getSizeInBits();
+ unsigned NumElts = VecWidth / DestWidth;
+ EVT BitcastVT = EVT::getVectorVT(*DAG.getContext(), TruncVT, NumElts);
+ SDValue BitcastVec = DAG.getBitcast(BitcastVT, ExtElt.getOperand(0));
+ SDLoc DL(N);
+ SDValue NewExtElt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, TruncVT,
+ BitcastVec, ExtElt.getOperand(1));
+ return DAG.getNode(N->getOpcode(), DL, N->getValueType(0), NewExtElt);
+}
+
static SDValue combineUIntToFP(SDNode *N, SelectionDAG &DAG,
const X86Subtarget &Subtarget) {
SDValue Op0 = N->getOperand(0);
@@ -40181,6 +42613,10 @@ static SDValue combineSIntToFP(SDNode *N, SelectionDAG &DAG,
return FILDChain;
}
}
+
+ if (SDValue V = combineToFPTruncExtElt(N, DAG))
+ return V;
+
return SDValue();
}
@@ -40267,13 +42703,13 @@ static SDValue combineCMP(SDNode *N, SelectionDAG &DAG) {
if ((Op.getOpcode() == ISD::SRL || Op.getOpcode() == ISD::SHL) &&
Op.hasOneUse() && isa<ConstantSDNode>(Op.getOperand(1)) &&
onlyZeroFlagUsed(SDValue(N, 0))) {
- EVT VT = Op.getValueType();
unsigned BitWidth = VT.getSizeInBits();
- unsigned ShAmt = Op.getConstantOperandVal(1);
- if (ShAmt < BitWidth) { // Avoid undefined shifts.
+ const APInt &ShAmt = Op.getConstantOperandAPInt(1);
+ if (ShAmt.ult(BitWidth)) { // Avoid undefined shifts.
+ unsigned MaskBits = BitWidth - ShAmt.getZExtValue();
APInt Mask = Op.getOpcode() == ISD::SRL
- ? APInt::getHighBitsSet(BitWidth, BitWidth - ShAmt)
- : APInt::getLowBitsSet(BitWidth, BitWidth - ShAmt);
+ ? APInt::getHighBitsSet(BitWidth, MaskBits)
+ : APInt::getLowBitsSet(BitWidth, MaskBits);
if (Mask.isSignedIntN(32)) {
Op = DAG.getNode(ISD::AND, dl, VT, Op.getOperand(0),
DAG.getConstant(Mask, dl, VT));
@@ -40283,7 +42719,6 @@ static SDValue combineCMP(SDNode *N, SelectionDAG &DAG) {
}
}
-
// Look for a truncate with a single use.
if (Op.getOpcode() != ISD::TRUNCATE || !Op.hasOneUse())
return SDValue();
@@ -40337,8 +42772,42 @@ static SDValue combineCMP(SDNode *N, SelectionDAG &DAG) {
return Op.getValue(1);
}
+static SDValue combineX86AddSub(SDNode *N, SelectionDAG &DAG,
+ TargetLowering::DAGCombinerInfo &DCI) {
+ assert((X86ISD::ADD == N->getOpcode() || X86ISD::SUB == N->getOpcode()) &&
+ "Expected X86ISD::ADD or X86ISD::SUB");
+
+ SDLoc DL(N);
+ SDValue LHS = N->getOperand(0);
+ SDValue RHS = N->getOperand(1);
+ MVT VT = LHS.getSimpleValueType();
+ unsigned GenericOpc = X86ISD::ADD == N->getOpcode() ? ISD::ADD : ISD::SUB;
+
+ // If we don't use the flag result, simplify back to a generic ADD/SUB.
+ if (!N->hasAnyUseOfValue(1)) {
+ SDValue Res = DAG.getNode(GenericOpc, DL, VT, LHS, RHS);
+ return DAG.getMergeValues({Res, DAG.getConstant(0, DL, MVT::i32)}, DL);
+ }
+
+ // Fold any similar generic ADD/SUB opcodes to reuse this node.
+ auto MatchGeneric = [&](SDValue N0, SDValue N1, bool Negate) {
+ SDValue Ops[] = {N0, N1};
+ SDVTList VTs = DAG.getVTList(N->getValueType(0));
+ if (SDNode *GenericAddSub = DAG.getNodeIfExists(GenericOpc, VTs, Ops)) {
+ SDValue Op(N, 0);
+ if (Negate)
+ Op = DAG.getNode(ISD::SUB, DL, VT, DAG.getConstant(0, DL, VT), Op);
+ DCI.CombineTo(GenericAddSub, Op);
+ }
+ };
+ MatchGeneric(LHS, RHS, false);
+ MatchGeneric(RHS, LHS, X86ISD::SUB == N->getOpcode());
+
+ return SDValue();
+}
+
static SDValue combineSBB(SDNode *N, SelectionDAG &DAG) {
- if (SDValue Flags = combineCarryThroughADD(N->getOperand(2))) {
+ if (SDValue Flags = combineCarryThroughADD(N->getOperand(2), DAG)) {
MVT VT = N->getSimpleValueType(0);
SDVTList VTs = DAG.getVTList(VT, MVT::i32);
return DAG.getNode(X86ISD::SBB, SDLoc(N), VTs,
@@ -40346,6 +42815,15 @@ static SDValue combineSBB(SDNode *N, SelectionDAG &DAG) {
Flags);
}
+ // Fold SBB(SUB(X,Y),0,Carry) -> SBB(X,Y,Carry)
+ // iff the flag result is dead.
+ SDValue Op0 = N->getOperand(0);
+ SDValue Op1 = N->getOperand(1);
+ if (Op0.getOpcode() == ISD::SUB && isNullConstant(Op1) &&
+ !N->hasAnyUseOfValue(1))
+ return DAG.getNode(X86ISD::SBB, SDLoc(N), N->getVTList(), Op0.getOperand(0),
+ Op0.getOperand(1), N->getOperand(2));
+
return SDValue();
}
@@ -40372,7 +42850,7 @@ static SDValue combineADC(SDNode *N, SelectionDAG &DAG,
return DCI.CombineTo(N, Res1, CarryOut);
}
- if (SDValue Flags = combineCarryThroughADD(N->getOperand(2))) {
+ if (SDValue Flags = combineCarryThroughADD(N->getOperand(2), DAG)) {
MVT VT = N->getSimpleValueType(0);
SDVTList VTs = DAG.getVTList(VT, MVT::i32);
return DAG.getNode(X86ISD::ADC, SDLoc(N), VTs,
@@ -40468,7 +42946,7 @@ static SDValue combineAddOrSubToADCOrSBB(SDNode *N, SelectionDAG &DAG) {
// Do not flip "e > c", where "c" is a constant, because Cmp instruction
// cannot take an immediate as its first operand.
//
- if (EFLAGS.getOpcode() == X86ISD::SUB && EFLAGS.hasOneUse() &&
+ if (EFLAGS.getOpcode() == X86ISD::SUB && EFLAGS.getNode()->hasOneUse() &&
EFLAGS.getValueType().isInteger() &&
!isa<ConstantSDNode>(EFLAGS.getOperand(1))) {
SDValue NewSub = DAG.getNode(X86ISD::SUB, SDLoc(EFLAGS),
@@ -40575,8 +43053,8 @@ static SDValue combineLoopMAddPattern(SDNode *N, SelectionDAG &DAG,
// Madd vector size is half of the original vector size
auto PMADDWDBuilder = [](SelectionDAG &DAG, const SDLoc &DL,
ArrayRef<SDValue> Ops) {
- MVT VT = MVT::getVectorVT(MVT::i32, Ops[0].getValueSizeInBits() / 32);
- return DAG.getNode(X86ISD::VPMADDWD, DL, VT, Ops);
+ MVT OpVT = MVT::getVectorVT(MVT::i32, Ops[0].getValueSizeInBits() / 32);
+ return DAG.getNode(X86ISD::VPMADDWD, DL, OpVT, Ops);
};
auto BuildPMADDWD = [&](SDValue Mul) {
@@ -40631,10 +43109,10 @@ static SDValue combineLoopSADPattern(SDNode *N, SelectionDAG &DAG,
return SDValue();
// We know N is a reduction add, which means one of its operands is a phi.
- // To match SAD, we need the other operand to be a vector select.
- if (Op0.getOpcode() != ISD::VSELECT)
+ // To match SAD, we need the other operand to be a ABS.
+ if (Op0.getOpcode() != ISD::ABS)
std::swap(Op0, Op1);
- if (Op0.getOpcode() != ISD::VSELECT)
+ if (Op0.getOpcode() != ISD::ABS)
return SDValue();
auto BuildPSADBW = [&](SDValue Op0, SDValue Op1) {
@@ -40673,7 +43151,7 @@ static SDValue combineLoopSADPattern(SDNode *N, SelectionDAG &DAG,
Op0 = BuildPSADBW(SadOp0, SadOp1);
// It's possible we have a sad on the other side too.
- if (Op1.getOpcode() == ISD::VSELECT &&
+ if (Op1.getOpcode() == ISD::ABS &&
detectZextAbsDiff(Op1, SadOp0, SadOp1)) {
Op1 = BuildPSADBW(SadOp0, SadOp1);
}
@@ -40815,39 +43293,6 @@ static SDValue matchPMADDWD(SelectionDAG &DAG, SDValue Op0, SDValue Op1,
PMADDBuilder);
}
-// Try to turn (add (umax X, C), -C) into (psubus X, C)
-static SDValue combineAddToSUBUS(SDNode *N, SelectionDAG &DAG,
- const X86Subtarget &Subtarget) {
- if (!Subtarget.hasSSE2())
- return SDValue();
-
- EVT VT = N->getValueType(0);
-
- // psubus is available in SSE2 for i8 and i16 vectors.
- if (!VT.isVector() || VT.getVectorNumElements() < 2 ||
- !isPowerOf2_32(VT.getVectorNumElements()) ||
- !(VT.getVectorElementType() == MVT::i8 ||
- VT.getVectorElementType() == MVT::i16))
- return SDValue();
-
- SDValue Op0 = N->getOperand(0);
- SDValue Op1 = N->getOperand(1);
- if (Op0.getOpcode() != ISD::UMAX)
- return SDValue();
-
- // The add should have a constant that is the negative of the max.
- // TODO: Handle build_vectors with undef elements.
- auto MatchUSUBSAT = [](ConstantSDNode *Max, ConstantSDNode *Op) {
- return Max->getAPIntValue() == (-Op->getAPIntValue());
- };
- if (!ISD::matchBinaryPredicate(Op0.getOperand(1), Op1, MatchUSUBSAT))
- return SDValue();
-
- SDLoc DL(N);
- return DAG.getNode(ISD::USUBSAT, DL, VT, Op0.getOperand(0),
- Op0.getOperand(1));
-}
-
// Attempt to turn this pattern into PMADDWD.
// (mul (add (zext (build_vector)), (zext (build_vector))),
// (add (zext (build_vector)), (zext (build_vector)))
@@ -40957,12 +43402,12 @@ static SDValue matchPMADDWD_2(SelectionDAG &DAG, SDValue N0, SDValue N1,
ArrayRef<SDValue> Ops) {
// Shrink by adding truncate nodes and let DAGCombine fold with the
// sources.
- EVT InVT = Ops[0].getValueType();
- assert(InVT.getScalarType() == MVT::i16 &&
+ EVT OpVT = Ops[0].getValueType();
+ assert(OpVT.getScalarType() == MVT::i16 &&
"Unexpected scalar element type");
- assert(InVT == Ops[1].getValueType() && "Operands' types mismatch");
+ assert(OpVT == Ops[1].getValueType() && "Operands' types mismatch");
EVT ResVT = EVT::getVectorVT(*DAG.getContext(), MVT::i32,
- InVT.getVectorNumElements() / 2);
+ OpVT.getVectorNumElements() / 2);
return DAG.getNode(X86ISD::VPMADDWD, DL, ResVT, Ops[0], Ops[1]);
};
return SplitOpsAndApply(DAG, Subtarget, DL, VT, { In0, In1 },
@@ -40990,8 +43435,8 @@ static SDValue combineAdd(SDNode *N, SelectionDAG &DAG,
// Try to synthesize horizontal adds from adds of shuffles.
if ((VT == MVT::v8i16 || VT == MVT::v4i32 || VT == MVT::v16i16 ||
VT == MVT::v8i32) &&
- Subtarget.hasSSSE3() && isHorizontalBinOp(Op0, Op1, true) &&
- shouldUseHorizontalOp(Op0 == Op1, DAG, Subtarget)) {
+ Subtarget.hasSSSE3() &&
+ isHorizontalBinOp(Op0, Op1, DAG, Subtarget, true)) {
auto HADDBuilder = [](SelectionDAG &DAG, const SDLoc &DL,
ArrayRef<SDValue> Ops) {
return DAG.getNode(X86ISD::HADD, DL, Ops[0].getValueType(), Ops);
@@ -41003,9 +43448,6 @@ static SDValue combineAdd(SDNode *N, SelectionDAG &DAG,
if (SDValue V = combineIncDecVector(N, DAG))
return V;
- if (SDValue V = combineAddToSUBUS(N, DAG, Subtarget))
- return V;
-
return combineAddOrSubToADCOrSBB(N, DAG);
}
@@ -41110,7 +43552,7 @@ static SDValue combineSub(SDNode *N, SelectionDAG &DAG,
// X-Y -> X+~Y+1, saving one register.
if (Op1->hasOneUse() && Op1.getOpcode() == ISD::XOR &&
isa<ConstantSDNode>(Op1.getOperand(1))) {
- APInt XorC = cast<ConstantSDNode>(Op1.getOperand(1))->getAPIntValue();
+ const APInt &XorC = Op1.getConstantOperandAPInt(1);
EVT VT = Op0.getValueType();
SDValue NewXor = DAG.getNode(ISD::XOR, SDLoc(Op1), VT,
Op1.getOperand(0),
@@ -41124,8 +43566,8 @@ static SDValue combineSub(SDNode *N, SelectionDAG &DAG,
EVT VT = N->getValueType(0);
if ((VT == MVT::v8i16 || VT == MVT::v4i32 || VT == MVT::v16i16 ||
VT == MVT::v8i32) &&
- Subtarget.hasSSSE3() && isHorizontalBinOp(Op0, Op1, false) &&
- shouldUseHorizontalOp(Op0 == Op1, DAG, Subtarget)) {
+ Subtarget.hasSSSE3() &&
+ isHorizontalBinOp(Op0, Op1, DAG, Subtarget, false)) {
auto HSUBBuilder = [](SelectionDAG &DAG, const SDLoc &DL,
ArrayRef<SDValue> Ops) {
return DAG.getNode(X86ISD::HSUB, DL, Ops[0].getValueType(), Ops);
@@ -41159,6 +43601,149 @@ static SDValue combineVectorCompare(SDNode *N, SelectionDAG &DAG,
return SDValue();
}
+/// Helper that combines an array of subvector ops as if they were the operands
+/// of a ISD::CONCAT_VECTORS node, but may have come from another source (e.g.
+/// ISD::INSERT_SUBVECTOR). The ops are assumed to be of the same type.
+static SDValue combineConcatVectorOps(const SDLoc &DL, MVT VT,
+ ArrayRef<SDValue> Ops, SelectionDAG &DAG,
+ TargetLowering::DAGCombinerInfo &DCI,
+ const X86Subtarget &Subtarget) {
+ assert(Subtarget.hasAVX() && "AVX assumed for concat_vectors");
+
+ if (llvm::all_of(Ops, [](SDValue Op) { return Op.isUndef(); }))
+ return DAG.getUNDEF(VT);
+
+ if (llvm::all_of(Ops, [](SDValue Op) {
+ return ISD::isBuildVectorAllZeros(Op.getNode());
+ }))
+ return getZeroVector(VT, Subtarget, DAG, DL);
+
+ SDValue Op0 = Ops[0];
+
+ // Fold subvector loads into one.
+ // If needed, look through bitcasts to get to the load.
+ if (auto *FirstLd = dyn_cast<LoadSDNode>(peekThroughBitcasts(Op0))) {
+ bool Fast;
+ const X86TargetLowering *TLI = Subtarget.getTargetLowering();
+ if (TLI->allowsMemoryAccess(*DAG.getContext(), DAG.getDataLayout(), VT,
+ *FirstLd->getMemOperand(), &Fast) &&
+ Fast) {
+ if (SDValue Ld =
+ EltsFromConsecutiveLoads(VT, Ops, DL, DAG, Subtarget, false))
+ return Ld;
+ }
+ }
+
+ // Repeated subvectors.
+ if (llvm::all_of(Ops, [Op0](SDValue Op) { return Op == Op0; })) {
+ // If this broadcast/subv_broadcast is inserted into both halves, use a
+ // larger broadcast/subv_broadcast.
+ if (Op0.getOpcode() == X86ISD::VBROADCAST ||
+ Op0.getOpcode() == X86ISD::SUBV_BROADCAST)
+ return DAG.getNode(Op0.getOpcode(), DL, VT, Op0.getOperand(0));
+
+ // concat_vectors(movddup(x),movddup(x)) -> broadcast(x)
+ if (Op0.getOpcode() == X86ISD::MOVDDUP && VT == MVT::v4f64 &&
+ (Subtarget.hasAVX2() || MayFoldLoad(Op0.getOperand(0))))
+ return DAG.getNode(X86ISD::VBROADCAST, DL, VT,
+ DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f64,
+ Op0.getOperand(0),
+ DAG.getIntPtrConstant(0, DL)));
+
+ // concat_vectors(scalar_to_vector(x),scalar_to_vector(x)) -> broadcast(x)
+ if (Op0.getOpcode() == ISD::SCALAR_TO_VECTOR &&
+ (Subtarget.hasAVX2() ||
+ (VT.getScalarSizeInBits() >= 32 && MayFoldLoad(Op0.getOperand(0)))) &&
+ Op0.getOperand(0).getValueType() == VT.getScalarType())
+ return DAG.getNode(X86ISD::VBROADCAST, DL, VT, Op0.getOperand(0));
+ }
+
+ bool IsSplat = llvm::all_of(Ops, [&Op0](SDValue Op) { return Op == Op0; });
+
+ // Repeated opcode.
+ // TODO - combineX86ShufflesRecursively should handle shuffle concatenation
+ // but it currently struggles with different vector widths.
+ if (llvm::all_of(Ops, [Op0](SDValue Op) {
+ return Op.getOpcode() == Op0.getOpcode();
+ })) {
+ unsigned NumOps = Ops.size();
+ switch (Op0.getOpcode()) {
+ case X86ISD::PSHUFHW:
+ case X86ISD::PSHUFLW:
+ case X86ISD::PSHUFD:
+ if (!IsSplat && NumOps == 2 && VT.is256BitVector() &&
+ Subtarget.hasInt256() && Op0.getOperand(1) == Ops[1].getOperand(1)) {
+ SmallVector<SDValue, 2> Src;
+ for (unsigned i = 0; i != NumOps; ++i)
+ Src.push_back(Ops[i].getOperand(0));
+ return DAG.getNode(Op0.getOpcode(), DL, VT,
+ DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, Src),
+ Op0.getOperand(1));
+ }
+ LLVM_FALLTHROUGH;
+ case X86ISD::VPERMILPI:
+ // TODO - add support for vXf64/vXi64 shuffles.
+ if (!IsSplat && NumOps == 2 && (VT == MVT::v8f32 || VT == MVT::v8i32) &&
+ Subtarget.hasAVX() && Op0.getOperand(1) == Ops[1].getOperand(1)) {
+ SmallVector<SDValue, 2> Src;
+ for (unsigned i = 0; i != NumOps; ++i)
+ Src.push_back(DAG.getBitcast(MVT::v4f32, Ops[i].getOperand(0)));
+ SDValue Res = DAG.getNode(ISD::CONCAT_VECTORS, DL, MVT::v8f32, Src);
+ Res = DAG.getNode(X86ISD::VPERMILPI, DL, MVT::v8f32, Res,
+ Op0.getOperand(1));
+ return DAG.getBitcast(VT, Res);
+ }
+ break;
+ case X86ISD::PACKUS:
+ if (NumOps == 2 && VT.is256BitVector() && Subtarget.hasInt256()) {
+ SmallVector<SDValue, 2> LHS, RHS;
+ for (unsigned i = 0; i != NumOps; ++i) {
+ LHS.push_back(Ops[i].getOperand(0));
+ RHS.push_back(Ops[i].getOperand(1));
+ }
+ MVT SrcVT = Op0.getOperand(0).getSimpleValueType();
+ SrcVT = MVT::getVectorVT(SrcVT.getScalarType(),
+ NumOps * SrcVT.getVectorNumElements());
+ return DAG.getNode(Op0.getOpcode(), DL, VT,
+ DAG.getNode(ISD::CONCAT_VECTORS, DL, SrcVT, LHS),
+ DAG.getNode(ISD::CONCAT_VECTORS, DL, SrcVT, RHS));
+ }
+ break;
+ }
+ }
+
+ // If we're inserting all zeros into the upper half, change this to
+ // an insert into an all zeros vector. We will match this to a move
+ // with implicit upper bit zeroing during isel.
+ if (Ops.size() == 2 && ISD::isBuildVectorAllZeros(Ops[1].getNode()))
+ return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VT,
+ getZeroVector(VT, Subtarget, DAG, DL), Ops[0],
+ DAG.getIntPtrConstant(0, DL));
+
+ return SDValue();
+}
+
+static SDValue combineConcatVectors(SDNode *N, SelectionDAG &DAG,
+ TargetLowering::DAGCombinerInfo &DCI,
+ const X86Subtarget &Subtarget) {
+ EVT VT = N->getValueType(0);
+ EVT SrcVT = N->getOperand(0).getValueType();
+ const TargetLowering &TLI = DAG.getTargetLoweringInfo();
+
+ // Don't do anything for i1 vectors.
+ if (VT.getVectorElementType() == MVT::i1)
+ return SDValue();
+
+ if (Subtarget.hasAVX() && TLI.isTypeLegal(VT) && TLI.isTypeLegal(SrcVT)) {
+ SmallVector<SDValue, 4> Ops(N->op_begin(), N->op_end());
+ if (SDValue R = combineConcatVectorOps(SDLoc(N), VT.getSimpleVT(), Ops, DAG,
+ DCI, Subtarget))
+ return R;
+ }
+
+ return SDValue();
+}
+
static SDValue combineInsertSubvector(SDNode *N, SelectionDAG &DAG,
TargetLowering::DAGCombinerInfo &DCI,
const X86Subtarget &Subtarget) {
@@ -41173,19 +43758,23 @@ static SDValue combineInsertSubvector(SDNode *N, SelectionDAG &DAG,
SDValue Vec = N->getOperand(0);
SDValue SubVec = N->getOperand(1);
- unsigned IdxVal = N->getConstantOperandVal(2);
+ uint64_t IdxVal = N->getConstantOperandVal(2);
MVT SubVecVT = SubVec.getSimpleValueType();
- if (ISD::isBuildVectorAllZeros(Vec.getNode())) {
- // Inserting zeros into zeros is a nop.
- if (ISD::isBuildVectorAllZeros(SubVec.getNode()))
- return getZeroVector(OpVT, Subtarget, DAG, dl);
+ if (Vec.isUndef() && SubVec.isUndef())
+ return DAG.getUNDEF(OpVT);
+
+ // Inserting undefs/zeros into zeros/undefs is a zero vector.
+ if ((Vec.isUndef() || ISD::isBuildVectorAllZeros(Vec.getNode())) &&
+ (SubVec.isUndef() || ISD::isBuildVectorAllZeros(SubVec.getNode())))
+ return getZeroVector(OpVT, Subtarget, DAG, dl);
+ if (ISD::isBuildVectorAllZeros(Vec.getNode())) {
// If we're inserting into a zero vector and then into a larger zero vector,
// just insert into the larger zero vector directly.
if (SubVec.getOpcode() == ISD::INSERT_SUBVECTOR &&
ISD::isBuildVectorAllZeros(SubVec.getOperand(0).getNode())) {
- unsigned Idx2Val = SubVec.getConstantOperandVal(2);
+ uint64_t Idx2Val = SubVec.getConstantOperandVal(2);
return DAG.getNode(ISD::INSERT_SUBVECTOR, dl, OpVT,
getZeroVector(OpVT, Subtarget, DAG, dl),
SubVec.getOperand(1),
@@ -41197,30 +43786,16 @@ static SDValue combineInsertSubvector(SDNode *N, SelectionDAG &DAG,
// least as large as the original insertion. Just insert the original
// subvector into a zero vector.
if (SubVec.getOpcode() == ISD::EXTRACT_SUBVECTOR && IdxVal == 0 &&
- SubVec.getConstantOperandVal(1) == 0 &&
+ SubVec.getConstantOperandAPInt(1) == 0 &&
SubVec.getOperand(0).getOpcode() == ISD::INSERT_SUBVECTOR) {
SDValue Ins = SubVec.getOperand(0);
- if (Ins.getConstantOperandVal(2) == 0 &&
+ if (Ins.getConstantOperandAPInt(2) == 0 &&
ISD::isBuildVectorAllZeros(Ins.getOperand(0).getNode()) &&
Ins.getOperand(1).getValueSizeInBits() <= SubVecVT.getSizeInBits())
return DAG.getNode(ISD::INSERT_SUBVECTOR, dl, OpVT,
getZeroVector(OpVT, Subtarget, DAG, dl),
Ins.getOperand(1), N->getOperand(2));
}
-
- // If we're inserting a bitcast into zeros, rewrite the insert and move the
- // bitcast to the other side. This helps with detecting zero extending
- // during isel.
- // TODO: Is this useful for other indices than 0?
- if (!IsI1Vector && SubVec.getOpcode() == ISD::BITCAST && IdxVal == 0) {
- MVT CastVT = SubVec.getOperand(0).getSimpleValueType();
- unsigned NumElems = OpVT.getSizeInBits() / CastVT.getScalarSizeInBits();
- MVT NewVT = MVT::getVectorVT(CastVT.getVectorElementType(), NumElems);
- SDValue Insert = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, NewVT,
- DAG.getBitcast(NewVT, Vec),
- SubVec.getOperand(0), N->getOperand(2));
- return DAG.getBitcast(OpVT, Insert);
- }
}
// Stop here if this is an i1 vector.
@@ -41248,77 +43823,92 @@ static SDValue combineInsertSubvector(SDNode *N, SelectionDAG &DAG,
}
}
- // Fold two 16-byte or 32-byte subvector loads into one 32-byte or 64-byte
- // load:
- // (insert_subvector (insert_subvector undef, (load16 addr), 0),
- // (load16 addr + 16), Elts/2)
- // --> load32 addr
- // or:
- // (insert_subvector (insert_subvector undef, (load32 addr), 0),
- // (load32 addr + 32), Elts/2)
- // --> load64 addr
- // or a 16-byte or 32-byte broadcast:
- // (insert_subvector (insert_subvector undef, (load16 addr), 0),
- // (load16 addr), Elts/2)
- // --> X86SubVBroadcast(load16 addr)
- // or:
- // (insert_subvector (insert_subvector undef, (load32 addr), 0),
- // (load32 addr), Elts/2)
- // --> X86SubVBroadcast(load32 addr)
+ // Match concat_vector style patterns.
+ SmallVector<SDValue, 2> SubVectorOps;
+ if (collectConcatOps(N, SubVectorOps))
+ if (SDValue Fold =
+ combineConcatVectorOps(dl, OpVT, SubVectorOps, DAG, DCI, Subtarget))
+ return Fold;
+
+ // If we are inserting into both halves of the vector, the starting vector
+ // should be undef. If it isn't, make it so. Only do this if the early insert
+ // has no other uses.
+ // TODO: Should this be a generic DAG combine?
+ // TODO: Why doesn't SimplifyDemandedVectorElts catch this?
if ((IdxVal == OpVT.getVectorNumElements() / 2) &&
Vec.getOpcode() == ISD::INSERT_SUBVECTOR &&
- OpVT.getSizeInBits() == SubVecVT.getSizeInBits() * 2) {
- if (isNullConstant(Vec.getOperand(2))) {
- SDValue SubVec2 = Vec.getOperand(1);
- // If needed, look through bitcasts to get to the load.
- if (auto *FirstLd = dyn_cast<LoadSDNode>(peekThroughBitcasts(SubVec2))) {
- bool Fast;
- unsigned Alignment = FirstLd->getAlignment();
- unsigned AS = FirstLd->getAddressSpace();
- const X86TargetLowering *TLI = Subtarget.getTargetLowering();
- if (TLI->allowsMemoryAccess(*DAG.getContext(), DAG.getDataLayout(),
- OpVT, AS, Alignment, &Fast) && Fast) {
- SDValue Ops[] = {SubVec2, SubVec};
- if (SDValue Ld = EltsFromConsecutiveLoads(OpVT, Ops, dl, DAG,
- Subtarget, false))
- return Ld;
- }
- }
- // If lower/upper loads are the same and there's no other use of the lower
- // load, then splat the loaded value with a broadcast.
- if (auto *Ld = dyn_cast<LoadSDNode>(peekThroughOneUseBitcasts(SubVec2)))
- if (SubVec2 == SubVec && ISD::isNormalLoad(Ld) && Vec.hasOneUse())
- return DAG.getNode(X86ISD::SUBV_BROADCAST, dl, OpVT, SubVec);
-
- // If this is subv_broadcast insert into both halves, use a larger
- // subv_broadcast.
- if (SubVec.getOpcode() == X86ISD::SUBV_BROADCAST && SubVec == SubVec2)
- return DAG.getNode(X86ISD::SUBV_BROADCAST, dl, OpVT,
- SubVec.getOperand(0));
-
- // If we're inserting all zeros into the upper half, change this to
- // an insert into an all zeros vector. We will match this to a move
- // with implicit upper bit zeroing during isel.
- if (ISD::isBuildVectorAllZeros(SubVec.getNode()))
- return DAG.getNode(ISD::INSERT_SUBVECTOR, dl, OpVT,
- getZeroVector(OpVT, Subtarget, DAG, dl), SubVec2,
- Vec.getOperand(2));
+ OpVT.getSizeInBits() == SubVecVT.getSizeInBits() * 2 &&
+ isNullConstant(Vec.getOperand(2)) && !Vec.getOperand(0).isUndef() &&
+ Vec.hasOneUse()) {
+ Vec = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, OpVT, DAG.getUNDEF(OpVT),
+ Vec.getOperand(1), Vec.getOperand(2));
+ return DAG.getNode(ISD::INSERT_SUBVECTOR, dl, OpVT, Vec, SubVec,
+ N->getOperand(2));
+ }
- // If we are inserting into both halves of the vector, the starting
- // vector should be undef. If it isn't, make it so. Only do this if the
- // the early insert has no other uses.
- // TODO: Should this be a generic DAG combine?
- if (!Vec.getOperand(0).isUndef() && Vec.hasOneUse()) {
- Vec = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, OpVT, DAG.getUNDEF(OpVT),
- SubVec2, Vec.getOperand(2));
- return DAG.getNode(ISD::INSERT_SUBVECTOR, dl, OpVT, Vec, SubVec,
- N->getOperand(2));
+ // If this is a broadcast insert into an upper undef, use a larger broadcast.
+ if (Vec.isUndef() && IdxVal != 0 && SubVec.getOpcode() == X86ISD::VBROADCAST)
+ return DAG.getNode(X86ISD::VBROADCAST, dl, OpVT, SubVec.getOperand(0));
- }
- }
+ return SDValue();
+}
+
+/// If we are extracting a subvector of a vector select and the select condition
+/// is composed of concatenated vectors, try to narrow the select width. This
+/// is a common pattern for AVX1 integer code because 256-bit selects may be
+/// legal, but there is almost no integer math/logic available for 256-bit.
+/// This function should only be called with legal types (otherwise, the calls
+/// to get simple value types will assert).
+static SDValue narrowExtractedVectorSelect(SDNode *Ext, SelectionDAG &DAG) {
+ SDValue Sel = peekThroughBitcasts(Ext->getOperand(0));
+ SmallVector<SDValue, 4> CatOps;
+ if (Sel.getOpcode() != ISD::VSELECT ||
+ !collectConcatOps(Sel.getOperand(0).getNode(), CatOps))
+ return SDValue();
+
+ // Note: We assume simple value types because this should only be called with
+ // legal operations/types.
+ // TODO: This can be extended to handle extraction to 256-bits.
+ MVT VT = Ext->getSimpleValueType(0);
+ if (!VT.is128BitVector())
+ return SDValue();
+
+ MVT SelCondVT = Sel.getOperand(0).getSimpleValueType();
+ if (!SelCondVT.is256BitVector() && !SelCondVT.is512BitVector())
+ return SDValue();
+
+ MVT WideVT = Ext->getOperand(0).getSimpleValueType();
+ MVT SelVT = Sel.getSimpleValueType();
+ assert((SelVT.is256BitVector() || SelVT.is512BitVector()) &&
+ "Unexpected vector type with legal operations");
+
+ unsigned SelElts = SelVT.getVectorNumElements();
+ unsigned CastedElts = WideVT.getVectorNumElements();
+ unsigned ExtIdx = cast<ConstantSDNode>(Ext->getOperand(1))->getZExtValue();
+ if (SelElts % CastedElts == 0) {
+ // The select has the same or more (narrower) elements than the extract
+ // operand. The extraction index gets scaled by that factor.
+ ExtIdx *= (SelElts / CastedElts);
+ } else if (CastedElts % SelElts == 0) {
+ // The select has less (wider) elements than the extract operand. Make sure
+ // that the extraction index can be divided evenly.
+ unsigned IndexDivisor = CastedElts / SelElts;
+ if (ExtIdx % IndexDivisor != 0)
+ return SDValue();
+ ExtIdx /= IndexDivisor;
+ } else {
+ llvm_unreachable("Element count of simple vector types are not divisible?");
}
- return SDValue();
+ unsigned NarrowingFactor = WideVT.getSizeInBits() / VT.getSizeInBits();
+ unsigned NarrowElts = SelElts / NarrowingFactor;
+ MVT NarrowSelVT = MVT::getVectorVT(SelVT.getVectorElementType(), NarrowElts);
+ SDLoc DL(Ext);
+ SDValue ExtCond = extract128BitVector(Sel.getOperand(0), ExtIdx, DAG, DL);
+ SDValue ExtT = extract128BitVector(Sel.getOperand(1), ExtIdx, DAG, DL);
+ SDValue ExtF = extract128BitVector(Sel.getOperand(2), ExtIdx, DAG, DL);
+ SDValue NarrowSel = DAG.getSelect(DL, NarrowSelVT, ExtCond, ExtT, ExtF);
+ return DAG.getBitcast(VT, NarrowSel);
}
static SDValue combineExtractSubvector(SDNode *N, SelectionDAG &DAG,
@@ -41334,7 +43924,10 @@ static SDValue combineExtractSubvector(SDNode *N, SelectionDAG &DAG,
// Capture the original wide type in the likely case that we need to bitcast
// back to this type.
- EVT VT = N->getValueType(0);
+ if (!N->getValueType(0).isSimple())
+ return SDValue();
+
+ MVT VT = N->getSimpleValueType(0);
EVT WideVecVT = N->getOperand(0).getValueType();
SDValue WideVec = peekThroughBitcasts(N->getOperand(0));
const TargetLowering &TLI = DAG.getTargetLoweringInfo();
@@ -41360,65 +43953,102 @@ static SDValue combineExtractSubvector(SDNode *N, SelectionDAG &DAG,
if (DCI.isBeforeLegalizeOps())
return SDValue();
- MVT OpVT = N->getSimpleValueType(0);
+ if (SDValue V = narrowExtractedVectorSelect(N, DAG))
+ return V;
+
SDValue InVec = N->getOperand(0);
unsigned IdxVal = cast<ConstantSDNode>(N->getOperand(1))->getZExtValue();
if (ISD::isBuildVectorAllZeros(InVec.getNode()))
- return getZeroVector(OpVT, Subtarget, DAG, SDLoc(N));
+ return getZeroVector(VT, Subtarget, DAG, SDLoc(N));
if (ISD::isBuildVectorAllOnes(InVec.getNode())) {
- if (OpVT.getScalarType() == MVT::i1)
- return DAG.getConstant(1, SDLoc(N), OpVT);
- return getOnesVector(OpVT, DAG, SDLoc(N));
+ if (VT.getScalarType() == MVT::i1)
+ return DAG.getConstant(1, SDLoc(N), VT);
+ return getOnesVector(VT, DAG, SDLoc(N));
}
if (InVec.getOpcode() == ISD::BUILD_VECTOR)
return DAG.getBuildVector(
- OpVT, SDLoc(N),
- InVec.getNode()->ops().slice(IdxVal, OpVT.getVectorNumElements()));
+ VT, SDLoc(N),
+ InVec.getNode()->ops().slice(IdxVal, VT.getVectorNumElements()));
+
+ // Try to move vector bitcast after extract_subv by scaling extraction index:
+ // extract_subv (bitcast X), Index --> bitcast (extract_subv X, Index')
+ // TODO: Move this to DAGCombiner::visitEXTRACT_SUBVECTOR
+ if (InVec.getOpcode() == ISD::BITCAST &&
+ InVec.getOperand(0).getValueType().isVector()) {
+ SDValue SrcOp = InVec.getOperand(0);
+ EVT SrcVT = SrcOp.getValueType();
+ unsigned SrcNumElts = SrcVT.getVectorNumElements();
+ unsigned DestNumElts = InVec.getValueType().getVectorNumElements();
+ if ((DestNumElts % SrcNumElts) == 0) {
+ unsigned DestSrcRatio = DestNumElts / SrcNumElts;
+ if ((VT.getVectorNumElements() % DestSrcRatio) == 0) {
+ unsigned NewExtNumElts = VT.getVectorNumElements() / DestSrcRatio;
+ EVT NewExtVT = EVT::getVectorVT(*DAG.getContext(),
+ SrcVT.getScalarType(), NewExtNumElts);
+ if ((N->getConstantOperandVal(1) % DestSrcRatio) == 0 &&
+ TLI.isOperationLegalOrCustom(ISD::EXTRACT_SUBVECTOR, NewExtVT)) {
+ unsigned IndexValScaled = N->getConstantOperandVal(1) / DestSrcRatio;
+ SDLoc DL(N);
+ SDValue NewIndex = DAG.getIntPtrConstant(IndexValScaled, DL);
+ SDValue NewExtract = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, NewExtVT,
+ SrcOp, NewIndex);
+ return DAG.getBitcast(VT, NewExtract);
+ }
+ }
+ }
+ }
+
+ // If we're extracting from a broadcast then we're better off just
+ // broadcasting to the smaller type directly, assuming this is the only use.
+ // As its a broadcast we don't care about the extraction index.
+ if (InVec.getOpcode() == X86ISD::VBROADCAST && InVec.hasOneUse() &&
+ InVec.getOperand(0).getValueSizeInBits() <= VT.getSizeInBits())
+ return DAG.getNode(X86ISD::VBROADCAST, SDLoc(N), VT, InVec.getOperand(0));
// If we're extracting the lowest subvector and we're the only user,
// we may be able to perform this with a smaller vector width.
if (IdxVal == 0 && InVec.hasOneUse()) {
unsigned InOpcode = InVec.getOpcode();
- if (OpVT == MVT::v2f64 && InVec.getValueType() == MVT::v4f64) {
+ if (VT == MVT::v2f64 && InVec.getValueType() == MVT::v4f64) {
// v2f64 CVTDQ2PD(v4i32).
if (InOpcode == ISD::SINT_TO_FP &&
InVec.getOperand(0).getValueType() == MVT::v4i32) {
- return DAG.getNode(X86ISD::CVTSI2P, SDLoc(N), OpVT, InVec.getOperand(0));
+ return DAG.getNode(X86ISD::CVTSI2P, SDLoc(N), VT, InVec.getOperand(0));
+ }
+ // v2f64 CVTUDQ2PD(v4i32).
+ if (InOpcode == ISD::UINT_TO_FP &&
+ InVec.getOperand(0).getValueType() == MVT::v4i32) {
+ return DAG.getNode(X86ISD::CVTUI2P, SDLoc(N), VT, InVec.getOperand(0));
}
// v2f64 CVTPS2PD(v4f32).
if (InOpcode == ISD::FP_EXTEND &&
InVec.getOperand(0).getValueType() == MVT::v4f32) {
- return DAG.getNode(X86ISD::VFPEXT, SDLoc(N), OpVT, InVec.getOperand(0));
+ return DAG.getNode(X86ISD::VFPEXT, SDLoc(N), VT, InVec.getOperand(0));
}
}
- if ((InOpcode == ISD::ZERO_EXTEND || InOpcode == ISD::SIGN_EXTEND) &&
- OpVT.is128BitVector() &&
- InVec.getOperand(0).getSimpleValueType().is128BitVector()) {
- unsigned ExtOp =
- InOpcode == ISD::ZERO_EXTEND ? ISD::ZERO_EXTEND_VECTOR_INREG
- : ISD::SIGN_EXTEND_VECTOR_INREG;
- return DAG.getNode(ExtOp, SDLoc(N), OpVT, InVec.getOperand(0));
- }
- if ((InOpcode == ISD::ZERO_EXTEND_VECTOR_INREG ||
+ if ((InOpcode == ISD::ANY_EXTEND ||
+ InOpcode == ISD::ANY_EXTEND_VECTOR_INREG ||
+ InOpcode == ISD::ZERO_EXTEND ||
+ InOpcode == ISD::ZERO_EXTEND_VECTOR_INREG ||
+ InOpcode == ISD::SIGN_EXTEND ||
InOpcode == ISD::SIGN_EXTEND_VECTOR_INREG) &&
- OpVT.is128BitVector() &&
+ VT.is128BitVector() &&
InVec.getOperand(0).getSimpleValueType().is128BitVector()) {
- return DAG.getNode(InOpcode, SDLoc(N), OpVT, InVec.getOperand(0));
+ unsigned ExtOp = getOpcode_EXTEND_VECTOR_INREG(InOpcode);
+ return DAG.getNode(ExtOp, SDLoc(N), VT, InVec.getOperand(0));
}
- if (InOpcode == ISD::BITCAST) {
- // TODO - do this for target shuffles in general.
- SDValue InVecBC = peekThroughOneUseBitcasts(InVec);
- if (InVecBC.getOpcode() == X86ISD::PSHUFB && OpVT.is128BitVector()) {
- SDLoc DL(N);
- SDValue SubPSHUFB =
- DAG.getNode(X86ISD::PSHUFB, DL, MVT::v16i8,
- extract128BitVector(InVecBC.getOperand(0), 0, DAG, DL),
- extract128BitVector(InVecBC.getOperand(1), 0, DAG, DL));
- return DAG.getBitcast(OpVT, SubPSHUFB);
- }
+ if (InOpcode == ISD::VSELECT &&
+ InVec.getOperand(0).getValueType().is256BitVector() &&
+ InVec.getOperand(1).getValueType().is256BitVector() &&
+ InVec.getOperand(2).getValueType().is256BitVector()) {
+ SDLoc DL(N);
+ SDValue Ext0 = extractSubVector(InVec.getOperand(0), 0, DAG, DL, 128);
+ SDValue Ext1 = extractSubVector(InVec.getOperand(1), 0, DAG, DL, 128);
+ SDValue Ext2 = extractSubVector(InVec.getOperand(2), 0, DAG, DL, 128);
+ return DAG.getNode(InOpcode, DL, VT, Ext0, Ext1, Ext2);
}
}
@@ -41428,6 +44058,7 @@ static SDValue combineExtractSubvector(SDNode *N, SelectionDAG &DAG,
static SDValue combineScalarToVector(SDNode *N, SelectionDAG &DAG) {
EVT VT = N->getValueType(0);
SDValue Src = N->getOperand(0);
+ SDLoc DL(N);
// If this is a scalar to vector to v1i1 from an AND with 1, bypass the and.
// This occurs frequently in our masked scalar intrinsic code and our
@@ -41436,7 +44067,7 @@ static SDValue combineScalarToVector(SDNode *N, SelectionDAG &DAG) {
if (VT == MVT::v1i1 && Src.getOpcode() == ISD::AND && Src.hasOneUse())
if (auto *C = dyn_cast<ConstantSDNode>(Src.getOperand(1)))
if (C->getAPIntValue().isOneValue())
- return DAG.getNode(ISD::SCALAR_TO_VECTOR, SDLoc(N), MVT::v1i1,
+ return DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, MVT::v1i1,
Src.getOperand(0));
// Combine scalar_to_vector of an extract_vector_elt into an extract_subvec.
@@ -41445,8 +44076,17 @@ static SDValue combineScalarToVector(SDNode *N, SelectionDAG &DAG) {
Src.getOperand(0).getValueType().getVectorElementType() == MVT::i1)
if (auto *C = dyn_cast<ConstantSDNode>(Src.getOperand(1)))
if (C->isNullValue())
- return DAG.getNode(ISD::EXTRACT_SUBVECTOR, SDLoc(N), VT,
- Src.getOperand(0), Src.getOperand(1));
+ return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, Src.getOperand(0),
+ Src.getOperand(1));
+
+ // Reduce v2i64 to v4i32 if we don't need the upper bits.
+ // TODO: Move to DAGCombine?
+ if (VT == MVT::v2i64 && Src.getOpcode() == ISD::ANY_EXTEND &&
+ Src.getValueType() == MVT::i64 && Src.hasOneUse() &&
+ Src.getOperand(0).getScalarValueSizeInBits() <= 32)
+ return DAG.getBitcast(
+ VT, DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, MVT::v4i32,
+ DAG.getAnyExtOrTrunc(Src.getOperand(0), DL, MVT::i32)));
return SDValue();
}
@@ -41483,6 +44123,56 @@ static SDValue combinePMULDQ(SDNode *N, SelectionDAG &DAG,
return SDValue();
}
+static SDValue combineExtInVec(SDNode *N, SelectionDAG &DAG,
+ TargetLowering::DAGCombinerInfo &DCI,
+ const X86Subtarget &Subtarget) {
+ EVT VT = N->getValueType(0);
+ SDValue In = N->getOperand(0);
+ const TargetLowering &TLI = DAG.getTargetLoweringInfo();
+
+ // Try to merge vector loads and extend_inreg to an extload.
+ if (!DCI.isBeforeLegalizeOps() && ISD::isNormalLoad(In.getNode()) &&
+ In.hasOneUse()) {
+ auto *Ld = cast<LoadSDNode>(In);
+ if (!Ld->isVolatile()) {
+ MVT SVT = In.getSimpleValueType().getVectorElementType();
+ ISD::LoadExtType Ext = N->getOpcode() == ISD::SIGN_EXTEND_VECTOR_INREG ? ISD::SEXTLOAD : ISD::ZEXTLOAD;
+ EVT MemVT = EVT::getVectorVT(*DAG.getContext(), SVT,
+ VT.getVectorNumElements());
+ if (TLI.isLoadExtLegal(Ext, VT, MemVT)) {
+ SDValue Load =
+ DAG.getExtLoad(Ext, SDLoc(N), VT, Ld->getChain(), Ld->getBasePtr(),
+ Ld->getPointerInfo(), MemVT, Ld->getAlignment(),
+ Ld->getMemOperand()->getFlags());
+ DAG.ReplaceAllUsesOfValueWith(SDValue(Ld, 1), Load.getValue(1));
+ return Load;
+ }
+ }
+ }
+
+ // Disabling for widening legalization for now. We can enable if we find a
+ // case that needs it. Otherwise it can be deleted when we switch to
+ // widening legalization.
+ if (ExperimentalVectorWideningLegalization)
+ return SDValue();
+
+ // Combine (ext_invec (ext_invec X)) -> (ext_invec X)
+ if (In.getOpcode() == N->getOpcode() &&
+ TLI.isTypeLegal(VT) && TLI.isTypeLegal(In.getOperand(0).getValueType()))
+ return DAG.getNode(N->getOpcode(), SDLoc(N), VT, In.getOperand(0));
+
+ // Attempt to combine as a shuffle.
+ // TODO: SSE41 support
+ if (Subtarget.hasAVX() && N->getOpcode() != ISD::SIGN_EXTEND_VECTOR_INREG) {
+ SDValue Op(N, 0);
+ if (TLI.isTypeLegal(VT) && TLI.isTypeLegal(In.getValueType()))
+ if (SDValue Res = combineX86ShufflesRecursively(Op, DAG, Subtarget))
+ return Res;
+ }
+
+ return SDValue();
+}
+
SDValue X86TargetLowering::PerformDAGCombine(SDNode *N,
DAGCombinerInfo &DCI) const {
SelectionDAG &DAG = DCI.DAG;
@@ -41494,6 +44184,8 @@ SDValue X86TargetLowering::PerformDAGCombine(SDNode *N,
case X86ISD::PEXTRW:
case X86ISD::PEXTRB:
return combineExtractVectorElt(N, DAG, DCI, Subtarget);
+ case ISD::CONCAT_VECTORS:
+ return combineConcatVectors(N, DAG, DCI, Subtarget);
case ISD::INSERT_SUBVECTOR:
return combineInsertSubvector(N, DAG, DCI, Subtarget);
case ISD::EXTRACT_SUBVECTOR:
@@ -41506,19 +44198,21 @@ SDValue X86TargetLowering::PerformDAGCombine(SDNode *N,
case X86ISD::CMP: return combineCMP(N, DAG);
case ISD::ADD: return combineAdd(N, DAG, Subtarget);
case ISD::SUB: return combineSub(N, DAG, Subtarget);
+ case X86ISD::ADD:
+ case X86ISD::SUB: return combineX86AddSub(N, DAG, DCI);
case X86ISD::SBB: return combineSBB(N, DAG);
case X86ISD::ADC: return combineADC(N, DAG, DCI);
case ISD::MUL: return combineMul(N, DAG, DCI, Subtarget);
- case ISD::SHL:
- case ISD::SRA:
- case ISD::SRL: return combineShift(N, DAG, DCI, Subtarget);
+ case ISD::SHL: return combineShiftLeft(N, DAG);
+ case ISD::SRA: return combineShiftRightArithmetic(N, DAG);
+ case ISD::SRL: return combineShiftRightLogical(N, DAG, DCI);
case ISD::AND: return combineAnd(N, DAG, DCI, Subtarget);
case ISD::OR: return combineOr(N, DAG, DCI, Subtarget);
case ISD::XOR: return combineXor(N, DAG, DCI, Subtarget);
case X86ISD::BEXTR: return combineBEXTR(N, DAG, DCI, Subtarget);
case ISD::LOAD: return combineLoad(N, DAG, DCI, Subtarget);
case ISD::MLOAD: return combineMaskedLoad(N, DAG, DCI, Subtarget);
- case ISD::STORE: return combineStore(N, DAG, Subtarget);
+ case ISD::STORE: return combineStore(N, DAG, DCI, Subtarget);
case ISD::MSTORE: return combineMaskedStore(N, DAG, DCI, Subtarget);
case ISD::SINT_TO_FP: return combineSIntToFP(N, DAG, Subtarget);
case ISD::UINT_TO_FP: return combineUIntToFP(N, DAG, Subtarget);
@@ -41535,13 +44229,21 @@ SDValue X86TargetLowering::PerformDAGCombine(SDNode *N,
case X86ISD::FMAX: return combineFMinFMax(N, DAG);
case ISD::FMINNUM:
case ISD::FMAXNUM: return combineFMinNumFMaxNum(N, DAG, Subtarget);
- case X86ISD::CVTSI2P:
+ case X86ISD::CVTSI2P:
case X86ISD::CVTUI2P: return combineX86INT_TO_FP(N, DAG, DCI);
+ case X86ISD::CVTP2SI:
+ case X86ISD::CVTP2UI:
+ case X86ISD::CVTTP2SI:
+ case X86ISD::CVTTP2UI: return combineCVTP2I_CVTTP2I(N, DAG, DCI);
case X86ISD::BT: return combineBT(N, DAG, DCI);
case ISD::ANY_EXTEND:
case ISD::ZERO_EXTEND: return combineZext(N, DAG, DCI, Subtarget);
case ISD::SIGN_EXTEND: return combineSext(N, DAG, DCI, Subtarget);
case ISD::SIGN_EXTEND_INREG: return combineSignExtendInReg(N, DAG, Subtarget);
+ case ISD::ANY_EXTEND_VECTOR_INREG:
+ case ISD::SIGN_EXTEND_VECTOR_INREG:
+ case ISD::ZERO_EXTEND_VECTOR_INREG: return combineExtInVec(N, DAG, DCI,
+ Subtarget);
case ISD::SETCC: return combineSetCC(N, DAG, Subtarget);
case X86ISD::SETCC: return combineX86SetCC(N, DAG, Subtarget);
case X86ISD::BRCOND: return combineBrCond(N, DAG, Subtarget);
@@ -41624,11 +44326,15 @@ bool X86TargetLowering::isTypeDesirableForOp(unsigned Opc, EVT VT) const {
if (Opc == ISD::SHL && VT.isVector() && VT.getVectorElementType() == MVT::i8)
return false;
- // 8-bit multiply is probably not much cheaper than 32-bit multiply, and
- // we have specializations to turn 32-bit multiply into LEA or other ops.
+ // TODO: Almost no 8-bit ops are desirable because they have no actual
+ // size/speed advantages vs. 32-bit ops, but they do have a major
+ // potential disadvantage by causing partial register stalls.
+ //
+ // 8-bit multiply/shl is probably not cheaper than 32-bit multiply/shl, and
+ // we have specializations to turn 32-bit multiply/shl into LEA or other ops.
// Also, see the comment in "IsDesirableToPromoteOp" - where we additionally
// check for a constant operand to the multiply.
- if (Opc == ISD::MUL && VT == MVT::i8)
+ if ((Opc == ISD::MUL || Opc == ISD::SHL) && VT == MVT::i8)
return false;
// i16 instruction encodings are longer and some i16 instructions are slow,
@@ -41642,6 +44348,7 @@ bool X86TargetLowering::isTypeDesirableForOp(unsigned Opc, EVT VT) const {
case ISD::ZERO_EXTEND:
case ISD::ANY_EXTEND:
case ISD::SHL:
+ case ISD::SRA:
case ISD::SRL:
case ISD::SUB:
case ISD::ADD:
@@ -41717,6 +44424,7 @@ bool X86TargetLowering::IsDesirableToPromoteOp(SDValue Op, EVT &PVT) const {
case ISD::ANY_EXTEND:
break;
case ISD::SHL:
+ case ISD::SRA:
case ISD::SRL: {
SDValue N0 = Op.getOperand(0);
// Look out for (store (shl (load), x)).
@@ -41889,6 +44597,40 @@ bool X86TargetLowering::ExpandInlineAsm(CallInst *CI) const {
return false;
}
+static X86::CondCode parseConstraintCode(llvm::StringRef Constraint) {
+ X86::CondCode Cond = StringSwitch<X86::CondCode>(Constraint)
+ .Case("{@cca}", X86::COND_A)
+ .Case("{@ccae}", X86::COND_AE)
+ .Case("{@ccb}", X86::COND_B)
+ .Case("{@ccbe}", X86::COND_BE)
+ .Case("{@ccc}", X86::COND_B)
+ .Case("{@cce}", X86::COND_E)
+ .Case("{@ccz}", X86::COND_E)
+ .Case("{@ccg}", X86::COND_G)
+ .Case("{@ccge}", X86::COND_GE)
+ .Case("{@ccl}", X86::COND_L)
+ .Case("{@ccle}", X86::COND_LE)
+ .Case("{@ccna}", X86::COND_BE)
+ .Case("{@ccnae}", X86::COND_B)
+ .Case("{@ccnb}", X86::COND_AE)
+ .Case("{@ccnbe}", X86::COND_A)
+ .Case("{@ccnc}", X86::COND_AE)
+ .Case("{@ccne}", X86::COND_NE)
+ .Case("{@ccnz}", X86::COND_NE)
+ .Case("{@ccng}", X86::COND_LE)
+ .Case("{@ccnge}", X86::COND_L)
+ .Case("{@ccnl}", X86::COND_GE)
+ .Case("{@ccnle}", X86::COND_G)
+ .Case("{@ccno}", X86::COND_NO)
+ .Case("{@ccnp}", X86::COND_P)
+ .Case("{@ccns}", X86::COND_NS)
+ .Case("{@cco}", X86::COND_O)
+ .Case("{@ccp}", X86::COND_P)
+ .Case("{@ccs}", X86::COND_S)
+ .Default(X86::COND_INVALID);
+ return Cond;
+}
+
/// Given a constraint letter, return the type of constraint for this target.
X86TargetLowering::ConstraintType
X86TargetLowering::getConstraintType(StringRef Constraint) const {
@@ -41949,7 +44691,8 @@ X86TargetLowering::getConstraintType(StringRef Constraint) const {
return C_RegisterClass;
}
}
- }
+ } else if (parseConstraintCode(Constraint) != X86::COND_INVALID)
+ return C_Other;
return TargetLowering::getConstraintType(Constraint);
}
@@ -42120,6 +44863,32 @@ LowerXConstraint(EVT ConstraintVT) const {
return TargetLowering::LowerXConstraint(ConstraintVT);
}
+// Lower @cc targets via setcc.
+SDValue X86TargetLowering::LowerAsmOutputForConstraint(
+ SDValue &Chain, SDValue &Flag, SDLoc DL, const AsmOperandInfo &OpInfo,
+ SelectionDAG &DAG) const {
+ X86::CondCode Cond = parseConstraintCode(OpInfo.ConstraintCode);
+ if (Cond == X86::COND_INVALID)
+ return SDValue();
+ // Check that return type is valid.
+ if (OpInfo.ConstraintVT.isVector() || !OpInfo.ConstraintVT.isInteger() ||
+ OpInfo.ConstraintVT.getSizeInBits() < 8)
+ report_fatal_error("Flag output operand is of invalid type");
+
+ // Get EFLAGS register. Only update chain when copyfrom is glued.
+ if (Flag.getNode()) {
+ Flag = DAG.getCopyFromReg(Chain, DL, X86::EFLAGS, MVT::i32, Flag);
+ Chain = Flag.getValue(1);
+ } else
+ Flag = DAG.getCopyFromReg(Chain, DL, X86::EFLAGS, MVT::i32);
+ // Extract CC code.
+ SDValue CC = getSETCC(Cond, Flag, DL, DAG);
+ // Extend to 32-bits
+ SDValue Result = DAG.getNode(ISD::ZERO_EXTEND, DL, OpInfo.ConstraintVT, CC);
+
+ return Result;
+}
+
/// Lower the specified operand into the Ops vector.
/// If it is invalid, don't add anything to Ops.
void X86TargetLowering::LowerAsmOperandForConstraint(SDValue Op,
@@ -42229,8 +44998,13 @@ void X86TargetLowering::LowerAsmOperandForConstraint(SDValue Op,
case 'i': {
// Literal immediates are always ok.
if (ConstantSDNode *CST = dyn_cast<ConstantSDNode>(Op)) {
- // Widen to 64 bits here to get it sign extended.
- Result = DAG.getTargetConstant(CST->getSExtValue(), SDLoc(Op), MVT::i64);
+ bool IsBool = CST->getConstantIntValue()->getBitWidth() == 1;
+ BooleanContent BCont = getBooleanContents(MVT::i64);
+ ISD::NodeType ExtOpc = IsBool ? getExtendForContent(BCont)
+ : ISD::SIGN_EXTEND;
+ int64_t ExtVal = ExtOpc == ISD::ZERO_EXTEND ? CST->getZExtValue()
+ : CST->getSExtValue();
+ Result = DAG.getTargetConstant(ExtVal, SDLoc(Op), MVT::i64);
break;
}
@@ -42242,40 +45016,12 @@ void X86TargetLowering::LowerAsmOperandForConstraint(SDValue Op,
// If we are in non-pic codegen mode, we allow the address of a global (with
// an optional displacement) to be used with 'i'.
- GlobalAddressSDNode *GA = nullptr;
- int64_t Offset = 0;
-
- // Match either (GA), (GA+C), (GA+C1+C2), etc.
- while (1) {
- if ((GA = dyn_cast<GlobalAddressSDNode>(Op))) {
- Offset += GA->getOffset();
- break;
- } else if (Op.getOpcode() == ISD::ADD) {
- if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op.getOperand(1))) {
- Offset += C->getZExtValue();
- Op = Op.getOperand(0);
- continue;
- }
- } else if (Op.getOpcode() == ISD::SUB) {
- if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op.getOperand(1))) {
- Offset += -C->getZExtValue();
- Op = Op.getOperand(0);
- continue;
- }
- }
-
- // Otherwise, this isn't something we can handle, reject it.
- return;
- }
-
- const GlobalValue *GV = GA->getGlobal();
- // If we require an extra load to get this address, as in PIC mode, we
- // can't accept it.
- if (isGlobalStubReference(Subtarget.classifyGlobalReference(GV)))
- return;
-
- Result = DAG.getTargetGlobalAddress(GV, SDLoc(Op),
- GA->getValueType(0), Offset);
+ if (auto *GA = dyn_cast<GlobalAddressSDNode>(Op))
+ // If we require an extra load to get this address, as in PIC mode, we
+ // can't accept it.
+ if (isGlobalStubReference(
+ Subtarget.classifyGlobalReference(GA->getGlobal())))
+ return;
break;
}
}
@@ -42307,6 +45053,18 @@ static bool isFRClass(const TargetRegisterClass &RC) {
RC.hasSuperClassEq(&X86::VR512RegClass);
}
+/// Check if \p RC is a mask register class.
+/// I.e., VK* or one of their variant.
+static bool isVKClass(const TargetRegisterClass &RC) {
+ return RC.hasSuperClassEq(&X86::VK1RegClass) ||
+ RC.hasSuperClassEq(&X86::VK2RegClass) ||
+ RC.hasSuperClassEq(&X86::VK4RegClass) ||
+ RC.hasSuperClassEq(&X86::VK8RegClass) ||
+ RC.hasSuperClassEq(&X86::VK16RegClass) ||
+ RC.hasSuperClassEq(&X86::VK32RegClass) ||
+ RC.hasSuperClassEq(&X86::VK64RegClass);
+}
+
std::pair<unsigned, const TargetRegisterClass *>
X86TargetLowering::getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI,
StringRef Constraint,
@@ -42317,25 +45075,31 @@ X86TargetLowering::getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI,
// GCC Constraint Letters
switch (Constraint[0]) {
default: break;
+ // 'A' means [ER]AX + [ER]DX.
+ case 'A':
+ if (Subtarget.is64Bit())
+ return std::make_pair(X86::RAX, &X86::GR64_ADRegClass);
+ assert((Subtarget.is32Bit() || Subtarget.is16Bit()) &&
+ "Expecting 64, 32 or 16 bit subtarget");
+ return std::make_pair(X86::EAX, &X86::GR32_ADRegClass);
+
// TODO: Slight differences here in allocation order and leaving
// RIP in the class. Do they matter any more here than they do
// in the normal allocation?
case 'k':
if (Subtarget.hasAVX512()) {
- // Only supported in AVX512 or later.
- switch (VT.SimpleTy) {
- default: break;
- case MVT::i32:
- return std::make_pair(0U, &X86::VK32RegClass);
- case MVT::i16:
- return std::make_pair(0U, &X86::VK16RegClass);
- case MVT::i8:
- return std::make_pair(0U, &X86::VK8RegClass);
- case MVT::i1:
+ if (VT == MVT::i1)
return std::make_pair(0U, &X86::VK1RegClass);
- case MVT::i64:
+ if (VT == MVT::i8)
+ return std::make_pair(0U, &X86::VK8RegClass);
+ if (VT == MVT::i16)
+ return std::make_pair(0U, &X86::VK16RegClass);
+ }
+ if (Subtarget.hasBWI()) {
+ if (VT == MVT::i32)
+ return std::make_pair(0U, &X86::VK32RegClass);
+ if (VT == MVT::i64)
return std::make_pair(0U, &X86::VK64RegClass);
- }
}
break;
case 'q': // GENERAL_REGS in 64-bit mode, Q_REGS in 32-bit mode.
@@ -42403,7 +45167,7 @@ X86TargetLowering::getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI,
// Scalar SSE types.
case MVT::f32:
case MVT::i32:
- if (VConstraint && Subtarget.hasAVX512() && Subtarget.hasVLX())
+ if (VConstraint && Subtarget.hasVLX())
return std::make_pair(0U, &X86::FR32XRegClass);
return std::make_pair(0U, &X86::FR32RegClass);
case MVT::f64:
@@ -42431,12 +45195,17 @@ X86TargetLowering::getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI,
case MVT::v4f64:
if (VConstraint && Subtarget.hasVLX())
return std::make_pair(0U, &X86::VR256XRegClass);
- return std::make_pair(0U, &X86::VR256RegClass);
+ if (Subtarget.hasAVX())
+ return std::make_pair(0U, &X86::VR256RegClass);
+ break;
case MVT::v8f64:
case MVT::v16f32:
case MVT::v16i32:
case MVT::v8i64:
- return std::make_pair(0U, &X86::VR512RegClass);
+ if (!Subtarget.hasAVX512()) break;
+ if (VConstraint)
+ return std::make_pair(0U, &X86::VR512RegClass);
+ return std::make_pair(0U, &X86::VR512_0_15RegClass);
}
break;
}
@@ -42457,25 +45226,27 @@ X86TargetLowering::getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI,
return std::make_pair(X86::XMM0, &X86::VR128RegClass);
case 'k':
// This register class doesn't allocate k0 for masked vector operation.
- if (Subtarget.hasAVX512()) { // Only supported in AVX512.
- switch (VT.SimpleTy) {
- default: break;
- case MVT::i32:
- return std::make_pair(0U, &X86::VK32WMRegClass);
- case MVT::i16:
- return std::make_pair(0U, &X86::VK16WMRegClass);
- case MVT::i8:
- return std::make_pair(0U, &X86::VK8WMRegClass);
- case MVT::i1:
+ if (Subtarget.hasAVX512()) {
+ if (VT == MVT::i1)
return std::make_pair(0U, &X86::VK1WMRegClass);
- case MVT::i64:
+ if (VT == MVT::i8)
+ return std::make_pair(0U, &X86::VK8WMRegClass);
+ if (VT == MVT::i16)
+ return std::make_pair(0U, &X86::VK16WMRegClass);
+ }
+ if (Subtarget.hasBWI()) {
+ if (VT == MVT::i32)
+ return std::make_pair(0U, &X86::VK32WMRegClass);
+ if (VT == MVT::i64)
return std::make_pair(0U, &X86::VK64WMRegClass);
- }
}
break;
}
}
+ if (parseConstraintCode(Constraint) != X86::COND_INVALID)
+ return std::make_pair(0U, &X86::GR32RegClass);
+
// Use the default implementation in TargetLowering to convert the register
// constraint into a member of a register class.
std::pair<unsigned, const TargetRegisterClass*> Res;
@@ -42505,14 +45276,14 @@ X86TargetLowering::getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI,
if (StringRef("{flags}").equals_lower(Constraint))
return std::make_pair(X86::EFLAGS, &X86::CCRRegClass);
- // 'A' means [ER]AX + [ER]DX.
- if (Constraint == "A") {
- if (Subtarget.is64Bit())
- return std::make_pair(X86::RAX, &X86::GR64_ADRegClass);
- assert((Subtarget.is32Bit() || Subtarget.is16Bit()) &&
- "Expecting 64, 32 or 16 bit subtarget");
- return std::make_pair(X86::EAX, &X86::GR32_ADRegClass);
- }
+ // dirflag -> DF
+ if (StringRef("{dirflag}").equals_lower(Constraint))
+ return std::make_pair(X86::DF, &X86::DFCCRRegClass);
+
+ // fpsr -> FPSW
+ if (StringRef("{fpsr}").equals_lower(Constraint))
+ return std::make_pair(X86::FPSW, &X86::FPCCRRegClass);
+
return Res;
}
@@ -42561,20 +45332,20 @@ X86TargetLowering::getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI,
if (Size == 64 && !is64Bit) {
// Model GCC's behavior here and select a fixed pair of 32-bit
// registers.
- switch (Res.first) {
- case X86::EAX:
+ switch (DestReg) {
+ case X86::RAX:
return std::make_pair(X86::EAX, &X86::GR32_ADRegClass);
- case X86::EDX:
+ case X86::RDX:
return std::make_pair(X86::EDX, &X86::GR32_DCRegClass);
- case X86::ECX:
+ case X86::RCX:
return std::make_pair(X86::ECX, &X86::GR32_CBRegClass);
- case X86::EBX:
+ case X86::RBX:
return std::make_pair(X86::EBX, &X86::GR32_BSIRegClass);
- case X86::ESI:
+ case X86::RSI:
return std::make_pair(X86::ESI, &X86::GR32_SIDIRegClass);
- case X86::EDI:
+ case X86::RDI:
return std::make_pair(X86::EDI, &X86::GR32_DIBPRegClass);
- case X86::EBP:
+ case X86::RBP:
return std::make_pair(X86::EBP, &X86::GR32_BPSPRegClass);
default:
return std::make_pair(0, nullptr);
@@ -42594,13 +45365,13 @@ X86TargetLowering::getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI,
// TODO: Handle f128 and i128 in FR128RegClass after it is tested well.
if (VT == MVT::f32 || VT == MVT::i32)
- Res.second = &X86::FR32RegClass;
+ Res.second = &X86::FR32XRegClass;
else if (VT == MVT::f64 || VT == MVT::i64)
- Res.second = &X86::FR64RegClass;
- else if (TRI->isTypeLegalForClass(X86::VR128RegClass, VT))
- Res.second = &X86::VR128RegClass;
- else if (TRI->isTypeLegalForClass(X86::VR256RegClass, VT))
- Res.second = &X86::VR256RegClass;
+ Res.second = &X86::FR64XRegClass;
+ else if (TRI->isTypeLegalForClass(X86::VR128XRegClass, VT))
+ Res.second = &X86::VR128XRegClass;
+ else if (TRI->isTypeLegalForClass(X86::VR256XRegClass, VT))
+ Res.second = &X86::VR256XRegClass;
else if (TRI->isTypeLegalForClass(X86::VR512RegClass, VT))
Res.second = &X86::VR512RegClass;
else {
@@ -42608,6 +45379,22 @@ X86TargetLowering::getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI,
Res.first = 0;
Res.second = nullptr;
}
+ } else if (isVKClass(*Class)) {
+ if (VT == MVT::i1)
+ Res.second = &X86::VK1RegClass;
+ else if (VT == MVT::i8)
+ Res.second = &X86::VK8RegClass;
+ else if (VT == MVT::i16)
+ Res.second = &X86::VK16RegClass;
+ else if (VT == MVT::i32)
+ Res.second = &X86::VK32RegClass;
+ else if (VT == MVT::i64)
+ Res.second = &X86::VK64RegClass;
+ else {
+ // Type mismatch and not a clobber: Return an error;
+ Res.first = 0;
+ Res.second = nullptr;
+ }
}
return Res;
@@ -42660,7 +45447,7 @@ void X86TargetLowering::initializeSplitCSR(MachineBasicBlock *Entry) const {
// Update IsSplitCSR in X86MachineFunctionInfo.
X86MachineFunctionInfo *AFI =
- Entry->getParent()->getInfo<X86MachineFunctionInfo>();
+ Entry->getParent()->getInfo<X86MachineFunctionInfo>();
AFI->setIsSplitCSR(true);
}
@@ -42688,9 +45475,9 @@ void X86TargetLowering::insertCopiesSplitCSR(
// fine for CXX_FAST_TLS since the C++-style TLS access functions should be
// nounwind. If we want to generalize this later, we may need to emit
// CFI pseudo-instructions.
- assert(Entry->getParent()->getFunction().hasFnAttribute(
- Attribute::NoUnwind) &&
- "Function should be nounwind in insertCopiesSplitCSR!");
+ assert(
+ Entry->getParent()->getFunction().hasFnAttribute(Attribute::NoUnwind) &&
+ "Function should be nounwind in insertCopiesSplitCSR!");
Entry->addLiveIn(*I);
BuildMI(*Entry, MBBI, DebugLoc(), TII->get(TargetOpcode::COPY), NewVR)
.addReg(*I);
@@ -42709,7 +45496,8 @@ bool X86TargetLowering::supportSwiftError() const {
/// Returns the name of the symbol used to emit stack probes or the empty
/// string if not applicable.
-StringRef X86TargetLowering::getStackProbeSymbolName(MachineFunction &MF) const {
+StringRef
+X86TargetLowering::getStackProbeSymbolName(MachineFunction &MF) const {
// If the function specifically requests stack probes, emit them.
if (MF.getFunction().hasFnAttribute("probe-stack"))
return MF.getFunction().getFnAttribute("probe-stack").getValueAsString();
diff --git a/lib/Target/X86/X86ISelLowering.h b/lib/Target/X86/X86ISelLowering.h
index 910acd80e8b8..e0be03bc3f9d 100644
--- a/lib/Target/X86/X86ISelLowering.h
+++ b/lib/Target/X86/X86ISelLowering.h
@@ -1,9 +1,8 @@
//===-- X86ISelLowering.h - X86 DAG Lowering Interface ----------*- C++ -*-===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
@@ -78,15 +77,6 @@ namespace llvm {
/// Same as call except it adds the NoTrack prefix.
NT_CALL,
- /// This operation implements the lowering for readcyclecounter.
- RDTSC_DAG,
-
- /// X86 Read Time-Stamp Counter and Processor ID.
- RDTSCP_DAG,
-
- /// X86 Read Performance Monitoring Counters.
- RDPMC_DAG,
-
/// X86 compare and logical compare instructions.
CMP, COMI, UCOMI,
@@ -110,13 +100,12 @@ namespace llvm {
FSETCC,
/// X86 FP SETCC, similar to above, but with output as an i1 mask and
- /// with optional rounding mode.
- FSETCCM, FSETCCM_RND,
+ /// and a version with SAE.
+ FSETCCM, FSETCCM_SAE,
/// X86 conditional moves. Operand 0 and operand 1 are the two values
/// to select from. Operand 2 is the condition code, and operand 3 is the
- /// flag operand produced by a CMP or TEST instruction. It also writes a
- /// flag result.
+ /// flag operand produced by a CMP or TEST instruction.
CMOV,
/// X86 conditional branches. Operand 0 is the chain operand, operand 1
@@ -204,28 +193,29 @@ namespace llvm {
/// Dynamic (non-constant condition) vector blend where only the sign bits
/// of the condition elements are used. This is used to enforce that the
/// condition mask is not valid for generic VSELECT optimizations. This
- /// can also be used to implement the intrinsics.
+ /// is also used to implement the intrinsics.
+ /// Operands are in VSELECT order: MASK, TRUE, FALSE
BLENDV,
/// Combined add and sub on an FP vector.
ADDSUB,
// FP vector ops with rounding mode.
- FADD_RND, FADDS_RND,
- FSUB_RND, FSUBS_RND,
- FMUL_RND, FMULS_RND,
- FDIV_RND, FDIVS_RND,
- FMAX_RND, FMAXS_RND,
- FMIN_RND, FMINS_RND,
- FSQRT_RND, FSQRTS_RND,
+ FADD_RND, FADDS, FADDS_RND,
+ FSUB_RND, FSUBS, FSUBS_RND,
+ FMUL_RND, FMULS, FMULS_RND,
+ FDIV_RND, FDIVS, FDIVS_RND,
+ FMAX_SAE, FMAXS_SAE,
+ FMIN_SAE, FMINS_SAE,
+ FSQRT_RND, FSQRTS, FSQRTS_RND,
// FP vector get exponent.
- FGETEXP_RND, FGETEXPS_RND,
+ FGETEXP, FGETEXP_SAE, FGETEXPS, FGETEXPS_SAE,
// Extract Normalized Mantissas.
- VGETMANT, VGETMANT_RND, VGETMANTS, VGETMANTS_RND,
+ VGETMANT, VGETMANT_SAE, VGETMANTS, VGETMANTS_SAE,
// FP Scale.
- SCALEF,
- SCALEFS,
+ SCALEF, SCALEF_RND,
+ SCALEFS, SCALEFS_RND,
// Unsigned Integer average.
AVG,
@@ -300,10 +290,10 @@ namespace llvm {
VMTRUNC, VMTRUNCUS, VMTRUNCS,
// Vector FP extend.
- VFPEXT, VFPEXT_RND, VFPEXTS_RND,
+ VFPEXT, VFPEXT_SAE, VFPEXTS, VFPEXTS_SAE,
// Vector FP round.
- VFPROUND, VFPROUND_RND, VFPROUNDS_RND,
+ VFPROUND, VFPROUND_RND, VFPROUNDS, VFPROUNDS_RND,
// Masked version of above. Used for v2f64->v4f32.
// SRC, PASSTHRU, MASK
@@ -315,10 +305,8 @@ namespace llvm {
// Vector shift elements
VSHL, VSRL, VSRA,
- // Vector variable shift right arithmetic.
- // Unlike ISD::SRA, in case shift count greater then element size
- // use sign bit to fill destination data element.
- VSRAV,
+ // Vector variable shift
+ VSHLV, VSRLV, VSRAV,
// Vector shift elements by immediate
VSHLI, VSRLI, VSRAI,
@@ -343,8 +331,8 @@ namespace llvm {
/// Vector comparison generating mask bits for fp and
/// integer signed and unsigned data types.
CMPM,
- // Vector comparison with rounding mode for FP values
- CMPM_RND,
+ // Vector comparison with SAE for FP values
+ CMPM_SAE,
// Arithmetic operations with FLAGS results.
ADD, SUB, ADC, SBB, SMUL, UMUL,
@@ -419,16 +407,16 @@ namespace llvm {
// Bitwise ternary logic.
VPTERNLOG,
// Fix Up Special Packed Float32/64 values.
- VFIXUPIMM,
- VFIXUPIMMS,
+ VFIXUPIMM, VFIXUPIMM_SAE,
+ VFIXUPIMMS, VFIXUPIMMS_SAE,
// Range Restriction Calculation For Packed Pairs of Float32/64 values.
- VRANGE, VRANGE_RND, VRANGES, VRANGES_RND,
+ VRANGE, VRANGE_SAE, VRANGES, VRANGES_SAE,
// Reduce - Perform Reduction Transformation on scalar\packed FP.
- VREDUCE, VREDUCE_RND, VREDUCES, VREDUCES_RND,
+ VREDUCE, VREDUCE_SAE, VREDUCES, VREDUCES_SAE,
// RndScale - Round FP Values To Include A Given Number Of Fraction Bits.
// Also used by the legacy (V)ROUND intrinsics where we mask out the
// scaling part of the immediate.
- VRNDSCALE, VRNDSCALE_RND, VRNDSCALES, VRNDSCALES_RND,
+ VRNDSCALE, VRNDSCALE_SAE, VRNDSCALES, VRNDSCALES_SAE,
// Tests Types Of a FP Values for packed types.
VFPCLASS,
// Tests Types Of a FP Values for scalar types.
@@ -499,6 +487,7 @@ namespace llvm {
// Convert Unsigned/Integer to Floating-Point Value with rounding mode.
SINT_TO_FP_RND, UINT_TO_FP_RND,
+ SCALAR_SINT_TO_FP, SCALAR_UINT_TO_FP,
SCALAR_SINT_TO_FP_RND, SCALAR_UINT_TO_FP_RND,
// Vector float/double to signed/unsigned integer.
@@ -507,9 +496,9 @@ namespace llvm {
CVTS2SI, CVTS2UI, CVTS2SI_RND, CVTS2UI_RND,
// Vector float/double to signed/unsigned integer with truncation.
- CVTTP2SI, CVTTP2UI, CVTTP2SI_RND, CVTTP2UI_RND,
+ CVTTP2SI, CVTTP2UI, CVTTP2SI_SAE, CVTTP2UI_SAE,
// Scalar float/double to signed/unsigned integer with truncation.
- CVTTS2SI, CVTTS2UI, CVTTS2SI_RND, CVTTS2UI_RND,
+ CVTTS2SI, CVTTS2UI, CVTTS2SI_SAE, CVTTS2UI_SAE,
// Vector signed/unsigned integer to float/double.
CVTSI2P, CVTUI2P,
@@ -517,6 +506,20 @@ namespace llvm {
// Masked versions of above. Used for v2f64->v4f32.
// SRC, PASSTHRU, MASK
MCVTP2SI, MCVTP2UI, MCVTTP2SI, MCVTTP2UI,
+ MCVTSI2P, MCVTUI2P,
+
+ // Vector float to bfloat16.
+ // Convert TWO packed single data to one packed BF16 data
+ CVTNE2PS2BF16,
+ // Convert packed single data to packed BF16 data
+ CVTNEPS2BF16,
+ // Masked version of above.
+ // SRC, PASSTHRU, MASK
+ MCVTNEPS2BF16,
+
+ // Dot product of BF16 pairs to accumulated into
+ // packed single precision.
+ DPBF16PS,
// Save xmm argument registers to the stack, according to %al. An operator
// is needed so that this can be expanded with control flow.
@@ -547,6 +550,12 @@ namespace llvm {
// indicate whether it is valid in CF.
RDSEED,
+ // Protection keys
+ // RDPKRU - Operand 0 is chain. Operand 1 is value for ECX.
+ // WRPKRU - Operand 0 is chain. Operand 1 is value for EDX. Operand 2 is
+ // value for ECX.
+ RDPKRU, WRPKRU,
+
// SSE42 string comparisons.
// These nodes produce 3 results, index, mask, and flags. X86ISelDAGToDAG
// will emit one or two instructions based on which results are used. If
@@ -560,10 +569,11 @@ namespace llvm {
XTEST,
// ERI instructions.
- RSQRT28, RSQRT28S, RCP28, RCP28S, EXP2,
+ RSQRT28, RSQRT28_SAE, RSQRT28S, RSQRT28S_SAE,
+ RCP28, RCP28_SAE, RCP28S, RCP28S_SAE, EXP2, EXP2_SAE,
// Conversions between float and half-float.
- CVTPS2PH, CVTPH2PS, CVTPH2PS_RND,
+ CVTPS2PH, CVTPH2PS, CVTPH2PS_SAE,
// Masked version of above.
// SRC, RND, PASSTHRU, MASK
@@ -578,6 +588,12 @@ namespace llvm {
// User level wait
UMWAIT, TPAUSE,
+ // Enqueue Stores Instructions
+ ENQCMD, ENQCMDS,
+
+ // For avx512-vp2intersect
+ VP2INTERSECT,
+
// Compare and swap.
LCMPXCHG_DAG = ISD::FIRST_TARGET_MEMORY_OPCODE,
LCMPXCHG8_DAG,
@@ -592,6 +608,9 @@ namespace llvm {
// Load, scalar_to_vector, and zero extend.
VZEXT_LOAD,
+ // extract_vector_elt, store.
+ VEXTRACT_STORE,
+
// Store FP control world into i16 memory.
FNSTCW16m,
@@ -599,29 +618,33 @@ namespace llvm {
/// integer destination in memory and a FP reg source. This corresponds
/// to the X86::FIST*m instructions and the rounding mode change stuff. It
/// has two inputs (token chain and address) and two outputs (int value
- /// and token chain).
- FP_TO_INT16_IN_MEM,
- FP_TO_INT32_IN_MEM,
- FP_TO_INT64_IN_MEM,
+ /// and token chain). Memory VT specifies the type to store to.
+ FP_TO_INT_IN_MEM,
/// This instruction implements SINT_TO_FP with the
/// integer source in memory and FP reg result. This corresponds to the
- /// X86::FILD*m instructions. It has three inputs (token chain, address,
- /// and source type) and two outputs (FP value and token chain). FILD_FLAG
- /// also produces a flag).
+ /// X86::FILD*m instructions. It has two inputs (token chain and address)
+ /// and two outputs (FP value and token chain). FILD_FLAG also produces a
+ /// flag). The integer source type is specified by the memory VT.
FILD,
FILD_FLAG,
+ /// This instruction implements a fp->int store from FP stack
+ /// slots. This corresponds to the fist instruction. It takes a
+ /// chain operand, value to store, address, and glue. The memory VT
+ /// specifies the type to store as.
+ FIST,
+
/// This instruction implements an extending load to FP stack slots.
/// This corresponds to the X86::FLD32m / X86::FLD64m. It takes a chain
- /// operand, ptr to load from, and a ValueType node indicating the type
- /// to load to.
+ /// operand, and ptr to load from. The memory VT specifies the type to
+ /// load from.
FLD,
- /// This instruction implements a truncating store to FP stack
+ /// This instruction implements a truncating store from FP stack
/// slots. This corresponds to the X86::FST32m / X86::FST64m. It takes a
- /// chain operand, value to store, address, and a ValueType to store it
- /// as.
+ /// chain operand, value to store, address, and glue. The memory VT
+ /// specifies the type to store as.
FST,
/// This instruction grabs the address of the next argument
@@ -708,7 +731,7 @@ namespace llvm {
/// target-independent logic.
EVT getOptimalMemOpType(uint64_t Size, unsigned DstAlign, unsigned SrcAlign,
bool IsMemset, bool ZeroMemset, bool MemcpyStrSrc,
- MachineFunction &MF) const override;
+ const AttributeList &FuncAttributes) const override;
/// Returns true if it's safe to use load / store of the
/// specified type to expand memcpy / memset inline. This is mostly true
@@ -721,7 +744,8 @@ namespace llvm {
/// Returns true if the target allows unaligned memory accesses of the
/// specified type. Returns whether it is "fast" in the last argument.
bool allowsMisalignedMemoryAccesses(EVT VT, unsigned AS, unsigned Align,
- bool *Fast) const override;
+ MachineMemOperand::Flags Flags,
+ bool *Fast) const override;
/// Provide custom lowering hooks for some operations.
///
@@ -775,7 +799,11 @@ namespace llvm {
/// This method returns the name of a target specific DAG node.
const char *getTargetNodeName(unsigned Opcode) const override;
- bool mergeStoresAfterLegalization() const override { return true; }
+ /// Do not merge vector stores after legalization because that may conflict
+ /// with x86-specific store splitting optimizations.
+ bool mergeStoresAfterLegalization(EVT MemVT) const override {
+ return !MemVT.isVector();
+ }
bool canMergeStoresTo(unsigned AddressSpace, EVT MemVT,
const SelectionDAG &DAG) const override;
@@ -812,7 +840,10 @@ namespace llvm {
bool hasAndNot(SDValue Y) const override;
- bool preferShiftsToClearExtremeBits(SDValue Y) const override;
+ bool shouldFoldConstantShiftPairToMask(const SDNode *N,
+ CombineLevel Level) const override;
+
+ bool shouldFoldMaskToVariableShiftPair(SDValue Y) const override;
bool
shouldTransformSignedTruncationCheck(EVT XVT,
@@ -832,6 +863,12 @@ namespace llvm {
return VTIsOk(XVT) && VTIsOk(KeptBitsVT);
}
+ bool shouldExpandShift(SelectionDAG &DAG, SDNode *N) const override {
+ if (DAG.getMachineFunction().getFunction().hasMinSize())
+ return false;
+ return true;
+ }
+
bool shouldSplatInsEltVarIndex(EVT VT) const override;
bool convertSetCCLogicToBitwiseLogic(EVT VT) const override {
@@ -841,11 +878,6 @@ namespace llvm {
/// Vector-sized comparisons are fast using PCMPEQ + PMOVMSK or PTEST.
MVT hasFastEqualityCompare(unsigned NumBits) const override;
- /// Allow multiple load pairs per block for smaller and faster code.
- unsigned getMemcmpEqZeroLoadsPerBlock() const override {
- return 2;
- }
-
/// Return the value type to use for ISD::SETCC.
EVT getSetCCResultType(const DataLayout &DL, LLVMContext &Context,
EVT VT) const override;
@@ -881,6 +913,8 @@ namespace llvm {
TargetLoweringOpt &TLO,
unsigned Depth) const override;
+ const Constant *getTargetConstantFromLoad(LoadSDNode *LD) const override;
+
SDValue unwrapAddress(SDValue N) const override;
SDValue getReturnAddressFrameIndex(SelectionDAG &DAG) const;
@@ -918,6 +952,11 @@ namespace llvm {
return TargetLowering::getInlineAsmMemConstraint(ConstraintCode);
}
+ /// Handle Lowering flag assembly outputs.
+ SDValue LowerAsmOutputForConstraint(SDValue &Chain, SDValue &Flag, SDLoc DL,
+ const AsmOperandInfo &Constraint,
+ SelectionDAG &DAG) const override;
+
/// Given a physical register constraint
/// (e.g. {edx}), return the register number and the register class for the
/// register. This should only be used for C_Register constraints. On
@@ -956,6 +995,12 @@ namespace llvm {
bool isVectorShiftByScalarCheap(Type *Ty) const override;
+ /// Add x86-specific opcodes to the default list.
+ bool isBinOp(unsigned Opcode) const override;
+
+ /// Returns true if the opcode is a commutative binary operation.
+ bool isCommutativeBinOp(unsigned Opcode) const override;
+
/// Return true if it's free to truncate a value of
/// type Ty1 to type Ty2. e.g. On x86 it's free to truncate a i32 value in
/// register EAX to i16 by referencing its sub-register AX.
@@ -1001,7 +1046,8 @@ namespace llvm {
/// Returns true if the target can instruction select the
/// specified FP immediate natively. If false, the legalizer will
/// materialize the FP immediate as a load from a constant pool.
- bool isFPImmLegal(const APFloat &Imm, EVT VT) const override;
+ bool isFPImmLegal(const APFloat &Imm, EVT VT,
+ bool ForCodeSize) const override;
/// Targets can use this to indicate that they only support *some*
/// VECTOR_SHUFFLE operations, those with specific masks. By default, if a
@@ -1063,6 +1109,17 @@ namespace llvm {
/// supported.
bool shouldScalarizeBinop(SDValue) const override;
+ /// Extract of a scalar FP value from index 0 of a vector is free.
+ bool isExtractVecEltCheap(EVT VT, unsigned Index) const override {
+ EVT EltVT = VT.getScalarType();
+ return (EltVT == MVT::f32 || EltVT == MVT::f64) && Index == 0;
+ }
+
+ /// Overflow nodes should get combined/lowered to optimal instructions
+ /// (they should allow eliminating explicit compares by getting flags from
+ /// math ops).
+ bool shouldFormOverflowOp(unsigned Opcode, EVT VT) const override;
+
bool storeOfVectorConstantIsCheap(EVT MemVT, unsigned NumElem,
unsigned AddrSpace) const override {
// If we can replace more than 2 scalar stores, there will be a reduction
@@ -1070,7 +1127,9 @@ namespace llvm {
return NumElem > 2;
}
- bool isLoadBitCastBeneficial(EVT LoadVT, EVT BitcastVT) const override;
+ bool isLoadBitCastBeneficial(EVT LoadVT, EVT BitcastVT,
+ const SelectionDAG &DAG,
+ const MachineMemOperand &MMO) const override;
/// Intel processors have a unified instruction and data cache
const char * getClearCacheBuiltinName() const override {
@@ -1105,7 +1164,7 @@ namespace llvm {
bool useStackGuardXorFP() const override;
void insertSSPDeclarations(Module &M) const override;
Value *getSDagStackGuard(const Module &M) const override;
- Value *getSSPStackGuardCheck(const Module &M) const override;
+ Function *getSSPStackGuardCheck(const Module &M) const override;
SDValue emitStackGuardXorFP(SelectionDAG &DAG, SDValue Val,
const SDLoc &DL) const override;
@@ -1221,9 +1280,7 @@ namespace llvm {
unsigned getAddressSpace(void) const;
- std::pair<SDValue,SDValue> FP_TO_INTHelper(SDValue Op, SelectionDAG &DAG,
- bool isSigned,
- bool isReplace) const;
+ SDValue FP_TO_INTHelper(SDValue Op, SelectionDAG &DAG, bool isSigned) const;
SDValue LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG) const;
SDValue LowerVSELECT(SDValue Op, SelectionDAG &DAG) const;
@@ -1234,12 +1291,15 @@ namespace llvm {
const unsigned char OpFlags = 0) const;
SDValue LowerConstantPool(SDValue Op, SelectionDAG &DAG) const;
SDValue LowerBlockAddress(SDValue Op, SelectionDAG &DAG) const;
- SDValue LowerGlobalAddress(const GlobalValue *GV, const SDLoc &dl,
- int64_t Offset, SelectionDAG &DAG) const;
SDValue LowerGlobalAddress(SDValue Op, SelectionDAG &DAG) const;
SDValue LowerGlobalTLSAddress(SDValue Op, SelectionDAG &DAG) const;
SDValue LowerExternalSymbol(SDValue Op, SelectionDAG &DAG) const;
+ /// Creates target global address or external symbol nodes for calls or
+ /// other uses.
+ SDValue LowerGlobalOrExternal(SDValue Op, SelectionDAG &DAG,
+ bool ForCall) const;
+
SDValue LowerSINT_TO_FP(SDValue Op, SelectionDAG &DAG) const;
SDValue LowerUINT_TO_FP(SDValue Op, SelectionDAG &DAG) const;
SDValue LowerTRUNCATE(SDValue Op, SelectionDAG &DAG) const;
@@ -1568,10 +1628,10 @@ namespace llvm {
void scaleShuffleMask(int Scale, ArrayRef<T> Mask,
SmallVectorImpl<T> &ScaledMask) {
assert(0 < Scale && "Unexpected scaling factor");
- int NumElts = Mask.size();
- ScaledMask.assign(static_cast<size_t>(NumElts * Scale), -1);
+ size_t NumElts = Mask.size();
+ ScaledMask.assign(NumElts * Scale, -1);
- for (int i = 0; i != NumElts; ++i) {
+ for (int i = 0; i != (int)NumElts; ++i) {
int M = Mask[i];
// Repeat sentinel values in every mask element.
diff --git a/lib/Target/X86/X86IndirectBranchTracking.cpp b/lib/Target/X86/X86IndirectBranchTracking.cpp
index 7c00c9260d15..04e8b2231fec 100644
--- a/lib/Target/X86/X86IndirectBranchTracking.cpp
+++ b/lib/Target/X86/X86IndirectBranchTracking.cpp
@@ -1,9 +1,8 @@
//===---- X86IndirectBranchTracking.cpp - Enables CET IBT mechanism -------===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
@@ -58,7 +57,7 @@ private:
/// The function will not add it if already exists.
/// It will add ENDBR32 or ENDBR64 opcode, depending on the target.
/// \returns true if the ENDBR was added and false otherwise.
- bool addENDBR(MachineBasicBlock &MBB) const;
+ bool addENDBR(MachineBasicBlock &MBB, MachineBasicBlock::iterator I) const;
};
} // end anonymous namespace
@@ -69,20 +68,31 @@ FunctionPass *llvm::createX86IndirectBranchTrackingPass() {
return new X86IndirectBranchTrackingPass();
}
-bool X86IndirectBranchTrackingPass::addENDBR(MachineBasicBlock &MBB) const {
+bool X86IndirectBranchTrackingPass::addENDBR(
+ MachineBasicBlock &MBB, MachineBasicBlock::iterator I) const {
assert(TII && "Target instruction info was not initialized");
assert((X86::ENDBR64 == EndbrOpcode || X86::ENDBR32 == EndbrOpcode) &&
"Unexpected Endbr opcode");
- auto MI = MBB.begin();
- // If the MBB is empty or the first instruction is not ENDBR,
- // add the ENDBR instruction to the beginning of the MBB.
- if (MI == MBB.end() || EndbrOpcode != MI->getOpcode()) {
- BuildMI(MBB, MI, MBB.findDebugLoc(MI), TII->get(EndbrOpcode));
- NumEndBranchAdded++;
+ // If the MBB/I is empty or the current instruction is not ENDBR,
+ // insert ENDBR instruction to the location of I.
+ if (I == MBB.end() || I->getOpcode() != EndbrOpcode) {
+ BuildMI(MBB, I, MBB.findDebugLoc(I), TII->get(EndbrOpcode));
+ ++NumEndBranchAdded;
return true;
}
+ return false;
+}
+bool IsCallReturnTwice(llvm::MachineOperand &MOp) {
+ if (!MOp.isGlobal())
+ return false;
+ auto *CalleeFn = dyn_cast<Function>(MOp.getGlobal());
+ if (!CalleeFn)
+ return false;
+ AttributeList Attrs = CalleeFn->getAttributes();
+ if (Attrs.hasAttribute(AttributeList::FunctionIndex, Attribute::ReturnsTwice))
+ return true;
return false;
}
@@ -108,14 +118,21 @@ bool X86IndirectBranchTrackingPass::runOnMachineFunction(MachineFunction &MF) {
!MF.getFunction().hasLocalLinkage()) &&
!MF.getFunction().doesNoCfCheck()) {
auto MBB = MF.begin();
- Changed |= addENDBR(*MBB);
+ Changed |= addENDBR(*MBB, MBB->begin());
}
- for (auto &MBB : MF)
+ for (auto &MBB : MF) {
// Find all basic blocks that their address was taken (for example
// in the case of indirect jump) and add ENDBR instruction.
if (MBB.hasAddressTaken())
- Changed |= addENDBR(MBB);
-
+ Changed |= addENDBR(MBB, MBB.begin());
+
+ for (MachineBasicBlock::iterator I = MBB.begin(); I != MBB.end(); ++I) {
+ if (!I->isCall())
+ continue;
+ if (IsCallReturnTwice(I->getOperand(0)))
+ Changed |= addENDBR(MBB, std::next(I));
+ }
+ }
return Changed;
}
diff --git a/lib/Target/X86/X86InsertPrefetch.cpp b/lib/Target/X86/X86InsertPrefetch.cpp
index 30b46a09ef0f..02ae73706a34 100644
--- a/lib/Target/X86/X86InsertPrefetch.cpp
+++ b/lib/Target/X86/X86InsertPrefetch.cpp
@@ -1,9 +1,8 @@
//===------- X86InsertPrefetch.cpp - Insert cache prefetch hints ----------===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
@@ -34,7 +33,8 @@ using namespace sampleprof;
static cl::opt<std::string>
PrefetchHintsFile("prefetch-hints-file",
- cl::desc("Path to the prefetch hints profile."),
+ cl::desc("Path to the prefetch hints profile. See also "
+ "-x86-discriminate-memops"),
cl::Hidden);
namespace {
diff --git a/lib/Target/X86/X86Instr3DNow.td b/lib/Target/X86/X86Instr3DNow.td
index 49e9e924887a..cd1b06365971 100644
--- a/lib/Target/X86/X86Instr3DNow.td
+++ b/lib/Target/X86/X86Instr3DNow.td
@@ -1,9 +1,8 @@
//===-- X86Instr3DNow.td - The 3DNow! Instruction Set ------*- tablegen -*-===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
@@ -74,7 +73,9 @@ defm PFSUBR : I3DNow_binop_rm_int<0xAA, "pfsubr", WriteFAdd, 1>;
defm PI2FD : I3DNow_conv_rm_int<0x0D, "pi2fd", WriteCvtI2PS>;
defm PMULHRW : I3DNow_binop_rm_int<0xB7, "pmulhrw", SchedWriteVecIMul.MMX, 1>;
-let SchedRW = [WriteEMMS] in
+let SchedRW = [WriteEMMS],
+ Defs = [MM0, MM1, MM2, MM3, MM4, MM5, MM6, MM7,
+ ST0, ST1, ST2, ST3, ST4, ST5, ST6, ST7] in
def FEMMS : I3DNow<0x0E, RawFrm, (outs), (ins), "femms",
[(int_x86_mmx_femms)]>, TB;
diff --git a/lib/Target/X86/X86InstrAVX512.td b/lib/Target/X86/X86InstrAVX512.td
index 7423cb85acd2..54eddeacaa17 100644
--- a/lib/Target/X86/X86InstrAVX512.td
+++ b/lib/Target/X86/X86InstrAVX512.td
@@ -1,9 +1,8 @@
//===-- X86InstrAVX512.td - AVX512 Instruction Set ---------*- tablegen -*-===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
@@ -27,6 +26,10 @@ class X86VectorVTInfo<int numelts, ValueType eltvt, RegisterClass rc,
// Corresponding mask register class.
RegisterClass KRC = !cast<RegisterClass>("VK" # NumElts);
+ // Corresponding mask register pair class.
+ RegisterOperand KRPC = !if (!gt(NumElts, 16), ?,
+ !cast<RegisterOperand>("VK" # NumElts # "Pair"));
+
// Corresponding write-mask register class.
RegisterClass KRCWM = !cast<RegisterClass>("VK" # NumElts # "WM");
@@ -95,10 +98,7 @@ class X86VectorVTInfo<int numelts, ValueType eltvt, RegisterClass rc,
RegisterClass FRC = !if (!eq (EltTypeName, "f32"), FR32X, FR64X);
- // A vector type of the same width with element type i32. This is used to
- // create the canonical constant zero node ImmAllZerosV.
- ValueType i32VT = !cast<ValueType>("v" # !srl(Size, 5) # "i32");
- dag ImmAllZerosV = (VT (bitconvert (i32VT immAllZerosV)));
+ dag ImmAllZerosV = (VT immAllZerosV);
string ZSuffix = !if (!eq (Size, 128), "Z128",
!if (!eq (Size, 256), "Z256", "Z"));
@@ -277,10 +277,9 @@ multiclass AVX512_maskable<bits<8> O, Format F, X86VectorVTInfo _,
multiclass AVX512_maskable_scalar<bits<8> O, Format F, X86VectorVTInfo _,
dag Outs, dag Ins, string OpcodeStr,
string AttSrcAsm, string IntelSrcAsm,
- dag RHS,
- bit IsCommutable = 0> :
+ dag RHS> :
AVX512_maskable<O, F, _, Outs, Ins, OpcodeStr, AttSrcAsm, IntelSrcAsm,
- RHS, IsCommutable, 0, IsCommutable, X86selects>;
+ RHS, 0, 0, 0, X86selects>;
// Similar to AVX512_maskable but in this case one of the source operands
// ($src1) is already tied to $dst so we just use that for the preserved
@@ -365,7 +364,7 @@ multiclass AVX512_maskable_custom_cmp<bits<8> O, Format F,
list<dag> Pattern,
list<dag> MaskingPattern,
bit IsCommutable = 0> {
- let isCommutable = IsCommutable in
+ let isCommutable = IsCommutable in {
def NAME: AVX512<O, F, Outs, Ins,
OpcodeStr#"\t{"#AttSrcAsm#", $dst|"#
"$dst, "#IntelSrcAsm#"}",
@@ -375,6 +374,7 @@ multiclass AVX512_maskable_custom_cmp<bits<8> O, Format F,
OpcodeStr#"\t{"#AttSrcAsm#", $dst {${mask}}|"#
"$dst {${mask}}, "#IntelSrcAsm#"}",
MaskingPattern>, EVEX_K;
+ }
}
multiclass AVX512_maskable_common_cmp<bits<8> O, Format F, X86VectorVTInfo _,
@@ -392,38 +392,11 @@ multiclass AVX512_maskable_common_cmp<bits<8> O, Format F, X86VectorVTInfo _,
multiclass AVX512_maskable_cmp<bits<8> O, Format F, X86VectorVTInfo _,
dag Outs, dag Ins, string OpcodeStr,
string AttSrcAsm, string IntelSrcAsm,
- dag RHS, bit IsCommutable = 0> :
+ dag RHS, dag RHS_su, bit IsCommutable = 0> :
AVX512_maskable_common_cmp<O, F, _, Outs, Ins,
!con((ins _.KRCWM:$mask), Ins),
OpcodeStr, AttSrcAsm, IntelSrcAsm, RHS,
- (and _.KRCWM:$mask, RHS), IsCommutable>;
-
-multiclass AVX512_maskable_cmp_alt<bits<8> O, Format F, X86VectorVTInfo _,
- dag Outs, dag Ins, string OpcodeStr,
- string AttSrcAsm, string IntelSrcAsm> :
- AVX512_maskable_custom_cmp<O, F, Outs,
- Ins, !con((ins _.KRCWM:$mask),Ins), OpcodeStr,
- AttSrcAsm, IntelSrcAsm, [], []>;
-
-// This multiclass generates the unconditional/non-masking, the masking and
-// the zero-masking variant of the vector instruction. In the masking case, the
-// perserved vector elements come from a new dummy input operand tied to $dst.
-multiclass AVX512_maskable_logic<bits<8> O, Format F, X86VectorVTInfo _,
- dag Outs, dag Ins, string OpcodeStr,
- string AttSrcAsm, string IntelSrcAsm,
- dag RHS, dag MaskedRHS,
- bit IsCommutable = 0, SDNode Select = vselect> :
- AVX512_maskable_custom<O, F, Outs, Ins,
- !con((ins _.RC:$src0, _.KRCWM:$mask), Ins),
- !con((ins _.KRCWM:$mask), Ins),
- OpcodeStr, AttSrcAsm, IntelSrcAsm,
- [(set _.RC:$dst, RHS)],
- [(set _.RC:$dst,
- (Select _.KRCWM:$mask, MaskedRHS, _.RC:$src0))],
- [(set _.RC:$dst,
- (Select _.KRCWM:$mask, MaskedRHS,
- _.ImmAllZerosV))],
- "$src0 = $dst", IsCommutable>;
+ (and _.KRCWM:$mask, RHS_su), IsCommutable>;
// Alias instruction that maps zero vector to pxor / xorp* for AVX-512.
@@ -451,8 +424,8 @@ def AVX512_512_SEXT_MASK_32 : I<0, Pseudo, (outs VR512:$dst),
def AVX512_512_SEXT_MASK_64 : I<0, Pseudo, (outs VR512:$dst),
(ins VK8WM:$mask), "",
[(set VR512:$dst, (vselect (v8i1 VK8WM:$mask),
- (bc_v8i64 (v16i32 immAllOnesV)),
- (bc_v8i64 (v16i32 immAllZerosV))))]>;
+ (v8i64 immAllOnesV),
+ (v8i64 immAllZerosV)))]>;
}
let isReMaterializable = 1, isAsCheapAsAMove = 1, canFoldAsLoad = 1,
@@ -753,6 +726,7 @@ defm : vinsert_for_mask_cast<"VINSERTI64x4Z", v32i8x_info, v64i8_info,
// vinsertps - insert f32 to XMM
let ExeDomain = SSEPackedSingle in {
+let isCommutable = 1 in
def VINSERTPSZrr : AVX512AIi8<0x21, MRMSrcReg, (outs VR128X:$dst),
(ins VR128X:$src1, VR128X:$src2, u8imm:$src3),
"vinsertps\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}",
@@ -1378,15 +1352,15 @@ multiclass avx512_subvec_broadcast_rm_dq<bits<8> opc, string OpcodeStr,
let Predicates = [HasAVX512] in {
// 32-bit targets will fail to load a i64 directly but can use ZEXT_LOAD.
- def : Pat<(v8i64 (X86VBroadcast (v8i64 (X86vzload addr:$src)))),
+ def : Pat<(v8i64 (X86VBroadcast (v2i64 (X86vzload64 addr:$src)))),
(VPBROADCASTQZm addr:$src)>;
}
let Predicates = [HasVLX] in {
// 32-bit targets will fail to load a i64 directly but can use ZEXT_LOAD.
- def : Pat<(v2i64 (X86VBroadcast (v2i64 (X86vzload addr:$src)))),
+ def : Pat<(v2i64 (X86VBroadcast (v2i64 (X86vzload64 addr:$src)))),
(VPBROADCASTQZ128m addr:$src)>;
- def : Pat<(v4i64 (X86VBroadcast (v4i64 (X86vzload addr:$src)))),
+ def : Pat<(v4i64 (X86VBroadcast (v2i64 (X86vzload64 addr:$src)))),
(VPBROADCASTQZ256m addr:$src)>;
}
let Predicates = [HasVLX, HasBWI] in {
@@ -1397,12 +1371,30 @@ let Predicates = [HasVLX, HasBWI] in {
def : Pat<(v16i16 (X86VBroadcast (i16 (trunc (i32 (load addr:$src)))))),
(VPBROADCASTWZ256m addr:$src)>;
def : Pat<(v8i16 (X86VBroadcast
+ (i16 (trunc (i32 (extloadi16 addr:$src)))))),
+ (VPBROADCASTWZ128m addr:$src)>;
+ def : Pat<(v8i16 (X86VBroadcast
(i16 (trunc (i32 (zextloadi16 addr:$src)))))),
(VPBROADCASTWZ128m addr:$src)>;
def : Pat<(v16i16 (X86VBroadcast
+ (i16 (trunc (i32 (extloadi16 addr:$src)))))),
+ (VPBROADCASTWZ256m addr:$src)>;
+ def : Pat<(v16i16 (X86VBroadcast
(i16 (trunc (i32 (zextloadi16 addr:$src)))))),
(VPBROADCASTWZ256m addr:$src)>;
}
+let Predicates = [HasBWI] in {
+ // loadi16 is tricky to fold, because !isTypeDesirableForOp, justifiably.
+ // This means we'll encounter truncated i32 loads; match that here.
+ def : Pat<(v32i16 (X86VBroadcast (i16 (trunc (i32 (load addr:$src)))))),
+ (VPBROADCASTWZm addr:$src)>;
+ def : Pat<(v32i16 (X86VBroadcast
+ (i16 (trunc (i32 (extloadi16 addr:$src)))))),
+ (VPBROADCASTWZm addr:$src)>;
+ def : Pat<(v32i16 (X86VBroadcast
+ (i16 (trunc (i32 (zextloadi16 addr:$src)))))),
+ (VPBROADCASTWZm addr:$src)>;
+}
//===----------------------------------------------------------------------===//
// AVX-512 BROADCAST SUBVECTORS
@@ -1464,7 +1456,7 @@ def : Pat<(v64i8 (X86SubVBroadcast (loadv16i8 addr:$src))),
// Patterns for selects of bitcasted operations.
def : Pat<(vselect VK16WM:$mask,
(bc_v16f32 (v8f64 (X86SubVBroadcast (loadv2f64 addr:$src)))),
- (bc_v16f32 (v16i32 immAllZerosV))),
+ (v16f32 immAllZerosV)),
(VBROADCASTF32X4rmkz VK16WM:$mask, addr:$src)>;
def : Pat<(vselect VK16WM:$mask,
(bc_v16f32 (v8f64 (X86SubVBroadcast (loadv2f64 addr:$src)))),
@@ -1481,7 +1473,7 @@ def : Pat<(vselect VK16WM:$mask,
def : Pat<(vselect VK8WM:$mask,
(bc_v8f64 (v16f32 (X86SubVBroadcast (loadv8f32 addr:$src)))),
- (bc_v8f64 (v16i32 immAllZerosV))),
+ (v8f64 immAllZerosV)),
(VBROADCASTF64X4rmkz VK8WM:$mask, addr:$src)>;
def : Pat<(vselect VK8WM:$mask,
(bc_v8f64 (v16f32 (X86SubVBroadcast (loadv8f32 addr:$src)))),
@@ -1489,7 +1481,7 @@ def : Pat<(vselect VK8WM:$mask,
(VBROADCASTF64X4rmk VR512:$src0, VK8WM:$mask, addr:$src)>;
def : Pat<(vselect VK8WM:$mask,
(bc_v8i64 (v16i32 (X86SubVBroadcast (loadv8i32 addr:$src)))),
- (bc_v8i64 (v16i32 immAllZerosV))),
+ (v8i64 immAllZerosV)),
(VBROADCASTI64X4rmkz VK8WM:$mask, addr:$src)>;
def : Pat<(vselect VK8WM:$mask,
(bc_v8i64 (v16i32 (X86SubVBroadcast (loadv8i32 addr:$src)))),
@@ -1517,7 +1509,7 @@ def : Pat<(v32i8 (X86SubVBroadcast (loadv16i8 addr:$src))),
// Patterns for selects of bitcasted operations.
def : Pat<(vselect VK8WM:$mask,
(bc_v8f32 (v4f64 (X86SubVBroadcast (loadv2f64 addr:$src)))),
- (bc_v8f32 (v8i32 immAllZerosV))),
+ (v8f32 immAllZerosV)),
(VBROADCASTF32X4Z256rmkz VK8WM:$mask, addr:$src)>;
def : Pat<(vselect VK8WM:$mask,
(bc_v8f32 (v4f64 (X86SubVBroadcast (loadv2f64 addr:$src)))),
@@ -1566,7 +1558,7 @@ defm VBROADCASTF64X2Z128 : avx512_subvec_broadcast_rm_dq<0x1a, "vbroadcastf64x2"
// Patterns for selects of bitcasted operations.
def : Pat<(vselect VK4WM:$mask,
(bc_v4f64 (v8f32 (X86SubVBroadcast (loadv4f32 addr:$src)))),
- (bc_v4f64 (v8i32 immAllZerosV))),
+ (v4f64 immAllZerosV)),
(VBROADCASTF64X2Z128rmkz VK4WM:$mask, addr:$src)>;
def : Pat<(vselect VK4WM:$mask,
(bc_v4f64 (v8f32 (X86SubVBroadcast (loadv4f32 addr:$src)))),
@@ -1574,7 +1566,7 @@ def : Pat<(vselect VK4WM:$mask,
(VBROADCASTF64X2Z128rmk VR256X:$src0, VK4WM:$mask, addr:$src)>;
def : Pat<(vselect VK4WM:$mask,
(bc_v4i64 (v8i32 (X86SubVBroadcast (loadv4i32 addr:$src)))),
- (bc_v4i64 (v8i32 immAllZerosV))),
+ (v4i64 immAllZerosV)),
(VBROADCASTI64X2Z128rmkz VK4WM:$mask, addr:$src)>;
def : Pat<(vselect VK4WM:$mask,
(bc_v4i64 (v8i32 (X86SubVBroadcast (loadv4i32 addr:$src)))),
@@ -1599,7 +1591,7 @@ defm VBROADCASTF32X8 : avx512_subvec_broadcast_rm_dq<0x1b, "vbroadcastf32x8",
// Patterns for selects of bitcasted operations.
def : Pat<(vselect VK16WM:$mask,
(bc_v16f32 (v8f64 (X86SubVBroadcast (loadv4f64 addr:$src)))),
- (bc_v16f32 (v16i32 immAllZerosV))),
+ (v16f32 immAllZerosV)),
(VBROADCASTF32X8rmkz VK16WM:$mask, addr:$src)>;
def : Pat<(vselect VK16WM:$mask,
(bc_v16f32 (v8f64 (X86SubVBroadcast (loadv4f64 addr:$src)))),
@@ -1616,7 +1608,7 @@ def : Pat<(vselect VK16WM:$mask,
def : Pat<(vselect VK8WM:$mask,
(bc_v8f64 (v16f32 (X86SubVBroadcast (loadv4f32 addr:$src)))),
- (bc_v8f64 (v16i32 immAllZerosV))),
+ (v8f64 immAllZerosV)),
(VBROADCASTF64X2rmkz VK8WM:$mask, addr:$src)>;
def : Pat<(vselect VK8WM:$mask,
(bc_v8f64 (v16f32 (X86SubVBroadcast (loadv4f32 addr:$src)))),
@@ -1624,7 +1616,7 @@ def : Pat<(vselect VK8WM:$mask,
(VBROADCASTF64X2rmk VR512:$src0, VK8WM:$mask, addr:$src)>;
def : Pat<(vselect VK8WM:$mask,
(bc_v8i64 (v16i32 (X86SubVBroadcast (loadv4i32 addr:$src)))),
- (bc_v8i64 (v16i32 immAllZerosV))),
+ (v8i64 immAllZerosV)),
(VBROADCASTI64X2rmkz VK8WM:$mask, addr:$src)>;
def : Pat<(vselect VK8WM:$mask,
(bc_v8i64 (v16i32 (X86SubVBroadcast (loadv4i32 addr:$src)))),
@@ -2031,96 +2023,86 @@ defm VPBLENDMW : blendmask_bw<0x66, "vpblendmw", SchedWriteVarBlend,
// avx512_cmp_scalar - AVX512 CMPSS and CMPSD
-multiclass avx512_cmp_scalar<X86VectorVTInfo _, SDNode OpNode, SDNode OpNodeRnd,
+multiclass avx512_cmp_scalar<X86VectorVTInfo _, SDNode OpNode, SDNode OpNodeSAE,
+ PatFrag OpNode_su, PatFrag OpNodeSAE_su,
X86FoldableSchedWrite sched> {
defm rr_Int : AVX512_maskable_cmp<0xC2, MRMSrcReg, _,
(outs _.KRC:$dst),
- (ins _.RC:$src1, _.RC:$src2, AVXCC:$cc),
- "vcmp${cc}"#_.Suffix,
- "$src2, $src1", "$src1, $src2",
- (OpNode (_.VT _.RC:$src1),
- (_.VT _.RC:$src2),
- imm:$cc)>, EVEX_4V, Sched<[sched]>;
+ (ins _.RC:$src1, _.RC:$src2, u8imm:$cc),
+ "vcmp"#_.Suffix,
+ "$cc, $src2, $src1", "$src1, $src2, $cc",
+ (OpNode (_.VT _.RC:$src1), (_.VT _.RC:$src2), imm:$cc),
+ (OpNode_su (_.VT _.RC:$src1), (_.VT _.RC:$src2),
+ imm:$cc)>, EVEX_4V, VEX_LIG, Sched<[sched]>;
let mayLoad = 1 in
defm rm_Int : AVX512_maskable_cmp<0xC2, MRMSrcMem, _,
(outs _.KRC:$dst),
- (ins _.RC:$src1, _.IntScalarMemOp:$src2, AVXCC:$cc),
- "vcmp${cc}"#_.Suffix,
- "$src2, $src1", "$src1, $src2",
+ (ins _.RC:$src1, _.IntScalarMemOp:$src2, u8imm:$cc),
+ "vcmp"#_.Suffix,
+ "$cc, $src2, $src1", "$src1, $src2, $cc",
(OpNode (_.VT _.RC:$src1), _.ScalarIntMemCPat:$src2,
- imm:$cc)>, EVEX_4V, EVEX_CD8<_.EltSize, CD8VT1>,
+ imm:$cc),
+ (OpNode_su (_.VT _.RC:$src1), _.ScalarIntMemCPat:$src2,
+ imm:$cc)>, EVEX_4V, VEX_LIG, EVEX_CD8<_.EltSize, CD8VT1>,
Sched<[sched.Folded, sched.ReadAfterFold]>;
defm rrb_Int : AVX512_maskable_cmp<0xC2, MRMSrcReg, _,
(outs _.KRC:$dst),
- (ins _.RC:$src1, _.RC:$src2, AVXCC:$cc),
- "vcmp${cc}"#_.Suffix,
- "{sae}, $src2, $src1", "$src1, $src2, {sae}",
- (OpNodeRnd (_.VT _.RC:$src1),
- (_.VT _.RC:$src2),
- imm:$cc,
- (i32 FROUND_NO_EXC))>,
- EVEX_4V, EVEX_B, Sched<[sched]>;
- // Accept explicit immediate argument form instead of comparison code.
- let isAsmParserOnly = 1, hasSideEffects = 0 in {
- defm rri_alt : AVX512_maskable_cmp_alt<0xC2, MRMSrcReg, _,
- (outs VK1:$dst),
- (ins _.RC:$src1, _.RC:$src2, u8imm:$cc),
- "vcmp"#_.Suffix,
- "$cc, $src2, $src1", "$src1, $src2, $cc">, EVEX_4V,
- Sched<[sched]>, NotMemoryFoldable;
- let mayLoad = 1 in
- defm rmi_alt : AVX512_maskable_cmp_alt<0xC2, MRMSrcMem, _,
- (outs _.KRC:$dst),
- (ins _.RC:$src1, _.ScalarMemOp:$src2, u8imm:$cc),
- "vcmp"#_.Suffix,
- "$cc, $src2, $src1", "$src1, $src2, $cc">,
- EVEX_4V, EVEX_CD8<_.EltSize, CD8VT1>,
- Sched<[sched.Folded, sched.ReadAfterFold]>, NotMemoryFoldable;
-
- defm rrb_alt : AVX512_maskable_cmp_alt<0xC2, MRMSrcReg, _,
- (outs _.KRC:$dst),
- (ins _.RC:$src1, _.RC:$src2, u8imm:$cc),
- "vcmp"#_.Suffix,
- "$cc, {sae}, $src2, $src1","$src1, $src2, {sae}, $cc">,
- EVEX_4V, EVEX_B, Sched<[sched]>, NotMemoryFoldable;
- }// let isAsmParserOnly = 1, hasSideEffects = 0
+ (ins _.RC:$src1, _.RC:$src2, u8imm:$cc),
+ "vcmp"#_.Suffix,
+ "$cc, {sae}, $src2, $src1","$src1, $src2, {sae}, $cc",
+ (OpNodeSAE (_.VT _.RC:$src1), (_.VT _.RC:$src2),
+ imm:$cc),
+ (OpNodeSAE_su (_.VT _.RC:$src1), (_.VT _.RC:$src2),
+ imm:$cc)>,
+ EVEX_4V, VEX_LIG, EVEX_B, Sched<[sched]>;
let isCodeGenOnly = 1 in {
let isCommutable = 1 in
def rr : AVX512Ii8<0xC2, MRMSrcReg,
- (outs _.KRC:$dst), (ins _.FRC:$src1, _.FRC:$src2, AVXCC:$cc),
- !strconcat("vcmp${cc}", _.Suffix,
- "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
+ (outs _.KRC:$dst), (ins _.FRC:$src1, _.FRC:$src2, u8imm:$cc),
+ !strconcat("vcmp", _.Suffix,
+ "\t{$cc, $src2, $src1, $dst|$dst, $src1, $src2, $cc}"),
[(set _.KRC:$dst, (OpNode _.FRC:$src1,
_.FRC:$src2,
imm:$cc))]>,
- EVEX_4V, Sched<[sched]>;
+ EVEX_4V, VEX_LIG, Sched<[sched]>;
def rm : AVX512Ii8<0xC2, MRMSrcMem,
(outs _.KRC:$dst),
- (ins _.FRC:$src1, _.ScalarMemOp:$src2, AVXCC:$cc),
- !strconcat("vcmp${cc}", _.Suffix,
- "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
+ (ins _.FRC:$src1, _.ScalarMemOp:$src2, u8imm:$cc),
+ !strconcat("vcmp", _.Suffix,
+ "\t{$cc, $src2, $src1, $dst|$dst, $src1, $src2, $cc}"),
[(set _.KRC:$dst, (OpNode _.FRC:$src1,
(_.ScalarLdFrag addr:$src2),
imm:$cc))]>,
- EVEX_4V, EVEX_CD8<_.EltSize, CD8VT1>,
+ EVEX_4V, VEX_LIG, EVEX_CD8<_.EltSize, CD8VT1>,
Sched<[sched.Folded, sched.ReadAfterFold]>;
}
}
+def X86cmpms_su : PatFrag<(ops node:$src1, node:$src2, node:$cc),
+ (X86cmpms node:$src1, node:$src2, node:$cc), [{
+ return N->hasOneUse();
+}]>;
+def X86cmpmsSAE_su : PatFrag<(ops node:$src1, node:$src2, node:$cc),
+ (X86cmpmsSAE node:$src1, node:$src2, node:$cc), [{
+ return N->hasOneUse();
+}]>;
+
let Predicates = [HasAVX512] in {
let ExeDomain = SSEPackedSingle in
- defm VCMPSSZ : avx512_cmp_scalar<f32x_info, X86cmpms, X86cmpmsRnd,
+ defm VCMPSSZ : avx512_cmp_scalar<f32x_info, X86cmpms, X86cmpmsSAE,
+ X86cmpms_su, X86cmpmsSAE_su,
SchedWriteFCmp.Scl>, AVX512XSIi8Base;
let ExeDomain = SSEPackedDouble in
- defm VCMPSDZ : avx512_cmp_scalar<f64x_info, X86cmpms, X86cmpmsRnd,
+ defm VCMPSDZ : avx512_cmp_scalar<f64x_info, X86cmpms, X86cmpmsSAE,
+ X86cmpms_su, X86cmpmsSAE_su,
SchedWriteFCmp.Scl>, AVX512XDIi8Base, VEX_W;
}
multiclass avx512_icmp_packed<bits<8> opc, string OpcodeStr, PatFrag OpNode,
- X86FoldableSchedWrite sched, X86VectorVTInfo _,
- bit IsCommutable> {
+ PatFrag OpNode_su, X86FoldableSchedWrite sched,
+ X86VectorVTInfo _, bit IsCommutable> {
let isCommutable = IsCommutable in
def rr : AVX512BI<opc, MRMSrcReg,
(outs _.KRC:$dst), (ins _.RC:$src1, _.RC:$src2),
@@ -2139,22 +2121,23 @@ multiclass avx512_icmp_packed<bits<8> opc, string OpcodeStr, PatFrag OpNode,
!strconcat(OpcodeStr, "\t{$src2, $src1, $dst {${mask}}|",
"$dst {${mask}}, $src1, $src2}"),
[(set _.KRC:$dst, (and _.KRCWM:$mask,
- (OpNode (_.VT _.RC:$src1), (_.VT _.RC:$src2))))]>,
+ (OpNode_su (_.VT _.RC:$src1), (_.VT _.RC:$src2))))]>,
EVEX_4V, EVEX_K, Sched<[sched]>;
def rmk : AVX512BI<opc, MRMSrcMem,
(outs _.KRC:$dst), (ins _.KRCWM:$mask, _.RC:$src1, _.MemOp:$src2),
!strconcat(OpcodeStr, "\t{$src2, $src1, $dst {${mask}}|",
"$dst {${mask}}, $src1, $src2}"),
[(set _.KRC:$dst, (and _.KRCWM:$mask,
- (OpNode (_.VT _.RC:$src1),
+ (OpNode_su (_.VT _.RC:$src1),
(_.VT (_.LdFrag addr:$src2)))))]>,
EVEX_4V, EVEX_K, Sched<[sched.Folded, sched.ReadAfterFold]>;
}
multiclass avx512_icmp_packed_rmb<bits<8> opc, string OpcodeStr, PatFrag OpNode,
+ PatFrag OpNode_su,
X86FoldableSchedWrite sched, X86VectorVTInfo _,
bit IsCommutable> :
- avx512_icmp_packed<opc, OpcodeStr, OpNode, sched, _, IsCommutable> {
+ avx512_icmp_packed<opc, OpcodeStr, OpNode, OpNode_su, sched, _, IsCommutable> {
def rmb : AVX512BI<opc, MRMSrcMem,
(outs _.KRC:$dst), (ins _.RC:$src1, _.ScalarMemOp:$src2),
!strconcat(OpcodeStr, "\t{${src2}", _.BroadcastStr, ", $src1, $dst",
@@ -2169,7 +2152,7 @@ multiclass avx512_icmp_packed_rmb<bits<8> opc, string OpcodeStr, PatFrag OpNode,
"\t{${src2}", _.BroadcastStr, ", $src1, $dst {${mask}}|",
"$dst {${mask}}, $src1, ${src2}", _.BroadcastStr, "}"),
[(set _.KRC:$dst, (and _.KRCWM:$mask,
- (OpNode (_.VT _.RC:$src1),
+ (OpNode_su (_.VT _.RC:$src1),
(X86VBroadcast
(_.ScalarLdFrag addr:$src2)))))]>,
EVEX_4V, EVEX_K, EVEX_B,
@@ -2177,33 +2160,34 @@ multiclass avx512_icmp_packed_rmb<bits<8> opc, string OpcodeStr, PatFrag OpNode,
}
multiclass avx512_icmp_packed_vl<bits<8> opc, string OpcodeStr, PatFrag OpNode,
- X86SchedWriteWidths sched,
+ PatFrag OpNode_su, X86SchedWriteWidths sched,
AVX512VLVectorVTInfo VTInfo, Predicate prd,
bit IsCommutable = 0> {
let Predicates = [prd] in
- defm Z : avx512_icmp_packed<opc, OpcodeStr, OpNode, sched.ZMM,
+ defm Z : avx512_icmp_packed<opc, OpcodeStr, OpNode, OpNode_su, sched.ZMM,
VTInfo.info512, IsCommutable>, EVEX_V512;
let Predicates = [prd, HasVLX] in {
- defm Z256 : avx512_icmp_packed<opc, OpcodeStr, OpNode, sched.YMM,
+ defm Z256 : avx512_icmp_packed<opc, OpcodeStr, OpNode, OpNode_su, sched.YMM,
VTInfo.info256, IsCommutable>, EVEX_V256;
- defm Z128 : avx512_icmp_packed<opc, OpcodeStr, OpNode, sched.XMM,
+ defm Z128 : avx512_icmp_packed<opc, OpcodeStr, OpNode, OpNode_su, sched.XMM,
VTInfo.info128, IsCommutable>, EVEX_V128;
}
}
multiclass avx512_icmp_packed_rmb_vl<bits<8> opc, string OpcodeStr,
- PatFrag OpNode, X86SchedWriteWidths sched,
+ PatFrag OpNode, PatFrag OpNode_su,
+ X86SchedWriteWidths sched,
AVX512VLVectorVTInfo VTInfo,
Predicate prd, bit IsCommutable = 0> {
let Predicates = [prd] in
- defm Z : avx512_icmp_packed_rmb<opc, OpcodeStr, OpNode, sched.ZMM,
+ defm Z : avx512_icmp_packed_rmb<opc, OpcodeStr, OpNode, OpNode_su, sched.ZMM,
VTInfo.info512, IsCommutable>, EVEX_V512;
let Predicates = [prd, HasVLX] in {
- defm Z256 : avx512_icmp_packed_rmb<opc, OpcodeStr, OpNode, sched.YMM,
+ defm Z256 : avx512_icmp_packed_rmb<opc, OpcodeStr, OpNode, OpNode_su, sched.YMM,
VTInfo.info256, IsCommutable>, EVEX_V256;
- defm Z128 : avx512_icmp_packed_rmb<opc, OpcodeStr, OpNode, sched.XMM,
+ defm Z128 : avx512_icmp_packed_rmb<opc, OpcodeStr, OpNode, OpNode_su, sched.XMM,
VTInfo.info128, IsCommutable>, EVEX_V128;
}
}
@@ -2216,59 +2200,69 @@ def X86pcmpeqm_c : PatFrag<(ops node:$src1, node:$src2),
def X86pcmpgtm : PatFrag<(ops node:$src1, node:$src2),
(setcc node:$src1, node:$src2, SETGT)>;
+def X86pcmpeqm_c_su : PatFrag<(ops node:$src1, node:$src2),
+ (X86pcmpeqm_c node:$src1, node:$src2), [{
+ return N->hasOneUse();
+}]>;
+def X86pcmpgtm_su : PatFrag<(ops node:$src1, node:$src2),
+ (X86pcmpgtm node:$src1, node:$src2), [{
+ return N->hasOneUse();
+}]>;
+
// AddedComplexity is needed because the explicit SETEQ/SETGT CondCode doesn't
// increase the pattern complexity the way an immediate would.
let AddedComplexity = 2 in {
// FIXME: Is there a better scheduler class for VPCMP?
-defm VPCMPEQB : avx512_icmp_packed_vl<0x74, "vpcmpeqb", X86pcmpeqm_c,
+defm VPCMPEQB : avx512_icmp_packed_vl<0x74, "vpcmpeqb", X86pcmpeqm_c, X86pcmpeqm_c_su,
SchedWriteVecALU, avx512vl_i8_info, HasBWI, 1>,
EVEX_CD8<8, CD8VF>, VEX_WIG;
-defm VPCMPEQW : avx512_icmp_packed_vl<0x75, "vpcmpeqw", X86pcmpeqm_c,
+defm VPCMPEQW : avx512_icmp_packed_vl<0x75, "vpcmpeqw", X86pcmpeqm_c, X86pcmpeqm_c_su,
SchedWriteVecALU, avx512vl_i16_info, HasBWI, 1>,
EVEX_CD8<16, CD8VF>, VEX_WIG;
-defm VPCMPEQD : avx512_icmp_packed_rmb_vl<0x76, "vpcmpeqd", X86pcmpeqm_c,
+defm VPCMPEQD : avx512_icmp_packed_rmb_vl<0x76, "vpcmpeqd", X86pcmpeqm_c, X86pcmpeqm_c_su,
SchedWriteVecALU, avx512vl_i32_info, HasAVX512, 1>,
EVEX_CD8<32, CD8VF>;
-defm VPCMPEQQ : avx512_icmp_packed_rmb_vl<0x29, "vpcmpeqq", X86pcmpeqm_c,
+defm VPCMPEQQ : avx512_icmp_packed_rmb_vl<0x29, "vpcmpeqq", X86pcmpeqm_c, X86pcmpeqm_c_su,
SchedWriteVecALU, avx512vl_i64_info, HasAVX512, 1>,
T8PD, VEX_W, EVEX_CD8<64, CD8VF>;
-defm VPCMPGTB : avx512_icmp_packed_vl<0x64, "vpcmpgtb", X86pcmpgtm,
+defm VPCMPGTB : avx512_icmp_packed_vl<0x64, "vpcmpgtb", X86pcmpgtm, X86pcmpgtm_su,
SchedWriteVecALU, avx512vl_i8_info, HasBWI>,
EVEX_CD8<8, CD8VF>, VEX_WIG;
-defm VPCMPGTW : avx512_icmp_packed_vl<0x65, "vpcmpgtw", X86pcmpgtm,
+defm VPCMPGTW : avx512_icmp_packed_vl<0x65, "vpcmpgtw", X86pcmpgtm, X86pcmpgtm_su,
SchedWriteVecALU, avx512vl_i16_info, HasBWI>,
EVEX_CD8<16, CD8VF>, VEX_WIG;
-defm VPCMPGTD : avx512_icmp_packed_rmb_vl<0x66, "vpcmpgtd", X86pcmpgtm,
+defm VPCMPGTD : avx512_icmp_packed_rmb_vl<0x66, "vpcmpgtd", X86pcmpgtm, X86pcmpgtm_su,
SchedWriteVecALU, avx512vl_i32_info, HasAVX512>,
EVEX_CD8<32, CD8VF>;
-defm VPCMPGTQ : avx512_icmp_packed_rmb_vl<0x37, "vpcmpgtq", X86pcmpgtm,
+defm VPCMPGTQ : avx512_icmp_packed_rmb_vl<0x37, "vpcmpgtq", X86pcmpgtm, X86pcmpgtm_su,
SchedWriteVecALU, avx512vl_i64_info, HasAVX512>,
T8PD, VEX_W, EVEX_CD8<64, CD8VF>;
}
multiclass avx512_icmp_cc<bits<8> opc, string Suffix, PatFrag Frag,
- PatFrag CommFrag, X86FoldableSchedWrite sched,
+ PatFrag Frag_su, PatFrag CommFrag, PatFrag CommFrag_su,
+ X86FoldableSchedWrite sched,
X86VectorVTInfo _, string Name> {
let isCommutable = 1 in
def rri : AVX512AIi8<opc, MRMSrcReg,
- (outs _.KRC:$dst), (ins _.RC:$src1, _.RC:$src2, AVX512ICC:$cc),
- !strconcat("vpcmp${cc}", Suffix,
- "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
+ (outs _.KRC:$dst), (ins _.RC:$src1, _.RC:$src2, u8imm:$cc),
+ !strconcat("vpcmp", Suffix,
+ "\t{$cc, $src2, $src1, $dst|$dst, $src1, $src2, $cc}"),
[(set _.KRC:$dst, (_.KVT (Frag:$cc (_.VT _.RC:$src1),
(_.VT _.RC:$src2),
cond)))]>,
EVEX_4V, Sched<[sched]>;
def rmi : AVX512AIi8<opc, MRMSrcMem,
- (outs _.KRC:$dst), (ins _.RC:$src1, _.MemOp:$src2, AVX512ICC:$cc),
- !strconcat("vpcmp${cc}", Suffix,
- "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
+ (outs _.KRC:$dst), (ins _.RC:$src1, _.MemOp:$src2, u8imm:$cc),
+ !strconcat("vpcmp", Suffix,
+ "\t{$cc, $src2, $src1, $dst|$dst, $src1, $src2, $cc}"),
[(set _.KRC:$dst, (_.KVT
(Frag:$cc
(_.VT _.RC:$src1),
@@ -2278,67 +2272,36 @@ multiclass avx512_icmp_cc<bits<8> opc, string Suffix, PatFrag Frag,
let isCommutable = 1 in
def rrik : AVX512AIi8<opc, MRMSrcReg,
(outs _.KRC:$dst), (ins _.KRCWM:$mask, _.RC:$src1, _.RC:$src2,
- AVX512ICC:$cc),
- !strconcat("vpcmp${cc}", Suffix,
- "\t{$src2, $src1, $dst {${mask}}|",
- "$dst {${mask}}, $src1, $src2}"),
+ u8imm:$cc),
+ !strconcat("vpcmp", Suffix,
+ "\t{$cc, $src2, $src1, $dst {${mask}}|",
+ "$dst {${mask}}, $src1, $src2, $cc}"),
[(set _.KRC:$dst, (and _.KRCWM:$mask,
- (_.KVT (Frag:$cc (_.VT _.RC:$src1),
- (_.VT _.RC:$src2),
- cond))))]>,
+ (_.KVT (Frag_su:$cc (_.VT _.RC:$src1),
+ (_.VT _.RC:$src2),
+ cond))))]>,
EVEX_4V, EVEX_K, Sched<[sched]>;
def rmik : AVX512AIi8<opc, MRMSrcMem,
(outs _.KRC:$dst), (ins _.KRCWM:$mask, _.RC:$src1, _.MemOp:$src2,
- AVX512ICC:$cc),
- !strconcat("vpcmp${cc}", Suffix,
- "\t{$src2, $src1, $dst {${mask}}|",
- "$dst {${mask}}, $src1, $src2}"),
+ u8imm:$cc),
+ !strconcat("vpcmp", Suffix,
+ "\t{$cc, $src2, $src1, $dst {${mask}}|",
+ "$dst {${mask}}, $src1, $src2, $cc}"),
[(set _.KRC:$dst, (and _.KRCWM:$mask,
(_.KVT
- (Frag:$cc
+ (Frag_su:$cc
(_.VT _.RC:$src1),
(_.VT (_.LdFrag addr:$src2)),
cond))))]>,
EVEX_4V, EVEX_K, Sched<[sched.Folded, sched.ReadAfterFold]>;
- // Accept explicit immediate argument form instead of comparison code.
- let isAsmParserOnly = 1, hasSideEffects = 0 in {
- def rri_alt : AVX512AIi8<opc, MRMSrcReg,
- (outs _.KRC:$dst), (ins _.RC:$src1, _.RC:$src2, u8imm:$cc),
- !strconcat("vpcmp", Suffix, "\t{$cc, $src2, $src1, $dst|",
- "$dst, $src1, $src2, $cc}"), []>,
- EVEX_4V, Sched<[sched]>, NotMemoryFoldable;
- let mayLoad = 1 in
- def rmi_alt : AVX512AIi8<opc, MRMSrcMem,
- (outs _.KRC:$dst), (ins _.RC:$src1, _.MemOp:$src2, u8imm:$cc),
- !strconcat("vpcmp", Suffix, "\t{$cc, $src2, $src1, $dst|",
- "$dst, $src1, $src2, $cc}"), []>,
- EVEX_4V, Sched<[sched.Folded, sched.ReadAfterFold]>, NotMemoryFoldable;
- def rrik_alt : AVX512AIi8<opc, MRMSrcReg,
- (outs _.KRC:$dst), (ins _.KRCWM:$mask, _.RC:$src1, _.RC:$src2,
- u8imm:$cc),
- !strconcat("vpcmp", Suffix,
- "\t{$cc, $src2, $src1, $dst {${mask}}|",
- "$dst {${mask}}, $src1, $src2, $cc}"), []>,
- EVEX_4V, EVEX_K, Sched<[sched]>, NotMemoryFoldable;
- let mayLoad = 1 in
- def rmik_alt : AVX512AIi8<opc, MRMSrcMem,
- (outs _.KRC:$dst), (ins _.KRCWM:$mask, _.RC:$src1, _.MemOp:$src2,
- u8imm:$cc),
- !strconcat("vpcmp", Suffix,
- "\t{$cc, $src2, $src1, $dst {${mask}}|",
- "$dst {${mask}}, $src1, $src2, $cc}"), []>,
- EVEX_4V, EVEX_K, Sched<[sched.Folded, sched.ReadAfterFold]>,
- NotMemoryFoldable;
- }
-
def : Pat<(_.KVT (CommFrag:$cc (_.LdFrag addr:$src2),
(_.VT _.RC:$src1), cond)),
(!cast<Instruction>(Name#_.ZSuffix#"rmi")
_.RC:$src1, addr:$src2, (CommFrag.OperandTransform $cc))>;
def : Pat<(and _.KRCWM:$mask,
- (_.KVT (CommFrag:$cc (_.LdFrag addr:$src2),
+ (_.KVT (CommFrag_su:$cc (_.LdFrag addr:$src2),
(_.VT _.RC:$src1), cond))),
(!cast<Instruction>(Name#_.ZSuffix#"rmik")
_.KRCWM:$mask, _.RC:$src1, addr:$src2,
@@ -2346,15 +2309,17 @@ multiclass avx512_icmp_cc<bits<8> opc, string Suffix, PatFrag Frag,
}
multiclass avx512_icmp_cc_rmb<bits<8> opc, string Suffix, PatFrag Frag,
- PatFrag CommFrag, X86FoldableSchedWrite sched,
+ PatFrag Frag_su, PatFrag CommFrag,
+ PatFrag CommFrag_su, X86FoldableSchedWrite sched,
X86VectorVTInfo _, string Name> :
- avx512_icmp_cc<opc, Suffix, Frag, CommFrag, sched, _, Name> {
+ avx512_icmp_cc<opc, Suffix, Frag, Frag_su, CommFrag, CommFrag_su,
+ sched, _, Name> {
def rmib : AVX512AIi8<opc, MRMSrcMem,
(outs _.KRC:$dst), (ins _.RC:$src1, _.ScalarMemOp:$src2,
- AVX512ICC:$cc),
- !strconcat("vpcmp${cc}", Suffix,
- "\t{${src2}", _.BroadcastStr, ", $src1, $dst|",
- "$dst, $src1, ${src2}", _.BroadcastStr, "}"),
+ u8imm:$cc),
+ !strconcat("vpcmp", Suffix,
+ "\t{$cc, ${src2}", _.BroadcastStr, ", $src1, $dst|",
+ "$dst, $src1, ${src2}", _.BroadcastStr, ", $cc}"),
[(set _.KRC:$dst, (_.KVT (Frag:$cc
(_.VT _.RC:$src1),
(X86VBroadcast
@@ -2363,45 +2328,25 @@ multiclass avx512_icmp_cc_rmb<bits<8> opc, string Suffix, PatFrag Frag,
EVEX_4V, EVEX_B, Sched<[sched.Folded, sched.ReadAfterFold]>;
def rmibk : AVX512AIi8<opc, MRMSrcMem,
(outs _.KRC:$dst), (ins _.KRCWM:$mask, _.RC:$src1,
- _.ScalarMemOp:$src2, AVX512ICC:$cc),
- !strconcat("vpcmp${cc}", Suffix,
- "\t{${src2}", _.BroadcastStr, ", $src1, $dst {${mask}}|",
- "$dst {${mask}}, $src1, ${src2}", _.BroadcastStr, "}"),
+ _.ScalarMemOp:$src2, u8imm:$cc),
+ !strconcat("vpcmp", Suffix,
+ "\t{$cc, ${src2}", _.BroadcastStr, ", $src1, $dst {${mask}}|",
+ "$dst {${mask}}, $src1, ${src2}", _.BroadcastStr, ", $cc}"),
[(set _.KRC:$dst, (and _.KRCWM:$mask,
- (_.KVT (Frag:$cc
+ (_.KVT (Frag_su:$cc
(_.VT _.RC:$src1),
(X86VBroadcast
(_.ScalarLdFrag addr:$src2)),
cond))))]>,
EVEX_4V, EVEX_K, EVEX_B, Sched<[sched.Folded, sched.ReadAfterFold]>;
- // Accept explicit immediate argument form instead of comparison code.
- let isAsmParserOnly = 1, hasSideEffects = 0, mayLoad = 1 in {
- def rmib_alt : AVX512AIi8<opc, MRMSrcMem,
- (outs _.KRC:$dst), (ins _.RC:$src1, _.ScalarMemOp:$src2,
- u8imm:$cc),
- !strconcat("vpcmp", Suffix,
- "\t{$cc, ${src2}", _.BroadcastStr, ", $src1, $dst|",
- "$dst, $src1, ${src2}", _.BroadcastStr, ", $cc}"), []>,
- EVEX_4V, EVEX_B, Sched<[sched.Folded, sched.ReadAfterFold]>,
- NotMemoryFoldable;
- def rmibk_alt : AVX512AIi8<opc, MRMSrcMem,
- (outs _.KRC:$dst), (ins _.KRCWM:$mask, _.RC:$src1,
- _.ScalarMemOp:$src2, u8imm:$cc),
- !strconcat("vpcmp", Suffix,
- "\t{$cc, ${src2}", _.BroadcastStr, ", $src1, $dst {${mask}}|",
- "$dst {${mask}}, $src1, ${src2}", _.BroadcastStr, ", $cc}"), []>,
- EVEX_4V, EVEX_K, EVEX_B, Sched<[sched.Folded, sched.ReadAfterFold]>,
- NotMemoryFoldable;
- }
-
def : Pat<(_.KVT (CommFrag:$cc (X86VBroadcast (_.ScalarLdFrag addr:$src2)),
(_.VT _.RC:$src1), cond)),
(!cast<Instruction>(Name#_.ZSuffix#"rmib")
_.RC:$src1, addr:$src2, (CommFrag.OperandTransform $cc))>;
def : Pat<(and _.KRCWM:$mask,
- (_.KVT (CommFrag:$cc (X86VBroadcast
+ (_.KVT (CommFrag_su:$cc (X86VBroadcast
(_.ScalarLdFrag addr:$src2)),
(_.VT _.RC:$src1), cond))),
(!cast<Instruction>(Name#_.ZSuffix#"rmibk")
@@ -2410,32 +2355,34 @@ multiclass avx512_icmp_cc_rmb<bits<8> opc, string Suffix, PatFrag Frag,
}
multiclass avx512_icmp_cc_vl<bits<8> opc, string Suffix, PatFrag Frag,
- PatFrag CommFrag, X86SchedWriteWidths sched,
+ PatFrag Frag_su, PatFrag CommFrag,
+ PatFrag CommFrag_su, X86SchedWriteWidths sched,
AVX512VLVectorVTInfo VTInfo, Predicate prd> {
let Predicates = [prd] in
- defm Z : avx512_icmp_cc<opc, Suffix, Frag, CommFrag, sched.ZMM,
- VTInfo.info512, NAME>, EVEX_V512;
+ defm Z : avx512_icmp_cc<opc, Suffix, Frag, Frag_su, CommFrag, CommFrag_su,
+ sched.ZMM, VTInfo.info512, NAME>, EVEX_V512;
let Predicates = [prd, HasVLX] in {
- defm Z256 : avx512_icmp_cc<opc, Suffix, Frag, CommFrag, sched.YMM,
- VTInfo.info256, NAME>, EVEX_V256;
- defm Z128 : avx512_icmp_cc<opc, Suffix, Frag, CommFrag, sched.XMM,
- VTInfo.info128, NAME>, EVEX_V128;
+ defm Z256 : avx512_icmp_cc<opc, Suffix, Frag, Frag_su, CommFrag, CommFrag_su,
+ sched.YMM, VTInfo.info256, NAME>, EVEX_V256;
+ defm Z128 : avx512_icmp_cc<opc, Suffix, Frag, Frag_su, CommFrag, CommFrag_su,
+ sched.XMM, VTInfo.info128, NAME>, EVEX_V128;
}
}
multiclass avx512_icmp_cc_rmb_vl<bits<8> opc, string Suffix, PatFrag Frag,
- PatFrag CommFrag, X86SchedWriteWidths sched,
+ PatFrag Frag_su, PatFrag CommFrag,
+ PatFrag CommFrag_su, X86SchedWriteWidths sched,
AVX512VLVectorVTInfo VTInfo, Predicate prd> {
let Predicates = [prd] in
- defm Z : avx512_icmp_cc_rmb<opc, Suffix, Frag, CommFrag, sched.ZMM,
- VTInfo.info512, NAME>, EVEX_V512;
+ defm Z : avx512_icmp_cc_rmb<opc, Suffix, Frag, Frag_su, CommFrag, CommFrag_su,
+ sched.ZMM, VTInfo.info512, NAME>, EVEX_V512;
let Predicates = [prd, HasVLX] in {
- defm Z256 : avx512_icmp_cc_rmb<opc, Suffix, Frag, CommFrag, sched.YMM,
- VTInfo.info256, NAME>, EVEX_V256;
- defm Z128 : avx512_icmp_cc_rmb<opc, Suffix, Frag, CommFrag, sched.XMM,
- VTInfo.info128, NAME>, EVEX_V128;
+ defm Z256 : avx512_icmp_cc_rmb<opc, Suffix, Frag, Frag_su, CommFrag, CommFrag_su,
+ sched.YMM, VTInfo.info256, NAME>, EVEX_V256;
+ defm Z128 : avx512_icmp_cc_rmb<opc, Suffix, Frag, Frag_su, CommFrag, CommFrag_su,
+ sched.XMM, VTInfo.info128, NAME>, EVEX_V128;
}
}
@@ -2459,6 +2406,12 @@ def X86pcmpm : PatFrag<(ops node:$src1, node:$src2, node:$cc),
return !ISD::isUnsignedIntSetCC(CC);
}], X86pcmpm_imm>;
+def X86pcmpm_su : PatFrag<(ops node:$src1, node:$src2, node:$cc),
+ (setcc node:$src1, node:$src2, node:$cc), [{
+ ISD::CondCode CC = cast<CondCodeSDNode>(N->getOperand(2))->get();
+ return N->hasOneUse() && !ISD::isUnsignedIntSetCC(CC);
+}], X86pcmpm_imm>;
+
// Same as above, but commutes immediate. Use for load folding.
def X86pcmpm_commute : PatFrag<(ops node:$src1, node:$src2, node:$cc),
(setcc node:$src1, node:$src2, node:$cc), [{
@@ -2466,12 +2419,24 @@ def X86pcmpm_commute : PatFrag<(ops node:$src1, node:$src2, node:$cc),
return !ISD::isUnsignedIntSetCC(CC);
}], X86pcmpm_imm_commute>;
+def X86pcmpm_commute_su : PatFrag<(ops node:$src1, node:$src2, node:$cc),
+ (setcc node:$src1, node:$src2, node:$cc), [{
+ ISD::CondCode CC = cast<CondCodeSDNode>(N->getOperand(2))->get();
+ return N->hasOneUse() && !ISD::isUnsignedIntSetCC(CC);
+}], X86pcmpm_imm_commute>;
+
def X86pcmpum : PatFrag<(ops node:$src1, node:$src2, node:$cc),
(setcc node:$src1, node:$src2, node:$cc), [{
ISD::CondCode CC = cast<CondCodeSDNode>(N->getOperand(2))->get();
return ISD::isUnsignedIntSetCC(CC);
}], X86pcmpm_imm>;
+def X86pcmpum_su : PatFrag<(ops node:$src1, node:$src2, node:$cc),
+ (setcc node:$src1, node:$src2, node:$cc), [{
+ ISD::CondCode CC = cast<CondCodeSDNode>(N->getOperand(2))->get();
+ return N->hasOneUse() && ISD::isUnsignedIntSetCC(CC);
+}], X86pcmpm_imm>;
+
// Same as above, but commutes immediate. Use for load folding.
def X86pcmpum_commute : PatFrag<(ops node:$src1, node:$src2, node:$cc),
(setcc node:$src1, node:$src2, node:$cc), [{
@@ -2479,93 +2444,91 @@ def X86pcmpum_commute : PatFrag<(ops node:$src1, node:$src2, node:$cc),
return ISD::isUnsignedIntSetCC(CC);
}], X86pcmpm_imm_commute>;
+def X86pcmpum_commute_su : PatFrag<(ops node:$src1, node:$src2, node:$cc),
+ (setcc node:$src1, node:$src2, node:$cc), [{
+ ISD::CondCode CC = cast<CondCodeSDNode>(N->getOperand(2))->get();
+ return N->hasOneUse() && ISD::isUnsignedIntSetCC(CC);
+}], X86pcmpm_imm_commute>;
+
// FIXME: Is there a better scheduler class for VPCMP/VPCMPU?
-defm VPCMPB : avx512_icmp_cc_vl<0x3F, "b", X86pcmpm, X86pcmpm_commute,
+defm VPCMPB : avx512_icmp_cc_vl<0x3F, "b", X86pcmpm, X86pcmpm_su,
+ X86pcmpm_commute, X86pcmpm_commute_su,
SchedWriteVecALU, avx512vl_i8_info, HasBWI>,
EVEX_CD8<8, CD8VF>;
-defm VPCMPUB : avx512_icmp_cc_vl<0x3E, "ub", X86pcmpum, X86pcmpum_commute,
+defm VPCMPUB : avx512_icmp_cc_vl<0x3E, "ub", X86pcmpum, X86pcmpum_su,
+ X86pcmpum_commute, X86pcmpum_commute_su,
SchedWriteVecALU, avx512vl_i8_info, HasBWI>,
EVEX_CD8<8, CD8VF>;
-defm VPCMPW : avx512_icmp_cc_vl<0x3F, "w", X86pcmpm, X86pcmpm_commute,
+defm VPCMPW : avx512_icmp_cc_vl<0x3F, "w", X86pcmpm, X86pcmpm_su,
+ X86pcmpm_commute, X86pcmpm_commute_su,
SchedWriteVecALU, avx512vl_i16_info, HasBWI>,
VEX_W, EVEX_CD8<16, CD8VF>;
-defm VPCMPUW : avx512_icmp_cc_vl<0x3E, "uw", X86pcmpum, X86pcmpum_commute,
+defm VPCMPUW : avx512_icmp_cc_vl<0x3E, "uw", X86pcmpum, X86pcmpum_su,
+ X86pcmpum_commute, X86pcmpum_commute_su,
SchedWriteVecALU, avx512vl_i16_info, HasBWI>,
VEX_W, EVEX_CD8<16, CD8VF>;
-defm VPCMPD : avx512_icmp_cc_rmb_vl<0x1F, "d", X86pcmpm, X86pcmpm_commute,
+defm VPCMPD : avx512_icmp_cc_rmb_vl<0x1F, "d", X86pcmpm, X86pcmpm_su,
+ X86pcmpm_commute, X86pcmpm_commute_su,
SchedWriteVecALU, avx512vl_i32_info,
HasAVX512>, EVEX_CD8<32, CD8VF>;
-defm VPCMPUD : avx512_icmp_cc_rmb_vl<0x1E, "ud", X86pcmpum, X86pcmpum_commute,
+defm VPCMPUD : avx512_icmp_cc_rmb_vl<0x1E, "ud", X86pcmpum, X86pcmpum_su,
+ X86pcmpum_commute, X86pcmpum_commute_su,
SchedWriteVecALU, avx512vl_i32_info,
HasAVX512>, EVEX_CD8<32, CD8VF>;
-defm VPCMPQ : avx512_icmp_cc_rmb_vl<0x1F, "q", X86pcmpm, X86pcmpm_commute,
+defm VPCMPQ : avx512_icmp_cc_rmb_vl<0x1F, "q", X86pcmpm, X86pcmpm_su,
+ X86pcmpm_commute, X86pcmpm_commute_su,
SchedWriteVecALU, avx512vl_i64_info,
HasAVX512>, VEX_W, EVEX_CD8<64, CD8VF>;
-defm VPCMPUQ : avx512_icmp_cc_rmb_vl<0x1E, "uq", X86pcmpum, X86pcmpum_commute,
+defm VPCMPUQ : avx512_icmp_cc_rmb_vl<0x1E, "uq", X86pcmpum, X86pcmpum_su,
+ X86pcmpum_commute, X86pcmpum_commute_su,
SchedWriteVecALU, avx512vl_i64_info,
HasAVX512>, VEX_W, EVEX_CD8<64, CD8VF>;
+def X86cmpm_su : PatFrag<(ops node:$src1, node:$src2, node:$cc),
+ (X86cmpm node:$src1, node:$src2, node:$cc), [{
+ return N->hasOneUse();
+}]>;
+def X86cmpmSAE_su : PatFrag<(ops node:$src1, node:$src2, node:$cc),
+ (X86cmpmSAE node:$src1, node:$src2, node:$cc), [{
+ return N->hasOneUse();
+}]>;
+
multiclass avx512_vcmp_common<X86FoldableSchedWrite sched, X86VectorVTInfo _,
string Name> {
defm rri : AVX512_maskable_cmp<0xC2, MRMSrcReg, _,
- (outs _.KRC:$dst), (ins _.RC:$src1, _.RC:$src2,AVXCC:$cc),
- "vcmp${cc}"#_.Suffix,
- "$src2, $src1", "$src1, $src2",
- (X86cmpm (_.VT _.RC:$src1),
- (_.VT _.RC:$src2),
- imm:$cc), 1>,
- Sched<[sched]>;
+ (outs _.KRC:$dst), (ins _.RC:$src1, _.RC:$src2,u8imm:$cc),
+ "vcmp"#_.Suffix,
+ "$cc, $src2, $src1", "$src1, $src2, $cc",
+ (X86cmpm (_.VT _.RC:$src1), (_.VT _.RC:$src2), imm:$cc),
+ (X86cmpm_su (_.VT _.RC:$src1), (_.VT _.RC:$src2), imm:$cc),
+ 1>, Sched<[sched]>;
defm rmi : AVX512_maskable_cmp<0xC2, MRMSrcMem, _,
- (outs _.KRC:$dst),(ins _.RC:$src1, _.MemOp:$src2, AVXCC:$cc),
- "vcmp${cc}"#_.Suffix,
- "$src2, $src1", "$src1, $src2",
- (X86cmpm (_.VT _.RC:$src1),
- (_.VT (_.LdFrag addr:$src2)),
- imm:$cc)>,
+ (outs _.KRC:$dst),(ins _.RC:$src1, _.MemOp:$src2, u8imm:$cc),
+ "vcmp"#_.Suffix,
+ "$cc, $src2, $src1", "$src1, $src2, $cc",
+ (X86cmpm (_.VT _.RC:$src1), (_.VT (_.LdFrag addr:$src2)),
+ imm:$cc),
+ (X86cmpm_su (_.VT _.RC:$src1), (_.VT (_.LdFrag addr:$src2)),
+ imm:$cc)>,
Sched<[sched.Folded, sched.ReadAfterFold]>;
defm rmbi : AVX512_maskable_cmp<0xC2, MRMSrcMem, _,
(outs _.KRC:$dst),
- (ins _.RC:$src1, _.ScalarMemOp:$src2, AVXCC:$cc),
- "vcmp${cc}"#_.Suffix,
- "${src2}"##_.BroadcastStr##", $src1",
- "$src1, ${src2}"##_.BroadcastStr,
+ (ins _.RC:$src1, _.ScalarMemOp:$src2, u8imm:$cc),
+ "vcmp"#_.Suffix,
+ "$cc, ${src2}"#_.BroadcastStr#", $src1",
+ "$src1, ${src2}"#_.BroadcastStr#", $cc",
(X86cmpm (_.VT _.RC:$src1),
(_.VT (X86VBroadcast(_.ScalarLdFrag addr:$src2))),
- imm:$cc)>,
+ imm:$cc),
+ (X86cmpm_su (_.VT _.RC:$src1),
+ (_.VT (X86VBroadcast(_.ScalarLdFrag addr:$src2))),
+ imm:$cc)>,
EVEX_B, Sched<[sched.Folded, sched.ReadAfterFold]>;
- // Accept explicit immediate argument form instead of comparison code.
- let isAsmParserOnly = 1, hasSideEffects = 0 in {
- defm rri_alt : AVX512_maskable_cmp_alt<0xC2, MRMSrcReg, _,
- (outs _.KRC:$dst),
- (ins _.RC:$src1, _.RC:$src2, u8imm:$cc),
- "vcmp"#_.Suffix,
- "$cc, $src2, $src1", "$src1, $src2, $cc">,
- Sched<[sched]>, NotMemoryFoldable;
-
- let mayLoad = 1 in {
- defm rmi_alt : AVX512_maskable_cmp_alt<0xC2, MRMSrcMem, _,
- (outs _.KRC:$dst),
- (ins _.RC:$src1, _.MemOp:$src2, u8imm:$cc),
- "vcmp"#_.Suffix,
- "$cc, $src2, $src1", "$src1, $src2, $cc">,
- Sched<[sched.Folded, sched.ReadAfterFold]>,
- NotMemoryFoldable;
-
- defm rmbi_alt : AVX512_maskable_cmp_alt<0xC2, MRMSrcMem, _,
- (outs _.KRC:$dst),
- (ins _.RC:$src1, _.ScalarMemOp:$src2, u8imm:$cc),
- "vcmp"#_.Suffix,
- "$cc, ${src2}"##_.BroadcastStr##", $src1",
- "$src1, ${src2}"##_.BroadcastStr##", $cc">,
- EVEX_B, Sched<[sched.Folded, sched.ReadAfterFold]>,
- NotMemoryFoldable;
- }
- }
// Patterns for selecting with loads in other operand.
def : Pat<(X86cmpm (_.LdFrag addr:$src2), (_.VT _.RC:$src1),
@@ -2573,9 +2536,9 @@ multiclass avx512_vcmp_common<X86FoldableSchedWrite sched, X86VectorVTInfo _,
(!cast<Instruction>(Name#_.ZSuffix#"rmi") _.RC:$src1, addr:$src2,
imm:$cc)>;
- def : Pat<(and _.KRCWM:$mask, (X86cmpm (_.LdFrag addr:$src2),
- (_.VT _.RC:$src1),
- CommutableCMPCC:$cc)),
+ def : Pat<(and _.KRCWM:$mask, (X86cmpm_su (_.LdFrag addr:$src2),
+ (_.VT _.RC:$src1),
+ CommutableCMPCC:$cc)),
(!cast<Instruction>(Name#_.ZSuffix#"rmik") _.KRCWM:$mask,
_.RC:$src1, addr:$src2,
imm:$cc)>;
@@ -2585,10 +2548,10 @@ multiclass avx512_vcmp_common<X86FoldableSchedWrite sched, X86VectorVTInfo _,
(!cast<Instruction>(Name#_.ZSuffix#"rmbi") _.RC:$src1, addr:$src2,
imm:$cc)>;
- def : Pat<(and _.KRCWM:$mask, (X86cmpm (X86VBroadcast
- (_.ScalarLdFrag addr:$src2)),
- (_.VT _.RC:$src1),
- CommutableCMPCC:$cc)),
+ def : Pat<(and _.KRCWM:$mask, (X86cmpm_su (X86VBroadcast
+ (_.ScalarLdFrag addr:$src2)),
+ (_.VT _.RC:$src1),
+ CommutableCMPCC:$cc)),
(!cast<Instruction>(Name#_.ZSuffix#"rmbik") _.KRCWM:$mask,
_.RC:$src1, addr:$src2,
imm:$cc)>;
@@ -2597,24 +2560,14 @@ multiclass avx512_vcmp_common<X86FoldableSchedWrite sched, X86VectorVTInfo _,
multiclass avx512_vcmp_sae<X86FoldableSchedWrite sched, X86VectorVTInfo _> {
// comparison code form (VCMP[EQ/LT/LE/...]
defm rrib : AVX512_maskable_cmp<0xC2, MRMSrcReg, _,
- (outs _.KRC:$dst),(ins _.RC:$src1, _.RC:$src2, AVXCC:$cc),
- "vcmp${cc}"#_.Suffix,
- "{sae}, $src2, $src1", "$src1, $src2, {sae}",
- (X86cmpmRnd (_.VT _.RC:$src1),
- (_.VT _.RC:$src2),
- imm:$cc,
- (i32 FROUND_NO_EXC))>,
+ (outs _.KRC:$dst),(ins _.RC:$src1, _.RC:$src2, u8imm:$cc),
+ "vcmp"#_.Suffix,
+ "$cc, {sae}, $src2, $src1",
+ "$src1, $src2, {sae}, $cc",
+ (X86cmpmSAE (_.VT _.RC:$src1), (_.VT _.RC:$src2), imm:$cc),
+ (X86cmpmSAE_su (_.VT _.RC:$src1), (_.VT _.RC:$src2),
+ imm:$cc)>,
EVEX_B, Sched<[sched]>;
-
- let isAsmParserOnly = 1, hasSideEffects = 0 in {
- defm rrib_alt : AVX512_maskable_cmp_alt<0xC2, MRMSrcReg, _,
- (outs _.KRC:$dst),
- (ins _.RC:$src1, _.RC:$src2, u8imm:$cc),
- "vcmp"#_.Suffix,
- "$cc, {sae}, $src2, $src1",
- "$src1, $src2, {sae}, $cc">,
- EVEX_B, Sched<[sched]>, NotMemoryFoldable;
- }
}
multiclass avx512_vcmp<X86SchedWriteWidths sched, AVX512VLVectorVTInfo _> {
@@ -2647,16 +2600,27 @@ let Predicates = [HasAVX512] in {
// ----------------------------------------------------------------
// FPClass
+
+def X86Vfpclasss_su : PatFrag<(ops node:$src1, node:$src2),
+ (X86Vfpclasss node:$src1, node:$src2), [{
+ return N->hasOneUse();
+}]>;
+
+def X86Vfpclass_su : PatFrag<(ops node:$src1, node:$src2),
+ (X86Vfpclass node:$src1, node:$src2), [{
+ return N->hasOneUse();
+}]>;
+
//handle fpclass instruction mask = op(reg_scalar,imm)
// op(mem_scalar,imm)
-multiclass avx512_scalar_fpclass<bits<8> opc, string OpcodeStr, SDNode OpNode,
+multiclass avx512_scalar_fpclass<bits<8> opc, string OpcodeStr,
X86FoldableSchedWrite sched, X86VectorVTInfo _,
Predicate prd> {
let Predicates = [prd], ExeDomain = _.ExeDomain in {
def rr : AVX512<opc, MRMSrcReg, (outs _.KRC:$dst),
(ins _.RC:$src1, i32u8imm:$src2),
OpcodeStr##_.Suffix#"\t{$src2, $src1, $dst|$dst, $src1, $src2}",
- [(set _.KRC:$dst,(OpNode (_.VT _.RC:$src1),
+ [(set _.KRC:$dst,(X86Vfpclasss (_.VT _.RC:$src1),
(i32 imm:$src2)))]>,
Sched<[sched]>;
def rrk : AVX512<opc, MRMSrcReg, (outs _.KRC:$dst),
@@ -2664,7 +2628,7 @@ multiclass avx512_scalar_fpclass<bits<8> opc, string OpcodeStr, SDNode OpNode,
OpcodeStr##_.Suffix#
"\t{$src2, $src1, $dst {${mask}}|$dst {${mask}}, $src1, $src2}",
[(set _.KRC:$dst,(and _.KRCWM:$mask,
- (OpNode (_.VT _.RC:$src1),
+ (X86Vfpclasss_su (_.VT _.RC:$src1),
(i32 imm:$src2))))]>,
EVEX_K, Sched<[sched]>;
def rm : AVX512<opc, MRMSrcMem, (outs _.KRC:$dst),
@@ -2672,15 +2636,15 @@ multiclass avx512_scalar_fpclass<bits<8> opc, string OpcodeStr, SDNode OpNode,
OpcodeStr##_.Suffix##
"\t{$src2, $src1, $dst|$dst, $src1, $src2}",
[(set _.KRC:$dst,
- (OpNode _.ScalarIntMemCPat:$src1,
- (i32 imm:$src2)))]>,
+ (X86Vfpclasss _.ScalarIntMemCPat:$src1,
+ (i32 imm:$src2)))]>,
Sched<[sched.Folded, sched.ReadAfterFold]>;
def rmk : AVX512<opc, MRMSrcMem, (outs _.KRC:$dst),
(ins _.KRCWM:$mask, _.IntScalarMemOp:$src1, i32u8imm:$src2),
OpcodeStr##_.Suffix##
"\t{$src2, $src1, $dst {${mask}}|$dst {${mask}}, $src1, $src2}",
[(set _.KRC:$dst,(and _.KRCWM:$mask,
- (OpNode _.ScalarIntMemCPat:$src1,
+ (X86Vfpclasss_su _.ScalarIntMemCPat:$src1,
(i32 imm:$src2))))]>,
EVEX_K, Sched<[sched.Folded, sched.ReadAfterFold]>;
}
@@ -2689,14 +2653,14 @@ multiclass avx512_scalar_fpclass<bits<8> opc, string OpcodeStr, SDNode OpNode,
//handle fpclass instruction mask = fpclass(reg_vec, reg_vec, imm)
// fpclass(reg_vec, mem_vec, imm)
// fpclass(reg_vec, broadcast(eltVt), imm)
-multiclass avx512_vector_fpclass<bits<8> opc, string OpcodeStr, SDNode OpNode,
+multiclass avx512_vector_fpclass<bits<8> opc, string OpcodeStr,
X86FoldableSchedWrite sched, X86VectorVTInfo _,
- string mem, string broadcast>{
+ string mem>{
let ExeDomain = _.ExeDomain in {
def rr : AVX512<opc, MRMSrcReg, (outs _.KRC:$dst),
(ins _.RC:$src1, i32u8imm:$src2),
OpcodeStr##_.Suffix#"\t{$src2, $src1, $dst|$dst, $src1, $src2}",
- [(set _.KRC:$dst,(OpNode (_.VT _.RC:$src1),
+ [(set _.KRC:$dst,(X86Vfpclass (_.VT _.RC:$src1),
(i32 imm:$src2)))]>,
Sched<[sched]>;
def rrk : AVX512<opc, MRMSrcReg, (outs _.KRC:$dst),
@@ -2704,85 +2668,103 @@ multiclass avx512_vector_fpclass<bits<8> opc, string OpcodeStr, SDNode OpNode,
OpcodeStr##_.Suffix#
"\t{$src2, $src1, $dst {${mask}}|$dst {${mask}}, $src1, $src2}",
[(set _.KRC:$dst,(and _.KRCWM:$mask,
- (OpNode (_.VT _.RC:$src1),
+ (X86Vfpclass_su (_.VT _.RC:$src1),
(i32 imm:$src2))))]>,
EVEX_K, Sched<[sched]>;
def rm : AVX512<opc, MRMSrcMem, (outs _.KRC:$dst),
(ins _.MemOp:$src1, i32u8imm:$src2),
- OpcodeStr##_.Suffix##mem#
+ OpcodeStr##_.Suffix#"{"#mem#"}"#
"\t{$src2, $src1, $dst|$dst, $src1, $src2}",
- [(set _.KRC:$dst,(OpNode
+ [(set _.KRC:$dst,(X86Vfpclass
(_.VT (_.LdFrag addr:$src1)),
(i32 imm:$src2)))]>,
Sched<[sched.Folded, sched.ReadAfterFold]>;
def rmk : AVX512<opc, MRMSrcMem, (outs _.KRC:$dst),
(ins _.KRCWM:$mask, _.MemOp:$src1, i32u8imm:$src2),
- OpcodeStr##_.Suffix##mem#
+ OpcodeStr##_.Suffix#"{"#mem#"}"#
"\t{$src2, $src1, $dst {${mask}}|$dst {${mask}}, $src1, $src2}",
- [(set _.KRC:$dst, (and _.KRCWM:$mask, (OpNode
+ [(set _.KRC:$dst, (and _.KRCWM:$mask, (X86Vfpclass_su
(_.VT (_.LdFrag addr:$src1)),
(i32 imm:$src2))))]>,
EVEX_K, Sched<[sched.Folded, sched.ReadAfterFold]>;
def rmb : AVX512<opc, MRMSrcMem, (outs _.KRC:$dst),
(ins _.ScalarMemOp:$src1, i32u8imm:$src2),
- OpcodeStr##_.Suffix##broadcast##"\t{$src2, ${src1}"##
+ OpcodeStr##_.Suffix##"\t{$src2, ${src1}"##
_.BroadcastStr##", $dst|$dst, ${src1}"
##_.BroadcastStr##", $src2}",
- [(set _.KRC:$dst,(OpNode
+ [(set _.KRC:$dst,(X86Vfpclass
(_.VT (X86VBroadcast
(_.ScalarLdFrag addr:$src1))),
(i32 imm:$src2)))]>,
EVEX_B, Sched<[sched.Folded, sched.ReadAfterFold]>;
def rmbk : AVX512<opc, MRMSrcMem, (outs _.KRC:$dst),
(ins _.KRCWM:$mask, _.ScalarMemOp:$src1, i32u8imm:$src2),
- OpcodeStr##_.Suffix##broadcast##"\t{$src2, ${src1}"##
+ OpcodeStr##_.Suffix##"\t{$src2, ${src1}"##
_.BroadcastStr##", $dst {${mask}}|$dst {${mask}}, ${src1}"##
_.BroadcastStr##", $src2}",
- [(set _.KRC:$dst,(and _.KRCWM:$mask, (OpNode
+ [(set _.KRC:$dst,(and _.KRCWM:$mask, (X86Vfpclass_su
(_.VT (X86VBroadcast
(_.ScalarLdFrag addr:$src1))),
(i32 imm:$src2))))]>,
EVEX_B, EVEX_K, Sched<[sched.Folded, sched.ReadAfterFold]>;
}
+
+ // Allow registers or broadcast with the x, y, z suffix we use to disambiguate
+ // the memory form.
+ def : InstAlias<OpcodeStr#_.Suffix#mem#
+ "\t{$src2, $src1, $dst|$dst, $src1, $src2}",
+ (!cast<Instruction>(NAME#"rr")
+ _.KRC:$dst, _.RC:$src1, i32u8imm:$src2), 0, "att">;
+ def : InstAlias<OpcodeStr#_.Suffix#mem#
+ "\t{$src2, $src1, $dst {${mask}}|$dst {${mask}}, $src1, $src2}",
+ (!cast<Instruction>(NAME#"rrk")
+ _.KRC:$dst, _.KRCWM:$mask, _.RC:$src1, i32u8imm:$src2), 0, "att">;
+ def : InstAlias<OpcodeStr#_.Suffix#mem#
+ "\t{$src2, ${src1}"#_.BroadcastStr#", $dst|$dst, ${src1}"#
+ _.BroadcastStr#", $src2}",
+ (!cast<Instruction>(NAME#"rmb")
+ _.KRC:$dst, _.ScalarMemOp:$src1, i32u8imm:$src2), 0, "att">;
+ def : InstAlias<OpcodeStr#_.Suffix#mem#
+ "\t{$src2, ${src1}"#_.BroadcastStr#", $dst {${mask}}|"
+ "$dst {${mask}}, ${src1}"#_.BroadcastStr#", $src2}",
+ (!cast<Instruction>(NAME#"rmbk")
+ _.KRC:$dst, _.KRCWM:$mask, _.ScalarMemOp:$src1, i32u8imm:$src2), 0, "att">;
}
multiclass avx512_vector_fpclass_all<string OpcodeStr, AVX512VLVectorVTInfo _,
- bits<8> opc, SDNode OpNode,
- X86SchedWriteWidths sched, Predicate prd,
- string broadcast>{
+ bits<8> opc, X86SchedWriteWidths sched,
+ Predicate prd>{
let Predicates = [prd] in {
- defm Z : avx512_vector_fpclass<opc, OpcodeStr, OpNode, sched.ZMM,
- _.info512, "{z}", broadcast>, EVEX_V512;
+ defm Z : avx512_vector_fpclass<opc, OpcodeStr, sched.ZMM,
+ _.info512, "z">, EVEX_V512;
}
let Predicates = [prd, HasVLX] in {
- defm Z128 : avx512_vector_fpclass<opc, OpcodeStr, OpNode, sched.XMM,
- _.info128, "{x}", broadcast>, EVEX_V128;
- defm Z256 : avx512_vector_fpclass<opc, OpcodeStr, OpNode, sched.YMM,
- _.info256, "{y}", broadcast>, EVEX_V256;
+ defm Z128 : avx512_vector_fpclass<opc, OpcodeStr, sched.XMM,
+ _.info128, "x">, EVEX_V128;
+ defm Z256 : avx512_vector_fpclass<opc, OpcodeStr, sched.YMM,
+ _.info256, "y">, EVEX_V256;
}
}
multiclass avx512_fp_fpclass_all<string OpcodeStr, bits<8> opcVec,
- bits<8> opcScalar, SDNode VecOpNode,
- SDNode ScalarOpNode, X86SchedWriteWidths sched,
+ bits<8> opcScalar, X86SchedWriteWidths sched,
Predicate prd> {
defm PS : avx512_vector_fpclass_all<OpcodeStr, avx512vl_f32_info, opcVec,
- VecOpNode, sched, prd, "{l}">,
+ sched, prd>,
EVEX_CD8<32, CD8VF>;
defm PD : avx512_vector_fpclass_all<OpcodeStr, avx512vl_f64_info, opcVec,
- VecOpNode, sched, prd, "{q}">,
+ sched, prd>,
EVEX_CD8<64, CD8VF> , VEX_W;
- defm SSZ : avx512_scalar_fpclass<opcScalar, OpcodeStr, ScalarOpNode,
- sched.Scl, f32x_info, prd>,
+ defm SSZ : avx512_scalar_fpclass<opcScalar, OpcodeStr,
+ sched.Scl, f32x_info, prd>, VEX_LIG,
EVEX_CD8<32, CD8VT1>;
- defm SDZ : avx512_scalar_fpclass<opcScalar, OpcodeStr, ScalarOpNode,
- sched.Scl, f64x_info, prd>,
+ defm SDZ : avx512_scalar_fpclass<opcScalar, OpcodeStr,
+ sched.Scl, f64x_info, prd>, VEX_LIG,
EVEX_CD8<64, CD8VT1>, VEX_W;
}
-defm VFPCLASS : avx512_fp_fpclass_all<"vfpclass", 0x66, 0x67, X86Vfpclass,
- X86Vfpclasss, SchedWriteFCmp, HasDQI>,
- AVX512AIi8Base, EVEX;
+defm VFPCLASS : avx512_fp_fpclass_all<"vfpclass", 0x66, 0x67, SchedWriteFCmp,
+ HasDQI>, AVX512AIi8Base, EVEX;
//-----------------------------------------------------------------
// Mask register copy, including
@@ -3039,26 +3021,24 @@ defm : avx512_binop_pat<vxnor, xnor, KXNORWrr>;
defm : avx512_binop_pat<xor, xor, KXORWrr>;
// Mask unpacking
-multiclass avx512_mask_unpck<string Suffix,RegisterClass KRC, ValueType VT,
- RegisterClass KRCSrc, X86FoldableSchedWrite sched,
+multiclass avx512_mask_unpck<string Suffix, X86KVectorVTInfo Dst,
+ X86KVectorVTInfo Src, X86FoldableSchedWrite sched,
Predicate prd> {
let Predicates = [prd] in {
let hasSideEffects = 0 in
- def rr : I<0x4b, MRMSrcReg, (outs KRC:$dst),
- (ins KRC:$src1, KRC:$src2),
+ def rr : I<0x4b, MRMSrcReg, (outs Dst.KRC:$dst),
+ (ins Src.KRC:$src1, Src.KRC:$src2),
"kunpck"#Suffix#"\t{$src2, $src1, $dst|$dst, $src1, $src2}", []>,
VEX_4V, VEX_L, Sched<[sched]>;
- def : Pat<(VT (concat_vectors KRCSrc:$src1, KRCSrc:$src2)),
- (!cast<Instruction>(NAME##rr)
- (COPY_TO_REGCLASS KRCSrc:$src2, KRC),
- (COPY_TO_REGCLASS KRCSrc:$src1, KRC))>;
+ def : Pat<(Dst.KVT (concat_vectors Src.KRC:$src1, Src.KRC:$src2)),
+ (!cast<Instruction>(NAME##rr) Src.KRC:$src2, Src.KRC:$src1)>;
}
}
-defm KUNPCKBW : avx512_mask_unpck<"bw", VK16, v16i1, VK8, WriteShuffle, HasAVX512>, PD;
-defm KUNPCKWD : avx512_mask_unpck<"wd", VK32, v32i1, VK16, WriteShuffle, HasBWI>, PS;
-defm KUNPCKDQ : avx512_mask_unpck<"dq", VK64, v64i1, VK32, WriteShuffle, HasBWI>, PS, VEX_W;
+defm KUNPCKBW : avx512_mask_unpck<"bw", v16i1_info, v8i1_info, WriteShuffle, HasAVX512>, PD;
+defm KUNPCKWD : avx512_mask_unpck<"wd", v32i1_info, v16i1_info, WriteShuffle, HasBWI>, PS;
+defm KUNPCKDQ : avx512_mask_unpck<"dq", v64i1_info, v32i1_info, WriteShuffle, HasBWI>, PS, VEX_W;
// Mask bit testing
multiclass avx512_mask_testop<bits<8> opc, string OpcodeStr, RegisterClass KRC,
@@ -3118,7 +3098,8 @@ defm KSHIFTL : avx512_mask_shiftop_w<0x32, 0x33, "kshiftl", X86kshiftl, WriteShu
defm KSHIFTR : avx512_mask_shiftop_w<0x30, 0x31, "kshiftr", X86kshiftr, WriteShuffle>;
// Patterns for comparing 128/256-bit integer vectors using 512-bit instruction.
-multiclass axv512_icmp_packed_no_vlx_lowering<PatFrag Frag, string InstStr,
+multiclass axv512_icmp_packed_no_vlx_lowering<PatFrag Frag, PatFrag Frag_su,
+ string InstStr,
X86VectorVTInfo Narrow,
X86VectorVTInfo Wide> {
def : Pat<(Narrow.KVT (Frag (Narrow.VT Narrow.RC:$src1),
@@ -3130,8 +3111,8 @@ multiclass axv512_icmp_packed_no_vlx_lowering<PatFrag Frag, string InstStr,
Narrow.KRC)>;
def : Pat<(Narrow.KVT (and Narrow.KRC:$mask,
- (Frag (Narrow.VT Narrow.RC:$src1),
- (Narrow.VT Narrow.RC:$src2)))),
+ (Frag_su (Narrow.VT Narrow.RC:$src1),
+ (Narrow.VT Narrow.RC:$src2)))),
(COPY_TO_REGCLASS
(!cast<Instruction>(InstStr#"Zrrk")
(COPY_TO_REGCLASS Narrow.KRC:$mask, Wide.KRC),
@@ -3141,7 +3122,7 @@ multiclass axv512_icmp_packed_no_vlx_lowering<PatFrag Frag, string InstStr,
}
// Patterns for comparing 128/256-bit integer vectors using 512-bit instruction.
-multiclass axv512_icmp_packed_cc_no_vlx_lowering<PatFrag Frag,
+multiclass axv512_icmp_packed_cc_no_vlx_lowering<PatFrag Frag, PatFrag Frag_su,
string InstStr,
X86VectorVTInfo Narrow,
X86VectorVTInfo Wide> {
@@ -3154,9 +3135,9 @@ def : Pat<(Narrow.KVT (Frag:$cc (Narrow.VT Narrow.RC:$src1),
(Frag.OperandTransform $cc)), Narrow.KRC)>;
def : Pat<(Narrow.KVT (and Narrow.KRC:$mask,
- (Narrow.KVT (Frag:$cc (Narrow.VT Narrow.RC:$src1),
- (Narrow.VT Narrow.RC:$src2),
- cond)))),
+ (Narrow.KVT (Frag_su:$cc (Narrow.VT Narrow.RC:$src1),
+ (Narrow.VT Narrow.RC:$src2),
+ cond)))),
(COPY_TO_REGCLASS (!cast<Instruction>(InstStr##Zrrik)
(COPY_TO_REGCLASS Narrow.KRC:$mask, Wide.KRC),
(Wide.VT (INSERT_SUBREG (IMPLICIT_DEF), Narrow.RC:$src1, Narrow.SubRegIdx)),
@@ -3165,7 +3146,8 @@ def : Pat<(Narrow.KVT (and Narrow.KRC:$mask,
}
// Same as above, but for fp types which don't use PatFrags.
-multiclass axv512_cmp_packed_cc_no_vlx_lowering<SDNode OpNode, string InstStr,
+multiclass axv512_cmp_packed_cc_no_vlx_lowering<SDNode OpNode, PatFrag OpNode_su,
+ string InstStr,
X86VectorVTInfo Narrow,
X86VectorVTInfo Wide> {
def : Pat<(Narrow.KVT (OpNode (Narrow.VT Narrow.RC:$src1),
@@ -3177,8 +3159,8 @@ def : Pat<(Narrow.KVT (OpNode (Narrow.VT Narrow.RC:$src1),
imm:$cc), Narrow.KRC)>;
def : Pat<(Narrow.KVT (and Narrow.KRC:$mask,
- (OpNode (Narrow.VT Narrow.RC:$src1),
- (Narrow.VT Narrow.RC:$src2), imm:$cc))),
+ (OpNode_su (Narrow.VT Narrow.RC:$src1),
+ (Narrow.VT Narrow.RC:$src2), imm:$cc))),
(COPY_TO_REGCLASS (!cast<Instruction>(InstStr##Zrrik)
(COPY_TO_REGCLASS Narrow.KRC:$mask, Wide.KRC),
(Wide.VT (INSERT_SUBREG (IMPLICIT_DEF), Narrow.RC:$src1, Narrow.SubRegIdx)),
@@ -3190,65 +3172,65 @@ let Predicates = [HasAVX512, NoVLX] in {
// AddedComplexity is needed because the explicit SETEQ/SETGT CondCode doesn't
// increase the pattern complexity the way an immediate would.
let AddedComplexity = 2 in {
- defm : axv512_icmp_packed_no_vlx_lowering<X86pcmpgtm, "VPCMPGTD", v8i32x_info, v16i32_info>;
- defm : axv512_icmp_packed_no_vlx_lowering<X86pcmpeqm_c, "VPCMPEQD", v8i32x_info, v16i32_info>;
+ defm : axv512_icmp_packed_no_vlx_lowering<X86pcmpgtm, X86pcmpgtm_su, "VPCMPGTD", v8i32x_info, v16i32_info>;
+ defm : axv512_icmp_packed_no_vlx_lowering<X86pcmpeqm_c, X86pcmpeqm_c_su, "VPCMPEQD", v8i32x_info, v16i32_info>;
- defm : axv512_icmp_packed_no_vlx_lowering<X86pcmpgtm, "VPCMPGTD", v4i32x_info, v16i32_info>;
- defm : axv512_icmp_packed_no_vlx_lowering<X86pcmpeqm_c, "VPCMPEQD", v4i32x_info, v16i32_info>;
+ defm : axv512_icmp_packed_no_vlx_lowering<X86pcmpgtm, X86pcmpgtm_su, "VPCMPGTD", v4i32x_info, v16i32_info>;
+ defm : axv512_icmp_packed_no_vlx_lowering<X86pcmpeqm_c, X86pcmpeqm_c_su, "VPCMPEQD", v4i32x_info, v16i32_info>;
- defm : axv512_icmp_packed_no_vlx_lowering<X86pcmpgtm, "VPCMPGTQ", v4i64x_info, v8i64_info>;
- defm : axv512_icmp_packed_no_vlx_lowering<X86pcmpeqm_c, "VPCMPEQQ", v4i64x_info, v8i64_info>;
+ defm : axv512_icmp_packed_no_vlx_lowering<X86pcmpgtm, X86pcmpgtm_su, "VPCMPGTQ", v4i64x_info, v8i64_info>;
+ defm : axv512_icmp_packed_no_vlx_lowering<X86pcmpeqm_c, X86pcmpeqm_c_su, "VPCMPEQQ", v4i64x_info, v8i64_info>;
- defm : axv512_icmp_packed_no_vlx_lowering<X86pcmpgtm, "VPCMPGTQ", v2i64x_info, v8i64_info>;
- defm : axv512_icmp_packed_no_vlx_lowering<X86pcmpeqm_c, "VPCMPEQQ", v2i64x_info, v8i64_info>;
+ defm : axv512_icmp_packed_no_vlx_lowering<X86pcmpgtm, X86pcmpgtm_su, "VPCMPGTQ", v2i64x_info, v8i64_info>;
+ defm : axv512_icmp_packed_no_vlx_lowering<X86pcmpeqm_c, X86pcmpeqm_c_su, "VPCMPEQQ", v2i64x_info, v8i64_info>;
}
- defm : axv512_icmp_packed_cc_no_vlx_lowering<X86pcmpm, "VPCMPD", v8i32x_info, v16i32_info>;
- defm : axv512_icmp_packed_cc_no_vlx_lowering<X86pcmpum, "VPCMPUD", v8i32x_info, v16i32_info>;
+ defm : axv512_icmp_packed_cc_no_vlx_lowering<X86pcmpm, X86pcmpm_su, "VPCMPD", v8i32x_info, v16i32_info>;
+ defm : axv512_icmp_packed_cc_no_vlx_lowering<X86pcmpum, X86pcmpum_su, "VPCMPUD", v8i32x_info, v16i32_info>;
- defm : axv512_icmp_packed_cc_no_vlx_lowering<X86pcmpm, "VPCMPD", v4i32x_info, v16i32_info>;
- defm : axv512_icmp_packed_cc_no_vlx_lowering<X86pcmpum, "VPCMPUD", v4i32x_info, v16i32_info>;
+ defm : axv512_icmp_packed_cc_no_vlx_lowering<X86pcmpm, X86pcmpm_su, "VPCMPD", v4i32x_info, v16i32_info>;
+ defm : axv512_icmp_packed_cc_no_vlx_lowering<X86pcmpum, X86pcmpum_su, "VPCMPUD", v4i32x_info, v16i32_info>;
- defm : axv512_icmp_packed_cc_no_vlx_lowering<X86pcmpm, "VPCMPQ", v4i64x_info, v8i64_info>;
- defm : axv512_icmp_packed_cc_no_vlx_lowering<X86pcmpum, "VPCMPUQ", v4i64x_info, v8i64_info>;
+ defm : axv512_icmp_packed_cc_no_vlx_lowering<X86pcmpm, X86pcmpm_su, "VPCMPQ", v4i64x_info, v8i64_info>;
+ defm : axv512_icmp_packed_cc_no_vlx_lowering<X86pcmpum, X86pcmpum_su, "VPCMPUQ", v4i64x_info, v8i64_info>;
- defm : axv512_icmp_packed_cc_no_vlx_lowering<X86pcmpm, "VPCMPQ", v2i64x_info, v8i64_info>;
- defm : axv512_icmp_packed_cc_no_vlx_lowering<X86pcmpum, "VPCMPUQ", v2i64x_info, v8i64_info>;
+ defm : axv512_icmp_packed_cc_no_vlx_lowering<X86pcmpm, X86pcmpm_su, "VPCMPQ", v2i64x_info, v8i64_info>;
+ defm : axv512_icmp_packed_cc_no_vlx_lowering<X86pcmpum, X86pcmpum_su, "VPCMPUQ", v2i64x_info, v8i64_info>;
- defm : axv512_cmp_packed_cc_no_vlx_lowering<X86cmpm, "VCMPPS", v8f32x_info, v16f32_info>;
- defm : axv512_cmp_packed_cc_no_vlx_lowering<X86cmpm, "VCMPPS", v4f32x_info, v16f32_info>;
- defm : axv512_cmp_packed_cc_no_vlx_lowering<X86cmpm, "VCMPPD", v4f64x_info, v8f64_info>;
- defm : axv512_cmp_packed_cc_no_vlx_lowering<X86cmpm, "VCMPPD", v2f64x_info, v8f64_info>;
+ defm : axv512_cmp_packed_cc_no_vlx_lowering<X86cmpm, X86cmpm_su, "VCMPPS", v8f32x_info, v16f32_info>;
+ defm : axv512_cmp_packed_cc_no_vlx_lowering<X86cmpm, X86cmpm_su, "VCMPPS", v4f32x_info, v16f32_info>;
+ defm : axv512_cmp_packed_cc_no_vlx_lowering<X86cmpm, X86cmpm_su, "VCMPPD", v4f64x_info, v8f64_info>;
+ defm : axv512_cmp_packed_cc_no_vlx_lowering<X86cmpm, X86cmpm_su, "VCMPPD", v2f64x_info, v8f64_info>;
}
let Predicates = [HasBWI, NoVLX] in {
// AddedComplexity is needed because the explicit SETEQ/SETGT CondCode doesn't
// increase the pattern complexity the way an immediate would.
let AddedComplexity = 2 in {
- defm : axv512_icmp_packed_no_vlx_lowering<X86pcmpgtm, "VPCMPGTB", v32i8x_info, v64i8_info>;
- defm : axv512_icmp_packed_no_vlx_lowering<X86pcmpeqm_c, "VPCMPEQB", v32i8x_info, v64i8_info>;
+ defm : axv512_icmp_packed_no_vlx_lowering<X86pcmpgtm, X86pcmpgtm_su, "VPCMPGTB", v32i8x_info, v64i8_info>;
+ defm : axv512_icmp_packed_no_vlx_lowering<X86pcmpeqm_c, X86pcmpeqm_c_su, "VPCMPEQB", v32i8x_info, v64i8_info>;
- defm : axv512_icmp_packed_no_vlx_lowering<X86pcmpgtm, "VPCMPGTB", v16i8x_info, v64i8_info>;
- defm : axv512_icmp_packed_no_vlx_lowering<X86pcmpeqm_c, "VPCMPEQB", v16i8x_info, v64i8_info>;
+ defm : axv512_icmp_packed_no_vlx_lowering<X86pcmpgtm, X86pcmpgtm_su, "VPCMPGTB", v16i8x_info, v64i8_info>;
+ defm : axv512_icmp_packed_no_vlx_lowering<X86pcmpeqm_c, X86pcmpeqm_c_su, "VPCMPEQB", v16i8x_info, v64i8_info>;
- defm : axv512_icmp_packed_no_vlx_lowering<X86pcmpgtm, "VPCMPGTW", v16i16x_info, v32i16_info>;
- defm : axv512_icmp_packed_no_vlx_lowering<X86pcmpeqm_c, "VPCMPEQW", v16i16x_info, v32i16_info>;
+ defm : axv512_icmp_packed_no_vlx_lowering<X86pcmpgtm, X86pcmpgtm_su, "VPCMPGTW", v16i16x_info, v32i16_info>;
+ defm : axv512_icmp_packed_no_vlx_lowering<X86pcmpeqm_c, X86pcmpeqm_c_su, "VPCMPEQW", v16i16x_info, v32i16_info>;
- defm : axv512_icmp_packed_no_vlx_lowering<X86pcmpgtm, "VPCMPGTW", v8i16x_info, v32i16_info>;
- defm : axv512_icmp_packed_no_vlx_lowering<X86pcmpeqm_c, "VPCMPEQW", v8i16x_info, v32i16_info>;
+ defm : axv512_icmp_packed_no_vlx_lowering<X86pcmpgtm, X86pcmpgtm_su, "VPCMPGTW", v8i16x_info, v32i16_info>;
+ defm : axv512_icmp_packed_no_vlx_lowering<X86pcmpeqm_c, X86pcmpeqm_c_su, "VPCMPEQW", v8i16x_info, v32i16_info>;
}
- defm : axv512_icmp_packed_cc_no_vlx_lowering<X86pcmpm, "VPCMPB", v32i8x_info, v64i8_info>;
- defm : axv512_icmp_packed_cc_no_vlx_lowering<X86pcmpum, "VPCMPUB", v32i8x_info, v64i8_info>;
+ defm : axv512_icmp_packed_cc_no_vlx_lowering<X86pcmpm, X86pcmpm_su, "VPCMPB", v32i8x_info, v64i8_info>;
+ defm : axv512_icmp_packed_cc_no_vlx_lowering<X86pcmpum, X86pcmpum_su, "VPCMPUB", v32i8x_info, v64i8_info>;
- defm : axv512_icmp_packed_cc_no_vlx_lowering<X86pcmpm, "VPCMPB", v16i8x_info, v64i8_info>;
- defm : axv512_icmp_packed_cc_no_vlx_lowering<X86pcmpum, "VPCMPUB", v16i8x_info, v64i8_info>;
+ defm : axv512_icmp_packed_cc_no_vlx_lowering<X86pcmpm, X86pcmpm_su, "VPCMPB", v16i8x_info, v64i8_info>;
+ defm : axv512_icmp_packed_cc_no_vlx_lowering<X86pcmpum, X86pcmpum_su, "VPCMPUB", v16i8x_info, v64i8_info>;
- defm : axv512_icmp_packed_cc_no_vlx_lowering<X86pcmpm, "VPCMPW", v16i16x_info, v32i16_info>;
- defm : axv512_icmp_packed_cc_no_vlx_lowering<X86pcmpum, "VPCMPUW", v16i16x_info, v32i16_info>;
+ defm : axv512_icmp_packed_cc_no_vlx_lowering<X86pcmpm, X86pcmpm_su, "VPCMPW", v16i16x_info, v32i16_info>;
+ defm : axv512_icmp_packed_cc_no_vlx_lowering<X86pcmpum, X86pcmpum_su, "VPCMPUW", v16i16x_info, v32i16_info>;
- defm : axv512_icmp_packed_cc_no_vlx_lowering<X86pcmpm, "VPCMPW", v8i16x_info, v32i16_info>;
- defm : axv512_icmp_packed_cc_no_vlx_lowering<X86pcmpum, "VPCMPUW", v8i16x_info, v32i16_info>;
+ defm : axv512_icmp_packed_cc_no_vlx_lowering<X86pcmpm, X86pcmpm_su, "VPCMPW", v8i16x_info, v32i16_info>;
+ defm : axv512_icmp_packed_cc_no_vlx_lowering<X86pcmpum, X86pcmpum_su, "VPCMPUW", v8i16x_info, v32i16_info>;
}
// Mask setting all 0s or 1s
@@ -3394,15 +3376,15 @@ multiclass avx512_alignedload_vl<bits<8> opc, string OpcodeStr,
string EVEX2VEXOvrd, bit NoRMPattern = 0> {
let Predicates = [prd] in
defm Z : avx512_load<opc, OpcodeStr, NAME, _.info512,
- _.info512.AlignedLdFrag, masked_load_aligned512,
+ _.info512.AlignedLdFrag, masked_load_aligned,
Sched.ZMM, "", NoRMPattern>, EVEX_V512;
let Predicates = [prd, HasVLX] in {
defm Z256 : avx512_load<opc, OpcodeStr, NAME, _.info256,
- _.info256.AlignedLdFrag, masked_load_aligned256,
+ _.info256.AlignedLdFrag, masked_load_aligned,
Sched.YMM, EVEX2VEXOvrd#"Y", NoRMPattern>, EVEX_V256;
defm Z128 : avx512_load<opc, OpcodeStr, NAME, _.info128,
- _.info128.AlignedLdFrag, masked_load_aligned128,
+ _.info128.AlignedLdFrag, masked_load_aligned,
Sched.XMM, EVEX2VEXOvrd, NoRMPattern>, EVEX_V128;
}
}
@@ -3414,15 +3396,15 @@ multiclass avx512_load_vl<bits<8> opc, string OpcodeStr,
SDPatternOperator SelectOprr = vselect> {
let Predicates = [prd] in
defm Z : avx512_load<opc, OpcodeStr, NAME, _.info512, _.info512.LdFrag,
- masked_load_unaligned, Sched.ZMM, "",
+ masked_load, Sched.ZMM, "",
NoRMPattern, SelectOprr>, EVEX_V512;
let Predicates = [prd, HasVLX] in {
defm Z256 : avx512_load<opc, OpcodeStr, NAME, _.info256, _.info256.LdFrag,
- masked_load_unaligned, Sched.YMM, EVEX2VEXOvrd#"Y",
+ masked_load, Sched.YMM, EVEX2VEXOvrd#"Y",
NoRMPattern, SelectOprr>, EVEX_V256;
defm Z128 : avx512_load<opc, OpcodeStr, NAME, _.info128, _.info128.LdFrag,
- masked_load_unaligned, Sched.XMM, EVEX2VEXOvrd,
+ masked_load, Sched.XMM, EVEX2VEXOvrd,
NoRMPattern, SelectOprr>, EVEX_V128;
}
}
@@ -3488,14 +3470,14 @@ multiclass avx512_store_vl< bits<8> opc, string OpcodeStr,
string EVEX2VEXOvrd, bit NoMRPattern = 0> {
let Predicates = [prd] in
defm Z : avx512_store<opc, OpcodeStr, NAME, _.info512, store,
- masked_store_unaligned, Sched.ZMM, "",
+ masked_store, Sched.ZMM, "",
NoMRPattern>, EVEX_V512;
let Predicates = [prd, HasVLX] in {
defm Z256 : avx512_store<opc, OpcodeStr, NAME, _.info256, store,
- masked_store_unaligned, Sched.YMM,
+ masked_store, Sched.YMM,
EVEX2VEXOvrd#"Y", NoMRPattern>, EVEX_V256;
defm Z128 : avx512_store<opc, OpcodeStr, NAME, _.info128, store,
- masked_store_unaligned, Sched.XMM, EVEX2VEXOvrd,
+ masked_store, Sched.XMM, EVEX2VEXOvrd,
NoMRPattern>, EVEX_V128;
}
}
@@ -3506,15 +3488,15 @@ multiclass avx512_alignedstore_vl<bits<8> opc, string OpcodeStr,
string EVEX2VEXOvrd, bit NoMRPattern = 0> {
let Predicates = [prd] in
defm Z : avx512_store<opc, OpcodeStr, NAME, _.info512, alignedstore,
- masked_store_aligned512, Sched.ZMM, "",
+ masked_store_aligned, Sched.ZMM, "",
NoMRPattern>, EVEX_V512;
let Predicates = [prd, HasVLX] in {
defm Z256 : avx512_store<opc, OpcodeStr, NAME, _.info256, alignedstore,
- masked_store_aligned256, Sched.YMM,
+ masked_store_aligned, Sched.YMM,
EVEX2VEXOvrd#"Y", NoMRPattern>, EVEX_V256;
defm Z128 : avx512_store<opc, OpcodeStr, NAME, _.info128, alignedstore,
- masked_store_aligned128, Sched.XMM, EVEX2VEXOvrd,
+ masked_store_aligned, Sched.XMM, EVEX2VEXOvrd,
NoMRPattern>, EVEX_V128;
}
}
@@ -3609,7 +3591,7 @@ def VMOVUPSZ256mr_NOVLX : I<0, Pseudo, (outs), (ins f256mem:$dst, VR256X:$src),
"", []>, Sched<[WriteFStoreY]>;
}
-def : Pat<(v8i64 (vselect VK8WM:$mask, (bc_v8i64 (v16i32 immAllZerosV)),
+def : Pat<(v8i64 (vselect VK8WM:$mask, (v8i64 immAllZerosV),
(v8i64 VR512:$src))),
(VMOVDQA64Zrrkz (COPY_TO_REGCLASS (KNOTWrr (COPY_TO_REGCLASS VK8:$mask, VK16)),
VK8), VR512:$src)>;
@@ -3621,7 +3603,7 @@ def : Pat<(v16i32 (vselect VK16WM:$mask, (v16i32 immAllZerosV),
// These patterns exist to prevent the above patterns from introducing a second
// mask inversion when one already exists.
def : Pat<(v8i64 (vselect (xor VK8:$mask, (v8i1 immAllOnesV)),
- (bc_v8i64 (v16i32 immAllZerosV)),
+ (v8i64 immAllZerosV),
(v8i64 VR512:$src))),
(VMOVDQA64Zrrkz VK8:$mask, VR512:$src)>;
def : Pat<(v16i32 (vselect (xor VK16:$mask, (v16i1 immAllOnesV)),
@@ -3761,75 +3743,6 @@ let Predicates = [HasVLX] in {
(VMOVDQU64Z256mr addr:$dst, VR256X:$src)>;
}
-multiclass masked_move_for_extract<string InstrStr, X86VectorVTInfo From,
- X86VectorVTInfo To, X86VectorVTInfo Cast> {
- def : Pat<(Cast.VT (vselect Cast.KRCWM:$mask,
- (bitconvert
- (To.VT (extract_subvector
- (From.VT From.RC:$src), (iPTR 0)))),
- To.RC:$src0)),
- (Cast.VT (!cast<Instruction>(InstrStr#"rrk")
- Cast.RC:$src0, Cast.KRCWM:$mask,
- (To.VT (EXTRACT_SUBREG From.RC:$src, To.SubRegIdx))))>;
-
- def : Pat<(Cast.VT (vselect Cast.KRCWM:$mask,
- (bitconvert
- (To.VT (extract_subvector
- (From.VT From.RC:$src), (iPTR 0)))),
- Cast.ImmAllZerosV)),
- (Cast.VT (!cast<Instruction>(InstrStr#"rrkz")
- Cast.KRCWM:$mask,
- (To.VT (EXTRACT_SUBREG From.RC:$src, To.SubRegIdx))))>;
-}
-
-
-let Predicates = [HasVLX] in {
-// A masked extract from the first 128-bits of a 256-bit vector can be
-// implemented with masked move.
-defm : masked_move_for_extract<"VMOVDQA64Z128", v4i64x_info, v2i64x_info, v2i64x_info>;
-defm : masked_move_for_extract<"VMOVDQA64Z128", v8i32x_info, v4i32x_info, v2i64x_info>;
-defm : masked_move_for_extract<"VMOVDQA64Z128", v16i16x_info, v8i16x_info, v2i64x_info>;
-defm : masked_move_for_extract<"VMOVDQA64Z128", v32i8x_info, v16i8x_info, v2i64x_info>;
-defm : masked_move_for_extract<"VMOVDQA32Z128", v4i64x_info, v2i64x_info, v4i32x_info>;
-defm : masked_move_for_extract<"VMOVDQA32Z128", v8i32x_info, v4i32x_info, v4i32x_info>;
-defm : masked_move_for_extract<"VMOVDQA32Z128", v16i16x_info, v8i16x_info, v4i32x_info>;
-defm : masked_move_for_extract<"VMOVDQA32Z128", v32i8x_info, v16i8x_info, v4i32x_info>;
-defm : masked_move_for_extract<"VMOVAPDZ128", v4f64x_info, v2f64x_info, v2f64x_info>;
-defm : masked_move_for_extract<"VMOVAPDZ128", v8f32x_info, v4f32x_info, v2f64x_info>;
-defm : masked_move_for_extract<"VMOVAPSZ128", v4f64x_info, v2f64x_info, v4f32x_info>;
-defm : masked_move_for_extract<"VMOVAPSZ128", v8f32x_info, v4f32x_info, v4f32x_info>;
-
-// A masked extract from the first 128-bits of a 512-bit vector can be
-// implemented with masked move.
-defm : masked_move_for_extract<"VMOVDQA64Z128", v8i64_info, v2i64x_info, v2i64x_info>;
-defm : masked_move_for_extract<"VMOVDQA64Z128", v16i32_info, v4i32x_info, v2i64x_info>;
-defm : masked_move_for_extract<"VMOVDQA64Z128", v32i16_info, v8i16x_info, v2i64x_info>;
-defm : masked_move_for_extract<"VMOVDQA64Z128", v64i8_info, v16i8x_info, v2i64x_info>;
-defm : masked_move_for_extract<"VMOVDQA32Z128", v8i64_info, v2i64x_info, v4i32x_info>;
-defm : masked_move_for_extract<"VMOVDQA32Z128", v16i32_info, v4i32x_info, v4i32x_info>;
-defm : masked_move_for_extract<"VMOVDQA32Z128", v32i16_info, v8i16x_info, v4i32x_info>;
-defm : masked_move_for_extract<"VMOVDQA32Z128", v64i8_info, v16i8x_info, v4i32x_info>;
-defm : masked_move_for_extract<"VMOVAPDZ128", v8f64_info, v2f64x_info, v2f64x_info>;
-defm : masked_move_for_extract<"VMOVAPDZ128", v16f32_info, v4f32x_info, v2f64x_info>;
-defm : masked_move_for_extract<"VMOVAPSZ128", v8f64_info, v2f64x_info, v4f32x_info>;
-defm : masked_move_for_extract<"VMOVAPSZ128", v16f32_info, v4f32x_info, v4f32x_info>;
-
-// A masked extract from the first 256-bits of a 512-bit vector can be
-// implemented with masked move.
-defm : masked_move_for_extract<"VMOVDQA64Z256", v8i64_info, v4i64x_info, v4i64x_info>;
-defm : masked_move_for_extract<"VMOVDQA64Z256", v16i32_info, v8i32x_info, v4i64x_info>;
-defm : masked_move_for_extract<"VMOVDQA64Z256", v32i16_info, v16i16x_info, v4i64x_info>;
-defm : masked_move_for_extract<"VMOVDQA64Z256", v64i8_info, v32i8x_info, v4i64x_info>;
-defm : masked_move_for_extract<"VMOVDQA32Z256", v8i64_info, v4i64x_info, v8i32x_info>;
-defm : masked_move_for_extract<"VMOVDQA32Z256", v16i32_info, v8i32x_info, v8i32x_info>;
-defm : masked_move_for_extract<"VMOVDQA32Z256", v32i16_info, v16i16x_info, v8i32x_info>;
-defm : masked_move_for_extract<"VMOVDQA32Z256", v64i8_info, v32i8x_info, v8i32x_info>;
-defm : masked_move_for_extract<"VMOVAPDZ256", v8f64_info, v4f64x_info, v4f64x_info>;
-defm : masked_move_for_extract<"VMOVAPDZ256", v16f32_info, v8f32x_info, v4f64x_info>;
-defm : masked_move_for_extract<"VMOVAPSZ256", v8f64_info, v4f64x_info, v8f32x_info>;
-defm : masked_move_for_extract<"VMOVAPSZ256", v16f32_info, v8f32x_info, v8f32x_info>;
-}
-
// Move Int Doubleword to Packed Double Int
//
let ExeDomain = SSEPackedInt in {
@@ -3858,19 +3771,10 @@ def VMOV64toSDZrr : AVX512BI<0x6E, MRMSrcReg, (outs FR64X:$dst), (ins GR64:$src)
"vmovq\t{$src, $dst|$dst, $src}",
[(set FR64X:$dst, (bitconvert GR64:$src))]>,
EVEX, VEX_W, Sched<[WriteVecMoveFromGpr]>;
-def VMOV64toSDZrm : AVX512XSI<0x7E, MRMSrcMem, (outs FR64X:$dst), (ins i64mem:$src),
- "vmovq\t{$src, $dst|$dst, $src}",
- [(set FR64X:$dst, (bitconvert (loadi64 addr:$src)))]>,
- EVEX, VEX_W, EVEX_CD8<8, CD8VT8>, Sched<[WriteVecLoad]>;
def VMOVSDto64Zrr : AVX512BI<0x7E, MRMDestReg, (outs GR64:$dst), (ins FR64X:$src),
"vmovq\t{$src, $dst|$dst, $src}",
[(set GR64:$dst, (bitconvert FR64X:$src))]>,
EVEX, VEX_W, Sched<[WriteVecMoveFromGpr]>;
-def VMOVSDto64Zmr : AVX512BI<0x7E, MRMDestMem, (outs), (ins i64mem:$dst, FR64X:$src),
- "vmovq\t{$src, $dst|$dst, $src}",
- [(store (i64 (bitconvert FR64X:$src)), addr:$dst)]>,
- EVEX, VEX_W, Sched<[WriteVecStore]>,
- EVEX_CD8<64, CD8VT1>;
}
} // ExeDomain = SSEPackedInt
@@ -3881,11 +3785,6 @@ def VMOVDI2SSZrr : AVX512BI<0x6E, MRMSrcReg, (outs FR32X:$dst), (ins GR32:$src)
"vmovd\t{$src, $dst|$dst, $src}",
[(set FR32X:$dst, (bitconvert GR32:$src))]>,
EVEX, Sched<[WriteVecMoveFromGpr]>;
-
-def VMOVDI2SSZrm : AVX512BI<0x6E, MRMSrcMem, (outs FR32X:$dst), (ins i32mem:$src),
- "vmovd\t{$src, $dst|$dst, $src}",
- [(set FR32X:$dst, (bitconvert (loadi32 addr:$src)))]>,
- EVEX, EVEX_CD8<32, CD8VT1>, Sched<[WriteVecLoad]>;
} // ExeDomain = SSEPackedInt, isCodeGenOnly = 1
// Move doubleword from xmm register to r/m32
@@ -3938,6 +3837,11 @@ def VMOVPQI2QIZrr : AVX512BI<0xD6, MRMDestReg, (outs VR128X:$dst),
def : InstAlias<"vmovq.s\t{$src, $dst|$dst, $src}",
(VMOVPQI2QIZrr VR128X:$dst, VR128X:$src), 0>;
+let Predicates = [HasAVX512] in {
+ def : Pat<(X86vextractstore64 (v2i64 VR128X:$src), addr:$dst),
+ (VMOVPQI2QIZmr addr:$dst, VR128X:$src)>;
+}
+
// Move Scalar Single to Double Int
//
let ExeDomain = SSEPackedInt, isCodeGenOnly = 1 in {
@@ -3946,11 +3850,6 @@ def VMOVSS2DIZrr : AVX512BI<0x7E, MRMDestReg, (outs GR32:$dst),
"vmovd\t{$src, $dst|$dst, $src}",
[(set GR32:$dst, (bitconvert FR32X:$src))]>,
EVEX, Sched<[WriteVecMoveToGpr]>;
-def VMOVSS2DIZmr : AVX512BI<0x7E, MRMDestMem, (outs),
- (ins i32mem:$dst, FR32X:$src),
- "vmovd\t{$src, $dst|$dst, $src}",
- [(store (i32 (bitconvert FR32X:$src)), addr:$dst)]>,
- EVEX, EVEX_CD8<32, CD8VT1>, Sched<[WriteVecStore]>;
} // ExeDomain = SSEPackedInt, isCodeGenOnly = 1
// Move Quadword Int to Packed Quadword Int
@@ -3974,7 +3873,7 @@ def : InstAlias<"vmovd\t{$src, $dst|$dst, $src}",
// AVX-512 MOVSS, MOVSD
//===----------------------------------------------------------------------===//
-multiclass avx512_move_scalar<string asm, SDNode OpNode,
+multiclass avx512_move_scalar<string asm, SDNode OpNode, PatFrag vzload_frag,
X86VectorVTInfo _> {
let Predicates = [HasAVX512, OptForSize] in
def rr : AVX512PI<0x10, MRMSrcReg, (outs _.RC:$dst),
@@ -3999,11 +3898,18 @@ multiclass avx512_move_scalar<string asm, SDNode OpNode,
(_.VT (OpNode _.RC:$src1, _.RC:$src2)),
(_.VT _.RC:$src0))))],
_.ExeDomain>, EVEX_4V, EVEX_K, Sched<[SchedWriteFShuffle.XMM]>;
- let canFoldAsLoad = 1, isReMaterializable = 1 in
- def rm : AVX512PI<0x10, MRMSrcMem, (outs _.FRC:$dst), (ins _.ScalarMemOp:$src),
+ let canFoldAsLoad = 1, isReMaterializable = 1 in {
+ def rm : AVX512PI<0x10, MRMSrcMem, (outs _.RC:$dst), (ins _.ScalarMemOp:$src),
!strconcat(asm, "\t{$src, $dst|$dst, $src}"),
- [(set _.FRC:$dst, (_.ScalarLdFrag addr:$src))],
+ [(set _.RC:$dst, (_.VT (vzload_frag addr:$src)))],
_.ExeDomain>, EVEX, Sched<[WriteFLoad]>;
+ // _alt version uses FR32/FR64 register class.
+ let isCodeGenOnly = 1 in
+ def rm_alt : AVX512PI<0x10, MRMSrcMem, (outs _.FRC:$dst), (ins _.ScalarMemOp:$src),
+ !strconcat(asm, "\t{$src, $dst|$dst, $src}"),
+ [(set _.FRC:$dst, (_.ScalarLdFrag addr:$src))],
+ _.ExeDomain>, EVEX, Sched<[WriteFLoad]>;
+ }
let mayLoad = 1, hasSideEffects = 0 in {
let Constraints = "$src0 = $dst" in
def rmk : AVX512PI<0x10, MRMSrcMem, (outs _.RC:$dst),
@@ -4023,16 +3929,16 @@ multiclass avx512_move_scalar<string asm, SDNode OpNode,
EVEX, Sched<[WriteFStore]>;
let mayStore = 1, hasSideEffects = 0 in
def mrk: AVX512PI<0x11, MRMDestMem, (outs),
- (ins _.ScalarMemOp:$dst, VK1WM:$mask, _.FRC:$src),
+ (ins _.ScalarMemOp:$dst, VK1WM:$mask, _.RC:$src),
!strconcat(asm, "\t{$src, $dst {${mask}}|$dst {${mask}}, $src}"),
[], _.ExeDomain>, EVEX, EVEX_K, Sched<[WriteFStore]>,
NotMemoryFoldable;
}
-defm VMOVSSZ : avx512_move_scalar<"vmovss", X86Movss, f32x_info>,
+defm VMOVSSZ : avx512_move_scalar<"vmovss", X86Movss, X86vzload32, f32x_info>,
VEX_LIG, XS, EVEX_CD8<32, CD8VT1>;
-defm VMOVSDZ : avx512_move_scalar<"vmovsd", X86Movsd, f64x_info>,
+defm VMOVSDZ : avx512_move_scalar<"vmovsd", X86Movsd, X86vzload64, f64x_info>,
VEX_LIG, XD, VEX_W, EVEX_CD8<64, CD8VT1>;
@@ -4070,7 +3976,7 @@ def : Pat<(masked_store
(iPTR 0))), addr:$dst, Mask),
(!cast<Instruction>(InstrStr#mrk) addr:$dst,
(COPY_TO_REGCLASS MaskRC:$mask, VK1WM),
- (COPY_TO_REGCLASS _.info128.RC:$src, _.info128.FRC))>;
+ _.info128.RC:$src)>;
}
@@ -4085,7 +3991,7 @@ def : Pat<(masked_store
(iPTR 0))), addr:$dst, Mask),
(!cast<Instruction>(InstrStr#mrk) addr:$dst,
(COPY_TO_REGCLASS (i32 (INSERT_SUBREG (IMPLICIT_DEF), MaskRC:$mask, subreg)), VK1WM),
- (COPY_TO_REGCLASS _.info128.RC:$src, _.info128.FRC))>;
+ _.info128.RC:$src)>;
}
@@ -4105,13 +4011,13 @@ def : Pat<(masked_store
(iPTR 0))), addr:$dst, Mask512),
(!cast<Instruction>(InstrStr#mrk) addr:$dst,
(COPY_TO_REGCLASS (i32 (INSERT_SUBREG (IMPLICIT_DEF), MaskRC:$mask, subreg)), VK1WM),
- (COPY_TO_REGCLASS _.info128.RC:$src, _.info128.FRC))>;
+ _.info128.RC:$src)>;
// AVX512VL pattern.
def : Pat<(masked_store (_.info128.VT _.info128.RC:$src), addr:$dst, Mask128),
(!cast<Instruction>(InstrStr#mrk) addr:$dst,
(COPY_TO_REGCLASS (i32 (INSERT_SUBREG (IMPLICIT_DEF), MaskRC:$mask, subreg)), VK1WM),
- (COPY_TO_REGCLASS _.info128.RC:$src, _.info128.FRC))>;
+ _.info128.RC:$src)>;
}
multiclass avx512_load_scalar_lowering<string InstrStr, AVX512VLVectorVTInfo _,
@@ -4119,8 +4025,7 @@ multiclass avx512_load_scalar_lowering<string InstrStr, AVX512VLVectorVTInfo _,
def : Pat<(_.info128.VT (extract_subvector
(_.info512.VT (masked_load addr:$srcAddr, Mask,
- (_.info512.VT (bitconvert
- (v16i32 immAllZerosV))))),
+ _.info512.ImmAllZerosV)),
(iPTR 0))),
(!cast<Instruction>(InstrStr#rmkz)
(COPY_TO_REGCLASS MaskRC:$mask, VK1WM),
@@ -4145,8 +4050,7 @@ multiclass avx512_load_scalar_lowering_subreg<string InstrStr,
def : Pat<(_.info128.VT (extract_subvector
(_.info512.VT (masked_load addr:$srcAddr, Mask,
- (_.info512.VT (bitconvert
- (v16i32 immAllZerosV))))),
+ _.info512.ImmAllZerosV)),
(iPTR 0))),
(!cast<Instruction>(InstrStr#rmkz)
(COPY_TO_REGCLASS (i32 (INSERT_SUBREG (IMPLICIT_DEF), MaskRC:$mask, subreg)), VK1WM),
@@ -4175,8 +4079,7 @@ multiclass avx512_load_scalar_lowering_subreg2<string InstrStr,
// AVX512F patterns.
def : Pat<(_.info128.VT (extract_subvector
(_.info512.VT (masked_load addr:$srcAddr, Mask512,
- (_.info512.VT (bitconvert
- (v16i32 immAllZerosV))))),
+ _.info512.ImmAllZerosV)),
(iPTR 0))),
(!cast<Instruction>(InstrStr#rmkz)
(COPY_TO_REGCLASS (i32 (INSERT_SUBREG (IMPLICIT_DEF), MaskRC:$mask, subreg)), VK1WM),
@@ -4194,7 +4097,7 @@ def : Pat<(_.info128.VT (extract_subvector
// AVX512Vl patterns.
def : Pat<(_.info128.VT (masked_load addr:$srcAddr, Mask128,
- (_.info128.VT (bitconvert (v4i32 immAllZerosV))))),
+ _.info128.ImmAllZerosV)),
(!cast<Instruction>(InstrStr#rmkz)
(COPY_TO_REGCLASS (i32 (INSERT_SUBREG (IMPLICIT_DEF), MaskRC:$mask, subreg)), VK1WM),
addr:$srcAddr)>;
@@ -4383,15 +4286,6 @@ let Predicates = [HasAVX512, OptForSize] in {
(v4i32 (VMOVSSZrr (v4i32 (AVX512_128_SET0)),
(v4i32 (EXTRACT_SUBREG (v8i32 VR256X:$src), sub_xmm)))), sub_xmm)>;
- def : Pat<(v4f64 (X86vzmovl (v4f64 VR256X:$src))),
- (SUBREG_TO_REG (i32 0),
- (v2f64 (VMOVSDZrr (v2f64 (AVX512_128_SET0)),
- (v2f64 (EXTRACT_SUBREG (v4f64 VR256X:$src), sub_xmm)))), sub_xmm)>;
- def : Pat<(v4i64 (X86vzmovl (v4i64 VR256X:$src))),
- (SUBREG_TO_REG (i32 0),
- (v2i64 (VMOVSDZrr (v2i64 (AVX512_128_SET0)),
- (v2i64 (EXTRACT_SUBREG (v4i64 VR256X:$src), sub_xmm)))), sub_xmm)>;
-
def : Pat<(v16f32 (X86vzmovl (v16f32 VR512:$src))),
(SUBREG_TO_REG (i32 0),
(v4f32 (VMOVSSZrr (v4f32 (AVX512_128_SET0)),
@@ -4400,17 +4294,6 @@ let Predicates = [HasAVX512, OptForSize] in {
(SUBREG_TO_REG (i32 0),
(v4i32 (VMOVSSZrr (v4i32 (AVX512_128_SET0)),
(v4i32 (EXTRACT_SUBREG (v16i32 VR512:$src), sub_xmm)))), sub_xmm)>;
-
- def : Pat<(v8f64 (X86vzmovl (v8f64 VR512:$src))),
- (SUBREG_TO_REG (i32 0),
- (v2f64 (VMOVSDZrr (v2f64 (AVX512_128_SET0)),
- (v2f64 (EXTRACT_SUBREG (v8f64 VR512:$src), sub_xmm)))), sub_xmm)>;
-
- def : Pat<(v8i64 (X86vzmovl (v8i64 VR512:$src))),
- (SUBREG_TO_REG (i32 0),
- (v2i64 (VMOVSDZrr (v2i64 (AVX512_128_SET0)),
- (v2i64 (EXTRACT_SUBREG (v8i64 VR512:$src), sub_xmm)))), sub_xmm)>;
-
}
// Use 128-bit blends for OptForSpeed since BLENDs have better throughput than
@@ -4426,79 +4309,27 @@ let Predicates = [HasAVX512, OptForSpeed] in {
(v4i32 (VPBLENDWrri (v4i32 (V_SET0)),
(v4i32 (EXTRACT_SUBREG (v16i32 VR512:$src), sub_xmm)),
(i8 3))), sub_xmm)>;
-
- def : Pat<(v8f64 (X86vzmovl (v8f64 VR512:$src))),
- (SUBREG_TO_REG (i32 0),
- (v2f64 (VBLENDPDrri (v2f64 (V_SET0)),
- (v2f64 (EXTRACT_SUBREG (v8f64 VR512:$src), sub_xmm)),
- (i8 1))), sub_xmm)>;
- def : Pat<(v8i64 (X86vzmovl (v8i64 VR512:$src))),
- (SUBREG_TO_REG (i32 0),
- (v2i64 (VPBLENDWrri (v2i64 (V_SET0)),
- (v2i64 (EXTRACT_SUBREG (v8i64 VR512:$src), sub_xmm)),
- (i8 0xf))), sub_xmm)>;
}
let Predicates = [HasAVX512] in {
-
- // MOVSSrm zeros the high parts of the register; represent this
- // with SUBREG_TO_REG. The AVX versions also write: DST[255:128] <- 0
- def : Pat<(v4f32 (X86vzmovl (v4f32 (scalar_to_vector (loadf32 addr:$src))))),
- (COPY_TO_REGCLASS (VMOVSSZrm addr:$src), VR128X)>;
- def : Pat<(v4f32 (X86vzmovl (loadv4f32 addr:$src))),
- (COPY_TO_REGCLASS (VMOVSSZrm addr:$src), VR128X)>;
- def : Pat<(v4f32 (X86vzload addr:$src)),
- (COPY_TO_REGCLASS (VMOVSSZrm addr:$src), VR128X)>;
-
- // MOVSDrm zeros the high parts of the register; represent this
- // with SUBREG_TO_REG. The AVX versions also write: DST[255:128] <- 0
- def : Pat<(v2f64 (X86vzmovl (v2f64 (scalar_to_vector (loadf64 addr:$src))))),
- (COPY_TO_REGCLASS (VMOVSDZrm addr:$src), VR128X)>;
- def : Pat<(v2f64 (X86vzmovl (loadv2f64 addr:$src))),
- (COPY_TO_REGCLASS (VMOVSDZrm addr:$src), VR128X)>;
- def : Pat<(v2f64 (X86vzload addr:$src)),
- (COPY_TO_REGCLASS (VMOVSDZrm addr:$src), VR128X)>;
+ def : Pat<(v4f32 (scalar_to_vector (loadf32 addr:$src))),
+ (VMOVSSZrm addr:$src)>;
+ def : Pat<(v2f64 (scalar_to_vector (loadf64 addr:$src))),
+ (VMOVSDZrm addr:$src)>;
// Represent the same patterns above but in the form they appear for
// 256-bit types
- def : Pat<(v8i32 (X86vzmovl (insert_subvector undef,
- (v4i32 (scalar_to_vector (loadi32 addr:$src))), (iPTR 0)))),
- (SUBREG_TO_REG (i32 0), (v4i32 (VMOVDI2PDIZrm addr:$src)), sub_xmm)>;
- def : Pat<(v8f32 (X86vzmovl (insert_subvector undef,
- (v4f32 (scalar_to_vector (loadf32 addr:$src))), (iPTR 0)))),
- (SUBREG_TO_REG (i32 0), (VMOVSSZrm addr:$src), sub_xmm)>;
- def : Pat<(v8f32 (X86vzload addr:$src)),
+ def : Pat<(v8f32 (X86vzload32 addr:$src)),
(SUBREG_TO_REG (i32 0), (VMOVSSZrm addr:$src), sub_xmm)>;
- def : Pat<(v4f64 (X86vzmovl (insert_subvector undef,
- (v2f64 (scalar_to_vector (loadf64 addr:$src))), (iPTR 0)))),
- (SUBREG_TO_REG (i32 0), (VMOVSDZrm addr:$src), sub_xmm)>;
- def : Pat<(v4f64 (X86vzload addr:$src)),
+ def : Pat<(v4f64 (X86vzload64 addr:$src)),
(SUBREG_TO_REG (i32 0), (VMOVSDZrm addr:$src), sub_xmm)>;
// Represent the same patterns above but in the form they appear for
// 512-bit types
- def : Pat<(v16i32 (X86vzmovl (insert_subvector undef,
- (v4i32 (scalar_to_vector (loadi32 addr:$src))), (iPTR 0)))),
- (SUBREG_TO_REG (i32 0), (v4i32 (VMOVDI2PDIZrm addr:$src)), sub_xmm)>;
- def : Pat<(v16f32 (X86vzmovl (insert_subvector undef,
- (v4f32 (scalar_to_vector (loadf32 addr:$src))), (iPTR 0)))),
+ def : Pat<(v16f32 (X86vzload32 addr:$src)),
(SUBREG_TO_REG (i32 0), (VMOVSSZrm addr:$src), sub_xmm)>;
- def : Pat<(v16f32 (X86vzload addr:$src)),
- (SUBREG_TO_REG (i32 0), (VMOVSSZrm addr:$src), sub_xmm)>;
- def : Pat<(v8f64 (X86vzmovl (insert_subvector undef,
- (v2f64 (scalar_to_vector (loadf64 addr:$src))), (iPTR 0)))),
- (SUBREG_TO_REG (i32 0), (VMOVSDZrm addr:$src), sub_xmm)>;
- def : Pat<(v8f64 (X86vzload addr:$src)),
+ def : Pat<(v8f64 (X86vzload64 addr:$src)),
(SUBREG_TO_REG (i32 0), (VMOVSDZrm addr:$src), sub_xmm)>;
-
- def : Pat<(v4i64 (X86vzmovl (insert_subvector undef,
- (v2i64 (scalar_to_vector (loadi64 addr:$src))), (iPTR 0)))),
- (SUBREG_TO_REG (i64 0), (v2i64 (VMOVQI2PQIZrm addr:$src)), sub_xmm)>;
-
- // Extract and store.
- def : Pat<(store (f32 (extractelt (v4f32 VR128X:$src), (iPTR 0))),
- addr:$dst),
- (VMOVSSZmr addr:$dst, (COPY_TO_REGCLASS (v4f32 VR128X:$src), FR32X))>;
}
let ExeDomain = SSEPackedInt, SchedRW = [SchedWriteVecLogic.XMM] in {
@@ -4517,47 +4348,47 @@ let Predicates = [HasAVX512] in {
def : Pat<(v2i64 (X86vzmovl (v2i64 (scalar_to_vector GR64:$src)))),
(VMOV64toPQIZrr GR64:$src)>;
- def : Pat<(v4i64 (X86vzmovl (insert_subvector undef,
- (v2i64 (scalar_to_vector GR64:$src)),(iPTR 0)))),
- (SUBREG_TO_REG (i64 0), (v2i64 (VMOV64toPQIZrr GR64:$src)), sub_xmm)>;
-
- def : Pat<(v8i64 (X86vzmovl (insert_subvector undef,
- (v2i64 (scalar_to_vector GR64:$src)),(iPTR 0)))),
- (SUBREG_TO_REG (i64 0), (v2i64 (VMOV64toPQIZrr GR64:$src)), sub_xmm)>;
-
// AVX 128-bit movd/movq instruction write zeros in the high 128-bit part.
def : Pat<(v2i64 (X86vzmovl (v2i64 (scalar_to_vector (zextloadi64i32 addr:$src))))),
(VMOVDI2PDIZrm addr:$src)>;
- def : Pat<(v4i32 (X86vzmovl (v4i32 (scalar_to_vector (loadi32 addr:$src))))),
- (VMOVDI2PDIZrm addr:$src)>;
- def : Pat<(v4i32 (X86vzmovl (loadv4i32 addr:$src))),
- (VMOVDI2PDIZrm addr:$src)>;
- def : Pat<(v4i32 (X86vzload addr:$src)),
+ def : Pat<(v4i32 (X86vzload32 addr:$src)),
(VMOVDI2PDIZrm addr:$src)>;
- def : Pat<(v8i32 (X86vzload addr:$src)),
+ def : Pat<(v8i32 (X86vzload32 addr:$src)),
(SUBREG_TO_REG (i32 0), (v4i32 (VMOVDI2PDIZrm addr:$src)), sub_xmm)>;
- def : Pat<(v2i64 (X86vzmovl (loadv2i64 addr:$src))),
- (VMOVQI2PQIZrm addr:$src)>;
def : Pat<(v2f64 (X86vzmovl (v2f64 VR128X:$src))),
(VMOVZPQILo2PQIZrr VR128X:$src)>;
- def : Pat<(v2i64 (X86vzload addr:$src)),
+ def : Pat<(v2i64 (X86vzload64 addr:$src)),
(VMOVQI2PQIZrm addr:$src)>;
- def : Pat<(v4i64 (X86vzload addr:$src)),
+ def : Pat<(v4i64 (X86vzload64 addr:$src)),
(SUBREG_TO_REG (i64 0), (v2i64 (VMOVQI2PQIZrm addr:$src)), sub_xmm)>;
- // Use regular 128-bit instructions to match 256-bit scalar_to_vec+zext.
- def : Pat<(v8i32 (X86vzmovl (insert_subvector undef,
- (v4i32 (scalar_to_vector GR32:$src)),(iPTR 0)))),
- (SUBREG_TO_REG (i32 0), (v4i32 (VMOVDI2PDIZrr GR32:$src)), sub_xmm)>;
- def : Pat<(v16i32 (X86vzmovl (insert_subvector undef,
- (v4i32 (scalar_to_vector GR32:$src)),(iPTR 0)))),
- (SUBREG_TO_REG (i32 0), (v4i32 (VMOVDI2PDIZrr GR32:$src)), sub_xmm)>;
-
// Use regular 128-bit instructions to match 512-bit scalar_to_vec+zext.
- def : Pat<(v16i32 (X86vzload addr:$src)),
+ def : Pat<(v16i32 (X86vzload32 addr:$src)),
(SUBREG_TO_REG (i32 0), (v4i32 (VMOVDI2PDIZrm addr:$src)), sub_xmm)>;
- def : Pat<(v8i64 (X86vzload addr:$src)),
+ def : Pat<(v8i64 (X86vzload64 addr:$src)),
(SUBREG_TO_REG (i64 0), (v2i64 (VMOVQI2PQIZrm addr:$src)), sub_xmm)>;
+
+ def : Pat<(v4f64 (X86vzmovl (v4f64 VR256X:$src))),
+ (SUBREG_TO_REG (i32 0),
+ (v2f64 (VMOVZPQILo2PQIZrr
+ (v2f64 (EXTRACT_SUBREG (v4f64 VR256X:$src), sub_xmm)))),
+ sub_xmm)>;
+ def : Pat<(v4i64 (X86vzmovl (v4i64 VR256X:$src))),
+ (SUBREG_TO_REG (i32 0),
+ (v2i64 (VMOVZPQILo2PQIZrr
+ (v2i64 (EXTRACT_SUBREG (v4i64 VR256X:$src), sub_xmm)))),
+ sub_xmm)>;
+
+ def : Pat<(v8f64 (X86vzmovl (v8f64 VR512:$src))),
+ (SUBREG_TO_REG (i32 0),
+ (v2f64 (VMOVZPQILo2PQIZrr
+ (v2f64 (EXTRACT_SUBREG (v8f64 VR512:$src), sub_xmm)))),
+ sub_xmm)>;
+ def : Pat<(v8i64 (X86vzmovl (v8i64 VR512:$src))),
+ (SUBREG_TO_REG (i32 0),
+ (v2i64 (VMOVZPQILo2PQIZrr
+ (v2i64 (EXTRACT_SUBREG (v8i64 VR512:$src), sub_xmm)))),
+ sub_xmm)>;
}
//===----------------------------------------------------------------------===//
@@ -4686,7 +4517,7 @@ multiclass avx512_binop_rm<bits<8> opc, string OpcodeStr, SDNode OpNode,
(ins _.RC:$src1, _.RC:$src2), OpcodeStr,
"$src2, $src1", "$src1, $src2",
(_.VT (OpNode _.RC:$src1, _.RC:$src2)),
- IsCommutable>, AVX512BIBase, EVEX_4V,
+ IsCommutable, IsCommutable>, AVX512BIBase, EVEX_4V,
Sched<[sched]>;
defm rm : AVX512_maskable<opc, MRMSrcMem, _, (outs _.RC:$dst),
@@ -4922,7 +4753,7 @@ multiclass avx512_packs_rm<bits<8> opc, string OpcodeStr,
(_Dst.VT (OpNode
(_Src.VT _Src.RC:$src1),
(_Src.VT _Src.RC:$src2))),
- IsCommutable>,
+ IsCommutable, IsCommutable>,
EVEX_CD8<_Src.EltSize, CD8VF>, EVEX_4V, Sched<[sched]>;
defm rm : AVX512_maskable<opc, MRMSrcMem, _Dst, (outs _Dst.RC:$dst),
(ins _Src.RC:$src1, _Src.MemOp:$src2), OpcodeStr,
@@ -5458,16 +5289,14 @@ multiclass avx512_fp_scalar<bits<8> opc, string OpcodeStr,X86VectorVTInfo _,
defm rr_Int : AVX512_maskable_scalar<opc, MRMSrcReg, _, (outs _.RC:$dst),
(ins _.RC:$src1, _.RC:$src2), OpcodeStr,
"$src2, $src1", "$src1, $src2",
- (_.VT (VecNode _.RC:$src1, _.RC:$src2,
- (i32 FROUND_CURRENT)))>,
+ (_.VT (VecNode _.RC:$src1, _.RC:$src2))>,
Sched<[sched]>;
defm rm_Int : AVX512_maskable_scalar<opc, MRMSrcMem, _, (outs _.RC:$dst),
(ins _.RC:$src1, _.IntScalarMemOp:$src2), OpcodeStr,
"$src2, $src1", "$src1, $src2",
(_.VT (VecNode _.RC:$src1,
- _.ScalarIntMemCPat:$src2,
- (i32 FROUND_CURRENT)))>,
+ _.ScalarIntMemCPat:$src2))>,
Sched<[sched.Folded, sched.ReadAfterFold]>;
let isCodeGenOnly = 1, Predicates = [HasAVX512] in {
def rr : I< opc, MRMSrcReg, (outs _.FRC:$dst),
@@ -5495,7 +5324,7 @@ multiclass avx512_fp_scalar_round<bits<8> opc, string OpcodeStr,X86VectorVTInfo
(ins _.RC:$src1, _.RC:$src2, AVX512RC:$rc), OpcodeStr,
"$rc, $src2, $src1", "$src1, $src2, $rc",
(VecNode (_.VT _.RC:$src1), (_.VT _.RC:$src2),
- (i32 imm:$rc)), IsCommutable>,
+ (i32 timm:$rc))>,
EVEX_B, EVEX_RC, Sched<[sched]>;
}
multiclass avx512_fp_scalar_sae<bits<8> opc, string OpcodeStr,X86VectorVTInfo _,
@@ -5534,23 +5363,22 @@ multiclass avx512_fp_scalar_sae<bits<8> opc, string OpcodeStr,X86VectorVTInfo _,
defm rrb_Int : AVX512_maskable_scalar<opc, MRMSrcReg, _, (outs _.RC:$dst),
(ins _.RC:$src1, _.RC:$src2), OpcodeStr,
"{sae}, $src2, $src1", "$src1, $src2, {sae}",
- (SaeNode (_.VT _.RC:$src1), (_.VT _.RC:$src2),
- (i32 FROUND_NO_EXC))>, EVEX_B,
- Sched<[sched]>;
+ (SaeNode (_.VT _.RC:$src1), (_.VT _.RC:$src2))>,
+ EVEX_B, Sched<[sched]>;
}
}
multiclass avx512_binop_s_round<bits<8> opc, string OpcodeStr, SDNode OpNode,
- SDNode VecNode, X86SchedWriteSizes sched,
- bit IsCommutable> {
+ SDNode VecNode, SDNode RndNode,
+ X86SchedWriteSizes sched, bit IsCommutable> {
defm SSZ : avx512_fp_scalar<opc, OpcodeStr#"ss", f32x_info, OpNode, VecNode,
sched.PS.Scl, IsCommutable>,
- avx512_fp_scalar_round<opc, OpcodeStr#"ss", f32x_info, VecNode,
+ avx512_fp_scalar_round<opc, OpcodeStr#"ss", f32x_info, RndNode,
sched.PS.Scl, IsCommutable>,
XS, EVEX_4V, VEX_LIG, EVEX_CD8<32, CD8VT1>;
defm SDZ : avx512_fp_scalar<opc, OpcodeStr#"sd", f64x_info, OpNode, VecNode,
sched.PD.Scl, IsCommutable>,
- avx512_fp_scalar_round<opc, OpcodeStr#"sd", f64x_info, VecNode,
+ avx512_fp_scalar_round<opc, OpcodeStr#"sd", f64x_info, RndNode,
sched.PD.Scl, IsCommutable>,
XD, VEX_W, EVEX_4V, VEX_LIG, EVEX_CD8<64, CD8VT1>;
}
@@ -5565,17 +5393,17 @@ multiclass avx512_binop_s_sae<bits<8> opc, string OpcodeStr, SDNode OpNode,
VecNode, SaeNode, sched.PD.Scl, IsCommutable>,
XD, VEX_W, EVEX_4V, VEX_LIG, EVEX_CD8<64, CD8VT1>;
}
-defm VADD : avx512_binop_s_round<0x58, "vadd", fadd, X86faddRnds,
+defm VADD : avx512_binop_s_round<0x58, "vadd", fadd, X86fadds, X86faddRnds,
SchedWriteFAddSizes, 1>;
-defm VMUL : avx512_binop_s_round<0x59, "vmul", fmul, X86fmulRnds,
+defm VMUL : avx512_binop_s_round<0x59, "vmul", fmul, X86fmuls, X86fmulRnds,
SchedWriteFMulSizes, 1>;
-defm VSUB : avx512_binop_s_round<0x5C, "vsub", fsub, X86fsubRnds,
+defm VSUB : avx512_binop_s_round<0x5C, "vsub", fsub, X86fsubs, X86fsubRnds,
SchedWriteFAddSizes, 0>;
-defm VDIV : avx512_binop_s_round<0x5E, "vdiv", fdiv, X86fdivRnds,
+defm VDIV : avx512_binop_s_round<0x5E, "vdiv", fdiv, X86fdivs, X86fdivRnds,
SchedWriteFDivSizes, 0>;
-defm VMIN : avx512_binop_s_sae<0x5D, "vmin", X86fmin, X86fmins, X86fminRnds,
+defm VMIN : avx512_binop_s_sae<0x5D, "vmin", X86fmin, X86fmins, X86fminSAEs,
SchedWriteFCmpSizes, 0>;
-defm VMAX : avx512_binop_s_sae<0x5F, "vmax", X86fmax, X86fmaxs, X86fmaxRnds,
+defm VMAX : avx512_binop_s_sae<0x5F, "vmax", X86fmax, X86fmaxs, X86fmaxSAEs,
SchedWriteFCmpSizes, 0>;
// MIN/MAX nodes are commutable under "unsafe-fp-math". In this case we use
@@ -5618,13 +5446,13 @@ defm VMAXCSDZ : avx512_comutable_binop_s<0x5F, "vmaxsd", f64x_info, X86fmaxc,
multiclass avx512_fp_packed<bits<8> opc, string OpcodeStr, SDPatternOperator OpNode,
X86VectorVTInfo _, X86FoldableSchedWrite sched,
bit IsCommutable,
- bit IsKZCommutable = IsCommutable> {
+ bit IsKCommutable = IsCommutable> {
let ExeDomain = _.ExeDomain, hasSideEffects = 0 in {
defm rr: AVX512_maskable<opc, MRMSrcReg, _, (outs _.RC:$dst),
(ins _.RC:$src1, _.RC:$src2), OpcodeStr##_.Suffix,
"$src2, $src1", "$src1, $src2",
- (_.VT (OpNode _.RC:$src1, _.RC:$src2)), IsCommutable, 0,
- IsKZCommutable>,
+ (_.VT (OpNode _.RC:$src1, _.RC:$src2)), IsCommutable,
+ IsKCommutable, IsKCommutable>,
EVEX_4V, Sched<[sched]>;
let mayLoad = 1 in {
defm rm: AVX512_maskable<opc, MRMSrcMem, _, (outs _.RC:$dst),
@@ -5651,18 +5479,18 @@ multiclass avx512_fp_round_packed<bits<8> opc, string OpcodeStr,
defm rrb: AVX512_maskable<opc, MRMSrcReg, _, (outs _.RC:$dst),
(ins _.RC:$src1, _.RC:$src2, AVX512RC:$rc), OpcodeStr##_.Suffix,
"$rc, $src2, $src1", "$src1, $src2, $rc",
- (_.VT (OpNodeRnd _.RC:$src1, _.RC:$src2, (i32 imm:$rc)))>,
+ (_.VT (OpNodeRnd _.RC:$src1, _.RC:$src2, (i32 timm:$rc)))>,
EVEX_4V, EVEX_B, EVEX_RC, Sched<[sched]>;
}
multiclass avx512_fp_sae_packed<bits<8> opc, string OpcodeStr,
- SDPatternOperator OpNodeRnd,
+ SDPatternOperator OpNodeSAE,
X86FoldableSchedWrite sched, X86VectorVTInfo _> {
let ExeDomain = _.ExeDomain in
defm rrb: AVX512_maskable<opc, MRMSrcReg, _, (outs _.RC:$dst),
(ins _.RC:$src1, _.RC:$src2), OpcodeStr##_.Suffix,
"{sae}, $src2, $src1", "$src1, $src2, {sae}",
- (_.VT (OpNodeRnd _.RC:$src1, _.RC:$src2, (i32 FROUND_NO_EXC)))>,
+ (_.VT (OpNodeSAE _.RC:$src1, _.RC:$src2))>,
EVEX_4V, EVEX_B, Sched<[sched]>;
}
@@ -5731,10 +5559,10 @@ defm VDIV : avx512_fp_binop_p<0x5E, "vdiv", fdiv, HasAVX512,
avx512_fp_binop_p_round<0x5E, "vdiv", X86fdivRnd, SchedWriteFDivSizes>;
defm VMIN : avx512_fp_binop_p<0x5D, "vmin", X86fmin, HasAVX512,
SchedWriteFCmpSizes, 0>,
- avx512_fp_binop_p_sae<0x5D, "vmin", X86fminRnd, SchedWriteFCmpSizes>;
+ avx512_fp_binop_p_sae<0x5D, "vmin", X86fminSAE, SchedWriteFCmpSizes>;
defm VMAX : avx512_fp_binop_p<0x5F, "vmax", X86fmax, HasAVX512,
SchedWriteFCmpSizes, 0>,
- avx512_fp_binop_p_sae<0x5F, "vmax", X86fmaxRnd, SchedWriteFCmpSizes>;
+ avx512_fp_binop_p_sae<0x5F, "vmax", X86fmaxSAE, SchedWriteFCmpSizes>;
let isCodeGenOnly = 1 in {
defm VMINC : avx512_fp_binop_p<0x5D, "vmin", X86fminc, HasAVX512,
SchedWriteFCmpSizes, 1>;
@@ -5750,71 +5578,25 @@ defm VOR : avx512_fp_binop_p<0x56, "vor", null_frag, HasDQI,
defm VXOR : avx512_fp_binop_p<0x57, "vxor", null_frag, HasDQI,
SchedWriteFLogicSizes, 1>;
-let Predicates = [HasVLX,HasDQI] in {
- // Use packed logical operations for scalar ops.
- def : Pat<(f64 (X86fand FR64X:$src1, FR64X:$src2)),
- (COPY_TO_REGCLASS
- (v2f64 (VANDPDZ128rr (v2f64 (COPY_TO_REGCLASS FR64X:$src1, VR128X)),
- (v2f64 (COPY_TO_REGCLASS FR64X:$src2, VR128X)))),
- FR64X)>;
- def : Pat<(f64 (X86for FR64X:$src1, FR64X:$src2)),
- (COPY_TO_REGCLASS
- (v2f64 (VORPDZ128rr (v2f64 (COPY_TO_REGCLASS FR64X:$src1, VR128X)),
- (v2f64 (COPY_TO_REGCLASS FR64X:$src2, VR128X)))),
- FR64X)>;
- def : Pat<(f64 (X86fxor FR64X:$src1, FR64X:$src2)),
- (COPY_TO_REGCLASS
- (v2f64 (VXORPDZ128rr (v2f64 (COPY_TO_REGCLASS FR64X:$src1, VR128X)),
- (v2f64 (COPY_TO_REGCLASS FR64X:$src2, VR128X)))),
- FR64X)>;
- def : Pat<(f64 (X86fandn FR64X:$src1, FR64X:$src2)),
- (COPY_TO_REGCLASS
- (v2f64 (VANDNPDZ128rr (v2f64 (COPY_TO_REGCLASS FR64X:$src1, VR128X)),
- (v2f64 (COPY_TO_REGCLASS FR64X:$src2, VR128X)))),
- FR64X)>;
-
- def : Pat<(f32 (X86fand FR32X:$src1, FR32X:$src2)),
- (COPY_TO_REGCLASS
- (v4f32 (VANDPSZ128rr (v4f32 (COPY_TO_REGCLASS FR32X:$src1, VR128X)),
- (v4f32 (COPY_TO_REGCLASS FR32X:$src2, VR128X)))),
- FR32X)>;
- def : Pat<(f32 (X86for FR32X:$src1, FR32X:$src2)),
- (COPY_TO_REGCLASS
- (v4f32 (VORPSZ128rr (v4f32 (COPY_TO_REGCLASS FR32X:$src1, VR128X)),
- (v4f32 (COPY_TO_REGCLASS FR32X:$src2, VR128X)))),
- FR32X)>;
- def : Pat<(f32 (X86fxor FR32X:$src1, FR32X:$src2)),
- (COPY_TO_REGCLASS
- (v4f32 (VXORPSZ128rr (v4f32 (COPY_TO_REGCLASS FR32X:$src1, VR128X)),
- (v4f32 (COPY_TO_REGCLASS FR32X:$src2, VR128X)))),
- FR32X)>;
- def : Pat<(f32 (X86fandn FR32X:$src1, FR32X:$src2)),
- (COPY_TO_REGCLASS
- (v4f32 (VANDNPSZ128rr (v4f32 (COPY_TO_REGCLASS FR32X:$src1, VR128X)),
- (v4f32 (COPY_TO_REGCLASS FR32X:$src2, VR128X)))),
- FR32X)>;
-}
-
multiclass avx512_fp_scalef_p<bits<8> opc, string OpcodeStr, SDNode OpNode,
X86FoldableSchedWrite sched, X86VectorVTInfo _> {
let ExeDomain = _.ExeDomain in {
defm rr: AVX512_maskable<opc, MRMSrcReg, _, (outs _.RC:$dst),
(ins _.RC:$src1, _.RC:$src2), OpcodeStr##_.Suffix,
"$src2, $src1", "$src1, $src2",
- (_.VT (OpNode _.RC:$src1, _.RC:$src2, (i32 FROUND_CURRENT)))>,
+ (_.VT (OpNode _.RC:$src1, _.RC:$src2))>,
EVEX_4V, Sched<[sched]>;
defm rm: AVX512_maskable<opc, MRMSrcMem, _, (outs _.RC:$dst),
(ins _.RC:$src1, _.MemOp:$src2), OpcodeStr##_.Suffix,
"$src2, $src1", "$src1, $src2",
- (OpNode _.RC:$src1, (_.LdFrag addr:$src2), (i32 FROUND_CURRENT))>,
+ (OpNode _.RC:$src1, (_.LdFrag addr:$src2))>,
EVEX_4V, Sched<[sched.Folded, sched.ReadAfterFold]>;
defm rmb: AVX512_maskable<opc, MRMSrcMem, _, (outs _.RC:$dst),
(ins _.RC:$src1, _.ScalarMemOp:$src2), OpcodeStr##_.Suffix,
"${src2}"##_.BroadcastStr##", $src1",
"$src1, ${src2}"##_.BroadcastStr,
(OpNode _.RC:$src1, (_.VT (X86VBroadcast
- (_.ScalarLdFrag addr:$src2))),
- (i32 FROUND_CURRENT))>,
+ (_.ScalarLdFrag addr:$src2))))>,
EVEX_4V, EVEX_B, Sched<[sched.Folded, sched.ReadAfterFold]>;
}
}
@@ -5825,332 +5607,139 @@ multiclass avx512_fp_scalef_scalar<bits<8> opc, string OpcodeStr, SDNode OpNode,
defm rr: AVX512_maskable_scalar<opc, MRMSrcReg, _, (outs _.RC:$dst),
(ins _.RC:$src1, _.RC:$src2), OpcodeStr##_.Suffix,
"$src2, $src1", "$src1, $src2",
- (_.VT (OpNode _.RC:$src1, _.RC:$src2, (i32 FROUND_CURRENT)))>,
+ (_.VT (OpNode _.RC:$src1, _.RC:$src2))>,
Sched<[sched]>;
defm rm: AVX512_maskable_scalar<opc, MRMSrcMem, _, (outs _.RC:$dst),
(ins _.RC:$src1, _.IntScalarMemOp:$src2), OpcodeStr##_.Suffix,
"$src2, $src1", "$src1, $src2",
- (OpNode _.RC:$src1, _.ScalarIntMemCPat:$src2,
- (i32 FROUND_CURRENT))>,
+ (OpNode _.RC:$src1, _.ScalarIntMemCPat:$src2)>,
Sched<[sched.Folded, sched.ReadAfterFold]>;
}
}
multiclass avx512_fp_scalef_all<bits<8> opc, bits<8> opcScaler, string OpcodeStr,
- SDNode OpNode, SDNode OpNodeScal,
X86SchedWriteWidths sched> {
- defm PSZ : avx512_fp_scalef_p<opc, OpcodeStr, OpNode, sched.ZMM, v16f32_info>,
- avx512_fp_round_packed<opc, OpcodeStr, OpNode, sched.ZMM, v16f32_info>,
+ defm PSZ : avx512_fp_scalef_p<opc, OpcodeStr, X86scalef, sched.ZMM, v16f32_info>,
+ avx512_fp_round_packed<opc, OpcodeStr, X86scalefRnd, sched.ZMM, v16f32_info>,
EVEX_V512, EVEX_CD8<32, CD8VF>;
- defm PDZ : avx512_fp_scalef_p<opc, OpcodeStr, OpNode, sched.ZMM, v8f64_info>,
- avx512_fp_round_packed<opc, OpcodeStr, OpNode, sched.ZMM, v8f64_info>,
+ defm PDZ : avx512_fp_scalef_p<opc, OpcodeStr, X86scalef, sched.ZMM, v8f64_info>,
+ avx512_fp_round_packed<opc, OpcodeStr, X86scalefRnd, sched.ZMM, v8f64_info>,
EVEX_V512, VEX_W, EVEX_CD8<64, CD8VF>;
- defm SSZ : avx512_fp_scalef_scalar<opcScaler, OpcodeStr, OpNodeScal, sched.Scl, f32x_info>,
- avx512_fp_scalar_round<opcScaler, OpcodeStr##"ss", f32x_info, OpNodeScal, sched.Scl>,
- EVEX_4V,EVEX_CD8<32, CD8VT1>;
- defm SDZ : avx512_fp_scalef_scalar<opcScaler, OpcodeStr, OpNodeScal, sched.Scl, f64x_info>,
- avx512_fp_scalar_round<opcScaler, OpcodeStr##"sd", f64x_info, OpNodeScal, sched.Scl>,
- EVEX_4V, EVEX_CD8<64, CD8VT1>, VEX_W;
+ defm SSZ : avx512_fp_scalef_scalar<opcScaler, OpcodeStr, X86scalefs, sched.Scl, f32x_info>,
+ avx512_fp_scalar_round<opcScaler, OpcodeStr##"ss", f32x_info,
+ X86scalefsRnd, sched.Scl>,
+ EVEX_4V, VEX_LIG, EVEX_CD8<32, CD8VT1>;
+ defm SDZ : avx512_fp_scalef_scalar<opcScaler, OpcodeStr, X86scalefs, sched.Scl, f64x_info>,
+ avx512_fp_scalar_round<opcScaler, OpcodeStr##"sd", f64x_info,
+ X86scalefsRnd, sched.Scl>,
+ EVEX_4V, VEX_LIG, EVEX_CD8<64, CD8VT1>, VEX_W;
// Define only if AVX512VL feature is present.
let Predicates = [HasVLX] in {
- defm PSZ128 : avx512_fp_scalef_p<opc, OpcodeStr, OpNode, sched.XMM, v4f32x_info>,
+ defm PSZ128 : avx512_fp_scalef_p<opc, OpcodeStr, X86scalef, sched.XMM, v4f32x_info>,
EVEX_V128, EVEX_CD8<32, CD8VF>;
- defm PSZ256 : avx512_fp_scalef_p<opc, OpcodeStr, OpNode, sched.YMM, v8f32x_info>,
+ defm PSZ256 : avx512_fp_scalef_p<opc, OpcodeStr, X86scalef, sched.YMM, v8f32x_info>,
EVEX_V256, EVEX_CD8<32, CD8VF>;
- defm PDZ128 : avx512_fp_scalef_p<opc, OpcodeStr, OpNode, sched.XMM, v2f64x_info>,
+ defm PDZ128 : avx512_fp_scalef_p<opc, OpcodeStr, X86scalef, sched.XMM, v2f64x_info>,
EVEX_V128, VEX_W, EVEX_CD8<64, CD8VF>;
- defm PDZ256 : avx512_fp_scalef_p<opc, OpcodeStr, OpNode, sched.YMM, v4f64x_info>,
+ defm PDZ256 : avx512_fp_scalef_p<opc, OpcodeStr, X86scalef, sched.YMM, v4f64x_info>,
EVEX_V256, VEX_W, EVEX_CD8<64, CD8VF>;
}
}
-defm VSCALEF : avx512_fp_scalef_all<0x2C, 0x2D, "vscalef", X86scalef, X86scalefs,
+defm VSCALEF : avx512_fp_scalef_all<0x2C, 0x2D, "vscalef",
SchedWriteFAdd>, T8PD, NotEVEX2VEXConvertible;
//===----------------------------------------------------------------------===//
// AVX-512 VPTESTM instructions
//===----------------------------------------------------------------------===//
-multiclass avx512_vptest<bits<8> opc, string OpcodeStr, PatFrag OpNode,
+multiclass avx512_vptest<bits<8> opc, string OpcodeStr,
X86FoldableSchedWrite sched, X86VectorVTInfo _,
string Name> {
- let ExeDomain = _.ExeDomain in {
- let isCommutable = 1 in
+ // NOTE: Patterns are omitted in favor of manual selection in X86ISelDAGToDAG.
+ // There are just too many permuations due to commutability and bitcasts.
+ let ExeDomain = _.ExeDomain, hasSideEffects = 0 in {
defm rr : AVX512_maskable_cmp<opc, MRMSrcReg, _, (outs _.KRC:$dst),
(ins _.RC:$src1, _.RC:$src2), OpcodeStr,
"$src2, $src1", "$src1, $src2",
- (OpNode (and _.RC:$src1, _.RC:$src2), _.ImmAllZerosV)>,
+ (null_frag), (null_frag), 1>,
EVEX_4V, Sched<[sched]>;
+ let mayLoad = 1 in
defm rm : AVX512_maskable_cmp<opc, MRMSrcMem, _, (outs _.KRC:$dst),
(ins _.RC:$src1, _.MemOp:$src2), OpcodeStr,
"$src2, $src1", "$src1, $src2",
- (OpNode (and _.RC:$src1, (_.LdFrag addr:$src2)),
- _.ImmAllZerosV)>,
+ (null_frag), (null_frag)>,
EVEX_4V, EVEX_CD8<_.EltSize, CD8VF>,
Sched<[sched.Folded, sched.ReadAfterFold]>;
}
-
- // Patterns for compare with 0 that just use the same source twice.
- def : Pat<(_.KVT (OpNode _.RC:$src, _.ImmAllZerosV)),
- (_.KVT (!cast<Instruction>(Name # _.ZSuffix # "rr")
- _.RC:$src, _.RC:$src))>;
-
- def : Pat<(_.KVT (and _.KRC:$mask, (OpNode _.RC:$src, _.ImmAllZerosV))),
- (_.KVT (!cast<Instruction>(Name # _.ZSuffix # "rrk")
- _.KRC:$mask, _.RC:$src, _.RC:$src))>;
}
-multiclass avx512_vptest_mb<bits<8> opc, string OpcodeStr, PatFrag OpNode,
+multiclass avx512_vptest_mb<bits<8> opc, string OpcodeStr,
X86FoldableSchedWrite sched, X86VectorVTInfo _> {
- let ExeDomain = _.ExeDomain in
+ let ExeDomain = _.ExeDomain, mayLoad = 1, hasSideEffects = 0 in
defm rmb : AVX512_maskable_cmp<opc, MRMSrcMem, _, (outs _.KRC:$dst),
(ins _.RC:$src1, _.ScalarMemOp:$src2), OpcodeStr,
"${src2}"##_.BroadcastStr##", $src1",
"$src1, ${src2}"##_.BroadcastStr,
- (OpNode (and _.RC:$src1,
- (X86VBroadcast
- (_.ScalarLdFrag addr:$src2))),
- _.ImmAllZerosV)>,
+ (null_frag), (null_frag)>,
EVEX_B, EVEX_4V, EVEX_CD8<_.EltSize, CD8VF>,
Sched<[sched.Folded, sched.ReadAfterFold]>;
}
-// Use 512bit version to implement 128/256 bit in case NoVLX.
-multiclass avx512_vptest_lowering<PatFrag OpNode, X86VectorVTInfo ExtendInfo,
- X86VectorVTInfo _, string Name> {
- def : Pat<(_.KVT (OpNode (and _.RC:$src1, _.RC:$src2),
- _.ImmAllZerosV)),
- (_.KVT (COPY_TO_REGCLASS
- (!cast<Instruction>(Name # "Zrr")
- (INSERT_SUBREG (ExtendInfo.VT (IMPLICIT_DEF)),
- _.RC:$src1, _.SubRegIdx),
- (INSERT_SUBREG (ExtendInfo.VT (IMPLICIT_DEF)),
- _.RC:$src2, _.SubRegIdx)),
- _.KRC))>;
-
- def : Pat<(_.KVT (and _.KRC:$mask,
- (OpNode (and _.RC:$src1, _.RC:$src2),
- _.ImmAllZerosV))),
- (COPY_TO_REGCLASS
- (!cast<Instruction>(Name # "Zrrk")
- (COPY_TO_REGCLASS _.KRC:$mask, ExtendInfo.KRC),
- (INSERT_SUBREG (ExtendInfo.VT (IMPLICIT_DEF)),
- _.RC:$src1, _.SubRegIdx),
- (INSERT_SUBREG (ExtendInfo.VT (IMPLICIT_DEF)),
- _.RC:$src2, _.SubRegIdx)),
- _.KRC)>;
-
- def : Pat<(_.KVT (OpNode _.RC:$src, _.ImmAllZerosV)),
- (_.KVT (COPY_TO_REGCLASS
- (!cast<Instruction>(Name # "Zrr")
- (INSERT_SUBREG (ExtendInfo.VT (IMPLICIT_DEF)),
- _.RC:$src, _.SubRegIdx),
- (INSERT_SUBREG (ExtendInfo.VT (IMPLICIT_DEF)),
- _.RC:$src, _.SubRegIdx)),
- _.KRC))>;
-
- def : Pat<(_.KVT (and _.KRC:$mask, (OpNode _.RC:$src, _.ImmAllZerosV))),
- (COPY_TO_REGCLASS
- (!cast<Instruction>(Name # "Zrrk")
- (COPY_TO_REGCLASS _.KRC:$mask, ExtendInfo.KRC),
- (INSERT_SUBREG (ExtendInfo.VT (IMPLICIT_DEF)),
- _.RC:$src, _.SubRegIdx),
- (INSERT_SUBREG (ExtendInfo.VT (IMPLICIT_DEF)),
- _.RC:$src, _.SubRegIdx)),
- _.KRC)>;
-}
-
-multiclass avx512_vptest_dq_sizes<bits<8> opc, string OpcodeStr, PatFrag OpNode,
- X86SchedWriteWidths sched, AVX512VLVectorVTInfo _> {
+multiclass avx512_vptest_dq_sizes<bits<8> opc, string OpcodeStr,
+ X86SchedWriteWidths sched,
+ AVX512VLVectorVTInfo _> {
let Predicates = [HasAVX512] in
- defm Z : avx512_vptest<opc, OpcodeStr, OpNode, sched.ZMM, _.info512, NAME>,
- avx512_vptest_mb<opc, OpcodeStr, OpNode, sched.ZMM, _.info512>, EVEX_V512;
+ defm Z : avx512_vptest<opc, OpcodeStr, sched.ZMM, _.info512, NAME>,
+ avx512_vptest_mb<opc, OpcodeStr, sched.ZMM, _.info512>, EVEX_V512;
let Predicates = [HasAVX512, HasVLX] in {
- defm Z256 : avx512_vptest<opc, OpcodeStr, OpNode, sched.YMM, _.info256, NAME>,
- avx512_vptest_mb<opc, OpcodeStr, OpNode, sched.YMM, _.info256>, EVEX_V256;
- defm Z128 : avx512_vptest<opc, OpcodeStr, OpNode, sched.XMM, _.info128, NAME>,
- avx512_vptest_mb<opc, OpcodeStr, OpNode, sched.XMM, _.info128>, EVEX_V128;
- }
- let Predicates = [HasAVX512, NoVLX] in {
- defm Z256_Alt : avx512_vptest_lowering< OpNode, _.info512, _.info256, NAME>;
- defm Z128_Alt : avx512_vptest_lowering< OpNode, _.info512, _.info128, NAME>;
+ defm Z256 : avx512_vptest<opc, OpcodeStr, sched.YMM, _.info256, NAME>,
+ avx512_vptest_mb<opc, OpcodeStr, sched.YMM, _.info256>, EVEX_V256;
+ defm Z128 : avx512_vptest<opc, OpcodeStr, sched.XMM, _.info128, NAME>,
+ avx512_vptest_mb<opc, OpcodeStr, sched.XMM, _.info128>, EVEX_V128;
}
}
-multiclass avx512_vptest_dq<bits<8> opc, string OpcodeStr, PatFrag OpNode,
+multiclass avx512_vptest_dq<bits<8> opc, string OpcodeStr,
X86SchedWriteWidths sched> {
- defm D : avx512_vptest_dq_sizes<opc, OpcodeStr#"d", OpNode, sched,
+ defm D : avx512_vptest_dq_sizes<opc, OpcodeStr#"d", sched,
avx512vl_i32_info>;
- defm Q : avx512_vptest_dq_sizes<opc, OpcodeStr#"q", OpNode, sched,
+ defm Q : avx512_vptest_dq_sizes<opc, OpcodeStr#"q", sched,
avx512vl_i64_info>, VEX_W;
}
multiclass avx512_vptest_wb<bits<8> opc, string OpcodeStr,
- PatFrag OpNode, X86SchedWriteWidths sched> {
+ X86SchedWriteWidths sched> {
let Predicates = [HasBWI] in {
- defm WZ: avx512_vptest<opc, OpcodeStr#"w", OpNode, sched.ZMM,
+ defm WZ: avx512_vptest<opc, OpcodeStr#"w", sched.ZMM,
v32i16_info, NAME#"W">, EVEX_V512, VEX_W;
- defm BZ: avx512_vptest<opc, OpcodeStr#"b", OpNode, sched.ZMM,
+ defm BZ: avx512_vptest<opc, OpcodeStr#"b", sched.ZMM,
v64i8_info, NAME#"B">, EVEX_V512;
}
let Predicates = [HasVLX, HasBWI] in {
- defm WZ256: avx512_vptest<opc, OpcodeStr#"w", OpNode, sched.YMM,
+ defm WZ256: avx512_vptest<opc, OpcodeStr#"w", sched.YMM,
v16i16x_info, NAME#"W">, EVEX_V256, VEX_W;
- defm WZ128: avx512_vptest<opc, OpcodeStr#"w", OpNode, sched.XMM,
+ defm WZ128: avx512_vptest<opc, OpcodeStr#"w", sched.XMM,
v8i16x_info, NAME#"W">, EVEX_V128, VEX_W;
- defm BZ256: avx512_vptest<opc, OpcodeStr#"b", OpNode, sched.YMM,
+ defm BZ256: avx512_vptest<opc, OpcodeStr#"b", sched.YMM,
v32i8x_info, NAME#"B">, EVEX_V256;
- defm BZ128: avx512_vptest<opc, OpcodeStr#"b", OpNode, sched.XMM,
+ defm BZ128: avx512_vptest<opc, OpcodeStr#"b", sched.XMM,
v16i8x_info, NAME#"B">, EVEX_V128;
}
-
- let Predicates = [HasBWI, NoVLX] in {
- defm BZ256_Alt : avx512_vptest_lowering<OpNode, v64i8_info, v32i8x_info, NAME#"B">;
- defm BZ128_Alt : avx512_vptest_lowering<OpNode, v64i8_info, v16i8x_info, NAME#"B">;
- defm WZ256_Alt : avx512_vptest_lowering<OpNode, v32i16_info, v16i16x_info, NAME#"W">;
- defm WZ128_Alt : avx512_vptest_lowering<OpNode, v32i16_info, v8i16x_info, NAME#"W">;
- }
}
-// These patterns are used to match vptestm/vptestnm. We don't treat pcmpeqm
-// as commutable here because we already canonicalized all zeros vectors to the
-// RHS during lowering.
-def X86pcmpeqm : PatFrag<(ops node:$src1, node:$src2),
- (setcc node:$src1, node:$src2, SETEQ)>;
-def X86pcmpnem : PatFrag<(ops node:$src1, node:$src2),
- (setcc node:$src1, node:$src2, SETNE)>;
-
multiclass avx512_vptest_all_forms<bits<8> opc_wb, bits<8> opc_dq, string OpcodeStr,
- PatFrag OpNode, X86SchedWriteWidths sched> :
- avx512_vptest_wb<opc_wb, OpcodeStr, OpNode, sched>,
- avx512_vptest_dq<opc_dq, OpcodeStr, OpNode, sched>;
+ X86SchedWriteWidths sched> :
+ avx512_vptest_wb<opc_wb, OpcodeStr, sched>,
+ avx512_vptest_dq<opc_dq, OpcodeStr, sched>;
-defm VPTESTM : avx512_vptest_all_forms<0x26, 0x27, "vptestm", X86pcmpnem,
+defm VPTESTM : avx512_vptest_all_forms<0x26, 0x27, "vptestm",
SchedWriteVecLogic>, T8PD;
-defm VPTESTNM : avx512_vptest_all_forms<0x26, 0x27, "vptestnm", X86pcmpeqm,
+defm VPTESTNM : avx512_vptest_all_forms<0x26, 0x27, "vptestnm",
SchedWriteVecLogic>, T8XS;
-
-multiclass avx512_vptest_lowering_pats<string InstrStr, PatFrag OpNode,
- X86VectorVTInfo _,
- X86VectorVTInfo AndInfo> {
- def : Pat<(_.KVT (OpNode (bitconvert
- (AndInfo.VT (and _.RC:$src1, _.RC:$src2))),
- _.ImmAllZerosV)),
- (!cast<Instruction>(InstrStr # "rr") _.RC:$src1, _.RC:$src2)>;
-
- def : Pat<(_.KVT (and _.KRC:$mask,
- (OpNode (bitconvert
- (AndInfo.VT (and _.RC:$src1, _.RC:$src2))),
- _.ImmAllZerosV))),
- (!cast<Instruction>(InstrStr # "rrk") _.KRC:$mask, _.RC:$src1,
- _.RC:$src2)>;
-
- def : Pat<(_.KVT (OpNode (bitconvert
- (AndInfo.VT (and _.RC:$src1,
- (AndInfo.LdFrag addr:$src2)))),
- _.ImmAllZerosV)),
- (!cast<Instruction>(InstrStr # "rm") _.RC:$src1, addr:$src2)>;
-
- def : Pat<(_.KVT (and _.KRC:$mask,
- (OpNode (bitconvert
- (AndInfo.VT (and _.RC:$src1,
- (AndInfo.LdFrag addr:$src2)))),
- _.ImmAllZerosV))),
- (!cast<Instruction>(InstrStr # "rmk") _.KRC:$mask, _.RC:$src1,
- addr:$src2)>;
-}
-
-// Patterns to use 512-bit instructions when 128/256 are not available.
-multiclass avx512_vptest_lowering_wide_pats<string InstrStr, PatFrag OpNode,
- X86VectorVTInfo _,
- X86VectorVTInfo AndInfo,
- X86VectorVTInfo ExtendInfo> {
- def : Pat<(_.KVT (OpNode (bitconvert
- (AndInfo.VT (and _.RC:$src1, _.RC:$src2))),
- _.ImmAllZerosV)),
- (_.KVT (COPY_TO_REGCLASS
- (!cast<Instruction>(InstrStr#"rr")
- (INSERT_SUBREG (ExtendInfo.VT (IMPLICIT_DEF)),
- _.RC:$src1, _.SubRegIdx),
- (INSERT_SUBREG (ExtendInfo.VT (IMPLICIT_DEF)),
- _.RC:$src2, _.SubRegIdx)),
- _.KRC))>;
-
- def : Pat<(_.KVT (and _.KRC:$mask,
- (OpNode (bitconvert
- (AndInfo.VT (and _.RC:$src1, _.RC:$src2))),
- _.ImmAllZerosV))),
- (COPY_TO_REGCLASS
- (!cast<Instruction>(InstrStr#"rrk")
- (COPY_TO_REGCLASS _.KRC:$mask, ExtendInfo.KRC),
- (INSERT_SUBREG (ExtendInfo.VT (IMPLICIT_DEF)),
- _.RC:$src1, _.SubRegIdx),
- (INSERT_SUBREG (ExtendInfo.VT (IMPLICIT_DEF)),
- _.RC:$src2, _.SubRegIdx)),
- _.KRC)>;
-}
-
-multiclass avx512_vptest_lowering_sizes<string InstrStr, PatFrag OpNode,
- Predicate prd,
- AVX512VLVectorVTInfo CmpInfo,
- AVX512VLVectorVTInfo AndInfo> {
-let Predicates = [prd, HasVLX] in {
- defm : avx512_vptest_lowering_pats<InstrStr#"Z128", OpNode,
- CmpInfo.info128, AndInfo.info128>;
- defm : avx512_vptest_lowering_pats<InstrStr#"Z256", OpNode,
- CmpInfo.info256, AndInfo.info256>;
-}
-let Predicates = [prd] in {
- defm : avx512_vptest_lowering_pats<InstrStr#"Z", OpNode,
- CmpInfo.info512, AndInfo.info512>;
-}
-
-let Predicates = [prd, NoVLX] in {
- defm : avx512_vptest_lowering_wide_pats<InstrStr#"Z", OpNode,
- CmpInfo.info128, AndInfo.info128,
- CmpInfo.info512>;
- defm : avx512_vptest_lowering_wide_pats<InstrStr#"Z", OpNode,
- CmpInfo.info256, AndInfo.info256,
- CmpInfo.info512>;
-}
-}
-
-multiclass avx512_vptest_lowering_types<string InstrStr, PatFrag OpNode> {
- defm : avx512_vptest_lowering_sizes<InstrStr # "B", OpNode, HasBWI,
- avx512vl_i8_info, avx512vl_i16_info>;
- defm : avx512_vptest_lowering_sizes<InstrStr # "B", OpNode, HasBWI,
- avx512vl_i8_info, avx512vl_i32_info>;
- defm : avx512_vptest_lowering_sizes<InstrStr # "B", OpNode, HasBWI,
- avx512vl_i8_info, avx512vl_i64_info>;
-
- defm : avx512_vptest_lowering_sizes<InstrStr # "W", OpNode, HasBWI,
- avx512vl_i16_info, avx512vl_i8_info>;
- defm : avx512_vptest_lowering_sizes<InstrStr # "W", OpNode, HasBWI,
- avx512vl_i16_info, avx512vl_i32_info>;
- defm : avx512_vptest_lowering_sizes<InstrStr # "W", OpNode, HasBWI,
- avx512vl_i16_info, avx512vl_i64_info>;
-
- defm : avx512_vptest_lowering_sizes<InstrStr # "D", OpNode, HasAVX512,
- avx512vl_i32_info, avx512vl_i8_info>;
- defm : avx512_vptest_lowering_sizes<InstrStr # "D", OpNode, HasAVX512,
- avx512vl_i32_info, avx512vl_i16_info>;
- defm : avx512_vptest_lowering_sizes<InstrStr # "D", OpNode, HasAVX512,
- avx512vl_i32_info, avx512vl_i64_info>;
-
- defm : avx512_vptest_lowering_sizes<InstrStr # "Q", OpNode, HasAVX512,
- avx512vl_i64_info, avx512vl_i8_info>;
- defm : avx512_vptest_lowering_sizes<InstrStr # "Q", OpNode, HasAVX512,
- avx512vl_i64_info, avx512vl_i16_info>;
- defm : avx512_vptest_lowering_sizes<InstrStr # "Q", OpNode, HasAVX512,
- avx512vl_i64_info, avx512vl_i32_info>;
-}
-
-defm : avx512_vptest_lowering_types<"VPTESTM", X86pcmpnem>;
-defm : avx512_vptest_lowering_types<"VPTESTNM", X86pcmpeqm>;
-
//===----------------------------------------------------------------------===//
// AVX-512 Shift instructions
//===----------------------------------------------------------------------===//
@@ -6427,86 +6016,23 @@ multiclass avx512_var_shift_w<bits<8> opc, string OpcodeStr,
}
}
-defm VPSLLV : avx512_var_shift_types<0x47, "vpsllv", shl, SchedWriteVarVecShift>,
- avx512_var_shift_w<0x12, "vpsllvw", shl, SchedWriteVarVecShift>;
+defm VPSLLV : avx512_var_shift_types<0x47, "vpsllv", X86vshlv, SchedWriteVarVecShift>,
+ avx512_var_shift_w<0x12, "vpsllvw", X86vshlv, SchedWriteVarVecShift>;
-defm VPSRAV : avx512_var_shift_types<0x46, "vpsrav", sra, SchedWriteVarVecShift>,
- avx512_var_shift_w<0x11, "vpsravw", sra, SchedWriteVarVecShift>;
+defm VPSRAV : avx512_var_shift_types<0x46, "vpsrav", X86vsrav, SchedWriteVarVecShift>,
+ avx512_var_shift_w<0x11, "vpsravw", X86vsrav, SchedWriteVarVecShift>;
-defm VPSRLV : avx512_var_shift_types<0x45, "vpsrlv", srl, SchedWriteVarVecShift>,
- avx512_var_shift_w<0x10, "vpsrlvw", srl, SchedWriteVarVecShift>;
+defm VPSRLV : avx512_var_shift_types<0x45, "vpsrlv", X86vsrlv, SchedWriteVarVecShift>,
+ avx512_var_shift_w<0x10, "vpsrlvw", X86vsrlv, SchedWriteVarVecShift>;
defm VPRORV : avx512_var_shift_types<0x14, "vprorv", rotr, SchedWriteVarVecShift>;
defm VPROLV : avx512_var_shift_types<0x15, "vprolv", rotl, SchedWriteVarVecShift>;
-defm : avx512_var_shift_lowering<avx512vl_i64_info, "VPSRAVQ", sra, [HasAVX512, NoVLX]>;
-defm : avx512_var_shift_lowering<avx512vl_i16_info, "VPSLLVW", shl, [HasBWI, NoVLX]>;
-defm : avx512_var_shift_lowering<avx512vl_i16_info, "VPSRAVW", sra, [HasBWI, NoVLX]>;
-defm : avx512_var_shift_lowering<avx512vl_i16_info, "VPSRLVW", srl, [HasBWI, NoVLX]>;
+defm : avx512_var_shift_lowering<avx512vl_i64_info, "VPSRAVQ", X86vsrav, [HasAVX512, NoVLX]>;
+defm : avx512_var_shift_lowering<avx512vl_i16_info, "VPSLLVW", X86vshlv, [HasBWI, NoVLX]>;
+defm : avx512_var_shift_lowering<avx512vl_i16_info, "VPSRAVW", X86vsrav, [HasBWI, NoVLX]>;
+defm : avx512_var_shift_lowering<avx512vl_i16_info, "VPSRLVW", X86vsrlv, [HasBWI, NoVLX]>;
-// Special handing for handling VPSRAV intrinsics.
-multiclass avx512_var_shift_int_lowering<string InstrStr, X86VectorVTInfo _,
- list<Predicate> p> {
- let Predicates = p in {
- def : Pat<(_.VT (X86vsrav _.RC:$src1, _.RC:$src2)),
- (!cast<Instruction>(InstrStr#_.ZSuffix#rr) _.RC:$src1,
- _.RC:$src2)>;
- def : Pat<(_.VT (X86vsrav _.RC:$src1, (_.LdFrag addr:$src2))),
- (!cast<Instruction>(InstrStr#_.ZSuffix##rm)
- _.RC:$src1, addr:$src2)>;
- def : Pat<(_.VT (vselect _.KRCWM:$mask,
- (X86vsrav _.RC:$src1, _.RC:$src2), _.RC:$src0)),
- (!cast<Instruction>(InstrStr#_.ZSuffix#rrk) _.RC:$src0,
- _.KRC:$mask, _.RC:$src1, _.RC:$src2)>;
- def : Pat<(_.VT (vselect _.KRCWM:$mask,
- (X86vsrav _.RC:$src1, (_.LdFrag addr:$src2)),
- _.RC:$src0)),
- (!cast<Instruction>(InstrStr#_.ZSuffix##rmk) _.RC:$src0,
- _.KRC:$mask, _.RC:$src1, addr:$src2)>;
- def : Pat<(_.VT (vselect _.KRCWM:$mask,
- (X86vsrav _.RC:$src1, _.RC:$src2), _.ImmAllZerosV)),
- (!cast<Instruction>(InstrStr#_.ZSuffix#rrkz) _.KRC:$mask,
- _.RC:$src1, _.RC:$src2)>;
- def : Pat<(_.VT (vselect _.KRCWM:$mask,
- (X86vsrav _.RC:$src1, (_.LdFrag addr:$src2)),
- _.ImmAllZerosV)),
- (!cast<Instruction>(InstrStr#_.ZSuffix##rmkz) _.KRC:$mask,
- _.RC:$src1, addr:$src2)>;
- }
-}
-
-multiclass avx512_var_shift_int_lowering_mb<string InstrStr, X86VectorVTInfo _,
- list<Predicate> p> :
- avx512_var_shift_int_lowering<InstrStr, _, p> {
- let Predicates = p in {
- def : Pat<(_.VT (X86vsrav _.RC:$src1,
- (X86VBroadcast (_.ScalarLdFrag addr:$src2)))),
- (!cast<Instruction>(InstrStr#_.ZSuffix##rmb)
- _.RC:$src1, addr:$src2)>;
- def : Pat<(_.VT (vselect _.KRCWM:$mask,
- (X86vsrav _.RC:$src1,
- (X86VBroadcast (_.ScalarLdFrag addr:$src2))),
- _.RC:$src0)),
- (!cast<Instruction>(InstrStr#_.ZSuffix##rmbk) _.RC:$src0,
- _.KRC:$mask, _.RC:$src1, addr:$src2)>;
- def : Pat<(_.VT (vselect _.KRCWM:$mask,
- (X86vsrav _.RC:$src1,
- (X86VBroadcast (_.ScalarLdFrag addr:$src2))),
- _.ImmAllZerosV)),
- (!cast<Instruction>(InstrStr#_.ZSuffix##rmbkz) _.KRC:$mask,
- _.RC:$src1, addr:$src2)>;
- }
-}
-
-defm : avx512_var_shift_int_lowering<"VPSRAVW", v8i16x_info, [HasVLX, HasBWI]>;
-defm : avx512_var_shift_int_lowering<"VPSRAVW", v16i16x_info, [HasVLX, HasBWI]>;
-defm : avx512_var_shift_int_lowering<"VPSRAVW", v32i16_info, [HasBWI]>;
-defm : avx512_var_shift_int_lowering_mb<"VPSRAVD", v4i32x_info, [HasVLX]>;
-defm : avx512_var_shift_int_lowering_mb<"VPSRAVD", v8i32x_info, [HasVLX]>;
-defm : avx512_var_shift_int_lowering_mb<"VPSRAVD", v16i32_info, [HasAVX512]>;
-defm : avx512_var_shift_int_lowering_mb<"VPSRAVQ", v2i64x_info, [HasVLX]>;
-defm : avx512_var_shift_int_lowering_mb<"VPSRAVQ", v4i64x_info, [HasVLX]>;
-defm : avx512_var_shift_int_lowering_mb<"VPSRAVQ", v8i64_info, [HasAVX512]>;
// Use 512bit VPROL/VPROLI version to implement v2i64/v4i64 + v4i32/v8i32 in case NoVLX.
let Predicates = [HasAVX512, NoVLX] in {
@@ -6827,17 +6353,20 @@ let Predicates = [HasAVX512] in {
def : Pat<(v2f64 (X86Unpckl VR128X:$src1,
(bc_v2f64 (v2i64 (scalar_to_vector (loadi64 addr:$src2)))))),
(VMOVHPDZ128rm VR128X:$src1, addr:$src2)>;
+ def : Pat<(v2f64 (X86Unpckl VR128X:$src1, (X86vzload64 addr:$src2))),
+ (VMOVHPDZ128rm VR128X:$src1, addr:$src2)>;
+
+ // VMOVLPD patterns
+ def : Pat<(v2f64 (X86Movsd VR128X:$src1, (X86vzload64 addr:$src2))),
+ (VMOVLPDZ128rm VR128X:$src1, addr:$src2)>;
}
let SchedRW = [WriteFStore] in {
+let mayStore = 1, hasSideEffects = 0 in
def VMOVHPSZ128mr : AVX512PSI<0x17, MRMDestMem, (outs),
(ins f64mem:$dst, VR128X:$src),
"vmovhps\t{$src, $dst|$dst, $src}",
- [(store (f64 (extractelt
- (X86Unpckh (bc_v2f64 (v4f32 VR128X:$src)),
- (bc_v2f64 (v4f32 VR128X:$src))),
- (iPTR 0))), addr:$dst)]>,
- EVEX, EVEX_CD8<32, CD8VT2>;
+ []>, EVEX, EVEX_CD8<32, CD8VT2>;
def VMOVHPDZ128mr : AVX512PDI<0x17, MRMDestMem, (outs),
(ins f64mem:$dst, VR128X:$src),
"vmovhpd\t{$src, $dst|$dst, $src}",
@@ -6845,12 +6374,11 @@ def VMOVHPDZ128mr : AVX512PDI<0x17, MRMDestMem, (outs),
(v2f64 (X86Unpckh VR128X:$src, VR128X:$src)),
(iPTR 0))), addr:$dst)]>,
EVEX, EVEX_CD8<64, CD8VT1>, VEX_W;
+let mayStore = 1, hasSideEffects = 0 in
def VMOVLPSZ128mr : AVX512PSI<0x13, MRMDestMem, (outs),
(ins f64mem:$dst, VR128X:$src),
"vmovlps\t{$src, $dst|$dst, $src}",
- [(store (f64 (extractelt (bc_v2f64 (v4f32 VR128X:$src)),
- (iPTR 0))), addr:$dst)]>,
- EVEX, EVEX_CD8<32, CD8VT2>;
+ []>, EVEX, EVEX_CD8<32, CD8VT2>;
def VMOVLPDZ128mr : AVX512PDI<0x13, MRMDestMem, (outs),
(ins f64mem:$dst, VR128X:$src),
"vmovlpd\t{$src, $dst|$dst, $src}",
@@ -6903,7 +6431,7 @@ multiclass avx512_fma3_213_round<bits<8> opc, string OpcodeStr, SDNode OpNode,
defm rb: AVX512_maskable_3src<opc, MRMSrcReg, _, (outs _.RC:$dst),
(ins _.RC:$src2, _.RC:$src3, AVX512RC:$rc),
OpcodeStr, "$rc, $src3, $src2", "$src2, $src3, $rc",
- (_.VT ( OpNode _.RC:$src2, _.RC:$src1, _.RC:$src3, (i32 imm:$rc))), 1, 1>,
+ (_.VT ( OpNode _.RC:$src2, _.RC:$src1, _.RC:$src3, (i32 timm:$rc))), 1, 1>,
AVX512FMA3Base, EVEX_B, EVEX_RC, Sched<[sched]>;
}
@@ -6978,7 +6506,7 @@ multiclass avx512_fma3_231_round<bits<8> opc, string OpcodeStr, SDNode OpNode,
defm rb: AVX512_maskable_3src<opc, MRMSrcReg, _, (outs _.RC:$dst),
(ins _.RC:$src2, _.RC:$src3, AVX512RC:$rc),
OpcodeStr, "$rc, $src3, $src2", "$src2, $src3, $rc",
- (_.VT ( OpNode _.RC:$src2, _.RC:$src3, _.RC:$src1, (i32 imm:$rc))),
+ (_.VT ( OpNode _.RC:$src2, _.RC:$src3, _.RC:$src1, (i32 timm:$rc))),
1, 1, vselect, 1>,
AVX512FMA3Base, EVEX_B, EVEX_RC, Sched<[sched]>;
}
@@ -7056,7 +6584,7 @@ multiclass avx512_fma3_132_round<bits<8> opc, string OpcodeStr, SDNode OpNode,
defm rb: AVX512_maskable_3src<opc, MRMSrcReg, _, (outs _.RC:$dst),
(ins _.RC:$src2, _.RC:$src3, AVX512RC:$rc),
OpcodeStr, "$rc, $src3, $src2", "$src2, $src3, $rc",
- (_.VT ( OpNode _.RC:$src1, _.RC:$src3, _.RC:$src2, (i32 imm:$rc))),
+ (_.VT ( OpNode _.RC:$src1, _.RC:$src3, _.RC:$src2, (i32 timm:$rc))),
1, 1, vselect, 1>,
AVX512FMA3Base, EVEX_B, EVEX_RC, Sched<[sched]>;
}
@@ -7132,7 +6660,7 @@ let Constraints = "$src1 = $dst", hasSideEffects = 0 in {
def rb : AVX512FMA3S<opc, MRMSrcReg, (outs _.FRC:$dst),
(ins _.FRC:$src1, _.FRC:$src2, _.FRC:$src3, AVX512RC:$rc),
!strconcat(OpcodeStr,
- "\t{$src3, $src2, $dst|$dst, $src2, $src3}"),
+ "\t{$rc, $src3, $src2, $dst|$dst, $src2, $src3, $rc}"),
!if(MaskOnlyReg, [], [RHS_b])>, EVEX_B, EVEX_RC,
Sched<[SchedWriteFMA.Scl]>;
}// isCodeGenOnly = 1
@@ -7151,7 +6679,7 @@ multiclass avx512_fma3s_all<bits<8> opc213, bits<8> opc231, bits<8> opc132,
(set _.FRC:$dst, (_.EltVT (OpNode _.FRC:$src2, _.FRC:$src1,
(_.ScalarLdFrag addr:$src3)))),
(set _.FRC:$dst, (_.EltVT (OpNodeRnd _.FRC:$src2, _.FRC:$src1,
- _.FRC:$src3, (i32 imm:$rc)))), 0>;
+ _.FRC:$src3, (i32 timm:$rc)))), 0>;
defm NAME#231#SUFF#Z: avx512_fma3s_common<opc231, OpcodeStr#"231"#_.Suffix, _,
(set _.FRC:$dst, (_.EltVT (OpNode _.FRC:$src2, _.FRC:$src3,
@@ -7159,7 +6687,7 @@ multiclass avx512_fma3s_all<bits<8> opc213, bits<8> opc231, bits<8> opc132,
(set _.FRC:$dst, (_.EltVT (OpNode _.FRC:$src2,
(_.ScalarLdFrag addr:$src3), _.FRC:$src1))),
(set _.FRC:$dst, (_.EltVT (OpNodeRnd _.FRC:$src2, _.FRC:$src3,
- _.FRC:$src1, (i32 imm:$rc)))), 1>;
+ _.FRC:$src1, (i32 timm:$rc)))), 1>;
// One pattern is 312 order so that the load is in a different place from the
// 213 and 231 patterns this helps tablegen's duplicate pattern detection.
@@ -7169,7 +6697,7 @@ multiclass avx512_fma3s_all<bits<8> opc213, bits<8> opc231, bits<8> opc132,
(set _.FRC:$dst, (_.EltVT (OpNode (_.ScalarLdFrag addr:$src3),
_.FRC:$src1, _.FRC:$src2))),
(set _.FRC:$dst, (_.EltVT (OpNodeRnd _.FRC:$src1, _.FRC:$src3,
- _.FRC:$src2, (i32 imm:$rc)))), 1>;
+ _.FRC:$src2, (i32 timm:$rc)))), 1>;
}
}
@@ -7333,62 +6861,62 @@ multiclass avx512_scalar_fma_patterns<SDNode Op, SDNode RndOp, string Prefix,
def : Pat<(_.VT (Move (_.VT VR128X:$src1), (_.VT (scalar_to_vector
(RndOp _.FRC:$src2,
(_.EltVT (extractelt (_.VT VR128X:$src1), (iPTR 0))),
- _.FRC:$src3, (i32 imm:$rc)))))),
+ _.FRC:$src3, (i32 timm:$rc)))))),
(!cast<I>(Prefix#"213"#Suffix#"Zrb_Int")
VR128X:$src1, (_.VT (COPY_TO_REGCLASS _.FRC:$src2, VR128X)),
- (_.VT (COPY_TO_REGCLASS _.FRC:$src3, VR128X)), imm:$rc)>;
+ (_.VT (COPY_TO_REGCLASS _.FRC:$src3, VR128X)), AVX512RC:$rc)>;
def : Pat<(_.VT (Move (_.VT VR128X:$src1), (_.VT (scalar_to_vector
(RndOp _.FRC:$src2, _.FRC:$src3,
(_.EltVT (extractelt (_.VT VR128X:$src1), (iPTR 0))),
- (i32 imm:$rc)))))),
+ (i32 timm:$rc)))))),
(!cast<I>(Prefix#"231"#Suffix#"Zrb_Int")
VR128X:$src1, (_.VT (COPY_TO_REGCLASS _.FRC:$src2, VR128X)),
- (_.VT (COPY_TO_REGCLASS _.FRC:$src3, VR128X)), imm:$rc)>;
+ (_.VT (COPY_TO_REGCLASS _.FRC:$src3, VR128X)), AVX512RC:$rc)>;
def : Pat<(_.VT (Move (_.VT VR128X:$src1), (_.VT (scalar_to_vector
(X86selects VK1WM:$mask,
(RndOp _.FRC:$src2,
(_.EltVT (extractelt (_.VT VR128X:$src1), (iPTR 0))),
- _.FRC:$src3, (i32 imm:$rc)),
+ _.FRC:$src3, (i32 timm:$rc)),
(_.EltVT (extractelt (_.VT VR128X:$src1), (iPTR 0)))))))),
(!cast<I>(Prefix#"213"#Suffix#"Zrb_Intk")
VR128X:$src1, VK1WM:$mask,
(_.VT (COPY_TO_REGCLASS _.FRC:$src2, VR128X)),
- (_.VT (COPY_TO_REGCLASS _.FRC:$src3, VR128X)), imm:$rc)>;
+ (_.VT (COPY_TO_REGCLASS _.FRC:$src3, VR128X)), AVX512RC:$rc)>;
def : Pat<(_.VT (Move (_.VT VR128X:$src1), (_.VT (scalar_to_vector
(X86selects VK1WM:$mask,
(RndOp _.FRC:$src2, _.FRC:$src3,
(_.EltVT (extractelt (_.VT VR128X:$src1), (iPTR 0))),
- (i32 imm:$rc)),
+ (i32 timm:$rc)),
(_.EltVT (extractelt (_.VT VR128X:$src1), (iPTR 0)))))))),
(!cast<I>(Prefix#"231"#Suffix#"Zrb_Intk")
VR128X:$src1, VK1WM:$mask,
(_.VT (COPY_TO_REGCLASS _.FRC:$src2, VR128X)),
- (_.VT (COPY_TO_REGCLASS _.FRC:$src3, VR128X)), imm:$rc)>;
+ (_.VT (COPY_TO_REGCLASS _.FRC:$src3, VR128X)), AVX512RC:$rc)>;
def : Pat<(_.VT (Move (_.VT VR128X:$src1), (_.VT (scalar_to_vector
(X86selects VK1WM:$mask,
(RndOp _.FRC:$src2,
(_.EltVT (extractelt (_.VT VR128X:$src1), (iPTR 0))),
- _.FRC:$src3, (i32 imm:$rc)),
+ _.FRC:$src3, (i32 timm:$rc)),
(_.EltVT ZeroFP)))))),
(!cast<I>(Prefix#"213"#Suffix#"Zrb_Intkz")
VR128X:$src1, VK1WM:$mask,
(_.VT (COPY_TO_REGCLASS _.FRC:$src2, VR128X)),
- (_.VT (COPY_TO_REGCLASS _.FRC:$src3, VR128X)), imm:$rc)>;
+ (_.VT (COPY_TO_REGCLASS _.FRC:$src3, VR128X)), AVX512RC:$rc)>;
def : Pat<(_.VT (Move (_.VT VR128X:$src1), (_.VT (scalar_to_vector
(X86selects VK1WM:$mask,
(RndOp _.FRC:$src2, _.FRC:$src3,
(_.EltVT (extractelt (_.VT VR128X:$src1), (iPTR 0))),
- (i32 imm:$rc)),
+ (i32 timm:$rc)),
(_.EltVT ZeroFP)))))),
(!cast<I>(Prefix#"231"#Suffix#"Zrb_Intkz")
VR128X:$src1, VK1WM:$mask,
(_.VT (COPY_TO_REGCLASS _.FRC:$src2, VR128X)),
- (_.VT (COPY_TO_REGCLASS _.FRC:$src3, VR128X)), imm:$rc)>;
+ (_.VT (COPY_TO_REGCLASS _.FRC:$src3, VR128X)), AVX512RC:$rc)>;
}
}
@@ -7468,44 +6996,44 @@ defm VPMADD52HUQ : avx512_pmadd52_common<0xb5, "vpmadd52huq", x86vpmadd52h,
// AVX-512 Scalar convert from sign integer to float/double
//===----------------------------------------------------------------------===//
-multiclass avx512_vcvtsi<bits<8> opc, SDNode OpNode, X86FoldableSchedWrite sched,
+multiclass avx512_vcvtsi<bits<8> opc, SDPatternOperator OpNode, X86FoldableSchedWrite sched,
RegisterClass SrcRC, X86VectorVTInfo DstVT,
- X86MemOperand x86memop, PatFrag ld_frag, string asm> {
- let hasSideEffects = 0 in {
+ X86MemOperand x86memop, PatFrag ld_frag, string asm,
+ string mem> {
+ let hasSideEffects = 0, isCodeGenOnly = 1 in {
def rr : SI<opc, MRMSrcReg, (outs DstVT.FRC:$dst),
(ins DstVT.FRC:$src1, SrcRC:$src),
!strconcat(asm,"\t{$src, $src1, $dst|$dst, $src1, $src}"), []>,
- EVEX_4V, Sched<[sched]>;
+ EVEX_4V, Sched<[sched, ReadDefault, ReadInt2Fpu]>;
let mayLoad = 1 in
def rm : SI<opc, MRMSrcMem, (outs DstVT.FRC:$dst),
(ins DstVT.FRC:$src1, x86memop:$src),
- !strconcat(asm,"\t{$src, $src1, $dst|$dst, $src1, $src}"), []>,
+ asm#"{"#mem#"}\t{$src, $src1, $dst|$dst, $src1, $src}", []>,
EVEX_4V, Sched<[sched.Folded, sched.ReadAfterFold]>;
} // hasSideEffects = 0
- let isCodeGenOnly = 1 in {
- def rr_Int : SI<opc, MRMSrcReg, (outs DstVT.RC:$dst),
- (ins DstVT.RC:$src1, SrcRC:$src2),
- !strconcat(asm,"\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
- [(set DstVT.RC:$dst,
- (OpNode (DstVT.VT DstVT.RC:$src1),
- SrcRC:$src2,
- (i32 FROUND_CURRENT)))]>,
- EVEX_4V, Sched<[sched]>;
-
- def rm_Int : SI<opc, MRMSrcMem, (outs DstVT.RC:$dst),
- (ins DstVT.RC:$src1, x86memop:$src2),
- !strconcat(asm,"\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
- [(set DstVT.RC:$dst,
- (OpNode (DstVT.VT DstVT.RC:$src1),
- (ld_frag addr:$src2),
- (i32 FROUND_CURRENT)))]>,
- EVEX_4V, Sched<[sched.Folded, sched.ReadAfterFold]>;
- }//isCodeGenOnly = 1
+ def rr_Int : SI<opc, MRMSrcReg, (outs DstVT.RC:$dst),
+ (ins DstVT.RC:$src1, SrcRC:$src2),
+ !strconcat(asm,"\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
+ [(set DstVT.RC:$dst,
+ (OpNode (DstVT.VT DstVT.RC:$src1), SrcRC:$src2))]>,
+ EVEX_4V, Sched<[sched, ReadDefault, ReadInt2Fpu]>;
+
+ def rm_Int : SI<opc, MRMSrcMem, (outs DstVT.RC:$dst),
+ (ins DstVT.RC:$src1, x86memop:$src2),
+ asm#"{"#mem#"}\t{$src2, $src1, $dst|$dst, $src1, $src2}",
+ [(set DstVT.RC:$dst,
+ (OpNode (DstVT.VT DstVT.RC:$src1),
+ (ld_frag addr:$src2)))]>,
+ EVEX_4V, Sched<[sched.Folded, sched.ReadAfterFold]>;
+ def : InstAlias<"v"#asm#mem#"\t{$src2, $src1, $dst|$dst, $src1, $src2}",
+ (!cast<Instruction>(NAME#"rr_Int") DstVT.RC:$dst,
+ DstVT.RC:$src1, SrcRC:$src2), 0, "att">;
}
multiclass avx512_vcvtsi_round<bits<8> opc, SDNode OpNode,
X86FoldableSchedWrite sched, RegisterClass SrcRC,
- X86VectorVTInfo DstVT, string asm> {
+ X86VectorVTInfo DstVT, string asm,
+ string mem> {
def rrb_Int : SI<opc, MRMSrcReg, (outs DstVT.RC:$dst),
(ins DstVT.RC:$src1, SrcRC:$src2, AVX512RC:$rc),
!strconcat(asm,
@@ -7513,37 +7041,44 @@ multiclass avx512_vcvtsi_round<bits<8> opc, SDNode OpNode,
[(set DstVT.RC:$dst,
(OpNode (DstVT.VT DstVT.RC:$src1),
SrcRC:$src2,
- (i32 imm:$rc)))]>,
- EVEX_4V, EVEX_B, EVEX_RC, Sched<[sched]>;
+ (i32 timm:$rc)))]>,
+ EVEX_4V, EVEX_B, EVEX_RC, Sched<[sched, ReadDefault, ReadInt2Fpu]>;
+ def : InstAlias<"v"#asm#mem#"\t{$src2, $rc, $src1, $dst|$dst, $src1, $rc, $src2}",
+ (!cast<Instruction>(NAME#"rrb_Int") DstVT.RC:$dst,
+ DstVT.RC:$src1, SrcRC:$src2, AVX512RC:$rc), 0, "att">;
}
-multiclass avx512_vcvtsi_common<bits<8> opc, SDNode OpNode,
+multiclass avx512_vcvtsi_common<bits<8> opc, SDNode OpNode, SDNode OpNodeRnd,
X86FoldableSchedWrite sched,
RegisterClass SrcRC, X86VectorVTInfo DstVT,
- X86MemOperand x86memop, PatFrag ld_frag, string asm> {
- defm NAME : avx512_vcvtsi_round<opc, OpNode, sched, SrcRC, DstVT, asm>,
+ X86MemOperand x86memop, PatFrag ld_frag,
+ string asm, string mem> {
+ defm NAME : avx512_vcvtsi_round<opc, OpNodeRnd, sched, SrcRC, DstVT, asm, mem>,
avx512_vcvtsi<opc, OpNode, sched, SrcRC, DstVT, x86memop,
- ld_frag, asm>, VEX_LIG;
+ ld_frag, asm, mem>, VEX_LIG;
}
let Predicates = [HasAVX512] in {
-defm VCVTSI2SSZ : avx512_vcvtsi_common<0x2A, X86SintToFpRnd, WriteCvtI2SS, GR32,
- v4f32x_info, i32mem, loadi32, "cvtsi2ss{l}">,
+defm VCVTSI2SSZ : avx512_vcvtsi_common<0x2A, X86SintToFp, X86SintToFpRnd,
+ WriteCvtI2SS, GR32,
+ v4f32x_info, i32mem, loadi32, "cvtsi2ss", "l">,
XS, EVEX_CD8<32, CD8VT1>;
-defm VCVTSI642SSZ: avx512_vcvtsi_common<0x2A, X86SintToFpRnd, WriteCvtI2SS, GR64,
- v4f32x_info, i64mem, loadi64, "cvtsi2ss{q}">,
+defm VCVTSI642SSZ: avx512_vcvtsi_common<0x2A, X86SintToFp, X86SintToFpRnd,
+ WriteCvtI2SS, GR64,
+ v4f32x_info, i64mem, loadi64, "cvtsi2ss", "q">,
XS, VEX_W, EVEX_CD8<64, CD8VT1>;
-defm VCVTSI2SDZ : avx512_vcvtsi_common<0x2A, X86SintToFpRnd, WriteCvtI2SD, GR32,
- v2f64x_info, i32mem, loadi32, "cvtsi2sd{l}">,
- XD, EVEX_CD8<32, CD8VT1>;
-defm VCVTSI642SDZ: avx512_vcvtsi_common<0x2A, X86SintToFpRnd, WriteCvtI2SD, GR64,
- v2f64x_info, i64mem, loadi64, "cvtsi2sd{q}">,
+defm VCVTSI2SDZ : avx512_vcvtsi<0x2A, null_frag, WriteCvtI2SD, GR32,
+ v2f64x_info, i32mem, loadi32, "cvtsi2sd", "l">,
+ XD, VEX_LIG, EVEX_CD8<32, CD8VT1>;
+defm VCVTSI642SDZ: avx512_vcvtsi_common<0x2A, X86SintToFp, X86SintToFpRnd,
+ WriteCvtI2SD, GR64,
+ v2f64x_info, i64mem, loadi64, "cvtsi2sd", "q">,
XD, VEX_W, EVEX_CD8<64, CD8VT1>;
def : InstAlias<"vcvtsi2ss\t{$src, $src1, $dst|$dst, $src1, $src}",
- (VCVTSI2SSZrm FR64X:$dst, FR64X:$src1, i32mem:$src), 0, "att">;
+ (VCVTSI2SSZrm_Int VR128X:$dst, VR128X:$src1, i32mem:$src), 0, "att">;
def : InstAlias<"vcvtsi2sd\t{$src, $src1, $dst|$dst, $src1, $src}",
- (VCVTSI2SDZrm FR64X:$dst, FR64X:$src1, i32mem:$src), 0, "att">;
+ (VCVTSI2SDZrm_Int VR128X:$dst, VR128X:$src1, i32mem:$src), 0, "att">;
def : Pat<(f32 (sint_to_fp (loadi32 addr:$src))),
(VCVTSI2SSZrm (f32 (IMPLICIT_DEF)), addr:$src)>;
@@ -7563,23 +7098,26 @@ def : Pat<(f64 (sint_to_fp GR32:$src)),
def : Pat<(f64 (sint_to_fp GR64:$src)),
(VCVTSI642SDZrr (f64 (IMPLICIT_DEF)), GR64:$src)>;
-defm VCVTUSI2SSZ : avx512_vcvtsi_common<0x7B, X86UintToFpRnd, WriteCvtI2SS, GR32,
+defm VCVTUSI2SSZ : avx512_vcvtsi_common<0x7B, X86UintToFp, X86UintToFpRnd,
+ WriteCvtI2SS, GR32,
v4f32x_info, i32mem, loadi32,
- "cvtusi2ss{l}">, XS, EVEX_CD8<32, CD8VT1>;
-defm VCVTUSI642SSZ : avx512_vcvtsi_common<0x7B, X86UintToFpRnd, WriteCvtI2SS, GR64,
- v4f32x_info, i64mem, loadi64, "cvtusi2ss{q}">,
+ "cvtusi2ss", "l">, XS, EVEX_CD8<32, CD8VT1>;
+defm VCVTUSI642SSZ : avx512_vcvtsi_common<0x7B, X86UintToFp, X86UintToFpRnd,
+ WriteCvtI2SS, GR64,
+ v4f32x_info, i64mem, loadi64, "cvtusi2ss", "q">,
XS, VEX_W, EVEX_CD8<64, CD8VT1>;
-defm VCVTUSI2SDZ : avx512_vcvtsi<0x7B, X86UintToFpRnd, WriteCvtI2SD, GR32, v2f64x_info,
- i32mem, loadi32, "cvtusi2sd{l}">,
+defm VCVTUSI2SDZ : avx512_vcvtsi<0x7B, null_frag, WriteCvtI2SD, GR32, v2f64x_info,
+ i32mem, loadi32, "cvtusi2sd", "l">,
XD, VEX_LIG, EVEX_CD8<32, CD8VT1>;
-defm VCVTUSI642SDZ : avx512_vcvtsi_common<0x7B, X86UintToFpRnd, WriteCvtI2SD, GR64,
- v2f64x_info, i64mem, loadi64, "cvtusi2sd{q}">,
+defm VCVTUSI642SDZ : avx512_vcvtsi_common<0x7B, X86UintToFp, X86UintToFpRnd,
+ WriteCvtI2SD, GR64,
+ v2f64x_info, i64mem, loadi64, "cvtusi2sd", "q">,
XD, VEX_W, EVEX_CD8<64, CD8VT1>;
def : InstAlias<"vcvtusi2ss\t{$src, $src1, $dst|$dst, $src1, $src}",
- (VCVTUSI2SSZrm FR64X:$dst, FR64X:$src1, i32mem:$src), 0, "att">;
+ (VCVTUSI2SSZrm_Int VR128X:$dst, VR128X:$src1, i32mem:$src), 0, "att">;
def : InstAlias<"vcvtusi2sd\t{$src, $src1, $dst|$dst, $src1, $src}",
- (VCVTUSI2SDZrm FR64X:$dst, FR64X:$src1, i32mem:$src), 0, "att">;
+ (VCVTUSI2SDZrm_Int VR128X:$dst, VR128X:$src1, i32mem:$src), 0, "att">;
def : Pat<(f32 (uint_to_fp (loadi32 addr:$src))),
(VCVTUSI2SSZrm (f32 (IMPLICIT_DEF)), addr:$src)>;
@@ -7608,8 +7146,7 @@ multiclass avx512_cvt_s_int_round<bits<8> opc, X86VectorVTInfo SrcVT,
X86VectorVTInfo DstVT, SDNode OpNode,
SDNode OpNodeRnd,
X86FoldableSchedWrite sched, string asm,
- string aliasStr,
- bit CodeGenOnly = 1> {
+ string aliasStr> {
let Predicates = [HasAVX512] in {
def rr_Int : SI<opc, MRMSrcReg, (outs DstVT.RC:$dst), (ins SrcVT.RC:$src),
!strconcat(asm,"\t{$src, $dst|$dst, $src}"),
@@ -7617,34 +7154,23 @@ multiclass avx512_cvt_s_int_round<bits<8> opc, X86VectorVTInfo SrcVT,
EVEX, VEX_LIG, Sched<[sched]>;
def rrb_Int : SI<opc, MRMSrcReg, (outs DstVT.RC:$dst), (ins SrcVT.RC:$src, AVX512RC:$rc),
!strconcat(asm,"\t{$rc, $src, $dst|$dst, $src, $rc}"),
- [(set DstVT.RC:$dst, (OpNodeRnd (SrcVT.VT SrcVT.RC:$src),(i32 imm:$rc)))]>,
+ [(set DstVT.RC:$dst, (OpNodeRnd (SrcVT.VT SrcVT.RC:$src),(i32 timm:$rc)))]>,
EVEX, VEX_LIG, EVEX_B, EVEX_RC,
Sched<[sched]>;
- let isCodeGenOnly = CodeGenOnly, ForceDisassemble = CodeGenOnly in
def rm_Int : SI<opc, MRMSrcMem, (outs DstVT.RC:$dst), (ins SrcVT.IntScalarMemOp:$src),
!strconcat(asm,"\t{$src, $dst|$dst, $src}"),
[(set DstVT.RC:$dst, (OpNode
(SrcVT.VT SrcVT.ScalarIntMemCPat:$src)))]>,
EVEX, VEX_LIG, Sched<[sched.Folded, sched.ReadAfterFold]>;
-
- def : InstAlias<"v" # asm # aliasStr # "\t{$src, $dst|$dst, $src}",
- (!cast<Instruction>(NAME # "rr_Int") DstVT.RC:$dst, SrcVT.RC:$src), 0, "att">;
- def : InstAlias<"v" # asm # aliasStr # "\t{$rc, $src, $dst|$dst, $src, $rc}",
- (!cast<Instruction>(NAME # "rrb_Int") DstVT.RC:$dst, SrcVT.RC:$src, AVX512RC:$rc), 0, "att">;
} // Predicates = [HasAVX512]
-}
-multiclass avx512_cvt_s_int_round_aliases<bits<8> opc, X86VectorVTInfo SrcVT,
- X86VectorVTInfo DstVT, SDNode OpNode,
- SDNode OpNodeRnd,
- X86FoldableSchedWrite sched, string asm,
- string aliasStr> :
- avx512_cvt_s_int_round<opc, SrcVT, DstVT, OpNode, OpNodeRnd, sched, asm, aliasStr, 0> {
- let Predicates = [HasAVX512] in {
- def : InstAlias<"v" # asm # aliasStr # "\t{$src, $dst|$dst, $src}",
- (!cast<Instruction>(NAME # "rm_Int") DstVT.RC:$dst,
- SrcVT.IntScalarMemOp:$src), 0, "att">;
- } // Predicates = [HasAVX512]
+ def : InstAlias<"v" # asm # aliasStr # "\t{$src, $dst|$dst, $src}",
+ (!cast<Instruction>(NAME # "rr_Int") DstVT.RC:$dst, SrcVT.RC:$src), 0, "att">;
+ def : InstAlias<"v" # asm # aliasStr # "\t{$rc, $src, $dst|$dst, $src, $rc}",
+ (!cast<Instruction>(NAME # "rrb_Int") DstVT.RC:$dst, SrcVT.RC:$src, AVX512RC:$rc), 0, "att">;
+ def : InstAlias<"v" # asm # aliasStr # "\t{$src, $dst|$dst, $src}",
+ (!cast<Instruction>(NAME # "rm_Int") DstVT.RC:$dst,
+ SrcVT.IntScalarMemOp:$src), 0, "att">;
}
// Convert float/double to signed/unsigned int 32/64
@@ -7654,10 +7180,10 @@ defm VCVTSS2SIZ: avx512_cvt_s_int_round<0x2D, f32x_info, i32x_info,X86cvts2si,
defm VCVTSS2SI64Z: avx512_cvt_s_int_round<0x2D, f32x_info, i64x_info, X86cvts2si,
X86cvts2siRnd, WriteCvtSS2I, "cvtss2si", "{q}">,
XS, VEX_W, EVEX_CD8<32, CD8VT1>;
-defm VCVTSS2USIZ: avx512_cvt_s_int_round_aliases<0x79, f32x_info, i32x_info, X86cvts2usi,
+defm VCVTSS2USIZ: avx512_cvt_s_int_round<0x79, f32x_info, i32x_info, X86cvts2usi,
X86cvts2usiRnd, WriteCvtSS2I, "cvtss2usi", "{l}">,
XS, EVEX_CD8<32, CD8VT1>;
-defm VCVTSS2USI64Z: avx512_cvt_s_int_round_aliases<0x79, f32x_info, i64x_info, X86cvts2usi,
+defm VCVTSS2USI64Z: avx512_cvt_s_int_round<0x79, f32x_info, i64x_info, X86cvts2usi,
X86cvts2usiRnd, WriteCvtSS2I, "cvtss2usi", "{q}">,
XS, VEX_W, EVEX_CD8<32, CD8VT1>;
defm VCVTSD2SIZ: avx512_cvt_s_int_round<0x2D, f64x_info, i32x_info, X86cvts2si,
@@ -7666,10 +7192,10 @@ defm VCVTSD2SIZ: avx512_cvt_s_int_round<0x2D, f64x_info, i32x_info, X86cvts2si,
defm VCVTSD2SI64Z: avx512_cvt_s_int_round<0x2D, f64x_info, i64x_info, X86cvts2si,
X86cvts2siRnd, WriteCvtSD2I, "cvtsd2si", "{q}">,
XD, VEX_W, EVEX_CD8<64, CD8VT1>;
-defm VCVTSD2USIZ: avx512_cvt_s_int_round_aliases<0x79, f64x_info, i32x_info, X86cvts2usi,
+defm VCVTSD2USIZ: avx512_cvt_s_int_round<0x79, f64x_info, i32x_info, X86cvts2usi,
X86cvts2usiRnd, WriteCvtSD2I, "cvtsd2usi", "{l}">,
XD, EVEX_CD8<64, CD8VT1>;
-defm VCVTSD2USI64Z: avx512_cvt_s_int_round_aliases<0x79, f64x_info, i64x_info, X86cvts2usi,
+defm VCVTSD2USI64Z: avx512_cvt_s_int_round<0x79, f64x_info, i64x_info, X86cvts2usi,
X86cvts2usiRnd, WriteCvtSD2I, "cvtsd2usi", "{q}">,
XD, VEX_W, EVEX_CD8<64, CD8VT1>;
@@ -7760,19 +7286,18 @@ def : Pat<(v2f64 (X86Movsd
// Convert float/double to signed/unsigned int 32/64 with truncation
multiclass avx512_cvt_s_all<bits<8> opc, string asm, X86VectorVTInfo _SrcRC,
X86VectorVTInfo _DstRC, SDNode OpNode,
- SDNode OpNodeInt, SDNode OpNodeRnd,
- X86FoldableSchedWrite sched, string aliasStr,
- bit CodeGenOnly = 1>{
+ SDNode OpNodeInt, SDNode OpNodeSAE,
+ X86FoldableSchedWrite sched, string aliasStr>{
let Predicates = [HasAVX512] in {
let isCodeGenOnly = 1 in {
def rr : AVX512<opc, MRMSrcReg, (outs _DstRC.RC:$dst), (ins _SrcRC.FRC:$src),
!strconcat(asm,"\t{$src, $dst|$dst, $src}"),
[(set _DstRC.RC:$dst, (OpNode _SrcRC.FRC:$src))]>,
- EVEX, Sched<[sched]>;
+ EVEX, VEX_LIG, Sched<[sched]>;
def rm : AVX512<opc, MRMSrcMem, (outs _DstRC.RC:$dst), (ins _SrcRC.ScalarMemOp:$src),
!strconcat(asm,"\t{$src, $dst|$dst, $src}"),
[(set _DstRC.RC:$dst, (OpNode (_SrcRC.ScalarLdFrag addr:$src)))]>,
- EVEX, Sched<[sched.Folded, sched.ReadAfterFold]>;
+ EVEX, VEX_LIG, Sched<[sched.Folded, sched.ReadAfterFold]>;
}
def rr_Int : AVX512<opc, MRMSrcReg, (outs _DstRC.RC:$dst), (ins _SrcRC.RC:$src),
@@ -7781,63 +7306,49 @@ let Predicates = [HasAVX512] in {
EVEX, VEX_LIG, Sched<[sched]>;
def rrb_Int : AVX512<opc, MRMSrcReg, (outs _DstRC.RC:$dst), (ins _SrcRC.RC:$src),
!strconcat(asm,"\t{{sae}, $src, $dst|$dst, $src, {sae}}"),
- [(set _DstRC.RC:$dst, (OpNodeRnd (_SrcRC.VT _SrcRC.RC:$src),
- (i32 FROUND_NO_EXC)))]>,
- EVEX,VEX_LIG , EVEX_B, Sched<[sched]>;
- let isCodeGenOnly = CodeGenOnly, ForceDisassemble = CodeGenOnly in
+ [(set _DstRC.RC:$dst, (OpNodeSAE (_SrcRC.VT _SrcRC.RC:$src)))]>,
+ EVEX, VEX_LIG, EVEX_B, Sched<[sched]>;
def rm_Int : AVX512<opc, MRMSrcMem, (outs _DstRC.RC:$dst),
(ins _SrcRC.IntScalarMemOp:$src),
!strconcat(asm,"\t{$src, $dst|$dst, $src}"),
[(set _DstRC.RC:$dst,
(OpNodeInt (_SrcRC.VT _SrcRC.ScalarIntMemCPat:$src)))]>,
EVEX, VEX_LIG, Sched<[sched.Folded, sched.ReadAfterFold]>;
+} //HasAVX512
def : InstAlias<asm # aliasStr # "\t{$src, $dst|$dst, $src}",
(!cast<Instruction>(NAME # "rr_Int") _DstRC.RC:$dst, _SrcRC.RC:$src), 0, "att">;
def : InstAlias<asm # aliasStr # "\t{{sae}, $src, $dst|$dst, $src, {sae}}",
(!cast<Instruction>(NAME # "rrb_Int") _DstRC.RC:$dst, _SrcRC.RC:$src), 0, "att">;
-} //HasAVX512
-}
-
-multiclass avx512_cvt_s_all_unsigned<bits<8> opc, string asm,
- X86VectorVTInfo _SrcRC,
- X86VectorVTInfo _DstRC, SDNode OpNode,
- SDNode OpNodeInt, SDNode OpNodeRnd,
- X86FoldableSchedWrite sched,
- string aliasStr> :
- avx512_cvt_s_all<opc, asm, _SrcRC, _DstRC, OpNode, OpNodeInt, OpNodeRnd, sched,
- aliasStr, 0> {
-let Predicates = [HasAVX512] in {
def : InstAlias<asm # aliasStr # "\t{$src, $dst|$dst, $src}",
(!cast<Instruction>(NAME # "rm_Int") _DstRC.RC:$dst,
_SrcRC.IntScalarMemOp:$src), 0, "att">;
}
-}
defm VCVTTSS2SIZ: avx512_cvt_s_all<0x2C, "vcvttss2si", f32x_info, i32x_info,
- fp_to_sint, X86cvtts2Int, X86cvtts2IntRnd, WriteCvtSS2I,
+ fp_to_sint, X86cvtts2Int, X86cvtts2IntSAE, WriteCvtSS2I,
"{l}">, XS, EVEX_CD8<32, CD8VT1>;
defm VCVTTSS2SI64Z: avx512_cvt_s_all<0x2C, "vcvttss2si", f32x_info, i64x_info,
- fp_to_sint, X86cvtts2Int, X86cvtts2IntRnd, WriteCvtSS2I,
+ fp_to_sint, X86cvtts2Int, X86cvtts2IntSAE, WriteCvtSS2I,
"{q}">, VEX_W, XS, EVEX_CD8<32, CD8VT1>;
defm VCVTTSD2SIZ: avx512_cvt_s_all<0x2C, "vcvttsd2si", f64x_info, i32x_info,
- fp_to_sint, X86cvtts2Int, X86cvtts2IntRnd, WriteCvtSD2I,
+ fp_to_sint, X86cvtts2Int, X86cvtts2IntSAE, WriteCvtSD2I,
"{l}">, XD, EVEX_CD8<64, CD8VT1>;
defm VCVTTSD2SI64Z: avx512_cvt_s_all<0x2C, "vcvttsd2si", f64x_info, i64x_info,
- fp_to_sint, X86cvtts2Int, X86cvtts2IntRnd, WriteCvtSD2I,
+ fp_to_sint, X86cvtts2Int, X86cvtts2IntSAE, WriteCvtSD2I,
"{q}">, VEX_W, XD, EVEX_CD8<64, CD8VT1>;
-defm VCVTTSS2USIZ: avx512_cvt_s_all_unsigned<0x78, "vcvttss2usi", f32x_info, i32x_info,
- fp_to_uint, X86cvtts2UInt, X86cvtts2UIntRnd, WriteCvtSS2I,
+defm VCVTTSS2USIZ: avx512_cvt_s_all<0x78, "vcvttss2usi", f32x_info, i32x_info,
+ fp_to_uint, X86cvtts2UInt, X86cvtts2UIntSAE, WriteCvtSS2I,
"{l}">, XS, EVEX_CD8<32, CD8VT1>;
-defm VCVTTSS2USI64Z: avx512_cvt_s_all_unsigned<0x78, "vcvttss2usi", f32x_info, i64x_info,
- fp_to_uint, X86cvtts2UInt, X86cvtts2UIntRnd, WriteCvtSS2I,
+defm VCVTTSS2USI64Z: avx512_cvt_s_all<0x78, "vcvttss2usi", f32x_info, i64x_info,
+ fp_to_uint, X86cvtts2UInt, X86cvtts2UIntSAE, WriteCvtSS2I,
"{q}">, XS,VEX_W, EVEX_CD8<32, CD8VT1>;
-defm VCVTTSD2USIZ: avx512_cvt_s_all_unsigned<0x78, "vcvttsd2usi", f64x_info, i32x_info,
- fp_to_uint, X86cvtts2UInt, X86cvtts2UIntRnd, WriteCvtSD2I,
+defm VCVTTSD2USIZ: avx512_cvt_s_all<0x78, "vcvttsd2usi", f64x_info, i32x_info,
+ fp_to_uint, X86cvtts2UInt, X86cvtts2UIntSAE, WriteCvtSD2I,
"{l}">, XD, EVEX_CD8<64, CD8VT1>;
-defm VCVTTSD2USI64Z: avx512_cvt_s_all_unsigned<0x78, "vcvttsd2usi", f64x_info, i64x_info,
- fp_to_uint, X86cvtts2UInt, X86cvtts2UIntRnd, WriteCvtSD2I,
+defm VCVTTSD2USI64Z: avx512_cvt_s_all<0x78, "vcvttsd2usi", f64x_info, i64x_info,
+ fp_to_uint, X86cvtts2UInt, X86cvtts2UIntSAE, WriteCvtSD2I,
"{q}">, XD, VEX_W, EVEX_CD8<64, CD8VT1>;
//===----------------------------------------------------------------------===//
@@ -7851,15 +7362,13 @@ multiclass avx512_cvt_fp_scalar<bits<8> opc, string OpcodeStr, X86VectorVTInfo _
(ins _.RC:$src1, _Src.RC:$src2), OpcodeStr,
"$src2, $src1", "$src1, $src2",
(_.VT (OpNode (_.VT _.RC:$src1),
- (_Src.VT _Src.RC:$src2),
- (i32 FROUND_CURRENT)))>,
+ (_Src.VT _Src.RC:$src2)))>,
EVEX_4V, VEX_LIG, Sched<[sched]>;
defm rm_Int : AVX512_maskable_scalar<opc, MRMSrcMem, _, (outs _.RC:$dst),
(ins _.RC:$src1, _Src.IntScalarMemOp:$src2), OpcodeStr,
"$src2, $src1", "$src1, $src2",
(_.VT (OpNode (_.VT _.RC:$src1),
- (_Src.VT _Src.ScalarIntMemCPat:$src2),
- (i32 FROUND_CURRENT)))>,
+ (_Src.VT _Src.ScalarIntMemCPat:$src2)))>,
EVEX_4V, VEX_LIG,
Sched<[sched.Folded, sched.ReadAfterFold]>;
@@ -7878,14 +7387,13 @@ multiclass avx512_cvt_fp_scalar<bits<8> opc, string OpcodeStr, X86VectorVTInfo _
// Scalar Coversion with SAE - suppress all exceptions
multiclass avx512_cvt_fp_sae_scalar<bits<8> opc, string OpcodeStr, X86VectorVTInfo _,
- X86VectorVTInfo _Src, SDNode OpNodeRnd,
+ X86VectorVTInfo _Src, SDNode OpNodeSAE,
X86FoldableSchedWrite sched> {
defm rrb_Int : AVX512_maskable_scalar<opc, MRMSrcReg, _, (outs _.RC:$dst),
(ins _.RC:$src1, _Src.RC:$src2), OpcodeStr,
"{sae}, $src2, $src1", "$src1, $src2, {sae}",
- (_.VT (OpNodeRnd (_.VT _.RC:$src1),
- (_Src.VT _Src.RC:$src2),
- (i32 FROUND_NO_EXC)))>,
+ (_.VT (OpNodeSAE (_.VT _.RC:$src1),
+ (_Src.VT _Src.RC:$src2)))>,
EVEX_4V, VEX_LIG, EVEX_B, Sched<[sched]>;
}
@@ -7897,34 +7405,36 @@ multiclass avx512_cvt_fp_rc_scalar<bits<8> opc, string OpcodeStr, X86VectorVTInf
(ins _.RC:$src1, _Src.RC:$src2, AVX512RC:$rc), OpcodeStr,
"$rc, $src2, $src1", "$src1, $src2, $rc",
(_.VT (OpNodeRnd (_.VT _.RC:$src1),
- (_Src.VT _Src.RC:$src2), (i32 imm:$rc)))>,
+ (_Src.VT _Src.RC:$src2), (i32 timm:$rc)))>,
EVEX_4V, VEX_LIG, Sched<[sched]>,
EVEX_B, EVEX_RC;
}
multiclass avx512_cvt_fp_scalar_sd2ss<bits<8> opc, string OpcodeStr,
- SDNode OpNodeRnd, X86FoldableSchedWrite sched,
- X86VectorVTInfo _src, X86VectorVTInfo _dst> {
+ SDNode OpNode, SDNode OpNodeRnd,
+ X86FoldableSchedWrite sched,
+ X86VectorVTInfo _src, X86VectorVTInfo _dst> {
let Predicates = [HasAVX512] in {
- defm Z : avx512_cvt_fp_scalar<opc, OpcodeStr, _dst, _src, OpNodeRnd, sched>,
+ defm Z : avx512_cvt_fp_scalar<opc, OpcodeStr, _dst, _src, OpNode, sched>,
avx512_cvt_fp_rc_scalar<opc, OpcodeStr, _dst, _src,
OpNodeRnd, sched>, VEX_W, EVEX_CD8<64, CD8VT1>, XD;
}
}
-multiclass avx512_cvt_fp_scalar_ss2sd<bits<8> opc, string OpcodeStr, SDNode OpNodeRnd,
+multiclass avx512_cvt_fp_scalar_ss2sd<bits<8> opc, string OpcodeStr,
+ SDNode OpNode, SDNode OpNodeSAE,
X86FoldableSchedWrite sched,
X86VectorVTInfo _src, X86VectorVTInfo _dst> {
let Predicates = [HasAVX512] in {
- defm Z : avx512_cvt_fp_scalar<opc, OpcodeStr, _dst, _src, OpNodeRnd, sched>,
- avx512_cvt_fp_sae_scalar<opc, OpcodeStr, _dst, _src, OpNodeRnd, sched>,
+ defm Z : avx512_cvt_fp_scalar<opc, OpcodeStr, _dst, _src, OpNode, sched>,
+ avx512_cvt_fp_sae_scalar<opc, OpcodeStr, _dst, _src, OpNodeSAE, sched>,
EVEX_CD8<32, CD8VT1>, XS;
}
}
-defm VCVTSD2SS : avx512_cvt_fp_scalar_sd2ss<0x5A, "vcvtsd2ss",
- X86froundRnd, WriteCvtSD2SS, f64x_info,
+defm VCVTSD2SS : avx512_cvt_fp_scalar_sd2ss<0x5A, "vcvtsd2ss", X86frounds,
+ X86froundsRnd, WriteCvtSD2SS, f64x_info,
f32x_info>;
-defm VCVTSS2SD : avx512_cvt_fp_scalar_ss2sd<0x5A, "vcvtss2sd",
- X86fpextRnd, WriteCvtSS2SD, f32x_info,
+defm VCVTSS2SD : avx512_cvt_fp_scalar_ss2sd<0x5A, "vcvtss2sd", X86fpexts,
+ X86fpextsSAE, WriteCvtSS2SD, f32x_info,
f64x_info>;
def : Pat<(f64 (fpextend FR32X:$src)),
@@ -7934,14 +7444,6 @@ def : Pat<(f64 (fpextend (loadf32 addr:$src))),
(VCVTSS2SDZrm (f64 (IMPLICIT_DEF)), addr:$src)>,
Requires<[HasAVX512, OptForSize]>;
-def : Pat<(f64 (extloadf32 addr:$src)),
- (VCVTSS2SDZrm (f64 (IMPLICIT_DEF)), addr:$src)>,
- Requires<[HasAVX512, OptForSize]>;
-
-def : Pat<(f64 (extloadf32 addr:$src)),
- (VCVTSS2SDZrr (f64 (IMPLICIT_DEF)), (VMOVSSZrm addr:$src))>,
- Requires<[HasAVX512, OptForSpeed]>;
-
def : Pat<(f32 (fpround FR64X:$src)),
(VCVTSD2SSZrr (f32 (IMPLICIT_DEF)), FR64X:$src)>,
Requires<[HasAVX512]>;
@@ -7970,7 +7472,8 @@ multiclass avx512_vcvt_fp<bits<8> opc, string OpcodeStr, X86VectorVTInfo _,
X86FoldableSchedWrite sched,
string Broadcast = _.BroadcastStr,
string Alias = "", X86MemOperand MemOp = _Src.MemOp,
- RegisterClass MaskRC = _.KRCWM> {
+ RegisterClass MaskRC = _.KRCWM,
+ dag LdDAG = (_.VT (OpNode (_Src.VT (_Src.LdFrag addr:$src))))> {
defm rr : AVX512_maskable_common<opc, MRMSrcReg, _, (outs _.RC:$dst),
(ins _Src.RC:$src),
@@ -7989,12 +7492,8 @@ multiclass avx512_vcvt_fp<bits<8> opc, string OpcodeStr, X86VectorVTInfo _,
(ins _.RC:$src0, MaskRC:$mask, MemOp:$src),
(ins MaskRC:$mask, MemOp:$src),
OpcodeStr#Alias, "$src", "$src",
- (_.VT (OpNode (_Src.VT
- (_Src.LdFrag addr:$src)))),
- (vselect MaskRC:$mask,
- (_.VT (OpNode (_Src.VT
- (_Src.LdFrag addr:$src)))),
- _.RC:$src0),
+ LdDAG,
+ (vselect MaskRC:$mask, LdDAG, _.RC:$src0),
vselect, "$src0 = $dst">,
EVEX, Sched<[sched.Folded]>;
@@ -8019,13 +7518,12 @@ multiclass avx512_vcvt_fp<bits<8> opc, string OpcodeStr, X86VectorVTInfo _,
}
// Coversion with SAE - suppress all exceptions
multiclass avx512_vcvt_fp_sae<bits<8> opc, string OpcodeStr, X86VectorVTInfo _,
- X86VectorVTInfo _Src, SDNode OpNodeRnd,
+ X86VectorVTInfo _Src, SDNode OpNodeSAE,
X86FoldableSchedWrite sched> {
defm rrb : AVX512_maskable<opc, MRMSrcReg, _, (outs _.RC:$dst),
(ins _Src.RC:$src), OpcodeStr,
"{sae}, $src", "$src, {sae}",
- (_.VT (OpNodeRnd (_Src.VT _Src.RC:$src),
- (i32 FROUND_NO_EXC)))>,
+ (_.VT (OpNodeSAE (_Src.VT _Src.RC:$src)))>,
EVEX, EVEX_B, Sched<[sched]>;
}
@@ -8036,23 +7534,34 @@ multiclass avx512_vcvt_fp_rc<bits<8> opc, string OpcodeStr, X86VectorVTInfo _,
defm rrb : AVX512_maskable<opc, MRMSrcReg, _, (outs _.RC:$dst),
(ins _Src.RC:$src, AVX512RC:$rc), OpcodeStr,
"$rc, $src", "$src, $rc",
- (_.VT (OpNodeRnd (_Src.VT _Src.RC:$src), (i32 imm:$rc)))>,
+ (_.VT (OpNodeRnd (_Src.VT _Src.RC:$src), (i32 timm:$rc)))>,
EVEX, EVEX_B, EVEX_RC, Sched<[sched]>;
}
+// Similar to avx512_vcvt_fp, but uses an extload for the memory form.
+multiclass avx512_vcvt_fpextend<bits<8> opc, string OpcodeStr, X86VectorVTInfo _,
+ X86VectorVTInfo _Src, SDNode OpNode,
+ X86FoldableSchedWrite sched,
+ string Broadcast = _.BroadcastStr,
+ string Alias = "", X86MemOperand MemOp = _Src.MemOp,
+ RegisterClass MaskRC = _.KRCWM>
+ : avx512_vcvt_fp<opc, OpcodeStr, _, _Src, OpNode, sched, Broadcast, Alias,
+ MemOp, MaskRC,
+ (_.VT (!cast<PatFrag>("extload"#_Src.VTName) addr:$src))>;
+
// Extend Float to Double
multiclass avx512_cvtps2pd<bits<8> opc, string OpcodeStr,
X86SchedWriteWidths sched> {
let Predicates = [HasAVX512] in {
- defm Z : avx512_vcvt_fp<opc, OpcodeStr, v8f64_info, v8f32x_info,
+ defm Z : avx512_vcvt_fpextend<opc, OpcodeStr, v8f64_info, v8f32x_info,
fpextend, sched.ZMM>,
avx512_vcvt_fp_sae<opc, OpcodeStr, v8f64_info, v8f32x_info,
- X86vfpextRnd, sched.ZMM>, EVEX_V512;
+ X86vfpextSAE, sched.ZMM>, EVEX_V512;
}
let Predicates = [HasVLX] in {
- defm Z128 : avx512_vcvt_fp<opc, OpcodeStr, v2f64x_info, v4f32x_info,
+ defm Z128 : avx512_vcvt_fpextend<opc, OpcodeStr, v2f64x_info, v4f32x_info,
X86vfpext, sched.XMM, "{1to2}", "", f64mem>, EVEX_V128;
- defm Z256 : avx512_vcvt_fp<opc, OpcodeStr, v4f64x_info, v4f32x_info, fpextend,
+ defm Z256 : avx512_vcvt_fpextend<opc, OpcodeStr, v4f64x_info, v4f32x_info, fpextend,
sched.YMM>, EVEX_V256;
}
}
@@ -8060,7 +7569,7 @@ multiclass avx512_cvtps2pd<bits<8> opc, string OpcodeStr,
// Truncate Double to Float
multiclass avx512_cvtpd2ps<bits<8> opc, string OpcodeStr, X86SchedWriteWidths sched> {
let Predicates = [HasAVX512] in {
- defm Z : avx512_vcvt_fp<opc, OpcodeStr, v8f32x_info, v8f64_info, fpround, sched.ZMM>,
+ defm Z : avx512_vcvt_fp<opc, OpcodeStr, v8f32x_info, v8f64_info, X86vfpround, sched.ZMM>,
avx512_vcvt_fp_rc<opc, OpcodeStr, v8f32x_info, v8f64_info,
X86vfproundRnd, sched.ZMM>, EVEX_V512;
}
@@ -8068,18 +7577,49 @@ multiclass avx512_cvtpd2ps<bits<8> opc, string OpcodeStr, X86SchedWriteWidths sc
defm Z128 : avx512_vcvt_fp<opc, OpcodeStr, v4f32x_info, v2f64x_info,
null_frag, sched.XMM, "{1to2}", "{x}", f128mem, VK2WM>,
EVEX_V128;
- defm Z256 : avx512_vcvt_fp<opc, OpcodeStr, v4f32x_info, v4f64x_info, fpround,
+ defm Z256 : avx512_vcvt_fp<opc, OpcodeStr, v4f32x_info, v4f64x_info, X86vfpround,
sched.YMM, "{1to4}", "{y}">, EVEX_V256;
-
- def : InstAlias<OpcodeStr##"x\t{$src, $dst|$dst, $src}",
- (!cast<Instruction>(NAME # "Z128rr") VR128X:$dst, VR128X:$src), 0>;
- def : InstAlias<OpcodeStr##"x\t{$src, $dst|$dst, $src}",
- (!cast<Instruction>(NAME # "Z128rm") VR128X:$dst, f128mem:$src), 0, "intel">;
- def : InstAlias<OpcodeStr##"y\t{$src, $dst|$dst, $src}",
- (!cast<Instruction>(NAME # "Z256rr") VR128X:$dst, VR256X:$src), 0>;
- def : InstAlias<OpcodeStr##"y\t{$src, $dst|$dst, $src}",
- (!cast<Instruction>(NAME # "Z256rm") VR128X:$dst, f256mem:$src), 0, "intel">;
}
+
+ def : InstAlias<OpcodeStr##"x\t{$src, $dst|$dst, $src}",
+ (!cast<Instruction>(NAME # "Z128rr") VR128X:$dst, VR128X:$src), 0, "att">;
+ def : InstAlias<OpcodeStr##"x\t{$src, $dst {${mask}}|$dst {${mask}}, $src}",
+ (!cast<Instruction>(NAME # "Z128rrk") VR128X:$dst,
+ VK2WM:$mask, VR128X:$src), 0, "att">;
+ def : InstAlias<OpcodeStr##"x\t{$src, $dst {${mask}} {z}|"
+ "$dst {${mask}} {z}, $src}",
+ (!cast<Instruction>(NAME # "Z128rrkz") VR128X:$dst,
+ VK2WM:$mask, VR128X:$src), 0, "att">;
+ def : InstAlias<OpcodeStr##"x\t{${src}{1to2}, $dst|$dst, ${src}{1to2}}",
+ (!cast<Instruction>(NAME # "Z128rmb") VR128X:$dst, f64mem:$src), 0, "att">;
+ def : InstAlias<OpcodeStr##"x\t{${src}{1to2}, $dst {${mask}}|"
+ "$dst {${mask}}, ${src}{1to2}}",
+ (!cast<Instruction>(NAME # "Z128rmbk") VR128X:$dst,
+ VK2WM:$mask, f64mem:$src), 0, "att">;
+ def : InstAlias<OpcodeStr##"x\t{${src}{1to2}, $dst {${mask}} {z}|"
+ "$dst {${mask}} {z}, ${src}{1to2}}",
+ (!cast<Instruction>(NAME # "Z128rmbkz") VR128X:$dst,
+ VK2WM:$mask, f64mem:$src), 0, "att">;
+
+ def : InstAlias<OpcodeStr##"y\t{$src, $dst|$dst, $src}",
+ (!cast<Instruction>(NAME # "Z256rr") VR128X:$dst, VR256X:$src), 0, "att">;
+ def : InstAlias<OpcodeStr##"y\t{$src, $dst {${mask}}|$dst {${mask}}, $src}",
+ (!cast<Instruction>(NAME # "Z256rrk") VR128X:$dst,
+ VK4WM:$mask, VR256X:$src), 0, "att">;
+ def : InstAlias<OpcodeStr##"y\t{$src, $dst {${mask}} {z}|"
+ "$dst {${mask}} {z}, $src}",
+ (!cast<Instruction>(NAME # "Z256rrkz") VR128X:$dst,
+ VK4WM:$mask, VR256X:$src), 0, "att">;
+ def : InstAlias<OpcodeStr##"y\t{${src}{1to4}, $dst|$dst, ${src}{1to4}}",
+ (!cast<Instruction>(NAME # "Z256rmb") VR128X:$dst, f64mem:$src), 0, "att">;
+ def : InstAlias<OpcodeStr##"y\t{${src}{1to4}, $dst {${mask}}|"
+ "$dst {${mask}}, ${src}{1to4}}",
+ (!cast<Instruction>(NAME # "Z256rmbk") VR128X:$dst,
+ VK4WM:$mask, f64mem:$src), 0, "att">;
+ def : InstAlias<OpcodeStr##"y\t{${src}{1to4}, $dst {${mask}} {z}|"
+ "$dst {${mask}} {z}, ${src}{1to4}}",
+ (!cast<Instruction>(NAME # "Z256rmbkz") VR128X:$dst,
+ VK4WM:$mask, f64mem:$src), 0, "att">;
}
defm VCVTPD2PS : avx512_cvtpd2ps<0x5A, "vcvtpd2ps", SchedWriteCvtPD2PS>,
@@ -8087,20 +7627,66 @@ defm VCVTPD2PS : avx512_cvtpd2ps<0x5A, "vcvtpd2ps", SchedWriteCvtPD2PS>,
defm VCVTPS2PD : avx512_cvtps2pd<0x5A, "vcvtps2pd", SchedWriteCvtPS2PD>,
PS, EVEX_CD8<32, CD8VH>;
-def : Pat<(v8f64 (extloadv8f32 addr:$src)),
- (VCVTPS2PDZrm addr:$src)>;
+let Predicates = [HasAVX512] in {
+ def : Pat<(v8f32 (fpround (v8f64 VR512:$src))),
+ (VCVTPD2PSZrr VR512:$src)>;
+ def : Pat<(vselect VK8WM:$mask, (v8f32 (fpround (v8f64 VR512:$src))),
+ VR256X:$src0),
+ (VCVTPD2PSZrrk VR256X:$src0, VK8WM:$mask, VR512:$src)>;
+ def : Pat<(vselect VK8WM:$mask, (v8f32 (fpround (v8f64 VR512:$src))),
+ v8f32x_info.ImmAllZerosV),
+ (VCVTPD2PSZrrkz VK8WM:$mask, VR512:$src)>;
+
+ def : Pat<(v8f32 (fpround (loadv8f64 addr:$src))),
+ (VCVTPD2PSZrm addr:$src)>;
+ def : Pat<(vselect VK8WM:$mask, (v8f32 (fpround (loadv8f64 addr:$src))),
+ VR256X:$src0),
+ (VCVTPD2PSZrmk VR256X:$src0, VK8WM:$mask, addr:$src)>;
+ def : Pat<(vselect VK8WM:$mask, (v8f32 (fpround (loadv8f64 addr:$src))),
+ v8f32x_info.ImmAllZerosV),
+ (VCVTPD2PSZrmkz VK8WM:$mask, addr:$src)>;
+
+ def : Pat<(v8f32 (fpround (v8f64 (X86VBroadcast (loadf64 addr:$src))))),
+ (VCVTPD2PSZrmb addr:$src)>;
+ def : Pat<(vselect VK8WM:$mask,
+ (fpround (v8f64 (X86VBroadcast (loadf64 addr:$src)))),
+ (v8f32 VR256X:$src0)),
+ (VCVTPD2PSZrmbk VR256X:$src0, VK8WM:$mask, addr:$src)>;
+ def : Pat<(vselect VK8WM:$mask,
+ (fpround (v8f64 (X86VBroadcast (loadf64 addr:$src)))),
+ v8f32x_info.ImmAllZerosV),
+ (VCVTPD2PSZrmbkz VK8WM:$mask, addr:$src)>;
+}
let Predicates = [HasVLX] in {
- def : Pat<(X86vzmovl (v2f64 (bitconvert
- (v4f32 (X86vfpround (v2f64 VR128X:$src)))))),
- (VCVTPD2PSZ128rr VR128X:$src)>;
- def : Pat<(X86vzmovl (v2f64 (bitconvert
- (v4f32 (X86vfpround (loadv2f64 addr:$src)))))),
- (VCVTPD2PSZ128rm addr:$src)>;
- def : Pat<(v2f64 (extloadv2f32 addr:$src)),
- (VCVTPS2PDZ128rm addr:$src)>;
- def : Pat<(v4f64 (extloadv4f32 addr:$src)),
- (VCVTPS2PDZ256rm addr:$src)>;
+ def : Pat<(v4f32 (fpround (v4f64 VR256X:$src))),
+ (VCVTPD2PSZ256rr VR256X:$src)>;
+ def : Pat<(vselect VK4WM:$mask, (v4f32 (fpround (v4f64 VR256X:$src))),
+ VR128X:$src0),
+ (VCVTPD2PSZ256rrk VR128X:$src0, VK4WM:$mask, VR256X:$src)>;
+ def : Pat<(vselect VK4WM:$mask, (v4f32 (fpround (v4f64 VR256X:$src))),
+ v4f32x_info.ImmAllZerosV),
+ (VCVTPD2PSZ256rrkz VK4WM:$mask, VR256X:$src)>;
+
+ def : Pat<(v4f32 (fpround (loadv4f64 addr:$src))),
+ (VCVTPD2PSZ256rm addr:$src)>;
+ def : Pat<(vselect VK4WM:$mask, (v4f32 (fpround (loadv4f64 addr:$src))),
+ VR128X:$src0),
+ (VCVTPD2PSZ256rmk VR128X:$src0, VK4WM:$mask, addr:$src)>;
+ def : Pat<(vselect VK4WM:$mask, (v4f32 (fpround (loadv4f64 addr:$src))),
+ v4f32x_info.ImmAllZerosV),
+ (VCVTPD2PSZ256rmkz VK4WM:$mask, addr:$src)>;
+
+ def : Pat<(v4f32 (fpround (v4f64 (X86VBroadcast (loadf64 addr:$src))))),
+ (VCVTPD2PSZ256rmb addr:$src)>;
+ def : Pat<(vselect VK4WM:$mask,
+ (v4f32 (fpround (v4f64 (X86VBroadcast (loadf64 addr:$src))))),
+ VR128X:$src0),
+ (VCVTPD2PSZ256rmbk VR128X:$src0, VK4WM:$mask, addr:$src)>;
+ def : Pat<(vselect VK4WM:$mask,
+ (v4f32 (fpround (v4f64 (X86VBroadcast (loadf64 addr:$src))))),
+ v4f32x_info.ImmAllZerosV),
+ (VCVTPD2PSZ256rmbkz VK4WM:$mask, addr:$src)>;
// Special patterns to allow use of X86vmfpround for masking. Instruction
// patterns have been disabled with null_frag.
@@ -8142,7 +7728,11 @@ multiclass avx512_cvtdq2pd<bits<8> opc, string OpcodeStr, SDNode OpNode,
let Predicates = [HasVLX] in {
defm Z128 : avx512_vcvt_fp<opc, OpcodeStr, v2f64x_info, v4i32x_info,
- OpNode128, sched.XMM, "{1to2}", "", i64mem>, EVEX_V128;
+ OpNode128, sched.XMM, "{1to2}", "", i64mem, VK2WM,
+ (v2f64 (OpNode128 (bc_v4i32
+ (v2i64
+ (scalar_to_vector (loadi64 addr:$src))))))>,
+ EVEX_V128;
defm Z256 : avx512_vcvt_fp<opc, OpcodeStr, v4f64x_info, v4i32x_info, OpNode,
sched.YMM>, EVEX_V256;
}
@@ -8167,12 +7757,12 @@ multiclass avx512_cvtdq2ps<bits<8> opc, string OpcodeStr, SDNode OpNode,
// Convert Float to Signed/Unsigned Doubleword with truncation
multiclass avx512_cvttps2dq<bits<8> opc, string OpcodeStr, SDNode OpNode,
- SDNode OpNodeRnd, X86SchedWriteWidths sched> {
+ SDNode OpNodeSAE, X86SchedWriteWidths sched> {
let Predicates = [HasAVX512] in {
defm Z : avx512_vcvt_fp<opc, OpcodeStr, v16i32_info, v16f32_info, OpNode,
sched.ZMM>,
avx512_vcvt_fp_sae<opc, OpcodeStr, v16i32_info, v16f32_info,
- OpNodeRnd, sched.ZMM>, EVEX_V512;
+ OpNodeSAE, sched.ZMM>, EVEX_V512;
}
let Predicates = [HasVLX] in {
defm Z128 : avx512_vcvt_fp<opc, OpcodeStr, v4i32x_info, v4f32x_info, OpNode,
@@ -8201,12 +7791,12 @@ multiclass avx512_cvtps2dq<bits<8> opc, string OpcodeStr, SDNode OpNode,
// Convert Double to Signed/Unsigned Doubleword with truncation
multiclass avx512_cvttpd2dq<bits<8> opc, string OpcodeStr, SDNode OpNode,
- SDNode OpNodeRnd, X86SchedWriteWidths sched> {
+ SDNode OpNodeSAE, X86SchedWriteWidths sched> {
let Predicates = [HasAVX512] in {
defm Z : avx512_vcvt_fp<opc, OpcodeStr, v8i32x_info, v8f64_info, OpNode,
sched.ZMM>,
avx512_vcvt_fp_sae<opc, OpcodeStr, v8i32x_info, v8f64_info,
- OpNodeRnd, sched.ZMM>, EVEX_V512;
+ OpNodeSAE, sched.ZMM>, EVEX_V512;
}
let Predicates = [HasVLX] in {
// we need "x"/"y" suffixes in order to distinguish between 128 and 256
@@ -8218,16 +7808,49 @@ multiclass avx512_cvttpd2dq<bits<8> opc, string OpcodeStr, SDNode OpNode,
VK2WM>, EVEX_V128;
defm Z256 : avx512_vcvt_fp<opc, OpcodeStr, v4i32x_info, v4f64x_info, OpNode,
sched.YMM, "{1to4}", "{y}">, EVEX_V256;
-
- def : InstAlias<OpcodeStr##"x\t{$src, $dst|$dst, $src}",
- (!cast<Instruction>(NAME # "Z128rr") VR128X:$dst, VR128X:$src), 0>;
- def : InstAlias<OpcodeStr##"x\t{$src, $dst|$dst, $src}",
- (!cast<Instruction>(NAME # "Z128rm") VR128X:$dst, i128mem:$src), 0, "intel">;
- def : InstAlias<OpcodeStr##"y\t{$src, $dst|$dst, $src}",
- (!cast<Instruction>(NAME # "Z256rr") VR128X:$dst, VR256X:$src), 0>;
- def : InstAlias<OpcodeStr##"y\t{$src, $dst|$dst, $src}",
- (!cast<Instruction>(NAME # "Z256rm") VR128X:$dst, i256mem:$src), 0, "intel">;
}
+
+ def : InstAlias<OpcodeStr##"x\t{$src, $dst|$dst, $src}",
+ (!cast<Instruction>(NAME # "Z128rr") VR128X:$dst,
+ VR128X:$src), 0, "att">;
+ def : InstAlias<OpcodeStr##"x\t{$src, $dst {${mask}}|$dst {${mask}}, $src}",
+ (!cast<Instruction>(NAME # "Z128rrk") VR128X:$dst,
+ VK2WM:$mask, VR128X:$src), 0, "att">;
+ def : InstAlias<OpcodeStr##"x\t{$src, $dst {${mask}} {z}|$dst {${mask}} {z}, $src}",
+ (!cast<Instruction>(NAME # "Z128rrkz") VR128X:$dst,
+ VK2WM:$mask, VR128X:$src), 0, "att">;
+ def : InstAlias<OpcodeStr##"x\t{${src}{1to2}, $dst|$dst, ${src}{1to2}}",
+ (!cast<Instruction>(NAME # "Z128rmb") VR128X:$dst,
+ f64mem:$src), 0, "att">;
+ def : InstAlias<OpcodeStr##"x\t{${src}{1to2}, $dst {${mask}}|"
+ "$dst {${mask}}, ${src}{1to2}}",
+ (!cast<Instruction>(NAME # "Z128rmbk") VR128X:$dst,
+ VK2WM:$mask, f64mem:$src), 0, "att">;
+ def : InstAlias<OpcodeStr##"x\t{${src}{1to2}, $dst {${mask}} {z}|"
+ "$dst {${mask}} {z}, ${src}{1to2}}",
+ (!cast<Instruction>(NAME # "Z128rmbkz") VR128X:$dst,
+ VK2WM:$mask, f64mem:$src), 0, "att">;
+
+ def : InstAlias<OpcodeStr##"y\t{$src, $dst|$dst, $src}",
+ (!cast<Instruction>(NAME # "Z256rr") VR128X:$dst,
+ VR256X:$src), 0, "att">;
+ def : InstAlias<OpcodeStr##"y\t{$src, $dst {${mask}}|$dst {${mask}}, $src}",
+ (!cast<Instruction>(NAME # "Z256rrk") VR128X:$dst,
+ VK4WM:$mask, VR256X:$src), 0, "att">;
+ def : InstAlias<OpcodeStr##"y\t{$src, $dst {${mask}} {z}|$dst {${mask}} {z}, $src}",
+ (!cast<Instruction>(NAME # "Z256rrkz") VR128X:$dst,
+ VK4WM:$mask, VR256X:$src), 0, "att">;
+ def : InstAlias<OpcodeStr##"y\t{${src}{1to4}, $dst|$dst, ${src}{1to4}}",
+ (!cast<Instruction>(NAME # "Z256rmb") VR128X:$dst,
+ f64mem:$src), 0, "att">;
+ def : InstAlias<OpcodeStr##"y\t{${src}{1to4}, $dst {${mask}}|"
+ "$dst {${mask}}, ${src}{1to4}}",
+ (!cast<Instruction>(NAME # "Z256rmbk") VR128X:$dst,
+ VK4WM:$mask, f64mem:$src), 0, "att">;
+ def : InstAlias<OpcodeStr##"y\t{${src}{1to4}, $dst {${mask}} {z}|"
+ "$dst {${mask}} {z}, ${src}{1to4}}",
+ (!cast<Instruction>(NAME # "Z256rmbkz") VR128X:$dst,
+ VK4WM:$mask, f64mem:$src), 0, "att">;
}
// Convert Double to Signed/Unsigned Doubleword
@@ -8249,16 +7872,47 @@ multiclass avx512_cvtpd2dq<bits<8> opc, string OpcodeStr, SDNode OpNode,
VK2WM>, EVEX_V128;
defm Z256 : avx512_vcvt_fp<opc, OpcodeStr, v4i32x_info, v4f64x_info, OpNode,
sched.YMM, "{1to4}", "{y}">, EVEX_V256;
-
- def : InstAlias<OpcodeStr##"x\t{$src, $dst|$dst, $src}",
- (!cast<Instruction>(NAME # "Z128rr") VR128X:$dst, VR128X:$src), 0>;
- def : InstAlias<OpcodeStr##"x\t{$src, $dst|$dst, $src}",
- (!cast<Instruction>(NAME # "Z128rm") VR128X:$dst, f128mem:$src), 0, "intel">;
- def : InstAlias<OpcodeStr##"y\t{$src, $dst|$dst, $src}",
- (!cast<Instruction>(NAME # "Z256rr") VR128X:$dst, VR256X:$src), 0>;
- def : InstAlias<OpcodeStr##"y\t{$src, $dst|$dst, $src}",
- (!cast<Instruction>(NAME # "Z256rm") VR128X:$dst, f256mem:$src), 0, "intel">;
}
+
+ def : InstAlias<OpcodeStr##"x\t{$src, $dst|$dst, $src}",
+ (!cast<Instruction>(NAME # "Z128rr") VR128X:$dst, VR128X:$src), 0, "att">;
+ def : InstAlias<OpcodeStr##"x\t{$src, $dst {${mask}}|$dst {${mask}}, $src}",
+ (!cast<Instruction>(NAME # "Z128rrk") VR128X:$dst,
+ VK2WM:$mask, VR128X:$src), 0, "att">;
+ def : InstAlias<OpcodeStr##"x\t{$src, $dst {${mask}} {z}|$dst {${mask}} {z}, $src}",
+ (!cast<Instruction>(NAME # "Z128rrkz") VR128X:$dst,
+ VK2WM:$mask, VR128X:$src), 0, "att">;
+ def : InstAlias<OpcodeStr##"x\t{${src}{1to2}, $dst|$dst, ${src}{1to2}}",
+ (!cast<Instruction>(NAME # "Z128rmb") VR128X:$dst,
+ f64mem:$src), 0, "att">;
+ def : InstAlias<OpcodeStr##"x\t{${src}{1to2}, $dst {${mask}}|"
+ "$dst {${mask}}, ${src}{1to2}}",
+ (!cast<Instruction>(NAME # "Z128rmbk") VR128X:$dst,
+ VK2WM:$mask, f64mem:$src), 0, "att">;
+ def : InstAlias<OpcodeStr##"x\t{${src}{1to2}, $dst {${mask}} {z}|"
+ "$dst {${mask}} {z}, ${src}{1to2}}",
+ (!cast<Instruction>(NAME # "Z128rmbkz") VR128X:$dst,
+ VK2WM:$mask, f64mem:$src), 0, "att">;
+
+ def : InstAlias<OpcodeStr##"y\t{$src, $dst|$dst, $src}",
+ (!cast<Instruction>(NAME # "Z256rr") VR128X:$dst, VR256X:$src), 0, "att">;
+ def : InstAlias<OpcodeStr##"y\t{$src, $dst {${mask}}|$dst {${mask}}, $src}",
+ (!cast<Instruction>(NAME # "Z256rrk") VR128X:$dst,
+ VK4WM:$mask, VR256X:$src), 0, "att">;
+ def : InstAlias<OpcodeStr##"y\t{$src, $dst {${mask}} {z}|$dst {${mask}} {z}, $src}",
+ (!cast<Instruction>(NAME # "Z256rrkz") VR128X:$dst,
+ VK4WM:$mask, VR256X:$src), 0, "att">;
+ def : InstAlias<OpcodeStr##"y\t{${src}{1to4}, $dst|$dst, ${src}{1to4}}",
+ (!cast<Instruction>(NAME # "Z256rmb") VR128X:$dst,
+ f64mem:$src), 0, "att">;
+ def : InstAlias<OpcodeStr##"y\t{${src}{1to4}, $dst {${mask}}|"
+ "$dst {${mask}}, ${src}{1to4}}",
+ (!cast<Instruction>(NAME # "Z256rmbk") VR128X:$dst,
+ VK4WM:$mask, f64mem:$src), 0, "att">;
+ def : InstAlias<OpcodeStr##"y\t{${src}{1to4}, $dst {${mask}} {z}|"
+ "$dst {${mask}} {z}, ${src}{1to4}}",
+ (!cast<Instruction>(NAME # "Z256rmbkz") VR128X:$dst,
+ VK4WM:$mask, f64mem:$src), 0, "att">;
}
// Convert Double to Signed/Unsigned Quardword
@@ -8325,7 +7979,11 @@ multiclass avx512_cvtps2qq<bits<8> opc, string OpcodeStr, SDNode OpNode,
// Explicitly specified broadcast string, since we take only 2 elements
// from v4f32x_info source
defm Z128 : avx512_vcvt_fp<opc, OpcodeStr, v2i64x_info, v4f32x_info, OpNode,
- sched.XMM, "{1to2}", "", f64mem>, EVEX_V128;
+ sched.XMM, "{1to2}", "", f64mem, VK2WM,
+ (v2i64 (OpNode (bc_v4f32
+ (v2f64
+ (scalar_to_vector (loadf64 addr:$src))))))>,
+ EVEX_V128;
defm Z256 : avx512_vcvt_fp<opc, OpcodeStr, v4i64x_info, v4f32x_info, OpNode,
sched.YMM>, EVEX_V256;
}
@@ -8343,7 +8001,11 @@ multiclass avx512_cvttps2qq<bits<8> opc, string OpcodeStr, SDNode OpNode,
// Explicitly specified broadcast string, since we take only 2 elements
// from v4f32x_info source
defm Z128 : avx512_vcvt_fp<opc, OpcodeStr, v2i64x_info, v4f32x_info, OpNode,
- sched.XMM, "{1to2}", "", f64mem>, EVEX_V128;
+ sched.XMM, "{1to2}", "", f64mem, VK2WM,
+ (v2i64 (OpNode (bc_v4f32
+ (v2f64
+ (scalar_to_vector (loadf64 addr:$src))))))>,
+ EVEX_V128;
defm Z256 : avx512_vcvt_fp<opc, OpcodeStr, v4i64x_info, v4f32x_info, OpNode,
sched.YMM>, EVEX_V256;
}
@@ -8351,8 +8013,7 @@ multiclass avx512_cvttps2qq<bits<8> opc, string OpcodeStr, SDNode OpNode,
// Convert Signed/Unsigned Quardword to Float
multiclass avx512_cvtqq2ps<bits<8> opc, string OpcodeStr, SDNode OpNode,
- SDNode OpNode128, SDNode OpNodeRnd,
- X86SchedWriteWidths sched> {
+ SDNode OpNodeRnd, X86SchedWriteWidths sched> {
let Predicates = [HasDQI] in {
defm Z : avx512_vcvt_fp<opc, OpcodeStr, v8f32x_info, v8i64_info, OpNode,
sched.ZMM>,
@@ -8364,22 +8025,57 @@ multiclass avx512_cvtqq2ps<bits<8> opc, string OpcodeStr, SDNode OpNode,
// memory forms of these instructions in Asm Parcer. They have the same
// dest type - 'v4i32x_info'. We also specify the broadcast string explicitly
// due to the same reason.
- defm Z128 : avx512_vcvt_fp<opc, OpcodeStr, v4f32x_info, v2i64x_info, OpNode128,
- sched.XMM, "{1to2}", "{x}">, EVEX_V128,
- NotEVEX2VEXConvertible;
+ defm Z128 : avx512_vcvt_fp<opc, OpcodeStr, v4f32x_info, v2i64x_info, null_frag,
+ sched.XMM, "{1to2}", "{x}", i128mem, VK2WM>,
+ EVEX_V128, NotEVEX2VEXConvertible;
defm Z256 : avx512_vcvt_fp<opc, OpcodeStr, v4f32x_info, v4i64x_info, OpNode,
sched.YMM, "{1to4}", "{y}">, EVEX_V256,
NotEVEX2VEXConvertible;
-
- def : InstAlias<OpcodeStr##"x\t{$src, $dst|$dst, $src}",
- (!cast<Instruction>(NAME # "Z128rr") VR128X:$dst, VR128X:$src), 0>;
- def : InstAlias<OpcodeStr##"x\t{$src, $dst|$dst, $src}",
- (!cast<Instruction>(NAME # "Z128rm") VR128X:$dst, i128mem:$src), 0, "intel">;
- def : InstAlias<OpcodeStr##"y\t{$src, $dst|$dst, $src}",
- (!cast<Instruction>(NAME # "Z256rr") VR128X:$dst, VR256X:$src), 0>;
- def : InstAlias<OpcodeStr##"y\t{$src, $dst|$dst, $src}",
- (!cast<Instruction>(NAME # "Z256rm") VR128X:$dst, i256mem:$src), 0, "intel">;
}
+
+ def : InstAlias<OpcodeStr##"x\t{$src, $dst|$dst, $src}",
+ (!cast<Instruction>(NAME # "Z128rr") VR128X:$dst,
+ VR128X:$src), 0, "att">;
+ def : InstAlias<OpcodeStr##"x\t{$src, $dst {${mask}}|$dst {${mask}}, $src}",
+ (!cast<Instruction>(NAME # "Z128rrk") VR128X:$dst,
+ VK2WM:$mask, VR128X:$src), 0, "att">;
+ def : InstAlias<OpcodeStr##"x\t{$src, $dst {${mask}} {z}|$dst {${mask}} {z}, $src}",
+ (!cast<Instruction>(NAME # "Z128rrkz") VR128X:$dst,
+ VK2WM:$mask, VR128X:$src), 0, "att">;
+ def : InstAlias<OpcodeStr##"x\t{${src}{1to2}, $dst|$dst, ${src}{1to2}}",
+ (!cast<Instruction>(NAME # "Z128rmb") VR128X:$dst,
+ i64mem:$src), 0, "att">;
+ def : InstAlias<OpcodeStr##"x\t{${src}{1to2}, $dst {${mask}}|"
+ "$dst {${mask}}, ${src}{1to2}}",
+ (!cast<Instruction>(NAME # "Z128rmbk") VR128X:$dst,
+ VK2WM:$mask, i64mem:$src), 0, "att">;
+ def : InstAlias<OpcodeStr##"x\t{${src}{1to2}, $dst {${mask}} {z}|"
+ "$dst {${mask}} {z}, ${src}{1to2}}",
+ (!cast<Instruction>(NAME # "Z128rmbkz") VR128X:$dst,
+ VK2WM:$mask, i64mem:$src), 0, "att">;
+
+ def : InstAlias<OpcodeStr##"y\t{$src, $dst|$dst, $src}",
+ (!cast<Instruction>(NAME # "Z256rr") VR128X:$dst,
+ VR256X:$src), 0, "att">;
+ def : InstAlias<OpcodeStr##"y\t{$src, $dst {${mask}}|"
+ "$dst {${mask}}, $src}",
+ (!cast<Instruction>(NAME # "Z256rrk") VR128X:$dst,
+ VK4WM:$mask, VR256X:$src), 0, "att">;
+ def : InstAlias<OpcodeStr##"y\t{$src, $dst {${mask}} {z}|"
+ "$dst {${mask}} {z}, $src}",
+ (!cast<Instruction>(NAME # "Z256rrkz") VR128X:$dst,
+ VK4WM:$mask, VR256X:$src), 0, "att">;
+ def : InstAlias<OpcodeStr##"y\t{${src}{1to4}, $dst|$dst, ${src}{1to4}}",
+ (!cast<Instruction>(NAME # "Z256rmb") VR128X:$dst,
+ i64mem:$src), 0, "att">;
+ def : InstAlias<OpcodeStr##"y\t{${src}{1to4}, $dst {${mask}}|"
+ "$dst {${mask}}, ${src}{1to4}}",
+ (!cast<Instruction>(NAME # "Z256rmbk") VR128X:$dst,
+ VK4WM:$mask, i64mem:$src), 0, "att">;
+ def : InstAlias<OpcodeStr##"y\t{${src}{1to4}, $dst {${mask}} {z}|"
+ "$dst {${mask}} {z}, ${src}{1to4}}",
+ (!cast<Instruction>(NAME # "Z256rmbkz") VR128X:$dst,
+ VK4WM:$mask, i64mem:$src), 0, "att">;
}
defm VCVTDQ2PD : avx512_cvtdq2pd<0xE6, "vcvtdq2pd", sint_to_fp, X86VSintToFP,
@@ -8390,19 +8086,19 @@ defm VCVTDQ2PS : avx512_cvtdq2ps<0x5B, "vcvtdq2ps", sint_to_fp,
PS, EVEX_CD8<32, CD8VF>;
defm VCVTTPS2DQ : avx512_cvttps2dq<0x5B, "vcvttps2dq", X86cvttp2si,
- X86cvttp2siRnd, SchedWriteCvtPS2DQ>,
+ X86cvttp2siSAE, SchedWriteCvtPS2DQ>,
XS, EVEX_CD8<32, CD8VF>;
defm VCVTTPD2DQ : avx512_cvttpd2dq<0xE6, "vcvttpd2dq", X86cvttp2si,
- X86cvttp2siRnd, SchedWriteCvtPD2DQ>,
+ X86cvttp2siSAE, SchedWriteCvtPD2DQ>,
PD, VEX_W, EVEX_CD8<64, CD8VF>;
defm VCVTTPS2UDQ : avx512_cvttps2dq<0x78, "vcvttps2udq", X86cvttp2ui,
- X86cvttp2uiRnd, SchedWriteCvtPS2DQ>, PS,
+ X86cvttp2uiSAE, SchedWriteCvtPS2DQ>, PS,
EVEX_CD8<32, CD8VF>;
defm VCVTTPD2UDQ : avx512_cvttpd2dq<0x78, "vcvttpd2udq", X86cvttp2ui,
- X86cvttp2uiRnd, SchedWriteCvtPD2DQ>,
+ X86cvttp2uiSAE, SchedWriteCvtPD2DQ>,
PS, VEX_W, EVEX_CD8<64, CD8VF>;
defm VCVTUDQ2PD : avx512_cvtdq2pd<0x7A, "vcvtudq2pd", uint_to_fp,
@@ -8446,19 +8142,19 @@ defm VCVTPS2UQQ : avx512_cvtps2qq<0x79, "vcvtps2uqq", X86cvtp2UInt,
EVEX_CD8<32, CD8VH>;
defm VCVTTPD2QQ : avx512_cvttpd2qq<0x7A, "vcvttpd2qq", X86cvttp2si,
- X86cvttp2siRnd, SchedWriteCvtPD2DQ>, VEX_W,
+ X86cvttp2siSAE, SchedWriteCvtPD2DQ>, VEX_W,
PD, EVEX_CD8<64, CD8VF>;
defm VCVTTPS2QQ : avx512_cvttps2qq<0x7A, "vcvttps2qq", X86cvttp2si,
- X86cvttp2siRnd, SchedWriteCvtPS2DQ>, PD,
+ X86cvttp2siSAE, SchedWriteCvtPS2DQ>, PD,
EVEX_CD8<32, CD8VH>;
defm VCVTTPD2UQQ : avx512_cvttpd2qq<0x78, "vcvttpd2uqq", X86cvttp2ui,
- X86cvttp2uiRnd, SchedWriteCvtPD2DQ>, VEX_W,
+ X86cvttp2uiSAE, SchedWriteCvtPD2DQ>, VEX_W,
PD, EVEX_CD8<64, CD8VF>;
defm VCVTTPS2UQQ : avx512_cvttps2qq<0x78, "vcvttps2uqq", X86cvttp2ui,
- X86cvttp2uiRnd, SchedWriteCvtPS2DQ>, PD,
+ X86cvttp2uiSAE, SchedWriteCvtPS2DQ>, PD,
EVEX_CD8<32, CD8VH>;
defm VCVTQQ2PD : avx512_cvtqq2pd<0xE6, "vcvtqq2pd", sint_to_fp,
@@ -8469,67 +8165,15 @@ defm VCVTUQQ2PD : avx512_cvtqq2pd<0x7A, "vcvtuqq2pd", uint_to_fp,
X86VUintToFpRnd, SchedWriteCvtDQ2PD>, VEX_W, XS,
EVEX_CD8<64, CD8VF>;
-defm VCVTQQ2PS : avx512_cvtqq2ps<0x5B, "vcvtqq2ps", sint_to_fp, X86VSintToFP,
+defm VCVTQQ2PS : avx512_cvtqq2ps<0x5B, "vcvtqq2ps", sint_to_fp,
X86VSintToFpRnd, SchedWriteCvtDQ2PS>, VEX_W, PS,
EVEX_CD8<64, CD8VF>;
-defm VCVTUQQ2PS : avx512_cvtqq2ps<0x7A, "vcvtuqq2ps", uint_to_fp, X86VUintToFP,
+defm VCVTUQQ2PS : avx512_cvtqq2ps<0x7A, "vcvtuqq2ps", uint_to_fp,
X86VUintToFpRnd, SchedWriteCvtDQ2PS>, VEX_W, XD,
EVEX_CD8<64, CD8VF>;
-let Predicates = [HasAVX512] in {
- def : Pat<(v16i32 (fp_to_sint (v16f32 VR512:$src))),
- (VCVTTPS2DQZrr VR512:$src)>;
- def : Pat<(v16i32 (fp_to_sint (loadv16f32 addr:$src))),
- (VCVTTPS2DQZrm addr:$src)>;
-
- def : Pat<(v16i32 (fp_to_uint (v16f32 VR512:$src))),
- (VCVTTPS2UDQZrr VR512:$src)>;
- def : Pat<(v16i32 (fp_to_uint (loadv16f32 addr:$src))),
- (VCVTTPS2UDQZrm addr:$src)>;
-
- def : Pat<(v8i32 (fp_to_sint (v8f64 VR512:$src))),
- (VCVTTPD2DQZrr VR512:$src)>;
- def : Pat<(v8i32 (fp_to_sint (loadv8f64 addr:$src))),
- (VCVTTPD2DQZrm addr:$src)>;
-
- def : Pat<(v8i32 (fp_to_uint (v8f64 VR512:$src))),
- (VCVTTPD2UDQZrr VR512:$src)>;
- def : Pat<(v8i32 (fp_to_uint (loadv8f64 addr:$src))),
- (VCVTTPD2UDQZrm addr:$src)>;
-}
-
let Predicates = [HasVLX] in {
- def : Pat<(v4i32 (fp_to_sint (v4f32 VR128X:$src))),
- (VCVTTPS2DQZ128rr VR128X:$src)>;
- def : Pat<(v4i32 (fp_to_sint (loadv4f32 addr:$src))),
- (VCVTTPS2DQZ128rm addr:$src)>;
-
- def : Pat<(v4i32 (fp_to_uint (v4f32 VR128X:$src))),
- (VCVTTPS2UDQZ128rr VR128X:$src)>;
- def : Pat<(v4i32 (fp_to_uint (loadv4f32 addr:$src))),
- (VCVTTPS2UDQZ128rm addr:$src)>;
-
- def : Pat<(v8i32 (fp_to_sint (v8f32 VR256X:$src))),
- (VCVTTPS2DQZ256rr VR256X:$src)>;
- def : Pat<(v8i32 (fp_to_sint (loadv8f32 addr:$src))),
- (VCVTTPS2DQZ256rm addr:$src)>;
-
- def : Pat<(v8i32 (fp_to_uint (v8f32 VR256X:$src))),
- (VCVTTPS2UDQZ256rr VR256X:$src)>;
- def : Pat<(v8i32 (fp_to_uint (loadv8f32 addr:$src))),
- (VCVTTPS2UDQZ256rm addr:$src)>;
-
- def : Pat<(v4i32 (fp_to_sint (v4f64 VR256X:$src))),
- (VCVTTPD2DQZ256rr VR256X:$src)>;
- def : Pat<(v4i32 (fp_to_sint (loadv4f64 addr:$src))),
- (VCVTTPD2DQZ256rm addr:$src)>;
-
- def : Pat<(v4i32 (fp_to_uint (v4f64 VR256X:$src))),
- (VCVTTPD2UDQZ256rr VR256X:$src)>;
- def : Pat<(v4i32 (fp_to_uint (loadv4f64 addr:$src))),
- (VCVTTPD2UDQZ256rm addr:$src)>;
-
// Special patterns to allow use of X86mcvtp2Int for masking. Instruction
// patterns have been disabled with null_frag.
def : Pat<(v4i32 (X86cvtp2Int (v2f64 VR128X:$src))),
@@ -8647,72 +8291,64 @@ let Predicates = [HasVLX] in {
(VCVTTPD2UDQZ128rmbkz VK2WM:$mask, addr:$src)>;
}
-let Predicates = [HasDQI] in {
- def : Pat<(v8i64 (fp_to_sint (v8f32 VR256X:$src))),
- (VCVTTPS2QQZrr VR256X:$src)>;
- def : Pat<(v8i64 (fp_to_sint (loadv8f32 addr:$src))),
- (VCVTTPS2QQZrm addr:$src)>;
-
- def : Pat<(v8i64 (fp_to_uint (v8f32 VR256X:$src))),
- (VCVTTPS2UQQZrr VR256X:$src)>;
- def : Pat<(v8i64 (fp_to_uint (loadv8f32 addr:$src))),
- (VCVTTPS2UQQZrm addr:$src)>;
-
- def : Pat<(v8i64 (fp_to_sint (v8f64 VR512:$src))),
- (VCVTTPD2QQZrr VR512:$src)>;
- def : Pat<(v8i64 (fp_to_sint (loadv8f64 addr:$src))),
- (VCVTTPD2QQZrm addr:$src)>;
-
- def : Pat<(v8i64 (fp_to_uint (v8f64 VR512:$src))),
- (VCVTTPD2UQQZrr VR512:$src)>;
- def : Pat<(v8i64 (fp_to_uint (loadv8f64 addr:$src))),
- (VCVTTPD2UQQZrm addr:$src)>;
-}
-
let Predicates = [HasDQI, HasVLX] in {
- def : Pat<(v4i64 (fp_to_sint (v4f32 VR128X:$src))),
- (VCVTTPS2QQZ256rr VR128X:$src)>;
- def : Pat<(v4i64 (fp_to_sint (loadv4f32 addr:$src))),
- (VCVTTPS2QQZ256rm addr:$src)>;
-
- def : Pat<(v4i64 (fp_to_uint (v4f32 VR128X:$src))),
- (VCVTTPS2UQQZ256rr VR128X:$src)>;
- def : Pat<(v4i64 (fp_to_uint (loadv4f32 addr:$src))),
- (VCVTTPS2UQQZ256rm addr:$src)>;
-
- def : Pat<(v2i64 (fp_to_sint (v2f64 VR128X:$src))),
- (VCVTTPD2QQZ128rr VR128X:$src)>;
- def : Pat<(v2i64 (fp_to_sint (loadv2f64 addr:$src))),
- (VCVTTPD2QQZ128rm addr:$src)>;
-
- def : Pat<(v2i64 (fp_to_uint (v2f64 VR128X:$src))),
- (VCVTTPD2UQQZ128rr VR128X:$src)>;
- def : Pat<(v2i64 (fp_to_uint (loadv2f64 addr:$src))),
- (VCVTTPD2UQQZ128rm addr:$src)>;
-
- def : Pat<(v4i64 (fp_to_sint (v4f64 VR256X:$src))),
- (VCVTTPD2QQZ256rr VR256X:$src)>;
- def : Pat<(v4i64 (fp_to_sint (loadv4f64 addr:$src))),
- (VCVTTPD2QQZ256rm addr:$src)>;
-
- def : Pat<(v4i64 (fp_to_uint (v4f64 VR256X:$src))),
- (VCVTTPD2UQQZ256rr VR256X:$src)>;
- def : Pat<(v4i64 (fp_to_uint (loadv4f64 addr:$src))),
- (VCVTTPD2UQQZ256rm addr:$src)>;
+ def : Pat<(v2i64 (X86cvtp2Int (bc_v4f32 (v2f64 (X86vzload64 addr:$src))))),
+ (VCVTPS2QQZ128rm addr:$src)>;
+ def : Pat<(v2i64 (vselect VK2WM:$mask,
+ (X86cvtp2Int (bc_v4f32 (v2f64 (X86vzload64 addr:$src)))),
+ VR128X:$src0)),
+ (VCVTPS2QQZ128rmk VR128X:$src0, VK2WM:$mask, addr:$src)>;
+ def : Pat<(v2i64 (vselect VK2WM:$mask,
+ (X86cvtp2Int (bc_v4f32 (v2f64 (X86vzload64 addr:$src)))),
+ v2i64x_info.ImmAllZerosV)),
+ (VCVTPS2QQZ128rmkz VK2WM:$mask, addr:$src)>;
+
+ def : Pat<(v2i64 (X86cvtp2UInt (bc_v4f32 (v2f64 (X86vzload64 addr:$src))))),
+ (VCVTPS2UQQZ128rm addr:$src)>;
+ def : Pat<(v2i64 (vselect VK2WM:$mask,
+ (X86cvtp2UInt (bc_v4f32 (v2f64 (X86vzload64 addr:$src)))),
+ VR128X:$src0)),
+ (VCVTPS2UQQZ128rmk VR128X:$src0, VK2WM:$mask, addr:$src)>;
+ def : Pat<(v2i64 (vselect VK2WM:$mask,
+ (X86cvtp2UInt (bc_v4f32 (v2f64 (X86vzload64 addr:$src)))),
+ v2i64x_info.ImmAllZerosV)),
+ (VCVTPS2UQQZ128rmkz VK2WM:$mask, addr:$src)>;
+
+ def : Pat<(v2i64 (X86cvttp2si (bc_v4f32 (v2f64 (X86vzload64 addr:$src))))),
+ (VCVTTPS2QQZ128rm addr:$src)>;
+ def : Pat<(v2i64 (vselect VK2WM:$mask,
+ (X86cvttp2si (bc_v4f32 (v2f64 (X86vzload64 addr:$src)))),
+ VR128X:$src0)),
+ (VCVTTPS2QQZ128rmk VR128X:$src0, VK2WM:$mask, addr:$src)>;
+ def : Pat<(v2i64 (vselect VK2WM:$mask,
+ (X86cvttp2si (bc_v4f32 (v2f64 (X86vzload64 addr:$src)))),
+ v2i64x_info.ImmAllZerosV)),
+ (VCVTTPS2QQZ128rmkz VK2WM:$mask, addr:$src)>;
+
+ def : Pat<(v2i64 (X86cvttp2ui (bc_v4f32 (v2f64 (X86vzload64 addr:$src))))),
+ (VCVTTPS2UQQZ128rm addr:$src)>;
+ def : Pat<(v2i64 (vselect VK2WM:$mask,
+ (X86cvttp2ui (bc_v4f32 (v2f64 (X86vzload64 addr:$src)))),
+ VR128X:$src0)),
+ (VCVTTPS2UQQZ128rmk VR128X:$src0, VK2WM:$mask, addr:$src)>;
+ def : Pat<(v2i64 (vselect VK2WM:$mask,
+ (X86cvttp2ui (bc_v4f32 (v2f64 (X86vzload64 addr:$src)))),
+ v2i64x_info.ImmAllZerosV)),
+ (VCVTTPS2UQQZ128rmkz VK2WM:$mask, addr:$src)>;
}
let Predicates = [HasAVX512, NoVLX] in {
-def : Pat<(v8i32 (fp_to_uint (v8f32 VR256X:$src1))),
+def : Pat<(v8i32 (X86cvttp2ui (v8f32 VR256X:$src1))),
(EXTRACT_SUBREG (v16i32 (VCVTTPS2UDQZrr
(v16f32 (INSERT_SUBREG (IMPLICIT_DEF),
VR256X:$src1, sub_ymm)))), sub_ymm)>;
-def : Pat<(v4i32 (fp_to_uint (v4f32 VR128X:$src1))),
+def : Pat<(v4i32 (X86cvttp2ui (v4f32 VR128X:$src1))),
(EXTRACT_SUBREG (v16i32 (VCVTTPS2UDQZrr
(v16f32 (INSERT_SUBREG (IMPLICIT_DEF),
VR128X:$src1, sub_xmm)))), sub_xmm)>;
-def : Pat<(v4i32 (fp_to_uint (v4f64 VR256X:$src1))),
+def : Pat<(v4i32 (X86cvttp2ui (v4f64 VR256X:$src1))),
(EXTRACT_SUBREG (v8i32 (VCVTTPD2UDQZrr
(v8f64 (INSERT_SUBREG (IMPLICIT_DEF),
VR256X:$src1, sub_ymm)))), sub_xmm)>;
@@ -8738,80 +8374,117 @@ def : Pat<(v2f64 (X86VUintToFP (v4i32 VR128X:$src1))),
VR128X:$src1, sub_xmm)))), sub_xmm)>;
}
-let Predicates = [HasAVX512, HasVLX] in {
- def : Pat<(X86vzmovl (v2i64 (bitconvert
- (v4i32 (X86cvtp2Int (v2f64 VR128X:$src)))))),
- (VCVTPD2DQZ128rr VR128X:$src)>;
- def : Pat<(X86vzmovl (v2i64 (bitconvert
- (v4i32 (X86cvtp2Int (loadv2f64 addr:$src)))))),
- (VCVTPD2DQZ128rm addr:$src)>;
- def : Pat<(X86vzmovl (v2i64 (bitconvert
- (v4i32 (X86cvtp2UInt (v2f64 VR128X:$src)))))),
- (VCVTPD2UDQZ128rr VR128X:$src)>;
- def : Pat<(X86vzmovl (v2i64 (bitconvert
- (v4i32 (X86cvttp2si (v2f64 VR128X:$src)))))),
- (VCVTTPD2DQZ128rr VR128X:$src)>;
- def : Pat<(X86vzmovl (v2i64 (bitconvert
- (v4i32 (X86cvttp2si (loadv2f64 addr:$src)))))),
- (VCVTTPD2DQZ128rm addr:$src)>;
- def : Pat<(X86vzmovl (v2i64 (bitconvert
- (v4i32 (X86cvttp2ui (v2f64 VR128X:$src)))))),
- (VCVTTPD2UDQZ128rr VR128X:$src)>;
-
- def : Pat<(v2f64 (X86VSintToFP (bc_v4i32 (v2i64 (scalar_to_vector (loadi64 addr:$src)))))),
- (VCVTDQ2PDZ128rm addr:$src)>;
- def : Pat<(v2f64 (X86VSintToFP (bc_v4i32 (v2i64 (X86vzload addr:$src))))),
+let Predicates = [HasVLX] in {
+ def : Pat<(v2f64 (X86VSintToFP (bc_v4i32 (v2i64 (X86vzload64 addr:$src))))),
(VCVTDQ2PDZ128rm addr:$src)>;
-
- def : Pat<(v2f64 (X86VUintToFP (bc_v4i32 (v2i64 (scalar_to_vector (loadi64 addr:$src)))))),
- (VCVTUDQ2PDZ128rm addr:$src)>;
- def : Pat<(v2f64 (X86VUintToFP (bc_v4i32 (v2i64 (X86vzload addr:$src))))),
+ def : Pat<(v2f64 (vselect VK2WM:$mask,
+ (X86VSintToFP (bc_v4i32 (v2i64 (X86vzload64 addr:$src)))),
+ VR128X:$src0)),
+ (VCVTDQ2PDZ128rmk VR128X:$src0, VK2WM:$mask, addr:$src)>;
+ def : Pat<(v2f64 (vselect VK2WM:$mask,
+ (X86VSintToFP (bc_v4i32 (v2i64 (X86vzload64 addr:$src)))),
+ v2f64x_info.ImmAllZerosV)),
+ (VCVTDQ2PDZ128rmkz VK2WM:$mask, addr:$src)>;
+
+ def : Pat<(v2f64 (X86VUintToFP (bc_v4i32 (v2i64 (X86vzload64 addr:$src))))),
(VCVTUDQ2PDZ128rm addr:$src)>;
-}
-
-let Predicates = [HasAVX512] in {
- def : Pat<(v8f32 (fpround (loadv8f64 addr:$src))),
- (VCVTPD2PSZrm addr:$src)>;
- def : Pat<(v8f64 (extloadv8f32 addr:$src)),
- (VCVTPS2PDZrm addr:$src)>;
+ def : Pat<(v2f64 (vselect VK2WM:$mask,
+ (X86VUintToFP (bc_v4i32 (v2i64 (X86vzload64 addr:$src)))),
+ VR128X:$src0)),
+ (VCVTUDQ2PDZ128rmk VR128X:$src0, VK2WM:$mask, addr:$src)>;
+ def : Pat<(v2f64 (vselect VK2WM:$mask,
+ (X86VUintToFP (bc_v4i32 (v2i64 (X86vzload64 addr:$src)))),
+ v2f64x_info.ImmAllZerosV)),
+ (VCVTUDQ2PDZ128rmkz VK2WM:$mask, addr:$src)>;
}
let Predicates = [HasDQI, HasVLX] in {
- def : Pat<(X86vzmovl (v2f64 (bitconvert
- (v4f32 (X86VSintToFP (v2i64 VR128X:$src)))))),
+ // Special patterns to allow use of X86VMSintToFP for masking. Instruction
+ // patterns have been disabled with null_frag.
+ def : Pat<(v4f32 (X86VSintToFP (v2i64 VR128X:$src))),
(VCVTQQ2PSZ128rr VR128X:$src)>;
- def : Pat<(X86vzmovl (v2f64 (bitconvert
- (v4f32 (X86VUintToFP (v2i64 VR128X:$src)))))),
+ def : Pat<(X86VMSintToFP (v2i64 VR128X:$src), (v4f32 VR128X:$src0),
+ VK2WM:$mask),
+ (VCVTQQ2PSZ128rrk VR128X:$src0, VK2WM:$mask, VR128X:$src)>;
+ def : Pat<(X86VMSintToFP (v2i64 VR128X:$src), v4f32x_info.ImmAllZerosV,
+ VK2WM:$mask),
+ (VCVTQQ2PSZ128rrkz VK2WM:$mask, VR128X:$src)>;
+
+ def : Pat<(v4f32 (X86VSintToFP (loadv2i64 addr:$src))),
+ (VCVTQQ2PSZ128rm addr:$src)>;
+ def : Pat<(X86VMSintToFP (loadv2i64 addr:$src), (v4f32 VR128X:$src0),
+ VK2WM:$mask),
+ (VCVTQQ2PSZ128rmk VR128X:$src0, VK2WM:$mask, addr:$src)>;
+ def : Pat<(X86VMSintToFP (loadv2i64 addr:$src), v4f32x_info.ImmAllZerosV,
+ VK2WM:$mask),
+ (VCVTQQ2PSZ128rmkz VK2WM:$mask, addr:$src)>;
+
+ def : Pat<(v4f32 (X86VSintToFP (v2i64 (X86VBroadcast (loadi64 addr:$src))))),
+ (VCVTQQ2PSZ128rmb addr:$src)>;
+ def : Pat<(X86VMSintToFP (v2i64 (X86VBroadcast (loadi64 addr:$src))),
+ (v4f32 VR128X:$src0), VK2WM:$mask),
+ (VCVTQQ2PSZ128rmbk VR128X:$src0, VK2WM:$mask, addr:$src)>;
+ def : Pat<(X86VMSintToFP (v2i64 (X86VBroadcast (loadi64 addr:$src))),
+ v4f32x_info.ImmAllZerosV, VK2WM:$mask),
+ (VCVTQQ2PSZ128rmbkz VK2WM:$mask, addr:$src)>;
+
+ // Special patterns to allow use of X86VMUintToFP for masking. Instruction
+ // patterns have been disabled with null_frag.
+ def : Pat<(v4f32 (X86VUintToFP (v2i64 VR128X:$src))),
(VCVTUQQ2PSZ128rr VR128X:$src)>;
+ def : Pat<(X86VMUintToFP (v2i64 VR128X:$src), (v4f32 VR128X:$src0),
+ VK2WM:$mask),
+ (VCVTUQQ2PSZ128rrk VR128X:$src0, VK2WM:$mask, VR128X:$src)>;
+ def : Pat<(X86VMUintToFP (v2i64 VR128X:$src), v4f32x_info.ImmAllZerosV,
+ VK2WM:$mask),
+ (VCVTUQQ2PSZ128rrkz VK2WM:$mask, VR128X:$src)>;
+
+ def : Pat<(v4f32 (X86VUintToFP (loadv2i64 addr:$src))),
+ (VCVTUQQ2PSZ128rm addr:$src)>;
+ def : Pat<(X86VMUintToFP (loadv2i64 addr:$src), (v4f32 VR128X:$src0),
+ VK2WM:$mask),
+ (VCVTUQQ2PSZ128rmk VR128X:$src0, VK2WM:$mask, addr:$src)>;
+ def : Pat<(X86VMUintToFP (loadv2i64 addr:$src), v4f32x_info.ImmAllZerosV,
+ VK2WM:$mask),
+ (VCVTUQQ2PSZ128rmkz VK2WM:$mask, addr:$src)>;
+
+ def : Pat<(v4f32 (X86VUintToFP (v2i64 (X86VBroadcast (loadi64 addr:$src))))),
+ (VCVTUQQ2PSZ128rmb addr:$src)>;
+ def : Pat<(X86VMUintToFP (v2i64 (X86VBroadcast (loadi64 addr:$src))),
+ (v4f32 VR128X:$src0), VK2WM:$mask),
+ (VCVTUQQ2PSZ128rmbk VR128X:$src0, VK2WM:$mask, addr:$src)>;
+ def : Pat<(X86VMUintToFP (v2i64 (X86VBroadcast (loadi64 addr:$src))),
+ v4f32x_info.ImmAllZerosV, VK2WM:$mask),
+ (VCVTUQQ2PSZ128rmbkz VK2WM:$mask, addr:$src)>;
}
let Predicates = [HasDQI, NoVLX] in {
-def : Pat<(v2i64 (fp_to_sint (v2f64 VR128X:$src1))),
+def : Pat<(v2i64 (X86cvttp2si (v2f64 VR128X:$src1))),
(EXTRACT_SUBREG (v8i64 (VCVTTPD2QQZrr
(v8f64 (INSERT_SUBREG (IMPLICIT_DEF),
VR128X:$src1, sub_xmm)))), sub_xmm)>;
-def : Pat<(v4i64 (fp_to_sint (v4f32 VR128X:$src1))),
+def : Pat<(v4i64 (X86cvttp2si (v4f32 VR128X:$src1))),
(EXTRACT_SUBREG (v8i64 (VCVTTPS2QQZrr
(v8f32 (INSERT_SUBREG (IMPLICIT_DEF),
VR128X:$src1, sub_xmm)))), sub_ymm)>;
-def : Pat<(v4i64 (fp_to_sint (v4f64 VR256X:$src1))),
+def : Pat<(v4i64 (X86cvttp2si (v4f64 VR256X:$src1))),
(EXTRACT_SUBREG (v8i64 (VCVTTPD2QQZrr
(v8f64 (INSERT_SUBREG (IMPLICIT_DEF),
VR256X:$src1, sub_ymm)))), sub_ymm)>;
-def : Pat<(v2i64 (fp_to_uint (v2f64 VR128X:$src1))),
+def : Pat<(v2i64 (X86cvttp2ui (v2f64 VR128X:$src1))),
(EXTRACT_SUBREG (v8i64 (VCVTTPD2UQQZrr
(v8f64 (INSERT_SUBREG (IMPLICIT_DEF),
VR128X:$src1, sub_xmm)))), sub_xmm)>;
-def : Pat<(v4i64 (fp_to_uint (v4f32 VR128X:$src1))),
+def : Pat<(v4i64 (X86cvttp2ui (v4f32 VR128X:$src1))),
(EXTRACT_SUBREG (v8i64 (VCVTTPS2UQQZrr
(v8f32 (INSERT_SUBREG (IMPLICIT_DEF),
VR128X:$src1, sub_xmm)))), sub_ymm)>;
-def : Pat<(v4i64 (fp_to_uint (v4f64 VR256X:$src1))),
+def : Pat<(v4i64 (X86cvttp2ui (v4f64 VR256X:$src1))),
(EXTRACT_SUBREG (v8i64 (VCVTTPD2UQQZrr
(v8f64 (INSERT_SUBREG (IMPLICIT_DEF),
VR256X:$src1, sub_ymm)))), sub_ymm)>;
@@ -8870,8 +8543,7 @@ multiclass avx512_cvtph2ps_sae<X86VectorVTInfo _dest, X86VectorVTInfo _src,
defm rrb : AVX512_maskable<0x13, MRMSrcReg, _dest, (outs _dest.RC:$dst),
(ins _src.RC:$src), "vcvtph2ps",
"{sae}, $src", "$src, {sae}",
- (X86cvtph2psRnd (_src.VT _src.RC:$src),
- (i32 FROUND_NO_EXC))>,
+ (X86cvtph2psSAE (_src.VT _src.RC:$src))>,
T8PD, EVEX_B, Sched<[sched]>;
}
@@ -8890,9 +8562,7 @@ let Predicates = [HasVLX] in {
EVEX_CD8<32, CD8VH>;
// Pattern match vcvtph2ps of a scalar i64 load.
- def : Pat<(v4f32 (X86cvtph2ps (v8i16 (vzmovl_v2i64 addr:$src)))),
- (VCVTPH2PSZ128rm addr:$src)>;
- def : Pat<(v4f32 (X86cvtph2ps (v8i16 (vzload_v2i64 addr:$src)))),
+ def : Pat<(v4f32 (X86cvtph2ps (bc_v8i16 (v2i64 (X86vzload64 addr:$src))))),
(VCVTPH2PSZ128rm addr:$src)>;
def : Pat<(v4f32 (X86cvtph2ps (v8i16 (bitconvert
(v2i64 (scalar_to_vector (loadi64 addr:$src))))))),
@@ -9055,12 +8725,12 @@ multiclass avx512_fp14_s<bits<8> opc, string OpcodeStr, SDNode OpNode,
(ins _.RC:$src1, _.RC:$src2), OpcodeStr,
"$src2, $src1", "$src1, $src2",
(OpNode (_.VT _.RC:$src1), (_.VT _.RC:$src2))>,
- EVEX_4V, Sched<[sched]>;
+ EVEX_4V, VEX_LIG, Sched<[sched]>;
defm rm : AVX512_maskable_scalar<opc, MRMSrcMem, _, (outs _.RC:$dst),
(ins _.RC:$src1, _.IntScalarMemOp:$src2), OpcodeStr,
"$src2, $src1", "$src1, $src2",
(OpNode (_.VT _.RC:$src1),
- _.ScalarIntMemCPat:$src2)>, EVEX_4V,
+ _.ScalarIntMemCPat:$src2)>, EVEX_4V, VEX_LIG,
Sched<[sched.Folded, sched.ReadAfterFold]>;
}
}
@@ -9129,47 +8799,45 @@ defm VRCP14 : avx512_fp14_p_vl_all<0x4C, "vrcp14", X86rcp14, SchedWriteFRcp>;
/// avx512_fp28_s rcp28ss, rcp28sd, rsqrt28ss, rsqrt28sd
multiclass avx512_fp28_s<bits<8> opc, string OpcodeStr,X86VectorVTInfo _,
- SDNode OpNode, X86FoldableSchedWrite sched> {
+ SDNode OpNode, SDNode OpNodeSAE,
+ X86FoldableSchedWrite sched> {
let ExeDomain = _.ExeDomain in {
defm r : AVX512_maskable_scalar<opc, MRMSrcReg, _, (outs _.RC:$dst),
(ins _.RC:$src1, _.RC:$src2), OpcodeStr,
"$src2, $src1", "$src1, $src2",
- (OpNode (_.VT _.RC:$src1), (_.VT _.RC:$src2),
- (i32 FROUND_CURRENT))>,
+ (OpNode (_.VT _.RC:$src1), (_.VT _.RC:$src2))>,
Sched<[sched]>;
defm rb : AVX512_maskable_scalar<opc, MRMSrcReg, _, (outs _.RC:$dst),
(ins _.RC:$src1, _.RC:$src2), OpcodeStr,
"{sae}, $src2, $src1", "$src1, $src2, {sae}",
- (OpNode (_.VT _.RC:$src1), (_.VT _.RC:$src2),
- (i32 FROUND_NO_EXC))>, EVEX_B,
- Sched<[sched]>;
+ (OpNodeSAE (_.VT _.RC:$src1), (_.VT _.RC:$src2))>,
+ EVEX_B, Sched<[sched]>;
defm m : AVX512_maskable_scalar<opc, MRMSrcMem, _, (outs _.RC:$dst),
(ins _.RC:$src1, _.IntScalarMemOp:$src2), OpcodeStr,
"$src2, $src1", "$src1, $src2",
- (OpNode (_.VT _.RC:$src1), _.ScalarIntMemCPat:$src2,
- (i32 FROUND_CURRENT))>,
+ (OpNode (_.VT _.RC:$src1), _.ScalarIntMemCPat:$src2)>,
Sched<[sched.Folded, sched.ReadAfterFold]>;
}
}
multiclass avx512_eri_s<bits<8> opc, string OpcodeStr, SDNode OpNode,
- X86FoldableSchedWrite sched> {
- defm SSZ : avx512_fp28_s<opc, OpcodeStr#"ss", f32x_info, OpNode, sched>,
- EVEX_CD8<32, CD8VT1>;
- defm SDZ : avx512_fp28_s<opc, OpcodeStr#"sd", f64x_info, OpNode, sched>,
- EVEX_CD8<64, CD8VT1>, VEX_W;
+ SDNode OpNodeSAE, X86FoldableSchedWrite sched> {
+ defm SSZ : avx512_fp28_s<opc, OpcodeStr#"ss", f32x_info, OpNode, OpNodeSAE,
+ sched>, EVEX_CD8<32, CD8VT1>, VEX_LIG;
+ defm SDZ : avx512_fp28_s<opc, OpcodeStr#"sd", f64x_info, OpNode, OpNodeSAE,
+ sched>, EVEX_CD8<64, CD8VT1>, VEX_LIG, VEX_W;
}
let Predicates = [HasERI] in {
- defm VRCP28 : avx512_eri_s<0xCB, "vrcp28", X86rcp28s, SchedWriteFRcp.Scl>,
- T8PD, EVEX_4V;
- defm VRSQRT28 : avx512_eri_s<0xCD, "vrsqrt28", X86rsqrt28s,
+ defm VRCP28 : avx512_eri_s<0xCB, "vrcp28", X86rcp28s, X86rcp28SAEs,
+ SchedWriteFRcp.Scl>, T8PD, EVEX_4V;
+ defm VRSQRT28 : avx512_eri_s<0xCD, "vrsqrt28", X86rsqrt28s, X86rsqrt28SAEs,
SchedWriteFRsqrt.Scl>, T8PD, EVEX_4V;
}
-defm VGETEXP : avx512_eri_s<0x43, "vgetexp", X86fgetexpRnds,
+defm VGETEXP : avx512_eri_s<0x43, "vgetexp", X86fgetexps, X86fgetexpSAEs,
SchedWriteFRnd.Scl>, T8PD, EVEX_4V;
/// avx512_fp28_p rcp28ps, rcp28pd, rsqrt28ps, rsqrt28pd
@@ -9178,42 +8846,40 @@ multiclass avx512_fp28_p<bits<8> opc, string OpcodeStr, X86VectorVTInfo _,
let ExeDomain = _.ExeDomain in {
defm r : AVX512_maskable<opc, MRMSrcReg, _, (outs _.RC:$dst),
(ins _.RC:$src), OpcodeStr, "$src", "$src",
- (OpNode (_.VT _.RC:$src), (i32 FROUND_CURRENT))>,
+ (OpNode (_.VT _.RC:$src))>,
Sched<[sched]>;
defm m : AVX512_maskable<opc, MRMSrcMem, _, (outs _.RC:$dst),
(ins _.MemOp:$src), OpcodeStr, "$src", "$src",
(OpNode (_.VT
- (bitconvert (_.LdFrag addr:$src))),
- (i32 FROUND_CURRENT))>,
+ (bitconvert (_.LdFrag addr:$src))))>,
Sched<[sched.Folded, sched.ReadAfterFold]>;
defm mb : AVX512_maskable<opc, MRMSrcMem, _, (outs _.RC:$dst),
(ins _.ScalarMemOp:$src), OpcodeStr,
"${src}"##_.BroadcastStr, "${src}"##_.BroadcastStr,
(OpNode (_.VT
- (X86VBroadcast (_.ScalarLdFrag addr:$src))),
- (i32 FROUND_CURRENT))>, EVEX_B,
- Sched<[sched.Folded, sched.ReadAfterFold]>;
+ (X86VBroadcast (_.ScalarLdFrag addr:$src))))>,
+ EVEX_B, Sched<[sched.Folded, sched.ReadAfterFold]>;
}
}
-multiclass avx512_fp28_p_round<bits<8> opc, string OpcodeStr, X86VectorVTInfo _,
+multiclass avx512_fp28_p_sae<bits<8> opc, string OpcodeStr, X86VectorVTInfo _,
SDNode OpNode, X86FoldableSchedWrite sched> {
let ExeDomain = _.ExeDomain in
defm rb : AVX512_maskable<opc, MRMSrcReg, _, (outs _.RC:$dst),
(ins _.RC:$src), OpcodeStr,
"{sae}, $src", "$src, {sae}",
- (OpNode (_.VT _.RC:$src), (i32 FROUND_NO_EXC))>,
+ (OpNode (_.VT _.RC:$src))>,
EVEX_B, Sched<[sched]>;
}
multiclass avx512_eri<bits<8> opc, string OpcodeStr, SDNode OpNode,
- X86SchedWriteWidths sched> {
+ SDNode OpNodeSAE, X86SchedWriteWidths sched> {
defm PSZ : avx512_fp28_p<opc, OpcodeStr#"ps", v16f32_info, OpNode, sched.ZMM>,
- avx512_fp28_p_round<opc, OpcodeStr#"ps", v16f32_info, OpNode, sched.ZMM>,
+ avx512_fp28_p_sae<opc, OpcodeStr#"ps", v16f32_info, OpNodeSAE, sched.ZMM>,
T8PD, EVEX_V512, EVEX_CD8<32, CD8VF>;
defm PDZ : avx512_fp28_p<opc, OpcodeStr#"pd", v8f64_info, OpNode, sched.ZMM>,
- avx512_fp28_p_round<opc, OpcodeStr#"pd", v8f64_info, OpNode, sched.ZMM>,
+ avx512_fp28_p_sae<opc, OpcodeStr#"pd", v8f64_info, OpNodeSAE, sched.ZMM>,
T8PD, EVEX_V512, VEX_W, EVEX_CD8<64, CD8VF>;
}
@@ -9221,24 +8887,32 @@ multiclass avx512_fp_unaryop_packed<bits<8> opc, string OpcodeStr,
SDNode OpNode, X86SchedWriteWidths sched> {
// Define only if AVX512VL feature is present.
let Predicates = [HasVLX] in {
- defm PSZ128 : avx512_fp28_p<opc, OpcodeStr#"ps", v4f32x_info, OpNode, sched.XMM>,
- EVEX_V128, T8PD, EVEX_CD8<32, CD8VF>;
- defm PSZ256 : avx512_fp28_p<opc, OpcodeStr#"ps", v8f32x_info, OpNode, sched.YMM>,
- EVEX_V256, T8PD, EVEX_CD8<32, CD8VF>;
- defm PDZ128 : avx512_fp28_p<opc, OpcodeStr#"pd", v2f64x_info, OpNode, sched.XMM>,
- EVEX_V128, VEX_W, T8PD, EVEX_CD8<64, CD8VF>;
- defm PDZ256 : avx512_fp28_p<opc, OpcodeStr#"pd", v4f64x_info, OpNode, sched.YMM>,
- EVEX_V256, VEX_W, T8PD, EVEX_CD8<64, CD8VF>;
+ defm PSZ128 : avx512_fp28_p<opc, OpcodeStr#"ps", v4f32x_info, OpNode,
+ sched.XMM>,
+ EVEX_V128, T8PD, EVEX_CD8<32, CD8VF>;
+ defm PSZ256 : avx512_fp28_p<opc, OpcodeStr#"ps", v8f32x_info, OpNode,
+ sched.YMM>,
+ EVEX_V256, T8PD, EVEX_CD8<32, CD8VF>;
+ defm PDZ128 : avx512_fp28_p<opc, OpcodeStr#"pd", v2f64x_info, OpNode,
+ sched.XMM>,
+ EVEX_V128, VEX_W, T8PD, EVEX_CD8<64, CD8VF>;
+ defm PDZ256 : avx512_fp28_p<opc, OpcodeStr#"pd", v4f64x_info, OpNode,
+ sched.YMM>,
+ EVEX_V256, VEX_W, T8PD, EVEX_CD8<64, CD8VF>;
}
}
let Predicates = [HasERI] in {
- defm VRSQRT28 : avx512_eri<0xCC, "vrsqrt28", X86rsqrt28, SchedWriteFRsqrt>, EVEX;
- defm VRCP28 : avx512_eri<0xCA, "vrcp28", X86rcp28, SchedWriteFRcp>, EVEX;
- defm VEXP2 : avx512_eri<0xC8, "vexp2", X86exp2, SchedWriteFAdd>, EVEX;
-}
-defm VGETEXP : avx512_eri<0x42, "vgetexp", X86fgetexpRnd, SchedWriteFRnd>,
- avx512_fp_unaryop_packed<0x42, "vgetexp", X86fgetexpRnd,
+ defm VRSQRT28 : avx512_eri<0xCC, "vrsqrt28", X86rsqrt28, X86rsqrt28SAE,
+ SchedWriteFRsqrt>, EVEX;
+ defm VRCP28 : avx512_eri<0xCA, "vrcp28", X86rcp28, X86rcp28SAE,
+ SchedWriteFRcp>, EVEX;
+ defm VEXP2 : avx512_eri<0xC8, "vexp2", X86exp2, X86exp2SAE,
+ SchedWriteFAdd>, EVEX;
+}
+defm VGETEXP : avx512_eri<0x42, "vgetexp", X86fgetexp, X86fgetexpSAE,
+ SchedWriteFRnd>,
+ avx512_fp_unaryop_packed<0x42, "vgetexp", X86fgetexp,
SchedWriteFRnd>, EVEX;
multiclass avx512_sqrt_packed_round<bits<8> opc, string OpcodeStr,
@@ -9246,7 +8920,7 @@ multiclass avx512_sqrt_packed_round<bits<8> opc, string OpcodeStr,
let ExeDomain = _.ExeDomain in
defm rb: AVX512_maskable<opc, MRMSrcReg, _, (outs _.RC:$dst),
(ins _.RC:$src, AVX512RC:$rc), OpcodeStr, "$rc, $src", "$src, $rc",
- (_.VT (X86fsqrtRnd _.RC:$src, (i32 imm:$rc)))>,
+ (_.VT (X86fsqrtRnd _.RC:$src, (i32 timm:$rc)))>,
EVEX, EVEX_B, EVEX_RC, Sched<[sched]>;
}
@@ -9312,23 +8986,21 @@ multiclass avx512_sqrt_scalar<bits<8> opc, string OpcodeStr, X86FoldableSchedWri
defm r_Int : AVX512_maskable_scalar<opc, MRMSrcReg, _, (outs _.RC:$dst),
(ins _.RC:$src1, _.RC:$src2), OpcodeStr,
"$src2, $src1", "$src1, $src2",
- (X86fsqrtRnds (_.VT _.RC:$src1),
- (_.VT _.RC:$src2),
- (i32 FROUND_CURRENT))>,
+ (X86fsqrts (_.VT _.RC:$src1),
+ (_.VT _.RC:$src2))>,
Sched<[sched]>;
defm m_Int : AVX512_maskable_scalar<opc, MRMSrcMem, _, (outs _.RC:$dst),
(ins _.RC:$src1, _.IntScalarMemOp:$src2), OpcodeStr,
"$src2, $src1", "$src1, $src2",
- (X86fsqrtRnds (_.VT _.RC:$src1),
- _.ScalarIntMemCPat:$src2,
- (i32 FROUND_CURRENT))>,
+ (X86fsqrts (_.VT _.RC:$src1),
+ _.ScalarIntMemCPat:$src2)>,
Sched<[sched.Folded, sched.ReadAfterFold]>;
defm rb_Int : AVX512_maskable_scalar<opc, MRMSrcReg, _, (outs _.RC:$dst),
(ins _.RC:$src1, _.RC:$src2, AVX512RC:$rc), OpcodeStr,
"$rc, $src2, $src1", "$src1, $src2, $rc",
(X86fsqrtRnds (_.VT _.RC:$src1),
(_.VT _.RC:$src2),
- (i32 imm:$rc))>,
+ (i32 timm:$rc))>,
EVEX_B, EVEX_RC, Sched<[sched]>;
let isCodeGenOnly = 1, hasSideEffects = 0, Predicates=[HasAVX512] in {
@@ -9383,8 +9055,8 @@ multiclass avx512_rndscale_scalar<bits<8> opc, string OpcodeStr,
defm rb_Int : AVX512_maskable_scalar<opc, MRMSrcReg, _, (outs _.RC:$dst),
(ins _.RC:$src1, _.RC:$src2, i32u8imm:$src3), OpcodeStr,
"$src3, {sae}, $src2, $src1", "$src1, $src2, {sae}, $src3",
- (_.VT (X86RndScalesRnd (_.VT _.RC:$src1), (_.VT _.RC:$src2),
- (i32 imm:$src3), (i32 FROUND_NO_EXC)))>, EVEX_B,
+ (_.VT (X86RndScalesSAE (_.VT _.RC:$src1), (_.VT _.RC:$src2),
+ (i32 imm:$src3)))>, EVEX_B,
Sched<[sched]>;
defm m_Int : AVX512_maskable_scalar<opc, MRMSrcMem, _, (outs _.RC:$dst),
@@ -9410,50 +9082,26 @@ multiclass avx512_rndscale_scalar<bits<8> opc, string OpcodeStr,
}
let Predicates = [HasAVX512] in {
- def : Pat<(ffloor _.FRC:$src),
- (_.EltVT (!cast<Instruction>(NAME##r) (_.EltVT (IMPLICIT_DEF)),
- _.FRC:$src, (i32 0x9)))>;
- def : Pat<(fceil _.FRC:$src),
- (_.EltVT (!cast<Instruction>(NAME##r) (_.EltVT (IMPLICIT_DEF)),
- _.FRC:$src, (i32 0xa)))>;
- def : Pat<(ftrunc _.FRC:$src),
+ def : Pat<(X86VRndScale _.FRC:$src1, imm:$src2),
(_.EltVT (!cast<Instruction>(NAME##r) (_.EltVT (IMPLICIT_DEF)),
- _.FRC:$src, (i32 0xb)))>;
- def : Pat<(frint _.FRC:$src),
- (_.EltVT (!cast<Instruction>(NAME##r) (_.EltVT (IMPLICIT_DEF)),
- _.FRC:$src, (i32 0x4)))>;
- def : Pat<(fnearbyint _.FRC:$src),
- (_.EltVT (!cast<Instruction>(NAME##r) (_.EltVT (IMPLICIT_DEF)),
- _.FRC:$src, (i32 0xc)))>;
+ _.FRC:$src1, imm:$src2))>;
}
let Predicates = [HasAVX512, OptForSize] in {
- def : Pat<(ffloor (_.ScalarLdFrag addr:$src)),
- (_.EltVT (!cast<Instruction>(NAME##m) (_.EltVT (IMPLICIT_DEF)),
- addr:$src, (i32 0x9)))>;
- def : Pat<(fceil (_.ScalarLdFrag addr:$src)),
- (_.EltVT (!cast<Instruction>(NAME##m) (_.EltVT (IMPLICIT_DEF)),
- addr:$src, (i32 0xa)))>;
- def : Pat<(ftrunc (_.ScalarLdFrag addr:$src)),
+ def : Pat<(X86VRndScale (_.ScalarLdFrag addr:$src1), imm:$src2),
(_.EltVT (!cast<Instruction>(NAME##m) (_.EltVT (IMPLICIT_DEF)),
- addr:$src, (i32 0xb)))>;
- def : Pat<(frint (_.ScalarLdFrag addr:$src)),
- (_.EltVT (!cast<Instruction>(NAME##m) (_.EltVT (IMPLICIT_DEF)),
- addr:$src, (i32 0x4)))>;
- def : Pat<(fnearbyint (_.ScalarLdFrag addr:$src)),
- (_.EltVT (!cast<Instruction>(NAME##m) (_.EltVT (IMPLICIT_DEF)),
- addr:$src, (i32 0xc)))>;
+ addr:$src1, imm:$src2))>;
}
}
defm VRNDSCALESSZ : avx512_rndscale_scalar<0x0A, "vrndscaless",
SchedWriteFRnd.Scl, f32x_info>,
- AVX512AIi8Base, EVEX_4V,
+ AVX512AIi8Base, EVEX_4V, VEX_LIG,
EVEX_CD8<32, CD8VT1>;
defm VRNDSCALESDZ : avx512_rndscale_scalar<0x0B, "vrndscalesd",
SchedWriteFRnd.Scl, f64x_info>,
- VEX_W, AVX512AIi8Base, EVEX_4V,
+ VEX_W, AVX512AIi8Base, EVEX_4V, VEX_LIG,
EVEX_CD8<64, CD8VT1>;
multiclass avx512_masked_scalar<SDNode OpNode, string OpcPrefix, SDNode Move,
@@ -9481,32 +9129,6 @@ defm : avx512_masked_scalar<fsqrt, "SQRTSDZ", X86Movsd,
(v1i1 (scalar_to_vector (i8 (trunc (i32 GR32:$mask))))), v2f64x_info,
fp64imm0, (COPY_TO_REGCLASS $mask, VK1WM), HasAVX512>;
-multiclass avx512_masked_scalar_imm<SDNode OpNode, string OpcPrefix, SDNode Move,
- X86VectorVTInfo _, PatLeaf ZeroFP,
- bits<8> ImmV, Predicate BasePredicate> {
- let Predicates = [BasePredicate] in {
- def : Pat<(Move _.VT:$src1, (scalar_to_vector (X86selects VK1WM:$mask,
- (OpNode (extractelt _.VT:$src2, (iPTR 0))),
- (extractelt _.VT:$dst, (iPTR 0))))),
- (!cast<Instruction>("V"#OpcPrefix#Zr_Intk)
- _.VT:$dst, VK1WM:$mask, _.VT:$src1, _.VT:$src2, (i32 ImmV))>;
-
- def : Pat<(Move _.VT:$src1, (scalar_to_vector (X86selects VK1WM:$mask,
- (OpNode (extractelt _.VT:$src2, (iPTR 0))), ZeroFP))),
- (!cast<Instruction>("V"#OpcPrefix#Zr_Intkz)
- VK1WM:$mask, _.VT:$src1, _.VT:$src2, (i32 ImmV))>;
- }
-}
-
-defm : avx512_masked_scalar_imm<ffloor, "RNDSCALESS", X86Movss,
- v4f32x_info, fp32imm0, 0x01, HasAVX512>;
-defm : avx512_masked_scalar_imm<fceil, "RNDSCALESS", X86Movss,
- v4f32x_info, fp32imm0, 0x02, HasAVX512>;
-defm : avx512_masked_scalar_imm<ffloor, "RNDSCALESD", X86Movsd,
- v2f64x_info, fp64imm0, 0x01, HasAVX512>;
-defm : avx512_masked_scalar_imm<fceil, "RNDSCALESD", X86Movsd,
- v2f64x_info, fp64imm0, 0x02, HasAVX512>;
-
//-------------------------------------------------
// Integer truncate and extend operations
@@ -9966,26 +9588,14 @@ multiclass AVX512_pmovx_patterns_base<string OpcPrefix, SDNode ExtOp> {
let Predicates = [HasVLX, HasBWI] in {
def : Pat<(v16i16 (ExtOp (loadv16i8 addr:$src))),
(!cast<I>(OpcPrefix#BWZ256rm) addr:$src)>;
- def : Pat<(v16i16 (ExtOp (v16i8 (vzmovl_v2i64 addr:$src)))),
- (!cast<I>(OpcPrefix#BWZ256rm) addr:$src)>;
- def : Pat<(v16i16 (ExtOp (v16i8 (vzload_v2i64 addr:$src)))),
- (!cast<I>(OpcPrefix#BWZ256rm) addr:$src)>;
}
let Predicates = [HasVLX] in {
def : Pat<(v8i32 (ExtOp (loadv8i16 addr:$src))),
(!cast<I>(OpcPrefix#WDZ256rm) addr:$src)>;
- def : Pat<(v8i32 (ExtOp (v8i16 (vzmovl_v2i64 addr:$src)))),
- (!cast<I>(OpcPrefix#WDZ256rm) addr:$src)>;
- def : Pat<(v8i32 (ExtOp (v8i16 (vzload_v2i64 addr:$src)))),
- (!cast<I>(OpcPrefix#WDZ256rm) addr:$src)>;
def : Pat<(v4i64 (ExtOp (loadv4i32 addr:$src))),
(!cast<I>(OpcPrefix#DQZ256rm) addr:$src)>;
- def : Pat<(v4i64 (ExtOp (v4i32 (vzmovl_v2i64 addr:$src)))),
- (!cast<I>(OpcPrefix#DQZ256rm) addr:$src)>;
- def : Pat<(v4i64 (ExtOp (v4i32 (vzload_v2i64 addr:$src)))),
- (!cast<I>(OpcPrefix#DQZ256rm) addr:$src)>;
}
// 512-bit patterns
@@ -10007,41 +9617,6 @@ multiclass AVX512_pmovx_patterns_base<string OpcPrefix, SDNode ExtOp> {
}
}
-multiclass AVX512_pmovx_patterns_aext<string OpcPrefix, SDNode ExtOp> :
- AVX512_pmovx_patterns_base<OpcPrefix, ExtOp> {
- let Predicates = [HasVLX, HasBWI] in {
- def : Pat<(v16i16 (ExtOp (v16i8 VR128X:$src))),
- (!cast<I>(OpcPrefix#BWZ256rr) VR128X:$src)>;
- }
-
- let Predicates = [HasVLX] in {
- def : Pat<(v8i32 (ExtOp (v8i16 VR128X:$src))),
- (!cast<I>(OpcPrefix#WDZ256rr) VR128X:$src)>;
-
- def : Pat<(v4i64 (ExtOp (v4i32 VR128X:$src))),
- (!cast<I>(OpcPrefix#DQZ256rr) VR128X:$src)>;
- }
-
- // 512-bit patterns
- let Predicates = [HasBWI] in {
- def : Pat<(v32i16 (ExtOp (v32i8 VR256X:$src))),
- (!cast<I>(OpcPrefix#BWZrr) VR256X:$src)>;
- }
- let Predicates = [HasAVX512] in {
- def : Pat<(v16i32 (ExtOp (v16i8 VR128X:$src))),
- (!cast<I>(OpcPrefix#BDZrr) VR128X:$src)>;
- def : Pat<(v16i32 (ExtOp (v16i16 VR256X:$src))),
- (!cast<I>(OpcPrefix#WDZrr) VR256X:$src)>;
-
- def : Pat<(v8i64 (ExtOp (v8i16 VR128X:$src))),
- (!cast<I>(OpcPrefix#WQZrr) VR128X:$src)>;
-
- def : Pat<(v8i64 (ExtOp (v8i32 VR256X:$src))),
- (!cast<I>(OpcPrefix#DQZrr) VR256X:$src)>;
- }
-}
-
-
multiclass AVX512_pmovx_patterns<string OpcPrefix, SDNode ExtOp,
SDNode InVecOp> :
AVX512_pmovx_patterns_base<OpcPrefix, ExtOp> {
@@ -10051,103 +9626,62 @@ multiclass AVX512_pmovx_patterns<string OpcPrefix, SDNode ExtOp,
(!cast<I>(OpcPrefix#BWZ128rm) addr:$src)>;
def : Pat<(v8i16 (InVecOp (bc_v16i8 (v2f64 (scalar_to_vector (loadf64 addr:$src)))))),
(!cast<I>(OpcPrefix#BWZ128rm) addr:$src)>;
- def : Pat<(v8i16 (InVecOp (v16i8 (vzmovl_v2i64 addr:$src)))),
- (!cast<I>(OpcPrefix#BWZ128rm) addr:$src)>;
- def : Pat<(v8i16 (InVecOp (v16i8 (vzload_v2i64 addr:$src)))),
- (!cast<I>(OpcPrefix#BWZ128rm) addr:$src)>;
- def : Pat<(v8i16 (InVecOp (loadv16i8 addr:$src))),
+ def : Pat<(v8i16 (InVecOp (bc_v16i8 (v2i64 (X86vzload64 addr:$src))))),
(!cast<I>(OpcPrefix#BWZ128rm) addr:$src)>;
}
let Predicates = [HasVLX] in {
def : Pat<(v4i32 (InVecOp (bc_v16i8 (v4i32 (scalar_to_vector (loadi32 addr:$src)))))),
(!cast<I>(OpcPrefix#BDZ128rm) addr:$src)>;
- def : Pat<(v4i32 (InVecOp (v16i8 (vzmovl_v4i32 addr:$src)))),
- (!cast<I>(OpcPrefix#BDZ128rm) addr:$src)>;
- def : Pat<(v4i32 (InVecOp (v16i8 (vzload_v2i64 addr:$src)))),
- (!cast<I>(OpcPrefix#BDZ128rm) addr:$src)>;
- def : Pat<(v4i32 (InVecOp (loadv16i8 addr:$src))),
+ def : Pat<(v4i32 (InVecOp (bc_v16i8 (v4i32 (X86vzload32 addr:$src))))),
(!cast<I>(OpcPrefix#BDZ128rm) addr:$src)>;
def : Pat<(v2i64 (InVecOp (bc_v16i8 (v4i32 (scalar_to_vector (extloadi32i16 addr:$src)))))),
(!cast<I>(OpcPrefix#BQZ128rm) addr:$src)>;
- def : Pat<(v2i64 (InVecOp (v16i8 (vzmovl_v4i32 addr:$src)))),
- (!cast<I>(OpcPrefix#BQZ128rm) addr:$src)>;
- def : Pat<(v2i64 (InVecOp (v16i8 (vzload_v2i64 addr:$src)))),
- (!cast<I>(OpcPrefix#BQZ128rm) addr:$src)>;
- def : Pat<(v2i64 (InVecOp (loadv16i8 addr:$src))),
- (!cast<I>(OpcPrefix#BQZ128rm) addr:$src)>;
def : Pat<(v4i32 (InVecOp (bc_v8i16 (v2i64 (scalar_to_vector (loadi64 addr:$src)))))),
(!cast<I>(OpcPrefix#WDZ128rm) addr:$src)>;
def : Pat<(v4i32 (InVecOp (bc_v8i16 (v2f64 (scalar_to_vector (loadf64 addr:$src)))))),
(!cast<I>(OpcPrefix#WDZ128rm) addr:$src)>;
- def : Pat<(v4i32 (InVecOp (v8i16 (vzmovl_v2i64 addr:$src)))),
- (!cast<I>(OpcPrefix#WDZ128rm) addr:$src)>;
- def : Pat<(v4i32 (InVecOp (v8i16 (vzload_v2i64 addr:$src)))),
- (!cast<I>(OpcPrefix#WDZ128rm) addr:$src)>;
- def : Pat<(v4i32 (InVecOp (loadv8i16 addr:$src))),
+ def : Pat<(v4i32 (InVecOp (bc_v8i16 (v2i64 (X86vzload64 addr:$src))))),
(!cast<I>(OpcPrefix#WDZ128rm) addr:$src)>;
def : Pat<(v2i64 (InVecOp (bc_v8i16 (v4i32 (scalar_to_vector (loadi32 addr:$src)))))),
(!cast<I>(OpcPrefix#WQZ128rm) addr:$src)>;
- def : Pat<(v2i64 (InVecOp (v8i16 (vzmovl_v4i32 addr:$src)))),
- (!cast<I>(OpcPrefix#WQZ128rm) addr:$src)>;
- def : Pat<(v2i64 (InVecOp (v8i16 (vzload_v2i64 addr:$src)))),
- (!cast<I>(OpcPrefix#WQZ128rm) addr:$src)>;
- def : Pat<(v2i64 (InVecOp (loadv8i16 addr:$src))),
+ def : Pat<(v2i64 (InVecOp (bc_v8i16 (v4i32 (X86vzload32 addr:$src))))),
(!cast<I>(OpcPrefix#WQZ128rm) addr:$src)>;
def : Pat<(v2i64 (InVecOp (bc_v4i32 (v2i64 (scalar_to_vector (loadi64 addr:$src)))))),
(!cast<I>(OpcPrefix#DQZ128rm) addr:$src)>;
def : Pat<(v2i64 (InVecOp (bc_v4i32 (v2f64 (scalar_to_vector (loadf64 addr:$src)))))),
(!cast<I>(OpcPrefix#DQZ128rm) addr:$src)>;
- def : Pat<(v2i64 (InVecOp (v4i32 (vzmovl_v2i64 addr:$src)))),
- (!cast<I>(OpcPrefix#DQZ128rm) addr:$src)>;
- def : Pat<(v2i64 (InVecOp (v4i32 (vzload_v2i64 addr:$src)))),
- (!cast<I>(OpcPrefix#DQZ128rm) addr:$src)>;
- def : Pat<(v2i64 (InVecOp (loadv4i32 addr:$src))),
+ def : Pat<(v2i64 (InVecOp (bc_v4i32 (v2i64 (X86vzload64 addr:$src))))),
(!cast<I>(OpcPrefix#DQZ128rm) addr:$src)>;
}
let Predicates = [HasVLX] in {
def : Pat<(v8i32 (InVecOp (bc_v16i8 (v2i64 (scalar_to_vector (loadi64 addr:$src)))))),
(!cast<I>(OpcPrefix#BDZ256rm) addr:$src)>;
- def : Pat<(v8i32 (InVecOp (v16i8 (vzmovl_v2i64 addr:$src)))),
- (!cast<I>(OpcPrefix#BDZ256rm) addr:$src)>;
- def : Pat<(v8i32 (InVecOp (v16i8 (vzload_v2i64 addr:$src)))),
- (!cast<I>(OpcPrefix#BDZ256rm) addr:$src)>;
- def : Pat<(v8i32 (InVecOp (loadv16i8 addr:$src))),
+ def : Pat<(v8i32 (InVecOp (bc_v16i8 (v2i64 (X86vzload64 addr:$src))))),
(!cast<I>(OpcPrefix#BDZ256rm) addr:$src)>;
def : Pat<(v4i64 (InVecOp (bc_v16i8 (v4i32 (scalar_to_vector (loadi32 addr:$src)))))),
(!cast<I>(OpcPrefix#BQZ256rm) addr:$src)>;
- def : Pat<(v4i64 (InVecOp (v16i8 (vzmovl_v4i32 addr:$src)))),
- (!cast<I>(OpcPrefix#BQZ256rm) addr:$src)>;
- def : Pat<(v4i64 (InVecOp (v16i8 (vzload_v2i64 addr:$src)))),
- (!cast<I>(OpcPrefix#BQZ256rm) addr:$src)>;
- def : Pat<(v4i64 (InVecOp (loadv16i8 addr:$src))),
+ def : Pat<(v4i64 (InVecOp (bc_v16i8 (v4i32 (X86vzload32 addr:$src))))),
(!cast<I>(OpcPrefix#BQZ256rm) addr:$src)>;
def : Pat<(v4i64 (InVecOp (bc_v8i16 (v2i64 (scalar_to_vector (loadi64 addr:$src)))))),
(!cast<I>(OpcPrefix#WQZ256rm) addr:$src)>;
- def : Pat<(v4i64 (InVecOp (v8i16 (vzmovl_v2i64 addr:$src)))),
- (!cast<I>(OpcPrefix#WQZ256rm) addr:$src)>;
- def : Pat<(v4i64 (InVecOp (v8i16 (vzload_v2i64 addr:$src)))),
- (!cast<I>(OpcPrefix#WQZ256rm) addr:$src)>;
- def : Pat<(v4i64 (InVecOp (loadv8i16 addr:$src))),
+ def : Pat<(v4i64 (InVecOp (bc_v8i16 (v2i64 (X86vzload64 addr:$src))))),
(!cast<I>(OpcPrefix#WQZ256rm) addr:$src)>;
}
// 512-bit patterns
let Predicates = [HasAVX512] in {
def : Pat<(v8i64 (InVecOp (bc_v16i8 (v2i64 (scalar_to_vector (loadi64 addr:$src)))))),
(!cast<I>(OpcPrefix#BQZrm) addr:$src)>;
- def : Pat<(v8i64 (InVecOp (loadv16i8 addr:$src))),
- (!cast<I>(OpcPrefix#BQZrm) addr:$src)>;
}
}
defm : AVX512_pmovx_patterns<"VPMOVSX", sext, sext_invec>;
defm : AVX512_pmovx_patterns<"VPMOVZX", zext, zext_invec>;
-defm : AVX512_pmovx_patterns_aext<"VPMOVZX", anyext>;
// Without BWI we can't do a trunc from v16i16 to v16i8. DAG combine can merge
// ext+trunc aggresively making it impossible to legalize the DAG to this
@@ -10155,22 +9689,8 @@ defm : AVX512_pmovx_patterns_aext<"VPMOVZX", anyext>;
let Predicates = [HasAVX512, NoBWI] in {
def: Pat<(v16i8 (trunc (v16i16 VR256X:$src))),
(VPMOVDBZrr (v16i32 (VPMOVZXWDZrr VR256X:$src)))>;
-def: Pat<(v16i8 (trunc (bc_v16i16 (loadv4i64 addr:$src)))),
+def: Pat<(v16i8 (trunc (loadv16i16 addr:$src))),
(VPMOVDBZrr (v16i32 (VPMOVZXWDZrm addr:$src)))>;
-def: Pat<(store (v16i8 (trunc (v16i16 VR256X:$src))), addr:$dst),
- (VPMOVDBZmr addr:$dst, (v16i32 (VPMOVZXWDZrr VR256X:$src)))>;
-}
-
-// Without BWI we can't do a trunc from v16i16 to v16i8. DAG combine can merge
-// ext+trunc aggresively making it impossible to legalize the DAG to this
-// pattern directly.
-let Predicates = [HasAVX512, NoBWI] in {
-def: Pat<(v16i8 (trunc (v16i16 VR256X:$src))),
- (VPMOVDBZrr (v16i32 (VPMOVZXWDZrr VR256X:$src)))>;
-def: Pat<(v16i8 (trunc (bc_v16i16 (loadv4i64 addr:$src)))),
- (VPMOVDBZrr (v16i32 (VPMOVZXWDZrm addr:$src)))>;
-def: Pat<(store (v16i8 (trunc (v16i16 VR256X:$src))), addr:$dst),
- (VPMOVDBZmr addr:$dst, (v16i32 (VPMOVZXWDZrr VR256X:$src)))>;
}
//===----------------------------------------------------------------------===//
@@ -10457,7 +9977,7 @@ multiclass compress_by_vec_width_common<bits<8> opc, X86VectorVTInfo _,
string OpcodeStr, X86FoldableSchedWrite sched> {
defm rr : AVX512_maskable<opc, MRMDestReg, _, (outs _.RC:$dst),
(ins _.RC:$src1), OpcodeStr, "$src1", "$src1",
- (_.VT (X86compress _.RC:$src1))>, AVX5128IBase,
+ (null_frag)>, AVX5128IBase,
Sched<[sched]>;
let mayStore = 1, hasSideEffects = 0 in
@@ -10479,6 +9999,13 @@ multiclass compress_by_vec_width_lowering<X86VectorVTInfo _, string Name> {
def : Pat<(X86mCompressingStore (_.VT _.RC:$src), addr:$dst, _.KRCWM:$mask),
(!cast<Instruction>(Name#_.ZSuffix##mrk)
addr:$dst, _.KRCWM:$mask, _.RC:$src)>;
+
+ def : Pat<(X86compress (_.VT _.RC:$src), _.RC:$src0, _.KRCWM:$mask),
+ (!cast<Instruction>(Name#_.ZSuffix##rrk)
+ _.RC:$src0, _.KRCWM:$mask, _.RC:$src)>;
+ def : Pat<(X86compress (_.VT _.RC:$src), _.ImmAllZerosV, _.KRCWM:$mask),
+ (!cast<Instruction>(Name#_.ZSuffix##rrkz)
+ _.KRCWM:$mask, _.RC:$src)>;
}
multiclass compress_by_elt_width<bits<8> opc, string OpcodeStr,
@@ -10512,13 +10039,12 @@ multiclass expand_by_vec_width<bits<8> opc, X86VectorVTInfo _,
string OpcodeStr, X86FoldableSchedWrite sched> {
defm rr : AVX512_maskable<opc, MRMSrcReg, _, (outs _.RC:$dst),
(ins _.RC:$src1), OpcodeStr, "$src1", "$src1",
- (_.VT (X86expand _.RC:$src1))>, AVX5128IBase,
+ (null_frag)>, AVX5128IBase,
Sched<[sched]>;
defm rm : AVX512_maskable<opc, MRMSrcMem, _, (outs _.RC:$dst),
(ins _.MemOp:$src1), OpcodeStr, "$src1", "$src1",
- (_.VT (X86expand (_.VT (bitconvert
- (_.LdFrag addr:$src1)))))>,
+ (null_frag)>,
AVX5128IBase, EVEX_CD8<_.EltSize, CD8VT1>,
Sched<[sched.Folded, sched.ReadAfterFold]>;
}
@@ -10537,6 +10063,13 @@ multiclass expand_by_vec_width_lowering<X86VectorVTInfo _, string Name> {
(_.VT _.RC:$src0))),
(!cast<Instruction>(Name#_.ZSuffix##rmk)
_.RC:$src0, _.KRCWM:$mask, addr:$src)>;
+
+ def : Pat<(X86expand (_.VT _.RC:$src), _.RC:$src0, _.KRCWM:$mask),
+ (!cast<Instruction>(Name#_.ZSuffix##rrk)
+ _.RC:$src0, _.KRCWM:$mask, _.RC:$src)>;
+ def : Pat<(X86expand (_.VT _.RC:$src), _.ImmAllZerosV, _.KRCWM:$mask),
+ (!cast<Instruction>(Name#_.ZSuffix##rrkz)
+ _.KRCWM:$mask, _.RC:$src)>;
}
multiclass expand_by_elt_width<bits<8> opc, string OpcodeStr,
@@ -10603,18 +10136,17 @@ multiclass avx512_unary_fp_sae_packed_imm<bits<8> opc, string OpcodeStr,
OpcodeStr##_.Suffix, "$src2, {sae}, $src1",
"$src1, {sae}, $src2",
(OpNode (_.VT _.RC:$src1),
- (i32 imm:$src2),
- (i32 FROUND_NO_EXC))>,
+ (i32 imm:$src2))>,
EVEX_B, Sched<[sched]>;
}
multiclass avx512_common_unary_fp_sae_packed_imm<string OpcodeStr,
AVX512VLVectorVTInfo _, bits<8> opc, SDNode OpNode,
- SDNode OpNodeRnd, X86SchedWriteWidths sched, Predicate prd>{
+ SDNode OpNodeSAE, X86SchedWriteWidths sched, Predicate prd>{
let Predicates = [prd] in {
defm Z : avx512_unary_fp_packed_imm<opc, OpcodeStr, OpNode, sched.ZMM,
_.info512>,
- avx512_unary_fp_sae_packed_imm<opc, OpcodeStr, OpNodeRnd,
+ avx512_unary_fp_sae_packed_imm<opc, OpcodeStr, OpNodeSAE,
sched.ZMM, _.info512>, EVEX_V512;
}
let Predicates = [prd, HasVLX] in {
@@ -10733,8 +10265,7 @@ multiclass avx512_fp_sae_packed_imm<bits<8> opc, string OpcodeStr,
"$src1, $src2, {sae}, $src3",
(OpNode (_.VT _.RC:$src1),
(_.VT _.RC:$src2),
- (i32 imm:$src3),
- (i32 FROUND_NO_EXC))>,
+ (i32 imm:$src3))>,
EVEX_B, Sched<[sched]>;
}
@@ -10748,17 +10279,16 @@ multiclass avx512_fp_sae_scalar_imm<bits<8> opc, string OpcodeStr, SDNode OpNode
"$src1, $src2, {sae}, $src3",
(OpNode (_.VT _.RC:$src1),
(_.VT _.RC:$src2),
- (i32 imm:$src3),
- (i32 FROUND_NO_EXC))>,
+ (i32 imm:$src3))>,
EVEX_B, Sched<[sched]>;
}
multiclass avx512_common_fp_sae_packed_imm<string OpcodeStr,
AVX512VLVectorVTInfo _, bits<8> opc, SDNode OpNode,
- SDNode OpNodeRnd, X86SchedWriteWidths sched, Predicate prd>{
+ SDNode OpNodeSAE, X86SchedWriteWidths sched, Predicate prd>{
let Predicates = [prd] in {
defm Z : avx512_fp_packed_imm<opc, OpcodeStr, OpNode, sched.ZMM, _.info512>,
- avx512_fp_sae_packed_imm<opc, OpcodeStr, OpNodeRnd, sched.ZMM, _.info512>,
+ avx512_fp_sae_packed_imm<opc, OpcodeStr, OpNodeSAE, sched.ZMM, _.info512>,
EVEX_V512;
}
@@ -10802,267 +10332,64 @@ multiclass avx512_common_3Op_imm8<string OpcodeStr, AVX512VLVectorVTInfo _,
multiclass avx512_common_fp_sae_scalar_imm<string OpcodeStr,
X86VectorVTInfo _, bits<8> opc, SDNode OpNode,
- SDNode OpNodeRnd, X86SchedWriteWidths sched, Predicate prd> {
+ SDNode OpNodeSAE, X86SchedWriteWidths sched, Predicate prd> {
let Predicates = [prd] in {
defm Z : avx512_fp_scalar_imm<opc, OpcodeStr, OpNode, sched.XMM, _>,
- avx512_fp_sae_scalar_imm<opc, OpcodeStr, OpNodeRnd, sched.XMM, _>;
+ avx512_fp_sae_scalar_imm<opc, OpcodeStr, OpNodeSAE, sched.XMM, _>;
}
}
multiclass avx512_common_unary_fp_sae_packed_imm_all<string OpcodeStr,
bits<8> opcPs, bits<8> opcPd, SDNode OpNode,
- SDNode OpNodeRnd, X86SchedWriteWidths sched, Predicate prd>{
+ SDNode OpNodeSAE, X86SchedWriteWidths sched, Predicate prd>{
defm PS : avx512_common_unary_fp_sae_packed_imm<OpcodeStr, avx512vl_f32_info,
- opcPs, OpNode, OpNodeRnd, sched, prd>,
+ opcPs, OpNode, OpNodeSAE, sched, prd>,
EVEX_CD8<32, CD8VF>;
defm PD : avx512_common_unary_fp_sae_packed_imm<OpcodeStr, avx512vl_f64_info,
- opcPd, OpNode, OpNodeRnd, sched, prd>,
+ opcPd, OpNode, OpNodeSAE, sched, prd>,
EVEX_CD8<64, CD8VF>, VEX_W;
}
defm VREDUCE : avx512_common_unary_fp_sae_packed_imm_all<"vreduce", 0x56, 0x56,
- X86VReduce, X86VReduceRnd, SchedWriteFRnd, HasDQI>,
+ X86VReduce, X86VReduceSAE, SchedWriteFRnd, HasDQI>,
AVX512AIi8Base, EVEX;
defm VRNDSCALE : avx512_common_unary_fp_sae_packed_imm_all<"vrndscale", 0x08, 0x09,
- X86VRndScale, X86VRndScaleRnd, SchedWriteFRnd, HasAVX512>,
+ X86VRndScale, X86VRndScaleSAE, SchedWriteFRnd, HasAVX512>,
AVX512AIi8Base, EVEX;
defm VGETMANT : avx512_common_unary_fp_sae_packed_imm_all<"vgetmant", 0x26, 0x26,
- X86VGetMant, X86VGetMantRnd, SchedWriteFRnd, HasAVX512>,
+ X86VGetMant, X86VGetMantSAE, SchedWriteFRnd, HasAVX512>,
AVX512AIi8Base, EVEX;
defm VRANGEPD : avx512_common_fp_sae_packed_imm<"vrangepd", avx512vl_f64_info,
- 0x50, X86VRange, X86VRangeRnd,
+ 0x50, X86VRange, X86VRangeSAE,
SchedWriteFAdd, HasDQI>,
AVX512AIi8Base, EVEX_4V, EVEX_CD8<64, CD8VF>, VEX_W;
defm VRANGEPS : avx512_common_fp_sae_packed_imm<"vrangeps", avx512vl_f32_info,
- 0x50, X86VRange, X86VRangeRnd,
+ 0x50, X86VRange, X86VRangeSAE,
SchedWriteFAdd, HasDQI>,
AVX512AIi8Base, EVEX_4V, EVEX_CD8<32, CD8VF>;
defm VRANGESD: avx512_common_fp_sae_scalar_imm<"vrangesd",
- f64x_info, 0x51, X86Ranges, X86RangesRnd, SchedWriteFAdd, HasDQI>,
+ f64x_info, 0x51, X86Ranges, X86RangesSAE, SchedWriteFAdd, HasDQI>,
AVX512AIi8Base, VEX_LIG, EVEX_4V, EVEX_CD8<64, CD8VT1>, VEX_W;
defm VRANGESS: avx512_common_fp_sae_scalar_imm<"vrangess", f32x_info,
- 0x51, X86Ranges, X86RangesRnd, SchedWriteFAdd, HasDQI>,
+ 0x51, X86Ranges, X86RangesSAE, SchedWriteFAdd, HasDQI>,
AVX512AIi8Base, VEX_LIG, EVEX_4V, EVEX_CD8<32, CD8VT1>;
defm VREDUCESD: avx512_common_fp_sae_scalar_imm<"vreducesd", f64x_info,
- 0x57, X86Reduces, X86ReducesRnd, SchedWriteFRnd, HasDQI>,
+ 0x57, X86Reduces, X86ReducesSAE, SchedWriteFRnd, HasDQI>,
AVX512AIi8Base, VEX_LIG, EVEX_4V, EVEX_CD8<64, CD8VT1>, VEX_W;
defm VREDUCESS: avx512_common_fp_sae_scalar_imm<"vreducess", f32x_info,
- 0x57, X86Reduces, X86ReducesRnd, SchedWriteFRnd, HasDQI>,
+ 0x57, X86Reduces, X86ReducesSAE, SchedWriteFRnd, HasDQI>,
AVX512AIi8Base, VEX_LIG, EVEX_4V, EVEX_CD8<32, CD8VT1>;
defm VGETMANTSD: avx512_common_fp_sae_scalar_imm<"vgetmantsd", f64x_info,
- 0x27, X86GetMants, X86GetMantsRnd, SchedWriteFRnd, HasAVX512>,
+ 0x27, X86GetMants, X86GetMantsSAE, SchedWriteFRnd, HasAVX512>,
AVX512AIi8Base, VEX_LIG, EVEX_4V, EVEX_CD8<64, CD8VT1>, VEX_W;
defm VGETMANTSS: avx512_common_fp_sae_scalar_imm<"vgetmantss", f32x_info,
- 0x27, X86GetMants, X86GetMantsRnd, SchedWriteFRnd, HasAVX512>,
+ 0x27, X86GetMants, X86GetMantsSAE, SchedWriteFRnd, HasAVX512>,
AVX512AIi8Base, VEX_LIG, EVEX_4V, EVEX_CD8<32, CD8VT1>;
-
-multiclass AVX512_rndscale_lowering<X86VectorVTInfo _, string Suffix> {
- // Register
- def : Pat<(_.VT (ffloor _.RC:$src)),
- (!cast<Instruction>("VRNDSCALE"#Suffix#_.ZSuffix#"rri")
- _.RC:$src, (i32 0x9))>;
- def : Pat<(_.VT (fnearbyint _.RC:$src)),
- (!cast<Instruction>("VRNDSCALE"#Suffix#_.ZSuffix#"rri")
- _.RC:$src, (i32 0xC))>;
- def : Pat<(_.VT (fceil _.RC:$src)),
- (!cast<Instruction>("VRNDSCALE"#Suffix#_.ZSuffix#"rri")
- _.RC:$src, (i32 0xA))>;
- def : Pat<(_.VT (frint _.RC:$src)),
- (!cast<Instruction>("VRNDSCALE"#Suffix#_.ZSuffix#"rri")
- _.RC:$src, (i32 0x4))>;
- def : Pat<(_.VT (ftrunc _.RC:$src)),
- (!cast<Instruction>("VRNDSCALE"#Suffix#_.ZSuffix#"rri")
- _.RC:$src, (i32 0xB))>;
-
- // Merge-masking
- def : Pat<(_.VT (vselect _.KRCWM:$mask, (ffloor _.RC:$src), _.RC:$dst)),
- (!cast<Instruction>("VRNDSCALE"#Suffix#_.ZSuffix#"rrik")
- _.RC:$dst, _.KRCWM:$mask, _.RC:$src, (i32 0x9))>;
- def : Pat<(_.VT (vselect _.KRCWM:$mask, (fnearbyint _.RC:$src), _.RC:$dst)),
- (!cast<Instruction>("VRNDSCALE"#Suffix#_.ZSuffix#"rrik")
- _.RC:$dst, _.KRCWM:$mask, _.RC:$src, (i32 0xC))>;
- def : Pat<(_.VT (vselect _.KRCWM:$mask, (fceil _.RC:$src), _.RC:$dst)),
- (!cast<Instruction>("VRNDSCALE"#Suffix#_.ZSuffix#"rrik")
- _.RC:$dst, _.KRCWM:$mask, _.RC:$src, (i32 0xA))>;
- def : Pat<(_.VT (vselect _.KRCWM:$mask, (frint _.RC:$src), _.RC:$dst)),
- (!cast<Instruction>("VRNDSCALE"#Suffix#_.ZSuffix#"rrik")
- _.RC:$dst, _.KRCWM:$mask, _.RC:$src, (i32 0x4))>;
- def : Pat<(_.VT (vselect _.KRCWM:$mask, (ftrunc _.RC:$src), _.RC:$dst)),
- (!cast<Instruction>("VRNDSCALE"#Suffix#_.ZSuffix#"rrik")
- _.RC:$dst, _.KRCWM:$mask, _.RC:$src, (i32 0xB))>;
-
- // Zero-masking
- def : Pat<(_.VT (vselect _.KRCWM:$mask, (ffloor _.RC:$src),
- _.ImmAllZerosV)),
- (!cast<Instruction>("VRNDSCALE"#Suffix#_.ZSuffix#"rrikz")
- _.KRCWM:$mask, _.RC:$src, (i32 0x9))>;
- def : Pat<(_.VT (vselect _.KRCWM:$mask, (fnearbyint _.RC:$src),
- _.ImmAllZerosV)),
- (!cast<Instruction>("VRNDSCALE"#Suffix#_.ZSuffix#"rrikz")
- _.KRCWM:$mask, _.RC:$src, (i32 0xC))>;
- def : Pat<(_.VT (vselect _.KRCWM:$mask, (fceil _.RC:$src),
- _.ImmAllZerosV)),
- (!cast<Instruction>("VRNDSCALE"#Suffix#_.ZSuffix#"rrikz")
- _.KRCWM:$mask, _.RC:$src, (i32 0xA))>;
- def : Pat<(_.VT (vselect _.KRCWM:$mask, (frint _.RC:$src),
- _.ImmAllZerosV)),
- (!cast<Instruction>("VRNDSCALE"#Suffix#_.ZSuffix#"rrikz")
- _.KRCWM:$mask, _.RC:$src, (i32 0x4))>;
- def : Pat<(_.VT (vselect _.KRCWM:$mask, (ftrunc _.RC:$src),
- _.ImmAllZerosV)),
- (!cast<Instruction>("VRNDSCALE"#Suffix#_.ZSuffix#"rrikz")
- _.KRCWM:$mask, _.RC:$src, (i32 0xB))>;
-
- // Load
- def : Pat<(_.VT (ffloor (_.LdFrag addr:$src))),
- (!cast<Instruction>("VRNDSCALE"#Suffix#_.ZSuffix#"rmi")
- addr:$src, (i32 0x9))>;
- def : Pat<(_.VT (fnearbyint (_.LdFrag addr:$src))),
- (!cast<Instruction>("VRNDSCALE"#Suffix#_.ZSuffix#"rmi")
- addr:$src, (i32 0xC))>;
- def : Pat<(_.VT (fceil (_.LdFrag addr:$src))),
- (!cast<Instruction>("VRNDSCALE"#Suffix#_.ZSuffix#"rmi")
- addr:$src, (i32 0xA))>;
- def : Pat<(_.VT (frint (_.LdFrag addr:$src))),
- (!cast<Instruction>("VRNDSCALE"#Suffix#_.ZSuffix#"rmi")
- addr:$src, (i32 0x4))>;
- def : Pat<(_.VT (ftrunc (_.LdFrag addr:$src))),
- (!cast<Instruction>("VRNDSCALE"#Suffix#_.ZSuffix#"rmi")
- addr:$src, (i32 0xB))>;
-
- // Merge-masking + load
- def : Pat<(_.VT (vselect _.KRCWM:$mask, (ffloor (_.LdFrag addr:$src)),
- _.RC:$dst)),
- (!cast<Instruction>("VRNDSCALE"#Suffix#_.ZSuffix#"rmik")
- _.RC:$dst, _.KRCWM:$mask, addr:$src, (i32 0x9))>;
- def : Pat<(_.VT (vselect _.KRCWM:$mask, (fnearbyint (_.LdFrag addr:$src)),
- _.RC:$dst)),
- (!cast<Instruction>("VRNDSCALE"#Suffix#_.ZSuffix#"rmik")
- _.RC:$dst, _.KRCWM:$mask, addr:$src, (i32 0xC))>;
- def : Pat<(_.VT (vselect _.KRCWM:$mask, (fceil (_.LdFrag addr:$src)),
- _.RC:$dst)),
- (!cast<Instruction>("VRNDSCALE"#Suffix#_.ZSuffix#"rmik")
- _.RC:$dst, _.KRCWM:$mask, addr:$src, (i32 0xA))>;
- def : Pat<(_.VT (vselect _.KRCWM:$mask, (frint (_.LdFrag addr:$src)),
- _.RC:$dst)),
- (!cast<Instruction>("VRNDSCALE"#Suffix#_.ZSuffix#"rmik")
- _.RC:$dst, _.KRCWM:$mask, addr:$src, (i32 0x4))>;
- def : Pat<(_.VT (vselect _.KRCWM:$mask, (ftrunc (_.LdFrag addr:$src)),
- _.RC:$dst)),
- (!cast<Instruction>("VRNDSCALE"#Suffix#_.ZSuffix#"rmik")
- _.RC:$dst, _.KRCWM:$mask, addr:$src, (i32 0xB))>;
-
- // Zero-masking + load
- def : Pat<(_.VT (vselect _.KRCWM:$mask, (ffloor (_.LdFrag addr:$src)),
- _.ImmAllZerosV)),
- (!cast<Instruction>("VRNDSCALE"#Suffix#_.ZSuffix#"rmikz")
- _.KRCWM:$mask, addr:$src, (i32 0x9))>;
- def : Pat<(_.VT (vselect _.KRCWM:$mask, (fnearbyint (_.LdFrag addr:$src)),
- _.ImmAllZerosV)),
- (!cast<Instruction>("VRNDSCALE"#Suffix#_.ZSuffix#"rmikz")
- _.KRCWM:$mask, addr:$src, (i32 0xC))>;
- def : Pat<(_.VT (vselect _.KRCWM:$mask, (fceil (_.LdFrag addr:$src)),
- _.ImmAllZerosV)),
- (!cast<Instruction>("VRNDSCALE"#Suffix#_.ZSuffix#"rmikz")
- _.KRCWM:$mask, addr:$src, (i32 0xA))>;
- def : Pat<(_.VT (vselect _.KRCWM:$mask, (frint (_.LdFrag addr:$src)),
- _.ImmAllZerosV)),
- (!cast<Instruction>("VRNDSCALE"#Suffix#_.ZSuffix#"rmikz")
- _.KRCWM:$mask, addr:$src, (i32 0x4))>;
- def : Pat<(_.VT (vselect _.KRCWM:$mask, (ftrunc (_.LdFrag addr:$src)),
- _.ImmAllZerosV)),
- (!cast<Instruction>("VRNDSCALE"#Suffix#_.ZSuffix#"rmikz")
- _.KRCWM:$mask, addr:$src, (i32 0xB))>;
-
- // Broadcast load
- def : Pat<(_.VT (ffloor (X86VBroadcast (_.ScalarLdFrag addr:$src)))),
- (!cast<Instruction>("VRNDSCALE"#Suffix#_.ZSuffix#"rmbi")
- addr:$src, (i32 0x9))>;
- def : Pat<(_.VT (fnearbyint (X86VBroadcast (_.ScalarLdFrag addr:$src)))),
- (!cast<Instruction>("VRNDSCALE"#Suffix#_.ZSuffix#"rmbi")
- addr:$src, (i32 0xC))>;
- def : Pat<(_.VT (fceil (X86VBroadcast (_.ScalarLdFrag addr:$src)))),
- (!cast<Instruction>("VRNDSCALE"#Suffix#_.ZSuffix#"rmbi")
- addr:$src, (i32 0xA))>;
- def : Pat<(_.VT (frint (X86VBroadcast (_.ScalarLdFrag addr:$src)))),
- (!cast<Instruction>("VRNDSCALE"#Suffix#_.ZSuffix#"rmbi")
- addr:$src, (i32 0x4))>;
- def : Pat<(_.VT (ftrunc (X86VBroadcast (_.ScalarLdFrag addr:$src)))),
- (!cast<Instruction>("VRNDSCALE"#Suffix#_.ZSuffix#"rmbi")
- addr:$src, (i32 0xB))>;
-
- // Merge-masking + broadcast load
- def : Pat<(_.VT (vselect _.KRCWM:$mask,
- (ffloor (X86VBroadcast (_.ScalarLdFrag addr:$src))),
- _.RC:$dst)),
- (!cast<Instruction>("VRNDSCALE"#Suffix#_.ZSuffix#"rmbik")
- _.RC:$dst, _.KRCWM:$mask, addr:$src, (i32 0x9))>;
- def : Pat<(_.VT (vselect _.KRCWM:$mask,
- (fnearbyint (X86VBroadcast (_.ScalarLdFrag addr:$src))),
- _.RC:$dst)),
- (!cast<Instruction>("VRNDSCALE"#Suffix#_.ZSuffix#"rmbik")
- _.RC:$dst, _.KRCWM:$mask, addr:$src, (i32 0xC))>;
- def : Pat<(_.VT (vselect _.KRCWM:$mask,
- (fceil (X86VBroadcast (_.ScalarLdFrag addr:$src))),
- _.RC:$dst)),
- (!cast<Instruction>("VRNDSCALE"#Suffix#_.ZSuffix#"rmbik")
- _.RC:$dst, _.KRCWM:$mask, addr:$src, (i32 0xA))>;
- def : Pat<(_.VT (vselect _.KRCWM:$mask,
- (frint (X86VBroadcast (_.ScalarLdFrag addr:$src))),
- _.RC:$dst)),
- (!cast<Instruction>("VRNDSCALE"#Suffix#_.ZSuffix#"rmbik")
- _.RC:$dst, _.KRCWM:$mask, addr:$src, (i32 0x4))>;
- def : Pat<(_.VT (vselect _.KRCWM:$mask,
- (ftrunc (X86VBroadcast (_.ScalarLdFrag addr:$src))),
- _.RC:$dst)),
- (!cast<Instruction>("VRNDSCALE"#Suffix#_.ZSuffix#"rmbik")
- _.RC:$dst, _.KRCWM:$mask, addr:$src, (i32 0xB))>;
-
- // Zero-masking + broadcast load
- def : Pat<(_.VT (vselect _.KRCWM:$mask,
- (ffloor (X86VBroadcast (_.ScalarLdFrag addr:$src))),
- _.ImmAllZerosV)),
- (!cast<Instruction>("VRNDSCALE"#Suffix#_.ZSuffix#"rmbikz")
- _.KRCWM:$mask, addr:$src, (i32 0x9))>;
- def : Pat<(_.VT (vselect _.KRCWM:$mask,
- (fnearbyint (X86VBroadcast (_.ScalarLdFrag addr:$src))),
- _.ImmAllZerosV)),
- (!cast<Instruction>("VRNDSCALE"#Suffix#_.ZSuffix#"rmbikz")
- _.KRCWM:$mask, addr:$src, (i32 0xC))>;
- def : Pat<(_.VT (vselect _.KRCWM:$mask,
- (fceil (X86VBroadcast (_.ScalarLdFrag addr:$src))),
- _.ImmAllZerosV)),
- (!cast<Instruction>("VRNDSCALE"#Suffix#_.ZSuffix#"rmbikz")
- _.KRCWM:$mask, addr:$src, (i32 0xA))>;
- def : Pat<(_.VT (vselect _.KRCWM:$mask,
- (frint (X86VBroadcast (_.ScalarLdFrag addr:$src))),
- _.ImmAllZerosV)),
- (!cast<Instruction>("VRNDSCALE"#Suffix#_.ZSuffix#"rmbikz")
- _.KRCWM:$mask, addr:$src, (i32 0x4))>;
- def : Pat<(_.VT (vselect _.KRCWM:$mask,
- (ftrunc (X86VBroadcast (_.ScalarLdFrag addr:$src))),
- _.ImmAllZerosV)),
- (!cast<Instruction>("VRNDSCALE"#Suffix#_.ZSuffix#"rmbikz")
- _.KRCWM:$mask, addr:$src, (i32 0xB))>;
-}
-
-let Predicates = [HasAVX512] in {
- defm : AVX512_rndscale_lowering<v16f32_info, "PS">;
- defm : AVX512_rndscale_lowering<v8f64_info, "PD">;
-}
-
-let Predicates = [HasVLX] in {
- defm : AVX512_rndscale_lowering<v8f32x_info, "PS">;
- defm : AVX512_rndscale_lowering<v4f64x_info, "PD">;
- defm : AVX512_rndscale_lowering<v4f32x_info, "PS">;
- defm : AVX512_rndscale_lowering<v2f64x_info, "PD">;
-}
-
multiclass avx512_shuff_packed_128_common<bits<8> opc, string OpcodeStr,
X86FoldableSchedWrite sched,
X86VectorVTInfo _,
@@ -11544,9 +10871,9 @@ def : Pat<(v2f64 (X86VBroadcast (loadf64 addr:$src))),
(VMOVDDUPZ128rm addr:$src)>;
def : Pat<(v2f64 (X86VBroadcast f64:$src)),
(VMOVDDUPZ128rr (v2f64 (COPY_TO_REGCLASS FR64X:$src, VR128X)))>;
-def : Pat<(v2f64 (X86VBroadcast (loadv2f64 addr:$src))),
+def : Pat<(v2f64 (X86VBroadcast (v2f64 (nonvolatile_load addr:$src)))),
(VMOVDDUPZ128rm addr:$src)>;
-def : Pat<(v2f64 (X86VBroadcast (v2f64 (X86vzload addr:$src)))),
+def : Pat<(v2f64 (X86VBroadcast (v2f64 (X86vzload64 addr:$src)))),
(VMOVDDUPZ128rm addr:$src)>;
def : Pat<(vselect (v2i1 VK2WM:$mask), (v2f64 (X86VBroadcast f64:$src)),
@@ -11554,21 +10881,21 @@ def : Pat<(vselect (v2i1 VK2WM:$mask), (v2f64 (X86VBroadcast f64:$src)),
(VMOVDDUPZ128rrk VR128X:$src0, VK2WM:$mask,
(v2f64 (COPY_TO_REGCLASS FR64X:$src, VR128X)))>;
def : Pat<(vselect (v2i1 VK2WM:$mask), (v2f64 (X86VBroadcast f64:$src)),
- (bitconvert (v4i32 immAllZerosV))),
+ immAllZerosV),
(VMOVDDUPZ128rrkz VK2WM:$mask, (v2f64 (COPY_TO_REGCLASS FR64X:$src, VR128X)))>;
def : Pat<(vselect (v2i1 VK2WM:$mask), (v2f64 (X86VBroadcast (loadf64 addr:$src))),
(v2f64 VR128X:$src0)),
(VMOVDDUPZ128rmk VR128X:$src0, VK2WM:$mask, addr:$src)>;
def : Pat<(vselect (v2i1 VK2WM:$mask), (v2f64 (X86VBroadcast (loadf64 addr:$src))),
- (bitconvert (v4i32 immAllZerosV))),
+ immAllZerosV),
(VMOVDDUPZ128rmkz VK2WM:$mask, addr:$src)>;
-def : Pat<(vselect (v2i1 VK2WM:$mask), (v2f64 (X86VBroadcast (loadv2f64 addr:$src))),
+def : Pat<(vselect (v2i1 VK2WM:$mask), (v2f64 (X86VBroadcast (v2f64 (nonvolatile_load addr:$src)))),
(v2f64 VR128X:$src0)),
(VMOVDDUPZ128rmk VR128X:$src0, VK2WM:$mask, addr:$src)>;
-def : Pat<(vselect (v2i1 VK2WM:$mask), (v2f64 (X86VBroadcast (loadv2f64 addr:$src))),
- (bitconvert (v4i32 immAllZerosV))),
+def : Pat<(vselect (v2i1 VK2WM:$mask), (v2f64 (X86VBroadcast (v2f64 (nonvolatile_load addr:$src)))),
+ immAllZerosV),
(VMOVDDUPZ128rmkz VK2WM:$mask, addr:$src)>;
}
@@ -12067,39 +11394,39 @@ defm VPTERNLOGQ : avx512_common_ternlog<"vpternlogq", SchedWriteVecALU,
// TODO: We should maybe have a more generalized algorithm for folding to
// vpternlog.
let Predicates = [HasAVX512] in {
- def : Pat<(xor VR512:$src, (bc_v64i8 (v16i32 immAllOnesV))),
+ def : Pat<(xor VR512:$src, (v64i8 immAllOnesV)),
(VPTERNLOGQZrri VR512:$src, VR512:$src, VR512:$src, (i8 15))>;
- def : Pat<(xor VR512:$src, (bc_v32i16 (v16i32 immAllOnesV))),
+ def : Pat<(xor VR512:$src, (v32i16 immAllOnesV)),
(VPTERNLOGQZrri VR512:$src, VR512:$src, VR512:$src, (i8 15))>;
- def : Pat<(xor VR512:$src, (bc_v16i32 (v16i32 immAllOnesV))),
+ def : Pat<(xor VR512:$src, (v16i32 immAllOnesV)),
(VPTERNLOGQZrri VR512:$src, VR512:$src, VR512:$src, (i8 15))>;
- def : Pat<(xor VR512:$src, (bc_v8i64 (v16i32 immAllOnesV))),
+ def : Pat<(xor VR512:$src, (v8i64 immAllOnesV)),
(VPTERNLOGQZrri VR512:$src, VR512:$src, VR512:$src, (i8 15))>;
}
let Predicates = [HasAVX512, NoVLX] in {
- def : Pat<(xor VR128X:$src, (bc_v16i8 (v4i32 immAllOnesV))),
+ def : Pat<(xor VR128X:$src, (v16i8 immAllOnesV)),
(EXTRACT_SUBREG
(VPTERNLOGQZrri
(INSERT_SUBREG (v8i64 (IMPLICIT_DEF)), VR128X:$src, sub_xmm),
(INSERT_SUBREG (v8i64 (IMPLICIT_DEF)), VR128X:$src, sub_xmm),
(INSERT_SUBREG (v8i64 (IMPLICIT_DEF)), VR128X:$src, sub_xmm),
(i8 15)), sub_xmm)>;
- def : Pat<(xor VR128X:$src, (bc_v8i16 (v4i32 immAllOnesV))),
+ def : Pat<(xor VR128X:$src, (v8i16 immAllOnesV)),
(EXTRACT_SUBREG
(VPTERNLOGQZrri
(INSERT_SUBREG (v8i64 (IMPLICIT_DEF)), VR128X:$src, sub_xmm),
(INSERT_SUBREG (v8i64 (IMPLICIT_DEF)), VR128X:$src, sub_xmm),
(INSERT_SUBREG (v8i64 (IMPLICIT_DEF)), VR128X:$src, sub_xmm),
(i8 15)), sub_xmm)>;
- def : Pat<(xor VR128X:$src, (bc_v4i32 (v4i32 immAllOnesV))),
+ def : Pat<(xor VR128X:$src, (v4i32 immAllOnesV)),
(EXTRACT_SUBREG
(VPTERNLOGQZrri
(INSERT_SUBREG (v8i64 (IMPLICIT_DEF)), VR128X:$src, sub_xmm),
(INSERT_SUBREG (v8i64 (IMPLICIT_DEF)), VR128X:$src, sub_xmm),
(INSERT_SUBREG (v8i64 (IMPLICIT_DEF)), VR128X:$src, sub_xmm),
(i8 15)), sub_xmm)>;
- def : Pat<(xor VR128X:$src, (bc_v2i64 (v4i32 immAllOnesV))),
+ def : Pat<(xor VR128X:$src, (v2i64 immAllOnesV)),
(EXTRACT_SUBREG
(VPTERNLOGQZrri
(INSERT_SUBREG (v8i64 (IMPLICIT_DEF)), VR128X:$src, sub_xmm),
@@ -12107,28 +11434,28 @@ let Predicates = [HasAVX512, NoVLX] in {
(INSERT_SUBREG (v8i64 (IMPLICIT_DEF)), VR128X:$src, sub_xmm),
(i8 15)), sub_xmm)>;
- def : Pat<(xor VR256X:$src, (bc_v32i8 (v8i32 immAllOnesV))),
+ def : Pat<(xor VR256X:$src, (v32i8 immAllOnesV)),
(EXTRACT_SUBREG
(VPTERNLOGQZrri
(INSERT_SUBREG (v8i64 (IMPLICIT_DEF)), VR256X:$src, sub_ymm),
(INSERT_SUBREG (v8i64 (IMPLICIT_DEF)), VR256X:$src, sub_ymm),
(INSERT_SUBREG (v8i64 (IMPLICIT_DEF)), VR256X:$src, sub_ymm),
(i8 15)), sub_ymm)>;
- def : Pat<(xor VR256X:$src, (bc_v16i16 (v8i32 immAllOnesV))),
+ def : Pat<(xor VR256X:$src, (v16i16 immAllOnesV)),
(EXTRACT_SUBREG
(VPTERNLOGQZrri
(INSERT_SUBREG (v8i64 (IMPLICIT_DEF)), VR256X:$src, sub_ymm),
(INSERT_SUBREG (v8i64 (IMPLICIT_DEF)), VR256X:$src, sub_ymm),
(INSERT_SUBREG (v8i64 (IMPLICIT_DEF)), VR256X:$src, sub_ymm),
(i8 15)), sub_ymm)>;
- def : Pat<(xor VR256X:$src, (bc_v8i32 (v8i32 immAllOnesV))),
+ def : Pat<(xor VR256X:$src, (v8i32 immAllOnesV)),
(EXTRACT_SUBREG
(VPTERNLOGQZrri
(INSERT_SUBREG (v8i64 (IMPLICIT_DEF)), VR256X:$src, sub_ymm),
(INSERT_SUBREG (v8i64 (IMPLICIT_DEF)), VR256X:$src, sub_ymm),
(INSERT_SUBREG (v8i64 (IMPLICIT_DEF)), VR256X:$src, sub_ymm),
(i8 15)), sub_ymm)>;
- def : Pat<(xor VR256X:$src, (bc_v4i64 (v8i32 immAllOnesV))),
+ def : Pat<(xor VR256X:$src, (v4i64 immAllOnesV)),
(EXTRACT_SUBREG
(VPTERNLOGQZrri
(INSERT_SUBREG (v8i64 (IMPLICIT_DEF)), VR256X:$src, sub_ymm),
@@ -12138,22 +11465,22 @@ let Predicates = [HasAVX512, NoVLX] in {
}
let Predicates = [HasVLX] in {
- def : Pat<(xor VR128X:$src, (bc_v16i8 (v4i32 immAllOnesV))),
+ def : Pat<(xor VR128X:$src, (v16i8 immAllOnesV)),
(VPTERNLOGQZ128rri VR128X:$src, VR128X:$src, VR128X:$src, (i8 15))>;
- def : Pat<(xor VR128X:$src, (bc_v8i16 (v4i32 immAllOnesV))),
+ def : Pat<(xor VR128X:$src, (v8i16 immAllOnesV)),
(VPTERNLOGQZ128rri VR128X:$src, VR128X:$src, VR128X:$src, (i8 15))>;
- def : Pat<(xor VR128X:$src, (bc_v4i32 (v4i32 immAllOnesV))),
+ def : Pat<(xor VR128X:$src, (v4i32 immAllOnesV)),
(VPTERNLOGQZ128rri VR128X:$src, VR128X:$src, VR128X:$src, (i8 15))>;
- def : Pat<(xor VR128X:$src, (bc_v2i64 (v4i32 immAllOnesV))),
+ def : Pat<(xor VR128X:$src, (v2i64 immAllOnesV)),
(VPTERNLOGQZ128rri VR128X:$src, VR128X:$src, VR128X:$src, (i8 15))>;
- def : Pat<(xor VR256X:$src, (bc_v32i8 (v8i32 immAllOnesV))),
+ def : Pat<(xor VR256X:$src, (v32i8 immAllOnesV)),
(VPTERNLOGQZ256rri VR256X:$src, VR256X:$src, VR256X:$src, (i8 15))>;
- def : Pat<(xor VR256X:$src, (bc_v16i16 (v8i32 immAllOnesV))),
+ def : Pat<(xor VR256X:$src, (v16i16 immAllOnesV)),
(VPTERNLOGQZ256rri VR256X:$src, VR256X:$src, VR256X:$src, (i8 15))>;
- def : Pat<(xor VR256X:$src, (bc_v8i32 (v8i32 immAllOnesV))),
+ def : Pat<(xor VR256X:$src, (v8i32 immAllOnesV)),
(VPTERNLOGQZ256rri VR256X:$src, VR256X:$src, VR256X:$src, (i8 15))>;
- def : Pat<(xor VR256X:$src, (bc_v4i64 (v8i32 immAllOnesV))),
+ def : Pat<(xor VR256X:$src, (v4i64 immAllOnesV)),
(VPTERNLOGQZ256rri VR256X:$src, VR256X:$src, VR256X:$src, (i8 15))>;
}
@@ -12161,58 +11488,55 @@ let Predicates = [HasVLX] in {
// AVX-512 - FixupImm
//===----------------------------------------------------------------------===//
-multiclass avx512_fixupimm_packed<bits<8> opc, string OpcodeStr, SDNode OpNode,
+multiclass avx512_fixupimm_packed<bits<8> opc, string OpcodeStr,
X86FoldableSchedWrite sched, X86VectorVTInfo _,
X86VectorVTInfo TblVT>{
let Constraints = "$src1 = $dst", ExeDomain = _.ExeDomain in {
defm rri : AVX512_maskable_3src<opc, MRMSrcReg, _, (outs _.RC:$dst),
(ins _.RC:$src2, _.RC:$src3, i32u8imm:$src4),
OpcodeStr##_.Suffix, "$src4, $src3, $src2", "$src2, $src3, $src4",
- (OpNode (_.VT _.RC:$src1),
- (_.VT _.RC:$src2),
- (TblVT.VT _.RC:$src3),
- (i32 imm:$src4),
- (i32 FROUND_CURRENT))>, Sched<[sched]>;
+ (X86VFixupimm (_.VT _.RC:$src1),
+ (_.VT _.RC:$src2),
+ (TblVT.VT _.RC:$src3),
+ (i32 imm:$src4))>, Sched<[sched]>;
defm rmi : AVX512_maskable_3src<opc, MRMSrcMem, _, (outs _.RC:$dst),
(ins _.RC:$src2, _.MemOp:$src3, i32u8imm:$src4),
OpcodeStr##_.Suffix, "$src4, $src3, $src2", "$src2, $src3, $src4",
- (OpNode (_.VT _.RC:$src1),
- (_.VT _.RC:$src2),
- (TblVT.VT (bitconvert (TblVT.LdFrag addr:$src3))),
- (i32 imm:$src4),
- (i32 FROUND_CURRENT))>,
+ (X86VFixupimm (_.VT _.RC:$src1),
+ (_.VT _.RC:$src2),
+ (TblVT.VT (bitconvert (TblVT.LdFrag addr:$src3))),
+ (i32 imm:$src4))>,
Sched<[sched.Folded, sched.ReadAfterFold]>;
defm rmbi : AVX512_maskable_3src<opc, MRMSrcMem, _, (outs _.RC:$dst),
(ins _.RC:$src2, _.ScalarMemOp:$src3, i32u8imm:$src4),
OpcodeStr##_.Suffix, "$src4, ${src3}"##_.BroadcastStr##", $src2",
"$src2, ${src3}"##_.BroadcastStr##", $src4",
- (OpNode (_.VT _.RC:$src1),
- (_.VT _.RC:$src2),
- (TblVT.VT (X86VBroadcast(TblVT.ScalarLdFrag addr:$src3))),
- (i32 imm:$src4),
- (i32 FROUND_CURRENT))>,
+ (X86VFixupimm (_.VT _.RC:$src1),
+ (_.VT _.RC:$src2),
+ (TblVT.VT (X86VBroadcast(TblVT.ScalarLdFrag addr:$src3))),
+ (i32 imm:$src4))>,
EVEX_B, Sched<[sched.Folded, sched.ReadAfterFold]>;
} // Constraints = "$src1 = $dst"
}
multiclass avx512_fixupimm_packed_sae<bits<8> opc, string OpcodeStr,
- SDNode OpNode, X86FoldableSchedWrite sched,
- X86VectorVTInfo _, X86VectorVTInfo TblVT>{
+ X86FoldableSchedWrite sched,
+ X86VectorVTInfo _, X86VectorVTInfo TblVT>
+ : avx512_fixupimm_packed<opc, OpcodeStr, sched, _, TblVT> {
let Constraints = "$src1 = $dst", ExeDomain = _.ExeDomain in {
defm rrib : AVX512_maskable_3src<opc, MRMSrcReg, _, (outs _.RC:$dst),
(ins _.RC:$src2, _.RC:$src3, i32u8imm:$src4),
OpcodeStr##_.Suffix, "$src4, {sae}, $src3, $src2",
"$src2, $src3, {sae}, $src4",
- (OpNode (_.VT _.RC:$src1),
- (_.VT _.RC:$src2),
- (TblVT.VT _.RC:$src3),
- (i32 imm:$src4),
- (i32 FROUND_NO_EXC))>,
+ (X86VFixupimmSAE (_.VT _.RC:$src1),
+ (_.VT _.RC:$src2),
+ (TblVT.VT _.RC:$src3),
+ (i32 imm:$src4))>,
EVEX_B, Sched<[sched]>;
}
}
-multiclass avx512_fixupimm_scalar<bits<8> opc, string OpcodeStr, SDNode OpNode,
+multiclass avx512_fixupimm_scalar<bits<8> opc, string OpcodeStr,
X86FoldableSchedWrite sched, X86VectorVTInfo _,
X86VectorVTInfo _src3VT> {
let Constraints = "$src1 = $dst" , Predicates = [HasAVX512],
@@ -12220,30 +11544,27 @@ multiclass avx512_fixupimm_scalar<bits<8> opc, string OpcodeStr, SDNode OpNode,
defm rri : AVX512_maskable_3src_scalar<opc, MRMSrcReg, _, (outs _.RC:$dst),
(ins _.RC:$src2, _.RC:$src3, i32u8imm:$src4),
OpcodeStr##_.Suffix, "$src4, $src3, $src2", "$src2, $src3, $src4",
- (OpNode (_.VT _.RC:$src1),
- (_.VT _.RC:$src2),
- (_src3VT.VT _src3VT.RC:$src3),
- (i32 imm:$src4),
- (i32 FROUND_CURRENT))>, Sched<[sched]>;
+ (X86VFixupimms (_.VT _.RC:$src1),
+ (_.VT _.RC:$src2),
+ (_src3VT.VT _src3VT.RC:$src3),
+ (i32 imm:$src4))>, Sched<[sched]>;
defm rrib : AVX512_maskable_3src_scalar<opc, MRMSrcReg, _, (outs _.RC:$dst),
(ins _.RC:$src2, _.RC:$src3, i32u8imm:$src4),
OpcodeStr##_.Suffix, "$src4, {sae}, $src3, $src2",
"$src2, $src3, {sae}, $src4",
- (OpNode (_.VT _.RC:$src1),
- (_.VT _.RC:$src2),
- (_src3VT.VT _src3VT.RC:$src3),
- (i32 imm:$src4),
- (i32 FROUND_NO_EXC))>,
+ (X86VFixupimmSAEs (_.VT _.RC:$src1),
+ (_.VT _.RC:$src2),
+ (_src3VT.VT _src3VT.RC:$src3),
+ (i32 imm:$src4))>,
EVEX_B, Sched<[sched.Folded, sched.ReadAfterFold]>;
defm rmi : AVX512_maskable_3src_scalar<opc, MRMSrcMem, _, (outs _.RC:$dst),
(ins _.RC:$src2, _.ScalarMemOp:$src3, i32u8imm:$src4),
OpcodeStr##_.Suffix, "$src4, $src3, $src2", "$src2, $src3, $src4",
- (OpNode (_.VT _.RC:$src1),
- (_.VT _.RC:$src2),
- (_src3VT.VT (scalar_to_vector
- (_src3VT.ScalarLdFrag addr:$src3))),
- (i32 imm:$src4),
- (i32 FROUND_CURRENT))>,
+ (X86VFixupimms (_.VT _.RC:$src1),
+ (_.VT _.RC:$src2),
+ (_src3VT.VT (scalar_to_vector
+ (_src3VT.ScalarLdFrag addr:$src3))),
+ (i32 imm:$src4))>,
Sched<[sched.Folded, sched.ReadAfterFold]>;
}
}
@@ -12252,25 +11573,23 @@ multiclass avx512_fixupimm_packed_all<X86SchedWriteWidths sched,
AVX512VLVectorVTInfo _Vec,
AVX512VLVectorVTInfo _Tbl> {
let Predicates = [HasAVX512] in
- defm Z : avx512_fixupimm_packed<0x54, "vfixupimm", X86VFixupimm, sched.ZMM,
- _Vec.info512, _Tbl.info512>,
- avx512_fixupimm_packed_sae<0x54, "vfixupimm", X86VFixupimm, sched.ZMM,
+ defm Z : avx512_fixupimm_packed_sae<0x54, "vfixupimm", sched.ZMM,
_Vec.info512, _Tbl.info512>, AVX512AIi8Base,
EVEX_4V, EVEX_V512;
let Predicates = [HasAVX512, HasVLX] in {
- defm Z128 : avx512_fixupimm_packed<0x54, "vfixupimm", X86VFixupimm, sched.XMM,
+ defm Z128 : avx512_fixupimm_packed<0x54, "vfixupimm", sched.XMM,
_Vec.info128, _Tbl.info128>, AVX512AIi8Base,
EVEX_4V, EVEX_V128;
- defm Z256 : avx512_fixupimm_packed<0x54, "vfixupimm", X86VFixupimm, sched.YMM,
+ defm Z256 : avx512_fixupimm_packed<0x54, "vfixupimm", sched.YMM,
_Vec.info256, _Tbl.info256>, AVX512AIi8Base,
EVEX_4V, EVEX_V256;
}
}
-defm VFIXUPIMMSSZ : avx512_fixupimm_scalar<0x55, "vfixupimm", X86VFixupimmScalar,
+defm VFIXUPIMMSSZ : avx512_fixupimm_scalar<0x55, "vfixupimm",
SchedWriteFAdd.Scl, f32x_info, v4i32x_info>,
AVX512AIi8Base, VEX_LIG, EVEX_4V, EVEX_CD8<32, CD8VT1>;
-defm VFIXUPIMMSDZ : avx512_fixupimm_scalar<0x55, "vfixupimm", X86VFixupimmScalar,
+defm VFIXUPIMMSDZ : avx512_fixupimm_scalar<0x55, "vfixupimm",
SchedWriteFAdd.Scl, f64x_info, v2i64x_info>,
AVX512AIi8Base, VEX_LIG, EVEX_4V, EVEX_CD8<64, CD8VT1>, VEX_W;
defm VFIXUPIMMPS : avx512_fixupimm_packed_all<SchedWriteFAdd, avx512vl_f32_info,
@@ -12331,6 +11650,12 @@ multiclass AVX512_scalar_math_fp_patterns<SDNode Op, string OpcPrefix, SDNode Mo
_.FRC:$src)))),
(!cast<Instruction>("V"#OpcPrefix#Zrr_Int) _.VT:$dst,
(_.VT (COPY_TO_REGCLASS _.FRC:$src, VR128X)))>;
+ def : Pat<(MoveNode
+ (_.VT VR128X:$dst),
+ (_.VT (scalar_to_vector
+ (Op (_.EltVT (extractelt (_.VT VR128X:$dst), (iPTR 0))),
+ (_.ScalarLdFrag addr:$src))))),
+ (!cast<Instruction>("V"#OpcPrefix#Zrm_Int) _.VT:$dst, addr:$src)>;
// extracted masked scalar math op with insert via movss
def : Pat<(MoveNode (_.VT VR128X:$src1),
@@ -12344,6 +11669,16 @@ multiclass AVX512_scalar_math_fp_patterns<SDNode Op, string OpcPrefix, SDNode Mo
(_.VT (COPY_TO_REGCLASS _.FRC:$src0, VR128X)),
VK1WM:$mask, _.VT:$src1,
(_.VT (COPY_TO_REGCLASS _.FRC:$src2, VR128X)))>;
+ def : Pat<(MoveNode (_.VT VR128X:$src1),
+ (scalar_to_vector
+ (X86selects VK1WM:$mask,
+ (Op (_.EltVT
+ (extractelt (_.VT VR128X:$src1), (iPTR 0))),
+ (_.ScalarLdFrag addr:$src2)),
+ _.FRC:$src0))),
+ (!cast<Instruction>("V"#OpcPrefix#Zrm_Intk)
+ (_.VT (COPY_TO_REGCLASS _.FRC:$src0, VR128X)),
+ VK1WM:$mask, _.VT:$src1, addr:$src2)>;
// extracted masked scalar math op with insert via movss
def : Pat<(MoveNode (_.VT VR128X:$src1),
@@ -12355,6 +11690,13 @@ multiclass AVX512_scalar_math_fp_patterns<SDNode Op, string OpcPrefix, SDNode Mo
(!cast<I>("V"#OpcPrefix#Zrr_Intkz)
VK1WM:$mask, _.VT:$src1,
(_.VT (COPY_TO_REGCLASS _.FRC:$src2, VR128X)))>;
+ def : Pat<(MoveNode (_.VT VR128X:$src1),
+ (scalar_to_vector
+ (X86selects VK1WM:$mask,
+ (Op (_.EltVT
+ (extractelt (_.VT VR128X:$src1), (iPTR 0))),
+ (_.ScalarLdFrag addr:$src2)), (_.EltVT ZeroFP)))),
+ (!cast<I>("V"#OpcPrefix#Zrm_Intkz) VK1WM:$mask, _.VT:$src1, addr:$src2)>;
}
}
@@ -12380,26 +11722,6 @@ multiclass AVX512_scalar_unary_math_patterns<SDNode OpNode, string OpcPrefix,
defm : AVX512_scalar_unary_math_patterns<fsqrt, "SQRTSS", X86Movss, v4f32x_info>;
defm : AVX512_scalar_unary_math_patterns<fsqrt, "SQRTSD", X86Movsd, v2f64x_info>;
-multiclass AVX512_scalar_unary_math_imm_patterns<SDNode OpNode, string OpcPrefix,
- SDNode Move, X86VectorVTInfo _,
- bits<8> ImmV> {
- let Predicates = [HasAVX512] in {
- def : Pat<(_.VT (Move _.VT:$dst,
- (scalar_to_vector (OpNode (extractelt _.VT:$src, 0))))),
- (!cast<Instruction>("V"#OpcPrefix#Zr_Int) _.VT:$dst, _.VT:$src,
- (i32 ImmV))>;
- }
-}
-
-defm : AVX512_scalar_unary_math_imm_patterns<ffloor, "RNDSCALESS", X86Movss,
- v4f32x_info, 0x01>;
-defm : AVX512_scalar_unary_math_imm_patterns<fceil, "RNDSCALESS", X86Movss,
- v4f32x_info, 0x02>;
-defm : AVX512_scalar_unary_math_imm_patterns<ffloor, "RNDSCALESD", X86Movsd,
- v2f64x_info, 0x01>;
-defm : AVX512_scalar_unary_math_imm_patterns<fceil, "RNDSCALESD", X86Movsd,
- v2f64x_info, 0x02>;
-
//===----------------------------------------------------------------------===//
// AES instructions
//===----------------------------------------------------------------------===//
@@ -12612,12 +11934,19 @@ defm VPOPCNTW : avx512_unary_rm_vl<0x54, "vpopcntw", ctpop, SchedWriteVecALU,
defm : avx512_unary_lowering<"VPOPCNTB", ctpop, avx512vl_i8_info, HasBITALG>;
defm : avx512_unary_lowering<"VPOPCNTW", ctpop, avx512vl_i16_info, HasBITALG>;
+def X86Vpshufbitqmb_su : PatFrag<(ops node:$src1, node:$src2),
+ (X86Vpshufbitqmb node:$src1, node:$src2), [{
+ return N->hasOneUse();
+}]>;
+
multiclass VPSHUFBITQMB_rm<X86FoldableSchedWrite sched, X86VectorVTInfo VTI> {
defm rr : AVX512_maskable_cmp<0x8F, MRMSrcReg, VTI, (outs VTI.KRC:$dst),
(ins VTI.RC:$src1, VTI.RC:$src2),
"vpshufbitqmb",
"$src2, $src1", "$src1, $src2",
(X86Vpshufbitqmb (VTI.VT VTI.RC:$src1),
+ (VTI.VT VTI.RC:$src2)),
+ (X86Vpshufbitqmb_su (VTI.VT VTI.RC:$src1),
(VTI.VT VTI.RC:$src2))>, EVEX_4V, T8PD,
Sched<[sched]>;
defm rm : AVX512_maskable_cmp<0x8F, MRMSrcMem, VTI, (outs VTI.KRC:$dst),
@@ -12625,6 +11954,8 @@ multiclass VPSHUFBITQMB_rm<X86FoldableSchedWrite sched, X86VectorVTInfo VTI> {
"vpshufbitqmb",
"$src2, $src1", "$src1, $src2",
(X86Vpshufbitqmb (VTI.VT VTI.RC:$src1),
+ (VTI.VT (VTI.LdFrag addr:$src2))),
+ (X86Vpshufbitqmb_su (VTI.VT VTI.RC:$src1),
(VTI.VT (VTI.LdFrag addr:$src2)))>,
EVEX_4V, EVEX_CD8<8, CD8VF>, T8PD,
Sched<[sched.Folded, sched.ReadAfterFold]>;
@@ -12720,13 +12051,13 @@ defm V4FNMADDPSrm : AVX512_maskable_3src_in_asm<0xAA, MRMSrcMem, v16f32_info,
defm V4FMADDSSrm : AVX512_maskable_3src_in_asm<0x9B, MRMSrcMem, f32x_info,
(outs VR128X:$dst), (ins VR128X:$src2, f128mem:$src3),
"v4fmaddss", "$src3, $src2", "$src2, $src3",
- []>, EVEX_V128, EVEX_4V, T8XD, EVEX_CD8<32, CD8VF>,
+ []>, VEX_LIG, EVEX_4V, T8XD, EVEX_CD8<32, CD8VF>,
Sched<[SchedWriteFMA.Scl.Folded]>;
defm V4FNMADDSSrm : AVX512_maskable_3src_in_asm<0xAB, MRMSrcMem, f32x_info,
(outs VR128X:$dst), (ins VR128X:$src2, f128mem:$src3),
"v4fnmaddss", "$src3, $src2", "$src2, $src3",
- []>, EVEX_V128, EVEX_4V, T8XD, EVEX_CD8<32, CD8VF>,
+ []>, VEX_LIG, EVEX_4V, T8XD, EVEX_CD8<32, CD8VF>,
Sched<[SchedWriteFMA.Scl.Folded]>;
}
@@ -12749,3 +12080,196 @@ defm VP4DPWSSDSrm : AVX512_maskable_3src_in_asm<0x53, MRMSrcMem, v16i32_info,
Sched<[SchedWriteFMA.ZMM.Folded]>;
}
+let hasSideEffects = 0 in {
+ let mayStore = 1 in
+ def MASKPAIR16STORE : PseudoI<(outs), (ins anymem:$dst, VK16PAIR:$src), []>;
+ let mayLoad = 1 in
+ def MASKPAIR16LOAD : PseudoI<(outs VK16PAIR:$dst), (ins anymem:$src), []>;
+}
+
+//===----------------------------------------------------------------------===//
+// VP2INTERSECT
+//===----------------------------------------------------------------------===//
+
+multiclass avx512_vp2intersect_modes<X86VectorVTInfo _> {
+ def rr : I<0x68, MRMSrcReg,
+ (outs _.KRPC:$dst),
+ (ins _.RC:$src1, _.RC:$src2),
+ !strconcat("vp2intersect", _.Suffix,
+ "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
+ [(set _.KRPC:$dst, (X86vp2intersect
+ _.RC:$src1, (_.VT _.RC:$src2)))]>,
+ EVEX_4V, T8XD;
+
+ def rm : I<0x68, MRMSrcMem,
+ (outs _.KRPC:$dst),
+ (ins _.RC:$src1, _.MemOp:$src2),
+ !strconcat("vp2intersect", _.Suffix,
+ "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
+ [(set _.KRPC:$dst, (X86vp2intersect
+ _.RC:$src1, (_.VT (bitconvert (_.LdFrag addr:$src2)))))]>,
+ EVEX_4V, T8XD, EVEX_CD8<_.EltSize, CD8VF>;
+
+ def rmb : I<0x68, MRMSrcMem,
+ (outs _.KRPC:$dst),
+ (ins _.RC:$src1, _.ScalarMemOp:$src2),
+ !strconcat("vp2intersect", _.Suffix, "\t{${src2}", _.BroadcastStr,
+ ", $src1, $dst|$dst, $src1, ${src2}", _.BroadcastStr ,"}"),
+ [(set _.KRPC:$dst, (X86vp2intersect
+ _.RC:$src1, (_.VT (X86VBroadcast (_.ScalarLdFrag addr:$src2)))))]>,
+ EVEX_4V, T8XD, EVEX_B, EVEX_CD8<_.EltSize, CD8VF>;
+}
+
+multiclass avx512_vp2intersect<AVX512VLVectorVTInfo _> {
+ let Predicates = [HasAVX512, HasVP2INTERSECT] in
+ defm Z : avx512_vp2intersect_modes<_.info512>, EVEX_V512;
+
+ let Predicates = [HasAVX512, HasVP2INTERSECT, HasVLX] in {
+ defm Z256 : avx512_vp2intersect_modes<_.info256>, EVEX_V256;
+ defm Z128 : avx512_vp2intersect_modes<_.info128>, EVEX_V128;
+ }
+}
+
+defm VP2INTERSECTD : avx512_vp2intersect<avx512vl_i32_info>;
+defm VP2INTERSECTQ : avx512_vp2intersect<avx512vl_i64_info>, VEX_W;
+
+multiclass avx512_binop_all2<bits<8> opc, string OpcodeStr,
+ X86SchedWriteWidths sched,
+ AVX512VLVectorVTInfo _SrcVTInfo,
+ AVX512VLVectorVTInfo _DstVTInfo,
+ SDNode OpNode, Predicate prd,
+ bit IsCommutable = 0> {
+ let Predicates = [prd] in
+ defm NAME#Z : avx512_binop_rm2<opc, OpcodeStr, sched.ZMM, OpNode,
+ _SrcVTInfo.info512, _DstVTInfo.info512,
+ _SrcVTInfo.info512, IsCommutable>,
+ EVEX_V512, EVEX_CD8<32, CD8VF>;
+ let Predicates = [HasVLX, prd] in {
+ defm NAME#Z256 : avx512_binop_rm2<opc, OpcodeStr, sched.YMM, OpNode,
+ _SrcVTInfo.info256, _DstVTInfo.info256,
+ _SrcVTInfo.info256, IsCommutable>,
+ EVEX_V256, EVEX_CD8<32, CD8VF>;
+ defm NAME#Z128 : avx512_binop_rm2<opc, OpcodeStr, sched.XMM, OpNode,
+ _SrcVTInfo.info128, _DstVTInfo.info128,
+ _SrcVTInfo.info128, IsCommutable>,
+ EVEX_V128, EVEX_CD8<32, CD8VF>;
+ }
+}
+
+defm VCVTNE2PS2BF16 : avx512_binop_all2<0x72, "vcvtne2ps2bf16",
+ SchedWriteCvtPD2PS, //FIXME: Shoulod be SchedWriteCvtPS2BF
+ avx512vl_f32_info, avx512vl_i16_info,
+ X86cvtne2ps2bf16, HasBF16, 0>, T8XD;
+
+// Truncate Float to BFloat16
+multiclass avx512_cvtps2bf16<bits<8> opc, string OpcodeStr,
+ X86SchedWriteWidths sched> {
+ let Predicates = [HasBF16] in {
+ defm Z : avx512_vcvt_fp<opc, OpcodeStr, v16i16x_info, v16f32_info,
+ X86cvtneps2bf16, sched.ZMM>, EVEX_V512;
+ }
+ let Predicates = [HasBF16, HasVLX] in {
+ defm Z128 : avx512_vcvt_fp<opc, OpcodeStr, v8i16x_info, v4f32x_info,
+ null_frag, sched.XMM, "{1to4}", "{x}", f128mem,
+ VK4WM>, EVEX_V128;
+ defm Z256 : avx512_vcvt_fp<opc, OpcodeStr, v8i16x_info, v8f32x_info,
+ X86cvtneps2bf16,
+ sched.YMM, "{1to8}", "{y}">, EVEX_V256;
+
+ def : InstAlias<OpcodeStr##"x\t{$src, $dst|$dst, $src}",
+ (!cast<Instruction>(NAME # "Z128rr") VR128X:$dst,
+ VR128X:$src), 0>;
+ def : InstAlias<OpcodeStr##"x\t{$src, $dst|$dst, $src}",
+ (!cast<Instruction>(NAME # "Z128rm") VR128X:$dst,
+ f128mem:$src), 0, "intel">;
+ def : InstAlias<OpcodeStr##"y\t{$src, $dst|$dst, $src}",
+ (!cast<Instruction>(NAME # "Z256rr") VR128X:$dst,
+ VR256X:$src), 0>;
+ def : InstAlias<OpcodeStr##"y\t{$src, $dst|$dst, $src}",
+ (!cast<Instruction>(NAME # "Z256rm") VR128X:$dst,
+ f256mem:$src), 0, "intel">;
+ }
+}
+
+defm VCVTNEPS2BF16 : avx512_cvtps2bf16<0x72, "vcvtneps2bf16",
+ SchedWriteCvtPD2PS>, T8XS,
+ EVEX_CD8<32, CD8VF>;
+
+let Predicates = [HasBF16, HasVLX] in {
+ // Special patterns to allow use of X86mcvtneps2bf16 for masking. Instruction
+ // patterns have been disabled with null_frag.
+ def : Pat<(v8i16 (X86cvtneps2bf16 (v4f32 VR128X:$src))),
+ (VCVTNEPS2BF16Z128rr VR128X:$src)>;
+ def : Pat<(X86mcvtneps2bf16 (v4f32 VR128X:$src), (v8i16 VR128X:$src0),
+ VK4WM:$mask),
+ (VCVTNEPS2BF16Z128rrk VR128X:$src0, VK4WM:$mask, VR128X:$src)>;
+ def : Pat<(X86mcvtneps2bf16 (v4f32 VR128X:$src), v8i16x_info.ImmAllZerosV,
+ VK4WM:$mask),
+ (VCVTNEPS2BF16Z128rrkz VK4WM:$mask, VR128X:$src)>;
+
+ def : Pat<(v8i16 (X86cvtneps2bf16 (loadv4f32 addr:$src))),
+ (VCVTNEPS2BF16Z128rm addr:$src)>;
+ def : Pat<(X86mcvtneps2bf16 (loadv4f32 addr:$src), (v8i16 VR128X:$src0),
+ VK4WM:$mask),
+ (VCVTNEPS2BF16Z128rmk VR128X:$src0, VK4WM:$mask, addr:$src)>;
+ def : Pat<(X86mcvtneps2bf16 (loadv4f32 addr:$src), v8i16x_info.ImmAllZerosV,
+ VK4WM:$mask),
+ (VCVTNEPS2BF16Z128rmkz VK4WM:$mask, addr:$src)>;
+
+ def : Pat<(v8i16 (X86cvtneps2bf16 (v4f32
+ (X86VBroadcast (loadf32 addr:$src))))),
+ (VCVTNEPS2BF16Z128rmb addr:$src)>;
+ def : Pat<(X86mcvtneps2bf16 (v4f32 (X86VBroadcast (loadf32 addr:$src))),
+ (v8i16 VR128X:$src0), VK4WM:$mask),
+ (VCVTNEPS2BF16Z128rmbk VR128X:$src0, VK4WM:$mask, addr:$src)>;
+ def : Pat<(X86mcvtneps2bf16 (v4f32 (X86VBroadcast (loadf32 addr:$src))),
+ v8i16x_info.ImmAllZerosV, VK4WM:$mask),
+ (VCVTNEPS2BF16Z128rmbkz VK4WM:$mask, addr:$src)>;
+}
+
+let Constraints = "$src1 = $dst" in {
+multiclass avx512_dpbf16ps_rm<bits<8> opc, string OpcodeStr, SDNode OpNode,
+ X86VectorVTInfo _, X86VectorVTInfo src_v> {
+ defm r: AVX512_maskable_3src<opc, MRMSrcReg, _, (outs _.RC:$dst),
+ (ins _.RC:$src2, _.RC:$src3),
+ OpcodeStr, "$src3, $src2", "$src2, $src3",
+ (_.VT (OpNode _.RC:$src1, _.RC:$src2, _.RC:$src3))>,
+ EVEX_4V;
+
+ defm m: AVX512_maskable_3src<opc, MRMSrcMem, _, (outs _.RC:$dst),
+ (ins _.RC:$src2, _.MemOp:$src3),
+ OpcodeStr, "$src3, $src2", "$src2, $src3",
+ (_.VT (OpNode _.RC:$src1, _.RC:$src2,
+ (src_v.VT (bitconvert
+ (src_v.LdFrag addr:$src3)))))>, EVEX_4V;
+
+ defm mb: AVX512_maskable_3src<opc, MRMSrcMem, _, (outs _.RC:$dst),
+ (ins _.RC:$src2, _.ScalarMemOp:$src3),
+ OpcodeStr,
+ !strconcat("${src3}", _.BroadcastStr,", $src2"),
+ !strconcat("$src2, ${src3}", _.BroadcastStr),
+ (_.VT (OpNode _.RC:$src1, _.RC:$src2,
+ (src_v.VT (X86VBroadcast(src_v.ScalarLdFrag addr:$src3)))))>,
+ EVEX_B, EVEX_4V;
+
+}
+} // Constraints = "$src1 = $dst"
+
+multiclass avx512_dpbf16ps_sizes<bits<8> opc, string OpcodeStr, SDNode OpNode,
+ AVX512VLVectorVTInfo _,
+ AVX512VLVectorVTInfo src_v, Predicate prd> {
+ let Predicates = [prd] in {
+ defm Z : avx512_dpbf16ps_rm<opc, OpcodeStr, OpNode, _.info512,
+ src_v.info512>, EVEX_V512;
+ }
+ let Predicates = [HasVLX, prd] in {
+ defm Z256 : avx512_dpbf16ps_rm<opc, OpcodeStr, OpNode, _.info256,
+ src_v.info256>, EVEX_V256;
+ defm Z128 : avx512_dpbf16ps_rm<opc, OpcodeStr, OpNode, _.info128,
+ src_v.info128>, EVEX_V128;
+ }
+}
+
+defm VDPBF16PS : avx512_dpbf16ps_sizes<0x52, "vdpbf16ps", X86dpbf16ps,
+ avx512vl_f32_info, avx512vl_i32_info,
+ HasBF16>, T8XS, EVEX_CD8<32, CD8VF>;
diff --git a/lib/Target/X86/X86InstrArithmetic.td b/lib/Target/X86/X86InstrArithmetic.td
index cb5a4e5b5d41..e52635f8d48b 100644
--- a/lib/Target/X86/X86InstrArithmetic.td
+++ b/lib/Target/X86/X86InstrArithmetic.td
@@ -1,9 +1,8 @@
//===-- X86InstrArithmetic.td - Integer Arithmetic Instrs --*- tablegen -*-===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
@@ -195,19 +194,22 @@ def IMUL64rm : RI<0xAF, MRMSrcMem, (outs GR64:$dst),
// Surprisingly enough, these are not two address instructions!
let Defs = [EFLAGS] in {
+// NOTE: These are order specific, we want the ri8 forms to be listed
+// first so that they are slightly preferred to the ri forms.
+
// Register-Integer Signed Integer Multiply
-def IMUL16rri : Ii16<0x69, MRMSrcReg, // GR16 = GR16*I16
- (outs GR16:$dst), (ins GR16:$src1, i16imm:$src2),
- "imul{w}\t{$src2, $src1, $dst|$dst, $src1, $src2}",
- [(set GR16:$dst, EFLAGS,
- (X86smul_flag GR16:$src1, imm:$src2))]>,
- Sched<[WriteIMul16Imm]>, OpSize16;
def IMUL16rri8 : Ii8<0x6B, MRMSrcReg, // GR16 = GR16*I8
(outs GR16:$dst), (ins GR16:$src1, i16i8imm:$src2),
"imul{w}\t{$src2, $src1, $dst|$dst, $src1, $src2}",
[(set GR16:$dst, EFLAGS,
(X86smul_flag GR16:$src1, i16immSExt8:$src2))]>,
Sched<[WriteIMul16Imm]>, OpSize16;
+def IMUL16rri : Ii16<0x69, MRMSrcReg, // GR16 = GR16*I16
+ (outs GR16:$dst), (ins GR16:$src1, i16imm:$src2),
+ "imul{w}\t{$src2, $src1, $dst|$dst, $src1, $src2}",
+ [(set GR16:$dst, EFLAGS,
+ (X86smul_flag GR16:$src1, imm:$src2))]>,
+ Sched<[WriteIMul16Imm]>, OpSize16;
def IMUL32rri : Ii32<0x69, MRMSrcReg, // GR32 = GR32*I32
(outs GR32:$dst), (ins GR32:$src1, i32imm:$src2),
"imul{l}\t{$src2, $src1, $dst|$dst, $src1, $src2}",
@@ -220,26 +222,20 @@ def IMUL32rri8 : Ii8<0x6B, MRMSrcReg, // GR32 = GR32*I8
[(set GR32:$dst, EFLAGS,
(X86smul_flag GR32:$src1, i32immSExt8:$src2))]>,
Sched<[WriteIMul32Imm]>, OpSize32;
-def IMUL64rri32 : RIi32S<0x69, MRMSrcReg, // GR64 = GR64*I32
- (outs GR64:$dst), (ins GR64:$src1, i64i32imm:$src2),
- "imul{q}\t{$src2, $src1, $dst|$dst, $src1, $src2}",
- [(set GR64:$dst, EFLAGS,
- (X86smul_flag GR64:$src1, i64immSExt32:$src2))]>,
- Sched<[WriteIMul64Imm]>;
def IMUL64rri8 : RIi8<0x6B, MRMSrcReg, // GR64 = GR64*I8
(outs GR64:$dst), (ins GR64:$src1, i64i8imm:$src2),
"imul{q}\t{$src2, $src1, $dst|$dst, $src1, $src2}",
[(set GR64:$dst, EFLAGS,
(X86smul_flag GR64:$src1, i64immSExt8:$src2))]>,
Sched<[WriteIMul64Imm]>;
+def IMUL64rri32 : RIi32S<0x69, MRMSrcReg, // GR64 = GR64*I32
+ (outs GR64:$dst), (ins GR64:$src1, i64i32imm:$src2),
+ "imul{q}\t{$src2, $src1, $dst|$dst, $src1, $src2}",
+ [(set GR64:$dst, EFLAGS,
+ (X86smul_flag GR64:$src1, i64immSExt32:$src2))]>,
+ Sched<[WriteIMul64Imm]>;
// Memory-Integer Signed Integer Multiply
-def IMUL16rmi : Ii16<0x69, MRMSrcMem, // GR16 = [mem16]*I16
- (outs GR16:$dst), (ins i16mem:$src1, i16imm:$src2),
- "imul{w}\t{$src2, $src1, $dst|$dst, $src1, $src2}",
- [(set GR16:$dst, EFLAGS,
- (X86smul_flag (loadi16 addr:$src1), imm:$src2))]>,
- Sched<[WriteIMul16Imm.Folded]>, OpSize16;
def IMUL16rmi8 : Ii8<0x6B, MRMSrcMem, // GR16 = [mem16]*I8
(outs GR16:$dst), (ins i16mem:$src1, i16i8imm :$src2),
"imul{w}\t{$src2, $src1, $dst|$dst, $src1, $src2}",
@@ -247,12 +243,12 @@ def IMUL16rmi8 : Ii8<0x6B, MRMSrcMem, // GR16 = [mem16]*I8
(X86smul_flag (loadi16 addr:$src1),
i16immSExt8:$src2))]>,
Sched<[WriteIMul16Imm.Folded]>, OpSize16;
-def IMUL32rmi : Ii32<0x69, MRMSrcMem, // GR32 = [mem32]*I32
- (outs GR32:$dst), (ins i32mem:$src1, i32imm:$src2),
- "imul{l}\t{$src2, $src1, $dst|$dst, $src1, $src2}",
- [(set GR32:$dst, EFLAGS,
- (X86smul_flag (loadi32 addr:$src1), imm:$src2))]>,
- Sched<[WriteIMul32Imm.Folded]>, OpSize32;
+def IMUL16rmi : Ii16<0x69, MRMSrcMem, // GR16 = [mem16]*I16
+ (outs GR16:$dst), (ins i16mem:$src1, i16imm:$src2),
+ "imul{w}\t{$src2, $src1, $dst|$dst, $src1, $src2}",
+ [(set GR16:$dst, EFLAGS,
+ (X86smul_flag (loadi16 addr:$src1), imm:$src2))]>,
+ Sched<[WriteIMul16Imm.Folded]>, OpSize16;
def IMUL32rmi8 : Ii8<0x6B, MRMSrcMem, // GR32 = [mem32]*I8
(outs GR32:$dst), (ins i32mem:$src1, i32i8imm: $src2),
"imul{l}\t{$src2, $src1, $dst|$dst, $src1, $src2}",
@@ -260,13 +256,12 @@ def IMUL32rmi8 : Ii8<0x6B, MRMSrcMem, // GR32 = [mem32]*I8
(X86smul_flag (loadi32 addr:$src1),
i32immSExt8:$src2))]>,
Sched<[WriteIMul32Imm.Folded]>, OpSize32;
-def IMUL64rmi32 : RIi32S<0x69, MRMSrcMem, // GR64 = [mem64]*I32
- (outs GR64:$dst), (ins i64mem:$src1, i64i32imm:$src2),
- "imul{q}\t{$src2, $src1, $dst|$dst, $src1, $src2}",
- [(set GR64:$dst, EFLAGS,
- (X86smul_flag (loadi64 addr:$src1),
- i64immSExt32:$src2))]>,
- Sched<[WriteIMul64Imm.Folded]>;
+def IMUL32rmi : Ii32<0x69, MRMSrcMem, // GR32 = [mem32]*I32
+ (outs GR32:$dst), (ins i32mem:$src1, i32imm:$src2),
+ "imul{l}\t{$src2, $src1, $dst|$dst, $src1, $src2}",
+ [(set GR32:$dst, EFLAGS,
+ (X86smul_flag (loadi32 addr:$src1), imm:$src2))]>,
+ Sched<[WriteIMul32Imm.Folded]>, OpSize32;
def IMUL64rmi8 : RIi8<0x6B, MRMSrcMem, // GR64 = [mem64]*I8
(outs GR64:$dst), (ins i64mem:$src1, i64i8imm: $src2),
"imul{q}\t{$src2, $src1, $dst|$dst, $src1, $src2}",
@@ -274,6 +269,13 @@ def IMUL64rmi8 : RIi8<0x6B, MRMSrcMem, // GR64 = [mem64]*I8
(X86smul_flag (loadi64 addr:$src1),
i64immSExt8:$src2))]>,
Sched<[WriteIMul64Imm.Folded]>;
+def IMUL64rmi32 : RIi32S<0x69, MRMSrcMem, // GR64 = [mem64]*I32
+ (outs GR64:$dst), (ins i64mem:$src1, i64i32imm:$src2),
+ "imul{q}\t{$src2, $src1, $dst|$dst, $src1, $src2}",
+ [(set GR64:$dst, EFLAGS,
+ (X86smul_flag (loadi64 addr:$src1),
+ i64immSExt32:$src2))]>,
+ Sched<[WriteIMul64Imm.Folded]>;
} // Defs = [EFLAGS]
// unsigned division/remainder
@@ -436,11 +438,10 @@ def X86sub_flag_nocf : PatFrag<(ops node:$lhs, node:$rhs),
// TODO: inc/dec is slow for P4, but fast for Pentium-M.
let Defs = [EFLAGS] in {
let Constraints = "$src1 = $dst", SchedRW = [WriteALU] in {
-let CodeSize = 2 in
+let isConvertibleToThreeAddress = 1, CodeSize = 2 in { // Can xform into LEA.
def INC8r : I<0xFE, MRM0r, (outs GR8 :$dst), (ins GR8 :$src1),
"inc{b}\t$dst",
[(set GR8:$dst, EFLAGS, (X86add_flag_nocf GR8:$src1, 1))]>;
-let isConvertibleToThreeAddress = 1, CodeSize = 2 in { // Can xform into LEA.
def INC16r : I<0xFF, MRM0r, (outs GR16:$dst), (ins GR16:$src1),
"inc{w}\t$dst",
[(set GR16:$dst, EFLAGS, (X86add_flag_nocf GR16:$src1, 1))]>,
@@ -484,11 +485,10 @@ let Predicates = [UseIncDec, In64BitMode] in {
} // CodeSize = 2, SchedRW
let Constraints = "$src1 = $dst", SchedRW = [WriteALU] in {
-let CodeSize = 2 in
+let isConvertibleToThreeAddress = 1, CodeSize = 2 in { // Can xform into LEA.
def DEC8r : I<0xFE, MRM1r, (outs GR8 :$dst), (ins GR8 :$src1),
"dec{b}\t$dst",
[(set GR8:$dst, EFLAGS, (X86sub_flag_nocf GR8:$src1, 1))]>;
-let isConvertibleToThreeAddress = 1, CodeSize = 2 in { // Can xform into LEA.
def DEC16r : I<0xFF, MRM1r, (outs GR16:$dst), (ins GR16:$src1),
"dec{w}\t$dst",
[(set GR16:$dst, EFLAGS, (X86sub_flag_nocf GR16:$src1, 1))]>,
@@ -605,16 +605,16 @@ def invalid_node : SDNode<"<<invalid_node>>", SDTIntLeaf,[],"<<invalid_node>>">;
def Xi8 : X86TypeInfo<i8, "b", GR8, loadi8, i8mem,
- Imm8, i8imm, imm8_su, i8imm, invalid_node,
+ Imm8, i8imm, relocImm8_su, i8imm, invalid_node,
0, OpSizeFixed, 0>;
def Xi16 : X86TypeInfo<i16, "w", GR16, loadi16, i16mem,
- Imm16, i16imm, imm16_su, i16i8imm, i16immSExt8_su,
+ Imm16, i16imm, relocImm16_su, i16i8imm, i16immSExt8_su,
1, OpSize16, 0>;
def Xi32 : X86TypeInfo<i32, "l", GR32, loadi32, i32mem,
- Imm32, i32imm, imm32_su, i32i8imm, i32immSExt8_su,
+ Imm32, i32imm, relocImm32_su, i32i8imm, i32immSExt8_su,
1, OpSize32, 0>;
def Xi64 : X86TypeInfo<i64, "q", GR64, loadi64, i64mem,
- Imm32S, i64i32imm, i64immSExt32_su, i64i8imm, i64immSExt8_su,
+ Imm32S, i64i32imm, i64relocImmSExt32_su, i64i8imm, i64immSExt8_su,
1, OpSizeFixed, 1>;
/// ITy - This instruction base class takes the type info for the instruction.
@@ -924,11 +924,12 @@ class BinOpAI_F<bits<8> opcode, string mnemonic, X86TypeInfo typeinfo,
multiclass ArithBinOp_RF<bits<8> BaseOpc, bits<8> BaseOpc2, bits<8> BaseOpc4,
string mnemonic, Format RegMRM, Format MemMRM,
SDNode opnodeflag, SDNode opnode,
- bit CommutableRR, bit ConvertibleToThreeAddress> {
+ bit CommutableRR, bit ConvertibleToThreeAddress,
+ bit ConvertibleToThreeAddressRR> {
let Defs = [EFLAGS] in {
let Constraints = "$src1 = $dst" in {
let isCommutable = CommutableRR in {
- let isConvertibleToThreeAddress = ConvertibleToThreeAddress in {
+ let isConvertibleToThreeAddress = ConvertibleToThreeAddressRR in {
def NAME#8rr : BinOpRR_RF<BaseOpc, mnemonic, Xi8 , opnodeflag>;
def NAME#16rr : BinOpRR_RF<BaseOpc, mnemonic, Xi16, opnodeflag>;
def NAME#32rr : BinOpRR_RF<BaseOpc, mnemonic, Xi32, opnodeflag>;
@@ -1169,16 +1170,16 @@ multiclass ArithBinOp_F<bits<8> BaseOpc, bits<8> BaseOpc2, bits<8> BaseOpc4,
defm AND : ArithBinOp_RF<0x20, 0x22, 0x24, "and", MRM4r, MRM4m,
- X86and_flag, and, 1, 0>;
+ X86and_flag, and, 1, 0, 0>;
defm OR : ArithBinOp_RF<0x08, 0x0A, 0x0C, "or", MRM1r, MRM1m,
- X86or_flag, or, 1, 0>;
+ X86or_flag, or, 1, 0, 0>;
defm XOR : ArithBinOp_RF<0x30, 0x32, 0x34, "xor", MRM6r, MRM6m,
- X86xor_flag, xor, 1, 0>;
+ X86xor_flag, xor, 1, 0, 0>;
defm ADD : ArithBinOp_RF<0x00, 0x02, 0x04, "add", MRM0r, MRM0m,
- X86add_flag, add, 1, 1>;
+ X86add_flag, add, 1, 1, 1>;
let isCompare = 1 in {
defm SUB : ArithBinOp_RF<0x28, 0x2A, 0x2C, "sub", MRM5r, MRM5m,
- X86sub_flag, sub, 0, 0>;
+ X86sub_flag, sub, 0, 1, 0>;
}
// Arithmetic.
diff --git a/lib/Target/X86/X86InstrBuilder.h b/lib/Target/X86/X86InstrBuilder.h
index dcce7b9951f2..50aed98112c3 100644
--- a/lib/Target/X86/X86InstrBuilder.h
+++ b/lib/Target/X86/X86InstrBuilder.h
@@ -1,9 +1,8 @@
//===-- X86InstrBuilder.h - Functions to aid building x86 insts -*- C++ -*-===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
diff --git a/lib/Target/X86/X86InstrCMovSetCC.td b/lib/Target/X86/X86InstrCMovSetCC.td
index f5494fc0b13f..099f6aa8d8bb 100644
--- a/lib/Target/X86/X86InstrCMovSetCC.td
+++ b/lib/Target/X86/X86InstrCMovSetCC.td
@@ -1,9 +1,8 @@
//===-- X86InstrCMovSetCC.td - Conditional Move and SetCC --*- tablegen -*-===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
@@ -14,99 +13,94 @@
// CMOV instructions.
-multiclass CMOV<bits<8> opc, string Mnemonic, X86FoldableSchedWrite Sched,
- PatLeaf CondNode> {
- let Uses = [EFLAGS], Predicates = [HasCMov], Constraints = "$src1 = $dst",
- isCommutable = 1, SchedRW = [Sched] in {
- def NAME#16rr
- : I<opc, MRMSrcReg, (outs GR16:$dst), (ins GR16:$src1, GR16:$src2),
- !strconcat(Mnemonic, "{w}\t{$src2, $dst|$dst, $src2}"),
- [(set GR16:$dst,
- (X86cmov GR16:$src1, GR16:$src2, CondNode, EFLAGS))]>,
- TB, OpSize16;
- def NAME#32rr
- : I<opc, MRMSrcReg, (outs GR32:$dst), (ins GR32:$src1, GR32:$src2),
- !strconcat(Mnemonic, "{l}\t{$src2, $dst|$dst, $src2}"),
- [(set GR32:$dst,
- (X86cmov GR32:$src1, GR32:$src2, CondNode, EFLAGS))]>,
- TB, OpSize32;
- def NAME#64rr
- :RI<opc, MRMSrcReg, (outs GR64:$dst), (ins GR64:$src1, GR64:$src2),
- !strconcat(Mnemonic, "{q}\t{$src2, $dst|$dst, $src2}"),
- [(set GR64:$dst,
- (X86cmov GR64:$src1, GR64:$src2, CondNode, EFLAGS))]>, TB;
- }
-
- let Uses = [EFLAGS], Predicates = [HasCMov], Constraints = "$src1 = $dst",
- SchedRW = [Sched.Folded, Sched.ReadAfterFold] in {
- def NAME#16rm
- : I<opc, MRMSrcMem, (outs GR16:$dst), (ins GR16:$src1, i16mem:$src2),
- !strconcat(Mnemonic, "{w}\t{$src2, $dst|$dst, $src2}"),
- [(set GR16:$dst, (X86cmov GR16:$src1, (loadi16 addr:$src2),
- CondNode, EFLAGS))]>, TB, OpSize16;
- def NAME#32rm
- : I<opc, MRMSrcMem, (outs GR32:$dst), (ins GR32:$src1, i32mem:$src2),
- !strconcat(Mnemonic, "{l}\t{$src2, $dst|$dst, $src2}"),
- [(set GR32:$dst, (X86cmov GR32:$src1, (loadi32 addr:$src2),
- CondNode, EFLAGS))]>, TB, OpSize32;
- def NAME#64rm
- :RI<opc, MRMSrcMem, (outs GR64:$dst), (ins GR64:$src1, i64mem:$src2),
- !strconcat(Mnemonic, "{q}\t{$src2, $dst|$dst, $src2}"),
- [(set GR64:$dst, (X86cmov GR64:$src1, (loadi64 addr:$src2),
- CondNode, EFLAGS))]>, TB;
- } // Uses = [EFLAGS], Predicates = [HasCMov], Constraints = "$src1 = $dst"
-} // end multiclass
+let isCodeGenOnly = 1, ForceDisassemble = 1 in {
+let Uses = [EFLAGS], Predicates = [HasCMov], Constraints = "$src1 = $dst",
+ isCommutable = 1, SchedRW = [WriteCMOV] in {
+ def CMOV16rr
+ : I<0x40, MRMSrcRegCC, (outs GR16:$dst), (ins GR16:$src1, GR16:$src2, ccode:$cond),
+ "cmov${cond}{w}\t{$src2, $dst|$dst, $src2}",
+ [(set GR16:$dst,
+ (X86cmov GR16:$src1, GR16:$src2, imm:$cond, EFLAGS))]>,
+ TB, OpSize16;
+ def CMOV32rr
+ : I<0x40, MRMSrcRegCC, (outs GR32:$dst), (ins GR32:$src1, GR32:$src2, ccode:$cond),
+ "cmov${cond}{l}\t{$src2, $dst|$dst, $src2}",
+ [(set GR32:$dst,
+ (X86cmov GR32:$src1, GR32:$src2, imm:$cond, EFLAGS))]>,
+ TB, OpSize32;
+ def CMOV64rr
+ :RI<0x40, MRMSrcRegCC, (outs GR64:$dst), (ins GR64:$src1, GR64:$src2, ccode:$cond),
+ "cmov${cond}{q}\t{$src2, $dst|$dst, $src2}",
+ [(set GR64:$dst,
+ (X86cmov GR64:$src1, GR64:$src2, imm:$cond, EFLAGS))]>, TB;
+}
+let Uses = [EFLAGS], Predicates = [HasCMov], Constraints = "$src1 = $dst",
+ SchedRW = [WriteCMOV.Folded, WriteCMOV.ReadAfterFold] in {
+ def CMOV16rm
+ : I<0x40, MRMSrcMemCC, (outs GR16:$dst), (ins GR16:$src1, i16mem:$src2, ccode:$cond),
+ "cmov${cond}{w}\t{$src2, $dst|$dst, $src2}",
+ [(set GR16:$dst, (X86cmov GR16:$src1, (loadi16 addr:$src2),
+ imm:$cond, EFLAGS))]>, TB, OpSize16;
+ def CMOV32rm
+ : I<0x40, MRMSrcMemCC, (outs GR32:$dst), (ins GR32:$src1, i32mem:$src2, ccode:$cond),
+ "cmov${cond}{l}\t{$src2, $dst|$dst, $src2}",
+ [(set GR32:$dst, (X86cmov GR32:$src1, (loadi32 addr:$src2),
+ imm:$cond, EFLAGS))]>, TB, OpSize32;
+ def CMOV64rm
+ :RI<0x40, MRMSrcMemCC, (outs GR64:$dst), (ins GR64:$src1, i64mem:$src2, ccode:$cond),
+ "cmov${cond}{q}\t{$src2, $dst|$dst, $src2}",
+ [(set GR64:$dst, (X86cmov GR64:$src1, (loadi64 addr:$src2),
+ imm:$cond, EFLAGS))]>, TB;
+} // Uses = [EFLAGS], Predicates = [HasCMov], Constraints = "$src1 = $dst"
+} // isCodeGenOnly = 1, ForceDisassemble = 1
-// Conditional Moves.
-defm CMOVO : CMOV<0x40, "cmovo" , WriteCMOV, X86_COND_O>;
-defm CMOVNO : CMOV<0x41, "cmovno", WriteCMOV, X86_COND_NO>;
-defm CMOVB : CMOV<0x42, "cmovb" , WriteCMOV, X86_COND_B>;
-defm CMOVAE : CMOV<0x43, "cmovae", WriteCMOV, X86_COND_AE>;
-defm CMOVE : CMOV<0x44, "cmove" , WriteCMOV, X86_COND_E>;
-defm CMOVNE : CMOV<0x45, "cmovne", WriteCMOV, X86_COND_NE>;
-defm CMOVBE : CMOV<0x46, "cmovbe", WriteCMOV2, X86_COND_BE>;
-defm CMOVA : CMOV<0x47, "cmova" , WriteCMOV2, X86_COND_A>;
-defm CMOVS : CMOV<0x48, "cmovs" , WriteCMOV, X86_COND_S>;
-defm CMOVNS : CMOV<0x49, "cmovns", WriteCMOV, X86_COND_NS>;
-defm CMOVP : CMOV<0x4A, "cmovp" , WriteCMOV, X86_COND_P>;
-defm CMOVNP : CMOV<0x4B, "cmovnp", WriteCMOV, X86_COND_NP>;
-defm CMOVL : CMOV<0x4C, "cmovl" , WriteCMOV, X86_COND_L>;
-defm CMOVGE : CMOV<0x4D, "cmovge", WriteCMOV, X86_COND_GE>;
-defm CMOVLE : CMOV<0x4E, "cmovle", WriteCMOV, X86_COND_LE>;
-defm CMOVG : CMOV<0x4F, "cmovg" , WriteCMOV, X86_COND_G>;
+// SetCC instructions.
+let Uses = [EFLAGS], isCodeGenOnly = 1, ForceDisassemble = 1 in {
+ def SETCCr : I<0x90, MRMXrCC, (outs GR8:$dst), (ins ccode:$cond),
+ "set${cond}\t$dst",
+ [(set GR8:$dst, (X86setcc imm:$cond, EFLAGS))]>,
+ TB, Sched<[WriteSETCC]>;
+ def SETCCm : I<0x90, MRMXmCC, (outs), (ins i8mem:$dst, ccode:$cond),
+ "set${cond}\t$dst",
+ [(store (X86setcc imm:$cond, EFLAGS), addr:$dst)]>,
+ TB, Sched<[WriteSETCCStore]>;
+} // Uses = [EFLAGS]
+multiclass CMOV_SETCC_Aliases<string Cond, int CC> {
+ def : InstAlias<"cmov"#Cond#"{w}\t{$src, $dst|$dst, $src}",
+ (CMOV16rr GR16:$dst, GR16:$src, CC), 0>;
+ def : InstAlias<"cmov"#Cond#"{w}\t{$src, $dst|$dst, $src}",
+ (CMOV16rm GR16:$dst, i16mem:$src, CC), 0>;
+ def : InstAlias<"cmov"#Cond#"{l}\t{$src, $dst|$dst, $src}",
+ (CMOV32rr GR32:$dst, GR32:$src, CC), 0>;
+ def : InstAlias<"cmov"#Cond#"{l}\t{$src, $dst|$dst, $src}",
+ (CMOV32rm GR32:$dst, i32mem:$src, CC), 0>;
+ def : InstAlias<"cmov"#Cond#"{q}\t{$src, $dst|$dst, $src}",
+ (CMOV64rr GR64:$dst, GR64:$src, CC), 0>;
+ def : InstAlias<"cmov"#Cond#"{q}\t{$src, $dst|$dst, $src}",
+ (CMOV64rm GR64:$dst, i64mem:$src, CC), 0>;
-// SetCC instructions.
-multiclass SETCC<bits<8> opc, string Mnemonic, PatLeaf OpNode> {
- let Uses = [EFLAGS] in {
- def r : I<opc, MRMXr, (outs GR8:$dst), (ins),
- !strconcat(Mnemonic, "\t$dst"),
- [(set GR8:$dst, (X86setcc OpNode, EFLAGS))]>,
- TB, Sched<[WriteSETCC]>;
- def m : I<opc, MRMXm, (outs), (ins i8mem:$dst),
- !strconcat(Mnemonic, "\t$dst"),
- [(store (X86setcc OpNode, EFLAGS), addr:$dst)]>,
- TB, Sched<[WriteSETCCStore]>;
- } // Uses = [EFLAGS]
+ def : InstAlias<"set"#Cond#"\t$dst", (SETCCr GR8:$dst, CC), 0>;
+ def : InstAlias<"set"#Cond#"\t$dst", (SETCCm i8mem:$dst, CC), 0>;
}
-defm SETO : SETCC<0x90, "seto", X86_COND_O>; // is overflow bit set
-defm SETNO : SETCC<0x91, "setno", X86_COND_NO>; // is overflow bit not set
-defm SETB : SETCC<0x92, "setb", X86_COND_B>; // unsigned less than
-defm SETAE : SETCC<0x93, "setae", X86_COND_AE>; // unsigned greater or equal
-defm SETE : SETCC<0x94, "sete", X86_COND_E>; // equal to
-defm SETNE : SETCC<0x95, "setne", X86_COND_NE>; // not equal to
-defm SETBE : SETCC<0x96, "setbe", X86_COND_BE>; // unsigned less than or equal
-defm SETA : SETCC<0x97, "seta", X86_COND_A>; // unsigned greater than
-defm SETS : SETCC<0x98, "sets", X86_COND_S>; // is signed bit set
-defm SETNS : SETCC<0x99, "setns", X86_COND_NS>; // is not signed
-defm SETP : SETCC<0x9A, "setp", X86_COND_P>; // is parity bit set
-defm SETNP : SETCC<0x9B, "setnp", X86_COND_NP>; // is parity bit not set
-defm SETL : SETCC<0x9C, "setl", X86_COND_L>; // signed less than
-defm SETGE : SETCC<0x9D, "setge", X86_COND_GE>; // signed greater or equal
-defm SETLE : SETCC<0x9E, "setle", X86_COND_LE>; // signed less than or equal
-defm SETG : SETCC<0x9F, "setg", X86_COND_G>; // signed greater than
+defm : CMOV_SETCC_Aliases<"o" , 0>;
+defm : CMOV_SETCC_Aliases<"no", 1>;
+defm : CMOV_SETCC_Aliases<"b" , 2>;
+defm : CMOV_SETCC_Aliases<"ae", 3>;
+defm : CMOV_SETCC_Aliases<"e" , 4>;
+defm : CMOV_SETCC_Aliases<"ne", 5>;
+defm : CMOV_SETCC_Aliases<"be", 6>;
+defm : CMOV_SETCC_Aliases<"a" , 7>;
+defm : CMOV_SETCC_Aliases<"s" , 8>;
+defm : CMOV_SETCC_Aliases<"ns", 9>;
+defm : CMOV_SETCC_Aliases<"p" , 10>;
+defm : CMOV_SETCC_Aliases<"np", 11>;
+defm : CMOV_SETCC_Aliases<"l" , 12>;
+defm : CMOV_SETCC_Aliases<"ge", 13>;
+defm : CMOV_SETCC_Aliases<"le", 14>;
+defm : CMOV_SETCC_Aliases<"g" , 15>;
// SALC is an undocumented instruction. Information for this instruction can be found
// here http://www.rcollins.org/secrets/opcodes/SALC.html
diff --git a/lib/Target/X86/X86InstrCompiler.td b/lib/Target/X86/X86InstrCompiler.td
index 394dca8e7817..efaccdc9ee96 100644
--- a/lib/Target/X86/X86InstrCompiler.td
+++ b/lib/Target/X86/X86InstrCompiler.td
@@ -1,9 +1,8 @@
//===- X86InstrCompiler.td - Compiler Pseudos and Patterns -*- tablegen -*-===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
@@ -20,11 +19,6 @@ def GetLo32XForm : SDNodeXForm<imm, [{
return getI32Imm((uint32_t)N->getZExtValue(), SDLoc(N));
}]>;
-def GetLo8XForm : SDNodeXForm<imm, [{
- // Transformation function: get the low 8 bits.
- return getI8Imm((uint8_t)N->getZExtValue(), SDLoc(N));
-}]>;
-
//===----------------------------------------------------------------------===//
// Random Pseudo Instructions.
@@ -360,7 +354,7 @@ def : Pat<(i64 (sext (i8 (X86setcc_c X86_COND_B, EFLAGS)))),
// this happens, it is great. However, if we are left with an 8-bit sbb and an
// and, we might as well just match it as a setb.
def : Pat<(and (i8 (X86setcc_c X86_COND_B, EFLAGS)), 1),
- (SETBr)>;
+ (SETCCr (i8 2))>;
// Patterns to give priority when both inputs are zero so that we don't use
// an immediate for the RHS.
@@ -574,8 +568,14 @@ let usesCustomInserter = 1, hasNoSchedulingInfo = 1, Uses = [EFLAGS] in {
defm _RFP80 : CMOVrr_PSEUDO<RFP80, f80>;
- defm _FR32 : CMOVrr_PSEUDO<FR32, f32>;
- defm _FR64 : CMOVrr_PSEUDO<FR64, f64>;
+ let Predicates = [NoAVX512] in {
+ defm _FR32 : CMOVrr_PSEUDO<FR32, f32>;
+ defm _FR64 : CMOVrr_PSEUDO<FR64, f64>;
+ }
+ let Predicates = [HasAVX512] in {
+ defm _FR32X : CMOVrr_PSEUDO<FR32X, f32>;
+ defm _FR64X : CMOVrr_PSEUDO<FR64X, f64>;
+ }
let Predicates = [NoVLX] in {
defm _VR128 : CMOVrr_PSEUDO<VR128, v2i64>;
defm _VR256 : CMOVrr_PSEUDO<VR256, v4i64>;
@@ -712,6 +712,32 @@ def NAME#64mr : RI<{RegOpc{7}, RegOpc{6}, RegOpc{5}, RegOpc{4},
"{$src2, $dst|$dst, $src2}"),
[(set EFLAGS, (Op addr:$dst, GR64:$src2))]>, LOCK;
+// NOTE: These are order specific, we want the mi8 forms to be listed
+// first so that they are slightly preferred to the mi forms.
+def NAME#16mi8 : Ii8<{ImmOpc8{7}, ImmOpc8{6}, ImmOpc8{5}, ImmOpc8{4},
+ ImmOpc8{3}, ImmOpc8{2}, ImmOpc8{1}, 1 },
+ ImmMod, (outs), (ins i16mem :$dst, i16i8imm :$src2),
+ !strconcat(mnemonic, "{w}\t",
+ "{$src2, $dst|$dst, $src2}"),
+ [(set EFLAGS, (Op addr:$dst, i16immSExt8:$src2))]>,
+ OpSize16, LOCK;
+
+def NAME#32mi8 : Ii8<{ImmOpc8{7}, ImmOpc8{6}, ImmOpc8{5}, ImmOpc8{4},
+ ImmOpc8{3}, ImmOpc8{2}, ImmOpc8{1}, 1 },
+ ImmMod, (outs), (ins i32mem :$dst, i32i8imm :$src2),
+ !strconcat(mnemonic, "{l}\t",
+ "{$src2, $dst|$dst, $src2}"),
+ [(set EFLAGS, (Op addr:$dst, i32immSExt8:$src2))]>,
+ OpSize32, LOCK;
+
+def NAME#64mi8 : RIi8<{ImmOpc8{7}, ImmOpc8{6}, ImmOpc8{5}, ImmOpc8{4},
+ ImmOpc8{3}, ImmOpc8{2}, ImmOpc8{1}, 1 },
+ ImmMod, (outs), (ins i64mem :$dst, i64i8imm :$src2),
+ !strconcat(mnemonic, "{q}\t",
+ "{$src2, $dst|$dst, $src2}"),
+ [(set EFLAGS, (Op addr:$dst, i64immSExt8:$src2))]>,
+ LOCK;
+
def NAME#8mi : Ii8<{ImmOpc{7}, ImmOpc{6}, ImmOpc{5}, ImmOpc{4},
ImmOpc{3}, ImmOpc{2}, ImmOpc{1}, 0 },
ImmMod, (outs), (ins i8mem :$dst, i8imm :$src2),
@@ -742,30 +768,6 @@ def NAME#64mi32 : RIi32S<{ImmOpc{7}, ImmOpc{6}, ImmOpc{5}, ImmOpc{4},
"{$src2, $dst|$dst, $src2}"),
[(set EFLAGS, (Op addr:$dst, i64immSExt32:$src2))]>,
LOCK;
-
-def NAME#16mi8 : Ii8<{ImmOpc8{7}, ImmOpc8{6}, ImmOpc8{5}, ImmOpc8{4},
- ImmOpc8{3}, ImmOpc8{2}, ImmOpc8{1}, 1 },
- ImmMod, (outs), (ins i16mem :$dst, i16i8imm :$src2),
- !strconcat(mnemonic, "{w}\t",
- "{$src2, $dst|$dst, $src2}"),
- [(set EFLAGS, (Op addr:$dst, i16immSExt8:$src2))]>,
- OpSize16, LOCK;
-
-def NAME#32mi8 : Ii8<{ImmOpc8{7}, ImmOpc8{6}, ImmOpc8{5}, ImmOpc8{4},
- ImmOpc8{3}, ImmOpc8{2}, ImmOpc8{1}, 1 },
- ImmMod, (outs), (ins i32mem :$dst, i32i8imm :$src2),
- !strconcat(mnemonic, "{l}\t",
- "{$src2, $dst|$dst, $src2}"),
- [(set EFLAGS, (Op addr:$dst, i32immSExt8:$src2))]>,
- OpSize32, LOCK;
-
-def NAME#64mi8 : RIi8<{ImmOpc8{7}, ImmOpc8{6}, ImmOpc8{5}, ImmOpc8{4},
- ImmOpc8{3}, ImmOpc8{2}, ImmOpc8{1}, 1 },
- ImmMod, (outs), (ins i64mem :$dst, i64i8imm :$src2),
- !strconcat(mnemonic, "{q}\t",
- "{$src2, $dst|$dst, $src2}"),
- [(set EFLAGS, (Op addr:$dst, i64immSExt8:$src2))]>,
- LOCK;
}
}
@@ -868,7 +870,7 @@ let isCodeGenOnly = 1, SchedRW = [WriteCMPXCHGRMW] in {
}
let Defs = [EAX, EDX, EFLAGS], Uses = [EAX, EBX, ECX, EDX],
- SchedRW = [WriteCMPXCHGRMW] in {
+ Predicates = [HasCmpxchg8b], SchedRW = [WriteCMPXCHGRMW] in {
defm LCMPXCHG8B : LCMPXCHG_UnOp<0xC7, MRM1m, "cmpxchg8b", X86cas8, i64mem>;
}
@@ -892,8 +894,9 @@ defm LCMPXCHG8B : LCMPXCHG_UnOp<0xC7, MRM1m, "cmpxchg8b", X86cas8, i64mem>;
// the instruction and we are sure we will have a valid register to restore
// the value of RBX.
let Defs = [EAX, EDX, EBX, EFLAGS], Uses = [EAX, ECX, EDX],
- SchedRW = [WriteCMPXCHGRMW], isCodeGenOnly = 1, isPseudo = 1,
- Constraints = "$ebx_save = $dst", usesCustomInserter = 1 in {
+ Predicates = [HasCmpxchg8b], SchedRW = [WriteCMPXCHGRMW],
+ isCodeGenOnly = 1, isPseudo = 1, Constraints = "$ebx_save = $dst",
+ usesCustomInserter = 1 in {
def LCMPXCHG8B_SAVE_EBX :
I<0, Pseudo, (outs GR32:$dst),
(ins i64mem:$ptr, GR32:$ebx_input, GR32:$ebx_save),
@@ -904,14 +907,14 @@ def LCMPXCHG8B_SAVE_EBX :
let Defs = [RAX, RDX, EFLAGS], Uses = [RAX, RBX, RCX, RDX],
- Predicates = [HasCmpxchg16b], SchedRW = [WriteCMPXCHGRMW] in {
+ Predicates = [HasCmpxchg16b,In64BitMode], SchedRW = [WriteCMPXCHGRMW] in {
defm LCMPXCHG16B : LCMPXCHG_UnOp<0xC7, MRM1m, "cmpxchg16b",
X86cas16, i128mem>, REX_W;
}
// Same as LCMPXCHG8B_SAVE_RBX but for the 16 Bytes variant.
let Defs = [RAX, RDX, RBX, EFLAGS], Uses = [RAX, RCX, RDX],
- Predicates = [HasCmpxchg16b], SchedRW = [WriteCMPXCHGRMW],
+ Predicates = [HasCmpxchg16b,In64BitMode], SchedRW = [WriteCMPXCHGRMW],
isCodeGenOnly = 1, isPseudo = 1, Constraints = "$rbx_save = $dst",
usesCustomInserter = 1 in {
def LCMPXCHG16B_SAVE_RBX :
@@ -1001,28 +1004,31 @@ defm : RELEASE_BINOP_MI<"OR", or>;
defm : RELEASE_BINOP_MI<"XOR", xor>;
defm : RELEASE_BINOP_MI<"SUB", sub>;
-// Same as above, but for floating-point.
-// FIXME: imm version.
-// FIXME: Version that doesn't clobber $src, using AVX's VADDSS.
+// Atomic load + floating point patterns.
// FIXME: This could also handle SIMD operations with *ps and *pd instructions.
-let usesCustomInserter = 1, SchedRW = [WriteMicrocoded] in {
-multiclass RELEASE_FP_BINOP_MI<SDNode op> {
- def NAME#32mr : I<0, Pseudo, (outs), (ins i32mem:$dst, FR32:$src),
- "#BINOP "#NAME#"32mr PSEUDO!",
- [(atomic_store_32 addr:$dst,
- (i32 (bitconvert (op
- (f32 (bitconvert (i32 (atomic_load_32 addr:$dst)))),
- FR32:$src))))]>, Requires<[HasSSE1]>;
- def NAME#64mr : I<0, Pseudo, (outs), (ins i64mem:$dst, FR64:$src),
- "#BINOP "#NAME#"64mr PSEUDO!",
- [(atomic_store_64 addr:$dst,
- (i64 (bitconvert (op
- (f64 (bitconvert (i64 (atomic_load_64 addr:$dst)))),
- FR64:$src))))]>, Requires<[HasSSE2]>;
+multiclass ATOMIC_LOAD_FP_BINOP_MI<string Name, SDNode op> {
+ def : Pat<(op FR32:$src1, (bitconvert (i32 (atomic_load_32 addr:$src2)))),
+ (!cast<Instruction>(Name#"SSrm") FR32:$src1, addr:$src2)>,
+ Requires<[UseSSE1]>;
+ def : Pat<(op FR32:$src1, (bitconvert (i32 (atomic_load_32 addr:$src2)))),
+ (!cast<Instruction>("V"#Name#"SSrm") FR32:$src1, addr:$src2)>,
+ Requires<[UseAVX]>;
+ def : Pat<(op FR32X:$src1, (bitconvert (i32 (atomic_load_32 addr:$src2)))),
+ (!cast<Instruction>("V"#Name#"SSZrm") FR32X:$src1, addr:$src2)>,
+ Requires<[HasAVX512]>;
+
+ def : Pat<(op FR64:$src1, (bitconvert (i64 (atomic_load_64 addr:$src2)))),
+ (!cast<Instruction>(Name#"SDrm") FR64:$src1, addr:$src2)>,
+ Requires<[UseSSE1]>;
+ def : Pat<(op FR64:$src1, (bitconvert (i64 (atomic_load_64 addr:$src2)))),
+ (!cast<Instruction>("V"#Name#"SDrm") FR64:$src1, addr:$src2)>,
+ Requires<[UseAVX]>;
+ def : Pat<(op FR64X:$src1, (bitconvert (i64 (atomic_load_64 addr:$src2)))),
+ (!cast<Instruction>("V"#Name#"SDZrm") FR64X:$src1, addr:$src2)>,
+ Requires<[HasAVX512]>;
}
-defm RELEASE_FADD : RELEASE_FP_BINOP_MI<fadd>;
+defm : ATOMIC_LOAD_FP_BINOP_MI<"ADD", fadd>;
// FIXME: Add fsub, fmul, fdiv, ...
-}
multiclass RELEASE_UNOP<string Name, dag dag8, dag dag16, dag dag32,
dag dag64> {
@@ -1083,6 +1089,35 @@ def : Pat<(i16 (atomic_load_16 addr:$src)), (MOV16rm addr:$src)>;
def : Pat<(i32 (atomic_load_32 addr:$src)), (MOV32rm addr:$src)>;
def : Pat<(i64 (atomic_load_64 addr:$src)), (MOV64rm addr:$src)>;
+// Floating point loads/stores.
+def : Pat<(atomic_store_32 addr:$dst, (i32 (bitconvert (f32 FR32:$src)))),
+ (MOVSSmr addr:$dst, FR32:$src)>, Requires<[UseSSE1]>;
+def : Pat<(atomic_store_32 addr:$dst, (i32 (bitconvert (f32 FR32:$src)))),
+ (VMOVSSmr addr:$dst, FR32:$src)>, Requires<[UseAVX]>;
+def : Pat<(atomic_store_32 addr:$dst, (i32 (bitconvert (f32 FR32:$src)))),
+ (VMOVSSZmr addr:$dst, FR32:$src)>, Requires<[HasAVX512]>;
+
+def : Pat<(atomic_store_64 addr:$dst, (i64 (bitconvert (f64 FR64:$src)))),
+ (MOVSDmr addr:$dst, FR64:$src)>, Requires<[UseSSE2]>;
+def : Pat<(atomic_store_64 addr:$dst, (i64 (bitconvert (f64 FR64:$src)))),
+ (VMOVSDmr addr:$dst, FR64:$src)>, Requires<[UseAVX]>;
+def : Pat<(atomic_store_64 addr:$dst, (i64 (bitconvert (f64 FR64:$src)))),
+ (VMOVSDmr addr:$dst, FR64:$src)>, Requires<[HasAVX512]>;
+
+def : Pat<(f32 (bitconvert (i32 (atomic_load_32 addr:$src)))),
+ (MOVSSrm_alt addr:$src)>, Requires<[UseSSE1]>;
+def : Pat<(f32 (bitconvert (i32 (atomic_load_32 addr:$src)))),
+ (VMOVSSrm_alt addr:$src)>, Requires<[UseAVX]>;
+def : Pat<(f32 (bitconvert (i32 (atomic_load_32 addr:$src)))),
+ (VMOVSSZrm_alt addr:$src)>, Requires<[HasAVX512]>;
+
+def : Pat<(f64 (bitconvert (i64 (atomic_load_64 addr:$src)))),
+ (MOVSDrm_alt addr:$src)>, Requires<[UseSSE2]>;
+def : Pat<(f64 (bitconvert (i64 (atomic_load_64 addr:$src)))),
+ (VMOVSDrm_alt addr:$src)>, Requires<[UseAVX]>;
+def : Pat<(f64 (bitconvert (i64 (atomic_load_64 addr:$src)))),
+ (VMOVSDZrm_alt addr:$src)>, Requires<[HasAVX512]>;
+
//===----------------------------------------------------------------------===//
// DAG Pattern Matching Rules
//===----------------------------------------------------------------------===//
@@ -1241,37 +1276,23 @@ def : Pat<(X86cmp GR32:$src1, 0),
def : Pat<(X86cmp GR64:$src1, 0),
(TEST64rr GR64:$src1, GR64:$src1)>;
+def inv_cond_XFORM : SDNodeXForm<imm, [{
+ X86::CondCode CC = static_cast<X86::CondCode>(N->getZExtValue());
+ return CurDAG->getTargetConstant(X86::GetOppositeBranchCondition(CC),
+ SDLoc(N), MVT::i8);
+}]>;
+
// Conditional moves with folded loads with operands swapped and conditions
// inverted.
-multiclass CMOVmr<PatLeaf InvertedCond, Instruction Inst16, Instruction Inst32,
- Instruction Inst64> {
- let Predicates = [HasCMov] in {
- def : Pat<(X86cmov (loadi16 addr:$src1), GR16:$src2, InvertedCond, EFLAGS),
- (Inst16 GR16:$src2, addr:$src1)>;
- def : Pat<(X86cmov (loadi32 addr:$src1), GR32:$src2, InvertedCond, EFLAGS),
- (Inst32 GR32:$src2, addr:$src1)>;
- def : Pat<(X86cmov (loadi64 addr:$src1), GR64:$src2, InvertedCond, EFLAGS),
- (Inst64 GR64:$src2, addr:$src1)>;
- }
+let Predicates = [HasCMov] in {
+ def : Pat<(X86cmov (loadi16 addr:$src1), GR16:$src2, imm:$cond, EFLAGS),
+ (CMOV16rm GR16:$src2, addr:$src1, (inv_cond_XFORM imm:$cond))>;
+ def : Pat<(X86cmov (loadi32 addr:$src1), GR32:$src2, imm:$cond, EFLAGS),
+ (CMOV32rm GR32:$src2, addr:$src1, (inv_cond_XFORM imm:$cond))>;
+ def : Pat<(X86cmov (loadi64 addr:$src1), GR64:$src2, imm:$cond, EFLAGS),
+ (CMOV64rm GR64:$src2, addr:$src1, (inv_cond_XFORM imm:$cond))>;
}
-defm : CMOVmr<X86_COND_B , CMOVAE16rm, CMOVAE32rm, CMOVAE64rm>;
-defm : CMOVmr<X86_COND_AE, CMOVB16rm , CMOVB32rm , CMOVB64rm>;
-defm : CMOVmr<X86_COND_E , CMOVNE16rm, CMOVNE32rm, CMOVNE64rm>;
-defm : CMOVmr<X86_COND_NE, CMOVE16rm , CMOVE32rm , CMOVE64rm>;
-defm : CMOVmr<X86_COND_BE, CMOVA16rm , CMOVA32rm , CMOVA64rm>;
-defm : CMOVmr<X86_COND_A , CMOVBE16rm, CMOVBE32rm, CMOVBE64rm>;
-defm : CMOVmr<X86_COND_L , CMOVGE16rm, CMOVGE32rm, CMOVGE64rm>;
-defm : CMOVmr<X86_COND_GE, CMOVL16rm , CMOVL32rm , CMOVL64rm>;
-defm : CMOVmr<X86_COND_LE, CMOVG16rm , CMOVG32rm , CMOVG64rm>;
-defm : CMOVmr<X86_COND_G , CMOVLE16rm, CMOVLE32rm, CMOVLE64rm>;
-defm : CMOVmr<X86_COND_P , CMOVNP16rm, CMOVNP32rm, CMOVNP64rm>;
-defm : CMOVmr<X86_COND_NP, CMOVP16rm , CMOVP32rm , CMOVP64rm>;
-defm : CMOVmr<X86_COND_S , CMOVNS16rm, CMOVNS32rm, CMOVNS64rm>;
-defm : CMOVmr<X86_COND_NS, CMOVS16rm , CMOVS32rm , CMOVS64rm>;
-defm : CMOVmr<X86_COND_O , CMOVNO16rm, CMOVNO32rm, CMOVNO64rm>;
-defm : CMOVmr<X86_COND_NO, CMOVO16rm , CMOVO32rm , CMOVO64rm>;
-
// zextload bool -> zextload byte
// i1 stored in one byte in zero-extended form.
// Upper bits cleanup should be executed before Store.
@@ -1298,14 +1319,16 @@ def : Pat<(extloadi32i16 addr:$src), (MOVZX32rm16 addr:$src)>;
// For other extloads, use subregs, since the high contents of the register are
// defined after an extload.
+// NOTE: The extloadi64i32 pattern needs to be first as it will try to form
+// 32-bit loads for 4 byte aligned i8/i16 loads.
+def : Pat<(extloadi64i32 addr:$src),
+ (SUBREG_TO_REG (i64 0), (MOV32rm addr:$src), sub_32bit)>;
def : Pat<(extloadi64i1 addr:$src),
(SUBREG_TO_REG (i64 0), (MOVZX32rm8 addr:$src), sub_32bit)>;
def : Pat<(extloadi64i8 addr:$src),
(SUBREG_TO_REG (i64 0), (MOVZX32rm8 addr:$src), sub_32bit)>;
def : Pat<(extloadi64i16 addr:$src),
(SUBREG_TO_REG (i64 0), (MOVZX32rm16 addr:$src), sub_32bit)>;
-def : Pat<(extloadi64i32 addr:$src),
- (SUBREG_TO_REG (i64 0), (MOV32rm addr:$src), sub_32bit)>;
// anyext. Define these to do an explicit zero-extend to
// avoid partial-register updates.
@@ -1351,6 +1374,8 @@ def def32 : PatLeaf<(i32 GR32:$src), [{
// we can use a SUBREG_TO_REG.
def : Pat<(i64 (zext def32:$src)),
(SUBREG_TO_REG (i64 0), GR32:$src, sub_32bit)>;
+def : Pat<(i64 (and (anyext def32:$src), 0x00000000FFFFFFFF)),
+ (SUBREG_TO_REG (i64 0), GR32:$src, sub_32bit)>;
//===----------------------------------------------------------------------===//
// Pattern match OR as ADD
@@ -1377,9 +1402,12 @@ def or_is_add : PatFrag<(ops node:$lhs, node:$rhs), (or node:$lhs, node:$rhs),[{
// Try this before the selecting to OR.
let SchedRW = [WriteALU] in {
-let isConvertibleToThreeAddress = 1,
+let isConvertibleToThreeAddress = 1, isPseudo = 1,
Constraints = "$src1 = $dst", Defs = [EFLAGS] in {
let isCommutable = 1 in {
+def ADD8rr_DB : I<0, Pseudo, (outs GR8:$dst), (ins GR8:$src1, GR8:$src2),
+ "", // orb/addb REG, REG
+ [(set GR8:$dst, (or_is_add GR8:$src1, GR8:$src2))]>;
def ADD16rr_DB : I<0, Pseudo, (outs GR16:$dst), (ins GR16:$src1, GR16:$src2),
"", // orw/addw REG, REG
[(set GR16:$dst, (or_is_add GR16:$src1, GR16:$src2))]>;
@@ -1394,6 +1422,10 @@ def ADD64rr_DB : I<0, Pseudo, (outs GR64:$dst), (ins GR64:$src1, GR64:$src2),
// NOTE: These are order specific, we want the ri8 forms to be listed
// first so that they are slightly preferred to the ri forms.
+def ADD8ri_DB : I<0, Pseudo,
+ (outs GR8:$dst), (ins GR8:$src1, i8imm:$src2),
+ "", // orb/addb REG, imm8
+ [(set GR8:$dst, (or_is_add GR8:$src1, imm:$src2))]>;
def ADD16ri8_DB : I<0, Pseudo,
(outs GR16:$dst), (ins GR16:$src1, i16i8imm:$src2),
"", // orw/addw REG, imm8
@@ -1483,6 +1515,13 @@ def : Pat<(add GR64:$src1, 128),
def : Pat<(store (add (loadi64 addr:$dst), 128), addr:$dst),
(SUB64mi8 addr:$dst, -128)>;
+def : Pat<(X86add_flag_nocf GR16:$src1, 128),
+ (SUB16ri8 GR16:$src1, -128)>;
+def : Pat<(X86add_flag_nocf GR32:$src1, 128),
+ (SUB32ri8 GR32:$src1, -128)>;
+def : Pat<(X86add_flag_nocf GR64:$src1, 128),
+ (SUB64ri8 GR64:$src1, -128)>;
+
// The same trick applies for 32-bit immediate fields in 64-bit
// instructions.
def : Pat<(add GR64:$src1, 0x0000000080000000),
@@ -1490,6 +1529,9 @@ def : Pat<(add GR64:$src1, 0x0000000080000000),
def : Pat<(store (add (loadi64 addr:$dst), 0x0000000080000000), addr:$dst),
(SUB64mi32 addr:$dst, 0xffffffff80000000)>;
+def : Pat<(X86add_flag_nocf GR64:$src1, 0x0000000080000000),
+ (SUB64ri32 GR64:$src1, 0xffffffff80000000)>;
+
// To avoid needing to materialize an immediate in a register, use a 32-bit and
// with implicit zero-extension instead of a 64-bit and if the immediate has at
// least 32 bits of leading zeros. If in addition the last 32 bits can be
@@ -1504,7 +1546,7 @@ def : Pat<(and GR64:$src, i64immZExt32SExt8:$imm),
(i64 0),
(AND32ri8
(EXTRACT_SUBREG GR64:$src, sub_32bit),
- (i32 (GetLo8XForm imm:$imm))),
+ (i32 (GetLo32XForm imm:$imm))),
sub_32bit)>;
def : Pat<(and GR64:$src, i64immZExt32:$imm),
@@ -1714,40 +1756,43 @@ def : Pat<(shl GR16:$src1, (i8 1)), (ADD16rr GR16:$src1, GR16:$src1)>;
def : Pat<(shl GR32:$src1, (i8 1)), (ADD32rr GR32:$src1, GR32:$src1)>;
def : Pat<(shl GR64:$src1, (i8 1)), (ADD64rr GR64:$src1, GR64:$src1)>;
-// Helper imms to check if a mask doesn't change significant shift/rotate bits.
-def immShift8 : ImmLeaf<i8, [{
- return countTrailingOnes<uint64_t>(Imm) >= 3;
+def shiftMask8 : PatFrag<(ops node:$lhs), (and node:$lhs, imm), [{
+ return isUnneededShiftMask(N, 3);
}]>;
-def immShift16 : ImmLeaf<i8, [{
- return countTrailingOnes<uint64_t>(Imm) >= 4;
+
+def shiftMask16 : PatFrag<(ops node:$lhs), (and node:$lhs, imm), [{
+ return isUnneededShiftMask(N, 4);
}]>;
-def immShift32 : ImmLeaf<i8, [{
- return countTrailingOnes<uint64_t>(Imm) >= 5;
+
+def shiftMask32 : PatFrag<(ops node:$lhs), (and node:$lhs, imm), [{
+ return isUnneededShiftMask(N, 5);
}]>;
-def immShift64 : ImmLeaf<i8, [{
- return countTrailingOnes<uint64_t>(Imm) >= 6;
+
+def shiftMask64 : PatFrag<(ops node:$lhs), (and node:$lhs, imm), [{
+ return isUnneededShiftMask(N, 6);
}]>;
+
// Shift amount is implicitly masked.
multiclass MaskedShiftAmountPats<SDNode frag, string name> {
// (shift x (and y, 31)) ==> (shift x, y)
- def : Pat<(frag GR8:$src1, (and CL, immShift32)),
+ def : Pat<(frag GR8:$src1, (shiftMask32 CL)),
(!cast<Instruction>(name # "8rCL") GR8:$src1)>;
- def : Pat<(frag GR16:$src1, (and CL, immShift32)),
+ def : Pat<(frag GR16:$src1, (shiftMask32 CL)),
(!cast<Instruction>(name # "16rCL") GR16:$src1)>;
- def : Pat<(frag GR32:$src1, (and CL, immShift32)),
+ def : Pat<(frag GR32:$src1, (shiftMask32 CL)),
(!cast<Instruction>(name # "32rCL") GR32:$src1)>;
- def : Pat<(store (frag (loadi8 addr:$dst), (and CL, immShift32)), addr:$dst),
+ def : Pat<(store (frag (loadi8 addr:$dst), (shiftMask32 CL)), addr:$dst),
(!cast<Instruction>(name # "8mCL") addr:$dst)>;
- def : Pat<(store (frag (loadi16 addr:$dst), (and CL, immShift32)), addr:$dst),
+ def : Pat<(store (frag (loadi16 addr:$dst), (shiftMask32 CL)), addr:$dst),
(!cast<Instruction>(name # "16mCL") addr:$dst)>;
- def : Pat<(store (frag (loadi32 addr:$dst), (and CL, immShift32)), addr:$dst),
+ def : Pat<(store (frag (loadi32 addr:$dst), (shiftMask32 CL)), addr:$dst),
(!cast<Instruction>(name # "32mCL") addr:$dst)>;
// (shift x (and y, 63)) ==> (shift x, y)
- def : Pat<(frag GR64:$src1, (and CL, immShift64)),
+ def : Pat<(frag GR64:$src1, (shiftMask64 CL)),
(!cast<Instruction>(name # "64rCL") GR64:$src1)>;
- def : Pat<(store (frag (loadi64 addr:$dst), (and CL, immShift64)), addr:$dst),
+ def : Pat<(store (frag (loadi64 addr:$dst), (shiftMask64 CL)), addr:$dst),
(!cast<Instruction>(name # "64mCL") addr:$dst)>;
}
@@ -1763,23 +1808,23 @@ defm : MaskedShiftAmountPats<sra, "SAR">;
// not tracking flags for these nodes.
multiclass MaskedRotateAmountPats<SDNode frag, string name> {
// (rot x (and y, BitWidth - 1)) ==> (rot x, y)
- def : Pat<(frag GR8:$src1, (and CL, immShift8)),
+ def : Pat<(frag GR8:$src1, (shiftMask8 CL)),
(!cast<Instruction>(name # "8rCL") GR8:$src1)>;
- def : Pat<(frag GR16:$src1, (and CL, immShift16)),
+ def : Pat<(frag GR16:$src1, (shiftMask16 CL)),
(!cast<Instruction>(name # "16rCL") GR16:$src1)>;
- def : Pat<(frag GR32:$src1, (and CL, immShift32)),
+ def : Pat<(frag GR32:$src1, (shiftMask32 CL)),
(!cast<Instruction>(name # "32rCL") GR32:$src1)>;
- def : Pat<(store (frag (loadi8 addr:$dst), (and CL, immShift8)), addr:$dst),
+ def : Pat<(store (frag (loadi8 addr:$dst), (shiftMask8 CL)), addr:$dst),
(!cast<Instruction>(name # "8mCL") addr:$dst)>;
- def : Pat<(store (frag (loadi16 addr:$dst), (and CL, immShift16)), addr:$dst),
+ def : Pat<(store (frag (loadi16 addr:$dst), (shiftMask16 CL)), addr:$dst),
(!cast<Instruction>(name # "16mCL") addr:$dst)>;
- def : Pat<(store (frag (loadi32 addr:$dst), (and CL, immShift32)), addr:$dst),
+ def : Pat<(store (frag (loadi32 addr:$dst), (shiftMask32 CL)), addr:$dst),
(!cast<Instruction>(name # "32mCL") addr:$dst)>;
// (rot x (and y, 63)) ==> (rot x, y)
- def : Pat<(frag GR64:$src1, (and CL, immShift64)),
+ def : Pat<(frag GR64:$src1, (shiftMask64 CL)),
(!cast<Instruction>(name # "64rCL") GR64:$src1)>;
- def : Pat<(store (frag (loadi64 addr:$dst), (and CL, immShift64)), addr:$dst),
+ def : Pat<(store (frag (loadi64 addr:$dst), (shiftMask64 CL)), addr:$dst),
(!cast<Instruction>(name # "64mCL") addr:$dst)>;
}
@@ -1790,13 +1835,13 @@ defm : MaskedRotateAmountPats<rotr, "ROR">;
// Double shift amount is implicitly masked.
multiclass MaskedDoubleShiftAmountPats<SDNode frag, string name> {
// (shift x (and y, 31)) ==> (shift x, y)
- def : Pat<(frag GR16:$src1, GR16:$src2, (and CL, immShift32)),
+ def : Pat<(frag GR16:$src1, GR16:$src2, (shiftMask32 CL)),
(!cast<Instruction>(name # "16rrCL") GR16:$src1, GR16:$src2)>;
- def : Pat<(frag GR32:$src1, GR32:$src2, (and CL, immShift32)),
+ def : Pat<(frag GR32:$src1, GR32:$src2, (shiftMask32 CL)),
(!cast<Instruction>(name # "32rrCL") GR32:$src1, GR32:$src2)>;
// (shift x (and y, 63)) ==> (shift x, y)
- def : Pat<(frag GR64:$src1, GR64:$src2, (and CL, immShift64)),
+ def : Pat<(frag GR64:$src1, GR64:$src2, (shiftMask32 CL)),
(!cast<Instruction>(name # "64rrCL") GR64:$src1, GR64:$src2)>;
}
@@ -1805,57 +1850,57 @@ defm : MaskedDoubleShiftAmountPats<X86shrd, "SHRD">;
let Predicates = [HasBMI2] in {
let AddedComplexity = 1 in {
- def : Pat<(sra GR32:$src1, (and GR8:$src2, immShift32)),
+ def : Pat<(sra GR32:$src1, (shiftMask32 GR8:$src2)),
(SARX32rr GR32:$src1,
(INSERT_SUBREG
(i32 (IMPLICIT_DEF)), GR8:$src2, sub_8bit))>;
- def : Pat<(sra GR64:$src1, (and GR8:$src2, immShift64)),
+ def : Pat<(sra GR64:$src1, (shiftMask64 GR8:$src2)),
(SARX64rr GR64:$src1,
(INSERT_SUBREG
(i64 (IMPLICIT_DEF)), GR8:$src2, sub_8bit))>;
- def : Pat<(srl GR32:$src1, (and GR8:$src2, immShift32)),
+ def : Pat<(srl GR32:$src1, (shiftMask32 GR8:$src2)),
(SHRX32rr GR32:$src1,
(INSERT_SUBREG
(i32 (IMPLICIT_DEF)), GR8:$src2, sub_8bit))>;
- def : Pat<(srl GR64:$src1, (and GR8:$src2, immShift64)),
+ def : Pat<(srl GR64:$src1, (shiftMask64 GR8:$src2)),
(SHRX64rr GR64:$src1,
(INSERT_SUBREG
(i64 (IMPLICIT_DEF)), GR8:$src2, sub_8bit))>;
- def : Pat<(shl GR32:$src1, (and GR8:$src2, immShift32)),
+ def : Pat<(shl GR32:$src1, (shiftMask32 GR8:$src2)),
(SHLX32rr GR32:$src1,
(INSERT_SUBREG
(i32 (IMPLICIT_DEF)), GR8:$src2, sub_8bit))>;
- def : Pat<(shl GR64:$src1, (and GR8:$src2, immShift64)),
+ def : Pat<(shl GR64:$src1, (shiftMask64 GR8:$src2)),
(SHLX64rr GR64:$src1,
(INSERT_SUBREG
(i64 (IMPLICIT_DEF)), GR8:$src2, sub_8bit))>;
}
- def : Pat<(sra (loadi32 addr:$src1), (and GR8:$src2, immShift32)),
+ def : Pat<(sra (loadi32 addr:$src1), (shiftMask32 GR8:$src2)),
(SARX32rm addr:$src1,
(INSERT_SUBREG
(i32 (IMPLICIT_DEF)), GR8:$src2, sub_8bit))>;
- def : Pat<(sra (loadi64 addr:$src1), (and GR8:$src2, immShift64)),
+ def : Pat<(sra (loadi64 addr:$src1), (shiftMask64 GR8:$src2)),
(SARX64rm addr:$src1,
(INSERT_SUBREG
(i64 (IMPLICIT_DEF)), GR8:$src2, sub_8bit))>;
- def : Pat<(srl (loadi32 addr:$src1), (and GR8:$src2, immShift32)),
+ def : Pat<(srl (loadi32 addr:$src1), (shiftMask32 GR8:$src2)),
(SHRX32rm addr:$src1,
(INSERT_SUBREG
(i32 (IMPLICIT_DEF)), GR8:$src2, sub_8bit))>;
- def : Pat<(srl (loadi64 addr:$src1), (and GR8:$src2, immShift64)),
+ def : Pat<(srl (loadi64 addr:$src1), (shiftMask64 GR8:$src2)),
(SHRX64rm addr:$src1,
(INSERT_SUBREG
(i64 (IMPLICIT_DEF)), GR8:$src2, sub_8bit))>;
- def : Pat<(shl (loadi32 addr:$src1), (and GR8:$src2, immShift32)),
+ def : Pat<(shl (loadi32 addr:$src1), (shiftMask32 GR8:$src2)),
(SHLX32rm addr:$src1,
(INSERT_SUBREG
(i32 (IMPLICIT_DEF)), GR8:$src2, sub_8bit))>;
- def : Pat<(shl (loadi64 addr:$src1), (and GR8:$src2, immShift64)),
+ def : Pat<(shl (loadi64 addr:$src1), (shiftMask64 GR8:$src2)),
(SHLX64rm addr:$src1,
(INSERT_SUBREG
(i64 (IMPLICIT_DEF)), GR8:$src2, sub_8bit))>;
@@ -1864,7 +1909,7 @@ let Predicates = [HasBMI2] in {
// Use BTR/BTS/BTC for clearing/setting/toggling a bit in a variable location.
multiclass one_bit_patterns<RegisterClass RC, ValueType VT, Instruction BTR,
Instruction BTS, Instruction BTC,
- ImmLeaf ImmShift> {
+ PatFrag ShiftMask> {
def : Pat<(and RC:$src1, (rotl -2, GR8:$src2)),
(BTR RC:$src1,
(INSERT_SUBREG (VT (IMPLICIT_DEF)), GR8:$src2, sub_8bit))>;
@@ -1876,20 +1921,20 @@ multiclass one_bit_patterns<RegisterClass RC, ValueType VT, Instruction BTR,
(INSERT_SUBREG (VT (IMPLICIT_DEF)), GR8:$src2, sub_8bit))>;
// Similar to above, but removing unneeded masking of the shift amount.
- def : Pat<(and RC:$src1, (rotl -2, (and GR8:$src2, ImmShift))),
+ def : Pat<(and RC:$src1, (rotl -2, (ShiftMask GR8:$src2))),
(BTR RC:$src1,
(INSERT_SUBREG (VT (IMPLICIT_DEF)), GR8:$src2, sub_8bit))>;
- def : Pat<(or RC:$src1, (shl 1, (and GR8:$src2, ImmShift))),
+ def : Pat<(or RC:$src1, (shl 1, (ShiftMask GR8:$src2))),
(BTS RC:$src1,
(INSERT_SUBREG (VT (IMPLICIT_DEF)), GR8:$src2, sub_8bit))>;
- def : Pat<(xor RC:$src1, (shl 1, (and GR8:$src2, ImmShift))),
+ def : Pat<(xor RC:$src1, (shl 1, (ShiftMask GR8:$src2))),
(BTC RC:$src1,
(INSERT_SUBREG (VT (IMPLICIT_DEF)), GR8:$src2, sub_8bit))>;
}
-defm : one_bit_patterns<GR16, i16, BTR16rr, BTS16rr, BTC16rr, immShift16>;
-defm : one_bit_patterns<GR32, i32, BTR32rr, BTS32rr, BTC32rr, immShift32>;
-defm : one_bit_patterns<GR64, i64, BTR64rr, BTS64rr, BTC64rr, immShift64>;
+defm : one_bit_patterns<GR16, i16, BTR16rr, BTS16rr, BTC16rr, shiftMask16>;
+defm : one_bit_patterns<GR32, i32, BTR32rr, BTS32rr, BTC32rr, shiftMask32>;
+defm : one_bit_patterns<GR64, i64, BTR64rr, BTS64rr, BTC64rr, shiftMask64>;
// (anyext (setcc_carry)) -> (setcc_carry)
@@ -1974,8 +2019,6 @@ def : Pat<(X86sub_flag 0, GR64:$src), (NEG64r GR64:$src)>;
// sub reg, relocImm
def : Pat<(X86sub_flag GR64:$src1, i64relocImmSExt8_su:$src2),
(SUB64ri8 GR64:$src1, i64relocImmSExt8_su:$src2)>;
-def : Pat<(X86sub_flag GR64:$src1, i64relocImmSExt32_su:$src2),
- (SUB64ri32 GR64:$src1, i64relocImmSExt32_su:$src2)>;
// mul reg, reg
def : Pat<(mul GR16:$src1, GR16:$src2),
diff --git a/lib/Target/X86/X86InstrControl.td b/lib/Target/X86/X86InstrControl.td
index a7c7aaab2285..f82e80965b7c 100644
--- a/lib/Target/X86/X86InstrControl.td
+++ b/lib/Target/X86/X86InstrControl.td
@@ -1,9 +1,8 @@
//===-- X86InstrControl.td - Control Flow Instructions -----*- tablegen -*-===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
@@ -71,35 +70,40 @@ let isBarrier = 1, isBranch = 1, isTerminator = 1, SchedRW = [WriteJump] in {
}
// Conditional Branches.
-let isBranch = 1, isTerminator = 1, Uses = [EFLAGS], SchedRW = [WriteJump] in {
- multiclass ICBr<bits<8> opc1, bits<8> opc4, string asm, PatFrag Cond> {
- def _1 : Ii8PCRel <opc1, RawFrm, (outs), (ins brtarget8:$dst), asm,
- [(X86brcond bb:$dst, Cond, EFLAGS)]>;
- let hasSideEffects = 0, isCodeGenOnly = 1, ForceDisassemble = 1 in {
- def _2 : Ii16PCRel<opc4, RawFrm, (outs), (ins brtarget16:$dst), asm,
- []>, OpSize16, TB;
- def _4 : Ii32PCRel<opc4, RawFrm, (outs), (ins brtarget32:$dst), asm,
- []>, TB, OpSize32;
- }
+let isBranch = 1, isTerminator = 1, Uses = [EFLAGS], SchedRW = [WriteJump],
+ isCodeGenOnly = 1, ForceDisassemble = 1 in {
+ def JCC_1 : Ii8PCRel <0x70, AddCCFrm, (outs),
+ (ins brtarget8:$dst, ccode:$cond),
+ "j${cond}\t$dst",
+ [(X86brcond bb:$dst, imm:$cond, EFLAGS)]>;
+ let hasSideEffects = 0 in {
+ def JCC_2 : Ii16PCRel<0x80, AddCCFrm, (outs),
+ (ins brtarget16:$dst, ccode:$cond),
+ "j${cond}\t$dst",
+ []>, OpSize16, TB;
+ def JCC_4 : Ii32PCRel<0x80, AddCCFrm, (outs),
+ (ins brtarget32:$dst, ccode:$cond),
+ "j${cond}\t$dst",
+ []>, TB, OpSize32;
}
}
-defm JO : ICBr<0x70, 0x80, "jo\t$dst" , X86_COND_O>;
-defm JNO : ICBr<0x71, 0x81, "jno\t$dst", X86_COND_NO>;
-defm JB : ICBr<0x72, 0x82, "jb\t$dst" , X86_COND_B>;
-defm JAE : ICBr<0x73, 0x83, "jae\t$dst", X86_COND_AE>;
-defm JE : ICBr<0x74, 0x84, "je\t$dst" , X86_COND_E>;
-defm JNE : ICBr<0x75, 0x85, "jne\t$dst", X86_COND_NE>;
-defm JBE : ICBr<0x76, 0x86, "jbe\t$dst", X86_COND_BE>;
-defm JA : ICBr<0x77, 0x87, "ja\t$dst" , X86_COND_A>;
-defm JS : ICBr<0x78, 0x88, "js\t$dst" , X86_COND_S>;
-defm JNS : ICBr<0x79, 0x89, "jns\t$dst", X86_COND_NS>;
-defm JP : ICBr<0x7A, 0x8A, "jp\t$dst" , X86_COND_P>;
-defm JNP : ICBr<0x7B, 0x8B, "jnp\t$dst", X86_COND_NP>;
-defm JL : ICBr<0x7C, 0x8C, "jl\t$dst" , X86_COND_L>;
-defm JGE : ICBr<0x7D, 0x8D, "jge\t$dst", X86_COND_GE>;
-defm JLE : ICBr<0x7E, 0x8E, "jle\t$dst", X86_COND_LE>;
-defm JG : ICBr<0x7F, 0x8F, "jg\t$dst" , X86_COND_G>;
+def : InstAlias<"jo\t$dst", (JCC_1 brtarget8:$dst, 0), 0>;
+def : InstAlias<"jno\t$dst", (JCC_1 brtarget8:$dst, 1), 0>;
+def : InstAlias<"jb\t$dst", (JCC_1 brtarget8:$dst, 2), 0>;
+def : InstAlias<"jae\t$dst", (JCC_1 brtarget8:$dst, 3), 0>;
+def : InstAlias<"je\t$dst", (JCC_1 brtarget8:$dst, 4), 0>;
+def : InstAlias<"jne\t$dst", (JCC_1 brtarget8:$dst, 5), 0>;
+def : InstAlias<"jbe\t$dst", (JCC_1 brtarget8:$dst, 6), 0>;
+def : InstAlias<"ja\t$dst", (JCC_1 brtarget8:$dst, 7), 0>;
+def : InstAlias<"js\t$dst", (JCC_1 brtarget8:$dst, 8), 0>;
+def : InstAlias<"jns\t$dst", (JCC_1 brtarget8:$dst, 9), 0>;
+def : InstAlias<"jp\t$dst", (JCC_1 brtarget8:$dst, 10), 0>;
+def : InstAlias<"jnp\t$dst", (JCC_1 brtarget8:$dst, 11), 0>;
+def : InstAlias<"jl\t$dst", (JCC_1 brtarget8:$dst, 12), 0>;
+def : InstAlias<"jge\t$dst", (JCC_1 brtarget8:$dst, 13), 0>;
+def : InstAlias<"jle\t$dst", (JCC_1 brtarget8:$dst, 14), 0>;
+def : InstAlias<"jg\t$dst", (JCC_1 brtarget8:$dst, 15), 0>;
// jcx/jecx/jrcx instructions.
let isBranch = 1, isTerminator = 1, hasSideEffects = 0, SchedRW = [WriteJump] in {
diff --git a/lib/Target/X86/X86InstrExtension.td b/lib/Target/X86/X86InstrExtension.td
index c24d6d5b8df1..06e605fe5db2 100644
--- a/lib/Target/X86/X86InstrExtension.td
+++ b/lib/Target/X86/X86InstrExtension.td
@@ -1,9 +1,8 @@
//===-- X86InstrExtension.td - Sign and Zero Extensions ----*- tablegen -*-===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
@@ -29,11 +28,11 @@ let hasSideEffects = 0 in {
let Defs = [RAX], Uses = [EAX] in // RAX = signext(EAX)
def CDQE : RI<0x98, RawFrm, (outs), (ins),
- "{cltq|cdqe}", []>, Sched<[WriteALU]>;
+ "{cltq|cdqe}", []>, Sched<[WriteALU]>, Requires<[In64BitMode]>;
let Defs = [RAX,RDX], Uses = [RAX] in // RDX:RAX = signext(RAX)
def CQO : RI<0x99, RawFrm, (outs), (ins),
- "{cqto|cqo}", []>, Sched<[WriteALU]>;
+ "{cqto|cqo}", []>, Sched<[WriteALU]>, Requires<[In64BitMode]>;
}
// Sign/Zero extenders
diff --git a/lib/Target/X86/X86InstrFMA.td b/lib/Target/X86/X86InstrFMA.td
index 1a8e529431af..0cca71bdc431 100644
--- a/lib/Target/X86/X86InstrFMA.td
+++ b/lib/Target/X86/X86InstrFMA.td
@@ -1,9 +1,8 @@
//===-- X86InstrFMA.td - FMA Instruction Set ---------------*- tablegen -*-===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
@@ -237,7 +236,8 @@ multiclass fma3s_rm_132<bits<8> opc, string OpcodeStr,
Sched<[sched.Folded, sched.ReadAfterFold, sched.ReadAfterFold]>;
}
-let Constraints = "$src1 = $dst", isCommutable = 1, hasSideEffects = 0 in
+let Constraints = "$src1 = $dst", isCommutable = 1, isCodeGenOnly = 1,
+ hasSideEffects = 0 in
multiclass fma3s_forms<bits<8> opc132, bits<8> opc213, bits<8> opc231,
string OpStr, string PackTy, string Suff,
SDNode OpNode, RegisterClass RC,
@@ -263,8 +263,7 @@ multiclass fma3s_forms<bits<8> opc132, bits<8> opc213, bits<8> opc231,
// the lowest element of the FMA*_Int instruction. Even though such analysis
// may be not implemented yet we allow the routines doing the actual commute
// transformation to decide if one or another instruction is commutable or not.
-let Constraints = "$src1 = $dst", isCommutable = 1, isCodeGenOnly = 1,
- hasSideEffects = 0 in
+let Constraints = "$src1 = $dst", isCommutable = 1, hasSideEffects = 0 in
multiclass fma3s_rm_int<bits<8> opc, string OpcodeStr,
Operand memopr, RegisterClass RC,
X86FoldableSchedWrite sched> {
diff --git a/lib/Target/X86/X86InstrFMA3Info.cpp b/lib/Target/X86/X86InstrFMA3Info.cpp
index def732a2dd00..25bbdddb7a21 100644
--- a/lib/Target/X86/X86InstrFMA3Info.cpp
+++ b/lib/Target/X86/X86InstrFMA3Info.cpp
@@ -1,9 +1,8 @@
//===-- X86InstrFMA3Info.cpp - X86 FMA3 Instruction Information -----------===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
@@ -57,7 +56,7 @@ using namespace llvm;
#define FMA3GROUP_SCALAR(Name, Attrs) \
FMA3GROUP_SCALAR_WIDTHS(Name, SD, Attrs) \
- FMA3GROUP_SCALAR_WIDTHS(Name, SS, Attrs) \
+ FMA3GROUP_SCALAR_WIDTHS(Name, SS, Attrs)
#define FMA3GROUP_FULL(Name, Attrs) \
FMA3GROUP_PACKED(Name, Attrs) \
@@ -159,11 +158,9 @@ const X86InstrFMA3Group *llvm::getFMA3Group(unsigned Opcode, uint64_t TSFlags) {
// FMA 231 instructions have an opcode of 0xB6-0xBF
unsigned FormIndex = ((BaseOpcode - 0x90) >> 4) & 0x3;
- auto I = std::lower_bound(Table.begin(), Table.end(), Opcode,
- [FormIndex](const X86InstrFMA3Group &Group,
- unsigned Opcode) {
- return Group.Opcodes[FormIndex] < Opcode;
- });
+ auto I = partition_point(Table, [=](const X86InstrFMA3Group &Group) {
+ return Group.Opcodes[FormIndex] < Opcode;
+ });
assert(I != Table.end() && I->Opcodes[FormIndex] == Opcode &&
"Couldn't find FMA3 opcode!");
return I;
diff --git a/lib/Target/X86/X86InstrFMA3Info.h b/lib/Target/X86/X86InstrFMA3Info.h
index 6eec1db98bf8..7fa6f5917862 100644
--- a/lib/Target/X86/X86InstrFMA3Info.h
+++ b/lib/Target/X86/X86InstrFMA3Info.h
@@ -1,9 +1,8 @@
//===- X86InstrFMA3Info.h - X86 FMA3 Instruction Information ----*- C++ -*-===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
diff --git a/lib/Target/X86/X86InstrFPStack.td b/lib/Target/X86/X86InstrFPStack.td
index 5912a3199613..2ec6d50f9702 100644
--- a/lib/Target/X86/X86InstrFPStack.td
+++ b/lib/Target/X86/X86InstrFPStack.td
@@ -1,9 +1,8 @@
//===- X86InstrFPStack.td - FPU Instruction Set ------------*- tablegen -*-===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
@@ -17,18 +16,13 @@
// FPStack specific DAG Nodes.
//===----------------------------------------------------------------------===//
-def SDTX86FpGet2 : SDTypeProfile<2, 0, [SDTCisVT<0, f80>,
- SDTCisVT<1, f80>]>;
-def SDTX86Fld : SDTypeProfile<1, 2, [SDTCisFP<0>,
- SDTCisPtrTy<1>,
- SDTCisVT<2, OtherVT>]>;
-def SDTX86Fst : SDTypeProfile<0, 3, [SDTCisFP<0>,
- SDTCisPtrTy<1>,
- SDTCisVT<2, OtherVT>]>;
-def SDTX86Fild : SDTypeProfile<1, 2, [SDTCisFP<0>, SDTCisPtrTy<1>,
- SDTCisVT<2, OtherVT>]>;
+def SDTX86Fld : SDTypeProfile<1, 1, [SDTCisFP<0>,
+ SDTCisPtrTy<1>]>;
+def SDTX86Fst : SDTypeProfile<0, 2, [SDTCisFP<0>,
+ SDTCisPtrTy<1>]>;
+def SDTX86Fild : SDTypeProfile<1, 1, [SDTCisFP<0>, SDTCisPtrTy<1>]>;
+def SDTX86Fist : SDTypeProfile<0, 2, [SDTCisFP<0>, SDTCisPtrTy<1>]>;
def SDTX86Fnstsw : SDTypeProfile<1, 1, [SDTCisVT<0, i16>, SDTCisVT<1, i16>]>;
-def SDTX86FpToIMem : SDTypeProfile<0, 2, [SDTCisFP<0>, SDTCisPtrTy<1>]>;
def SDTX86CwdStore : SDTypeProfile<0, 1, [SDTCisPtrTy<0>]>;
@@ -42,17 +36,71 @@ def X86fild : SDNode<"X86ISD::FILD", SDTX86Fild,
def X86fildflag : SDNode<"X86ISD::FILD_FLAG", SDTX86Fild,
[SDNPHasChain, SDNPOutGlue, SDNPMayLoad,
SDNPMemOperand]>;
+def X86fist : SDNode<"X86ISD::FIST", SDTX86Fist,
+ [SDNPHasChain, SDNPInGlue, SDNPMayStore,
+ SDNPMemOperand]>;
def X86fp_stsw : SDNode<"X86ISD::FNSTSW16r", SDTX86Fnstsw>;
-def X86fp_to_i16mem : SDNode<"X86ISD::FP_TO_INT16_IN_MEM", SDTX86FpToIMem,
- [SDNPHasChain, SDNPMayStore, SDNPMemOperand]>;
-def X86fp_to_i32mem : SDNode<"X86ISD::FP_TO_INT32_IN_MEM", SDTX86FpToIMem,
- [SDNPHasChain, SDNPMayStore, SDNPMemOperand]>;
-def X86fp_to_i64mem : SDNode<"X86ISD::FP_TO_INT64_IN_MEM", SDTX86FpToIMem,
- [SDNPHasChain, SDNPMayStore, SDNPMemOperand]>;
+def X86fp_to_mem : SDNode<"X86ISD::FP_TO_INT_IN_MEM", SDTX86Fst,
+ [SDNPHasChain, SDNPMayStore, SDNPMemOperand]>;
def X86fp_cwd_get16 : SDNode<"X86ISD::FNSTCW16m", SDTX86CwdStore,
[SDNPHasChain, SDNPMayStore, SDNPSideEffect,
SDNPMemOperand]>;
+def X86fstf32 : PatFrag<(ops node:$val, node:$ptr),
+ (X86fst node:$val, node:$ptr), [{
+ return cast<MemIntrinsicSDNode>(N)->getMemoryVT() == MVT::f32;
+}]>;
+def X86fstf64 : PatFrag<(ops node:$val, node:$ptr),
+ (X86fst node:$val, node:$ptr), [{
+ return cast<MemIntrinsicSDNode>(N)->getMemoryVT() == MVT::f64;
+}]>;
+def X86fstf80 : PatFrag<(ops node:$val, node:$ptr),
+ (X86fst node:$val, node:$ptr), [{
+ return cast<MemIntrinsicSDNode>(N)->getMemoryVT() == MVT::f80;
+}]>;
+
+def X86fldf32 : PatFrag<(ops node:$ptr), (X86fld node:$ptr), [{
+ return cast<MemIntrinsicSDNode>(N)->getMemoryVT() == MVT::f32;
+}]>;
+def X86fldf64 : PatFrag<(ops node:$ptr), (X86fld node:$ptr), [{
+ return cast<MemIntrinsicSDNode>(N)->getMemoryVT() == MVT::f64;
+}]>;
+def X86fldf80 : PatFrag<(ops node:$ptr), (X86fld node:$ptr), [{
+ return cast<MemIntrinsicSDNode>(N)->getMemoryVT() == MVT::f80;
+}]>;
+
+def X86fild16 : PatFrag<(ops node:$ptr), (X86fild node:$ptr), [{
+ return cast<MemIntrinsicSDNode>(N)->getMemoryVT() == MVT::i16;
+}]>;
+def X86fild32 : PatFrag<(ops node:$ptr), (X86fild node:$ptr), [{
+ return cast<MemIntrinsicSDNode>(N)->getMemoryVT() == MVT::i32;
+}]>;
+def X86fild64 : PatFrag<(ops node:$ptr), (X86fild node:$ptr), [{
+ return cast<MemIntrinsicSDNode>(N)->getMemoryVT() == MVT::i64;
+}]>;
+
+def X86fildflag64 : PatFrag<(ops node:$ptr), (X86fildflag node:$ptr), [{
+ return cast<MemIntrinsicSDNode>(N)->getMemoryVT() == MVT::i64;
+}]>;
+
+def X86fist64 : PatFrag<(ops node:$val, node:$ptr),
+ (X86fist node:$val, node:$ptr), [{
+ return cast<MemIntrinsicSDNode>(N)->getMemoryVT() == MVT::i64;
+}]>;
+
+def X86fp_to_i16mem : PatFrag<(ops node:$val, node:$ptr),
+ (X86fp_to_mem node:$val, node:$ptr), [{
+ return cast<MemIntrinsicSDNode>(N)->getMemoryVT() == MVT::i16;
+}]>;
+def X86fp_to_i32mem : PatFrag<(ops node:$val, node:$ptr),
+ (X86fp_to_mem node:$val, node:$ptr), [{
+ return cast<MemIntrinsicSDNode>(N)->getMemoryVT() == MVT::i32;
+}]>;
+def X86fp_to_i64mem : PatFrag<(ops node:$val, node:$ptr),
+ (X86fp_to_mem node:$val, node:$ptr), [{
+ return cast<MemIntrinsicSDNode>(N)->getMemoryVT() == MVT::i64;
+}]>;
+
//===----------------------------------------------------------------------===//
// FPStack pattern fragments
//===----------------------------------------------------------------------===//
@@ -74,7 +122,9 @@ def fpimmneg1 : FPImmLeaf<fAny, [{
}]>;
// Some 'special' instructions - expanded after instruction selection.
-let usesCustomInserter = 1, hasNoSchedulingInfo = 1 in {
+// Clobbers EFLAGS due to OR instruction used internally.
+// FIXME: Can we model this in SelectionDAG?
+let usesCustomInserter = 1, hasNoSchedulingInfo = 1, Defs = [EFLAGS] in {
def FP32_TO_INT16_IN_MEM : PseudoI<(outs), (ins i16mem:$dst, RFP32:$src),
[(X86fp_to_i16mem RFP32:$src, addr:$dst)]>;
def FP32_TO_INT32_IN_MEM : PseudoI<(outs), (ins i32mem:$dst, RFP32:$src),
@@ -139,7 +189,6 @@ def _Fp80 : FpI_<(outs RFP80:$dst), (ins RFP80:$src1, RFP80:$src2), TwoArgFP,
// These instructions cannot address 80-bit memory.
multiclass FPBinary<SDNode OpNode, Format fp, string asmstring,
bit Forward = 1> {
-let mayLoad = 1, hasSideEffects = 1 in {
// ST(0) = ST(0) + [mem]
def _Fp32m : FpIf32<(outs RFP32:$dst),
(ins RFP32:$src1, f32mem:$src2), OneArgFPRW,
@@ -176,8 +225,10 @@ def _Fp80m64: FpI_<(outs RFP80:$dst),
(OpNode RFP80:$src1, (f80 (extloadf64 addr:$src2)))),
(set RFP80:$dst,
(OpNode (f80 (extloadf64 addr:$src2)), RFP80:$src1)))]>;
+let mayLoad = 1 in
def _F32m : FPI<0xD8, fp, (outs), (ins f32mem:$src),
!strconcat("f", asmstring, "{s}\t$src")>;
+let mayLoad = 1 in
def _F64m : FPI<0xDC, fp, (outs), (ins f64mem:$src),
!strconcat("f", asmstring, "{l}\t$src")>;
// ST(0) = ST(0) + [memint]
@@ -185,52 +236,53 @@ def _FpI16m32 : FpIf32<(outs RFP32:$dst), (ins RFP32:$src1, i16mem:$src2),
OneArgFPRW,
[!if(Forward,
(set RFP32:$dst,
- (OpNode RFP32:$src1, (X86fild addr:$src2, i16))),
+ (OpNode RFP32:$src1, (X86fild16 addr:$src2))),
(set RFP32:$dst,
- (OpNode (X86fild addr:$src2, i16), RFP32:$src1)))]>;
+ (OpNode (X86fild16 addr:$src2), RFP32:$src1)))]>;
def _FpI32m32 : FpIf32<(outs RFP32:$dst), (ins RFP32:$src1, i32mem:$src2),
OneArgFPRW,
[!if(Forward,
(set RFP32:$dst,
- (OpNode RFP32:$src1, (X86fild addr:$src2, i32))),
+ (OpNode RFP32:$src1, (X86fild32 addr:$src2))),
(set RFP32:$dst,
- (OpNode (X86fild addr:$src2, i32), RFP32:$src1)))]>;
+ (OpNode (X86fild32 addr:$src2), RFP32:$src1)))]>;
def _FpI16m64 : FpIf64<(outs RFP64:$dst), (ins RFP64:$src1, i16mem:$src2),
OneArgFPRW,
[!if(Forward,
(set RFP64:$dst,
- (OpNode RFP64:$src1, (X86fild addr:$src2, i16))),
+ (OpNode RFP64:$src1, (X86fild16 addr:$src2))),
(set RFP64:$dst,
- (OpNode (X86fild addr:$src2, i16), RFP64:$src1)))]>;
+ (OpNode (X86fild16 addr:$src2), RFP64:$src1)))]>;
def _FpI32m64 : FpIf64<(outs RFP64:$dst), (ins RFP64:$src1, i32mem:$src2),
OneArgFPRW,
[!if(Forward,
(set RFP64:$dst,
- (OpNode RFP64:$src1, (X86fild addr:$src2, i32))),
+ (OpNode RFP64:$src1, (X86fild32 addr:$src2))),
(set RFP64:$dst,
- (OpNode (X86fild addr:$src2, i32), RFP64:$src1)))]>;
+ (OpNode (X86fild32 addr:$src2), RFP64:$src1)))]>;
def _FpI16m80 : FpI_<(outs RFP80:$dst), (ins RFP80:$src1, i16mem:$src2),
OneArgFPRW,
[!if(Forward,
(set RFP80:$dst,
- (OpNode RFP80:$src1, (X86fild addr:$src2, i16))),
+ (OpNode RFP80:$src1, (X86fild16 addr:$src2))),
(set RFP80:$dst,
- (OpNode (X86fild addr:$src2, i16), RFP80:$src1)))]>;
+ (OpNode (X86fild16 addr:$src2), RFP80:$src1)))]>;
def _FpI32m80 : FpI_<(outs RFP80:$dst), (ins RFP80:$src1, i32mem:$src2),
OneArgFPRW,
[!if(Forward,
(set RFP80:$dst,
- (OpNode RFP80:$src1, (X86fild addr:$src2, i32))),
+ (OpNode RFP80:$src1, (X86fild32 addr:$src2))),
(set RFP80:$dst,
- (OpNode (X86fild addr:$src2, i32), RFP80:$src1)))]>;
+ (OpNode (X86fild32 addr:$src2), RFP80:$src1)))]>;
+let mayLoad = 1 in
def _FI16m : FPI<0xDE, fp, (outs), (ins i16mem:$src),
!strconcat("fi", asmstring, "{s}\t$src")>;
+let mayLoad = 1 in
def _FI32m : FPI<0xDA, fp, (outs), (ins i32mem:$src),
!strconcat("fi", asmstring, "{l}\t$src")>;
-} // mayLoad = 1, hasSideEffects = 1
}
-let Defs = [FPSW] in {
+let Defs = [FPSW], Uses = [FPCW] in {
// FPBinary_rr just defines pseudo-instructions, no need to set a scheduling
// resources.
let hasNoSchedulingInfo = 1 in {
@@ -258,42 +310,42 @@ defm DIVR: FPBinary<fdiv, MRM7m, "divr", 0>;
} // Defs = [FPSW]
class FPST0rInst<Format fp, string asm>
- : FPI<0xD8, fp, (outs), (ins RST:$op), asm>;
+ : FPI<0xD8, fp, (outs), (ins RSTi:$op), asm>;
class FPrST0Inst<Format fp, string asm>
- : FPI<0xDC, fp, (outs), (ins RST:$op), asm>;
+ : FPI<0xDC, fp, (outs), (ins RSTi:$op), asm>;
class FPrST0PInst<Format fp, string asm>
- : FPI<0xDE, fp, (outs), (ins RST:$op), asm>;
+ : FPI<0xDE, fp, (outs), (ins RSTi:$op), asm>;
// NOTE: GAS and apparently all other AT&T style assemblers have a broken notion
// of some of the 'reverse' forms of the fsub and fdiv instructions. As such,
// we have to put some 'r's in and take them out of weird places.
-let SchedRW = [WriteFAdd] in {
-def ADD_FST0r : FPST0rInst <MRM0r, "fadd\t$op">;
-def ADD_FrST0 : FPrST0Inst <MRM0r, "fadd\t{%st(0), $op|$op, st(0)}">;
-def ADD_FPrST0 : FPrST0PInst<MRM0r, "faddp\t$op">;
-def SUBR_FST0r : FPST0rInst <MRM5r, "fsubr\t$op">;
-def SUB_FrST0 : FPrST0Inst <MRM5r, "fsub{r}\t{%st(0), $op|$op, st(0)}">;
-def SUB_FPrST0 : FPrST0PInst<MRM5r, "fsub{r}p\t$op">;
-def SUB_FST0r : FPST0rInst <MRM4r, "fsub\t$op">;
-def SUBR_FrST0 : FPrST0Inst <MRM4r, "fsub{|r}\t{%st(0), $op|$op, st(0)}">;
-def SUBR_FPrST0 : FPrST0PInst<MRM4r, "fsub{|r}p\t$op">;
+let SchedRW = [WriteFAdd], Defs = [FPSW], Uses = [FPCW] in {
+def ADD_FST0r : FPST0rInst <MRM0r, "fadd\t{$op, %st|st, $op}">;
+def ADD_FrST0 : FPrST0Inst <MRM0r, "fadd\t{%st, $op|$op, st}">;
+def ADD_FPrST0 : FPrST0PInst<MRM0r, "faddp\t{%st, $op|$op, st}">;
+def SUBR_FST0r : FPST0rInst <MRM5r, "fsubr\t{$op, %st|st, $op}">;
+def SUB_FrST0 : FPrST0Inst <MRM5r, "fsub{r}\t{%st, $op|$op, st}">;
+def SUB_FPrST0 : FPrST0PInst<MRM5r, "fsub{r}p\t{%st, $op|$op, st}">;
+def SUB_FST0r : FPST0rInst <MRM4r, "fsub\t{$op, %st|st, $op}">;
+def SUBR_FrST0 : FPrST0Inst <MRM4r, "fsub{|r}\t{%st, $op|$op, st}">;
+def SUBR_FPrST0 : FPrST0PInst<MRM4r, "fsub{|r}p\t{%st, $op|$op, st}">;
} // SchedRW
-let SchedRW = [WriteFCom] in {
+let SchedRW = [WriteFCom], Defs = [FPSW], Uses = [FPCW] in {
def COM_FST0r : FPST0rInst <MRM2r, "fcom\t$op">;
def COMP_FST0r : FPST0rInst <MRM3r, "fcomp\t$op">;
} // SchedRW
-let SchedRW = [WriteFMul] in {
-def MUL_FST0r : FPST0rInst <MRM1r, "fmul\t$op">;
-def MUL_FrST0 : FPrST0Inst <MRM1r, "fmul\t{%st(0), $op|$op, st(0)}">;
-def MUL_FPrST0 : FPrST0PInst<MRM1r, "fmulp\t$op">;
+let SchedRW = [WriteFMul], Defs = [FPSW], Uses = [FPCW] in {
+def MUL_FST0r : FPST0rInst <MRM1r, "fmul\t{$op, %st|st, $op}">;
+def MUL_FrST0 : FPrST0Inst <MRM1r, "fmul\t{%st, $op|$op, st}">;
+def MUL_FPrST0 : FPrST0PInst<MRM1r, "fmulp\t{%st, $op|$op, st}">;
} // SchedRW
-let SchedRW = [WriteFDiv] in {
-def DIVR_FST0r : FPST0rInst <MRM7r, "fdivr\t$op">;
-def DIV_FrST0 : FPrST0Inst <MRM7r, "fdiv{r}\t{%st(0), $op|$op, st(0)}">;
-def DIV_FPrST0 : FPrST0PInst<MRM7r, "fdiv{r}p\t$op">;
-def DIV_FST0r : FPST0rInst <MRM6r, "fdiv\t$op">;
-def DIVR_FrST0 : FPrST0Inst <MRM6r, "fdiv{|r}\t{%st(0), $op|$op, st(0)}">;
-def DIVR_FPrST0 : FPrST0PInst<MRM6r, "fdiv{|r}p\t$op">;
+let SchedRW = [WriteFDiv], Defs = [FPSW], Uses = [FPCW] in {
+def DIVR_FST0r : FPST0rInst <MRM7r, "fdivr\t{$op, %st|st, $op}">;
+def DIV_FrST0 : FPrST0Inst <MRM7r, "fdiv{r}\t{%st, $op|$op, st}">;
+def DIV_FPrST0 : FPrST0PInst<MRM7r, "fdiv{r}p\t{%st, $op|$op, st}">;
+def DIV_FST0r : FPST0rInst <MRM6r, "fdiv\t{$op, %st|st, $op}">;
+def DIVR_FrST0 : FPrST0Inst <MRM6r, "fdiv{|r}\t{%st, $op|$op, st}">;
+def DIVR_FPrST0 : FPrST0PInst<MRM6r, "fdiv{|r}p\t{%st, $op|$op, st}">;
} // SchedRW
// Unary operations.
@@ -307,7 +359,7 @@ def _Fp80 : FpI_<(outs RFP80:$dst), (ins RFP80:$src), OneArgFPRW,
def _F : FPI<0xD9, fp, (outs), (ins), asmstring>;
}
-let Defs = [FPSW] in {
+let Defs = [FPSW], Uses = [FPCW] in {
let SchedRW = [WriteFSign] in {
defm CHS : FPUnary<fneg, MRM_E0, "fchs">;
@@ -335,7 +387,7 @@ def TST_F : FPI<0xD9, MRM_E4, (outs), (ins), "ftst">;
// Versions of FP instructions that take a single memory operand. Added for the
// disassembler; remove as they are included with patterns elsewhere.
-let SchedRW = [WriteFComLd] in {
+let SchedRW = [WriteFComLd], Defs = [FPSW], Uses = [FPCW] in {
def FCOM32m : FPI<0xD8, MRM2m, (outs), (ins f32mem:$src), "fcom{s}\t$src">;
def FCOMP32m : FPI<0xD8, MRM3m, (outs), (ins f32mem:$src), "fcomp{s}\t$src">;
@@ -398,32 +450,31 @@ defm CMOVNP : FPCMov<X86_COND_NP>;
let Predicates = [HasCMov] in {
// These are not factored because there's no clean way to pass DA/DB.
-def CMOVB_F : FPI<0xDA, MRM0r, (outs), (ins RST:$op),
- "fcmovb\t{$op, %st(0)|st(0), $op}">;
-def CMOVBE_F : FPI<0xDA, MRM2r, (outs), (ins RST:$op),
- "fcmovbe\t{$op, %st(0)|st(0), $op}">;
-def CMOVE_F : FPI<0xDA, MRM1r, (outs), (ins RST:$op),
- "fcmove\t{$op, %st(0)|st(0), $op}">;
-def CMOVP_F : FPI<0xDA, MRM3r, (outs), (ins RST:$op),
- "fcmovu\t{$op, %st(0)|st(0), $op}">;
-def CMOVNB_F : FPI<0xDB, MRM0r, (outs), (ins RST:$op),
- "fcmovnb\t{$op, %st(0)|st(0), $op}">;
-def CMOVNBE_F: FPI<0xDB, MRM2r, (outs), (ins RST:$op),
- "fcmovnbe\t{$op, %st(0)|st(0), $op}">;
-def CMOVNE_F : FPI<0xDB, MRM1r, (outs), (ins RST:$op),
- "fcmovne\t{$op, %st(0)|st(0), $op}">;
-def CMOVNP_F : FPI<0xDB, MRM3r, (outs), (ins RST:$op),
- "fcmovnu\t{$op, %st(0)|st(0), $op}">;
+def CMOVB_F : FPI<0xDA, MRM0r, (outs), (ins RSTi:$op),
+ "fcmovb\t{$op, %st|st, $op}">;
+def CMOVBE_F : FPI<0xDA, MRM2r, (outs), (ins RSTi:$op),
+ "fcmovbe\t{$op, %st|st, $op}">;
+def CMOVE_F : FPI<0xDA, MRM1r, (outs), (ins RSTi:$op),
+ "fcmove\t{$op, %st|st, $op}">;
+def CMOVP_F : FPI<0xDA, MRM3r, (outs), (ins RSTi:$op),
+ "fcmovu\t{$op, %st|st, $op}">;
+def CMOVNB_F : FPI<0xDB, MRM0r, (outs), (ins RSTi:$op),
+ "fcmovnb\t{$op, %st|st, $op}">;
+def CMOVNBE_F: FPI<0xDB, MRM2r, (outs), (ins RSTi:$op),
+ "fcmovnbe\t{$op, %st|st, $op}">;
+def CMOVNE_F : FPI<0xDB, MRM1r, (outs), (ins RSTi:$op),
+ "fcmovne\t{$op, %st|st, $op}">;
+def CMOVNP_F : FPI<0xDB, MRM3r, (outs), (ins RSTi:$op),
+ "fcmovnu\t{$op, %st|st, $op}">;
} // Predicates = [HasCMov]
} // SchedRW
// Floating point loads & stores.
-let SchedRW = [WriteLoad] in {
+let SchedRW = [WriteLoad], Uses = [FPCW] in {
let canFoldAsLoad = 1 in {
def LD_Fp32m : FpIf32<(outs RFP32:$dst), (ins f32mem:$src), ZeroArgFP,
[(set RFP32:$dst, (loadf32 addr:$src))]>;
-let isReMaterializable = 1 in
- def LD_Fp64m : FpIf64<(outs RFP64:$dst), (ins f64mem:$src), ZeroArgFP,
+def LD_Fp64m : FpIf64<(outs RFP64:$dst), (ins f64mem:$src), ZeroArgFP,
[(set RFP64:$dst, (loadf64 addr:$src))]>;
def LD_Fp80m : FpI_<(outs RFP80:$dst), (ins f80mem:$src), ZeroArgFP,
[(set RFP80:$dst, (loadf80 addr:$src))]>;
@@ -435,26 +486,26 @@ def LD_Fp64m80 : FpI_<(outs RFP80:$dst), (ins f64mem:$src), ZeroArgFP,
def LD_Fp32m80 : FpI_<(outs RFP80:$dst), (ins f32mem:$src), ZeroArgFP,
[(set RFP80:$dst, (f80 (extloadf32 addr:$src)))]>;
def ILD_Fp16m32: FpIf32<(outs RFP32:$dst), (ins i16mem:$src), ZeroArgFP,
- [(set RFP32:$dst, (X86fild addr:$src, i16))]>;
+ [(set RFP32:$dst, (X86fild16 addr:$src))]>;
def ILD_Fp32m32: FpIf32<(outs RFP32:$dst), (ins i32mem:$src), ZeroArgFP,
- [(set RFP32:$dst, (X86fild addr:$src, i32))]>;
+ [(set RFP32:$dst, (X86fild32 addr:$src))]>;
def ILD_Fp64m32: FpIf32<(outs RFP32:$dst), (ins i64mem:$src), ZeroArgFP,
- [(set RFP32:$dst, (X86fild addr:$src, i64))]>;
+ [(set RFP32:$dst, (X86fild64 addr:$src))]>;
def ILD_Fp16m64: FpIf64<(outs RFP64:$dst), (ins i16mem:$src), ZeroArgFP,
- [(set RFP64:$dst, (X86fild addr:$src, i16))]>;
+ [(set RFP64:$dst, (X86fild16 addr:$src))]>;
def ILD_Fp32m64: FpIf64<(outs RFP64:$dst), (ins i32mem:$src), ZeroArgFP,
- [(set RFP64:$dst, (X86fild addr:$src, i32))]>;
+ [(set RFP64:$dst, (X86fild32 addr:$src))]>;
def ILD_Fp64m64: FpIf64<(outs RFP64:$dst), (ins i64mem:$src), ZeroArgFP,
- [(set RFP64:$dst, (X86fild addr:$src, i64))]>;
+ [(set RFP64:$dst, (X86fild64 addr:$src))]>;
def ILD_Fp16m80: FpI_<(outs RFP80:$dst), (ins i16mem:$src), ZeroArgFP,
- [(set RFP80:$dst, (X86fild addr:$src, i16))]>;
+ [(set RFP80:$dst, (X86fild16 addr:$src))]>;
def ILD_Fp32m80: FpI_<(outs RFP80:$dst), (ins i32mem:$src), ZeroArgFP,
- [(set RFP80:$dst, (X86fild addr:$src, i32))]>;
+ [(set RFP80:$dst, (X86fild32 addr:$src))]>;
def ILD_Fp64m80: FpI_<(outs RFP80:$dst), (ins i64mem:$src), ZeroArgFP,
- [(set RFP80:$dst, (X86fild addr:$src, i64))]>;
+ [(set RFP80:$dst, (X86fild64 addr:$src))]>;
} // SchedRW
-let SchedRW = [WriteStore] in {
+let SchedRW = [WriteStore], Uses = [FPCW] in {
def ST_Fp32m : FpIf32<(outs), (ins f32mem:$op, RFP32:$src), OneArgFP,
[(store RFP32:$src, addr:$op)]>;
def ST_Fp64m32 : FpIf64<(outs), (ins f32mem:$op, RFP64:$src), OneArgFP,
@@ -489,9 +540,9 @@ def IST_Fp16m80 : FpI_<(outs), (ins i16mem:$op, RFP80:$src), OneArgFP, []>;
def IST_Fp32m80 : FpI_<(outs), (ins i32mem:$op, RFP80:$src), OneArgFP, []>;
def IST_Fp64m80 : FpI_<(outs), (ins i64mem:$op, RFP80:$src), OneArgFP, []>;
} // mayStore
-} // SchedRW
+} // SchedRW, Uses = [FPCW]
-let mayLoad = 1, SchedRW = [WriteLoad] in {
+let mayLoad = 1, SchedRW = [WriteLoad], Uses = [FPCW] in {
def LD_F32m : FPI<0xD9, MRM0m, (outs), (ins f32mem:$src), "fld{s}\t$src">;
def LD_F64m : FPI<0xDD, MRM0m, (outs), (ins f64mem:$src), "fld{l}\t$src">;
def LD_F80m : FPI<0xDB, MRM5m, (outs), (ins f80mem:$src), "fld{t}\t$src">;
@@ -499,7 +550,7 @@ def ILD_F16m : FPI<0xDF, MRM0m, (outs), (ins i16mem:$src), "fild{s}\t$src">;
def ILD_F32m : FPI<0xDB, MRM0m, (outs), (ins i32mem:$src), "fild{l}\t$src">;
def ILD_F64m : FPI<0xDF, MRM5m, (outs), (ins i64mem:$src), "fild{ll}\t$src">;
}
-let mayStore = 1, SchedRW = [WriteStore] in {
+let mayStore = 1, SchedRW = [WriteStore], Uses = [FPCW] in {
def ST_F32m : FPI<0xD9, MRM2m, (outs), (ins f32mem:$dst), "fst{s}\t$dst">;
def ST_F64m : FPI<0xDD, MRM2m, (outs), (ins f64mem:$dst), "fst{l}\t$dst">;
def ST_FP32m : FPI<0xD9, MRM3m, (outs), (ins f32mem:$dst), "fstp{s}\t$dst">;
@@ -513,7 +564,7 @@ def IST_FP64m : FPI<0xDF, MRM7m, (outs), (ins i64mem:$dst), "fistp{ll}\t$dst">;
}
// FISTTP requires SSE3 even though it's a FPStack op.
-let Predicates = [HasSSE3], SchedRW = [WriteStore] in {
+let Predicates = [HasSSE3], SchedRW = [WriteStore], Uses = [FPCW] in {
def ISTT_Fp16m32 : FpI_<(outs), (ins i16mem:$op, RFP32:$src), OneArgFP,
[(X86fp_to_i16mem RFP32:$src, addr:$op)]>;
def ISTT_Fp32m32 : FpI_<(outs), (ins i32mem:$op, RFP32:$src), OneArgFP,
@@ -534,22 +585,22 @@ def ISTT_Fp64m80 : FpI_<(outs), (ins i64mem:$op, RFP80:$src), OneArgFP,
[(X86fp_to_i64mem RFP80:$src, addr:$op)]>;
} // Predicates = [HasSSE3]
-let mayStore = 1, SchedRW = [WriteStore] in {
+let mayStore = 1, SchedRW = [WriteStore], Uses = [FPCW] in {
def ISTT_FP16m : FPI<0xDF, MRM1m, (outs), (ins i16mem:$dst), "fisttp{s}\t$dst">;
def ISTT_FP32m : FPI<0xDB, MRM1m, (outs), (ins i32mem:$dst), "fisttp{l}\t$dst">;
def ISTT_FP64m : FPI<0xDD, MRM1m, (outs), (ins i64mem:$dst), "fisttp{ll}\t$dst">;
}
// FP Stack manipulation instructions.
-let SchedRW = [WriteMove] in {
-def LD_Frr : FPI<0xD9, MRM0r, (outs), (ins RST:$op), "fld\t$op">;
-def ST_Frr : FPI<0xDD, MRM2r, (outs), (ins RST:$op), "fst\t$op">;
-def ST_FPrr : FPI<0xDD, MRM3r, (outs), (ins RST:$op), "fstp\t$op">;
-def XCH_F : FPI<0xD9, MRM1r, (outs), (ins RST:$op), "fxch\t$op">;
+let SchedRW = [WriteMove], Uses = [FPCW] in {
+def LD_Frr : FPI<0xD9, MRM0r, (outs), (ins RSTi:$op), "fld\t$op">;
+def ST_Frr : FPI<0xDD, MRM2r, (outs), (ins RSTi:$op), "fst\t$op">;
+def ST_FPrr : FPI<0xDD, MRM3r, (outs), (ins RSTi:$op), "fstp\t$op">;
+def XCH_F : FPI<0xD9, MRM1r, (outs), (ins RSTi:$op), "fxch\t$op">;
}
// Floating point constant loads.
-let isReMaterializable = 1, SchedRW = [WriteZero] in {
+let SchedRW = [WriteZero], Uses = [FPCW] in {
def LD_Fp032 : FpIf32<(outs RFP32:$dst), (ins), ZeroArgFP,
[(set RFP32:$dst, fpimm0)]>;
def LD_Fp132 : FpIf32<(outs RFP32:$dst), (ins), ZeroArgFP,
@@ -564,13 +615,13 @@ def LD_Fp180 : FpI_<(outs RFP80:$dst), (ins), ZeroArgFP,
[(set RFP80:$dst, fpimm1)]>;
}
-let SchedRW = [WriteFLD0] in
+let SchedRW = [WriteFLD0], Uses = [FPCW] in
def LD_F0 : FPI<0xD9, MRM_EE, (outs), (ins), "fldz">;
-let SchedRW = [WriteFLD1] in
+let SchedRW = [WriteFLD1], Uses = [FPCW] in
def LD_F1 : FPI<0xD9, MRM_E8, (outs), (ins), "fld1">;
-let SchedRW = [WriteFLDC], Defs = [FPSW] in {
+let SchedRW = [WriteFLDC], Uses = [FPCW] in {
def FLDL2T : I<0xD9, MRM_E9, (outs), (ins), "fldl2t", []>;
def FLDL2E : I<0xD9, MRM_EA, (outs), (ins), "fldl2e", []>;
def FLDPI : I<0xD9, MRM_EB, (outs), (ins), "fldpi", []>;
@@ -579,7 +630,7 @@ def FLDLN2 : I<0xD9, MRM_ED, (outs), (ins), "fldln2", []>;
} // SchedRW
// Floating point compares.
-let SchedRW = [WriteFCom] in {
+let SchedRW = [WriteFCom], Uses = [FPCW] in {
def UCOM_Fpr32 : FpIf32<(outs), (ins RFP32:$lhs, RFP32:$rhs), CompareFP,
[(set FPSW, (trunc (X86cmp RFP32:$lhs, RFP32:$rhs)))]>;
def UCOM_Fpr64 : FpIf64<(outs), (ins RFP64:$lhs, RFP64:$rhs), CompareFP,
@@ -591,37 +642,37 @@ def UCOM_Fpr80 : FpI_ <(outs), (ins RFP80:$lhs, RFP80:$rhs), CompareFP,
let SchedRW = [WriteFCom] in {
// CC = ST(0) cmp ST(i)
-let Defs = [EFLAGS, FPSW] in {
-let Predicates = [FPStackf32, HasCMov] in
-def UCOM_FpIr32: FpIf32<(outs), (ins RFP32:$lhs, RFP32:$rhs), CompareFP,
- [(set EFLAGS, (X86cmp RFP32:$lhs, RFP32:$rhs))]>;
-let Predicates = [FPStackf64, HasCMov] in
-def UCOM_FpIr64: FpIf64<(outs), (ins RFP64:$lhs, RFP64:$rhs), CompareFP,
- [(set EFLAGS, (X86cmp RFP64:$lhs, RFP64:$rhs))]>;
-let Predicates = [HasCMov] in
+let Defs = [EFLAGS, FPSW], Uses = [FPCW] in {
+def UCOM_FpIr32: FpI_<(outs), (ins RFP32:$lhs, RFP32:$rhs), CompareFP,
+ [(set EFLAGS, (X86cmp RFP32:$lhs, RFP32:$rhs))]>,
+ Requires<[FPStackf32, HasCMov]>;
+def UCOM_FpIr64: FpI_<(outs), (ins RFP64:$lhs, RFP64:$rhs), CompareFP,
+ [(set EFLAGS, (X86cmp RFP64:$lhs, RFP64:$rhs))]>,
+ Requires<[FPStackf64, HasCMov]>;
def UCOM_FpIr80: FpI_<(outs), (ins RFP80:$lhs, RFP80:$rhs), CompareFP,
- [(set EFLAGS, (X86cmp RFP80:$lhs, RFP80:$rhs))]>;
+ [(set EFLAGS, (X86cmp RFP80:$lhs, RFP80:$rhs))]>,
+ Requires<[HasCMov]>;
}
-let Defs = [FPSW], Uses = [ST0] in {
+let Defs = [FPSW], Uses = [ST0, FPCW] in {
def UCOM_Fr : FPI<0xDD, MRM4r, // FPSW = cmp ST(0) with ST(i)
- (outs), (ins RST:$reg), "fucom\t$reg">;
+ (outs), (ins RSTi:$reg), "fucom\t$reg">;
def UCOM_FPr : FPI<0xDD, MRM5r, // FPSW = cmp ST(0) with ST(i), pop
- (outs), (ins RST:$reg), "fucomp\t$reg">;
+ (outs), (ins RSTi:$reg), "fucomp\t$reg">;
def UCOM_FPPr : FPI<0xDA, MRM_E9, // cmp ST(0) with ST(1), pop, pop
(outs), (ins), "fucompp">;
}
-let Defs = [EFLAGS, FPSW], Uses = [ST0] in {
+let Defs = [EFLAGS, FPSW], Uses = [ST0, FPCW] in {
def UCOM_FIr : FPI<0xDB, MRM5r, // CC = cmp ST(0) with ST(i)
- (outs), (ins RST:$reg), "fucomi\t$reg">;
+ (outs), (ins RSTi:$reg), "fucomi\t{$reg, %st|st, $reg}">;
def UCOM_FIPr : FPI<0xDF, MRM5r, // CC = cmp ST(0) with ST(i), pop
- (outs), (ins RST:$reg), "fucompi\t$reg">;
-}
+ (outs), (ins RSTi:$reg), "fucompi\t{$reg, %st|st, $reg}">;
-let Defs = [EFLAGS, FPSW] in {
-def COM_FIr : FPI<0xDB, MRM6r, (outs), (ins RST:$reg), "fcomi\t$reg">;
-def COM_FIPr : FPI<0xDF, MRM6r, (outs), (ins RST:$reg), "fcompi\t$reg">;
+def COM_FIr : FPI<0xDB, MRM6r, (outs), (ins RSTi:$reg),
+ "fcomi\t{$reg, %st|st, $reg}">;
+def COM_FIPr : FPI<0xDF, MRM6r, (outs), (ins RSTi:$reg),
+ "fcompi\t{$reg, %st|st, $reg}">;
}
} // SchedRW
@@ -631,12 +682,12 @@ let Defs = [AX], Uses = [FPSW] in
def FNSTSW16r : I<0xDF, MRM_E0, // AX = fp flags
(outs), (ins), "fnstsw\t{%ax|ax}",
[(set AX, (X86fp_stsw FPSW))]>;
-let Defs = [FPSW] in
+let Defs = [FPSW], Uses = [FPCW] in
def FNSTCW16m : I<0xD9, MRM7m, // [mem16] = X87 control world
(outs), (ins i16mem:$dst), "fnstcw\t$dst",
[(X86fp_cwd_get16 addr:$dst)]>;
} // SchedRW
-let Defs = [FPSW], mayLoad = 1 in
+let Defs = [FPSW,FPCW], mayLoad = 1 in
def FLDCW16m : I<0xD9, MRM5m, // X87 control world = [mem16]
(outs), (ins i16mem:$dst), "fldcw\t$dst", []>,
Sched<[WriteLoad]>;
@@ -645,8 +696,8 @@ def FLDCW16m : I<0xD9, MRM5m, // X87 control world = [mem16]
let SchedRW = [WriteMicrocoded] in {
let Defs = [FPSW] in {
def FNINIT : I<0xDB, MRM_E3, (outs), (ins), "fninit", []>;
-def FFREE : FPI<0xDD, MRM0r, (outs), (ins RST:$reg), "ffree\t$reg">;
-def FFREEP : FPI<0xDF, MRM0r, (outs), (ins RST:$reg), "ffreep\t$reg">;
+def FFREE : FPI<0xDD, MRM0r, (outs), (ins RSTi:$reg), "ffree\t$reg">;
+def FFREEP : FPI<0xDF, MRM0r, (outs), (ins RSTi:$reg), "ffreep\t$reg">;
// Clear exceptions
def FNCLEX : I<0xDB, MRM_E2, (outs), (ins), "fnclex", []>;
@@ -695,21 +746,17 @@ def FXRSTOR64 : RI<0xAE, MRM1m, (outs), (ins opaquemem:$src),
//===----------------------------------------------------------------------===//
// Required for RET of f32 / f64 / f80 values.
-def : Pat<(X86fld addr:$src, f32), (LD_Fp32m addr:$src)>;
-def : Pat<(X86fld addr:$src, f64), (LD_Fp64m addr:$src)>;
-def : Pat<(X86fld addr:$src, f80), (LD_Fp80m addr:$src)>;
+def : Pat<(X86fldf32 addr:$src), (LD_Fp32m addr:$src)>;
+def : Pat<(X86fldf64 addr:$src), (LD_Fp64m addr:$src)>;
+def : Pat<(X86fldf80 addr:$src), (LD_Fp80m addr:$src)>;
// Required for CALL which return f32 / f64 / f80 values.
-def : Pat<(X86fst RFP32:$src, addr:$op, f32), (ST_Fp32m addr:$op, RFP32:$src)>;
-def : Pat<(X86fst RFP64:$src, addr:$op, f32), (ST_Fp64m32 addr:$op,
- RFP64:$src)>;
-def : Pat<(X86fst RFP64:$src, addr:$op, f64), (ST_Fp64m addr:$op, RFP64:$src)>;
-def : Pat<(X86fst RFP80:$src, addr:$op, f32), (ST_Fp80m32 addr:$op,
- RFP80:$src)>;
-def : Pat<(X86fst RFP80:$src, addr:$op, f64), (ST_Fp80m64 addr:$op,
- RFP80:$src)>;
-def : Pat<(X86fst RFP80:$src, addr:$op, f80), (ST_FpP80m addr:$op,
- RFP80:$src)>;
+def : Pat<(X86fstf32 RFP32:$src, addr:$op), (ST_Fp32m addr:$op, RFP32:$src)>;
+def : Pat<(X86fstf32 RFP64:$src, addr:$op), (ST_Fp64m32 addr:$op, RFP64:$src)>;
+def : Pat<(X86fstf64 RFP64:$src, addr:$op), (ST_Fp64m addr:$op, RFP64:$src)>;
+def : Pat<(X86fstf32 RFP80:$src, addr:$op), (ST_Fp80m32 addr:$op, RFP80:$src)>;
+def : Pat<(X86fstf64 RFP80:$src, addr:$op), (ST_Fp80m64 addr:$op, RFP80:$src)>;
+def : Pat<(X86fstf80 RFP80:$src, addr:$op), (ST_FpP80m addr:$op, RFP80:$src)>;
// Floating point constant -0.0 and -1.0
def : Pat<(f32 fpimmneg0), (CHS_Fp32 (LD_Fp032))>, Requires<[FPStackf32]>;
@@ -720,7 +767,11 @@ def : Pat<(f80 fpimmneg0), (CHS_Fp80 (LD_Fp080))>;
def : Pat<(f80 fpimmneg1), (CHS_Fp80 (LD_Fp180))>;
// Used to conv. i64 to f64 since there isn't a SSE version.
-def : Pat<(X86fildflag addr:$src, i64), (ILD_Fp64m64 addr:$src)>;
+def : Pat<(X86fildflag64 addr:$src), (ILD_Fp64m64 addr:$src)>;
+
+// Used to conv. between f80 and i64 for i64 atomic loads.
+def : Pat<(X86fildflag64 addr:$src), (ILD_Fp64m80 addr:$src)>;
+def : Pat<(X86fist64 RFP80:$src, addr:$op), (IST_Fp64m80 addr:$op, RFP80:$src)>;
// FP extensions map onto simple pseudo-value conversions if they are to/from
// the FP stack.
diff --git a/lib/Target/X86/X86InstrFoldTables.cpp b/lib/Target/X86/X86InstrFoldTables.cpp
index 7d31cfab4137..d42fec3770c7 100644
--- a/lib/Target/X86/X86InstrFoldTables.cpp
+++ b/lib/Target/X86/X86InstrFoldTables.cpp
@@ -1,9 +1,8 @@
//===-- X86InstrFoldTables.cpp - X86 Instruction Folding Tables -----------===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
@@ -34,6 +33,17 @@ using namespace llvm;
// tables that would be incorrect. The manual review process allows us a chance
// to catch these before they become observable bugs.
static const X86MemoryFoldTableEntry MemoryFoldTable2Addr[] = {
+ { X86::ADD16ri8_DB, X86::ADD16mi8, TB_NO_REVERSE },
+ { X86::ADD16ri_DB, X86::ADD16mi, TB_NO_REVERSE },
+ { X86::ADD16rr_DB, X86::ADD16mr, TB_NO_REVERSE },
+ { X86::ADD32ri8_DB, X86::ADD32mi8, TB_NO_REVERSE },
+ { X86::ADD32ri_DB, X86::ADD32mi, TB_NO_REVERSE },
+ { X86::ADD32rr_DB, X86::ADD32mr, TB_NO_REVERSE },
+ { X86::ADD64ri32_DB,X86::ADD64mi32, TB_NO_REVERSE },
+ { X86::ADD64ri8_DB, X86::ADD64mi8, TB_NO_REVERSE },
+ { X86::ADD64rr_DB, X86::ADD64mr, TB_NO_REVERSE },
+ { X86::ADD8ri_DB, X86::ADD8mi, TB_NO_REVERSE },
+ { X86::ADD8rr_DB, X86::ADD8mr, TB_NO_REVERSE },
{ X86::ADC16ri, X86::ADC16mi, 0 },
{ X86::ADC16ri8, X86::ADC16mi8, 0 },
{ X86::ADC16rr, X86::ADC16mr, 0 },
@@ -48,22 +58,13 @@ static const X86MemoryFoldTableEntry MemoryFoldTable2Addr[] = {
{ X86::ADC8rr, X86::ADC8mr, 0 },
{ X86::ADD16ri, X86::ADD16mi, 0 },
{ X86::ADD16ri8, X86::ADD16mi8, 0 },
- { X86::ADD16ri8_DB, X86::ADD16mi8, TB_NO_REVERSE },
- { X86::ADD16ri_DB, X86::ADD16mi, TB_NO_REVERSE },
{ X86::ADD16rr, X86::ADD16mr, 0 },
- { X86::ADD16rr_DB, X86::ADD16mr, TB_NO_REVERSE },
{ X86::ADD32ri, X86::ADD32mi, 0 },
{ X86::ADD32ri8, X86::ADD32mi8, 0 },
- { X86::ADD32ri8_DB, X86::ADD32mi8, TB_NO_REVERSE },
- { X86::ADD32ri_DB, X86::ADD32mi, TB_NO_REVERSE },
{ X86::ADD32rr, X86::ADD32mr, 0 },
- { X86::ADD32rr_DB, X86::ADD32mr, TB_NO_REVERSE },
{ X86::ADD64ri32, X86::ADD64mi32, 0 },
- { X86::ADD64ri32_DB,X86::ADD64mi32, TB_NO_REVERSE },
{ X86::ADD64ri8, X86::ADD64mi8, 0 },
- { X86::ADD64ri8_DB, X86::ADD64mi8, TB_NO_REVERSE },
{ X86::ADD64rr, X86::ADD64mr, 0 },
- { X86::ADD64rr_DB, X86::ADD64mr, TB_NO_REVERSE },
{ X86::ADD8ri, X86::ADD8mi, 0 },
{ X86::ADD8ri8, X86::ADD8mi8, 0 },
{ X86::ADD8rr, X86::ADD8mr, 0 },
@@ -247,7 +248,7 @@ static const X86MemoryFoldTableEntry MemoryFoldTable2Addr[] = {
{ X86::XOR64rr, X86::XOR64mr, 0 },
{ X86::XOR8ri, X86::XOR8mi, 0 },
{ X86::XOR8ri8, X86::XOR8mi8, 0 },
- { X86::XOR8rr, X86::XOR8mr, 0 }
+ { X86::XOR8rr, X86::XOR8mr, 0 },
};
static const X86MemoryFoldTableEntry MemoryFoldTable0[] = {
@@ -305,9 +306,9 @@ static const X86MemoryFoldTableEntry MemoryFoldTable0[] = {
{ X86::MOVDQArr, X86::MOVDQAmr, TB_FOLDED_STORE | TB_ALIGN_16 },
{ X86::MOVDQUrr, X86::MOVDQUmr, TB_FOLDED_STORE },
{ X86::MOVPDI2DIrr, X86::MOVPDI2DImr, TB_FOLDED_STORE },
- { X86::MOVPQIto64rr, X86::MOVPQI2QImr, TB_FOLDED_STORE },
- { X86::MOVSDto64rr, X86::MOVSDto64mr, TB_FOLDED_STORE },
- { X86::MOVSS2DIrr, X86::MOVSS2DImr, TB_FOLDED_STORE },
+ { X86::MOVPQIto64rr, X86::MOVPQI2QImr, TB_FOLDED_STORE | TB_NO_REVERSE },
+ { X86::MOVSDto64rr, X86::MOVSDmr, TB_FOLDED_STORE | TB_NO_REVERSE },
+ { X86::MOVSS2DIrr, X86::MOVSSmr, TB_FOLDED_STORE },
{ X86::MOVUPDrr, X86::MOVUPDmr, TB_FOLDED_STORE },
{ X86::MOVUPSrr, X86::MOVUPSmr, TB_FOLDED_STORE },
{ X86::MUL16r, X86::MUL16m, TB_FOLDED_LOAD },
@@ -321,22 +322,7 @@ static const X86MemoryFoldTableEntry MemoryFoldTable0[] = {
{ X86::PUSH16r, X86::PUSH16rmm, TB_FOLDED_LOAD },
{ X86::PUSH32r, X86::PUSH32rmm, TB_FOLDED_LOAD },
{ X86::PUSH64r, X86::PUSH64rmm, TB_FOLDED_LOAD },
- { X86::SETAEr, X86::SETAEm, TB_FOLDED_STORE },
- { X86::SETAr, X86::SETAm, TB_FOLDED_STORE },
- { X86::SETBEr, X86::SETBEm, TB_FOLDED_STORE },
- { X86::SETBr, X86::SETBm, TB_FOLDED_STORE },
- { X86::SETEr, X86::SETEm, TB_FOLDED_STORE },
- { X86::SETGEr, X86::SETGEm, TB_FOLDED_STORE },
- { X86::SETGr, X86::SETGm, TB_FOLDED_STORE },
- { X86::SETLEr, X86::SETLEm, TB_FOLDED_STORE },
- { X86::SETLr, X86::SETLm, TB_FOLDED_STORE },
- { X86::SETNEr, X86::SETNEm, TB_FOLDED_STORE },
- { X86::SETNOr, X86::SETNOm, TB_FOLDED_STORE },
- { X86::SETNPr, X86::SETNPm, TB_FOLDED_STORE },
- { X86::SETNSr, X86::SETNSm, TB_FOLDED_STORE },
- { X86::SETOr, X86::SETOm, TB_FOLDED_STORE },
- { X86::SETPr, X86::SETPm, TB_FOLDED_STORE },
- { X86::SETSr, X86::SETSm, TB_FOLDED_STORE },
+ { X86::SETCCr, X86::SETCCm, TB_FOLDED_STORE },
{ X86::TAILJMPr, X86::TAILJMPm, TB_FOLDED_LOAD },
{ X86::TAILJMPr64, X86::TAILJMPm64, TB_FOLDED_LOAD },
{ X86::TAILJMPr64_REX, X86::TAILJMPm64_REX, TB_FOLDED_LOAD },
@@ -403,12 +389,12 @@ static const X86MemoryFoldTableEntry MemoryFoldTable0[] = {
{ X86::VMOVDQUrr, X86::VMOVDQUmr, TB_FOLDED_STORE },
{ X86::VMOVPDI2DIZrr, X86::VMOVPDI2DIZmr, TB_FOLDED_STORE },
{ X86::VMOVPDI2DIrr, X86::VMOVPDI2DImr, TB_FOLDED_STORE },
- { X86::VMOVPQIto64Zrr, X86::VMOVPQI2QIZmr, TB_FOLDED_STORE },
- { X86::VMOVPQIto64rr, X86::VMOVPQI2QImr, TB_FOLDED_STORE },
- { X86::VMOVSDto64Zrr, X86::VMOVSDto64Zmr, TB_FOLDED_STORE },
- { X86::VMOVSDto64rr, X86::VMOVSDto64mr, TB_FOLDED_STORE },
- { X86::VMOVSS2DIZrr, X86::VMOVSS2DIZmr, TB_FOLDED_STORE },
- { X86::VMOVSS2DIrr, X86::VMOVSS2DImr, TB_FOLDED_STORE },
+ { X86::VMOVPQIto64Zrr, X86::VMOVPQI2QIZmr, TB_FOLDED_STORE | TB_NO_REVERSE },
+ { X86::VMOVPQIto64rr, X86::VMOVPQI2QImr, TB_FOLDED_STORE | TB_NO_REVERSE },
+ { X86::VMOVSDto64Zrr, X86::VMOVSDZmr, TB_FOLDED_STORE | TB_NO_REVERSE },
+ { X86::VMOVSDto64rr, X86::VMOVSDmr, TB_FOLDED_STORE | TB_NO_REVERSE },
+ { X86::VMOVSS2DIZrr, X86::VMOVSSZmr, TB_FOLDED_STORE },
+ { X86::VMOVSS2DIrr, X86::VMOVSSmr, TB_FOLDED_STORE },
{ X86::VMOVUPDYrr, X86::VMOVUPDYmr, TB_FOLDED_STORE },
{ X86::VMOVUPDZ128rr, X86::VMOVUPDZ128mr, TB_FOLDED_STORE },
{ X86::VMOVUPDZ256rr, X86::VMOVUPDZ256mr, TB_FOLDED_STORE },
@@ -544,14 +530,14 @@ static const X86MemoryFoldTableEntry MemoryFoldTable1[] = {
{ X86::MOV16rr, X86::MOV16rm, 0 },
{ X86::MOV32rr, X86::MOV32rm, 0 },
{ X86::MOV64rr, X86::MOV64rm, 0 },
- { X86::MOV64toPQIrr, X86::MOVQI2PQIrm, 0 },
- { X86::MOV64toSDrr, X86::MOV64toSDrm, 0 },
+ { X86::MOV64toPQIrr, X86::MOVQI2PQIrm, TB_NO_REVERSE },
+ { X86::MOV64toSDrr, X86::MOVSDrm_alt, TB_NO_REVERSE },
{ X86::MOV8rr, X86::MOV8rm, 0 },
{ X86::MOVAPDrr, X86::MOVAPDrm, TB_ALIGN_16 },
{ X86::MOVAPSrr, X86::MOVAPSrm, TB_ALIGN_16 },
{ X86::MOVDDUPrr, X86::MOVDDUPrm, TB_NO_REVERSE },
{ X86::MOVDI2PDIrr, X86::MOVDI2PDIrm, 0 },
- { X86::MOVDI2SSrr, X86::MOVDI2SSrm, 0 },
+ { X86::MOVDI2SSrr, X86::MOVSSrm_alt, 0 },
{ X86::MOVDQArr, X86::MOVDQArm, TB_ALIGN_16 },
{ X86::MOVDQUrr, X86::MOVDQUrm, 0 },
{ X86::MOVSHDUPrr, X86::MOVSHDUPrm, TB_ALIGN_16 },
@@ -628,7 +614,6 @@ static const X86MemoryFoldTableEntry MemoryFoldTable1[] = {
{ X86::SQRTSSr, X86::SQRTSSm, 0 },
{ X86::T1MSKC32rr, X86::T1MSKC32rm, 0 },
{ X86::T1MSKC64rr, X86::T1MSKC64rm, 0 },
- // FIXME: TEST*rr EAX,EAX ---> CMP [mem], 0
{ X86::TZCNT16rr, X86::TZCNT16rm, 0 },
{ X86::TZCNT32rr, X86::TZCNT32rm, 0 },
{ X86::TZCNT64rr, X86::TZCNT64rm, 0 },
@@ -663,7 +648,7 @@ static const X86MemoryFoldTableEntry MemoryFoldTable1[] = {
{ X86::VCOMISSrr_Int, X86::VCOMISSrm_Int, TB_NO_REVERSE },
{ X86::VCVTDQ2PDYrr, X86::VCVTDQ2PDYrm, 0 },
{ X86::VCVTDQ2PDZ128rr, X86::VCVTDQ2PDZ128rm, TB_NO_REVERSE },
- { X86::VCVTDQ2PDZ256rr, X86::VCVTDQ2PDZ256rm, 0 },
+ { X86::VCVTDQ2PDZ256rr, X86::VCVTDQ2PDZ256rm, 0 },
{ X86::VCVTDQ2PDZrr, X86::VCVTDQ2PDZrm, 0 },
{ X86::VCVTDQ2PDrr, X86::VCVTDQ2PDrm, TB_NO_REVERSE },
{ X86::VCVTDQ2PSYrr, X86::VCVTDQ2PSYrm, 0 },
@@ -671,6 +656,9 @@ static const X86MemoryFoldTableEntry MemoryFoldTable1[] = {
{ X86::VCVTDQ2PSZ256rr, X86::VCVTDQ2PSZ256rm, 0 },
{ X86::VCVTDQ2PSZrr, X86::VCVTDQ2PSZrm, 0 },
{ X86::VCVTDQ2PSrr, X86::VCVTDQ2PSrm, 0 },
+ { X86::VCVTNEPS2BF16Z128rr, X86::VCVTNEPS2BF16Z128rm, 0 },
+ { X86::VCVTNEPS2BF16Z256rr, X86::VCVTNEPS2BF16Z256rm, 0 },
+ { X86::VCVTNEPS2BF16Zrr, X86::VCVTNEPS2BF16Zrm, 0 },
{ X86::VCVTPD2DQYrr, X86::VCVTPD2DQYrm, 0 },
{ X86::VCVTPD2DQZ128rr, X86::VCVTPD2DQZ128rm, 0 },
{ X86::VCVTPD2DQZ256rr, X86::VCVTPD2DQZ256rm, 0 },
@@ -830,10 +818,10 @@ static const X86MemoryFoldTableEntry MemoryFoldTable1[] = {
{ X86::VGETMANTPSZ128rri, X86::VGETMANTPSZ128rmi, 0 },
{ X86::VGETMANTPSZ256rri, X86::VGETMANTPSZ256rmi, 0 },
{ X86::VGETMANTPSZrri, X86::VGETMANTPSZrmi, 0 },
- { X86::VMOV64toPQIZrr, X86::VMOVQI2PQIZrm, 0 },
- { X86::VMOV64toPQIrr, X86::VMOVQI2PQIrm, 0 },
- { X86::VMOV64toSDZrr, X86::VMOV64toSDZrm, 0 },
- { X86::VMOV64toSDrr, X86::VMOV64toSDrm, 0 },
+ { X86::VMOV64toPQIZrr, X86::VMOVQI2PQIZrm, TB_NO_REVERSE },
+ { X86::VMOV64toPQIrr, X86::VMOVQI2PQIrm, TB_NO_REVERSE },
+ { X86::VMOV64toSDZrr, X86::VMOVSDZrm_alt, TB_NO_REVERSE },
+ { X86::VMOV64toSDrr, X86::VMOVSDrm_alt, TB_NO_REVERSE },
{ X86::VMOVAPDYrr, X86::VMOVAPDYrm, TB_ALIGN_32 },
{ X86::VMOVAPDZ128rr, X86::VMOVAPDZ128rm, TB_ALIGN_16 },
{ X86::VMOVAPDZ256rr, X86::VMOVAPDZ256rm, TB_ALIGN_32 },
@@ -851,8 +839,8 @@ static const X86MemoryFoldTableEntry MemoryFoldTable1[] = {
{ X86::VMOVDDUPrr, X86::VMOVDDUPrm, TB_NO_REVERSE },
{ X86::VMOVDI2PDIZrr, X86::VMOVDI2PDIZrm, 0 },
{ X86::VMOVDI2PDIrr, X86::VMOVDI2PDIrm, 0 },
- { X86::VMOVDI2SSZrr, X86::VMOVDI2SSZrm, 0 },
- { X86::VMOVDI2SSrr, X86::VMOVDI2SSrm, 0 },
+ { X86::VMOVDI2SSZrr, X86::VMOVSSZrm_alt, 0 },
+ { X86::VMOVDI2SSrr, X86::VMOVSSrm_alt, 0 },
{ X86::VMOVDQA32Z128rr, X86::VMOVDQA32Z128rm, TB_ALIGN_16 },
{ X86::VMOVDQA32Z256rr, X86::VMOVDQA32Z256rm, TB_ALIGN_32 },
{ X86::VMOVDQA32Zrr, X86::VMOVDQA32Zrm, TB_ALIGN_64 },
@@ -1206,6 +1194,10 @@ static const X86MemoryFoldTableEntry MemoryFoldTable1[] = {
};
static const X86MemoryFoldTableEntry MemoryFoldTable2[] = {
+ { X86::ADD16rr_DB, X86::ADD16rm, TB_NO_REVERSE },
+ { X86::ADD32rr_DB, X86::ADD32rm, TB_NO_REVERSE },
+ { X86::ADD64rr_DB, X86::ADD64rm, TB_NO_REVERSE },
+ { X86::ADD8rr_DB, X86::ADD8rm, TB_NO_REVERSE },
{ X86::ADC16rr, X86::ADC16rm, 0 },
{ X86::ADC32rr, X86::ADC32rm, 0 },
{ X86::ADC64rr, X86::ADC64rm, 0 },
@@ -1213,11 +1205,8 @@ static const X86MemoryFoldTableEntry MemoryFoldTable2[] = {
{ X86::ADCX32rr, X86::ADCX32rm, 0 },
{ X86::ADCX64rr, X86::ADCX64rm, 0 },
{ X86::ADD16rr, X86::ADD16rm, 0 },
- { X86::ADD16rr_DB, X86::ADD16rm, TB_NO_REVERSE },
{ X86::ADD32rr, X86::ADD32rm, 0 },
- { X86::ADD32rr_DB, X86::ADD32rm, TB_NO_REVERSE },
{ X86::ADD64rr, X86::ADD64rm, 0 },
- { X86::ADD64rr_DB, X86::ADD64rm, TB_NO_REVERSE },
{ X86::ADD8rr, X86::ADD8rm, 0 },
{ X86::ADDPDrr, X86::ADDPDrm, TB_ALIGN_16 },
{ X86::ADDPSrr, X86::ADDPSrm, TB_ALIGN_16 },
@@ -1247,54 +1236,9 @@ static const X86MemoryFoldTableEntry MemoryFoldTable2[] = {
{ X86::BLENDPSrri, X86::BLENDPSrmi, TB_ALIGN_16 },
{ X86::BLENDVPDrr0, X86::BLENDVPDrm0, TB_ALIGN_16 },
{ X86::BLENDVPSrr0, X86::BLENDVPSrm0, TB_ALIGN_16 },
- { X86::CMOVA16rr, X86::CMOVA16rm, 0 },
- { X86::CMOVA32rr, X86::CMOVA32rm, 0 },
- { X86::CMOVA64rr, X86::CMOVA64rm, 0 },
- { X86::CMOVAE16rr, X86::CMOVAE16rm, 0 },
- { X86::CMOVAE32rr, X86::CMOVAE32rm, 0 },
- { X86::CMOVAE64rr, X86::CMOVAE64rm, 0 },
- { X86::CMOVB16rr, X86::CMOVB16rm, 0 },
- { X86::CMOVB32rr, X86::CMOVB32rm, 0 },
- { X86::CMOVB64rr, X86::CMOVB64rm, 0 },
- { X86::CMOVBE16rr, X86::CMOVBE16rm, 0 },
- { X86::CMOVBE32rr, X86::CMOVBE32rm, 0 },
- { X86::CMOVBE64rr, X86::CMOVBE64rm, 0 },
- { X86::CMOVE16rr, X86::CMOVE16rm, 0 },
- { X86::CMOVE32rr, X86::CMOVE32rm, 0 },
- { X86::CMOVE64rr, X86::CMOVE64rm, 0 },
- { X86::CMOVG16rr, X86::CMOVG16rm, 0 },
- { X86::CMOVG32rr, X86::CMOVG32rm, 0 },
- { X86::CMOVG64rr, X86::CMOVG64rm, 0 },
- { X86::CMOVGE16rr, X86::CMOVGE16rm, 0 },
- { X86::CMOVGE32rr, X86::CMOVGE32rm, 0 },
- { X86::CMOVGE64rr, X86::CMOVGE64rm, 0 },
- { X86::CMOVL16rr, X86::CMOVL16rm, 0 },
- { X86::CMOVL32rr, X86::CMOVL32rm, 0 },
- { X86::CMOVL64rr, X86::CMOVL64rm, 0 },
- { X86::CMOVLE16rr, X86::CMOVLE16rm, 0 },
- { X86::CMOVLE32rr, X86::CMOVLE32rm, 0 },
- { X86::CMOVLE64rr, X86::CMOVLE64rm, 0 },
- { X86::CMOVNE16rr, X86::CMOVNE16rm, 0 },
- { X86::CMOVNE32rr, X86::CMOVNE32rm, 0 },
- { X86::CMOVNE64rr, X86::CMOVNE64rm, 0 },
- { X86::CMOVNO16rr, X86::CMOVNO16rm, 0 },
- { X86::CMOVNO32rr, X86::CMOVNO32rm, 0 },
- { X86::CMOVNO64rr, X86::CMOVNO64rm, 0 },
- { X86::CMOVNP16rr, X86::CMOVNP16rm, 0 },
- { X86::CMOVNP32rr, X86::CMOVNP32rm, 0 },
- { X86::CMOVNP64rr, X86::CMOVNP64rm, 0 },
- { X86::CMOVNS16rr, X86::CMOVNS16rm, 0 },
- { X86::CMOVNS32rr, X86::CMOVNS32rm, 0 },
- { X86::CMOVNS64rr, X86::CMOVNS64rm, 0 },
- { X86::CMOVO16rr, X86::CMOVO16rm, 0 },
- { X86::CMOVO32rr, X86::CMOVO32rm, 0 },
- { X86::CMOVO64rr, X86::CMOVO64rm, 0 },
- { X86::CMOVP16rr, X86::CMOVP16rm, 0 },
- { X86::CMOVP32rr, X86::CMOVP32rm, 0 },
- { X86::CMOVP64rr, X86::CMOVP64rm, 0 },
- { X86::CMOVS16rr, X86::CMOVS16rm, 0 },
- { X86::CMOVS32rr, X86::CMOVS32rm, 0 },
- { X86::CMOVS64rr, X86::CMOVS64rm, 0 },
+ { X86::CMOV16rr, X86::CMOV16rm, 0 },
+ { X86::CMOV32rr, X86::CMOV32rm, 0 },
+ { X86::CMOV64rr, X86::CMOV64rm, 0 },
{ X86::CMPPDrri, X86::CMPPDrmi, TB_ALIGN_16 },
{ X86::CMPPSrri, X86::CMPPSrmi, TB_ALIGN_16 },
{ X86::CMPSDrr, X86::CMPSDrm, 0 },
@@ -1421,6 +1365,7 @@ static const X86MemoryFoldTableEntry MemoryFoldTable2[] = {
{ X86::MMX_PUNPCKLWDirr, X86::MMX_PUNPCKLWDirm, TB_NO_REVERSE },
{ X86::MMX_PXORirr, X86::MMX_PXORirm, 0 },
{ X86::MOVLHPSrr, X86::MOVHPSrm, TB_NO_REVERSE },
+ { X86::MOVSDrr, X86::MOVLPDrm, TB_NO_REVERSE },
{ X86::MPSADBWrri, X86::MPSADBWrmi, TB_ALIGN_16 },
{ X86::MULPDrr, X86::MULPDrm, TB_ALIGN_16 },
{ X86::MULPSrr, X86::MULPSrm, TB_ALIGN_16 },
@@ -1576,7 +1521,6 @@ static const X86MemoryFoldTableEntry MemoryFoldTable2[] = {
{ X86::SUBSDrr_Int, X86::SUBSDrm_Int, TB_NO_REVERSE },
{ X86::SUBSSrr, X86::SUBSSrm, 0 },
{ X86::SUBSSrr_Int, X86::SUBSSrm_Int, TB_NO_REVERSE },
- // FIXME: TEST*rr -> swapped operand of TEST *mr.
{ X86::UNPCKHPDrr, X86::UNPCKHPDrm, TB_ALIGN_16 },
{ X86::UNPCKHPSrr, X86::UNPCKHPSrm, TB_ALIGN_16 },
{ X86::UNPCKLPDrr, X86::UNPCKLPDrm, TB_ALIGN_16 },
@@ -1697,6 +1641,12 @@ static const X86MemoryFoldTableEntry MemoryFoldTable2[] = {
{ X86::VCVTDQ2PSZ128rrkz, X86::VCVTDQ2PSZ128rmkz, 0 },
{ X86::VCVTDQ2PSZ256rrkz, X86::VCVTDQ2PSZ256rmkz, 0 },
{ X86::VCVTDQ2PSZrrkz, X86::VCVTDQ2PSZrmkz, 0 },
+ { X86::VCVTNE2PS2BF16Z128rr, X86::VCVTNE2PS2BF16Z128rm, 0 },
+ { X86::VCVTNE2PS2BF16Z256rr, X86::VCVTNE2PS2BF16Z256rm, 0 },
+ { X86::VCVTNE2PS2BF16Zrr, X86::VCVTNE2PS2BF16Zrm, 0 },
+ { X86::VCVTNEPS2BF16Z128rrkz, X86::VCVTNEPS2BF16Z128rmkz, 0 },
+ { X86::VCVTNEPS2BF16Z256rrkz, X86::VCVTNEPS2BF16Z256rmkz, 0 },
+ { X86::VCVTNEPS2BF16Zrrkz, X86::VCVTNEPS2BF16Zrmkz, 0 },
{ X86::VCVTPD2DQZ128rrkz, X86::VCVTPD2DQZ128rmkz, 0 },
{ X86::VCVTPD2DQZ256rrkz, X86::VCVTPD2DQZ256rmkz, 0 },
{ X86::VCVTPD2DQZrrkz, X86::VCVTPD2DQZrmkz, 0 },
@@ -2030,6 +1980,8 @@ static const X86MemoryFoldTableEntry MemoryFoldTable2[] = {
{ X86::VMOVDQU8Zrrkz, X86::VMOVDQU8Zrmkz, TB_NO_REVERSE },
{ X86::VMOVLHPSZrr, X86::VMOVHPSZ128rm, TB_NO_REVERSE },
{ X86::VMOVLHPSrr, X86::VMOVHPSrm, TB_NO_REVERSE },
+ { X86::VMOVSDZrr, X86::VMOVLPDZ128rm, TB_NO_REVERSE },
+ { X86::VMOVSDrr, X86::VMOVLPDrm, TB_NO_REVERSE },
{ X86::VMOVSHDUPZ128rrkz, X86::VMOVSHDUPZ128rmkz, 0 },
{ X86::VMOVSHDUPZ256rrkz, X86::VMOVSHDUPZ256rmkz, 0 },
{ X86::VMOVSHDUPZrrkz, X86::VMOVSHDUPZrmkz, 0 },
@@ -2072,6 +2024,12 @@ static const X86MemoryFoldTableEntry MemoryFoldTable2[] = {
{ X86::VORPSZ256rr, X86::VORPSZ256rm, 0 },
{ X86::VORPSZrr, X86::VORPSZrm, 0 },
{ X86::VORPSrr, X86::VORPSrm, 0 },
+ { X86::VP2INTERSECTDZ128rr, X86::VP2INTERSECTDZ128rm, 0 },
+ { X86::VP2INTERSECTDZ256rr, X86::VP2INTERSECTDZ256rm, 0 },
+ { X86::VP2INTERSECTDZrr, X86::VP2INTERSECTDZrm, 0 },
+ { X86::VP2INTERSECTQZ128rr, X86::VP2INTERSECTQZ128rm, 0 },
+ { X86::VP2INTERSECTQZ256rr, X86::VP2INTERSECTQZ256rm, 0 },
+ { X86::VP2INTERSECTQZrr, X86::VP2INTERSECTQZrm, 0 },
{ X86::VPABSBZ128rrkz, X86::VPABSBZ128rmkz, 0 },
{ X86::VPABSBZ256rrkz, X86::VPABSBZ256rmkz, 0 },
{ X86::VPABSBZrrkz, X86::VPABSBZrmkz, 0 },
@@ -3074,6 +3032,12 @@ static const X86MemoryFoldTableEntry MemoryFoldTable3[] = {
{ X86::VCVTDQ2PSZ128rrk, X86::VCVTDQ2PSZ128rmk, 0 },
{ X86::VCVTDQ2PSZ256rrk, X86::VCVTDQ2PSZ256rmk, 0 },
{ X86::VCVTDQ2PSZrrk, X86::VCVTDQ2PSZrmk, 0 },
+ { X86::VCVTNE2PS2BF16Z128rrkz, X86::VCVTNE2PS2BF16Z128rmkz, 0 },
+ { X86::VCVTNE2PS2BF16Z256rrkz, X86::VCVTNE2PS2BF16Z256rmkz, 0 },
+ { X86::VCVTNE2PS2BF16Zrrkz, X86::VCVTNE2PS2BF16Zrmkz, 0 },
+ { X86::VCVTNEPS2BF16Z128rrk, X86::VCVTNEPS2BF16Z128rmk, 0 },
+ { X86::VCVTNEPS2BF16Z256rrk, X86::VCVTNEPS2BF16Z256rmk, 0 },
+ { X86::VCVTNEPS2BF16Zrrk, X86::VCVTNEPS2BF16Zrmk, 0 },
{ X86::VCVTPD2DQZ128rrk, X86::VCVTPD2DQZ128rmk, 0 },
{ X86::VCVTPD2DQZ256rrk, X86::VCVTPD2DQZ256rmk, 0 },
{ X86::VCVTPD2DQZrrk, X86::VCVTPD2DQZrmk, 0 },
@@ -3162,6 +3126,9 @@ static const X86MemoryFoldTableEntry MemoryFoldTable3[] = {
{ X86::VDIVPSZrrkz, X86::VDIVPSZrmkz, 0 },
{ X86::VDIVSDZrr_Intkz, X86::VDIVSDZrm_Intkz, TB_NO_REVERSE },
{ X86::VDIVSSZrr_Intkz, X86::VDIVSSZrm_Intkz, TB_NO_REVERSE },
+ { X86::VDPBF16PSZ128r, X86::VDPBF16PSZ128m, 0 },
+ { X86::VDPBF16PSZ256r, X86::VDPBF16PSZ256m, 0 },
+ { X86::VDPBF16PSZr, X86::VDPBF16PSZm, 0 },
{ X86::VEXP2PDZrk, X86::VEXP2PDZmk, 0 },
{ X86::VEXP2PSZrk, X86::VEXP2PSZmk, 0 },
{ X86::VEXPANDPDZ128rrk, X86::VEXPANDPDZ128rmk, TB_NO_REVERSE },
@@ -4376,6 +4343,9 @@ static const X86MemoryFoldTableEntry MemoryFoldTable4[] = {
{ X86::VANDPSZ128rrk, X86::VANDPSZ128rmk, 0 },
{ X86::VANDPSZ256rrk, X86::VANDPSZ256rmk, 0 },
{ X86::VANDPSZrrk, X86::VANDPSZrmk, 0 },
+ { X86::VCVTNE2PS2BF16Z128rrk, X86::VCVTNE2PS2BF16Z128rmk, 0 },
+ { X86::VCVTNE2PS2BF16Z256rrk, X86::VCVTNE2PS2BF16Z256rmk, 0 },
+ { X86::VCVTNE2PS2BF16Zrrk, X86::VCVTNE2PS2BF16Zrmk, 0 },
{ X86::VCVTSD2SSZrr_Intk, X86::VCVTSD2SSZrm_Intk, TB_NO_REVERSE },
{ X86::VCVTSS2SDZrr_Intk, X86::VCVTSS2SDZrm_Intk, TB_NO_REVERSE },
{ X86::VDBPSADBWZ128rrik, X86::VDBPSADBWZ128rmik, 0 },
@@ -4389,6 +4359,12 @@ static const X86MemoryFoldTableEntry MemoryFoldTable4[] = {
{ X86::VDIVPSZrrk, X86::VDIVPSZrmk, 0 },
{ X86::VDIVSDZrr_Intk, X86::VDIVSDZrm_Intk, TB_NO_REVERSE },
{ X86::VDIVSSZrr_Intk, X86::VDIVSSZrm_Intk, TB_NO_REVERSE },
+ { X86::VDPBF16PSZ128rk, X86::VDPBF16PSZ128mk, 0 },
+ { X86::VDPBF16PSZ128rkz, X86::VDPBF16PSZ128mkz, 0 },
+ { X86::VDPBF16PSZ256rk, X86::VDPBF16PSZ256mk, 0 },
+ { X86::VDPBF16PSZ256rkz, X86::VDPBF16PSZ256mkz, 0 },
+ { X86::VDPBF16PSZrk, X86::VDPBF16PSZmk, 0 },
+ { X86::VDPBF16PSZrkz, X86::VDPBF16PSZmkz, 0 },
{ X86::VFIXUPIMMPDZ128rrik, X86::VFIXUPIMMPDZ128rmik, 0 },
{ X86::VFIXUPIMMPDZ128rrikz, X86::VFIXUPIMMPDZ128rmikz, 0 },
{ X86::VFIXUPIMMPDZ256rrik, X86::VFIXUPIMMPDZ256rmik, 0 },
@@ -5315,9 +5291,7 @@ lookupFoldTableImpl(ArrayRef<X86MemoryFoldTableEntry> Table, unsigned RegOp) {
}
#endif
- const X86MemoryFoldTableEntry *Data = std::lower_bound(Table.begin(),
- Table.end(),
- RegOp);
+ const X86MemoryFoldTableEntry *Data = llvm::lower_bound(Table, RegOp);
if (Data != Table.end() && Data->KeyOp == RegOp &&
!(Data->Flags & TB_NO_FORWARD))
return Data;
@@ -5404,7 +5378,7 @@ static ManagedStatic<X86MemUnfoldTable> MemUnfoldTable;
const X86MemoryFoldTableEntry *
llvm::lookupUnfoldTable(unsigned MemOp) {
auto &Table = MemUnfoldTable->Table;
- auto I = std::lower_bound(Table.begin(), Table.end(), MemOp);
+ auto I = llvm::lower_bound(Table, MemOp);
if (I != Table.end() && I->KeyOp == MemOp)
return &*I;
return nullptr;
diff --git a/lib/Target/X86/X86InstrFoldTables.h b/lib/Target/X86/X86InstrFoldTables.h
index 90016baead96..419baf98f61d 100644
--- a/lib/Target/X86/X86InstrFoldTables.h
+++ b/lib/Target/X86/X86InstrFoldTables.h
@@ -1,9 +1,8 @@
//===-- X86InstrFoldTables.h - X86 Instruction Folding Tables ---*- C++ -*-===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
diff --git a/lib/Target/X86/X86InstrFormats.td b/lib/Target/X86/X86InstrFormats.td
index 47d4719d3060..e8f0d937dff4 100644
--- a/lib/Target/X86/X86InstrFormats.td
+++ b/lib/Target/X86/X86InstrFormats.td
@@ -1,9 +1,8 @@
//===-- X86InstrFormats.td - X86 Instruction Formats -------*- tablegen -*-===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
@@ -27,10 +26,13 @@ def RawFrmDst : Format<5>;
def RawFrmDstSrc : Format<6>;
def RawFrmImm8 : Format<7>;
def RawFrmImm16 : Format<8>;
+def AddCCFrm : Format<9>;
def MRMDestMem : Format<32>;
def MRMSrcMem : Format<33>;
def MRMSrcMem4VOp3 : Format<34>;
def MRMSrcMemOp4 : Format<35>;
+def MRMSrcMemCC : Format<36>;
+def MRMXmCC: Format<38>;
def MRMXm : Format<39>;
def MRM0m : Format<40>; def MRM1m : Format<41>; def MRM2m : Format<42>;
def MRM3m : Format<43>; def MRM4m : Format<44>; def MRM5m : Format<45>;
@@ -39,6 +41,8 @@ def MRMDestReg : Format<48>;
def MRMSrcReg : Format<49>;
def MRMSrcReg4VOp3 : Format<50>;
def MRMSrcRegOp4 : Format<51>;
+def MRMSrcRegCC : Format<52>;
+def MRMXrCC: Format<54>;
def MRMXr : Format<55>;
def MRM0r : Format<56>; def MRM1r : Format<57>; def MRM2r : Format<58>;
def MRM3r : Format<59>; def MRM4r : Format<60>; def MRM5r : Format<61>;
@@ -206,13 +210,10 @@ class TAPS : TA { Prefix OpPrefix = PS; }
class TAPD : TA { Prefix OpPrefix = PD; }
class TAXD : TA { Prefix OpPrefix = XD; }
class VEX { Encoding OpEnc = EncVEX; }
-class VEX_W { bits<2> VEX_WPrefix = 1; }
-class VEX_WIG { bits<2> VEX_WPrefix = 2; }
+class VEX_W { bit HasVEX_W = 1; }
+class VEX_WIG { bit IgnoresVEX_W = 1; }
// Special version of VEX_W that can be changed to VEX.W==0 for EVEX2VEX.
-// FIXME: We should consider adding separate bits for VEX_WIG and the extra
-// part of W1X. This would probably simplify the tablegen emitters and
-// the TSFlags creation below.
-class VEX_W1X { bits<2> VEX_WPrefix = 3; }
+class VEX_W1X { bit HasVEX_W = 1; bit EVEX_W1_VEX_W0 = 1; }
class VEX_4V : VEX { bit hasVEX_4V = 1; }
class VEX_L { bit hasVEX_L = 1; }
class VEX_LIG { bit ignoresVEX_L = 1; }
@@ -296,7 +297,10 @@ class X86Inst<bits<8> opcod, Format f, ImmType i, dag outs, dag ins,
bit hasREPPrefix = 0; // Does this inst have a REP prefix?
Encoding OpEnc = EncNormal; // Encoding used by this instruction
bits<2> OpEncBits = OpEnc.Value;
- bits<2> VEX_WPrefix = 0; // Does this inst set the VEX_W field?
+ bit HasVEX_W = 0; // Does this inst set the VEX_W field?
+ bit IgnoresVEX_W = 0; // Does this inst ignore VEX_W field?
+ bit EVEX_W1_VEX_W0 = 0; // This EVEX inst with VEX.W==1 can become a VEX
+ // instruction with VEX.W == 0.
bit hasVEX_4V = 0; // Does this inst require the VEX.VVVV field?
bit hasVEX_L = 0; // Does this inst use large (256-bit) registers?
bit ignoresVEX_L = 0; // Does this instruction ignore the L-bit
@@ -311,11 +315,8 @@ class X86Inst<bits<8> opcod, Format f, ImmType i, dag outs, dag ins,
bit hasEVEX_RC = 0; // Explicitly specified rounding control in FP instruction.
bit hasNoTrackPrefix = 0; // Does this inst has 0x3E (NoTrack) prefix?
- bits<2> EVEX_LL;
- let EVEX_LL{0} = hasVEX_L;
- let EVEX_LL{1} = hasEVEX_L2;
// Vector size in bytes.
- bits<7> VectSize = !shl(16, EVEX_LL);
+ bits<7> VectSize = !if(hasEVEX_L2, 64, !if(hasVEX_L, 32, 16));
// The scaling factor for AVX512's compressed displacement is either
// - the size of a power-of-two number of elements or
@@ -355,7 +356,7 @@ class X86Inst<bits<8> opcod, Format f, ImmType i, dag outs, dag ins,
let TSFlags{29-28} = OpEncBits;
let TSFlags{37-30} = Opcode;
// Currently no need for second bit in TSFlags - W Ignore is equivalent to 0.
- let TSFlags{38} = VEX_WPrefix{0};
+ let TSFlags{38} = HasVEX_W;
let TSFlags{39} = hasVEX_4V;
let TSFlags{40} = hasVEX_L;
let TSFlags{41} = hasEVEX_K;
diff --git a/lib/Target/X86/X86InstrFragmentsSIMD.td b/lib/Target/X86/X86InstrFragmentsSIMD.td
index 11a27ba90586..096cc27861ca 100644
--- a/lib/Target/X86/X86InstrFragmentsSIMD.td
+++ b/lib/Target/X86/X86InstrFragmentsSIMD.td
@@ -1,9 +1,8 @@
//===-- X86InstrFragmentsSIMD.td - x86 SIMD ISA ------------*- tablegen -*-===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
@@ -100,8 +99,10 @@ def X86insertps : SDNode<"X86ISD::INSERTPS",
def X86vzmovl : SDNode<"X86ISD::VZEXT_MOVL",
SDTypeProfile<1, 1, [SDTCisSameAs<0,1>]>>;
-def X86vzload : SDNode<"X86ISD::VZEXT_LOAD", SDTLoad,
- [SDNPHasChain, SDNPMayLoad, SDNPMemOperand]>;
+def X86vzld : SDNode<"X86ISD::VZEXT_LOAD", SDTLoad,
+ [SDNPHasChain, SDNPMayLoad, SDNPMemOperand]>;
+def X86vextractst : SDNode<"X86ISD::VEXTRACT_STORE", SDTStore,
+ [SDNPHasChain, SDNPMayStore, SDNPMemOperand]>;
def SDTVtrunc : SDTypeProfile<1, 1, [SDTCisVec<0>, SDTCisVec<1>,
SDTCisInt<0>, SDTCisInt<1>,
@@ -127,21 +128,31 @@ def X86vfpext : SDNode<"X86ISD::VFPEXT",
def X86vfpround: SDNode<"X86ISD::VFPROUND",
SDTypeProfile<1, 1, [SDTCVecEltisVT<0, f32>,
SDTCVecEltisVT<1, f64>,
- SDTCisSameSizeAs<0, 1>]>>;
+ SDTCisOpSmallerThanOp<0, 1>]>>;
-def X86froundRnd: SDNode<"X86ISD::VFPROUNDS_RND",
+def X86frounds : SDNode<"X86ISD::VFPROUNDS",
+ SDTypeProfile<1, 2, [SDTCVecEltisVT<0, f32>,
+ SDTCisSameAs<0, 1>,
+ SDTCVecEltisVT<2, f64>,
+ SDTCisSameSizeAs<0, 2>]>>;
+
+def X86froundsRnd: SDNode<"X86ISD::VFPROUNDS_RND",
SDTypeProfile<1, 3, [SDTCVecEltisVT<0, f32>,
SDTCisSameAs<0, 1>,
SDTCVecEltisVT<2, f64>,
SDTCisSameSizeAs<0, 2>,
SDTCisVT<3, i32>]>>;
-def X86fpextRnd : SDNode<"X86ISD::VFPEXTS_RND",
- SDTypeProfile<1, 3, [SDTCVecEltisVT<0, f64>,
+def X86fpexts : SDNode<"X86ISD::VFPEXTS",
+ SDTypeProfile<1, 2, [SDTCVecEltisVT<0, f64>,
SDTCisSameAs<0, 1>,
SDTCVecEltisVT<2, f32>,
- SDTCisSameSizeAs<0, 2>,
- SDTCisVT<3, i32>]>>;
+ SDTCisSameSizeAs<0, 2>]>>;
+def X86fpextsSAE : SDNode<"X86ISD::VFPEXTS_SAE",
+ SDTypeProfile<1, 2, [SDTCVecEltisVT<0, f64>,
+ SDTCisSameAs<0, 1>,
+ SDTCVecEltisVT<2, f32>,
+ SDTCisSameSizeAs<0, 2>]>>;
def X86vmfpround: SDNode<"X86ISD::VMFPROUND",
SDTypeProfile<1, 3, [SDTCVecEltisVT<0, f32>,
@@ -164,25 +175,14 @@ def X86CmpMaskCC :
SDTypeProfile<1, 3, [SDTCisVec<0>, SDTCVecEltisVT<0, i1>,
SDTCisVec<1>, SDTCisSameAs<2, 1>,
SDTCisSameNumEltsAs<0, 1>, SDTCisVT<3, i8>]>;
-def X86CmpMaskCCRound :
- SDTypeProfile<1, 4, [SDTCisVec<0>,SDTCVecEltisVT<0, i1>,
- SDTCisVec<1>, SDTCisFP<1>, SDTCisSameAs<2, 1>,
- SDTCisSameNumEltsAs<0, 1>, SDTCisVT<3, i8>,
- SDTCisVT<4, i32>]>;
def X86CmpMaskCCScalar :
SDTypeProfile<1, 3, [SDTCisInt<0>, SDTCisFP<1>, SDTCisSameAs<1, 2>,
SDTCisVT<3, i8>]>;
-def X86CmpMaskCCScalarRound :
- SDTypeProfile<1, 4, [SDTCisInt<0>, SDTCisFP<1>, SDTCisSameAs<1, 2>,
- SDTCisVT<3, i8>, SDTCisVT<4, i32>]>;
-
def X86cmpm : SDNode<"X86ISD::CMPM", X86CmpMaskCC>;
-// Hack to make CMPM commutable in tablegen patterns for load folding.
-def X86cmpm_c : SDNode<"X86ISD::CMPM", X86CmpMaskCC, [SDNPCommutative]>;
-def X86cmpmRnd : SDNode<"X86ISD::CMPM_RND", X86CmpMaskCCRound>;
+def X86cmpmSAE : SDNode<"X86ISD::CMPM_SAE", X86CmpMaskCC>;
def X86cmpms : SDNode<"X86ISD::FSETCCM", X86CmpMaskCCScalar>;
-def X86cmpmsRnd : SDNode<"X86ISD::FSETCCM_RND", X86CmpMaskCCScalarRound>;
+def X86cmpmsSAE : SDNode<"X86ISD::FSETCCM_SAE", X86CmpMaskCCScalar>;
def X86phminpos: SDNode<"X86ISD::PHMINPOS",
SDTypeProfile<1, 1, [SDTCisVT<0, v8i16>, SDTCisVT<1, v8i16>]>>;
@@ -198,6 +198,8 @@ def X86vsra : SDNode<"X86ISD::VSRA", X86vshiftuniform>;
def X86vshiftvariable : SDTypeProfile<1, 2, [SDTCisVec<0>, SDTCisSameAs<0,1>,
SDTCisSameAs<0,2>, SDTCisInt<0>]>;
+def X86vshlv : SDNode<"X86ISD::VSHLV", X86vshiftvariable>;
+def X86vsrlv : SDNode<"X86ISD::VSRLV", X86vshiftvariable>;
def X86vsrav : SDNode<"X86ISD::VSRAV", X86vshiftvariable>;
def X86vshli : SDNode<"X86ISD::VSHLI", X86vshiftimm>;
@@ -299,25 +301,15 @@ def SDTFPBinOpImm: SDTypeProfile<1, 3, [SDTCisFP<0>, SDTCisVec<0>,
SDTCisSameAs<0,1>,
SDTCisSameAs<0,2>,
SDTCisVT<3, i32>]>;
-def SDTFPBinOpImmRound: SDTypeProfile<1, 4, [SDTCisFP<0>, SDTCisVec<0>,
- SDTCisSameAs<0,1>,
- SDTCisSameAs<0,2>,
- SDTCisVT<3, i32>,
- SDTCisVT<4, i32>]>;
-def SDTFPTernaryOpImmRound: SDTypeProfile<1, 5, [SDTCisFP<0>, SDTCisSameAs<0,1>,
- SDTCisSameAs<0,2>,
- SDTCisInt<3>,
- SDTCisSameSizeAs<0, 3>,
- SDTCisSameNumEltsAs<0, 3>,
- SDTCisVT<4, i32>,
- SDTCisVT<5, i32>]>;
-def SDTFPUnaryOpImm: SDTypeProfile<1, 2, [SDTCisFP<0>, SDTCisVec<0>,
+def SDTFPTernaryOpImm: SDTypeProfile<1, 4, [SDTCisFP<0>, SDTCisSameAs<0,1>,
+ SDTCisSameAs<0,2>,
+ SDTCisInt<3>,
+ SDTCisSameSizeAs<0, 3>,
+ SDTCisSameNumEltsAs<0, 3>,
+ SDTCisVT<4, i32>]>;
+def SDTFPUnaryOpImm: SDTypeProfile<1, 2, [SDTCisFP<0>,
SDTCisSameAs<0,1>,
SDTCisVT<2, i32>]>;
-def SDTFPUnaryOpImmRound: SDTypeProfile<1, 3, [SDTCisFP<0>, SDTCisVec<0>,
- SDTCisSameAs<0,1>,
- SDTCisVT<2, i32>,
- SDTCisVT<3, i32>]>;
def SDTVBroadcast : SDTypeProfile<1, 1, [SDTCisVec<0>]>;
def SDTVBroadcastm : SDTypeProfile<1, 1, [SDTCisVec<0>,
@@ -373,11 +365,23 @@ def X86Movddup : SDNode<"X86ISD::MOVDDUP", SDTShuff1Op>;
def X86Movshdup : SDNode<"X86ISD::MOVSHDUP", SDTShuff1Op>;
def X86Movsldup : SDNode<"X86ISD::MOVSLDUP", SDTShuff1Op>;
-def X86Movsd : SDNode<"X86ISD::MOVSD", SDTShuff2OpFP>;
-def X86Movss : SDNode<"X86ISD::MOVSS", SDTShuff2OpFP>;
-
-def X86Movlhps : SDNode<"X86ISD::MOVLHPS", SDTShuff2OpFP>;
-def X86Movhlps : SDNode<"X86ISD::MOVHLPS", SDTShuff2OpFP>;
+def X86Movsd : SDNode<"X86ISD::MOVSD",
+ SDTypeProfile<1, 2, [SDTCisVT<0, v2f64>,
+ SDTCisVT<1, v2f64>,
+ SDTCisVT<2, v2f64>]>>;
+def X86Movss : SDNode<"X86ISD::MOVSS",
+ SDTypeProfile<1, 2, [SDTCisVT<0, v4f32>,
+ SDTCisVT<1, v4f32>,
+ SDTCisVT<2, v4f32>]>>;
+
+def X86Movlhps : SDNode<"X86ISD::MOVLHPS",
+ SDTypeProfile<1, 2, [SDTCisVT<0, v4f32>,
+ SDTCisVT<1, v4f32>,
+ SDTCisVT<2, v4f32>]>>;
+def X86Movhlps : SDNode<"X86ISD::MOVHLPS",
+ SDTypeProfile<1, 2, [SDTCisVT<0, v4f32>,
+ SDTCisVT<1, v4f32>,
+ SDTCisVT<2, v4f32>]>>;
def SDTPack : SDTypeProfile<1, 2, [SDTCisVec<0>, SDTCisInt<0>,
SDTCisVec<1>, SDTCisInt<1>,
@@ -421,16 +425,18 @@ def X86vpternlog : SDNode<"X86ISD::VPTERNLOG", SDTTernlog>;
def X86VPerm2x128 : SDNode<"X86ISD::VPERM2X128", SDTShuff3OpI>;
-def X86VFixupimm : SDNode<"X86ISD::VFIXUPIMM", SDTFPTernaryOpImmRound>;
-def X86VFixupimmScalar : SDNode<"X86ISD::VFIXUPIMMS", SDTFPTernaryOpImmRound>;
+def X86VFixupimm : SDNode<"X86ISD::VFIXUPIMM", SDTFPTernaryOpImm>;
+def X86VFixupimmSAE : SDNode<"X86ISD::VFIXUPIMM_SAE", SDTFPTernaryOpImm>;
+def X86VFixupimms : SDNode<"X86ISD::VFIXUPIMMS", SDTFPTernaryOpImm>;
+def X86VFixupimmSAEs : SDNode<"X86ISD::VFIXUPIMMS_SAE", SDTFPTernaryOpImm>;
def X86VRange : SDNode<"X86ISD::VRANGE", SDTFPBinOpImm>;
-def X86VRangeRnd : SDNode<"X86ISD::VRANGE_RND", SDTFPBinOpImmRound>;
+def X86VRangeSAE : SDNode<"X86ISD::VRANGE_SAE", SDTFPBinOpImm>;
def X86VReduce : SDNode<"X86ISD::VREDUCE", SDTFPUnaryOpImm>;
-def X86VReduceRnd : SDNode<"X86ISD::VREDUCE_RND", SDTFPUnaryOpImmRound>;
+def X86VReduceSAE : SDNode<"X86ISD::VREDUCE_SAE", SDTFPUnaryOpImm>;
def X86VRndScale : SDNode<"X86ISD::VRNDSCALE", SDTFPUnaryOpImm>;
-def X86VRndScaleRnd: SDNode<"X86ISD::VRNDSCALE_RND", SDTFPUnaryOpImmRound>;
+def X86VRndScaleSAE: SDNode<"X86ISD::VRNDSCALE_SAE", SDTFPUnaryOpImm>;
def X86VGetMant : SDNode<"X86ISD::VGETMANT", SDTFPUnaryOpImm>;
-def X86VGetMantRnd : SDNode<"X86ISD::VGETMANT_RND", SDTFPUnaryOpImmRound>;
+def X86VGetMantSAE : SDNode<"X86ISD::VGETMANT_SAE", SDTFPUnaryOpImm>;
def X86Vfpclass : SDNode<"X86ISD::VFPCLASS",
SDTypeProfile<1, 2, [SDTCVecEltisVT<0, i1>,
SDTCisFP<1>,
@@ -448,27 +454,42 @@ def X86VBroadcast : SDNode<"X86ISD::VBROADCAST", SDTVBroadcast>;
def X86VBroadcastm : SDNode<"X86ISD::VBROADCASTM", SDTVBroadcastm>;
def X86Blendi : SDNode<"X86ISD::BLENDI", SDTBlend>;
+def X86Blendv : SDNode<"X86ISD::BLENDV",
+ SDTypeProfile<1, 3, [SDTCisVec<0>, SDTCisInt<1>,
+ SDTCisSameAs<0, 2>,
+ SDTCisSameAs<2, 3>,
+ SDTCisSameNumEltsAs<0, 1>,
+ SDTCisSameSizeAs<0, 1>]>>;
def X86Addsub : SDNode<"X86ISD::ADDSUB", SDTFPBinOp>;
def X86faddRnd : SDNode<"X86ISD::FADD_RND", SDTFPBinOpRound>;
+def X86fadds : SDNode<"X86ISD::FADDS", SDTFPBinOp>;
def X86faddRnds : SDNode<"X86ISD::FADDS_RND", SDTFPBinOpRound>;
def X86fsubRnd : SDNode<"X86ISD::FSUB_RND", SDTFPBinOpRound>;
+def X86fsubs : SDNode<"X86ISD::FSUBS", SDTFPBinOp>;
def X86fsubRnds : SDNode<"X86ISD::FSUBS_RND", SDTFPBinOpRound>;
def X86fmulRnd : SDNode<"X86ISD::FMUL_RND", SDTFPBinOpRound>;
+def X86fmuls : SDNode<"X86ISD::FMULS", SDTFPBinOp>;
def X86fmulRnds : SDNode<"X86ISD::FMULS_RND", SDTFPBinOpRound>;
def X86fdivRnd : SDNode<"X86ISD::FDIV_RND", SDTFPBinOpRound>;
+def X86fdivs : SDNode<"X86ISD::FDIVS", SDTFPBinOp>;
def X86fdivRnds : SDNode<"X86ISD::FDIVS_RND", SDTFPBinOpRound>;
-def X86fmaxRnd : SDNode<"X86ISD::FMAX_RND", SDTFPBinOpRound>;
-def X86fmaxRnds : SDNode<"X86ISD::FMAXS_RND", SDTFPBinOpRound>;
-def X86fminRnd : SDNode<"X86ISD::FMIN_RND", SDTFPBinOpRound>;
-def X86fminRnds : SDNode<"X86ISD::FMINS_RND", SDTFPBinOpRound>;
-def X86scalef : SDNode<"X86ISD::SCALEF", SDTFPBinOpRound>;
-def X86scalefs : SDNode<"X86ISD::SCALEFS", SDTFPBinOpRound>;
+def X86fmaxSAE : SDNode<"X86ISD::FMAX_SAE", SDTFPBinOp>;
+def X86fmaxSAEs : SDNode<"X86ISD::FMAXS_SAE", SDTFPBinOp>;
+def X86fminSAE : SDNode<"X86ISD::FMIN_SAE", SDTFPBinOp>;
+def X86fminSAEs : SDNode<"X86ISD::FMINS_SAE", SDTFPBinOp>;
+def X86scalef : SDNode<"X86ISD::SCALEF", SDTFPBinOp>;
+def X86scalefRnd : SDNode<"X86ISD::SCALEF_RND", SDTFPBinOpRound>;
+def X86scalefs : SDNode<"X86ISD::SCALEFS", SDTFPBinOp>;
+def X86scalefsRnd: SDNode<"X86ISD::SCALEFS_RND", SDTFPBinOpRound>;
def X86fsqrtRnd : SDNode<"X86ISD::FSQRT_RND", SDTFPUnaryOpRound>;
+def X86fsqrts : SDNode<"X86ISD::FSQRTS", SDTFPBinOp>;
def X86fsqrtRnds : SDNode<"X86ISD::FSQRTS_RND", SDTFPBinOpRound>;
-def X86fgetexpRnd : SDNode<"X86ISD::FGETEXP_RND", SDTFPUnaryOpRound>;
-def X86fgetexpRnds : SDNode<"X86ISD::FGETEXPS_RND", SDTFPBinOpRound>;
+def X86fgetexp : SDNode<"X86ISD::FGETEXP", SDTFPUnaryOp>;
+def X86fgetexpSAE : SDNode<"X86ISD::FGETEXP_SAE", SDTFPUnaryOp>;
+def X86fgetexps : SDNode<"X86ISD::FGETEXPS", SDTFPBinOp>;
+def X86fgetexpSAEs : SDNode<"X86ISD::FGETEXPS_SAE", SDTFPBinOp>;
def X86Fmadd : SDNode<"ISD::FMA", SDTFPTernaryOp, [SDNPCommutative]>;
def X86Fnmadd : SDNode<"X86ISD::FNMADD", SDTFPTernaryOp, [SDNPCommutative]>;
@@ -484,6 +505,10 @@ def X86FnmsubRnd : SDNode<"X86ISD::FNMSUB_RND", SDTFmaRound, [SDNPCommutat
def X86FmaddsubRnd : SDNode<"X86ISD::FMADDSUB_RND", SDTFmaRound, [SDNPCommutative]>;
def X86FmsubaddRnd : SDNode<"X86ISD::FMSUBADD_RND", SDTFmaRound, [SDNPCommutative]>;
+def X86vp2intersect : SDNode<"X86ISD::VP2INTERSECT",
+ SDTypeProfile<1, 2, [SDTCisVT<0, untyped>,
+ SDTCisVec<1>, SDTCisSameAs<1, 2>]>>;
+
def SDTIFma : SDTypeProfile<1, 3, [SDTCisInt<0>, SDTCisSameAs<0,1>,
SDTCisSameAs<1,2>, SDTCisSameAs<1,3>]>;
def x86vpmadd52l : SDNode<"X86ISD::VPMADD52L", SDTIFma, [SDNPCommutative]>;
@@ -500,27 +525,36 @@ def X86Vpdpbusds : SDNode<"X86ISD::VPDPBUSDS", SDTVnni>;
def X86Vpdpwssd : SDNode<"X86ISD::VPDPWSSD", SDTVnni>;
def X86Vpdpwssds : SDNode<"X86ISD::VPDPWSSDS", SDTVnni>;
-def X86rsqrt28 : SDNode<"X86ISD::RSQRT28", SDTFPUnaryOpRound>;
-def X86rcp28 : SDNode<"X86ISD::RCP28", SDTFPUnaryOpRound>;
-def X86exp2 : SDNode<"X86ISD::EXP2", SDTFPUnaryOpRound>;
+def X86rsqrt28 : SDNode<"X86ISD::RSQRT28", SDTFPUnaryOp>;
+def X86rsqrt28SAE: SDNode<"X86ISD::RSQRT28_SAE", SDTFPUnaryOp>;
+def X86rcp28 : SDNode<"X86ISD::RCP28", SDTFPUnaryOp>;
+def X86rcp28SAE : SDNode<"X86ISD::RCP28_SAE", SDTFPUnaryOp>;
+def X86exp2 : SDNode<"X86ISD::EXP2", SDTFPUnaryOp>;
+def X86exp2SAE : SDNode<"X86ISD::EXP2_SAE", SDTFPUnaryOp>;
def X86rsqrt14s : SDNode<"X86ISD::RSQRT14S", SDTFPBinOp>;
def X86rcp14s : SDNode<"X86ISD::RCP14S", SDTFPBinOp>;
-def X86rsqrt28s : SDNode<"X86ISD::RSQRT28S", SDTFPBinOpRound>;
-def X86rcp28s : SDNode<"X86ISD::RCP28S", SDTFPBinOpRound>;
+def X86rsqrt28s : SDNode<"X86ISD::RSQRT28S", SDTFPBinOp>;
+def X86rsqrt28SAEs : SDNode<"X86ISD::RSQRT28S_SAE", SDTFPBinOp>;
+def X86rcp28s : SDNode<"X86ISD::RCP28S", SDTFPBinOp>;
+def X86rcp28SAEs : SDNode<"X86ISD::RCP28S_SAE", SDTFPBinOp>;
def X86Ranges : SDNode<"X86ISD::VRANGES", SDTFPBinOpImm>;
def X86RndScales : SDNode<"X86ISD::VRNDSCALES", SDTFPBinOpImm>;
def X86Reduces : SDNode<"X86ISD::VREDUCES", SDTFPBinOpImm>;
def X86GetMants : SDNode<"X86ISD::VGETMANTS", SDTFPBinOpImm>;
-def X86RangesRnd : SDNode<"X86ISD::VRANGES_RND", SDTFPBinOpImmRound>;
-def X86RndScalesRnd : SDNode<"X86ISD::VRNDSCALES_RND", SDTFPBinOpImmRound>;
-def X86ReducesRnd : SDNode<"X86ISD::VREDUCES_RND", SDTFPBinOpImmRound>;
-def X86GetMantsRnd : SDNode<"X86ISD::VGETMANTS_RND", SDTFPBinOpImmRound>;
-
-def X86compress: SDNode<"X86ISD::COMPRESS", SDTypeProfile<1, 1,
- [SDTCisSameAs<0, 1>, SDTCisVec<1>]>, []>;
-def X86expand : SDNode<"X86ISD::EXPAND", SDTypeProfile<1, 1,
- [SDTCisSameAs<0, 1>, SDTCisVec<1>]>, []>;
+def X86RangesSAE : SDNode<"X86ISD::VRANGES_SAE", SDTFPBinOpImm>;
+def X86RndScalesSAE : SDNode<"X86ISD::VRNDSCALES_SAE", SDTFPBinOpImm>;
+def X86ReducesSAE : SDNode<"X86ISD::VREDUCES_SAE", SDTFPBinOpImm>;
+def X86GetMantsSAE : SDNode<"X86ISD::VGETMANTS_SAE", SDTFPBinOpImm>;
+
+def X86compress: SDNode<"X86ISD::COMPRESS", SDTypeProfile<1, 3,
+ [SDTCisSameAs<0, 1>, SDTCisVec<1>,
+ SDTCisSameAs<0, 2>, SDTCVecEltisVT<3, i1>,
+ SDTCisSameNumEltsAs<0, 3>]>, []>;
+def X86expand : SDNode<"X86ISD::EXPAND", SDTypeProfile<1, 3,
+ [SDTCisSameAs<0, 1>, SDTCisVec<1>,
+ SDTCisSameAs<0, 2>, SDTCVecEltisVT<3, i1>,
+ SDTCisSameNumEltsAs<0, 3>]>, []>;
// vpshufbitqmb
def X86Vpshufbitqmb : SDNode<"X86ISD::VPSHUFBITQMB",
@@ -529,6 +563,8 @@ def X86Vpshufbitqmb : SDNode<"X86ISD::VPSHUFBITQMB",
SDTCVecEltisVT<0,i1>,
SDTCisSameNumEltsAs<0,1>]>>;
+def SDTintToFP: SDTypeProfile<1, 2, [SDTCisVec<0>, SDTCisFP<0>,
+ SDTCisSameAs<0,1>, SDTCisInt<2>]>;
def SDTintToFPRound: SDTypeProfile<1, 3, [SDTCisVec<0>, SDTCisFP<0>,
SDTCisSameAs<0,1>, SDTCisInt<2>,
SDTCisVT<3, i32>]>;
@@ -550,13 +586,15 @@ def SDTVintToFPRound: SDTypeProfile<1, 2, [SDTCisVec<0>, SDTCisVec<1>,
SDTCisVT<2, i32>]>;
// Scalar
+def X86SintToFp : SDNode<"X86ISD::SCALAR_SINT_TO_FP", SDTintToFP>;
def X86SintToFpRnd : SDNode<"X86ISD::SCALAR_SINT_TO_FP_RND", SDTintToFPRound>;
+def X86UintToFp : SDNode<"X86ISD::SCALAR_UINT_TO_FP", SDTintToFP>;
def X86UintToFpRnd : SDNode<"X86ISD::SCALAR_UINT_TO_FP_RND", SDTintToFPRound>;
def X86cvtts2Int : SDNode<"X86ISD::CVTTS2SI", SDTSFloatToInt>;
def X86cvtts2UInt : SDNode<"X86ISD::CVTTS2UI", SDTSFloatToInt>;
-def X86cvtts2IntRnd : SDNode<"X86ISD::CVTTS2SI_RND", SDTSFloatToIntRnd>;
-def X86cvtts2UIntRnd : SDNode<"X86ISD::CVTTS2UI_RND", SDTSFloatToIntRnd>;
+def X86cvtts2IntSAE : SDNode<"X86ISD::CVTTS2SI_SAE", SDTSFloatToInt>;
+def X86cvtts2UIntSAE : SDNode<"X86ISD::CVTTS2UI_SAE", SDTSFloatToInt>;
def X86cvts2si : SDNode<"X86ISD::CVTS2SI", SDTSFloatToInt>;
def X86cvts2usi : SDNode<"X86ISD::CVTS2UI", SDTSFloatToInt>;
@@ -566,8 +604,8 @@ def X86cvts2usiRnd : SDNode<"X86ISD::CVTS2UI_RND", SDTSFloatToIntRnd>;
// Vector with rounding mode
// cvtt fp-to-int staff
-def X86cvttp2siRnd : SDNode<"X86ISD::CVTTP2SI_RND", SDTFloatToIntRnd>;
-def X86cvttp2uiRnd : SDNode<"X86ISD::CVTTP2UI_RND", SDTFloatToIntRnd>;
+def X86cvttp2siSAE : SDNode<"X86ISD::CVTTP2SI_SAE", SDTFloatToInt>;
+def X86cvttp2uiSAE : SDNode<"X86ISD::CVTTP2UI_SAE", SDTFloatToInt>;
def X86VSintToFpRnd : SDNode<"X86ISD::SINT_TO_FP_RND", SDTVintToFPRound>;
def X86VUintToFpRnd : SDNode<"X86ISD::UINT_TO_FP_RND", SDTVintToFPRound>;
@@ -590,6 +628,13 @@ def X86cvtp2Int : SDNode<"X86ISD::CVTP2SI", SDTFloatToInt>;
def X86cvtp2UInt : SDNode<"X86ISD::CVTP2UI", SDTFloatToInt>;
+// Masked versions of above
+def SDTMVintToFP: SDTypeProfile<1, 3, [SDTCisVec<0>, SDTCisVec<1>,
+ SDTCisFP<0>, SDTCisInt<1>,
+ SDTCisSameSizeAs<0, 1>,
+ SDTCisSameAs<0, 2>,
+ SDTCVecEltisVT<3, i1>,
+ SDTCisSameNumEltsAs<1, 3>]>;
def SDTMFloatToInt: SDTypeProfile<1, 3, [SDTCisVec<0>, SDTCisVec<1>,
SDTCisInt<0>, SDTCisFP<1>,
SDTCisSameSizeAs<0, 1>,
@@ -597,6 +642,9 @@ def SDTMFloatToInt: SDTypeProfile<1, 3, [SDTCisVec<0>, SDTCisVec<1>,
SDTCVecEltisVT<3, i1>,
SDTCisSameNumEltsAs<1, 3>]>;
+def X86VMSintToFP : SDNode<"X86ISD::MCVTSI2P", SDTMVintToFP>;
+def X86VMUintToFP : SDNode<"X86ISD::MCVTUI2P", SDTMVintToFP>;
+
def X86mcvtp2Int : SDNode<"X86ISD::MCVTP2SI", SDTMFloatToInt>;
def X86mcvtp2UInt : SDNode<"X86ISD::MCVTP2UI", SDTMFloatToInt>;
def X86mcvttp2si : SDNode<"X86ISD::MCVTTP2SI", SDTMFloatToInt>;
@@ -607,10 +655,9 @@ def X86cvtph2ps : SDNode<"X86ISD::CVTPH2PS",
SDTypeProfile<1, 1, [SDTCVecEltisVT<0, f32>,
SDTCVecEltisVT<1, i16>]> >;
-def X86cvtph2psRnd : SDNode<"X86ISD::CVTPH2PS_RND",
- SDTypeProfile<1, 2, [SDTCVecEltisVT<0, f32>,
- SDTCVecEltisVT<1, i16>,
- SDTCisVT<2, i32>]> >;
+def X86cvtph2psSAE : SDNode<"X86ISD::CVTPH2PS_SAE",
+ SDTypeProfile<1, 1, [SDTCVecEltisVT<0, f32>,
+ SDTCVecEltisVT<1, i16>]> >;
def X86cvtps2ph : SDNode<"X86ISD::CVTPS2PH",
SDTypeProfile<1, 2, [SDTCVecEltisVT<0, i16>,
@@ -623,17 +670,35 @@ def X86mcvtps2ph : SDNode<"X86ISD::MCVTPS2PH",
SDTCisSameAs<0, 3>,
SDTCVecEltisVT<4, i1>,
SDTCisSameNumEltsAs<1, 4>]> >;
-def X86vfpextRnd : SDNode<"X86ISD::VFPEXT_RND",
- SDTypeProfile<1, 2, [SDTCVecEltisVT<0, f64>,
+def X86vfpextSAE : SDNode<"X86ISD::VFPEXT_SAE",
+ SDTypeProfile<1, 1, [SDTCVecEltisVT<0, f64>,
SDTCVecEltisVT<1, f32>,
- SDTCisOpSmallerThanOp<1, 0>,
- SDTCisVT<2, i32>]>>;
+ SDTCisOpSmallerThanOp<1, 0>]>>;
def X86vfproundRnd: SDNode<"X86ISD::VFPROUND_RND",
SDTypeProfile<1, 2, [SDTCVecEltisVT<0, f32>,
SDTCVecEltisVT<1, f64>,
SDTCisOpSmallerThanOp<0, 1>,
SDTCisVT<2, i32>]>>;
+// cvt fp to bfloat16
+def X86cvtne2ps2bf16 : SDNode<"X86ISD::CVTNE2PS2BF16",
+ SDTypeProfile<1, 2, [SDTCisVec<0>, SDTCisVec<1>,
+ SDTCisSameAs<1,2>]>>;
+def X86mcvtneps2bf16 : SDNode<"X86ISD::MCVTNEPS2BF16",
+ SDTypeProfile<1, 3, [SDTCVecEltisVT<0, i16>,
+ SDTCVecEltisVT<1, f32>,
+ SDTCisSameAs<0, 2>,
+ SDTCVecEltisVT<3, i1>,
+ SDTCisSameNumEltsAs<1, 3>]>>;
+def X86cvtneps2bf16 : SDNode<"X86ISD::CVTNEPS2BF16",
+ SDTypeProfile<1, 1, [SDTCVecEltisVT<0, i16>,
+ SDTCVecEltisVT<1, f32>]>>;
+def X86dpbf16ps : SDNode<"X86ISD::DPBF16PS",
+ SDTypeProfile<1, 3, [SDTCVecEltisVT<0, f32>,
+ SDTCisSameAs<0,1>,
+ SDTCVecEltisVT<2, i32>,
+ SDTCisSameAs<2,3>]>>;
+
// galois field arithmetic
def X86GF2P8affineinvqb : SDNode<"X86ISD::GF2P8AFFINEINVQB", SDTBlend>;
def X86GF2P8affineqb : SDNode<"X86ISD::GF2P8AFFINEQB", SDTBlend>;
@@ -653,18 +718,8 @@ def sse_load_f64 : ComplexPattern<v2f64, 5, "selectScalarSSELoad", [],
[SDNPHasChain, SDNPMayLoad, SDNPMemOperand,
SDNPWantRoot, SDNPWantParent]>;
-def ssmem : Operand<v4f32> {
- let PrintMethod = "printf32mem";
- let MIOperandInfo = (ops ptr_rc, i8imm, ptr_rc_nosp, i32imm, SEGMENT_REG);
- let ParserMatchClass = X86Mem32AsmOperand;
- let OperandType = "OPERAND_MEMORY";
-}
-def sdmem : Operand<v2f64> {
- let PrintMethod = "printf64mem";
- let MIOperandInfo = (ops ptr_rc, i8imm, ptr_rc_nosp, i32imm, SEGMENT_REG);
- let ParserMatchClass = X86Mem64AsmOperand;
- let OperandType = "OPERAND_MEMORY";
-}
+def ssmem : X86MemOperand<"printdwordmem", X86Mem32AsmOperand>;
+def sdmem : X86MemOperand<"printqwordmem", X86Mem64AsmOperand>;
//===----------------------------------------------------------------------===//
// SSE pattern fragments
@@ -695,9 +750,9 @@ def loadv32i16 : PatFrag<(ops node:$ptr), (v32i16 (load node:$ptr))>;
def loadv64i8 : PatFrag<(ops node:$ptr), (v64i8 (load node:$ptr))>;
// 128-/256-/512-bit extload pattern fragments
-def extloadv2f32 : PatFrag<(ops node:$ptr), (v2f64 (extloadvf32 node:$ptr))>;
-def extloadv4f32 : PatFrag<(ops node:$ptr), (v4f64 (extloadvf32 node:$ptr))>;
-def extloadv8f32 : PatFrag<(ops node:$ptr), (v8f64 (extloadvf32 node:$ptr))>;
+def extloadv2f32 : PatFrag<(ops node:$ptr), (extloadvf32 node:$ptr)>;
+def extloadv4f32 : PatFrag<(ops node:$ptr), (extloadvf32 node:$ptr)>;
+def extloadv8f32 : PatFrag<(ops node:$ptr), (extloadvf32 node:$ptr)>;
// Like 'store', but always requires vector size alignment.
def alignedstore : PatFrag<(ops node:$val, node:$ptr),
@@ -884,15 +939,20 @@ def bc_v8i64 : PatFrag<(ops node:$in), (v8i64 (bitconvert node:$in))>;
def bc_v8f64 : PatFrag<(ops node:$in), (v8f64 (bitconvert node:$in))>;
def bc_v16f32 : PatFrag<(ops node:$in), (v16f32 (bitconvert node:$in))>;
-def vzmovl_v2i64 : PatFrag<(ops node:$src),
- (bitconvert (v2i64 (X86vzmovl
- (v2i64 (scalar_to_vector (loadi64 node:$src))))))>;
-def vzmovl_v4i32 : PatFrag<(ops node:$src),
- (bitconvert (v4i32 (X86vzmovl
- (v4i32 (scalar_to_vector (loadi32 node:$src))))))>;
+def X86vzload32 : PatFrag<(ops node:$src),
+ (X86vzld node:$src), [{
+ return cast<MemIntrinsicSDNode>(N)->getMemoryVT().getStoreSize() == 4;
+}]>;
-def vzload_v2i64 : PatFrag<(ops node:$src),
- (bitconvert (v2i64 (X86vzload node:$src)))>;
+def X86vzload64 : PatFrag<(ops node:$src),
+ (X86vzld node:$src), [{
+ return cast<MemIntrinsicSDNode>(N)->getMemoryVT().getStoreSize() == 8;
+}]>;
+
+def X86vextractstore64 : PatFrag<(ops node:$val, node:$ptr),
+ (X86vextractst node:$val, node:$ptr), [{
+ return cast<MemIntrinsicSDNode>(N)->getMemoryVT().getStoreSize() == 8;
+}]>;
def fp32imm0 : PatLeaf<(f32 fpimm), [{
@@ -903,20 +963,6 @@ def fp64imm0 : PatLeaf<(f64 fpimm), [{
return N->isExactlyValue(+0.0);
}]>;
-def I8Imm : SDNodeXForm<imm, [{
- // Transformation function: get the low 8 bits.
- return getI8Imm((uint8_t)N->getZExtValue(), SDLoc(N));
-}]>;
-
-def FROUND_NO_EXC : PatLeaf<(i32 8)>;
-def FROUND_CURRENT : PatLeaf<(i32 4)>;
-
-// BYTE_imm - Transform bit immediates into byte immediates.
-def BYTE_imm : SDNodeXForm<imm, [{
- // Transformation function: imm >> 3
- return getI32Imm(N->getZExtValue() >> 3, SDLoc(N));
-}]>;
-
// EXTRACT_get_vextract128_imm xform function: convert extract_subvector index
// to VEXTRACTF128/VEXTRACTI128 imm.
def EXTRACT_get_vextract128_imm : SDNodeXForm<extract_subvector, [{
@@ -943,8 +989,10 @@ def INSERT_get_vinsert256_imm : SDNodeXForm<insert_subvector, [{
def vextract128_extract : PatFrag<(ops node:$bigvec, node:$index),
(extract_subvector node:$bigvec,
- node:$index), [{}],
- EXTRACT_get_vextract128_imm>;
+ node:$index), [{
+ // Index 0 can be handled via extract_subreg.
+ return !isNullConstant(N->getOperand(1));
+}], EXTRACT_get_vextract128_imm>;
def vinsert128_insert : PatFrag<(ops node:$bigvec, node:$smallvec,
node:$index),
@@ -954,8 +1002,10 @@ def vinsert128_insert : PatFrag<(ops node:$bigvec, node:$smallvec,
def vextract256_extract : PatFrag<(ops node:$bigvec, node:$index),
(extract_subvector node:$bigvec,
- node:$index), [{}],
- EXTRACT_get_vextract256_imm>;
+ node:$index), [{
+ // Index 0 can be handled via extract_subreg.
+ return !isNullConstant(N->getOperand(1));
+}], EXTRACT_get_vextract256_imm>;
def vinsert256_insert : PatFrag<(ops node:$bigvec, node:$smallvec,
node:$index),
@@ -963,70 +1013,46 @@ def vinsert256_insert : PatFrag<(ops node:$bigvec, node:$smallvec,
node:$index), [{}],
INSERT_get_vinsert256_imm>;
-def X86mload : PatFrag<(ops node:$src1, node:$src2, node:$src3),
- (masked_load node:$src1, node:$src2, node:$src3), [{
+def masked_load : PatFrag<(ops node:$src1, node:$src2, node:$src3),
+ (masked_ld node:$src1, node:$src2, node:$src3), [{
return !cast<MaskedLoadSDNode>(N)->isExpandingLoad() &&
cast<MaskedLoadSDNode>(N)->getExtensionType() == ISD::NON_EXTLOAD;
}]>;
-def masked_load_aligned128 : PatFrag<(ops node:$src1, node:$src2, node:$src3),
- (X86mload node:$src1, node:$src2, node:$src3), [{
- return cast<MaskedLoadSDNode>(N)->getAlignment() >= 16;
-}]>;
-
-def masked_load_aligned256 : PatFrag<(ops node:$src1, node:$src2, node:$src3),
- (X86mload node:$src1, node:$src2, node:$src3), [{
- return cast<MaskedLoadSDNode>(N)->getAlignment() >= 32;
-}]>;
-
-def masked_load_aligned512 : PatFrag<(ops node:$src1, node:$src2, node:$src3),
- (X86mload node:$src1, node:$src2, node:$src3), [{
- return cast<MaskedLoadSDNode>(N)->getAlignment() >= 64;
-}]>;
-
-def masked_load_unaligned : PatFrag<(ops node:$src1, node:$src2, node:$src3),
+def masked_load_aligned : PatFrag<(ops node:$src1, node:$src2, node:$src3),
(masked_load node:$src1, node:$src2, node:$src3), [{
- return !cast<MaskedLoadSDNode>(N)->isExpandingLoad() &&
- cast<MaskedLoadSDNode>(N)->getExtensionType() == ISD::NON_EXTLOAD;
+ // Use the node type to determine the size the alignment needs to match.
+ // We can't use memory VT because type widening changes the node VT, but
+ // not the memory VT.
+ auto *Ld = cast<MaskedLoadSDNode>(N);
+ return Ld->getAlignment() >= Ld->getValueType(0).getStoreSize();
}]>;
def X86mExpandingLoad : PatFrag<(ops node:$src1, node:$src2, node:$src3),
- (masked_load node:$src1, node:$src2, node:$src3), [{
+ (masked_ld node:$src1, node:$src2, node:$src3), [{
return cast<MaskedLoadSDNode>(N)->isExpandingLoad();
}]>;
// Masked store fragments.
// X86mstore can't be implemented in core DAG files because some targets
// do not support vector types (llvm-tblgen will fail).
-def X86mstore : PatFrag<(ops node:$src1, node:$src2, node:$src3),
- (masked_store node:$src1, node:$src2, node:$src3), [{
+def masked_store : PatFrag<(ops node:$src1, node:$src2, node:$src3),
+ (masked_st node:$src1, node:$src2, node:$src3), [{
return (!cast<MaskedStoreSDNode>(N)->isTruncatingStore()) &&
(!cast<MaskedStoreSDNode>(N)->isCompressingStore());
}]>;
-def masked_store_aligned128 : PatFrag<(ops node:$src1, node:$src2, node:$src3),
- (X86mstore node:$src1, node:$src2, node:$src3), [{
- return cast<MaskedStoreSDNode>(N)->getAlignment() >= 16;
-}]>;
-
-def masked_store_aligned256 : PatFrag<(ops node:$src1, node:$src2, node:$src3),
- (X86mstore node:$src1, node:$src2, node:$src3), [{
- return cast<MaskedStoreSDNode>(N)->getAlignment() >= 32;
-}]>;
-
-def masked_store_aligned512 : PatFrag<(ops node:$src1, node:$src2, node:$src3),
- (X86mstore node:$src1, node:$src2, node:$src3), [{
- return cast<MaskedStoreSDNode>(N)->getAlignment() >= 64;
-}]>;
-
-def masked_store_unaligned : PatFrag<(ops node:$src1, node:$src2, node:$src3),
+def masked_store_aligned : PatFrag<(ops node:$src1, node:$src2, node:$src3),
(masked_store node:$src1, node:$src2, node:$src3), [{
- return (!cast<MaskedStoreSDNode>(N)->isTruncatingStore()) &&
- (!cast<MaskedStoreSDNode>(N)->isCompressingStore());
+ // Use the node type to determine the size the alignment needs to match.
+ // We can't use memory VT because type widening changes the node VT, but
+ // not the memory VT.
+ auto *St = cast<MaskedStoreSDNode>(N);
+ return St->getAlignment() >= St->getOperand(1).getValueType().getStoreSize();
}]>;
def X86mCompressingStore : PatFrag<(ops node:$src1, node:$src2, node:$src3),
- (masked_store node:$src1, node:$src2, node:$src3), [{
+ (masked_st node:$src1, node:$src2, node:$src3), [{
return cast<MaskedStoreSDNode>(N)->isCompressingStore();
}]>;
@@ -1034,7 +1060,7 @@ def X86mCompressingStore : PatFrag<(ops node:$src1, node:$src2, node:$src3),
// X86mtruncstore can't be implemented in core DAG files because some targets
// doesn't support vector type ( llvm-tblgen will fail)
def X86mtruncstore : PatFrag<(ops node:$src1, node:$src2, node:$src3),
- (masked_store node:$src1, node:$src2, node:$src3), [{
+ (masked_st node:$src1, node:$src2, node:$src3), [{
return cast<MaskedStoreSDNode>(N)->isTruncatingStore();
}]>;
def masked_truncstorevi8 :
diff --git a/lib/Target/X86/X86InstrInfo.cpp b/lib/Target/X86/X86InstrInfo.cpp
index ab14ee7fadf2..dbe45356c42b 100644
--- a/lib/Target/X86/X86InstrInfo.cpp
+++ b/lib/Target/X86/X86InstrInfo.cpp
@@ -1,9 +1,8 @@
//===-- X86InstrInfo.cpp - X86 Instruction Information --------------------===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
@@ -220,16 +219,22 @@ static bool isFrameLoadOpcode(int Opcode, unsigned &MemBytes) {
return true;
case X86::MOV32rm:
case X86::MOVSSrm:
- case X86::VMOVSSZrm:
+ case X86::MOVSSrm_alt:
case X86::VMOVSSrm:
+ case X86::VMOVSSrm_alt:
+ case X86::VMOVSSZrm:
+ case X86::VMOVSSZrm_alt:
case X86::KMOVDkm:
MemBytes = 4;
return true;
case X86::MOV64rm:
case X86::LD_Fp64m:
case X86::MOVSDrm:
+ case X86::MOVSDrm_alt:
case X86::VMOVSDrm:
+ case X86::VMOVSDrm_alt:
case X86::VMOVSDZrm:
+ case X86::VMOVSDZrm_alt:
case X86::MMX_MOVD64rm:
case X86::MMX_MOVQ64rm:
case X86::KMOVQkm:
@@ -483,9 +488,10 @@ bool X86InstrInfo::isReallyTriviallyReMaterializable(const MachineInstr &MI,
case X86::MOV16rm:
case X86::MOV32rm:
case X86::MOV64rm:
- case X86::LD_Fp64m:
case X86::MOVSSrm:
+ case X86::MOVSSrm_alt:
case X86::MOVSDrm:
+ case X86::MOVSDrm_alt:
case X86::MOVAPSrm:
case X86::MOVUPSrm:
case X86::MOVAPDrm:
@@ -493,7 +499,9 @@ bool X86InstrInfo::isReallyTriviallyReMaterializable(const MachineInstr &MI,
case X86::MOVDQArm:
case X86::MOVDQUrm:
case X86::VMOVSSrm:
+ case X86::VMOVSSrm_alt:
case X86::VMOVSDrm:
+ case X86::VMOVSDrm_alt:
case X86::VMOVAPSrm:
case X86::VMOVUPSrm:
case X86::VMOVAPDrm:
@@ -510,7 +518,9 @@ bool X86InstrInfo::isReallyTriviallyReMaterializable(const MachineInstr &MI,
case X86::MMX_MOVQ64rm:
// AVX-512
case X86::VMOVSSZrm:
+ case X86::VMOVSSZrm_alt:
case X86::VMOVSDZrm:
+ case X86::VMOVSDZrm_alt:
case X86::VMOVAPDZ128rm:
case X86::VMOVAPDZ256rm:
case X86::VMOVAPDZrm:
@@ -590,96 +600,12 @@ bool X86InstrInfo::isReallyTriviallyReMaterializable(const MachineInstr &MI,
return true;
}
-bool X86InstrInfo::isSafeToClobberEFLAGS(MachineBasicBlock &MBB,
- MachineBasicBlock::iterator I) const {
- MachineBasicBlock::iterator E = MBB.end();
-
- // For compile time consideration, if we are not able to determine the
- // safety after visiting 4 instructions in each direction, we will assume
- // it's not safe.
- MachineBasicBlock::iterator Iter = I;
- for (unsigned i = 0; Iter != E && i < 4; ++i) {
- bool SeenDef = false;
- for (unsigned j = 0, e = Iter->getNumOperands(); j != e; ++j) {
- MachineOperand &MO = Iter->getOperand(j);
- if (MO.isRegMask() && MO.clobbersPhysReg(X86::EFLAGS))
- SeenDef = true;
- if (!MO.isReg())
- continue;
- if (MO.getReg() == X86::EFLAGS) {
- if (MO.isUse())
- return false;
- SeenDef = true;
- }
- }
-
- if (SeenDef)
- // This instruction defines EFLAGS, no need to look any further.
- return true;
- ++Iter;
- // Skip over debug instructions.
- while (Iter != E && Iter->isDebugInstr())
- ++Iter;
- }
-
- // It is safe to clobber EFLAGS at the end of a block of no successor has it
- // live in.
- if (Iter == E) {
- for (MachineBasicBlock *S : MBB.successors())
- if (S->isLiveIn(X86::EFLAGS))
- return false;
- return true;
- }
-
- MachineBasicBlock::iterator B = MBB.begin();
- Iter = I;
- for (unsigned i = 0; i < 4; ++i) {
- // If we make it to the beginning of the block, it's safe to clobber
- // EFLAGS iff EFLAGS is not live-in.
- if (Iter == B)
- return !MBB.isLiveIn(X86::EFLAGS);
-
- --Iter;
- // Skip over debug instructions.
- while (Iter != B && Iter->isDebugInstr())
- --Iter;
-
- bool SawKill = false;
- for (unsigned j = 0, e = Iter->getNumOperands(); j != e; ++j) {
- MachineOperand &MO = Iter->getOperand(j);
- // A register mask may clobber EFLAGS, but we should still look for a
- // live EFLAGS def.
- if (MO.isRegMask() && MO.clobbersPhysReg(X86::EFLAGS))
- SawKill = true;
- if (MO.isReg() && MO.getReg() == X86::EFLAGS) {
- if (MO.isDef()) return MO.isDead();
- if (MO.isKill()) SawKill = true;
- }
- }
-
- if (SawKill)
- // This instruction kills EFLAGS and doesn't redefine it, so
- // there's no need to look further.
- return true;
- }
-
- // Conservative answer.
- return false;
-}
-
void X86InstrInfo::reMaterialize(MachineBasicBlock &MBB,
MachineBasicBlock::iterator I,
unsigned DestReg, unsigned SubIdx,
const MachineInstr &Orig,
const TargetRegisterInfo &TRI) const {
- bool ClobbersEFLAGS = false;
- for (const MachineOperand &MO : Orig.operands()) {
- if (MO.isReg() && MO.isDef() && MO.getReg() == X86::EFLAGS) {
- ClobbersEFLAGS = true;
- break;
- }
- }
-
+ bool ClobbersEFLAGS = Orig.modifiesRegister(X86::EFLAGS, &TRI);
if (ClobbersEFLAGS && !isSafeToClobberEFLAGS(MBB, I)) {
// The instruction clobbers EFLAGS. Re-materialize as MOV32ri to avoid side
// effects.
@@ -796,11 +722,10 @@ bool X86InstrInfo::classifyLEAReg(MachineInstr &MI, const MachineOperand &Src,
MachineInstr *X86InstrInfo::convertToThreeAddressWithLEA(
unsigned MIOpc, MachineFunction::iterator &MFI, MachineInstr &MI,
- LiveVariables *LV) const {
+ LiveVariables *LV, bool Is8BitOp) const {
// We handle 8-bit adds and various 16-bit opcodes in the switch below.
- bool Is16BitOp = !(MIOpc == X86::ADD8rr || MIOpc == X86::ADD8ri);
MachineRegisterInfo &RegInfo = MFI->getParent()->getRegInfo();
- assert((!Is16BitOp || RegInfo.getTargetRegisterInfo()->getRegSizeInBits(
+ assert((Is8BitOp || RegInfo.getTargetRegisterInfo()->getRegSizeInBits(
*RegInfo.getRegClass(MI.getOperand(0).getReg())) == 16) &&
"Unexpected type for LEA transform");
@@ -830,7 +755,7 @@ MachineInstr *X86InstrInfo::convertToThreeAddressWithLEA(
unsigned Src = MI.getOperand(1).getReg();
bool IsDead = MI.getOperand(0).isDead();
bool IsKill = MI.getOperand(1).isKill();
- unsigned SubReg = Is16BitOp ? X86::sub_16bit : X86::sub_8bit;
+ unsigned SubReg = Is8BitOp ? X86::sub_8bit : X86::sub_16bit;
assert(!MI.getOperand(1).isUndef() && "Undef op doesn't need optimization");
BuildMI(*MFI, MBBI, MI.getDebugLoc(), get(X86::IMPLICIT_DEF), InRegLEA);
MachineInstr *InsMI =
@@ -842,19 +767,23 @@ MachineInstr *X86InstrInfo::convertToThreeAddressWithLEA(
BuildMI(*MFI, MBBI, MI.getDebugLoc(), get(Opcode), OutRegLEA);
switch (MIOpc) {
default: llvm_unreachable("Unreachable!");
+ case X86::SHL8ri:
case X86::SHL16ri: {
unsigned ShAmt = MI.getOperand(2).getImm();
MIB.addReg(0).addImm(1ULL << ShAmt)
.addReg(InRegLEA, RegState::Kill).addImm(0).addReg(0);
break;
}
+ case X86::INC8r:
case X86::INC16r:
addRegOffset(MIB, InRegLEA, true, 1);
break;
+ case X86::DEC8r:
case X86::DEC16r:
addRegOffset(MIB, InRegLEA, true, -1);
break;
case X86::ADD8ri:
+ case X86::ADD8ri_DB:
case X86::ADD16ri:
case X86::ADD16ri8:
case X86::ADD16ri_DB:
@@ -862,6 +791,7 @@ MachineInstr *X86InstrInfo::convertToThreeAddressWithLEA(
addRegOffset(MIB, InRegLEA, true, MI.getOperand(2).getImm());
break;
case X86::ADD8rr:
+ case X86::ADD8rr_DB:
case X86::ADD16rr:
case X86::ADD16rr_DB: {
unsigned Src2 = MI.getOperand(2).getReg();
@@ -948,9 +878,10 @@ X86InstrInfo::convertToThreeAddress(MachineFunction::iterator &MFI,
MachineInstr *NewMI = nullptr;
bool Is64Bit = Subtarget.is64Bit();
+ bool Is8BitOp = false;
unsigned MIOpc = MI.getOpcode();
switch (MIOpc) {
- default: return nullptr;
+ default: llvm_unreachable("Unreachable!");
case X86::SHL64ri: {
assert(MI.getNumOperands() >= 3 && "Unknown shift instruction!");
unsigned ShAmt = getTruncatedShiftCount(MI, 2);
@@ -1000,12 +931,15 @@ X86InstrInfo::convertToThreeAddress(MachineFunction::iterator &MFI,
break;
}
+ case X86::SHL8ri:
+ Is8BitOp = true;
+ LLVM_FALLTHROUGH;
case X86::SHL16ri: {
assert(MI.getNumOperands() >= 3 && "Unknown shift instruction!");
unsigned ShAmt = getTruncatedShiftCount(MI, 2);
if (!isTruncatedShiftCountForLEA(ShAmt))
return nullptr;
- return convertToThreeAddressWithLEA(MIOpc, MFI, MI, LV);
+ return convertToThreeAddressWithLEA(MIOpc, MFI, MI, LV, Is8BitOp);
}
case X86::INC64r:
case X86::INC32r: {
@@ -1029,8 +963,6 @@ X86InstrInfo::convertToThreeAddress(MachineFunction::iterator &MFI,
NewMI = addOffset(MIB, 1);
break;
}
- case X86::INC16r:
- return convertToThreeAddressWithLEA(MIOpc, MFI, MI, LV);
case X86::DEC64r:
case X86::DEC32r: {
assert(MI.getNumOperands() >= 2 && "Unknown dec instruction!");
@@ -1054,8 +986,13 @@ X86InstrInfo::convertToThreeAddress(MachineFunction::iterator &MFI,
break;
}
+ case X86::DEC8r:
+ case X86::INC8r:
+ Is8BitOp = true;
+ LLVM_FALLTHROUGH;
case X86::DEC16r:
- return convertToThreeAddressWithLEA(MIOpc, MFI, MI, LV);
+ case X86::INC16r:
+ return convertToThreeAddressWithLEA(MIOpc, MFI, MI, LV, Is8BitOp);
case X86::ADD64rr:
case X86::ADD64rr_DB:
case X86::ADD32rr:
@@ -1094,9 +1031,12 @@ X86InstrInfo::convertToThreeAddress(MachineFunction::iterator &MFI,
break;
}
case X86::ADD8rr:
+ case X86::ADD8rr_DB:
+ Is8BitOp = true;
+ LLVM_FALLTHROUGH;
case X86::ADD16rr:
case X86::ADD16rr_DB:
- return convertToThreeAddressWithLEA(MIOpc, MFI, MI, LV);
+ return convertToThreeAddressWithLEA(MIOpc, MFI, MI, LV, Is8BitOp);
case X86::ADD64ri32:
case X86::ADD64ri8:
case X86::ADD64ri32_DB:
@@ -1130,11 +1070,59 @@ X86InstrInfo::convertToThreeAddress(MachineFunction::iterator &MFI,
break;
}
case X86::ADD8ri:
+ case X86::ADD8ri_DB:
+ Is8BitOp = true;
+ LLVM_FALLTHROUGH;
case X86::ADD16ri:
case X86::ADD16ri8:
case X86::ADD16ri_DB:
case X86::ADD16ri8_DB:
- return convertToThreeAddressWithLEA(MIOpc, MFI, MI, LV);
+ return convertToThreeAddressWithLEA(MIOpc, MFI, MI, LV, Is8BitOp);
+ case X86::SUB8ri:
+ case X86::SUB16ri8:
+ case X86::SUB16ri:
+ /// FIXME: Support these similar to ADD8ri/ADD16ri*.
+ return nullptr;
+ case X86::SUB32ri8:
+ case X86::SUB32ri: {
+ int64_t Imm = MI.getOperand(2).getImm();
+ if (!isInt<32>(-Imm))
+ return nullptr;
+
+ assert(MI.getNumOperands() >= 3 && "Unknown add instruction!");
+ unsigned Opc = Is64Bit ? X86::LEA64_32r : X86::LEA32r;
+
+ bool isKill;
+ unsigned SrcReg;
+ MachineOperand ImplicitOp = MachineOperand::CreateReg(0, false);
+ if (!classifyLEAReg(MI, Src, Opc, /*AllowSP=*/ true,
+ SrcReg, isKill, ImplicitOp, LV))
+ return nullptr;
+
+ MachineInstrBuilder MIB = BuildMI(MF, MI.getDebugLoc(), get(Opc))
+ .add(Dest)
+ .addReg(SrcReg, getKillRegState(isKill));
+ if (ImplicitOp.getReg() != 0)
+ MIB.add(ImplicitOp);
+
+ NewMI = addOffset(MIB, -Imm);
+ break;
+ }
+
+ case X86::SUB64ri8:
+ case X86::SUB64ri32: {
+ int64_t Imm = MI.getOperand(2).getImm();
+ if (!isInt<32>(-Imm))
+ return nullptr;
+
+ assert(MI.getNumOperands() >= 3 && "Unknown sub instruction!");
+
+ MachineInstrBuilder MIB = BuildMI(MF, MI.getDebugLoc(),
+ get(X86::LEA64r)).add(Dest).add(Src);
+ NewMI = addOffset(MIB, -Imm);
+ break;
+ }
+
case X86::VMOVDQU8Z128rmk:
case X86::VMOVDQU8Z256rmk:
case X86::VMOVDQU8Zrmk:
@@ -1522,7 +1510,7 @@ MachineInstr *X86InstrInfo::commuteInstructionImpl(MachineInstr &MI, bool NewMI,
case X86::VBLENDPDrri:
case X86::VBLENDPSrri:
// If we're optimizing for size, try to use MOVSD/MOVSS.
- if (MI.getParent()->getParent()->getFunction().optForSize()) {
+ if (MI.getParent()->getParent()->getFunction().hasOptSize()) {
unsigned Mask, Opc;
switch (MI.getOpcode()) {
default: llvm_unreachable("Unreachable!");
@@ -1548,47 +1536,90 @@ MachineInstr *X86InstrInfo::commuteInstructionImpl(MachineInstr &MI, bool NewMI,
case X86::VPBLENDWrri:
case X86::VPBLENDDYrri:
case X86::VPBLENDWYrri:{
- unsigned Mask;
+ int8_t Mask;
switch (MI.getOpcode()) {
default: llvm_unreachable("Unreachable!");
- case X86::BLENDPDrri: Mask = 0x03; break;
- case X86::BLENDPSrri: Mask = 0x0F; break;
- case X86::PBLENDWrri: Mask = 0xFF; break;
- case X86::VBLENDPDrri: Mask = 0x03; break;
- case X86::VBLENDPSrri: Mask = 0x0F; break;
- case X86::VBLENDPDYrri: Mask = 0x0F; break;
- case X86::VBLENDPSYrri: Mask = 0xFF; break;
- case X86::VPBLENDDrri: Mask = 0x0F; break;
- case X86::VPBLENDWrri: Mask = 0xFF; break;
- case X86::VPBLENDDYrri: Mask = 0xFF; break;
- case X86::VPBLENDWYrri: Mask = 0xFF; break;
+ case X86::BLENDPDrri: Mask = (int8_t)0x03; break;
+ case X86::BLENDPSrri: Mask = (int8_t)0x0F; break;
+ case X86::PBLENDWrri: Mask = (int8_t)0xFF; break;
+ case X86::VBLENDPDrri: Mask = (int8_t)0x03; break;
+ case X86::VBLENDPSrri: Mask = (int8_t)0x0F; break;
+ case X86::VBLENDPDYrri: Mask = (int8_t)0x0F; break;
+ case X86::VBLENDPSYrri: Mask = (int8_t)0xFF; break;
+ case X86::VPBLENDDrri: Mask = (int8_t)0x0F; break;
+ case X86::VPBLENDWrri: Mask = (int8_t)0xFF; break;
+ case X86::VPBLENDDYrri: Mask = (int8_t)0xFF; break;
+ case X86::VPBLENDWYrri: Mask = (int8_t)0xFF; break;
}
// Only the least significant bits of Imm are used.
- unsigned Imm = MI.getOperand(3).getImm() & Mask;
+ // Using int8_t to ensure it will be sign extended to the int64_t that
+ // setImm takes in order to match isel behavior.
+ int8_t Imm = MI.getOperand(3).getImm() & Mask;
auto &WorkingMI = cloneIfNew(MI);
WorkingMI.getOperand(3).setImm(Mask ^ Imm);
return TargetInstrInfo::commuteInstructionImpl(WorkingMI, /*NewMI=*/false,
OpIdx1, OpIdx2);
}
+ case X86::INSERTPSrr:
+ case X86::VINSERTPSrr:
+ case X86::VINSERTPSZrr: {
+ unsigned Imm = MI.getOperand(MI.getNumOperands() - 1).getImm();
+ unsigned ZMask = Imm & 15;
+ unsigned DstIdx = (Imm >> 4) & 3;
+ unsigned SrcIdx = (Imm >> 6) & 3;
+
+ // We can commute insertps if we zero 2 of the elements, the insertion is
+ // "inline" and we don't override the insertion with a zero.
+ if (DstIdx == SrcIdx && (ZMask & (1 << DstIdx)) == 0 &&
+ countPopulation(ZMask) == 2) {
+ unsigned AltIdx = findFirstSet((ZMask | (1 << DstIdx)) ^ 15);
+ assert(AltIdx < 4 && "Illegal insertion index");
+ unsigned AltImm = (AltIdx << 6) | (AltIdx << 4) | ZMask;
+ auto &WorkingMI = cloneIfNew(MI);
+ WorkingMI.getOperand(MI.getNumOperands() - 1).setImm(AltImm);
+ return TargetInstrInfo::commuteInstructionImpl(WorkingMI, /*NewMI=*/false,
+ OpIdx1, OpIdx2);
+ }
+ return nullptr;
+ }
case X86::MOVSDrr:
case X86::MOVSSrr:
case X86::VMOVSDrr:
case X86::VMOVSSrr:{
// On SSE41 or later we can commute a MOVSS/MOVSD to a BLENDPS/BLENDPD.
- assert(Subtarget.hasSSE41() && "Commuting MOVSD/MOVSS requires SSE41!");
+ if (Subtarget.hasSSE41()) {
+ unsigned Mask, Opc;
+ switch (MI.getOpcode()) {
+ default: llvm_unreachable("Unreachable!");
+ case X86::MOVSDrr: Opc = X86::BLENDPDrri; Mask = 0x02; break;
+ case X86::MOVSSrr: Opc = X86::BLENDPSrri; Mask = 0x0E; break;
+ case X86::VMOVSDrr: Opc = X86::VBLENDPDrri; Mask = 0x02; break;
+ case X86::VMOVSSrr: Opc = X86::VBLENDPSrri; Mask = 0x0E; break;
+ }
- unsigned Mask, Opc;
- switch (MI.getOpcode()) {
- default: llvm_unreachable("Unreachable!");
- case X86::MOVSDrr: Opc = X86::BLENDPDrri; Mask = 0x02; break;
- case X86::MOVSSrr: Opc = X86::BLENDPSrri; Mask = 0x0E; break;
- case X86::VMOVSDrr: Opc = X86::VBLENDPDrri; Mask = 0x02; break;
- case X86::VMOVSSrr: Opc = X86::VBLENDPSrri; Mask = 0x0E; break;
+ auto &WorkingMI = cloneIfNew(MI);
+ WorkingMI.setDesc(get(Opc));
+ WorkingMI.addOperand(MachineOperand::CreateImm(Mask));
+ return TargetInstrInfo::commuteInstructionImpl(WorkingMI, /*NewMI=*/false,
+ OpIdx1, OpIdx2);
}
+ // Convert to SHUFPD.
+ assert(MI.getOpcode() == X86::MOVSDrr &&
+ "Can only commute MOVSDrr without SSE4.1");
+
auto &WorkingMI = cloneIfNew(MI);
- WorkingMI.setDesc(get(Opc));
- WorkingMI.addOperand(MachineOperand::CreateImm(Mask));
+ WorkingMI.setDesc(get(X86::SHUFPDrri));
+ WorkingMI.addOperand(MachineOperand::CreateImm(0x02));
+ return TargetInstrInfo::commuteInstructionImpl(WorkingMI, /*NewMI=*/false,
+ OpIdx1, OpIdx2);
+ }
+ case X86::SHUFPDrri: {
+ // Commute to MOVSD.
+ assert(MI.getOperand(3).getImm() == 0x02 && "Unexpected immediate!");
+ auto &WorkingMI = cloneIfNew(MI);
+ WorkingMI.setDesc(get(X86::MOVSDrr));
+ WorkingMI.RemoveOperand(3);
return TargetInstrInfo::commuteInstructionImpl(WorkingMI, /*NewMI=*/false,
OpIdx1, OpIdx2);
}
@@ -1657,7 +1688,7 @@ MachineInstr *X86InstrInfo::commuteInstructionImpl(MachineInstr &MI, bool NewMI,
// Flip permute source immediate.
// Imm & 0x02: lo = if set, select Op1.lo/hi else Op0.lo/hi.
// Imm & 0x20: hi = if set, select Op1.lo/hi else Op0.lo/hi.
- unsigned Imm = MI.getOperand(3).getImm() & 0xFF;
+ int8_t Imm = MI.getOperand(3).getImm() & 0xFF;
auto &WorkingMI = cloneIfNew(MI);
WorkingMI.getOperand(3).setImm(Imm ^ 0x22);
return TargetInstrInfo::commuteInstructionImpl(WorkingMI, /*NewMI=*/false,
@@ -1686,76 +1717,11 @@ MachineInstr *X86InstrInfo::commuteInstructionImpl(MachineInstr &MI, bool NewMI,
return TargetInstrInfo::commuteInstructionImpl(WorkingMI, /*NewMI=*/false,
OpIdx1, OpIdx2);
}
- case X86::CMOVB16rr: case X86::CMOVB32rr: case X86::CMOVB64rr:
- case X86::CMOVAE16rr: case X86::CMOVAE32rr: case X86::CMOVAE64rr:
- case X86::CMOVE16rr: case X86::CMOVE32rr: case X86::CMOVE64rr:
- case X86::CMOVNE16rr: case X86::CMOVNE32rr: case X86::CMOVNE64rr:
- case X86::CMOVBE16rr: case X86::CMOVBE32rr: case X86::CMOVBE64rr:
- case X86::CMOVA16rr: case X86::CMOVA32rr: case X86::CMOVA64rr:
- case X86::CMOVL16rr: case X86::CMOVL32rr: case X86::CMOVL64rr:
- case X86::CMOVGE16rr: case X86::CMOVGE32rr: case X86::CMOVGE64rr:
- case X86::CMOVLE16rr: case X86::CMOVLE32rr: case X86::CMOVLE64rr:
- case X86::CMOVG16rr: case X86::CMOVG32rr: case X86::CMOVG64rr:
- case X86::CMOVS16rr: case X86::CMOVS32rr: case X86::CMOVS64rr:
- case X86::CMOVNS16rr: case X86::CMOVNS32rr: case X86::CMOVNS64rr:
- case X86::CMOVP16rr: case X86::CMOVP32rr: case X86::CMOVP64rr:
- case X86::CMOVNP16rr: case X86::CMOVNP32rr: case X86::CMOVNP64rr:
- case X86::CMOVO16rr: case X86::CMOVO32rr: case X86::CMOVO64rr:
- case X86::CMOVNO16rr: case X86::CMOVNO32rr: case X86::CMOVNO64rr: {
- unsigned Opc;
- switch (MI.getOpcode()) {
- default: llvm_unreachable("Unreachable!");
- case X86::CMOVB16rr: Opc = X86::CMOVAE16rr; break;
- case X86::CMOVB32rr: Opc = X86::CMOVAE32rr; break;
- case X86::CMOVB64rr: Opc = X86::CMOVAE64rr; break;
- case X86::CMOVAE16rr: Opc = X86::CMOVB16rr; break;
- case X86::CMOVAE32rr: Opc = X86::CMOVB32rr; break;
- case X86::CMOVAE64rr: Opc = X86::CMOVB64rr; break;
- case X86::CMOVE16rr: Opc = X86::CMOVNE16rr; break;
- case X86::CMOVE32rr: Opc = X86::CMOVNE32rr; break;
- case X86::CMOVE64rr: Opc = X86::CMOVNE64rr; break;
- case X86::CMOVNE16rr: Opc = X86::CMOVE16rr; break;
- case X86::CMOVNE32rr: Opc = X86::CMOVE32rr; break;
- case X86::CMOVNE64rr: Opc = X86::CMOVE64rr; break;
- case X86::CMOVBE16rr: Opc = X86::CMOVA16rr; break;
- case X86::CMOVBE32rr: Opc = X86::CMOVA32rr; break;
- case X86::CMOVBE64rr: Opc = X86::CMOVA64rr; break;
- case X86::CMOVA16rr: Opc = X86::CMOVBE16rr; break;
- case X86::CMOVA32rr: Opc = X86::CMOVBE32rr; break;
- case X86::CMOVA64rr: Opc = X86::CMOVBE64rr; break;
- case X86::CMOVL16rr: Opc = X86::CMOVGE16rr; break;
- case X86::CMOVL32rr: Opc = X86::CMOVGE32rr; break;
- case X86::CMOVL64rr: Opc = X86::CMOVGE64rr; break;
- case X86::CMOVGE16rr: Opc = X86::CMOVL16rr; break;
- case X86::CMOVGE32rr: Opc = X86::CMOVL32rr; break;
- case X86::CMOVGE64rr: Opc = X86::CMOVL64rr; break;
- case X86::CMOVLE16rr: Opc = X86::CMOVG16rr; break;
- case X86::CMOVLE32rr: Opc = X86::CMOVG32rr; break;
- case X86::CMOVLE64rr: Opc = X86::CMOVG64rr; break;
- case X86::CMOVG16rr: Opc = X86::CMOVLE16rr; break;
- case X86::CMOVG32rr: Opc = X86::CMOVLE32rr; break;
- case X86::CMOVG64rr: Opc = X86::CMOVLE64rr; break;
- case X86::CMOVS16rr: Opc = X86::CMOVNS16rr; break;
- case X86::CMOVS32rr: Opc = X86::CMOVNS32rr; break;
- case X86::CMOVS64rr: Opc = X86::CMOVNS64rr; break;
- case X86::CMOVNS16rr: Opc = X86::CMOVS16rr; break;
- case X86::CMOVNS32rr: Opc = X86::CMOVS32rr; break;
- case X86::CMOVNS64rr: Opc = X86::CMOVS64rr; break;
- case X86::CMOVP16rr: Opc = X86::CMOVNP16rr; break;
- case X86::CMOVP32rr: Opc = X86::CMOVNP32rr; break;
- case X86::CMOVP64rr: Opc = X86::CMOVNP64rr; break;
- case X86::CMOVNP16rr: Opc = X86::CMOVP16rr; break;
- case X86::CMOVNP32rr: Opc = X86::CMOVP32rr; break;
- case X86::CMOVNP64rr: Opc = X86::CMOVP64rr; break;
- case X86::CMOVO16rr: Opc = X86::CMOVNO16rr; break;
- case X86::CMOVO32rr: Opc = X86::CMOVNO32rr; break;
- case X86::CMOVO64rr: Opc = X86::CMOVNO64rr; break;
- case X86::CMOVNO16rr: Opc = X86::CMOVO16rr; break;
- case X86::CMOVNO32rr: Opc = X86::CMOVO32rr; break;
- case X86::CMOVNO64rr: Opc = X86::CMOVO64rr; break;
- }
+ case X86::CMOV16rr: case X86::CMOV32rr: case X86::CMOV64rr: {
auto &WorkingMI = cloneIfNew(MI);
- WorkingMI.setDesc(get(Opc));
+ unsigned OpNo = MI.getDesc().getNumOperands() - 1;
+ X86::CondCode CC = static_cast<X86::CondCode>(MI.getOperand(OpNo).getImm());
+ WorkingMI.getOperand(OpNo).setImm(X86::GetOppositeBranchCondition(CC));
return TargetInstrInfo::commuteInstructionImpl(WorkingMI, /*NewMI=*/false,
OpIdx1, OpIdx2);
}
@@ -1879,7 +1845,6 @@ X86InstrInfo::findThreeSrcCommutedOpIndices(const MachineInstr &MI,
// regardless of the FMA opcode. The FMA opcode is adjusted later.
if (SrcOpIdx1 == CommuteAnyOperandIndex ||
SrcOpIdx2 == CommuteAnyOperandIndex) {
- unsigned CommutableOpIdx1 = SrcOpIdx1;
unsigned CommutableOpIdx2 = SrcOpIdx2;
// At least one of operands to be commuted is not specified and
@@ -1895,6 +1860,8 @@ X86InstrInfo::findThreeSrcCommutedOpIndices(const MachineInstr &MI,
// CommutableOpIdx2 is well defined now. Let's choose another commutable
// operand and assign its index to CommutableOpIdx1.
unsigned Op2Reg = MI.getOperand(CommutableOpIdx2).getReg();
+
+ unsigned CommutableOpIdx1;
for (CommutableOpIdx1 = LastCommutableVecOp;
CommutableOpIdx1 >= FirstCommutableVecOp; CommutableOpIdx1--) {
// Just ignore and skip the k-mask operand.
@@ -1946,28 +1913,43 @@ bool X86InstrInfo::findCommutedOpIndices(MachineInstr &MI, unsigned &SrcOpIdx1,
case X86::VCMPPDZ128rri:
case X86::VCMPPSZ128rri:
case X86::VCMPPDZ256rri:
- case X86::VCMPPSZ256rri: {
+ case X86::VCMPPSZ256rri:
+ case X86::VCMPPDZrrik:
+ case X86::VCMPPSZrrik:
+ case X86::VCMPPDZ128rrik:
+ case X86::VCMPPSZ128rrik:
+ case X86::VCMPPDZ256rrik:
+ case X86::VCMPPSZ256rrik: {
+ unsigned OpOffset = X86II::isKMasked(Desc.TSFlags) ? 1 : 0;
+
// Float comparison can be safely commuted for
// Ordered/Unordered/Equal/NotEqual tests
- unsigned Imm = MI.getOperand(3).getImm() & 0x7;
+ unsigned Imm = MI.getOperand(3 + OpOffset).getImm() & 0x7;
switch (Imm) {
case 0x00: // EQUAL
case 0x03: // UNORDERED
case 0x04: // NOT EQUAL
case 0x07: // ORDERED
- // The indices of the commutable operands are 1 and 2.
+ // The indices of the commutable operands are 1 and 2 (or 2 and 3
+ // when masked).
// Assign them to the returned operand indices here.
- return fixCommutedOpIndices(SrcOpIdx1, SrcOpIdx2, 1, 2);
+ return fixCommutedOpIndices(SrcOpIdx1, SrcOpIdx2, 1 + OpOffset,
+ 2 + OpOffset);
}
return false;
}
- case X86::MOVSDrr:
case X86::MOVSSrr:
- case X86::VMOVSDrr:
- case X86::VMOVSSrr:
+ // X86::MOVSDrr is always commutable. MOVSS is only commutable if we can
+ // form sse4.1 blend. We assume VMOVSSrr/VMOVSDrr is always commutable since
+ // AVX implies sse4.1.
if (Subtarget.hasSSE41())
return TargetInstrInfo::findCommutedOpIndices(MI, SrcOpIdx1, SrcOpIdx2);
return false;
+ case X86::SHUFPDrri:
+ // We can commute this to MOVSD.
+ if (MI.getOperand(3).getImm() == 0x02)
+ return TargetInstrInfo::findCommutedOpIndices(MI, SrcOpIdx1, SrcOpIdx2);
+ return false;
case X86::MOVHLPSrr:
case X86::UNPCKHPDrr:
case X86::VMOVHLPSrr:
@@ -2089,125 +2071,33 @@ bool X86InstrInfo::findCommutedOpIndices(MachineInstr &MI, unsigned &SrcOpIdx1,
return false;
}
-X86::CondCode X86::getCondFromBranchOpc(unsigned BrOpc) {
- switch (BrOpc) {
+X86::CondCode X86::getCondFromBranch(const MachineInstr &MI) {
+ switch (MI.getOpcode()) {
default: return X86::COND_INVALID;
- case X86::JE_1: return X86::COND_E;
- case X86::JNE_1: return X86::COND_NE;
- case X86::JL_1: return X86::COND_L;
- case X86::JLE_1: return X86::COND_LE;
- case X86::JG_1: return X86::COND_G;
- case X86::JGE_1: return X86::COND_GE;
- case X86::JB_1: return X86::COND_B;
- case X86::JBE_1: return X86::COND_BE;
- case X86::JA_1: return X86::COND_A;
- case X86::JAE_1: return X86::COND_AE;
- case X86::JS_1: return X86::COND_S;
- case X86::JNS_1: return X86::COND_NS;
- case X86::JP_1: return X86::COND_P;
- case X86::JNP_1: return X86::COND_NP;
- case X86::JO_1: return X86::COND_O;
- case X86::JNO_1: return X86::COND_NO;
- }
-}
-
-/// Return condition code of a SET opcode.
-X86::CondCode X86::getCondFromSETOpc(unsigned Opc) {
- switch (Opc) {
+ case X86::JCC_1:
+ return static_cast<X86::CondCode>(
+ MI.getOperand(MI.getDesc().getNumOperands() - 1).getImm());
+ }
+}
+
+/// Return condition code of a SETCC opcode.
+X86::CondCode X86::getCondFromSETCC(const MachineInstr &MI) {
+ switch (MI.getOpcode()) {
default: return X86::COND_INVALID;
- case X86::SETAr: case X86::SETAm: return X86::COND_A;
- case X86::SETAEr: case X86::SETAEm: return X86::COND_AE;
- case X86::SETBr: case X86::SETBm: return X86::COND_B;
- case X86::SETBEr: case X86::SETBEm: return X86::COND_BE;
- case X86::SETEr: case X86::SETEm: return X86::COND_E;
- case X86::SETGr: case X86::SETGm: return X86::COND_G;
- case X86::SETGEr: case X86::SETGEm: return X86::COND_GE;
- case X86::SETLr: case X86::SETLm: return X86::COND_L;
- case X86::SETLEr: case X86::SETLEm: return X86::COND_LE;
- case X86::SETNEr: case X86::SETNEm: return X86::COND_NE;
- case X86::SETNOr: case X86::SETNOm: return X86::COND_NO;
- case X86::SETNPr: case X86::SETNPm: return X86::COND_NP;
- case X86::SETNSr: case X86::SETNSm: return X86::COND_NS;
- case X86::SETOr: case X86::SETOm: return X86::COND_O;
- case X86::SETPr: case X86::SETPm: return X86::COND_P;
- case X86::SETSr: case X86::SETSm: return X86::COND_S;
+ case X86::SETCCr: case X86::SETCCm:
+ return static_cast<X86::CondCode>(
+ MI.getOperand(MI.getDesc().getNumOperands() - 1).getImm());
}
}
/// Return condition code of a CMov opcode.
-X86::CondCode X86::getCondFromCMovOpc(unsigned Opc) {
- switch (Opc) {
+X86::CondCode X86::getCondFromCMov(const MachineInstr &MI) {
+ switch (MI.getOpcode()) {
default: return X86::COND_INVALID;
- case X86::CMOVA16rm: case X86::CMOVA16rr: case X86::CMOVA32rm:
- case X86::CMOVA32rr: case X86::CMOVA64rm: case X86::CMOVA64rr:
- return X86::COND_A;
- case X86::CMOVAE16rm: case X86::CMOVAE16rr: case X86::CMOVAE32rm:
- case X86::CMOVAE32rr: case X86::CMOVAE64rm: case X86::CMOVAE64rr:
- return X86::COND_AE;
- case X86::CMOVB16rm: case X86::CMOVB16rr: case X86::CMOVB32rm:
- case X86::CMOVB32rr: case X86::CMOVB64rm: case X86::CMOVB64rr:
- return X86::COND_B;
- case X86::CMOVBE16rm: case X86::CMOVBE16rr: case X86::CMOVBE32rm:
- case X86::CMOVBE32rr: case X86::CMOVBE64rm: case X86::CMOVBE64rr:
- return X86::COND_BE;
- case X86::CMOVE16rm: case X86::CMOVE16rr: case X86::CMOVE32rm:
- case X86::CMOVE32rr: case X86::CMOVE64rm: case X86::CMOVE64rr:
- return X86::COND_E;
- case X86::CMOVG16rm: case X86::CMOVG16rr: case X86::CMOVG32rm:
- case X86::CMOVG32rr: case X86::CMOVG64rm: case X86::CMOVG64rr:
- return X86::COND_G;
- case X86::CMOVGE16rm: case X86::CMOVGE16rr: case X86::CMOVGE32rm:
- case X86::CMOVGE32rr: case X86::CMOVGE64rm: case X86::CMOVGE64rr:
- return X86::COND_GE;
- case X86::CMOVL16rm: case X86::CMOVL16rr: case X86::CMOVL32rm:
- case X86::CMOVL32rr: case X86::CMOVL64rm: case X86::CMOVL64rr:
- return X86::COND_L;
- case X86::CMOVLE16rm: case X86::CMOVLE16rr: case X86::CMOVLE32rm:
- case X86::CMOVLE32rr: case X86::CMOVLE64rm: case X86::CMOVLE64rr:
- return X86::COND_LE;
- case X86::CMOVNE16rm: case X86::CMOVNE16rr: case X86::CMOVNE32rm:
- case X86::CMOVNE32rr: case X86::CMOVNE64rm: case X86::CMOVNE64rr:
- return X86::COND_NE;
- case X86::CMOVNO16rm: case X86::CMOVNO16rr: case X86::CMOVNO32rm:
- case X86::CMOVNO32rr: case X86::CMOVNO64rm: case X86::CMOVNO64rr:
- return X86::COND_NO;
- case X86::CMOVNP16rm: case X86::CMOVNP16rr: case X86::CMOVNP32rm:
- case X86::CMOVNP32rr: case X86::CMOVNP64rm: case X86::CMOVNP64rr:
- return X86::COND_NP;
- case X86::CMOVNS16rm: case X86::CMOVNS16rr: case X86::CMOVNS32rm:
- case X86::CMOVNS32rr: case X86::CMOVNS64rm: case X86::CMOVNS64rr:
- return X86::COND_NS;
- case X86::CMOVO16rm: case X86::CMOVO16rr: case X86::CMOVO32rm:
- case X86::CMOVO32rr: case X86::CMOVO64rm: case X86::CMOVO64rr:
- return X86::COND_O;
- case X86::CMOVP16rm: case X86::CMOVP16rr: case X86::CMOVP32rm:
- case X86::CMOVP32rr: case X86::CMOVP64rm: case X86::CMOVP64rr:
- return X86::COND_P;
- case X86::CMOVS16rm: case X86::CMOVS16rr: case X86::CMOVS32rm:
- case X86::CMOVS32rr: case X86::CMOVS64rm: case X86::CMOVS64rr:
- return X86::COND_S;
- }
-}
-
-unsigned X86::GetCondBranchFromCond(X86::CondCode CC) {
- switch (CC) {
- default: llvm_unreachable("Illegal condition code!");
- case X86::COND_E: return X86::JE_1;
- case X86::COND_NE: return X86::JNE_1;
- case X86::COND_L: return X86::JL_1;
- case X86::COND_LE: return X86::JLE_1;
- case X86::COND_G: return X86::JG_1;
- case X86::COND_GE: return X86::JGE_1;
- case X86::COND_B: return X86::JB_1;
- case X86::COND_BE: return X86::JBE_1;
- case X86::COND_A: return X86::JA_1;
- case X86::COND_AE: return X86::JAE_1;
- case X86::COND_S: return X86::JS_1;
- case X86::COND_NS: return X86::JNS_1;
- case X86::COND_P: return X86::JP_1;
- case X86::COND_NP: return X86::JNP_1;
- case X86::COND_O: return X86::JO_1;
- case X86::COND_NO: return X86::JNO_1;
+ case X86::CMOV16rr: case X86::CMOV32rr: case X86::CMOV64rr:
+ case X86::CMOV16rm: case X86::CMOV32rm: case X86::CMOV64rm:
+ return static_cast<X86::CondCode>(
+ MI.getOperand(MI.getDesc().getNumOperands() - 1).getImm());
}
}
@@ -2293,78 +2183,18 @@ X86::getX86ConditionCode(CmpInst::Predicate Predicate) {
return std::make_pair(CC, NeedSwap);
}
-/// Return a set opcode for the given condition and
-/// whether it has memory operand.
-unsigned X86::getSETFromCond(CondCode CC, bool HasMemoryOperand) {
- static const uint16_t Opc[16][2] = {
- { X86::SETAr, X86::SETAm },
- { X86::SETAEr, X86::SETAEm },
- { X86::SETBr, X86::SETBm },
- { X86::SETBEr, X86::SETBEm },
- { X86::SETEr, X86::SETEm },
- { X86::SETGr, X86::SETGm },
- { X86::SETGEr, X86::SETGEm },
- { X86::SETLr, X86::SETLm },
- { X86::SETLEr, X86::SETLEm },
- { X86::SETNEr, X86::SETNEm },
- { X86::SETNOr, X86::SETNOm },
- { X86::SETNPr, X86::SETNPm },
- { X86::SETNSr, X86::SETNSm },
- { X86::SETOr, X86::SETOm },
- { X86::SETPr, X86::SETPm },
- { X86::SETSr, X86::SETSm }
- };
-
- assert(CC <= LAST_VALID_COND && "Can only handle standard cond codes");
- return Opc[CC][HasMemoryOperand ? 1 : 0];
-}
-
-/// Return a cmov opcode for the given condition,
-/// register size in bytes, and operand type.
-unsigned X86::getCMovFromCond(CondCode CC, unsigned RegBytes,
- bool HasMemoryOperand) {
- static const uint16_t Opc[32][3] = {
- { X86::CMOVA16rr, X86::CMOVA32rr, X86::CMOVA64rr },
- { X86::CMOVAE16rr, X86::CMOVAE32rr, X86::CMOVAE64rr },
- { X86::CMOVB16rr, X86::CMOVB32rr, X86::CMOVB64rr },
- { X86::CMOVBE16rr, X86::CMOVBE32rr, X86::CMOVBE64rr },
- { X86::CMOVE16rr, X86::CMOVE32rr, X86::CMOVE64rr },
- { X86::CMOVG16rr, X86::CMOVG32rr, X86::CMOVG64rr },
- { X86::CMOVGE16rr, X86::CMOVGE32rr, X86::CMOVGE64rr },
- { X86::CMOVL16rr, X86::CMOVL32rr, X86::CMOVL64rr },
- { X86::CMOVLE16rr, X86::CMOVLE32rr, X86::CMOVLE64rr },
- { X86::CMOVNE16rr, X86::CMOVNE32rr, X86::CMOVNE64rr },
- { X86::CMOVNO16rr, X86::CMOVNO32rr, X86::CMOVNO64rr },
- { X86::CMOVNP16rr, X86::CMOVNP32rr, X86::CMOVNP64rr },
- { X86::CMOVNS16rr, X86::CMOVNS32rr, X86::CMOVNS64rr },
- { X86::CMOVO16rr, X86::CMOVO32rr, X86::CMOVO64rr },
- { X86::CMOVP16rr, X86::CMOVP32rr, X86::CMOVP64rr },
- { X86::CMOVS16rr, X86::CMOVS32rr, X86::CMOVS64rr },
- { X86::CMOVA16rm, X86::CMOVA32rm, X86::CMOVA64rm },
- { X86::CMOVAE16rm, X86::CMOVAE32rm, X86::CMOVAE64rm },
- { X86::CMOVB16rm, X86::CMOVB32rm, X86::CMOVB64rm },
- { X86::CMOVBE16rm, X86::CMOVBE32rm, X86::CMOVBE64rm },
- { X86::CMOVE16rm, X86::CMOVE32rm, X86::CMOVE64rm },
- { X86::CMOVG16rm, X86::CMOVG32rm, X86::CMOVG64rm },
- { X86::CMOVGE16rm, X86::CMOVGE32rm, X86::CMOVGE64rm },
- { X86::CMOVL16rm, X86::CMOVL32rm, X86::CMOVL64rm },
- { X86::CMOVLE16rm, X86::CMOVLE32rm, X86::CMOVLE64rm },
- { X86::CMOVNE16rm, X86::CMOVNE32rm, X86::CMOVNE64rm },
- { X86::CMOVNO16rm, X86::CMOVNO32rm, X86::CMOVNO64rm },
- { X86::CMOVNP16rm, X86::CMOVNP32rm, X86::CMOVNP64rm },
- { X86::CMOVNS16rm, X86::CMOVNS32rm, X86::CMOVNS64rm },
- { X86::CMOVO16rm, X86::CMOVO32rm, X86::CMOVO64rm },
- { X86::CMOVP16rm, X86::CMOVP32rm, X86::CMOVP64rm },
- { X86::CMOVS16rm, X86::CMOVS32rm, X86::CMOVS64rm }
- };
+/// Return a setcc opcode based on whether it has memory operand.
+unsigned X86::getSETOpc(bool HasMemoryOperand) {
+ return HasMemoryOperand ? X86::SETCCr : X86::SETCCm;
+}
- assert(CC < 16 && "Can only handle standard cond codes");
- unsigned Idx = HasMemoryOperand ? 16+CC : CC;
+/// Return a cmov opcode for the given register size in bytes, and operand type.
+unsigned X86::getCMovOpcode(unsigned RegBytes, bool HasMemoryOperand) {
switch(RegBytes) {
default: llvm_unreachable("Illegal register size!");
- case 2: return Opc[Idx][0];
- case 4: return Opc[Idx][1];
- case 8: return Opc[Idx][2];
+ case 2: return HasMemoryOperand ? X86::CMOV16rm : X86::CMOV16rr;
+ case 4: return HasMemoryOperand ? X86::CMOV32rm : X86::CMOV32rr;
+ case 8: return HasMemoryOperand ? X86::CMOV32rm : X86::CMOV64rr;
}
}
@@ -2490,7 +2320,7 @@ void X86InstrInfo::replaceBranchWithTailCall(
if (!I->isBranch())
assert(0 && "Can't find the branch to replace!");
- X86::CondCode CC = X86::getCondFromBranchOpc(I->getOpcode());
+ X86::CondCode CC = X86::getCondFromBranch(*I);
assert(BranchCond.size() == 1);
if (CC != BranchCond[0].getImm())
continue;
@@ -2597,13 +2427,13 @@ bool X86InstrInfo::AnalyzeBranchImpl(
}
// Handle conditional branches.
- X86::CondCode BranchCode = X86::getCondFromBranchOpc(I->getOpcode());
+ X86::CondCode BranchCode = X86::getCondFromBranch(*I);
if (BranchCode == X86::COND_INVALID)
return true; // Can't handle indirect branch.
// In practice we should never have an undef eflags operand, if we do
// abort here as we are not prepared to preserve the flag.
- if (I->getOperand(1).isUndef())
+ if (I->findRegisterUseOperand(X86::EFLAGS)->isUndef())
return true;
// Working from the bottom, handle the first conditional branch.
@@ -2629,11 +2459,11 @@ bool X86InstrInfo::AnalyzeBranchImpl(
// Which is a bit more efficient.
// We conditionally jump to the fall-through block.
BranchCode = GetOppositeBranchCondition(BranchCode);
- unsigned JNCC = GetCondBranchFromCond(BranchCode);
MachineBasicBlock::iterator OldInst = I;
- BuildMI(MBB, UnCondBrIter, MBB.findDebugLoc(I), get(JNCC))
- .addMBB(UnCondBrIter->getOperand(0).getMBB());
+ BuildMI(MBB, UnCondBrIter, MBB.findDebugLoc(I), get(X86::JCC_1))
+ .addMBB(UnCondBrIter->getOperand(0).getMBB())
+ .addImm(BranchCode);
BuildMI(MBB, UnCondBrIter, MBB.findDebugLoc(I), get(X86::JMP_1))
.addMBB(TargetBB);
@@ -2798,7 +2628,7 @@ unsigned X86InstrInfo::removeBranch(MachineBasicBlock &MBB,
if (I->isDebugInstr())
continue;
if (I->getOpcode() != X86::JMP_1 &&
- X86::getCondFromBranchOpc(I->getOpcode()) == X86::COND_INVALID)
+ X86::getCondFromBranch(*I) == X86::COND_INVALID)
break;
// Remove the branch.
I->eraseFromParent();
@@ -2837,9 +2667,9 @@ unsigned X86InstrInfo::insertBranch(MachineBasicBlock &MBB,
switch (CC) {
case X86::COND_NE_OR_P:
// Synthesize NE_OR_P with two branches.
- BuildMI(&MBB, DL, get(X86::JNE_1)).addMBB(TBB);
+ BuildMI(&MBB, DL, get(X86::JCC_1)).addMBB(TBB).addImm(X86::COND_NE);
++Count;
- BuildMI(&MBB, DL, get(X86::JP_1)).addMBB(TBB);
+ BuildMI(&MBB, DL, get(X86::JCC_1)).addMBB(TBB).addImm(X86::COND_P);
++Count;
break;
case X86::COND_E_AND_NP:
@@ -2850,14 +2680,13 @@ unsigned X86InstrInfo::insertBranch(MachineBasicBlock &MBB,
"body is a fall-through.");
}
// Synthesize COND_E_AND_NP with two branches.
- BuildMI(&MBB, DL, get(X86::JNE_1)).addMBB(FBB);
+ BuildMI(&MBB, DL, get(X86::JCC_1)).addMBB(FBB).addImm(X86::COND_NE);
++Count;
- BuildMI(&MBB, DL, get(X86::JNP_1)).addMBB(TBB);
+ BuildMI(&MBB, DL, get(X86::JCC_1)).addMBB(TBB).addImm(X86::COND_NP);
++Count;
break;
default: {
- unsigned Opc = GetCondBranchFromCond(CC);
- BuildMI(&MBB, DL, get(Opc)).addMBB(TBB);
+ BuildMI(&MBB, DL, get(X86::JCC_1)).addMBB(TBB).addImm(CC);
++Count;
}
}
@@ -2880,7 +2709,7 @@ canInsertSelect(const MachineBasicBlock &MBB,
if (Cond.size() != 1)
return false;
// We cannot do the composite conditions, at least not in SSA form.
- if ((X86::CondCode)Cond[0].getImm() > X86::COND_S)
+ if ((X86::CondCode)Cond[0].getImm() > X86::LAST_VALID_COND)
return false;
// Check register classes.
@@ -2915,10 +2744,12 @@ void X86InstrInfo::insertSelect(MachineBasicBlock &MBB,
const TargetRegisterInfo &TRI = *MRI.getTargetRegisterInfo();
const TargetRegisterClass &RC = *MRI.getRegClass(DstReg);
assert(Cond.size() == 1 && "Invalid Cond array");
- unsigned Opc = getCMovFromCond((X86::CondCode)Cond[0].getImm(),
- TRI.getRegSizeInBits(RC) / 8,
- false /*HasMemoryOperand*/);
- BuildMI(MBB, I, DL, get(Opc), DstReg).addReg(FalseReg).addReg(TrueReg);
+ unsigned Opc = X86::getCMovOpcode(TRI.getRegSizeInBits(RC) / 8,
+ false /*HasMemoryOperand*/);
+ BuildMI(MBB, I, DL, get(Opc), DstReg)
+ .addReg(FalseReg)
+ .addReg(TrueReg)
+ .addImm(Cond[0].getImm());
}
/// Test if the given register is a physical h register.
@@ -2984,22 +2815,22 @@ static unsigned CopyToFromAsymmetricReg(unsigned DestReg, unsigned SrcReg,
return X86::MMX_MOVD64to64rr;
}
- // SrcReg(FR32) -> DestReg(GR32)
- // SrcReg(GR32) -> DestReg(FR32)
+ // SrcReg(VR128) -> DestReg(GR32)
+ // SrcReg(GR32) -> DestReg(VR128)
if (X86::GR32RegClass.contains(DestReg) &&
- X86::FR32XRegClass.contains(SrcReg))
- // Copy from a FR32 register to a GR32 register.
- return HasAVX512 ? X86::VMOVSS2DIZrr :
- HasAVX ? X86::VMOVSS2DIrr :
- X86::MOVSS2DIrr;
+ X86::VR128XRegClass.contains(SrcReg))
+ // Copy from a VR128 register to a GR32 register.
+ return HasAVX512 ? X86::VMOVPDI2DIZrr :
+ HasAVX ? X86::VMOVPDI2DIrr :
+ X86::MOVPDI2DIrr;
- if (X86::FR32XRegClass.contains(DestReg) &&
+ if (X86::VR128XRegClass.contains(DestReg) &&
X86::GR32RegClass.contains(SrcReg))
- // Copy from a GR32 register to a FR32 register.
- return HasAVX512 ? X86::VMOVDI2SSZrr :
- HasAVX ? X86::VMOVDI2SSrr :
- X86::MOVDI2SSrr;
+ // Copy from a VR128 register to a VR128 register.
+ return HasAVX512 ? X86::VMOVDI2PDIZrr :
+ HasAVX ? X86::VMOVDI2PDIrr :
+ X86::MOVDI2PDIrr;
return 0;
}
@@ -3129,22 +2960,38 @@ static unsigned getLoadStoreRegOpcode(unsigned Reg,
return load ? X86::MOV32rm : X86::MOV32mr;
if (X86::FR32XRegClass.hasSubClassEq(RC))
return load ?
- (HasAVX512 ? X86::VMOVSSZrm : HasAVX ? X86::VMOVSSrm : X86::MOVSSrm) :
- (HasAVX512 ? X86::VMOVSSZmr : HasAVX ? X86::VMOVSSmr : X86::MOVSSmr);
+ (HasAVX512 ? X86::VMOVSSZrm_alt :
+ HasAVX ? X86::VMOVSSrm_alt :
+ X86::MOVSSrm_alt) :
+ (HasAVX512 ? X86::VMOVSSZmr :
+ HasAVX ? X86::VMOVSSmr :
+ X86::MOVSSmr);
if (X86::RFP32RegClass.hasSubClassEq(RC))
return load ? X86::LD_Fp32m : X86::ST_Fp32m;
if (X86::VK32RegClass.hasSubClassEq(RC)) {
assert(STI.hasBWI() && "KMOVD requires BWI");
return load ? X86::KMOVDkm : X86::KMOVDmk;
}
+ // All of these mask pair classes have the same spill size, the same kind
+ // of kmov instructions can be used with all of them.
+ if (X86::VK1PAIRRegClass.hasSubClassEq(RC) ||
+ X86::VK2PAIRRegClass.hasSubClassEq(RC) ||
+ X86::VK4PAIRRegClass.hasSubClassEq(RC) ||
+ X86::VK8PAIRRegClass.hasSubClassEq(RC) ||
+ X86::VK16PAIRRegClass.hasSubClassEq(RC))
+ return load ? X86::MASKPAIR16LOAD : X86::MASKPAIR16STORE;
llvm_unreachable("Unknown 4-byte regclass");
case 8:
if (X86::GR64RegClass.hasSubClassEq(RC))
return load ? X86::MOV64rm : X86::MOV64mr;
if (X86::FR64XRegClass.hasSubClassEq(RC))
return load ?
- (HasAVX512 ? X86::VMOVSDZrm : HasAVX ? X86::VMOVSDrm : X86::MOVSDrm) :
- (HasAVX512 ? X86::VMOVSDZmr : HasAVX ? X86::VMOVSDmr : X86::MOVSDmr);
+ (HasAVX512 ? X86::VMOVSDZrm_alt :
+ HasAVX ? X86::VMOVSDrm_alt :
+ X86::MOVSDrm_alt) :
+ (HasAVX512 ? X86::VMOVSDZmr :
+ HasAVX ? X86::VMOVSDmr :
+ X86::MOVSDmr);
if (X86::VR64RegClass.hasSubClassEq(RC))
return load ? X86::MMX_MOVQ64rm : X86::MMX_MOVQ64mr;
if (X86::RFP64RegClass.hasSubClassEq(RC))
@@ -3219,7 +3066,7 @@ static unsigned getLoadStoreRegOpcode(unsigned Reg,
}
bool X86InstrInfo::getMemOperandWithOffset(
- MachineInstr &MemOp, MachineOperand *&BaseOp, int64_t &Offset,
+ const MachineInstr &MemOp, const MachineOperand *&BaseOp, int64_t &Offset,
const TargetRegisterInfo *TRI) const {
const MCInstrDesc &Desc = MemOp.getDesc();
int MemRefBegin = X86II::getMemoryOperandNo(Desc.TSFlags);
@@ -3572,25 +3419,39 @@ inline static bool isDefConvertible(const MachineInstr &MI, bool &NoSignFlag) {
static X86::CondCode isUseDefConvertible(const MachineInstr &MI) {
switch (MI.getOpcode()) {
default: return X86::COND_INVALID;
- case X86::LZCNT16rr: case X86::LZCNT16rm:
- case X86::LZCNT32rr: case X86::LZCNT32rm:
- case X86::LZCNT64rr: case X86::LZCNT64rm:
+ case X86::NEG8r:
+ case X86::NEG16r:
+ case X86::NEG32r:
+ case X86::NEG64r:
+ return X86::COND_AE;
+ case X86::LZCNT16rr:
+ case X86::LZCNT32rr:
+ case X86::LZCNT64rr:
return X86::COND_B;
- case X86::POPCNT16rr:case X86::POPCNT16rm:
- case X86::POPCNT32rr:case X86::POPCNT32rm:
- case X86::POPCNT64rr:case X86::POPCNT64rm:
+ case X86::POPCNT16rr:
+ case X86::POPCNT32rr:
+ case X86::POPCNT64rr:
return X86::COND_E;
- case X86::TZCNT16rr: case X86::TZCNT16rm:
- case X86::TZCNT32rr: case X86::TZCNT32rm:
- case X86::TZCNT64rr: case X86::TZCNT64rm:
+ case X86::TZCNT16rr:
+ case X86::TZCNT32rr:
+ case X86::TZCNT64rr:
return X86::COND_B;
- case X86::BSF16rr: case X86::BSF16rm:
- case X86::BSF32rr: case X86::BSF32rm:
- case X86::BSF64rr: case X86::BSF64rm:
- case X86::BSR16rr: case X86::BSR16rm:
- case X86::BSR32rr: case X86::BSR32rm:
- case X86::BSR64rr: case X86::BSR64rm:
+ case X86::BSF16rr:
+ case X86::BSF32rr:
+ case X86::BSF64rr:
+ case X86::BSR16rr:
+ case X86::BSR32rr:
+ case X86::BSR64rr:
return X86::COND_E;
+ case X86::BLSI32rr:
+ case X86::BLSI64rr:
+ return X86::COND_AE;
+ case X86::BLSR32rr:
+ case X86::BLSR64rr:
+ case X86::BLSMSK32rr:
+ case X86::BLSMSK64rr:
+ return X86::COND_B;
+ // TODO: TBM instructions.
}
}
@@ -3602,7 +3463,6 @@ bool X86InstrInfo::optimizeCompareInstr(MachineInstr &CmpInstr, unsigned SrcReg,
int CmpValue,
const MachineRegisterInfo *MRI) const {
// Check whether we can replace SUB with CMP.
- unsigned NewOpcode = 0;
switch (CmpInstr.getOpcode()) {
default: break;
case X86::SUB64ri32:
@@ -3623,6 +3483,7 @@ bool X86InstrInfo::optimizeCompareInstr(MachineInstr &CmpInstr, unsigned SrcReg,
if (!MRI->use_nodbg_empty(CmpInstr.getOperand(0).getReg()))
return false;
// There is no use of the destination register, we can replace SUB with CMP.
+ unsigned NewOpcode = 0;
switch (CmpInstr.getOpcode()) {
default: llvm_unreachable("Unreachable!");
case X86::SUB64rm: NewOpcode = X86::CMP64rm; break;
@@ -3746,7 +3607,7 @@ bool X86InstrInfo::optimizeCompareInstr(MachineInstr &CmpInstr, unsigned SrcReg,
// If we are done with the basic block, we need to check whether EFLAGS is
// live-out.
bool IsSafe = false;
- SmallVector<std::pair<MachineInstr*, unsigned /*NewOpc*/>, 4> OpsToUpdate;
+ SmallVector<std::pair<MachineInstr*, X86::CondCode>, 4> OpsToUpdate;
MachineBasicBlock::iterator E = CmpInstr.getParent()->end();
for (++I; I != E; ++I) {
const MachineInstr &Instr = *I;
@@ -3763,17 +3624,14 @@ bool X86InstrInfo::optimizeCompareInstr(MachineInstr &CmpInstr, unsigned SrcReg,
// EFLAGS is used by this instruction.
X86::CondCode OldCC = X86::COND_INVALID;
- bool OpcIsSET = false;
if (IsCmpZero || IsSwapped) {
// We decode the condition code from opcode.
if (Instr.isBranch())
- OldCC = X86::getCondFromBranchOpc(Instr.getOpcode());
+ OldCC = X86::getCondFromBranch(Instr);
else {
- OldCC = X86::getCondFromSETOpc(Instr.getOpcode());
- if (OldCC != X86::COND_INVALID)
- OpcIsSET = true;
- else
- OldCC = X86::getCondFromCMovOpc(Instr.getOpcode());
+ OldCC = X86::getCondFromSETCC(Instr);
+ if (OldCC == X86::COND_INVALID)
+ OldCC = X86::getCondFromCMov(Instr);
}
if (OldCC == X86::COND_INVALID) return false;
}
@@ -3818,24 +3676,10 @@ bool X86InstrInfo::optimizeCompareInstr(MachineInstr &CmpInstr, unsigned SrcReg,
}
if ((ShouldUpdateCC || IsSwapped) && ReplacementCC != OldCC) {
- // Synthesize the new opcode.
- bool HasMemoryOperand = Instr.hasOneMemOperand();
- unsigned NewOpc;
- if (Instr.isBranch())
- NewOpc = GetCondBranchFromCond(ReplacementCC);
- else if(OpcIsSET)
- NewOpc = getSETFromCond(ReplacementCC, HasMemoryOperand);
- else {
- unsigned DstReg = Instr.getOperand(0).getReg();
- const TargetRegisterClass *DstRC = MRI->getRegClass(DstReg);
- NewOpc = getCMovFromCond(ReplacementCC, TRI->getRegSizeInBits(*DstRC)/8,
- HasMemoryOperand);
- }
-
// Push the MachineInstr to OpsToUpdate.
// If it is safe to remove CmpInstr, the condition code of these
// instructions will be modified.
- OpsToUpdate.push_back(std::make_pair(&*I, NewOpc));
+ OpsToUpdate.push_back(std::make_pair(&*I, ReplacementCC));
}
if (ModifyEFLAGS || Instr.killsRegister(X86::EFLAGS, TRI)) {
// It is safe to remove CmpInstr if EFLAGS is updated again or killed.
@@ -3876,21 +3720,17 @@ bool X86InstrInfo::optimizeCompareInstr(MachineInstr &CmpInstr, unsigned SrcReg,
}
// Make sure Sub instruction defines EFLAGS and mark the def live.
- unsigned i = 0, e = Sub->getNumOperands();
- for (; i != e; ++i) {
- MachineOperand &MO = Sub->getOperand(i);
- if (MO.isReg() && MO.isDef() && MO.getReg() == X86::EFLAGS) {
- MO.setIsDead(false);
- break;
- }
- }
- assert(i != e && "Unable to locate a def EFLAGS operand");
+ MachineOperand *FlagDef = Sub->findRegisterDefOperand(X86::EFLAGS);
+ assert(FlagDef && "Unable to locate a def EFLAGS operand");
+ FlagDef->setIsDead(false);
CmpInstr.eraseFromParent();
// Modify the condition code of instructions in OpsToUpdate.
- for (auto &Op : OpsToUpdate)
- Op.first->setDesc(get(Op.second));
+ for (auto &Op : OpsToUpdate) {
+ Op.first->getOperand(Op.first->getDesc().getNumOperands() - 1)
+ .setImm(Op.second);
+ }
return true;
}
@@ -4128,6 +3968,20 @@ static bool expandNOVLXStore(MachineInstrBuilder &MIB,
return true;
}
+
+static bool expandSHXDROT(MachineInstrBuilder &MIB, const MCInstrDesc &Desc) {
+ MIB->setDesc(Desc);
+ int64_t ShiftAmt = MIB->getOperand(2).getImm();
+ // Temporarily remove the immediate so we can add another source register.
+ MIB->RemoveOperand(2);
+ // Add the register. Don't copy the kill flag if there is one.
+ MIB.addReg(MIB->getOperand(1).getReg(),
+ getUndefRegState(MIB->getOperand(1).isUndef()));
+ // Add back the immediate.
+ MIB.addImm(ShiftAmt);
+ return true;
+}
+
bool X86InstrInfo::expandPostRAPseudo(MachineInstr &MI) const {
bool HasAVX = Subtarget.hasAVX();
MachineInstrBuilder MIB(*MI.getParent()->getParent(), MI);
@@ -4193,6 +4047,12 @@ bool X86InstrInfo::expandPostRAPseudo(MachineInstr &MI) const {
MIB.addReg(SrcReg, RegState::ImplicitDefine);
return true;
}
+ if (MI.getOpcode() == X86::AVX512_256_SET0) {
+ // No VLX so we must reference a zmm.
+ unsigned ZReg =
+ TRI->getMatchingSuperReg(SrcReg, X86::sub_ymm, &X86::VR512RegClass);
+ MIB->getOperand(0).setReg(ZReg);
+ }
return Expand2AddrUndef(MIB, get(X86::VPXORDZrr));
}
case X86::V_SETALLONES:
@@ -4282,6 +4142,21 @@ bool X86InstrInfo::expandPostRAPseudo(MachineInstr &MI) const {
case X86::XOR64_FP:
case X86::XOR32_FP:
return expandXorFP(MIB, *this);
+ case X86::SHLDROT32ri: return expandSHXDROT(MIB, get(X86::SHLD32rri8));
+ case X86::SHLDROT64ri: return expandSHXDROT(MIB, get(X86::SHLD64rri8));
+ case X86::SHRDROT32ri: return expandSHXDROT(MIB, get(X86::SHRD32rri8));
+ case X86::SHRDROT64ri: return expandSHXDROT(MIB, get(X86::SHRD64rri8));
+ case X86::ADD8rr_DB: MIB->setDesc(get(X86::OR8rr)); break;
+ case X86::ADD16rr_DB: MIB->setDesc(get(X86::OR16rr)); break;
+ case X86::ADD32rr_DB: MIB->setDesc(get(X86::OR32rr)); break;
+ case X86::ADD64rr_DB: MIB->setDesc(get(X86::OR64rr)); break;
+ case X86::ADD8ri_DB: MIB->setDesc(get(X86::OR8ri)); break;
+ case X86::ADD16ri_DB: MIB->setDesc(get(X86::OR16ri)); break;
+ case X86::ADD32ri_DB: MIB->setDesc(get(X86::OR32ri)); break;
+ case X86::ADD64ri32_DB: MIB->setDesc(get(X86::OR64ri32)); break;
+ case X86::ADD16ri8_DB: MIB->setDesc(get(X86::OR16ri8)); break;
+ case X86::ADD32ri8_DB: MIB->setDesc(get(X86::OR32ri8)); break;
+ case X86::ADD64ri8_DB: MIB->setDesc(get(X86::OR64ri8)); break;
}
return false;
}
@@ -4303,7 +4178,8 @@ bool X86InstrInfo::expandPostRAPseudo(MachineInstr &MI) const {
/// FIXME: This should be turned into a TSFlags.
///
static bool hasPartialRegUpdate(unsigned Opcode,
- const X86Subtarget &Subtarget) {
+ const X86Subtarget &Subtarget,
+ bool ForLoadFold = false) {
switch (Opcode) {
case X86::CVTSI2SSrr:
case X86::CVTSI2SSrm:
@@ -4313,6 +4189,9 @@ static bool hasPartialRegUpdate(unsigned Opcode,
case X86::CVTSI2SDrm:
case X86::CVTSI642SDrr:
case X86::CVTSI642SDrm:
+ // Load folding won't effect the undef register update since the input is
+ // a GPR.
+ return !ForLoadFold;
case X86::CVTSD2SSrr:
case X86::CVTSD2SSrm:
case X86::CVTSS2SDrr:
@@ -4389,7 +4268,7 @@ unsigned X86InstrInfo::getPartialRegUpdateClearance(
// Return true for any instruction the copies the high bits of the first source
// operand into the unused high bits of the destination operand.
-static bool hasUndefRegUpdate(unsigned Opcode) {
+static bool hasUndefRegUpdate(unsigned Opcode, bool ForLoadFold = false) {
switch (Opcode) {
case X86::VCVTSI2SSrr:
case X86::VCVTSI2SSrm:
@@ -4407,38 +4286,6 @@ static bool hasUndefRegUpdate(unsigned Opcode) {
case X86::VCVTSI642SDrm:
case X86::VCVTSI642SDrr_Int:
case X86::VCVTSI642SDrm_Int:
- case X86::VCVTSD2SSrr:
- case X86::VCVTSD2SSrm:
- case X86::VCVTSD2SSrr_Int:
- case X86::VCVTSD2SSrm_Int:
- case X86::VCVTSS2SDrr:
- case X86::VCVTSS2SDrm:
- case X86::VCVTSS2SDrr_Int:
- case X86::VCVTSS2SDrm_Int:
- case X86::VRCPSSr:
- case X86::VRCPSSr_Int:
- case X86::VRCPSSm:
- case X86::VRCPSSm_Int:
- case X86::VROUNDSDr:
- case X86::VROUNDSDm:
- case X86::VROUNDSDr_Int:
- case X86::VROUNDSDm_Int:
- case X86::VROUNDSSr:
- case X86::VROUNDSSm:
- case X86::VROUNDSSr_Int:
- case X86::VROUNDSSm_Int:
- case X86::VRSQRTSSr:
- case X86::VRSQRTSSr_Int:
- case X86::VRSQRTSSm:
- case X86::VRSQRTSSm_Int:
- case X86::VSQRTSSr:
- case X86::VSQRTSSr_Int:
- case X86::VSQRTSSm:
- case X86::VSQRTSSm_Int:
- case X86::VSQRTSDr:
- case X86::VSQRTSDr_Int:
- case X86::VSQRTSDm:
- case X86::VSQRTSDm_Int:
// AVX-512
case X86::VCVTSI2SSZrr:
case X86::VCVTSI2SSZrm:
@@ -4453,7 +4300,6 @@ static bool hasUndefRegUpdate(unsigned Opcode) {
case X86::VCVTSI2SDZrr:
case X86::VCVTSI2SDZrm:
case X86::VCVTSI2SDZrr_Int:
- case X86::VCVTSI2SDZrrb_Int:
case X86::VCVTSI2SDZrm_Int:
case X86::VCVTSI642SDZrr:
case X86::VCVTSI642SDZrm:
@@ -4479,6 +4325,42 @@ static bool hasUndefRegUpdate(unsigned Opcode) {
case X86::VCVTUSI642SDZrr_Int:
case X86::VCVTUSI642SDZrrb_Int:
case X86::VCVTUSI642SDZrm_Int:
+ // Load folding won't effect the undef register update since the input is
+ // a GPR.
+ return !ForLoadFold;
+ case X86::VCVTSD2SSrr:
+ case X86::VCVTSD2SSrm:
+ case X86::VCVTSD2SSrr_Int:
+ case X86::VCVTSD2SSrm_Int:
+ case X86::VCVTSS2SDrr:
+ case X86::VCVTSS2SDrm:
+ case X86::VCVTSS2SDrr_Int:
+ case X86::VCVTSS2SDrm_Int:
+ case X86::VRCPSSr:
+ case X86::VRCPSSr_Int:
+ case X86::VRCPSSm:
+ case X86::VRCPSSm_Int:
+ case X86::VROUNDSDr:
+ case X86::VROUNDSDm:
+ case X86::VROUNDSDr_Int:
+ case X86::VROUNDSDm_Int:
+ case X86::VROUNDSSr:
+ case X86::VROUNDSSm:
+ case X86::VROUNDSSr_Int:
+ case X86::VROUNDSSm_Int:
+ case X86::VRSQRTSSr:
+ case X86::VRSQRTSSr_Int:
+ case X86::VRSQRTSSm:
+ case X86::VRSQRTSSm_Int:
+ case X86::VSQRTSSr:
+ case X86::VSQRTSSr_Int:
+ case X86::VSQRTSSm:
+ case X86::VSQRTSSm_Int:
+ case X86::VSQRTSDr:
+ case X86::VSQRTSDr_Int:
+ case X86::VSQRTSDm:
+ case X86::VSQRTSDm_Int:
+ // AVX-512
case X86::VCVTSD2SSZrr:
case X86::VCVTSD2SSZrr_Int:
case X86::VCVTSD2SSZrrb_Int:
@@ -4759,7 +4641,7 @@ MachineInstr *X86InstrInfo::foldMemoryOperandCustom(
const TargetRegisterInfo &TRI = *MF.getSubtarget().getRegisterInfo();
const TargetRegisterClass *RC = getRegClass(MI.getDesc(), OpNum, &RI, MF);
unsigned RCSize = TRI.getRegSizeInBits(*RC) / 8;
- if (Size <= RCSize && 4 <= Align) {
+ if ((Size == 0 || Size >= 16) && RCSize >= 16 && 4 <= Align) {
int PtrOffset = SrcIdx * 4;
unsigned NewImm = (DstIdx << 4) | ZMask;
unsigned NewOpCode =
@@ -4783,7 +4665,7 @@ MachineInstr *X86InstrInfo::foldMemoryOperandCustom(
const TargetRegisterInfo &TRI = *MF.getSubtarget().getRegisterInfo();
const TargetRegisterClass *RC = getRegClass(MI.getDesc(), OpNum, &RI, MF);
unsigned RCSize = TRI.getRegSizeInBits(*RC) / 8;
- if (Size <= RCSize && 8 <= Align) {
+ if ((Size == 0 || Size >= 16) && RCSize >= 16 && 8 <= Align) {
unsigned NewOpCode =
(MI.getOpcode() == X86::VMOVHLPSZrr) ? X86::VMOVLPSZ128rm :
(MI.getOpcode() == X86::VMOVHLPSrr) ? X86::VMOVLPSrm :
@@ -4794,13 +4676,29 @@ MachineInstr *X86InstrInfo::foldMemoryOperandCustom(
}
}
break;
- };
+ case X86::UNPCKLPDrr:
+ // If we won't be able to fold this to the memory form of UNPCKL, use
+ // MOVHPD instead. Done as custom because we can't have this in the load
+ // table twice.
+ if (OpNum == 2) {
+ const TargetRegisterInfo &TRI = *MF.getSubtarget().getRegisterInfo();
+ const TargetRegisterClass *RC = getRegClass(MI.getDesc(), OpNum, &RI, MF);
+ unsigned RCSize = TRI.getRegSizeInBits(*RC) / 8;
+ if ((Size == 0 || Size >= 16) && RCSize >= 16 && Align < 16) {
+ MachineInstr *NewMI =
+ FuseInst(MF, X86::MOVHPDrm, OpNum, MOs, InsertPt, MI, *this);
+ return NewMI;
+ }
+ }
+ break;
+ }
return nullptr;
}
-static bool shouldPreventUndefRegUpdateMemFold(MachineFunction &MF, MachineInstr &MI) {
- if (MF.getFunction().optForSize() || !hasUndefRegUpdate(MI.getOpcode()) ||
+static bool shouldPreventUndefRegUpdateMemFold(MachineFunction &MF,
+ MachineInstr &MI) {
+ if (!hasUndefRegUpdate(MI.getOpcode(), /*ForLoadFold*/true) ||
!MI.getOperand(1).isReg())
return false;
@@ -4828,15 +4726,15 @@ MachineInstr *X86InstrInfo::foldMemoryOperandImpl(
// For CPUs that favor the register form of a call or push,
// do not fold loads into calls or pushes, unless optimizing for size
// aggressively.
- if (isSlowTwoMemOps && !MF.getFunction().optForMinSize() &&
+ if (isSlowTwoMemOps && !MF.getFunction().hasMinSize() &&
(MI.getOpcode() == X86::CALL32r || MI.getOpcode() == X86::CALL64r ||
MI.getOpcode() == X86::PUSH16r || MI.getOpcode() == X86::PUSH32r ||
MI.getOpcode() == X86::PUSH64r))
return nullptr;
// Avoid partial and undef register update stalls unless optimizing for size.
- if (!MF.getFunction().optForSize() &&
- (hasPartialRegUpdate(MI.getOpcode(), Subtarget) ||
+ if (!MF.getFunction().hasOptSize() &&
+ (hasPartialRegUpdate(MI.getOpcode(), Subtarget, /*ForLoadFold*/true) ||
shouldPreventUndefRegUpdateMemFold(MF, MI)))
return nullptr;
@@ -4899,6 +4797,7 @@ MachineInstr *X86InstrInfo::foldMemoryOperandImpl(
&RI, MF);
unsigned RCSize = TRI.getRegSizeInBits(*RC) / 8;
if (Size < RCSize) {
+ // FIXME: Allow scalar intrinsic instructions like ADDSSrm_Int.
// Check if it's safe to fold the load. If the size of the object is
// narrower than the load width, then it's not.
if (Opcode != X86::MOV64rm || RCSize != 8 || Size != 4)
@@ -4937,9 +4836,9 @@ MachineInstr *X86InstrInfo::foldMemoryOperandImpl(
unsigned CommuteOpIdx1 = OpNum, CommuteOpIdx2 = CommuteAnyOperandIndex;
if (findCommutedOpIndices(MI, CommuteOpIdx1, CommuteOpIdx2)) {
bool HasDef = MI.getDesc().getNumDefs();
- unsigned Reg0 = HasDef ? MI.getOperand(0).getReg() : 0;
- unsigned Reg1 = MI.getOperand(CommuteOpIdx1).getReg();
- unsigned Reg2 = MI.getOperand(CommuteOpIdx2).getReg();
+ Register Reg0 = HasDef ? MI.getOperand(0).getReg() : Register();
+ Register Reg1 = MI.getOperand(CommuteOpIdx1).getReg();
+ Register Reg2 = MI.getOperand(CommuteOpIdx2).getReg();
bool Tied1 =
0 == MI.getDesc().getOperandConstraint(CommuteOpIdx1, MCOI::TIED_TO);
bool Tied2 =
@@ -4997,14 +4896,15 @@ MachineInstr *
X86InstrInfo::foldMemoryOperandImpl(MachineFunction &MF, MachineInstr &MI,
ArrayRef<unsigned> Ops,
MachineBasicBlock::iterator InsertPt,
- int FrameIndex, LiveIntervals *LIS) const {
+ int FrameIndex, LiveIntervals *LIS,
+ VirtRegMap *VRM) const {
// Check switch flag
if (NoFusing)
return nullptr;
// Avoid partial and undef register update stalls unless optimizing for size.
- if (!MF.getFunction().optForSize() &&
- (hasPartialRegUpdate(MI.getOpcode(), Subtarget) ||
+ if (!MF.getFunction().hasOptSize() &&
+ (hasPartialRegUpdate(MI.getOpcode(), Subtarget, /*ForLoadFold*/true) ||
shouldPreventUndefRegUpdateMemFold(MF, MI)))
return nullptr;
@@ -5073,7 +4973,9 @@ static bool isNonFoldablePartialRegisterLoad(const MachineInstr &LoadMI,
MF.getRegInfo().getRegClass(LoadMI.getOperand(0).getReg());
unsigned RegSize = TRI.getRegSizeInBits(*RC);
- if ((Opc == X86::MOVSSrm || Opc == X86::VMOVSSrm || Opc == X86::VMOVSSZrm) &&
+ if ((Opc == X86::MOVSSrm || Opc == X86::VMOVSSrm || Opc == X86::VMOVSSZrm ||
+ Opc == X86::MOVSSrm_alt || Opc == X86::VMOVSSrm_alt ||
+ Opc == X86::VMOVSSZrm_alt) &&
RegSize > 32) {
// These instructions only load 32 bits, we can't fold them if the
// destination register is wider than 32 bits (4 bytes), and its user
@@ -5087,6 +4989,7 @@ static bool isNonFoldablePartialRegisterLoad(const MachineInstr &LoadMI,
case X86::MULSSrr_Int: case X86::VMULSSrr_Int: case X86::VMULSSZrr_Int:
case X86::SUBSSrr_Int: case X86::VSUBSSrr_Int: case X86::VSUBSSZrr_Int:
case X86::VADDSSZrr_Intk: case X86::VADDSSZrr_Intkz:
+ case X86::VCMPSSZrr_Intk:
case X86::VDIVSSZrr_Intk: case X86::VDIVSSZrr_Intkz:
case X86::VMAXSSZrr_Intk: case X86::VMAXSSZrr_Intkz:
case X86::VMINSSZrr_Intk: case X86::VMINSSZrr_Intkz:
@@ -5124,7 +5027,9 @@ static bool isNonFoldablePartialRegisterLoad(const MachineInstr &LoadMI,
}
}
- if ((Opc == X86::MOVSDrm || Opc == X86::VMOVSDrm || Opc == X86::VMOVSDZrm) &&
+ if ((Opc == X86::MOVSDrm || Opc == X86::VMOVSDrm || Opc == X86::VMOVSDZrm ||
+ Opc == X86::MOVSDrm_alt || Opc == X86::VMOVSDrm_alt ||
+ Opc == X86::VMOVSDZrm_alt) &&
RegSize > 64) {
// These instructions only load 64 bits, we can't fold them if the
// destination register is wider than 64 bits (8 bytes), and its user
@@ -5138,6 +5043,7 @@ static bool isNonFoldablePartialRegisterLoad(const MachineInstr &LoadMI,
case X86::MULSDrr_Int: case X86::VMULSDrr_Int: case X86::VMULSDZrr_Int:
case X86::SUBSDrr_Int: case X86::VSUBSDrr_Int: case X86::VSUBSDZrr_Int:
case X86::VADDSDZrr_Intk: case X86::VADDSDZrr_Intkz:
+ case X86::VCMPSDZrr_Intk:
case X86::VDIVSDZrr_Intk: case X86::VDIVSDZrr_Intkz:
case X86::VMAXSDZrr_Intk: case X86::VMAXSDZrr_Intkz:
case X86::VMINSDZrr_Intk: case X86::VMINSDZrr_Intkz:
@@ -5203,8 +5109,8 @@ MachineInstr *X86InstrInfo::foldMemoryOperandImpl(
if (NoFusing) return nullptr;
// Avoid partial and undef register update stalls unless optimizing for size.
- if (!MF.getFunction().optForSize() &&
- (hasPartialRegUpdate(MI.getOpcode(), Subtarget) ||
+ if (!MF.getFunction().hasOptSize() &&
+ (hasPartialRegUpdate(MI.getOpcode(), Subtarget, /*ForLoadFold*/true) ||
shouldPreventUndefRegUpdateMemFold(MF, MI)))
return nullptr;
@@ -5359,10 +5265,7 @@ extractLoadMMOs(ArrayRef<MachineMemOperand *> MMOs, MachineFunction &MF) {
} else {
// Clone the MMO and unset the store flag.
LoadMMOs.push_back(MF.getMachineMemOperand(
- MMO->getPointerInfo(), MMO->getFlags() & ~MachineMemOperand::MOStore,
- MMO->getSize(), MMO->getBaseAlignment(), MMO->getAAInfo(), nullptr,
- MMO->getSyncScopeID(), MMO->getOrdering(),
- MMO->getFailureOrdering()));
+ MMO, MMO->getFlags() & ~MachineMemOperand::MOStore));
}
}
@@ -5383,10 +5286,7 @@ extractStoreMMOs(ArrayRef<MachineMemOperand *> MMOs, MachineFunction &MF) {
} else {
// Clone the MMO and unset the load flag.
StoreMMOs.push_back(MF.getMachineMemOperand(
- MMO->getPointerInfo(), MMO->getFlags() & ~MachineMemOperand::MOLoad,
- MMO->getSize(), MMO->getBaseAlignment(), MMO->getAAInfo(), nullptr,
- MMO->getSyncScopeID(), MMO->getOrdering(),
- MMO->getFailureOrdering()));
+ MMO, MMO->getFlags() & ~MachineMemOperand::MOLoad));
}
}
@@ -5668,7 +5568,9 @@ X86InstrInfo::areLoadsFromSameBasePtr(SDNode *Load1, SDNode *Load2,
case X86::LD_Fp64m:
case X86::LD_Fp80m:
case X86::MOVSSrm:
+ case X86::MOVSSrm_alt:
case X86::MOVSDrm:
+ case X86::MOVSDrm_alt:
case X86::MMX_MOVD64rm:
case X86::MMX_MOVQ64rm:
case X86::MOVAPSrm:
@@ -5679,7 +5581,9 @@ X86InstrInfo::areLoadsFromSameBasePtr(SDNode *Load1, SDNode *Load2,
case X86::MOVDQUrm:
// AVX load instructions
case X86::VMOVSSrm:
+ case X86::VMOVSSrm_alt:
case X86::VMOVSDrm:
+ case X86::VMOVSDrm_alt:
case X86::VMOVAPSrm:
case X86::VMOVUPSrm:
case X86::VMOVAPDrm:
@@ -5694,7 +5598,9 @@ X86InstrInfo::areLoadsFromSameBasePtr(SDNode *Load1, SDNode *Load2,
case X86::VMOVDQUYrm:
// AVX512 load instructions
case X86::VMOVSSZrm:
+ case X86::VMOVSSZrm_alt:
case X86::VMOVSDZrm:
+ case X86::VMOVSDZrm_alt:
case X86::VMOVAPSZ128rm:
case X86::VMOVUPSZ128rm:
case X86::VMOVAPSZ128rm_NOVLX:
@@ -5745,7 +5651,9 @@ X86InstrInfo::areLoadsFromSameBasePtr(SDNode *Load1, SDNode *Load2,
case X86::LD_Fp64m:
case X86::LD_Fp80m:
case X86::MOVSSrm:
+ case X86::MOVSSrm_alt:
case X86::MOVSDrm:
+ case X86::MOVSDrm_alt:
case X86::MMX_MOVD64rm:
case X86::MMX_MOVQ64rm:
case X86::MOVAPSrm:
@@ -5756,7 +5664,9 @@ X86InstrInfo::areLoadsFromSameBasePtr(SDNode *Load1, SDNode *Load2,
case X86::MOVDQUrm:
// AVX load instructions
case X86::VMOVSSrm:
+ case X86::VMOVSSrm_alt:
case X86::VMOVSDrm:
+ case X86::VMOVSDrm_alt:
case X86::VMOVAPSrm:
case X86::VMOVUPSrm:
case X86::VMOVAPDrm:
@@ -5771,7 +5681,9 @@ X86InstrInfo::areLoadsFromSameBasePtr(SDNode *Load1, SDNode *Load2,
case X86::VMOVDQUYrm:
// AVX512 load instructions
case X86::VMOVSSZrm:
+ case X86::VMOVSSZrm_alt:
case X86::VMOVSDZrm:
+ case X86::VMOVSDZrm_alt:
case X86::VMOVAPSZ128rm:
case X86::VMOVUPSZ128rm:
case X86::VMOVAPSZ128rm_NOVLX:
@@ -5943,7 +5855,9 @@ static const uint16_t ReplaceableInstrs[][3] = {
{ X86::MOVSDmr, X86::MOVSDmr, X86::MOVPQI2QImr },
{ X86::MOVSSmr, X86::MOVSSmr, X86::MOVPDI2DImr },
{ X86::MOVSDrm, X86::MOVSDrm, X86::MOVQI2PQIrm },
+ { X86::MOVSDrm_alt,X86::MOVSDrm_alt,X86::MOVQI2PQIrm },
{ X86::MOVSSrm, X86::MOVSSrm, X86::MOVDI2PDIrm },
+ { X86::MOVSSrm_alt,X86::MOVSSrm_alt,X86::MOVDI2PDIrm },
{ X86::MOVNTPSmr, X86::MOVNTPDmr, X86::MOVNTDQmr },
{ X86::ANDNPSrm, X86::ANDNPDrm, X86::PANDNrm },
{ X86::ANDNPSrr, X86::ANDNPDrr, X86::PANDNrr },
@@ -5973,7 +5887,9 @@ static const uint16_t ReplaceableInstrs[][3] = {
{ X86::VMOVSDmr, X86::VMOVSDmr, X86::VMOVPQI2QImr },
{ X86::VMOVSSmr, X86::VMOVSSmr, X86::VMOVPDI2DImr },
{ X86::VMOVSDrm, X86::VMOVSDrm, X86::VMOVQI2PQIrm },
+ { X86::VMOVSDrm_alt,X86::VMOVSDrm_alt,X86::VMOVQI2PQIrm },
{ X86::VMOVSSrm, X86::VMOVSSrm, X86::VMOVDI2PDIrm },
+ { X86::VMOVSSrm_alt,X86::VMOVSSrm_alt,X86::VMOVDI2PDIrm },
{ X86::VMOVNTPSmr, X86::VMOVNTPDmr, X86::VMOVNTDQmr },
{ X86::VANDNPSrm, X86::VANDNPDrm, X86::VPANDNrm },
{ X86::VANDNPSrr, X86::VANDNPDrr, X86::VPANDNrr },
@@ -6012,13 +5928,17 @@ static const uint16_t ReplaceableInstrs[][3] = {
{ X86::VMOVSDZmr, X86::VMOVSDZmr, X86::VMOVPQI2QIZmr },
{ X86::VMOVSSZmr, X86::VMOVSSZmr, X86::VMOVPDI2DIZmr },
{ X86::VMOVSDZrm, X86::VMOVSDZrm, X86::VMOVQI2PQIZrm },
+ { X86::VMOVSDZrm_alt, X86::VMOVSDZrm_alt, X86::VMOVQI2PQIZrm },
{ X86::VMOVSSZrm, X86::VMOVSSZrm, X86::VMOVDI2PDIZrm },
+ { X86::VMOVSSZrm_alt, X86::VMOVSSZrm_alt, X86::VMOVDI2PDIZrm },
{ X86::VBROADCASTSSZ128r, X86::VBROADCASTSSZ128r, X86::VPBROADCASTDZ128r },
{ X86::VBROADCASTSSZ128m, X86::VBROADCASTSSZ128m, X86::VPBROADCASTDZ128m },
{ X86::VBROADCASTSSZ256r, X86::VBROADCASTSSZ256r, X86::VPBROADCASTDZ256r },
{ X86::VBROADCASTSSZ256m, X86::VBROADCASTSSZ256m, X86::VPBROADCASTDZ256m },
{ X86::VBROADCASTSSZr, X86::VBROADCASTSSZr, X86::VPBROADCASTDZr },
{ X86::VBROADCASTSSZm, X86::VBROADCASTSSZm, X86::VPBROADCASTDZm },
+ { X86::VMOVDDUPZ128rr, X86::VMOVDDUPZ128rr, X86::VPBROADCASTQZ128r },
+ { X86::VMOVDDUPZ128rm, X86::VMOVDDUPZ128rm, X86::VPBROADCASTQZ128m },
{ X86::VBROADCASTSDZ256r, X86::VBROADCASTSDZ256r, X86::VPBROADCASTQZ256r },
{ X86::VBROADCASTSDZ256m, X86::VBROADCASTSDZ256m, X86::VPBROADCASTQZ256m },
{ X86::VBROADCASTSDZr, X86::VBROADCASTSDZr, X86::VPBROADCASTQZr },
@@ -6109,6 +6029,8 @@ static const uint16_t ReplaceableInstrsAVX2[][3] = {
{ X86::VPERM2F128rr, X86::VPERM2F128rr, X86::VPERM2I128rr },
{ X86::VBROADCASTSSrm, X86::VBROADCASTSSrm, X86::VPBROADCASTDrm},
{ X86::VBROADCASTSSrr, X86::VBROADCASTSSrr, X86::VPBROADCASTDrr},
+ { X86::VMOVDDUPrm, X86::VMOVDDUPrm, X86::VPBROADCASTQrm},
+ { X86::VMOVDDUPrr, X86::VMOVDDUPrr, X86::VPBROADCASTQrr},
{ X86::VBROADCASTSSYrr, X86::VBROADCASTSSYrr, X86::VPBROADCASTDYrr},
{ X86::VBROADCASTSSYrm, X86::VBROADCASTSSYrm, X86::VPBROADCASTDYrm},
{ X86::VBROADCASTSDYrr, X86::VBROADCASTSDYrr, X86::VPBROADCASTQYrr},
@@ -6128,6 +6050,19 @@ static const uint16_t ReplaceableInstrsAVX2[][3] = {
{ X86::VUNPCKHPSYrr, X86::VUNPCKHPSYrr, X86::VPUNPCKHDQYrr },
};
+static const uint16_t ReplaceableInstrsFP[][3] = {
+ //PackedSingle PackedDouble
+ { X86::MOVLPSrm, X86::MOVLPDrm, X86::INSTRUCTION_LIST_END },
+ { X86::MOVHPSrm, X86::MOVHPDrm, X86::INSTRUCTION_LIST_END },
+ { X86::MOVHPSmr, X86::MOVHPDmr, X86::INSTRUCTION_LIST_END },
+ { X86::VMOVLPSrm, X86::VMOVLPDrm, X86::INSTRUCTION_LIST_END },
+ { X86::VMOVHPSrm, X86::VMOVHPDrm, X86::INSTRUCTION_LIST_END },
+ { X86::VMOVHPSmr, X86::VMOVHPDmr, X86::INSTRUCTION_LIST_END },
+ { X86::VMOVLPSZ128rm, X86::VMOVLPDZ128rm, X86::INSTRUCTION_LIST_END },
+ { X86::VMOVHPSZ128rm, X86::VMOVHPDZ128rm, X86::INSTRUCTION_LIST_END },
+ { X86::VMOVHPSZ128mr, X86::VMOVHPDZ128mr, X86::INSTRUCTION_LIST_END },
+};
+
static const uint16_t ReplaceableInstrsAVX2InsertExtract[][3] = {
//PackedSingle PackedDouble PackedInt
{ X86::VEXTRACTF128mr, X86::VEXTRACTF128mr, X86::VEXTRACTI128mr },
@@ -6368,7 +6303,7 @@ static const uint16_t ReplaceableInstrsAVX512DQMasked[][4] = {
};
// NOTE: These should only be used by the custom domain methods.
-static const uint16_t ReplaceableCustomInstrs[][3] = {
+static const uint16_t ReplaceableBlendInstrs[][3] = {
//PackedSingle PackedDouble PackedInt
{ X86::BLENDPSrmi, X86::BLENDPDrmi, X86::PBLENDWrmi },
{ X86::BLENDPSrri, X86::BLENDPDrri, X86::PBLENDWrri },
@@ -6377,7 +6312,7 @@ static const uint16_t ReplaceableCustomInstrs[][3] = {
{ X86::VBLENDPSYrmi, X86::VBLENDPDYrmi, X86::VPBLENDWYrmi },
{ X86::VBLENDPSYrri, X86::VBLENDPDYrri, X86::VPBLENDWYrri },
};
-static const uint16_t ReplaceableCustomAVX2Instrs[][3] = {
+static const uint16_t ReplaceableBlendAVX2Instrs[][3] = {
//PackedSingle PackedDouble PackedInt
{ X86::VBLENDPSrmi, X86::VBLENDPDrmi, X86::VPBLENDDrmi },
{ X86::VBLENDPSrri, X86::VBLENDPDrri, X86::VPBLENDDrri },
@@ -6552,6 +6487,8 @@ uint16_t X86InstrInfo::getExecutionDomainCustom(const MachineInstr &MI) const {
MI.getOperand(2).getSubReg() == 0)
return 0x6;
return 0;
+ case X86::SHUFPDrri:
+ return 0x6;
}
return 0;
}
@@ -6571,9 +6508,9 @@ bool X86InstrInfo::setExecutionDomainCustom(MachineInstr &MI,
Imm = (ImmWidth == 16 ? ((Imm << 8) | Imm) : Imm);
unsigned NewImm = Imm;
- const uint16_t *table = lookup(Opcode, dom, ReplaceableCustomInstrs);
+ const uint16_t *table = lookup(Opcode, dom, ReplaceableBlendInstrs);
if (!table)
- table = lookup(Opcode, dom, ReplaceableCustomAVX2Instrs);
+ table = lookup(Opcode, dom, ReplaceableBlendAVX2Instrs);
if (Domain == 1) { // PackedSingle
AdjustBlendMask(Imm, ImmWidth, Is256 ? 8 : 4, &NewImm);
@@ -6583,7 +6520,7 @@ bool X86InstrInfo::setExecutionDomainCustom(MachineInstr &MI,
if (Subtarget.hasAVX2()) {
// If we are already VPBLENDW use that, else use VPBLENDD.
if ((ImmWidth / (Is256 ? 2 : 1)) != 8) {
- table = lookup(Opcode, dom, ReplaceableCustomAVX2Instrs);
+ table = lookup(Opcode, dom, ReplaceableBlendAVX2Instrs);
AdjustBlendMask(Imm, ImmWidth, Is256 ? 8 : 4, &NewImm);
}
} else {
@@ -6672,6 +6609,18 @@ bool X86InstrInfo::setExecutionDomainCustom(MachineInstr &MI,
// We must always return true for MOVHLPSrr.
if (Opcode == X86::MOVHLPSrr)
return true;
+ break;
+ case X86::SHUFPDrri: {
+ if (Domain == 1) {
+ unsigned Imm = MI.getOperand(3).getImm();
+ unsigned NewImm = 0x44;
+ if (Imm & 1) NewImm |= 0x0a;
+ if (Imm & 2) NewImm |= 0xa0;
+ MI.getOperand(3).setImm(NewImm);
+ MI.setDesc(get(X86::SHUFPSrri));
+ }
+ return true;
+ }
}
return false;
}
@@ -6691,6 +6640,8 @@ X86InstrInfo::getExecutionDomain(const MachineInstr &MI) const {
validDomains = 0xe;
} else if (lookup(opcode, domain, ReplaceableInstrsAVX2)) {
validDomains = Subtarget.hasAVX2() ? 0xe : 0x6;
+ } else if (lookup(opcode, domain, ReplaceableInstrsFP)) {
+ validDomains = 0x6;
} else if (lookup(opcode, domain, ReplaceableInstrsAVX2InsertExtract)) {
// Insert/extract instructions should only effect domain if AVX2
// is enabled.
@@ -6730,6 +6681,11 @@ void X86InstrInfo::setExecutionDomain(MachineInstr &MI, unsigned Domain) const {
"256-bit vector operations only available in AVX2");
table = lookup(MI.getOpcode(), dom, ReplaceableInstrsAVX2);
}
+ if (!table) { // try the FP table
+ table = lookup(MI.getOpcode(), dom, ReplaceableInstrsFP);
+ assert((!table || Domain < 3) &&
+ "Can only select PackedSingle or PackedDouble");
+ }
if (!table) { // try the other table
assert(Subtarget.hasAVX2() &&
"256-bit insert/extract only available in AVX2");
@@ -7140,6 +7096,20 @@ bool X86InstrInfo::isAssociativeAndCommutative(const MachineInstr &Inst) const {
case X86::PADDWrr:
case X86::PADDDrr:
case X86::PADDQrr:
+ case X86::PMULLWrr:
+ case X86::PMULLDrr:
+ case X86::PMAXSBrr:
+ case X86::PMAXSDrr:
+ case X86::PMAXSWrr:
+ case X86::PMAXUBrr:
+ case X86::PMAXUDrr:
+ case X86::PMAXUWrr:
+ case X86::PMINSBrr:
+ case X86::PMINSDrr:
+ case X86::PMINSWrr:
+ case X86::PMINUBrr:
+ case X86::PMINUDrr:
+ case X86::PMINUWrr:
case X86::VPANDrr:
case X86::VPANDYrr:
case X86::VPANDDZ128rr:
@@ -7243,6 +7213,78 @@ bool X86InstrInfo::isAssociativeAndCommutative(const MachineInstr &Inst) const {
case X86::VPMULLQZ128rr:
case X86::VPMULLQZ256rr:
case X86::VPMULLQZrr:
+ case X86::VPMAXSBrr:
+ case X86::VPMAXSBYrr:
+ case X86::VPMAXSBZ128rr:
+ case X86::VPMAXSBZ256rr:
+ case X86::VPMAXSBZrr:
+ case X86::VPMAXSDrr:
+ case X86::VPMAXSDYrr:
+ case X86::VPMAXSDZ128rr:
+ case X86::VPMAXSDZ256rr:
+ case X86::VPMAXSDZrr:
+ case X86::VPMAXSQZ128rr:
+ case X86::VPMAXSQZ256rr:
+ case X86::VPMAXSQZrr:
+ case X86::VPMAXSWrr:
+ case X86::VPMAXSWYrr:
+ case X86::VPMAXSWZ128rr:
+ case X86::VPMAXSWZ256rr:
+ case X86::VPMAXSWZrr:
+ case X86::VPMAXUBrr:
+ case X86::VPMAXUBYrr:
+ case X86::VPMAXUBZ128rr:
+ case X86::VPMAXUBZ256rr:
+ case X86::VPMAXUBZrr:
+ case X86::VPMAXUDrr:
+ case X86::VPMAXUDYrr:
+ case X86::VPMAXUDZ128rr:
+ case X86::VPMAXUDZ256rr:
+ case X86::VPMAXUDZrr:
+ case X86::VPMAXUQZ128rr:
+ case X86::VPMAXUQZ256rr:
+ case X86::VPMAXUQZrr:
+ case X86::VPMAXUWrr:
+ case X86::VPMAXUWYrr:
+ case X86::VPMAXUWZ128rr:
+ case X86::VPMAXUWZ256rr:
+ case X86::VPMAXUWZrr:
+ case X86::VPMINSBrr:
+ case X86::VPMINSBYrr:
+ case X86::VPMINSBZ128rr:
+ case X86::VPMINSBZ256rr:
+ case X86::VPMINSBZrr:
+ case X86::VPMINSDrr:
+ case X86::VPMINSDYrr:
+ case X86::VPMINSDZ128rr:
+ case X86::VPMINSDZ256rr:
+ case X86::VPMINSDZrr:
+ case X86::VPMINSQZ128rr:
+ case X86::VPMINSQZ256rr:
+ case X86::VPMINSQZrr:
+ case X86::VPMINSWrr:
+ case X86::VPMINSWYrr:
+ case X86::VPMINSWZ128rr:
+ case X86::VPMINSWZ256rr:
+ case X86::VPMINSWZrr:
+ case X86::VPMINUBrr:
+ case X86::VPMINUBYrr:
+ case X86::VPMINUBZ128rr:
+ case X86::VPMINUBZ256rr:
+ case X86::VPMINUBZrr:
+ case X86::VPMINUDrr:
+ case X86::VPMINUDYrr:
+ case X86::VPMINUDZ128rr:
+ case X86::VPMINUDZ256rr:
+ case X86::VPMINUDZrr:
+ case X86::VPMINUQZ128rr:
+ case X86::VPMINUQZ256rr:
+ case X86::VPMINUQZrr:
+ case X86::VPMINUWrr:
+ case X86::VPMINUWYrr:
+ case X86::VPMINUWZ128rr:
+ case X86::VPMINUWZ256rr:
+ case X86::VPMINUWZrr:
// Normal min/max instructions are not commutative because of NaN and signed
// zero semantics, but these are. Thus, there's no need to check for global
// relaxed math; the instructions themselves have the properties we need.
@@ -7698,7 +7740,7 @@ bool X86InstrInfo::isFunctionSafeToOutlineFrom(MachineFunction &MF,
// Does the function use a red zone? If it does, then we can't risk messing
// with the stack.
- if (!F.hasFnAttribute(Attribute::NoRedZone)) {
+ if (Subtarget.getFrameLowering()->has128ByteRedZone(MF)) {
// It could have a red zone. If it does, then we don't want to touch it.
const X86MachineFunctionInfo *X86FI = MF.getInfo<X86MachineFunctionInfo>();
if (!X86FI || X86FI->getUsesRedZone())
diff --git a/lib/Target/X86/X86InstrInfo.h b/lib/Target/X86/X86InstrInfo.h
index 159cb50afc5c..13ca17139494 100644
--- a/lib/Target/X86/X86InstrInfo.h
+++ b/lib/Target/X86/X86InstrInfo.h
@@ -1,9 +1,8 @@
//===-- X86InstrInfo.h - X86 Instruction Information ------------*- C++ -*-===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
@@ -36,62 +35,24 @@ enum AsmComments {
AC_EVEX_2_VEX = MachineInstr::TAsmComments
};
-// X86 specific condition code. These correspond to X86_*_COND in
-// X86InstrInfo.td. They must be kept in synch.
-enum CondCode {
- COND_A = 0,
- COND_AE = 1,
- COND_B = 2,
- COND_BE = 3,
- COND_E = 4,
- COND_G = 5,
- COND_GE = 6,
- COND_L = 7,
- COND_LE = 8,
- COND_NE = 9,
- COND_NO = 10,
- COND_NP = 11,
- COND_NS = 12,
- COND_O = 13,
- COND_P = 14,
- COND_S = 15,
- LAST_VALID_COND = COND_S,
-
- // Artificial condition codes. These are used by AnalyzeBranch
- // to indicate a block terminated with two conditional branches that together
- // form a compound condition. They occur in code using FCMP_OEQ or FCMP_UNE,
- // which can't be represented on x86 with a single condition. These
- // are never used in MachineInstrs and are inverses of one another.
- COND_NE_OR_P,
- COND_E_AND_NP,
-
- COND_INVALID
-};
-
-// Turn condition code into conditional branch opcode.
-unsigned GetCondBranchFromCond(CondCode CC);
-
/// Return a pair of condition code for the given predicate and whether
/// the instruction operands should be swaped to match the condition code.
std::pair<CondCode, bool> getX86ConditionCode(CmpInst::Predicate Predicate);
-/// Return a set opcode for the given condition and whether it has
-/// a memory operand.
-unsigned getSETFromCond(CondCode CC, bool HasMemoryOperand = false);
+/// Return a setcc opcode based on whether it has a memory operand.
+unsigned getSETOpc(bool HasMemoryOperand = false);
-/// Return a cmov opcode for the given condition, register size in
-/// bytes, and operand type.
-unsigned getCMovFromCond(CondCode CC, unsigned RegBytes,
- bool HasMemoryOperand = false);
+/// Return a cmov opcode for the given register size in bytes, and operand type.
+unsigned getCMovOpcode(unsigned RegBytes, bool HasMemoryOperand = false);
-// Turn jCC opcode into condition code.
-CondCode getCondFromBranchOpc(unsigned Opc);
+// Turn jCC instruction into condition code.
+CondCode getCondFromBranch(const MachineInstr &MI);
-// Turn setCC opcode into condition code.
-CondCode getCondFromSETOpc(unsigned Opc);
+// Turn setCC instruction into condition code.
+CondCode getCondFromSETCC(const MachineInstr &MI);
-// Turn CMov opcode into condition code.
-CondCode getCondFromCMovOpc(unsigned Opc);
+// Turn CMov instruction into condition code.
+CondCode getCondFromCMov(const MachineInstr &MI);
/// GetOppositeBranchCondition - Return the inverse of the specified cond,
/// e.g. turning COND_E to COND_NE.
@@ -327,7 +288,8 @@ public:
SmallVectorImpl<MachineOperand> &Cond,
bool AllowModify) const override;
- bool getMemOperandWithOffset(MachineInstr &LdSt, MachineOperand *&BaseOp,
+ bool getMemOperandWithOffset(const MachineInstr &LdSt,
+ const MachineOperand *&BaseOp,
int64_t &Offset,
const TargetRegisterInfo *TRI) const override;
bool analyzeBranchPredicate(MachineBasicBlock &MBB,
@@ -388,7 +350,8 @@ public:
foldMemoryOperandImpl(MachineFunction &MF, MachineInstr &MI,
ArrayRef<unsigned> Ops,
MachineBasicBlock::iterator InsertPt, int FrameIndex,
- LiveIntervals *LIS = nullptr) const override;
+ LiveIntervals *LIS = nullptr,
+ VirtRegMap *VRM = nullptr) const override;
/// foldMemoryOperand - Same as the previous version except it allows folding
/// of any load and store from / to any address, not just from a specific
@@ -453,7 +416,10 @@ public:
/// conservative. If it cannot definitely determine the safety after visiting
/// a few instructions in each direction it assumes it's not safe.
bool isSafeToClobberEFLAGS(MachineBasicBlock &MBB,
- MachineBasicBlock::iterator I) const;
+ MachineBasicBlock::iterator I) const {
+ return MBB.computeRegisterLiveness(&RI, X86::EFLAGS, I, 4) ==
+ MachineBasicBlock::LQR_Dead;
+ }
/// True if MI has a condition code def, e.g. EFLAGS, that is
/// not marked dead.
@@ -590,7 +556,8 @@ private:
MachineInstr *convertToThreeAddressWithLEA(unsigned MIOpc,
MachineFunction::iterator &MFI,
MachineInstr &MI,
- LiveVariables *LV) const;
+ LiveVariables *LV,
+ bool Is8BitOp) const;
/// Handles memory folding for special case instructions, for instance those
/// requiring custom manipulation of the address.
diff --git a/lib/Target/X86/X86InstrInfo.td b/lib/Target/X86/X86InstrInfo.td
index e53f83baa3c6..8e05dd8ec5c1 100644
--- a/lib/Target/X86/X86InstrInfo.td
+++ b/lib/Target/X86/X86InstrInfo.td
@@ -1,9 +1,8 @@
//===-- X86InstrInfo.td - Main X86 Instruction Definition --*- tablegen -*-===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
@@ -64,6 +63,10 @@ def SDTX86sahf : SDTypeProfile<1, 1, [SDTCisVT<0, i32>, SDTCisVT<1, i8>]>;
def SDTX86rdrand : SDTypeProfile<2, 0, [SDTCisInt<0>, SDTCisVT<1, i32>]>;
+def SDTX86rdpkru : SDTypeProfile<1, 1, [SDTCisVT<0, i32>, SDTCisVT<1, i32>]>;
+def SDTX86wrpkru : SDTypeProfile<0, 3, [SDTCisVT<0, i32>, SDTCisVT<1, i32>,
+ SDTCisVT<2, i32>]>;
+
def SDTX86cas : SDTypeProfile<0, 3, [SDTCisPtrTy<0>, SDTCisInt<1>,
SDTCisVT<2, i8>]>;
def SDTX86caspair : SDTypeProfile<0, 1, [SDTCisPtrTy<0>]>;
@@ -124,6 +127,9 @@ def SDT_X86TCRET : SDTypeProfile<0, 2, [SDTCisPtrTy<0>, SDTCisVT<1, i32>]>;
def SDT_X86MEMBARRIER : SDTypeProfile<0, 0, []>;
+def SDT_X86ENQCMD : SDTypeProfile<1, 2, [SDTCisVT<0, i32>,
+ SDTCisPtrTy<1>, SDTCisSameAs<1, 2>]>;
+
def X86MemBarrier : SDNode<"X86ISD::MEMBARRIER", SDT_X86MEMBARRIER,
[SDNPHasChain,SDNPSideEffect]>;
def X86MFence : SDNode<"X86ISD::MFENCE", SDT_X86MEMBARRIER,
@@ -152,6 +158,11 @@ def X86rdrand : SDNode<"X86ISD::RDRAND", SDTX86rdrand,
def X86rdseed : SDNode<"X86ISD::RDSEED", SDTX86rdrand,
[SDNPHasChain, SDNPSideEffect]>;
+def X86rdpkru : SDNode<"X86ISD::RDPKRU", SDTX86rdpkru,
+ [SDNPHasChain, SDNPSideEffect]>;
+def X86wrpkru : SDNode<"X86ISD::WRPKRU", SDTX86wrpkru,
+ [SDNPHasChain, SDNPSideEffect]>;
+
def X86cas : SDNode<"X86ISD::LCMPXCHG_DAG", SDTX86cas,
[SDNPHasChain, SDNPInGlue, SDNPOutGlue, SDNPMayStore,
SDNPMayLoad, SDNPMemOperand]>;
@@ -206,13 +217,6 @@ def X86rep_movs: SDNode<"X86ISD::REP_MOVS", SDTX86RepStr,
[SDNPHasChain, SDNPInGlue, SDNPOutGlue, SDNPMayStore,
SDNPMayLoad]>;
-def X86rdtsc : SDNode<"X86ISD::RDTSC_DAG", SDTX86Void,
- [SDNPHasChain, SDNPOutGlue, SDNPSideEffect]>;
-def X86rdtscp : SDNode<"X86ISD::RDTSCP_DAG", SDTX86Void,
- [SDNPHasChain, SDNPOutGlue, SDNPSideEffect]>;
-def X86rdpmc : SDNode<"X86ISD::RDPMC_DAG", SDTX86Void,
- [SDNPHasChain, SDNPOutGlue, SDNPSideEffect]>;
-
def X86Wrapper : SDNode<"X86ISD::Wrapper", SDTX86Wrapper>;
def X86WrapperRIP : SDNode<"X86ISD::WrapperRIP", SDTX86Wrapper>;
@@ -306,6 +310,11 @@ def X86tpause : SDNode<"X86ISD::TPAUSE",
SDTCisVT<2, i32>, SDTCisVT<3, i32>]>,
[SDNPHasChain, SDNPSideEffect]>;
+def X86enqcmd : SDNode<"X86ISD::ENQCMD", SDT_X86ENQCMD,
+ [SDNPHasChain, SDNPSideEffect]>;
+def X86enqcmds : SDNode<"X86ISD::ENQCMDS", SDT_X86ENQCMD,
+ [SDNPHasChain, SDNPSideEffect]>;
+
//===----------------------------------------------------------------------===//
// X86 Operand Definitions.
//
@@ -371,37 +380,35 @@ def anymem : X86MemOperand<"printanymem">;
// restrict to only unsized memory.
def opaquemem : X86MemOperand<"printopaquemem">;
-def i8mem : X86MemOperand<"printi8mem", X86Mem8AsmOperand>;
-def i16mem : X86MemOperand<"printi16mem", X86Mem16AsmOperand>;
-def i32mem : X86MemOperand<"printi32mem", X86Mem32AsmOperand>;
-def i64mem : X86MemOperand<"printi64mem", X86Mem64AsmOperand>;
-def i128mem : X86MemOperand<"printi128mem", X86Mem128AsmOperand>;
-def i256mem : X86MemOperand<"printi256mem", X86Mem256AsmOperand>;
-def i512mem : X86MemOperand<"printi512mem", X86Mem512AsmOperand>;
-def f32mem : X86MemOperand<"printf32mem", X86Mem32AsmOperand>;
-def f64mem : X86MemOperand<"printf64mem", X86Mem64AsmOperand>;
-def f80mem : X86MemOperand<"printf80mem", X86Mem80AsmOperand>;
-def f128mem : X86MemOperand<"printf128mem", X86Mem128AsmOperand>;
-def f256mem : X86MemOperand<"printf256mem", X86Mem256AsmOperand>;
-def f512mem : X86MemOperand<"printf512mem", X86Mem512AsmOperand>;
-
-def v512mem : X86VMemOperand<VR512, "printf512mem", X86Mem512AsmOperand>;
+def i8mem : X86MemOperand<"printbytemem", X86Mem8AsmOperand>;
+def i16mem : X86MemOperand<"printwordmem", X86Mem16AsmOperand>;
+def i32mem : X86MemOperand<"printdwordmem", X86Mem32AsmOperand>;
+def i64mem : X86MemOperand<"printqwordmem", X86Mem64AsmOperand>;
+def i128mem : X86MemOperand<"printxmmwordmem", X86Mem128AsmOperand>;
+def i256mem : X86MemOperand<"printymmwordmem", X86Mem256AsmOperand>;
+def i512mem : X86MemOperand<"printzmmwordmem", X86Mem512AsmOperand>;
+def f32mem : X86MemOperand<"printdwordmem", X86Mem32AsmOperand>;
+def f64mem : X86MemOperand<"printqwordmem", X86Mem64AsmOperand>;
+def f80mem : X86MemOperand<"printtbytemem", X86Mem80AsmOperand>;
+def f128mem : X86MemOperand<"printxmmwordmem", X86Mem128AsmOperand>;
+def f256mem : X86MemOperand<"printymmwordmem", X86Mem256AsmOperand>;
+def f512mem : X86MemOperand<"printzmmwordmem", X86Mem512AsmOperand>;
// Gather mem operands
-def vx64mem : X86VMemOperand<VR128, "printi64mem", X86Mem64_RC128Operand>;
-def vx128mem : X86VMemOperand<VR128, "printi128mem", X86Mem128_RC128Operand>;
-def vx256mem : X86VMemOperand<VR128, "printi256mem", X86Mem256_RC128Operand>;
-def vy128mem : X86VMemOperand<VR256, "printi128mem", X86Mem128_RC256Operand>;
-def vy256mem : X86VMemOperand<VR256, "printi256mem", X86Mem256_RC256Operand>;
-
-def vx64xmem : X86VMemOperand<VR128X, "printi64mem", X86Mem64_RC128XOperand>;
-def vx128xmem : X86VMemOperand<VR128X, "printi128mem", X86Mem128_RC128XOperand>;
-def vx256xmem : X86VMemOperand<VR128X, "printi256mem", X86Mem256_RC128XOperand>;
-def vy128xmem : X86VMemOperand<VR256X, "printi128mem", X86Mem128_RC256XOperand>;
-def vy256xmem : X86VMemOperand<VR256X, "printi256mem", X86Mem256_RC256XOperand>;
-def vy512xmem : X86VMemOperand<VR256X, "printi512mem", X86Mem512_RC256XOperand>;
-def vz256mem : X86VMemOperand<VR512, "printi256mem", X86Mem256_RC512Operand>;
-def vz512mem : X86VMemOperand<VR512, "printi512mem", X86Mem512_RC512Operand>;
+def vx64mem : X86VMemOperand<VR128, "printqwordmem", X86Mem64_RC128Operand>;
+def vx128mem : X86VMemOperand<VR128, "printxmmwordmem", X86Mem128_RC128Operand>;
+def vx256mem : X86VMemOperand<VR128, "printymmwordmem", X86Mem256_RC128Operand>;
+def vy128mem : X86VMemOperand<VR256, "printxmmwordmem", X86Mem128_RC256Operand>;
+def vy256mem : X86VMemOperand<VR256, "printymmwordmem", X86Mem256_RC256Operand>;
+
+def vx64xmem : X86VMemOperand<VR128X, "printqwordmem", X86Mem64_RC128XOperand>;
+def vx128xmem : X86VMemOperand<VR128X, "printxmmwordmem", X86Mem128_RC128XOperand>;
+def vx256xmem : X86VMemOperand<VR128X, "printymmwordmem", X86Mem256_RC128XOperand>;
+def vy128xmem : X86VMemOperand<VR256X, "printxmmwordmem", X86Mem128_RC256XOperand>;
+def vy256xmem : X86VMemOperand<VR256X, "printymmwordmem", X86Mem256_RC256XOperand>;
+def vy512xmem : X86VMemOperand<VR256X, "printzmmwordmem", X86Mem512_RC256XOperand>;
+def vz256mem : X86VMemOperand<VR512, "printymmwordmem", X86Mem256_RC512Operand>;
+def vz512mem : X86VMemOperand<VR512, "printzmmwordmem", X86Mem512_RC512Operand>;
// A version of i8mem for use on x86-64 and x32 that uses a NOREX GPR instead
// of a plain GPR, so that it doesn't potentially require a REX prefix.
@@ -409,7 +416,7 @@ def ptr_rc_norex : PointerLikeRegClass<2>;
def ptr_rc_norex_nosp : PointerLikeRegClass<3>;
def i8mem_NOREX : Operand<iPTR> {
- let PrintMethod = "printi8mem";
+ let PrintMethod = "printbytemem";
let MIOperandInfo = (ops ptr_rc_norex, i8imm, ptr_rc_norex_nosp, i32imm,
SEGMENT_REG);
let ParserMatchClass = X86Mem8AsmOperand;
@@ -424,7 +431,7 @@ def ptr_rc_tailcall : PointerLikeRegClass<4>;
// allowed to use callee-saved registers since they must be scheduled
// after callee-saved register are popped.
def i32mem_TC : Operand<i32> {
- let PrintMethod = "printi32mem";
+ let PrintMethod = "printdwordmem";
let MIOperandInfo = (ops ptr_rc_tailcall, i8imm, ptr_rc_tailcall,
i32imm, SEGMENT_REG);
let ParserMatchClass = X86Mem32AsmOperand;
@@ -435,7 +442,7 @@ def i32mem_TC : Operand<i32> {
// allowed to use callee-saved registers since they must be scheduled
// after callee-saved register are popped.
def i64mem_TC : Operand<i64> {
- let PrintMethod = "printi64mem";
+ let PrintMethod = "printqwordmem";
let MIOperandInfo = (ops ptr_rc_tailcall, i8imm,
ptr_rc_tailcall, i32imm, SEGMENT_REG);
let ParserMatchClass = X86Mem64AsmOperand;
@@ -603,24 +610,10 @@ def offset64_32 : X86MemOffsOperand<i64imm, "printMemOffs32",
def offset64_64 : X86MemOffsOperand<i64imm, "printMemOffs64",
X86MemOffs64_64AsmOperand>;
-def SSECC : Operand<i8> {
- let PrintMethod = "printSSEAVXCC";
- let OperandType = "OPERAND_IMMEDIATE";
-}
-
-def AVXCC : Operand<i8> {
- let PrintMethod = "printSSEAVXCC";
- let OperandType = "OPERAND_IMMEDIATE";
-}
-
-def AVX512ICC : Operand<i8> {
- let PrintMethod = "printSSEAVXCC";
- let OperandType = "OPERAND_IMMEDIATE";
-}
-
-def XOPCC : Operand<i8> {
- let PrintMethod = "printXOPCC";
- let OperandType = "OPERAND_IMMEDIATE";
+def ccode : Operand<i8> {
+ let PrintMethod = "printCondCode";
+ let OperandNamespace = "X86";
+ let OperandType = "OPERAND_COND_CODE";
}
class ImmSExtAsmOperandClass : AsmOperandClass {
@@ -640,7 +633,8 @@ def AVX512RCOperand : AsmOperandClass {
}
def AVX512RC : Operand<i32> {
let PrintMethod = "printRoundingControl";
- let OperandType = "OPERAND_IMMEDIATE";
+ let OperandNamespace = "X86";
+ let OperandType = "OPERAND_ROUNDING_CONTROL";
let ParserMatchClass = AVX512RCOperand;
}
@@ -718,6 +712,14 @@ def u8imm : Operand<i8> {
let OperandType = "OPERAND_IMMEDIATE";
}
+// 16-bit immediate but only 8-bits are significant and they are unsigned.
+// Used by BT instructions.
+def i16u8imm : Operand<i16> {
+ let PrintMethod = "printU8Imm";
+ let ParserMatchClass = ImmUnsignedi8AsmOperand;
+ let OperandType = "OPERAND_IMMEDIATE";
+}
+
// 32-bit immediate but only 8-bits are significant and they are unsigned.
// Used by some SSE/AVX instructions that use intrinsics.
def i32u8imm : Operand<i32> {
@@ -726,6 +728,14 @@ def i32u8imm : Operand<i32> {
let OperandType = "OPERAND_IMMEDIATE";
}
+// 64-bit immediate but only 8-bits are significant and they are unsigned.
+// Used by BT instructions.
+def i64u8imm : Operand<i64> {
+ let PrintMethod = "printU8Imm";
+ let ParserMatchClass = ImmUnsignedi8AsmOperand;
+ let OperandType = "OPERAND_IMMEDIATE";
+}
+
// 64-bits but only 32 bits are significant, and those bits are treated as being
// pc relative.
def i64i32imm_pcrel : Operand<i64> {
@@ -747,6 +757,33 @@ def lea64mem : Operand<i64> {
let ParserMatchClass = X86MemAsmOperand;
}
+let RenderMethod = "addMaskPairOperands" in {
+ def VK1PairAsmOperand : AsmOperandClass { let Name = "VK1Pair"; }
+ def VK2PairAsmOperand : AsmOperandClass { let Name = "VK2Pair"; }
+ def VK4PairAsmOperand : AsmOperandClass { let Name = "VK4Pair"; }
+ def VK8PairAsmOperand : AsmOperandClass { let Name = "VK8Pair"; }
+ def VK16PairAsmOperand : AsmOperandClass { let Name = "VK16Pair"; }
+}
+
+def VK1Pair : RegisterOperand<VK1PAIR, "printVKPair"> {
+ let ParserMatchClass = VK1PairAsmOperand;
+}
+
+def VK2Pair : RegisterOperand<VK2PAIR, "printVKPair"> {
+ let ParserMatchClass = VK2PairAsmOperand;
+}
+
+def VK4Pair : RegisterOperand<VK4PAIR, "printVKPair"> {
+ let ParserMatchClass = VK4PairAsmOperand;
+}
+
+def VK8Pair : RegisterOperand<VK8PAIR, "printVKPair"> {
+ let ParserMatchClass = VK8PairAsmOperand;
+}
+
+def VK16Pair : RegisterOperand<VK16PAIR, "printVKPair"> {
+ let ParserMatchClass = VK16PairAsmOperand;
+}
//===----------------------------------------------------------------------===//
// X86 Complex Pattern Definitions.
@@ -833,6 +870,8 @@ def NoVLX_Or_NoBWI : Predicate<"!Subtarget->hasVLX() || !Subtarget->hasBWI()">;
def NoVLX_Or_NoDQI : Predicate<"!Subtarget->hasVLX() || !Subtarget->hasDQI()">;
def PKU : Predicate<"Subtarget->hasPKU()">;
def HasVNNI : Predicate<"Subtarget->hasVNNI()">;
+def HasVP2INTERSECT : Predicate<"Subtarget->hasVP2INTERSECT()">;
+def HasBF16 : Predicate<"Subtarget->hasBF16()">;
def HasBITALG : Predicate<"Subtarget->hasBITALG()">;
def HasPOPCNT : Predicate<"Subtarget->hasPOPCNT()">;
@@ -894,8 +933,10 @@ def HasWBNOINVD : Predicate<"Subtarget->hasWBNOINVD()">;
def HasRDPID : Predicate<"Subtarget->hasRDPID()">;
def HasWAITPKG : Predicate<"Subtarget->hasWAITPKG()">;
def HasINVPCID : Predicate<"Subtarget->hasINVPCID()">;
+def HasCmpxchg8b : Predicate<"Subtarget->hasCmpxchg8b()">;
def HasCmpxchg16b: Predicate<"Subtarget->hasCmpxchg16b()">;
def HasPCONFIG : Predicate<"Subtarget->hasPCONFIG()">;
+def HasENQCMD : Predicate<"Subtarget->hasENQCMD()">;
def Not64BitMode : Predicate<"!Subtarget->is64Bit()">,
AssemblerPredicate<"!Mode64Bit", "Not 64-bit mode">;
def In64BitMode : Predicate<"Subtarget->is64Bit()">,
@@ -928,12 +969,12 @@ def IsNotPIC : Predicate<"!TM.isPositionIndependent()">;
// the Function object through the <Target>Subtarget and objections were raised
// to that (see post-commit review comments for r301750).
let RecomputePerFunction = 1 in {
- def OptForSize : Predicate<"MF->getFunction().optForSize()">;
- def OptForMinSize : Predicate<"MF->getFunction().optForMinSize()">;
- def OptForSpeed : Predicate<"!MF->getFunction().optForSize()">;
+ def OptForSize : Predicate<"MF->getFunction().hasOptSize()">;
+ def OptForMinSize : Predicate<"MF->getFunction().hasMinSize()">;
+ def OptForSpeed : Predicate<"!MF->getFunction().hasOptSize()">;
def UseIncDec : Predicate<"!Subtarget->slowIncDec() || "
- "MF->getFunction().optForSize()">;
- def NoSSE41_Or_OptForSize : Predicate<"MF->getFunction().optForSize() || "
+ "MF->getFunction().hasOptSize()">;
+ def NoSSE41_Or_OptForSize : Predicate<"MF->getFunction().hasOptSize() || "
"!Subtarget->hasSSE41()">;
}
@@ -959,22 +1000,22 @@ include "X86InstrFormats.td"
// X86 specific condition code. These correspond to CondCode in
// X86InstrInfo.h. They must be kept in synch.
-def X86_COND_A : PatLeaf<(i8 0)>; // alt. COND_NBE
-def X86_COND_AE : PatLeaf<(i8 1)>; // alt. COND_NC
+def X86_COND_O : PatLeaf<(i8 0)>;
+def X86_COND_NO : PatLeaf<(i8 1)>;
def X86_COND_B : PatLeaf<(i8 2)>; // alt. COND_C
-def X86_COND_BE : PatLeaf<(i8 3)>; // alt. COND_NA
+def X86_COND_AE : PatLeaf<(i8 3)>; // alt. COND_NC
def X86_COND_E : PatLeaf<(i8 4)>; // alt. COND_Z
-def X86_COND_G : PatLeaf<(i8 5)>; // alt. COND_NLE
-def X86_COND_GE : PatLeaf<(i8 6)>; // alt. COND_NL
-def X86_COND_L : PatLeaf<(i8 7)>; // alt. COND_NGE
-def X86_COND_LE : PatLeaf<(i8 8)>; // alt. COND_NG
-def X86_COND_NE : PatLeaf<(i8 9)>; // alt. COND_NZ
-def X86_COND_NO : PatLeaf<(i8 10)>;
+def X86_COND_NE : PatLeaf<(i8 5)>; // alt. COND_NZ
+def X86_COND_BE : PatLeaf<(i8 6)>; // alt. COND_NA
+def X86_COND_A : PatLeaf<(i8 7)>; // alt. COND_NBE
+def X86_COND_S : PatLeaf<(i8 8)>;
+def X86_COND_NS : PatLeaf<(i8 9)>;
+def X86_COND_P : PatLeaf<(i8 10)>; // alt. COND_PE
def X86_COND_NP : PatLeaf<(i8 11)>; // alt. COND_PO
-def X86_COND_NS : PatLeaf<(i8 12)>;
-def X86_COND_O : PatLeaf<(i8 13)>;
-def X86_COND_P : PatLeaf<(i8 14)>; // alt. COND_PE
-def X86_COND_S : PatLeaf<(i8 15)>;
+def X86_COND_L : PatLeaf<(i8 12)>; // alt. COND_NGE
+def X86_COND_GE : PatLeaf<(i8 13)>; // alt. COND_NL
+def X86_COND_LE : PatLeaf<(i8 14)>; // alt. COND_NG
+def X86_COND_G : PatLeaf<(i8 15)>; // alt. COND_NLE
def i16immSExt8 : ImmLeaf<i16, [{ return isInt<8>(Imm); }]>;
def i32immSExt8 : ImmLeaf<i32, [{ return isInt<8>(Imm); }]>;
@@ -1007,16 +1048,13 @@ def i64relocImmSExt32 : PatLeaf<(i64 relocImm), [{
// Eventually, it would be nice to allow ConstantHoisting to merge constants
// globally for potentially added savings.
//
-def imm8_su : PatLeaf<(i8 relocImm), [{
+def relocImm8_su : PatLeaf<(i8 relocImm), [{
return !shouldAvoidImmediateInstFormsForSize(N);
}]>;
-def imm16_su : PatLeaf<(i16 relocImm), [{
+def relocImm16_su : PatLeaf<(i16 relocImm), [{
return !shouldAvoidImmediateInstFormsForSize(N);
}]>;
-def imm32_su : PatLeaf<(i32 relocImm), [{
- return !shouldAvoidImmediateInstFormsForSize(N);
-}]>;
-def i64immSExt32_su : PatLeaf<(i64immSExt32), [{
+def relocImm32_su : PatLeaf<(i32 relocImm), [{
return !shouldAvoidImmediateInstFormsForSize(N);
}]>;
@@ -1121,7 +1159,19 @@ def extloadi32i16 : PatFrag<(ops node:$ptr), (i32 (extloadi16 node:$ptr))>;
def extloadi64i1 : PatFrag<(ops node:$ptr), (i64 (extloadi1 node:$ptr))>;
def extloadi64i8 : PatFrag<(ops node:$ptr), (i64 (extloadi8 node:$ptr))>;
def extloadi64i16 : PatFrag<(ops node:$ptr), (i64 (extloadi16 node:$ptr))>;
-def extloadi64i32 : PatFrag<(ops node:$ptr), (i64 (extloadi32 node:$ptr))>;
+
+// We can treat an i8/i16 extending load to i64 as a 32 bit load if its known
+// to be 4 byte aligned or better.
+def extloadi64i32 : PatFrag<(ops node:$ptr), (i64 (unindexedload node:$ptr)), [{
+ LoadSDNode *LD = cast<LoadSDNode>(N);
+ ISD::LoadExtType ExtType = LD->getExtensionType();
+ if (ExtType != ISD::EXTLOAD)
+ return false;
+ if (LD->getMemoryVT() == MVT::i32)
+ return true;
+
+ return LD->getAlignment() >= 4 && !LD->isVolatile();
+}]>;
// An 'and' node with a single use.
@@ -1517,16 +1567,16 @@ def MOV32ri_alt : Ii32<0xC7, MRM0r, (outs GR32:$dst), (ins i32imm:$src),
let SchedRW = [WriteStore] in {
def MOV8mi : Ii8 <0xC6, MRM0m, (outs), (ins i8mem :$dst, i8imm :$src),
"mov{b}\t{$src, $dst|$dst, $src}",
- [(store (i8 imm8_su:$src), addr:$dst)]>;
+ [(store (i8 relocImm8_su:$src), addr:$dst)]>;
def MOV16mi : Ii16<0xC7, MRM0m, (outs), (ins i16mem:$dst, i16imm:$src),
"mov{w}\t{$src, $dst|$dst, $src}",
- [(store (i16 imm16_su:$src), addr:$dst)]>, OpSize16;
+ [(store (i16 relocImm16_su:$src), addr:$dst)]>, OpSize16;
def MOV32mi : Ii32<0xC7, MRM0m, (outs), (ins i32mem:$dst, i32imm:$src),
"mov{l}\t{$src, $dst|$dst, $src}",
- [(store (i32 imm32_su:$src), addr:$dst)]>, OpSize32;
+ [(store (i32 relocImm32_su:$src), addr:$dst)]>, OpSize32;
def MOV64mi32 : RIi32S<0xC7, MRM0m, (outs), (ins i64mem:$dst, i64i32imm:$src),
"mov{q}\t{$src, $dst|$dst, $src}",
- [(store i64immSExt32_su:$src, addr:$dst)]>,
+ [(store i64relocImmSExt32_su:$src, addr:$dst)]>,
Requires<[In64BitMode]>;
} // SchedRW
@@ -1773,36 +1823,36 @@ let mayLoad = 1, hasSideEffects = 0, SchedRW = [WriteBitTestRegLd] in {
}
let SchedRW = [WriteBitTest] in {
-def BT16ri8 : Ii8<0xBA, MRM4r, (outs), (ins GR16:$src1, i16i8imm:$src2),
+def BT16ri8 : Ii8<0xBA, MRM4r, (outs), (ins GR16:$src1, i16u8imm:$src2),
"bt{w}\t{$src2, $src1|$src1, $src2}",
- [(set EFLAGS, (X86bt GR16:$src1, i16immSExt8:$src2))]>,
+ [(set EFLAGS, (X86bt GR16:$src1, imm:$src2))]>,
OpSize16, TB;
-def BT32ri8 : Ii8<0xBA, MRM4r, (outs), (ins GR32:$src1, i32i8imm:$src2),
+def BT32ri8 : Ii8<0xBA, MRM4r, (outs), (ins GR32:$src1, i32u8imm:$src2),
"bt{l}\t{$src2, $src1|$src1, $src2}",
- [(set EFLAGS, (X86bt GR32:$src1, i32immSExt8:$src2))]>,
+ [(set EFLAGS, (X86bt GR32:$src1, imm:$src2))]>,
OpSize32, TB;
-def BT64ri8 : RIi8<0xBA, MRM4r, (outs), (ins GR64:$src1, i64i8imm:$src2),
+def BT64ri8 : RIi8<0xBA, MRM4r, (outs), (ins GR64:$src1, i64u8imm:$src2),
"bt{q}\t{$src2, $src1|$src1, $src2}",
- [(set EFLAGS, (X86bt GR64:$src1, i64immSExt8:$src2))]>, TB;
+ [(set EFLAGS, (X86bt GR64:$src1, imm:$src2))]>, TB;
} // SchedRW
// Note that these instructions aren't slow because that only applies when the
// other operand is in a register. When it's an immediate, bt is still fast.
let SchedRW = [WriteBitTestImmLd] in {
-def BT16mi8 : Ii8<0xBA, MRM4m, (outs), (ins i16mem:$src1, i16i8imm:$src2),
+def BT16mi8 : Ii8<0xBA, MRM4m, (outs), (ins i16mem:$src1, i16u8imm:$src2),
"bt{w}\t{$src2, $src1|$src1, $src2}",
[(set EFLAGS, (X86bt (loadi16 addr:$src1),
- i16immSExt8:$src2))]>,
+ imm:$src2))]>,
OpSize16, TB;
-def BT32mi8 : Ii8<0xBA, MRM4m, (outs), (ins i32mem:$src1, i32i8imm:$src2),
+def BT32mi8 : Ii8<0xBA, MRM4m, (outs), (ins i32mem:$src1, i32u8imm:$src2),
"bt{l}\t{$src2, $src1|$src1, $src2}",
[(set EFLAGS, (X86bt (loadi32 addr:$src1),
- i32immSExt8:$src2))]>,
+ imm:$src2))]>,
OpSize32, TB;
-def BT64mi8 : RIi8<0xBA, MRM4m, (outs), (ins i64mem:$src1, i64i8imm:$src2),
+def BT64mi8 : RIi8<0xBA, MRM4m, (outs), (ins i64mem:$src1, i64u8imm:$src2),
"bt{q}\t{$src2, $src1|$src1, $src2}",
[(set EFLAGS, (X86bt (loadi64 addr:$src1),
- i64immSExt8:$src2))]>, TB,
+ imm:$src2))]>, TB,
Requires<[In64BitMode]>;
} // SchedRW
@@ -1832,20 +1882,20 @@ def BTC64mr : RI<0xBB, MRMDestMem, (outs), (ins i64mem:$src1, GR64:$src2),
}
let SchedRW = [WriteBitTestSet], Constraints = "$src1 = $dst" in {
-def BTC16ri8 : Ii8<0xBA, MRM7r, (outs GR16:$dst), (ins GR16:$src1, i16i8imm:$src2),
+def BTC16ri8 : Ii8<0xBA, MRM7r, (outs GR16:$dst), (ins GR16:$src1, i16u8imm:$src2),
"btc{w}\t{$src2, $src1|$src1, $src2}", []>, OpSize16, TB;
-def BTC32ri8 : Ii8<0xBA, MRM7r, (outs GR32:$dst), (ins GR32:$src1, i32i8imm:$src2),
+def BTC32ri8 : Ii8<0xBA, MRM7r, (outs GR32:$dst), (ins GR32:$src1, i32u8imm:$src2),
"btc{l}\t{$src2, $src1|$src1, $src2}", []>, OpSize32, TB;
-def BTC64ri8 : RIi8<0xBA, MRM7r, (outs GR64:$dst), (ins GR64:$src1, i64i8imm:$src2),
+def BTC64ri8 : RIi8<0xBA, MRM7r, (outs GR64:$dst), (ins GR64:$src1, i64u8imm:$src2),
"btc{q}\t{$src2, $src1|$src1, $src2}", []>, TB;
} // SchedRW
let mayLoad = 1, mayStore = 1, SchedRW = [WriteBitTestSetImmRMW] in {
-def BTC16mi8 : Ii8<0xBA, MRM7m, (outs), (ins i16mem:$src1, i16i8imm:$src2),
+def BTC16mi8 : Ii8<0xBA, MRM7m, (outs), (ins i16mem:$src1, i16u8imm:$src2),
"btc{w}\t{$src2, $src1|$src1, $src2}", []>, OpSize16, TB;
-def BTC32mi8 : Ii8<0xBA, MRM7m, (outs), (ins i32mem:$src1, i32i8imm:$src2),
+def BTC32mi8 : Ii8<0xBA, MRM7m, (outs), (ins i32mem:$src1, i32u8imm:$src2),
"btc{l}\t{$src2, $src1|$src1, $src2}", []>, OpSize32, TB;
-def BTC64mi8 : RIi8<0xBA, MRM7m, (outs), (ins i64mem:$src1, i64i8imm:$src2),
+def BTC64mi8 : RIi8<0xBA, MRM7m, (outs), (ins i64mem:$src1, i64u8imm:$src2),
"btc{q}\t{$src2, $src1|$src1, $src2}", []>, TB,
Requires<[In64BitMode]>;
}
@@ -1875,24 +1925,24 @@ def BTR64mr : RI<0xB3, MRMDestMem, (outs), (ins i64mem:$src1, GR64:$src2),
}
let SchedRW = [WriteBitTestSet], Constraints = "$src1 = $dst" in {
-def BTR16ri8 : Ii8<0xBA, MRM6r, (outs GR16:$dst), (ins GR16:$src1, i16i8imm:$src2),
+def BTR16ri8 : Ii8<0xBA, MRM6r, (outs GR16:$dst), (ins GR16:$src1, i16u8imm:$src2),
"btr{w}\t{$src2, $src1|$src1, $src2}", []>,
OpSize16, TB;
-def BTR32ri8 : Ii8<0xBA, MRM6r, (outs GR32:$dst), (ins GR32:$src1, i32i8imm:$src2),
+def BTR32ri8 : Ii8<0xBA, MRM6r, (outs GR32:$dst), (ins GR32:$src1, i32u8imm:$src2),
"btr{l}\t{$src2, $src1|$src1, $src2}", []>,
OpSize32, TB;
-def BTR64ri8 : RIi8<0xBA, MRM6r, (outs GR64:$dst), (ins GR64:$src1, i64i8imm:$src2),
+def BTR64ri8 : RIi8<0xBA, MRM6r, (outs GR64:$dst), (ins GR64:$src1, i64u8imm:$src2),
"btr{q}\t{$src2, $src1|$src1, $src2}", []>, TB;
} // SchedRW
let mayLoad = 1, mayStore = 1, SchedRW = [WriteBitTestSetImmRMW] in {
-def BTR16mi8 : Ii8<0xBA, MRM6m, (outs), (ins i16mem:$src1, i16i8imm:$src2),
+def BTR16mi8 : Ii8<0xBA, MRM6m, (outs), (ins i16mem:$src1, i16u8imm:$src2),
"btr{w}\t{$src2, $src1|$src1, $src2}", []>,
OpSize16, TB;
-def BTR32mi8 : Ii8<0xBA, MRM6m, (outs), (ins i32mem:$src1, i32i8imm:$src2),
+def BTR32mi8 : Ii8<0xBA, MRM6m, (outs), (ins i32mem:$src1, i32u8imm:$src2),
"btr{l}\t{$src2, $src1|$src1, $src2}", []>,
OpSize32, TB;
-def BTR64mi8 : RIi8<0xBA, MRM6m, (outs), (ins i64mem:$src1, i64i8imm:$src2),
+def BTR64mi8 : RIi8<0xBA, MRM6m, (outs), (ins i64mem:$src1, i64u8imm:$src2),
"btr{q}\t{$src2, $src1|$src1, $src2}", []>, TB,
Requires<[In64BitMode]>;
}
@@ -1922,20 +1972,20 @@ def BTS64mr : RI<0xAB, MRMDestMem, (outs), (ins i64mem:$src1, GR64:$src2),
}
let SchedRW = [WriteBitTestSet], Constraints = "$src1 = $dst" in {
-def BTS16ri8 : Ii8<0xBA, MRM5r, (outs GR16:$dst), (ins GR16:$src1, i16i8imm:$src2),
+def BTS16ri8 : Ii8<0xBA, MRM5r, (outs GR16:$dst), (ins GR16:$src1, i16u8imm:$src2),
"bts{w}\t{$src2, $src1|$src1, $src2}", []>, OpSize16, TB;
-def BTS32ri8 : Ii8<0xBA, MRM5r, (outs GR32:$dst), (ins GR32:$src1, i32i8imm:$src2),
+def BTS32ri8 : Ii8<0xBA, MRM5r, (outs GR32:$dst), (ins GR32:$src1, i32u8imm:$src2),
"bts{l}\t{$src2, $src1|$src1, $src2}", []>, OpSize32, TB;
-def BTS64ri8 : RIi8<0xBA, MRM5r, (outs GR64:$dst), (ins GR64:$src1, i64i8imm:$src2),
+def BTS64ri8 : RIi8<0xBA, MRM5r, (outs GR64:$dst), (ins GR64:$src1, i64u8imm:$src2),
"bts{q}\t{$src2, $src1|$src1, $src2}", []>, TB;
} // SchedRW
let mayLoad = 1, mayStore = 1, SchedRW = [WriteBitTestSetImmRMW] in {
-def BTS16mi8 : Ii8<0xBA, MRM5m, (outs), (ins i16mem:$src1, i16i8imm:$src2),
+def BTS16mi8 : Ii8<0xBA, MRM5m, (outs), (ins i16mem:$src1, i16u8imm:$src2),
"bts{w}\t{$src2, $src1|$src1, $src2}", []>, OpSize16, TB;
-def BTS32mi8 : Ii8<0xBA, MRM5m, (outs), (ins i32mem:$src1, i32i8imm:$src2),
+def BTS32mi8 : Ii8<0xBA, MRM5m, (outs), (ins i32mem:$src1, i32u8imm:$src2),
"bts{l}\t{$src2, $src1|$src1, $src2}", []>, OpSize32, TB;
-def BTS64mi8 : RIi8<0xBA, MRM5m, (outs), (ins i64mem:$src1, i64i8imm:$src2),
+def BTS64mi8 : RIi8<0xBA, MRM5m, (outs), (ins i64mem:$src1, i64u8imm:$src2),
"bts{q}\t{$src2, $src1|$src1, $src2}", []>, TB,
Requires<[In64BitMode]>;
}
@@ -2090,12 +2140,13 @@ def CMPXCHG64rm : RI<0xB1, MRMDestMem, (outs), (ins i64mem:$dst, GR64:$src),
let Defs = [EAX, EDX, EFLAGS], Uses = [EAX, EBX, ECX, EDX] in
def CMPXCHG8B : I<0xC7, MRM1m, (outs), (ins i64mem:$dst),
- "cmpxchg8b\t$dst", []>, TB;
+ "cmpxchg8b\t$dst", []>, TB, Requires<[HasCmpxchg8b]>;
let Defs = [RAX, RDX, EFLAGS], Uses = [RAX, RBX, RCX, RDX] in
+// NOTE: In64BitMode check needed for the AssemblerPredicate.
def CMPXCHG16B : RI<0xC7, MRM1m, (outs), (ins i128mem:$dst),
"cmpxchg16b\t$dst", []>,
- TB, Requires<[HasCmpxchg16b, In64BitMode]>;
+ TB, Requires<[HasCmpxchg16b,In64BitMode]>;
} // SchedRW, mayLoad, mayStore, hasSideEffects
@@ -2388,6 +2439,11 @@ def xor_flag_nocf : PatFrag<(ops node:$lhs, node:$rhs),
return hasNoCarryFlagUses(SDValue(N, 1));
}]>;
+def and_flag_nocf : PatFrag<(ops node:$lhs, node:$rhs),
+ (X86and_flag node:$lhs, node:$rhs), [{
+ return hasNoCarryFlagUses(SDValue(N, 1));
+}]>;
+
let Predicates = [HasBMI] in {
// FIXME: patterns for the load versions are not implemented
def : Pat<(and GR32:$src, (add GR32:$src, -1)),
@@ -2406,12 +2462,20 @@ let Predicates = [HasBMI] in {
(BLSI64rr GR64:$src)>;
// Versions to match flag producing ops.
- // X86and_flag nodes are rarely created. Those should use CMP+AND. We do
- // TESTrr matching in PostProcessISelDAG to allow BLSR/BLSI to be formed.
+ def : Pat<(and_flag_nocf GR32:$src, (add GR32:$src, -1)),
+ (BLSR32rr GR32:$src)>;
+ def : Pat<(and_flag_nocf GR64:$src, (add GR64:$src, -1)),
+ (BLSR64rr GR64:$src)>;
+
def : Pat<(xor_flag_nocf GR32:$src, (add GR32:$src, -1)),
(BLSMSK32rr GR32:$src)>;
def : Pat<(xor_flag_nocf GR64:$src, (add GR64:$src, -1)),
(BLSMSK64rr GR64:$src)>;
+
+ def : Pat<(and_flag_nocf GR32:$src, (ineg GR32:$src)),
+ (BLSI32rr GR32:$src)>;
+ def : Pat<(and_flag_nocf GR64:$src, (ineg GR64:$src)),
+ (BLSI64rr GR64:$src)>;
}
multiclass bmi_bextr<bits<8> opc, string mnemonic, RegisterClass RC,
@@ -2653,16 +2717,12 @@ defm LWPVAL64 : lwpval_intr<GR64, int_x86_lwpval64>, VEX_W;
// MONITORX/MWAITX Instructions
//
let SchedRW = [ WriteSystem ] in {
- let usesCustomInserter = 1 in {
- def MONITORX : PseudoI<(outs), (ins i32mem:$src1, GR32:$src2, GR32:$src3),
- [(int_x86_monitorx addr:$src1, GR32:$src2, GR32:$src3)]>,
- Requires<[ HasMWAITX ]>;
- }
-
- let Uses = [ EAX, ECX, EDX ] in {
- def MONITORXrrr : I<0x01, MRM_FA, (outs), (ins), "monitorx", []>,
- TB, Requires<[ HasMWAITX ]>;
- }
+ let Uses = [ EAX, ECX, EDX ] in
+ def MONITORX32rrr : I<0x01, MRM_FA, (outs), (ins), "monitorx", []>,
+ TB, Requires<[ HasMWAITX, Not64BitMode ]>;
+ let Uses = [ RAX, ECX, EDX ] in
+ def MONITORX64rrr : I<0x01, MRM_FA, (outs), (ins), "monitorx", []>,
+ TB, Requires<[ HasMWAITX, In64BitMode ]>;
let Uses = [ ECX, EAX, EBX ] in {
def MWAITXrrr : I<0x01, MRM_FB, (outs), (ins), "mwaitx",
@@ -2676,9 +2736,9 @@ def : InstAlias<"mwaitx\t{%eax, %ecx, %ebx|ebx, ecx, eax}", (MWAITXrrr)>,
def : InstAlias<"mwaitx\t{%rax, %rcx, %rbx|rbx, rcx, rax}", (MWAITXrrr)>,
Requires<[ In64BitMode ]>;
-def : InstAlias<"monitorx\t{%eax, %ecx, %edx|edx, ecx, eax}", (MONITORXrrr)>,
+def : InstAlias<"monitorx\t{%eax, %ecx, %edx|edx, ecx, eax}", (MONITORX32rrr)>,
Requires<[ Not64BitMode ]>;
-def : InstAlias<"monitorx\t{%rax, %rcx, %rdx|rdx, rcx, rax}", (MONITORXrrr)>,
+def : InstAlias<"monitorx\t{%rax, %rcx, %rdx|rdx, rcx, rax}", (MONITORX64rrr)>,
Requires<[ In64BitMode ]>;
//===----------------------------------------------------------------------===//
@@ -2738,21 +2798,50 @@ def MOVDIR64B64 : I<0xF8, MRMSrcMem, (outs), (ins GR64:$dst, i512mem:$src),
} // SchedRW
//===----------------------------------------------------------------------===//
+// ENQCMD/S - Enqueue 64-byte command as user with 64-byte write atomicity
+//
+let SchedRW = [WriteStore], Defs = [EFLAGS] in {
+ def ENQCMD16 : I<0xF8, MRMSrcMem, (outs), (ins GR16:$dst, i512mem:$src),
+ "enqcmd\t{$src, $dst|$dst, $src}",
+ [(set EFLAGS, (X86enqcmd GR16:$dst, addr:$src))]>,
+ T8XD, AdSize16, Requires<[HasENQCMD, Not64BitMode]>;
+ def ENQCMD32 : I<0xF8, MRMSrcMem, (outs), (ins GR32:$dst, i512mem:$src),
+ "enqcmd\t{$src, $dst|$dst, $src}",
+ [(set EFLAGS, (X86enqcmd GR32:$dst, addr:$src))]>,
+ T8XD, AdSize32, Requires<[HasENQCMD]>;
+ def ENQCMD64 : I<0xF8, MRMSrcMem, (outs), (ins GR64:$dst, i512mem:$src),
+ "enqcmd\t{$src, $dst|$dst, $src}",
+ [(set EFLAGS, (X86enqcmd GR64:$dst, addr:$src))]>,
+ T8XD, AdSize64, Requires<[HasENQCMD, In64BitMode]>;
+
+ def ENQCMDS16 : I<0xF8, MRMSrcMem, (outs), (ins GR16:$dst, i512mem:$src),
+ "enqcmds\t{$src, $dst|$dst, $src}",
+ [(set EFLAGS, (X86enqcmds GR16:$dst, addr:$src))]>,
+ T8XS, AdSize16, Requires<[HasENQCMD, Not64BitMode]>;
+ def ENQCMDS32 : I<0xF8, MRMSrcMem, (outs), (ins GR32:$dst, i512mem:$src),
+ "enqcmds\t{$src, $dst|$dst, $src}",
+ [(set EFLAGS, (X86enqcmds GR32:$dst, addr:$src))]>,
+ T8XS, AdSize32, Requires<[HasENQCMD]>;
+ def ENQCMDS64 : I<0xF8, MRMSrcMem, (outs), (ins GR64:$dst, i512mem:$src),
+ "enqcmds\t{$src, $dst|$dst, $src}",
+ [(set EFLAGS, (X86enqcmds GR64:$dst, addr:$src))]>,
+ T8XS, AdSize64, Requires<[HasENQCMD, In64BitMode]>;
+}
+
+//===----------------------------------------------------------------------===//
// CLZERO Instruction
//
let SchedRW = [WriteSystem] in {
let Uses = [EAX] in
- def CLZEROr : I<0x01, MRM_FC, (outs), (ins), "clzero", []>,
- TB, Requires<[HasCLZERO]>;
-
- let usesCustomInserter = 1 in {
- def CLZERO : PseudoI<(outs), (ins i32mem:$src1),
- [(int_x86_clzero addr:$src1)]>, Requires<[HasCLZERO]>;
- }
+ def CLZERO32r : I<0x01, MRM_FC, (outs), (ins), "clzero", []>,
+ TB, Requires<[HasCLZERO, Not64BitMode]>;
+ let Uses = [RAX] in
+ def CLZERO64r : I<0x01, MRM_FC, (outs), (ins), "clzero", []>,
+ TB, Requires<[HasCLZERO, In64BitMode]>;
} // SchedRW
-def : InstAlias<"clzero\t{%eax|eax}", (CLZEROr)>, Requires<[Not64BitMode]>;
-def : InstAlias<"clzero\t{%rax|rax}", (CLZEROr)>, Requires<[In64BitMode]>;
+def : InstAlias<"clzero\t{%eax|eax}", (CLZERO32r)>, Requires<[Not64BitMode]>;
+def : InstAlias<"clzero\t{%rax|rax}", (CLZERO64r)>, Requires<[In64BitMode]>;
//===----------------------------------------------------------------------===//
// Pattern fragments to auto generate TBM instructions.
@@ -2812,8 +2901,6 @@ let Predicates = [HasTBM] in {
(TZMSK64rr GR64:$src)>;
// Patterns to match flag producing ops.
- // X86and_flag nodes are rarely created. Those should use CMP+AND. We do
- // TESTrr matching in PostProcessISelDAG to allow BLSR/BLSI to be formed.
def : Pat<(or_flag_nocf GR32:$src, (not (add GR32:$src, 1))),
(BLCI32rr GR32:$src)>;
def : Pat<(or_flag_nocf GR64:$src, (not (add GR64:$src, 1))),
@@ -2825,6 +2912,11 @@ let Predicates = [HasTBM] in {
def : Pat<(or_flag_nocf GR64:$src, (sub -2, GR64:$src)),
(BLCI64rr GR64:$src)>;
+ def : Pat<(and_flag_nocf (not GR32:$src), (add GR32:$src, 1)),
+ (BLCIC32rr GR32:$src)>;
+ def : Pat<(and_flag_nocf (not GR64:$src), (add GR64:$src, 1)),
+ (BLCIC64rr GR64:$src)>;
+
def : Pat<(xor_flag_nocf GR32:$src, (add GR32:$src, 1)),
(BLCMSK32rr GR32:$src)>;
def : Pat<(xor_flag_nocf GR64:$src, (add GR64:$src, 1)),
@@ -2849,6 +2941,11 @@ let Predicates = [HasTBM] in {
(T1MSKC32rr GR32:$src)>;
def : Pat<(or_flag_nocf (not GR64:$src), (add GR64:$src, 1)),
(T1MSKC64rr GR64:$src)>;
+
+ def : Pat<(and_flag_nocf (not GR32:$src), (add GR32:$src, -1)),
+ (TZMSK32rr GR32:$src)>;
+ def : Pat<(and_flag_nocf (not GR64:$src), (add GR64:$src, -1)),
+ (TZMSK64rr GR64:$src)>;
} // HasTBM
//===----------------------------------------------------------------------===//
@@ -3231,39 +3328,39 @@ def : InstAlias<"fucompi", (UCOM_FIPr ST1), 0>;
// instructions like "fadd %st(0), %st(0)" as "fadd %st(0)" for consistency with
// gas.
multiclass FpUnaryAlias<string Mnemonic, Instruction Inst, bit EmitAlias = 1> {
- def : InstAlias<!strconcat(Mnemonic, "\t{$op, %st(0)|st(0), $op}"),
- (Inst RST:$op), EmitAlias>;
- def : InstAlias<!strconcat(Mnemonic, "\t{%st(0), %st(0)|st(0), st(0)}"),
+ def : InstAlias<!strconcat(Mnemonic, "\t$op"),
+ (Inst RSTi:$op), EmitAlias>;
+ def : InstAlias<!strconcat(Mnemonic, "\t{%st, %st|st, st}"),
(Inst ST0), EmitAlias>;
}
-defm : FpUnaryAlias<"fadd", ADD_FST0r>;
+defm : FpUnaryAlias<"fadd", ADD_FST0r, 0>;
defm : FpUnaryAlias<"faddp", ADD_FPrST0, 0>;
-defm : FpUnaryAlias<"fsub", SUB_FST0r>;
-defm : FpUnaryAlias<"fsub{|r}p", SUBR_FPrST0>;
-defm : FpUnaryAlias<"fsubr", SUBR_FST0r>;
-defm : FpUnaryAlias<"fsub{r|}p", SUB_FPrST0>;
-defm : FpUnaryAlias<"fmul", MUL_FST0r>;
-defm : FpUnaryAlias<"fmulp", MUL_FPrST0>;
-defm : FpUnaryAlias<"fdiv", DIV_FST0r>;
-defm : FpUnaryAlias<"fdiv{|r}p", DIVR_FPrST0>;
-defm : FpUnaryAlias<"fdivr", DIVR_FST0r>;
-defm : FpUnaryAlias<"fdiv{r|}p", DIV_FPrST0>;
+defm : FpUnaryAlias<"fsub", SUB_FST0r, 0>;
+defm : FpUnaryAlias<"fsub{|r}p", SUBR_FPrST0, 0>;
+defm : FpUnaryAlias<"fsubr", SUBR_FST0r, 0>;
+defm : FpUnaryAlias<"fsub{r|}p", SUB_FPrST0, 0>;
+defm : FpUnaryAlias<"fmul", MUL_FST0r, 0>;
+defm : FpUnaryAlias<"fmulp", MUL_FPrST0, 0>;
+defm : FpUnaryAlias<"fdiv", DIV_FST0r, 0>;
+defm : FpUnaryAlias<"fdiv{|r}p", DIVR_FPrST0, 0>;
+defm : FpUnaryAlias<"fdivr", DIVR_FST0r, 0>;
+defm : FpUnaryAlias<"fdiv{r|}p", DIV_FPrST0, 0>;
defm : FpUnaryAlias<"fcomi", COM_FIr, 0>;
defm : FpUnaryAlias<"fucomi", UCOM_FIr, 0>;
-defm : FpUnaryAlias<"fcompi", COM_FIPr>;
-defm : FpUnaryAlias<"fucompi", UCOM_FIPr>;
+defm : FpUnaryAlias<"fcompi", COM_FIPr, 0>;
+defm : FpUnaryAlias<"fucompi", UCOM_FIPr, 0>;
-// Handle "f{mulp,addp} st(0), $op" the same as "f{mulp,addp} $op", since they
+// Handle "f{mulp,addp} $op, %st(0)" the same as "f{mulp,addp} $op", since they
// commute. We also allow fdiv[r]p/fsubrp even though they don't commute,
// solely because gas supports it.
-def : InstAlias<"faddp\t{%st(0), $op|$op, st(0)}", (ADD_FPrST0 RST:$op), 0>;
-def : InstAlias<"fmulp\t{%st(0), $op|$op, st(0)}", (MUL_FPrST0 RST:$op)>;
-def : InstAlias<"fsub{|r}p\t{%st(0), $op|$op, st(0)}", (SUBR_FPrST0 RST:$op)>;
-def : InstAlias<"fsub{r|}p\t{%st(0), $op|$op, st(0)}", (SUB_FPrST0 RST:$op)>;
-def : InstAlias<"fdiv{|r}p\t{%st(0), $op|$op, st(0)}", (DIVR_FPrST0 RST:$op)>;
-def : InstAlias<"fdiv{r|}p\t{%st(0), $op|$op, st(0)}", (DIV_FPrST0 RST:$op)>;
+def : InstAlias<"faddp\t{$op, %st|st, $op}", (ADD_FPrST0 RSTi:$op), 0>;
+def : InstAlias<"fmulp\t{$op, %st|st, $op}", (MUL_FPrST0 RSTi:$op), 0>;
+def : InstAlias<"fsub{|r}p\t{$op, %st|st, $op}", (SUBR_FPrST0 RSTi:$op), 0>;
+def : InstAlias<"fsub{r|}p\t{$op, %st|st, $op}", (SUB_FPrST0 RSTi:$op), 0>;
+def : InstAlias<"fdiv{|r}p\t{$op, %st|st, $op}", (DIVR_FPrST0 RSTi:$op), 0>;
+def : InstAlias<"fdiv{r|}p\t{$op, %st|st, $op}", (DIV_FPrST0 RSTi:$op), 0>;
def : InstAlias<"fnstsw" , (FNSTSW16r), 0>;
diff --git a/lib/Target/X86/X86InstrMMX.td b/lib/Target/X86/X86InstrMMX.td
index 8f3357170576..57835b1a256a 100644
--- a/lib/Target/X86/X86InstrMMX.td
+++ b/lib/Target/X86/X86InstrMMX.td
@@ -1,9 +1,8 @@
//===-- X86InstrMMX.td - Describe the MMX Instruction Set --*- tablegen -*-===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
@@ -153,7 +152,9 @@ multiclass sse12_cvt_pint_3addr<bits<8> opc, RegisterClass SrcRC,
// MMX EMMS Instruction
//===----------------------------------------------------------------------===//
-let SchedRW = [WriteEMMS] in
+let SchedRW = [WriteEMMS],
+ Defs = [MM0, MM1, MM2, MM3, MM4, MM5, MM6, MM7,
+ ST0, ST1, ST2, ST3, ST4, ST5, ST6, ST7] in
def MMX_EMMS : MMXI<0x77, RawFrm, (outs), (ins), "emms", [(int_x86_mmx_emms)]>;
//===----------------------------------------------------------------------===//
@@ -544,7 +545,7 @@ let Predicates = [HasMMX, HasSSE1] in {
"pinsrw\t{$src3, $src2, $dst|$dst, $src2, $src3}",
[(set VR64:$dst, (int_x86_mmx_pinsr_w VR64:$src1,
GR32orGR64:$src2, imm:$src3))]>,
- Sched<[WriteVecInsert]>;
+ Sched<[WriteVecInsert, ReadDefault, ReadInt2Fpu]>;
def MMX_PINSRWrm : MMXIi8<0xC4, MRMSrcMem,
(outs VR64:$dst),
diff --git a/lib/Target/X86/X86InstrMPX.td b/lib/Target/X86/X86InstrMPX.td
index c1a8cc7c5fbf..f7d931510fe2 100644
--- a/lib/Target/X86/X86InstrMPX.td
+++ b/lib/Target/X86/X86InstrMPX.td
@@ -1,9 +1,8 @@
//===-- X86InstrMPX.td - MPX Instruction Set ---------*- tablegen -*-===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
diff --git a/lib/Target/X86/X86InstrSGX.td b/lib/Target/X86/X86InstrSGX.td
index 488cc4438076..747f5aa86653 100644
--- a/lib/Target/X86/X86InstrSGX.td
+++ b/lib/Target/X86/X86InstrSGX.td
@@ -1,9 +1,8 @@
//===-- X86InstrSGX.td - SGX Instruction Set Extension -----*- tablegen -*-===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
diff --git a/lib/Target/X86/X86InstrSSE.td b/lib/Target/X86/X86InstrSSE.td
index e2bcd18ce660..7d0a5b87baf4 100644
--- a/lib/Target/X86/X86InstrSSE.td
+++ b/lib/Target/X86/X86InstrSSE.td
@@ -1,9 +1,8 @@
//===-- X86InstrSSE.td - SSE Instruction Set ---------------*- tablegen -*-===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
@@ -22,6 +21,7 @@ multiclass sse12_fp_scalar<bits<8> opc, string OpcodeStr, SDNode OpNode,
RegisterClass RC, X86MemOperand x86memop,
Domain d, X86FoldableSchedWrite sched,
bit Is2Addr = 1> {
+let isCodeGenOnly = 1 in {
let isCommutable = 1 in {
def rr : SI<opc, MRMSrcReg, (outs RC:$dst), (ins RC:$src1, RC:$src2),
!if(Is2Addr,
@@ -37,6 +37,7 @@ multiclass sse12_fp_scalar<bits<8> opc, string OpcodeStr, SDNode OpNode,
[(set RC:$dst, (OpNode RC:$src1, (load addr:$src2)))], d>,
Sched<[sched.Folded, sched.ReadAfterFold]>;
}
+}
/// sse12_fp_scalar_int - SSE 1 & 2 scalar instructions intrinsics class
multiclass sse12_fp_scalar_int<bits<8> opc, string OpcodeStr,
@@ -44,7 +45,7 @@ multiclass sse12_fp_scalar_int<bits<8> opc, string OpcodeStr,
ValueType VT, string asm, Operand memopr,
ComplexPattern mem_cpat, Domain d,
X86FoldableSchedWrite sched, bit Is2Addr = 1> {
-let isCodeGenOnly = 1, hasSideEffects = 0 in {
+let hasSideEffects = 0 in {
def rr_Int : SI_Int<opc, MRMSrcReg, (outs RC:$dst), (ins RC:$src1, RC:$src2),
!if(Is2Addr,
!strconcat(asm, "\t{$src2, $dst|$dst, $src2}"),
@@ -224,16 +225,29 @@ multiclass sse12_move<RegisterClass RC, SDNode OpNode, ValueType vt,
}
// Loading from memory automatically zeroing upper bits.
-multiclass sse12_move_rm<RegisterClass RC, X86MemOperand x86memop,
- PatFrag mem_pat, string OpcodeStr, Domain d> {
- def V#NAME#rm : SI<0x10, MRMSrcMem, (outs RC:$dst), (ins x86memop:$src),
+multiclass sse12_move_rm<RegisterClass RC, ValueType vt, X86MemOperand x86memop,
+ PatFrag mem_pat, PatFrag vzloadfrag, string OpcodeStr,
+ Domain d> {
+ def V#NAME#rm : SI<0x10, MRMSrcMem, (outs VR128:$dst), (ins x86memop:$src),
!strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
- [(set RC:$dst, (mem_pat addr:$src))], d>,
+ [(set VR128:$dst, (vt (vzloadfrag addr:$src)))], d>,
VEX, VEX_LIG, Sched<[WriteFLoad]>, VEX_WIG;
- def NAME#rm : SI<0x10, MRMSrcMem, (outs RC:$dst), (ins x86memop:$src),
+ def NAME#rm : SI<0x10, MRMSrcMem, (outs VR128:$dst), (ins x86memop:$src),
!strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
- [(set RC:$dst, (mem_pat addr:$src))], d>,
+ [(set VR128:$dst, (vt (vzloadfrag addr:$src)))], d>,
Sched<[WriteFLoad]>;
+
+ // _alt version uses FR32/FR64 register class.
+ let isCodeGenOnly = 1 in {
+ def V#NAME#rm_alt : SI<0x10, MRMSrcMem, (outs RC:$dst), (ins x86memop:$src),
+ !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
+ [(set RC:$dst, (mem_pat addr:$src))], d>,
+ VEX, VEX_LIG, Sched<[WriteFLoad]>, VEX_WIG;
+ def NAME#rm_alt : SI<0x10, MRMSrcMem, (outs RC:$dst), (ins x86memop:$src),
+ !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
+ [(set RC:$dst, (mem_pat addr:$src))], d>,
+ Sched<[WriteFLoad]>;
+ }
}
defm MOVSS : sse12_move<FR32, X86Movss, v4f32, f32mem, "movss",
@@ -242,49 +256,25 @@ defm MOVSD : sse12_move<FR64, X86Movsd, v2f64, f64mem, "movsd",
SSEPackedDouble, "MOVSD", UseSSE2>, XD;
let canFoldAsLoad = 1, isReMaterializable = 1 in {
- defm MOVSS : sse12_move_rm<FR32, f32mem, loadf32, "movss",
+ defm MOVSS : sse12_move_rm<FR32, v4f32, f32mem, loadf32, X86vzload32, "movss",
SSEPackedSingle>, XS;
- defm MOVSD : sse12_move_rm<FR64, f64mem, loadf64, "movsd",
+ defm MOVSD : sse12_move_rm<FR64, v2f64, f64mem, loadf64, X86vzload64, "movsd",
SSEPackedDouble>, XD;
}
// Patterns
let Predicates = [UseAVX] in {
- // MOVSSrm zeros the high parts of the register; represent this
- // with SUBREG_TO_REG. The AVX versions also write: DST[255:128] <- 0
- def : Pat<(v4f32 (X86vzmovl (v4f32 (scalar_to_vector (loadf32 addr:$src))))),
- (COPY_TO_REGCLASS (VMOVSSrm addr:$src), VR128)>;
- def : Pat<(v4f32 (X86vzmovl (loadv4f32 addr:$src))),
- (COPY_TO_REGCLASS (VMOVSSrm addr:$src), VR128)>;
- def : Pat<(v4f32 (X86vzload addr:$src)),
- (COPY_TO_REGCLASS (VMOVSSrm addr:$src), VR128)>;
-
- // MOVSDrm zeros the high parts of the register; represent this
- // with SUBREG_TO_REG. The AVX versions also write: DST[255:128] <- 0
- def : Pat<(v2f64 (X86vzmovl (v2f64 (scalar_to_vector (loadf64 addr:$src))))),
- (COPY_TO_REGCLASS (VMOVSDrm addr:$src), VR128)>;
- def : Pat<(v2f64 (X86vzmovl (loadv2f64 addr:$src))),
- (COPY_TO_REGCLASS (VMOVSDrm addr:$src), VR128)>;
- def : Pat<(v2f64 (X86vzload addr:$src)),
- (COPY_TO_REGCLASS (VMOVSDrm addr:$src), VR128)>;
+ def : Pat<(v4f32 (scalar_to_vector (loadf32 addr:$src))),
+ (VMOVSSrm addr:$src)>;
+ def : Pat<(v2f64 (scalar_to_vector (loadf64 addr:$src))),
+ (VMOVSDrm addr:$src)>;
// Represent the same patterns above but in the form they appear for
// 256-bit types
- def : Pat<(v8f32 (X86vzmovl (insert_subvector undef,
- (v4f32 (scalar_to_vector (loadf32 addr:$src))), (iPTR 0)))),
- (SUBREG_TO_REG (i32 0), (VMOVSSrm addr:$src), sub_xmm)>;
- def : Pat<(v8f32 (X86vzload addr:$src)),
+ def : Pat<(v8f32 (X86vzload32 addr:$src)),
(SUBREG_TO_REG (i32 0), (VMOVSSrm addr:$src), sub_xmm)>;
- def : Pat<(v4f64 (X86vzmovl (insert_subvector undef,
- (v2f64 (scalar_to_vector (loadf64 addr:$src))), (iPTR 0)))),
+ def : Pat<(v4f64 (X86vzload64 addr:$src)),
(SUBREG_TO_REG (i32 0), (VMOVSDrm addr:$src), sub_xmm)>;
- def : Pat<(v4f64 (X86vzload addr:$src)),
- (SUBREG_TO_REG (i32 0), (VMOVSDrm addr:$src), sub_xmm)>;
-
- // Extract and store.
- def : Pat<(store (f32 (extractelt (v4f32 VR128:$src), (iPTR 0))),
- addr:$dst),
- (VMOVSSmr addr:$dst, (COPY_TO_REGCLASS (v4f32 VR128:$src), FR32))>;
}
let Predicates = [UseAVX, OptForSize] in {
@@ -304,59 +294,24 @@ let Predicates = [UseAVX, OptForSize] in {
(SUBREG_TO_REG (i32 0),
(v4i32 (VMOVSSrr (v4i32 (V_SET0)),
(v4i32 (EXTRACT_SUBREG (v8i32 VR256:$src), sub_xmm)))), sub_xmm)>;
-
- def : Pat<(v4f64 (X86vzmovl (v4f64 VR256:$src))),
- (SUBREG_TO_REG (i32 0),
- (v2f64 (VMOVSDrr (v2f64 (V_SET0)),
- (v2f64 (EXTRACT_SUBREG (v4f64 VR256:$src), sub_xmm)))),
- sub_xmm)>;
- def : Pat<(v4i64 (X86vzmovl (v4i64 VR256:$src))),
- (SUBREG_TO_REG (i32 0),
- (v2i64 (VMOVSDrr (v2i64 (V_SET0)),
- (v2i64 (EXTRACT_SUBREG (v4i64 VR256:$src), sub_xmm)))),
- sub_xmm)>;
}
-let Predicates = [UseSSE1] in {
- let Predicates = [UseSSE1, NoSSE41_Or_OptForSize] in {
- // Move scalar to XMM zero-extended, zeroing a VR128 then do a
- // MOVSS to the lower bits.
- def : Pat<(v4f32 (X86vzmovl (v4f32 VR128:$src))),
- (MOVSSrr (v4f32 (V_SET0)), VR128:$src)>;
- def : Pat<(v4i32 (X86vzmovl (v4i32 VR128:$src))),
- (MOVSSrr (v4i32 (V_SET0)), VR128:$src)>;
- }
-
- // MOVSSrm already zeros the high parts of the register.
- def : Pat<(v4f32 (X86vzmovl (v4f32 (scalar_to_vector (loadf32 addr:$src))))),
- (COPY_TO_REGCLASS (MOVSSrm addr:$src), VR128)>;
- def : Pat<(v4f32 (X86vzmovl (loadv4f32 addr:$src))),
- (COPY_TO_REGCLASS (MOVSSrm addr:$src), VR128)>;
- def : Pat<(v4f32 (X86vzload addr:$src)),
- (COPY_TO_REGCLASS (MOVSSrm addr:$src), VR128)>;
-
- // Extract and store.
- def : Pat<(store (f32 (extractelt (v4f32 VR128:$src), (iPTR 0))),
- addr:$dst),
- (MOVSSmr addr:$dst, (COPY_TO_REGCLASS VR128:$src, FR32))>;
+let Predicates = [UseSSE1, NoSSE41_Or_OptForSize] in {
+// Move scalar to XMM zero-extended, zeroing a VR128 then do a
+// MOVSS to the lower bits.
+def : Pat<(v4f32 (X86vzmovl (v4f32 VR128:$src))),
+ (MOVSSrr (v4f32 (V_SET0)), VR128:$src)>;
+def : Pat<(v4i32 (X86vzmovl (v4i32 VR128:$src))),
+ (MOVSSrr (v4i32 (V_SET0)), VR128:$src)>;
}
-let Predicates = [UseSSE2] in {
- // MOVSDrm already zeros the high parts of the register.
- def : Pat<(v2f64 (X86vzmovl (v2f64 (scalar_to_vector (loadf64 addr:$src))))),
- (COPY_TO_REGCLASS (MOVSDrm addr:$src), VR128)>;
- def : Pat<(v2f64 (X86vzmovl (loadv2f64 addr:$src))),
- (COPY_TO_REGCLASS (MOVSDrm addr:$src), VR128)>;
- def : Pat<(v2f64 (X86vzload addr:$src)),
- (COPY_TO_REGCLASS (MOVSDrm addr:$src), VR128)>;
-}
-
-// Aliases to help the assembler pick two byte VEX encodings by swapping the
-// operands relative to the normal instructions to use VEX.R instead of VEX.B.
-def : InstAlias<"vmovss\t{$src2, $src1, $dst|$dst, $src1, $src2}",
- (VMOVSSrr_REV VR128L:$dst, VR128:$src1, VR128H:$src2), 0>;
-def : InstAlias<"vmovsd\t{$src2, $src1, $dst|$dst, $src1, $src2}",
- (VMOVSDrr_REV VR128L:$dst, VR128:$src1, VR128H:$src2), 0>;
+let Predicates = [UseSSE2] in
+def : Pat<(v2f64 (scalar_to_vector (loadf64 addr:$src))),
+ (MOVSDrm addr:$src)>;
+
+let Predicates = [UseSSE1] in
+def : Pat<(v4f32 (scalar_to_vector (loadf32 addr:$src))),
+ (MOVSSrm addr:$src)>;
//===----------------------------------------------------------------------===//
// SSE 1 & 2 - Move Aligned/Unaligned FP Instructions
@@ -504,25 +459,6 @@ let SchedRW = [SchedWriteFMoveLS.YMM.RR] in {
} // SchedRW
} // Predicate
-// Aliases to help the assembler pick two byte VEX encodings by swapping the
-// operands relative to the normal instructions to use VEX.R instead of VEX.B.
-def : InstAlias<"vmovaps\t{$src, $dst|$dst, $src}",
- (VMOVAPSrr_REV VR128L:$dst, VR128H:$src), 0>;
-def : InstAlias<"vmovapd\t{$src, $dst|$dst, $src}",
- (VMOVAPDrr_REV VR128L:$dst, VR128H:$src), 0>;
-def : InstAlias<"vmovups\t{$src, $dst|$dst, $src}",
- (VMOVUPSrr_REV VR128L:$dst, VR128H:$src), 0>;
-def : InstAlias<"vmovupd\t{$src, $dst|$dst, $src}",
- (VMOVUPDrr_REV VR128L:$dst, VR128H:$src), 0>;
-def : InstAlias<"vmovaps\t{$src, $dst|$dst, $src}",
- (VMOVAPSYrr_REV VR256L:$dst, VR256H:$src), 0>;
-def : InstAlias<"vmovapd\t{$src, $dst|$dst, $src}",
- (VMOVAPDYrr_REV VR256L:$dst, VR256H:$src), 0>;
-def : InstAlias<"vmovups\t{$src, $dst|$dst, $src}",
- (VMOVUPSYrr_REV VR256L:$dst, VR256H:$src), 0>;
-def : InstAlias<"vmovupd\t{$src, $dst|$dst, $src}",
- (VMOVUPDYrr_REV VR256L:$dst, VR256H:$src), 0>;
-
// Reversed version with ".s" suffix for GAS compatibility.
def : InstAlias<"vmovaps.s\t{$src, $dst|$dst, $src}",
(VMOVAPSrr_REV VR128:$dst, VR128:$src), 0>;
@@ -700,10 +636,10 @@ defm MOVL : sse12_mov_hilo_packed<0x12, X86Movsd, "movlp">;
let SchedRW = [WriteFStore] in {
let Predicates = [UseAVX] in {
+let mayStore = 1, hasSideEffects = 0 in
def VMOVLPSmr : VPSI<0x13, MRMDestMem, (outs), (ins f64mem:$dst, VR128:$src),
"movlps\t{$src, $dst|$dst, $src}",
- [(store (f64 (extractelt (bc_v2f64 (v4f32 VR128:$src)),
- (iPTR 0))), addr:$dst)]>,
+ []>,
VEX, VEX_WIG;
def VMOVLPDmr : VPDI<0x13, MRMDestMem, (outs), (ins f64mem:$dst, VR128:$src),
"movlpd\t{$src, $dst|$dst, $src}",
@@ -711,10 +647,10 @@ def VMOVLPDmr : VPDI<0x13, MRMDestMem, (outs), (ins f64mem:$dst, VR128:$src),
(iPTR 0))), addr:$dst)]>,
VEX, VEX_WIG;
}// UseAVX
+let mayStore = 1, hasSideEffects = 0 in
def MOVLPSmr : PSI<0x13, MRMDestMem, (outs), (ins f64mem:$dst, VR128:$src),
"movlps\t{$src, $dst|$dst, $src}",
- [(store (f64 (extractelt (bc_v2f64 (v4f32 VR128:$src)),
- (iPTR 0))), addr:$dst)]>;
+ []>;
def MOVLPDmr : PDI<0x13, MRMDestMem, (outs), (ins f64mem:$dst, VR128:$src),
"movlpd\t{$src, $dst|$dst, $src}",
[(store (f64 (extractelt (v2f64 VR128:$src),
@@ -722,16 +658,19 @@ def MOVLPDmr : PDI<0x13, MRMDestMem, (outs), (ins f64mem:$dst, VR128:$src),
} // SchedRW
let Predicates = [UseSSE1] in {
- // (store (vector_shuffle (load addr), v2, <4, 5, 2, 3>), addr) using MOVLPS
- def : Pat<(store (i64 (extractelt (bc_v2i64 (v4f32 VR128:$src2)),
- (iPTR 0))), addr:$src1),
- (MOVLPSmr addr:$src1, VR128:$src2)>;
-
// This pattern helps select MOVLPS on SSE1 only targets. With SSE2 we'll
// end up with a movsd or blend instead of shufp.
// No need for aligned load, we're only loading 64-bits.
- def : Pat<(X86Shufp (loadv4f32 addr:$src2), VR128:$src1, (i8 -28)),
+ def : Pat<(X86Shufp (v4f32 (nonvolatile_load addr:$src2)), VR128:$src1,
+ (i8 -28)),
(MOVLPSrm VR128:$src1, addr:$src2)>;
+ def : Pat<(X86Shufp (v4f32 (X86vzload64 addr:$src2)), VR128:$src1, (i8 -28)),
+ (MOVLPSrm VR128:$src1, addr:$src2)>;
+
+ def : Pat<(v4f32 (X86vzload64 addr:$src)),
+ (MOVLPSrm (v4f32 (V_SET0)), addr:$src)>;
+ def : Pat<(X86vextractstore64 (v4f32 VR128:$src), addr:$dst),
+ (MOVLPSmr addr:$dst, VR128:$src)>;
}
//===----------------------------------------------------------------------===//
@@ -744,24 +683,20 @@ let SchedRW = [WriteFStore] in {
// v2f64 extract element 1 is always custom lowered to unpack high to low
// and extract element 0 so the non-store version isn't too horrible.
let Predicates = [UseAVX] in {
+let mayStore = 1, hasSideEffects = 0 in
def VMOVHPSmr : VPSI<0x17, MRMDestMem, (outs), (ins f64mem:$dst, VR128:$src),
"movhps\t{$src, $dst|$dst, $src}",
- [(store (f64 (extractelt
- (X86Unpckh (bc_v2f64 (v4f32 VR128:$src)),
- (bc_v2f64 (v4f32 VR128:$src))),
- (iPTR 0))), addr:$dst)]>, VEX, VEX_WIG;
+ []>, VEX, VEX_WIG;
def VMOVHPDmr : VPDI<0x17, MRMDestMem, (outs), (ins f64mem:$dst, VR128:$src),
"movhpd\t{$src, $dst|$dst, $src}",
[(store (f64 (extractelt
(v2f64 (X86Unpckh VR128:$src, VR128:$src)),
(iPTR 0))), addr:$dst)]>, VEX, VEX_WIG;
} // UseAVX
+let mayStore = 1, hasSideEffects = 0 in
def MOVHPSmr : PSI<0x17, MRMDestMem, (outs), (ins f64mem:$dst, VR128:$src),
"movhps\t{$src, $dst|$dst, $src}",
- [(store (f64 (extractelt
- (X86Unpckh (bc_v2f64 (v4f32 VR128:$src)),
- (bc_v2f64 (v4f32 VR128:$src))),
- (iPTR 0))), addr:$dst)]>;
+ []>;
def MOVHPDmr : PDI<0x17, MRMDestMem, (outs), (ins f64mem:$dst, VR128:$src),
"movhpd\t{$src, $dst|$dst, $src}",
[(store (f64 (extractelt
@@ -775,19 +710,31 @@ let Predicates = [UseAVX] in {
def : Pat<(v2f64 (X86Unpckl VR128:$src1,
(bc_v2f64 (v2i64 (scalar_to_vector (loadi64 addr:$src2)))))),
(VMOVHPDrm VR128:$src1, addr:$src2)>;
+ def : Pat<(v2f64 (X86Unpckl VR128:$src1, (X86vzload64 addr:$src2))),
+ (VMOVHPDrm VR128:$src1, addr:$src2)>;
def : Pat<(store (f64 (extractelt
(v2f64 (X86VPermilpi VR128:$src, (i8 1))),
(iPTR 0))), addr:$dst),
(VMOVHPDmr addr:$dst, VR128:$src)>;
+
+ // MOVLPD patterns
+ def : Pat<(v2f64 (X86Movsd VR128:$src1, (X86vzload64 addr:$src2))),
+ (VMOVLPDrm VR128:$src1, addr:$src2)>;
}
let Predicates = [UseSSE1] in {
// This pattern helps select MOVHPS on SSE1 only targets. With SSE2 we'll
// end up with a movsd or blend instead of shufp.
// No need for aligned load, we're only loading 64-bits.
- def : Pat<(X86Movlhps VR128:$src1, (loadv4f32 addr:$src2)),
+ def : Pat<(X86Movlhps VR128:$src1, (v4f32 (nonvolatile_load addr:$src2))),
+ (MOVHPSrm VR128:$src1, addr:$src2)>;
+ def : Pat<(X86Movlhps VR128:$src1, (v4f32 (X86vzload64 addr:$src2))),
(MOVHPSrm VR128:$src1, addr:$src2)>;
+
+ def : Pat<(X86vextractstore64 (v4f32 (X86Movhlps VR128:$src, VR128:$src)),
+ addr:$dst),
+ (MOVHPSmr addr:$dst, VR128:$src)>;
}
let Predicates = [UseSSE2] in {
@@ -798,11 +745,24 @@ let Predicates = [UseSSE2] in {
def : Pat<(v2f64 (X86Unpckl VR128:$src1,
(bc_v2f64 (v2i64 (scalar_to_vector (loadi64 addr:$src2)))))),
(MOVHPDrm VR128:$src1, addr:$src2)>;
+ def : Pat<(v2f64 (X86Unpckl VR128:$src1, (X86vzload64 addr:$src2))),
+ (MOVHPDrm VR128:$src1, addr:$src2)>;
def : Pat<(store (f64 (extractelt
(v2f64 (X86Shufp VR128:$src, VR128:$src, (i8 1))),
(iPTR 0))), addr:$dst),
(MOVHPDmr addr:$dst, VR128:$src)>;
+
+ // MOVLPD patterns
+ def : Pat<(v2f64 (X86Movsd VR128:$src1, (X86vzload64 addr:$src2))),
+ (MOVLPDrm VR128:$src1, addr:$src2)>;
+}
+
+let Predicates = [UseSSE2, NoSSE41_Or_OptForSize] in {
+ // Use MOVLPD to load into the low bits from a full vector unless we can use
+ // BLENDPD.
+ def : Pat<(X86Movsd VR128:$src1, (v2f64 (nonvolatile_load addr:$src2))),
+ (MOVLPDrm VR128:$src1, addr:$src2)>;
}
//===----------------------------------------------------------------------===//
@@ -847,13 +807,16 @@ let Constraints = "$src1 = $dst" in {
multiclass sse12_cvt_s<bits<8> opc, RegisterClass SrcRC, RegisterClass DstRC,
SDNode OpNode, X86MemOperand x86memop, PatFrag ld_frag,
- string asm, X86FoldableSchedWrite sched> {
- def rr : SI<opc, MRMSrcReg, (outs DstRC:$dst), (ins SrcRC:$src), asm,
- [(set DstRC:$dst, (OpNode SrcRC:$src))]>,
- Sched<[sched]>;
- def rm : SI<opc, MRMSrcMem, (outs DstRC:$dst), (ins x86memop:$src), asm,
- [(set DstRC:$dst, (OpNode (ld_frag addr:$src)))]>,
- Sched<[sched.Folded]>;
+ string asm, string mem, X86FoldableSchedWrite sched,
+ SchedRead Int2Fpu = ReadDefault> {
+ def rr : SI<opc, MRMSrcReg, (outs DstRC:$dst), (ins SrcRC:$src),
+ !strconcat(asm,"\t{$src, $dst|$dst, $src}"),
+ [(set DstRC:$dst, (OpNode SrcRC:$src))]>,
+ Sched<[sched, Int2Fpu]>;
+ def rm : SI<opc, MRMSrcMem, (outs DstRC:$dst), (ins x86memop:$src),
+ mem#"\t{$src, $dst|$dst, $src}",
+ [(set DstRC:$dst, (OpNode (ld_frag addr:$src)))]>,
+ Sched<[sched.Folded]>;
}
multiclass sse12_cvt_p<bits<8> opc, RegisterClass RC, X86MemOperand x86memop,
@@ -872,74 +835,55 @@ let hasSideEffects = 0 in {
}
multiclass sse12_vcvt_avx<bits<8> opc, RegisterClass SrcRC, RegisterClass DstRC,
- X86MemOperand x86memop, string asm,
+ X86MemOperand x86memop, string asm, string mem,
X86FoldableSchedWrite sched> {
let hasSideEffects = 0, Predicates = [UseAVX] in {
def rr : SI<opc, MRMSrcReg, (outs DstRC:$dst), (ins DstRC:$src1, SrcRC:$src),
!strconcat(asm,"\t{$src, $src1, $dst|$dst, $src1, $src}"), []>,
- Sched<[sched]>;
+ Sched<[sched, ReadDefault, ReadInt2Fpu]>;
let mayLoad = 1 in
def rm : SI<opc, MRMSrcMem, (outs DstRC:$dst),
(ins DstRC:$src1, x86memop:$src),
- !strconcat(asm,"\t{$src, $src1, $dst|$dst, $src1, $src}"), []>,
+ asm#"{"#mem#"}\t{$src, $src1, $dst|$dst, $src1, $src}", []>,
Sched<[sched.Folded, sched.ReadAfterFold]>;
} // hasSideEffects = 0
}
-let Predicates = [UseAVX] in {
+let isCodeGenOnly = 1, Predicates = [UseAVX] in {
defm VCVTTSS2SI : sse12_cvt_s<0x2C, FR32, GR32, fp_to_sint, f32mem, loadf32,
- "cvttss2si\t{$src, $dst|$dst, $src}",
+ "cvttss2si", "cvttss2si",
WriteCvtSS2I>,
XS, VEX, VEX_LIG;
defm VCVTTSS2SI64 : sse12_cvt_s<0x2C, FR32, GR64, fp_to_sint, f32mem, loadf32,
- "cvttss2si\t{$src, $dst|$dst, $src}",
+ "cvttss2si", "cvttss2si",
WriteCvtSS2I>,
XS, VEX, VEX_W, VEX_LIG;
defm VCVTTSD2SI : sse12_cvt_s<0x2C, FR64, GR32, fp_to_sint, f64mem, loadf64,
- "cvttsd2si\t{$src, $dst|$dst, $src}",
+ "cvttsd2si", "cvttsd2si",
WriteCvtSD2I>,
XD, VEX, VEX_LIG;
defm VCVTTSD2SI64 : sse12_cvt_s<0x2C, FR64, GR64, fp_to_sint, f64mem, loadf64,
- "cvttsd2si\t{$src, $dst|$dst, $src}",
+ "cvttsd2si", "cvttsd2si",
WriteCvtSD2I>,
XD, VEX, VEX_W, VEX_LIG;
-
-def : InstAlias<"vcvttss2si{l}\t{$src, $dst|$dst, $src}",
- (VCVTTSS2SIrr GR32:$dst, FR32:$src), 0, "att">;
-def : InstAlias<"vcvttss2si{l}\t{$src, $dst|$dst, $src}",
- (VCVTTSS2SIrm GR32:$dst, f32mem:$src), 0, "att">;
-def : InstAlias<"vcvttsd2si{l}\t{$src, $dst|$dst, $src}",
- (VCVTTSD2SIrr GR32:$dst, FR64:$src), 0, "att">;
-def : InstAlias<"vcvttsd2si{l}\t{$src, $dst|$dst, $src}",
- (VCVTTSD2SIrm GR32:$dst, f64mem:$src), 0, "att">;
-def : InstAlias<"vcvttss2si{q}\t{$src, $dst|$dst, $src}",
- (VCVTTSS2SI64rr GR64:$dst, FR32:$src), 0, "att">;
-def : InstAlias<"vcvttss2si{q}\t{$src, $dst|$dst, $src}",
- (VCVTTSS2SI64rm GR64:$dst, f32mem:$src), 0, "att">;
-def : InstAlias<"vcvttsd2si{q}\t{$src, $dst|$dst, $src}",
- (VCVTTSD2SI64rr GR64:$dst, FR64:$src), 0, "att">;
-def : InstAlias<"vcvttsd2si{q}\t{$src, $dst|$dst, $src}",
- (VCVTTSD2SI64rm GR64:$dst, f64mem:$src), 0, "att">;
}
+
// The assembler can recognize rr 64-bit instructions by seeing a rxx
// register, but the same isn't true when only using memory operands,
// provide other assembly "l" and "q" forms to address this explicitly
// where appropriate to do so.
-defm VCVTSI2SS : sse12_vcvt_avx<0x2A, GR32, FR32, i32mem, "cvtsi2ss{l}",
+let isCodeGenOnly = 1 in {
+defm VCVTSI2SS : sse12_vcvt_avx<0x2A, GR32, FR32, i32mem, "cvtsi2ss", "l",
WriteCvtI2SS>, XS, VEX_4V, VEX_LIG;
-defm VCVTSI642SS : sse12_vcvt_avx<0x2A, GR64, FR32, i64mem, "cvtsi2ss{q}",
+defm VCVTSI642SS : sse12_vcvt_avx<0x2A, GR64, FR32, i64mem, "cvtsi2ss", "q",
WriteCvtI2SS>, XS, VEX_4V, VEX_W, VEX_LIG;
-defm VCVTSI2SD : sse12_vcvt_avx<0x2A, GR32, FR64, i32mem, "cvtsi2sd{l}",
+defm VCVTSI2SD : sse12_vcvt_avx<0x2A, GR32, FR64, i32mem, "cvtsi2sd", "l",
WriteCvtI2SD>, XD, VEX_4V, VEX_LIG;
-defm VCVTSI642SD : sse12_vcvt_avx<0x2A, GR64, FR64, i64mem, "cvtsi2sd{q}",
+defm VCVTSI642SD : sse12_vcvt_avx<0x2A, GR64, FR64, i64mem, "cvtsi2sd", "q",
WriteCvtI2SD>, XD, VEX_4V, VEX_W, VEX_LIG;
+} // isCodeGenOnly = 1
let Predicates = [UseAVX] in {
- def : InstAlias<"vcvtsi2ss\t{$src, $src1, $dst|$dst, $src1, $src}",
- (VCVTSI2SSrm FR64:$dst, FR64:$src1, i32mem:$src), 0, "att">;
- def : InstAlias<"vcvtsi2sd\t{$src, $src1, $dst|$dst, $src1, $src}",
- (VCVTSI2SDrm FR64:$dst, FR64:$src1, i32mem:$src), 0, "att">;
-
def : Pat<(f32 (sint_to_fp (loadi32 addr:$src))),
(VCVTSI2SSrm (f32 (IMPLICIT_DEF)), addr:$src)>;
def : Pat<(f32 (sint_to_fp (loadi64 addr:$src))),
@@ -959,52 +903,32 @@ let Predicates = [UseAVX] in {
(VCVTSI642SDrr (f64 (IMPLICIT_DEF)), GR64:$src)>;
}
+let isCodeGenOnly = 1 in {
defm CVTTSS2SI : sse12_cvt_s<0x2C, FR32, GR32, fp_to_sint, f32mem, loadf32,
- "cvttss2si\t{$src, $dst|$dst, $src}",
+ "cvttss2si", "cvttss2si",
WriteCvtSS2I>, XS;
defm CVTTSS2SI64 : sse12_cvt_s<0x2C, FR32, GR64, fp_to_sint, f32mem, loadf32,
- "cvttss2si\t{$src, $dst|$dst, $src}",
+ "cvttss2si", "cvttss2si",
WriteCvtSS2I>, XS, REX_W;
defm CVTTSD2SI : sse12_cvt_s<0x2C, FR64, GR32, fp_to_sint, f64mem, loadf64,
- "cvttsd2si\t{$src, $dst|$dst, $src}",
+ "cvttsd2si", "cvttsd2si",
WriteCvtSD2I>, XD;
defm CVTTSD2SI64 : sse12_cvt_s<0x2C, FR64, GR64, fp_to_sint, f64mem, loadf64,
- "cvttsd2si\t{$src, $dst|$dst, $src}",
+ "cvttsd2si", "cvttsd2si",
WriteCvtSD2I>, XD, REX_W;
defm CVTSI2SS : sse12_cvt_s<0x2A, GR32, FR32, sint_to_fp, i32mem, loadi32,
- "cvtsi2ss{l}\t{$src, $dst|$dst, $src}",
- WriteCvtI2SS>, XS;
+ "cvtsi2ss", "cvtsi2ss{l}",
+ WriteCvtI2SS, ReadInt2Fpu>, XS;
defm CVTSI642SS : sse12_cvt_s<0x2A, GR64, FR32, sint_to_fp, i64mem, loadi64,
- "cvtsi2ss{q}\t{$src, $dst|$dst, $src}",
- WriteCvtI2SS>, XS, REX_W;
+ "cvtsi2ss", "cvtsi2ss{q}",
+ WriteCvtI2SS, ReadInt2Fpu>, XS, REX_W;
defm CVTSI2SD : sse12_cvt_s<0x2A, GR32, FR64, sint_to_fp, i32mem, loadi32,
- "cvtsi2sd{l}\t{$src, $dst|$dst, $src}",
- WriteCvtI2SD>, XD;
+ "cvtsi2sd", "cvtsi2sd{l}",
+ WriteCvtI2SD, ReadInt2Fpu>, XD;
defm CVTSI642SD : sse12_cvt_s<0x2A, GR64, FR64, sint_to_fp, i64mem, loadi64,
- "cvtsi2sd{q}\t{$src, $dst|$dst, $src}",
- WriteCvtI2SD>, XD, REX_W;
-
-def : InstAlias<"cvttss2si{l}\t{$src, $dst|$dst, $src}",
- (CVTTSS2SIrr GR32:$dst, FR32:$src), 0, "att">;
-def : InstAlias<"cvttss2si{l}\t{$src, $dst|$dst, $src}",
- (CVTTSS2SIrm GR32:$dst, f32mem:$src), 0, "att">;
-def : InstAlias<"cvttsd2si{l}\t{$src, $dst|$dst, $src}",
- (CVTTSD2SIrr GR32:$dst, FR64:$src), 0, "att">;
-def : InstAlias<"cvttsd2si{l}\t{$src, $dst|$dst, $src}",
- (CVTTSD2SIrm GR32:$dst, f64mem:$src), 0, "att">;
-def : InstAlias<"cvttss2si{q}\t{$src, $dst|$dst, $src}",
- (CVTTSS2SI64rr GR64:$dst, FR32:$src), 0, "att">;
-def : InstAlias<"cvttss2si{q}\t{$src, $dst|$dst, $src}",
- (CVTTSS2SI64rm GR64:$dst, f32mem:$src), 0, "att">;
-def : InstAlias<"cvttsd2si{q}\t{$src, $dst|$dst, $src}",
- (CVTTSD2SI64rr GR64:$dst, FR64:$src), 0, "att">;
-def : InstAlias<"cvttsd2si{q}\t{$src, $dst|$dst, $src}",
- (CVTTSD2SI64rm GR64:$dst, f64mem:$src), 0, "att">;
-
-def : InstAlias<"cvtsi2ss\t{$src, $dst|$dst, $src}",
- (CVTSI2SSrm FR64:$dst, i32mem:$src), 0, "att">;
-def : InstAlias<"cvtsi2sd\t{$src, $dst|$dst, $src}",
- (CVTSI2SDrm FR64:$dst, i32mem:$src), 0, "att">;
+ "cvtsi2sd", "cvtsi2sd{q}",
+ WriteCvtI2SD, ReadInt2Fpu>, XD, REX_W;
+} // isCodeGenOnly = 1
// Conversion Instructions Intrinsics - Match intrinsics which expect MM
// and/or XMM operand(s).
@@ -1025,20 +949,20 @@ multiclass sse12_cvt_sint<bits<8> opc, RegisterClass SrcRC, RegisterClass DstRC,
multiclass sse12_cvt_sint_3addr<bits<8> opc, RegisterClass SrcRC,
RegisterClass DstRC, X86MemOperand x86memop,
- string asm, X86FoldableSchedWrite sched,
+ string asm, string mem, X86FoldableSchedWrite sched,
bit Is2Addr = 1> {
let hasSideEffects = 0 in {
def rr_Int : SI<opc, MRMSrcReg, (outs DstRC:$dst), (ins DstRC:$src1, SrcRC:$src2),
!if(Is2Addr,
!strconcat(asm, "\t{$src2, $dst|$dst, $src2}"),
!strconcat(asm, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
- []>, Sched<[sched]>;
+ []>, Sched<[sched, ReadDefault, ReadInt2Fpu]>;
let mayLoad = 1 in
def rm_Int : SI<opc, MRMSrcMem, (outs DstRC:$dst),
(ins DstRC:$src1, x86memop:$src2),
!if(Is2Addr,
- !strconcat(asm, "\t{$src2, $dst|$dst, $src2}"),
- !strconcat(asm, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
+ asm#"{"#mem#"}\t{$src2, $dst|$dst, $src2}",
+ asm#"{"#mem#"}\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
[]>, Sched<[sched.Folded, sched.ReadAfterFold]>;
}
}
@@ -1057,48 +981,73 @@ defm CVTSD2SI64 : sse12_cvt_sint<0x2D, VR128, GR64, i64, v2f64, X86cvts2si,
sdmem, sse_load_f64, "cvtsd2si", WriteCvtSD2I>, XD, REX_W;
-let isCodeGenOnly = 1 in {
- let Predicates = [UseAVX] in {
- defm VCVTSI2SS : sse12_cvt_sint_3addr<0x2A, GR32, VR128,
- i32mem, "cvtsi2ss{l}", WriteCvtI2SS, 0>, XS, VEX_4V;
- defm VCVTSI642SS : sse12_cvt_sint_3addr<0x2A, GR64, VR128,
- i64mem, "cvtsi2ss{q}", WriteCvtI2SS, 0>, XS, VEX_4V, VEX_W;
- defm VCVTSI2SD : sse12_cvt_sint_3addr<0x2A, GR32, VR128,
- i32mem, "cvtsi2sd{l}", WriteCvtI2SD, 0>, XD, VEX_4V;
- defm VCVTSI642SD : sse12_cvt_sint_3addr<0x2A, GR64, VR128,
- i64mem, "cvtsi2sd{q}", WriteCvtI2SD, 0>, XD, VEX_4V, VEX_W;
- }
- let Constraints = "$src1 = $dst" in {
- defm CVTSI2SS : sse12_cvt_sint_3addr<0x2A, GR32, VR128,
- i32mem, "cvtsi2ss{l}", WriteCvtI2SS>, XS;
- defm CVTSI642SS : sse12_cvt_sint_3addr<0x2A, GR64, VR128,
- i64mem, "cvtsi2ss{q}", WriteCvtI2SS>, XS, REX_W;
- defm CVTSI2SD : sse12_cvt_sint_3addr<0x2A, GR32, VR128,
- i32mem, "cvtsi2sd{l}", WriteCvtI2SD>, XD;
- defm CVTSI642SD : sse12_cvt_sint_3addr<0x2A, GR64, VR128,
- i64mem, "cvtsi2sd{q}", WriteCvtI2SD>, XD, REX_W;
- }
-} // isCodeGenOnly = 1
+let Predicates = [UseAVX] in {
+defm VCVTSI2SS : sse12_cvt_sint_3addr<0x2A, GR32, VR128,
+ i32mem, "cvtsi2ss", "l", WriteCvtI2SS, 0>, XS, VEX_4V, VEX_LIG;
+defm VCVTSI642SS : sse12_cvt_sint_3addr<0x2A, GR64, VR128,
+ i64mem, "cvtsi2ss", "q", WriteCvtI2SS, 0>, XS, VEX_4V, VEX_LIG, VEX_W;
+defm VCVTSI2SD : sse12_cvt_sint_3addr<0x2A, GR32, VR128,
+ i32mem, "cvtsi2sd", "l", WriteCvtI2SD, 0>, XD, VEX_4V, VEX_LIG;
+defm VCVTSI642SD : sse12_cvt_sint_3addr<0x2A, GR64, VR128,
+ i64mem, "cvtsi2sd", "q", WriteCvtI2SD, 0>, XD, VEX_4V, VEX_LIG, VEX_W;
+}
+let Constraints = "$src1 = $dst" in {
+ defm CVTSI2SS : sse12_cvt_sint_3addr<0x2A, GR32, VR128,
+ i32mem, "cvtsi2ss", "l", WriteCvtI2SS>, XS;
+ defm CVTSI642SS : sse12_cvt_sint_3addr<0x2A, GR64, VR128,
+ i64mem, "cvtsi2ss", "q", WriteCvtI2SS>, XS, REX_W;
+ defm CVTSI2SD : sse12_cvt_sint_3addr<0x2A, GR32, VR128,
+ i32mem, "cvtsi2sd", "l", WriteCvtI2SD>, XD;
+ defm CVTSI642SD : sse12_cvt_sint_3addr<0x2A, GR64, VR128,
+ i64mem, "cvtsi2sd", "q", WriteCvtI2SD>, XD, REX_W;
+}
+
+def : InstAlias<"vcvtsi2ss{l}\t{$src2, $src1, $dst|$dst, $src1, $src2}",
+ (VCVTSI2SSrr_Int VR128:$dst, VR128:$src1, GR32:$src2), 0, "att">;
+def : InstAlias<"vcvtsi2ss{q}\t{$src2, $src1, $dst|$dst, $src1, $src2}",
+ (VCVTSI642SSrr_Int VR128:$dst, VR128:$src1, GR64:$src2), 0, "att">;
+def : InstAlias<"vcvtsi2sd{l}\t{$src2, $src1, $dst|$dst, $src1, $src2}",
+ (VCVTSI2SDrr_Int VR128:$dst, VR128:$src1, GR32:$src2), 0, "att">;
+def : InstAlias<"vcvtsi2sd{q}\t{$src2, $src1, $dst|$dst, $src1, $src2}",
+ (VCVTSI642SDrr_Int VR128:$dst, VR128:$src1, GR64:$src2), 0, "att">;
+
+def : InstAlias<"vcvtsi2ss\t{$src, $src1, $dst|$dst, $src1, $src}",
+ (VCVTSI2SSrm_Int VR128:$dst, VR128:$src1, i32mem:$src), 0, "att">;
+def : InstAlias<"vcvtsi2sd\t{$src, $src1, $dst|$dst, $src1, $src}",
+ (VCVTSI2SDrm_Int VR128:$dst, VR128:$src1, i32mem:$src), 0, "att">;
+
+def : InstAlias<"cvtsi2ss{l}\t{$src, $dst|$dst, $src}",
+ (CVTSI2SSrr_Int VR128:$dst, GR32:$src), 0, "att">;
+def : InstAlias<"cvtsi2ss{q}\t{$src, $dst|$dst, $src}",
+ (CVTSI642SSrr_Int VR128:$dst, GR64:$src), 0, "att">;
+def : InstAlias<"cvtsi2sd{l}\t{$src, $dst|$dst, $src}",
+ (CVTSI2SDrr_Int VR128:$dst, GR32:$src), 0, "att">;
+def : InstAlias<"cvtsi2sd{q}\t{$src, $dst|$dst, $src}",
+ (CVTSI642SDrr_Int VR128:$dst, GR64:$src), 0, "att">;
+
+def : InstAlias<"cvtsi2ss\t{$src, $dst|$dst, $src}",
+ (CVTSI2SSrm_Int VR128:$dst, i32mem:$src), 0, "att">;
+def : InstAlias<"cvtsi2sd\t{$src, $dst|$dst, $src}",
+ (CVTSI2SDrm_Int VR128:$dst, i32mem:$src), 0, "att">;
/// SSE 1 Only
// Aliases for intrinsics
-let isCodeGenOnly = 1 in {
let Predicates = [UseAVX] in {
defm VCVTTSS2SI : sse12_cvt_sint<0x2C, VR128, GR32, i32, v4f32, X86cvtts2Int,
ssmem, sse_load_f32, "cvttss2si",
- WriteCvtSS2I>, XS, VEX;
+ WriteCvtSS2I>, XS, VEX, VEX_LIG;
defm VCVTTSS2SI64 : sse12_cvt_sint<0x2C, VR128, GR64, i64, v4f32,
X86cvtts2Int, ssmem, sse_load_f32,
"cvttss2si", WriteCvtSS2I>,
- XS, VEX, VEX_W;
+ XS, VEX, VEX_LIG, VEX_W;
defm VCVTTSD2SI : sse12_cvt_sint<0x2C, VR128, GR32, i32, v2f64, X86cvtts2Int,
sdmem, sse_load_f64, "cvttsd2si",
- WriteCvtSS2I>, XD, VEX;
+ WriteCvtSS2I>, XD, VEX, VEX_LIG;
defm VCVTTSD2SI64 : sse12_cvt_sint<0x2C, VR128, GR64, i64, v2f64,
X86cvtts2Int, sdmem, sse_load_f64,
"cvttsd2si", WriteCvtSS2I>,
- XD, VEX, VEX_W;
+ XD, VEX, VEX_LIG, VEX_W;
}
defm CVTTSS2SI : sse12_cvt_sint<0x2C, VR128, GR32, i32, v4f32, X86cvtts2Int,
ssmem, sse_load_f32, "cvttss2si",
@@ -1112,7 +1061,40 @@ defm CVTTSD2SI : sse12_cvt_sint<0x2C, VR128, GR32, i32, v2f64, X86cvtts2Int,
defm CVTTSD2SI64 : sse12_cvt_sint<0x2C, VR128, GR64, i64, v2f64,
X86cvtts2Int, sdmem, sse_load_f64,
"cvttsd2si", WriteCvtSD2I>, XD, REX_W;
-} // isCodeGenOnly = 1
+
+def : InstAlias<"vcvttss2si{l}\t{$src, $dst|$dst, $src}",
+ (VCVTTSS2SIrr_Int GR32:$dst, VR128:$src), 0, "att">;
+def : InstAlias<"vcvttss2si{l}\t{$src, $dst|$dst, $src}",
+ (VCVTTSS2SIrm_Int GR32:$dst, f32mem:$src), 0, "att">;
+def : InstAlias<"vcvttsd2si{l}\t{$src, $dst|$dst, $src}",
+ (VCVTTSD2SIrr_Int GR32:$dst, VR128:$src), 0, "att">;
+def : InstAlias<"vcvttsd2si{l}\t{$src, $dst|$dst, $src}",
+ (VCVTTSD2SIrm_Int GR32:$dst, f64mem:$src), 0, "att">;
+def : InstAlias<"vcvttss2si{q}\t{$src, $dst|$dst, $src}",
+ (VCVTTSS2SI64rr_Int GR64:$dst, VR128:$src), 0, "att">;
+def : InstAlias<"vcvttss2si{q}\t{$src, $dst|$dst, $src}",
+ (VCVTTSS2SI64rm_Int GR64:$dst, f32mem:$src), 0, "att">;
+def : InstAlias<"vcvttsd2si{q}\t{$src, $dst|$dst, $src}",
+ (VCVTTSD2SI64rr_Int GR64:$dst, VR128:$src), 0, "att">;
+def : InstAlias<"vcvttsd2si{q}\t{$src, $dst|$dst, $src}",
+ (VCVTTSD2SI64rm_Int GR64:$dst, f64mem:$src), 0, "att">;
+
+def : InstAlias<"cvttss2si{l}\t{$src, $dst|$dst, $src}",
+ (CVTTSS2SIrr_Int GR32:$dst, VR128:$src), 0, "att">;
+def : InstAlias<"cvttss2si{l}\t{$src, $dst|$dst, $src}",
+ (CVTTSS2SIrm_Int GR32:$dst, f32mem:$src), 0, "att">;
+def : InstAlias<"cvttsd2si{l}\t{$src, $dst|$dst, $src}",
+ (CVTTSD2SIrr_Int GR32:$dst, VR128:$src), 0, "att">;
+def : InstAlias<"cvttsd2si{l}\t{$src, $dst|$dst, $src}",
+ (CVTTSD2SIrm_Int GR32:$dst, f64mem:$src), 0, "att">;
+def : InstAlias<"cvttss2si{q}\t{$src, $dst|$dst, $src}",
+ (CVTTSS2SI64rr_Int GR64:$dst, VR128:$src), 0, "att">;
+def : InstAlias<"cvttss2si{q}\t{$src, $dst|$dst, $src}",
+ (CVTTSS2SI64rm_Int GR64:$dst, f32mem:$src), 0, "att">;
+def : InstAlias<"cvttsd2si{q}\t{$src, $dst|$dst, $src}",
+ (CVTTSD2SI64rr_Int GR64:$dst, VR128:$src), 0, "att">;
+def : InstAlias<"cvttsd2si{q}\t{$src, $dst|$dst, $src}",
+ (CVTTSD2SI64rm_Int GR64:$dst, f64mem:$src), 0, "att">;
let Predicates = [UseAVX] in {
defm VCVTSS2SI : sse12_cvt_sint<0x2D, VR128, GR32, i32, v4f32, X86cvts2si,
@@ -1143,7 +1125,7 @@ defm CVTDQ2PS : sse12_cvt_p<0x5B, VR128, i128mem, v4f32, v4i32, memop,
SSEPackedSingle, WriteCvtI2PS>,
PS, Requires<[UseSSE2]>;
-let Predicates = [UseAVX] in {
+// AVX aliases
def : InstAlias<"vcvtss2si{l}\t{$src, $dst|$dst, $src}",
(VCVTSS2SIrr_Int GR32:$dst, VR128:$src), 0, "att">;
def : InstAlias<"vcvtss2si{l}\t{$src, $dst|$dst, $src}",
@@ -1160,8 +1142,8 @@ def : InstAlias<"vcvtsd2si{q}\t{$src, $dst|$dst, $src}",
(VCVTSD2SI64rr_Int GR64:$dst, VR128:$src), 0, "att">;
def : InstAlias<"vcvtsd2si{q}\t{$src, $dst|$dst, $src}",
(VCVTSD2SI64rm_Int GR64:$dst, sdmem:$src), 0, "att">;
-}
+// SSE aliases
def : InstAlias<"cvtss2si{l}\t{$src, $dst|$dst, $src}",
(CVTSS2SIrr_Int GR32:$dst, VR128:$src), 0, "att">;
def : InstAlias<"cvtss2si{l}\t{$src, $dst|$dst, $src}",
@@ -1182,7 +1164,7 @@ def : InstAlias<"cvtsd2si{q}\t{$src, $dst|$dst, $src}",
/// SSE 2 Only
// Convert scalar double to scalar single
-let hasSideEffects = 0, Predicates = [UseAVX] in {
+let isCodeGenOnly = 1, hasSideEffects = 0, Predicates = [UseAVX] in {
def VCVTSD2SSrr : VSDI<0x5A, MRMSrcReg, (outs FR32:$dst),
(ins FR32:$src1, FR64:$src2),
"cvtsd2ss\t{$src2, $src1, $dst|$dst, $src1, $src2}", []>,
@@ -1200,6 +1182,7 @@ def : Pat<(f32 (fpround FR64:$src)),
(VCVTSD2SSrr (f32 (IMPLICIT_DEF)), FR64:$src)>,
Requires<[UseAVX]>;
+let isCodeGenOnly = 1 in {
def CVTSD2SSrr : SDI<0x5A, MRMSrcReg, (outs FR32:$dst), (ins FR64:$src),
"cvtsd2ss\t{$src, $dst|$dst, $src}",
[(set FR32:$dst, (fpround FR64:$src))]>,
@@ -1209,42 +1192,41 @@ def CVTSD2SSrm : I<0x5A, MRMSrcMem, (outs FR32:$dst), (ins f64mem:$src),
[(set FR32:$dst, (fpround (loadf64 addr:$src)))]>,
XD, Requires<[UseSSE2, OptForSize]>,
Sched<[WriteCvtSD2SS.Folded]>;
+}
-let isCodeGenOnly = 1 in {
def VCVTSD2SSrr_Int: I<0x5A, MRMSrcReg,
(outs VR128:$dst), (ins VR128:$src1, VR128:$src2),
"vcvtsd2ss\t{$src2, $src1, $dst|$dst, $src1, $src2}",
[(set VR128:$dst,
- (int_x86_sse2_cvtsd2ss VR128:$src1, VR128:$src2))]>,
- XD, VEX_4V, VEX_WIG, Requires<[HasAVX]>,
+ (v4f32 (X86frounds VR128:$src1, (v2f64 VR128:$src2))))]>,
+ XD, VEX_4V, VEX_LIG, VEX_WIG, Requires<[UseAVX]>,
Sched<[WriteCvtSD2SS]>;
def VCVTSD2SSrm_Int: I<0x5A, MRMSrcMem,
(outs VR128:$dst), (ins VR128:$src1, sdmem:$src2),
"vcvtsd2ss\t{$src2, $src1, $dst|$dst, $src1, $src2}",
- [(set VR128:$dst, (int_x86_sse2_cvtsd2ss
- VR128:$src1, sse_load_f64:$src2))]>,
- XD, VEX_4V, VEX_WIG, Requires<[HasAVX]>,
+ [(set VR128:$dst,
+ (v4f32 (X86frounds VR128:$src1, sse_load_f64:$src2)))]>,
+ XD, VEX_4V, VEX_LIG, VEX_WIG, Requires<[UseAVX]>,
Sched<[WriteCvtSD2SS.Folded, WriteCvtSD2SS.ReadAfterFold]>;
let Constraints = "$src1 = $dst" in {
def CVTSD2SSrr_Int: I<0x5A, MRMSrcReg,
(outs VR128:$dst), (ins VR128:$src1, VR128:$src2),
"cvtsd2ss\t{$src2, $dst|$dst, $src2}",
[(set VR128:$dst,
- (int_x86_sse2_cvtsd2ss VR128:$src1, VR128:$src2))]>,
+ (v4f32 (X86frounds VR128:$src1, (v2f64 VR128:$src2))))]>,
XD, Requires<[UseSSE2]>, Sched<[WriteCvtSD2SS]>;
def CVTSD2SSrm_Int: I<0x5A, MRMSrcMem,
(outs VR128:$dst), (ins VR128:$src1, sdmem:$src2),
"cvtsd2ss\t{$src2, $dst|$dst, $src2}",
- [(set VR128:$dst, (int_x86_sse2_cvtsd2ss
- VR128:$src1, sse_load_f64:$src2))]>,
+ [(set VR128:$dst,
+ (v4f32 (X86frounds VR128:$src1,sse_load_f64:$src2)))]>,
XD, Requires<[UseSSE2]>,
Sched<[WriteCvtSD2SS.Folded, WriteCvtSD2SS.ReadAfterFold]>;
}
-} // isCodeGenOnly = 1
// Convert scalar single to scalar double
// SSE2 instructions with XS prefix
-let hasSideEffects = 0 in {
+let isCodeGenOnly = 1, hasSideEffects = 0 in {
def VCVTSS2SDrr : I<0x5A, MRMSrcReg, (outs FR64:$dst),
(ins FR64:$src1, FR32:$src2),
"vcvtss2sd\t{$src2, $src1, $dst|$dst, $src1, $src2}", []>,
@@ -1257,51 +1239,36 @@ def VCVTSS2SDrm : I<0x5A, MRMSrcMem, (outs FR64:$dst),
XS, VEX_4V, VEX_LIG, VEX_WIG,
Sched<[WriteCvtSS2SD.Folded, WriteCvtSS2SD.ReadAfterFold]>,
Requires<[UseAVX, OptForSize]>;
-}
+} // isCodeGenOnly = 1, hasSideEffects = 0
def : Pat<(f64 (fpextend FR32:$src)),
(VCVTSS2SDrr (f64 (IMPLICIT_DEF)), FR32:$src)>, Requires<[UseAVX]>;
def : Pat<(fpextend (loadf32 addr:$src)),
(VCVTSS2SDrm (f64 (IMPLICIT_DEF)), addr:$src)>, Requires<[UseAVX, OptForSize]>;
-def : Pat<(extloadf32 addr:$src),
- (VCVTSS2SDrm (f64 (IMPLICIT_DEF)), addr:$src)>,
- Requires<[UseAVX, OptForSize]>;
-def : Pat<(extloadf32 addr:$src),
- (VCVTSS2SDrr (f64 (IMPLICIT_DEF)), (VMOVSSrm addr:$src))>,
- Requires<[UseAVX, OptForSpeed]>;
-
+let isCodeGenOnly = 1 in {
def CVTSS2SDrr : I<0x5A, MRMSrcReg, (outs FR64:$dst), (ins FR32:$src),
"cvtss2sd\t{$src, $dst|$dst, $src}",
[(set FR64:$dst, (fpextend FR32:$src))]>,
XS, Requires<[UseSSE2]>, Sched<[WriteCvtSS2SD]>;
def CVTSS2SDrm : I<0x5A, MRMSrcMem, (outs FR64:$dst), (ins f32mem:$src),
"cvtss2sd\t{$src, $dst|$dst, $src}",
- [(set FR64:$dst, (extloadf32 addr:$src))]>,
+ [(set FR64:$dst, (fpextend (loadf32 addr:$src)))]>,
XS, Requires<[UseSSE2, OptForSize]>,
Sched<[WriteCvtSS2SD.Folded]>;
+} // isCodeGenOnly = 1
-// extload f32 -> f64. This matches load+fpextend because we have a hack in
-// the isel (PreprocessForFPConvert) that can introduce loads after dag
-// combine.
-// Since these loads aren't folded into the fpextend, we have to match it
-// explicitly here.
-def : Pat<(fpextend (loadf32 addr:$src)),
- (CVTSS2SDrm addr:$src)>, Requires<[UseSSE2, OptForSize]>;
-def : Pat<(extloadf32 addr:$src),
- (CVTSS2SDrr (MOVSSrm addr:$src))>, Requires<[UseSSE2, OptForSpeed]>;
-
-let isCodeGenOnly = 1, hasSideEffects = 0 in {
+let hasSideEffects = 0 in {
def VCVTSS2SDrr_Int: I<0x5A, MRMSrcReg,
(outs VR128:$dst), (ins VR128:$src1, VR128:$src2),
"vcvtss2sd\t{$src2, $src1, $dst|$dst, $src1, $src2}",
- []>, XS, VEX_4V, VEX_WIG,
+ []>, XS, VEX_4V, VEX_LIG, VEX_WIG,
Requires<[HasAVX]>, Sched<[WriteCvtSS2SD]>;
let mayLoad = 1 in
def VCVTSS2SDrm_Int: I<0x5A, MRMSrcMem,
(outs VR128:$dst), (ins VR128:$src1, ssmem:$src2),
"vcvtss2sd\t{$src2, $src1, $dst|$dst, $src1, $src2}",
- []>, XS, VEX_4V, VEX_WIG, Requires<[HasAVX]>,
+ []>, XS, VEX_4V, VEX_LIG, VEX_WIG, Requires<[HasAVX]>,
Sched<[WriteCvtSS2SD.Folded, WriteCvtSS2SD.ReadAfterFold]>;
let Constraints = "$src1 = $dst" in { // SSE2 instructions with XS prefix
def CVTSS2SDrr_Int: I<0x5A, MRMSrcReg,
@@ -1316,7 +1283,7 @@ def CVTSS2SDrm_Int: I<0x5A, MRMSrcMem,
[]>, XS, Requires<[UseSSE2]>,
Sched<[WriteCvtSS2SD.Folded, WriteCvtSS2SD.ReadAfterFold]>;
}
-} // isCodeGenOnly = 1
+} // hasSideEffects = 0
// Patterns used for matching (v)cvtsi2ss, (v)cvtsi2sd, (v)cvtsd2ss and
// (v)cvtss2sd intrinsic sequences from clang which produce unnecessary
@@ -1476,15 +1443,11 @@ def VCVTPD2DQrr : SDI<0xE6, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
VEX, Sched<[WriteCvtPD2I]>, VEX_WIG;
// XMM only
-def : InstAlias<"vcvtpd2dqx\t{$src, $dst|$dst, $src}",
- (VCVTPD2DQrr VR128:$dst, VR128:$src), 0>;
def VCVTPD2DQrm : SDI<0xE6, MRMSrcMem, (outs VR128:$dst), (ins f128mem:$src),
"vcvtpd2dq{x}\t{$src, $dst|$dst, $src}",
[(set VR128:$dst,
(v4i32 (X86cvtp2Int (loadv2f64 addr:$src))))]>, VEX,
Sched<[WriteCvtPD2ILd]>, VEX_WIG;
-def : InstAlias<"vcvtpd2dqx\t{$src, $dst|$dst, $src}",
- (VCVTPD2DQrm VR128:$dst, f128mem:$src), 0, "intel">;
// YMM only
def VCVTPD2DQYrr : SDI<0xE6, MRMSrcReg, (outs VR128:$dst), (ins VR256:$src),
@@ -1497,12 +1460,13 @@ def VCVTPD2DQYrm : SDI<0xE6, MRMSrcMem, (outs VR128:$dst), (ins f256mem:$src),
[(set VR128:$dst,
(v4i32 (X86cvtp2Int (loadv4f64 addr:$src))))]>,
VEX, VEX_L, Sched<[WriteCvtPD2IYLd]>, VEX_WIG;
-def : InstAlias<"vcvtpd2dqy\t{$src, $dst|$dst, $src}",
- (VCVTPD2DQYrr VR128:$dst, VR256:$src), 0>;
-def : InstAlias<"vcvtpd2dqy\t{$src, $dst|$dst, $src}",
- (VCVTPD2DQYrm VR128:$dst, f256mem:$src), 0, "intel">;
}
+def : InstAlias<"vcvtpd2dqx\t{$src, $dst|$dst, $src}",
+ (VCVTPD2DQrr VR128:$dst, VR128:$src), 0, "att">;
+def : InstAlias<"vcvtpd2dqy\t{$src, $dst|$dst, $src}",
+ (VCVTPD2DQYrr VR128:$dst, VR256:$src), 0, "att">;
+
def CVTPD2DQrm : SDI<0xE6, MRMSrcMem, (outs VR128:$dst), (ins f128mem:$src),
"cvtpd2dq\t{$src, $dst|$dst, $src}",
[(set VR128:$dst,
@@ -1540,17 +1504,6 @@ def VCVTTPS2DQYrm : VS2SI<0x5B, MRMSrcMem, (outs VR256:$dst), (ins f256mem:$src)
Sched<[WriteCvtPS2IYLd]>, VEX_WIG;
}
-let Predicates = [HasAVX, NoVLX] in {
- def : Pat<(v4i32 (fp_to_sint (v4f32 VR128:$src))),
- (VCVTTPS2DQrr VR128:$src)>;
- def : Pat<(v4i32 (fp_to_sint (loadv4f32 addr:$src))),
- (VCVTTPS2DQrm addr:$src)>;
- def : Pat<(v8i32 (fp_to_sint (v8f32 VR256:$src))),
- (VCVTTPS2DQYrr VR256:$src)>;
- def : Pat<(v8i32 (fp_to_sint (loadv8f32 addr:$src))),
- (VCVTTPS2DQYrm addr:$src)>;
-}
-
def CVTTPS2DQrr : S2SI<0x5B, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
"cvttps2dq\t{$src, $dst|$dst, $src}",
[(set VR128:$dst,
@@ -1562,39 +1515,23 @@ def CVTTPS2DQrm : S2SI<0x5B, MRMSrcMem, (outs VR128:$dst), (ins f128mem:$src),
(v4i32 (X86cvttp2si (memopv4f32 addr:$src))))]>,
Sched<[WriteCvtPS2ILd]>;
-let Predicates = [UseSSE2] in {
- def : Pat<(v4i32 (fp_to_sint (v4f32 VR128:$src))),
- (CVTTPS2DQrr VR128:$src)>;
- def : Pat<(v4i32 (fp_to_sint (memopv4f32 addr:$src))),
- (CVTTPS2DQrm addr:$src)>;
-}
-
-let Predicates = [HasAVX, NoVLX] in
+// The assembler can recognize rr 256-bit instructions by seeing a ymm
+// register, but the same isn't true when using memory operands instead.
+// Provide other assembly rr and rm forms to address this explicitly.
+let Predicates = [HasAVX, NoVLX] in {
+// XMM only
def VCVTTPD2DQrr : VPDI<0xE6, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
"cvttpd2dq\t{$src, $dst|$dst, $src}",
[(set VR128:$dst,
(v4i32 (X86cvttp2si (v2f64 VR128:$src))))]>,
VEX, Sched<[WriteCvtPD2I]>, VEX_WIG;
-
-// The assembler can recognize rr 256-bit instructions by seeing a ymm
-// register, but the same isn't true when using memory operands instead.
-// Provide other assembly rr and rm forms to address this explicitly.
-
-// XMM only
-def : InstAlias<"vcvttpd2dqx\t{$src, $dst|$dst, $src}",
- (VCVTTPD2DQrr VR128:$dst, VR128:$src), 0>;
-
-let Predicates = [HasAVX, NoVLX] in
def VCVTTPD2DQrm : VPDI<0xE6, MRMSrcMem, (outs VR128:$dst), (ins f128mem:$src),
"cvttpd2dq{x}\t{$src, $dst|$dst, $src}",
[(set VR128:$dst,
(v4i32 (X86cvttp2si (loadv2f64 addr:$src))))]>,
VEX, Sched<[WriteCvtPD2ILd]>, VEX_WIG;
-def : InstAlias<"vcvttpd2dqx\t{$src, $dst|$dst, $src}",
- (VCVTTPD2DQrm VR128:$dst, f128mem:$src), 0, "intel">;
// YMM only
-let Predicates = [HasAVX, NoVLX] in {
def VCVTTPD2DQYrr : VPDI<0xE6, MRMSrcReg, (outs VR128:$dst), (ins VR256:$src),
"cvttpd2dq\t{$src, $dst|$dst, $src}",
[(set VR128:$dst,
@@ -1605,11 +1542,12 @@ def VCVTTPD2DQYrm : VPDI<0xE6, MRMSrcMem, (outs VR128:$dst), (ins f256mem:$src),
[(set VR128:$dst,
(v4i32 (X86cvttp2si (loadv4f64 addr:$src))))]>,
VEX, VEX_L, Sched<[WriteCvtPD2IYLd]>, VEX_WIG;
-}
-def : InstAlias<"vcvttpd2dqy\t{$src, $dst|$dst, $src}",
- (VCVTTPD2DQYrr VR128:$dst, VR256:$src), 0>;
+} // Predicates = [HasAVX, NoVLX]
+
+def : InstAlias<"vcvttpd2dqx\t{$src, $dst|$dst, $src}",
+ (VCVTTPD2DQrr VR128:$dst, VR128:$src), 0, "att">;
def : InstAlias<"vcvttpd2dqy\t{$src, $dst|$dst, $src}",
- (VCVTTPD2DQYrm VR128:$dst, f256mem:$src), 0, "intel">;
+ (VCVTTPD2DQYrr VR128:$dst, VR256:$src), 0, "att">;
let Predicates = [HasAVX, NoVLX] in {
def : Pat<(v4i32 (fp_to_sint (v4f64 VR256:$src))),
@@ -1618,21 +1556,6 @@ let Predicates = [HasAVX, NoVLX] in {
(VCVTTPD2DQYrm addr:$src)>;
}
-let Predicates = [HasAVX, NoVLX] in {
- def : Pat<(X86vzmovl (v2i64 (bitconvert
- (v4i32 (X86cvtp2Int (v2f64 VR128:$src)))))),
- (VCVTPD2DQrr VR128:$src)>;
- def : Pat<(X86vzmovl (v2i64 (bitconvert
- (v4i32 (X86cvtp2Int (loadv2f64 addr:$src)))))),
- (VCVTPD2DQrm addr:$src)>;
- def : Pat<(X86vzmovl (v2i64 (bitconvert
- (v4i32 (X86cvttp2si (v2f64 VR128:$src)))))),
- (VCVTTPD2DQrr VR128:$src)>;
- def : Pat<(X86vzmovl (v2i64 (bitconvert
- (v4i32 (X86cvttp2si (loadv2f64 addr:$src)))))),
- (VCVTTPD2DQrm addr:$src)>;
-} // Predicates = [HasAVX, NoVLX]
-
def CVTTPD2DQrr : PDI<0xE6, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
"cvttpd2dq\t{$src, $dst|$dst, $src}",
[(set VR128:$dst,
@@ -1644,21 +1567,6 @@ def CVTTPD2DQrm : PDI<0xE6, MRMSrcMem, (outs VR128:$dst),(ins f128mem:$src),
(v4i32 (X86cvttp2si (memopv2f64 addr:$src))))]>,
Sched<[WriteCvtPD2ILd]>;
-let Predicates = [UseSSE2] in {
- def : Pat<(X86vzmovl (v2i64 (bitconvert
- (v4i32 (X86cvtp2Int (v2f64 VR128:$src)))))),
- (CVTPD2DQrr VR128:$src)>;
- def : Pat<(X86vzmovl (v2i64 (bitconvert
- (v4i32 (X86cvtp2Int (memopv2f64 addr:$src)))))),
- (CVTPD2DQrm addr:$src)>;
- def : Pat<(X86vzmovl (v2i64 (bitconvert
- (v4i32 (X86cvttp2si (v2f64 VR128:$src)))))),
- (CVTTPD2DQrr VR128:$src)>;
- def : Pat<(X86vzmovl (v2i64 (bitconvert
- (v4i32 (X86cvttp2si (memopv2f64 addr:$src)))))),
- (CVTTPD2DQrm addr:$src)>;
-} // Predicates = [UseSSE2]
-
// Convert packed single to packed double
let Predicates = [HasAVX, NoVLX] in {
// SSE2 instructions without OpSize prefix
@@ -1697,7 +1605,10 @@ let hasSideEffects = 0, mayLoad = 1 in
def VCVTDQ2PDrm : S2SI<0xE6, MRMSrcMem, (outs VR128:$dst), (ins i64mem:$src),
"vcvtdq2pd\t{$src, $dst|$dst, $src}",
[(set VR128:$dst,
- (v2f64 (X86VSintToFP (loadv4i32 addr:$src))))]>,
+ (v2f64 (X86VSintToFP
+ (bc_v4i32
+ (v2i64 (scalar_to_vector
+ (loadi64 addr:$src)))))))]>,
VEX, Sched<[WriteCvtI2PDLd]>, VEX_WIG;
def VCVTDQ2PDrr : S2SI<0xE6, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
"vcvtdq2pd\t{$src, $dst|$dst, $src}",
@@ -1721,7 +1632,10 @@ let hasSideEffects = 0, mayLoad = 1 in
def CVTDQ2PDrm : S2SI<0xE6, MRMSrcMem, (outs VR128:$dst), (ins i64mem:$src),
"cvtdq2pd\t{$src, $dst|$dst, $src}",
[(set VR128:$dst,
- (v2f64 (X86VSintToFP (loadv4i32 addr:$src))))]>,
+ (v2f64 (X86VSintToFP
+ (bc_v4i32
+ (v2i64 (scalar_to_vector
+ (loadi64 addr:$src)))))))]>,
Sched<[WriteCvtI2PDLd]>;
def CVTDQ2PDrr : S2SI<0xE6, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
"cvtdq2pd\t{$src, $dst|$dst, $src}",
@@ -1731,17 +1645,13 @@ def CVTDQ2PDrr : S2SI<0xE6, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
// AVX register conversion intrinsics
let Predicates = [HasAVX, NoVLX] in {
- def : Pat<(v2f64 (X86VSintToFP (bc_v4i32 (v2i64 (scalar_to_vector (loadi64 addr:$src)))))),
- (VCVTDQ2PDrm addr:$src)>;
- def : Pat<(v2f64 (X86VSintToFP (bc_v4i32 (v2i64 (X86vzload addr:$src))))),
+ def : Pat<(v2f64 (X86VSintToFP (bc_v4i32 (v2i64 (X86vzload64 addr:$src))))),
(VCVTDQ2PDrm addr:$src)>;
} // Predicates = [HasAVX, NoVLX]
// SSE2 register conversion intrinsics
let Predicates = [UseSSE2] in {
- def : Pat<(v2f64 (X86VSintToFP (bc_v4i32 (v2i64 (scalar_to_vector (loadi64 addr:$src)))))),
- (CVTDQ2PDrm addr:$src)>;
- def : Pat<(v2f64 (X86VSintToFP (bc_v4i32 (v2i64 (X86vzload addr:$src))))),
+ def : Pat<(v2f64 (X86VSintToFP (bc_v4i32 (v2i64 (X86vzload64 addr:$src))))),
(CVTDQ2PDrm addr:$src)>;
} // Predicates = [UseSSE2]
@@ -1749,38 +1659,31 @@ let Predicates = [UseSSE2] in {
// The assembler can recognize rr 256-bit instructions by seeing a ymm
// register, but the same isn't true when using memory operands instead.
// Provide other assembly rr and rm forms to address this explicitly.
-let Predicates = [HasAVX, NoVLX] in
+let Predicates = [HasAVX, NoVLX] in {
+// XMM only
def VCVTPD2PSrr : VPDI<0x5A, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
"cvtpd2ps\t{$src, $dst|$dst, $src}",
[(set VR128:$dst, (X86vfpround (v2f64 VR128:$src)))]>,
VEX, Sched<[WriteCvtPD2PS]>, VEX_WIG;
-
-// XMM only
-def : InstAlias<"vcvtpd2psx\t{$src, $dst|$dst, $src}",
- (VCVTPD2PSrr VR128:$dst, VR128:$src), 0>;
-let Predicates = [HasAVX, NoVLX] in
def VCVTPD2PSrm : VPDI<0x5A, MRMSrcMem, (outs VR128:$dst), (ins f128mem:$src),
"cvtpd2ps{x}\t{$src, $dst|$dst, $src}",
[(set VR128:$dst, (X86vfpround (loadv2f64 addr:$src)))]>,
VEX, Sched<[WriteCvtPD2PS.Folded]>, VEX_WIG;
-def : InstAlias<"vcvtpd2psx\t{$src, $dst|$dst, $src}",
- (VCVTPD2PSrm VR128:$dst, f128mem:$src), 0, "intel">;
-// YMM only
-let Predicates = [HasAVX, NoVLX] in {
def VCVTPD2PSYrr : VPDI<0x5A, MRMSrcReg, (outs VR128:$dst), (ins VR256:$src),
"cvtpd2ps\t{$src, $dst|$dst, $src}",
- [(set VR128:$dst, (fpround VR256:$src))]>,
+ [(set VR128:$dst, (X86vfpround VR256:$src))]>,
VEX, VEX_L, Sched<[WriteCvtPD2PSY]>, VEX_WIG;
def VCVTPD2PSYrm : VPDI<0x5A, MRMSrcMem, (outs VR128:$dst), (ins f256mem:$src),
"cvtpd2ps{y}\t{$src, $dst|$dst, $src}",
- [(set VR128:$dst, (fpround (loadv4f64 addr:$src)))]>,
+ [(set VR128:$dst, (X86vfpround (loadv4f64 addr:$src)))]>,
VEX, VEX_L, Sched<[WriteCvtPD2PSY.Folded]>, VEX_WIG;
-}
-def : InstAlias<"vcvtpd2psy\t{$src, $dst|$dst, $src}",
- (VCVTPD2PSYrr VR128:$dst, VR256:$src), 0>;
+} // Predicates = [HasAVX, NoVLX]
+
+def : InstAlias<"vcvtpd2psx\t{$src, $dst|$dst, $src}",
+ (VCVTPD2PSrr VR128:$dst, VR128:$src), 0, "att">;
def : InstAlias<"vcvtpd2psy\t{$src, $dst|$dst, $src}",
- (VCVTPD2PSYrm VR128:$dst, f256mem:$src), 0, "intel">;
+ (VCVTPD2PSYrr VR128:$dst, VR256:$src), 0, "att">;
def CVTPD2PSrr : PDI<0x5A, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
"cvtpd2ps\t{$src, $dst|$dst, $src}",
@@ -1791,28 +1694,11 @@ def CVTPD2PSrm : PDI<0x5A, MRMSrcMem, (outs VR128:$dst), (ins f128mem:$src),
[(set VR128:$dst, (X86vfpround (memopv2f64 addr:$src)))]>,
Sched<[WriteCvtPD2PS.Folded]>;
-// AVX 256-bit register conversion intrinsics
-// FIXME: Migrate SSE conversion intrinsics matching to use patterns as below
-// whenever possible to avoid declaring two versions of each one.
-
let Predicates = [HasAVX, NoVLX] in {
- // Match fpround and fpextend for 128/256-bit conversions
- def : Pat<(X86vzmovl (v2f64 (bitconvert
- (v4f32 (X86vfpround (v2f64 VR128:$src)))))),
- (VCVTPD2PSrr VR128:$src)>;
- def : Pat<(X86vzmovl (v2f64 (bitconvert
- (v4f32 (X86vfpround (loadv2f64 addr:$src)))))),
- (VCVTPD2PSrm addr:$src)>;
-}
-
-let Predicates = [UseSSE2] in {
- // Match fpround and fpextend for 128 conversions
- def : Pat<(X86vzmovl (v2f64 (bitconvert
- (v4f32 (X86vfpround (v2f64 VR128:$src)))))),
- (CVTPD2PSrr VR128:$src)>;
- def : Pat<(X86vzmovl (v2f64 (bitconvert
- (v4f32 (X86vfpround (memopv2f64 addr:$src)))))),
- (CVTPD2PSrm addr:$src)>;
+ def : Pat<(v4f32 (fpround (v4f64 VR256:$src))),
+ (VCVTPD2PSYrr VR256:$src)>;
+ def : Pat<(v4f32 (fpround (loadv4f64 addr:$src))),
+ (VCVTPD2PSYrm addr:$src)>;
}
//===----------------------------------------------------------------------===//
@@ -1821,94 +1707,80 @@ let Predicates = [UseSSE2] in {
// sse12_cmp_scalar - sse 1 & 2 compare scalar instructions
multiclass sse12_cmp_scalar<RegisterClass RC, X86MemOperand x86memop,
- Operand CC, SDNode OpNode, ValueType VT,
- PatFrag ld_frag, string asm, string asm_alt,
+ SDNode OpNode, ValueType VT,
+ PatFrag ld_frag, string asm,
X86FoldableSchedWrite sched> {
let isCommutable = 1 in
def rr : SIi8<0xC2, MRMSrcReg,
- (outs RC:$dst), (ins RC:$src1, RC:$src2, CC:$cc), asm,
+ (outs RC:$dst), (ins RC:$src1, RC:$src2, u8imm:$cc), asm,
[(set RC:$dst, (OpNode (VT RC:$src1), RC:$src2, imm:$cc))]>,
Sched<[sched]>;
def rm : SIi8<0xC2, MRMSrcMem,
- (outs RC:$dst), (ins RC:$src1, x86memop:$src2, CC:$cc), asm,
+ (outs RC:$dst), (ins RC:$src1, x86memop:$src2, u8imm:$cc), asm,
[(set RC:$dst, (OpNode (VT RC:$src1),
(ld_frag addr:$src2), imm:$cc))]>,
Sched<[sched.Folded, sched.ReadAfterFold]>;
-
- // Accept explicit immediate argument form instead of comparison code.
- let isAsmParserOnly = 1, hasSideEffects = 0 in {
- def rr_alt : SIi8<0xC2, MRMSrcReg, (outs RC:$dst),
- (ins RC:$src1, RC:$src2, u8imm:$cc), asm_alt, []>,
- Sched<[sched]>, NotMemoryFoldable;
- let mayLoad = 1 in
- def rm_alt : SIi8<0xC2, MRMSrcMem, (outs RC:$dst),
- (ins RC:$src1, x86memop:$src2, u8imm:$cc), asm_alt, []>,
- Sched<[sched.Folded, sched.ReadAfterFold]>, NotMemoryFoldable;
- }
}
-let ExeDomain = SSEPackedSingle in
-defm VCMPSS : sse12_cmp_scalar<FR32, f32mem, AVXCC, X86cmps, f32, loadf32,
- "cmp${cc}ss\t{$src2, $src1, $dst|$dst, $src1, $src2}",
- "cmpss\t{$cc, $src2, $src1, $dst|$dst, $src1, $src2, $cc}",
- SchedWriteFCmpSizes.PS.Scl>, XS, VEX_4V, VEX_LIG, VEX_WIG;
-let ExeDomain = SSEPackedDouble in
-defm VCMPSD : sse12_cmp_scalar<FR64, f64mem, AVXCC, X86cmps, f64, loadf64,
- "cmp${cc}sd\t{$src2, $src1, $dst|$dst, $src1, $src2}",
- "cmpsd\t{$cc, $src2, $src1, $dst|$dst, $src1, $src2, $cc}",
- SchedWriteFCmpSizes.PD.Scl>,
- XD, VEX_4V, VEX_LIG, VEX_WIG;
-
-let Constraints = "$src1 = $dst" in {
+let isCodeGenOnly = 1 in {
let ExeDomain = SSEPackedSingle in
- defm CMPSS : sse12_cmp_scalar<FR32, f32mem, SSECC, X86cmps, f32, loadf32,
- "cmp${cc}ss\t{$src2, $dst|$dst, $src2}",
- "cmpss\t{$cc, $src2, $dst|$dst, $src2, $cc}",
- SchedWriteFCmpSizes.PS.Scl>, XS;
+ defm VCMPSS : sse12_cmp_scalar<FR32, f32mem, X86cmps, f32, loadf32,
+ "cmpss\t{$cc, $src2, $src1, $dst|$dst, $src1, $src2, $cc}",
+ SchedWriteFCmpSizes.PS.Scl>, XS, VEX_4V, VEX_LIG, VEX_WIG;
let ExeDomain = SSEPackedDouble in
- defm CMPSD : sse12_cmp_scalar<FR64, f64mem, SSECC, X86cmps, f64, loadf64,
- "cmp${cc}sd\t{$src2, $dst|$dst, $src2}",
- "cmpsd\t{$cc, $src2, $dst|$dst, $src2, $cc}",
- SchedWriteFCmpSizes.PD.Scl>, XD;
+ defm VCMPSD : sse12_cmp_scalar<FR64, f64mem, X86cmps, f64, loadf64,
+ "cmpsd\t{$cc, $src2, $src1, $dst|$dst, $src1, $src2, $cc}",
+ SchedWriteFCmpSizes.PD.Scl>,
+ XD, VEX_4V, VEX_LIG, VEX_WIG;
+
+ let Constraints = "$src1 = $dst" in {
+ let ExeDomain = SSEPackedSingle in
+ defm CMPSS : sse12_cmp_scalar<FR32, f32mem, X86cmps, f32, loadf32,
+ "cmpss\t{$cc, $src2, $dst|$dst, $src2, $cc}",
+ SchedWriteFCmpSizes.PS.Scl>, XS;
+ let ExeDomain = SSEPackedDouble in
+ defm CMPSD : sse12_cmp_scalar<FR64, f64mem, X86cmps, f64, loadf64,
+ "cmpsd\t{$cc, $src2, $dst|$dst, $src2, $cc}",
+ SchedWriteFCmpSizes.PD.Scl>, XD;
+ }
}
-multiclass sse12_cmp_scalar_int<Operand memop, Operand CC,
+multiclass sse12_cmp_scalar_int<Operand memop,
Intrinsic Int, string asm, X86FoldableSchedWrite sched,
ComplexPattern mem_cpat> {
def rr_Int : SIi8<0xC2, MRMSrcReg, (outs VR128:$dst),
- (ins VR128:$src1, VR128:$src, CC:$cc), asm,
+ (ins VR128:$src1, VR128:$src, u8imm:$cc), asm,
[(set VR128:$dst, (Int VR128:$src1,
VR128:$src, imm:$cc))]>,
Sched<[sched]>;
let mayLoad = 1 in
def rm_Int : SIi8<0xC2, MRMSrcMem, (outs VR128:$dst),
- (ins VR128:$src1, memop:$src, CC:$cc), asm,
+ (ins VR128:$src1, memop:$src, u8imm:$cc), asm,
[(set VR128:$dst, (Int VR128:$src1,
mem_cpat:$src, imm:$cc))]>,
Sched<[sched.Folded, sched.ReadAfterFold]>;
}
-let isCodeGenOnly = 1 in {
- // Aliases to match intrinsics which expect XMM operand(s).
+// Aliases to match intrinsics which expect XMM operand(s).
+let ExeDomain = SSEPackedSingle in
+defm VCMPSS : sse12_cmp_scalar_int<ssmem, int_x86_sse_cmp_ss,
+ "cmpss\t{$cc, $src, $src1, $dst|$dst, $src1, $src, $cc}",
+ SchedWriteFCmpSizes.PS.Scl, sse_load_f32>,
+ XS, VEX_4V, VEX_LIG, VEX_WIG;
+let ExeDomain = SSEPackedDouble in
+defm VCMPSD : sse12_cmp_scalar_int<sdmem, int_x86_sse2_cmp_sd,
+ "cmpsd\t{$cc, $src, $src1, $dst|$dst, $src1, $src, $cc}",
+ SchedWriteFCmpSizes.PD.Scl, sse_load_f64>,
+ XD, VEX_4V, VEX_LIG, VEX_WIG;
+let Constraints = "$src1 = $dst" in {
let ExeDomain = SSEPackedSingle in
- defm VCMPSS : sse12_cmp_scalar_int<ssmem, AVXCC, int_x86_sse_cmp_ss,
- "cmp${cc}ss\t{$src, $src1, $dst|$dst, $src1, $src}",
- SchedWriteFCmpSizes.PS.Scl, sse_load_f32>, XS, VEX_4V;
+ defm CMPSS : sse12_cmp_scalar_int<ssmem, int_x86_sse_cmp_ss,
+ "cmpss\t{$cc, $src, $dst|$dst, $src, $cc}",
+ SchedWriteFCmpSizes.PS.Scl, sse_load_f32>, XS;
let ExeDomain = SSEPackedDouble in
- defm VCMPSD : sse12_cmp_scalar_int<sdmem, AVXCC, int_x86_sse2_cmp_sd,
- "cmp${cc}sd\t{$src, $src1, $dst|$dst, $src1, $src}",
- SchedWriteFCmpSizes.PD.Scl, sse_load_f64>,
- XD, VEX_4V;
- let Constraints = "$src1 = $dst" in {
- let ExeDomain = SSEPackedSingle in
- defm CMPSS : sse12_cmp_scalar_int<ssmem, SSECC, int_x86_sse_cmp_ss,
- "cmp${cc}ss\t{$src, $dst|$dst, $src}",
- SchedWriteFCmpSizes.PS.Scl, sse_load_f32>, XS;
- let ExeDomain = SSEPackedDouble in
- defm CMPSD : sse12_cmp_scalar_int<sdmem, SSECC, int_x86_sse2_cmp_sd,
- "cmp${cc}sd\t{$src, $dst|$dst, $src}",
- SchedWriteFCmpSizes.PD.Scl, sse_load_f64>, XD;
-}
+ defm CMPSD : sse12_cmp_scalar_int<sdmem, int_x86_sse2_cmp_sd,
+ "cmpsd\t{$cc, $src, $dst|$dst, $src, $cc}",
+ SchedWriteFCmpSizes.PD.Scl, sse_load_f64>, XD;
}
@@ -1962,14 +1834,14 @@ let Defs = [EFLAGS] in {
let isCodeGenOnly = 1 in {
defm VUCOMISS : sse12_ord_cmp_int<0x2E, VR128, X86ucomi, v4f32, ssmem,
- sse_load_f32, "ucomiss", WriteFCom>, PS, VEX, VEX_WIG;
+ sse_load_f32, "ucomiss", WriteFCom>, PS, VEX, VEX_LIG, VEX_WIG;
defm VUCOMISD : sse12_ord_cmp_int<0x2E, VR128, X86ucomi, v2f64, sdmem,
- sse_load_f64, "ucomisd", WriteFCom>, PD, VEX, VEX_WIG;
+ sse_load_f64, "ucomisd", WriteFCom>, PD, VEX, VEX_LIG, VEX_WIG;
defm VCOMISS : sse12_ord_cmp_int<0x2F, VR128, X86comi, v4f32, ssmem,
- sse_load_f32, "comiss", WriteFCom>, PS, VEX, VEX_WIG;
+ sse_load_f32, "comiss", WriteFCom>, PS, VEX, VEX_LIG, VEX_WIG;
defm VCOMISD : sse12_ord_cmp_int<0x2F, VR128, X86comi, v2f64, sdmem,
- sse_load_f64, "comisd", WriteFCom>, PD, VEX, VEX_WIG;
+ sse_load_f64, "comisd", WriteFCom>, PD, VEX, VEX_LIG, VEX_WIG;
}
defm UCOMISS : sse12_ord_cmp<0x2E, FR32, X86cmp, f32, f32mem, loadf32,
"ucomiss", WriteFCom>, PS;
@@ -1998,56 +1870,38 @@ let Defs = [EFLAGS] in {
// sse12_cmp_packed - sse 1 & 2 compare packed instructions
multiclass sse12_cmp_packed<RegisterClass RC, X86MemOperand x86memop,
- Operand CC, ValueType VT, string asm,
- string asm_alt, X86FoldableSchedWrite sched,
+ ValueType VT, string asm,
+ X86FoldableSchedWrite sched,
Domain d, PatFrag ld_frag> {
let isCommutable = 1 in
def rri : PIi8<0xC2, MRMSrcReg,
- (outs RC:$dst), (ins RC:$src1, RC:$src2, CC:$cc), asm,
+ (outs RC:$dst), (ins RC:$src1, RC:$src2, u8imm:$cc), asm,
[(set RC:$dst, (VT (X86cmpp RC:$src1, RC:$src2, imm:$cc)))], d>,
Sched<[sched]>;
def rmi : PIi8<0xC2, MRMSrcMem,
- (outs RC:$dst), (ins RC:$src1, x86memop:$src2, CC:$cc), asm,
+ (outs RC:$dst), (ins RC:$src1, x86memop:$src2, u8imm:$cc), asm,
[(set RC:$dst,
(VT (X86cmpp RC:$src1, (ld_frag addr:$src2), imm:$cc)))], d>,
Sched<[sched.Folded, sched.ReadAfterFold]>;
-
- // Accept explicit immediate argument form instead of comparison code.
- let isAsmParserOnly = 1, hasSideEffects = 0 in {
- def rri_alt : PIi8<0xC2, MRMSrcReg,
- (outs RC:$dst), (ins RC:$src1, RC:$src2, u8imm:$cc),
- asm_alt, [], d>, Sched<[sched]>, NotMemoryFoldable;
- let mayLoad = 1 in
- def rmi_alt : PIi8<0xC2, MRMSrcMem,
- (outs RC:$dst), (ins RC:$src1, x86memop:$src2, u8imm:$cc),
- asm_alt, [], d>, Sched<[sched.Folded, sched.ReadAfterFold]>,
- NotMemoryFoldable;
- }
}
-defm VCMPPS : sse12_cmp_packed<VR128, f128mem, AVXCC, v4f32,
- "cmp${cc}ps\t{$src2, $src1, $dst|$dst, $src1, $src2}",
+defm VCMPPS : sse12_cmp_packed<VR128, f128mem, v4f32,
"cmpps\t{$cc, $src2, $src1, $dst|$dst, $src1, $src2, $cc}",
SchedWriteFCmpSizes.PS.XMM, SSEPackedSingle, loadv4f32>, PS, VEX_4V, VEX_WIG;
-defm VCMPPD : sse12_cmp_packed<VR128, f128mem, AVXCC, v2f64,
- "cmp${cc}pd\t{$src2, $src1, $dst|$dst, $src1, $src2}",
+defm VCMPPD : sse12_cmp_packed<VR128, f128mem, v2f64,
"cmppd\t{$cc, $src2, $src1, $dst|$dst, $src1, $src2, $cc}",
SchedWriteFCmpSizes.PD.XMM, SSEPackedDouble, loadv2f64>, PD, VEX_4V, VEX_WIG;
-defm VCMPPSY : sse12_cmp_packed<VR256, f256mem, AVXCC, v8f32,
- "cmp${cc}ps\t{$src2, $src1, $dst|$dst, $src1, $src2}",
+defm VCMPPSY : sse12_cmp_packed<VR256, f256mem, v8f32,
"cmpps\t{$cc, $src2, $src1, $dst|$dst, $src1, $src2, $cc}",
SchedWriteFCmpSizes.PS.YMM, SSEPackedSingle, loadv8f32>, PS, VEX_4V, VEX_L, VEX_WIG;
-defm VCMPPDY : sse12_cmp_packed<VR256, f256mem, AVXCC, v4f64,
- "cmp${cc}pd\t{$src2, $src1, $dst|$dst, $src1, $src2}",
+defm VCMPPDY : sse12_cmp_packed<VR256, f256mem, v4f64,
"cmppd\t{$cc, $src2, $src1, $dst|$dst, $src1, $src2, $cc}",
SchedWriteFCmpSizes.PD.YMM, SSEPackedDouble, loadv4f64>, PD, VEX_4V, VEX_L, VEX_WIG;
let Constraints = "$src1 = $dst" in {
- defm CMPPS : sse12_cmp_packed<VR128, f128mem, SSECC, v4f32,
- "cmp${cc}ps\t{$src2, $dst|$dst, $src2}",
+ defm CMPPS : sse12_cmp_packed<VR128, f128mem, v4f32,
"cmpps\t{$cc, $src2, $dst|$dst, $src2, $cc}",
SchedWriteFCmpSizes.PS.XMM, SSEPackedSingle, memopv4f32>, PS;
- defm CMPPD : sse12_cmp_packed<VR128, f128mem, SSECC, v2f64,
- "cmp${cc}pd\t{$src2, $dst|$dst, $src2}",
+ defm CMPPD : sse12_cmp_packed<VR128, f128mem, v2f64,
"cmppd\t{$cc, $src2, $dst|$dst, $src2, $cc}",
SchedWriteFCmpSizes.PD.XMM, SSEPackedDouble, memopv2f64>, PD;
}
@@ -2111,12 +1965,14 @@ let Predicates = [UseSSE1] in {
/// sse12_shuffle - sse 1 & 2 fp shuffle instructions
multiclass sse12_shuffle<RegisterClass RC, X86MemOperand x86memop,
ValueType vt, string asm, PatFrag mem_frag,
- X86FoldableSchedWrite sched, Domain d> {
+ X86FoldableSchedWrite sched, Domain d,
+ bit IsCommutable = 0> {
def rmi : PIi8<0xC6, MRMSrcMem, (outs RC:$dst),
(ins RC:$src1, x86memop:$src2, u8imm:$src3), asm,
[(set RC:$dst, (vt (X86Shufp RC:$src1, (mem_frag addr:$src2),
(i8 imm:$src3))))], d>,
Sched<[sched.Folded, sched.ReadAfterFold]>;
+ let isCommutable = IsCommutable in
def rri : PIi8<0xC6, MRMSrcReg, (outs RC:$dst),
(ins RC:$src1, RC:$src2, u8imm:$src3), asm,
[(set RC:$dst, (vt (X86Shufp RC:$src1, RC:$src2,
@@ -2148,7 +2004,7 @@ let Constraints = "$src1 = $dst" in {
memopv4f32, SchedWriteFShuffle.XMM, SSEPackedSingle>, PS;
defm SHUFPD : sse12_shuffle<VR128, f128mem, v2f64,
"shufpd\t{$src3, $src2, $dst|$dst, $src2, $src3}",
- memopv2f64, SchedWriteFShuffle.XMM, SSEPackedDouble>, PD;
+ memopv2f64, SchedWriteFShuffle.XMM, SSEPackedDouble, 1>, PD;
}
//===----------------------------------------------------------------------===//
@@ -2238,6 +2094,13 @@ let Predicates = [HasAVX1Only] in {
(VUNPCKHPDYrr VR256:$src1, VR256:$src2)>;
}
+let Predicates = [UseSSE2] in {
+ // Use MOVHPD if the load isn't aligned enough for UNPCKLPD.
+ def : Pat<(v2f64 (X86Unpckl VR128:$src1,
+ (v2f64 (nonvolatile_load addr:$src2)))),
+ (MOVHPDrm VR128:$src1, addr:$src2)>;
+}
+
//===----------------------------------------------------------------------===//
// SSE 1 & 2 - Extract Floating-Point Sign mask
//===----------------------------------------------------------------------===//
@@ -2523,99 +2386,6 @@ let Predicates = [HasAVX1Only] in {
(VANDNPSYrm VR256:$src1, addr:$src2)>;
}
-let Predicates = [HasAVX, NoVLX_Or_NoDQI] in {
- // Use packed logical operations for scalar ops.
- def : Pat<(f64 (X86fand FR64:$src1, FR64:$src2)),
- (COPY_TO_REGCLASS
- (v2f64 (VANDPDrr (v2f64 (COPY_TO_REGCLASS FR64:$src1, VR128)),
- (v2f64 (COPY_TO_REGCLASS FR64:$src2, VR128)))),
- FR64)>;
- def : Pat<(f64 (X86for FR64:$src1, FR64:$src2)),
- (COPY_TO_REGCLASS
- (v2f64 (VORPDrr (v2f64 (COPY_TO_REGCLASS FR64:$src1, VR128)),
- (v2f64 (COPY_TO_REGCLASS FR64:$src2, VR128)))),
- FR64)>;
- def : Pat<(f64 (X86fxor FR64:$src1, FR64:$src2)),
- (COPY_TO_REGCLASS
- (v2f64 (VXORPDrr (v2f64 (COPY_TO_REGCLASS FR64:$src1, VR128)),
- (v2f64 (COPY_TO_REGCLASS FR64:$src2, VR128)))),
- FR64)>;
- def : Pat<(f64 (X86fandn FR64:$src1, FR64:$src2)),
- (COPY_TO_REGCLASS
- (v2f64 (VANDNPDrr (v2f64 (COPY_TO_REGCLASS FR64:$src1, VR128)),
- (v2f64 (COPY_TO_REGCLASS FR64:$src2, VR128)))),
- FR64)>;
-
- def : Pat<(f32 (X86fand FR32:$src1, FR32:$src2)),
- (COPY_TO_REGCLASS
- (v4f32 (VANDPSrr (v4f32 (COPY_TO_REGCLASS FR32:$src1, VR128)),
- (v4f32 (COPY_TO_REGCLASS FR32:$src2, VR128)))),
- FR32)>;
- def : Pat<(f32 (X86for FR32:$src1, FR32:$src2)),
- (COPY_TO_REGCLASS
- (v4f32 (VORPSrr (v4f32 (COPY_TO_REGCLASS FR32:$src1, VR128)),
- (v4f32 (COPY_TO_REGCLASS FR32:$src2, VR128)))),
- FR32)>;
- def : Pat<(f32 (X86fxor FR32:$src1, FR32:$src2)),
- (COPY_TO_REGCLASS
- (v4f32 (VXORPSrr (v4f32 (COPY_TO_REGCLASS FR32:$src1, VR128)),
- (v4f32 (COPY_TO_REGCLASS FR32:$src2, VR128)))),
- FR32)>;
- def : Pat<(f32 (X86fandn FR32:$src1, FR32:$src2)),
- (COPY_TO_REGCLASS
- (v4f32 (VANDNPSrr (v4f32 (COPY_TO_REGCLASS FR32:$src1, VR128)),
- (v4f32 (COPY_TO_REGCLASS FR32:$src2, VR128)))),
- FR32)>;
-}
-
-let Predicates = [UseSSE1] in {
- // Use packed logical operations for scalar ops.
- def : Pat<(f32 (X86fand FR32:$src1, FR32:$src2)),
- (COPY_TO_REGCLASS
- (v4f32 (ANDPSrr (v4f32 (COPY_TO_REGCLASS FR32:$src1, VR128)),
- (v4f32 (COPY_TO_REGCLASS FR32:$src2, VR128)))),
- FR32)>;
- def : Pat<(f32 (X86for FR32:$src1, FR32:$src2)),
- (COPY_TO_REGCLASS
- (v4f32 (ORPSrr (v4f32 (COPY_TO_REGCLASS FR32:$src1, VR128)),
- (v4f32 (COPY_TO_REGCLASS FR32:$src2, VR128)))),
- FR32)>;
- def : Pat<(f32 (X86fxor FR32:$src1, FR32:$src2)),
- (COPY_TO_REGCLASS
- (v4f32 (XORPSrr (v4f32 (COPY_TO_REGCLASS FR32:$src1, VR128)),
- (v4f32 (COPY_TO_REGCLASS FR32:$src2, VR128)))),
- FR32)>;
- def : Pat<(f32 (X86fandn FR32:$src1, FR32:$src2)),
- (COPY_TO_REGCLASS
- (v4f32 (ANDNPSrr (v4f32 (COPY_TO_REGCLASS FR32:$src1, VR128)),
- (v4f32 (COPY_TO_REGCLASS FR32:$src2, VR128)))),
- FR32)>;
-}
-
-let Predicates = [UseSSE2] in {
- // Use packed logical operations for scalar ops.
- def : Pat<(f64 (X86fand FR64:$src1, FR64:$src2)),
- (COPY_TO_REGCLASS
- (v2f64 (ANDPDrr (v2f64 (COPY_TO_REGCLASS FR64:$src1, VR128)),
- (v2f64 (COPY_TO_REGCLASS FR64:$src2, VR128)))),
- FR64)>;
- def : Pat<(f64 (X86for FR64:$src1, FR64:$src2)),
- (COPY_TO_REGCLASS
- (v2f64 (ORPDrr (v2f64 (COPY_TO_REGCLASS FR64:$src1, VR128)),
- (v2f64 (COPY_TO_REGCLASS FR64:$src2, VR128)))),
- FR64)>;
- def : Pat<(f64 (X86fxor FR64:$src1, FR64:$src2)),
- (COPY_TO_REGCLASS
- (v2f64 (XORPDrr (v2f64 (COPY_TO_REGCLASS FR64:$src1, VR128)),
- (v2f64 (COPY_TO_REGCLASS FR64:$src2, VR128)))),
- FR64)>;
- def : Pat<(f64 (X86fandn FR64:$src1, FR64:$src2)),
- (COPY_TO_REGCLASS
- (v2f64 (ANDNPDrr (v2f64 (COPY_TO_REGCLASS FR64:$src1, VR128)),
- (v2f64 (COPY_TO_REGCLASS FR64:$src2, VR128)))),
- FR64)>;
-}
-
let Predicates = [HasAVX, NoVLX] in {
def : Pat<(v16i8 (and VR128:$src1, VR128:$src2)),
(VPANDrr VR128:$src1, VR128:$src2)>;
@@ -2908,7 +2678,8 @@ let isCodeGenOnly = 1 in {
// patterns we have to try to match.
multiclass scalar_math_patterns<SDNode Op, string OpcPrefix, SDNode Move,
ValueType VT, ValueType EltTy,
- RegisterClass RC, Predicate BasePredicate> {
+ RegisterClass RC, PatFrag ld_frag,
+ Predicate BasePredicate> {
let Predicates = [BasePredicate] in {
// extracted scalar math op with insert via movss/movsd
def : Pat<(VT (Move (VT VR128:$dst),
@@ -2917,6 +2688,11 @@ multiclass scalar_math_patterns<SDNode Op, string OpcPrefix, SDNode Move,
RC:$src))))),
(!cast<Instruction>(OpcPrefix#rr_Int) VT:$dst,
(VT (COPY_TO_REGCLASS RC:$src, VR128)))>;
+ def : Pat<(VT (Move (VT VR128:$dst),
+ (VT (scalar_to_vector
+ (Op (EltTy (extractelt (VT VR128:$dst), (iPTR 0))),
+ (ld_frag addr:$src)))))),
+ (!cast<Instruction>(OpcPrefix#rm_Int) VT:$dst, addr:$src)>;
}
// Repeat for AVX versions of the instructions.
@@ -2928,18 +2704,23 @@ multiclass scalar_math_patterns<SDNode Op, string OpcPrefix, SDNode Move,
RC:$src))))),
(!cast<Instruction>("V"#OpcPrefix#rr_Int) VT:$dst,
(VT (COPY_TO_REGCLASS RC:$src, VR128)))>;
+ def : Pat<(VT (Move (VT VR128:$dst),
+ (VT (scalar_to_vector
+ (Op (EltTy (extractelt (VT VR128:$dst), (iPTR 0))),
+ (ld_frag addr:$src)))))),
+ (!cast<Instruction>("V"#OpcPrefix#rm_Int) VT:$dst, addr:$src)>;
}
}
-defm : scalar_math_patterns<fadd, "ADDSS", X86Movss, v4f32, f32, FR32, UseSSE1>;
-defm : scalar_math_patterns<fsub, "SUBSS", X86Movss, v4f32, f32, FR32, UseSSE1>;
-defm : scalar_math_patterns<fmul, "MULSS", X86Movss, v4f32, f32, FR32, UseSSE1>;
-defm : scalar_math_patterns<fdiv, "DIVSS", X86Movss, v4f32, f32, FR32, UseSSE1>;
+defm : scalar_math_patterns<fadd, "ADDSS", X86Movss, v4f32, f32, FR32, loadf32, UseSSE1>;
+defm : scalar_math_patterns<fsub, "SUBSS", X86Movss, v4f32, f32, FR32, loadf32, UseSSE1>;
+defm : scalar_math_patterns<fmul, "MULSS", X86Movss, v4f32, f32, FR32, loadf32, UseSSE1>;
+defm : scalar_math_patterns<fdiv, "DIVSS", X86Movss, v4f32, f32, FR32, loadf32, UseSSE1>;
-defm : scalar_math_patterns<fadd, "ADDSD", X86Movsd, v2f64, f64, FR64, UseSSE2>;
-defm : scalar_math_patterns<fsub, "SUBSD", X86Movsd, v2f64, f64, FR64, UseSSE2>;
-defm : scalar_math_patterns<fmul, "MULSD", X86Movsd, v2f64, f64, FR64, UseSSE2>;
-defm : scalar_math_patterns<fdiv, "DIVSD", X86Movsd, v2f64, f64, FR64, UseSSE2>;
+defm : scalar_math_patterns<fadd, "ADDSD", X86Movsd, v2f64, f64, FR64, loadf64, UseSSE2>;
+defm : scalar_math_patterns<fsub, "SUBSD", X86Movsd, v2f64, f64, FR64, loadf64, UseSSE2>;
+defm : scalar_math_patterns<fmul, "MULSD", X86Movsd, v2f64, f64, FR64, loadf64, UseSSE2>;
+defm : scalar_math_patterns<fdiv, "DIVSD", X86Movsd, v2f64, f64, FR64, loadf64, UseSSE2>;
/// Unop Arithmetic
/// In addition, we also have a special variant of the scalar form here to
@@ -2956,7 +2737,7 @@ multiclass sse_fp_unop_s<bits<8> opc, string OpcodeStr, RegisterClass RC,
ValueType ScalarVT, X86MemOperand x86memop,
Operand intmemop, SDNode OpNode, Domain d,
X86FoldableSchedWrite sched, Predicate target> {
- let hasSideEffects = 0 in {
+ let isCodeGenOnly = 1, hasSideEffects = 0 in {
def r : I<opc, MRMSrcReg, (outs RC:$dst), (ins RC:$src1),
!strconcat(OpcodeStr, "\t{$src1, $dst|$dst, $src1}"),
[(set RC:$dst, (OpNode RC:$src1))], d>, Sched<[sched]>,
@@ -2967,8 +2748,9 @@ multiclass sse_fp_unop_s<bits<8> opc, string OpcodeStr, RegisterClass RC,
[(set RC:$dst, (OpNode (load addr:$src1)))], d>,
Sched<[sched.Folded]>,
Requires<[target, OptForSize]>;
+ }
- let isCodeGenOnly = 1, Constraints = "$src1 = $dst", ExeDomain = d in {
+ let hasSideEffects = 0, Constraints = "$src1 = $dst", ExeDomain = d in {
def r_Int : I<opc, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src1, VR128:$src2),
!strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"), []>,
Sched<[sched]>;
@@ -2977,7 +2759,6 @@ multiclass sse_fp_unop_s<bits<8> opc, string OpcodeStr, RegisterClass RC,
!strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"), []>,
Sched<[sched.Folded, sched.ReadAfterFold]>;
}
- }
}
@@ -3022,7 +2803,7 @@ multiclass avx_fp_unop_s<bits<8> opc, string OpcodeStr, RegisterClass RC,
ValueType ScalarVT, X86MemOperand x86memop,
Operand intmemop, SDNode OpNode, Domain d,
X86FoldableSchedWrite sched, Predicate target> {
- let hasSideEffects = 0 in {
+ let isCodeGenOnly = 1, hasSideEffects = 0 in {
def r : I<opc, MRMSrcReg, (outs RC:$dst), (ins RC:$src1, RC:$src2),
!strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
[], d>, Sched<[sched]>;
@@ -3030,7 +2811,8 @@ multiclass avx_fp_unop_s<bits<8> opc, string OpcodeStr, RegisterClass RC,
def m : I<opc, MRMSrcMem, (outs RC:$dst), (ins RC:$src1, x86memop:$src2),
!strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
[], d>, Sched<[sched.Folded, sched.ReadAfterFold]>;
- let isCodeGenOnly = 1, ExeDomain = d in {
+ }
+ let hasSideEffects = 0, ExeDomain = d in {
def r_Int : I<opc, MRMSrcReg, (outs VR128:$dst),
(ins VR128:$src1, VR128:$src2),
!strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
@@ -3041,7 +2823,6 @@ multiclass avx_fp_unop_s<bits<8> opc, string OpcodeStr, RegisterClass RC,
!strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
[]>, Sched<[sched.Folded, sched.ReadAfterFold]>;
}
- }
// We don't want to fold scalar loads into these instructions unless
// optimizing for size. This is because the folded instruction will have a
@@ -3197,23 +2978,6 @@ multiclass scalar_unary_math_patterns<SDNode OpNode, string OpcPrefix, SDNode Mo
}
}
-multiclass scalar_unary_math_imm_patterns<SDNode OpNode, string OpcPrefix, SDNode Move,
- ValueType VT, bits<8> ImmV,
- Predicate BasePredicate> {
- let Predicates = [BasePredicate] in {
- def : Pat<(VT (Move VT:$dst, (scalar_to_vector
- (OpNode (extractelt VT:$src, 0))))),
- (!cast<Instruction>(OpcPrefix#r_Int) VT:$dst, VT:$src, (i32 ImmV))>;
- }
-
- // Repeat for AVX versions of the instructions.
- let Predicates = [UseAVX] in {
- def : Pat<(VT (Move VT:$dst, (scalar_to_vector
- (OpNode (extractelt VT:$src, 0))))),
- (!cast<Instruction>("V"#OpcPrefix#r_Int) VT:$dst, VT:$src, (i32 ImmV))>;
- }
-}
-
defm : scalar_unary_math_patterns<fsqrt, "SQRTSS", X86Movss, v4f32, UseSSE1>;
defm : scalar_unary_math_patterns<fsqrt, "SQRTSD", X86Movsd, v2f64, UseSSE2>;
@@ -3388,16 +3152,20 @@ def : Pat<(X86MFence), (MFENCE)>;
// SSE 1 & 2 - Load/Store XCSR register
//===----------------------------------------------------------------------===//
+let mayLoad=1, hasSideEffects=1 in
def VLDMXCSR : VPSI<0xAE, MRM2m, (outs), (ins i32mem:$src),
"ldmxcsr\t$src", [(int_x86_sse_ldmxcsr addr:$src)]>,
VEX, Sched<[WriteLDMXCSR]>, VEX_WIG;
+let mayStore=1, hasSideEffects=1 in
def VSTMXCSR : VPSI<0xAE, MRM3m, (outs), (ins i32mem:$dst),
"stmxcsr\t$dst", [(int_x86_sse_stmxcsr addr:$dst)]>,
VEX, Sched<[WriteSTMXCSR]>, VEX_WIG;
+let mayLoad=1, hasSideEffects=1 in
def LDMXCSR : I<0xAE, MRM2m, (outs), (ins i32mem:$src),
"ldmxcsr\t$src", [(int_x86_sse_ldmxcsr addr:$src)]>,
TB, Sched<[WriteLDMXCSR]>;
+let mayStore=1, hasSideEffects=1 in
def STMXCSR : I<0xAE, MRM3m, (outs), (ins i32mem:$dst),
"stmxcsr\t$dst", [(int_x86_sse_stmxcsr addr:$dst)]>,
TB, Sched<[WriteSTMXCSR]>;
@@ -3529,17 +3297,6 @@ def MOVDQUmr : I<0x7F, MRMDestMem, (outs), (ins i128mem:$dst, VR128:$src),
} // ExeDomain = SSEPackedInt
-// Aliases to help the assembler pick two byte VEX encodings by swapping the
-// operands relative to the normal instructions to use VEX.R instead of VEX.B.
-def : InstAlias<"vmovdqa\t{$src, $dst|$dst, $src}",
- (VMOVDQArr_REV VR128L:$dst, VR128H:$src), 0>;
-def : InstAlias<"vmovdqa\t{$src, $dst|$dst, $src}",
- (VMOVDQAYrr_REV VR256L:$dst, VR256H:$src), 0>;
-def : InstAlias<"vmovdqu\t{$src, $dst|$dst, $src}",
- (VMOVDQUrr_REV VR128L:$dst, VR128H:$src), 0>;
-def : InstAlias<"vmovdqu\t{$src, $dst|$dst, $src}",
- (VMOVDQUYrr_REV VR256L:$dst, VR256H:$src), 0>;
-
// Reversed version with ".s" suffix for GAS compatibility.
def : InstAlias<"vmovdqa.s\t{$src, $dst|$dst, $src}",
(VMOVDQArr_REV VR128:$dst, VR128:$src), 0>;
@@ -4118,7 +3875,7 @@ multiclass sse2_pinsrw<bit Is2Addr = 1> {
"vpinsrw\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"),
[(set VR128:$dst,
(X86pinsrw VR128:$src1, GR32orGR64:$src2, imm:$src3))]>,
- Sched<[WriteVecInsert]>;
+ Sched<[WriteVecInsert, ReadDefault, ReadInt2Fpu]>;
def rm : Ii8<0xC4, MRMSrcMem,
(outs VR128:$dst), (ins VR128:$src1,
i16mem:$src2, u8imm:$src3),
@@ -4138,7 +3895,7 @@ def VPEXTRWrr : Ii8<0xC5, MRMSrcReg,
"vpextrw\t{$src2, $src1, $dst|$dst, $src1, $src2}",
[(set GR32orGR64:$dst, (X86pextrw (v8i16 VR128:$src1),
imm:$src2))]>,
- PD, VEX, Sched<[WriteVecExtract]>;
+ PD, VEX, VEX_WIG, Sched<[WriteVecExtract]>;
def PEXTRWrr : PDIi8<0xC5, MRMSrcReg,
(outs GR32orGR64:$dst), (ins VR128:$src1, u8imm:$src2),
"pextrw\t{$src2, $src1, $dst|$dst, $src1, $src2}",
@@ -4148,7 +3905,7 @@ def PEXTRWrr : PDIi8<0xC5, MRMSrcReg,
// Insert
let Predicates = [HasAVX, NoBWI] in
-defm VPINSRW : sse2_pinsrw<0>, PD, VEX_4V;
+defm VPINSRW : sse2_pinsrw<0>, PD, VEX_4V, VEX_WIG;
let Predicates = [UseSSE2], Constraints = "$src1 = $dst" in
defm PINSRW : sse2_pinsrw, PD;
@@ -4279,19 +4036,11 @@ let ExeDomain = SSEPackedInt, isCodeGenOnly = 1 in {
[(set FR32:$dst, (bitconvert GR32:$src))]>,
VEX, Sched<[WriteVecMoveFromGpr]>;
- def VMOVDI2SSrm : VS2I<0x6E, MRMSrcMem, (outs FR32:$dst), (ins i32mem:$src),
- "movd\t{$src, $dst|$dst, $src}",
- [(set FR32:$dst, (bitconvert (loadi32 addr:$src)))]>,
- VEX, Sched<[WriteVecLoad]>;
def MOVDI2SSrr : S2I<0x6E, MRMSrcReg, (outs FR32:$dst), (ins GR32:$src),
"movd\t{$src, $dst|$dst, $src}",
[(set FR32:$dst, (bitconvert GR32:$src))]>,
Sched<[WriteVecMoveFromGpr]>;
- def MOVDI2SSrm : S2I<0x6E, MRMSrcMem, (outs FR32:$dst), (ins i32mem:$src),
- "movd\t{$src, $dst|$dst, $src}",
- [(set FR32:$dst, (bitconvert (loadi32 addr:$src)))]>,
- Sched<[WriteVecLoad]>;
} // ExeDomain = SSEPackedInt, isCodeGenOnly = 1
//===---------------------------------------------------------------------===//
@@ -4353,32 +4102,15 @@ def MOVPQIto64mr : RS2I<0x7E, MRMDestMem, (outs), (ins i64mem:$dst, VR128:$src),
// Bitcast FR64 <-> GR64
//
let ExeDomain = SSEPackedInt, isCodeGenOnly = 1 in {
- let Predicates = [UseAVX] in
- def VMOV64toSDrm : VS2SI<0x7E, MRMSrcMem, (outs FR64:$dst), (ins i64mem:$src),
- "movq\t{$src, $dst|$dst, $src}",
- [(set FR64:$dst, (bitconvert (loadi64 addr:$src)))]>,
- VEX, Sched<[WriteVecLoad]>;
def VMOVSDto64rr : VRS2I<0x7E, MRMDestReg, (outs GR64:$dst), (ins FR64:$src),
"movq\t{$src, $dst|$dst, $src}",
[(set GR64:$dst, (bitconvert FR64:$src))]>,
VEX, Sched<[WriteVecMoveToGpr]>;
- def VMOVSDto64mr : VRS2I<0x7E, MRMDestMem, (outs), (ins i64mem:$dst, FR64:$src),
- "movq\t{$src, $dst|$dst, $src}",
- [(store (i64 (bitconvert FR64:$src)), addr:$dst)]>,
- VEX, Sched<[WriteVecStore]>;
- def MOV64toSDrm : S2SI<0x7E, MRMSrcMem, (outs FR64:$dst), (ins i64mem:$src),
- "movq\t{$src, $dst|$dst, $src}",
- [(set FR64:$dst, (bitconvert (loadi64 addr:$src)))]>,
- Sched<[WriteVecLoad]>;
def MOVSDto64rr : RS2I<0x7E, MRMDestReg, (outs GR64:$dst), (ins FR64:$src),
"movq\t{$src, $dst|$dst, $src}",
[(set GR64:$dst, (bitconvert FR64:$src))]>,
Sched<[WriteVecMoveToGpr]>;
- def MOVSDto64mr : RS2I<0x7E, MRMDestMem, (outs), (ins i64mem:$dst, FR64:$src),
- "movq\t{$src, $dst|$dst, $src}",
- [(store (i64 (bitconvert FR64:$src)), addr:$dst)]>,
- Sched<[WriteVecStore]>;
} // ExeDomain = SSEPackedInt, isCodeGenOnly = 1
//===---------------------------------------------------------------------===//
@@ -4389,18 +4121,10 @@ let ExeDomain = SSEPackedInt, isCodeGenOnly = 1 in {
"movd\t{$src, $dst|$dst, $src}",
[(set GR32:$dst, (bitconvert FR32:$src))]>,
VEX, Sched<[WriteVecMoveToGpr]>;
- def VMOVSS2DImr : VS2I<0x7E, MRMDestMem, (outs), (ins i32mem:$dst, FR32:$src),
- "movd\t{$src, $dst|$dst, $src}",
- [(store (i32 (bitconvert FR32:$src)), addr:$dst)]>,
- VEX, Sched<[WriteVecStore]>;
def MOVSS2DIrr : S2I<0x7E, MRMDestReg, (outs GR32:$dst), (ins FR32:$src),
"movd\t{$src, $dst|$dst, $src}",
[(set GR32:$dst, (bitconvert FR32:$src))]>,
Sched<[WriteVecMoveToGpr]>;
- def MOVSS2DImr : S2I<0x7E, MRMDestMem, (outs), (ins i32mem:$dst, FR32:$src),
- "movd\t{$src, $dst|$dst, $src}",
- [(store (i32 (bitconvert FR32:$src)), addr:$dst)]>,
- Sched<[WriteVecStore]>;
} // ExeDomain = SSEPackedInt, isCodeGenOnly = 1
let Predicates = [UseAVX] in {
@@ -4410,28 +4134,14 @@ let Predicates = [UseAVX] in {
def : Pat<(v2i64 (X86vzmovl (v2i64 (scalar_to_vector GR64:$src)))),
(VMOV64toPQIrr GR64:$src)>;
- def : Pat<(v4i64 (X86vzmovl (insert_subvector undef,
- (v2i64 (scalar_to_vector GR64:$src)),(iPTR 0)))),
- (SUBREG_TO_REG (i64 0), (v2i64 (VMOV64toPQIrr GR64:$src)), sub_xmm)>;
// AVX 128-bit movd/movq instructions write zeros in the high 128-bit part.
// These instructions also write zeros in the high part of a 256-bit register.
def : Pat<(v2i64 (X86vzmovl (v2i64 (scalar_to_vector (zextloadi64i32 addr:$src))))),
(VMOVDI2PDIrm addr:$src)>;
- def : Pat<(v4i32 (X86vzmovl (v4i32 (scalar_to_vector (loadi32 addr:$src))))),
+ def : Pat<(v4i32 (X86vzload32 addr:$src)),
(VMOVDI2PDIrm addr:$src)>;
- def : Pat<(v4i32 (X86vzmovl (loadv4i32 addr:$src))),
- (VMOVDI2PDIrm addr:$src)>;
- def : Pat<(v4i32 (X86vzload addr:$src)),
- (VMOVDI2PDIrm addr:$src)>;
- def : Pat<(v8i32 (X86vzmovl (insert_subvector undef,
- (v4i32 (scalar_to_vector (loadi32 addr:$src))), (iPTR 0)))),
- (SUBREG_TO_REG (i32 0), (v4i32 (VMOVDI2PDIrm addr:$src)), sub_xmm)>;
- def : Pat<(v8i32 (X86vzload addr:$src)),
+ def : Pat<(v8i32 (X86vzload32 addr:$src)),
(SUBREG_TO_REG (i64 0), (v4i32 (VMOVDI2PDIrm addr:$src)), sub_xmm)>;
- // Use regular 128-bit instructions to match 256-bit scalar_to_vec+zext.
- def : Pat<(v8i32 (X86vzmovl (insert_subvector undef,
- (v4i32 (scalar_to_vector GR32:$src)),(iPTR 0)))),
- (SUBREG_TO_REG (i32 0), (v4i32 (VMOVDI2PDIrr GR32:$src)), sub_xmm)>;
}
let Predicates = [UseSSE2] in {
@@ -4442,11 +4152,7 @@ let Predicates = [UseSSE2] in {
(MOV64toPQIrr GR64:$src)>;
def : Pat<(v2i64 (X86vzmovl (v2i64 (scalar_to_vector (zextloadi64i32 addr:$src))))),
(MOVDI2PDIrm addr:$src)>;
- def : Pat<(v4i32 (X86vzmovl (v4i32 (scalar_to_vector (loadi32 addr:$src))))),
- (MOVDI2PDIrm addr:$src)>;
- def : Pat<(v4i32 (X86vzmovl (loadv4i32 addr:$src))),
- (MOVDI2PDIrm addr:$src)>;
- def : Pat<(v4i32 (X86vzload addr:$src)),
+ def : Pat<(v4i32 (X86vzload32 addr:$src)),
(MOVDI2PDIrm addr:$src)>;
}
@@ -4508,32 +4214,26 @@ def MOVPQI2QIrr : S2I<0xD6, MRMDestReg, (outs VR128:$dst), (ins VR128:$src),
"movq\t{$src, $dst|$dst, $src}", []>;
}
-// Aliases to help the assembler pick two byte VEX encodings by swapping the
-// operands relative to the normal instructions to use VEX.R instead of VEX.B.
-def : InstAlias<"vmovq\t{$src, $dst|$dst, $src}",
- (VMOVPQI2QIrr VR128L:$dst, VR128H:$src), 0>;
-
def : InstAlias<"vmovq.s\t{$src, $dst|$dst, $src}",
(VMOVPQI2QIrr VR128:$dst, VR128:$src), 0>;
def : InstAlias<"movq.s\t{$src, $dst|$dst, $src}",
(MOVPQI2QIrr VR128:$dst, VR128:$src), 0>;
let Predicates = [UseAVX] in {
- def : Pat<(v2i64 (X86vzmovl (loadv2i64 addr:$src))),
+ def : Pat<(v2i64 (X86vzload64 addr:$src)),
(VMOVQI2PQIrm addr:$src)>;
- def : Pat<(v2i64 (X86vzload addr:$src)),
- (VMOVQI2PQIrm addr:$src)>;
- def : Pat<(v4i64 (X86vzmovl (insert_subvector undef,
- (v2i64 (scalar_to_vector (loadi64 addr:$src))), (iPTR 0)))),
- (SUBREG_TO_REG (i64 0), (v2i64 (VMOVQI2PQIrm addr:$src)), sub_xmm)>;
- def : Pat<(v4i64 (X86vzload addr:$src)),
+ def : Pat<(v4i64 (X86vzload64 addr:$src)),
(SUBREG_TO_REG (i64 0), (v2i64 (VMOVQI2PQIrm addr:$src)), sub_xmm)>;
+
+ def : Pat<(X86vextractstore64 (v2i64 VR128:$src), addr:$dst),
+ (VMOVPQI2QImr addr:$dst, VR128:$src)>;
}
let Predicates = [UseSSE2] in {
- def : Pat<(v2i64 (X86vzmovl (loadv2i64 addr:$src))),
- (MOVQI2PQIrm addr:$src)>;
- def : Pat<(v2i64 (X86vzload addr:$src)), (MOVQI2PQIrm addr:$src)>;
+ def : Pat<(v2i64 (X86vzload64 addr:$src)), (MOVQI2PQIrm addr:$src)>;
+
+ def : Pat<(X86vextractstore64 (v2i64 VR128:$src), addr:$dst),
+ (MOVPQI2QImr addr:$dst, VR128:$src)>;
}
//===---------------------------------------------------------------------===//
@@ -4560,6 +4260,19 @@ let Predicates = [UseSSE2] in {
(MOVZPQILo2PQIrr VR128:$src)>;
}
+let Predicates = [UseAVX] in {
+ def : Pat<(v4f64 (X86vzmovl (v4f64 VR256:$src))),
+ (SUBREG_TO_REG (i32 0),
+ (v2f64 (VMOVZPQILo2PQIrr
+ (v2f64 (EXTRACT_SUBREG (v4f64 VR256:$src), sub_xmm)))),
+ sub_xmm)>;
+ def : Pat<(v4i64 (X86vzmovl (v4i64 VR256:$src))),
+ (SUBREG_TO_REG (i32 0),
+ (v2i64 (VMOVZPQILo2PQIrr
+ (v2i64 (EXTRACT_SUBREG (v4i64 VR256:$src), sub_xmm)))),
+ sub_xmm)>;
+}
+
//===---------------------------------------------------------------------===//
// SSE3 - Replicate Single FP - MOVSHDUP and MOVSLDUP
//===---------------------------------------------------------------------===//
@@ -4667,17 +4380,17 @@ defm MOVDDUP : sse3_replicate_dfp<"movddup", SchedWriteFShuffle>;
let Predicates = [HasAVX, NoVLX] in {
- def : Pat<(X86Movddup (loadv2f64 addr:$src)),
+ def : Pat<(X86Movddup (v2f64 (nonvolatile_load addr:$src))),
(VMOVDDUPrm addr:$src)>, Requires<[HasAVX]>;
- def : Pat<(X86Movddup (v2f64 (X86vzload addr:$src))),
+ def : Pat<(X86Movddup (v2f64 (X86vzload64 addr:$src))),
(VMOVDDUPrm addr:$src)>, Requires<[HasAVX]>;
}
let Predicates = [UseSSE3] in {
// No need for aligned memory as this only loads 64-bits.
- def : Pat<(X86Movddup (loadv2f64 addr:$src)),
+ def : Pat<(X86Movddup (v2f64 (nonvolatile_load addr:$src))),
(MOVDDUPrm addr:$src)>;
- def : Pat<(X86Movddup (v2f64 (X86vzload addr:$src))),
+ def : Pat<(X86Movddup (v2f64 (X86vzload64 addr:$src))),
(MOVDDUPrm addr:$src)>;
}
@@ -5130,15 +4843,12 @@ let Constraints = "$src1 = $dst", Predicates = [UseSSSE3] in
//===---------------------------------------------------------------------===//
let SchedRW = [WriteSystem] in {
-let usesCustomInserter = 1 in {
-def MONITOR : PseudoI<(outs), (ins i32mem:$src1, GR32:$src2, GR32:$src3),
- [(int_x86_sse3_monitor addr:$src1, GR32:$src2, GR32:$src3)]>,
- Requires<[HasSSE3]>;
-}
-
let Uses = [EAX, ECX, EDX] in
-def MONITORrrr : I<0x01, MRM_C8, (outs), (ins), "monitor", []>,
- TB, Requires<[HasSSE3]>;
+def MONITOR32rrr : I<0x01, MRM_C8, (outs), (ins), "monitor", []>,
+ TB, Requires<[HasSSE3, Not64BitMode]>;
+let Uses = [RAX, ECX, EDX] in
+def MONITOR64rrr : I<0x01, MRM_C8, (outs), (ins), "monitor", []>,
+ TB, Requires<[HasSSE3, In64BitMode]>;
let Uses = [ECX, EAX] in
def MWAITrr : I<0x01, MRM_C9, (outs), (ins), "mwait",
@@ -5148,13 +4858,14 @@ def MWAITrr : I<0x01, MRM_C9, (outs), (ins), "mwait",
def : InstAlias<"mwait\t{%eax, %ecx|ecx, eax}", (MWAITrr)>, Requires<[Not64BitMode]>;
def : InstAlias<"mwait\t{%rax, %rcx|rcx, rax}", (MWAITrr)>, Requires<[In64BitMode]>;
-def : InstAlias<"monitor\t{%eax, %ecx, %edx|edx, ecx, eax}", (MONITORrrr)>,
+def : InstAlias<"monitor\t{%eax, %ecx, %edx|edx, ecx, eax}", (MONITOR32rrr)>,
Requires<[Not64BitMode]>;
-def : InstAlias<"monitor\t{%rax, %rcx, %rdx|rdx, rcx, rax}", (MONITORrrr)>,
+def : InstAlias<"monitor\t{%rax, %rcx, %rdx|rdx, rcx, rax}", (MONITOR64rrr)>,
Requires<[In64BitMode]>;
//===----------------------------------------------------------------------===//
// SSE4.1 - Packed Move with Sign/Zero Extend
+// NOTE: Any Extend is promoted to Zero Extend in X86ISelDAGToDAG.cpp
//===----------------------------------------------------------------------===//
multiclass SS41I_pmovx_rrrm<bits<8> opc, string OpcodeStr, X86MemOperand MemOp,
@@ -5202,71 +4913,38 @@ defm WQ : SS41I_pmovx_rm<0x24, "wq", i32mem, i64mem, NoVLX>;
defm BQ : SS41I_pmovx_rm<0x22, "bq", i16mem, i32mem, NoVLX>;
-// Patterns that we also need for any_extend.
-// Any_extend_vector_inreg is currently legalized to zero_extend_vector_inreg.
-multiclass SS41I_pmovx_avx2_patterns_base<string OpcPrefix, SDNode ExtOp> {
- // Register-Register patterns
- let Predicates = [HasAVX2, NoVLX_Or_NoBWI] in {
- def : Pat<(v16i16 (ExtOp (v16i8 VR128:$src))),
- (!cast<I>(OpcPrefix#BWYrr) VR128:$src)>;
- }
-
- let Predicates = [HasAVX2, NoVLX] in {
- def : Pat<(v8i32 (ExtOp (v8i16 VR128:$src))),
- (!cast<I>(OpcPrefix#WDYrr) VR128:$src)>;
-
- def : Pat<(v4i64 (ExtOp (v4i32 VR128:$src))),
- (!cast<I>(OpcPrefix#DQYrr) VR128:$src)>;
- }
-
- // AVX2 Register-Memory patterns
- let Predicates = [HasAVX2, NoVLX_Or_NoBWI] in {
- def : Pat<(v16i16 (ExtOp (loadv16i8 addr:$src))),
- (!cast<I>(OpcPrefix#BWYrm) addr:$src)>;
- def : Pat<(v16i16 (ExtOp (v16i8 (vzmovl_v2i64 addr:$src)))),
- (!cast<I>(OpcPrefix#BWYrm) addr:$src)>;
- def : Pat<(v16i16 (ExtOp (v16i8 (vzload_v2i64 addr:$src)))),
- (!cast<I>(OpcPrefix#BWYrm) addr:$src)>;
- }
-
- let Predicates = [HasAVX2, NoVLX] in {
- def : Pat<(v8i32 (ExtOp (loadv8i16 addr:$src))),
- (!cast<I>(OpcPrefix#WDYrm) addr:$src)>;
- def : Pat<(v8i32 (ExtOp (v8i16 (vzmovl_v2i64 addr:$src)))),
- (!cast<I>(OpcPrefix#WDYrm) addr:$src)>;
- def : Pat<(v8i32 (ExtOp (v8i16 (vzload_v2i64 addr:$src)))),
- (!cast<I>(OpcPrefix#WDYrm) addr:$src)>;
-
- def : Pat<(v4i64 (ExtOp (loadv4i32 addr:$src))),
- (!cast<I>(OpcPrefix#DQYrm) addr:$src)>;
- def : Pat<(v4i64 (ExtOp (v4i32 (vzmovl_v2i64 addr:$src)))),
- (!cast<I>(OpcPrefix#DQYrm) addr:$src)>;
- def : Pat<(v4i64 (ExtOp (v4i32 (vzload_v2i64 addr:$src)))),
- (!cast<I>(OpcPrefix#DQYrm) addr:$src)>;
- }
-}
-
// AVX2 Patterns
multiclass SS41I_pmovx_avx2_patterns<string OpcPrefix, string ExtTy,
- SDNode ExtOp, SDNode InVecOp> :
- SS41I_pmovx_avx2_patterns_base<OpcPrefix, ExtOp> {
-
+ SDNode ExtOp, SDNode InVecOp> {
// Register-Register patterns
+ let Predicates = [HasAVX2, NoVLX_Or_NoBWI] in {
+ def : Pat<(v16i16 (ExtOp (v16i8 VR128:$src))),
+ (!cast<I>(OpcPrefix#BWYrr) VR128:$src)>;
+ }
let Predicates = [HasAVX2, NoVLX] in {
def : Pat<(v8i32 (InVecOp (v16i8 VR128:$src))),
(!cast<I>(OpcPrefix#BDYrr) VR128:$src)>;
def : Pat<(v4i64 (InVecOp (v16i8 VR128:$src))),
(!cast<I>(OpcPrefix#BQYrr) VR128:$src)>;
+ def : Pat<(v8i32 (ExtOp (v8i16 VR128:$src))),
+ (!cast<I>(OpcPrefix#WDYrr) VR128:$src)>;
def : Pat<(v4i64 (InVecOp (v8i16 VR128:$src))),
(!cast<I>(OpcPrefix#WQYrr) VR128:$src)>;
+
+ def : Pat<(v4i64 (ExtOp (v4i32 VR128:$src))),
+ (!cast<I>(OpcPrefix#DQYrr) VR128:$src)>;
}
// Simple Register-Memory patterns
let Predicates = [HasAVX2, NoVLX_Or_NoBWI] in {
def : Pat<(v16i16 (!cast<PatFrag>(ExtTy#"extloadvi8") addr:$src)),
(!cast<I>(OpcPrefix#BWYrm) addr:$src)>;
+
+ def : Pat<(v16i16 (ExtOp (loadv16i8 addr:$src))),
+ (!cast<I>(OpcPrefix#BWYrm) addr:$src)>;
}
+
let Predicates = [HasAVX2, NoVLX] in {
def : Pat<(v8i32 (!cast<PatFrag>(ExtTy#"extloadvi8") addr:$src)),
(!cast<I>(OpcPrefix#BDYrm) addr:$src)>;
@@ -5284,38 +4962,31 @@ multiclass SS41I_pmovx_avx2_patterns<string OpcPrefix, string ExtTy,
// AVX2 Register-Memory patterns
let Predicates = [HasAVX2, NoVLX] in {
+ def : Pat<(v8i32 (ExtOp (loadv8i16 addr:$src))),
+ (!cast<I>(OpcPrefix#WDYrm) addr:$src)>;
+
def : Pat<(v8i32 (InVecOp (bc_v16i8 (v2i64 (scalar_to_vector (loadi64 addr:$src)))))),
(!cast<I>(OpcPrefix#BDYrm) addr:$src)>;
- def : Pat<(v8i32 (InVecOp (v16i8 (vzmovl_v2i64 addr:$src)))),
- (!cast<I>(OpcPrefix#BDYrm) addr:$src)>;
- def : Pat<(v8i32 (InVecOp (v16i8 (vzload_v2i64 addr:$src)))),
- (!cast<I>(OpcPrefix#BDYrm) addr:$src)>;
- def : Pat<(v8i32 (InVecOp (loadv16i8 addr:$src))),
+ def : Pat<(v8i32 (InVecOp (v16i8 (X86vzload64 addr:$src)))),
(!cast<I>(OpcPrefix#BDYrm) addr:$src)>;
+ def : Pat<(v4i64 (ExtOp (loadv4i32 addr:$src))),
+ (!cast<I>(OpcPrefix#DQYrm) addr:$src)>;
+
def : Pat<(v4i64 (InVecOp (bc_v16i8 (v4i32 (scalar_to_vector (loadi32 addr:$src)))))),
(!cast<I>(OpcPrefix#BQYrm) addr:$src)>;
- def : Pat<(v4i64 (InVecOp (v16i8 (vzmovl_v4i32 addr:$src)))),
- (!cast<I>(OpcPrefix#BQYrm) addr:$src)>;
- def : Pat<(v4i64 (InVecOp (v16i8 (vzload_v2i64 addr:$src)))),
- (!cast<I>(OpcPrefix#BQYrm) addr:$src)>;
- def : Pat<(v4i64 (InVecOp (loadv16i8 addr:$src))),
+ def : Pat<(v4i64 (InVecOp (v16i8 (X86vzload64 addr:$src)))),
(!cast<I>(OpcPrefix#BQYrm) addr:$src)>;
def : Pat<(v4i64 (InVecOp (bc_v8i16 (v2i64 (scalar_to_vector (loadi64 addr:$src)))))),
(!cast<I>(OpcPrefix#WQYrm) addr:$src)>;
- def : Pat<(v4i64 (InVecOp (v8i16 (vzmovl_v2i64 addr:$src)))),
- (!cast<I>(OpcPrefix#WQYrm) addr:$src)>;
- def : Pat<(v4i64 (InVecOp (v8i16 (vzload_v2i64 addr:$src)))),
- (!cast<I>(OpcPrefix#WQYrm) addr:$src)>;
- def : Pat<(v4i64 (InVecOp (loadv8i16 addr:$src))),
+ def : Pat<(v4i64 (InVecOp (v8i16 (X86vzload64 addr:$src)))),
(!cast<I>(OpcPrefix#WQYrm) addr:$src)>;
}
}
defm : SS41I_pmovx_avx2_patterns<"VPMOVSX", "s", sext, sext_invec>;
defm : SS41I_pmovx_avx2_patterns<"VPMOVZX", "z", zext, zext_invec>;
-defm : SS41I_pmovx_avx2_patterns_base<"VPMOVZX", anyext>;
// SSE4.1/AVX patterns.
multiclass SS41I_pmovx_patterns<string OpcPrefix, string ExtTy,
@@ -5361,9 +5032,7 @@ multiclass SS41I_pmovx_patterns<string OpcPrefix, string ExtTy,
(!cast<I>(OpcPrefix#BWrm) addr:$src)>;
def : Pat<(v8i16 (ExtOp (bc_v16i8 (v2f64 (scalar_to_vector (loadf64 addr:$src)))))),
(!cast<I>(OpcPrefix#BWrm) addr:$src)>;
- def : Pat<(v8i16 (ExtOp (v16i8 (vzmovl_v2i64 addr:$src)))),
- (!cast<I>(OpcPrefix#BWrm) addr:$src)>;
- def : Pat<(v8i16 (ExtOp (v16i8 (vzload_v2i64 addr:$src)))),
+ def : Pat<(v8i16 (ExtOp (bc_v16i8 (v2i64 (X86vzload64 addr:$src))))),
(!cast<I>(OpcPrefix#BWrm) addr:$src)>;
def : Pat<(v8i16 (ExtOp (loadv16i8 addr:$src))),
(!cast<I>(OpcPrefix#BWrm) addr:$src)>;
@@ -5371,19 +5040,13 @@ multiclass SS41I_pmovx_patterns<string OpcPrefix, string ExtTy,
let Predicates = [HasAVX, NoVLX] in {
def : Pat<(v4i32 (ExtOp (bc_v16i8 (v4i32 (scalar_to_vector (loadi32 addr:$src)))))),
(!cast<I>(OpcPrefix#BDrm) addr:$src)>;
- def : Pat<(v4i32 (ExtOp (v16i8 (vzmovl_v4i32 addr:$src)))),
- (!cast<I>(OpcPrefix#BDrm) addr:$src)>;
- def : Pat<(v4i32 (ExtOp (v16i8 (vzload_v2i64 addr:$src)))),
+ def : Pat<(v4i32 (ExtOp (bc_v16i8 (v4i32 (X86vzload32 addr:$src))))),
(!cast<I>(OpcPrefix#BDrm) addr:$src)>;
def : Pat<(v4i32 (ExtOp (loadv16i8 addr:$src))),
(!cast<I>(OpcPrefix#BDrm) addr:$src)>;
def : Pat<(v2i64 (ExtOp (bc_v16i8 (v4i32 (scalar_to_vector (extloadi32i16 addr:$src)))))),
(!cast<I>(OpcPrefix#BQrm) addr:$src)>;
- def : Pat<(v2i64 (ExtOp (v16i8 (vzmovl_v4i32 addr:$src)))),
- (!cast<I>(OpcPrefix#BQrm) addr:$src)>;
- def : Pat<(v2i64 (ExtOp (v16i8 (vzload_v2i64 addr:$src)))),
- (!cast<I>(OpcPrefix#BQrm) addr:$src)>;
def : Pat<(v2i64 (ExtOp (loadv16i8 addr:$src))),
(!cast<I>(OpcPrefix#BQrm) addr:$src)>;
@@ -5391,18 +5054,14 @@ multiclass SS41I_pmovx_patterns<string OpcPrefix, string ExtTy,
(!cast<I>(OpcPrefix#WDrm) addr:$src)>;
def : Pat<(v4i32 (ExtOp (bc_v8i16 (v2f64 (scalar_to_vector (loadf64 addr:$src)))))),
(!cast<I>(OpcPrefix#WDrm) addr:$src)>;
- def : Pat<(v4i32 (ExtOp (v8i16 (vzmovl_v2i64 addr:$src)))),
- (!cast<I>(OpcPrefix#WDrm) addr:$src)>;
- def : Pat<(v4i32 (ExtOp (v8i16 (vzload_v2i64 addr:$src)))),
+ def : Pat<(v4i32 (ExtOp (bc_v8i16 (v2i64 (X86vzload64 addr:$src))))),
(!cast<I>(OpcPrefix#WDrm) addr:$src)>;
def : Pat<(v4i32 (ExtOp (loadv8i16 addr:$src))),
(!cast<I>(OpcPrefix#WDrm) addr:$src)>;
def : Pat<(v2i64 (ExtOp (bc_v8i16 (v4i32 (scalar_to_vector (loadi32 addr:$src)))))),
(!cast<I>(OpcPrefix#WQrm) addr:$src)>;
- def : Pat<(v2i64 (ExtOp (v8i16 (vzmovl_v4i32 addr:$src)))),
- (!cast<I>(OpcPrefix#WQrm) addr:$src)>;
- def : Pat<(v2i64 (ExtOp (v8i16 (vzload_v2i64 addr:$src)))),
+ def : Pat<(v2i64 (ExtOp (bc_v8i16 (v4i32 (X86vzload32 addr:$src))))),
(!cast<I>(OpcPrefix#WQrm) addr:$src)>;
def : Pat<(v2i64 (ExtOp (loadv8i16 addr:$src))),
(!cast<I>(OpcPrefix#WQrm) addr:$src)>;
@@ -5411,9 +5070,7 @@ multiclass SS41I_pmovx_patterns<string OpcPrefix, string ExtTy,
(!cast<I>(OpcPrefix#DQrm) addr:$src)>;
def : Pat<(v2i64 (ExtOp (bc_v4i32 (v2f64 (scalar_to_vector (loadf64 addr:$src)))))),
(!cast<I>(OpcPrefix#DQrm) addr:$src)>;
- def : Pat<(v2i64 (ExtOp (v4i32 (vzmovl_v2i64 addr:$src)))),
- (!cast<I>(OpcPrefix#DQrm) addr:$src)>;
- def : Pat<(v2i64 (ExtOp (v4i32 (vzload_v2i64 addr:$src)))),
+ def : Pat<(v2i64 (ExtOp (bc_v4i32 (v2i64 (X86vzload64 addr:$src))))),
(!cast<I>(OpcPrefix#DQrm) addr:$src)>;
def : Pat<(v2i64 (ExtOp (loadv4i32 addr:$src))),
(!cast<I>(OpcPrefix#DQrm) addr:$src)>;
@@ -5451,7 +5108,7 @@ multiclass SS41I_extract8<bits<8> opc, string OpcodeStr> {
}
let Predicates = [HasAVX, NoBWI] in
- defm VPEXTRB : SS41I_extract8<0x14, "vpextrb">, VEX;
+ defm VPEXTRB : SS41I_extract8<0x14, "vpextrb">, VEX, VEX_WIG;
defm PEXTRB : SS41I_extract8<0x14, "pextrb">;
@@ -5475,7 +5132,7 @@ multiclass SS41I_extract16<bits<8> opc, string OpcodeStr> {
}
let Predicates = [HasAVX, NoBWI] in
- defm VPEXTRW : SS41I_extract16<0x15, "vpextrw">, VEX;
+ defm VPEXTRW : SS41I_extract16<0x15, "vpextrw">, VEX, VEX_WIG;
defm PEXTRW : SS41I_extract16<0x15, "pextrw">;
@@ -5548,18 +5205,6 @@ let ExeDomain = SSEPackedSingle in {
defm EXTRACTPS : SS41I_extractf32<0x17, "extractps">;
}
-// Also match an EXTRACTPS store when the store is done as f32 instead of i32.
-def : Pat<(store (f32 (bitconvert (extractelt (bc_v4i32 (v4f32 VR128:$src1)),
- imm:$src2))),
- addr:$dst),
- (VEXTRACTPSmr addr:$dst, VR128:$src1, imm:$src2)>,
- Requires<[HasAVX]>;
-def : Pat<(store (f32 (bitconvert (extractelt (bc_v4i32 (v4f32 VR128:$src1)),
- imm:$src2))),
- addr:$dst),
- (EXTRACTPSmr addr:$dst, VR128:$src1, imm:$src2)>,
- Requires<[UseSSE41]>;
-
//===----------------------------------------------------------------------===//
// SSE4.1 - Insert Instructions
//===----------------------------------------------------------------------===//
@@ -5573,7 +5218,7 @@ multiclass SS41I_insert8<bits<8> opc, string asm, bit Is2Addr = 1> {
"\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}")),
[(set VR128:$dst,
(X86pinsrb VR128:$src1, GR32orGR64:$src2, imm:$src3))]>,
- Sched<[WriteVecInsert]>;
+ Sched<[WriteVecInsert, ReadDefault, ReadInt2Fpu]>;
def rm : SS4AIi8<opc, MRMSrcMem, (outs VR128:$dst),
(ins VR128:$src1, i8mem:$src2, u8imm:$src3),
!if(Is2Addr,
@@ -5586,7 +5231,7 @@ multiclass SS41I_insert8<bits<8> opc, string asm, bit Is2Addr = 1> {
}
let Predicates = [HasAVX, NoBWI] in
- defm VPINSRB : SS41I_insert8<0x20, "vpinsrb", 0>, VEX_4V;
+ defm VPINSRB : SS41I_insert8<0x20, "vpinsrb", 0>, VEX_4V, VEX_WIG;
let Constraints = "$src1 = $dst" in
defm PINSRB : SS41I_insert8<0x20, "pinsrb">;
@@ -5599,7 +5244,7 @@ multiclass SS41I_insert32<bits<8> opc, string asm, bit Is2Addr = 1> {
"\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}")),
[(set VR128:$dst,
(v4i32 (insertelt VR128:$src1, GR32:$src2, imm:$src3)))]>,
- Sched<[WriteVecInsert]>;
+ Sched<[WriteVecInsert, ReadDefault, ReadInt2Fpu]>;
def rm : SS4AIi8<opc, MRMSrcMem, (outs VR128:$dst),
(ins VR128:$src1, i32mem:$src2, u8imm:$src3),
!if(Is2Addr,
@@ -5625,7 +5270,7 @@ multiclass SS41I_insert64<bits<8> opc, string asm, bit Is2Addr = 1> {
"\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}")),
[(set VR128:$dst,
(v2i64 (insertelt VR128:$src1, GR64:$src2, imm:$src3)))]>,
- Sched<[WriteVecInsert]>;
+ Sched<[WriteVecInsert, ReadDefault, ReadInt2Fpu]>;
def rm : SS4AIi8<opc, MRMSrcMem, (outs VR128:$dst),
(ins VR128:$src1, i64mem:$src2, u8imm:$src3),
!if(Is2Addr,
@@ -5647,6 +5292,7 @@ let Constraints = "$src1 = $dst" in
// vector. The next one matches the intrinsic and could zero arbitrary elements
// in the target vector.
multiclass SS41I_insertf32<bits<8> opc, string asm, bit Is2Addr = 1> {
+ let isCommutable = 1 in
def rr : SS4AIi8<opc, MRMSrcReg, (outs VR128:$dst),
(ins VR128:$src1, VR128:$src2, u8imm:$src3),
!if(Is2Addr,
@@ -5853,7 +5499,7 @@ let Predicates = [HasAVX, NoVLX] in {
VEX, VEX_L, VEX_WIG;
}
}
-let Predicates = [HasAVX, NoAVX512] in {
+let Predicates = [UseAVX] in {
defm VROUND : sse41_fp_binop_s<0x0A, 0x0B, "vround", SchedWriteFRnd.Scl,
v4f32, v2f64, X86RndScales, 0>,
VEX_4V, VEX_LIG, VEX_WIG;
@@ -5862,141 +5508,17 @@ let Predicates = [HasAVX, NoAVX512] in {
}
let Predicates = [UseAVX] in {
- def : Pat<(ffloor FR32:$src),
- (VROUNDSSr (f32 (IMPLICIT_DEF)), FR32:$src, (i32 0x9))>;
- def : Pat<(f32 (fnearbyint FR32:$src)),
- (VROUNDSSr (f32 (IMPLICIT_DEF)), FR32:$src, (i32 0xC))>;
- def : Pat<(f32 (fceil FR32:$src)),
- (VROUNDSSr (f32 (IMPLICIT_DEF)), FR32:$src, (i32 0xA))>;
- def : Pat<(f32 (frint FR32:$src)),
- (VROUNDSSr (f32 (IMPLICIT_DEF)), FR32:$src, (i32 0x4))>;
- def : Pat<(f32 (ftrunc FR32:$src)),
- (VROUNDSSr (f32 (IMPLICIT_DEF)), FR32:$src, (i32 0xB))>;
-
- def : Pat<(f64 (ffloor FR64:$src)),
- (VROUNDSDr (f64 (IMPLICIT_DEF)), FR64:$src, (i32 0x9))>;
- def : Pat<(f64 (fnearbyint FR64:$src)),
- (VROUNDSDr (f64 (IMPLICIT_DEF)), FR64:$src, (i32 0xC))>;
- def : Pat<(f64 (fceil FR64:$src)),
- (VROUNDSDr (f64 (IMPLICIT_DEF)), FR64:$src, (i32 0xA))>;
- def : Pat<(f64 (frint FR64:$src)),
- (VROUNDSDr (f64 (IMPLICIT_DEF)), FR64:$src, (i32 0x4))>;
- def : Pat<(f64 (ftrunc FR64:$src)),
- (VROUNDSDr (f64 (IMPLICIT_DEF)), FR64:$src, (i32 0xB))>;
+ def : Pat<(X86VRndScale FR32:$src1, imm:$src2),
+ (VROUNDSSr (f32 (IMPLICIT_DEF)), FR32:$src1, imm:$src2)>;
+ def : Pat<(X86VRndScale FR64:$src1, imm:$src2),
+ (VROUNDSDr (f64 (IMPLICIT_DEF)), FR64:$src1, imm:$src2)>;
}
let Predicates = [UseAVX, OptForSize] in {
- def : Pat<(ffloor (loadf32 addr:$src)),
- (VROUNDSSm (f32 (IMPLICIT_DEF)), addr:$src, (i32 0x9))>;
- def : Pat<(f32 (fnearbyint (loadf32 addr:$src))),
- (VROUNDSSm (f32 (IMPLICIT_DEF)), addr:$src, (i32 0xC))>;
- def : Pat<(f32 (fceil (loadf32 addr:$src))),
- (VROUNDSSm (f32 (IMPLICIT_DEF)), addr:$src, (i32 0xA))>;
- def : Pat<(f32 (frint (loadf32 addr:$src))),
- (VROUNDSSm (f32 (IMPLICIT_DEF)), addr:$src, (i32 0x4))>;
- def : Pat<(f32 (ftrunc (loadf32 addr:$src))),
- (VROUNDSSm (f32 (IMPLICIT_DEF)), addr:$src, (i32 0xB))>;
-
- def : Pat<(f64 (ffloor (loadf64 addr:$src))),
- (VROUNDSDm (f64 (IMPLICIT_DEF)), addr:$src, (i32 0x9))>;
- def : Pat<(f64 (fnearbyint (loadf64 addr:$src))),
- (VROUNDSDm (f64 (IMPLICIT_DEF)), addr:$src, (i32 0xC))>;
- def : Pat<(f64 (fceil (loadf64 addr:$src))),
- (VROUNDSDm (f64 (IMPLICIT_DEF)), addr:$src, (i32 0xA))>;
- def : Pat<(f64 (frint (loadf64 addr:$src))),
- (VROUNDSDm (f64 (IMPLICIT_DEF)), addr:$src, (i32 0x4))>;
- def : Pat<(f64 (ftrunc (loadf64 addr:$src))),
- (VROUNDSDm (f64 (IMPLICIT_DEF)), addr:$src, (i32 0xB))>;
-}
-
-let Predicates = [HasAVX, NoVLX] in {
- def : Pat<(v4f32 (ffloor VR128:$src)),
- (VROUNDPSr VR128:$src, (i32 0x9))>;
- def : Pat<(v4f32 (fnearbyint VR128:$src)),
- (VROUNDPSr VR128:$src, (i32 0xC))>;
- def : Pat<(v4f32 (fceil VR128:$src)),
- (VROUNDPSr VR128:$src, (i32 0xA))>;
- def : Pat<(v4f32 (frint VR128:$src)),
- (VROUNDPSr VR128:$src, (i32 0x4))>;
- def : Pat<(v4f32 (ftrunc VR128:$src)),
- (VROUNDPSr VR128:$src, (i32 0xB))>;
-
- def : Pat<(v4f32 (ffloor (loadv4f32 addr:$src))),
- (VROUNDPSm addr:$src, (i32 0x9))>;
- def : Pat<(v4f32 (fnearbyint (loadv4f32 addr:$src))),
- (VROUNDPSm addr:$src, (i32 0xC))>;
- def : Pat<(v4f32 (fceil (loadv4f32 addr:$src))),
- (VROUNDPSm addr:$src, (i32 0xA))>;
- def : Pat<(v4f32 (frint (loadv4f32 addr:$src))),
- (VROUNDPSm addr:$src, (i32 0x4))>;
- def : Pat<(v4f32 (ftrunc (loadv4f32 addr:$src))),
- (VROUNDPSm addr:$src, (i32 0xB))>;
-
- def : Pat<(v2f64 (ffloor VR128:$src)),
- (VROUNDPDr VR128:$src, (i32 0x9))>;
- def : Pat<(v2f64 (fnearbyint VR128:$src)),
- (VROUNDPDr VR128:$src, (i32 0xC))>;
- def : Pat<(v2f64 (fceil VR128:$src)),
- (VROUNDPDr VR128:$src, (i32 0xA))>;
- def : Pat<(v2f64 (frint VR128:$src)),
- (VROUNDPDr VR128:$src, (i32 0x4))>;
- def : Pat<(v2f64 (ftrunc VR128:$src)),
- (VROUNDPDr VR128:$src, (i32 0xB))>;
-
- def : Pat<(v2f64 (ffloor (loadv2f64 addr:$src))),
- (VROUNDPDm addr:$src, (i32 0x9))>;
- def : Pat<(v2f64 (fnearbyint (loadv2f64 addr:$src))),
- (VROUNDPDm addr:$src, (i32 0xC))>;
- def : Pat<(v2f64 (fceil (loadv2f64 addr:$src))),
- (VROUNDPDm addr:$src, (i32 0xA))>;
- def : Pat<(v2f64 (frint (loadv2f64 addr:$src))),
- (VROUNDPDm addr:$src, (i32 0x4))>;
- def : Pat<(v2f64 (ftrunc (loadv2f64 addr:$src))),
- (VROUNDPDm addr:$src, (i32 0xB))>;
-
- def : Pat<(v8f32 (ffloor VR256:$src)),
- (VROUNDPSYr VR256:$src, (i32 0x9))>;
- def : Pat<(v8f32 (fnearbyint VR256:$src)),
- (VROUNDPSYr VR256:$src, (i32 0xC))>;
- def : Pat<(v8f32 (fceil VR256:$src)),
- (VROUNDPSYr VR256:$src, (i32 0xA))>;
- def : Pat<(v8f32 (frint VR256:$src)),
- (VROUNDPSYr VR256:$src, (i32 0x4))>;
- def : Pat<(v8f32 (ftrunc VR256:$src)),
- (VROUNDPSYr VR256:$src, (i32 0xB))>;
-
- def : Pat<(v8f32 (ffloor (loadv8f32 addr:$src))),
- (VROUNDPSYm addr:$src, (i32 0x9))>;
- def : Pat<(v8f32 (fnearbyint (loadv8f32 addr:$src))),
- (VROUNDPSYm addr:$src, (i32 0xC))>;
- def : Pat<(v8f32 (fceil (loadv8f32 addr:$src))),
- (VROUNDPSYm addr:$src, (i32 0xA))>;
- def : Pat<(v8f32 (frint (loadv8f32 addr:$src))),
- (VROUNDPSYm addr:$src, (i32 0x4))>;
- def : Pat<(v8f32 (ftrunc (loadv8f32 addr:$src))),
- (VROUNDPSYm addr:$src, (i32 0xB))>;
-
- def : Pat<(v4f64 (ffloor VR256:$src)),
- (VROUNDPDYr VR256:$src, (i32 0x9))>;
- def : Pat<(v4f64 (fnearbyint VR256:$src)),
- (VROUNDPDYr VR256:$src, (i32 0xC))>;
- def : Pat<(v4f64 (fceil VR256:$src)),
- (VROUNDPDYr VR256:$src, (i32 0xA))>;
- def : Pat<(v4f64 (frint VR256:$src)),
- (VROUNDPDYr VR256:$src, (i32 0x4))>;
- def : Pat<(v4f64 (ftrunc VR256:$src)),
- (VROUNDPDYr VR256:$src, (i32 0xB))>;
-
- def : Pat<(v4f64 (ffloor (loadv4f64 addr:$src))),
- (VROUNDPDYm addr:$src, (i32 0x9))>;
- def : Pat<(v4f64 (fnearbyint (loadv4f64 addr:$src))),
- (VROUNDPDYm addr:$src, (i32 0xC))>;
- def : Pat<(v4f64 (fceil (loadv4f64 addr:$src))),
- (VROUNDPDYm addr:$src, (i32 0xA))>;
- def : Pat<(v4f64 (frint (loadv4f64 addr:$src))),
- (VROUNDPDYm addr:$src, (i32 0x4))>;
- def : Pat<(v4f64 (ftrunc (loadv4f64 addr:$src))),
- (VROUNDPDYm addr:$src, (i32 0xB))>;
+ def : Pat<(X86VRndScale (loadf32 addr:$src1), imm:$src2),
+ (VROUNDSSm (f32 (IMPLICIT_DEF)), addr:$src1, imm:$src2)>;
+ def : Pat<(X86VRndScale (loadf64 addr:$src1), imm:$src2),
+ (VROUNDSDm (f64 (IMPLICIT_DEF)), addr:$src1, imm:$src2)>;
}
let ExeDomain = SSEPackedSingle in
@@ -6013,108 +5535,19 @@ defm ROUND : sse41_fp_binop_s<0x0A, 0x0B, "round", SchedWriteFRnd.Scl,
v4f32, v2f64, X86RndScales>;
let Predicates = [UseSSE41] in {
- def : Pat<(ffloor FR32:$src),
- (ROUNDSSr FR32:$src, (i32 0x9))>;
- def : Pat<(f32 (fnearbyint FR32:$src)),
- (ROUNDSSr FR32:$src, (i32 0xC))>;
- def : Pat<(f32 (fceil FR32:$src)),
- (ROUNDSSr FR32:$src, (i32 0xA))>;
- def : Pat<(f32 (frint FR32:$src)),
- (ROUNDSSr FR32:$src, (i32 0x4))>;
- def : Pat<(f32 (ftrunc FR32:$src)),
- (ROUNDSSr FR32:$src, (i32 0xB))>;
-
- def : Pat<(f64 (ffloor FR64:$src)),
- (ROUNDSDr FR64:$src, (i32 0x9))>;
- def : Pat<(f64 (fnearbyint FR64:$src)),
- (ROUNDSDr FR64:$src, (i32 0xC))>;
- def : Pat<(f64 (fceil FR64:$src)),
- (ROUNDSDr FR64:$src, (i32 0xA))>;
- def : Pat<(f64 (frint FR64:$src)),
- (ROUNDSDr FR64:$src, (i32 0x4))>;
- def : Pat<(f64 (ftrunc FR64:$src)),
- (ROUNDSDr FR64:$src, (i32 0xB))>;
+ def : Pat<(X86VRndScale FR32:$src1, imm:$src2),
+ (ROUNDSSr FR32:$src1, imm:$src2)>;
+ def : Pat<(X86VRndScale FR64:$src1, imm:$src2),
+ (ROUNDSDr FR64:$src1, imm:$src2)>;
}
let Predicates = [UseSSE41, OptForSize] in {
- def : Pat<(ffloor (loadf32 addr:$src)),
- (ROUNDSSm addr:$src, (i32 0x9))>;
- def : Pat<(f32 (fnearbyint (loadf32 addr:$src))),
- (ROUNDSSm addr:$src, (i32 0xC))>;
- def : Pat<(f32 (fceil (loadf32 addr:$src))),
- (ROUNDSSm addr:$src, (i32 0xA))>;
- def : Pat<(f32 (frint (loadf32 addr:$src))),
- (ROUNDSSm addr:$src, (i32 0x4))>;
- def : Pat<(f32 (ftrunc (loadf32 addr:$src))),
- (ROUNDSSm addr:$src, (i32 0xB))>;
-
- def : Pat<(f64 (ffloor (loadf64 addr:$src))),
- (ROUNDSDm addr:$src, (i32 0x9))>;
- def : Pat<(f64 (fnearbyint (loadf64 addr:$src))),
- (ROUNDSDm addr:$src, (i32 0xC))>;
- def : Pat<(f64 (fceil (loadf64 addr:$src))),
- (ROUNDSDm addr:$src, (i32 0xA))>;
- def : Pat<(f64 (frint (loadf64 addr:$src))),
- (ROUNDSDm addr:$src, (i32 0x4))>;
- def : Pat<(f64 (ftrunc (loadf64 addr:$src))),
- (ROUNDSDm addr:$src, (i32 0xB))>;
+ def : Pat<(X86VRndScale (loadf32 addr:$src1), imm:$src2),
+ (ROUNDSSm addr:$src1, imm:$src2)>;
+ def : Pat<(X86VRndScale (loadf64 addr:$src1), imm:$src2),
+ (ROUNDSDm addr:$src1, imm:$src2)>;
}
-let Predicates = [UseSSE41] in {
- def : Pat<(v4f32 (ffloor VR128:$src)),
- (ROUNDPSr VR128:$src, (i32 0x9))>;
- def : Pat<(v4f32 (fnearbyint VR128:$src)),
- (ROUNDPSr VR128:$src, (i32 0xC))>;
- def : Pat<(v4f32 (fceil VR128:$src)),
- (ROUNDPSr VR128:$src, (i32 0xA))>;
- def : Pat<(v4f32 (frint VR128:$src)),
- (ROUNDPSr VR128:$src, (i32 0x4))>;
- def : Pat<(v4f32 (ftrunc VR128:$src)),
- (ROUNDPSr VR128:$src, (i32 0xB))>;
-
- def : Pat<(v4f32 (ffloor (memopv4f32 addr:$src))),
- (ROUNDPSm addr:$src, (i32 0x9))>;
- def : Pat<(v4f32 (fnearbyint (memopv4f32 addr:$src))),
- (ROUNDPSm addr:$src, (i32 0xC))>;
- def : Pat<(v4f32 (fceil (memopv4f32 addr:$src))),
- (ROUNDPSm addr:$src, (i32 0xA))>;
- def : Pat<(v4f32 (frint (memopv4f32 addr:$src))),
- (ROUNDPSm addr:$src, (i32 0x4))>;
- def : Pat<(v4f32 (ftrunc (memopv4f32 addr:$src))),
- (ROUNDPSm addr:$src, (i32 0xB))>;
-
- def : Pat<(v2f64 (ffloor VR128:$src)),
- (ROUNDPDr VR128:$src, (i32 0x9))>;
- def : Pat<(v2f64 (fnearbyint VR128:$src)),
- (ROUNDPDr VR128:$src, (i32 0xC))>;
- def : Pat<(v2f64 (fceil VR128:$src)),
- (ROUNDPDr VR128:$src, (i32 0xA))>;
- def : Pat<(v2f64 (frint VR128:$src)),
- (ROUNDPDr VR128:$src, (i32 0x4))>;
- def : Pat<(v2f64 (ftrunc VR128:$src)),
- (ROUNDPDr VR128:$src, (i32 0xB))>;
-
- def : Pat<(v2f64 (ffloor (memopv2f64 addr:$src))),
- (ROUNDPDm addr:$src, (i32 0x9))>;
- def : Pat<(v2f64 (fnearbyint (memopv2f64 addr:$src))),
- (ROUNDPDm addr:$src, (i32 0xC))>;
- def : Pat<(v2f64 (fceil (memopv2f64 addr:$src))),
- (ROUNDPDm addr:$src, (i32 0xA))>;
- def : Pat<(v2f64 (frint (memopv2f64 addr:$src))),
- (ROUNDPDm addr:$src, (i32 0x4))>;
- def : Pat<(v2f64 (ftrunc (memopv2f64 addr:$src))),
- (ROUNDPDm addr:$src, (i32 0xB))>;
-}
-
-defm : scalar_unary_math_imm_patterns<ffloor, "ROUNDSS", X86Movss,
- v4f32, 0x01, UseSSE41>;
-defm : scalar_unary_math_imm_patterns<fceil, "ROUNDSS", X86Movss,
- v4f32, 0x02, UseSSE41>;
-defm : scalar_unary_math_imm_patterns<ffloor, "ROUNDSD", X86Movsd,
- v2f64, 0x01, UseSSE41>;
-defm : scalar_unary_math_imm_patterns<fceil, "ROUNDSD", X86Movsd,
- v2f64, 0x02, UseSSE41>;
-
//===----------------------------------------------------------------------===//
// SSE4.1 - Packed Bit Test
//===----------------------------------------------------------------------===//
@@ -6449,6 +5882,72 @@ def BlendCommuteImm8 : SDNodeXForm<imm, [{
return getI8Imm(Imm ^ 0xff, SDLoc(N));
}]>;
+// Turn a 4-bit blendi immediate to 8-bit for use with pblendw.
+def BlendScaleImm4 : SDNodeXForm<imm, [{
+ uint8_t Imm = N->getZExtValue();
+ uint8_t NewImm = 0;
+ for (unsigned i = 0; i != 4; ++i) {
+ if (Imm & (1 << i))
+ NewImm |= 0x3 << (i * 2);
+ }
+ return getI8Imm(NewImm, SDLoc(N));
+}]>;
+
+// Turn a 2-bit blendi immediate to 8-bit for use with pblendw.
+def BlendScaleImm2 : SDNodeXForm<imm, [{
+ uint8_t Imm = N->getZExtValue();
+ uint8_t NewImm = 0;
+ for (unsigned i = 0; i != 2; ++i) {
+ if (Imm & (1 << i))
+ NewImm |= 0xf << (i * 4);
+ }
+ return getI8Imm(NewImm, SDLoc(N));
+}]>;
+
+// Turn a 2-bit blendi immediate to 4-bit for use with pblendd.
+def BlendScaleImm2to4 : SDNodeXForm<imm, [{
+ uint8_t Imm = N->getZExtValue();
+ uint8_t NewImm = 0;
+ for (unsigned i = 0; i != 2; ++i) {
+ if (Imm & (1 << i))
+ NewImm |= 0x3 << (i * 2);
+ }
+ return getI8Imm(NewImm, SDLoc(N));
+}]>;
+
+// Turn a 4-bit blendi immediate to 8-bit for use with pblendw and invert it.
+def BlendScaleCommuteImm4 : SDNodeXForm<imm, [{
+ uint8_t Imm = N->getZExtValue();
+ uint8_t NewImm = 0;
+ for (unsigned i = 0; i != 4; ++i) {
+ if (Imm & (1 << i))
+ NewImm |= 0x3 << (i * 2);
+ }
+ return getI8Imm(NewImm ^ 0xff, SDLoc(N));
+}]>;
+
+// Turn a 2-bit blendi immediate to 8-bit for use with pblendw and invert it.
+def BlendScaleCommuteImm2 : SDNodeXForm<imm, [{
+ uint8_t Imm = N->getZExtValue();
+ uint8_t NewImm = 0;
+ for (unsigned i = 0; i != 2; ++i) {
+ if (Imm & (1 << i))
+ NewImm |= 0xf << (i * 4);
+ }
+ return getI8Imm(NewImm ^ 0xff, SDLoc(N));
+}]>;
+
+// Turn a 2-bit blendi immediate to 4-bit for use with pblendd and invert it.
+def BlendScaleCommuteImm2to4 : SDNodeXForm<imm, [{
+ uint8_t Imm = N->getZExtValue();
+ uint8_t NewImm = 0;
+ for (unsigned i = 0; i != 2; ++i) {
+ if (Imm & (1 << i))
+ NewImm |= 0x3 << (i * 2);
+ }
+ return getI8Imm(NewImm ^ 0xf, SDLoc(N));
+}]>;
+
let Predicates = [HasAVX] in {
let isCommutable = 0 in {
defm VMPSADBW : SS41I_binop_rmi_int<0x42, "vmpsadbw", int_x86_sse41_mpsadbw,
@@ -6559,6 +6058,42 @@ let Predicates = [HasAVX2] in {
VEX_4V, VEX_L, VEX_WIG;
}
+// Emulate vXi32/vXi64 blends with vXf32/vXf64 or pblendw.
+// ExecutionDomainFixPass will cleanup domains later on.
+let Predicates = [HasAVX1Only] in {
+def : Pat<(X86Blendi (v4i64 VR256:$src1), (v4i64 VR256:$src2), imm:$src3),
+ (VBLENDPDYrri VR256:$src1, VR256:$src2, imm:$src3)>;
+def : Pat<(X86Blendi VR256:$src1, (loadv4i64 addr:$src2), imm:$src3),
+ (VBLENDPDYrmi VR256:$src1, addr:$src2, imm:$src3)>;
+def : Pat<(X86Blendi (loadv4i64 addr:$src2), VR256:$src1, imm:$src3),
+ (VBLENDPDYrmi VR256:$src1, addr:$src2, (BlendCommuteImm4 imm:$src3))>;
+
+// Use pblendw for 128-bit integer to keep it in the integer domain and prevent
+// it from becoming movsd via commuting under optsize.
+def : Pat<(X86Blendi (v2i64 VR128:$src1), (v2i64 VR128:$src2), imm:$src3),
+ (VPBLENDWrri VR128:$src1, VR128:$src2, (BlendScaleImm2 imm:$src3))>;
+def : Pat<(X86Blendi VR128:$src1, (loadv2i64 addr:$src2), imm:$src3),
+ (VPBLENDWrmi VR128:$src1, addr:$src2, (BlendScaleImm2 imm:$src3))>;
+def : Pat<(X86Blendi (loadv2i64 addr:$src2), VR128:$src1, imm:$src3),
+ (VPBLENDWrmi VR128:$src1, addr:$src2, (BlendScaleCommuteImm2 imm:$src3))>;
+
+def : Pat<(X86Blendi (v8i32 VR256:$src1), (v8i32 VR256:$src2), imm:$src3),
+ (VBLENDPSYrri VR256:$src1, VR256:$src2, imm:$src3)>;
+def : Pat<(X86Blendi VR256:$src1, (loadv8i32 addr:$src2), imm:$src3),
+ (VBLENDPSYrmi VR256:$src1, addr:$src2, imm:$src3)>;
+def : Pat<(X86Blendi (loadv8i32 addr:$src2), VR256:$src1, imm:$src3),
+ (VBLENDPSYrmi VR256:$src1, addr:$src2, (BlendCommuteImm8 imm:$src3))>;
+
+// Use pblendw for 128-bit integer to keep it in the integer domain and prevent
+// it from becoming movss via commuting under optsize.
+def : Pat<(X86Blendi (v4i32 VR128:$src1), (v4i32 VR128:$src2), imm:$src3),
+ (VPBLENDWrri VR128:$src1, VR128:$src2, (BlendScaleImm4 imm:$src3))>;
+def : Pat<(X86Blendi VR128:$src1, (loadv4i32 addr:$src2), imm:$src3),
+ (VPBLENDWrmi VR128:$src1, addr:$src2, (BlendScaleImm4 imm:$src3))>;
+def : Pat<(X86Blendi (loadv4i32 addr:$src2), VR128:$src1, imm:$src3),
+ (VPBLENDWrmi VR128:$src1, addr:$src2, (BlendScaleCommuteImm4 imm:$src3))>;
+}
+
defm BLENDPS : SS41I_blend_rmi<0x0C, "blendps", X86Blendi, v4f32,
VR128, memop, f128mem, 1, SSEPackedSingle,
SchedWriteFBlend.XMM, BlendCommuteImm4>;
@@ -6569,6 +6104,24 @@ defm PBLENDW : SS41I_blend_rmi<0x0E, "pblendw", X86Blendi, v8i16,
VR128, memop, i128mem, 1, SSEPackedInt,
SchedWriteBlend.XMM, BlendCommuteImm8>;
+let Predicates = [UseSSE41] in {
+// Use pblendw for 128-bit integer to keep it in the integer domain and prevent
+// it from becoming movss via commuting under optsize.
+def : Pat<(X86Blendi (v2i64 VR128:$src1), (v2i64 VR128:$src2), imm:$src3),
+ (PBLENDWrri VR128:$src1, VR128:$src2, (BlendScaleImm2 imm:$src3))>;
+def : Pat<(X86Blendi VR128:$src1, (memopv2i64 addr:$src2), imm:$src3),
+ (PBLENDWrmi VR128:$src1, addr:$src2, (BlendScaleImm2 imm:$src3))>;
+def : Pat<(X86Blendi (memopv2i64 addr:$src2), VR128:$src1, imm:$src3),
+ (PBLENDWrmi VR128:$src1, addr:$src2, (BlendScaleCommuteImm2 imm:$src3))>;
+
+def : Pat<(X86Blendi (v4i32 VR128:$src1), (v4i32 VR128:$src2), imm:$src3),
+ (PBLENDWrri VR128:$src1, VR128:$src2, (BlendScaleImm4 imm:$src3))>;
+def : Pat<(X86Blendi VR128:$src1, (memopv4i32 addr:$src2), imm:$src3),
+ (PBLENDWrmi VR128:$src1, addr:$src2, (BlendScaleImm4 imm:$src3))>;
+def : Pat<(X86Blendi (memopv4i32 addr:$src2), VR128:$src1, imm:$src3),
+ (PBLENDWrmi VR128:$src1, addr:$src2, (BlendScaleCommuteImm4 imm:$src3))>;
+}
+
// For insertion into the zero index (low half) of a 256-bit vector, it is
// more efficient to generate a blend with immediate instead of an insert*128.
let Predicates = [HasAVX] in {
@@ -6580,18 +6133,25 @@ def : Pat<(insert_subvector (v8f32 VR256:$src1), (v4f32 VR128:$src2), (iPTR 0)),
(VBLENDPSYrri VR256:$src1,
(INSERT_SUBREG (v8f32 (IMPLICIT_DEF)),
VR128:$src2, sub_xmm), 0xf)>;
+
+def : Pat<(insert_subvector (loadv4f64 addr:$src2), (v2f64 VR128:$src1), (iPTR 0)),
+ (VBLENDPDYrmi (INSERT_SUBREG (v4f64 (IMPLICIT_DEF)),
+ VR128:$src1, sub_xmm), addr:$src2, 0xc)>;
+def : Pat<(insert_subvector (loadv8f32 addr:$src2), (v4f32 VR128:$src1), (iPTR 0)),
+ (VBLENDPSYrmi (INSERT_SUBREG (v8f32 (IMPLICIT_DEF)),
+ VR128:$src1, sub_xmm), addr:$src2, 0xf0)>;
}
-/// SS41I_quaternary_int_avx - AVX SSE 4.1 with 4 operators
-multiclass SS41I_quaternary_int_avx<bits<8> opc, string OpcodeStr,
- RegisterClass RC, X86MemOperand x86memop,
- PatFrag mem_frag, Intrinsic IntId,
- X86FoldableSchedWrite sched> {
+/// SS41I_quaternary_vx - AVX SSE 4.1 with 4 operators
+multiclass SS41I_quaternary_avx<bits<8> opc, string OpcodeStr, RegisterClass RC,
+ X86MemOperand x86memop, ValueType VT,
+ PatFrag mem_frag, SDNode OpNode,
+ X86FoldableSchedWrite sched> {
def rr : Ii8Reg<opc, MRMSrcReg, (outs RC:$dst),
(ins RC:$src1, RC:$src2, RC:$src3),
!strconcat(OpcodeStr,
"\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"),
- [(set RC:$dst, (IntId RC:$src1, RC:$src2, RC:$src3))],
+ [(set RC:$dst, (VT (OpNode RC:$src3, RC:$src2, RC:$src1)))],
SSEPackedInt>, TAPD, VEX_4V,
Sched<[sched]>;
@@ -6600,8 +6160,8 @@ multiclass SS41I_quaternary_int_avx<bits<8> opc, string OpcodeStr,
!strconcat(OpcodeStr,
"\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"),
[(set RC:$dst,
- (IntId RC:$src1, (mem_frag addr:$src2),
- RC:$src3))], SSEPackedInt>, TAPD, VEX_4V,
+ (OpNode RC:$src3, (mem_frag addr:$src2),
+ RC:$src1))], SSEPackedInt>, TAPD, VEX_4V,
Sched<[sched.Folded, sched.ReadAfterFold,
// x86memop:$src2
ReadDefault, ReadDefault, ReadDefault, ReadDefault,
@@ -6612,68 +6172,47 @@ multiclass SS41I_quaternary_int_avx<bits<8> opc, string OpcodeStr,
let Predicates = [HasAVX] in {
let ExeDomain = SSEPackedDouble in {
-defm VBLENDVPD : SS41I_quaternary_int_avx<0x4B, "vblendvpd", VR128, f128mem,
- load, int_x86_sse41_blendvpd,
- SchedWriteFVarBlend.XMM>;
-defm VBLENDVPDY : SS41I_quaternary_int_avx<0x4B, "vblendvpd", VR256, f256mem,
- loadv4f64, int_x86_avx_blendv_pd_256,
- SchedWriteFVarBlend.YMM>, VEX_L;
+defm VBLENDVPD : SS41I_quaternary_avx<0x4B, "vblendvpd", VR128, f128mem,
+ v2f64, loadv2f64, X86Blendv,
+ SchedWriteFVarBlend.XMM>;
+defm VBLENDVPDY : SS41I_quaternary_avx<0x4B, "vblendvpd", VR256, f256mem,
+ v4f64, loadv4f64, X86Blendv,
+ SchedWriteFVarBlend.YMM>, VEX_L;
} // ExeDomain = SSEPackedDouble
let ExeDomain = SSEPackedSingle in {
-defm VBLENDVPS : SS41I_quaternary_int_avx<0x4A, "vblendvps", VR128, f128mem,
- load, int_x86_sse41_blendvps,
- SchedWriteFVarBlend.XMM>;
-defm VBLENDVPSY : SS41I_quaternary_int_avx<0x4A, "vblendvps", VR256, f256mem,
- loadv8f32, int_x86_avx_blendv_ps_256,
- SchedWriteFVarBlend.YMM>, VEX_L;
+defm VBLENDVPS : SS41I_quaternary_avx<0x4A, "vblendvps", VR128, f128mem,
+ v4f32, loadv4f32, X86Blendv,
+ SchedWriteFVarBlend.XMM>;
+defm VBLENDVPSY : SS41I_quaternary_avx<0x4A, "vblendvps", VR256, f256mem,
+ v8f32, loadv8f32, X86Blendv,
+ SchedWriteFVarBlend.YMM>, VEX_L;
} // ExeDomain = SSEPackedSingle
-defm VPBLENDVB : SS41I_quaternary_int_avx<0x4C, "vpblendvb", VR128, i128mem,
- load, int_x86_sse41_pblendvb,
- SchedWriteVarBlend.XMM>;
+defm VPBLENDVB : SS41I_quaternary_avx<0x4C, "vpblendvb", VR128, i128mem,
+ v16i8, loadv16i8, X86Blendv,
+ SchedWriteVarBlend.XMM>;
}
let Predicates = [HasAVX2] in {
-defm VPBLENDVBY : SS41I_quaternary_int_avx<0x4C, "vpblendvb", VR256, i256mem,
- load, int_x86_avx2_pblendvb,
- SchedWriteVarBlend.YMM>, VEX_L;
+defm VPBLENDVBY : SS41I_quaternary_avx<0x4C, "vpblendvb", VR256, i256mem,
+ v32i8, loadv32i8, X86Blendv,
+ SchedWriteVarBlend.YMM>, VEX_L;
}
let Predicates = [HasAVX] in {
- def : Pat<(v16i8 (vselect (v16i8 VR128:$mask), (v16i8 VR128:$src1),
- (v16i8 VR128:$src2))),
- (VPBLENDVBrr VR128:$src2, VR128:$src1, VR128:$mask)>;
- def : Pat<(v4i32 (vselect (v4i32 VR128:$mask), (v4i32 VR128:$src1),
- (v4i32 VR128:$src2))),
+ def : Pat<(v4i32 (X86Blendv (v4i32 VR128:$mask), (v4i32 VR128:$src1),
+ (v4i32 VR128:$src2))),
(VBLENDVPSrr VR128:$src2, VR128:$src1, VR128:$mask)>;
- def : Pat<(v4f32 (vselect (v4i32 VR128:$mask), (v4f32 VR128:$src1),
- (v4f32 VR128:$src2))),
- (VBLENDVPSrr VR128:$src2, VR128:$src1, VR128:$mask)>;
- def : Pat<(v2i64 (vselect (v2i64 VR128:$mask), (v2i64 VR128:$src1),
- (v2i64 VR128:$src2))),
- (VBLENDVPDrr VR128:$src2, VR128:$src1, VR128:$mask)>;
- def : Pat<(v2f64 (vselect (v2i64 VR128:$mask), (v2f64 VR128:$src1),
- (v2f64 VR128:$src2))),
+ def : Pat<(v2i64 (X86Blendv (v2i64 VR128:$mask), (v2i64 VR128:$src1),
+ (v2i64 VR128:$src2))),
(VBLENDVPDrr VR128:$src2, VR128:$src1, VR128:$mask)>;
- def : Pat<(v8i32 (vselect (v8i32 VR256:$mask), (v8i32 VR256:$src1),
- (v8i32 VR256:$src2))),
+ def : Pat<(v8i32 (X86Blendv (v8i32 VR256:$mask), (v8i32 VR256:$src1),
+ (v8i32 VR256:$src2))),
(VBLENDVPSYrr VR256:$src2, VR256:$src1, VR256:$mask)>;
- def : Pat<(v8f32 (vselect (v8i32 VR256:$mask), (v8f32 VR256:$src1),
- (v8f32 VR256:$src2))),
- (VBLENDVPSYrr VR256:$src2, VR256:$src1, VR256:$mask)>;
- def : Pat<(v4i64 (vselect (v4i64 VR256:$mask), (v4i64 VR256:$src1),
- (v4i64 VR256:$src2))),
- (VBLENDVPDYrr VR256:$src2, VR256:$src1, VR256:$mask)>;
- def : Pat<(v4f64 (vselect (v4i64 VR256:$mask), (v4f64 VR256:$src1),
- (v4f64 VR256:$src2))),
+ def : Pat<(v4i64 (X86Blendv (v4i64 VR256:$mask), (v4i64 VR256:$src1),
+ (v4i64 VR256:$src2))),
(VBLENDVPDYrr VR256:$src2, VR256:$src1, VR256:$mask)>;
}
-let Predicates = [HasAVX2] in {
- def : Pat<(v32i8 (vselect (v32i8 VR256:$mask), (v32i8 VR256:$src1),
- (v32i8 VR256:$src2))),
- (VPBLENDVBYrr VR256:$src2, VR256:$src1, VR256:$mask)>;
-}
-
// Prefer a movss or movsd over a blendps when optimizing for size. these were
// changed to use blends because blends have better throughput on sandybridge
// and haswell, but movs[s/d] are 1-2 byte shorter instructions.
@@ -6708,17 +6247,6 @@ let Predicates = [HasAVX, OptForSpeed] in {
(v4i32 (VPBLENDWrri (v4i32 (V_SET0)),
(v4i32 (EXTRACT_SUBREG (v8i32 VR256:$src), sub_xmm)),
(i8 3))), sub_xmm)>;
-
- def : Pat<(v4f64 (X86vzmovl (v4f64 VR256:$src))),
- (SUBREG_TO_REG (i32 0),
- (v2f64 (VBLENDPDrri (v2f64 (V_SET0)),
- (v2f64 (EXTRACT_SUBREG (v4f64 VR256:$src), sub_xmm)),
- (i8 1))), sub_xmm)>;
- def : Pat<(v4i64 (X86vzmovl (v4i64 VR256:$src))),
- (SUBREG_TO_REG (i32 0),
- (v2i64 (VPBLENDWrri (v2i64 (V_SET0)),
- (v2i64 (EXTRACT_SUBREG (v4i64 VR256:$src), sub_xmm)),
- (i8 0xf))), sub_xmm)>;
}
// Prefer a movss or movsd over a blendps when optimizing for size. these were
@@ -6747,16 +6275,17 @@ let Predicates = [UseSSE41, OptForSpeed] in {
}
-/// SS41I_ternary_int - SSE 4.1 ternary operator
+/// SS41I_ternary - SSE 4.1 ternary operator
let Uses = [XMM0], Constraints = "$src1 = $dst" in {
- multiclass SS41I_ternary_int<bits<8> opc, string OpcodeStr, PatFrag mem_frag,
- X86MemOperand x86memop, Intrinsic IntId,
- X86FoldableSchedWrite sched> {
+ multiclass SS41I_ternary<bits<8> opc, string OpcodeStr, ValueType VT,
+ PatFrag mem_frag, X86MemOperand x86memop,
+ SDNode OpNode, X86FoldableSchedWrite sched> {
def rr0 : SS48I<opc, MRMSrcReg, (outs VR128:$dst),
(ins VR128:$src1, VR128:$src2),
!strconcat(OpcodeStr,
"\t{%xmm0, $src2, $dst|$dst, $src2, xmm0}"),
- [(set VR128:$dst, (IntId VR128:$src1, VR128:$src2, XMM0))]>,
+ [(set VR128:$dst,
+ (VT (OpNode XMM0, VR128:$src2, VR128:$src1)))]>,
Sched<[sched]>;
def rm0 : SS48I<opc, MRMSrcMem, (outs VR128:$dst),
@@ -6764,20 +6293,19 @@ let Uses = [XMM0], Constraints = "$src1 = $dst" in {
!strconcat(OpcodeStr,
"\t{%xmm0, $src2, $dst|$dst, $src2, xmm0}"),
[(set VR128:$dst,
- (IntId VR128:$src1,
- (mem_frag addr:$src2), XMM0))]>,
+ (OpNode XMM0, (mem_frag addr:$src2), VR128:$src1))]>,
Sched<[sched.Folded, sched.ReadAfterFold]>;
}
}
let ExeDomain = SSEPackedDouble in
-defm BLENDVPD : SS41I_ternary_int<0x15, "blendvpd", memop, f128mem,
- int_x86_sse41_blendvpd, SchedWriteFVarBlend.XMM>;
+defm BLENDVPD : SS41I_ternary<0x15, "blendvpd", v2f64, memopv2f64, f128mem,
+ X86Blendv, SchedWriteFVarBlend.XMM>;
let ExeDomain = SSEPackedSingle in
-defm BLENDVPS : SS41I_ternary_int<0x14, "blendvps", memop, f128mem,
- int_x86_sse41_blendvps, SchedWriteFVarBlend.XMM>;
-defm PBLENDVB : SS41I_ternary_int<0x10, "pblendvb", memop, i128mem,
- int_x86_sse41_pblendvb, SchedWriteVarBlend.XMM>;
+defm BLENDVPS : SS41I_ternary<0x14, "blendvps", v4f32, memopv4f32, f128mem,
+ X86Blendv, SchedWriteFVarBlend.XMM>;
+defm PBLENDVB : SS41I_ternary<0x10, "pblendvb", v16i8, memopv16i8, i128mem,
+ X86Blendv, SchedWriteVarBlend.XMM>;
// Aliases with the implicit xmm0 argument
def : InstAlias<"blendvpd\t{$src2, $dst|$dst, $src2}",
@@ -6794,20 +6322,11 @@ def : InstAlias<"pblendvb\t{$src2, $dst|$dst, $src2}",
(PBLENDVBrm0 VR128:$dst, i128mem:$src2), 0>;
let Predicates = [UseSSE41] in {
- def : Pat<(v16i8 (vselect (v16i8 XMM0), (v16i8 VR128:$src1),
- (v16i8 VR128:$src2))),
- (PBLENDVBrr0 VR128:$src2, VR128:$src1)>;
- def : Pat<(v4i32 (vselect (v4i32 XMM0), (v4i32 VR128:$src1),
- (v4i32 VR128:$src2))),
+ def : Pat<(v4i32 (X86Blendv (v4i32 XMM0), (v4i32 VR128:$src1),
+ (v4i32 VR128:$src2))),
(BLENDVPSrr0 VR128:$src2, VR128:$src1)>;
- def : Pat<(v4f32 (vselect (v4i32 XMM0), (v4f32 VR128:$src1),
- (v4f32 VR128:$src2))),
- (BLENDVPSrr0 VR128:$src2, VR128:$src1)>;
- def : Pat<(v2i64 (vselect (v2i64 XMM0), (v2i64 VR128:$src1),
- (v2i64 VR128:$src2))),
- (BLENDVPDrr0 VR128:$src2, VR128:$src1)>;
- def : Pat<(v2f64 (vselect (v2i64 XMM0), (v2f64 VR128:$src1),
- (v2f64 VR128:$src2))),
+ def : Pat<(v2i64 (X86Blendv (v2i64 XMM0), (v2i64 VR128:$src1),
+ (v2i64 VR128:$src2))),
(BLENDVPDrr0 VR128:$src2, VR128:$src1)>;
}
@@ -7451,17 +6970,6 @@ def VBROADCASTF128 : AVX8I<0x1A, MRMSrcMem, (outs VR256:$dst),
"vbroadcastf128\t{$src, $dst|$dst, $src}", []>,
Sched<[SchedWriteFShuffle.XMM.Folded]>, VEX, VEX_L;
-let Predicates = [HasAVX2, NoVLX] in {
-def : Pat<(v4i64 (X86SubVBroadcast (loadv2i64 addr:$src))),
- (VBROADCASTI128 addr:$src)>;
-def : Pat<(v8i32 (X86SubVBroadcast (loadv4i32 addr:$src))),
- (VBROADCASTI128 addr:$src)>;
-def : Pat<(v16i16 (X86SubVBroadcast (loadv8i16 addr:$src))),
- (VBROADCASTI128 addr:$src)>;
-def : Pat<(v32i8 (X86SubVBroadcast (loadv16i8 addr:$src))),
- (VBROADCASTI128 addr:$src)>;
-}
-
let Predicates = [HasAVX, NoVLX] in {
def : Pat<(v4f64 (X86SubVBroadcast (loadv2f64 addr:$src))),
(VBROADCASTF128 addr:$src)>;
@@ -7469,7 +6977,9 @@ def : Pat<(v8f32 (X86SubVBroadcast (loadv4f32 addr:$src))),
(VBROADCASTF128 addr:$src)>;
}
-let Predicates = [HasAVX1Only] in {
+// NOTE: We're using FP instructions here, but execution domain fixing can
+// convert to integer when profitable.
+let Predicates = [HasAVX, NoVLX] in {
def : Pat<(v4i64 (X86SubVBroadcast (loadv2i64 addr:$src))),
(VBROADCASTF128 addr:$src)>;
def : Pat<(v8i32 (X86SubVBroadcast (loadv4i32 addr:$src))),
@@ -7765,12 +7275,10 @@ let Predicates = [HasF16C, NoVLX] in {
WriteCvtPS2PHYSt>, VEX_L;
// Pattern match vcvtph2ps of a scalar i64 load.
- def : Pat<(v4f32 (X86cvtph2ps (v8i16 (vzmovl_v2i64 addr:$src)))),
- (VCVTPH2PSrm addr:$src)>;
- def : Pat<(v4f32 (X86cvtph2ps (v8i16 (vzload_v2i64 addr:$src)))),
+ def : Pat<(v4f32 (X86cvtph2ps (bc_v8i16 (v2i64 (X86vzload64 addr:$src))))),
(VCVTPH2PSrm addr:$src)>;
- def : Pat<(v4f32 (X86cvtph2ps (v8i16 (bitconvert
- (v2i64 (scalar_to_vector (loadi64 addr:$src))))))),
+ def : Pat<(v4f32 (X86cvtph2ps (bc_v8i16
+ (v2i64 (scalar_to_vector (loadi64 addr:$src)))))),
(VCVTPH2PSrm addr:$src)>;
def : Pat<(store (f64 (extractelt
@@ -7835,6 +7343,7 @@ multiclass AVX2_blend_rmi<bits<8> opc, string OpcodeStr, SDNode OpNode,
(commuteXForm imm:$src3))>;
}
+let Predicates = [HasAVX2] in {
defm VPBLENDD : AVX2_blend_rmi<0x02, "vpblendd", X86Blendi, v4i32,
SchedWriteBlend.XMM, VR128, i128mem,
BlendCommuteImm4>;
@@ -7842,28 +7351,26 @@ defm VPBLENDDY : AVX2_blend_rmi<0x02, "vpblendd", X86Blendi, v8i32,
SchedWriteBlend.YMM, VR256, i256mem,
BlendCommuteImm8>, VEX_L;
-// For insertion into the zero index (low half) of a 256-bit vector, it is
-// more efficient to generate a blend with immediate instead of an insert*128.
-let Predicates = [HasAVX2] in {
-def : Pat<(insert_subvector (v8i32 VR256:$src1), (v4i32 VR128:$src2), (iPTR 0)),
- (VPBLENDDYrri VR256:$src1,
- (INSERT_SUBREG (v8i32 (IMPLICIT_DEF)),
- VR128:$src2, sub_xmm), 0xf)>;
-def : Pat<(insert_subvector (v4i64 VR256:$src1), (v2i64 VR128:$src2), (iPTR 0)),
- (VPBLENDDYrri VR256:$src1,
- (INSERT_SUBREG (v8i32 (IMPLICIT_DEF)),
- VR128:$src2, sub_xmm), 0xf)>;
-def : Pat<(insert_subvector (v16i16 VR256:$src1), (v8i16 VR128:$src2), (iPTR 0)),
- (VPBLENDDYrri VR256:$src1,
- (INSERT_SUBREG (v8i32 (IMPLICIT_DEF)),
- VR128:$src2, sub_xmm), 0xf)>;
-def : Pat<(insert_subvector (v32i8 VR256:$src1), (v16i8 VR128:$src2), (iPTR 0)),
- (VPBLENDDYrri VR256:$src1,
- (INSERT_SUBREG (v8i32 (IMPLICIT_DEF)),
- VR128:$src2, sub_xmm), 0xf)>;
+def : Pat<(X86Blendi (v4i64 VR256:$src1), (v4i64 VR256:$src2), imm:$src3),
+ (VPBLENDDYrri VR256:$src1, VR256:$src2, (BlendScaleImm4 imm:$src3))>;
+def : Pat<(X86Blendi VR256:$src1, (loadv4i64 addr:$src2), imm:$src3),
+ (VPBLENDDYrmi VR256:$src1, addr:$src2, (BlendScaleImm4 imm:$src3))>;
+def : Pat<(X86Blendi (loadv4i64 addr:$src2), VR256:$src1, imm:$src3),
+ (VPBLENDDYrmi VR256:$src1, addr:$src2, (BlendScaleCommuteImm4 imm:$src3))>;
+
+def : Pat<(X86Blendi (v2i64 VR128:$src1), (v2i64 VR128:$src2), imm:$src3),
+ (VPBLENDDrri VR128:$src1, VR128:$src2, (BlendScaleImm2to4 imm:$src3))>;
+def : Pat<(X86Blendi VR128:$src1, (loadv2i64 addr:$src2), imm:$src3),
+ (VPBLENDDrmi VR128:$src1, addr:$src2, (BlendScaleImm2to4 imm:$src3))>;
+def : Pat<(X86Blendi (loadv2i64 addr:$src2), VR128:$src1, imm:$src3),
+ (VPBLENDDrmi VR128:$src1, addr:$src2, (BlendScaleCommuteImm2to4 imm:$src3))>;
}
-let Predicates = [HasAVX1Only] in {
+// For insertion into the zero index (low half) of a 256-bit vector, it is
+// more efficient to generate a blend with immediate instead of an insert*128.
+// NOTE: We're using FP instructions here, but exeuction domain fixing should
+// take care of using integer instructions when profitable.
+let Predicates = [HasAVX] in {
def : Pat<(insert_subvector (v8i32 VR256:$src1), (v4i32 VR128:$src2), (iPTR 0)),
(VBLENDPSYrri VR256:$src1,
(INSERT_SUBREG (v8i32 (IMPLICIT_DEF)),
@@ -7880,6 +7387,19 @@ def : Pat<(insert_subvector (v32i8 VR256:$src1), (v16i8 VR128:$src2), (iPTR 0)),
(VBLENDPSYrri VR256:$src1,
(INSERT_SUBREG (v8i32 (IMPLICIT_DEF)),
VR128:$src2, sub_xmm), 0xf)>;
+
+def : Pat<(insert_subvector (loadv8i32 addr:$src2), (v4i32 VR128:$src1), (iPTR 0)),
+ (VBLENDPSYrmi (INSERT_SUBREG (v8i32 (IMPLICIT_DEF)),
+ VR128:$src1, sub_xmm), addr:$src2, 0xf0)>;
+def : Pat<(insert_subvector (loadv4i64 addr:$src2), (v2i64 VR128:$src1), (iPTR 0)),
+ (VBLENDPSYrmi (INSERT_SUBREG (v8i32 (IMPLICIT_DEF)),
+ VR128:$src1, sub_xmm), addr:$src2, 0xf0)>;
+def : Pat<(insert_subvector (loadv16i16 addr:$src2), (v8i16 VR128:$src1), (iPTR 0)),
+ (VBLENDPSYrmi (INSERT_SUBREG (v8i32 (IMPLICIT_DEF)),
+ VR128:$src1, sub_xmm), addr:$src2, 0xf0)>;
+def : Pat<(insert_subvector (loadv32i8 addr:$src2), (v16i8 VR128:$src1), (iPTR 0)),
+ (VBLENDPSYrmi (INSERT_SUBREG (v8i32 (IMPLICIT_DEF)),
+ VR128:$src1, sub_xmm), addr:$src2, 0xf0)>;
}
//===----------------------------------------------------------------------===//
@@ -7930,9 +7450,9 @@ defm VPBROADCASTQ : avx2_broadcast<0x59, "vpbroadcastq", i64mem, loadi64,
let Predicates = [HasAVX2, NoVLX] in {
// 32-bit targets will fail to load a i64 directly but can use ZEXT_LOAD.
- def : Pat<(v2i64 (X86VBroadcast (v2i64 (X86vzload addr:$src)))),
+ def : Pat<(v2i64 (X86VBroadcast (v2i64 (X86vzload64 addr:$src)))),
(VPBROADCASTQrm addr:$src)>;
- def : Pat<(v4i64 (X86VBroadcast (v4i64 (X86vzload addr:$src)))),
+ def : Pat<(v4i64 (X86VBroadcast (v2i64 (X86vzload64 addr:$src)))),
(VPBROADCASTQYrm addr:$src)>;
def : Pat<(v4i32 (X86VBroadcast (v4i32 (scalar_to_vector (loadi32 addr:$src))))),
@@ -7952,9 +7472,15 @@ let Predicates = [HasAVX2, NoVLX_Or_NoBWI] in {
def : Pat<(v16i16 (X86VBroadcast (i16 (trunc (i32 (load addr:$src)))))),
(VPBROADCASTWYrm addr:$src)>;
def : Pat<(v8i16 (X86VBroadcast
+ (i16 (trunc (i32 (extloadi16 addr:$src)))))),
+ (VPBROADCASTWrm addr:$src)>;
+ def : Pat<(v8i16 (X86VBroadcast
(i16 (trunc (i32 (zextloadi16 addr:$src)))))),
(VPBROADCASTWrm addr:$src)>;
def : Pat<(v16i16 (X86VBroadcast
+ (i16 (trunc (i32 (extloadi16 addr:$src)))))),
+ (VPBROADCASTWYrm addr:$src)>;
+ def : Pat<(v16i16 (X86VBroadcast
(i16 (trunc (i32 (zextloadi16 addr:$src)))))),
(VPBROADCASTWYrm addr:$src)>;
}
@@ -8038,7 +7564,7 @@ let Predicates = [HasAVX, NoVLX] in {
(VMOVDDUPrr VR128:$src)>;
def : Pat<(v2f64 (X86VBroadcast (loadv2f64 addr:$src))),
(VMOVDDUPrm addr:$src)>;
- def : Pat<(v2f64 (X86VBroadcast (v2f64 (X86vzload addr:$src)))),
+ def : Pat<(v2f64 (X86VBroadcast (v2f64 (X86vzload64 addr:$src)))),
(VMOVDDUPrm addr:$src)>;
}
@@ -8236,19 +7762,14 @@ defm VPMASKMOVQ : avx2_pmovmask<"vpmaskmovq",
multiclass maskmov_lowering<string InstrStr, RegisterClass RC, ValueType VT,
ValueType MaskVT, string BlendStr, ValueType ZeroVT> {
// masked store
- def: Pat<(X86mstore (VT RC:$src), addr:$ptr, (MaskVT RC:$mask)),
+ def: Pat<(masked_store (VT RC:$src), addr:$ptr, (MaskVT RC:$mask)),
(!cast<Instruction>(InstrStr#"mr") addr:$ptr, RC:$mask, RC:$src)>;
// masked load
- def: Pat<(VT (X86mload addr:$ptr, (MaskVT RC:$mask), undef)),
+ def: Pat<(VT (masked_load addr:$ptr, (MaskVT RC:$mask), undef)),
(!cast<Instruction>(InstrStr#"rm") RC:$mask, addr:$ptr)>;
- def: Pat<(VT (X86mload addr:$ptr, (MaskVT RC:$mask),
- (VT (bitconvert (ZeroVT immAllZerosV))))),
+ def: Pat<(VT (masked_load addr:$ptr, (MaskVT RC:$mask),
+ (VT immAllZerosV))),
(!cast<Instruction>(InstrStr#"rm") RC:$mask, addr:$ptr)>;
- def: Pat<(VT (X86mload addr:$ptr, (MaskVT RC:$mask), (VT RC:$src0))),
- (!cast<Instruction>(BlendStr#"rr")
- RC:$src0,
- (VT (!cast<Instruction>(InstrStr#"rm") RC:$mask, addr:$ptr)),
- RC:$mask)>;
}
let Predicates = [HasAVX] in {
defm : maskmov_lowering<"VMASKMOVPS", VR128, v4f32, v4i32, "VBLENDVPS", v4i32>;
@@ -8275,21 +7796,6 @@ let Predicates = [HasAVX2] in {
// Provide fallback in case the load node that is used in the patterns above
// is used by additional users, which prevents the pattern selection.
-let Predicates = [HasAVX2, NoVLX] in {
-def : Pat<(v4i64 (X86SubVBroadcast (v2i64 VR128:$src))),
- (VINSERTI128rr (INSERT_SUBREG (v4i64 (IMPLICIT_DEF)), VR128:$src, sub_xmm),
- (v2i64 VR128:$src), 1)>;
-def : Pat<(v8i32 (X86SubVBroadcast (v4i32 VR128:$src))),
- (VINSERTI128rr (INSERT_SUBREG (v8i32 (IMPLICIT_DEF)), VR128:$src, sub_xmm),
- (v4i32 VR128:$src), 1)>;
-def : Pat<(v16i16 (X86SubVBroadcast (v8i16 VR128:$src))),
- (VINSERTI128rr (INSERT_SUBREG (v16i16 (IMPLICIT_DEF)), VR128:$src, sub_xmm),
- (v8i16 VR128:$src), 1)>;
-def : Pat<(v32i8 (X86SubVBroadcast (v16i8 VR128:$src))),
- (VINSERTI128rr (INSERT_SUBREG (v32i8 (IMPLICIT_DEF)), VR128:$src, sub_xmm),
- (v16i8 VR128:$src), 1)>;
-}
-
let Predicates = [HasAVX, NoVLX] in {
def : Pat<(v4f64 (X86SubVBroadcast (v2f64 VR128:$src))),
(VINSERTF128rr (INSERT_SUBREG (v4f64 (IMPLICIT_DEF)), VR128:$src, sub_xmm),
@@ -8299,7 +7805,9 @@ def : Pat<(v8f32 (X86SubVBroadcast (v4f32 VR128:$src))),
(v4f32 VR128:$src), 1)>;
}
-let Predicates = [HasAVX1Only] in {
+// NOTE: We're using FP instructions here, but execution domain fixing can
+// convert to integer when profitable.
+let Predicates = [HasAVX, NoVLX] in {
def : Pat<(v4i64 (X86SubVBroadcast (v2i64 VR128:$src))),
(VINSERTF128rr (INSERT_SUBREG (v4i64 (IMPLICIT_DEF)), VR128:$src, sub_xmm),
(v2i64 VR128:$src), 1)>;
@@ -8350,20 +7858,11 @@ multiclass avx2_var_shift<bits<8> opc, string OpcodeStr, SDNode OpNode,
}
let Predicates = [HasAVX2, NoVLX] in {
- defm VPSLLVD : avx2_var_shift<0x47, "vpsllvd", shl, v4i32, v8i32>;
- defm VPSLLVQ : avx2_var_shift<0x47, "vpsllvq", shl, v2i64, v4i64>, VEX_W;
- defm VPSRLVD : avx2_var_shift<0x45, "vpsrlvd", srl, v4i32, v8i32>;
- defm VPSRLVQ : avx2_var_shift<0x45, "vpsrlvq", srl, v2i64, v4i64>, VEX_W;
- defm VPSRAVD : avx2_var_shift<0x46, "vpsravd", sra, v4i32, v8i32>;
-
- def : Pat<(v4i32 (X86vsrav VR128:$src1, VR128:$src2)),
- (VPSRAVDrr VR128:$src1, VR128:$src2)>;
- def : Pat<(v4i32 (X86vsrav VR128:$src1, (load addr:$src2))),
- (VPSRAVDrm VR128:$src1, addr:$src2)>;
- def : Pat<(v8i32 (X86vsrav VR256:$src1, VR256:$src2)),
- (VPSRAVDYrr VR256:$src1, VR256:$src2)>;
- def : Pat<(v8i32 (X86vsrav VR256:$src1, (load addr:$src2))),
- (VPSRAVDYrm VR256:$src1, addr:$src2)>;
+ defm VPSLLVD : avx2_var_shift<0x47, "vpsllvd", X86vshlv, v4i32, v8i32>;
+ defm VPSLLVQ : avx2_var_shift<0x47, "vpsllvq", X86vshlv, v2i64, v4i64>, VEX_W;
+ defm VPSRLVD : avx2_var_shift<0x45, "vpsrlvd", X86vsrlv, v4i32, v8i32>;
+ defm VPSRLVQ : avx2_var_shift<0x45, "vpsrlvq", X86vsrlv, v2i64, v4i64>, VEX_W;
+ defm VPSRAVD : avx2_var_shift<0x46, "vpsravd", X86vsrav, v4i32, v8i32>;
}
//===----------------------------------------------------------------------===//
@@ -8393,7 +7892,7 @@ multiclass avx2_gather<bits<8> opc, string OpcodeStr, ValueType VTx,
VEX, VEX_L, Sched<[WriteLoad]>;
}
-let Predicates = [UseAVX2] in {
+let Predicates = [HasAVX2] in {
let mayLoad = 1, hasSideEffects = 0, Constraints
= "@earlyclobber $dst,@earlyclobber $mask_wb, $src1 = $dst, $mask = $mask_wb"
in {
diff --git a/lib/Target/X86/X86InstrSVM.td b/lib/Target/X86/X86InstrSVM.td
index 2dc6e8b43667..82c8e74156b2 100644
--- a/lib/Target/X86/X86InstrSVM.td
+++ b/lib/Target/X86/X86InstrSVM.td
@@ -1,9 +1,8 @@
//===-- X86InstrSVM.td - SVM Instruction Set Extension -----*- tablegen -*-===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
diff --git a/lib/Target/X86/X86InstrShiftRotate.td b/lib/Target/X86/X86InstrShiftRotate.td
index 7cd63a6dd820..9d974b716dda 100644
--- a/lib/Target/X86/X86InstrShiftRotate.td
+++ b/lib/Target/X86/X86InstrShiftRotate.td
@@ -1,9 +1,8 @@
//===-- X86InstrShiftRotate.td - Shift and Rotate Instrs ---*- tablegen -*-===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
@@ -31,11 +30,11 @@ def SHL64rCL : RI<0xD3, MRM4r, (outs GR64:$dst), (ins GR64:$src1),
[(set GR64:$dst, (shl GR64:$src1, CL))]>;
} // Uses = [CL], SchedRW
+let isConvertibleToThreeAddress = 1 in { // Can transform into LEA.
def SHL8ri : Ii8<0xC0, MRM4r, (outs GR8 :$dst), (ins GR8 :$src1, u8imm:$src2),
"shl{b}\t{$src2, $dst|$dst, $src2}",
[(set GR8:$dst, (shl GR8:$src1, (i8 imm:$src2)))]>;
-let isConvertibleToThreeAddress = 1 in { // Can transform into LEA.
def SHL16ri : Ii8<0xC1, MRM4r, (outs GR16:$dst), (ins GR16:$src1, u8imm:$src2),
"shl{w}\t{$src2, $dst|$dst, $src2}",
[(set GR16:$dst, (shl GR16:$src1, (i8 imm:$src2)))]>,
@@ -473,17 +472,19 @@ def ROL64rCL : RI<0xD3, MRM0r, (outs GR64:$dst), (ins GR64:$src1),
def ROL8ri : Ii8<0xC0, MRM0r, (outs GR8 :$dst), (ins GR8 :$src1, u8imm:$src2),
"rol{b}\t{$src2, $dst|$dst, $src2}",
- [(set GR8:$dst, (rotl GR8:$src1, (i8 imm:$src2)))]>;
+ [(set GR8:$dst, (rotl GR8:$src1, (i8 relocImm:$src2)))]>;
def ROL16ri : Ii8<0xC1, MRM0r, (outs GR16:$dst), (ins GR16:$src1, u8imm:$src2),
"rol{w}\t{$src2, $dst|$dst, $src2}",
- [(set GR16:$dst, (rotl GR16:$src1, (i8 imm:$src2)))]>, OpSize16;
+ [(set GR16:$dst, (rotl GR16:$src1, (i8 relocImm:$src2)))]>,
+ OpSize16;
def ROL32ri : Ii8<0xC1, MRM0r, (outs GR32:$dst), (ins GR32:$src1, u8imm:$src2),
"rol{l}\t{$src2, $dst|$dst, $src2}",
- [(set GR32:$dst, (rotl GR32:$src1, (i8 imm:$src2)))]>, OpSize32;
+ [(set GR32:$dst, (rotl GR32:$src1, (i8 relocImm:$src2)))]>,
+ OpSize32;
def ROL64ri : RIi8<0xC1, MRM0r, (outs GR64:$dst),
(ins GR64:$src1, u8imm:$src2),
"rol{q}\t{$src2, $dst|$dst, $src2}",
- [(set GR64:$dst, (rotl GR64:$src1, (i8 imm:$src2)))]>;
+ [(set GR64:$dst, (rotl GR64:$src1, (i8 relocImm:$src2)))]>;
// Rotate by 1
def ROL8r1 : I<0xD0, MRM0r, (outs GR8 :$dst), (ins GR8 :$src1),
@@ -586,16 +587,16 @@ def ROR64ri : RIi8<0xC1, MRM1r, (outs GR64:$dst),
// Rotate by 1
def ROR8r1 : I<0xD0, MRM1r, (outs GR8 :$dst), (ins GR8 :$src1),
"ror{b}\t$dst",
- [(set GR8:$dst, (rotl GR8:$src1, (i8 7)))]>;
+ [(set GR8:$dst, (rotr GR8:$src1, (i8 1)))]>;
def ROR16r1 : I<0xD1, MRM1r, (outs GR16:$dst), (ins GR16:$src1),
"ror{w}\t$dst",
- [(set GR16:$dst, (rotl GR16:$src1, (i8 15)))]>, OpSize16;
+ [(set GR16:$dst, (rotr GR16:$src1, (i8 1)))]>, OpSize16;
def ROR32r1 : I<0xD1, MRM1r, (outs GR32:$dst), (ins GR32:$src1),
"ror{l}\t$dst",
- [(set GR32:$dst, (rotl GR32:$src1, (i8 31)))]>, OpSize32;
+ [(set GR32:$dst, (rotr GR32:$src1, (i8 1)))]>, OpSize32;
def ROR64r1 : RI<0xD1, MRM1r, (outs GR64:$dst), (ins GR64:$src1),
"ror{q}\t$dst",
- [(set GR64:$dst, (rotl GR64:$src1, (i8 63)))]>;
+ [(set GR64:$dst, (rotr GR64:$src1, (i8 1)))]>;
} // Constraints = "$src = $dst", SchedRW
let Uses = [CL], SchedRW = [WriteRotateCLLd, WriteRMW] in {
@@ -634,18 +635,18 @@ def ROR64mi : RIi8<0xC1, MRM1m, (outs), (ins i64mem:$dst, u8imm:$src),
// Rotate by 1
def ROR8m1 : I<0xD0, MRM1m, (outs), (ins i8mem :$dst),
"ror{b}\t$dst",
- [(store (rotl (loadi8 addr:$dst), (i8 7)), addr:$dst)]>;
+ [(store (rotr (loadi8 addr:$dst), (i8 1)), addr:$dst)]>;
def ROR16m1 : I<0xD1, MRM1m, (outs), (ins i16mem:$dst),
"ror{w}\t$dst",
- [(store (rotl (loadi16 addr:$dst), (i8 15)), addr:$dst)]>,
+ [(store (rotr (loadi16 addr:$dst), (i8 1)), addr:$dst)]>,
OpSize16;
def ROR32m1 : I<0xD1, MRM1m, (outs), (ins i32mem:$dst),
"ror{l}\t$dst",
- [(store (rotl (loadi32 addr:$dst), (i8 31)), addr:$dst)]>,
+ [(store (rotr (loadi32 addr:$dst), (i8 1)), addr:$dst)]>,
OpSize32;
def ROR64m1 : RI<0xD1, MRM1m, (outs), (ins i64mem:$dst),
"ror{q}\t$dst",
- [(store (rotl (loadi64 addr:$dst), (i8 63)), addr:$dst)]>,
+ [(store (rotr (loadi64 addr:$dst), (i8 1)), addr:$dst)]>,
Requires<[In64BitMode]>;
} // SchedRW
@@ -807,13 +808,54 @@ def SHRD64mri8 : RIi8<0xAC, MRMDestMem,
} // Defs = [EFLAGS]
+// Use the opposite rotate if allows us to use the rotate by 1 instruction.
+def : Pat<(rotl GR8:$src1, (i8 7)), (ROR8r1 GR8:$src1)>;
+def : Pat<(rotl GR16:$src1, (i8 15)), (ROR16r1 GR16:$src1)>;
+def : Pat<(rotl GR32:$src1, (i8 31)), (ROR32r1 GR32:$src1)>;
+def : Pat<(rotl GR64:$src1, (i8 63)), (ROR64r1 GR64:$src1)>;
+def : Pat<(rotr GR8:$src1, (i8 7)), (ROL8r1 GR8:$src1)>;
+def : Pat<(rotr GR16:$src1, (i8 15)), (ROL16r1 GR16:$src1)>;
+def : Pat<(rotr GR32:$src1, (i8 31)), (ROL32r1 GR32:$src1)>;
+def : Pat<(rotr GR64:$src1, (i8 63)), (ROL64r1 GR64:$src1)>;
+
+def : Pat<(store (rotl (loadi8 addr:$dst), (i8 7)), addr:$dst),
+ (ROR8m1 addr:$dst)>;
+def : Pat<(store (rotl (loadi16 addr:$dst), (i8 15)), addr:$dst),
+ (ROR16m1 addr:$dst)>;
+def : Pat<(store (rotl (loadi32 addr:$dst), (i8 31)), addr:$dst),
+ (ROR32m1 addr:$dst)>;
+def : Pat<(store (rotl (loadi64 addr:$dst), (i8 63)), addr:$dst),
+ (ROR64m1 addr:$dst)>, Requires<[In64BitMode]>;
+
+def : Pat<(store (rotr (loadi8 addr:$dst), (i8 7)), addr:$dst),
+ (ROL8m1 addr:$dst)>;
+def : Pat<(store (rotr (loadi16 addr:$dst), (i8 15)), addr:$dst),
+ (ROL16m1 addr:$dst)>;
+def : Pat<(store (rotr (loadi32 addr:$dst), (i8 31)), addr:$dst),
+ (ROL32m1 addr:$dst)>;
+def : Pat<(store (rotr (loadi64 addr:$dst), (i8 63)), addr:$dst),
+ (ROL64m1 addr:$dst)>, Requires<[In64BitMode]>;
+
// Sandy Bridge and newer Intel processors support faster rotates using
// SHLD to avoid a partial flag update on the normal rotate instructions.
-let Predicates = [HasFastSHLDRotate], AddedComplexity = 5 in {
- def : Pat<(rotl GR32:$src, (i8 imm:$shamt)),
- (SHLD32rri8 GR32:$src, GR32:$src, imm:$shamt)>;
- def : Pat<(rotl GR64:$src, (i8 imm:$shamt)),
- (SHLD64rri8 GR64:$src, GR64:$src, imm:$shamt)>;
+// Use a pseudo so that TwoInstructionPass and register allocation will see
+// this as unary instruction.
+let Predicates = [HasFastSHLDRotate], AddedComplexity = 5,
+ Defs = [EFLAGS], isPseudo = 1, SchedRW = [WriteSHDrri],
+ Constraints = "$src1 = $dst" in {
+ def SHLDROT32ri : I<0, Pseudo, (outs GR32:$dst),
+ (ins GR32:$src1, u8imm:$shamt), "",
+ [(set GR32:$dst, (rotl GR32:$src1, (i8 imm:$shamt)))]>;
+ def SHLDROT64ri : I<0, Pseudo, (outs GR64:$dst),
+ (ins GR64:$src1, u8imm:$shamt), "",
+ [(set GR64:$dst, (rotl GR64:$src1, (i8 imm:$shamt)))]>;
+
+ def SHRDROT32ri : I<0, Pseudo, (outs GR32:$dst),
+ (ins GR32:$src1, u8imm:$shamt), "",
+ [(set GR32:$dst, (rotr GR32:$src1, (i8 imm:$shamt)))]>;
+ def SHRDROT64ri : I<0, Pseudo, (outs GR64:$dst),
+ (ins GR64:$src1, u8imm:$shamt), "",
+ [(set GR64:$dst, (rotr GR64:$src1, (i8 imm:$shamt)))]>;
}
def ROT32L2R_imm8 : SDNodeXForm<imm, [{
@@ -871,19 +913,29 @@ let Predicates = [HasBMI2] in {
// Prefer RORX which is non-destructive and doesn't update EFLAGS.
let AddedComplexity = 10 in {
+ def : Pat<(rotr GR32:$src, (i8 imm:$shamt)),
+ (RORX32ri GR32:$src, imm:$shamt)>;
+ def : Pat<(rotr GR64:$src, (i8 imm:$shamt)),
+ (RORX64ri GR64:$src, imm:$shamt)>;
+
def : Pat<(rotl GR32:$src, (i8 imm:$shamt)),
(RORX32ri GR32:$src, (ROT32L2R_imm8 imm:$shamt))>;
def : Pat<(rotl GR64:$src, (i8 imm:$shamt)),
(RORX64ri GR64:$src, (ROT64L2R_imm8 imm:$shamt))>;
}
+ def : Pat<(rotr (loadi32 addr:$src), (i8 imm:$shamt)),
+ (RORX32mi addr:$src, imm:$shamt)>;
+ def : Pat<(rotr (loadi64 addr:$src), (i8 imm:$shamt)),
+ (RORX64mi addr:$src, imm:$shamt)>;
+
def : Pat<(rotl (loadi32 addr:$src), (i8 imm:$shamt)),
(RORX32mi addr:$src, (ROT32L2R_imm8 imm:$shamt))>;
def : Pat<(rotl (loadi64 addr:$src), (i8 imm:$shamt)),
(RORX64mi addr:$src, (ROT64L2R_imm8 imm:$shamt))>;
// Prefer SARX/SHRX/SHLX over SAR/SHR/SHL with variable shift BUT not
- // immedidate shift, i.e. the following code is considered better
+ // immediate shift, i.e. the following code is considered better
//
// mov %edi, %esi
// shl $imm, %esi
diff --git a/lib/Target/X86/X86InstrSystem.td b/lib/Target/X86/X86InstrSystem.td
index 35ee00b9e016..7050e1917494 100644
--- a/lib/Target/X86/X86InstrSystem.td
+++ b/lib/Target/X86/X86InstrSystem.td
@@ -1,9 +1,8 @@
//===-- X86InstrSystem.td - System Instructions ------------*- tablegen -*-===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
@@ -15,10 +14,10 @@
let SchedRW = [WriteSystem] in {
let Defs = [RAX, RDX] in
- def RDTSC : I<0x31, RawFrm, (outs), (ins), "rdtsc", [(X86rdtsc)]>, TB;
+def RDTSC : I<0x31, RawFrm, (outs), (ins), "rdtsc", []>, TB;
let Defs = [RAX, RCX, RDX] in
- def RDTSCP : I<0x01, MRM_F9, (outs), (ins), "rdtscp", [(X86rdtscp)]>, TB;
+def RDTSCP : I<0x01, MRM_F9, (outs), (ins), "rdtscp", []>, TB;
// CPU flow control instructions
@@ -411,7 +410,7 @@ let Defs = [EAX, EDX], Uses = [ECX] in
def RDMSR : I<0x32, RawFrm, (outs), (ins), "rdmsr", []>, TB;
let Defs = [RAX, RDX], Uses = [ECX] in
- def RDPMC : I<0x33, RawFrm, (outs), (ins), "rdpmc", [(X86rdpmc)]>, TB;
+def RDPMC : I<0x33, RawFrm, (outs), (ins), "rdpmc", []>, TB;
def SMSW16r : I<0x01, MRM4r, (outs GR16:$dst), (ins),
"smsw{w}\t$dst", []>, OpSize16, TB;
@@ -588,18 +587,13 @@ let Defs = [RAX, RDX, RSI], Uses = [RAX, RSI] in
//==-----------------------------------------------------------------------===//
// PKU - enable protection key
-let usesCustomInserter = 1, hasNoSchedulingInfo = 1 in {
- def WRPKRU : PseudoI<(outs), (ins GR32:$src),
- [(int_x86_wrpkru GR32:$src)]>;
- def RDPKRU : PseudoI<(outs GR32:$dst), (ins),
- [(set GR32:$dst, (int_x86_rdpkru))]>;
-}
-
let SchedRW = [WriteSystem] in {
let Defs = [EAX, EDX], Uses = [ECX] in
- def RDPKRUr : I<0x01, MRM_EE, (outs), (ins), "rdpkru", []>, TB;
+ def RDPKRUr : I<0x01, MRM_EE, (outs), (ins), "rdpkru",
+ [(set EAX, (X86rdpkru ECX)), (implicit EDX)]>, TB;
let Uses = [EAX, ECX, EDX] in
- def WRPKRUr : I<0x01, MRM_EF, (outs), (ins), "wrpkru", []>, TB;
+ def WRPKRUr : I<0x01, MRM_EF, (outs), (ins), "wrpkru",
+ [(X86wrpkru EAX, EDX, ECX)]>, TB;
} // SchedRW
//===----------------------------------------------------------------------===//
diff --git a/lib/Target/X86/X86InstrTSX.td b/lib/Target/X86/X86InstrTSX.td
index 10c6eef78639..fc0da845299f 100644
--- a/lib/Target/X86/X86InstrTSX.td
+++ b/lib/Target/X86/X86InstrTSX.td
@@ -1,9 +1,8 @@
//===-- X86InstrVMX.td - TSX Instruction Set Extension -----*- tablegen -*-===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
diff --git a/lib/Target/X86/X86InstrVMX.td b/lib/Target/X86/X86InstrVMX.td
index 06a438ebfcad..37bc4ce2e053 100644
--- a/lib/Target/X86/X86InstrVMX.td
+++ b/lib/Target/X86/X86InstrVMX.td
@@ -1,9 +1,8 @@
//===-- X86InstrVMX.td - VMX Instruction Set Extension -----*- tablegen -*-===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
diff --git a/lib/Target/X86/X86InstrVecCompiler.td b/lib/Target/X86/X86InstrVecCompiler.td
index c417dc99b84d..e98843bd3ae3 100644
--- a/lib/Target/X86/X86InstrVecCompiler.td
+++ b/lib/Target/X86/X86InstrVecCompiler.td
@@ -1,9 +1,8 @@
//===- X86InstrVecCompiler.td - Vector Compiler Patterns ---*- tablegen -*-===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
@@ -99,76 +98,6 @@ defm : subvector_subreg_lowering<VR256, v16i16, VR512, v32i16, sub_ymm>;
defm : subvector_subreg_lowering<VR256, v32i8, VR512, v64i8, sub_ymm>;
-multiclass subvector_store_lowering<string AlignedStr, string UnalignedStr,
- RegisterClass RC, ValueType DstTy,
- ValueType SrcTy, SubRegIndex SubIdx> {
- def : Pat<(alignedstore (DstTy (extract_subvector
- (SrcTy RC:$src), (iPTR 0))), addr:$dst),
- (!cast<Instruction>("VMOV"#AlignedStr#"mr") addr:$dst,
- (DstTy (EXTRACT_SUBREG RC:$src, SubIdx)))>;
-
- def : Pat<(store (DstTy (extract_subvector
- (SrcTy RC:$src), (iPTR 0))), addr:$dst),
- (!cast<Instruction>("VMOV"#UnalignedStr#"mr") addr:$dst,
- (DstTy (EXTRACT_SUBREG RC:$src, SubIdx)))>;
-}
-
-let Predicates = [HasAVX, NoVLX] in {
- defm : subvector_store_lowering<"APD", "UPD", VR256X, v2f64, v4f64, sub_xmm>;
- defm : subvector_store_lowering<"APS", "UPS", VR256X, v4f32, v8f32, sub_xmm>;
- defm : subvector_store_lowering<"DQA", "DQU", VR256X, v2i64, v4i64, sub_xmm>;
- defm : subvector_store_lowering<"DQA", "DQU", VR256X, v4i32, v8i32, sub_xmm>;
- defm : subvector_store_lowering<"DQA", "DQU", VR256X, v8i16, v16i16, sub_xmm>;
- defm : subvector_store_lowering<"DQA", "DQU", VR256X, v16i8, v32i8, sub_xmm>;
-}
-
-let Predicates = [HasVLX] in {
- // Special patterns for storing subvector extracts of lower 128-bits
- // Its cheaper to just use VMOVAPS/VMOVUPS instead of VEXTRACTF128mr
- defm : subvector_store_lowering<"APDZ128", "UPDZ128", VR256X, v2f64, v4f64,
- sub_xmm>;
- defm : subvector_store_lowering<"APSZ128", "UPSZ128", VR256X, v4f32, v8f32,
- sub_xmm>;
- defm : subvector_store_lowering<"DQA64Z128", "DQU64Z128", VR256X, v2i64,
- v4i64, sub_xmm>;
- defm : subvector_store_lowering<"DQA64Z128", "DQU64Z128", VR256X, v4i32,
- v8i32, sub_xmm>;
- defm : subvector_store_lowering<"DQA64Z128", "DQU64Z128", VR256X, v8i16,
- v16i16, sub_xmm>;
- defm : subvector_store_lowering<"DQA64Z128", "DQU64Z128", VR256X, v16i8,
- v32i8, sub_xmm>;
-
- // Special patterns for storing subvector extracts of lower 128-bits of 512.
- // Its cheaper to just use VMOVAPS/VMOVUPS instead of VEXTRACTF128mr
- defm : subvector_store_lowering<"APDZ128", "UPDZ128", VR512, v2f64, v8f64,
- sub_xmm>;
- defm : subvector_store_lowering<"APSZ128", "UPSZ128", VR512, v4f32, v16f32,
- sub_xmm>;
- defm : subvector_store_lowering<"DQA64Z128", "DQU64Z128", VR512, v2i64,
- v8i64, sub_xmm>;
- defm : subvector_store_lowering<"DQA64Z128", "DQU64Z128", VR512, v4i32,
- v16i32, sub_xmm>;
- defm : subvector_store_lowering<"DQA64Z128", "DQU64Z128", VR512, v8i16,
- v32i16, sub_xmm>;
- defm : subvector_store_lowering<"DQA64Z128", "DQU64Z128", VR512, v16i8,
- v64i8, sub_xmm>;
-
- // Special patterns for storing subvector extracts of lower 256-bits of 512.
- // Its cheaper to just use VMOVAPS/VMOVUPS instead of VEXTRACTF128mr
- defm : subvector_store_lowering<"APDZ256", "UPDZ256", VR512, v4f64, v8f64,
- sub_ymm>;
- defm : subvector_store_lowering<"APSZ256", "UPSZ256", VR512, v8f32, v16f32,
- sub_ymm>;
- defm : subvector_store_lowering<"DQA64Z256", "DQU64Z256", VR512, v4i64,
- v8i64, sub_ymm>;
- defm : subvector_store_lowering<"DQA64Z256", "DQU64Z256", VR512, v8i32,
- v16i32, sub_ymm>;
- defm : subvector_store_lowering<"DQA64Z256", "DQU64Z256", VR512, v16i16,
- v32i16, sub_ymm>;
- defm : subvector_store_lowering<"DQA64Z256", "DQU64Z256", VR512, v32i8,
- v64i8, sub_ymm>;
-}
-
// If we're inserting into an all zeros vector, just use a plain move which
// will zero the upper bits. A post-isel hook will take care of removing
// any moves that we can prove are unnecessary.
@@ -176,7 +105,7 @@ multiclass subvec_zero_lowering<string MoveStr,
RegisterClass RC, ValueType DstTy,
ValueType SrcTy, ValueType ZeroTy,
SubRegIndex SubIdx> {
- def : Pat<(DstTy (insert_subvector (bitconvert (ZeroTy immAllZerosV)),
+ def : Pat<(DstTy (insert_subvector immAllZerosV,
(SrcTy RC:$src), (iPTR 0))),
(SUBREG_TO_REG (i64 0),
(SrcTy (!cast<Instruction>("VMOV"#MoveStr#"rr") RC:$src)), SubIdx)>;
@@ -398,7 +327,7 @@ let Predicates = [HasBWI, HasDQI] in {
(COPY_TO_REGCLASS (KMOVBkk VK8:$mask), VK64)>;
}
-let Predicates = [HasBWI, HasVLX] in {
+let Predicates = [HasBWI] in {
def : Pat<(v32i1 (insert_subvector (v32i1 immAllZerosV),
(v1i1 VK1:$mask), (iPTR 0))),
(KSHIFTRDri (KSHIFTLDri (COPY_TO_REGCLASS VK1:$mask, VK32),
@@ -487,7 +416,7 @@ def : Pat<(f128 (X86fxor VR128:$src1, VR128:$src2)),
(XORPSrr VR128:$src1, VR128:$src2)>;
}
-let Predicates = [HasAVX] in {
+let Predicates = [HasAVX, NoVLX] in {
// andps is shorter than andpd or pand. andps is SSE and andpd/pand are in SSE2
def : Pat<(f128 (X86fand VR128:$src1, (loadf128 addr:$src2))),
(VANDPSrm VR128:$src1, f128mem:$src2)>;
@@ -507,3 +436,24 @@ def : Pat<(f128 (X86fxor VR128:$src1, (loadf128 addr:$src2))),
def : Pat<(f128 (X86fxor VR128:$src1, VR128:$src2)),
(VXORPSrr VR128:$src1, VR128:$src2)>;
}
+
+let Predicates = [HasVLX] in {
+// andps is shorter than andpd or pand. andps is SSE and andpd/pand are in SSE2
+def : Pat<(f128 (X86fand VR128X:$src1, (loadf128 addr:$src2))),
+ (VANDPSZ128rm VR128X:$src1, f128mem:$src2)>;
+
+def : Pat<(f128 (X86fand VR128X:$src1, VR128X:$src2)),
+ (VANDPSZ128rr VR128X:$src1, VR128X:$src2)>;
+
+def : Pat<(f128 (X86for VR128X:$src1, (loadf128 addr:$src2))),
+ (VORPSZ128rm VR128X:$src1, f128mem:$src2)>;
+
+def : Pat<(f128 (X86for VR128X:$src1, VR128X:$src2)),
+ (VORPSZ128rr VR128X:$src1, VR128X:$src2)>;
+
+def : Pat<(f128 (X86fxor VR128X:$src1, (loadf128 addr:$src2))),
+ (VXORPSZ128rm VR128X:$src1, f128mem:$src2)>;
+
+def : Pat<(f128 (X86fxor VR128X:$src1, VR128X:$src2)),
+ (VXORPSZ128rr VR128X:$src1, VR128X:$src2)>;
+}
diff --git a/lib/Target/X86/X86InstrXOP.td b/lib/Target/X86/X86InstrXOP.td
index 9d810a675e3b..66ca78556b82 100644
--- a/lib/Target/X86/X86InstrXOP.td
+++ b/lib/Target/X86/X86InstrXOP.td
@@ -1,9 +1,8 @@
//===-- X86InstrXOP.td - XOP Instruction Set ---------------*- tablegen -*-===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
@@ -247,36 +246,22 @@ multiclass xopvpcom<bits<8> opc, string Suffix, SDNode OpNode, ValueType vt128,
let ExeDomain = SSEPackedInt in { // SSE integer instructions
let isCommutable = 1 in
def ri : IXOPi8<opc, MRMSrcReg, (outs VR128:$dst),
- (ins VR128:$src1, VR128:$src2, XOPCC:$cc),
- !strconcat("vpcom${cc}", Suffix,
- "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
+ (ins VR128:$src1, VR128:$src2, u8imm:$cc),
+ !strconcat("vpcom", Suffix,
+ "\t{$cc, $src2, $src1, $dst|$dst, $src1, $src2, $cc}"),
[(set VR128:$dst,
(vt128 (OpNode (vt128 VR128:$src1), (vt128 VR128:$src2),
imm:$cc)))]>,
XOP_4V, Sched<[sched]>;
def mi : IXOPi8<opc, MRMSrcMem, (outs VR128:$dst),
- (ins VR128:$src1, i128mem:$src2, XOPCC:$cc),
- !strconcat("vpcom${cc}", Suffix,
- "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
+ (ins VR128:$src1, i128mem:$src2, u8imm:$cc),
+ !strconcat("vpcom", Suffix,
+ "\t{$cc, $src2, $src1, $dst|$dst, $src1, $src2, $cc}"),
[(set VR128:$dst,
(vt128 (OpNode (vt128 VR128:$src1),
(vt128 (load addr:$src2)),
imm:$cc)))]>,
XOP_4V, Sched<[sched.Folded, sched.ReadAfterFold]>;
- let isAsmParserOnly = 1, hasSideEffects = 0 in {
- def ri_alt : IXOPi8<opc, MRMSrcReg, (outs VR128:$dst),
- (ins VR128:$src1, VR128:$src2, u8imm:$src3),
- !strconcat("vpcom", Suffix,
- "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"),
- []>, XOP_4V, Sched<[sched]>, NotMemoryFoldable;
- let mayLoad = 1 in
- def mi_alt : IXOPi8<opc, MRMSrcMem, (outs VR128:$dst),
- (ins VR128:$src1, i128mem:$src2, u8imm:$src3),
- !strconcat("vpcom", Suffix,
- "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"),
- []>, XOP_4V, Sched<[sched.Folded, sched.ReadAfterFold]>,
- NotMemoryFoldable;
- }
}
def : Pat<(OpNode (load addr:$src2),
diff --git a/lib/Target/X86/X86InstructionSelector.cpp b/lib/Target/X86/X86InstructionSelector.cpp
index c20336387b2d..892a083f4d1a 100644
--- a/lib/Target/X86/X86InstructionSelector.cpp
+++ b/lib/Target/X86/X86InstructionSelector.cpp
@@ -1,9 +1,8 @@
//===- X86InstructionSelector.cpp -----------------------------------------===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
/// \file
@@ -419,18 +418,22 @@ unsigned X86InstructionSelector::getLoadStoreOp(const LLT &Ty,
if (X86::GPRRegBankID == RB.getID())
return Isload ? X86::MOV32rm : X86::MOV32mr;
if (X86::VECRRegBankID == RB.getID())
- return Isload ? (HasAVX512 ? X86::VMOVSSZrm
- : HasAVX ? X86::VMOVSSrm : X86::MOVSSrm)
- : (HasAVX512 ? X86::VMOVSSZmr
- : HasAVX ? X86::VMOVSSmr : X86::MOVSSmr);
+ return Isload ? (HasAVX512 ? X86::VMOVSSZrm_alt :
+ HasAVX ? X86::VMOVSSrm_alt :
+ X86::MOVSSrm_alt)
+ : (HasAVX512 ? X86::VMOVSSZmr :
+ HasAVX ? X86::VMOVSSmr :
+ X86::MOVSSmr);
} else if (Ty == LLT::scalar(64) || Ty == LLT::pointer(0, 64)) {
if (X86::GPRRegBankID == RB.getID())
return Isload ? X86::MOV64rm : X86::MOV64mr;
if (X86::VECRRegBankID == RB.getID())
- return Isload ? (HasAVX512 ? X86::VMOVSDZrm
- : HasAVX ? X86::VMOVSDrm : X86::MOVSDrm)
- : (HasAVX512 ? X86::VMOVSDZmr
- : HasAVX ? X86::VMOVSDmr : X86::MOVSDmr);
+ return Isload ? (HasAVX512 ? X86::VMOVSDZrm_alt :
+ HasAVX ? X86::VMOVSDrm_alt :
+ X86::MOVSDrm_alt)
+ : (HasAVX512 ? X86::VMOVSDZmr :
+ HasAVX ? X86::VMOVSDmr :
+ X86::MOVSDmr);
} else if (Ty.isVector() && Ty.getSizeInBits() == 128) {
if (Alignment >= 16)
return Isload ? (HasVLX ? X86::VMOVAPSZ128rm
@@ -513,10 +516,22 @@ bool X86InstructionSelector::selectLoadStoreOp(MachineInstr &I,
LLT Ty = MRI.getType(DefReg);
const RegisterBank &RB = *RBI.getRegBank(DefReg, MRI, TRI);
+ assert(I.hasOneMemOperand());
auto &MemOp = **I.memoperands_begin();
- if (MemOp.getOrdering() != AtomicOrdering::NotAtomic) {
- LLVM_DEBUG(dbgs() << "Atomic load/store not supported yet\n");
- return false;
+ if (MemOp.isAtomic()) {
+ // Note: for unordered operations, we rely on the fact the appropriate MMO
+ // is already on the instruction we're mutating, and thus we don't need to
+ // make any changes. So long as we select an opcode which is capable of
+ // loading or storing the appropriate size atomically, the rest of the
+ // backend is required to respect the MMO state.
+ if (!MemOp.isUnordered()) {
+ LLVM_DEBUG(dbgs() << "Atomic ordering not supported yet\n");
+ return false;
+ }
+ if (MemOp.getAlignment() < Ty.getSizeInBits()/8) {
+ LLVM_DEBUG(dbgs() << "Unaligned atomics not supported yet\n");
+ return false;
+ }
}
unsigned NewOpc = getLoadStoreOp(Ty, RB, Opc, MemOp.getAlignment());
@@ -936,7 +951,6 @@ bool X86InstructionSelector::selectCmp(MachineInstr &I,
bool SwapArgs;
std::tie(CC, SwapArgs) = X86::getX86ConditionCode(
(CmpInst::Predicate)I.getOperand(1).getPredicate());
- unsigned OpSet = X86::getSETFromCond(CC);
unsigned LHS = I.getOperand(2).getReg();
unsigned RHS = I.getOperand(3).getReg();
@@ -970,7 +984,7 @@ bool X86InstructionSelector::selectCmp(MachineInstr &I,
.addReg(RHS);
MachineInstr &SetInst = *BuildMI(*I.getParent(), I, I.getDebugLoc(),
- TII.get(OpSet), I.getOperand(0).getReg());
+ TII.get(X86::SETCCr), I.getOperand(0).getReg()).addImm(CC);
constrainSelectedInstRegOperands(CmpInst, TII, TRI, RBI);
constrainSelectedInstRegOperands(SetInst, TII, TRI, RBI);
@@ -991,8 +1005,8 @@ bool X86InstructionSelector::selectFCmp(MachineInstr &I,
// FCMP_OEQ and FCMP_UNE cannot be checked with a single instruction.
static const uint16_t SETFOpcTable[2][3] = {
- {X86::SETEr, X86::SETNPr, X86::AND8rr},
- {X86::SETNEr, X86::SETPr, X86::OR8rr}};
+ {X86::COND_E, X86::COND_NP, X86::AND8rr},
+ {X86::COND_NE, X86::COND_P, X86::OR8rr}};
const uint16_t *SETFOpc = nullptr;
switch (Predicate) {
default:
@@ -1032,9 +1046,9 @@ bool X86InstructionSelector::selectFCmp(MachineInstr &I,
unsigned FlagReg1 = MRI.createVirtualRegister(&X86::GR8RegClass);
unsigned FlagReg2 = MRI.createVirtualRegister(&X86::GR8RegClass);
MachineInstr &Set1 = *BuildMI(*I.getParent(), I, I.getDebugLoc(),
- TII.get(SETFOpc[0]), FlagReg1);
+ TII.get(X86::SETCCr), FlagReg1).addImm(SETFOpc[0]);
MachineInstr &Set2 = *BuildMI(*I.getParent(), I, I.getDebugLoc(),
- TII.get(SETFOpc[1]), FlagReg2);
+ TII.get(X86::SETCCr), FlagReg2).addImm(SETFOpc[1]);
MachineInstr &Set3 = *BuildMI(*I.getParent(), I, I.getDebugLoc(),
TII.get(SETFOpc[2]), ResultReg)
.addReg(FlagReg1)
@@ -1052,7 +1066,6 @@ bool X86InstructionSelector::selectFCmp(MachineInstr &I,
bool SwapArgs;
std::tie(CC, SwapArgs) = X86::getX86ConditionCode(Predicate);
assert(CC <= X86::LAST_VALID_COND && "Unexpected condition code.");
- unsigned Opc = X86::getSETFromCond(CC);
if (SwapArgs)
std::swap(LhsReg, RhsReg);
@@ -1064,7 +1077,7 @@ bool X86InstructionSelector::selectFCmp(MachineInstr &I,
.addReg(RhsReg);
MachineInstr &Set =
- *BuildMI(*I.getParent(), I, I.getDebugLoc(), TII.get(Opc), ResultReg);
+ *BuildMI(*I.getParent(), I, I.getDebugLoc(), TII.get(X86::SETCCr), ResultReg).addImm(CC);
constrainSelectedInstRegOperands(CmpInst, TII, TRI, RBI);
constrainSelectedInstRegOperands(Set, TII, TRI, RBI);
I.eraseFromParent();
@@ -1409,8 +1422,8 @@ bool X86InstructionSelector::selectCondBranch(MachineInstr &I,
*BuildMI(*I.getParent(), I, I.getDebugLoc(), TII.get(X86::TEST8ri))
.addReg(CondReg)
.addImm(1);
- BuildMI(*I.getParent(), I, I.getDebugLoc(), TII.get(X86::JNE_1))
- .addMBB(DestMBB);
+ BuildMI(*I.getParent(), I, I.getDebugLoc(), TII.get(X86::JCC_1))
+ .addMBB(DestMBB).addImm(X86::COND_NE);
constrainSelectedInstRegOperands(TestInst, TII, TRI, RBI);
@@ -1530,15 +1543,14 @@ bool X86InstructionSelector::selectShift(MachineInstr &I,
const static struct ShiftEntry {
unsigned SizeInBits;
- unsigned CReg;
unsigned OpLSHR;
unsigned OpASHR;
unsigned OpSHL;
} OpTable[] = {
- {8, X86::CL, X86::SHR8rCL, X86::SAR8rCL, X86::SHL8rCL}, // i8
- {16, X86::CX, X86::SHR16rCL, X86::SAR16rCL, X86::SHL16rCL}, // i16
- {32, X86::ECX, X86::SHR32rCL, X86::SAR32rCL, X86::SHL32rCL}, // i32
- {64, X86::RCX, X86::SHR64rCL, X86::SAR64rCL, X86::SHL64rCL} // i64
+ {8, X86::SHR8rCL, X86::SAR8rCL, X86::SHL8rCL}, // i8
+ {16, X86::SHR16rCL, X86::SAR16rCL, X86::SHL16rCL}, // i16
+ {32, X86::SHR32rCL, X86::SAR32rCL, X86::SHL32rCL}, // i32
+ {64, X86::SHR64rCL, X86::SAR64rCL, X86::SHL64rCL} // i64
};
if (DstRB.getID() != X86::GPRRegBankID)
@@ -1551,7 +1563,6 @@ bool X86InstructionSelector::selectShift(MachineInstr &I,
if (ShiftEntryIt == std::end(OpTable))
return false;
- unsigned CReg = ShiftEntryIt->CReg;
unsigned Opcode = 0;
switch (I.getOpcode()) {
case TargetOpcode::G_SHL:
@@ -1570,16 +1581,11 @@ bool X86InstructionSelector::selectShift(MachineInstr &I,
unsigned Op0Reg = I.getOperand(1).getReg();
unsigned Op1Reg = I.getOperand(2).getReg();
- BuildMI(*I.getParent(), I, I.getDebugLoc(), TII.get(TargetOpcode::COPY),
- ShiftEntryIt->CReg)
- .addReg(Op1Reg);
+ assert(MRI.getType(Op1Reg).getSizeInBits() == 8);
- // The shift instruction uses X86::CL. If we defined a super-register
- // of X86::CL, emit a subreg KILL to precisely describe what we're doing here.
- if (CReg != X86::CL)
- BuildMI(*I.getParent(), I, I.getDebugLoc(), TII.get(TargetOpcode::KILL),
- X86::CL)
- .addReg(CReg, RegState::Kill);
+ BuildMI(*I.getParent(), I, I.getDebugLoc(), TII.get(TargetOpcode::COPY),
+ X86::CL)
+ .addReg(Op1Reg);
MachineInstr &ShiftInst =
*BuildMI(*I.getParent(), I, I.getDebugLoc(), TII.get(Opcode), DstReg)
@@ -1608,8 +1614,8 @@ bool X86InstructionSelector::selectDivRem(MachineInstr &I,
assert(RegTy == MRI.getType(Op1Reg) && RegTy == MRI.getType(Op2Reg) &&
"Arguments and return value types must match");
- const RegisterBank &RegRB = *RBI.getRegBank(DstReg, MRI, TRI);
- if (RegRB.getID() != X86::GPRRegBankID)
+ const RegisterBank *RegRB = RBI.getRegBank(DstReg, MRI, TRI);
+ if (!RegRB || RegRB->getID() != X86::GPRRegBankID)
return false;
const static unsigned NumTypes = 4; // i8, i16, i32, i64
@@ -1707,7 +1713,7 @@ bool X86InstructionSelector::selectDivRem(MachineInstr &I,
const DivRemEntry &TypeEntry = *OpEntryIt;
const DivRemEntry::DivRemResult &OpEntry = TypeEntry.ResultTable[OpIndex];
- const TargetRegisterClass *RegRC = getRegClass(RegTy, RegRB);
+ const TargetRegisterClass *RegRC = getRegClass(RegTy, *RegRB);
if (!RBI.constrainGenericRegister(Op1Reg, *RegRC, MRI) ||
!RBI.constrainGenericRegister(Op2Reg, *RegRC, MRI) ||
!RBI.constrainGenericRegister(DstReg, *RegRC, MRI)) {
diff --git a/lib/Target/X86/X86InterleavedAccess.cpp b/lib/Target/X86/X86InterleavedAccess.cpp
index 28940754a203..8f74a8fe041d 100644
--- a/lib/Target/X86/X86InterleavedAccess.cpp
+++ b/lib/Target/X86/X86InterleavedAccess.cpp
@@ -1,9 +1,8 @@
//===- X86InterleavedAccess.cpp -------------------------------------------===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
@@ -194,7 +193,7 @@ void X86InterleavedAccessGroup::decompose(
// Decompose the load instruction.
LoadInst *LI = cast<LoadInst>(VecInst);
- Type *VecBasePtrTy = SubVecTy->getPointerTo(LI->getPointerAddressSpace());
+ Type *VecBaseTy, *VecBasePtrTy;
Value *VecBasePtr;
unsigned int NumLoads = NumSubVectors;
// In the case of stride 3 with a vector of 32 elements load the information
@@ -202,18 +201,22 @@ void X86InterleavedAccessGroup::decompose(
// [0,1...,VF/2-1,VF/2+VF,VF/2+VF+1,...,2VF-1]
unsigned VecLength = DL.getTypeSizeInBits(VecWidth);
if (VecLength == 768 || VecLength == 1536) {
- Type *VecTran =
- VectorType::get(Type::getInt8Ty(LI->getContext()), 16)->getPointerTo();
- VecBasePtr = Builder.CreateBitCast(LI->getPointerOperand(), VecTran);
+ VecBaseTy = VectorType::get(Type::getInt8Ty(LI->getContext()), 16);
+ VecBasePtrTy = VecBaseTy->getPointerTo(LI->getPointerAddressSpace());
+ VecBasePtr = Builder.CreateBitCast(LI->getPointerOperand(), VecBasePtrTy);
NumLoads = NumSubVectors * (VecLength / 384);
- } else
+ } else {
+ VecBaseTy = SubVecTy;
+ VecBasePtrTy = VecBaseTy->getPointerTo(LI->getPointerAddressSpace());
VecBasePtr = Builder.CreateBitCast(LI->getPointerOperand(), VecBasePtrTy);
+ }
// Generate N loads of T type.
for (unsigned i = 0; i < NumLoads; i++) {
// TODO: Support inbounds GEP.
- Value *NewBasePtr = Builder.CreateGEP(VecBasePtr, Builder.getInt32(i));
+ Value *NewBasePtr =
+ Builder.CreateGEP(VecBaseTy, VecBasePtr, Builder.getInt32(i));
Instruction *NewLoad =
- Builder.CreateAlignedLoad(NewBasePtr, LI->getAlignment());
+ Builder.CreateAlignedLoad(VecBaseTy, NewBasePtr, LI->getAlignment());
DecomposedVectors.push_back(NewLoad);
}
}
@@ -416,7 +419,7 @@ void X86InterleavedAccessGroup::interleave8bitStride4(
}
reorderSubVector(VT, TransposedMatrix, VecOut, makeArrayRef(Concat, 16),
- NumOfElm, 4, Builder);
+ NumOfElm, 4, Builder);
}
// createShuffleStride returns shuffle mask of size N.
diff --git a/lib/Target/X86/X86IntrinsicsInfo.h b/lib/Target/X86/X86IntrinsicsInfo.h
index 151e1b9136c4..40141d894629 100644
--- a/lib/Target/X86/X86IntrinsicsInfo.h
+++ b/lib/Target/X86/X86IntrinsicsInfo.h
@@ -1,9 +1,8 @@
//===-- X86IntrinsicsInfo.h - X86 Intrinsics ------------*- C++ -*-===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
@@ -20,21 +19,22 @@
namespace llvm {
enum IntrinsicType : uint16_t {
+ CVTNEPS2BF16_MASK,
GATHER, SCATTER, PREFETCH, RDSEED, RDRAND, RDPMC, RDTSC, XTEST, XGETBV, ADX, FPCLASSS,
INTR_TYPE_1OP, INTR_TYPE_2OP, INTR_TYPE_3OP, INTR_TYPE_4OP,
INTR_TYPE_3OP_IMM8,
- CMP_MASK_CC,CMP_MASK_SCALAR_CC, VSHIFT, COMI, COMI_RM,
- CVTPD2PS, CVTPD2PS_MASK, CVTPD2PS_RND_MASK,
- INTR_TYPE_1OP_MASK, INTR_TYPE_1OP_MASK_RM,
- INTR_TYPE_2OP_MASK, INTR_TYPE_2OP_MASK_RM,
- INTR_TYPE_3OP_MASK,
- IFMA_OP, VPERM_2OP, INTR_TYPE_SCALAR_MASK,
- INTR_TYPE_SCALAR_MASK_RM, INTR_TYPE_3OP_SCALAR_MASK,
+ CMP_MASK_CC,CMP_MASK_SCALAR_CC, VSHIFT, COMI, COMI_RM, BLENDV,
+ CVTPD2PS_MASK,
+ INTR_TYPE_1OP_SAE, INTR_TYPE_2OP_SAE,
+ INTR_TYPE_1OP_MASK_SAE, INTR_TYPE_2OP_MASK_SAE, INTR_TYPE_3OP_MASK_SAE,
+ INTR_TYPE_1OP_MASK, INTR_TYPE_2OP_MASK,
+ IFMA_OP, VPERM_2OP, INTR_TYPE_SCALAR_MASK, INTR_TYPE_SCALAR_MASK_SAE,
+ INTR_TYPE_SCALAR_MASK_RND,
+ INTR_TYPE_3OP_SCALAR_MASK_SAE,
COMPRESS_EXPAND_IN_REG,
- TRUNCATE_TO_REG, CVTPS2PH_MASK, CVTPD2I_MASK,
+ TRUNCATE_TO_REG, CVTPS2PH_MASK, CVTPD2DQ_MASK, CVTQQ2PS_MASK,
TRUNCATE_TO_MEM_VI8, TRUNCATE_TO_MEM_VI16, TRUNCATE_TO_MEM_VI32,
- FIXUPIMM, FIXUPIMM_MASKZ, FIXUPIMMS,
- FIXUPIMMS_MASKZ, GATHER_AVX2,
+ FIXUPIMM, FIXUPIMM_MASKZ, GATHER_AVX2,
ROUNDP, ROUNDS
};
@@ -64,47 +64,47 @@ struct IntrinsicData {
* the alphabetical order.
*/
static const IntrinsicData IntrinsicsWithChain[] = {
- X86_INTRINSIC_DATA(avx2_gather_d_d, GATHER_AVX2, X86::VPGATHERDDrm, 0),
- X86_INTRINSIC_DATA(avx2_gather_d_d_256, GATHER_AVX2, X86::VPGATHERDDYrm, 0),
- X86_INTRINSIC_DATA(avx2_gather_d_pd, GATHER_AVX2, X86::VGATHERDPDrm, 0),
- X86_INTRINSIC_DATA(avx2_gather_d_pd_256, GATHER_AVX2, X86::VGATHERDPDYrm, 0),
- X86_INTRINSIC_DATA(avx2_gather_d_ps, GATHER_AVX2, X86::VGATHERDPSrm, 0),
- X86_INTRINSIC_DATA(avx2_gather_d_ps_256, GATHER_AVX2, X86::VGATHERDPSYrm, 0),
- X86_INTRINSIC_DATA(avx2_gather_d_q, GATHER_AVX2, X86::VPGATHERDQrm, 0),
- X86_INTRINSIC_DATA(avx2_gather_d_q_256, GATHER_AVX2, X86::VPGATHERDQYrm, 0),
- X86_INTRINSIC_DATA(avx2_gather_q_d, GATHER_AVX2, X86::VPGATHERQDrm, 0),
- X86_INTRINSIC_DATA(avx2_gather_q_d_256, GATHER_AVX2, X86::VPGATHERQDYrm, 0),
- X86_INTRINSIC_DATA(avx2_gather_q_pd, GATHER_AVX2, X86::VGATHERQPDrm, 0),
- X86_INTRINSIC_DATA(avx2_gather_q_pd_256, GATHER_AVX2, X86::VGATHERQPDYrm, 0),
- X86_INTRINSIC_DATA(avx2_gather_q_ps, GATHER_AVX2, X86::VGATHERQPSrm, 0),
- X86_INTRINSIC_DATA(avx2_gather_q_ps_256, GATHER_AVX2, X86::VGATHERQPSYrm, 0),
- X86_INTRINSIC_DATA(avx2_gather_q_q, GATHER_AVX2, X86::VPGATHERQQrm, 0),
- X86_INTRINSIC_DATA(avx2_gather_q_q_256, GATHER_AVX2, X86::VPGATHERQQYrm, 0),
+ X86_INTRINSIC_DATA(avx2_gather_d_d, GATHER_AVX2, 0, 0),
+ X86_INTRINSIC_DATA(avx2_gather_d_d_256, GATHER_AVX2, 0, 0),
+ X86_INTRINSIC_DATA(avx2_gather_d_pd, GATHER_AVX2, 0, 0),
+ X86_INTRINSIC_DATA(avx2_gather_d_pd_256, GATHER_AVX2, 0, 0),
+ X86_INTRINSIC_DATA(avx2_gather_d_ps, GATHER_AVX2, 0, 0),
+ X86_INTRINSIC_DATA(avx2_gather_d_ps_256, GATHER_AVX2, 0, 0),
+ X86_INTRINSIC_DATA(avx2_gather_d_q, GATHER_AVX2, 0, 0),
+ X86_INTRINSIC_DATA(avx2_gather_d_q_256, GATHER_AVX2, 0, 0),
+ X86_INTRINSIC_DATA(avx2_gather_q_d, GATHER_AVX2, 0, 0),
+ X86_INTRINSIC_DATA(avx2_gather_q_d_256, GATHER_AVX2, 0, 0),
+ X86_INTRINSIC_DATA(avx2_gather_q_pd, GATHER_AVX2, 0, 0),
+ X86_INTRINSIC_DATA(avx2_gather_q_pd_256, GATHER_AVX2, 0, 0),
+ X86_INTRINSIC_DATA(avx2_gather_q_ps, GATHER_AVX2, 0, 0),
+ X86_INTRINSIC_DATA(avx2_gather_q_ps_256, GATHER_AVX2, 0, 0),
+ X86_INTRINSIC_DATA(avx2_gather_q_q, GATHER_AVX2, 0, 0),
+ X86_INTRINSIC_DATA(avx2_gather_q_q_256, GATHER_AVX2, 0, 0),
- X86_INTRINSIC_DATA(avx512_gather_dpd_512, GATHER, X86::VGATHERDPDZrm, 0),
- X86_INTRINSIC_DATA(avx512_gather_dpi_512, GATHER, X86::VPGATHERDDZrm, 0),
- X86_INTRINSIC_DATA(avx512_gather_dpq_512, GATHER, X86::VPGATHERDQZrm, 0),
- X86_INTRINSIC_DATA(avx512_gather_dps_512, GATHER, X86::VGATHERDPSZrm, 0),
- X86_INTRINSIC_DATA(avx512_gather_qpd_512, GATHER, X86::VGATHERQPDZrm, 0),
- X86_INTRINSIC_DATA(avx512_gather_qpi_512, GATHER, X86::VPGATHERQDZrm, 0),
- X86_INTRINSIC_DATA(avx512_gather_qpq_512, GATHER, X86::VPGATHERQQZrm, 0),
- X86_INTRINSIC_DATA(avx512_gather_qps_512, GATHER, X86::VGATHERQPSZrm, 0),
- X86_INTRINSIC_DATA(avx512_gather3div2_df, GATHER, X86::VGATHERQPDZ128rm, 0),
- X86_INTRINSIC_DATA(avx512_gather3div2_di, GATHER, X86::VPGATHERQQZ128rm, 0),
- X86_INTRINSIC_DATA(avx512_gather3div4_df, GATHER, X86::VGATHERQPDZ256rm, 0),
- X86_INTRINSIC_DATA(avx512_gather3div4_di, GATHER, X86::VPGATHERQQZ256rm, 0),
- X86_INTRINSIC_DATA(avx512_gather3div4_sf, GATHER, X86::VGATHERQPSZ128rm, 0),
- X86_INTRINSIC_DATA(avx512_gather3div4_si, GATHER, X86::VPGATHERQDZ128rm, 0),
- X86_INTRINSIC_DATA(avx512_gather3div8_sf, GATHER, X86::VGATHERQPSZ256rm, 0),
- X86_INTRINSIC_DATA(avx512_gather3div8_si, GATHER, X86::VPGATHERQDZ256rm, 0),
- X86_INTRINSIC_DATA(avx512_gather3siv2_df, GATHER, X86::VGATHERDPDZ128rm, 0),
- X86_INTRINSIC_DATA(avx512_gather3siv2_di, GATHER, X86::VPGATHERDQZ128rm, 0),
- X86_INTRINSIC_DATA(avx512_gather3siv4_df, GATHER, X86::VGATHERDPDZ256rm, 0),
- X86_INTRINSIC_DATA(avx512_gather3siv4_di, GATHER, X86::VPGATHERDQZ256rm, 0),
- X86_INTRINSIC_DATA(avx512_gather3siv4_sf, GATHER, X86::VGATHERDPSZ128rm, 0),
- X86_INTRINSIC_DATA(avx512_gather3siv4_si, GATHER, X86::VPGATHERDDZ128rm, 0),
- X86_INTRINSIC_DATA(avx512_gather3siv8_sf, GATHER, X86::VGATHERDPSZ256rm, 0),
- X86_INTRINSIC_DATA(avx512_gather3siv8_si, GATHER, X86::VPGATHERDDZ256rm, 0),
+ X86_INTRINSIC_DATA(avx512_gather_dpd_512, GATHER, 0, 0),
+ X86_INTRINSIC_DATA(avx512_gather_dpi_512, GATHER, 0, 0),
+ X86_INTRINSIC_DATA(avx512_gather_dpq_512, GATHER, 0, 0),
+ X86_INTRINSIC_DATA(avx512_gather_dps_512, GATHER, 0, 0),
+ X86_INTRINSIC_DATA(avx512_gather_qpd_512, GATHER, 0, 0),
+ X86_INTRINSIC_DATA(avx512_gather_qpi_512, GATHER, 0, 0),
+ X86_INTRINSIC_DATA(avx512_gather_qpq_512, GATHER, 0, 0),
+ X86_INTRINSIC_DATA(avx512_gather_qps_512, GATHER, 0, 0),
+ X86_INTRINSIC_DATA(avx512_gather3div2_df, GATHER, 0, 0),
+ X86_INTRINSIC_DATA(avx512_gather3div2_di, GATHER, 0, 0),
+ X86_INTRINSIC_DATA(avx512_gather3div4_df, GATHER, 0, 0),
+ X86_INTRINSIC_DATA(avx512_gather3div4_di, GATHER, 0, 0),
+ X86_INTRINSIC_DATA(avx512_gather3div4_sf, GATHER, 0, 0),
+ X86_INTRINSIC_DATA(avx512_gather3div4_si, GATHER, 0, 0),
+ X86_INTRINSIC_DATA(avx512_gather3div8_sf, GATHER, 0, 0),
+ X86_INTRINSIC_DATA(avx512_gather3div8_si, GATHER, 0, 0),
+ X86_INTRINSIC_DATA(avx512_gather3siv2_df, GATHER, 0, 0),
+ X86_INTRINSIC_DATA(avx512_gather3siv2_di, GATHER, 0, 0),
+ X86_INTRINSIC_DATA(avx512_gather3siv4_df, GATHER, 0, 0),
+ X86_INTRINSIC_DATA(avx512_gather3siv4_di, GATHER, 0, 0),
+ X86_INTRINSIC_DATA(avx512_gather3siv4_sf, GATHER, 0, 0),
+ X86_INTRINSIC_DATA(avx512_gather3siv4_si, GATHER, 0, 0),
+ X86_INTRINSIC_DATA(avx512_gather3siv8_sf, GATHER, 0, 0),
+ X86_INTRINSIC_DATA(avx512_gather3siv8_si, GATHER, 0, 0),
X86_INTRINSIC_DATA(avx512_gatherpf_dpd_512, PREFETCH,
X86::VGATHERPF0DPDm, X86::VGATHERPF1DPDm),
@@ -115,30 +115,30 @@ static const IntrinsicData IntrinsicsWithChain[] = {
X86_INTRINSIC_DATA(avx512_gatherpf_qps_512, PREFETCH,
X86::VGATHERPF0QPSm, X86::VGATHERPF1QPSm),
- X86_INTRINSIC_DATA(avx512_mask_gather_dpd_512, GATHER, X86::VGATHERDPDZrm, 0),
- X86_INTRINSIC_DATA(avx512_mask_gather_dpi_512, GATHER, X86::VPGATHERDDZrm, 0),
- X86_INTRINSIC_DATA(avx512_mask_gather_dpq_512, GATHER, X86::VPGATHERDQZrm, 0),
- X86_INTRINSIC_DATA(avx512_mask_gather_dps_512, GATHER, X86::VGATHERDPSZrm, 0),
- X86_INTRINSIC_DATA(avx512_mask_gather_qpd_512, GATHER, X86::VGATHERQPDZrm, 0),
- X86_INTRINSIC_DATA(avx512_mask_gather_qpi_512, GATHER, X86::VPGATHERQDZrm, 0),
- X86_INTRINSIC_DATA(avx512_mask_gather_qpq_512, GATHER, X86::VPGATHERQQZrm, 0),
- X86_INTRINSIC_DATA(avx512_mask_gather_qps_512, GATHER, X86::VGATHERQPSZrm, 0),
- X86_INTRINSIC_DATA(avx512_mask_gather3div2_df, GATHER, X86::VGATHERQPDZ128rm, 0),
- X86_INTRINSIC_DATA(avx512_mask_gather3div2_di, GATHER, X86::VPGATHERQQZ128rm, 0),
- X86_INTRINSIC_DATA(avx512_mask_gather3div4_df, GATHER, X86::VGATHERQPDZ256rm, 0),
- X86_INTRINSIC_DATA(avx512_mask_gather3div4_di, GATHER, X86::VPGATHERQQZ256rm, 0),
- X86_INTRINSIC_DATA(avx512_mask_gather3div4_sf, GATHER, X86::VGATHERQPSZ128rm, 0),
- X86_INTRINSIC_DATA(avx512_mask_gather3div4_si, GATHER, X86::VPGATHERQDZ128rm, 0),
- X86_INTRINSIC_DATA(avx512_mask_gather3div8_sf, GATHER, X86::VGATHERQPSZ256rm, 0),
- X86_INTRINSIC_DATA(avx512_mask_gather3div8_si, GATHER, X86::VPGATHERQDZ256rm, 0),
- X86_INTRINSIC_DATA(avx512_mask_gather3siv2_df, GATHER, X86::VGATHERDPDZ128rm, 0),
- X86_INTRINSIC_DATA(avx512_mask_gather3siv2_di, GATHER, X86::VPGATHERDQZ128rm, 0),
- X86_INTRINSIC_DATA(avx512_mask_gather3siv4_df, GATHER, X86::VGATHERDPDZ256rm, 0),
- X86_INTRINSIC_DATA(avx512_mask_gather3siv4_di, GATHER, X86::VPGATHERDQZ256rm, 0),
- X86_INTRINSIC_DATA(avx512_mask_gather3siv4_sf, GATHER, X86::VGATHERDPSZ128rm, 0),
- X86_INTRINSIC_DATA(avx512_mask_gather3siv4_si, GATHER, X86::VPGATHERDDZ128rm, 0),
- X86_INTRINSIC_DATA(avx512_mask_gather3siv8_sf, GATHER, X86::VGATHERDPSZ256rm, 0),
- X86_INTRINSIC_DATA(avx512_mask_gather3siv8_si, GATHER, X86::VPGATHERDDZ256rm, 0),
+ X86_INTRINSIC_DATA(avx512_mask_gather_dpd_512, GATHER, 0, 0),
+ X86_INTRINSIC_DATA(avx512_mask_gather_dpi_512, GATHER, 0, 0),
+ X86_INTRINSIC_DATA(avx512_mask_gather_dpq_512, GATHER, 0, 0),
+ X86_INTRINSIC_DATA(avx512_mask_gather_dps_512, GATHER, 0, 0),
+ X86_INTRINSIC_DATA(avx512_mask_gather_qpd_512, GATHER, 0, 0),
+ X86_INTRINSIC_DATA(avx512_mask_gather_qpi_512, GATHER, 0, 0),
+ X86_INTRINSIC_DATA(avx512_mask_gather_qpq_512, GATHER, 0, 0),
+ X86_INTRINSIC_DATA(avx512_mask_gather_qps_512, GATHER, 0, 0),
+ X86_INTRINSIC_DATA(avx512_mask_gather3div2_df, GATHER, 0, 0),
+ X86_INTRINSIC_DATA(avx512_mask_gather3div2_di, GATHER, 0, 0),
+ X86_INTRINSIC_DATA(avx512_mask_gather3div4_df, GATHER, 0, 0),
+ X86_INTRINSIC_DATA(avx512_mask_gather3div4_di, GATHER, 0, 0),
+ X86_INTRINSIC_DATA(avx512_mask_gather3div4_sf, GATHER, 0, 0),
+ X86_INTRINSIC_DATA(avx512_mask_gather3div4_si, GATHER, 0, 0),
+ X86_INTRINSIC_DATA(avx512_mask_gather3div8_sf, GATHER, 0, 0),
+ X86_INTRINSIC_DATA(avx512_mask_gather3div8_si, GATHER, 0, 0),
+ X86_INTRINSIC_DATA(avx512_mask_gather3siv2_df, GATHER, 0, 0),
+ X86_INTRINSIC_DATA(avx512_mask_gather3siv2_di, GATHER, 0, 0),
+ X86_INTRINSIC_DATA(avx512_mask_gather3siv4_df, GATHER, 0, 0),
+ X86_INTRINSIC_DATA(avx512_mask_gather3siv4_di, GATHER, 0, 0),
+ X86_INTRINSIC_DATA(avx512_mask_gather3siv4_sf, GATHER, 0, 0),
+ X86_INTRINSIC_DATA(avx512_mask_gather3siv4_si, GATHER, 0, 0),
+ X86_INTRINSIC_DATA(avx512_mask_gather3siv8_sf, GATHER, 0, 0),
+ X86_INTRINSIC_DATA(avx512_mask_gather3siv8_si, GATHER, 0, 0),
X86_INTRINSIC_DATA(avx512_mask_pmov_db_mem_128, TRUNCATE_TO_MEM_VI8,
X86ISD::VTRUNC, 0),
@@ -249,47 +249,47 @@ static const IntrinsicData IntrinsicsWithChain[] = {
X86_INTRINSIC_DATA(avx512_mask_pmovus_wb_mem_512, TRUNCATE_TO_MEM_VI8,
X86ISD::VTRUNCUS, 0),
- X86_INTRINSIC_DATA(avx512_mask_scatter_dpd_512, SCATTER, X86::VSCATTERDPDZmr, 0),
- X86_INTRINSIC_DATA(avx512_mask_scatter_dpi_512, SCATTER, X86::VPSCATTERDDZmr, 0),
- X86_INTRINSIC_DATA(avx512_mask_scatter_dpq_512, SCATTER, X86::VPSCATTERDQZmr, 0),
- X86_INTRINSIC_DATA(avx512_mask_scatter_dps_512, SCATTER, X86::VSCATTERDPSZmr, 0),
- X86_INTRINSIC_DATA(avx512_mask_scatter_qpd_512, SCATTER, X86::VSCATTERQPDZmr, 0),
- X86_INTRINSIC_DATA(avx512_mask_scatter_qpi_512, SCATTER, X86::VPSCATTERQDZmr, 0),
- X86_INTRINSIC_DATA(avx512_mask_scatter_qpq_512, SCATTER, X86::VPSCATTERQQZmr, 0),
- X86_INTRINSIC_DATA(avx512_mask_scatter_qps_512, SCATTER, X86::VSCATTERQPSZmr, 0),
- X86_INTRINSIC_DATA(avx512_mask_scatterdiv2_df, SCATTER, X86::VSCATTERQPDZ128mr, 0),
- X86_INTRINSIC_DATA(avx512_mask_scatterdiv2_di, SCATTER, X86::VPSCATTERQQZ128mr, 0),
- X86_INTRINSIC_DATA(avx512_mask_scatterdiv4_df, SCATTER, X86::VSCATTERQPDZ256mr, 0),
- X86_INTRINSIC_DATA(avx512_mask_scatterdiv4_di, SCATTER, X86::VPSCATTERQQZ256mr, 0),
- X86_INTRINSIC_DATA(avx512_mask_scatterdiv4_sf, SCATTER, X86::VSCATTERQPSZ128mr, 0),
- X86_INTRINSIC_DATA(avx512_mask_scatterdiv4_si, SCATTER, X86::VPSCATTERQDZ128mr, 0),
- X86_INTRINSIC_DATA(avx512_mask_scatterdiv8_sf, SCATTER, X86::VSCATTERQPSZ256mr, 0),
- X86_INTRINSIC_DATA(avx512_mask_scatterdiv8_si, SCATTER, X86::VPSCATTERQDZ256mr, 0),
- X86_INTRINSIC_DATA(avx512_mask_scattersiv2_df, SCATTER, X86::VSCATTERDPDZ128mr, 0),
- X86_INTRINSIC_DATA(avx512_mask_scattersiv2_di, SCATTER, X86::VPSCATTERDQZ128mr, 0),
- X86_INTRINSIC_DATA(avx512_mask_scattersiv4_df, SCATTER, X86::VSCATTERDPDZ256mr, 0),
- X86_INTRINSIC_DATA(avx512_mask_scattersiv4_di, SCATTER, X86::VPSCATTERDQZ256mr, 0),
- X86_INTRINSIC_DATA(avx512_mask_scattersiv4_sf, SCATTER, X86::VSCATTERDPSZ128mr, 0),
- X86_INTRINSIC_DATA(avx512_mask_scattersiv4_si, SCATTER, X86::VPSCATTERDDZ128mr, 0),
- X86_INTRINSIC_DATA(avx512_mask_scattersiv8_sf, SCATTER, X86::VSCATTERDPSZ256mr, 0),
- X86_INTRINSIC_DATA(avx512_mask_scattersiv8_si, SCATTER, X86::VPSCATTERDDZ256mr, 0),
+ X86_INTRINSIC_DATA(avx512_mask_scatter_dpd_512, SCATTER, 0, 0),
+ X86_INTRINSIC_DATA(avx512_mask_scatter_dpi_512, SCATTER, 0, 0),
+ X86_INTRINSIC_DATA(avx512_mask_scatter_dpq_512, SCATTER, 0, 0),
+ X86_INTRINSIC_DATA(avx512_mask_scatter_dps_512, SCATTER, 0, 0),
+ X86_INTRINSIC_DATA(avx512_mask_scatter_qpd_512, SCATTER, 0, 0),
+ X86_INTRINSIC_DATA(avx512_mask_scatter_qpi_512, SCATTER, 0, 0),
+ X86_INTRINSIC_DATA(avx512_mask_scatter_qpq_512, SCATTER, 0, 0),
+ X86_INTRINSIC_DATA(avx512_mask_scatter_qps_512, SCATTER, 0, 0),
+ X86_INTRINSIC_DATA(avx512_mask_scatterdiv2_df, SCATTER, 0, 0),
+ X86_INTRINSIC_DATA(avx512_mask_scatterdiv2_di, SCATTER, 0, 0),
+ X86_INTRINSIC_DATA(avx512_mask_scatterdiv4_df, SCATTER, 0, 0),
+ X86_INTRINSIC_DATA(avx512_mask_scatterdiv4_di, SCATTER, 0, 0),
+ X86_INTRINSIC_DATA(avx512_mask_scatterdiv4_sf, SCATTER, 0, 0),
+ X86_INTRINSIC_DATA(avx512_mask_scatterdiv4_si, SCATTER, 0, 0),
+ X86_INTRINSIC_DATA(avx512_mask_scatterdiv8_sf, SCATTER, 0, 0),
+ X86_INTRINSIC_DATA(avx512_mask_scatterdiv8_si, SCATTER, 0, 0),
+ X86_INTRINSIC_DATA(avx512_mask_scattersiv2_df, SCATTER, 0, 0),
+ X86_INTRINSIC_DATA(avx512_mask_scattersiv2_di, SCATTER, 0, 0),
+ X86_INTRINSIC_DATA(avx512_mask_scattersiv4_df, SCATTER, 0, 0),
+ X86_INTRINSIC_DATA(avx512_mask_scattersiv4_di, SCATTER, 0, 0),
+ X86_INTRINSIC_DATA(avx512_mask_scattersiv4_sf, SCATTER, 0, 0),
+ X86_INTRINSIC_DATA(avx512_mask_scattersiv4_si, SCATTER, 0, 0),
+ X86_INTRINSIC_DATA(avx512_mask_scattersiv8_sf, SCATTER, 0, 0),
+ X86_INTRINSIC_DATA(avx512_mask_scattersiv8_si, SCATTER, 0, 0),
- X86_INTRINSIC_DATA(avx512_scatter_dpd_512, SCATTER, X86::VSCATTERDPDZmr, 0),
- X86_INTRINSIC_DATA(avx512_scatter_dpi_512, SCATTER, X86::VPSCATTERDDZmr, 0),
- X86_INTRINSIC_DATA(avx512_scatter_dpq_512, SCATTER, X86::VPSCATTERDQZmr, 0),
- X86_INTRINSIC_DATA(avx512_scatter_dps_512, SCATTER, X86::VSCATTERDPSZmr, 0),
- X86_INTRINSIC_DATA(avx512_scatter_qpd_512, SCATTER, X86::VSCATTERQPDZmr, 0),
- X86_INTRINSIC_DATA(avx512_scatter_qpi_512, SCATTER, X86::VPSCATTERQDZmr, 0),
- X86_INTRINSIC_DATA(avx512_scatter_qpq_512, SCATTER, X86::VPSCATTERQQZmr, 0),
- X86_INTRINSIC_DATA(avx512_scatter_qps_512, SCATTER, X86::VSCATTERQPSZmr, 0),
- X86_INTRINSIC_DATA(avx512_scatterdiv2_df, SCATTER, X86::VSCATTERQPDZ128mr, 0),
- X86_INTRINSIC_DATA(avx512_scatterdiv2_di, SCATTER, X86::VPSCATTERQQZ128mr, 0),
- X86_INTRINSIC_DATA(avx512_scatterdiv4_df, SCATTER, X86::VSCATTERQPDZ256mr, 0),
- X86_INTRINSIC_DATA(avx512_scatterdiv4_di, SCATTER, X86::VPSCATTERQQZ256mr, 0),
- X86_INTRINSIC_DATA(avx512_scatterdiv4_sf, SCATTER, X86::VSCATTERQPSZ128mr, 0),
- X86_INTRINSIC_DATA(avx512_scatterdiv4_si, SCATTER, X86::VPSCATTERQDZ128mr, 0),
- X86_INTRINSIC_DATA(avx512_scatterdiv8_sf, SCATTER, X86::VSCATTERQPSZ256mr, 0),
- X86_INTRINSIC_DATA(avx512_scatterdiv8_si, SCATTER, X86::VPSCATTERQDZ256mr, 0),
+ X86_INTRINSIC_DATA(avx512_scatter_dpd_512, SCATTER, 0, 0),
+ X86_INTRINSIC_DATA(avx512_scatter_dpi_512, SCATTER, 0, 0),
+ X86_INTRINSIC_DATA(avx512_scatter_dpq_512, SCATTER, 0, 0),
+ X86_INTRINSIC_DATA(avx512_scatter_dps_512, SCATTER, 0, 0),
+ X86_INTRINSIC_DATA(avx512_scatter_qpd_512, SCATTER, 0, 0),
+ X86_INTRINSIC_DATA(avx512_scatter_qpi_512, SCATTER, 0, 0),
+ X86_INTRINSIC_DATA(avx512_scatter_qpq_512, SCATTER, 0, 0),
+ X86_INTRINSIC_DATA(avx512_scatter_qps_512, SCATTER, 0, 0),
+ X86_INTRINSIC_DATA(avx512_scatterdiv2_df, SCATTER, 0, 0),
+ X86_INTRINSIC_DATA(avx512_scatterdiv2_di, SCATTER, 0, 0),
+ X86_INTRINSIC_DATA(avx512_scatterdiv4_df, SCATTER, 0, 0),
+ X86_INTRINSIC_DATA(avx512_scatterdiv4_di, SCATTER, 0, 0),
+ X86_INTRINSIC_DATA(avx512_scatterdiv4_sf, SCATTER, 0, 0),
+ X86_INTRINSIC_DATA(avx512_scatterdiv4_si, SCATTER, 0, 0),
+ X86_INTRINSIC_DATA(avx512_scatterdiv8_sf, SCATTER, 0, 0),
+ X86_INTRINSIC_DATA(avx512_scatterdiv8_si, SCATTER, 0, 0),
X86_INTRINSIC_DATA(avx512_scatterpf_dpd_512, PREFETCH, X86::VSCATTERPF0DPDm,
X86::VSCATTERPF1DPDm),
X86_INTRINSIC_DATA(avx512_scatterpf_dps_512, PREFETCH, X86::VSCATTERPF0DPSm,
@@ -298,24 +298,24 @@ static const IntrinsicData IntrinsicsWithChain[] = {
X86::VSCATTERPF1QPDm),
X86_INTRINSIC_DATA(avx512_scatterpf_qps_512, PREFETCH, X86::VSCATTERPF0QPSm,
X86::VSCATTERPF1QPSm),
- X86_INTRINSIC_DATA(avx512_scattersiv2_df, SCATTER, X86::VSCATTERDPDZ128mr, 0),
- X86_INTRINSIC_DATA(avx512_scattersiv2_di, SCATTER, X86::VPSCATTERDQZ128mr, 0),
- X86_INTRINSIC_DATA(avx512_scattersiv4_df, SCATTER, X86::VSCATTERDPDZ256mr, 0),
- X86_INTRINSIC_DATA(avx512_scattersiv4_di, SCATTER, X86::VPSCATTERDQZ256mr, 0),
- X86_INTRINSIC_DATA(avx512_scattersiv4_sf, SCATTER, X86::VSCATTERDPSZ128mr, 0),
- X86_INTRINSIC_DATA(avx512_scattersiv4_si, SCATTER, X86::VPSCATTERDDZ128mr, 0),
- X86_INTRINSIC_DATA(avx512_scattersiv8_sf, SCATTER, X86::VSCATTERDPSZ256mr, 0),
- X86_INTRINSIC_DATA(avx512_scattersiv8_si, SCATTER, X86::VPSCATTERDDZ256mr, 0),
- X86_INTRINSIC_DATA(rdpmc, RDPMC, X86ISD::RDPMC_DAG, 0),
+ X86_INTRINSIC_DATA(avx512_scattersiv2_df, SCATTER, 0, 0),
+ X86_INTRINSIC_DATA(avx512_scattersiv2_di, SCATTER, 0, 0),
+ X86_INTRINSIC_DATA(avx512_scattersiv4_df, SCATTER, 0, 0),
+ X86_INTRINSIC_DATA(avx512_scattersiv4_di, SCATTER, 0, 0),
+ X86_INTRINSIC_DATA(avx512_scattersiv4_sf, SCATTER, 0, 0),
+ X86_INTRINSIC_DATA(avx512_scattersiv4_si, SCATTER, 0, 0),
+ X86_INTRINSIC_DATA(avx512_scattersiv8_sf, SCATTER, 0, 0),
+ X86_INTRINSIC_DATA(avx512_scattersiv8_si, SCATTER, 0, 0),
+ X86_INTRINSIC_DATA(rdpmc, RDPMC, X86::RDPMC, 0),
X86_INTRINSIC_DATA(rdrand_16, RDRAND, X86ISD::RDRAND, 0),
X86_INTRINSIC_DATA(rdrand_32, RDRAND, X86ISD::RDRAND, 0),
X86_INTRINSIC_DATA(rdrand_64, RDRAND, X86ISD::RDRAND, 0),
X86_INTRINSIC_DATA(rdseed_16, RDSEED, X86ISD::RDSEED, 0),
X86_INTRINSIC_DATA(rdseed_32, RDSEED, X86ISD::RDSEED, 0),
X86_INTRINSIC_DATA(rdseed_64, RDSEED, X86ISD::RDSEED, 0),
- X86_INTRINSIC_DATA(rdtsc, RDTSC, X86ISD::RDTSC_DAG, 0),
- X86_INTRINSIC_DATA(rdtscp, RDTSC, X86ISD::RDTSCP_DAG, 0),
- X86_INTRINSIC_DATA(xgetbv, XGETBV, X86::XGETBV, 0),
+ X86_INTRINSIC_DATA(rdtsc, RDTSC, X86::RDTSC, 0),
+ X86_INTRINSIC_DATA(rdtscp, RDTSC, X86::RDTSCP, 0),
+ X86_INTRINSIC_DATA(xgetbv, XGETBV, X86::XGETBV, 0),
X86_INTRINSIC_DATA(xtest, XTEST, X86ISD::XTEST, 0),
};
@@ -340,9 +340,11 @@ static const IntrinsicData IntrinsicsWithoutChain[] = {
X86_INTRINSIC_DATA(addcarry_64, ADX, X86ISD::ADC, X86ISD::ADD),
X86_INTRINSIC_DATA(avx_addsub_pd_256, INTR_TYPE_2OP, X86ISD::ADDSUB, 0),
X86_INTRINSIC_DATA(avx_addsub_ps_256, INTR_TYPE_2OP, X86ISD::ADDSUB, 0),
+ X86_INTRINSIC_DATA(avx_blendv_pd_256, BLENDV, X86ISD::BLENDV, 0),
+ X86_INTRINSIC_DATA(avx_blendv_ps_256, BLENDV, X86ISD::BLENDV, 0),
X86_INTRINSIC_DATA(avx_cmp_pd_256, INTR_TYPE_3OP, X86ISD::CMPP, 0),
X86_INTRINSIC_DATA(avx_cmp_ps_256, INTR_TYPE_3OP, X86ISD::CMPP, 0),
- X86_INTRINSIC_DATA(avx_cvt_pd2_ps_256,CVTPD2PS, ISD::FP_ROUND, 0),
+ X86_INTRINSIC_DATA(avx_cvt_pd2_ps_256,INTR_TYPE_1OP, X86ISD::VFPROUND, 0),
X86_INTRINSIC_DATA(avx_cvt_pd2dq_256, INTR_TYPE_1OP, X86ISD::CVTP2SI, 0),
X86_INTRINSIC_DATA(avx_cvt_ps2dq_256, INTR_TYPE_1OP, X86ISD::CVTP2SI, 0),
X86_INTRINSIC_DATA(avx_cvtt_pd2dq_256,INTR_TYPE_1OP, X86ISD::CVTTP2SI, 0),
@@ -369,6 +371,9 @@ static const IntrinsicData IntrinsicsWithoutChain[] = {
X86_INTRINSIC_DATA(avx2_packsswb, INTR_TYPE_2OP, X86ISD::PACKSS, 0),
X86_INTRINSIC_DATA(avx2_packusdw, INTR_TYPE_2OP, X86ISD::PACKUS, 0),
X86_INTRINSIC_DATA(avx2_packuswb, INTR_TYPE_2OP, X86ISD::PACKUS, 0),
+ X86_INTRINSIC_DATA(avx2_pavg_b, INTR_TYPE_2OP, X86ISD::AVG, 0),
+ X86_INTRINSIC_DATA(avx2_pavg_w, INTR_TYPE_2OP, X86ISD::AVG, 0),
+ X86_INTRINSIC_DATA(avx2_pblendvb, BLENDV, X86ISD::BLENDV, 0),
X86_INTRINSIC_DATA(avx2_permd, VPERM_2OP, X86ISD::VPERMV, 0),
X86_INTRINSIC_DATA(avx2_permps, VPERM_2OP, X86ISD::VPERMV, 0),
X86_INTRINSIC_DATA(avx2_phadd_d, INTR_TYPE_2OP, X86ISD::HADD, 0),
@@ -389,10 +394,10 @@ static const IntrinsicData IntrinsicsWithoutChain[] = {
X86_INTRINSIC_DATA(avx2_pslli_d, VSHIFT, X86ISD::VSHLI, 0),
X86_INTRINSIC_DATA(avx2_pslli_q, VSHIFT, X86ISD::VSHLI, 0),
X86_INTRINSIC_DATA(avx2_pslli_w, VSHIFT, X86ISD::VSHLI, 0),
- X86_INTRINSIC_DATA(avx2_psllv_d, INTR_TYPE_2OP, ISD::SHL, 0),
- X86_INTRINSIC_DATA(avx2_psllv_d_256, INTR_TYPE_2OP, ISD::SHL, 0),
- X86_INTRINSIC_DATA(avx2_psllv_q, INTR_TYPE_2OP, ISD::SHL, 0),
- X86_INTRINSIC_DATA(avx2_psllv_q_256, INTR_TYPE_2OP, ISD::SHL, 0),
+ X86_INTRINSIC_DATA(avx2_psllv_d, INTR_TYPE_2OP, X86ISD::VSHLV, 0),
+ X86_INTRINSIC_DATA(avx2_psllv_d_256, INTR_TYPE_2OP, X86ISD::VSHLV, 0),
+ X86_INTRINSIC_DATA(avx2_psllv_q, INTR_TYPE_2OP, X86ISD::VSHLV, 0),
+ X86_INTRINSIC_DATA(avx2_psllv_q_256, INTR_TYPE_2OP, X86ISD::VSHLV, 0),
X86_INTRINSIC_DATA(avx2_psra_d, INTR_TYPE_2OP, X86ISD::VSRA, 0),
X86_INTRINSIC_DATA(avx2_psra_w, INTR_TYPE_2OP, X86ISD::VSRA, 0),
X86_INTRINSIC_DATA(avx2_psrai_d, VSHIFT, X86ISD::VSRAI, 0),
@@ -405,39 +410,45 @@ static const IntrinsicData IntrinsicsWithoutChain[] = {
X86_INTRINSIC_DATA(avx2_psrli_d, VSHIFT, X86ISD::VSRLI, 0),
X86_INTRINSIC_DATA(avx2_psrli_q, VSHIFT, X86ISD::VSRLI, 0),
X86_INTRINSIC_DATA(avx2_psrli_w, VSHIFT, X86ISD::VSRLI, 0),
- X86_INTRINSIC_DATA(avx2_psrlv_d, INTR_TYPE_2OP, ISD::SRL, 0),
- X86_INTRINSIC_DATA(avx2_psrlv_d_256, INTR_TYPE_2OP, ISD::SRL, 0),
- X86_INTRINSIC_DATA(avx2_psrlv_q, INTR_TYPE_2OP, ISD::SRL, 0),
- X86_INTRINSIC_DATA(avx2_psrlv_q_256, INTR_TYPE_2OP, ISD::SRL, 0),
+ X86_INTRINSIC_DATA(avx2_psrlv_d, INTR_TYPE_2OP, X86ISD::VSRLV, 0),
+ X86_INTRINSIC_DATA(avx2_psrlv_d_256, INTR_TYPE_2OP, X86ISD::VSRLV, 0),
+ X86_INTRINSIC_DATA(avx2_psrlv_q, INTR_TYPE_2OP, X86ISD::VSRLV, 0),
+ X86_INTRINSIC_DATA(avx2_psrlv_q_256, INTR_TYPE_2OP, X86ISD::VSRLV, 0),
X86_INTRINSIC_DATA(avx512_add_pd_512, INTR_TYPE_2OP, ISD::FADD, X86ISD::FADD_RND),
X86_INTRINSIC_DATA(avx512_add_ps_512, INTR_TYPE_2OP, ISD::FADD, X86ISD::FADD_RND),
X86_INTRINSIC_DATA(avx512_cmp_pd_128, CMP_MASK_CC, X86ISD::CMPM, 0),
X86_INTRINSIC_DATA(avx512_cmp_pd_256, CMP_MASK_CC, X86ISD::CMPM, 0),
- X86_INTRINSIC_DATA(avx512_cmp_pd_512, CMP_MASK_CC, X86ISD::CMPM, X86ISD::CMPM_RND),
+ X86_INTRINSIC_DATA(avx512_cmp_pd_512, CMP_MASK_CC, X86ISD::CMPM, X86ISD::CMPM_SAE),
X86_INTRINSIC_DATA(avx512_cmp_ps_128, CMP_MASK_CC, X86ISD::CMPM, 0),
X86_INTRINSIC_DATA(avx512_cmp_ps_256, CMP_MASK_CC, X86ISD::CMPM, 0),
- X86_INTRINSIC_DATA(avx512_cmp_ps_512, CMP_MASK_CC, X86ISD::CMPM, X86ISD::CMPM_RND),
- X86_INTRINSIC_DATA(avx512_cvtsi2sd64, INTR_TYPE_3OP, X86ISD::SCALAR_SINT_TO_FP_RND, 0),
- X86_INTRINSIC_DATA(avx512_cvtsi2ss32, INTR_TYPE_3OP, X86ISD::SCALAR_SINT_TO_FP_RND, 0),
- X86_INTRINSIC_DATA(avx512_cvtsi2ss64, INTR_TYPE_3OP, X86ISD::SCALAR_SINT_TO_FP_RND, 0),
- X86_INTRINSIC_DATA(avx512_cvttsd2si, INTR_TYPE_1OP, X86ISD::CVTTS2SI, X86ISD::CVTTS2SI_RND),
- X86_INTRINSIC_DATA(avx512_cvttsd2si64, INTR_TYPE_1OP, X86ISD::CVTTS2SI, X86ISD::CVTTS2SI_RND),
- X86_INTRINSIC_DATA(avx512_cvttsd2usi, INTR_TYPE_1OP, X86ISD::CVTTS2UI, X86ISD::CVTTS2UI_RND),
- X86_INTRINSIC_DATA(avx512_cvttsd2usi64, INTR_TYPE_1OP, X86ISD::CVTTS2UI, X86ISD::CVTTS2UI_RND),
- X86_INTRINSIC_DATA(avx512_cvttss2si, INTR_TYPE_1OP, X86ISD::CVTTS2SI, X86ISD::CVTTS2SI_RND),
- X86_INTRINSIC_DATA(avx512_cvttss2si64, INTR_TYPE_1OP, X86ISD::CVTTS2SI, X86ISD::CVTTS2SI_RND),
- X86_INTRINSIC_DATA(avx512_cvttss2usi, INTR_TYPE_1OP, X86ISD::CVTTS2UI, X86ISD::CVTTS2UI_RND),
- X86_INTRINSIC_DATA(avx512_cvttss2usi64, INTR_TYPE_1OP, X86ISD::CVTTS2UI, X86ISD::CVTTS2UI_RND),
- X86_INTRINSIC_DATA(avx512_cvtusi2ss, INTR_TYPE_3OP, X86ISD::SCALAR_UINT_TO_FP_RND, 0),
- X86_INTRINSIC_DATA(avx512_cvtusi642sd, INTR_TYPE_3OP, X86ISD::SCALAR_UINT_TO_FP_RND, 0),
- X86_INTRINSIC_DATA(avx512_cvtusi642ss, INTR_TYPE_3OP, X86ISD::SCALAR_UINT_TO_FP_RND, 0),
+ X86_INTRINSIC_DATA(avx512_cmp_ps_512, CMP_MASK_CC, X86ISD::CMPM, X86ISD::CMPM_SAE),
+ X86_INTRINSIC_DATA(avx512_conflict_d_128, INTR_TYPE_1OP, X86ISD::CONFLICT, 0),
+ X86_INTRINSIC_DATA(avx512_conflict_d_256, INTR_TYPE_1OP, X86ISD::CONFLICT, 0),
+ X86_INTRINSIC_DATA(avx512_conflict_d_512, INTR_TYPE_1OP, X86ISD::CONFLICT, 0),
+ X86_INTRINSIC_DATA(avx512_conflict_q_128, INTR_TYPE_1OP, X86ISD::CONFLICT, 0),
+ X86_INTRINSIC_DATA(avx512_conflict_q_256, INTR_TYPE_1OP, X86ISD::CONFLICT, 0),
+ X86_INTRINSIC_DATA(avx512_conflict_q_512, INTR_TYPE_1OP, X86ISD::CONFLICT, 0),
+ X86_INTRINSIC_DATA(avx512_cvtsi2sd64, INTR_TYPE_2OP, X86ISD::SCALAR_SINT_TO_FP, X86ISD::SCALAR_SINT_TO_FP_RND),
+ X86_INTRINSIC_DATA(avx512_cvtsi2ss32, INTR_TYPE_2OP, X86ISD::SCALAR_SINT_TO_FP, X86ISD::SCALAR_SINT_TO_FP_RND),
+ X86_INTRINSIC_DATA(avx512_cvtsi2ss64, INTR_TYPE_2OP, X86ISD::SCALAR_SINT_TO_FP, X86ISD::SCALAR_SINT_TO_FP_RND),
+ X86_INTRINSIC_DATA(avx512_cvttsd2si, INTR_TYPE_1OP_SAE, X86ISD::CVTTS2SI, X86ISD::CVTTS2SI_SAE),
+ X86_INTRINSIC_DATA(avx512_cvttsd2si64, INTR_TYPE_1OP_SAE, X86ISD::CVTTS2SI, X86ISD::CVTTS2SI_SAE),
+ X86_INTRINSIC_DATA(avx512_cvttsd2usi, INTR_TYPE_1OP_SAE, X86ISD::CVTTS2UI, X86ISD::CVTTS2UI_SAE),
+ X86_INTRINSIC_DATA(avx512_cvttsd2usi64, INTR_TYPE_1OP_SAE, X86ISD::CVTTS2UI, X86ISD::CVTTS2UI_SAE),
+ X86_INTRINSIC_DATA(avx512_cvttss2si, INTR_TYPE_1OP_SAE, X86ISD::CVTTS2SI, X86ISD::CVTTS2SI_SAE),
+ X86_INTRINSIC_DATA(avx512_cvttss2si64, INTR_TYPE_1OP_SAE, X86ISD::CVTTS2SI, X86ISD::CVTTS2SI_SAE),
+ X86_INTRINSIC_DATA(avx512_cvttss2usi, INTR_TYPE_1OP_SAE, X86ISD::CVTTS2UI, X86ISD::CVTTS2UI_SAE),
+ X86_INTRINSIC_DATA(avx512_cvttss2usi64, INTR_TYPE_1OP_SAE, X86ISD::CVTTS2UI, X86ISD::CVTTS2UI_SAE),
+ X86_INTRINSIC_DATA(avx512_cvtusi2ss, INTR_TYPE_2OP, X86ISD::SCALAR_UINT_TO_FP, X86ISD::SCALAR_UINT_TO_FP_RND),
+ X86_INTRINSIC_DATA(avx512_cvtusi642sd, INTR_TYPE_2OP, X86ISD::SCALAR_UINT_TO_FP, X86ISD::SCALAR_UINT_TO_FP_RND),
+ X86_INTRINSIC_DATA(avx512_cvtusi642ss, INTR_TYPE_2OP, X86ISD::SCALAR_UINT_TO_FP, X86ISD::SCALAR_UINT_TO_FP_RND),
X86_INTRINSIC_DATA(avx512_dbpsadbw_128, INTR_TYPE_3OP_IMM8, X86ISD::DBPSADBW, 0),
X86_INTRINSIC_DATA(avx512_dbpsadbw_256, INTR_TYPE_3OP_IMM8, X86ISD::DBPSADBW, 0),
X86_INTRINSIC_DATA(avx512_dbpsadbw_512, INTR_TYPE_3OP_IMM8, X86ISD::DBPSADBW, 0),
X86_INTRINSIC_DATA(avx512_div_pd_512, INTR_TYPE_2OP, ISD::FDIV, X86ISD::FDIV_RND),
X86_INTRINSIC_DATA(avx512_div_ps_512, INTR_TYPE_2OP, ISD::FDIV, X86ISD::FDIV_RND),
- X86_INTRINSIC_DATA(avx512_exp2_pd, INTR_TYPE_1OP_MASK_RM, X86ISD::EXP2, 0),
- X86_INTRINSIC_DATA(avx512_exp2_ps, INTR_TYPE_1OP_MASK_RM, X86ISD::EXP2, 0),
+ X86_INTRINSIC_DATA(avx512_exp2_pd, INTR_TYPE_1OP_MASK_SAE, X86ISD::EXP2, X86ISD::EXP2_SAE),
+ X86_INTRINSIC_DATA(avx512_exp2_ps, INTR_TYPE_1OP_MASK_SAE, X86ISD::EXP2, X86ISD::EXP2_SAE),
X86_INTRINSIC_DATA(avx512_fpclass_pd_128, INTR_TYPE_2OP, X86ISD::VFPCLASS, 0),
X86_INTRINSIC_DATA(avx512_fpclass_pd_256, INTR_TYPE_2OP, X86ISD::VFPCLASS, 0),
X86_INTRINSIC_DATA(avx512_fpclass_pd_512, INTR_TYPE_2OP, X86ISD::VFPCLASS, 0),
@@ -448,80 +459,32 @@ static const IntrinsicData IntrinsicsWithoutChain[] = {
X86_INTRINSIC_DATA(avx512_kadd_d, INTR_TYPE_2OP, X86ISD::KADD, 0),
X86_INTRINSIC_DATA(avx512_kadd_q, INTR_TYPE_2OP, X86ISD::KADD, 0),
X86_INTRINSIC_DATA(avx512_kadd_w, INTR_TYPE_2OP, X86ISD::KADD, 0),
- X86_INTRINSIC_DATA(avx512_mask_add_sd_round, INTR_TYPE_SCALAR_MASK_RM,
- X86ISD::FADDS_RND, 0),
- X86_INTRINSIC_DATA(avx512_mask_add_ss_round, INTR_TYPE_SCALAR_MASK_RM,
- X86ISD::FADDS_RND, 0),
+ X86_INTRINSIC_DATA(avx512_mask_add_sd_round, INTR_TYPE_SCALAR_MASK,
+ X86ISD::FADDS, X86ISD::FADDS_RND),
+ X86_INTRINSIC_DATA(avx512_mask_add_ss_round, INTR_TYPE_SCALAR_MASK,
+ X86ISD::FADDS, X86ISD::FADDS_RND),
X86_INTRINSIC_DATA(avx512_mask_cmp_sd, CMP_MASK_SCALAR_CC,
- X86ISD::FSETCCM, X86ISD::FSETCCM_RND),
+ X86ISD::FSETCCM, X86ISD::FSETCCM_SAE),
X86_INTRINSIC_DATA(avx512_mask_cmp_ss, CMP_MASK_SCALAR_CC,
- X86ISD::FSETCCM, X86ISD::FSETCCM_RND),
+ X86ISD::FSETCCM, X86ISD::FSETCCM_SAE),
- X86_INTRINSIC_DATA(avx512_mask_compress_b_128, COMPRESS_EXPAND_IN_REG,
- X86ISD::COMPRESS, 0),
- X86_INTRINSIC_DATA(avx512_mask_compress_b_256, COMPRESS_EXPAND_IN_REG,
- X86ISD::COMPRESS, 0),
- X86_INTRINSIC_DATA(avx512_mask_compress_b_512, COMPRESS_EXPAND_IN_REG,
- X86ISD::COMPRESS, 0),
- X86_INTRINSIC_DATA(avx512_mask_compress_d_128, COMPRESS_EXPAND_IN_REG,
- X86ISD::COMPRESS, 0),
- X86_INTRINSIC_DATA(avx512_mask_compress_d_256, COMPRESS_EXPAND_IN_REG,
- X86ISD::COMPRESS, 0),
- X86_INTRINSIC_DATA(avx512_mask_compress_d_512, COMPRESS_EXPAND_IN_REG,
- X86ISD::COMPRESS, 0),
- X86_INTRINSIC_DATA(avx512_mask_compress_pd_128, COMPRESS_EXPAND_IN_REG,
- X86ISD::COMPRESS, 0),
- X86_INTRINSIC_DATA(avx512_mask_compress_pd_256, COMPRESS_EXPAND_IN_REG,
- X86ISD::COMPRESS, 0),
- X86_INTRINSIC_DATA(avx512_mask_compress_pd_512, COMPRESS_EXPAND_IN_REG,
- X86ISD::COMPRESS, 0),
- X86_INTRINSIC_DATA(avx512_mask_compress_ps_128, COMPRESS_EXPAND_IN_REG,
+ X86_INTRINSIC_DATA(avx512_mask_compress, COMPRESS_EXPAND_IN_REG,
X86ISD::COMPRESS, 0),
- X86_INTRINSIC_DATA(avx512_mask_compress_ps_256, COMPRESS_EXPAND_IN_REG,
- X86ISD::COMPRESS, 0),
- X86_INTRINSIC_DATA(avx512_mask_compress_ps_512, COMPRESS_EXPAND_IN_REG,
- X86ISD::COMPRESS, 0),
- X86_INTRINSIC_DATA(avx512_mask_compress_q_128, COMPRESS_EXPAND_IN_REG,
- X86ISD::COMPRESS, 0),
- X86_INTRINSIC_DATA(avx512_mask_compress_q_256, COMPRESS_EXPAND_IN_REG,
- X86ISD::COMPRESS, 0),
- X86_INTRINSIC_DATA(avx512_mask_compress_q_512, COMPRESS_EXPAND_IN_REG,
- X86ISD::COMPRESS, 0),
- X86_INTRINSIC_DATA(avx512_mask_compress_w_128, COMPRESS_EXPAND_IN_REG,
- X86ISD::COMPRESS, 0),
- X86_INTRINSIC_DATA(avx512_mask_compress_w_256, COMPRESS_EXPAND_IN_REG,
- X86ISD::COMPRESS, 0),
- X86_INTRINSIC_DATA(avx512_mask_compress_w_512, COMPRESS_EXPAND_IN_REG,
- X86ISD::COMPRESS, 0),
- X86_INTRINSIC_DATA(avx512_mask_conflict_d_128, INTR_TYPE_1OP_MASK,
- X86ISD::CONFLICT, 0),
- X86_INTRINSIC_DATA(avx512_mask_conflict_d_256, INTR_TYPE_1OP_MASK,
- X86ISD::CONFLICT, 0),
- X86_INTRINSIC_DATA(avx512_mask_conflict_d_512, INTR_TYPE_1OP_MASK,
- X86ISD::CONFLICT, 0),
- X86_INTRINSIC_DATA(avx512_mask_conflict_q_128, INTR_TYPE_1OP_MASK,
- X86ISD::CONFLICT, 0),
- X86_INTRINSIC_DATA(avx512_mask_conflict_q_256, INTR_TYPE_1OP_MASK,
- X86ISD::CONFLICT, 0),
- X86_INTRINSIC_DATA(avx512_mask_conflict_q_512, INTR_TYPE_1OP_MASK,
- X86ISD::CONFLICT, 0),
- X86_INTRINSIC_DATA(avx512_mask_cvtdq2ps_512, INTR_TYPE_1OP_MASK,
- ISD::SINT_TO_FP, X86ISD::SINT_TO_FP_RND), //er
- X86_INTRINSIC_DATA(avx512_mask_cvtpd2dq_128, CVTPD2I_MASK,
+ X86_INTRINSIC_DATA(avx512_mask_cvtpd2dq_128, CVTPD2DQ_MASK,
X86ISD::CVTP2SI, X86ISD::MCVTP2SI),
X86_INTRINSIC_DATA(avx512_mask_cvtpd2dq_512, INTR_TYPE_1OP_MASK,
X86ISD::CVTP2SI, X86ISD::CVTP2SI_RND),
X86_INTRINSIC_DATA(avx512_mask_cvtpd2ps, CVTPD2PS_MASK,
X86ISD::VFPROUND, X86ISD::VMFPROUND),
- X86_INTRINSIC_DATA(avx512_mask_cvtpd2ps_512, CVTPD2PS_RND_MASK,
- ISD::FP_ROUND, X86ISD::VFPROUND_RND),
+ X86_INTRINSIC_DATA(avx512_mask_cvtpd2ps_512, INTR_TYPE_1OP_MASK,
+ X86ISD::VFPROUND, X86ISD::VFPROUND_RND),
X86_INTRINSIC_DATA(avx512_mask_cvtpd2qq_128, INTR_TYPE_1OP_MASK,
X86ISD::CVTP2SI, 0),
X86_INTRINSIC_DATA(avx512_mask_cvtpd2qq_256, INTR_TYPE_1OP_MASK,
X86ISD::CVTP2SI, 0),
X86_INTRINSIC_DATA(avx512_mask_cvtpd2qq_512, INTR_TYPE_1OP_MASK,
X86ISD::CVTP2SI, X86ISD::CVTP2SI_RND),
- X86_INTRINSIC_DATA(avx512_mask_cvtpd2udq_128, CVTPD2I_MASK,
+ X86_INTRINSIC_DATA(avx512_mask_cvtpd2udq_128, CVTPD2DQ_MASK,
X86ISD::CVTP2UI, X86ISD::MCVTP2UI),
X86_INTRINSIC_DATA(avx512_mask_cvtpd2udq_256, INTR_TYPE_1OP_MASK,
X86ISD::CVTP2UI, 0),
@@ -539,8 +502,8 @@ static const IntrinsicData IntrinsicsWithoutChain[] = {
X86ISD::CVTP2SI, 0),
X86_INTRINSIC_DATA(avx512_mask_cvtps2dq_512, INTR_TYPE_1OP_MASK,
X86ISD::CVTP2SI, X86ISD::CVTP2SI_RND),
- X86_INTRINSIC_DATA(avx512_mask_cvtps2pd_512, INTR_TYPE_1OP_MASK,
- ISD::FP_EXTEND, X86ISD::VFPEXT_RND),
+ X86_INTRINSIC_DATA(avx512_mask_cvtps2pd_512, INTR_TYPE_1OP_MASK_SAE,
+ ISD::FP_EXTEND, X86ISD::VFPEXT_SAE),
X86_INTRINSIC_DATA(avx512_mask_cvtps2qq_128, INTR_TYPE_1OP_MASK,
X86ISD::CVTP2SI, 0),
X86_INTRINSIC_DATA(avx512_mask_cvtps2qq_256, INTR_TYPE_1OP_MASK,
@@ -559,164 +522,116 @@ static const IntrinsicData IntrinsicsWithoutChain[] = {
X86ISD::CVTP2UI, 0),
X86_INTRINSIC_DATA(avx512_mask_cvtps2uqq_512, INTR_TYPE_1OP_MASK,
X86ISD::CVTP2UI, X86ISD::CVTP2UI_RND),
- X86_INTRINSIC_DATA(avx512_mask_cvtqq2pd_512, INTR_TYPE_1OP_MASK,
- ISD::SINT_TO_FP, X86ISD::SINT_TO_FP_RND),
- X86_INTRINSIC_DATA(avx512_mask_cvtqq2ps_128, INTR_TYPE_1OP_MASK,
- X86ISD::CVTSI2P, 0),
- X86_INTRINSIC_DATA(avx512_mask_cvtqq2ps_256, INTR_TYPE_1OP_MASK,
- ISD::SINT_TO_FP, 0),
- X86_INTRINSIC_DATA(avx512_mask_cvtqq2ps_512, INTR_TYPE_1OP_MASK,
- ISD::SINT_TO_FP, X86ISD::SINT_TO_FP_RND),
- X86_INTRINSIC_DATA(avx512_mask_cvtsd2ss_round, INTR_TYPE_SCALAR_MASK_RM,
- X86ISD::VFPROUNDS_RND, 0),
- X86_INTRINSIC_DATA(avx512_mask_cvtss2sd_round, INTR_TYPE_SCALAR_MASK_RM,
- X86ISD::VFPEXTS_RND, 0),
- X86_INTRINSIC_DATA(avx512_mask_cvttpd2dq_128, CVTPD2I_MASK,
+ X86_INTRINSIC_DATA(avx512_mask_cvtqq2ps_128, CVTQQ2PS_MASK,
+ X86ISD::CVTSI2P, X86ISD::MCVTSI2P),
+ X86_INTRINSIC_DATA(avx512_mask_cvtsd2ss_round, INTR_TYPE_SCALAR_MASK_RND,
+ X86ISD::VFPROUNDS, X86ISD::VFPROUNDS_RND),
+ X86_INTRINSIC_DATA(avx512_mask_cvtss2sd_round, INTR_TYPE_SCALAR_MASK_SAE,
+ X86ISD::VFPEXTS, X86ISD::VFPEXTS_SAE),
+ X86_INTRINSIC_DATA(avx512_mask_cvttpd2dq_128, CVTPD2DQ_MASK,
X86ISD::CVTTP2SI, X86ISD::MCVTTP2SI),
- X86_INTRINSIC_DATA(avx512_mask_cvttpd2dq_512, INTR_TYPE_1OP_MASK,
- X86ISD::CVTTP2SI, X86ISD::CVTTP2SI_RND),
+ X86_INTRINSIC_DATA(avx512_mask_cvttpd2dq_512, INTR_TYPE_1OP_MASK_SAE,
+ X86ISD::CVTTP2SI, X86ISD::CVTTP2SI_SAE),
X86_INTRINSIC_DATA(avx512_mask_cvttpd2qq_128, INTR_TYPE_1OP_MASK,
X86ISD::CVTTP2SI, 0),
X86_INTRINSIC_DATA(avx512_mask_cvttpd2qq_256, INTR_TYPE_1OP_MASK,
X86ISD::CVTTP2SI, 0),
- X86_INTRINSIC_DATA(avx512_mask_cvttpd2qq_512, INTR_TYPE_1OP_MASK,
- X86ISD::CVTTP2SI, X86ISD::CVTTP2SI_RND),
- X86_INTRINSIC_DATA(avx512_mask_cvttpd2udq_128, CVTPD2I_MASK,
+ X86_INTRINSIC_DATA(avx512_mask_cvttpd2qq_512, INTR_TYPE_1OP_MASK_SAE,
+ X86ISD::CVTTP2SI, X86ISD::CVTTP2SI_SAE),
+ X86_INTRINSIC_DATA(avx512_mask_cvttpd2udq_128, CVTPD2DQ_MASK,
X86ISD::CVTTP2UI, X86ISD::MCVTTP2UI),
X86_INTRINSIC_DATA(avx512_mask_cvttpd2udq_256, INTR_TYPE_1OP_MASK,
X86ISD::CVTTP2UI, 0),
- X86_INTRINSIC_DATA(avx512_mask_cvttpd2udq_512, INTR_TYPE_1OP_MASK,
- X86ISD::CVTTP2UI, X86ISD::CVTTP2UI_RND),
+ X86_INTRINSIC_DATA(avx512_mask_cvttpd2udq_512, INTR_TYPE_1OP_MASK_SAE,
+ X86ISD::CVTTP2UI, X86ISD::CVTTP2UI_SAE),
X86_INTRINSIC_DATA(avx512_mask_cvttpd2uqq_128, INTR_TYPE_1OP_MASK,
X86ISD::CVTTP2UI, 0),
X86_INTRINSIC_DATA(avx512_mask_cvttpd2uqq_256, INTR_TYPE_1OP_MASK,
X86ISD::CVTTP2UI, 0),
- X86_INTRINSIC_DATA(avx512_mask_cvttpd2uqq_512, INTR_TYPE_1OP_MASK,
- X86ISD::CVTTP2UI, X86ISD::CVTTP2UI_RND),
- X86_INTRINSIC_DATA(avx512_mask_cvttps2dq_512, INTR_TYPE_1OP_MASK,
- X86ISD::CVTTP2SI, X86ISD::CVTTP2SI_RND),
+ X86_INTRINSIC_DATA(avx512_mask_cvttpd2uqq_512, INTR_TYPE_1OP_MASK_SAE,
+ X86ISD::CVTTP2UI, X86ISD::CVTTP2UI_SAE),
+ X86_INTRINSIC_DATA(avx512_mask_cvttps2dq_512, INTR_TYPE_1OP_MASK_SAE,
+ X86ISD::CVTTP2SI, X86ISD::CVTTP2SI_SAE),
X86_INTRINSIC_DATA(avx512_mask_cvttps2qq_128, INTR_TYPE_1OP_MASK,
X86ISD::CVTTP2SI, 0),
X86_INTRINSIC_DATA(avx512_mask_cvttps2qq_256, INTR_TYPE_1OP_MASK,
X86ISD::CVTTP2SI, 0),
- X86_INTRINSIC_DATA(avx512_mask_cvttps2qq_512, INTR_TYPE_1OP_MASK,
- X86ISD::CVTTP2SI, X86ISD::CVTTP2SI_RND),
+ X86_INTRINSIC_DATA(avx512_mask_cvttps2qq_512, INTR_TYPE_1OP_MASK_SAE,
+ X86ISD::CVTTP2SI, X86ISD::CVTTP2SI_SAE),
X86_INTRINSIC_DATA(avx512_mask_cvttps2udq_128, INTR_TYPE_1OP_MASK,
X86ISD::CVTTP2UI, 0),
X86_INTRINSIC_DATA(avx512_mask_cvttps2udq_256, INTR_TYPE_1OP_MASK,
X86ISD::CVTTP2UI, 0),
- X86_INTRINSIC_DATA(avx512_mask_cvttps2udq_512, INTR_TYPE_1OP_MASK,
- X86ISD::CVTTP2UI, X86ISD::CVTTP2UI_RND),
+ X86_INTRINSIC_DATA(avx512_mask_cvttps2udq_512, INTR_TYPE_1OP_MASK_SAE,
+ X86ISD::CVTTP2UI, X86ISD::CVTTP2UI_SAE),
X86_INTRINSIC_DATA(avx512_mask_cvttps2uqq_128, INTR_TYPE_1OP_MASK,
X86ISD::CVTTP2UI, 0),
X86_INTRINSIC_DATA(avx512_mask_cvttps2uqq_256, INTR_TYPE_1OP_MASK,
X86ISD::CVTTP2UI, 0),
- X86_INTRINSIC_DATA(avx512_mask_cvttps2uqq_512, INTR_TYPE_1OP_MASK,
- X86ISD::CVTTP2UI, X86ISD::CVTTP2UI_RND),
- X86_INTRINSIC_DATA(avx512_mask_cvtudq2ps_512, INTR_TYPE_1OP_MASK,
- ISD::UINT_TO_FP, X86ISD::UINT_TO_FP_RND),
- X86_INTRINSIC_DATA(avx512_mask_cvtuqq2pd_512, INTR_TYPE_1OP_MASK,
- ISD::UINT_TO_FP, X86ISD::UINT_TO_FP_RND),
- X86_INTRINSIC_DATA(avx512_mask_cvtuqq2ps_128, INTR_TYPE_1OP_MASK,
- X86ISD::CVTUI2P, 0),
- X86_INTRINSIC_DATA(avx512_mask_cvtuqq2ps_256, INTR_TYPE_1OP_MASK,
- ISD::UINT_TO_FP, 0),
- X86_INTRINSIC_DATA(avx512_mask_cvtuqq2ps_512, INTR_TYPE_1OP_MASK,
- ISD::UINT_TO_FP, X86ISD::UINT_TO_FP_RND),
- X86_INTRINSIC_DATA(avx512_mask_div_sd_round, INTR_TYPE_SCALAR_MASK_RM,
- X86ISD::FDIVS_RND, 0),
- X86_INTRINSIC_DATA(avx512_mask_div_ss_round, INTR_TYPE_SCALAR_MASK_RM,
- X86ISD::FDIVS_RND, 0),
- X86_INTRINSIC_DATA(avx512_mask_expand_b_128, COMPRESS_EXPAND_IN_REG,
- X86ISD::EXPAND, 0),
- X86_INTRINSIC_DATA(avx512_mask_expand_b_256, COMPRESS_EXPAND_IN_REG,
- X86ISD::EXPAND, 0),
- X86_INTRINSIC_DATA(avx512_mask_expand_b_512, COMPRESS_EXPAND_IN_REG,
- X86ISD::EXPAND, 0),
- X86_INTRINSIC_DATA(avx512_mask_expand_d_128, COMPRESS_EXPAND_IN_REG,
- X86ISD::EXPAND, 0),
- X86_INTRINSIC_DATA(avx512_mask_expand_d_256, COMPRESS_EXPAND_IN_REG,
- X86ISD::EXPAND, 0),
- X86_INTRINSIC_DATA(avx512_mask_expand_d_512, COMPRESS_EXPAND_IN_REG,
- X86ISD::EXPAND, 0),
- X86_INTRINSIC_DATA(avx512_mask_expand_pd_128, COMPRESS_EXPAND_IN_REG,
- X86ISD::EXPAND, 0),
- X86_INTRINSIC_DATA(avx512_mask_expand_pd_256, COMPRESS_EXPAND_IN_REG,
- X86ISD::EXPAND, 0),
- X86_INTRINSIC_DATA(avx512_mask_expand_pd_512, COMPRESS_EXPAND_IN_REG,
- X86ISD::EXPAND, 0),
- X86_INTRINSIC_DATA(avx512_mask_expand_ps_128, COMPRESS_EXPAND_IN_REG,
- X86ISD::EXPAND, 0),
- X86_INTRINSIC_DATA(avx512_mask_expand_ps_256, COMPRESS_EXPAND_IN_REG,
- X86ISD::EXPAND, 0),
- X86_INTRINSIC_DATA(avx512_mask_expand_ps_512, COMPRESS_EXPAND_IN_REG,
- X86ISD::EXPAND, 0),
- X86_INTRINSIC_DATA(avx512_mask_expand_q_128, COMPRESS_EXPAND_IN_REG,
- X86ISD::EXPAND, 0),
- X86_INTRINSIC_DATA(avx512_mask_expand_q_256, COMPRESS_EXPAND_IN_REG,
- X86ISD::EXPAND, 0),
- X86_INTRINSIC_DATA(avx512_mask_expand_q_512, COMPRESS_EXPAND_IN_REG,
- X86ISD::EXPAND, 0),
- X86_INTRINSIC_DATA(avx512_mask_expand_w_128, COMPRESS_EXPAND_IN_REG,
- X86ISD::EXPAND, 0),
- X86_INTRINSIC_DATA(avx512_mask_expand_w_256, COMPRESS_EXPAND_IN_REG,
- X86ISD::EXPAND, 0),
- X86_INTRINSIC_DATA(avx512_mask_expand_w_512, COMPRESS_EXPAND_IN_REG,
+ X86_INTRINSIC_DATA(avx512_mask_cvttps2uqq_512, INTR_TYPE_1OP_MASK_SAE,
+ X86ISD::CVTTP2UI, X86ISD::CVTTP2UI_SAE),
+ X86_INTRINSIC_DATA(avx512_mask_cvtuqq2ps_128, CVTQQ2PS_MASK,
+ X86ISD::CVTUI2P, X86ISD::MCVTUI2P),
+ X86_INTRINSIC_DATA(avx512_mask_div_sd_round, INTR_TYPE_SCALAR_MASK,
+ X86ISD::FDIVS, X86ISD::FDIVS_RND),
+ X86_INTRINSIC_DATA(avx512_mask_div_ss_round, INTR_TYPE_SCALAR_MASK,
+ X86ISD::FDIVS, X86ISD::FDIVS_RND),
+ X86_INTRINSIC_DATA(avx512_mask_expand, COMPRESS_EXPAND_IN_REG,
X86ISD::EXPAND, 0),
X86_INTRINSIC_DATA(avx512_mask_fixupimm_pd_128, FIXUPIMM, X86ISD::VFIXUPIMM, 0),
X86_INTRINSIC_DATA(avx512_mask_fixupimm_pd_256, FIXUPIMM, X86ISD::VFIXUPIMM, 0),
- X86_INTRINSIC_DATA(avx512_mask_fixupimm_pd_512, FIXUPIMM, X86ISD::VFIXUPIMM, 0),
+ X86_INTRINSIC_DATA(avx512_mask_fixupimm_pd_512, FIXUPIMM, X86ISD::VFIXUPIMM, X86ISD::VFIXUPIMM_SAE),
X86_INTRINSIC_DATA(avx512_mask_fixupimm_ps_128, FIXUPIMM, X86ISD::VFIXUPIMM, 0),
X86_INTRINSIC_DATA(avx512_mask_fixupimm_ps_256, FIXUPIMM, X86ISD::VFIXUPIMM, 0),
- X86_INTRINSIC_DATA(avx512_mask_fixupimm_ps_512, FIXUPIMM, X86ISD::VFIXUPIMM, 0),
- X86_INTRINSIC_DATA(avx512_mask_fixupimm_sd, FIXUPIMMS, X86ISD::VFIXUPIMMS, 0),
- X86_INTRINSIC_DATA(avx512_mask_fixupimm_ss, FIXUPIMMS, X86ISD::VFIXUPIMMS, 0),
+ X86_INTRINSIC_DATA(avx512_mask_fixupimm_ps_512, FIXUPIMM, X86ISD::VFIXUPIMM, X86ISD::VFIXUPIMM_SAE),
+ X86_INTRINSIC_DATA(avx512_mask_fixupimm_sd, FIXUPIMM, X86ISD::VFIXUPIMMS, X86ISD::VFIXUPIMMS_SAE),
+ X86_INTRINSIC_DATA(avx512_mask_fixupimm_ss, FIXUPIMM, X86ISD::VFIXUPIMMS, X86ISD::VFIXUPIMMS_SAE),
X86_INTRINSIC_DATA(avx512_mask_fpclass_sd, FPCLASSS, X86ISD::VFPCLASSS, 0),
X86_INTRINSIC_DATA(avx512_mask_fpclass_ss, FPCLASSS, X86ISD::VFPCLASSS, 0),
- X86_INTRINSIC_DATA(avx512_mask_getexp_pd_128, INTR_TYPE_1OP_MASK_RM,
- X86ISD::FGETEXP_RND, 0),
- X86_INTRINSIC_DATA(avx512_mask_getexp_pd_256, INTR_TYPE_1OP_MASK_RM,
- X86ISD::FGETEXP_RND, 0),
- X86_INTRINSIC_DATA(avx512_mask_getexp_pd_512, INTR_TYPE_1OP_MASK_RM,
- X86ISD::FGETEXP_RND, 0),
- X86_INTRINSIC_DATA(avx512_mask_getexp_ps_128, INTR_TYPE_1OP_MASK_RM,
- X86ISD::FGETEXP_RND, 0),
- X86_INTRINSIC_DATA(avx512_mask_getexp_ps_256, INTR_TYPE_1OP_MASK_RM,
- X86ISD::FGETEXP_RND, 0),
- X86_INTRINSIC_DATA(avx512_mask_getexp_ps_512, INTR_TYPE_1OP_MASK_RM,
- X86ISD::FGETEXP_RND, 0),
- X86_INTRINSIC_DATA(avx512_mask_getexp_sd, INTR_TYPE_SCALAR_MASK_RM,
- X86ISD::FGETEXPS_RND, 0),
- X86_INTRINSIC_DATA(avx512_mask_getexp_ss, INTR_TYPE_SCALAR_MASK_RM,
- X86ISD::FGETEXPS_RND, 0),
- X86_INTRINSIC_DATA(avx512_mask_getmant_pd_128, INTR_TYPE_2OP_MASK,
+ X86_INTRINSIC_DATA(avx512_mask_getexp_pd_128, INTR_TYPE_1OP_MASK,
+ X86ISD::FGETEXP, 0),
+ X86_INTRINSIC_DATA(avx512_mask_getexp_pd_256, INTR_TYPE_1OP_MASK,
+ X86ISD::FGETEXP, 0),
+ X86_INTRINSIC_DATA(avx512_mask_getexp_pd_512, INTR_TYPE_1OP_MASK_SAE,
+ X86ISD::FGETEXP, X86ISD::FGETEXP_SAE),
+ X86_INTRINSIC_DATA(avx512_mask_getexp_ps_128, INTR_TYPE_1OP_MASK,
+ X86ISD::FGETEXP, 0),
+ X86_INTRINSIC_DATA(avx512_mask_getexp_ps_256, INTR_TYPE_1OP_MASK,
+ X86ISD::FGETEXP, 0),
+ X86_INTRINSIC_DATA(avx512_mask_getexp_ps_512, INTR_TYPE_1OP_MASK_SAE,
+ X86ISD::FGETEXP, X86ISD::FGETEXP_SAE),
+ X86_INTRINSIC_DATA(avx512_mask_getexp_sd, INTR_TYPE_SCALAR_MASK_SAE,
+ X86ISD::FGETEXPS, X86ISD::FGETEXPS_SAE),
+ X86_INTRINSIC_DATA(avx512_mask_getexp_ss, INTR_TYPE_SCALAR_MASK_SAE,
+ X86ISD::FGETEXPS, X86ISD::FGETEXPS_SAE),
+ X86_INTRINSIC_DATA(avx512_mask_getmant_pd_128, INTR_TYPE_2OP_MASK_SAE,
X86ISD::VGETMANT, 0),
- X86_INTRINSIC_DATA(avx512_mask_getmant_pd_256, INTR_TYPE_2OP_MASK,
+ X86_INTRINSIC_DATA(avx512_mask_getmant_pd_256, INTR_TYPE_2OP_MASK_SAE,
X86ISD::VGETMANT, 0),
- X86_INTRINSIC_DATA(avx512_mask_getmant_pd_512, INTR_TYPE_2OP_MASK,
- X86ISD::VGETMANT, X86ISD::VGETMANT_RND),
- X86_INTRINSIC_DATA(avx512_mask_getmant_ps_128, INTR_TYPE_2OP_MASK,
+ X86_INTRINSIC_DATA(avx512_mask_getmant_pd_512, INTR_TYPE_2OP_MASK_SAE,
+ X86ISD::VGETMANT, X86ISD::VGETMANT_SAE),
+ X86_INTRINSIC_DATA(avx512_mask_getmant_ps_128, INTR_TYPE_2OP_MASK_SAE,
X86ISD::VGETMANT, 0),
- X86_INTRINSIC_DATA(avx512_mask_getmant_ps_256, INTR_TYPE_2OP_MASK,
+ X86_INTRINSIC_DATA(avx512_mask_getmant_ps_256, INTR_TYPE_2OP_MASK_SAE,
X86ISD::VGETMANT, 0),
- X86_INTRINSIC_DATA(avx512_mask_getmant_ps_512, INTR_TYPE_2OP_MASK,
- X86ISD::VGETMANT, X86ISD::VGETMANT_RND),
- X86_INTRINSIC_DATA(avx512_mask_getmant_sd, INTR_TYPE_3OP_SCALAR_MASK,
- X86ISD::VGETMANTS, X86ISD::VGETMANTS_RND),
- X86_INTRINSIC_DATA(avx512_mask_getmant_ss, INTR_TYPE_3OP_SCALAR_MASK,
- X86ISD::VGETMANTS, X86ISD::VGETMANTS_RND),
- X86_INTRINSIC_DATA(avx512_mask_max_sd_round, INTR_TYPE_SCALAR_MASK,
- X86ISD::FMAXS, X86ISD::FMAXS_RND),
- X86_INTRINSIC_DATA(avx512_mask_max_ss_round, INTR_TYPE_SCALAR_MASK,
- X86ISD::FMAXS, X86ISD::FMAXS_RND),
- X86_INTRINSIC_DATA(avx512_mask_min_sd_round, INTR_TYPE_SCALAR_MASK,
- X86ISD::FMINS, X86ISD::FMINS_RND),
- X86_INTRINSIC_DATA(avx512_mask_min_ss_round, INTR_TYPE_SCALAR_MASK,
- X86ISD::FMINS, X86ISD::FMINS_RND),
- X86_INTRINSIC_DATA(avx512_mask_mul_sd_round, INTR_TYPE_SCALAR_MASK_RM,
- X86ISD::FMULS_RND, 0),
- X86_INTRINSIC_DATA(avx512_mask_mul_ss_round, INTR_TYPE_SCALAR_MASK_RM,
- X86ISD::FMULS_RND, 0),
+ X86_INTRINSIC_DATA(avx512_mask_getmant_ps_512, INTR_TYPE_2OP_MASK_SAE,
+ X86ISD::VGETMANT, X86ISD::VGETMANT_SAE),
+ X86_INTRINSIC_DATA(avx512_mask_getmant_sd, INTR_TYPE_3OP_SCALAR_MASK_SAE,
+ X86ISD::VGETMANTS, X86ISD::VGETMANTS_SAE),
+ X86_INTRINSIC_DATA(avx512_mask_getmant_ss, INTR_TYPE_3OP_SCALAR_MASK_SAE,
+ X86ISD::VGETMANTS, X86ISD::VGETMANTS_SAE),
+ X86_INTRINSIC_DATA(avx512_mask_max_sd_round, INTR_TYPE_SCALAR_MASK_SAE,
+ X86ISD::FMAXS, X86ISD::FMAXS_SAE),
+ X86_INTRINSIC_DATA(avx512_mask_max_ss_round, INTR_TYPE_SCALAR_MASK_SAE,
+ X86ISD::FMAXS, X86ISD::FMAXS_SAE),
+ X86_INTRINSIC_DATA(avx512_mask_min_sd_round, INTR_TYPE_SCALAR_MASK_SAE,
+ X86ISD::FMINS, X86ISD::FMINS_SAE),
+ X86_INTRINSIC_DATA(avx512_mask_min_ss_round, INTR_TYPE_SCALAR_MASK_SAE,
+ X86ISD::FMINS, X86ISD::FMINS_SAE),
+ X86_INTRINSIC_DATA(avx512_mask_mul_sd_round, INTR_TYPE_SCALAR_MASK,
+ X86ISD::FMULS, X86ISD::FMULS_RND),
+ X86_INTRINSIC_DATA(avx512_mask_mul_ss_round, INTR_TYPE_SCALAR_MASK,
+ X86ISD::FMULS, X86ISD::FMULS_RND),
X86_INTRINSIC_DATA(avx512_mask_pmov_db_128, TRUNCATE_TO_REG,
X86ISD::VTRUNC, X86ISD::VMTRUNC),
X86_INTRINSIC_DATA(avx512_mask_pmov_db_256, TRUNCATE_TO_REG,
@@ -737,10 +652,6 @@ static const IntrinsicData IntrinsicsWithoutChain[] = {
X86ISD::VTRUNC, X86ISD::VMTRUNC),
X86_INTRINSIC_DATA(avx512_mask_pmov_qd_128, TRUNCATE_TO_REG,
X86ISD::VTRUNC, X86ISD::VMTRUNC),
- X86_INTRINSIC_DATA(avx512_mask_pmov_qd_256, INTR_TYPE_1OP_MASK,
- ISD::TRUNCATE, 0),
- X86_INTRINSIC_DATA(avx512_mask_pmov_qd_512, INTR_TYPE_1OP_MASK,
- ISD::TRUNCATE, 0),
X86_INTRINSIC_DATA(avx512_mask_pmov_qw_128, TRUNCATE_TO_REG,
X86ISD::VTRUNC, X86ISD::VMTRUNC),
X86_INTRINSIC_DATA(avx512_mask_pmov_qw_256, TRUNCATE_TO_REG,
@@ -749,10 +660,6 @@ static const IntrinsicData IntrinsicsWithoutChain[] = {
ISD::TRUNCATE, X86ISD::VMTRUNC),
X86_INTRINSIC_DATA(avx512_mask_pmov_wb_128, TRUNCATE_TO_REG,
X86ISD::VTRUNC, X86ISD::VMTRUNC),
- X86_INTRINSIC_DATA(avx512_mask_pmov_wb_256, INTR_TYPE_1OP_MASK,
- ISD::TRUNCATE, 0),
- X86_INTRINSIC_DATA(avx512_mask_pmov_wb_512, INTR_TYPE_1OP_MASK,
- ISD::TRUNCATE, 0),
X86_INTRINSIC_DATA(avx512_mask_pmovs_db_128, TRUNCATE_TO_REG,
X86ISD::VTRUNCS, X86ISD::VMTRUNCS),
X86_INTRINSIC_DATA(avx512_mask_pmovs_db_256, TRUNCATE_TO_REG,
@@ -825,62 +732,62 @@ static const IntrinsicData IntrinsicsWithoutChain[] = {
X86ISD::VTRUNCUS, 0),
X86_INTRINSIC_DATA(avx512_mask_pmovus_wb_512, INTR_TYPE_1OP_MASK,
X86ISD::VTRUNCUS, 0),
- X86_INTRINSIC_DATA(avx512_mask_range_pd_128, INTR_TYPE_3OP_MASK, X86ISD::VRANGE, 0),
- X86_INTRINSIC_DATA(avx512_mask_range_pd_256, INTR_TYPE_3OP_MASK, X86ISD::VRANGE, 0),
- X86_INTRINSIC_DATA(avx512_mask_range_pd_512, INTR_TYPE_3OP_MASK, X86ISD::VRANGE, X86ISD::VRANGE_RND),
- X86_INTRINSIC_DATA(avx512_mask_range_ps_128, INTR_TYPE_3OP_MASK, X86ISD::VRANGE, 0),
- X86_INTRINSIC_DATA(avx512_mask_range_ps_256, INTR_TYPE_3OP_MASK, X86ISD::VRANGE, 0),
- X86_INTRINSIC_DATA(avx512_mask_range_ps_512, INTR_TYPE_3OP_MASK, X86ISD::VRANGE, X86ISD::VRANGE_RND),
- X86_INTRINSIC_DATA(avx512_mask_range_sd, INTR_TYPE_SCALAR_MASK, X86ISD::VRANGES, X86ISD::VRANGES_RND),
- X86_INTRINSIC_DATA(avx512_mask_range_ss, INTR_TYPE_SCALAR_MASK, X86ISD::VRANGES, X86ISD::VRANGES_RND),
- X86_INTRINSIC_DATA(avx512_mask_reduce_pd_128, INTR_TYPE_2OP_MASK, X86ISD::VREDUCE, 0),
- X86_INTRINSIC_DATA(avx512_mask_reduce_pd_256, INTR_TYPE_2OP_MASK, X86ISD::VREDUCE, 0),
- X86_INTRINSIC_DATA(avx512_mask_reduce_pd_512, INTR_TYPE_2OP_MASK, X86ISD::VREDUCE, X86ISD::VREDUCE_RND),
- X86_INTRINSIC_DATA(avx512_mask_reduce_ps_128, INTR_TYPE_2OP_MASK, X86ISD::VREDUCE, 0),
- X86_INTRINSIC_DATA(avx512_mask_reduce_ps_256, INTR_TYPE_2OP_MASK, X86ISD::VREDUCE, 0),
- X86_INTRINSIC_DATA(avx512_mask_reduce_ps_512, INTR_TYPE_2OP_MASK, X86ISD::VREDUCE, X86ISD::VREDUCE_RND),
- X86_INTRINSIC_DATA(avx512_mask_reduce_sd, INTR_TYPE_SCALAR_MASK, X86ISD::VREDUCES, X86ISD::VREDUCES_RND),
- X86_INTRINSIC_DATA(avx512_mask_reduce_ss, INTR_TYPE_SCALAR_MASK, X86ISD::VREDUCES, X86ISD::VREDUCES_RND),
- X86_INTRINSIC_DATA(avx512_mask_rndscale_pd_128, INTR_TYPE_2OP_MASK, X86ISD::VRNDSCALE, 0),
- X86_INTRINSIC_DATA(avx512_mask_rndscale_pd_256, INTR_TYPE_2OP_MASK, X86ISD::VRNDSCALE, 0),
- X86_INTRINSIC_DATA(avx512_mask_rndscale_pd_512, INTR_TYPE_2OP_MASK, X86ISD::VRNDSCALE, X86ISD::VRNDSCALE_RND),
- X86_INTRINSIC_DATA(avx512_mask_rndscale_ps_128, INTR_TYPE_2OP_MASK, X86ISD::VRNDSCALE, 0),
- X86_INTRINSIC_DATA(avx512_mask_rndscale_ps_256, INTR_TYPE_2OP_MASK, X86ISD::VRNDSCALE, 0),
- X86_INTRINSIC_DATA(avx512_mask_rndscale_ps_512, INTR_TYPE_2OP_MASK, X86ISD::VRNDSCALE, X86ISD::VRNDSCALE_RND),
+ X86_INTRINSIC_DATA(avx512_mask_range_pd_128, INTR_TYPE_3OP_MASK_SAE, X86ISD::VRANGE, 0),
+ X86_INTRINSIC_DATA(avx512_mask_range_pd_256, INTR_TYPE_3OP_MASK_SAE, X86ISD::VRANGE, 0),
+ X86_INTRINSIC_DATA(avx512_mask_range_pd_512, INTR_TYPE_3OP_MASK_SAE, X86ISD::VRANGE, X86ISD::VRANGE_SAE),
+ X86_INTRINSIC_DATA(avx512_mask_range_ps_128, INTR_TYPE_3OP_MASK_SAE, X86ISD::VRANGE, 0),
+ X86_INTRINSIC_DATA(avx512_mask_range_ps_256, INTR_TYPE_3OP_MASK_SAE, X86ISD::VRANGE, 0),
+ X86_INTRINSIC_DATA(avx512_mask_range_ps_512, INTR_TYPE_3OP_MASK_SAE, X86ISD::VRANGE, X86ISD::VRANGE_SAE),
+ X86_INTRINSIC_DATA(avx512_mask_range_sd, INTR_TYPE_SCALAR_MASK, X86ISD::VRANGES, X86ISD::VRANGES_SAE),
+ X86_INTRINSIC_DATA(avx512_mask_range_ss, INTR_TYPE_SCALAR_MASK, X86ISD::VRANGES, X86ISD::VRANGES_SAE),
+ X86_INTRINSIC_DATA(avx512_mask_reduce_pd_128, INTR_TYPE_2OP_MASK_SAE, X86ISD::VREDUCE, 0),
+ X86_INTRINSIC_DATA(avx512_mask_reduce_pd_256, INTR_TYPE_2OP_MASK_SAE, X86ISD::VREDUCE, 0),
+ X86_INTRINSIC_DATA(avx512_mask_reduce_pd_512, INTR_TYPE_2OP_MASK_SAE, X86ISD::VREDUCE, X86ISD::VREDUCE_SAE),
+ X86_INTRINSIC_DATA(avx512_mask_reduce_ps_128, INTR_TYPE_2OP_MASK_SAE, X86ISD::VREDUCE, 0),
+ X86_INTRINSIC_DATA(avx512_mask_reduce_ps_256, INTR_TYPE_2OP_MASK_SAE, X86ISD::VREDUCE, 0),
+ X86_INTRINSIC_DATA(avx512_mask_reduce_ps_512, INTR_TYPE_2OP_MASK_SAE, X86ISD::VREDUCE, X86ISD::VREDUCE_SAE),
+ X86_INTRINSIC_DATA(avx512_mask_reduce_sd, INTR_TYPE_SCALAR_MASK, X86ISD::VREDUCES, X86ISD::VREDUCES_SAE),
+ X86_INTRINSIC_DATA(avx512_mask_reduce_ss, INTR_TYPE_SCALAR_MASK, X86ISD::VREDUCES, X86ISD::VREDUCES_SAE),
+ X86_INTRINSIC_DATA(avx512_mask_rndscale_pd_128, INTR_TYPE_2OP_MASK_SAE, X86ISD::VRNDSCALE, 0),
+ X86_INTRINSIC_DATA(avx512_mask_rndscale_pd_256, INTR_TYPE_2OP_MASK_SAE, X86ISD::VRNDSCALE, 0),
+ X86_INTRINSIC_DATA(avx512_mask_rndscale_pd_512, INTR_TYPE_2OP_MASK_SAE, X86ISD::VRNDSCALE, X86ISD::VRNDSCALE_SAE),
+ X86_INTRINSIC_DATA(avx512_mask_rndscale_ps_128, INTR_TYPE_2OP_MASK_SAE, X86ISD::VRNDSCALE, 0),
+ X86_INTRINSIC_DATA(avx512_mask_rndscale_ps_256, INTR_TYPE_2OP_MASK_SAE, X86ISD::VRNDSCALE, 0),
+ X86_INTRINSIC_DATA(avx512_mask_rndscale_ps_512, INTR_TYPE_2OP_MASK_SAE, X86ISD::VRNDSCALE, X86ISD::VRNDSCALE_SAE),
X86_INTRINSIC_DATA(avx512_mask_rndscale_sd, INTR_TYPE_SCALAR_MASK,
- X86ISD::VRNDSCALES, X86ISD::VRNDSCALES_RND),
+ X86ISD::VRNDSCALES, X86ISD::VRNDSCALES_SAE),
X86_INTRINSIC_DATA(avx512_mask_rndscale_ss, INTR_TYPE_SCALAR_MASK,
- X86ISD::VRNDSCALES, X86ISD::VRNDSCALES_RND),
- X86_INTRINSIC_DATA(avx512_mask_scalef_pd_128, INTR_TYPE_2OP_MASK_RM,
+ X86ISD::VRNDSCALES, X86ISD::VRNDSCALES_SAE),
+ X86_INTRINSIC_DATA(avx512_mask_scalef_pd_128, INTR_TYPE_2OP_MASK,
X86ISD::SCALEF, 0),
- X86_INTRINSIC_DATA(avx512_mask_scalef_pd_256, INTR_TYPE_2OP_MASK_RM,
+ X86_INTRINSIC_DATA(avx512_mask_scalef_pd_256, INTR_TYPE_2OP_MASK,
X86ISD::SCALEF, 0),
- X86_INTRINSIC_DATA(avx512_mask_scalef_pd_512, INTR_TYPE_2OP_MASK_RM,
+ X86_INTRINSIC_DATA(avx512_mask_scalef_pd_512, INTR_TYPE_2OP_MASK,
+ X86ISD::SCALEF, X86ISD::SCALEF_RND),
+ X86_INTRINSIC_DATA(avx512_mask_scalef_ps_128, INTR_TYPE_2OP_MASK,
X86ISD::SCALEF, 0),
- X86_INTRINSIC_DATA(avx512_mask_scalef_ps_128, INTR_TYPE_2OP_MASK_RM,
+ X86_INTRINSIC_DATA(avx512_mask_scalef_ps_256, INTR_TYPE_2OP_MASK,
X86ISD::SCALEF, 0),
- X86_INTRINSIC_DATA(avx512_mask_scalef_ps_256, INTR_TYPE_2OP_MASK_RM,
- X86ISD::SCALEF, 0),
- X86_INTRINSIC_DATA(avx512_mask_scalef_ps_512, INTR_TYPE_2OP_MASK_RM,
- X86ISD::SCALEF, 0),
- X86_INTRINSIC_DATA(avx512_mask_scalef_sd, INTR_TYPE_SCALAR_MASK_RM,
- X86ISD::SCALEFS, 0),
- X86_INTRINSIC_DATA(avx512_mask_scalef_ss, INTR_TYPE_SCALAR_MASK_RM,
- X86ISD::SCALEFS, 0),
- X86_INTRINSIC_DATA(avx512_mask_sqrt_sd, INTR_TYPE_SCALAR_MASK_RM,
- X86ISD::FSQRTS_RND, 0),
- X86_INTRINSIC_DATA(avx512_mask_sqrt_ss, INTR_TYPE_SCALAR_MASK_RM,
- X86ISD::FSQRTS_RND, 0),
- X86_INTRINSIC_DATA(avx512_mask_sub_sd_round, INTR_TYPE_SCALAR_MASK_RM,
- X86ISD::FSUBS_RND, 0),
- X86_INTRINSIC_DATA(avx512_mask_sub_ss_round, INTR_TYPE_SCALAR_MASK_RM,
- X86ISD::FSUBS_RND, 0),
+ X86_INTRINSIC_DATA(avx512_mask_scalef_ps_512, INTR_TYPE_2OP_MASK,
+ X86ISD::SCALEF, X86ISD::SCALEF_RND),
+ X86_INTRINSIC_DATA(avx512_mask_scalef_sd, INTR_TYPE_SCALAR_MASK,
+ X86ISD::SCALEFS, X86ISD::SCALEFS_RND),
+ X86_INTRINSIC_DATA(avx512_mask_scalef_ss, INTR_TYPE_SCALAR_MASK,
+ X86ISD::SCALEFS, X86ISD::SCALEFS_RND),
+ X86_INTRINSIC_DATA(avx512_mask_sqrt_sd, INTR_TYPE_SCALAR_MASK,
+ X86ISD::FSQRTS, X86ISD::FSQRTS_RND),
+ X86_INTRINSIC_DATA(avx512_mask_sqrt_ss, INTR_TYPE_SCALAR_MASK,
+ X86ISD::FSQRTS, X86ISD::FSQRTS_RND),
+ X86_INTRINSIC_DATA(avx512_mask_sub_sd_round, INTR_TYPE_SCALAR_MASK,
+ X86ISD::FSUBS, X86ISD::FSUBS_RND),
+ X86_INTRINSIC_DATA(avx512_mask_sub_ss_round, INTR_TYPE_SCALAR_MASK,
+ X86ISD::FSUBS, X86ISD::FSUBS_RND),
X86_INTRINSIC_DATA(avx512_mask_vcvtph2ps_128, INTR_TYPE_1OP_MASK,
X86ISD::CVTPH2PS, 0),
X86_INTRINSIC_DATA(avx512_mask_vcvtph2ps_256, INTR_TYPE_1OP_MASK,
X86ISD::CVTPH2PS, 0),
- X86_INTRINSIC_DATA(avx512_mask_vcvtph2ps_512, INTR_TYPE_1OP_MASK,
- X86ISD::CVTPH2PS, X86ISD::CVTPH2PS_RND),
+ X86_INTRINSIC_DATA(avx512_mask_vcvtph2ps_512, INTR_TYPE_1OP_MASK_SAE,
+ X86ISD::CVTPH2PS, X86ISD::CVTPH2PS_SAE),
X86_INTRINSIC_DATA(avx512_mask_vcvtps2ph_128, CVTPS2PH_MASK,
X86ISD::CVTPS2PH, X86ISD::MCVTPS2PH),
X86_INTRINSIC_DATA(avx512_mask_vcvtps2ph_256, CVTPS2PH_MASK,
@@ -893,28 +800,30 @@ static const IntrinsicData IntrinsicsWithoutChain[] = {
X86_INTRINSIC_DATA(avx512_maskz_fixupimm_pd_256, FIXUPIMM_MASKZ,
X86ISD::VFIXUPIMM, 0),
X86_INTRINSIC_DATA(avx512_maskz_fixupimm_pd_512, FIXUPIMM_MASKZ,
- X86ISD::VFIXUPIMM, 0),
+ X86ISD::VFIXUPIMM, X86ISD::VFIXUPIMM_SAE),
X86_INTRINSIC_DATA(avx512_maskz_fixupimm_ps_128, FIXUPIMM_MASKZ,
X86ISD::VFIXUPIMM, 0),
X86_INTRINSIC_DATA(avx512_maskz_fixupimm_ps_256, FIXUPIMM_MASKZ,
X86ISD::VFIXUPIMM, 0),
X86_INTRINSIC_DATA(avx512_maskz_fixupimm_ps_512, FIXUPIMM_MASKZ,
- X86ISD::VFIXUPIMM, 0),
- X86_INTRINSIC_DATA(avx512_maskz_fixupimm_sd, FIXUPIMMS_MASKZ,
- X86ISD::VFIXUPIMMS, 0),
- X86_INTRINSIC_DATA(avx512_maskz_fixupimm_ss, FIXUPIMMS_MASKZ,
- X86ISD::VFIXUPIMMS, 0),
+ X86ISD::VFIXUPIMM, X86ISD::VFIXUPIMM_SAE),
+ X86_INTRINSIC_DATA(avx512_maskz_fixupimm_sd, FIXUPIMM_MASKZ,
+ X86ISD::VFIXUPIMMS, X86ISD::VFIXUPIMMS_SAE),
+ X86_INTRINSIC_DATA(avx512_maskz_fixupimm_ss, FIXUPIMM_MASKZ,
+ X86ISD::VFIXUPIMMS, X86ISD::VFIXUPIMMS_SAE),
- X86_INTRINSIC_DATA(avx512_max_pd_512, INTR_TYPE_2OP, X86ISD::FMAX, X86ISD::FMAX_RND),
- X86_INTRINSIC_DATA(avx512_max_ps_512, INTR_TYPE_2OP, X86ISD::FMAX, X86ISD::FMAX_RND),
- X86_INTRINSIC_DATA(avx512_min_pd_512, INTR_TYPE_2OP, X86ISD::FMIN, X86ISD::FMIN_RND),
- X86_INTRINSIC_DATA(avx512_min_ps_512, INTR_TYPE_2OP, X86ISD::FMIN, X86ISD::FMIN_RND),
+ X86_INTRINSIC_DATA(avx512_max_pd_512, INTR_TYPE_2OP_SAE, X86ISD::FMAX, X86ISD::FMAX_SAE),
+ X86_INTRINSIC_DATA(avx512_max_ps_512, INTR_TYPE_2OP_SAE, X86ISD::FMAX, X86ISD::FMAX_SAE),
+ X86_INTRINSIC_DATA(avx512_min_pd_512, INTR_TYPE_2OP_SAE, X86ISD::FMIN, X86ISD::FMIN_SAE),
+ X86_INTRINSIC_DATA(avx512_min_ps_512, INTR_TYPE_2OP_SAE, X86ISD::FMIN, X86ISD::FMIN_SAE),
X86_INTRINSIC_DATA(avx512_mul_pd_512, INTR_TYPE_2OP, ISD::FMUL, X86ISD::FMUL_RND),
X86_INTRINSIC_DATA(avx512_mul_ps_512, INTR_TYPE_2OP, ISD::FMUL, X86ISD::FMUL_RND),
X86_INTRINSIC_DATA(avx512_packssdw_512, INTR_TYPE_2OP, X86ISD::PACKSS, 0),
X86_INTRINSIC_DATA(avx512_packsswb_512, INTR_TYPE_2OP, X86ISD::PACKSS, 0),
X86_INTRINSIC_DATA(avx512_packusdw_512, INTR_TYPE_2OP, X86ISD::PACKUS, 0),
X86_INTRINSIC_DATA(avx512_packuswb_512, INTR_TYPE_2OP, X86ISD::PACKUS, 0),
+ X86_INTRINSIC_DATA(avx512_pavg_b_512, INTR_TYPE_2OP, X86ISD::AVG, 0),
+ X86_INTRINSIC_DATA(avx512_pavg_w_512, INTR_TYPE_2OP, X86ISD::AVG, 0),
X86_INTRINSIC_DATA(avx512_permvar_df_256, VPERM_2OP, X86ISD::VPERMV, 0),
X86_INTRINSIC_DATA(avx512_permvar_df_512, VPERM_2OP, X86ISD::VPERMV, 0),
X86_INTRINSIC_DATA(avx512_permvar_di_256, VPERM_2OP, X86ISD::VPERMV, 0),
@@ -943,11 +852,11 @@ static const IntrinsicData IntrinsicsWithoutChain[] = {
X86_INTRINSIC_DATA(avx512_pslli_d_512, VSHIFT, X86ISD::VSHLI, 0),
X86_INTRINSIC_DATA(avx512_pslli_q_512, VSHIFT, X86ISD::VSHLI, 0),
X86_INTRINSIC_DATA(avx512_pslli_w_512, VSHIFT, X86ISD::VSHLI, 0),
- X86_INTRINSIC_DATA(avx512_psllv_d_512, INTR_TYPE_2OP, ISD::SHL, 0),
- X86_INTRINSIC_DATA(avx512_psllv_q_512, INTR_TYPE_2OP, ISD::SHL, 0),
- X86_INTRINSIC_DATA(avx512_psllv_w_128, INTR_TYPE_2OP, ISD::SHL, 0),
- X86_INTRINSIC_DATA(avx512_psllv_w_256, INTR_TYPE_2OP, ISD::SHL, 0),
- X86_INTRINSIC_DATA(avx512_psllv_w_512, INTR_TYPE_2OP, ISD::SHL, 0),
+ X86_INTRINSIC_DATA(avx512_psllv_d_512, INTR_TYPE_2OP, X86ISD::VSHLV, 0),
+ X86_INTRINSIC_DATA(avx512_psllv_q_512, INTR_TYPE_2OP, X86ISD::VSHLV, 0),
+ X86_INTRINSIC_DATA(avx512_psllv_w_128, INTR_TYPE_2OP, X86ISD::VSHLV, 0),
+ X86_INTRINSIC_DATA(avx512_psllv_w_256, INTR_TYPE_2OP, X86ISD::VSHLV, 0),
+ X86_INTRINSIC_DATA(avx512_psllv_w_512, INTR_TYPE_2OP, X86ISD::VSHLV, 0),
X86_INTRINSIC_DATA(avx512_psra_d_512, INTR_TYPE_2OP, X86ISD::VSRA, 0),
X86_INTRINSIC_DATA(avx512_psra_q_128, INTR_TYPE_2OP, X86ISD::VSRA, 0),
X86_INTRINSIC_DATA(avx512_psra_q_256, INTR_TYPE_2OP, X86ISD::VSRA, 0),
@@ -971,11 +880,11 @@ static const IntrinsicData IntrinsicsWithoutChain[] = {
X86_INTRINSIC_DATA(avx512_psrli_d_512, VSHIFT, X86ISD::VSRLI, 0),
X86_INTRINSIC_DATA(avx512_psrli_q_512, VSHIFT, X86ISD::VSRLI, 0),
X86_INTRINSIC_DATA(avx512_psrli_w_512, VSHIFT, X86ISD::VSRLI, 0),
- X86_INTRINSIC_DATA(avx512_psrlv_d_512, INTR_TYPE_2OP, ISD::SRL, 0),
- X86_INTRINSIC_DATA(avx512_psrlv_q_512, INTR_TYPE_2OP, ISD::SRL, 0),
- X86_INTRINSIC_DATA(avx512_psrlv_w_128, INTR_TYPE_2OP, ISD::SRL, 0),
- X86_INTRINSIC_DATA(avx512_psrlv_w_256, INTR_TYPE_2OP, ISD::SRL, 0),
- X86_INTRINSIC_DATA(avx512_psrlv_w_512, INTR_TYPE_2OP, ISD::SRL, 0),
+ X86_INTRINSIC_DATA(avx512_psrlv_d_512, INTR_TYPE_2OP, X86ISD::VSRLV, 0),
+ X86_INTRINSIC_DATA(avx512_psrlv_q_512, INTR_TYPE_2OP, X86ISD::VSRLV, 0),
+ X86_INTRINSIC_DATA(avx512_psrlv_w_128, INTR_TYPE_2OP, X86ISD::VSRLV, 0),
+ X86_INTRINSIC_DATA(avx512_psrlv_w_256, INTR_TYPE_2OP, X86ISD::VSRLV, 0),
+ X86_INTRINSIC_DATA(avx512_psrlv_w_512, INTR_TYPE_2OP, X86ISD::VSRLV, 0),
X86_INTRINSIC_DATA(avx512_pternlog_d_128, INTR_TYPE_4OP, X86ISD::VPTERNLOG, 0),
X86_INTRINSIC_DATA(avx512_pternlog_d_256, INTR_TYPE_4OP, X86ISD::VPTERNLOG, 0),
X86_INTRINSIC_DATA(avx512_pternlog_d_512, INTR_TYPE_4OP, X86ISD::VPTERNLOG, 0),
@@ -990,10 +899,10 @@ static const IntrinsicData IntrinsicsWithoutChain[] = {
X86_INTRINSIC_DATA(avx512_rcp14_ps_512, INTR_TYPE_1OP_MASK, X86ISD::RCP14, 0),
X86_INTRINSIC_DATA(avx512_rcp14_sd, INTR_TYPE_SCALAR_MASK, X86ISD::RCP14S, 0),
X86_INTRINSIC_DATA(avx512_rcp14_ss, INTR_TYPE_SCALAR_MASK, X86ISD::RCP14S, 0),
- X86_INTRINSIC_DATA(avx512_rcp28_pd, INTR_TYPE_1OP_MASK_RM, X86ISD::RCP28, 0),
- X86_INTRINSIC_DATA(avx512_rcp28_ps, INTR_TYPE_1OP_MASK_RM, X86ISD::RCP28, 0),
- X86_INTRINSIC_DATA(avx512_rcp28_sd, INTR_TYPE_SCALAR_MASK_RM, X86ISD::RCP28S, 0),
- X86_INTRINSIC_DATA(avx512_rcp28_ss, INTR_TYPE_SCALAR_MASK_RM, X86ISD::RCP28S, 0),
+ X86_INTRINSIC_DATA(avx512_rcp28_pd, INTR_TYPE_1OP_MASK_SAE, X86ISD::RCP28, X86ISD::RCP28_SAE),
+ X86_INTRINSIC_DATA(avx512_rcp28_ps, INTR_TYPE_1OP_MASK_SAE, X86ISD::RCP28, X86ISD::RCP28_SAE),
+ X86_INTRINSIC_DATA(avx512_rcp28_sd, INTR_TYPE_SCALAR_MASK_SAE, X86ISD::RCP28S, X86ISD::RCP28S_SAE),
+ X86_INTRINSIC_DATA(avx512_rcp28_ss, INTR_TYPE_SCALAR_MASK_SAE, X86ISD::RCP28S, X86ISD::RCP28S_SAE),
X86_INTRINSIC_DATA(avx512_rsqrt14_pd_128, INTR_TYPE_1OP_MASK, X86ISD::RSQRT14, 0),
X86_INTRINSIC_DATA(avx512_rsqrt14_pd_256, INTR_TYPE_1OP_MASK, X86ISD::RSQRT14, 0),
X86_INTRINSIC_DATA(avx512_rsqrt14_pd_512, INTR_TYPE_1OP_MASK, X86ISD::RSQRT14, 0),
@@ -1002,14 +911,16 @@ static const IntrinsicData IntrinsicsWithoutChain[] = {
X86_INTRINSIC_DATA(avx512_rsqrt14_ps_512, INTR_TYPE_1OP_MASK, X86ISD::RSQRT14, 0),
X86_INTRINSIC_DATA(avx512_rsqrt14_sd, INTR_TYPE_SCALAR_MASK, X86ISD::RSQRT14S, 0),
X86_INTRINSIC_DATA(avx512_rsqrt14_ss, INTR_TYPE_SCALAR_MASK, X86ISD::RSQRT14S, 0),
- X86_INTRINSIC_DATA(avx512_rsqrt28_pd, INTR_TYPE_1OP_MASK_RM,X86ISD::RSQRT28, 0),
- X86_INTRINSIC_DATA(avx512_rsqrt28_ps, INTR_TYPE_1OP_MASK_RM,X86ISD::RSQRT28, 0),
- X86_INTRINSIC_DATA(avx512_rsqrt28_sd, INTR_TYPE_SCALAR_MASK_RM,X86ISD::RSQRT28S, 0),
- X86_INTRINSIC_DATA(avx512_rsqrt28_ss, INTR_TYPE_SCALAR_MASK_RM,X86ISD::RSQRT28S, 0),
+ X86_INTRINSIC_DATA(avx512_rsqrt28_pd, INTR_TYPE_1OP_MASK_SAE,X86ISD::RSQRT28, X86ISD::RSQRT28_SAE),
+ X86_INTRINSIC_DATA(avx512_rsqrt28_ps, INTR_TYPE_1OP_MASK_SAE,X86ISD::RSQRT28, X86ISD::RSQRT28_SAE),
+ X86_INTRINSIC_DATA(avx512_rsqrt28_sd, INTR_TYPE_SCALAR_MASK_SAE,X86ISD::RSQRT28S, X86ISD::RSQRT28S_SAE),
+ X86_INTRINSIC_DATA(avx512_rsqrt28_ss, INTR_TYPE_SCALAR_MASK_SAE,X86ISD::RSQRT28S, X86ISD::RSQRT28S_SAE),
+ X86_INTRINSIC_DATA(avx512_sitofp_round, INTR_TYPE_1OP, ISD::SINT_TO_FP, X86ISD::SINT_TO_FP_RND),
X86_INTRINSIC_DATA(avx512_sqrt_pd_512, INTR_TYPE_1OP, ISD::FSQRT, X86ISD::FSQRT_RND),
X86_INTRINSIC_DATA(avx512_sqrt_ps_512, INTR_TYPE_1OP, ISD::FSQRT, X86ISD::FSQRT_RND),
X86_INTRINSIC_DATA(avx512_sub_pd_512, INTR_TYPE_2OP, ISD::FSUB, X86ISD::FSUB_RND),
X86_INTRINSIC_DATA(avx512_sub_ps_512, INTR_TYPE_2OP, ISD::FSUB, X86ISD::FSUB_RND),
+ X86_INTRINSIC_DATA(avx512_uitofp_round, INTR_TYPE_1OP, ISD::UINT_TO_FP, X86ISD::UINT_TO_FP_RND),
X86_INTRINSIC_DATA(avx512_vcomi_sd, COMI_RM, X86ISD::COMI, X86ISD::UCOMI),
X86_INTRINSIC_DATA(avx512_vcomi_ss, COMI_RM, X86ISD::COMI, X86ISD::UCOMI),
X86_INTRINSIC_DATA(avx512_vcvtsd2si32, INTR_TYPE_1OP, X86ISD::CVTS2SI, X86ISD::CVTS2SI_RND),
@@ -1071,6 +982,16 @@ static const IntrinsicData IntrinsicsWithoutChain[] = {
X86_INTRINSIC_DATA(avx512_vpshufbitqmb_128, INTR_TYPE_2OP, X86ISD::VPSHUFBITQMB, 0),
X86_INTRINSIC_DATA(avx512_vpshufbitqmb_256, INTR_TYPE_2OP, X86ISD::VPSHUFBITQMB, 0),
X86_INTRINSIC_DATA(avx512_vpshufbitqmb_512, INTR_TYPE_2OP, X86ISD::VPSHUFBITQMB, 0),
+ // bfloat16
+ X86_INTRINSIC_DATA(avx512bf16_cvtne2ps2bf16_128, INTR_TYPE_2OP, X86ISD::CVTNE2PS2BF16, 0),
+ X86_INTRINSIC_DATA(avx512bf16_cvtne2ps2bf16_256, INTR_TYPE_2OP, X86ISD::CVTNE2PS2BF16, 0),
+ X86_INTRINSIC_DATA(avx512bf16_cvtne2ps2bf16_512, INTR_TYPE_2OP, X86ISD::CVTNE2PS2BF16, 0),
+ X86_INTRINSIC_DATA(avx512bf16_cvtneps2bf16_256, INTR_TYPE_1OP, X86ISD::CVTNEPS2BF16, 0),
+ X86_INTRINSIC_DATA(avx512bf16_cvtneps2bf16_512, INTR_TYPE_1OP, X86ISD::CVTNEPS2BF16, 0),
+ X86_INTRINSIC_DATA(avx512bf16_dpbf16ps_128, INTR_TYPE_3OP, X86ISD::DPBF16PS, 0),
+ X86_INTRINSIC_DATA(avx512bf16_dpbf16ps_256, INTR_TYPE_3OP, X86ISD::DPBF16PS, 0),
+ X86_INTRINSIC_DATA(avx512bf16_dpbf16ps_512, INTR_TYPE_3OP, X86ISD::DPBF16PS, 0),
+ X86_INTRINSIC_DATA(avx512bf16_mask_cvtneps2bf16_128, CVTNEPS2BF16_MASK, X86ISD::CVTNEPS2BF16, X86ISD::MCVTNEPS2BF16),
X86_INTRINSIC_DATA(bmi_bextr_32, INTR_TYPE_2OP, X86ISD::BEXTR, 0),
X86_INTRINSIC_DATA(bmi_bextr_64, INTR_TYPE_2OP, X86ISD::BEXTR, 0),
X86_INTRINSIC_DATA(bmi_bzhi_32, INTR_TYPE_2OP, X86ISD::BZHI, 0),
@@ -1111,6 +1032,7 @@ static const IntrinsicData IntrinsicsWithoutChain[] = {
X86_INTRINSIC_DATA(sse2_cvtps2dq, INTR_TYPE_1OP, X86ISD::CVTP2SI, 0),
X86_INTRINSIC_DATA(sse2_cvtsd2si, INTR_TYPE_1OP, X86ISD::CVTS2SI, 0),
X86_INTRINSIC_DATA(sse2_cvtsd2si64, INTR_TYPE_1OP, X86ISD::CVTS2SI, 0),
+ X86_INTRINSIC_DATA(sse2_cvtsd2ss, INTR_TYPE_2OP, X86ISD::VFPROUNDS, 0),
X86_INTRINSIC_DATA(sse2_cvttpd2dq, INTR_TYPE_1OP, X86ISD::CVTTP2SI, 0),
X86_INTRINSIC_DATA(sse2_cvttps2dq, INTR_TYPE_1OP, X86ISD::CVTTP2SI, 0),
X86_INTRINSIC_DATA(sse2_cvttsd2si, INTR_TYPE_1OP, X86ISD::CVTTS2SI, 0),
@@ -1123,6 +1045,8 @@ static const IntrinsicData IntrinsicsWithoutChain[] = {
X86_INTRINSIC_DATA(sse2_packssdw_128, INTR_TYPE_2OP, X86ISD::PACKSS, 0),
X86_INTRINSIC_DATA(sse2_packsswb_128, INTR_TYPE_2OP, X86ISD::PACKSS, 0),
X86_INTRINSIC_DATA(sse2_packuswb_128, INTR_TYPE_2OP, X86ISD::PACKUS, 0),
+ X86_INTRINSIC_DATA(sse2_pavg_b, INTR_TYPE_2OP, X86ISD::AVG, 0),
+ X86_INTRINSIC_DATA(sse2_pavg_w, INTR_TYPE_2OP, X86ISD::AVG, 0),
X86_INTRINSIC_DATA(sse2_pmadd_wd, INTR_TYPE_2OP, X86ISD::VPMADDWD, 0),
X86_INTRINSIC_DATA(sse2_pmovmskb_128, INTR_TYPE_1OP, X86ISD::MOVMSK, 0),
X86_INTRINSIC_DATA(sse2_pmulh_w, INTR_TYPE_2OP, ISD::MULHS, 0),
@@ -1156,8 +1080,11 @@ static const IntrinsicData IntrinsicsWithoutChain[] = {
X86_INTRINSIC_DATA(sse3_hadd_ps, INTR_TYPE_2OP, X86ISD::FHADD, 0),
X86_INTRINSIC_DATA(sse3_hsub_pd, INTR_TYPE_2OP, X86ISD::FHSUB, 0),
X86_INTRINSIC_DATA(sse3_hsub_ps, INTR_TYPE_2OP, X86ISD::FHSUB, 0),
+ X86_INTRINSIC_DATA(sse41_blendvpd, BLENDV, X86ISD::BLENDV, 0),
+ X86_INTRINSIC_DATA(sse41_blendvps, BLENDV, X86ISD::BLENDV, 0),
X86_INTRINSIC_DATA(sse41_insertps, INTR_TYPE_3OP, X86ISD::INSERTPS, 0),
X86_INTRINSIC_DATA(sse41_packusdw, INTR_TYPE_2OP, X86ISD::PACKUS, 0),
+ X86_INTRINSIC_DATA(sse41_pblendvb, BLENDV, X86ISD::BLENDV, 0),
X86_INTRINSIC_DATA(sse41_phminposuw, INTR_TYPE_1OP, X86ISD::PHMINPOS, 0),
X86_INTRINSIC_DATA(sse41_round_pd, ROUNDP, X86ISD::VRNDSCALE, 0),
X86_INTRINSIC_DATA(sse41_round_ps, ROUNDP, X86ISD::VRNDSCALE, 0),
@@ -1200,14 +1127,6 @@ static const IntrinsicData IntrinsicsWithoutChain[] = {
X86_INTRINSIC_DATA(vgf2p8mulb_512, INTR_TYPE_2OP,
X86ISD::GF2P8MULB, 0),
- X86_INTRINSIC_DATA(xop_vpcomb, INTR_TYPE_3OP, X86ISD::VPCOM, 0),
- X86_INTRINSIC_DATA(xop_vpcomd, INTR_TYPE_3OP, X86ISD::VPCOM, 0),
- X86_INTRINSIC_DATA(xop_vpcomq, INTR_TYPE_3OP, X86ISD::VPCOM, 0),
- X86_INTRINSIC_DATA(xop_vpcomub, INTR_TYPE_3OP, X86ISD::VPCOMU, 0),
- X86_INTRINSIC_DATA(xop_vpcomud, INTR_TYPE_3OP, X86ISD::VPCOMU, 0),
- X86_INTRINSIC_DATA(xop_vpcomuq, INTR_TYPE_3OP, X86ISD::VPCOMU, 0),
- X86_INTRINSIC_DATA(xop_vpcomuw, INTR_TYPE_3OP, X86ISD::VPCOMU, 0),
- X86_INTRINSIC_DATA(xop_vpcomw, INTR_TYPE_3OP, X86ISD::VPCOM, 0),
X86_INTRINSIC_DATA(xop_vpermil2pd, INTR_TYPE_4OP, X86ISD::VPERMIL2, 0),
X86_INTRINSIC_DATA(xop_vpermil2pd_256, INTR_TYPE_4OP, X86ISD::VPERMIL2, 0),
X86_INTRINSIC_DATA(xop_vpermil2ps, INTR_TYPE_4OP, X86ISD::VPERMIL2, 0),
diff --git a/lib/Target/X86/X86LegalizerInfo.cpp b/lib/Target/X86/X86LegalizerInfo.cpp
index 4a49fa68dd06..00fb1b573858 100644
--- a/lib/Target/X86/X86LegalizerInfo.cpp
+++ b/lib/Target/X86/X86LegalizerInfo.cpp
@@ -1,9 +1,8 @@
//===- X86LegalizerInfo.cpp --------------------------------------*- C++ -*-==//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
/// \file
@@ -134,9 +133,15 @@ void X86LegalizerInfo::setLegalizerInfo32bit() {
// Shifts and SDIV
getActionDefinitionsBuilder(
- {G_SHL, G_LSHR, G_ASHR, G_SDIV, G_SREM, G_UDIV, G_UREM})
- .legalFor({s8, s16, s32})
- .clampScalar(0, s8, s32);
+ {G_SDIV, G_SREM, G_UDIV, G_UREM})
+ .legalFor({s8, s16, s32})
+ .clampScalar(0, s8, s32);
+
+ getActionDefinitionsBuilder(
+ {G_SHL, G_LSHR, G_ASHR})
+ .legalFor({{s8, s8}, {s16, s8}, {s32, s8}})
+ .clampScalar(0, s8, s32)
+ .clampScalar(1, s8, s8);
}
// Control-flow
@@ -236,12 +241,19 @@ void X86LegalizerInfo::setLegalizerInfo64bit() {
.clampScalar(1, s32, s64)
.widenScalarToNextPow2(1);
- // Shifts and SDIV
+ // Divisions
getActionDefinitionsBuilder(
- {G_SHL, G_LSHR, G_ASHR, G_SDIV, G_SREM, G_UDIV, G_UREM})
+ {G_SDIV, G_SREM, G_UDIV, G_UREM})
.legalFor({s8, s16, s32, s64})
.clampScalar(0, s8, s64);
+ // Shifts
+ getActionDefinitionsBuilder(
+ {G_SHL, G_LSHR, G_ASHR})
+ .legalFor({{s8, s8}, {s16, s8}, {s32, s8}, {s64, s8}})
+ .clampScalar(0, s8, s64)
+ .clampScalar(1, s8, s8);
+
// Merge/Unmerge
setAction({G_MERGE_VALUES, s128}, Legal);
setAction({G_UNMERGE_VALUES, 1, s128}, Legal);
diff --git a/lib/Target/X86/X86LegalizerInfo.h b/lib/Target/X86/X86LegalizerInfo.h
index 135950a95f84..d21707b9ab9b 100644
--- a/lib/Target/X86/X86LegalizerInfo.h
+++ b/lib/Target/X86/X86LegalizerInfo.h
@@ -1,10 +1,9 @@
//===- X86LegalizerInfo.h ------------------------------------------*- C++
//-*-==//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
/// \file
diff --git a/lib/Target/X86/X86MCInstLower.cpp b/lib/Target/X86/X86MCInstLower.cpp
index 2816f8c62bfb..b1fefaa84be4 100644
--- a/lib/Target/X86/X86MCInstLower.cpp
+++ b/lib/Target/X86/X86MCInstLower.cpp
@@ -1,9 +1,8 @@
//===-- X86MCInstLower.cpp - Convert X86 MachineInstr to an MCInst --------===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
@@ -12,9 +11,9 @@
//
//===----------------------------------------------------------------------===//
-#include "InstPrinter/X86ATTInstPrinter.h"
-#include "InstPrinter/X86InstComments.h"
+#include "MCTargetDesc/X86ATTInstPrinter.h"
#include "MCTargetDesc/X86BaseInfo.h"
+#include "MCTargetDesc/X86InstComments.h"
#include "MCTargetDesc/X86TargetStreamer.h"
#include "Utils/X86ShuffleDecode.h"
#include "X86AsmPrinter.h"
@@ -101,9 +100,7 @@ void X86AsmPrinter::StackMapShadowTracker::emitShadowPadding(
}
void X86AsmPrinter::EmitAndCountInstruction(MCInst &Inst) {
- OutStreamer->EmitInstruction(Inst, getSubtargetInfo(),
- EnablePrintSchedInfo &&
- !(Inst.getFlags() & X86::NO_SCHED_INFO));
+ OutStreamer->EmitInstruction(Inst, getSubtargetInfo());
SMShadowTracker.count(Inst, getSubtargetInfo(), CodeEmitter.get());
}
@@ -438,7 +435,6 @@ void X86MCInstLower::Lower(const MachineInstr *MI, MCInst &OutMI) const {
OutMI.addOperand(MaybeMCOp.getValue());
// Handle a few special cases to eliminate operand modifiers.
-ReSimplify:
switch (OutMI.getOpcode()) {
case X86::LEA64_32r:
case X86::LEA64r:
@@ -554,11 +550,6 @@ ReSimplify:
case X86::TAILJMPd64:
Opcode = X86::JMP_1;
goto SetTailJmpOpcode;
- case X86::TAILJMPd_CC:
- case X86::TAILJMPd64_CC:
- Opcode = X86::GetCondBranchFromCond(
- static_cast<X86::CondCode>(MI->getOperand(1).getImm()));
- goto SetTailJmpOpcode;
SetTailJmpOpcode:
MCOperand Saved = OutMI.getOperand(0);
@@ -568,6 +559,17 @@ ReSimplify:
break;
}
+ case X86::TAILJMPd_CC:
+ case X86::TAILJMPd64_CC: {
+ MCOperand Saved = OutMI.getOperand(0);
+ MCOperand Saved2 = OutMI.getOperand(1);
+ OutMI = MCInst();
+ OutMI.setOpcode(X86::JCC_1);
+ OutMI.addOperand(Saved);
+ OutMI.addOperand(Saved2);
+ break;
+ }
+
case X86::DEC16r:
case X86::DEC32r:
case X86::INC16r:
@@ -586,19 +588,6 @@ ReSimplify:
}
break;
- // These are pseudo-ops for OR to help with the OR->ADD transformation. We do
- // this with an ugly goto in case the resultant OR uses EAX and needs the
- // short form.
- case X86::ADD16rr_DB: OutMI.setOpcode(X86::OR16rr); goto ReSimplify;
- case X86::ADD32rr_DB: OutMI.setOpcode(X86::OR32rr); goto ReSimplify;
- case X86::ADD64rr_DB: OutMI.setOpcode(X86::OR64rr); goto ReSimplify;
- case X86::ADD16ri_DB: OutMI.setOpcode(X86::OR16ri); goto ReSimplify;
- case X86::ADD32ri_DB: OutMI.setOpcode(X86::OR32ri); goto ReSimplify;
- case X86::ADD64ri32_DB: OutMI.setOpcode(X86::OR64ri32); goto ReSimplify;
- case X86::ADD16ri8_DB: OutMI.setOpcode(X86::OR16ri8); goto ReSimplify;
- case X86::ADD32ri8_DB: OutMI.setOpcode(X86::OR32ri8); goto ReSimplify;
- case X86::ADD64ri8_DB: OutMI.setOpcode(X86::OR64ri8); goto ReSimplify;
-
// We don't currently select the correct instruction form for instructions
// which have a short %eax, etc. form. Handle this by custom lowering, for
// now.
@@ -694,16 +683,9 @@ ReSimplify:
void X86AsmPrinter::LowerTlsAddr(X86MCInstLower &MCInstLowering,
const MachineInstr &MI) {
-
- bool is64Bits = MI.getOpcode() == X86::TLS_addr64 ||
+ bool Is64Bits = MI.getOpcode() == X86::TLS_addr64 ||
MI.getOpcode() == X86::TLS_base_addr64;
-
- bool needsPadding = MI.getOpcode() == X86::TLS_addr64;
-
- MCContext &context = OutStreamer->getContext();
-
- if (needsPadding)
- EmitAndCountInstruction(MCInstBuilder(X86::DATA16_PREFIX));
+ MCContext &Ctx = OutStreamer->getContext();
MCSymbolRefExpr::VariantKind SRVK;
switch (MI.getOpcode()) {
@@ -721,51 +703,86 @@ void X86AsmPrinter::LowerTlsAddr(X86MCInstLower &MCInstLowering,
llvm_unreachable("unexpected opcode");
}
- MCSymbol *sym = MCInstLowering.GetSymbolFromOperand(MI.getOperand(3));
- const MCSymbolRefExpr *symRef = MCSymbolRefExpr::create(sym, SRVK, context);
-
- MCInst LEA;
- if (is64Bits) {
- LEA.setOpcode(X86::LEA64r);
- LEA.addOperand(MCOperand::createReg(X86::RDI)); // dest
- LEA.addOperand(MCOperand::createReg(X86::RIP)); // base
- LEA.addOperand(MCOperand::createImm(1)); // scale
- LEA.addOperand(MCOperand::createReg(0)); // index
- LEA.addOperand(MCOperand::createExpr(symRef)); // disp
- LEA.addOperand(MCOperand::createReg(0)); // seg
- } else if (SRVK == MCSymbolRefExpr::VK_TLSLDM) {
- LEA.setOpcode(X86::LEA32r);
- LEA.addOperand(MCOperand::createReg(X86::EAX)); // dest
- LEA.addOperand(MCOperand::createReg(X86::EBX)); // base
- LEA.addOperand(MCOperand::createImm(1)); // scale
- LEA.addOperand(MCOperand::createReg(0)); // index
- LEA.addOperand(MCOperand::createExpr(symRef)); // disp
- LEA.addOperand(MCOperand::createReg(0)); // seg
+ const MCSymbolRefExpr *Sym = MCSymbolRefExpr::create(
+ MCInstLowering.GetSymbolFromOperand(MI.getOperand(3)), SRVK, Ctx);
+
+ // As of binutils 2.32, ld has a bogus TLS relaxation error when the GD/LD
+ // code sequence using R_X86_64_GOTPCREL (instead of R_X86_64_GOTPCRELX) is
+ // attempted to be relaxed to IE/LE (binutils PR24784). Work around the bug by
+ // only using GOT when GOTPCRELX is enabled.
+ // TODO Delete the workaround when GOTPCRELX becomes commonplace.
+ bool UseGot = MMI->getModule()->getRtLibUseGOT() &&
+ Ctx.getAsmInfo()->canRelaxRelocations();
+
+ if (Is64Bits) {
+ bool NeedsPadding = SRVK == MCSymbolRefExpr::VK_TLSGD;
+ if (NeedsPadding)
+ EmitAndCountInstruction(MCInstBuilder(X86::DATA16_PREFIX));
+ EmitAndCountInstruction(MCInstBuilder(X86::LEA64r)
+ .addReg(X86::RDI)
+ .addReg(X86::RIP)
+ .addImm(1)
+ .addReg(0)
+ .addExpr(Sym)
+ .addReg(0));
+ const MCSymbol *TlsGetAddr = Ctx.getOrCreateSymbol("__tls_get_addr");
+ if (NeedsPadding) {
+ if (!UseGot)
+ EmitAndCountInstruction(MCInstBuilder(X86::DATA16_PREFIX));
+ EmitAndCountInstruction(MCInstBuilder(X86::DATA16_PREFIX));
+ EmitAndCountInstruction(MCInstBuilder(X86::REX64_PREFIX));
+ }
+ if (UseGot) {
+ const MCExpr *Expr = MCSymbolRefExpr::create(
+ TlsGetAddr, MCSymbolRefExpr::VK_GOTPCREL, Ctx);
+ EmitAndCountInstruction(MCInstBuilder(X86::CALL64m)
+ .addReg(X86::RIP)
+ .addImm(1)
+ .addReg(0)
+ .addExpr(Expr)
+ .addReg(0));
+ } else {
+ EmitAndCountInstruction(
+ MCInstBuilder(X86::CALL64pcrel32)
+ .addExpr(MCSymbolRefExpr::create(TlsGetAddr,
+ MCSymbolRefExpr::VK_PLT, Ctx)));
+ }
} else {
- LEA.setOpcode(X86::LEA32r);
- LEA.addOperand(MCOperand::createReg(X86::EAX)); // dest
- LEA.addOperand(MCOperand::createReg(0)); // base
- LEA.addOperand(MCOperand::createImm(1)); // scale
- LEA.addOperand(MCOperand::createReg(X86::EBX)); // index
- LEA.addOperand(MCOperand::createExpr(symRef)); // disp
- LEA.addOperand(MCOperand::createReg(0)); // seg
- }
- EmitAndCountInstruction(LEA);
+ if (SRVK == MCSymbolRefExpr::VK_TLSGD && !UseGot) {
+ EmitAndCountInstruction(MCInstBuilder(X86::LEA32r)
+ .addReg(X86::EAX)
+ .addReg(0)
+ .addImm(1)
+ .addReg(X86::EBX)
+ .addExpr(Sym)
+ .addReg(0));
+ } else {
+ EmitAndCountInstruction(MCInstBuilder(X86::LEA32r)
+ .addReg(X86::EAX)
+ .addReg(X86::EBX)
+ .addImm(1)
+ .addReg(0)
+ .addExpr(Sym)
+ .addReg(0));
+ }
- if (needsPadding) {
- EmitAndCountInstruction(MCInstBuilder(X86::DATA16_PREFIX));
- EmitAndCountInstruction(MCInstBuilder(X86::DATA16_PREFIX));
- EmitAndCountInstruction(MCInstBuilder(X86::REX64_PREFIX));
+ const MCSymbol *TlsGetAddr = Ctx.getOrCreateSymbol("___tls_get_addr");
+ if (UseGot) {
+ const MCExpr *Expr =
+ MCSymbolRefExpr::create(TlsGetAddr, MCSymbolRefExpr::VK_GOT, Ctx);
+ EmitAndCountInstruction(MCInstBuilder(X86::CALL32m)
+ .addReg(X86::EBX)
+ .addImm(1)
+ .addReg(0)
+ .addExpr(Expr)
+ .addReg(0));
+ } else {
+ EmitAndCountInstruction(
+ MCInstBuilder(X86::CALLpcrel32)
+ .addExpr(MCSymbolRefExpr::create(TlsGetAddr,
+ MCSymbolRefExpr::VK_PLT, Ctx)));
+ }
}
-
- StringRef name = is64Bits ? "__tls_get_addr" : "___tls_get_addr";
- MCSymbol *tlsGetAddr = context.getOrCreateSymbol(name);
- const MCSymbolRefExpr *tlsRef =
- MCSymbolRefExpr::create(tlsGetAddr, MCSymbolRefExpr::VK_PLT, context);
-
- EmitAndCountInstruction(
- MCInstBuilder(is64Bits ? X86::CALL64pcrel32 : X86::CALLpcrel32)
- .addExpr(tlsRef));
}
/// Emit the largest nop instruction smaller than or equal to \p NumBytes
@@ -778,7 +795,7 @@ static unsigned EmitNop(MCStreamer &OS, unsigned NumBytes, bool Is64Bit,
unsigned NopSize;
unsigned Opc, BaseReg, ScaleVal, IndexReg, Displacement, SegmentReg;
- Opc = IndexReg = Displacement = SegmentReg = 0;
+ IndexReg = Displacement = SegmentReg = 0;
BaseReg = X86::RAX;
ScaleVal = 1;
switch (NumBytes) {
@@ -963,6 +980,7 @@ void X86AsmPrinter::LowerFAULTING_OP(const MachineInstr &FaultingMI,
if (auto MaybeOperand = MCIL.LowerMachineOperand(&FaultingMI, *I))
MI.addOperand(MaybeOperand.getValue());
+ OutStreamer->AddComment("on-fault: " + HandlerLabel->getName());
OutStreamer->EmitInstruction(MI, getSubtargetInfo());
}
@@ -1374,7 +1392,8 @@ PrevCrossBBInst(MachineBasicBlock::const_iterator MBBI) {
MBB = MBB->getPrevNode();
MBBI = MBB->end();
}
- return --MBBI;
+ --MBBI;
+ return MBBI;
}
static const Constant *getConstantFromPool(const MachineInstr &MI,
@@ -1668,6 +1687,77 @@ void X86AsmPrinter::EmitInstruction(const MachineInstr *MI) {
case X86::TLS_base_addr64:
return LowerTlsAddr(MCInstLowering, *MI);
+ // Loading/storing mask pairs requires two kmov operations. The second one of these
+ // needs a 2 byte displacement relative to the specified address (with 32 bit spill
+ // size). The pairs of 1bit masks up to 16 bit masks all use the same spill size,
+ // they all are stored using MASKPAIR16STORE, loaded using MASKPAIR16LOAD.
+ //
+ // The displacement value might wrap around in theory, thus the asserts in both
+ // cases.
+ case X86::MASKPAIR16LOAD: {
+ int64_t Disp = MI->getOperand(1 + X86::AddrDisp).getImm();
+ assert(Disp >= 0 && Disp <= INT32_MAX - 2 && "Unexpected displacement");
+ const X86RegisterInfo *RI =
+ MF->getSubtarget<X86Subtarget>().getRegisterInfo();
+ unsigned Reg = MI->getOperand(0).getReg();
+ unsigned Reg0 = RI->getSubReg(Reg, X86::sub_mask_0);
+ unsigned Reg1 = RI->getSubReg(Reg, X86::sub_mask_1);
+
+ // Load the first mask register
+ MCInstBuilder MIB = MCInstBuilder(X86::KMOVWkm);
+ MIB.addReg(Reg0);
+ for (int i = 0; i < X86::AddrNumOperands; ++i) {
+ auto Op = MCInstLowering.LowerMachineOperand(MI, MI->getOperand(1 + i));
+ MIB.addOperand(Op.getValue());
+ }
+ EmitAndCountInstruction(MIB);
+
+ // Load the second mask register of the pair
+ MIB = MCInstBuilder(X86::KMOVWkm);
+ MIB.addReg(Reg1);
+ for (int i = 0; i < X86::AddrNumOperands; ++i) {
+ if (i == X86::AddrDisp) {
+ MIB.addImm(Disp + 2);
+ } else {
+ auto Op = MCInstLowering.LowerMachineOperand(MI, MI->getOperand(1 + i));
+ MIB.addOperand(Op.getValue());
+ }
+ }
+ EmitAndCountInstruction(MIB);
+ return;
+ }
+
+ case X86::MASKPAIR16STORE: {
+ int64_t Disp = MI->getOperand(X86::AddrDisp).getImm();
+ assert(Disp >= 0 && Disp <= INT32_MAX - 2 && "Unexpected displacement");
+ const X86RegisterInfo *RI =
+ MF->getSubtarget<X86Subtarget>().getRegisterInfo();
+ unsigned Reg = MI->getOperand(X86::AddrNumOperands).getReg();
+ unsigned Reg0 = RI->getSubReg(Reg, X86::sub_mask_0);
+ unsigned Reg1 = RI->getSubReg(Reg, X86::sub_mask_1);
+
+ // Store the first mask register
+ MCInstBuilder MIB = MCInstBuilder(X86::KMOVWmk);
+ for (int i = 0; i < X86::AddrNumOperands; ++i)
+ MIB.addOperand(MCInstLowering.LowerMachineOperand(MI, MI->getOperand(i)).getValue());
+ MIB.addReg(Reg0);
+ EmitAndCountInstruction(MIB);
+
+ // Store the second mask register of the pair
+ MIB = MCInstBuilder(X86::KMOVWmk);
+ for (int i = 0; i < X86::AddrNumOperands; ++i) {
+ if (i == X86::AddrDisp) {
+ MIB.addImm(Disp + 2);
+ } else {
+ auto Op = MCInstLowering.LowerMachineOperand(MI, MI->getOperand(0 + i));
+ MIB.addOperand(Op.getValue());
+ }
+ }
+ MIB.addReg(Reg1);
+ EmitAndCountInstruction(MIB);
+ return;
+ }
+
case X86::MOVPC32r: {
// This is a pseudo op for a two instruction sequence with a label, which
// looks like:
@@ -1861,8 +1951,7 @@ void X86AsmPrinter::EmitInstruction(const MachineInstr *MI) {
SmallVector<int, 64> Mask;
DecodePSHUFBMask(C, Width, Mask);
if (!Mask.empty())
- OutStreamer->AddComment(getShuffleComment(MI, SrcIdx, SrcIdx, Mask),
- !EnablePrintSchedInfo);
+ OutStreamer->AddComment(getShuffleComment(MI, SrcIdx, SrcIdx, Mask));
}
break;
}
@@ -1934,8 +2023,7 @@ void X86AsmPrinter::EmitInstruction(const MachineInstr *MI) {
SmallVector<int, 16> Mask;
DecodeVPERMILPMask(C, ElSize, Width, Mask);
if (!Mask.empty())
- OutStreamer->AddComment(getShuffleComment(MI, SrcIdx, SrcIdx, Mask),
- !EnablePrintSchedInfo);
+ OutStreamer->AddComment(getShuffleComment(MI, SrcIdx, SrcIdx, Mask));
}
break;
}
@@ -1966,8 +2054,7 @@ void X86AsmPrinter::EmitInstruction(const MachineInstr *MI) {
SmallVector<int, 16> Mask;
DecodeVPERMIL2PMask(C, (unsigned)CtrlOp.getImm(), ElSize, Width, Mask);
if (!Mask.empty())
- OutStreamer->AddComment(getShuffleComment(MI, 1, 2, Mask),
- !EnablePrintSchedInfo);
+ OutStreamer->AddComment(getShuffleComment(MI, 1, 2, Mask));
}
break;
}
@@ -1984,8 +2071,7 @@ void X86AsmPrinter::EmitInstruction(const MachineInstr *MI) {
SmallVector<int, 16> Mask;
DecodeVPPERMMask(C, Width, Mask);
if (!Mask.empty())
- OutStreamer->AddComment(getShuffleComment(MI, 1, 2, Mask),
- !EnablePrintSchedInfo);
+ OutStreamer->AddComment(getShuffleComment(MI, 1, 2, Mask));
}
break;
}
@@ -2002,7 +2088,7 @@ void X86AsmPrinter::EmitInstruction(const MachineInstr *MI) {
CS << X86ATTInstPrinter::getRegisterName(DstOp.getReg()) << " = ";
if (auto *CF = dyn_cast<ConstantFP>(C)) {
CS << "0x" << CF->getValueAPF().bitcastToAPInt().toString(16, false);
- OutStreamer->AddComment(CS.str(), !EnablePrintSchedInfo);
+ OutStreamer->AddComment(CS.str());
}
}
break;
@@ -2099,7 +2185,7 @@ void X86AsmPrinter::EmitInstruction(const MachineInstr *MI) {
}
}
CS << "]";
- OutStreamer->AddComment(CS.str(), !EnablePrintSchedInfo);
+ OutStreamer->AddComment(CS.str());
} else if (auto *CV = dyn_cast<ConstantVector>(C)) {
CS << "<";
for (int l = 0; l != NumLanes; ++l) {
@@ -2111,7 +2197,7 @@ void X86AsmPrinter::EmitInstruction(const MachineInstr *MI) {
}
}
CS << ">";
- OutStreamer->AddComment(CS.str(), !EnablePrintSchedInfo);
+ OutStreamer->AddComment(CS.str());
}
}
break;
@@ -2198,14 +2284,12 @@ void X86AsmPrinter::EmitInstruction(const MachineInstr *MI) {
printConstant(C, CS);
}
CS << "]";
- OutStreamer->AddComment(CS.str(), !EnablePrintSchedInfo);
+ OutStreamer->AddComment(CS.str());
}
}
MCInst TmpInst;
MCInstLowering.Lower(MI, TmpInst);
- if (MI->getAsmPrinterFlag(MachineInstr::NoSchedComment))
- TmpInst.setFlags(TmpInst.getFlags() | X86::NO_SCHED_INFO);
// Stackmap shadows cannot include branch targets, so we can count the bytes
// in a call towards the shadow, but must ensure that the no thread returns
diff --git a/lib/Target/X86/X86MachineFunctionInfo.cpp b/lib/Target/X86/X86MachineFunctionInfo.cpp
index 5433033671f3..05f846bfb219 100644
--- a/lib/Target/X86/X86MachineFunctionInfo.cpp
+++ b/lib/Target/X86/X86MachineFunctionInfo.cpp
@@ -1,9 +1,8 @@
//===-- X86MachineFunctionInfo.cpp - X86 machine function info ------------===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
diff --git a/lib/Target/X86/X86MachineFunctionInfo.h b/lib/Target/X86/X86MachineFunctionInfo.h
index e1183bd14796..d7e535598d81 100644
--- a/lib/Target/X86/X86MachineFunctionInfo.h
+++ b/lib/Target/X86/X86MachineFunctionInfo.h
@@ -1,9 +1,8 @@
//===-- X86MachineFunctionInfo.h - X86 machine function info ----*- C++ -*-===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
diff --git a/lib/Target/X86/X86MacroFusion.cpp b/lib/Target/X86/X86MacroFusion.cpp
index 5c09597d0442..c6da4b09dd60 100644
--- a/lib/Target/X86/X86MacroFusion.cpp
+++ b/lib/Target/X86/X86MacroFusion.cpp
@@ -1,9 +1,8 @@
//===- X86MacroFusion.cpp - X86 Macro Fusion ------------------------------===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
@@ -19,59 +18,29 @@
using namespace llvm;
-/// Check if the instr pair, FirstMI and SecondMI, should be fused
-/// together. Given SecondMI, when FirstMI is unspecified, then check if
-/// SecondMI may be part of a fused pair at all.
-static bool shouldScheduleAdjacent(const TargetInstrInfo &TII,
- const TargetSubtargetInfo &TSI,
- const MachineInstr *FirstMI,
- const MachineInstr &SecondMI) {
- const X86Subtarget &ST = static_cast<const X86Subtarget&>(TSI);
- // Check if this processor supports macro-fusion.
- if (!ST.hasMacroFusion())
- return false;
+namespace {
- enum {
- FuseTest,
- FuseCmp,
- FuseInc
- } FuseKind;
+// The classification for the first instruction.
+enum class FirstInstrKind { Test, Cmp, And, ALU, IncDec, Invalid };
- unsigned FirstOpcode = FirstMI
- ? FirstMI->getOpcode()
- : static_cast<unsigned>(X86::INSTRUCTION_LIST_END);
- unsigned SecondOpcode = SecondMI.getOpcode();
+// The classification for the second instruction (jump).
+enum class JumpKind {
+ // JE, JL, JG and variants.
+ ELG,
+ // JA, JB and variants.
+ AB,
+ // JS, JP, JO and variants.
+ SPO,
+ // Not a fusable jump.
+ Invalid,
+};
- switch (SecondOpcode) {
- default:
- return false;
- case X86::JE_1:
- case X86::JNE_1:
- case X86::JL_1:
- case X86::JLE_1:
- case X86::JG_1:
- case X86::JGE_1:
- FuseKind = FuseInc;
- break;
- case X86::JB_1:
- case X86::JBE_1:
- case X86::JA_1:
- case X86::JAE_1:
- FuseKind = FuseCmp;
- break;
- case X86::JS_1:
- case X86::JNS_1:
- case X86::JP_1:
- case X86::JNP_1:
- case X86::JO_1:
- case X86::JNO_1:
- FuseKind = FuseTest;
- break;
- }
+} // namespace
- switch (FirstOpcode) {
+static FirstInstrKind classifyFirst(const MachineInstr &MI) {
+ switch (MI.getOpcode()) {
default:
- return false;
+ return FirstInstrKind::Invalid;
case X86::TEST8rr:
case X86::TEST16rr:
case X86::TEST32rr:
@@ -84,6 +53,7 @@ static bool shouldScheduleAdjacent(const TargetInstrInfo &TII,
case X86::TEST16mr:
case X86::TEST32mr:
case X86::TEST64mr:
+ return FirstInstrKind::Test;
case X86::AND16ri:
case X86::AND16ri8:
case X86::AND16rm:
@@ -99,7 +69,7 @@ static bool shouldScheduleAdjacent(const TargetInstrInfo &TII,
case X86::AND8ri:
case X86::AND8rm:
case X86::AND8rr:
- return true;
+ return FirstInstrKind::And;
case X86::CMP16ri:
case X86::CMP16ri8:
case X86::CMP16rm:
@@ -119,6 +89,7 @@ static bool shouldScheduleAdjacent(const TargetInstrInfo &TII,
case X86::CMP8rm:
case X86::CMP8rr:
case X86::CMP8mr:
+ return FirstInstrKind::Cmp;
case X86::ADD16ri:
case X86::ADD16ri8:
case X86::ADD16ri8_DB:
@@ -141,8 +112,10 @@ static bool shouldScheduleAdjacent(const TargetInstrInfo &TII,
case X86::ADD64rr:
case X86::ADD64rr_DB:
case X86::ADD8ri:
+ case X86::ADD8ri_DB:
case X86::ADD8rm:
case X86::ADD8rr:
+ case X86::ADD8rr_DB:
case X86::SUB16ri:
case X86::SUB16ri8:
case X86::SUB16rm:
@@ -158,7 +131,7 @@ static bool shouldScheduleAdjacent(const TargetInstrInfo &TII,
case X86::SUB8ri:
case X86::SUB8rm:
case X86::SUB8rr:
- return FuseKind == FuseCmp || FuseKind == FuseInc;
+ return FirstInstrKind::ALU;
case X86::INC16r:
case X86::INC32r:
case X86::INC64r:
@@ -167,10 +140,87 @@ static bool shouldScheduleAdjacent(const TargetInstrInfo &TII,
case X86::DEC32r:
case X86::DEC64r:
case X86::DEC8r:
- return FuseKind == FuseInc;
- case X86::INSTRUCTION_LIST_END:
- return true;
+ return FirstInstrKind::IncDec;
+ }
+}
+
+static JumpKind classifySecond(const MachineInstr &MI) {
+ X86::CondCode CC = X86::getCondFromBranch(MI);
+ if (CC == X86::COND_INVALID)
+ return JumpKind::Invalid;
+
+ switch (CC) {
+ default:
+ return JumpKind::Invalid;
+ case X86::COND_E:
+ case X86::COND_NE:
+ case X86::COND_L:
+ case X86::COND_LE:
+ case X86::COND_G:
+ case X86::COND_GE:
+ return JumpKind::ELG;
+ case X86::COND_B:
+ case X86::COND_BE:
+ case X86::COND_A:
+ case X86::COND_AE:
+ return JumpKind::AB;
+ case X86::COND_S:
+ case X86::COND_NS:
+ case X86::COND_P:
+ case X86::COND_NP:
+ case X86::COND_O:
+ case X86::COND_NO:
+ return JumpKind::SPO;
+ }
+}
+
+/// Check if the instr pair, FirstMI and SecondMI, should be fused
+/// together. Given SecondMI, when FirstMI is unspecified, then check if
+/// SecondMI may be part of a fused pair at all.
+static bool shouldScheduleAdjacent(const TargetInstrInfo &TII,
+ const TargetSubtargetInfo &TSI,
+ const MachineInstr *FirstMI,
+ const MachineInstr &SecondMI) {
+ const X86Subtarget &ST = static_cast<const X86Subtarget &>(TSI);
+
+ // Check if this processor supports any kind of fusion.
+ if (!(ST.hasBranchFusion() || ST.hasMacroFusion()))
+ return false;
+
+ const JumpKind BranchKind = classifySecond(SecondMI);
+
+ if (BranchKind == JumpKind::Invalid)
+ return false; // Second cannot be fused with anything.
+
+ if (FirstMI == nullptr)
+ return true; // We're only checking whether Second can be fused at all.
+
+ const FirstInstrKind TestKind = classifyFirst(*FirstMI);
+
+ if (ST.hasBranchFusion()) {
+ // Branch fusion can merge CMP and TEST with all conditional jumps.
+ return (TestKind == FirstInstrKind::Cmp ||
+ TestKind == FirstInstrKind::Test);
+ }
+
+ if (ST.hasMacroFusion()) {
+ // Macro Fusion rules are a bit more complex. See Agner Fog's
+ // Microarchitecture table 9.2 "Instruction Fusion".
+ switch (TestKind) {
+ case FirstInstrKind::Test:
+ case FirstInstrKind::And:
+ return true;
+ case FirstInstrKind::Cmp:
+ case FirstInstrKind::ALU:
+ return BranchKind == JumpKind::ELG || BranchKind == JumpKind::AB;
+ case FirstInstrKind::IncDec:
+ return BranchKind == JumpKind::ELG;
+ case FirstInstrKind::Invalid:
+ return false;
+ }
}
+
+ llvm_unreachable("unknown branch fusion type");
}
namespace llvm {
diff --git a/lib/Target/X86/X86MacroFusion.h b/lib/Target/X86/X86MacroFusion.h
index 97ef1d6d3b61..d4ae54f657a5 100644
--- a/lib/Target/X86/X86MacroFusion.h
+++ b/lib/Target/X86/X86MacroFusion.h
@@ -1,9 +1,8 @@
//===- X86MacroFusion.h - X86 Macro Fusion --------------------------------===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
diff --git a/lib/Target/X86/X86OptimizeLEAs.cpp b/lib/Target/X86/X86OptimizeLEAs.cpp
index b56d02b6bfb6..7f75598b0655 100644
--- a/lib/Target/X86/X86OptimizeLEAs.cpp
+++ b/lib/Target/X86/X86OptimizeLEAs.cpp
@@ -1,9 +1,8 @@
//===- X86OptimizeLEAs.cpp - optimize usage of LEA instructions -----------===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
@@ -569,11 +568,8 @@ MachineInstr *OptimizeLEAPass::replaceDebugValue(MachineInstr &MI,
unsigned VReg,
int64_t AddrDispShift) {
DIExpression *Expr = const_cast<DIExpression *>(MI.getDebugExpression());
-
if (AddrDispShift != 0)
- Expr = DIExpression::prepend(Expr, DIExpression::NoDeref, AddrDispShift,
- DIExpression::NoDeref,
- DIExpression::WithStackValue);
+ Expr = DIExpression::prepend(Expr, DIExpression::StackValue, AddrDispShift);
// Replace DBG_VALUE instruction with modified version.
MachineBasicBlock *MBB = MI.getParent();
@@ -701,7 +697,7 @@ bool OptimizeLEAPass::runOnMachineFunction(MachineFunction &MF) {
// Remove redundant address calculations. Do it only for -Os/-Oz since only
// a code size gain is expected from this part of the pass.
- if (MF.getFunction().optForSize())
+ if (MF.getFunction().hasOptSize())
Changed |= removeRedundantAddrCalc(LEAs);
}
diff --git a/lib/Target/X86/X86PadShortFunction.cpp b/lib/Target/X86/X86PadShortFunction.cpp
index 85b9aecc2106..af974c805c36 100644
--- a/lib/Target/X86/X86PadShortFunction.cpp
+++ b/lib/Target/X86/X86PadShortFunction.cpp
@@ -1,9 +1,8 @@
//===-------- X86PadShortFunction.cpp - pad short functions -----------===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
@@ -98,7 +97,7 @@ bool PadShortFunc::runOnMachineFunction(MachineFunction &MF) {
if (skipFunction(MF.getFunction()))
return false;
- if (MF.getFunction().optForSize())
+ if (MF.getFunction().hasOptSize())
return false;
if (!MF.getSubtarget<X86Subtarget>().padShortFunctions())
@@ -113,14 +112,11 @@ bool PadShortFunc::runOnMachineFunction(MachineFunction &MF) {
bool MadeChange = false;
- MachineBasicBlock *MBB;
- unsigned int Cycles = 0;
-
// Pad the identified basic blocks with NOOPs
for (DenseMap<MachineBasicBlock*, unsigned int>::iterator I = ReturnBBs.begin();
I != ReturnBBs.end(); ++I) {
- MBB = I->first;
- Cycles = I->second;
+ MachineBasicBlock *MBB = I->first;
+ unsigned Cycles = I->second;
if (Cycles < Threshold) {
// BB ends in a return. Skip over any DBG_VALUE instructions
diff --git a/lib/Target/X86/X86PfmCounters.td b/lib/Target/X86/X86PfmCounters.td
index a1a4210b5ebf..5610f4bc8873 100644
--- a/lib/Target/X86/X86PfmCounters.td
+++ b/lib/Target/X86/X86PfmCounters.td
@@ -1,9 +1,8 @@
//===-- X86PfmCounters.td - X86 Hardware Counters ----------*- tablegen -*-===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
diff --git a/lib/Target/X86/X86RegisterBankInfo.cpp b/lib/Target/X86/X86RegisterBankInfo.cpp
index 355291916ee8..78fede3dcde2 100644
--- a/lib/Target/X86/X86RegisterBankInfo.cpp
+++ b/lib/Target/X86/X86RegisterBankInfo.cpp
@@ -1,9 +1,8 @@
//===- X86RegisterBankInfo.cpp -----------------------------------*- C++ -*-==//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
/// \file
@@ -160,7 +159,7 @@ const RegisterBankInfo::InstructionMapping &
X86RegisterBankInfo::getInstrMapping(const MachineInstr &MI) const {
const MachineFunction &MF = *MI.getParent()->getParent();
const MachineRegisterInfo &MRI = MF.getRegInfo();
- auto Opc = MI.getOpcode();
+ unsigned Opc = MI.getOpcode();
// Try the default logic for non-generic instructions that are either copies
// or already have some operands assigned to banks.
@@ -174,17 +173,22 @@ X86RegisterBankInfo::getInstrMapping(const MachineInstr &MI) const {
case TargetOpcode::G_ADD:
case TargetOpcode::G_SUB:
case TargetOpcode::G_MUL:
- case TargetOpcode::G_SHL:
- case TargetOpcode::G_LSHR:
- case TargetOpcode::G_ASHR:
return getSameOperandsMapping(MI, false);
- break;
case TargetOpcode::G_FADD:
case TargetOpcode::G_FSUB:
case TargetOpcode::G_FMUL:
case TargetOpcode::G_FDIV:
return getSameOperandsMapping(MI, true);
- break;
+ case TargetOpcode::G_SHL:
+ case TargetOpcode::G_LSHR:
+ case TargetOpcode::G_ASHR: {
+ unsigned NumOperands = MI.getNumOperands();
+ LLT Ty = MRI.getType(MI.getOperand(0).getReg());
+
+ auto Mapping = getValueMapping(getPartialMappingIdx(Ty, false), 3);
+ return getInstructionMapping(DefaultMappingID, 1, Mapping, NumOperands);
+
+ }
default:
break;
}
diff --git a/lib/Target/X86/X86RegisterBankInfo.h b/lib/Target/X86/X86RegisterBankInfo.h
index e227880427f3..c1f3001c6180 100644
--- a/lib/Target/X86/X86RegisterBankInfo.h
+++ b/lib/Target/X86/X86RegisterBankInfo.h
@@ -1,9 +1,8 @@
//===- X86RegisterBankInfo ---------------------------------------*- C++ -*-==//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
/// \file
diff --git a/lib/Target/X86/X86RegisterBanks.td b/lib/Target/X86/X86RegisterBanks.td
index 6d17cd53a0c1..74c515850ab1 100644
--- a/lib/Target/X86/X86RegisterBanks.td
+++ b/lib/Target/X86/X86RegisterBanks.td
@@ -1,9 +1,8 @@
//=- X86RegisterBank.td - Describe the AArch64 Banks -----*- tablegen -*-=//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
diff --git a/lib/Target/X86/X86RegisterInfo.cpp b/lib/Target/X86/X86RegisterInfo.cpp
index 55842a4a2091..2e2f1f9e438a 100644
--- a/lib/Target/X86/X86RegisterInfo.cpp
+++ b/lib/Target/X86/X86RegisterInfo.cpp
@@ -1,9 +1,8 @@
//===-- X86RegisterInfo.cpp - X86 Register Information --------------------===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
@@ -164,6 +163,7 @@ X86RegisterInfo::getLargestLegalSuperClass(const TargetRegisterClass *RC,
case X86::RFP32RegClassID:
case X86::RFP64RegClassID:
case X86::RFP80RegClassID:
+ case X86::VR512_0_15RegClassID:
case X86::VR512RegClassID:
// Don't return a super-class that would shrink the spill size.
// That can happen with the vector and float classes.
@@ -216,6 +216,21 @@ X86RegisterInfo::getPointerRegClass(const MachineFunction &MF,
}
}
+bool X86RegisterInfo::shouldRewriteCopySrc(const TargetRegisterClass *DefRC,
+ unsigned DefSubReg,
+ const TargetRegisterClass *SrcRC,
+ unsigned SrcSubReg) const {
+ // Prevent rewriting a copy where the destination size is larger than the
+ // input size. See PR41619.
+ // FIXME: Should this be factored into the base implementation somehow.
+ if (DefRC->hasSuperClassEq(&X86::GR64RegClass) && DefSubReg == 0 &&
+ SrcRC->hasSuperClassEq(&X86::GR64RegClass) && SrcSubReg == X86::sub_32bit)
+ return false;
+
+ return TargetRegisterInfo::shouldRewriteCopySrc(DefRC, DefSubReg,
+ SrcRC, SrcSubReg);
+}
+
const TargetRegisterClass *
X86RegisterInfo::getGPRsForTailCall(const MachineFunction &MF) const {
const Function &F = MF.getFunction();
@@ -497,6 +512,9 @@ BitVector X86RegisterInfo::getReservedRegs(const MachineFunction &MF) const {
BitVector Reserved(getNumRegs());
const X86FrameLowering *TFI = getFrameLowering(MF);
+ // Set the floating point control register as reserved.
+ Reserved.set(X86::FPCW);
+
// Set the stack-pointer register and its aliases as reserved.
for (MCSubRegIterator I(X86::RSP, this, /*IncludeSelf=*/true); I.isValid();
++I)
@@ -747,7 +765,7 @@ X86RegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator II,
}
}
-unsigned X86RegisterInfo::getFrameRegister(const MachineFunction &MF) const {
+Register X86RegisterInfo::getFrameRegister(const MachineFunction &MF) const {
const X86FrameLowering *TFI = getFrameLowering(MF);
return TFI->hasFP(MF) ? FramePtr : StackPtr;
}
@@ -760,3 +778,12 @@ X86RegisterInfo::getPtrSizedFrameRegister(const MachineFunction &MF) const {
FrameReg = getX86SubSuperRegister(FrameReg, 32);
return FrameReg;
}
+
+unsigned
+X86RegisterInfo::getPtrSizedStackRegister(const MachineFunction &MF) const {
+ const X86Subtarget &Subtarget = MF.getSubtarget<X86Subtarget>();
+ unsigned StackReg = getStackRegister();
+ if (Subtarget.isTarget64BitILP32())
+ StackReg = getX86SubSuperRegister(StackReg, 32);
+ return StackReg;
+}
diff --git a/lib/Target/X86/X86RegisterInfo.h b/lib/Target/X86/X86RegisterInfo.h
index 29401dadead0..b82920898069 100644
--- a/lib/Target/X86/X86RegisterInfo.h
+++ b/lib/Target/X86/X86RegisterInfo.h
@@ -1,9 +1,8 @@
//===-- X86RegisterInfo.h - X86 Register Information Impl -------*- C++ -*-===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
@@ -50,7 +49,7 @@ private:
unsigned BasePtr;
public:
- X86RegisterInfo(const Triple &TT);
+ explicit X86RegisterInfo(const Triple &TT);
// FIXME: This should be tablegen'd like getDwarfRegNum is
int getSEHRegNum(unsigned i) const;
@@ -75,6 +74,11 @@ public:
getLargestLegalSuperClass(const TargetRegisterClass *RC,
const MachineFunction &MF) const override;
+ bool shouldRewriteCopySrc(const TargetRegisterClass *DefRC,
+ unsigned DefSubReg,
+ const TargetRegisterClass *SrcRC,
+ unsigned SrcSubReg) const override;
+
/// getPointerRegClass - Returns a TargetRegisterClass used for pointer
/// values.
const TargetRegisterClass *
@@ -129,15 +133,16 @@ public:
RegScavenger *RS = nullptr) const override;
// Debug information queries.
- unsigned getFrameRegister(const MachineFunction &MF) const override;
+ Register getFrameRegister(const MachineFunction &MF) const override;
unsigned getPtrSizedFrameRegister(const MachineFunction &MF) const;
- unsigned getStackRegister() const { return StackPtr; }
- unsigned getBaseRegister() const { return BasePtr; }
+ unsigned getPtrSizedStackRegister(const MachineFunction &MF) const;
+ Register getStackRegister() const { return StackPtr; }
+ Register getBaseRegister() const { return BasePtr; }
/// Returns physical register used as frame pointer.
/// This will always returns the frame pointer register, contrary to
/// getFrameRegister() which returns the "base pointer" in situations
/// involving a stack, frame and base pointer.
- unsigned getFramePtr() const { return FramePtr; }
+ Register getFramePtr() const { return FramePtr; }
// FIXME: Move to FrameInfok
unsigned getSlotSize() const { return SlotSize; }
};
diff --git a/lib/Target/X86/X86RegisterInfo.td b/lib/Target/X86/X86RegisterInfo.td
index aa20273f89ab..0528b90c1fd5 100644
--- a/lib/Target/X86/X86RegisterInfo.td
+++ b/lib/Target/X86/X86RegisterInfo.td
@@ -1,9 +1,8 @@
//===- X86RegisterInfo.td - Describe the X86 Register File --*- tablegen -*-==//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
@@ -29,6 +28,8 @@ let Namespace = "X86" in {
def sub_32bit : SubRegIndex<32>;
def sub_xmm : SubRegIndex<128>;
def sub_ymm : SubRegIndex<256>;
+ def sub_mask_0 : SubRegIndex<-1>;
+ def sub_mask_1 : SubRegIndex<-1, -1>;
}
//===----------------------------------------------------------------------===//
@@ -278,7 +279,7 @@ def K7 : X86Reg<"k7", 7>, DwarfRegNum<[125, 100, 100]>;
// pseudo registers, but we still mark them as aliasing FP registers. That
// way both kinds can be live without exceeding the stack depth. ST registers
// are only live around inline assembly.
-def ST0 : X86Reg<"st(0)", 0>, DwarfRegNum<[33, 12, 11]>;
+def ST0 : X86Reg<"st", 0>, DwarfRegNum<[33, 12, 11]>;
def ST1 : X86Reg<"st(1)", 1>, DwarfRegNum<[34, 13, 12]>;
def ST2 : X86Reg<"st(2)", 2>, DwarfRegNum<[35, 14, 13]>;
def ST3 : X86Reg<"st(3)", 3>, DwarfRegNum<[36, 15, 14]>;
@@ -288,7 +289,10 @@ def ST6 : X86Reg<"st(6)", 6>, DwarfRegNum<[39, 18, 17]>;
def ST7 : X86Reg<"st(7)", 7>, DwarfRegNum<[40, 19, 18]>;
// Floating-point status word
-def FPSW : X86Reg<"fpsw", 0>;
+def FPSW : X86Reg<"fpsr", 0>;
+
+// Floating-point control word
+def FPCW : X86Reg<"fpcr", 0>;
// Status flags register.
//
@@ -539,6 +543,9 @@ def RST : RegisterClass<"X86", [f80, f64, f32], 32, (sequence "ST%u", 0, 7)> {
let isAllocatable = 0;
}
+// Helper to allow %st to print as %st(0) when its encoded in the instruction.
+def RSTi : RegisterOperand<RST, "printSTiRegOperand">;
+
// Generic vector registers: VR64 and VR128.
// Ensure that float types are declared first - only float is legal on SSE1.
def VR64: RegisterClass<"X86", [x86mmx], 64, (sequence "MM%u", 0, 7)>;
@@ -547,17 +554,6 @@ def VR128 : RegisterClass<"X86", [v4f32, v2f64, v16i8, v8i16, v4i32, v2i64, f128
def VR256 : RegisterClass<"X86", [v8f32, v4f64, v32i8, v16i16, v8i32, v4i64],
256, (sequence "YMM%u", 0, 15)>;
-// Special classes that help the assembly parser choose some alternate
-// instructions to favor 2-byte VEX encodings.
-def VR128L : RegisterClass<"X86", [v4f32, v2f64, v16i8, v8i16, v4i32, v2i64, f128],
- 128, (sequence "XMM%u", 0, 7)>;
-def VR128H : RegisterClass<"X86", [v4f32, v2f64, v16i8, v8i16, v4i32, v2i64, f128],
- 128, (sequence "XMM%u", 8, 15)>;
-def VR256L : RegisterClass<"X86", [v8f32, v4f64, v32i8, v16i16, v8i32, v4i64],
- 256, (sequence "YMM%u", 0, 7)>;
-def VR256H : RegisterClass<"X86", [v8f32, v4f64, v32i8, v16i16, v8i32, v4i64],
- 256, (sequence "YMM%u", 8, 15)>;
-
// Status flags registers.
def CCR : RegisterClass<"X86", [i32], 32, (add EFLAGS)> {
let CopyCost = -1; // Don't allow copying of status registers.
@@ -576,6 +572,10 @@ def DFCCR : RegisterClass<"X86", [i32], 32, (add DF)> {
def VR512 : RegisterClass<"X86", [v16f32, v8f64, v64i8, v32i16, v16i32, v8i64],
512, (sequence "ZMM%u", 0, 31)>;
+// Represents the lower 16 registers that have VEX/legacy encodable subregs.
+def VR512_0_15 : RegisterClass<"X86", [v16f32, v8f64, v64i8, v32i16, v16i32, v8i64],
+ 512, (sequence "ZMM%u", 0, 15)>;
+
// Scalar AVX-512 floating point registers.
def FR32X : RegisterClass<"X86", [f32], 32, (sequence "XMM%u", 0, 31)>;
@@ -596,6 +596,16 @@ def VK16 : RegisterClass<"X86", [v16i1], 16, (add VK8)> {let Size = 16;}
def VK32 : RegisterClass<"X86", [v32i1], 32, (add VK16)> {let Size = 32;}
def VK64 : RegisterClass<"X86", [v64i1], 64, (add VK32)> {let Size = 64;}
+// Mask register pairs
+def KPAIRS : RegisterTuples<[sub_mask_0, sub_mask_1],
+ [(add K0, K2, K4, K6), (add K1, K3, K5, K7)]>;
+
+def VK1PAIR : RegisterClass<"X86", [untyped], 16, (add KPAIRS)> {let Size = 32;}
+def VK2PAIR : RegisterClass<"X86", [untyped], 16, (add KPAIRS)> {let Size = 32;}
+def VK4PAIR : RegisterClass<"X86", [untyped], 16, (add KPAIRS)> {let Size = 32;}
+def VK8PAIR : RegisterClass<"X86", [untyped], 16, (add KPAIRS)> {let Size = 32;}
+def VK16PAIR : RegisterClass<"X86", [untyped], 16, (add KPAIRS)> {let Size = 32;}
+
def VK1WM : RegisterClass<"X86", [v1i1], 16, (sub VK1, K0)> {let Size = 16;}
def VK2WM : RegisterClass<"X86", [v2i1], 16, (sub VK2, K0)> {let Size = 16;}
def VK4WM : RegisterClass<"X86", [v4i1], 16, (sub VK4, K0)> {let Size = 16;}
diff --git a/lib/Target/X86/X86RetpolineThunks.cpp b/lib/Target/X86/X86RetpolineThunks.cpp
index 08994cccb21e..b435b22e8ac7 100644
--- a/lib/Target/X86/X86RetpolineThunks.cpp
+++ b/lib/Target/X86/X86RetpolineThunks.cpp
@@ -1,9 +1,8 @@
//======- X86RetpolineThunks.cpp - Construct retpoline thunks for x86 --=====//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
/// \file
diff --git a/lib/Target/X86/X86SchedBroadwell.td b/lib/Target/X86/X86SchedBroadwell.td
index 971a50196e45..7574e4b8f896 100755
--- a/lib/Target/X86/X86SchedBroadwell.td
+++ b/lib/Target/X86/X86SchedBroadwell.td
@@ -1,9 +1,8 @@
//=- X86SchedBroadwell.td - X86 Broadwell Scheduling ---------*- tablegen -*-=//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
@@ -82,6 +81,8 @@ def : ReadAdvance<ReadAfterVecLd, 5>;
def : ReadAdvance<ReadAfterVecXLd, 5>;
def : ReadAdvance<ReadAfterVecYLd, 6>;
+def : ReadAdvance<ReadInt2Fpu, 0>;
+
// Many SchedWrites are defined in pairs with and without a folded load.
// Instructions with folded loads are usually micro-fused, so they only appear
// as two micro-ops when queued in the reservation station.
@@ -159,7 +160,6 @@ defm : BWWriteResPair<WriteCRC32, [BWPort1], 3>;
def : WriteRes<WriteLEA, [BWPort15]>; // LEA instructions can't fold loads.
defm : BWWriteResPair<WriteCMOV, [BWPort06], 1>; // Conditional move.
-defm : BWWriteResPair<WriteCMOV2, [BWPort06,BWPort0156], 2, [1,1], 2>; // // Conditional (CF + ZF flag) move.
defm : X86WriteRes<WriteFCMOV, [BWPort1], 3, [1], 1>; // x87 conditional move.
def : WriteRes<WriteSETCC, [BWPort06]>; // Setcc.
@@ -186,7 +186,7 @@ defm : BWWriteResPair<WritePOPCNT, [BWPort1], 3>;
// Integer shifts and rotates.
defm : BWWriteResPair<WriteShift, [BWPort06], 1>;
defm : BWWriteResPair<WriteShiftCL, [BWPort06,BWPort0156], 3, [2,1], 3>;
-defm : BWWriteResPair<WriteRotate, [BWPort06], 2, [2], 2>;
+defm : BWWriteResPair<WriteRotate, [BWPort06], 1, [1], 1>;
defm : BWWriteResPair<WriteRotateCL, [BWPort06,BWPort0156], 3, [2,1], 3>;
// SHLD/SHRD.
@@ -732,10 +732,10 @@ def BWWriteResGroup20 : SchedWriteRes<[BWPort06,BWPort0156]> {
}
def: InstRW<[BWWriteResGroup20], (instrs CWD,
JCXZ, JECXZ, JRCXZ,
- ADC8i8, SBB8i8)>;
-def: InstRW<[BWWriteResGroup20], (instregex "ADC8ri",
- "SBB8ri",
- "SET(A|BE)r")>;
+ ADC8i8, SBB8i8,
+ ADC16i16, SBB16i16,
+ ADC32i32, SBB32i32,
+ ADC64i32, SBB64i32)>;
def BWWriteResGroup22 : SchedWriteRes<[BWPort4,BWPort6,BWPort237]> {
let Latency = 2;
@@ -814,7 +814,6 @@ def BWWriteResGroup38 : SchedWriteRes<[BWPort4,BWPort237,BWPort06,BWPort0156]> {
let ResourceCycles = [1,1,1,1];
}
def: InstRW<[BWWriteResGroup38], (instrs CALL64pcrel32)>;
-def: InstRW<[BWWriteResGroup38], (instregex "SET(A|BE)m")>;
def BWWriteResGroup39 : SchedWriteRes<[BWPort0,BWPort1]> {
let Latency = 4;
@@ -890,8 +889,7 @@ def BWWriteResGroup47 : SchedWriteRes<[BWPort0]> {
let NumMicroOps = 1;
let ResourceCycles = [1];
}
-def: InstRW<[BWWriteResGroup47], (instregex "(V?)PCMPGTQ(Y?)rr",
- "MUL_(FPrST0|FST0r|FrST0)")>;
+def: InstRW<[BWWriteResGroup47], (instregex "MUL_(FPrST0|FST0r|FrST0)")>;
def BWWriteResGroup49 : SchedWriteRes<[BWPort23]> {
let Latency = 5;
@@ -965,6 +963,7 @@ def BWWriteResGroup59 : SchedWriteRes<[BWPort0,BWPort23]> {
}
def: InstRW<[BWWriteResGroup59], (instrs CVTPS2PDrm, VCVTPS2PDrm,
CVTSS2SDrm, VCVTSS2SDrm,
+ CVTSS2SDrm_Int, VCVTSS2SDrm_Int,
VPSLLVQrm,
VPSRLVQrm)>;
@@ -1103,6 +1102,14 @@ def BWWriteResGroup87 : SchedWriteRes<[BWPort4,BWPort23,BWPort237,BWPort06]> {
def: InstRW<[BWWriteResGroup87], (instregex "ROL(8|16|32|64)m(1|i)",
"ROR(8|16|32|64)m(1|i)")>;
+def BWWriteResGroup87_1 : SchedWriteRes<[BWPort06]> {
+ let Latency = 2;
+ let NumMicroOps = 2;
+ let ResourceCycles = [2];
+}
+def: InstRW<[BWWriteResGroup87_1], (instrs ROL8r1, ROL16r1, ROL32r1, ROL64r1,
+ ROR8r1, ROR16r1, ROR32r1, ROR64r1)>;
+
def BWWriteResGroup88 : SchedWriteRes<[BWPort4,BWPort23,BWPort237,BWPort0156]> {
let Latency = 7;
let NumMicroOps = 5;
@@ -1592,4 +1599,140 @@ def: InstRW<[BWWriteResGroup202], (instrs FSTENVm)>;
def: InstRW<[WriteZero], (instrs CLC)>;
+
+// Intruction variants handled by the renamer. These might not need execution
+// ports in certain conditions.
+// See Agner's Fog "The microarchitecture of Intel, AMD and VIA CPUs",
+// section "Haswell and Broadwell Pipeline" > "Register allocation and
+// renaming".
+// These can be investigated with llvm-exegesis, e.g.
+// echo 'pxor %mm0, %mm0' | /tmp/llvm-exegesis -mode=uops -snippets-file=-
+// echo 'vxorpd %xmm0, %xmm0, %xmm1' | /tmp/llvm-exegesis -mode=uops -snippets-file=-
+
+def BWWriteZeroLatency : SchedWriteRes<[]> {
+ let Latency = 0;
+}
+
+def BWWriteZeroIdiom : SchedWriteVariant<[
+ SchedVar<MCSchedPredicate<ZeroIdiomPredicate>, [BWWriteZeroLatency]>,
+ SchedVar<NoSchedPred, [WriteALU]>
+]>;
+def : InstRW<[BWWriteZeroIdiom], (instrs SUB32rr, SUB64rr,
+ XOR32rr, XOR64rr)>;
+
+def BWWriteFZeroIdiom : SchedWriteVariant<[
+ SchedVar<MCSchedPredicate<ZeroIdiomPredicate>, [BWWriteZeroLatency]>,
+ SchedVar<NoSchedPred, [WriteFLogic]>
+]>;
+def : InstRW<[BWWriteFZeroIdiom], (instrs XORPSrr, VXORPSrr, XORPDrr,
+ VXORPDrr)>;
+
+def BWWriteFZeroIdiomY : SchedWriteVariant<[
+ SchedVar<MCSchedPredicate<ZeroIdiomPredicate>, [BWWriteZeroLatency]>,
+ SchedVar<NoSchedPred, [WriteFLogicY]>
+]>;
+def : InstRW<[BWWriteFZeroIdiomY], (instrs VXORPSYrr, VXORPDYrr)>;
+
+def BWWriteVZeroIdiomLogicX : SchedWriteVariant<[
+ SchedVar<MCSchedPredicate<ZeroIdiomPredicate>, [BWWriteZeroLatency]>,
+ SchedVar<NoSchedPred, [WriteVecLogicX]>
+]>;
+def : InstRW<[BWWriteVZeroIdiomLogicX], (instrs PXORrr, VPXORrr)>;
+
+def BWWriteVZeroIdiomLogicY : SchedWriteVariant<[
+ SchedVar<MCSchedPredicate<ZeroIdiomPredicate>, [BWWriteZeroLatency]>,
+ SchedVar<NoSchedPred, [WriteVecLogicY]>
+]>;
+def : InstRW<[BWWriteVZeroIdiomLogicY], (instrs VPXORYrr)>;
+
+def BWWriteVZeroIdiomALUX : SchedWriteVariant<[
+ SchedVar<MCSchedPredicate<ZeroIdiomPredicate>, [BWWriteZeroLatency]>,
+ SchedVar<NoSchedPred, [WriteVecALUX]>
+]>;
+def : InstRW<[BWWriteVZeroIdiomALUX], (instrs PSUBBrr, VPSUBBrr,
+ PSUBDrr, VPSUBDrr,
+ PSUBQrr, VPSUBQrr,
+ PSUBWrr, VPSUBWrr,
+ PCMPGTBrr, VPCMPGTBrr,
+ PCMPGTDrr, VPCMPGTDrr,
+ PCMPGTWrr, VPCMPGTWrr)>;
+
+def BWWriteVZeroIdiomALUY : SchedWriteVariant<[
+ SchedVar<MCSchedPredicate<ZeroIdiomPredicate>, [BWWriteZeroLatency]>,
+ SchedVar<NoSchedPred, [WriteVecALUY]>
+]>;
+def : InstRW<[BWWriteVZeroIdiomALUY], (instrs VPSUBBYrr,
+ VPSUBDYrr,
+ VPSUBQYrr,
+ VPSUBWYrr,
+ VPCMPGTBYrr,
+ VPCMPGTDYrr,
+ VPCMPGTWYrr)>;
+
+def BWWritePCMPGTQ : SchedWriteRes<[BWPort0]> {
+ let Latency = 5;
+ let NumMicroOps = 1;
+ let ResourceCycles = [1];
+}
+
+def BWWriteVZeroIdiomPCMPGTQ : SchedWriteVariant<[
+ SchedVar<MCSchedPredicate<ZeroIdiomPredicate>, [BWWriteZeroLatency]>,
+ SchedVar<NoSchedPred, [BWWritePCMPGTQ]>
+]>;
+def : InstRW<[BWWriteVZeroIdiomPCMPGTQ], (instrs PCMPGTQrr, VPCMPGTQrr,
+ VPCMPGTQYrr)>;
+
+
+// CMOVs that use both Z and C flag require an extra uop.
+def BWWriteCMOVA_CMOVBErr : SchedWriteRes<[BWPort06,BWPort0156]> {
+ let Latency = 2;
+ let ResourceCycles = [1,1];
+ let NumMicroOps = 2;
+}
+
+def BWWriteCMOVA_CMOVBErm : SchedWriteRes<[BWPort23,BWPort06,BWPort0156]> {
+ let Latency = 7;
+ let ResourceCycles = [1,1,1];
+ let NumMicroOps = 3;
+}
+
+def BWCMOVA_CMOVBErr : SchedWriteVariant<[
+ SchedVar<MCSchedPredicate<IsCMOVArr_Or_CMOVBErr>, [BWWriteCMOVA_CMOVBErr]>,
+ SchedVar<NoSchedPred, [WriteCMOV]>
+]>;
+
+def BWCMOVA_CMOVBErm : SchedWriteVariant<[
+ SchedVar<MCSchedPredicate<IsCMOVArm_Or_CMOVBErm>, [BWWriteCMOVA_CMOVBErm]>,
+ SchedVar<NoSchedPred, [WriteCMOV.Folded]>
+]>;
+
+def : InstRW<[BWCMOVA_CMOVBErr], (instrs CMOV16rr, CMOV32rr, CMOV64rr)>;
+def : InstRW<[BWCMOVA_CMOVBErm], (instrs CMOV16rm, CMOV32rm, CMOV64rm)>;
+
+// SETCCs that use both Z and C flag require an extra uop.
+def BWWriteSETA_SETBEr : SchedWriteRes<[BWPort06,BWPort0156]> {
+ let Latency = 2;
+ let ResourceCycles = [1,1];
+ let NumMicroOps = 2;
+}
+
+def BWWriteSETA_SETBEm : SchedWriteRes<[BWPort4,BWPort237,BWPort06,BWPort0156]> {
+ let Latency = 3;
+ let ResourceCycles = [1,1,1,1];
+ let NumMicroOps = 4;
+}
+
+def BWSETA_SETBErr : SchedWriteVariant<[
+ SchedVar<MCSchedPredicate<IsSETAr_Or_SETBEr>, [BWWriteSETA_SETBEr]>,
+ SchedVar<NoSchedPred, [WriteSETCC]>
+]>;
+
+def BWSETA_SETBErm : SchedWriteVariant<[
+ SchedVar<MCSchedPredicate<IsSETAm_Or_SETBEm>, [BWWriteSETA_SETBEm]>,
+ SchedVar<NoSchedPred, [WriteSETCCStore]>
+]>;
+
+def : InstRW<[BWSETA_SETBErr], (instrs SETCCr)>;
+def : InstRW<[BWSETA_SETBErm], (instrs SETCCm)>;
+
} // SchedModel
diff --git a/lib/Target/X86/X86SchedHaswell.td b/lib/Target/X86/X86SchedHaswell.td
index 06a32fb0b1cd..284d1567c5c6 100644
--- a/lib/Target/X86/X86SchedHaswell.td
+++ b/lib/Target/X86/X86SchedHaswell.td
@@ -1,9 +1,8 @@
//=- X86SchedHaswell.td - X86 Haswell Scheduling -------------*- tablegen -*-=//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
@@ -87,6 +86,8 @@ def : ReadAdvance<ReadAfterVecLd, 5>;
def : ReadAdvance<ReadAfterVecXLd, 6>;
def : ReadAdvance<ReadAfterVecYLd, 7>;
+def : ReadAdvance<ReadInt2Fpu, 0>;
+
// Many SchedWrites are defined in pairs with and without a folded load.
// Instructions with folded loads are usually micro-fused, so they only appear
// as two micro-ops when queued in the reservation station.
@@ -151,7 +152,7 @@ defm : X86WriteRes<WriteXCHG, [HWPort0156], 2, [3], 3>;
// Integer shifts and rotates.
defm : HWWriteResPair<WriteShift, [HWPort06], 1>;
defm : HWWriteResPair<WriteShiftCL, [HWPort06, HWPort0156], 3, [2,1], 3>;
-defm : HWWriteResPair<WriteRotate, [HWPort06], 2, [2], 2>;
+defm : HWWriteResPair<WriteRotate, [HWPort06], 1, [1], 1>;
defm : HWWriteResPair<WriteRotateCL, [HWPort06, HWPort0156], 3, [2,1], 3>;
// SHLD/SHRD.
@@ -164,7 +165,6 @@ defm : HWWriteResPair<WriteJump, [HWPort06], 1>;
defm : HWWriteResPair<WriteCRC32, [HWPort1], 3>;
defm : HWWriteResPair<WriteCMOV, [HWPort06,HWPort0156], 2, [1,1], 2>; // Conditional move.
-defm : HWWriteResPair<WriteCMOV2, [HWPort06,HWPort0156], 3, [1,2], 3>; // Conditional (CF + ZF flag) move.
defm : X86WriteRes<WriteFCMOV, [HWPort1], 3, [1], 1>; // x87 conditional move.
def : WriteRes<WriteSETCC, [HWPort06]>; // Setcc.
def : WriteRes<WriteSETCCStore, [HWPort06,HWPort4,HWPort237]> {
@@ -1126,7 +1126,6 @@ def HWWriteResGroup35 : SchedWriteRes<[HWPort06,HWPort0156]> {
let ResourceCycles = [1,1];
}
def: InstRW<[HWWriteResGroup35], (instrs CWD, JCXZ, JECXZ, JRCXZ)>;
-def: InstRW<[HWWriteResGroup35], (instregex "SET(A|BE)r")>;
def HWWriteResGroup36_2 : SchedWriteRes<[HWPort5,HWPort23]> {
let Latency = 7;
@@ -1172,7 +1171,6 @@ def HWWriteResGroup45 : SchedWriteRes<[HWPort4,HWPort237,HWPort06,HWPort0156]> {
let ResourceCycles = [1,1,1,1];
}
def: InstRW<[HWWriteResGroup45], (instrs CALL64pcrel32)>;
-def: InstRW<[HWWriteResGroup45], (instregex "SET(A|BE)m")>;
def HWWriteResGroup46 : SchedWriteRes<[HWPort4,HWPort23,HWPort237,HWPort06]> {
let Latency = 8;
@@ -1182,6 +1180,14 @@ def HWWriteResGroup46 : SchedWriteRes<[HWPort4,HWPort23,HWPort237,HWPort06]> {
def: InstRW<[HWWriteResGroup46], (instregex "ROL(8|16|32|64)m(1|i)",
"ROR(8|16|32|64)m(1|i)")>;
+def HWWriteResGroup46_1 : SchedWriteRes<[HWPort06]> {
+ let Latency = 2;
+ let NumMicroOps = 2;
+ let ResourceCycles = [2];
+}
+def: InstRW<[HWWriteResGroup46_1], (instrs ROL8r1, ROL16r1, ROL32r1, ROL64r1,
+ ROR8r1, ROR16r1, ROR32r1, ROR64r1)>;
+
def HWWriteResGroup47 : SchedWriteRes<[HWPort4,HWPort23,HWPort237,HWPort0156]> {
let Latency = 8;
let NumMicroOps = 5;
@@ -1391,8 +1397,8 @@ def HWWriteResGroup78_1 : SchedWriteRes<[HWPort1,HWPort5,HWPort23]> {
let ResourceCycles = [1,1,1];
}
def: InstRW<[HWWriteResGroup78_1], (instrs MMX_CVTPI2PDirm,
- CVTSD2SSrm,
- VCVTSD2SSrm)>;
+ CVTSD2SSrm, CVTSD2SSrm_Int,
+ VCVTSD2SSrm, VCVTSD2SSrm_Int)>;
def HWWriteResGroup80 : SchedWriteRes<[HWPort5,HWPort23,HWPort015]> {
let Latency = 9;
@@ -1442,8 +1448,7 @@ def HWWriteResGroup89 : SchedWriteRes<[HWPort0]> {
let NumMicroOps = 1;
let ResourceCycles = [1];
}
-def: InstRW<[HWWriteResGroup89], (instregex "(V?)PCMPGTQ(Y?)rr",
- "MUL_(FPrST0|FST0r|FrST0)")>;
+def: InstRW<[HWWriteResGroup89], (instregex "MUL_(FPrST0|FST0r|FrST0)")>;
def HWWriteResGroup91_2 : SchedWriteRes<[HWPort0,HWPort23]> {
let Latency = 11;
@@ -1847,4 +1852,170 @@ def: InstRW<[HWWriteResGroup192], (instrs VGATHERQPSrm,
def: InstRW<[WriteZero], (instrs CLC)>;
+
+// Intruction variants handled by the renamer. These might not need execution
+// ports in certain conditions.
+// See Agner's Fog "The microarchitecture of Intel, AMD and VIA CPUs",
+// section "Haswell and Broadwell Pipeline" > "Register allocation and
+// renaming".
+// These can be investigated with llvm-exegesis, e.g.
+// echo 'pxor %mm0, %mm0' | /tmp/llvm-exegesis -mode=uops -snippets-file=-
+// echo 'vxorpd %xmm0, %xmm0, %xmm1' | /tmp/llvm-exegesis -mode=uops -snippets-file=-
+
+def HWWriteZeroLatency : SchedWriteRes<[]> {
+ let Latency = 0;
+}
+
+def HWWriteZeroIdiom : SchedWriteVariant<[
+ SchedVar<MCSchedPredicate<ZeroIdiomPredicate>, [HWWriteZeroLatency]>,
+ SchedVar<NoSchedPred, [WriteALU]>
+]>;
+def : InstRW<[HWWriteZeroIdiom], (instrs SUB32rr, SUB64rr,
+ XOR32rr, XOR64rr)>;
+
+def HWWriteFZeroIdiom : SchedWriteVariant<[
+ SchedVar<MCSchedPredicate<ZeroIdiomPredicate>, [HWWriteZeroLatency]>,
+ SchedVar<NoSchedPred, [WriteFLogic]>
+]>;
+def : InstRW<[HWWriteFZeroIdiom], (instrs XORPSrr, VXORPSrr, XORPDrr,
+ VXORPDrr)>;
+
+def HWWriteFZeroIdiomY : SchedWriteVariant<[
+ SchedVar<MCSchedPredicate<ZeroIdiomPredicate>, [HWWriteZeroLatency]>,
+ SchedVar<NoSchedPred, [WriteFLogicY]>
+]>;
+def : InstRW<[HWWriteFZeroIdiomY], (instrs VXORPSYrr, VXORPDYrr)>;
+
+def HWWriteVZeroIdiomLogicX : SchedWriteVariant<[
+ SchedVar<MCSchedPredicate<ZeroIdiomPredicate>, [HWWriteZeroLatency]>,
+ SchedVar<NoSchedPred, [WriteVecLogicX]>
+]>;
+def : InstRW<[HWWriteVZeroIdiomLogicX], (instrs PXORrr, VPXORrr)>;
+
+def HWWriteVZeroIdiomLogicY : SchedWriteVariant<[
+ SchedVar<MCSchedPredicate<ZeroIdiomPredicate>, [HWWriteZeroLatency]>,
+ SchedVar<NoSchedPred, [WriteVecLogicY]>
+]>;
+def : InstRW<[HWWriteVZeroIdiomLogicY], (instrs VPXORYrr)>;
+
+def HWWriteVZeroIdiomALUX : SchedWriteVariant<[
+ SchedVar<MCSchedPredicate<ZeroIdiomPredicate>, [HWWriteZeroLatency]>,
+ SchedVar<NoSchedPred, [WriteVecALUX]>
+]>;
+def : InstRW<[HWWriteVZeroIdiomALUX], (instrs PSUBBrr, VPSUBBrr,
+ PSUBDrr, VPSUBDrr,
+ PSUBQrr, VPSUBQrr,
+ PSUBWrr, VPSUBWrr,
+ PCMPGTBrr, VPCMPGTBrr,
+ PCMPGTDrr, VPCMPGTDrr,
+ PCMPGTWrr, VPCMPGTWrr)>;
+
+def HWWriteVZeroIdiomALUY : SchedWriteVariant<[
+ SchedVar<MCSchedPredicate<ZeroIdiomPredicate>, [HWWriteZeroLatency]>,
+ SchedVar<NoSchedPred, [WriteVecALUY]>
+]>;
+def : InstRW<[HWWriteVZeroIdiomALUY], (instrs VPSUBBYrr,
+ VPSUBDYrr,
+ VPSUBQYrr,
+ VPSUBWYrr,
+ VPCMPGTBYrr,
+ VPCMPGTDYrr,
+ VPCMPGTWYrr)>;
+
+def HWWritePCMPGTQ : SchedWriteRes<[HWPort0]> {
+ let Latency = 5;
+ let NumMicroOps = 1;
+ let ResourceCycles = [1];
+}
+
+def HWWriteVZeroIdiomPCMPGTQ : SchedWriteVariant<[
+ SchedVar<MCSchedPredicate<ZeroIdiomPredicate>, [HWWriteZeroLatency]>,
+ SchedVar<NoSchedPred, [HWWritePCMPGTQ]>
+]>;
+def : InstRW<[HWWriteVZeroIdiomPCMPGTQ], (instrs PCMPGTQrr, VPCMPGTQrr,
+ VPCMPGTQYrr)>;
+
+
+// The 0x83 ADC/SBB opcodes have special support for immediate 0 to only require
+// a single uop. It does not apply to the GR8 encoding. And only applies to the
+// 8-bit immediate since using larger immediate for 0 would be silly.
+// Unfortunately, this optimization does not apply to the AX/EAX/RAX short
+// encodings we convert to in MCInstLowering so we exclude AX/EAX/RAX here since
+// we schedule before that point.
+// TODO: Should we disable using the short encodings on these CPUs?
+def HWFastADC0 : MCSchedPredicate<
+ CheckAll<[
+ CheckImmOperand<2, 0>, // Second MCOperand is Imm and has value 0.
+ CheckNot<CheckRegOperand<1, AX>>, // First MCOperand is not register AX
+ CheckNot<CheckRegOperand<1, EAX>>, // First MCOperand is not register EAX
+ CheckNot<CheckRegOperand<1, RAX>> // First MCOperand is not register RAX
+ ]>
+>;
+
+def HWWriteADC0 : SchedWriteRes<[HWPort06]> {
+ let Latency = 1;
+ let NumMicroOps = 1;
+ let ResourceCycles = [1];
+}
+
+def HWWriteADC : SchedWriteVariant<[
+ SchedVar<HWFastADC0, [HWWriteADC0]>,
+ SchedVar<NoSchedPred, [WriteADC]>
+]>;
+
+def : InstRW<[HWWriteADC], (instrs ADC16ri8, ADC32ri8, ADC64ri8,
+ SBB16ri8, SBB32ri8, SBB64ri8)>;
+
+// CMOVs that use both Z and C flag require an extra uop.
+def HWWriteCMOVA_CMOVBErr : SchedWriteRes<[HWPort06,HWPort0156]> {
+ let Latency = 3;
+ let ResourceCycles = [1,2];
+ let NumMicroOps = 3;
+}
+
+def HWWriteCMOVA_CMOVBErm : SchedWriteRes<[HWPort23,HWPort06,HWPort0156]> {
+ let Latency = 8;
+ let ResourceCycles = [1,1,2];
+ let NumMicroOps = 4;
+}
+
+def HWCMOVA_CMOVBErr : SchedWriteVariant<[
+ SchedVar<MCSchedPredicate<IsCMOVArr_Or_CMOVBErr>, [HWWriteCMOVA_CMOVBErr]>,
+ SchedVar<NoSchedPred, [WriteCMOV]>
+]>;
+
+def HWCMOVA_CMOVBErm : SchedWriteVariant<[
+ SchedVar<MCSchedPredicate<IsCMOVArm_Or_CMOVBErm>, [HWWriteCMOVA_CMOVBErm]>,
+ SchedVar<NoSchedPred, [WriteCMOV.Folded]>
+]>;
+
+def : InstRW<[HWCMOVA_CMOVBErr], (instrs CMOV16rr, CMOV32rr, CMOV64rr)>;
+def : InstRW<[HWCMOVA_CMOVBErm], (instrs CMOV16rm, CMOV32rm, CMOV64rm)>;
+
+// SETCCs that use both Z and C flag require an extra uop.
+def HWWriteSETA_SETBEr : SchedWriteRes<[HWPort06,HWPort0156]> {
+ let Latency = 2;
+ let ResourceCycles = [1,1];
+ let NumMicroOps = 2;
+}
+
+def HWWriteSETA_SETBEm : SchedWriteRes<[HWPort4,HWPort237,HWPort06,HWPort0156]> {
+ let Latency = 3;
+ let ResourceCycles = [1,1,1,1];
+ let NumMicroOps = 4;
+}
+
+def HWSETA_SETBErr : SchedWriteVariant<[
+ SchedVar<MCSchedPredicate<IsSETAr_Or_SETBEr>, [HWWriteSETA_SETBEr]>,
+ SchedVar<NoSchedPred, [WriteSETCC]>
+]>;
+
+def HWSETA_SETBErm : SchedWriteVariant<[
+ SchedVar<MCSchedPredicate<IsSETAm_Or_SETBEm>, [HWWriteSETA_SETBEm]>,
+ SchedVar<NoSchedPred, [WriteSETCCStore]>
+]>;
+
+def : InstRW<[HWSETA_SETBErr], (instrs SETCCr)>;
+def : InstRW<[HWSETA_SETBErm], (instrs SETCCm)>;
+
} // SchedModel
diff --git a/lib/Target/X86/X86SchedPredicates.td b/lib/Target/X86/X86SchedPredicates.td
index 1c7f24375f61..41bd776648f7 100644
--- a/lib/Target/X86/X86SchedPredicates.td
+++ b/lib/Target/X86/X86SchedPredicates.td
@@ -1,9 +1,8 @@
//===-- X86SchedPredicates.td - X86 Scheduling Predicates --*- tablegen -*-===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
@@ -61,3 +60,27 @@ def IsThreeOperandsLEABody :
// X86GenInstrInfo.
def IsThreeOperandsLEAFn :
TIIPredicate<"isThreeOperandsLEA", IsThreeOperandsLEABody>;
+
+// A predicate to check for COND_A and COND_BE CMOVs which have an extra uop
+// on recent Intel CPUs.
+def IsCMOVArr_Or_CMOVBErr : CheckAny<[
+ CheckImmOperand_s<3, "X86::COND_A">,
+ CheckImmOperand_s<3, "X86::COND_BE">
+]>;
+
+def IsCMOVArm_Or_CMOVBErm : CheckAny<[
+ CheckImmOperand_s<7, "X86::COND_A">,
+ CheckImmOperand_s<7, "X86::COND_BE">
+]>;
+
+// A predicate to check for COND_A and COND_BE SETCCs which have an extra uop
+// on recent Intel CPUs.
+def IsSETAr_Or_SETBEr : CheckAny<[
+ CheckImmOperand_s<1, "X86::COND_A">,
+ CheckImmOperand_s<1, "X86::COND_BE">
+]>;
+
+def IsSETAm_Or_SETBEm : CheckAny<[
+ CheckImmOperand_s<5, "X86::COND_A">,
+ CheckImmOperand_s<5, "X86::COND_BE">
+]>;
diff --git a/lib/Target/X86/X86SchedSandyBridge.td b/lib/Target/X86/X86SchedSandyBridge.td
index 9dbf0976989f..d40bdf728a48 100644
--- a/lib/Target/X86/X86SchedSandyBridge.td
+++ b/lib/Target/X86/X86SchedSandyBridge.td
@@ -1,9 +1,8 @@
//=- X86SchedSandyBridge.td - X86 Sandy Bridge Scheduling ----*- tablegen -*-=//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
@@ -77,6 +76,8 @@ def : ReadAdvance<ReadAfterVecLd, 5>;
def : ReadAdvance<ReadAfterVecXLd, 6>;
def : ReadAdvance<ReadAfterVecYLd, 7>;
+def : ReadAdvance<ReadInt2Fpu, 0>;
+
// Many SchedWrites are defined in pairs with and without a folded load.
// Instructions with folded loads are usually micro-fused, so they only appear
// as two micro-ops when queued in the reservation station.
@@ -159,7 +160,6 @@ defm : SBWriteResPair<WriteJump, [SBPort5], 1>;
defm : SBWriteResPair<WriteCRC32, [SBPort1], 3, [1], 1, 5>;
defm : SBWriteResPair<WriteCMOV, [SBPort05,SBPort015], 2, [1,1], 2>; // Conditional move.
-defm : SBWriteResPair<WriteCMOV2, [SBPort05,SBPort015], 3, [2,1], 3>; // Conditional (CF + ZF flag) move.
defm : X86WriteRes<WriteFCMOV, [SBPort5,SBPort05], 3, [2,1], 3>; // x87 conditional move.
def : WriteRes<WriteSETCC, [SBPort05]>; // Setcc.
def : WriteRes<WriteSETCCStore, [SBPort05,SBPort4,SBPort23]> {
@@ -615,13 +615,6 @@ def: InstRW<[SBWriteResGroup5], (instrs MMX_PABSBrr,
MMX_PSIGNDrr,
MMX_PSIGNWrr)>;
-def SBWriteResGroup9 : SchedWriteRes<[SBPort05]> {
- let Latency = 2;
- let NumMicroOps = 2;
- let ResourceCycles = [2];
-}
-def: InstRW<[SBWriteResGroup9], (instregex "SET(A|BE)r")>;
-
def SBWriteResGroup11 : SchedWriteRes<[SBPort015]> {
let Latency = 2;
let NumMicroOps = 2;
@@ -705,12 +698,6 @@ def SBWriteResGroup29_2 : SchedWriteRes<[SBPort5,SBPort015]> {
}
def: InstRW<[SBWriteResGroup29_2], (instrs PAUSE)>;
-def SBWriteResGroup30 : SchedWriteRes<[SBPort0]> {
- let Latency = 5;
- let NumMicroOps = 1;
- let ResourceCycles = [1];
-}
-
def SBWriteResGroup31 : SchedWriteRes<[SBPort23]> {
let Latency = 5;
let NumMicroOps = 1;
@@ -772,13 +759,6 @@ def SBWriteResGroup41 : SchedWriteRes<[SBPort5,SBPort015]> {
}
def: InstRW<[SBWriteResGroup41], (instrs FNINIT)>;
-def SBWriteResGroup43 : SchedWriteRes<[SBPort4,SBPort23,SBPort05]> {
- let Latency = 3;
- let NumMicroOps = 4;
- let ResourceCycles = [1,1,2];
-}
-def: InstRW<[SBWriteResGroup43], (instregex "SET(A|BE)m")>;
-
def SBWriteResGroup45 : SchedWriteRes<[SBPort0,SBPort4,SBPort23,SBPort15]> {
let Latency = 5;
let NumMicroOps = 4;
@@ -1148,6 +1128,12 @@ def SBWriteFZeroIdiom : SchedWriteVariant<[
def : InstRW<[SBWriteFZeroIdiom], (instrs XORPSrr, VXORPSrr, XORPDrr,
VXORPDrr)>;
+def SBWriteFZeroIdiomY : SchedWriteVariant<[
+ SchedVar<MCSchedPredicate<ZeroIdiomPredicate>, [SBWriteZeroLatency]>,
+ SchedVar<NoSchedPred, [WriteFLogicY]>
+]>;
+def : InstRW<[SBWriteFZeroIdiomY], (instrs VXORPSYrr, VXORPDYrr)>;
+
def SBWriteVZeroIdiomLogicX : SchedWriteVariant<[
SchedVar<MCSchedPredicate<ZeroIdiomPredicate>, [SBWriteZeroLatency]>,
SchedVar<NoSchedPred, [WriteVecLogicX]>
@@ -1166,10 +1152,68 @@ def : InstRW<[SBWriteVZeroIdiomALUX], (instrs PSUBBrr, VPSUBBrr,
PCMPGTDrr, VPCMPGTDrr,
PCMPGTWrr, VPCMPGTWrr)>;
+def SBWritePCMPGTQ : SchedWriteRes<[SBPort0]> {
+ let Latency = 5;
+ let NumMicroOps = 1;
+ let ResourceCycles = [1];
+}
+
def SBWriteVZeroIdiomPCMPGTQ : SchedWriteVariant<[
SchedVar<MCSchedPredicate<ZeroIdiomPredicate>, [SBWriteZeroLatency]>,
- SchedVar<NoSchedPred, [SBWriteResGroup30]>
+ SchedVar<NoSchedPred, [SBWritePCMPGTQ]>
]>;
def : InstRW<[SBWriteVZeroIdiomPCMPGTQ], (instrs PCMPGTQrr, VPCMPGTQrr)>;
+// CMOVs that use both Z and C flag require an extra uop.
+def SBWriteCMOVA_CMOVBErr : SchedWriteRes<[SBPort05,SBPort015]> {
+ let Latency = 3;
+ let ResourceCycles = [2,1];
+ let NumMicroOps = 3;
+}
+
+def SBWriteCMOVA_CMOVBErm : SchedWriteRes<[SBPort23,SBPort05,SBPort015]> {
+ let Latency = 8;
+ let ResourceCycles = [1,2,1];
+ let NumMicroOps = 4;
+}
+
+def SBCMOVA_CMOVBErr : SchedWriteVariant<[
+ SchedVar<MCSchedPredicate<IsCMOVArr_Or_CMOVBErr>, [SBWriteCMOVA_CMOVBErr]>,
+ SchedVar<NoSchedPred, [WriteCMOV]>
+]>;
+
+def SBCMOVA_CMOVBErm : SchedWriteVariant<[
+ SchedVar<MCSchedPredicate<IsCMOVArm_Or_CMOVBErm>, [SBWriteCMOVA_CMOVBErm]>,
+ SchedVar<NoSchedPred, [WriteCMOV.Folded]>
+]>;
+
+def : InstRW<[SBCMOVA_CMOVBErr], (instrs CMOV16rr, CMOV32rr, CMOV64rr)>;
+def : InstRW<[SBCMOVA_CMOVBErm], (instrs CMOV16rm, CMOV32rm, CMOV64rm)>;
+
+// SETCCs that use both Z and C flag require an extra uop.
+def SBWriteSETA_SETBEr : SchedWriteRes<[SBPort05]> {
+ let Latency = 2;
+ let ResourceCycles = [2];
+ let NumMicroOps = 2;
+}
+
+def SBWriteSETA_SETBEm : SchedWriteRes<[SBPort4,SBPort23,SBPort05]> {
+ let Latency = 3;
+ let ResourceCycles = [1,1,2];
+ let NumMicroOps = 4;
+}
+
+def SBSETA_SETBErr : SchedWriteVariant<[
+ SchedVar<MCSchedPredicate<IsSETAr_Or_SETBEr>, [SBWriteSETA_SETBEr]>,
+ SchedVar<NoSchedPred, [WriteSETCC]>
+]>;
+
+def SBSETA_SETBErm : SchedWriteVariant<[
+ SchedVar<MCSchedPredicate<IsSETAm_Or_SETBEm>, [SBWriteSETA_SETBEm]>,
+ SchedVar<NoSchedPred, [WriteSETCCStore]>
+]>;
+
+def : InstRW<[SBSETA_SETBErr], (instrs SETCCr)>;
+def : InstRW<[SBSETA_SETBErm], (instrs SETCCm)>;
+
} // SchedModel
diff --git a/lib/Target/X86/X86SchedSkylakeClient.td b/lib/Target/X86/X86SchedSkylakeClient.td
index 2c9eb7516085..8f3e4ae62d53 100644
--- a/lib/Target/X86/X86SchedSkylakeClient.td
+++ b/lib/Target/X86/X86SchedSkylakeClient.td
@@ -1,9 +1,8 @@
//=- X86SchedSkylake.td - X86 Skylake Client Scheduling ------*- tablegen -*-=//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
@@ -81,6 +80,8 @@ def : ReadAdvance<ReadAfterVecLd, 5>;
def : ReadAdvance<ReadAfterVecXLd, 6>;
def : ReadAdvance<ReadAfterVecYLd, 7>;
+def : ReadAdvance<ReadInt2Fpu, 0>;
+
// Many SchedWrites are defined in pairs with and without a folded load.
// Instructions with folded loads are usually micro-fused, so they only appear
// as two micro-ops when queued in the reservation station.
@@ -157,7 +158,6 @@ defm : SKLWriteResPair<WriteCRC32, [SKLPort1], 3>;
def : WriteRes<WriteLEA, [SKLPort15]>; // LEA instructions can't fold loads.
defm : SKLWriteResPair<WriteCMOV, [SKLPort06], 1, [1], 1>; // Conditional move.
-defm : SKLWriteResPair<WriteCMOV2, [SKLPort06], 2, [2], 2>; // Conditional (CF + ZF flag) move.
defm : X86WriteRes<WriteFCMOV, [SKLPort1], 3, [1], 1>; // x87 conditional move.
def : WriteRes<WriteSETCC, [SKLPort06]>; // Setcc.
def : WriteRes<WriteSETCCStore, [SKLPort06,SKLPort4,SKLPort237]> {
@@ -183,7 +183,7 @@ defm : SKLWriteResPair<WritePOPCNT, [SKLPort1], 3>;
// Integer shifts and rotates.
defm : SKLWriteResPair<WriteShift, [SKLPort06], 1>;
defm : SKLWriteResPair<WriteShiftCL, [SKLPort06], 3, [3], 3>;
-defm : SKLWriteResPair<WriteRotate, [SKLPort06], 2, [2], 2>;
+defm : SKLWriteResPair<WriteRotate, [SKLPort06], 1, [1], 1>;
defm : SKLWriteResPair<WriteRotateCL, [SKLPort06], 3, [3], 3>;
// SHLD/SHRD.
@@ -659,8 +659,7 @@ def SKLWriteResGroup9 : SchedWriteRes<[SKLPort015]> {
let ResourceCycles = [1];
}
def: InstRW<[SKLWriteResGroup9], (instregex "(V?)PADD(B|D|Q|W)(Y?)rr",
- "VPBLENDD(Y?)rri",
- "(V?)PSUB(B|D|Q|W)(Y?)rr")>;
+ "VPBLENDD(Y?)rri")>;
def SKLWriteResGroup10 : SchedWriteRes<[SKLPort0156]> {
let Latency = 1;
@@ -698,13 +697,6 @@ def SKLWriteResGroup14 : SchedWriteRes<[SKLPort05]> {
def: InstRW<[SKLWriteResGroup14], (instrs FDECSTP,
MMX_MOVDQ2Qrr)>;
-def SKLWriteResGroup15 : SchedWriteRes<[SKLPort06]> {
- let Latency = 2;
- let NumMicroOps = 2;
- let ResourceCycles = [2];
-}
-def: InstRW<[SKLWriteResGroup15], (instregex "SET(A|BE)r")>;
-
def SKLWriteResGroup17 : SchedWriteRes<[SKLPort0156]> {
let Latency = 2;
let NumMicroOps = 2;
@@ -735,9 +727,10 @@ def SKLWriteResGroup23 : SchedWriteRes<[SKLPort06,SKLPort0156]> {
}
def: InstRW<[SKLWriteResGroup23], (instrs CWD,
JCXZ, JECXZ, JRCXZ,
- ADC8i8, SBB8i8)>;
-def: InstRW<[SKLWriteResGroup23], (instregex "ADC8ri",
- "SBB8ri")>;
+ ADC8i8, SBB8i8,
+ ADC16i16, SBB16i16,
+ ADC32i32, SBB32i32,
+ ADC64i32, SBB64i32)>;
def SKLWriteResGroup25 : SchedWriteRes<[SKLPort4,SKLPort6,SKLPort237]> {
let Latency = 2;
@@ -776,8 +769,7 @@ def SKLWriteResGroup30 : SchedWriteRes<[SKLPort5]> {
let ResourceCycles = [1];
}
def: InstRW<[SKLWriteResGroup30], (instregex "(ADD|SUB|SUBR)_(FPrST0|FST0r|FrST0)",
- "VPBROADCAST(B|W)rr",
- "(V?)PCMPGTQ(Y?)rr")>;
+ "VPBROADCAST(B|W)rr")>;
def SKLWriteResGroup32 : SchedWriteRes<[SKLPort0,SKLPort0156]> {
let Latency = 3;
@@ -839,13 +831,6 @@ def SKLWriteResGroup43 : SchedWriteRes<[SKLPort0,SKLPort4,SKLPort237]> {
}
def: InstRW<[SKLWriteResGroup43], (instrs FNSTSWm)>;
-def SKLWriteResGroup44 : SchedWriteRes<[SKLPort4,SKLPort237,SKLPort06]> {
- let Latency = 3;
- let NumMicroOps = 4;
- let ResourceCycles = [1,1,2];
-}
-def: InstRW<[SKLWriteResGroup44], (instregex "SET(A|BE)m")>;
-
def SKLWriteResGroup45 : SchedWriteRes<[SKLPort4,SKLPort6,SKLPort237,SKLPort0156]> {
let Latency = 3;
let NumMicroOps = 4;
@@ -1183,6 +1168,14 @@ def SKLWriteResGroup100 : SchedWriteRes<[SKLPort4,SKLPort23,SKLPort237,SKLPort06
def: InstRW<[SKLWriteResGroup100], (instregex "ROL(8|16|32|64)m(1|i)",
"ROR(8|16|32|64)m(1|i)")>;
+def SKLWriteResGroup100_1 : SchedWriteRes<[SKLPort06]> {
+ let Latency = 2;
+ let NumMicroOps = 2;
+ let ResourceCycles = [2];
+}
+def: InstRW<[SKLWriteResGroup100_1], (instrs ROL8r1, ROL16r1, ROL32r1, ROL64r1,
+ ROR8r1, ROR16r1, ROR32r1, ROR64r1)>;
+
def SKLWriteResGroup101 : SchedWriteRes<[SKLPort4,SKLPort23,SKLPort237,SKLPort0156]> {
let Latency = 7;
let NumMicroOps = 5;
@@ -1747,4 +1740,150 @@ def: InstRW<[SKLWriteResGroup223], (instrs FSTENVm)>;
def: InstRW<[WriteZero], (instrs CLC)>;
+
+// Intruction variants handled by the renamer. These might not need execution
+// ports in certain conditions.
+// See Agner's Fog "The microarchitecture of Intel, AMD and VIA CPUs",
+// section "Skylake Pipeline" > "Register allocation and renaming".
+// These can be investigated with llvm-exegesis, e.g.
+// echo 'pxor %mm0, %mm0' | /tmp/llvm-exegesis -mode=uops -snippets-file=-
+// echo 'vxorpd %xmm0, %xmm0, %xmm1' | /tmp/llvm-exegesis -mode=uops -snippets-file=-
+
+def SKLWriteZeroLatency : SchedWriteRes<[]> {
+ let Latency = 0;
+}
+
+def SKLWriteZeroIdiom : SchedWriteVariant<[
+ SchedVar<MCSchedPredicate<ZeroIdiomPredicate>, [SKLWriteZeroLatency]>,
+ SchedVar<NoSchedPred, [WriteALU]>
+]>;
+def : InstRW<[SKLWriteZeroIdiom], (instrs SUB32rr, SUB64rr,
+ XOR32rr, XOR64rr)>;
+
+def SKLWriteFZeroIdiom : SchedWriteVariant<[
+ SchedVar<MCSchedPredicate<ZeroIdiomPredicate>, [SKLWriteZeroLatency]>,
+ SchedVar<NoSchedPred, [WriteFLogic]>
+]>;
+def : InstRW<[SKLWriteFZeroIdiom], (instrs XORPSrr, VXORPSrr, XORPDrr,
+ VXORPDrr)>;
+
+def SKLWriteFZeroIdiomY : SchedWriteVariant<[
+ SchedVar<MCSchedPredicate<ZeroIdiomPredicate>, [SKLWriteZeroLatency]>,
+ SchedVar<NoSchedPred, [WriteFLogicY]>
+]>;
+def : InstRW<[SKLWriteFZeroIdiomY], (instrs VXORPSYrr, VXORPDYrr)>;
+
+def SKLWriteVZeroIdiomLogicX : SchedWriteVariant<[
+ SchedVar<MCSchedPredicate<ZeroIdiomPredicate>, [SKLWriteZeroLatency]>,
+ SchedVar<NoSchedPred, [WriteVecLogicX]>
+]>;
+def : InstRW<[SKLWriteVZeroIdiomLogicX], (instrs PXORrr, VPXORrr)>;
+
+def SKLWriteVZeroIdiomLogicY : SchedWriteVariant<[
+ SchedVar<MCSchedPredicate<ZeroIdiomPredicate>, [SKLWriteZeroLatency]>,
+ SchedVar<NoSchedPred, [WriteVecLogicY]>
+]>;
+def : InstRW<[SKLWriteVZeroIdiomLogicY], (instrs VPXORYrr)>;
+
+def SKLWriteVZeroIdiomALUX : SchedWriteVariant<[
+ SchedVar<MCSchedPredicate<ZeroIdiomPredicate>, [SKLWriteZeroLatency]>,
+ SchedVar<NoSchedPred, [WriteVecALUX]>
+]>;
+def : InstRW<[SKLWriteVZeroIdiomALUX], (instrs PCMPGTBrr, VPCMPGTBrr,
+ PCMPGTDrr, VPCMPGTDrr,
+ PCMPGTWrr, VPCMPGTWrr)>;
+
+def SKLWriteVZeroIdiomALUY : SchedWriteVariant<[
+ SchedVar<MCSchedPredicate<ZeroIdiomPredicate>, [SKLWriteZeroLatency]>,
+ SchedVar<NoSchedPred, [WriteVecALUY]>
+]>;
+def : InstRW<[SKLWriteVZeroIdiomALUY], (instrs VPCMPGTBYrr,
+ VPCMPGTDYrr,
+ VPCMPGTWYrr)>;
+
+def SKLWritePSUB : SchedWriteRes<[SKLPort015]> {
+ let Latency = 1;
+ let NumMicroOps = 1;
+ let ResourceCycles = [1];
+}
+
+def SKLWriteVZeroIdiomPSUB : SchedWriteVariant<[
+ SchedVar<MCSchedPredicate<ZeroIdiomPredicate>, [SKLWriteZeroLatency]>,
+ SchedVar<NoSchedPred, [SKLWritePSUB]>
+]>;
+def : InstRW<[SKLWriteVZeroIdiomPSUB], (instrs PSUBBrr, VPSUBBrr,
+ PSUBDrr, VPSUBDrr,
+ PSUBQrr, VPSUBQrr,
+ PSUBWrr, VPSUBWrr,
+ VPSUBBYrr,
+ VPSUBDYrr,
+ VPSUBQYrr,
+ VPSUBWYrr)>;
+
+def SKLWritePCMPGTQ : SchedWriteRes<[SKLPort5]> {
+ let Latency = 3;
+ let NumMicroOps = 1;
+ let ResourceCycles = [1];
+}
+
+def SKLWriteVZeroIdiomPCMPGTQ : SchedWriteVariant<[
+ SchedVar<MCSchedPredicate<ZeroIdiomPredicate>, [SKLWriteZeroLatency]>,
+ SchedVar<NoSchedPred, [SKLWritePCMPGTQ]>
+]>;
+def : InstRW<[SKLWriteVZeroIdiomPCMPGTQ], (instrs PCMPGTQrr, VPCMPGTQrr,
+ VPCMPGTQYrr)>;
+
+
+// CMOVs that use both Z and C flag require an extra uop.
+def SKLWriteCMOVA_CMOVBErr : SchedWriteRes<[SKLPort06]> {
+ let Latency = 2;
+ let ResourceCycles = [2];
+ let NumMicroOps = 2;
+}
+
+def SKLWriteCMOVA_CMOVBErm : SchedWriteRes<[SKLPort23,SKLPort06]> {
+ let Latency = 7;
+ let ResourceCycles = [1,2];
+ let NumMicroOps = 3;
+}
+
+def SKLCMOVA_CMOVBErr : SchedWriteVariant<[
+ SchedVar<MCSchedPredicate<IsCMOVArr_Or_CMOVBErr>, [SKLWriteCMOVA_CMOVBErr]>,
+ SchedVar<NoSchedPred, [WriteCMOV]>
+]>;
+
+def SKLCMOVA_CMOVBErm : SchedWriteVariant<[
+ SchedVar<MCSchedPredicate<IsCMOVArm_Or_CMOVBErm>, [SKLWriteCMOVA_CMOVBErm]>,
+ SchedVar<NoSchedPred, [WriteCMOV.Folded]>
+]>;
+
+def : InstRW<[SKLCMOVA_CMOVBErr], (instrs CMOV16rr, CMOV32rr, CMOV64rr)>;
+def : InstRW<[SKLCMOVA_CMOVBErm], (instrs CMOV16rm, CMOV32rm, CMOV64rm)>;
+
+// SETCCs that use both Z and C flag require an extra uop.
+def SKLWriteSETA_SETBEr : SchedWriteRes<[SKLPort06]> {
+ let Latency = 2;
+ let ResourceCycles = [2];
+ let NumMicroOps = 2;
+}
+
+def SKLWriteSETA_SETBEm : SchedWriteRes<[SKLPort4,SKLPort237,SKLPort06]> {
+ let Latency = 3;
+ let ResourceCycles = [1,1,2];
+ let NumMicroOps = 4;
+}
+
+def SKLSETA_SETBErr : SchedWriteVariant<[
+ SchedVar<MCSchedPredicate<IsSETAr_Or_SETBEr>, [SKLWriteSETA_SETBEr]>,
+ SchedVar<NoSchedPred, [WriteSETCC]>
+]>;
+
+def SKLSETA_SETBErm : SchedWriteVariant<[
+ SchedVar<MCSchedPredicate<IsSETAm_Or_SETBEm>, [SKLWriteSETA_SETBEm]>,
+ SchedVar<NoSchedPred, [WriteSETCCStore]>
+]>;
+
+def : InstRW<[SKLSETA_SETBErr], (instrs SETCCr)>;
+def : InstRW<[SKLSETA_SETBErm], (instrs SETCCm)>;
+
} // SchedModel
diff --git a/lib/Target/X86/X86SchedSkylakeServer.td b/lib/Target/X86/X86SchedSkylakeServer.td
index ec8e4db02d8a..58caf1dacfcb 100755
--- a/lib/Target/X86/X86SchedSkylakeServer.td
+++ b/lib/Target/X86/X86SchedSkylakeServer.td
@@ -1,9 +1,8 @@
//=- X86SchedSkylake.td - X86 Skylake Server Scheduling ------*- tablegen -*-=//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
@@ -81,6 +80,8 @@ def : ReadAdvance<ReadAfterVecLd, 5>;
def : ReadAdvance<ReadAfterVecXLd, 6>;
def : ReadAdvance<ReadAfterVecYLd, 7>;
+def : ReadAdvance<ReadInt2Fpu, 0>;
+
// Many SchedWrites are defined in pairs with and without a folded load.
// Instructions with folded loads are usually micro-fused, so they only appear
// as two micro-ops when queued in the reservation station.
@@ -158,7 +159,6 @@ defm : SKXWriteResPair<WriteCRC32, [SKXPort1], 3>;
def : WriteRes<WriteLEA, [SKXPort15]>; // LEA instructions can't fold loads.
defm : SKXWriteResPair<WriteCMOV, [SKXPort06], 1, [1], 1>; // Conditional move.
-defm : SKXWriteResPair<WriteCMOV2, [SKXPort06], 2, [2], 2>; // Conditional (CF + ZF flag) move.
defm : X86WriteRes<WriteFCMOV, [SKXPort1], 3, [1], 1>; // x87 conditional move.
def : WriteRes<WriteSETCC, [SKXPort06]>; // Setcc.
def : WriteRes<WriteSETCCStore, [SKXPort06,SKXPort4,SKXPort237]> {
@@ -176,7 +176,7 @@ defm : X86WriteRes<WriteBitTestSetRegLd, [SKXPort0156,SKXPort23], 5, [1,1], 2>;
// Integer shifts and rotates.
defm : SKXWriteResPair<WriteShift, [SKXPort06], 1>;
defm : SKXWriteResPair<WriteShiftCL, [SKXPort06], 3, [3], 3>;
-defm : SKXWriteResPair<WriteRotate, [SKXPort06], 2, [2], 2>;
+defm : SKXWriteResPair<WriteRotate, [SKXPort06], 1, [1], 1>;
defm : SKXWriteResPair<WriteRotateCL, [SKXPort06], 3, [3], 3>;
// SHLD/SHRD.
@@ -680,8 +680,7 @@ def: InstRW<[SKXWriteResGroup9], (instregex "VBLENDMPD(Z128|Z256)rr",
"VPBLENDMD(Z128|Z256)rr",
"VPBLENDMQ(Z128|Z256)rr",
"VPBLENDMW(Z128|Z256)rr",
- "VPSUB(B|D|Q|W)(Y|Z|Z128|Z256)rr",
- "(V?)PSUB(B|D|Q|W)rr",
+ "VPSUB(B|D|Q|W)(Y|Z|Z128|Z256)rrk",
"VPTERNLOGD(Z|Z128|Z256)rri",
"VPTERNLOGQ(Z|Z128|Z256)rri")>;
@@ -722,13 +721,6 @@ def SKXWriteResGroup14 : SchedWriteRes<[SKXPort05]> {
def: InstRW<[SKXWriteResGroup14], (instrs FDECSTP,
MMX_MOVDQ2Qrr)>;
-def SKXWriteResGroup15 : SchedWriteRes<[SKXPort06]> {
- let Latency = 2;
- let NumMicroOps = 2;
- let ResourceCycles = [2];
-}
-def: InstRW<[SKXWriteResGroup15], (instregex "SET(A|BE)r")>;
-
def SKXWriteResGroup17 : SchedWriteRes<[SKXPort0156]> {
let Latency = 2;
let NumMicroOps = 2;
@@ -759,9 +751,10 @@ def SKXWriteResGroup23 : SchedWriteRes<[SKXPort06,SKXPort0156]> {
}
def: InstRW<[SKXWriteResGroup23], (instrs CWD,
JCXZ, JECXZ, JRCXZ,
- ADC8i8, SBB8i8)>;
-def: InstRW<[SKXWriteResGroup23], (instregex "ADC8ri",
- "SBB8ri")>;
+ ADC8i8, SBB8i8,
+ ADC16i16, SBB16i16,
+ ADC32i32, SBB32i32,
+ ADC64i32, SBB64i32)>;
def SKXWriteResGroup25 : SchedWriteRes<[SKXPort4,SKXPort6,SKXPort237]> {
let Latency = 2;
@@ -834,7 +827,6 @@ def: InstRW<[SKXWriteResGroup32], (instregex "(ADD|SUB|SUBR)_(FPrST0|FST0r|FrST0
"VPCMPD(Z|Z128|Z256)rri",
"VPCMPEQ(B|D|Q|W)(Z|Z128|Z256)rr",
"VPCMPGT(B|D|Q|W)(Z|Z128|Z256)rr",
- "(V?)PCMPGTQ(Y?)rr",
"VPCMPQ(Z|Z128|Z256)rri",
"VPCMPU(B|D|Q|W)(Z|Z128|Z256)rri",
"VPCMPW(Z|Z128|Z256)rri",
@@ -900,13 +892,6 @@ def SKXWriteResGroup45 : SchedWriteRes<[SKXPort0,SKXPort4,SKXPort237]> {
}
def: InstRW<[SKXWriteResGroup45], (instrs FNSTSWm)>;
-def SKXWriteResGroup46 : SchedWriteRes<[SKXPort4,SKXPort237,SKXPort06]> {
- let Latency = 3;
- let NumMicroOps = 4;
- let ResourceCycles = [1,1,2];
-}
-def: InstRW<[SKXWriteResGroup46], (instregex "SET(A|BE)m")>;
-
def SKXWriteResGroup47 : SchedWriteRes<[SKXPort4,SKXPort6,SKXPort237,SKXPort0156]> {
let Latency = 3;
let NumMicroOps = 4;
@@ -1446,6 +1431,14 @@ def SKXWriteResGroup107 : SchedWriteRes<[SKXPort4,SKXPort23,SKXPort237,SKXPort06
def: InstRW<[SKXWriteResGroup107], (instregex "ROL(8|16|32|64)m(1|i)",
"ROR(8|16|32|64)m(1|i)")>;
+def SKXWriteResGroup107_1 : SchedWriteRes<[SKXPort06]> {
+ let Latency = 2;
+ let NumMicroOps = 2;
+ let ResourceCycles = [2];
+}
+def: InstRW<[SKXWriteResGroup107_1], (instrs ROL8r1, ROL16r1, ROL32r1, ROL64r1,
+ ROR8r1, ROR16r1, ROR32r1, ROR64r1)>;
+
def SKXWriteResGroup108 : SchedWriteRes<[SKXPort4,SKXPort23,SKXPort237,SKXPort0156]> {
let Latency = 7;
let NumMicroOps = 5;
@@ -2463,4 +2456,171 @@ def: InstRW<[SKXWriteResGroup267], (instrs PAUSE)>;
def: InstRW<[WriteZero], (instrs CLC)>;
+
+// Intruction variants handled by the renamer. These might not need execution
+// ports in certain conditions.
+// See Agner's Fog "The microarchitecture of Intel, AMD and VIA CPUs",
+// section "Skylake Pipeline" > "Register allocation and renaming".
+// These can be investigated with llvm-exegesis, e.g.
+// echo 'pxor %mm0, %mm0' | /tmp/llvm-exegesis -mode=uops -snippets-file=-
+// echo 'vxorpd %xmm0, %xmm0, %xmm1' | /tmp/llvm-exegesis -mode=uops -snippets-file=-
+
+def SKXWriteZeroLatency : SchedWriteRes<[]> {
+ let Latency = 0;
+}
+
+def SKXWriteZeroIdiom : SchedWriteVariant<[
+ SchedVar<MCSchedPredicate<ZeroIdiomPredicate>, [SKXWriteZeroLatency]>,
+ SchedVar<NoSchedPred, [WriteALU]>
+]>;
+def : InstRW<[SKXWriteZeroIdiom], (instrs SUB32rr, SUB64rr,
+ XOR32rr, XOR64rr)>;
+
+def SKXWriteFZeroIdiom : SchedWriteVariant<[
+ SchedVar<MCSchedPredicate<ZeroIdiomPredicate>, [SKXWriteZeroLatency]>,
+ SchedVar<NoSchedPred, [WriteFLogic]>
+]>;
+def : InstRW<[SKXWriteFZeroIdiom], (instrs XORPSrr, VXORPSrr,
+ XORPDrr, VXORPDrr,
+ VXORPSZ128rr,
+ VXORPDZ128rr)>;
+
+def SKXWriteFZeroIdiomY : SchedWriteVariant<[
+ SchedVar<MCSchedPredicate<ZeroIdiomPredicate>, [SKXWriteZeroLatency]>,
+ SchedVar<NoSchedPred, [WriteFLogicY]>
+]>;
+def : InstRW<[SKXWriteFZeroIdiomY], (instrs VXORPSYrr, VXORPDYrr,
+ VXORPSZ256rr, VXORPDZ256rr)>;
+
+def SKXWriteFZeroIdiomZ : SchedWriteVariant<[
+ SchedVar<MCSchedPredicate<ZeroIdiomPredicate>, [SKXWriteZeroLatency]>,
+ SchedVar<NoSchedPred, [WriteFLogicZ]>
+]>;
+def : InstRW<[SKXWriteFZeroIdiomZ], (instrs VXORPSZrr, VXORPDZrr)>;
+
+def SKXWriteVZeroIdiomLogicX : SchedWriteVariant<[
+ SchedVar<MCSchedPredicate<ZeroIdiomPredicate>, [SKXWriteZeroLatency]>,
+ SchedVar<NoSchedPred, [WriteVecLogicX]>
+]>;
+def : InstRW<[SKXWriteVZeroIdiomLogicX], (instrs PXORrr, VPXORrr,
+ VPXORDZ128rr, VPXORQZ128rr)>;
+
+def SKXWriteVZeroIdiomLogicY : SchedWriteVariant<[
+ SchedVar<MCSchedPredicate<ZeroIdiomPredicate>, [SKXWriteZeroLatency]>,
+ SchedVar<NoSchedPred, [WriteVecLogicY]>
+]>;
+def : InstRW<[SKXWriteVZeroIdiomLogicY], (instrs VPXORYrr,
+ VPXORDZ256rr, VPXORQZ256rr)>;
+
+def SKXWriteVZeroIdiomLogicZ : SchedWriteVariant<[
+ SchedVar<MCSchedPredicate<ZeroIdiomPredicate>, [SKXWriteZeroLatency]>,
+ SchedVar<NoSchedPred, [WriteVecLogicZ]>
+]>;
+def : InstRW<[SKXWriteVZeroIdiomLogicZ], (instrs VPXORDZrr, VPXORQZrr)>;
+
+def SKXWriteVZeroIdiomALUX : SchedWriteVariant<[
+ SchedVar<MCSchedPredicate<ZeroIdiomPredicate>, [SKXWriteZeroLatency]>,
+ SchedVar<NoSchedPred, [WriteVecALUX]>
+]>;
+def : InstRW<[SKXWriteVZeroIdiomALUX], (instrs PCMPGTBrr, VPCMPGTBrr,
+ PCMPGTDrr, VPCMPGTDrr,
+ PCMPGTWrr, VPCMPGTWrr)>;
+
+def SKXWriteVZeroIdiomALUY : SchedWriteVariant<[
+ SchedVar<MCSchedPredicate<ZeroIdiomPredicate>, [SKXWriteZeroLatency]>,
+ SchedVar<NoSchedPred, [WriteVecALUY]>
+]>;
+def : InstRW<[SKXWriteVZeroIdiomALUY], (instrs VPCMPGTBYrr,
+ VPCMPGTDYrr,
+ VPCMPGTWYrr)>;
+
+def SKXWritePSUB : SchedWriteRes<[SKXPort015]> {
+ let Latency = 1;
+ let NumMicroOps = 1;
+ let ResourceCycles = [1];
+}
+
+def SKXWriteVZeroIdiomPSUB : SchedWriteVariant<[
+ SchedVar<MCSchedPredicate<ZeroIdiomPredicate>, [SKXWriteZeroLatency]>,
+ SchedVar<NoSchedPred, [SKXWritePSUB]>
+]>;
+
+def : InstRW<[SKXWriteVZeroIdiomPSUB], (instrs PSUBBrr, VPSUBBrr, VPSUBBZ128rr,
+ PSUBDrr, VPSUBDrr, VPSUBDZ128rr,
+ PSUBQrr, VPSUBQrr, VPSUBQZ128rr,
+ PSUBWrr, VPSUBWrr, VPSUBWZ128rr,
+ VPSUBBYrr, VPSUBBZ256rr,
+ VPSUBDYrr, VPSUBDZ256rr,
+ VPSUBQYrr, VPSUBQZ256rr,
+ VPSUBWYrr, VPSUBWZ256rr,
+ VPSUBBZrr,
+ VPSUBDZrr,
+ VPSUBQZrr,
+ VPSUBWZrr)>;
+def SKXWritePCMPGTQ : SchedWriteRes<[SKXPort5]> {
+ let Latency = 3;
+ let NumMicroOps = 1;
+ let ResourceCycles = [1];
+}
+
+def SKXWriteVZeroIdiomPCMPGTQ : SchedWriteVariant<[
+ SchedVar<MCSchedPredicate<ZeroIdiomPredicate>, [SKXWriteZeroLatency]>,
+ SchedVar<NoSchedPred, [SKXWritePCMPGTQ]>
+]>;
+def : InstRW<[SKXWriteVZeroIdiomPCMPGTQ], (instrs PCMPGTQrr, VPCMPGTQrr,
+ VPCMPGTQYrr)>;
+
+
+// CMOVs that use both Z and C flag require an extra uop.
+def SKXWriteCMOVA_CMOVBErr : SchedWriteRes<[SKXPort06]> {
+ let Latency = 2;
+ let ResourceCycles = [2];
+ let NumMicroOps = 2;
+}
+
+def SKXWriteCMOVA_CMOVBErm : SchedWriteRes<[SKXPort23,SKXPort06]> {
+ let Latency = 7;
+ let ResourceCycles = [1,2];
+ let NumMicroOps = 3;
+}
+
+def SKXCMOVA_CMOVBErr : SchedWriteVariant<[
+ SchedVar<MCSchedPredicate<IsCMOVArr_Or_CMOVBErr>, [SKXWriteCMOVA_CMOVBErr]>,
+ SchedVar<NoSchedPred, [WriteCMOV]>
+]>;
+
+def SKXCMOVA_CMOVBErm : SchedWriteVariant<[
+ SchedVar<MCSchedPredicate<IsCMOVArm_Or_CMOVBErm>, [SKXWriteCMOVA_CMOVBErm]>,
+ SchedVar<NoSchedPred, [WriteCMOV.Folded]>
+]>;
+
+def : InstRW<[SKXCMOVA_CMOVBErr], (instrs CMOV16rr, CMOV32rr, CMOV64rr)>;
+def : InstRW<[SKXCMOVA_CMOVBErm], (instrs CMOV16rm, CMOV32rm, CMOV64rm)>;
+
+// SETCCs that use both Z and C flag require an extra uop.
+def SKXWriteSETA_SETBEr : SchedWriteRes<[SKXPort06]> {
+ let Latency = 2;
+ let ResourceCycles = [2];
+ let NumMicroOps = 2;
+}
+
+def SKXWriteSETA_SETBEm : SchedWriteRes<[SKXPort4,SKXPort237,SKXPort06]> {
+ let Latency = 3;
+ let ResourceCycles = [1,1,2];
+ let NumMicroOps = 4;
+}
+
+def SKXSETA_SETBErr : SchedWriteVariant<[
+ SchedVar<MCSchedPredicate<IsSETAr_Or_SETBEr>, [SKXWriteSETA_SETBEr]>,
+ SchedVar<NoSchedPred, [WriteSETCC]>
+]>;
+
+def SKXSETA_SETBErm : SchedWriteVariant<[
+ SchedVar<MCSchedPredicate<IsSETAm_Or_SETBEm>, [SKXWriteSETA_SETBEm]>,
+ SchedVar<NoSchedPred, [WriteSETCCStore]>
+]>;
+
+def : InstRW<[SKXSETA_SETBErr], (instrs SETCCr)>;
+def : InstRW<[SKXSETA_SETBErm], (instrs SETCCm)>;
+
} // SchedModel
diff --git a/lib/Target/X86/X86Schedule.td b/lib/Target/X86/X86Schedule.td
index 25aa83f96d3a..55ca85ec1e3d 100644
--- a/lib/Target/X86/X86Schedule.td
+++ b/lib/Target/X86/X86Schedule.td
@@ -1,9 +1,8 @@
//===-- X86Schedule.td - X86 Scheduling Definitions --------*- tablegen -*-===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
@@ -18,6 +17,12 @@ def ReadAfterVecLd : SchedRead;
def ReadAfterVecXLd : SchedRead;
def ReadAfterVecYLd : SchedRead;
+// Instructions that move data between general purpose registers and vector
+// registers may be subject to extra latency due to data bypass delays.
+// This SchedRead describes a bypass delay caused by data being moved from the
+// integer unit to the floating point unit.
+def ReadInt2Fpu : SchedRead;
+
// Instructions with both a load and a store folded are modeled as a folded
// load + WriteRMW.
def WriteRMW : SchedWrite;
@@ -158,7 +163,6 @@ defm WritePOPCNT : X86SchedWritePair; // Bit population count.
defm WriteLZCNT : X86SchedWritePair; // Leading zero count.
defm WriteTZCNT : X86SchedWritePair; // Trailing zero count.
defm WriteCMOV : X86SchedWritePair; // Conditional move.
-defm WriteCMOV2 : X86SchedWritePair; // Conditional (CF + ZF flag) move.
def WriteFCMOV : SchedWrite; // X87 conditional move.
def WriteSETCC : SchedWrite; // Set register based on condition code.
def WriteSETCCStore : SchedWrite;
diff --git a/lib/Target/X86/X86ScheduleAtom.td b/lib/Target/X86/X86ScheduleAtom.td
index 1589ff2ef402..b0334655de7e 100644
--- a/lib/Target/X86/X86ScheduleAtom.td
+++ b/lib/Target/X86/X86ScheduleAtom.td
@@ -1,9 +1,8 @@
//===- X86ScheduleAtom.td - X86 Atom Scheduling Definitions -*- tablegen -*-==//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
@@ -47,6 +46,8 @@ def : ReadAdvance<ReadAfterVecLd, 3>;
def : ReadAdvance<ReadAfterVecXLd, 3>;
def : ReadAdvance<ReadAfterVecYLd, 3>;
+def : ReadAdvance<ReadInt2Fpu, 0>;
+
// Many SchedWrites are defined in pairs with and without a folded load.
// Instructions with folded loads are usually micro-fused, so they only appear
// as two micro-ops when dispatched by the schedulers.
@@ -112,7 +113,6 @@ defm : AtomWriteResPair<WriteIDiv64, [AtomPort01], [AtomPort01],130,130,[130],[1
defm : X86WriteResPairUnsupported<WriteCRC32>;
defm : AtomWriteResPair<WriteCMOV, [AtomPort01], [AtomPort0]>;
-defm : AtomWriteResPair<WriteCMOV2, [AtomPort01], [AtomPort0]>;
defm : X86WriteRes<WriteFCMOV, [AtomPort01], 9, [9], 1>; // x87 conditional move.
def : WriteRes<WriteSETCC, [AtomPort01]>;
@@ -740,7 +740,7 @@ def AtomWrite01_45 : SchedWriteRes<[AtomPort01]> {
let Latency = 45;
let ResourceCycles = [45];
}
-def : InstRW<[AtomWrite01_45], (instrs MONITORrrr)>;
+def : InstRW<[AtomWrite01_45], (instrs MONITOR32rrr, MONITOR64rrr)>;
def AtomWrite01_46 : SchedWriteRes<[AtomPort01]> {
let Latency = 46;
diff --git a/lib/Target/X86/X86ScheduleBdVer2.td b/lib/Target/X86/X86ScheduleBdVer2.td
index 5798e1b2671b..8cc01c3acece 100644
--- a/lib/Target/X86/X86ScheduleBdVer2.td
+++ b/lib/Target/X86/X86ScheduleBdVer2.td
@@ -1,9 +1,8 @@
//=- X86ScheduleBdVer2.td - X86 BdVer2 (Piledriver) Scheduling * tablegen -*-=//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
@@ -209,7 +208,10 @@ multiclass __pdWriteResPair<X86FoldableSchedWrite SchedRW,
!add(Lat, LoadLat),
!if(!and(!empty(Res), !eq(LoadRes, 1)),
[],
- !listconcat([LoadRes], Res)),
+ !listconcat([LoadRes],
+ !if(!empty(Res),
+ !listsplat(1, !size(ExePorts)),
+ Res))),
!add(UOps, LoadUOps)>;
}
@@ -218,7 +220,7 @@ multiclass PdWriteResExPair<X86FoldableSchedWrite SchedRW,
list<int> Res = [], int UOps = 1,
int LoadUOps = 0> {
defm : __pdWriteResPair<SchedRW, ExePorts, Lat, Res, UOps,
- /*LoadLat*/4, /*LoadRes*/1, LoadUOps>;
+ /*LoadLat*/4, /*LoadRes*/3, LoadUOps>;
}
multiclass PdWriteResXMMPair<X86FoldableSchedWrite SchedRW,
@@ -226,15 +228,15 @@ multiclass PdWriteResXMMPair<X86FoldableSchedWrite SchedRW,
list<int> Res = [], int UOps = 1,
int LoadUOps = 0> {
defm : __pdWriteResPair<SchedRW, ExePorts, Lat, Res, UOps,
- /*LoadLat*/5, /*LoadRes*/1, LoadUOps>;
+ /*LoadLat*/5, /*LoadRes*/3, LoadUOps>;
}
multiclass PdWriteResYMMPair<X86FoldableSchedWrite SchedRW,
list<ProcResourceKind> ExePorts, int Lat,
- list<int> Res, int UOps = 2,
+ list<int> Res = [], int UOps = 2,
int LoadUOps = 0> {
defm : __pdWriteResPair<SchedRW, ExePorts, Lat, Res, UOps,
- /*LoadLat*/5, /*LoadRes*/2, LoadUOps>;
+ /*LoadLat*/5, /*LoadRes*/3, LoadUOps>;
}
//===----------------------------------------------------------------------===//
@@ -251,6 +253,11 @@ def : ReadAdvance<ReadAfterVecLd, 5>;
def : ReadAdvance<ReadAfterVecXLd, 5>;
def : ReadAdvance<ReadAfterVecYLd, 5>;
+// Transfer from int domain to ivec domain incurs additional latency of 8..10cy
+// Reference: Agner, Microarchitecture, "AMD Bulldozer, Piledriver, Steamroller
+// and Excavator pipeline", "Data delay between different execution domains"
+def : ReadAdvance<ReadInt2Fpu, -10>;
+
// A folded store needs a cycle on the PdStore for the store data.
def : WriteRes<WriteRMW, [PdStore]>;
@@ -258,15 +265,15 @@ def : WriteRes<WriteRMW, [PdStore]>;
// Loads, stores, and moves, not folded with other operations.
////////////////////////////////////////////////////////////////////////////////
-def : WriteRes<WriteLoad, [PdLoad]> { let Latency = 5; }
+def : WriteRes<WriteLoad, [PdLoad]> { let Latency = 5; let ResourceCycles = [2]; }
def : WriteRes<WriteStore, [PdStore]>;
def : WriteRes<WriteStoreNT, [PdStore]>;
-def : WriteRes<WriteMove, [PdEX01]>;
+def : WriteRes<WriteMove, [PdEX01]> { let ResourceCycles = [2]; }
// Load/store MXCSR.
// FIXME: These are copy and pasted from WriteLoad/Store.
def : WriteRes<WriteLDMXCSR, [PdLoad]> { let Latency = 5; }
-def : WriteRes<WriteSTMXCSR, [PdStore]> { let NumMicroOps = 2; }
+def : WriteRes<WriteSTMXCSR, [PdStore]> { let NumMicroOps = 2; let ResourceCycles = [18]; }
// Treat misc copies as a move.
def : InstRW<[WriteMove], (instrs COPY)>;
@@ -300,6 +307,7 @@ def : InstRW<[PdWriteXLAT], (instrs XLAT)>;
def PdWriteLARrr : SchedWriteRes<[PdEX01]> {
let Latency = 184;
+ let ResourceCycles = [375];
let NumMicroOps = 45;
}
def : InstRW<[PdWriteLARrr], (instregex "LAR(16|32|64)rr",
@@ -307,22 +315,31 @@ def : InstRW<[PdWriteLARrr], (instregex "LAR(16|32|64)rr",
// Nops don't have dependencies, so there's no actual latency, but we set this
// to '1' to tell the scheduler that the nop uses an ALU slot for a cycle.
-def : WriteRes<WriteNop, [PdEX01]>;
+def : WriteRes<WriteNop, [PdEX01]> { let ResourceCycles = [2]; }
////////////////////////////////////////////////////////////////////////////////
// Arithmetic.
////////////////////////////////////////////////////////////////////////////////
-defm : PdWriteResExPair<WriteALU, [PdEX01]>;
+defm : PdWriteResExPair<WriteALU, [PdEX01], 1, [2]>;
+
+def PdWriteALURMW : SchedWriteRes<[PdLoad, PdEX01, PdStore]> {
+ let Latency = 6;
+ let ResourceCycles = [3, 2, 1];
+ let NumMicroOps = 1;
+}
+def : SchedAlias<WriteALURMW, PdWriteALURMW>;
def PdWriteLXADD : SchedWriteRes<[PdEX01]> {
let Latency = 6;
+ let ResourceCycles = [88];
let NumMicroOps = 4;
}
def : InstRW<[PdWriteLXADD], (instrs LXADD8, LXADD16, LXADD32, LXADD64)>;
def PdWriteBMI1 : SchedWriteRes<[PdEX01]> {
let Latency = 2;
+ let ResourceCycles = [2];
let NumMicroOps = 2;
}
def : InstRW<[PdWriteBMI1],
@@ -332,8 +349,9 @@ def : InstRW<[PdWriteBMI1],
BLSIC32rr, BLSIC64rr, T1MSKC32rr, T1MSKC64rr,
TZMSK32rr, TZMSK64rr)>;
-def PdWriteBMI1m : SchedWriteRes<[PdEX01]> {
+def PdWriteBMI1m : SchedWriteRes<[PdLoad, PdEX01]> {
let Latency = 6;
+ let ResourceCycles = [3, 3];
let NumMicroOps = 2;
}
def : InstRW<[PdWriteBMI1m],
@@ -345,26 +363,34 @@ def : InstRW<[PdWriteBMI1m],
defm : PdWriteResExPair<WriteADC, [PdEX01], 1, [2]>;
-defm : PdWriteRes<WriteBSWAP32, [PdEX1]>;
-defm : PdWriteRes<WriteBSWAP64, [PdEX1]>;
-defm : PdWriteRes<WriteCMPXCHG, [PdEX1], 3, [], 5>;
-defm : PdWriteRes<WriteCMPXCHGRMW, [PdEX1, PdStore, PdLoad], 3, [], 2>;
-defm : PdWriteRes<WriteXCHG, [PdEX1], 1, [], 2>;
+def PdWriteADCSBB64ri32 : SchedWriteRes<[PdEX01]> {
+ let ResourceCycles = [3];
+}
+def : InstRW<[PdWriteADCSBB64ri32], (instrs ADC64ri32, SBB64ri32)>;
+
+defm : PdWriteRes<WriteBSWAP32, [PdEX01]>;
+defm : PdWriteRes<WriteBSWAP64, [PdEX01]>;
+defm : PdWriteRes<WriteCMPXCHG, [PdEX1], 3, [3], 5>;
+defm : PdWriteRes<WriteCMPXCHGRMW, [PdEX1, PdStore, PdLoad], 3, [44, 1, 1], 2>;
+defm : PdWriteRes<WriteXCHG, [PdEX1], 1, [], 2>;
def PdWriteCMPXCHG8rr : SchedWriteRes<[PdEX1]> {
let Latency = 3;
+ let ResourceCycles = [3];
let NumMicroOps = 3;
}
def : InstRW<[PdWriteCMPXCHG8rr], (instrs CMPXCHG8rr)>;
def PdWriteCMPXCHG8rm : SchedWriteRes<[PdEX1]> {
let Latency = 3;
+ let ResourceCycles = [23];
let NumMicroOps = 5;
}
def : InstRW<[PdWriteCMPXCHG8rm], (instrs CMPXCHG8rm)>;
def PdWriteCMPXCHG16rm_CMPXCHG32rm_CMPXCHG64rm : SchedWriteRes<[PdEX1]> {
let Latency = 3;
+ let ResourceCycles = [21];
let NumMicroOps = 6;
}
def : InstRW<[PdWriteCMPXCHG16rm_CMPXCHG32rm_CMPXCHG64rm],
@@ -372,42 +398,40 @@ def : InstRW<[PdWriteCMPXCHG16rm_CMPXCHG32rm_CMPXCHG64rm],
def PdWriteCMPXCHG8B : SchedWriteRes<[PdEX1]> {
let Latency = 3;
+ let ResourceCycles = [26];
let NumMicroOps = 18;
}
def : InstRW<[PdWriteCMPXCHG8B], (instrs CMPXCHG8B)>;
def PdWriteCMPXCHG16B : SchedWriteRes<[PdEX1]> {
let Latency = 3;
+ let ResourceCycles = [69];
let NumMicroOps = 22;
}
def : InstRW<[PdWriteCMPXCHG16B], (instrs CMPXCHG16B)>;
-def PdWriteXCHG16rr : SchedWriteRes<[PdEX1]> {
- let Latency = 2;
- let NumMicroOps = 2;
-}
-def : InstRW<[PdWriteXCHG16rr], (instrs XCHG16rr)>;
-
def PdWriteXADD : SchedWriteRes<[PdEX1]> {
- let Latency = 2;
- let NumMicroOps = 4;
+ let Latency = 1;
+ let ResourceCycles = [1];
+ let NumMicroOps = 2;
}
def : InstRW<[PdWriteXADD], (instrs XADD8rr, XADD16rr, XADD32rr, XADD64rr)>;
def PdWriteXADDm : SchedWriteRes<[PdEX1]> {
-let Latency = 6;
-let NumMicroOps = 4;
+ let Latency = 6;
+ let ResourceCycles = [20];
+ let NumMicroOps = 4;
}
def : InstRW<[PdWriteXADDm], (instrs XADD8rm, XADD16rm, XADD32rm, XADD64rm)>;
-defm : PdWriteResExPair<WriteIMul8, [PdEX1, PdMul], 4>;
-defm : PdWriteResExPair<WriteIMul16, [PdEX1, PdMul], 4, [], 2>;
-defm : PdWriteResExPair<WriteIMul16Imm, [PdEX1, PdMul], 5, [], 2>;
-defm : PdWriteResExPair<WriteIMul16Reg, [PdEX1, PdMul], 4>;
-defm : PdWriteResExPair<WriteIMul32, [PdEX1, PdMul], 4>;
-defm : PdWriteResExPair<WriteIMul32Imm, [PdEX1, PdMul], 4, [], 1, 1>;
-defm : PdWriteResExPair<WriteIMul32Reg, [PdEX1, PdMul], 4>;
-defm : PdWriteResExPair<WriteIMul64, [PdEX1, PdMul], 6, [1, 4]>;
+defm : PdWriteResExPair<WriteIMul8, [PdEX1, PdMul], 4, [1, 4]>;
+defm : PdWriteResExPair<WriteIMul16, [PdEX1, PdMul], 4, [1, 5], 2>;
+defm : PdWriteResExPair<WriteIMul16Imm, [PdEX1, PdMul], 5, [1, 5], 2>;
+defm : PdWriteResExPair<WriteIMul16Reg, [PdEX1, PdMul], 4, [1, 2]>;
+defm : PdWriteResExPair<WriteIMul32, [PdEX1, PdMul], 4, [1, 4]>;
+defm : PdWriteResExPair<WriteIMul32Imm, [PdEX1, PdMul], 4, [1, 2], 1, 1>;
+defm : PdWriteResExPair<WriteIMul32Reg, [PdEX1, PdMul], 4, [1, 2]>;
+defm : PdWriteResExPair<WriteIMul64, [PdEX1, PdMul], 6, [1, 6]>;
defm : PdWriteResExPair<WriteIMul64Imm, [PdEX1, PdMul], 6, [1, 4],1, 1>;
defm : PdWriteResExPair<WriteIMul64Reg, [PdEX1, PdMul], 6, [1, 4]>;
defm : X86WriteResUnsupported<WriteIMulH>; // BMI2 MULX
@@ -422,36 +446,48 @@ defm : PdWriteResExPair<WriteIDiv16, [PdEX1, PdDiv], 15, [1, 17],
defm : PdWriteResExPair<WriteIDiv32, [PdEX1, PdDiv], 14, [1, 25], 2>;
defm : PdWriteResExPair<WriteIDiv64, [PdEX1, PdDiv], 14, [1, 14], 2>;
-defm : PdWriteResExPair<WriteCRC32, [PdEX01], 3, [4], 3>;
+defm : PdWriteResExPair<WriteCRC32, [PdEX01], 2, [4], 3>;
def PdWriteCRC32r32r16 : SchedWriteRes<[PdEX01]> {
let Latency = 5;
- let ResourceCycles = [4];
+ let ResourceCycles = [10];
let NumMicroOps = 5;
}
def : InstRW<[PdWriteCRC32r32r16], (instrs CRC32r32r16)>;
def PdWriteCRC32r32r32 : SchedWriteRes<[PdEX01]> {
let Latency = 6;
- let ResourceCycles = [4];
+ let ResourceCycles = [12];
let NumMicroOps = 7;
}
def : InstRW<[PdWriteCRC32r32r32], (instrs CRC32r32r32)>;
def PdWriteCRC32r64r64 : SchedWriteRes<[PdEX01]> {
let Latency = 10;
- let ResourceCycles = [4];
+ let ResourceCycles = [17];
let NumMicroOps = 11;
}
def : InstRW<[PdWriteCRC32r64r64], (instrs CRC32r64r64)>;
defm : PdWriteResExPair<WriteCMOV, [PdEX01]>; // Conditional move.
-defm : PdWriteResExPair<WriteCMOV2, [PdEX01], 1, [], 1, 1>; // Conditional (CF + ZF flag) move.
-def : InstRW<[WriteCMOV2.Folded], (instrs CMOVG16rm, CMOVG32rm, CMOVG64rm,
- CMOVGE16rm, CMOVGE32rm, CMOVGE64rm,
- CMOVL16rm, CMOVL32rm, CMOVL64rm,
- CMOVLE16rm, CMOVLE32rm, CMOVLE64rm)>;
+def PdWriteCMOVm : SchedWriteRes<[PdLoad, PdEX01]> {
+ let Latency = 5;
+ let ResourceCycles = [3, 3];
+ let NumMicroOps = 2;
+}
+
+def PdWriteCMOVmVar : SchedWriteVariant<[
+ SchedVar<MCSchedPredicate<CheckImmOperand_s<7, "X86::COND_BE">>, [PdWriteCMOVm]>,
+ SchedVar<MCSchedPredicate<CheckImmOperand_s<7, "X86::COND_A">>, [PdWriteCMOVm]>,
+ SchedVar<MCSchedPredicate<CheckImmOperand_s<7, "X86::COND_L">>, [PdWriteCMOVm]>,
+ SchedVar<MCSchedPredicate<CheckImmOperand_s<7, "X86::COND_GE">>, [PdWriteCMOVm]>,
+ SchedVar<MCSchedPredicate<CheckImmOperand_s<7, "X86::COND_LE">>, [PdWriteCMOVm]>,
+ SchedVar<MCSchedPredicate<CheckImmOperand_s<7, "X86::COND_G">>, [PdWriteCMOVm]>,
+ SchedVar<NoSchedPred, [WriteCMOV.Folded]>
+]>;
+
+def : InstRW<[PdWriteCMOVmVar], (instrs CMOV16rm, CMOV32rm, CMOV64rm)>;
defm : PdWriteRes<WriteFCMOV, [PdFPU0, PdFPFMA]>; // x87 conditional move.
@@ -462,107 +498,143 @@ def PdWriteSETGEmSETGmSETLEmSETLm : SchedWriteRes<[PdEX01]> {
let ResourceCycles = [2];
let NumMicroOps = 2;
}
-def : InstRW<[PdWriteSETGEmSETGmSETLEmSETLm], (instrs SETGEm, SETGm,
- SETLEm, SETLm)>;
-defm : PdWriteRes<WriteLAHFSAHF, [PdEX01], 2, [], 2>;
+def PdSETGEmSETGmSETLEmSETLm : SchedWriteVariant<[
+ SchedVar<MCSchedPredicate<CheckImmOperand_s<5, "X86::COND_GE">>, [PdWriteSETGEmSETGmSETLEmSETLm]>,
+ SchedVar<MCSchedPredicate<CheckImmOperand_s<5, "X86::COND_G">>, [PdWriteSETGEmSETGmSETLEmSETLm]>,
+ SchedVar<MCSchedPredicate<CheckImmOperand_s<5, "X86::COND_LE">>, [PdWriteSETGEmSETGmSETLEmSETLm]>,
+ SchedVar<MCSchedPredicate<CheckImmOperand_s<5, "X86::COND_L">>, [PdWriteSETGEmSETGmSETLEmSETLm]>,
+ SchedVar<NoSchedPred, [WriteSETCCStore]>
+]>;
+def : InstRW<[PdSETGEmSETGmSETLEmSETLm], (instrs SETCCm)>;
+
+defm : PdWriteRes<WriteLAHFSAHF, [PdEX01], 2, [4], 2>;
-def WriteLAHF : SchedWriteRes<[PdEX01]> {
+def PdWriteLAHF : SchedWriteRes<[PdEX01]> {
let Latency = 2;
+ let ResourceCycles = [4];
let NumMicroOps = 4;
}
-def : InstRW<[WriteLAHF], (instrs LAHF)>;
+def : InstRW<[PdWriteLAHF], (instrs LAHF)>;
-def WriteSAHF : SchedWriteRes<[PdEX01]> {
+def PdWriteSAHF : SchedWriteRes<[PdEX01]> {
let Latency = 2;
+ let ResourceCycles = [2];
let NumMicroOps = 2;
}
-def : InstRW<[WriteSAHF], (instrs SAHF)>;
+def : InstRW<[PdWriteSAHF], (instrs SAHF)>;
+
+defm : PdWriteRes<WriteBitTest, [PdEX01], 1, [2], 1>;
+defm : PdWriteRes<WriteBitTestImmLd, [PdEX01, PdLoad], 5, [2, 3], 1>;
+defm : PdWriteRes<WriteBitTestRegLd, [PdEX01, PdLoad], 5, [7, 2], 7>;
+defm : PdWriteRes<WriteBitTestSet, [PdEX01], 2, [2], 2>;
+defm : PdWriteRes<WriteBitTestSetImmLd, [PdEX01, PdLoad], 6, [1, 1], 4>;
+defm : PdWriteRes<WriteBitTestSetRegLd, [PdEX01, PdLoad], 6, [1, 1], 10>;
-defm : PdWriteRes<WriteBitTest, [PdEX01], 1, [1], 1>;
-defm : PdWriteRes<WriteBitTestImmLd, [PdEX01, PdLoad], 5, [1, 1], 1>;
-defm : PdWriteRes<WriteBitTestRegLd, [PdEX01, PdLoad], 5, [1, 1], 7>;
-defm : PdWriteRes<WriteBitTestSet, [PdEX01], 2, [1], 2>;
-defm : PdWriteRes<WriteBitTestSetImmLd, [PdEX01, PdLoad], 6, [1, 1], 4>;
-defm : PdWriteRes<WriteBitTestSetImmRMW, [PdEX01, PdLoad], 6, [1, 1], 4>;
-defm : PdWriteRes<WriteBitTestSetRegLd, [PdEX01, PdLoad], 6, [1, 1], 10>;
-defm : PdWriteRes<WriteBitTestSetRegRMW, [PdEX01, PdLoad], 6, [1, 1], 10>;
+def PdWriteBTSIm : SchedWriteRes<[PdEX01, PdLoad]> {
+ let Latency = 7;
+ let ResourceCycles = [42, 1];
+ let NumMicroOps = 4;
+}
+def : SchedAlias<WriteBitTestSetImmRMW, PdWriteBTSIm>;
+def PdWriteBTSRm : SchedWriteRes<[PdEX01, PdLoad]> {
+ let Latency = 7;
+ let ResourceCycles = [44, 1];
+ let NumMicroOps = 10;
+}
+def : SchedAlias<WriteBitTestSetRegRMW, PdWriteBTSRm>;
// This is for simple LEAs with one or two input operands.
// FIXME: SAGU 3-operand LEA
def : WriteRes<WriteLEA, [PdEX01]> { let NumMicroOps = 2; }
// Bit counts.
-defm : PdWriteResExPair<WriteBSF, [PdEX01], 3, [4], 6, 2>;
-defm : PdWriteResExPair<WriteBSR, [PdEX01], 4, [4], 7, 2>;
-defm : PdWriteResExPair<WritePOPCNT, [PdEX01], 4>;
-defm : PdWriteResExPair<WriteLZCNT, [PdEX01], 2, [], 2>;
-defm : PdWriteResExPair<WriteTZCNT, [PdEX01], 2, [2], 2>;
+defm : PdWriteResExPair<WriteBSF, [PdEX01], 3, [6], 6, 2>;
+defm : PdWriteResExPair<WriteBSR, [PdEX01], 4, [8], 7, 2>;
+defm : PdWriteResExPair<WritePOPCNT, [PdEX01], 4, [4]>;
+defm : PdWriteResExPair<WriteLZCNT, [PdEX0], 2, [2], 2>;
+defm : PdWriteResExPair<WriteTZCNT, [PdEX0], 2, [2], 2>;
// BMI1 BEXTR, BMI2 BZHI
-defm : PdWriteResExPair<WriteBEXTR, [PdEX01], 2, [], 2>;
-defm : PdWriteResExPair<WriteBLS, [PdEX01], 2, [], 2>;
+defm : PdWriteResExPair<WriteBEXTR, [PdEX01], 2, [2], 2>;
+defm : PdWriteResExPair<WriteBLS, [PdEX01], 2, [2], 2>;
defm : PdWriteResExPair<WriteBZHI, [PdEX01]>;
+def PdWriteBEXTRI : SchedWriteRes<[PdEX01]> {
+ let Latency = 2;
+ let ResourceCycles = [4];
+ let NumMicroOps = 2;
+}
+def : InstRW<[PdWriteBEXTRI], (instrs BEXTRI32ri, BEXTRI64ri)>;
+
+def PdWriteBEXTRIm : SchedWriteRes<[PdEX01]> {
+ let Latency = 2;
+ let ResourceCycles = [5];
+ let NumMicroOps = 2;
+}
+def : InstRW<[PdWriteBEXTRIm], (instrs BEXTRI32mi, BEXTRI64mi)>;
+
////////////////////////////////////////////////////////////////////////////////
// Integer shifts and rotates.
////////////////////////////////////////////////////////////////////////////////
-defm : PdWriteResExPair<WriteShift, [PdEX01]>;
+defm : PdWriteResExPair<WriteShift, [PdEX01], 1, [2]>;
defm : PdWriteResExPair<WriteShiftCL, [PdEX01]>;
-defm : PdWriteResExPair<WriteRotate, [PdEX01]>;
+defm : PdWriteResExPair<WriteRotate, [PdEX01], 1, [2]>;
defm : PdWriteResExPair<WriteRotateCL, [PdEX01]>;
def PdWriteRCL8rCL : SchedWriteRes<[PdEX01]> {
let Latency = 12;
+ let ResourceCycles = [24];
let NumMicroOps = 26;
}
def : InstRW<[PdWriteRCL8rCL], (instrs RCL8rCL)>;
def PdWriteRCR8ri : SchedWriteRes<[PdEX01]> {
let Latency = 12;
+ let ResourceCycles = [23];
let NumMicroOps = 23;
}
def : InstRW<[PdWriteRCR8ri], (instrs RCR8ri)>;
def PdWriteRCR8rCL : SchedWriteRes<[PdEX01]> {
let Latency = 11;
+ let ResourceCycles = [22];
let NumMicroOps = 24;
}
def : InstRW<[PdWriteRCR8rCL], (instrs RCR8rCL)>;
def PdWriteRCL16rCL : SchedWriteRes<[PdEX01]> {
let Latency = 10;
+ let ResourceCycles = [20];
let NumMicroOps = 22;
}
def : InstRW<[PdWriteRCL16rCL], (instrs RCL16rCL)>;
def PdWriteRCR16ri : SchedWriteRes<[PdEX01]> {
let Latency = 10;
+ let ResourceCycles = [19];
let NumMicroOps = 19;
}
def : InstRW<[PdWriteRCR16ri], (instrs RCR16ri)>;
-def PdWriteRCL32rCLRCL64rCL : SchedWriteRes<[PdEX01]> {
+def PdWriteRCL3264rCL : SchedWriteRes<[PdEX01]> {
let Latency = 7;
+ let ResourceCycles = [14];
let NumMicroOps = 17;
}
-def : InstRW<[PdWriteRCL32rCLRCL64rCL], (instrs RCL32rCL, RCL64rCL)>;
+def : InstRW<[PdWriteRCL3264rCL], (instrs RCL32rCL, RCL64rCL)>;
-def PdWriteRCR64rCL : SchedWriteRes<[PdEX01]> {
+def PdWriteRCR3264rCL : SchedWriteRes<[PdEX01]> {
let Latency = 7;
+ let ResourceCycles = [13];
let NumMicroOps = 16;
}
-def : InstRW<[PdWriteRCR64rCL], (instrs RCR64rCL)>;
-
-def PdWriteRCR32rCL : SchedWriteRes<[PdEX01]> {
- let Latency = 7;
- let NumMicroOps = 16;
-}
-def : InstRW<[PdWriteRCR32rCL ], (instrs RCR32rCL)>;
+def : InstRW<[PdWriteRCR3264rCL], (instrs RCR32rCL, RCR64rCL)>;
def PdWriteRCR32riRCR64ri : SchedWriteRes<[PdEX01]> {
let Latency = 7;
+ let ResourceCycles = [14];
let NumMicroOps = 15;
}
def : InstRW<[PdWriteRCR32riRCR64ri], (instrs RCR32ri, RCR64ri)>;
@@ -570,31 +642,35 @@ def : InstRW<[PdWriteRCR32riRCR64ri], (instrs RCR32ri, RCR64ri)>;
def PdWriteRCR16rCL : SchedWriteRes<[PdEX01]> {
let Latency = 9;
+ let ResourceCycles = [18];
let NumMicroOps = 20;
}
def : InstRW<[PdWriteRCR16rCL], (instrs RCR16rCL)>;
def PdWriteRCL16ri : SchedWriteRes<[PdEX01]> {
let Latency = 11;
+ let ResourceCycles = [21];
let NumMicroOps = 21;
}
def : InstRW<[PdWriteRCL16ri], (instrs RCL16ri)>;
def PdWriteRCL3264ri : SchedWriteRes<[PdEX01]> {
let Latency = 8;
+ let ResourceCycles = [15];
let NumMicroOps = 16;
}
def : InstRW<[PdWriteRCL3264ri], (instrs RCL32ri, RCL64ri)>;
def PdWriteRCL8ri : SchedWriteRes<[PdEX01]> {
let Latency = 13;
+ let ResourceCycles = [25];
let NumMicroOps = 25;
}
def : InstRW<[PdWriteRCL8ri], (instrs RCL8ri)>;
// SHLD/SHRD.
-defm : PdWriteRes<WriteSHDrri, [PdEX01], 4, [6], 6>;
-defm : PdWriteRes<WriteSHDrrcl, [PdEX01], 4, [8], 7>;
+defm : PdWriteRes<WriteSHDrri, [PdEX01], 3, [6], 6>;
+defm : PdWriteRes<WriteSHDrrcl, [PdEX01], 3, [8], 7>;
def PdWriteSHLD32rri8SHRD16rri8 : SchedWriteRes<[PdEX01]> {
let Latency = 3;
@@ -604,8 +680,8 @@ def PdWriteSHLD32rri8SHRD16rri8 : SchedWriteRes<[PdEX01]> {
def : InstRW<[PdWriteSHLD32rri8SHRD16rri8 ], (instrs SHLD32rri8, SHRD16rri8)>;
def PdWriteSHLD16rrCLSHLD32rrCLSHRD32rrCL : SchedWriteRes<[PdEX01]> {
- let Latency = 4;
- let ResourceCycles = [8];
+ let Latency = 3;
+ let ResourceCycles = [6];
let NumMicroOps = 7;
}
def : InstRW<[PdWriteSHLD16rrCLSHLD32rrCLSHRD32rrCL], (instrs SHLD16rrCL,
@@ -623,19 +699,20 @@ defm : PdWriteRes<WriteFLD0, [PdFPU1, PdFPSTO], 3>;
defm : PdWriteRes<WriteFLD1, [PdFPU1, PdFPSTO], 3>;
defm : PdWriteRes<WriteFLDC, [PdFPU1, PdFPSTO], 3>;
-defm : PdWriteRes<WriteFLoad, [PdLoad, PdFPU01, PdFPFMA], 5>;
-defm : PdWriteRes<WriteFLoadX, [PdLoad, PdFPU01, PdFPFMA], 5>;
-defm : PdWriteRes<WriteFLoadY, [PdLoad, PdFPU01, PdFPFMA], 5, [], 2>;
+defm : PdWriteRes<WriteFLoad, [PdLoad, PdFPU01, PdFPFMA], 5, [3, 1, 3]>;
+defm : PdWriteRes<WriteFLoadX, [PdLoad, PdFPU01, PdFPFMA], 5, [3, 1, 3]>;
+defm : PdWriteRes<WriteFLoadY, [PdLoad, PdFPU01, PdFPFMA], 5, [3, 1, 3], 2>;
-defm : PdWriteRes<WriteFMaskedLoad, [PdLoad, PdFPU01, PdFPFMA], 6, [1, 1, 2]>;
-defm : PdWriteRes<WriteFMaskedLoadY, [PdLoad, PdFPU01, PdFPFMA], 6, [2, 2, 4], 2>;
+defm : PdWriteRes<WriteFMaskedLoad, [PdLoad, PdFPU01, PdFPFMA], 6, [3, 1, 4]>;
+defm : PdWriteRes<WriteFMaskedLoadY, [PdLoad, PdFPU01, PdFPFMA], 6, [3, 2, 4], 2>;
-defm : PdWriteRes<WriteFStore, [PdStore, PdFPU1, PdFPSTO], 2>;
-defm : PdWriteRes<WriteFStoreX, [PdStore, PdFPU1, PdFPSTO]>;
-defm : PdWriteRes<WriteFStoreY, [PdStore, PdFPU1, PdFPSTO], 1, [], 4>;
+defm : PdWriteRes<WriteFStore, [PdStore, PdFPU23, PdFPSTO], 2, [1, 3, 1]>;
+defm : PdWriteRes<WriteFStoreX, [PdStore, PdFPU23, PdFPSTO], 1, [1, 3, 1]>;
+defm : PdWriteRes<WriteFStoreY, [PdStore, PdFPU23, PdFPSTO], 1, [1, 36, 2], 4>;
-def PdWriteMOVHPm : SchedWriteRes<[PdStore, PdFPU1, PdFPSTO]> {
+def PdWriteMOVHPm : SchedWriteRes<[PdStore, PdFPU23, PdFPSTO]> {
let Latency = 2;
+ let ResourceCycles = [1, 3, 1];
let NumMicroOps = 2;
}
def : InstRW<[PdWriteMOVHPm], (instrs MOVHPDmr, MOVHPSmr, VMOVHPDmr, VMOVHPSmr)>;
@@ -649,33 +726,41 @@ defm : PdWriteRes<WriteFStoreNT, [PdStore, PdFPU1, PdFPSTO], 3>;
defm : PdWriteRes<WriteFStoreNTX, [PdStore, PdFPU1, PdFPSTO], 3>;
defm : PdWriteRes<WriteFStoreNTY, [PdStore, PdFPU1, PdFPSTO], 3, [2, 2, 2], 4>;
-defm : PdWriteRes<WriteFMaskedStore, [PdStore, PdFPU01, PdFPFMA], 6, [1, 1, 4], 18>;
-defm : PdWriteRes<WriteFMaskedStoreY, [PdStore, PdFPU01, PdFPFMA], 6, [2, 2, 4], 34>;
+defm : PdWriteRes<WriteFMaskedStore, [PdStore, PdFPU01, PdFPFMA], 6, [1, 1, 188], 18>;
+defm : PdWriteRes<WriteFMaskedStoreY, [PdStore, PdFPU01, PdFPFMA], 6, [2, 2, 376], 34>;
defm : PdWriteRes<WriteFMove, [PdFPU01, PdFPFMA]>;
-defm : PdWriteRes<WriteFMoveX, [PdFPU01, PdFPFMA]>;
+defm : PdWriteRes<WriteFMoveX, [PdFPU01, PdFPFMA], 1, [1, 2]>;
defm : PdWriteRes<WriteFMoveY, [PdFPU01, PdFPFMA], 2, [2, 2], 2>;
defm : PdWriteRes<WriteEMMS, [PdFPU01, PdFPFMA], 2>;
defm : PdWriteResXMMPair<WriteFAdd, [PdFPU0, PdFPFMA], 5>;
defm : PdWriteResXMMPair<WriteFAddX, [PdFPU0, PdFPFMA], 5>;
-defm : PdWriteResYMMPair<WriteFAddY, [PdFPU0, PdFPFMA], 5, [2, 1]>;
+defm : PdWriteResYMMPair<WriteFAddY, [PdFPU0, PdFPFMA], 5, [1, 2]>;
defm : X86WriteResPairUnsupported<WriteFAddZ>;
+def PdWriteX87Add: SchedWriteRes<[PdLoad, PdFPU0, PdFPFMA]> {
+ let Latency = 5;
+ let ResourceCycles = [3, 1, 10];
+}
+def : InstRW<[PdWriteX87Add], (instrs ADD_FI16m, ADD_FI32m, ADD_F32m, ADD_F64m,
+ SUB_FI16m, SUB_FI32m, SUB_F32m, SUB_F64m,
+ SUBR_FI16m, SUBR_FI32m, SUBR_F32m, SUBR_F64m)>;
+
defm : PdWriteResXMMPair<WriteFAdd64, [PdFPU0, PdFPFMA], 5>;
defm : PdWriteResXMMPair<WriteFAdd64X, [PdFPU0, PdFPFMA], 5>;
-defm : PdWriteResYMMPair<WriteFAdd64Y, [PdFPU0, PdFPFMA], 5, [2, 1]>;
+defm : PdWriteResYMMPair<WriteFAdd64Y, [PdFPU0, PdFPFMA], 5, [1, 2]>;
defm : X86WriteResPairUnsupported<WriteFAdd64Z>;
defm : PdWriteResXMMPair<WriteFCmp, [PdFPU0, PdFPFMA], 2>;
defm : PdWriteResXMMPair<WriteFCmpX, [PdFPU0, PdFPFMA], 2>;
-defm : PdWriteResYMMPair<WriteFCmpY, [PdFPU0, PdFPFMA], 2, [2, 1]>;
+defm : PdWriteResYMMPair<WriteFCmpY, [PdFPU0, PdFPFMA], 2, [1, 2]>;
defm : X86WriteResPairUnsupported<WriteFCmpZ>;
defm : PdWriteResXMMPair<WriteFCmp64, [PdFPU0, PdFPFMA], 2>;
defm : PdWriteResXMMPair<WriteFCmp64X, [PdFPU0, PdFPFMA], 2>;
-defm : PdWriteResYMMPair<WriteFCmp64Y, [PdFPU0, PdFPFMA], 2, [2, 1]>;
+defm : PdWriteResYMMPair<WriteFCmp64Y, [PdFPU0, PdFPFMA], 2, [1, 2]>;
defm : X86WriteResPairUnsupported<WriteFCmp64Z>;
defm : PdWriteResXMMPair<WriteFCom, [PdFPU0, PdFPFMA, PdEX0], 1, [], 2>;
@@ -690,29 +775,35 @@ def : InstRW<[PdWriteTST_F_UCOM_FPPr], (instrs TST_F, UCOM_FPPr)>;
defm : PdWriteResXMMPair<WriteFMul, [PdFPU1, PdFPFMA], 5>;
defm : PdWriteResXMMPair<WriteFMulX, [PdFPU1, PdFPFMA], 5>;
-defm : PdWriteResYMMPair<WriteFMulY, [PdFPU1, PdFPFMA], 5, [2, 1]>;
+defm : PdWriteResYMMPair<WriteFMulY, [PdFPU1, PdFPFMA], 5, [1, 2]>;
defm : X86WriteResPairUnsupported<WriteFMulZ>;
+def PdWriteX87Mul: SchedWriteRes<[PdLoad, PdFPU1, PdFPFMA]> {
+ let Latency = 5;
+ let ResourceCycles = [3, 1, 10];
+}
+def : InstRW<[PdWriteX87Mul], (instrs MUL_FI16m, MUL_FI32m, MUL_F32m, MUL_F64m)>;
+
defm : PdWriteResXMMPair<WriteFMul64, [PdFPU1, PdFPFMA], 5>;
defm : PdWriteResXMMPair<WriteFMul64X, [PdFPU1, PdFPFMA], 5>;
-defm : PdWriteResYMMPair<WriteFMul64Y, [PdFPU1, PdFPFMA], 5, [2, 1]>;
+defm : PdWriteResYMMPair<WriteFMul64Y, [PdFPU1, PdFPFMA], 5, [1, 2]>;
defm : X86WriteResPairUnsupported<WriteFMul64Z>;
-defm : PdWriteResXMMPair<WriteFMA, [PdFPU, PdFPFMA], 5>;
-defm : PdWriteResXMMPair<WriteFMAX, [PdFPU, PdFPFMA], 5>;
-defm : PdWriteResYMMPair<WriteFMAY, [PdFPU, PdFPFMA], 5, [1, 1]>;
+defm : PdWriteResXMMPair<WriteFMA, [PdFPU, PdFPFMA], 5, [1, 3]>;
+defm : PdWriteResXMMPair<WriteFMAX, [PdFPU, PdFPFMA], 5, [1, 3]>;
+defm : PdWriteResYMMPair<WriteFMAY, [PdFPU, PdFPFMA], 5, [1, 3]>;
defm : X86WriteResPairUnsupported<WriteFMAZ>;
-defm : PdWriteResXMMPair<WriteDPPD, [PdFPU1, PdFPFMA], 15, [1, 3], 15, 2>;
+defm : PdWriteResXMMPair<WriteDPPD, [PdFPU1, PdFPFMA], 15, [1, 10], 15, 2>;
-defm : PdWriteResXMMPair<WriteDPPS, [PdFPU1, PdFPFMA], 25, [1, 3], 16, 2>;
-defm : PdWriteResYMMPair<WriteDPPSY, [PdFPU1, PdFPFMA], 27, [2, 6], /*or 29*/ 25, 4>;
+defm : PdWriteResXMMPair<WriteDPPS, [PdFPU1, PdFPFMA], 25, [1, 14], 16, 2>;
+defm : PdWriteResYMMPair<WriteDPPSY, [PdFPU1, PdFPFMA], 27, [2, 25], /*or 29*/ 25, 4>;
defm : X86WriteResPairUnsupported<WriteDPPSZ>;
def PdWriteVDPPSrri : SchedWriteRes<[PdFPU1, PdFPFMA]> {
- let Latency = 25;
- let ResourceCycles = [1, 3];
+ let Latency = 27;
+ let ResourceCycles = [1, 14];
let NumMicroOps = 17;
}
def : InstRW<[PdWriteVDPPSrri], (instrs VDPPSrri)>;
@@ -722,118 +813,140 @@ defm : PdWriteResXMMPair<WriteFRcpX, [PdFPU1, PdFPFMA], 5>;
defm : PdWriteResYMMPair<WriteFRcpY, [PdFPU1, PdFPFMA], 5, [2, 1]>;
defm : X86WriteResPairUnsupported<WriteFRcpZ>;
-defm : PdWriteResXMMPair<WriteFRsqrt, [PdFPU1, PdFPFMA], 5>;
+defm : PdWriteResXMMPair<WriteFRsqrt, [PdFPU1, PdFPFMA], 5, [1, 2]>;
defm : PdWriteResXMMPair<WriteFRsqrtX, [PdFPU1, PdFPFMA], 5>;
-defm : PdWriteResYMMPair<WriteFRsqrtY, [PdFPU1, PdFPFMA], 5, [2, 1]>;
+defm : PdWriteResYMMPair<WriteFRsqrtY, [PdFPU1, PdFPFMA], 5, [2, 2]>;
defm : X86WriteResPairUnsupported<WriteFRsqrtZ>;
-defm : PdWriteResXMMPair<WriteFDiv, [PdFPU1, PdFPFMA], 9, [1, 19]>;
-defm : PdWriteResXMMPair<WriteFDivX, [PdFPU1, PdFPFMA], 9, [1, 19]>;
-defm : PdWriteResYMMPair<WriteFDivY, [PdFPU1, PdFPFMA], 9, [2, 38]>;
+defm : PdWriteResXMMPair<WriteFDiv, [PdFPU1, PdFPFMA], 9, [1, 9]>;
+defm : PdWriteResXMMPair<WriteFDivX, [PdFPU1, PdFPFMA], 9, [1, 9]>;
+defm : PdWriteResYMMPair<WriteFDivY, [PdFPU1, PdFPFMA], 9, [2, 18]>;
defm : X86WriteResPairUnsupported<WriteFDivZ>;
-defm : PdWriteResXMMPair<WriteFDiv64, [PdFPU1, PdFPFMA], 9, [1, 19]>;
-defm : PdWriteResXMMPair<WriteFDiv64X, [PdFPU1, PdFPFMA], 9, [1, 19]>;
-defm : PdWriteResYMMPair<WriteFDiv64Y, [PdFPU1, PdFPFMA], 9, [2, 38]>;
+def PdWriteX87Div: SchedWriteRes<[PdLoad, PdFPU0, PdFPFMA]> {
+ let Latency = 9;
+ let ResourceCycles = [3, 1, 18];
+}
+def : InstRW<[PdWriteX87Div], (instrs DIV_FI16m, DIV_FI32m,
+ DIVR_FI16m, DIVR_FI32m,
+ DIV_F32m, DIV_F64m,
+ DIVR_F32m, DIVR_F64m)>;
+
+defm : PdWriteResXMMPair<WriteFDiv64, [PdFPU1, PdFPFMA], 9, [1, 9]>;
+defm : PdWriteResXMMPair<WriteFDiv64X, [PdFPU1, PdFPFMA], 9, [1, 9]>;
+defm : PdWriteResYMMPair<WriteFDiv64Y, [PdFPU1, PdFPFMA], 9, [2, 18]>;
defm : X86WriteResPairUnsupported<WriteFDiv64Z>;
-defm : PdWriteResXMMPair<WriteFSqrt, [PdFPU1, PdFPFMA], 9, [1, 21]>;
-defm : PdWriteResXMMPair<WriteFSqrtX, [PdFPU1, PdFPFMA], 9, [1, 21]>;
-defm : PdWriteResYMMPair<WriteFSqrtY, [PdFPU1, PdFPFMA], 9, [2, 42]>;
+defm : PdWriteResXMMPair<WriteFSqrt, [PdFPU1, PdFPFMA], 9, [1, 9]>;
+defm : PdWriteResXMMPair<WriteFSqrtX, [PdFPU1, PdFPFMA], 9, [1, 9]>;
+defm : PdWriteResYMMPair<WriteFSqrtY, [PdFPU1, PdFPFMA], 9, [2, 18]>;
defm : X86WriteResPairUnsupported<WriteFSqrtZ>;
-defm : PdWriteResXMMPair<WriteFSqrt64, [PdFPU1, PdFPFMA], 9, [1, 27]>;
-defm : PdWriteResXMMPair<WriteFSqrt64X, [PdFPU1, PdFPFMA], 9, [1, 27]>;
-defm : PdWriteResYMMPair<WriteFSqrt64Y, [PdFPU1, PdFPFMA], 9, [2, 54]>;
+defm : PdWriteResXMMPair<WriteFSqrt64, [PdFPU1, PdFPFMA], 9, [1, 9]>;
+defm : PdWriteResXMMPair<WriteFSqrt64X, [PdFPU1, PdFPFMA], 9, [1, 9]>;
+defm : PdWriteResYMMPair<WriteFSqrt64Y, [PdFPU1, PdFPFMA], 9, [2, 18]>;
defm : X86WriteResPairUnsupported<WriteFSqrt64Z>;
-defm : PdWriteResXMMPair<WriteFSqrt80, [PdFPU1, PdFPFMA], 1, [1, 35]>;
-defm : PdWriteResXMMPair<WriteFSign, [PdFPU1, PdFPFMA]>;
+defm : PdWriteResXMMPair<WriteFSqrt80, [PdFPU1, PdFPFMA], 1, [1, 18]>;
+defm : PdWriteResXMMPair<WriteFSign, [PdFPU1, PdFPFMA], 1, [1, 4]>;
-defm : PdWriteResXMMPair<WriteFRnd, [PdFPU1, PdFPSTO], 4>;
+defm : PdWriteResXMMPair<WriteFRnd, [PdFPU1, PdFPSTO], 4, []>;
defm : PdWriteResYMMPair<WriteFRndY, [PdFPU1, PdFPSTO], 4, [2, 1], 2>;
defm : X86WriteResPairUnsupported<WriteFRndZ>;
-def PdWriteVFRCZ : SchedWriteRes<[PdFPU1, PdFPSTO]> {
+def PdWriteVFRCZP : SchedWriteRes<[PdFPU1, PdFPSTO]> {
+ let Latency = 10;
+ let ResourceCycles = [2, 1];
+ let NumMicroOps = 2;
+}
+def : InstRW<[PdWriteVFRCZP], (instrs VFRCZPDrr, VFRCZPSrr)>;
+
+def PdWriteVFRCZS : SchedWriteRes<[PdFPU1, PdFPSTO]> {
let Latency = 10;
+ let ResourceCycles = [10, 1];
let NumMicroOps = 2;
}
-def : InstRW<[PdWriteVFRCZ], (instrs VFRCZPDrr, VFRCZPSrr,
- VFRCZSDrr, VFRCZSSrr)>;
+def : InstRW<[PdWriteVFRCZS], (instrs VFRCZSDrr, VFRCZSSrr)>;
def PdWriteVFRCZm : SchedWriteRes<[PdFPU1, PdFPSTO]> {
let Latency = 15;
- let NumMicroOps = 2;
+ let ResourceCycles = [2, 1];
+ let NumMicroOps = 3;
}
def : InstRW<[PdWriteVFRCZm], (instrs VFRCZPDrm, VFRCZPSrm,
VFRCZSDrm, VFRCZSSrm)>;
def PdWriteVFRCZY : SchedWriteRes<[PdFPU1, PdFPSTO]> {
let Latency = 10;
- let ResourceCycles = [2, 1];
+ let ResourceCycles = [3, 1];
let NumMicroOps = 4;
}
def : InstRW<[PdWriteVFRCZY], (instrs VFRCZPSYrr, VFRCZPDYrr)>;
def PdWriteVFRCZYm : SchedWriteRes<[PdFPU1, PdFPSTO]> {
let Latency = 15;
- let ResourceCycles = [2, 1];
+ let ResourceCycles = [4, 1];
let NumMicroOps = 8;
}
def : InstRW<[PdWriteVFRCZYm], (instrs VFRCZPSYrm, VFRCZPDYrm)>;
-defm : PdWriteResXMMPair<WriteFLogic, [PdFPU01, PdFPFMA], 2>;
+defm : PdWriteResXMMPair<WriteFLogic, [PdFPU01, PdFPFMA], 2, [1, 2]>;
defm : PdWriteResYMMPair<WriteFLogicY, [PdFPU01, PdFPFMA], 2, [2, 2]>;
defm : X86WriteResPairUnsupported<WriteFLogicZ>;
defm : PdWriteResXMMPair<WriteFTest, [PdFPU0, PdFPFMA, PdEX0], 1, [], 2>;
-defm : PdWriteResYMMPair<WriteFTestY, [PdFPU01, PdFPFMA, PdEX0], 1, [2, 2, 1], 4, 2>;
+defm : PdWriteResYMMPair<WriteFTestY, [PdFPU01, PdFPFMA, PdEX0], 1, [4, 4, 1], 4, 2>;
defm : X86WriteResPairUnsupported<WriteFTestZ>;
-defm : PdWriteResXMMPair<WriteFShuffle, [PdFPU01, PdFPFMA], 2>;
-defm : PdWriteResYMMPair<WriteFShuffleY, [PdFPU01, PdFPFMA], 2, [2, 2], 2>;
+defm : PdWriteResXMMPair<WriteFShuffle, [PdFPU01, PdFPFMA], 2, [1, 2]>;
+defm : PdWriteResYMMPair<WriteFShuffleY, [PdFPU01, PdFPFMA], 2, [2, 4], 2>;
defm : X86WriteResPairUnsupported<WriteFShuffleZ>;
def PdWriteVBROADCASTF128 : SchedWriteRes<[PdFPU01, PdFPFMA]> {
let Latency = 7;
+ let ResourceCycles = [1, 3];
let NumMicroOps = 2;
}
def : InstRW<[PdWriteVBROADCASTF128], (instrs VBROADCASTF128)>;
-defm : PdWriteResXMMPair<WriteFVarShuffle, [PdFPU01, PdFPFMA], 3, [1, 4]>;
-defm : PdWriteResYMMPair<WriteFVarShuffleY, [PdFPU01, PdFPFMA], 3, [2, 6], 2>;
+defm : PdWriteResXMMPair<WriteFVarShuffle, [PdFPU01, PdFPFMA], 3, [1, 2]>;
+defm : PdWriteResYMMPair<WriteFVarShuffleY, [PdFPU01, PdFPFMA], 3, [2, 4], 2>;
defm : X86WriteResPairUnsupported<WriteFVarShuffleZ>;
-defm : PdWriteResXMMPair<WriteFBlend, [PdFPU01, PdFPFMA], 2>;
-defm : PdWriteResYMMPair<WriteFBlendY, [PdFPU01, PdFPFMA], 2, [2, 2], 2>;
+defm : PdWriteResXMMPair<WriteFBlend, [PdFPU01, PdFPFMA], 2, [1, 3]>;
+defm : PdWriteResYMMPair<WriteFBlendY, [PdFPU01, PdFPFMA], 2, [2, 3], 2>;
defm : X86WriteResPairUnsupported<WriteFBlendZ>;
-defm : PdWriteResXMMPair<WriteFVarBlend, [PdFPU01, PdFPFMA], 2, [1, 4]>;
-defm : PdWriteResYMMPair<WriteFVarBlendY, [PdFPU01, PdFPFMA], 2, [2, 6], 2>;
+defm : PdWriteResXMMPair<WriteFVarBlend, [PdFPU01, PdFPFMA], 2, [1, 3]>;
+defm : PdWriteResYMMPair<WriteFVarBlendY, [PdFPU01, PdFPFMA], 2, [2, 4], 2>;
defm : X86WriteResPairUnsupported<WriteFVarBlendZ>;
-defm : PdWriteResXMMPair<WriteFShuffle256, [PdFPU01, PdFPFMA], 2, [], 2>;
+defm : PdWriteResXMMPair<WriteFShuffle256, [PdFPU01, PdFPFMA], 2, [1, 3], 2>;
defm : X86WriteResPairUnsupported<WriteFVarShuffle256>;
def PdWriteVEXTRACTF128rr : SchedWriteRes<[PdFPU01, PdFPFMA]> {
let Latency = 2;
+ let ResourceCycles = [1, 2];
}
def : InstRW<[PdWriteVEXTRACTF128rr], (instrs VEXTRACTF128rr)>;
def PdWriteVEXTRACTF128mr : SchedWriteRes<[PdFPU01, PdFPFMA]> {
let Latency = 7;
+ let ResourceCycles = [1, 4];
let NumMicroOps = 2;
}
def : InstRW<[PdWriteVEXTRACTF128mr], (instrs VEXTRACTF128mr)>;
def PdWriteVPERM2F128rr : SchedWriteRes<[PdFPU01, PdFPFMA]> {
let Latency = 4;
+ let ResourceCycles = [1, 6];
let NumMicroOps = 8;
}
def : InstRW<[PdWriteVPERM2F128rr], (instrs VPERM2F128rr)>;
def PdWriteVPERM2F128rm : SchedWriteRes<[PdFPU01, PdFPFMA]> {
let Latency = 8; // 4 + 4
+ let ResourceCycles = [1, 8];
let NumMicroOps = 10;
}
def : InstRW<[PdWriteVPERM2F128rm], (instrs VPERM2F128rm)>;
@@ -842,99 +955,100 @@ def : InstRW<[PdWriteVPERM2F128rm], (instrs VPERM2F128rm)>;
// Conversions.
////////////////////////////////////////////////////////////////////////////////
-defm : PdWriteResXMMPair<WriteCvtSS2I, [PdFPU1, PdFPSTO, PdFPFMA, PdEX0], 13, [], 2>;
+defm : PdWriteResXMMPair<WriteCvtSS2I, [PdFPU0, PdFPCVT, PdFPSTO, PdFPFMA, PdEX0], 13, [], 2>;
-defm : PdWriteResXMMPair<WriteCvtPS2I, [PdFPU1, PdFPSTO], 4>;
-defm : PdWriteResYMMPair<WriteCvtPS2IY, [PdFPU1, PdFPSTO], 4, [2, 1]>;
+defm : PdWriteResXMMPair<WriteCvtPS2I, [PdFPU0, PdFPCVT, PdFPSTO], 4>;
+defm : PdWriteResYMMPair<WriteCvtPS2IY, [PdFPU0, PdFPCVT, PdFPSTO], 4, [1, 2, 1]>;
defm : X86WriteResPairUnsupported<WriteCvtPS2IZ>;
-defm : PdWriteResXMMPair<WriteCvtSD2I, [PdFPU1, PdFPSTO, PdFPFMA, PdEX0], 13, [], 2>;
+defm : PdWriteResXMMPair<WriteCvtSD2I, [PdFPU0, PdFPCVT, PdFPSTO, PdFPFMA, PdEX0], 13, [], 2>;
-defm : PdWriteResXMMPair<WriteCvtPD2I, [PdFPU1, PdFPSTO], 8, [], 2>;
-defm : PdWriteResYMMPair<WriteCvtPD2IY, [PdFPU1, PdFPSTO, PdFPFMA], 8, [2, 1, 1], 4>;
+defm : PdWriteResXMMPair<WriteCvtPD2I, [PdFPU0, PdFPCVT, PdFPSTO], 8, [], 2>;
+defm : PdWriteResYMMPair<WriteCvtPD2IY, [PdFPU0, PdFPCVT, PdFPSTO, PdFPFMA], 8, [1, 2, 1, 1], 4>;
defm : X86WriteResPairUnsupported<WriteCvtPD2IZ>;
-def PdWriteMMX_CVTTPD2PIirr : SchedWriteRes<[PdFPU1, PdFPSTO]> {
+def PdWriteMMX_CVTTPD2PIirr : SchedWriteRes<[PdFPU0, PdFPCVT, PdFPSTO]> {
let Latency = 6;
let NumMicroOps = 2;
}
def : InstRW<[PdWriteMMX_CVTTPD2PIirr], (instrs MMX_CVTTPD2PIirr)>;
// FIXME: f+3 ST, LD+STC latency
-defm : PdWriteResXMMPair<WriteCvtI2SS, [PdFPU1, PdFPSTO], 4, [], 2>;
+defm : PdWriteResXMMPair<WriteCvtI2SS, [PdFPU0, PdFPCVT, PdFPSTO], 4, [], 2>;
// FIXME: .Folded version is one NumMicroOp *less*..
-defm : PdWriteResXMMPair<WriteCvtI2PS, [PdFPU1, PdFPSTO], 4>;
-defm : PdWriteResYMMPair<WriteCvtI2PSY, [PdFPU1, PdFPSTO], 4, [2, 1]>;
+defm : PdWriteResXMMPair<WriteCvtI2PS, [PdFPU0, PdFPCVT, PdFPSTO], 4>;
+defm : PdWriteResYMMPair<WriteCvtI2PSY, [PdFPU0, PdFPCVT, PdFPSTO], 4, [1, 2, 1]>;
defm : X86WriteResPairUnsupported<WriteCvtI2PSZ>;
-defm : PdWriteResXMMPair<WriteCvtI2SD, [PdFPU1, PdFPSTO], 4, [], 2>;
+defm : PdWriteResXMMPair<WriteCvtI2SD, [PdFPU0, PdFPCVT, PdFPSTO], 4, [], 2>;
// FIXME: .Folded version is one NumMicroOp *less*..
-def WriteCVTSI642SDrr : SchedWriteRes<[PdFPU1, PdFPSTO]> {
+def PdWriteCVTSI642SDrr_CVTSI642SSrr_CVTSI2SDr_CVTSI2SSrr : SchedWriteRes<[PdFPU0, PdFPCVT, PdFPSTO]> {
let Latency = 13;
+ let ResourceCycles = [1, 3, 1];
let NumMicroOps = 2;
}
-def : InstRW<[WriteCVTSI642SDrr], (instrs CVTSI642SDrr, CVTSI642SSrr)>;
+def : InstRW<[PdWriteCVTSI642SDrr_CVTSI642SSrr_CVTSI2SDr_CVTSI2SSrr], (instrs CVTSI642SDrr, CVTSI642SSrr, CVTSI2SDrr, CVTSI2SSrr)>;
-defm : PdWriteResXMMPair<WriteCvtI2PD, [PdFPU1, PdFPSTO], 8, [], 2>;
-defm : PdWriteResYMMPair<WriteCvtI2PDY, [PdFPU1, PdFPSTO], 8, [2, 1], 4, 1>;
+defm : PdWriteResXMMPair<WriteCvtI2PD, [PdFPU0, PdFPCVT, PdFPSTO], 8, [], 2>;
+defm : PdWriteResYMMPair<WriteCvtI2PDY, [PdFPU0, PdFPCVT, PdFPSTO], 8, [1, 2, 1], 4, 1>;
defm : X86WriteResPairUnsupported<WriteCvtI2PDZ>;
-defm : PdWriteResXMMPair<WriteCvtSS2SD, [PdFPU1, PdFPSTO], 4>;
+defm : PdWriteResXMMPair<WriteCvtSS2SD, [PdFPU0, PdFPCVT, PdFPSTO], 4, [1, 2, 1]>;
-defm : PdWriteResXMMPair<WriteCvtPS2PD, [PdFPU1, PdFPSTO], 8, [], 2>;
-defm : PdWriteResYMMPair<WriteCvtPS2PDY, [PdFPU1, PdFPSTO], 8, [2, 1], 4, 1>;
+defm : PdWriteResXMMPair<WriteCvtPS2PD, [PdFPU0, PdFPCVT, PdFPSTO], 8, [], 2>;
+defm : PdWriteResYMMPair<WriteCvtPS2PDY, [PdFPU0, PdFPCVT, PdFPSTO], 8, [1, 2, 1], 4, 1>;
defm : X86WriteResPairUnsupported<WriteCvtPS2PDZ>;
-defm : PdWriteResXMMPair<WriteCvtSD2SS, [PdFPU1, PdFPSTO], 4>;
+defm : PdWriteResXMMPair<WriteCvtSD2SS, [PdFPU0, PdFPCVT, PdFPSTO], 4, [1, 2, 1]>;
-defm : PdWriteResXMMPair<WriteCvtPD2PS, [PdFPU1, PdFPSTO], 8, [], 2>;
-defm : PdWriteResYMMPair<WriteCvtPD2PSY, [PdFPU1, PdFPSTO, PdFPFMA], 8, [2, 1, 1], 4>;
+defm : PdWriteResXMMPair<WriteCvtPD2PS, [PdFPU0, PdFPCVT, PdFPSTO], 8, [], 2>;
+defm : PdWriteResYMMPair<WriteCvtPD2PSY, [PdFPU0, PdFPCVT, PdFPSTO, PdFPFMA], 8, [1, 2, 1, 1], 4>;
defm : X86WriteResPairUnsupported<WriteCvtPD2PSZ>;
-def WriteMMX_CVTPD2PIirrMMX_CVTPI2PDirr : SchedWriteRes<[PdFPU1, PdFPSTO]> {
+def PdWriteMMX_CVTPD2PIirrMMX_CVTPI2PDirr : SchedWriteRes<[PdFPU0, PdFPCVT, PdFPSTO]> {
let Latency = 6;
let NumMicroOps = 2;
}
-def : InstRW<[WriteMMX_CVTPD2PIirrMMX_CVTPI2PDirr], (instrs MMX_CVTPD2PIirr,
+def : InstRW<[PdWriteMMX_CVTPD2PIirrMMX_CVTPI2PDirr], (instrs MMX_CVTPD2PIirr,
MMX_CVTPI2PDirr)>;
-def WriteMMX_CVTPI2PSirr : SchedWriteRes<[PdFPU1, PdFPSTO]> {
+def PdWriteMMX_CVTPI2PSirr : SchedWriteRes<[PdFPU0, PdFPCVT, PdFPSTO]> {
let Latency = 4;
let NumMicroOps = 2;
}
-def : InstRW<[WriteMMX_CVTPI2PSirr], (instrs MMX_CVTPI2PSirr)>;
+def : InstRW<[PdWriteMMX_CVTPI2PSirr], (instrs MMX_CVTPI2PSirr)>;
-defm : PdWriteResXMMPair<WriteCvtPH2PS, [PdFPU1, PdFPSTO], 8, [], 2, 1>;
-defm : PdWriteResYMMPair<WriteCvtPH2PSY, [PdFPU1, PdFPSTO], 8, [2, 1], 4, 3>;
+defm : PdWriteResXMMPair<WriteCvtPH2PS, [PdFPU0, PdFPCVT, PdFPSTO], 8, [1, 2, 1], 2, 1>;
+defm : PdWriteResYMMPair<WriteCvtPH2PSY, [PdFPU0, PdFPCVT, PdFPSTO], 8, [1, 2, 1], 4, 3>;
defm : X86WriteResPairUnsupported<WriteCvtPH2PSZ>;
-defm : PdWriteRes<WriteCvtPS2PH, [PdFPU1, PdFPSTO], 8, [], 2>;
-defm : PdWriteRes<WriteCvtPS2PHY, [PdFPU1, PdFPSTO, PdFPFMA], 8, [2, 1, 1], 4>;
+defm : PdWriteRes<WriteCvtPS2PH, [PdFPU0, PdFPCVT, PdFPSTO], 8, [1, 2, 1], 2>;
+defm : PdWriteRes<WriteCvtPS2PHY, [PdFPU0, PdFPCVT, PdFPSTO, PdFPFMA], 8, [1, 2, 1, 1], 4>;
defm : X86WriteResUnsupported<WriteCvtPS2PHZ>;
-defm : PdWriteRes<WriteCvtPS2PHSt, [PdFPU1, PdFPSTO, PdStore], 4, [], 3>;
-defm : PdWriteRes<WriteCvtPS2PHYSt, [PdFPU1, PdFPSTO, PdFPFMA, PdStore], 4, [2, 1, 1, 1], 4>;
+defm : PdWriteRes<WriteCvtPS2PHSt, [PdFPU0, PdFPCVT, PdFPSTO, PdStore], 4, [1, 2, 1, 1], 3>;
+defm : PdWriteRes<WriteCvtPS2PHYSt, [PdFPU0, PdFPCVT, PdFPSTO, PdFPFMA, PdStore], 4, [1, 2, 1, 1, 1], 4>;
defm : X86WriteResUnsupported<WriteCvtPS2PHZSt>;
////////////////////////////////////////////////////////////////////////////////
// Vector integer operations.
////////////////////////////////////////////////////////////////////////////////
-defm : PdWriteRes<WriteVecLoad, [PdLoad, PdFPU01, PdFPMAL], 5>;
-defm : PdWriteRes<WriteVecLoadX, [PdLoad, PdFPU01, PdFPMAL], 5>;
-defm : PdWriteRes<WriteVecLoadY, [PdLoad, PdFPU01, PdFPMAL], 5, [], 2>;
+defm : PdWriteRes<WriteVecLoad, [PdLoad, PdFPU01, PdFPMAL], 5, [3, 1, 3]>;
+defm : PdWriteRes<WriteVecLoadX, [PdLoad, PdFPU01, PdFPMAL], 5, [3, 1, 3]>;
+defm : PdWriteRes<WriteVecLoadY, [PdLoad, PdFPU01, PdFPMAL], 5, [3, 2, 3], 2>;
-defm : PdWriteRes<WriteVecLoadNT, [PdLoad, PdFPU01, PdFPMAL], 5>;
-defm : PdWriteRes<WriteVecLoadNTY, [PdLoad, PdFPU01, PdFPMAL], 5>;
+defm : PdWriteRes<WriteVecLoadNT, [PdLoad, PdFPU01, PdFPMAL], 5, [3, 1, 4]>;
+defm : PdWriteRes<WriteVecLoadNTY, [PdLoad, PdFPU01, PdFPMAL], 5, [3, 2, 4]>;
-defm : PdWriteRes<WriteVecMaskedLoad, [PdLoad, PdFPU01, PdFPMAL], 6, [1, 1, 2]>;
-defm : PdWriteRes<WriteVecMaskedLoadY, [PdLoad, PdFPU01, PdFPMAL], 6, [2, 2, 4], 2>;
+defm : PdWriteRes<WriteVecMaskedLoad, [PdLoad, PdFPU01, PdFPMAL], 6, [3, 1, 2]>;
+defm : PdWriteRes<WriteVecMaskedLoadY, [PdLoad, PdFPU01, PdFPMAL], 6, [3, 2, 4], 2>;
-defm : PdWriteRes<WriteVecStore, [PdStore, PdFPU1, PdFPSTO], 2>;
-defm : PdWriteRes<WriteVecStoreX, [PdStore, PdFPU1, PdFPSTO]>;
-defm : PdWriteRes<WriteVecStoreY, [PdStore, PdFPU1, PdFPSTO], 1, [], 4>;
+defm : PdWriteRes<WriteVecStore, [PdStore, PdFPU23, PdFPSTO], 2, [1, 3, 1]>;
+defm : PdWriteRes<WriteVecStoreX, [PdStore, PdFPU23, PdFPSTO], 1, [1, 3, 1]>;
+defm : PdWriteRes<WriteVecStoreY, [PdStore, PdFPU23, PdFPSTO], 1, [2, 36, 2], 4>;
def PdWriteVMOVDQUYmr : SchedWriteRes<[PdStore, PdFPU1, PdFPSTO]> {
let NumMicroOps = 8;
@@ -948,24 +1062,33 @@ defm : PdWriteRes<WriteVecMaskedStore, [PdStore, PdFPU01, PdFPMAL], 6, [1,
defm : PdWriteRes<WriteVecMaskedStoreY, [PdStore, PdFPU01, PdFPMAL], 6, [2, 2, 4], 2>;
defm : PdWriteRes<WriteVecMove, [PdFPU01, PdFPMAL], 2>;
-defm : PdWriteRes<WriteVecMoveX, [PdFPU01, PdFPMAL], 2>;
+defm : PdWriteRes<WriteVecMoveX, [PdFPU01, PdFPMAL], 1, [1, 2]>;
defm : PdWriteRes<WriteVecMoveY, [PdFPU01, PdFPMAL], 2, [2, 2], 2>;
-defm : PdWriteRes<WriteVecMoveToGpr, [PdFPU0, PdFPFMA, PdEX0], 10>;
-defm : PdWriteRes<WriteVecMoveFromGpr, [PdFPU01, PdFPFMA], 10, [], 2>;
+def PdWriteMOVDQArr : SchedWriteRes<[PdFPU01, PdFPMAL]> {
+}
+def : InstRW<[PdWriteMOVDQArr], (instrs MOVDQArr)>;
+
+def PdWriteMOVQ2DQrr : SchedWriteRes<[PdFPU01, PdFPMAL]> {
+ let Latency = 4;
+}
+def : InstRW<[PdWriteMOVQ2DQrr], (instrs MMX_MOVQ2DQrr)>;
+
+defm : PdWriteRes<WriteVecMoveToGpr, [PdFPU0, PdFPFMA, PdEX0], 11>;
+defm : PdWriteRes<WriteVecMoveFromGpr, [PdFPU01, PdFPFMA], 11, [1, 2], 2>;
defm : PdWriteResXMMPair<WriteVecALU, [PdFPU01, PdFPMAL], 2>;
-defm : PdWriteResXMMPair<WriteVecALUX, [PdFPU01, PdFPMAL], 2>;
+defm : PdWriteResXMMPair<WriteVecALUX, [PdFPU01, PdFPMAL], 2, [1, 2]>;
defm : X86WriteResPairUnsupported<WriteVecALUY>;
defm : X86WriteResPairUnsupported<WriteVecALUZ>;
-defm : PdWriteResXMMPair<WriteVecShift, [PdFPU01, PdFPMAL], 3>;
-defm : PdWriteResXMMPair<WriteVecShiftX, [PdFPU01, PdFPMAL], 3>;
+defm : PdWriteResXMMPair<WriteVecShift, [PdFPU01, PdFPMAL], 3, [1, 2]>;
+defm : PdWriteResXMMPair<WriteVecShiftX, [PdFPU01, PdFPMAL], 3, [1, 2]>;
defm : X86WriteResPairUnsupported<WriteVecShiftY>;
defm : X86WriteResPairUnsupported<WriteVecShiftZ>;
-defm : PdWriteResXMMPair<WriteVecShiftImm, [PdFPU01, PdFPMAL], 2>;
-defm : PdWriteResXMMPair<WriteVecShiftImmX, [PdFPU01, PdFPMAL], 2>;
+defm : PdWriteResXMMPair<WriteVecShiftImm, [PdFPU01, PdFPMAL], 2, [1, 2]>;
+defm : PdWriteResXMMPair<WriteVecShiftImmX, [PdFPU01, PdFPMAL], 2, [1, 2]>;
defm : X86WriteResPairUnsupported<WriteVecShiftImmY>;
defm : X86WriteResPairUnsupported<WriteVecShiftImmZ>;
@@ -978,55 +1101,67 @@ defm : PdWriteResXMMPair<WritePMULLD, [PdFPU0, PdFPU01, PdFPMMA, PdFPMAL]
defm : X86WriteResPairUnsupported<WritePMULLDY>;
defm : X86WriteResPairUnsupported<WritePMULLDZ>;
-def JWriteVPMACS : SchedWriteRes<[PdFPU0, PdFPU01, PdFPMMA, PdFPMAL]> {
+def PdWriteVPMACS : SchedWriteRes<[PdFPU0, PdFPMMA, PdFPMAL]> {
let Latency = 4;
- let ResourceCycles = [2, 1, 2, 1];
}
-def : InstRW<[JWriteVPMACS], (instrs VPMACSDQHrr, VPMACSDQLrr, VPMACSSDQHrr,
- VPMACSSDQLrr)>;
+def : InstRW<[PdWriteVPMACS], (instrs VPMACSDQHrr, VPMACSDQLrr, VPMACSSDQHrr,
+ VPMACSSDQLrr)>;
-defm : PdWriteResXMMPair<WriteMPSAD, [PdFPU0, PdFPMMA], 9, [1, 2], 9>;
+defm : PdWriteResXMMPair<WriteMPSAD, [PdFPU0, PdFPMMA], 9, [1, 4], 8>;
defm : X86WriteResPairUnsupported<WriteMPSADY>;
defm : X86WriteResPairUnsupported<WriteMPSADZ>;
-defm : PdWriteResXMMPair<WritePSADBW, [PdFPU01, PdFPMAL], 4, [], 2>;
-defm : PdWriteResXMMPair<WritePSADBWX, [PdFPU01, PdFPMAL], 4, [], 2>;
+def PdWriteVMPSADBW : SchedWriteRes<[PdFPU0, PdFPMMA]> {
+ let Latency = 8;
+ let ResourceCycles = [1, 4];
+ let NumMicroOps = 10;
+}
+def : InstRW<[PdWriteVMPSADBW], (instrs VMPSADBWrri)>;
+
+defm : PdWriteResXMMPair<WritePSADBW, [PdFPU01, PdFPMAL], 4, [1, 2], 2>;
+defm : PdWriteResXMMPair<WritePSADBWX, [PdFPU01, PdFPMAL], 4, [1, 2], 2>;
defm : X86WriteResPairUnsupported<WritePSADBWY>;
defm : X86WriteResPairUnsupported<WritePSADBWZ>;
defm : PdWriteResXMMPair<WritePHMINPOS, [PdFPU0, PdFPMAL], 4, [], 2>;
-defm : PdWriteResXMMPair<WriteShuffle, [PdFPU01, PdFPMAL], 2>;
-defm : PdWriteResXMMPair<WriteShuffleX, [PdFPU01, PdFPMAL], 2>;
-defm : PdWriteResYMMPair<WriteShuffleY, [PdFPU01, PdFPMAL], 2, [1, 1]>;
+defm : PdWriteResXMMPair<WriteShuffle, [PdFPU01, PdFPMAL], 2, [1, 2]>;
+defm : PdWriteResXMMPair<WriteShuffleX, [PdFPU01, PdFPMAL], 2, [1, 2]>;
+defm : PdWriteResYMMPair<WriteShuffleY, [PdFPU01, PdFPMAL], 2, [1, 4]>;
defm : X86WriteResPairUnsupported<WriteShuffleZ>;
-defm : PdWriteResXMMPair<WriteVarShuffle, [PdFPU01, PdFPMAL], 3, [1, 4]>;
-defm : PdWriteResXMMPair<WriteVarShuffleX, [PdFPU01, PdFPMAL], 3, [1, 4]>;
+defm : PdWriteResXMMPair<WriteVarShuffle, [PdFPU01, PdFPMAL], 3, [1, 2]>;
+defm : PdWriteResXMMPair<WriteVarShuffleX, [PdFPU01, PdFPMAL], 3, [1, 3]>;
defm : X86WriteResPairUnsupported<WriteVarShuffleY>;
defm : X86WriteResPairUnsupported<WriteVarShuffleZ>;
+def PdWriteVPPERM : SchedWriteRes<[PdFPU01, PdFPMAL]> {
+ let Latency = 2;
+ let ResourceCycles = [1, 3];
+}
+def : InstRW<[PdWriteVPPERM], (instrs VPPERMrrr, VPPERMrrr_REV)>;
+
defm : PdWriteResXMMPair<WriteBlend, [PdFPU01, PdFPMAL], 2>;
defm : X86WriteResPairUnsupported<WriteBlendY>;
defm : X86WriteResPairUnsupported<WriteBlendZ>;
-defm : PdWriteResXMMPair<WriteVarBlend, [PdFPU01, PdFPMAL], 2, [1, 4]>;
+defm : PdWriteResXMMPair<WriteVarBlend, [PdFPU01, PdFPMAL], 2, [1, 2]>;
defm : X86WriteResPairUnsupported<WriteVarBlendY>;
defm : X86WriteResPairUnsupported<WriteVarBlendZ>;
defm : PdWriteResXMMPair<WriteVecLogic, [PdFPU01, PdFPMAL], 2>;
-defm : PdWriteResXMMPair<WriteVecLogicX, [PdFPU01, PdFPMAL], 2>;
+defm : PdWriteResXMMPair<WriteVecLogicX, [PdFPU01, PdFPMAL], 2, [1, 2]>;
defm : X86WriteResPairUnsupported<WriteVecLogicY>;
defm : X86WriteResPairUnsupported<WriteVecLogicZ>;
defm : PdWriteResXMMPair<WriteVecTest, [PdFPU0, PdFPFMA, PdEX0], 1, [], 2>;
-defm : PdWriteResYMMPair<WriteVecTestY, [PdFPU01, PdFPFMA, PdEX0], 1, [2, 2, 1], 4, 2>;
+defm : PdWriteResYMMPair<WriteVecTestY, [PdFPU01, PdFPFMA, PdEX0], 1, [2, 4, 1], 4, 2>;
defm : X86WriteResPairUnsupported<WriteVecTestZ>;
defm : PdWriteResXMMPair<WriteShuffle256, [PdFPU01, PdFPMAL]>;
defm : PdWriteResXMMPair<WriteVarShuffle256, [PdFPU01, PdFPMAL]>;
-defm : PdWriteResXMMPair<WriteVarVecShift, [PdFPU01, PdFPMAL], 3>;
+defm : PdWriteResXMMPair<WriteVarVecShift, [PdFPU01, PdFPMAL], 3, [1, 2]>;
defm : X86WriteResPairUnsupported<WriteVarVecShiftY>;
defm : X86WriteResPairUnsupported<WriteVarVecShiftZ>;
@@ -1034,14 +1169,15 @@ defm : X86WriteResPairUnsupported<WriteVarVecShiftZ>;
// Vector insert/extract operations.
////////////////////////////////////////////////////////////////////////////////
-defm : PdWriteRes<WriteVecInsert, [PdFPU01, PdFPMAL], 2, [], 2>;
-defm : PdWriteRes<WriteVecInsertLd, [PdFPU01, PdFPMAL, PdLoad], 6, [], 2>;
+defm : PdWriteRes<WriteVecInsert, [PdFPU01, PdFPMAL], 2, [1, 3], 2>;
+defm : PdWriteRes<WriteVecInsertLd, [PdFPU01, PdFPMAL, PdLoad], 6, [1, 4, 3], 2>;
-defm : PdWriteRes<WriteVecExtract, [PdFPU0, PdFPFMA, PdEX0], 13, [], 2>;
-defm : PdWriteRes<WriteVecExtractSt, [PdFPU1, PdFPSTO, PdStore], 13, [], 2>;
+defm : PdWriteRes<WriteVecExtract, [PdFPU0, PdFPFMA, PdEX0], 12, [1, 3, 1], 2>;
+defm : PdWriteRes<WriteVecExtractSt, [PdFPU1, PdFPSTO, PdStore], 13, [2, 1, 1], 2>;
def PdWriteEXTRQ : SchedWriteRes<[PdFPU01, PdFPMAL]> {
let Latency = 3;
+ let ResourceCycles = [1, 3];
}
def : InstRW<[PdWriteEXTRQ], (instrs EXTRQ, EXTRQI)>;
@@ -1049,19 +1185,19 @@ def : InstRW<[PdWriteEXTRQ], (instrs EXTRQ, EXTRQI)>;
// SSE42 String instructions.
////////////////////////////////////////////////////////////////////////////////
-defm : PdWriteResXMMPair<WritePCmpIStrI, [PdFPU1, PdFPFMA, PdEX0], 14, [1, 2, 1], 7, 1>;
-defm : PdWriteResXMMPair<WritePCmpIStrM, [PdFPU1, PdFPFMA, PdEX0], 6, [1, 2, 1], 7, 2>;
+defm : PdWriteResXMMPair<WritePCmpIStrI, [PdFPU1, PdFPFMA, PdEX0], 11, [1, 6, 1], 7, 1>;
+defm : PdWriteResXMMPair<WritePCmpIStrM, [PdFPU1, PdFPFMA, PdEX0], 7, [1, 8, 1], 7, 2>;
-defm : PdWriteResXMMPair<WritePCmpEStrI, [PdFPU1, PdStore, PdLoad, PdFPMAL, PdFPFMA, PdEX0], 15, [1, 2, 6, 4, 1, 1], 27, 1>;
-defm : PdWriteResXMMPair<WritePCmpEStrM, [PdFPU1, PdStore, PdLoad, PdFPMAL, PdFPFMA, PdEX0], 10, [1, 2, 6, 4, 1, 1], 27, 1>;
+defm : PdWriteResXMMPair<WritePCmpEStrI, [PdFPU1, PdStore, PdLoad, PdFPMAL, PdFPFMA, PdEX0], 14, [1, 10, 10, 10, 1, 1], 27, 1>;
+defm : PdWriteResXMMPair<WritePCmpEStrM, [PdFPU1, PdStore, PdLoad, PdFPMAL, PdFPFMA, PdEX0], 10, [1, 10, 10, 10, 1, 1], 27, 1>;
////////////////////////////////////////////////////////////////////////////////
// MOVMSK Instructions.
////////////////////////////////////////////////////////////////////////////////
-defm : PdWriteRes<WriteFMOVMSK, [PdFPU0, PdFPFMA, PdEX0], 10, [], 2>;
+defm : PdWriteRes<WriteFMOVMSK, [PdFPU0, PdFPFMA, PdEX0], 12, [], 2>;
-defm : PdWriteRes<WriteVecMOVMSK, [PdFPU0, PdFPFMA, PdEX0], 13, [], 2>;
+defm : PdWriteRes<WriteVecMOVMSK, [PdFPU0, PdFPFMA, PdEX0], 12, [], 2>;
defm : X86WriteResUnsupported<WriteVecMOVMSKY>;
// defm : X86WriteResUnsupported<WriteVecMOVMSKZ>;
@@ -1079,12 +1215,12 @@ defm : PdWriteResXMMPair<WriteAESDecEnc, [PdFPU0, PdFPMMA], 9, [], 2>;
// Horizontal add/sub instructions.
////////////////////////////////////////////////////////////////////////////////
-defm : PdWriteResXMMPair<WriteFHAdd, [PdFPU0, PdFPFMA], 11, [], 3, 1>;
-defm : PdWriteResYMMPair<WriteFHAddY, [PdFPU0, PdFPFMA], 11, [2, 1], 8, 2>;
+defm : PdWriteResXMMPair<WriteFHAdd, [PdFPU0, PdFPFMA], 11, [1, 5], 3, 1>;
+defm : PdWriteResYMMPair<WriteFHAddY, [PdFPU0, PdFPFMA], 11, [1, 8], 8, 2>;
defm : X86WriteResPairUnsupported<WriteFHAddZ>;
-defm : PdWriteResXMMPair<WritePHAdd, [PdFPU01, PdFPMAL], 5, [], 3, 1>;
-defm : PdWriteResXMMPair<WritePHAddX, [PdFPU01, PdFPMAL], 2>;
+defm : PdWriteResXMMPair<WritePHAdd, [PdFPU01, PdFPMAL], 5, [1, 4], 3, 1>;
+defm : PdWriteResXMMPair<WritePHAddX, [PdFPU01, PdFPMAL], 2, [1, 2]>;
defm : X86WriteResPairUnsupported<WritePHAddY>;
defm : X86WriteResPairUnsupported<WritePHAddZ>;
@@ -1106,10 +1242,11 @@ def : InstRW<[WritePHAdd.Folded], (instrs PHADDDrm, PHSUBDrm,
// Carry-less multiplication instructions.
////////////////////////////////////////////////////////////////////////////////
-defm : PdWriteResXMMPair<WriteCLMul, [PdFPU0, PdFPMMA], 12, [], 5, 1>;
+defm : PdWriteResXMMPair<WriteCLMul, [PdFPU0, PdFPMMA], 12, [1, 7], 5, 1>;
def PdWriteVPCLMULQDQrr : SchedWriteRes<[PdFPU0, PdFPMMA]> {
- let Latency = 13;
+ let Latency = 12;
+ let ResourceCycles = [1, 7];
let NumMicroOps = 6;
}
def : InstRW<[PdWriteVPCLMULQDQrr], (instrs VPCLMULQDQrr)>;
@@ -1120,9 +1257,15 @@ def : InstRW<[PdWriteVPCLMULQDQrr], (instrs VPCLMULQDQrr)>;
def PdWriteINSERTQ : SchedWriteRes<[PdFPU01, PdFPMAL]> {
let Latency = 3;
- let ResourceCycles = [1, 4];
+ let ResourceCycles = [1, 2];
+}
+def : InstRW<[PdWriteINSERTQ], (instrs INSERTQ)>;
+
+def PdWriteINSERTQI : SchedWriteRes<[PdFPU01, PdFPMAL]> {
+ let Latency = 3;
+ let ResourceCycles = [1, 3];
}
-def : InstRW<[PdWriteINSERTQ], (instrs INSERTQ, INSERTQI)>;
+def : InstRW<[PdWriteINSERTQI], (instrs INSERTQI)>;
////////////////////////////////////////////////////////////////////////////////
// AVX instructions.
diff --git a/lib/Target/X86/X86ScheduleBtVer2.td b/lib/Target/X86/X86ScheduleBtVer2.td
index 33a6b01546d7..2d26232b4132 100644
--- a/lib/Target/X86/X86ScheduleBtVer2.td
+++ b/lib/Target/X86/X86ScheduleBtVer2.td
@@ -1,9 +1,8 @@
//=- X86ScheduleBtVer2.td - X86 BtVer2 (Jaguar) Scheduling ---*- tablegen -*-=//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
@@ -109,6 +108,11 @@ def : ReadAdvance<ReadAfterVecLd, 5>;
def : ReadAdvance<ReadAfterVecXLd, 5>;
def : ReadAdvance<ReadAfterVecYLd, 5>;
+/// "Additional 6 cycle transfer operation which moves a floating point
+/// operation input value from the integer unit to the floating point unit.
+/// Reference: AMDfam16h SOG (Appendix A "Instruction Latencies", Section A.2).
+def : ReadAdvance<ReadInt2Fpu, -6>;
+
// Many SchedWrites are defined in pairs with and without a folded load.
// Instructions with folded loads are usually micro-fused, so they only appear
// as two micro-ops when dispatched by the schedulers.
@@ -174,6 +178,8 @@ multiclass JWriteResYMMPair<X86FoldableSchedWrite SchedRW,
}
}
+// Instructions that have local forwarding disabled have an extra +1cy latency.
+
// A folded store needs a cycle on the SAGU for the store data,
// most RMW instructions don't need an extra uop.
defm : X86WriteRes<WriteRMW, [JSAGU], 1, [1], 0>;
@@ -215,7 +221,6 @@ defm : JWriteResIntPair<WriteIDiv64, [JALU1, JDiv], 41, [1, 41], 2>;
defm : JWriteResIntPair<WriteCRC32, [JALU01], 3, [4], 3>;
defm : JWriteResIntPair<WriteCMOV, [JALU01], 1>; // Conditional move.
-defm : JWriteResIntPair<WriteCMOV2, [JALU01], 1>; // Conditional (CF + ZF flag) move.
defm : X86WriteRes<WriteFCMOV, [JFPU0, JFPA], 3, [1,1], 1>; // x87 conditional move.
def : WriteRes<WriteSETCC, [JALU01]>; // Setcc.
def : WriteRes<WriteSETCCStore, [JALU01,JSAGU]>;
@@ -262,14 +267,13 @@ defm : X86WriteRes<WriteSHDmrcl,[JLAGU, JALU01], 9, [1, 22], 8>;
// Loads, stores, and moves, not folded with other operations.
////////////////////////////////////////////////////////////////////////////////
-def : WriteRes<WriteLoad, [JLAGU]> { let Latency = 5; }
+def : WriteRes<WriteLoad, [JLAGU]> { let Latency = 3; }
def : WriteRes<WriteStore, [JSAGU]>;
def : WriteRes<WriteStoreNT, [JSAGU]>;
def : WriteRes<WriteMove, [JALU01]>;
// Load/store MXCSR.
-// FIXME: These are copy and pasted from WriteLoad/Store.
-def : WriteRes<WriteLDMXCSR, [JLAGU]> { let Latency = 5; }
+def : WriteRes<WriteLDMXCSR, [JLAGU]> { let Latency = 3; }
def : WriteRes<WriteSTMXCSR, [JSAGU]>;
// Treat misc copies as a move.
@@ -400,8 +404,8 @@ defm : X86WriteResPairUnsupported<WriteFTestZ>;
defm : JWriteResFpuPair<WriteFShuffle, [JFPU01, JFPX], 1>;
defm : JWriteResYMMPair<WriteFShuffleY, [JFPU01, JFPX], 1, [2, 2], 2>;
defm : X86WriteResPairUnsupported<WriteFShuffleZ>;
-defm : JWriteResFpuPair<WriteFVarShuffle, [JFPU01, JFPX], 2, [1, 4], 3>;
-defm : JWriteResYMMPair<WriteFVarShuffleY,[JFPU01, JFPX], 3, [2, 6], 6>;
+defm : JWriteResFpuPair<WriteFVarShuffle, [JFPU01, JFPX], 3, [1, 4], 3>; // +1cy latency.
+defm : JWriteResYMMPair<WriteFVarShuffleY,[JFPU01, JFPX], 4, [2, 6], 6>; // +1cy latency.
defm : X86WriteResPairUnsupported<WriteFVarShuffleZ>;
defm : JWriteResFpuPair<WriteFBlend, [JFPU01, JFPX], 1>;
defm : JWriteResYMMPair<WriteFBlendY, [JFPU01, JFPX], 1, [2, 2], 2>;
@@ -425,12 +429,13 @@ defm : JWriteResFpuPair<WriteCvtPD2I, [JFPU1, JSTC], 3, [1,1], 1>;
defm : JWriteResYMMPair<WriteCvtPD2IY, [JFPU1, JSTC, JFPX], 6, [2,2,4], 3>;
defm : X86WriteResPairUnsupported<WriteCvtPD2IZ>;
-// FIXME: f+3 ST, LD+STC latency
-defm : JWriteResFpuPair<WriteCvtI2SS, [JFPU1, JSTC], 9, [1,1], 2>;
+defm : X86WriteRes<WriteCvtI2SS, [JFPU1, JSTC], 4, [1,1], 2>;
+defm : X86WriteRes<WriteCvtI2SSLd, [JLAGU, JFPU1, JSTC], 9, [1,1,1], 1>;
defm : JWriteResFpuPair<WriteCvtI2PS, [JFPU1, JSTC], 3, [1,1], 1>;
defm : JWriteResYMMPair<WriteCvtI2PSY, [JFPU1, JSTC], 3, [2,2], 2>;
defm : X86WriteResPairUnsupported<WriteCvtI2PSZ>;
-defm : JWriteResFpuPair<WriteCvtI2SD, [JFPU1, JSTC], 9, [1,1], 2>;
+defm : X86WriteRes<WriteCvtI2SD, [JFPU1, JSTC], 4, [1,1], 2>;
+defm : X86WriteRes<WriteCvtI2SDLd, [JLAGU, JFPU1, JSTC], 9, [1,1,1], 1>;
defm : JWriteResFpuPair<WriteCvtI2PD, [JFPU1, JSTC], 3, [1,1], 1>;
defm : JWriteResYMMPair<WriteCvtI2PDY, [JFPU1, JSTC], 3, [2,2], 2>;
defm : X86WriteResPairUnsupported<WriteCvtI2PDZ>;
@@ -487,11 +492,11 @@ defm : JWriteResFpuPair<WriteVecALUX, [JFPU01, JVALU], 1>;
defm : X86WriteResPairUnsupported<WriteVecALUY>;
defm : X86WriteResPairUnsupported<WriteVecALUZ>;
defm : JWriteResFpuPair<WriteVecShift, [JFPU01, JVALU], 1>;
-defm : JWriteResFpuPair<WriteVecShiftX, [JFPU01, JVALU], 1>;
+defm : JWriteResFpuPair<WriteVecShiftX, [JFPU01, JVALU], 2>; // +1cy latency.
defm : X86WriteResPairUnsupported<WriteVecShiftY>;
defm : X86WriteResPairUnsupported<WriteVecShiftZ>;
defm : JWriteResFpuPair<WriteVecShiftImm, [JFPU01, JVALU], 1>;
-defm : JWriteResFpuPair<WriteVecShiftImmX,[JFPU01, JVALU], 1>;
+defm : JWriteResFpuPair<WriteVecShiftImmX,[JFPU01, JVALU], 2>; // +1cy latency.
defm : X86WriteResPairUnsupported<WriteVecShiftImmY>;
defm : X86WriteResPairUnsupported<WriteVecShiftImmZ>;
defm : X86WriteResPairUnsupported<WriteVarVecShift>;
@@ -540,7 +545,7 @@ defm : X86WriteResPairUnsupported<WriteVarShuffle256>;
// Vector insert/extract operations.
////////////////////////////////////////////////////////////////////////////////
-defm : X86WriteRes<WriteVecInsert, [JFPU01, JVALU], 7, [1,1], 2>;
+defm : X86WriteRes<WriteVecInsert, [JFPU01, JVALU], 1, [1,1], 2>;
defm : X86WriteRes<WriteVecInsertLd, [JFPU01, JVALU, JLAGU], 4, [1,1,1], 1>;
defm : X86WriteRes<WriteVecExtract, [JFPU0, JFPA, JALU0], 3, [1,1,1], 1>;
defm : X86WriteRes<WriteVecExtractSt, [JFPU1, JSTC, JSAGU], 3, [1,1,1], 1>;
@@ -575,10 +580,10 @@ defm : JWriteResFpuPair<WriteAESDecEnc, [JFPU01, JVALU, JFPU0, JVIMUL], 3, [1,
// Horizontal add/sub instructions.
////////////////////////////////////////////////////////////////////////////////
-defm : JWriteResFpuPair<WriteFHAdd, [JFPU0, JFPA], 3>;
-defm : JWriteResYMMPair<WriteFHAddY, [JFPU0, JFPA], 3, [2,2], 2>;
-defm : JWriteResFpuPair<WritePHAdd, [JFPU01, JVALU], 1>;
-defm : JWriteResFpuPair<WritePHAddX, [JFPU01, JVALU], 1>;
+defm : JWriteResFpuPair<WriteFHAdd, [JFPU0, JFPA], 4>; // +1cy latency.
+defm : JWriteResYMMPair<WriteFHAddY, [JFPU0, JFPA], 4, [2,2], 2>; // +1cy latency.
+defm : JWriteResFpuPair<WritePHAdd, [JFPU01, JVALU], 1>;
+defm : JWriteResFpuPair<WritePHAddX, [JFPU01, JVALU], 2>; // +1cy latency.
defm : X86WriteResPairUnsupported<WritePHAddY>;
////////////////////////////////////////////////////////////////////////////////
diff --git a/lib/Target/X86/X86ScheduleSLM.td b/lib/Target/X86/X86ScheduleSLM.td
index fcaff7cf810f..34c251a5c5bb 100644
--- a/lib/Target/X86/X86ScheduleSLM.td
+++ b/lib/Target/X86/X86ScheduleSLM.td
@@ -1,9 +1,8 @@
//=- X86ScheduleSLM.td - X86 Silvermont Scheduling -----------*- tablegen -*-=//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
@@ -53,6 +52,8 @@ def : ReadAdvance<ReadAfterVecLd, 3>;
def : ReadAdvance<ReadAfterVecXLd, 3>;
def : ReadAdvance<ReadAfterVecYLd, 3>;
+def : ReadAdvance<ReadInt2Fpu, 0>;
+
// Many SchedWrites are defined in pairs with and without a folded load.
// Instructions with folded loads are usually micro-fused, so they only appear
// as two micro-ops when queued in the reservation station.
@@ -130,7 +131,6 @@ defm : SLMWriteResPair<WriteJump, [SLM_IEC_RSV1], 1>;
defm : SLMWriteResPair<WriteCRC32, [SLM_IEC_RSV1], 3>;
defm : SLMWriteResPair<WriteCMOV, [SLM_IEC_RSV01], 2, [2]>;
-defm : SLMWriteResPair<WriteCMOV2, [SLM_IEC_RSV01], 2, [2]>;
defm : X86WriteRes<WriteFCMOV, [SLM_FPC_RSV1], 3, [1], 1>; // x87 conditional move.
def : WriteRes<WriteSETCC, [SLM_IEC_RSV01]>;
def : WriteRes<WriteSETCCStore, [SLM_IEC_RSV01, SLM_MEC_RSV]> {
diff --git a/lib/Target/X86/X86ScheduleZnver1.td b/lib/Target/X86/X86ScheduleZnver1.td
index a866f843106b..65f6d89df610 100644
--- a/lib/Target/X86/X86ScheduleZnver1.td
+++ b/lib/Target/X86/X86ScheduleZnver1.td
@@ -1,9 +1,8 @@
//=- X86ScheduleZnver1.td - X86 Znver1 Scheduling -------------*- tablegen -*-=//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
@@ -95,6 +94,8 @@ def : ReadAdvance<ReadAfterVecLd, 8>;
def : ReadAdvance<ReadAfterVecXLd, 8>;
def : ReadAdvance<ReadAfterVecYLd, 8>;
+def : ReadAdvance<ReadInt2Fpu, 0>;
+
// The Integer PRF for Zen is 168 entries, and it holds the architectural and
// speculative version of the 64-bit integer registers.
// Reference: "Software Optimization Guide for AMD Family 17h Processors"
@@ -214,7 +215,6 @@ defm : ZnWriteResPair<WriteJump, [ZnALU], 1>;
defm : ZnWriteResFpuPair<WriteCRC32, [ZnFPU0], 3>;
defm : ZnWriteResPair<WriteCMOV, [ZnALU], 1>;
-defm : ZnWriteResPair<WriteCMOV2, [ZnALU], 1>;
def : WriteRes<WriteSETCC, [ZnALU]>;
def : WriteRes<WriteSETCCStore, [ZnALU, ZnAGU]>;
defm : X86WriteRes<WriteLAHFSAHF, [ZnALU], 2, [1], 2>;
diff --git a/lib/Target/X86/X86SelectionDAGInfo.cpp b/lib/Target/X86/X86SelectionDAGInfo.cpp
index 008a9ec2ba3c..50690953eef5 100644
--- a/lib/Target/X86/X86SelectionDAGInfo.cpp
+++ b/lib/Target/X86/X86SelectionDAGInfo.cpp
@@ -1,9 +1,8 @@
//===-- X86SelectionDAGInfo.cpp - X86 SelectionDAG Info -------------------===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
@@ -44,24 +43,6 @@ bool X86SelectionDAGInfo::isBaseRegConflictPossible(
return false;
}
-namespace {
-
-// Represents a cover of a buffer of Size bytes with Count() blocks of type AVT
-// (of size UBytes() bytes), as well as how many bytes remain (BytesLeft() is
-// always smaller than the block size).
-struct RepMovsRepeats {
- RepMovsRepeats(uint64_t Size) : Size(Size) {}
-
- uint64_t Count() const { return Size / UBytes(); }
- uint64_t BytesLeft() const { return Size % UBytes(); }
- uint64_t UBytes() const { return AVT.getSizeInBits() / 8; }
-
- const uint64_t Size;
- MVT AVT = MVT::i8;
-};
-
-} // namespace
-
SDValue X86SelectionDAGInfo::EmitTargetCodeForMemset(
SelectionDAG &DAG, const SDLoc &dl, SDValue Chain, SDValue Dst, SDValue Val,
SDValue Size, unsigned Align, bool isVolatile,
@@ -201,98 +182,137 @@ SDValue X86SelectionDAGInfo::EmitTargetCodeForMemset(
return Chain;
}
-SDValue X86SelectionDAGInfo::EmitTargetCodeForMemcpy(
- SelectionDAG &DAG, const SDLoc &dl, SDValue Chain, SDValue Dst, SDValue Src,
- SDValue Size, unsigned Align, bool isVolatile, bool AlwaysInline,
- MachinePointerInfo DstPtrInfo, MachinePointerInfo SrcPtrInfo) const {
- // This requires the copy size to be a constant, preferably
- // within a subtarget-specific limit.
- ConstantSDNode *ConstantSize = dyn_cast<ConstantSDNode>(Size);
- const X86Subtarget &Subtarget =
- DAG.getMachineFunction().getSubtarget<X86Subtarget>();
- if (!ConstantSize)
- return SDValue();
- RepMovsRepeats Repeats(ConstantSize->getZExtValue());
- if (!AlwaysInline && Repeats.Size > Subtarget.getMaxInlineSizeThreshold())
+/// Emit a single REP MOVS{B,W,D,Q} instruction.
+static SDValue emitRepmovs(const X86Subtarget &Subtarget, SelectionDAG &DAG,
+ const SDLoc &dl, SDValue Chain, SDValue Dst,
+ SDValue Src, SDValue Size, MVT AVT) {
+ const bool Use64BitRegs = Subtarget.isTarget64BitLP64();
+ const unsigned CX = Use64BitRegs ? X86::RCX : X86::ECX;
+ const unsigned DI = Use64BitRegs ? X86::RDI : X86::EDI;
+ const unsigned SI = Use64BitRegs ? X86::RSI : X86::ESI;
+
+ SDValue InFlag;
+ Chain = DAG.getCopyToReg(Chain, dl, CX, Size, InFlag);
+ InFlag = Chain.getValue(1);
+ Chain = DAG.getCopyToReg(Chain, dl, DI, Dst, InFlag);
+ InFlag = Chain.getValue(1);
+ Chain = DAG.getCopyToReg(Chain, dl, SI, Src, InFlag);
+ InFlag = Chain.getValue(1);
+
+ SDVTList Tys = DAG.getVTList(MVT::Other, MVT::Glue);
+ SDValue Ops[] = {Chain, DAG.getValueType(AVT), InFlag};
+ return DAG.getNode(X86ISD::REP_MOVS, dl, Tys, Ops);
+}
+
+/// Emit a single REP MOVSB instruction for a particular constant size.
+static SDValue emitRepmovsB(const X86Subtarget &Subtarget, SelectionDAG &DAG,
+ const SDLoc &dl, SDValue Chain, SDValue Dst,
+ SDValue Src, uint64_t Size) {
+ return emitRepmovs(Subtarget, DAG, dl, Chain, Dst, Src,
+ DAG.getIntPtrConstant(Size, dl), MVT::i8);
+}
+
+/// Returns the best type to use with repmovs depending on alignment.
+static MVT getOptimalRepmovsType(const X86Subtarget &Subtarget,
+ uint64_t Align) {
+ assert((Align != 0) && "Align is normalized");
+ assert(isPowerOf2_64(Align) && "Align is a power of 2");
+ switch (Align) {
+ case 1:
+ return MVT::i8;
+ case 2:
+ return MVT::i16;
+ case 4:
+ return MVT::i32;
+ default:
+ return Subtarget.is64Bit() ? MVT::i64 : MVT::i32;
+ }
+}
+
+/// Returns a REP MOVS instruction, possibly with a few load/stores to implement
+/// a constant size memory copy. In some cases where we know REP MOVS is
+/// inefficient we return an empty SDValue so the calling code can either
+/// generate a load/store sequence or call the runtime memcpy function.
+static SDValue emitConstantSizeRepmov(
+ SelectionDAG &DAG, const X86Subtarget &Subtarget, const SDLoc &dl,
+ SDValue Chain, SDValue Dst, SDValue Src, uint64_t Size, EVT SizeVT,
+ unsigned Align, bool isVolatile, bool AlwaysInline,
+ MachinePointerInfo DstPtrInfo, MachinePointerInfo SrcPtrInfo) {
+
+ /// TODO: Revisit next line: big copy with ERMSB on march >= haswell are very
+ /// efficient.
+ if (!AlwaysInline && Size > Subtarget.getMaxInlineSizeThreshold())
return SDValue();
- /// If not DWORD aligned, it is more efficient to call the library. However
- /// if calling the library is not allowed (AlwaysInline), then soldier on as
- /// the code generated here is better than the long load-store sequence we
- /// would otherwise get.
+ /// If we have enhanced repmovs we use it.
+ if (Subtarget.hasERMSB())
+ return emitRepmovsB(Subtarget, DAG, dl, Chain, Dst, Src, Size);
+
+ assert(!Subtarget.hasERMSB() && "No efficient RepMovs");
+ /// We assume runtime memcpy will do a better job for unaligned copies when
+ /// ERMS is not present.
if (!AlwaysInline && (Align & 3) != 0)
return SDValue();
+ const MVT BlockType = getOptimalRepmovsType(Subtarget, Align);
+ const uint64_t BlockBytes = BlockType.getSizeInBits() / 8;
+ const uint64_t BlockCount = Size / BlockBytes;
+ const uint64_t BytesLeft = Size % BlockBytes;
+ SDValue RepMovs =
+ emitRepmovs(Subtarget, DAG, dl, Chain, Dst, Src,
+ DAG.getIntPtrConstant(BlockCount, dl), BlockType);
+
+ /// RepMov can process the whole length.
+ if (BytesLeft == 0)
+ return RepMovs;
+
+ assert(BytesLeft && "We have leftover at this point");
+
+ /// In case we optimize for size we use repmovsb even if it's less efficient
+ /// so we can save the loads/stores of the leftover.
+ if (DAG.getMachineFunction().getFunction().hasMinSize())
+ return emitRepmovsB(Subtarget, DAG, dl, Chain, Dst, Src, Size);
+
+ // Handle the last 1 - 7 bytes.
+ SmallVector<SDValue, 4> Results;
+ Results.push_back(RepMovs);
+ unsigned Offset = Size - BytesLeft;
+ EVT DstVT = Dst.getValueType();
+ EVT SrcVT = Src.getValueType();
+ Results.push_back(DAG.getMemcpy(
+ Chain, dl,
+ DAG.getNode(ISD::ADD, dl, DstVT, Dst, DAG.getConstant(Offset, dl, DstVT)),
+ DAG.getNode(ISD::ADD, dl, SrcVT, Src, DAG.getConstant(Offset, dl, SrcVT)),
+ DAG.getConstant(BytesLeft, dl, SizeVT), Align, isVolatile,
+ /*AlwaysInline*/ true, /*isTailCall*/ false,
+ DstPtrInfo.getWithOffset(Offset), SrcPtrInfo.getWithOffset(Offset)));
+ return DAG.getNode(ISD::TokenFactor, dl, MVT::Other, Results);
+}
+
+SDValue X86SelectionDAGInfo::EmitTargetCodeForMemcpy(
+ SelectionDAG &DAG, const SDLoc &dl, SDValue Chain, SDValue Dst, SDValue Src,
+ SDValue Size, unsigned Align, bool isVolatile, bool AlwaysInline,
+ MachinePointerInfo DstPtrInfo, MachinePointerInfo SrcPtrInfo) const {
// If to a segment-relative address space, use the default lowering.
- if (DstPtrInfo.getAddrSpace() >= 256 ||
- SrcPtrInfo.getAddrSpace() >= 256)
+ if (DstPtrInfo.getAddrSpace() >= 256 || SrcPtrInfo.getAddrSpace() >= 256)
return SDValue();
- // If the base register might conflict with our physical registers, bail out.
+ // If the base registers conflict with our physical registers, use the default
+ // lowering.
const MCPhysReg ClobberSet[] = {X86::RCX, X86::RSI, X86::RDI,
X86::ECX, X86::ESI, X86::EDI};
if (isBaseRegConflictPossible(DAG, ClobberSet))
return SDValue();
- // If the target has enhanced REPMOVSB, then it's at least as fast to use
- // REP MOVSB instead of REP MOVS{W,D,Q}, and it avoids having to handle
- // BytesLeft.
- if (!Subtarget.hasERMSB() && !(Align & 1)) {
- if (Align & 2)
- // WORD aligned
- Repeats.AVT = MVT::i16;
- else if (Align & 4)
- // DWORD aligned
- Repeats.AVT = MVT::i32;
- else
- // QWORD aligned
- Repeats.AVT = Subtarget.is64Bit() ? MVT::i64 : MVT::i32;
-
- if (Repeats.BytesLeft() > 0 &&
- DAG.getMachineFunction().getFunction().optForMinSize()) {
- // When aggressively optimizing for size, avoid generating the code to
- // handle BytesLeft.
- Repeats.AVT = MVT::i8;
- }
- }
-
- bool Use64BitRegs = Subtarget.isTarget64BitLP64();
- SDValue InFlag;
- Chain = DAG.getCopyToReg(Chain, dl, Use64BitRegs ? X86::RCX : X86::ECX,
- DAG.getIntPtrConstant(Repeats.Count(), dl), InFlag);
- InFlag = Chain.getValue(1);
- Chain = DAG.getCopyToReg(Chain, dl, Use64BitRegs ? X86::RDI : X86::EDI,
- Dst, InFlag);
- InFlag = Chain.getValue(1);
- Chain = DAG.getCopyToReg(Chain, dl, Use64BitRegs ? X86::RSI : X86::ESI,
- Src, InFlag);
- InFlag = Chain.getValue(1);
-
- SDVTList Tys = DAG.getVTList(MVT::Other, MVT::Glue);
- SDValue Ops[] = { Chain, DAG.getValueType(Repeats.AVT), InFlag };
- SDValue RepMovs = DAG.getNode(X86ISD::REP_MOVS, dl, Tys, Ops);
+ const X86Subtarget &Subtarget =
+ DAG.getMachineFunction().getSubtarget<X86Subtarget>();
- SmallVector<SDValue, 4> Results;
- Results.push_back(RepMovs);
- if (Repeats.BytesLeft()) {
- // Handle the last 1 - 7 bytes.
- unsigned Offset = Repeats.Size - Repeats.BytesLeft();
- EVT DstVT = Dst.getValueType();
- EVT SrcVT = Src.getValueType();
- EVT SizeVT = Size.getValueType();
- Results.push_back(DAG.getMemcpy(Chain, dl,
- DAG.getNode(ISD::ADD, dl, DstVT, Dst,
- DAG.getConstant(Offset, dl,
- DstVT)),
- DAG.getNode(ISD::ADD, dl, SrcVT, Src,
- DAG.getConstant(Offset, dl,
- SrcVT)),
- DAG.getConstant(Repeats.BytesLeft(), dl,
- SizeVT),
- Align, isVolatile, AlwaysInline, false,
- DstPtrInfo.getWithOffset(Offset),
- SrcPtrInfo.getWithOffset(Offset)));
- }
+ /// Handle constant sizes,
+ if (ConstantSDNode *ConstantSize = dyn_cast<ConstantSDNode>(Size))
+ return emitConstantSizeRepmov(DAG, Subtarget, dl, Chain, Dst, Src,
+ ConstantSize->getZExtValue(),
+ Size.getValueType(), Align, isVolatile,
+ AlwaysInline, DstPtrInfo, SrcPtrInfo);
- return DAG.getNode(ISD::TokenFactor, dl, MVT::Other, Results);
+ return SDValue();
}
diff --git a/lib/Target/X86/X86SelectionDAGInfo.h b/lib/Target/X86/X86SelectionDAGInfo.h
index f4a285a5f916..0f2d979f91e3 100644
--- a/lib/Target/X86/X86SelectionDAGInfo.h
+++ b/lib/Target/X86/X86SelectionDAGInfo.h
@@ -1,9 +1,8 @@
//===-- X86SelectionDAGInfo.h - X86 SelectionDAG Info -----------*- C++ -*-===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
diff --git a/lib/Target/X86/X86ShuffleDecodeConstantPool.cpp b/lib/Target/X86/X86ShuffleDecodeConstantPool.cpp
index 720be8afa62c..a202fc63637b 100644
--- a/lib/Target/X86/X86ShuffleDecodeConstantPool.cpp
+++ b/lib/Target/X86/X86ShuffleDecodeConstantPool.cpp
@@ -1,9 +1,8 @@
//===-- X86ShuffleDecodeConstantPool.cpp - X86 shuffle decode -------------===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
diff --git a/lib/Target/X86/X86ShuffleDecodeConstantPool.h b/lib/Target/X86/X86ShuffleDecodeConstantPool.h
index b08c31935d28..296341517579 100644
--- a/lib/Target/X86/X86ShuffleDecodeConstantPool.h
+++ b/lib/Target/X86/X86ShuffleDecodeConstantPool.h
@@ -1,9 +1,8 @@
//===-- X86ShuffleDecodeConstantPool.h - X86 shuffle decode -----*-C++-*---===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
diff --git a/lib/Target/X86/X86SpeculativeLoadHardening.cpp b/lib/Target/X86/X86SpeculativeLoadHardening.cpp
index a729161a1beb..40f5dbe57e4b 100644
--- a/lib/Target/X86/X86SpeculativeLoadHardening.cpp
+++ b/lib/Target/X86/X86SpeculativeLoadHardening.cpp
@@ -1,9 +1,8 @@
//====- X86SpeculativeLoadHardening.cpp - A Spectre v1 mitigation ---------===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
/// \file
@@ -123,10 +122,7 @@ namespace {
class X86SpeculativeLoadHardeningPass : public MachineFunctionPass {
public:
- X86SpeculativeLoadHardeningPass() : MachineFunctionPass(ID) {
- initializeX86SpeculativeLoadHardeningPassPass(
- *PassRegistry::getPassRegistry());
- }
+ X86SpeculativeLoadHardeningPass() : MachineFunctionPass(ID) { }
StringRef getPassName() const override {
return "X86 speculative load hardening";
@@ -661,7 +657,7 @@ X86SpeculativeLoadHardeningPass::collectBlockCondInfo(MachineFunction &MF) {
// jmpq *%rax
// ```
// We still want to harden the edge to `L1`.
- if (X86::getCondFromBranchOpc(MI.getOpcode()) == X86::COND_INVALID) {
+ if (X86::getCondFromBranch(MI) == X86::COND_INVALID) {
Info.CondBrs.clear();
Info.UncondBr = &MI;
continue;
@@ -752,7 +748,7 @@ X86SpeculativeLoadHardeningPass::tracePredStateThroughCFG(
for (X86::CondCode Cond : Conds) {
int PredStateSizeInBytes = TRI->getRegSizeInBits(*PS->RC) / 8;
- auto CMovOp = X86::getCMovFromCond(Cond, PredStateSizeInBytes);
+ auto CMovOp = X86::getCMovOpcode(PredStateSizeInBytes);
unsigned UpdatedStateReg = MRI->createVirtualRegister(PS->RC);
// Note that we intentionally use an empty debug location so that
@@ -760,7 +756,8 @@ X86SpeculativeLoadHardeningPass::tracePredStateThroughCFG(
auto CMovI = BuildMI(CheckingMBB, InsertPt, DebugLoc(),
TII->get(CMovOp), UpdatedStateReg)
.addReg(CurStateReg)
- .addReg(PS->PoisonReg);
+ .addReg(PS->PoisonReg)
+ .addImm(Cond);
// If this is the last cmov and the EFLAGS weren't originally
// live-in, mark them as killed.
if (!LiveEFLAGS && Cond == Conds.back())
@@ -789,7 +786,7 @@ X86SpeculativeLoadHardeningPass::tracePredStateThroughCFG(
MachineBasicBlock &Succ = *CondBr->getOperand(0).getMBB();
int &SuccCount = SuccCounts[&Succ];
- X86::CondCode Cond = X86::getCondFromBranchOpc(CondBr->getOpcode());
+ X86::CondCode Cond = X86::getCondFromBranch(*CondBr);
X86::CondCode InvCond = X86::GetOppositeBranchCondition(Cond);
UncondCodeSeq.push_back(Cond);
@@ -1177,12 +1174,13 @@ X86SpeculativeLoadHardeningPass::tracePredStateThroughIndirectBranches(
// Now cmov over the predicate if the comparison wasn't equal.
int PredStateSizeInBytes = TRI->getRegSizeInBits(*PS->RC) / 8;
- auto CMovOp = X86::getCMovFromCond(X86::COND_NE, PredStateSizeInBytes);
+ auto CMovOp = X86::getCMovOpcode(PredStateSizeInBytes);
unsigned UpdatedStateReg = MRI->createVirtualRegister(PS->RC);
auto CMovI =
BuildMI(MBB, InsertPt, DebugLoc(), TII->get(CMovOp), UpdatedStateReg)
.addReg(PS->InitialReg)
- .addReg(PS->PoisonReg);
+ .addReg(PS->PoisonReg)
+ .addImm(X86::COND_NE);
CMovI->findRegisterUseOperand(X86::EFLAGS)->setIsKill(true);
++NumInstsInserted;
LLVM_DEBUG(dbgs() << " Inserting cmov: "; CMovI->dump(); dbgs() << "\n");
@@ -1963,6 +1961,14 @@ void X86SpeculativeLoadHardeningPass::hardenLoadAddr(
LLVM_DEBUG(
dbgs() << " Skipping hardening base of explicit stack frame load: ";
MI.dump(); dbgs() << "\n");
+ } else if (BaseMO.getReg() == X86::RSP) {
+ // Some idempotent atomic operations are lowered directly to a locked
+ // OR with 0 to the top of stack(or slightly offset from top) which uses an
+ // explicit RSP register as the base.
+ assert(IndexMO.getReg() == X86::NoRegister &&
+ "Explicit RSP access with dynamic index!");
+ LLVM_DEBUG(
+ dbgs() << " Cannot harden base of explicit RSP offset in a load!");
} else if (BaseMO.getReg() == X86::RIP ||
BaseMO.getReg() == X86::NoRegister) {
// For both RIP-relative addressed loads or absolute loads, we cannot
@@ -2464,7 +2470,7 @@ void X86SpeculativeLoadHardeningPass::tracePredStateThroughCall(
// If we have no red zones or if the function returns twice (possibly without
// using the `ret` instruction) like setjmp, we need to save the expected
// return address prior to the call.
- if (MF.getFunction().hasFnAttribute(Attribute::NoRedZone) ||
+ if (!Subtarget->getFrameLowering()->has128ByteRedZone(MF) ||
MF.exposesReturnsTwice()) {
// If we don't have red zones, we need to compute the expected return
// address prior to the call and store it in a register that lives across
@@ -2546,12 +2552,13 @@ void X86SpeculativeLoadHardeningPass::tracePredStateThroughCall(
// Now conditionally update the predicate state we just extracted if we ended
// up at a different return address than expected.
int PredStateSizeInBytes = TRI->getRegSizeInBits(*PS->RC) / 8;
- auto CMovOp = X86::getCMovFromCond(X86::COND_NE, PredStateSizeInBytes);
+ auto CMovOp = X86::getCMovOpcode(PredStateSizeInBytes);
unsigned UpdatedStateReg = MRI->createVirtualRegister(PS->RC);
auto CMovI = BuildMI(MBB, InsertPt, Loc, TII->get(CMovOp), UpdatedStateReg)
.addReg(NewStateReg, RegState::Kill)
- .addReg(PS->PoisonReg);
+ .addReg(PS->PoisonReg)
+ .addImm(X86::COND_NE);
CMovI->findRegisterUseOperand(X86::EFLAGS)->setIsKill(true);
++NumInstsInserted;
LLVM_DEBUG(dbgs() << " Inserting cmov: "; CMovI->dump(); dbgs() << "\n");
diff --git a/lib/Target/X86/X86Subtarget.cpp b/lib/Target/X86/X86Subtarget.cpp
index 0c9ce8802e1b..d5bb56603df9 100644
--- a/lib/Target/X86/X86Subtarget.cpp
+++ b/lib/Target/X86/X86Subtarget.cpp
@@ -1,9 +1,8 @@
//===-- X86Subtarget.cpp - X86 Subtarget Information ----------------------===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
@@ -15,6 +14,7 @@
#include "X86CallLowering.h"
#include "X86LegalizerInfo.h"
+#include "X86MacroFusion.h"
#include "X86RegisterBankInfo.h"
#include "X86Subtarget.h"
#include "MCTargetDesc/X86BaseInfo.h"
@@ -176,10 +176,13 @@ X86Subtarget::classifyGlobalFunctionReference(const GlobalValue *GV,
if (TM.shouldAssumeDSOLocal(M, GV))
return X86II::MO_NO_FLAG;
+ // Functions on COFF can be non-DSO local for two reasons:
+ // - They are marked dllimport
+ // - They are extern_weak, and a stub is needed
if (isTargetCOFF()) {
- assert(GV->hasDLLImportStorageClass() &&
- "shouldAssumeDSOLocal gave inconsistent answer");
- return X86II::MO_DLLIMPORT;
+ if (GV->hasDLLImportStorageClass())
+ return X86II::MO_DLLIMPORT;
+ return X86II::MO_COFFSTUB;
}
const Function *F = dyn_cast_or_null<Function>(GV);
@@ -367,3 +370,8 @@ const RegisterBankInfo *X86Subtarget::getRegBankInfo() const {
bool X86Subtarget::enableEarlyIfConversion() const {
return hasCMov() && X86EarlyIfConv;
}
+
+void X86Subtarget::getPostRAMutations(
+ std::vector<std::unique_ptr<ScheduleDAGMutation>> &Mutations) const {
+ Mutations.push_back(createX86MacroFusionDAGMutation());
+}
diff --git a/lib/Target/X86/X86Subtarget.h b/lib/Target/X86/X86Subtarget.h
index b1103f823e7f..24ccc9cb7843 100644
--- a/lib/Target/X86/X86Subtarget.h
+++ b/lib/Target/X86/X86Subtarget.h
@@ -1,9 +1,8 @@
//===-- X86Subtarget.h - Define Subtarget for the X86 ----------*- C++ -*--===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
@@ -89,6 +88,9 @@ protected:
/// True if the processor supports X87 instructions.
bool HasX87 = false;
+ /// True if the processor supports CMPXCHG8B.
+ bool HasCmpxchg8b = false;
+
/// True if this processor has NOPL instruction
/// (generally pentium pro+).
bool HasNOPL = false;
@@ -295,6 +297,9 @@ protected:
/// True if the processor supports macrofusion.
bool HasMacroFusion = false;
+ /// True if the processor supports branch fusion.
+ bool HasBranchFusion = false;
+
/// True if the processor has enhanced REP MOVSB/STOSB.
bool HasERMSB = false;
@@ -348,9 +353,18 @@ protected:
/// Processor has AVX-512 Vector Neural Network Instructions
bool HasVNNI = false;
+ /// Processor has AVX-512 bfloat16 floating-point extensions
+ bool HasBF16 = false;
+
+ /// Processor supports ENQCMD instructions
+ bool HasENQCMD = false;
+
/// Processor has AVX-512 Bit Algorithms instructions
bool HasBITALG = false;
+ /// Processor has AVX-512 vp2intersect instructions
+ bool HasVP2INTERSECT = false;
+
/// Processor supports MPX - Memory Protection Extensions
bool HasMPX = false;
@@ -388,6 +402,12 @@ protected:
/// Try harder to combine to horizontal vector ops if they are fast.
bool HasFastHorizontalOps = false;
+ /// Prefer a left/right scalar logical shifts pair over a shift+and pair.
+ bool HasFastScalarShiftMasks = false;
+
+ /// Prefer a left/right vector logical shifts pair over a shift+and pair.
+ bool HasFastVectorShiftMasks = false;
+
/// Use a retpoline thunk rather than indirect calls to block speculative
/// execution.
bool UseRetpolineIndirectCalls = false;
@@ -547,6 +567,7 @@ public:
void setPICStyle(PICStyles::Style Style) { PICStyle = Style; }
bool hasX87() const { return HasX87; }
+ bool hasCmpxchg8b() const { return HasCmpxchg8b; }
bool hasNOPL() const { return HasNOPL; }
// SSE codegen depends on cmovs, and all SSE1+ processors support them.
// All 64-bit processors support cmov.
@@ -621,7 +642,7 @@ public:
int getGatherOverhead() const { return GatherOverhead; }
int getScatterOverhead() const { return ScatterOverhead; }
bool hasSSEUnalignedMem() const { return HasSSEUnalignedMem; }
- bool hasCmpxchg16b() const { return HasCmpxchg16b; }
+ bool hasCmpxchg16b() const { return HasCmpxchg16b && is64Bit(); }
bool useLeaForSP() const { return UseLeaForSP; }
bool hasPOPCNTFalseDeps() const { return HasPOPCNTFalseDeps; }
bool hasLZCNTFalseDeps() const { return HasLZCNTFalseDeps; }
@@ -638,7 +659,10 @@ public:
bool hasFastSHLDRotate() const { return HasFastSHLDRotate; }
bool hasFastBEXTR() const { return HasFastBEXTR; }
bool hasFastHorizontalOps() const { return HasFastHorizontalOps; }
+ bool hasFastScalarShiftMasks() const { return HasFastScalarShiftMasks; }
+ bool hasFastVectorShiftMasks() const { return HasFastVectorShiftMasks; }
bool hasMacroFusion() const { return HasMacroFusion; }
+ bool hasBranchFusion() const { return HasBranchFusion; }
bool hasERMSB() const { return HasERMSB; }
bool hasSlowDivide32() const { return HasSlowDivide32; }
bool hasSlowDivide64() const { return HasSlowDivide64; }
@@ -657,6 +681,8 @@ public:
bool hasVLX() const { return HasVLX; }
bool hasPKU() const { return HasPKU; }
bool hasVNNI() const { return HasVNNI; }
+ bool hasBF16() const { return HasBF16; }
+ bool hasVP2INTERSECT() const { return HasVP2INTERSECT; }
bool hasBITALG() const { return HasBITALG; }
bool hasMPX() const { return HasMPX; }
bool hasSHSTK() const { return HasSHSTK; }
@@ -669,6 +695,7 @@ public:
bool hasSGX() const { return HasSGX; }
bool threewayBranchProfitable() const { return ThreewayBranchProfitable; }
bool hasINVPCID() const { return HasINVPCID; }
+ bool hasENQCMD() const { return HasENQCMD; }
bool useRetpolineIndirectCalls() const { return UseRetpolineIndirectCalls; }
bool useRetpolineIndirectBranches() const {
return UseRetpolineIndirectBranches;
@@ -744,10 +771,6 @@ public:
return TargetTriple.isWindowsMSVCEnvironment();
}
- bool isTargetKnownWindowsMSVC() const {
- return TargetTriple.isKnownWindowsMSVCEnvironment();
- }
-
bool isTargetWindowsCoreCLR() const {
return TargetTriple.isWindowsCoreCLREnvironment();
}
@@ -834,11 +857,11 @@ public:
/// Enable the MachineScheduler pass for all X86 subtargets.
bool enableMachineScheduler() const override { return true; }
- // TODO: Update the regression tests and return true.
- bool supportPrintSchedInfo() const override { return false; }
-
bool enableEarlyIfConversion() const override;
+ void getPostRAMutations(std::vector<std::unique_ptr<ScheduleDAGMutation>>
+ &Mutations) const override;
+
AntiDepBreakMode getAntiDepBreakMode() const override {
return TargetSubtargetInfo::ANTIDEP_CRITICAL;
}
diff --git a/lib/Target/X86/X86TargetMachine.cpp b/lib/Target/X86/X86TargetMachine.cpp
index afcb49dc2263..0cbf13899a29 100644
--- a/lib/Target/X86/X86TargetMachine.cpp
+++ b/lib/Target/X86/X86TargetMachine.cpp
@@ -1,9 +1,8 @@
//===-- X86TargetMachine.cpp - Define TargetMachine for the X86 -----------===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
@@ -13,6 +12,7 @@
#include "X86TargetMachine.h"
#include "MCTargetDesc/X86MCTargetDesc.h"
+#include "TargetInfo/X86TargetInfo.h"
#include "X86.h"
#include "X86CallLowering.h"
#include "X86LegalizerInfo.h"
@@ -38,6 +38,7 @@
#include "llvm/IR/Attributes.h"
#include "llvm/IR/DataLayout.h"
#include "llvm/IR/Function.h"
+#include "llvm/MC/MCAsmInfo.h"
#include "llvm/Pass.h"
#include "llvm/Support/CodeGen.h"
#include "llvm/Support/CommandLine.h"
@@ -70,9 +71,10 @@ extern "C" void LLVMInitializeX86Target() {
initializeFixupBWInstPassPass(PR);
initializeEvexToVexInstPassPass(PR);
initializeFixupLEAPassPass(PR);
- initializeShadowCallStackPass(PR);
+ initializeFPSPass(PR);
initializeX86CallFrameOptimizationPass(PR);
initializeX86CmovConverterPassPass(PR);
+ initializeX86ExpandPseudoPass(PR);
initializeX86ExecutionDomainFixPass(PR);
initializeX86DomainReassignmentPass(PR);
initializeX86AvoidSFBPassPass(PR);
@@ -194,7 +196,7 @@ static CodeModel::Model getEffectiveX86CodeModel(Optional<CodeModel::Model> CM,
bool JIT, bool Is64Bit) {
if (CM) {
if (*CM == CodeModel::Tiny)
- report_fatal_error("Target does not support the tiny CodeModel");
+ report_fatal_error("Target does not support the tiny CodeModel", false);
return *CM;
}
if (JIT)
@@ -357,6 +359,13 @@ public:
return DAG;
}
+ ScheduleDAGInstrs *
+ createPostMachineScheduler(MachineSchedContext *C) const override {
+ ScheduleDAGMI *DAG = createGenericSchedPostRA(C);
+ DAG->addMutation(createX86MacroFusionDAGMutation());
+ return DAG;
+ }
+
void addIRPasses() override;
bool addInstSelector() override;
bool addIRTranslator() override;
@@ -371,6 +380,8 @@ public:
void addPreEmitPass() override;
void addPreEmitPass2() override;
void addPreSched2() override;
+
+ std::unique_ptr<CSEConfigBase> getCSEConfig() const override;
};
class X86ExecutionDomainFix : public ExecutionDomainFix {
@@ -490,7 +501,6 @@ void X86PassConfig::addPreEmitPass() {
addPass(createBreakFalseDeps());
}
- addPass(createShadowCallStackPass());
addPass(createX86IndirectBranchTrackingPass());
if (UseVZeroUpper)
@@ -512,6 +522,13 @@ void X86PassConfig::addPreEmitPass2() {
// correct CFA calculation rule where needed by inserting appropriate CFI
// instructions.
const Triple &TT = TM->getTargetTriple();
- if (!TT.isOSDarwin() && !TT.isOSWindows())
+ const MCAsmInfo *MAI = TM->getMCAsmInfo();
+ if (!TT.isOSDarwin() &&
+ (!TT.isOSWindows() ||
+ MAI->getExceptionHandlingType() == ExceptionHandling::DwarfCFI))
addPass(createCFIInstrInserter());
}
+
+std::unique_ptr<CSEConfigBase> X86PassConfig::getCSEConfig() const {
+ return getStandardCSEConfigForOpt(TM->getOptLevel());
+}
diff --git a/lib/Target/X86/X86TargetMachine.h b/lib/Target/X86/X86TargetMachine.h
index f5b45da0c3dc..b999e2e86af6 100644
--- a/lib/Target/X86/X86TargetMachine.h
+++ b/lib/Target/X86/X86TargetMachine.h
@@ -1,9 +1,8 @@
//===-- X86TargetMachine.h - Define TargetMachine for the X86 ---*- C++ -*-===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
diff --git a/lib/Target/X86/X86TargetObjectFile.cpp b/lib/Target/X86/X86TargetObjectFile.cpp
index 505c4fa07b77..92e0779c2e74 100644
--- a/lib/Target/X86/X86TargetObjectFile.cpp
+++ b/lib/Target/X86/X86TargetObjectFile.cpp
@@ -1,9 +1,8 @@
//===-- X86TargetObjectFile.cpp - X86 Object Info -------------------------===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
diff --git a/lib/Target/X86/X86TargetObjectFile.h b/lib/Target/X86/X86TargetObjectFile.h
index d045094edb1e..13d7b4ad70d6 100644
--- a/lib/Target/X86/X86TargetObjectFile.h
+++ b/lib/Target/X86/X86TargetObjectFile.h
@@ -1,9 +1,8 @@
//===-- X86TargetObjectFile.h - X86 Object Info -----------------*- C++ -*-===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
diff --git a/lib/Target/X86/X86TargetTransformInfo.cpp b/lib/Target/X86/X86TargetTransformInfo.cpp
index 36929a4f5439..3dc59aeb263e 100644
--- a/lib/Target/X86/X86TargetTransformInfo.cpp
+++ b/lib/Target/X86/X86TargetTransformInfo.cpp
@@ -1,9 +1,8 @@
//===-- X86TargetTransformInfo.cpp - X86 specific TTI pass ----------------===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
/// \file
@@ -1651,17 +1650,77 @@ int X86TTIImpl::getCmpSelInstrCost(unsigned Opcode, Type *ValTy, Type *CondTy,
int ISD = TLI->InstructionOpcodeToISD(Opcode);
assert(ISD && "Invalid opcode");
- static const CostTblEntry SSE2CostTbl[] = {
- { ISD::SETCC, MVT::v2i64, 8 },
- { ISD::SETCC, MVT::v4i32, 1 },
- { ISD::SETCC, MVT::v8i16, 1 },
- { ISD::SETCC, MVT::v16i8, 1 },
+ unsigned ExtraCost = 0;
+ if (I && (Opcode == Instruction::ICmp || Opcode == Instruction::FCmp)) {
+ // Some vector comparison predicates cost extra instructions.
+ if (MTy.isVector() &&
+ !((ST->hasXOP() && (!ST->hasAVX2() || MTy.is128BitVector())) ||
+ (ST->hasAVX512() && 32 <= MTy.getScalarSizeInBits()) ||
+ ST->hasBWI())) {
+ switch (cast<CmpInst>(I)->getPredicate()) {
+ case CmpInst::Predicate::ICMP_NE:
+ // xor(cmpeq(x,y),-1)
+ ExtraCost = 1;
+ break;
+ case CmpInst::Predicate::ICMP_SGE:
+ case CmpInst::Predicate::ICMP_SLE:
+ // xor(cmpgt(x,y),-1)
+ ExtraCost = 1;
+ break;
+ case CmpInst::Predicate::ICMP_ULT:
+ case CmpInst::Predicate::ICMP_UGT:
+ // cmpgt(xor(x,signbit),xor(y,signbit))
+ // xor(cmpeq(pmaxu(x,y),x),-1)
+ ExtraCost = 2;
+ break;
+ case CmpInst::Predicate::ICMP_ULE:
+ case CmpInst::Predicate::ICMP_UGE:
+ if ((ST->hasSSE41() && MTy.getScalarSizeInBits() == 32) ||
+ (ST->hasSSE2() && MTy.getScalarSizeInBits() < 32)) {
+ // cmpeq(psubus(x,y),0)
+ // cmpeq(pminu(x,y),x)
+ ExtraCost = 1;
+ } else {
+ // xor(cmpgt(xor(x,signbit),xor(y,signbit)),-1)
+ ExtraCost = 3;
+ }
+ break;
+ default:
+ break;
+ }
+ }
+ }
+
+ static const CostTblEntry AVX512BWCostTbl[] = {
+ { ISD::SETCC, MVT::v32i16, 1 },
+ { ISD::SETCC, MVT::v64i8, 1 },
+
+ { ISD::SELECT, MVT::v32i16, 1 },
+ { ISD::SELECT, MVT::v64i8, 1 },
};
- static const CostTblEntry SSE42CostTbl[] = {
- { ISD::SETCC, MVT::v2f64, 1 },
- { ISD::SETCC, MVT::v4f32, 1 },
- { ISD::SETCC, MVT::v2i64, 1 },
+ static const CostTblEntry AVX512CostTbl[] = {
+ { ISD::SETCC, MVT::v8i64, 1 },
+ { ISD::SETCC, MVT::v16i32, 1 },
+ { ISD::SETCC, MVT::v8f64, 1 },
+ { ISD::SETCC, MVT::v16f32, 1 },
+
+ { ISD::SELECT, MVT::v8i64, 1 },
+ { ISD::SELECT, MVT::v16i32, 1 },
+ { ISD::SELECT, MVT::v8f64, 1 },
+ { ISD::SELECT, MVT::v16f32, 1 },
+ };
+
+ static const CostTblEntry AVX2CostTbl[] = {
+ { ISD::SETCC, MVT::v4i64, 1 },
+ { ISD::SETCC, MVT::v8i32, 1 },
+ { ISD::SETCC, MVT::v16i16, 1 },
+ { ISD::SETCC, MVT::v32i8, 1 },
+
+ { ISD::SELECT, MVT::v4i64, 1 }, // pblendvb
+ { ISD::SELECT, MVT::v8i32, 1 }, // pblendvb
+ { ISD::SELECT, MVT::v16i16, 1 }, // pblendvb
+ { ISD::SELECT, MVT::v32i8, 1 }, // pblendvb
};
static const CostTblEntry AVX1CostTbl[] = {
@@ -1672,50 +1731,83 @@ int X86TTIImpl::getCmpSelInstrCost(unsigned Opcode, Type *ValTy, Type *CondTy,
{ ISD::SETCC, MVT::v8i32, 4 },
{ ISD::SETCC, MVT::v16i16, 4 },
{ ISD::SETCC, MVT::v32i8, 4 },
+
+ { ISD::SELECT, MVT::v4f64, 1 }, // vblendvpd
+ { ISD::SELECT, MVT::v8f32, 1 }, // vblendvps
+ { ISD::SELECT, MVT::v4i64, 1 }, // vblendvpd
+ { ISD::SELECT, MVT::v8i32, 1 }, // vblendvps
+ { ISD::SELECT, MVT::v16i16, 3 }, // vandps + vandnps + vorps
+ { ISD::SELECT, MVT::v32i8, 3 }, // vandps + vandnps + vorps
};
- static const CostTblEntry AVX2CostTbl[] = {
- { ISD::SETCC, MVT::v4i64, 1 },
- { ISD::SETCC, MVT::v8i32, 1 },
- { ISD::SETCC, MVT::v16i16, 1 },
- { ISD::SETCC, MVT::v32i8, 1 },
+ static const CostTblEntry SSE42CostTbl[] = {
+ { ISD::SETCC, MVT::v2f64, 1 },
+ { ISD::SETCC, MVT::v4f32, 1 },
+ { ISD::SETCC, MVT::v2i64, 1 },
};
- static const CostTblEntry AVX512CostTbl[] = {
- { ISD::SETCC, MVT::v8i64, 1 },
- { ISD::SETCC, MVT::v16i32, 1 },
- { ISD::SETCC, MVT::v8f64, 1 },
- { ISD::SETCC, MVT::v16f32, 1 },
+ static const CostTblEntry SSE41CostTbl[] = {
+ { ISD::SELECT, MVT::v2f64, 1 }, // blendvpd
+ { ISD::SELECT, MVT::v4f32, 1 }, // blendvps
+ { ISD::SELECT, MVT::v2i64, 1 }, // pblendvb
+ { ISD::SELECT, MVT::v4i32, 1 }, // pblendvb
+ { ISD::SELECT, MVT::v8i16, 1 }, // pblendvb
+ { ISD::SELECT, MVT::v16i8, 1 }, // pblendvb
};
- static const CostTblEntry AVX512BWCostTbl[] = {
- { ISD::SETCC, MVT::v32i16, 1 },
- { ISD::SETCC, MVT::v64i8, 1 },
+ static const CostTblEntry SSE2CostTbl[] = {
+ { ISD::SETCC, MVT::v2f64, 2 },
+ { ISD::SETCC, MVT::f64, 1 },
+ { ISD::SETCC, MVT::v2i64, 8 },
+ { ISD::SETCC, MVT::v4i32, 1 },
+ { ISD::SETCC, MVT::v8i16, 1 },
+ { ISD::SETCC, MVT::v16i8, 1 },
+
+ { ISD::SELECT, MVT::v2f64, 3 }, // andpd + andnpd + orpd
+ { ISD::SELECT, MVT::v2i64, 3 }, // pand + pandn + por
+ { ISD::SELECT, MVT::v4i32, 3 }, // pand + pandn + por
+ { ISD::SELECT, MVT::v8i16, 3 }, // pand + pandn + por
+ { ISD::SELECT, MVT::v16i8, 3 }, // pand + pandn + por
+ };
+
+ static const CostTblEntry SSE1CostTbl[] = {
+ { ISD::SETCC, MVT::v4f32, 2 },
+ { ISD::SETCC, MVT::f32, 1 },
+
+ { ISD::SELECT, MVT::v4f32, 3 }, // andps + andnps + orps
};
if (ST->hasBWI())
if (const auto *Entry = CostTableLookup(AVX512BWCostTbl, ISD, MTy))
- return LT.first * Entry->Cost;
+ return LT.first * (ExtraCost + Entry->Cost);
if (ST->hasAVX512())
if (const auto *Entry = CostTableLookup(AVX512CostTbl, ISD, MTy))
- return LT.first * Entry->Cost;
+ return LT.first * (ExtraCost + Entry->Cost);
if (ST->hasAVX2())
if (const auto *Entry = CostTableLookup(AVX2CostTbl, ISD, MTy))
- return LT.first * Entry->Cost;
+ return LT.first * (ExtraCost + Entry->Cost);
if (ST->hasAVX())
if (const auto *Entry = CostTableLookup(AVX1CostTbl, ISD, MTy))
- return LT.first * Entry->Cost;
+ return LT.first * (ExtraCost + Entry->Cost);
if (ST->hasSSE42())
if (const auto *Entry = CostTableLookup(SSE42CostTbl, ISD, MTy))
- return LT.first * Entry->Cost;
+ return LT.first * (ExtraCost + Entry->Cost);
+
+ if (ST->hasSSE41())
+ if (const auto *Entry = CostTableLookup(SSE41CostTbl, ISD, MTy))
+ return LT.first * (ExtraCost + Entry->Cost);
if (ST->hasSSE2())
if (const auto *Entry = CostTableLookup(SSE2CostTbl, ISD, MTy))
- return LT.first * Entry->Cost;
+ return LT.first * (ExtraCost + Entry->Cost);
+
+ if (ST->hasSSE1())
+ if (const auto *Entry = CostTableLookup(SSE1CostTbl, ISD, MTy))
+ return LT.first * (ExtraCost + Entry->Cost);
return BaseT::getCmpSelInstrCost(Opcode, ValTy, CondTy, I);
}
@@ -1784,6 +1876,10 @@ int X86TTIImpl::getIntrinsicInstrCost(Intrinsic::ID IID, Type *RetTy,
{ ISD::USUBSAT, MVT::v2i64, 2 }, // pmaxuq + psubq
{ ISD::USUBSAT, MVT::v4i64, 2 }, // pmaxuq + psubq
{ ISD::USUBSAT, MVT::v8i64, 2 }, // pmaxuq + psubq
+ { ISD::UADDSAT, MVT::v16i32, 3 }, // not + pminud + paddd
+ { ISD::UADDSAT, MVT::v2i64, 3 }, // not + pminuq + paddq
+ { ISD::UADDSAT, MVT::v4i64, 3 }, // not + pminuq + paddq
+ { ISD::UADDSAT, MVT::v8i64, 3 }, // not + pminuq + paddq
};
static const CostTblEntry XOPCostTbl[] = {
{ ISD::BITREVERSE, MVT::v4i64, 4 },
@@ -1825,6 +1921,7 @@ int X86TTIImpl::getIntrinsicInstrCost(Intrinsic::ID IID, Type *RetTy,
{ ISD::SSUBSAT, MVT::v32i8, 1 },
{ ISD::UADDSAT, MVT::v16i16, 1 },
{ ISD::UADDSAT, MVT::v32i8, 1 },
+ { ISD::UADDSAT, MVT::v8i32, 3 }, // not + pminud + paddd
{ ISD::USUBSAT, MVT::v16i16, 1 },
{ ISD::USUBSAT, MVT::v32i8, 1 },
{ ISD::USUBSAT, MVT::v8i32, 2 }, // pmaxud + psubd
@@ -1861,6 +1958,7 @@ int X86TTIImpl::getIntrinsicInstrCost(Intrinsic::ID IID, Type *RetTy,
{ ISD::SSUBSAT, MVT::v32i8, 4 }, // 2 x 128-bit Op + extract/insert
{ ISD::UADDSAT, MVT::v16i16, 4 }, // 2 x 128-bit Op + extract/insert
{ ISD::UADDSAT, MVT::v32i8, 4 }, // 2 x 128-bit Op + extract/insert
+ { ISD::UADDSAT, MVT::v8i32, 8 }, // 2 x 128-bit Op + extract/insert
{ ISD::USUBSAT, MVT::v16i16, 4 }, // 2 x 128-bit Op + extract/insert
{ ISD::USUBSAT, MVT::v32i8, 4 }, // 2 x 128-bit Op + extract/insert
{ ISD::USUBSAT, MVT::v8i32, 6 }, // 2 x 128-bit Op + extract/insert
@@ -1885,6 +1983,7 @@ int X86TTIImpl::getIntrinsicInstrCost(Intrinsic::ID IID, Type *RetTy,
};
static const CostTblEntry SSE42CostTbl[] = {
{ ISD::USUBSAT, MVT::v4i32, 2 }, // pmaxud + psubd
+ { ISD::UADDSAT, MVT::v4i32, 3 }, // not + pminud + paddd
{ ISD::FSQRT, MVT::f32, 18 }, // Nehalem from http://www.agner.org/
{ ISD::FSQRT, MVT::v4f32, 18 }, // Nehalem from http://www.agner.org/
};
@@ -1945,14 +2044,23 @@ int X86TTIImpl::getIntrinsicInstrCost(Intrinsic::ID IID, Type *RetTy,
{ ISD::FSQRT, MVT::v4f32, 56 }, // Pentium III from http://www.agner.org/
};
static const CostTblEntry X64CostTbl[] = { // 64-bit targets
- { ISD::BITREVERSE, MVT::i64, 14 }
+ { ISD::BITREVERSE, MVT::i64, 14 },
+ { ISD::SADDO, MVT::i64, 1 },
+ { ISD::UADDO, MVT::i64, 1 },
};
static const CostTblEntry X86CostTbl[] = { // 32 or 64-bit targets
{ ISD::BITREVERSE, MVT::i32, 14 },
{ ISD::BITREVERSE, MVT::i16, 14 },
- { ISD::BITREVERSE, MVT::i8, 11 }
+ { ISD::BITREVERSE, MVT::i8, 11 },
+ { ISD::SADDO, MVT::i32, 1 },
+ { ISD::SADDO, MVT::i16, 1 },
+ { ISD::SADDO, MVT::i8, 1 },
+ { ISD::UADDO, MVT::i32, 1 },
+ { ISD::UADDO, MVT::i16, 1 },
+ { ISD::UADDO, MVT::i8, 1 },
};
+ Type *OpTy = RetTy;
unsigned ISD = ISD::DELETED_NODE;
switch (IID) {
default:
@@ -1987,11 +2095,23 @@ int X86TTIImpl::getIntrinsicInstrCost(Intrinsic::ID IID, Type *RetTy,
case Intrinsic::sqrt:
ISD = ISD::FSQRT;
break;
+ case Intrinsic::sadd_with_overflow:
+ case Intrinsic::ssub_with_overflow:
+ // SSUBO has same costs so don't duplicate.
+ ISD = ISD::SADDO;
+ OpTy = RetTy->getContainedType(0);
+ break;
+ case Intrinsic::uadd_with_overflow:
+ case Intrinsic::usub_with_overflow:
+ // USUBO has same costs so don't duplicate.
+ ISD = ISD::UADDO;
+ OpTy = RetTy->getContainedType(0);
+ break;
}
if (ISD != ISD::DELETED_NODE) {
// Legalize the type.
- std::pair<int, MVT> LT = TLI->getTypeLegalizationCost(DL, RetTy);
+ std::pair<int, MVT> LT = TLI->getTypeLegalizationCost(DL, OpTy);
MVT MTy = LT.second;
// Attempt to lookup cost.
@@ -2226,6 +2346,9 @@ int X86TTIImpl::getMemoryOpCost(unsigned Opcode, Type *Src, unsigned Alignment,
int X86TTIImpl::getMaskedMemoryOpCost(unsigned Opcode, Type *SrcTy,
unsigned Alignment,
unsigned AddressSpace) {
+ bool IsLoad = (Instruction::Load == Opcode);
+ bool IsStore = (Instruction::Store == Opcode);
+
VectorType *SrcVTy = dyn_cast<VectorType>(SrcTy);
if (!SrcVTy)
// To calculate scalar take the regular cost, without mask
@@ -2233,10 +2356,9 @@ int X86TTIImpl::getMaskedMemoryOpCost(unsigned Opcode, Type *SrcTy,
unsigned NumElem = SrcVTy->getVectorNumElements();
VectorType *MaskTy =
- VectorType::get(Type::getInt8Ty(SrcVTy->getContext()), NumElem);
- if ((Opcode == Instruction::Load && !isLegalMaskedLoad(SrcVTy)) ||
- (Opcode == Instruction::Store && !isLegalMaskedStore(SrcVTy)) ||
- !isPowerOf2_32(NumElem)) {
+ VectorType::get(Type::getInt8Ty(SrcVTy->getContext()), NumElem);
+ if ((IsLoad && !isLegalMaskedLoad(SrcVTy)) ||
+ (IsStore && !isLegalMaskedStore(SrcVTy)) || !isPowerOf2_32(NumElem)) {
// Scalarization
int MaskSplitCost = getScalarizationOverhead(MaskTy, false, true);
int ScalarCompareCost = getCmpSelInstrCost(
@@ -2244,8 +2366,7 @@ int X86TTIImpl::getMaskedMemoryOpCost(unsigned Opcode, Type *SrcTy,
int BranchCost = getCFInstrCost(Instruction::Br);
int MaskCmpCost = NumElem * (BranchCost + ScalarCompareCost);
- int ValueSplitCost = getScalarizationOverhead(
- SrcVTy, Opcode == Instruction::Load, Opcode == Instruction::Store);
+ int ValueSplitCost = getScalarizationOverhead(SrcVTy, IsLoad, IsStore);
int MemopCost =
NumElem * BaseT::getMemoryOpCost(Opcode, SrcVTy->getScalarType(),
Alignment, AddressSpace);
@@ -2259,8 +2380,8 @@ int X86TTIImpl::getMaskedMemoryOpCost(unsigned Opcode, Type *SrcTy,
if (VT.isSimple() && LT.second != VT.getSimpleVT() &&
LT.second.getVectorNumElements() == NumElem)
// Promotion requires expand/truncate for data and a shuffle for mask.
- Cost += getShuffleCost(TTI::SK_Select, SrcVTy, 0, nullptr) +
- getShuffleCost(TTI::SK_Select, MaskTy, 0, nullptr);
+ Cost += getShuffleCost(TTI::SK_PermuteTwoSrc, SrcVTy, 0, nullptr) +
+ getShuffleCost(TTI::SK_PermuteTwoSrc, MaskTy, 0, nullptr);
else if (LT.second.getVectorNumElements() > NumElem) {
VectorType *NewMaskTy = VectorType::get(MaskTy->getVectorElementType(),
@@ -2268,11 +2389,13 @@ int X86TTIImpl::getMaskedMemoryOpCost(unsigned Opcode, Type *SrcTy,
// Expanding requires fill mask with zeroes
Cost += getShuffleCost(TTI::SK_InsertSubvector, NewMaskTy, 0, MaskTy);
}
+
+ // Pre-AVX512 - each maskmov load costs 2 + store costs ~8.
if (!ST->hasAVX512())
- return Cost + LT.first*4; // Each maskmov costs 4
+ return Cost + LT.first * (IsLoad ? 2 : 8);
// AVX-512 masked load/store is cheapper
- return Cost+LT.first;
+ return Cost + LT.first;
}
int X86TTIImpl::getAddressComputationCost(Type *Ty, ScalarEvolution *SE,
@@ -2281,7 +2404,7 @@ int X86TTIImpl::getAddressComputationCost(Type *Ty, ScalarEvolution *SE,
// likely result in more instructions compared to scalar code where the
// computation can more often be merged into the index mode. The resulting
// extra micro-ops can significantly decrease throughput.
- unsigned NumVectorInstToHideOverhead = 10;
+ const unsigned NumVectorInstToHideOverhead = 10;
// Cost modeling of Strided Access Computation is hidden by the indexing
// modes of X86 regardless of the stride value. We dont believe that there
@@ -2369,6 +2492,48 @@ int X86TTIImpl::getArithmeticReductionCost(unsigned Opcode, Type *ValTy,
return LT.first * Entry->Cost;
}
+ static const CostTblEntry AVX2BoolReduction[] = {
+ { ISD::AND, MVT::v16i16, 2 }, // vpmovmskb + cmp
+ { ISD::AND, MVT::v32i8, 2 }, // vpmovmskb + cmp
+ { ISD::OR, MVT::v16i16, 2 }, // vpmovmskb + cmp
+ { ISD::OR, MVT::v32i8, 2 }, // vpmovmskb + cmp
+ };
+
+ static const CostTblEntry AVX1BoolReduction[] = {
+ { ISD::AND, MVT::v4i64, 2 }, // vmovmskpd + cmp
+ { ISD::AND, MVT::v8i32, 2 }, // vmovmskps + cmp
+ { ISD::AND, MVT::v16i16, 4 }, // vextractf128 + vpand + vpmovmskb + cmp
+ { ISD::AND, MVT::v32i8, 4 }, // vextractf128 + vpand + vpmovmskb + cmp
+ { ISD::OR, MVT::v4i64, 2 }, // vmovmskpd + cmp
+ { ISD::OR, MVT::v8i32, 2 }, // vmovmskps + cmp
+ { ISD::OR, MVT::v16i16, 4 }, // vextractf128 + vpor + vpmovmskb + cmp
+ { ISD::OR, MVT::v32i8, 4 }, // vextractf128 + vpor + vpmovmskb + cmp
+ };
+
+ static const CostTblEntry SSE2BoolReduction[] = {
+ { ISD::AND, MVT::v2i64, 2 }, // movmskpd + cmp
+ { ISD::AND, MVT::v4i32, 2 }, // movmskps + cmp
+ { ISD::AND, MVT::v8i16, 2 }, // pmovmskb + cmp
+ { ISD::AND, MVT::v16i8, 2 }, // pmovmskb + cmp
+ { ISD::OR, MVT::v2i64, 2 }, // movmskpd + cmp
+ { ISD::OR, MVT::v4i32, 2 }, // movmskps + cmp
+ { ISD::OR, MVT::v8i16, 2 }, // pmovmskb + cmp
+ { ISD::OR, MVT::v16i8, 2 }, // pmovmskb + cmp
+ };
+
+ // Handle bool allof/anyof patterns.
+ if (ValTy->getVectorElementType()->isIntegerTy(1)) {
+ if (ST->hasAVX2())
+ if (const auto *Entry = CostTableLookup(AVX2BoolReduction, ISD, MTy))
+ return LT.first * Entry->Cost;
+ if (ST->hasAVX())
+ if (const auto *Entry = CostTableLookup(AVX1BoolReduction, ISD, MTy))
+ return LT.first * Entry->Cost;
+ if (ST->hasSSE2())
+ if (const auto *Entry = CostTableLookup(SSE2BoolReduction, ISD, MTy))
+ return LT.first * Entry->Cost;
+ }
+
return BaseT::getArithmeticReductionCost(Opcode, ValTy, IsPairwise);
}
@@ -2390,15 +2555,37 @@ int X86TTIImpl::getMinMaxReductionCost(Type *ValTy, Type *CondTy,
// We use the Intel Architecture Code Analyzer(IACA) to measure the throughput
// and make it as the cost.
- static const CostTblEntry SSE42CostTblPairWise[] = {
+ static const CostTblEntry SSE1CostTblPairWise[] = {
+ {ISD::FMINNUM, MVT::v4f32, 4},
+ };
+
+ static const CostTblEntry SSE2CostTblPairWise[] = {
{ISD::FMINNUM, MVT::v2f64, 3},
+ {ISD::SMIN, MVT::v2i64, 6},
+ {ISD::UMIN, MVT::v2i64, 8},
+ {ISD::SMIN, MVT::v4i32, 6},
+ {ISD::UMIN, MVT::v4i32, 8},
+ {ISD::SMIN, MVT::v8i16, 4},
+ {ISD::UMIN, MVT::v8i16, 6},
+ {ISD::SMIN, MVT::v16i8, 8},
+ {ISD::UMIN, MVT::v16i8, 6},
+ };
+
+ static const CostTblEntry SSE41CostTblPairWise[] = {
{ISD::FMINNUM, MVT::v4f32, 2},
- {ISD::SMIN, MVT::v2i64, 7}, // The data reported by the IACA is "6.8"
- {ISD::UMIN, MVT::v2i64, 8}, // The data reported by the IACA is "8.6"
+ {ISD::SMIN, MVT::v2i64, 9},
+ {ISD::UMIN, MVT::v2i64,10},
{ISD::SMIN, MVT::v4i32, 1}, // The data reported by the IACA is "1.5"
{ISD::UMIN, MVT::v4i32, 2}, // The data reported by the IACA is "1.8"
{ISD::SMIN, MVT::v8i16, 2},
{ISD::UMIN, MVT::v8i16, 2},
+ {ISD::SMIN, MVT::v16i8, 3},
+ {ISD::UMIN, MVT::v16i8, 3},
+ };
+
+ static const CostTblEntry SSE42CostTblPairWise[] = {
+ {ISD::SMIN, MVT::v2i64, 7}, // The data reported by the IACA is "6.8"
+ {ISD::UMIN, MVT::v2i64, 8}, // The data reported by the IACA is "8.6"
};
static const CostTblEntry AVX1CostTblPairWise[] = {
@@ -2411,8 +2598,16 @@ int X86TTIImpl::getMinMaxReductionCost(Type *ValTy, Type *CondTy,
{ISD::UMIN, MVT::v4i32, 1},
{ISD::SMIN, MVT::v8i16, 1},
{ISD::UMIN, MVT::v8i16, 1},
+ {ISD::SMIN, MVT::v16i8, 2},
+ {ISD::UMIN, MVT::v16i8, 2},
+ {ISD::SMIN, MVT::v4i64, 7},
+ {ISD::UMIN, MVT::v4i64, 7},
{ISD::SMIN, MVT::v8i32, 3},
{ISD::UMIN, MVT::v8i32, 3},
+ {ISD::SMIN, MVT::v16i16, 3},
+ {ISD::UMIN, MVT::v16i16, 3},
+ {ISD::SMIN, MVT::v32i8, 3},
+ {ISD::UMIN, MVT::v32i8, 3},
};
static const CostTblEntry AVX2CostTblPairWise[] = {
@@ -2435,15 +2630,37 @@ int X86TTIImpl::getMinMaxReductionCost(Type *ValTy, Type *CondTy,
{ISD::UMIN, MVT::v16i32, 1},
};
- static const CostTblEntry SSE42CostTblNoPairWise[] = {
+ static const CostTblEntry SSE1CostTblNoPairWise[] = {
+ {ISD::FMINNUM, MVT::v4f32, 4},
+ };
+
+ static const CostTblEntry SSE2CostTblNoPairWise[] = {
{ISD::FMINNUM, MVT::v2f64, 3},
+ {ISD::SMIN, MVT::v2i64, 6},
+ {ISD::UMIN, MVT::v2i64, 8},
+ {ISD::SMIN, MVT::v4i32, 6},
+ {ISD::UMIN, MVT::v4i32, 8},
+ {ISD::SMIN, MVT::v8i16, 4},
+ {ISD::UMIN, MVT::v8i16, 6},
+ {ISD::SMIN, MVT::v16i8, 8},
+ {ISD::UMIN, MVT::v16i8, 6},
+ };
+
+ static const CostTblEntry SSE41CostTblNoPairWise[] = {
{ISD::FMINNUM, MVT::v4f32, 3},
- {ISD::SMIN, MVT::v2i64, 7}, // The data reported by the IACA is "6.8"
- {ISD::UMIN, MVT::v2i64, 9}, // The data reported by the IACA is "8.6"
+ {ISD::SMIN, MVT::v2i64, 9},
+ {ISD::UMIN, MVT::v2i64,11},
{ISD::SMIN, MVT::v4i32, 1}, // The data reported by the IACA is "1.5"
{ISD::UMIN, MVT::v4i32, 2}, // The data reported by the IACA is "1.8"
{ISD::SMIN, MVT::v8i16, 1}, // The data reported by the IACA is "1.5"
{ISD::UMIN, MVT::v8i16, 2}, // The data reported by the IACA is "1.8"
+ {ISD::SMIN, MVT::v16i8, 3},
+ {ISD::UMIN, MVT::v16i8, 3},
+ };
+
+ static const CostTblEntry SSE42CostTblNoPairWise[] = {
+ {ISD::SMIN, MVT::v2i64, 7}, // The data reported by the IACA is "6.8"
+ {ISD::UMIN, MVT::v2i64, 9}, // The data reported by the IACA is "8.6"
};
static const CostTblEntry AVX1CostTblNoPairWise[] = {
@@ -2456,8 +2673,16 @@ int X86TTIImpl::getMinMaxReductionCost(Type *ValTy, Type *CondTy,
{ISD::UMIN, MVT::v4i32, 1},
{ISD::SMIN, MVT::v8i16, 1},
{ISD::UMIN, MVT::v8i16, 1},
+ {ISD::SMIN, MVT::v16i8, 2},
+ {ISD::UMIN, MVT::v16i8, 2},
+ {ISD::SMIN, MVT::v4i64, 7},
+ {ISD::UMIN, MVT::v4i64, 7},
{ISD::SMIN, MVT::v8i32, 2},
{ISD::UMIN, MVT::v8i32, 2},
+ {ISD::SMIN, MVT::v16i16, 2},
+ {ISD::UMIN, MVT::v16i16, 2},
+ {ISD::SMIN, MVT::v32i8, 2},
+ {ISD::UMIN, MVT::v32i8, 2},
};
static const CostTblEntry AVX2CostTblNoPairWise[] = {
@@ -2496,6 +2721,18 @@ int X86TTIImpl::getMinMaxReductionCost(Type *ValTy, Type *CondTy,
if (ST->hasSSE42())
if (const auto *Entry = CostTableLookup(SSE42CostTblPairWise, ISD, MTy))
return LT.first * Entry->Cost;
+
+ if (ST->hasSSE41())
+ if (const auto *Entry = CostTableLookup(SSE41CostTblPairWise, ISD, MTy))
+ return LT.first * Entry->Cost;
+
+ if (ST->hasSSE2())
+ if (const auto *Entry = CostTableLookup(SSE2CostTblPairWise, ISD, MTy))
+ return LT.first * Entry->Cost;
+
+ if (ST->hasSSE1())
+ if (const auto *Entry = CostTableLookup(SSE1CostTblPairWise, ISD, MTy))
+ return LT.first * Entry->Cost;
} else {
if (ST->hasAVX512())
if (const auto *Entry =
@@ -2513,6 +2750,18 @@ int X86TTIImpl::getMinMaxReductionCost(Type *ValTy, Type *CondTy,
if (ST->hasSSE42())
if (const auto *Entry = CostTableLookup(SSE42CostTblNoPairWise, ISD, MTy))
return LT.first * Entry->Cost;
+
+ if (ST->hasSSE41())
+ if (const auto *Entry = CostTableLookup(SSE41CostTblNoPairWise, ISD, MTy))
+ return LT.first * Entry->Cost;
+
+ if (ST->hasSSE2())
+ if (const auto *Entry = CostTableLookup(SSE2CostTblNoPairWise, ISD, MTy))
+ return LT.first * Entry->Cost;
+
+ if (ST->hasSSE1())
+ if (const auto *Entry = CostTableLookup(SSE1CostTblNoPairWise, ISD, MTy))
+ return LT.first * Entry->Cost;
}
return BaseT::getMinMaxReductionCost(ValTy, CondTy, IsPairwise, IsUnsigned);
@@ -2864,26 +3113,106 @@ bool X86TTIImpl::isLSRCostLess(TargetTransformInfo::LSRCost &C1,
}
bool X86TTIImpl::canMacroFuseCmp() {
- return ST->hasMacroFusion();
+ return ST->hasMacroFusion() || ST->hasBranchFusion();
}
bool X86TTIImpl::isLegalMaskedLoad(Type *DataTy) {
+ if (!ST->hasAVX())
+ return false;
+
// The backend can't handle a single element vector.
if (isa<VectorType>(DataTy) && DataTy->getVectorNumElements() == 1)
return false;
Type *ScalarTy = DataTy->getScalarType();
- int DataWidth = isa<PointerType>(ScalarTy) ?
- DL.getPointerSizeInBits() : ScalarTy->getPrimitiveSizeInBits();
- return ((DataWidth == 32 || DataWidth == 64) && ST->hasAVX()) ||
- ((DataWidth == 8 || DataWidth == 16) && ST->hasBWI());
+ if (ScalarTy->isPointerTy())
+ return true;
+
+ if (ScalarTy->isFloatTy() || ScalarTy->isDoubleTy())
+ return true;
+
+ if (!ScalarTy->isIntegerTy())
+ return false;
+
+ unsigned IntWidth = ScalarTy->getIntegerBitWidth();
+ return IntWidth == 32 || IntWidth == 64 ||
+ ((IntWidth == 8 || IntWidth == 16) && ST->hasBWI());
}
bool X86TTIImpl::isLegalMaskedStore(Type *DataType) {
return isLegalMaskedLoad(DataType);
}
+bool X86TTIImpl::isLegalNTLoad(Type *DataType, unsigned Alignment) {
+ unsigned DataSize = DL.getTypeStoreSize(DataType);
+ // The only supported nontemporal loads are for aligned vectors of 16 or 32
+ // bytes. Note that 32-byte nontemporal vector loads are supported by AVX2
+ // (the equivalent stores only require AVX).
+ if (Alignment >= DataSize && (DataSize == 16 || DataSize == 32))
+ return DataSize == 16 ? ST->hasSSE1() : ST->hasAVX2();
+
+ return false;
+}
+
+bool X86TTIImpl::isLegalNTStore(Type *DataType, unsigned Alignment) {
+ unsigned DataSize = DL.getTypeStoreSize(DataType);
+
+ // SSE4A supports nontemporal stores of float and double at arbitrary
+ // alignment.
+ if (ST->hasSSE4A() && (DataType->isFloatTy() || DataType->isDoubleTy()))
+ return true;
+
+ // Besides the SSE4A subtarget exception above, only aligned stores are
+ // available nontemporaly on any other subtarget. And only stores with a size
+ // of 4..32 bytes (powers of 2, only) are permitted.
+ if (Alignment < DataSize || DataSize < 4 || DataSize > 32 ||
+ !isPowerOf2_32(DataSize))
+ return false;
+
+ // 32-byte vector nontemporal stores are supported by AVX (the equivalent
+ // loads require AVX2).
+ if (DataSize == 32)
+ return ST->hasAVX();
+ else if (DataSize == 16)
+ return ST->hasSSE1();
+ return true;
+}
+
+bool X86TTIImpl::isLegalMaskedExpandLoad(Type *DataTy) {
+ if (!isa<VectorType>(DataTy))
+ return false;
+
+ if (!ST->hasAVX512())
+ return false;
+
+ // The backend can't handle a single element vector.
+ if (DataTy->getVectorNumElements() == 1)
+ return false;
+
+ Type *ScalarTy = DataTy->getVectorElementType();
+
+ if (ScalarTy->isFloatTy() || ScalarTy->isDoubleTy())
+ return true;
+
+ if (!ScalarTy->isIntegerTy())
+ return false;
+
+ unsigned IntWidth = ScalarTy->getIntegerBitWidth();
+ return IntWidth == 32 || IntWidth == 64 ||
+ ((IntWidth == 8 || IntWidth == 16) && ST->hasVBMI2());
+}
+
+bool X86TTIImpl::isLegalMaskedCompressStore(Type *DataTy) {
+ return isLegalMaskedExpandLoad(DataTy);
+}
+
bool X86TTIImpl::isLegalMaskedGather(Type *DataTy) {
+ // Some CPUs have better gather performance than others.
+ // TODO: Remove the explicit ST->hasAVX512()?, That would mean we would only
+ // enable gather with a -march.
+ if (!(ST->hasAVX512() || (ST->hasFastGather() && ST->hasAVX2())))
+ return false;
+
// This function is called now in two cases: from the Loop Vectorizer
// and from the Scalarizer.
// When the Loop Vectorizer asks about legality of the feature,
@@ -2902,14 +3231,17 @@ bool X86TTIImpl::isLegalMaskedGather(Type *DataTy) {
return false;
}
Type *ScalarTy = DataTy->getScalarType();
- int DataWidth = isa<PointerType>(ScalarTy) ?
- DL.getPointerSizeInBits() : ScalarTy->getPrimitiveSizeInBits();
+ if (ScalarTy->isPointerTy())
+ return true;
- // Some CPUs have better gather performance than others.
- // TODO: Remove the explicit ST->hasAVX512()?, That would mean we would only
- // enable gather with a -march.
- return (DataWidth == 32 || DataWidth == 64) &&
- (ST->hasAVX512() || (ST->hasFastGather() && ST->hasAVX2()));
+ if (ScalarTy->isFloatTy() || ScalarTy->isDoubleTy())
+ return true;
+
+ if (!ScalarTy->isIntegerTy())
+ return false;
+
+ unsigned IntWidth = ScalarTy->getIntegerBitWidth();
+ return IntWidth == 32 || IntWidth == 64;
}
bool X86TTIImpl::isLegalMaskedScatter(Type *DataType) {
@@ -2938,44 +3270,51 @@ bool X86TTIImpl::areInlineCompatible(const Function *Caller,
const FeatureBitset &CalleeBits =
TM.getSubtargetImpl(*Callee)->getFeatureBits();
- // FIXME: This is likely too limiting as it will include subtarget features
- // that we might not care about for inlining, but it is conservatively
- // correct.
- return (CallerBits & CalleeBits) == CalleeBits;
+ FeatureBitset RealCallerBits = CallerBits & ~InlineFeatureIgnoreList;
+ FeatureBitset RealCalleeBits = CalleeBits & ~InlineFeatureIgnoreList;
+ return (RealCallerBits & RealCalleeBits) == RealCalleeBits;
}
-const X86TTIImpl::TTI::MemCmpExpansionOptions *
-X86TTIImpl::enableMemCmpExpansion(bool IsZeroCmp) const {
- // Only enable vector loads for equality comparison.
- // Right now the vector version is not as fast, see #33329.
- static const auto ThreeWayOptions = [this]() {
- TTI::MemCmpExpansionOptions Options;
- if (ST->is64Bit()) {
- Options.LoadSizes.push_back(8);
- }
- Options.LoadSizes.push_back(4);
- Options.LoadSizes.push_back(2);
- Options.LoadSizes.push_back(1);
- return Options;
- }();
- static const auto EqZeroOptions = [this]() {
- TTI::MemCmpExpansionOptions Options;
+bool X86TTIImpl::areFunctionArgsABICompatible(
+ const Function *Caller, const Function *Callee,
+ SmallPtrSetImpl<Argument *> &Args) const {
+ if (!BaseT::areFunctionArgsABICompatible(Caller, Callee, Args))
+ return false;
+
+ // If we get here, we know the target features match. If one function
+ // considers 512-bit vectors legal and the other does not, consider them
+ // incompatible.
+ // FIXME Look at the arguments and only consider 512 bit or larger vectors?
+ const TargetMachine &TM = getTLI()->getTargetMachine();
+
+ return TM.getSubtarget<X86Subtarget>(*Caller).useAVX512Regs() ==
+ TM.getSubtarget<X86Subtarget>(*Callee).useAVX512Regs();
+}
+
+X86TTIImpl::TTI::MemCmpExpansionOptions
+X86TTIImpl::enableMemCmpExpansion(bool OptSize, bool IsZeroCmp) const {
+ TTI::MemCmpExpansionOptions Options;
+ Options.MaxNumLoads = TLI->getMaxExpandSizeMemcmp(OptSize);
+ Options.NumLoadsPerBlock = 2;
+ if (IsZeroCmp) {
+ // Only enable vector loads for equality comparison. Right now the vector
+ // version is not as fast for three way compare (see #33329).
// TODO: enable AVX512 when the DAG is ready.
// if (ST->hasAVX512()) Options.LoadSizes.push_back(64);
- if (ST->hasAVX2()) Options.LoadSizes.push_back(32);
- if (ST->hasSSE2()) Options.LoadSizes.push_back(16);
- if (ST->is64Bit()) {
- Options.LoadSizes.push_back(8);
- }
- Options.LoadSizes.push_back(4);
- Options.LoadSizes.push_back(2);
- Options.LoadSizes.push_back(1);
+ const unsigned PreferredWidth = ST->getPreferVectorWidth();
+ if (PreferredWidth >= 256 && ST->hasAVX2()) Options.LoadSizes.push_back(32);
+ if (PreferredWidth >= 128 && ST->hasSSE2()) Options.LoadSizes.push_back(16);
// All GPR and vector loads can be unaligned. SIMD compare requires integer
// vectors (SSE2/AVX2).
Options.AllowOverlappingLoads = true;
- return Options;
- }();
- return IsZeroCmp ? &EqZeroOptions : &ThreeWayOptions;
+ }
+ if (ST->is64Bit()) {
+ Options.LoadSizes.push_back(8);
+ }
+ Options.LoadSizes.push_back(4);
+ Options.LoadSizes.push_back(2);
+ Options.LoadSizes.push_back(1);
+ return Options;
}
bool X86TTIImpl::enableInterleavedAccessVectorization() {
diff --git a/lib/Target/X86/X86TargetTransformInfo.h b/lib/Target/X86/X86TargetTransformInfo.h
index 1637592c81f8..25d9c33eb16d 100644
--- a/lib/Target/X86/X86TargetTransformInfo.h
+++ b/lib/Target/X86/X86TargetTransformInfo.h
@@ -1,9 +1,8 @@
//===-- X86TargetTransformInfo.h - X86 specific TTI -------------*- C++ -*-===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
/// \file
@@ -36,6 +35,64 @@ class X86TTIImpl : public BasicTTIImplBase<X86TTIImpl> {
const X86Subtarget *getST() const { return ST; }
const X86TargetLowering *getTLI() const { return TLI; }
+ const FeatureBitset InlineFeatureIgnoreList = {
+ // This indicates the CPU is 64 bit capable not that we are in 64-bit
+ // mode.
+ X86::Feature64Bit,
+
+ // These features don't have any intrinsics or ABI effect.
+ X86::FeatureNOPL,
+ X86::FeatureCMPXCHG16B,
+ X86::FeatureLAHFSAHF,
+
+ // Codegen control options.
+ X86::FeatureFast11ByteNOP,
+ X86::FeatureFast15ByteNOP,
+ X86::FeatureFastBEXTR,
+ X86::FeatureFastHorizontalOps,
+ X86::FeatureFastLZCNT,
+ X86::FeatureFastPartialYMMorZMMWrite,
+ X86::FeatureFastScalarFSQRT,
+ X86::FeatureFastSHLDRotate,
+ X86::FeatureFastScalarShiftMasks,
+ X86::FeatureFastVectorShiftMasks,
+ X86::FeatureFastVariableShuffle,
+ X86::FeatureFastVectorFSQRT,
+ X86::FeatureLEAForSP,
+ X86::FeatureLEAUsesAG,
+ X86::FeatureLZCNTFalseDeps,
+ X86::FeatureBranchFusion,
+ X86::FeatureMacroFusion,
+ X86::FeatureMergeToThreeWayBranch,
+ X86::FeaturePadShortFunctions,
+ X86::FeaturePOPCNTFalseDeps,
+ X86::FeatureSSEUnalignedMem,
+ X86::FeatureSlow3OpsLEA,
+ X86::FeatureSlowDivide32,
+ X86::FeatureSlowDivide64,
+ X86::FeatureSlowIncDec,
+ X86::FeatureSlowLEA,
+ X86::FeatureSlowPMADDWD,
+ X86::FeatureSlowPMULLD,
+ X86::FeatureSlowSHLD,
+ X86::FeatureSlowTwoMemOps,
+ X86::FeatureSlowUAMem16,
+
+ // Perf-tuning flags.
+ X86::FeatureHasFastGather,
+ X86::FeatureSlowUAMem32,
+
+ // Based on whether user set the -mprefer-vector-width command line.
+ X86::FeaturePrefer256Bit,
+
+ // CPU name enums. These just follow CPU string.
+ X86::ProcIntelAtom,
+ X86::ProcIntelGLM,
+ X86::ProcIntelGLP,
+ X86::ProcIntelSLM,
+ X86::ProcIntelTRM,
+ };
+
public:
explicit X86TTIImpl(const X86TargetMachine *TM, const Function &F)
: BaseT(TM, F.getParent()->getDataLayout()), ST(TM->getSubtargetImpl(F)),
@@ -129,14 +186,21 @@ public:
bool canMacroFuseCmp();
bool isLegalMaskedLoad(Type *DataType);
bool isLegalMaskedStore(Type *DataType);
+ bool isLegalNTLoad(Type *DataType, unsigned Alignment);
+ bool isLegalNTStore(Type *DataType, unsigned Alignment);
bool isLegalMaskedGather(Type *DataType);
bool isLegalMaskedScatter(Type *DataType);
+ bool isLegalMaskedExpandLoad(Type *DataType);
+ bool isLegalMaskedCompressStore(Type *DataType);
bool hasDivRemOp(Type *DataType, bool IsSigned);
bool isFCmpOrdCheaperThanFCmpZero(Type *Ty);
bool areInlineCompatible(const Function *Caller,
const Function *Callee) const;
- const TTI::MemCmpExpansionOptions *enableMemCmpExpansion(
- bool IsZeroCmp) const;
+ bool areFunctionArgsABICompatible(const Function *Caller,
+ const Function *Callee,
+ SmallPtrSetImpl<Argument *> &Args) const;
+ TTI::MemCmpExpansionOptions enableMemCmpExpansion(bool OptSize,
+ bool IsZeroCmp) const;
bool enableInterleavedAccessVectorization();
private:
int getGSScalarCost(unsigned Opcode, Type *DataTy, bool VariableMask,
diff --git a/lib/Target/X86/X86VZeroUpper.cpp b/lib/Target/X86/X86VZeroUpper.cpp
index f882b760927c..a07d2f20acab 100644
--- a/lib/Target/X86/X86VZeroUpper.cpp
+++ b/lib/Target/X86/X86VZeroUpper.cpp
@@ -1,9 +1,8 @@
//===- X86VZeroUpper.cpp - AVX vzeroupper instruction inserter ------------===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
diff --git a/lib/Target/X86/X86WinAllocaExpander.cpp b/lib/Target/X86/X86WinAllocaExpander.cpp
index d298aaa97ecd..9e499db1d7ee 100644
--- a/lib/Target/X86/X86WinAllocaExpander.cpp
+++ b/lib/Target/X86/X86WinAllocaExpander.cpp
@@ -1,9 +1,8 @@
//===----- X86WinAllocaExpander.cpp - Expand WinAlloca pseudo instruction -===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
@@ -85,10 +84,6 @@ static int64_t getWinAllocaAmount(MachineInstr *MI, MachineRegisterInfo *MRI) {
unsigned AmountReg = MI->getOperand(0).getReg();
MachineInstr *Def = MRI->getUniqueVRegDef(AmountReg);
- // Look through copies.
- while (Def && Def->isCopy() && Def->getOperand(1).isReg())
- Def = MRI->getUniqueVRegDef(Def->getOperand(1).getReg());
-
if (!Def ||
(Def->getOpcode() != X86::MOV32ri && Def->getOpcode() != X86::MOV64ri) ||
!Def->getOperand(1).isImm())
@@ -210,15 +205,18 @@ void X86WinAllocaExpander::lower(MachineInstr* MI, Lowering L) {
return;
}
+ // These two variables differ on x32, which is a 64-bit target with a
+ // 32-bit alloca.
bool Is64Bit = STI->is64Bit();
+ bool Is64BitAlloca = MI->getOpcode() == X86::WIN_ALLOCA_64;
assert(SlotSize == 4 || SlotSize == 8);
- unsigned RegA = (SlotSize == 8) ? X86::RAX : X86::EAX;
switch (L) {
- case TouchAndSub:
+ case TouchAndSub: {
assert(Amount >= SlotSize);
// Use a push to touch the top of the stack.
+ unsigned RegA = Is64Bit ? X86::RAX : X86::EAX;
BuildMI(*MBB, I, DL, TII->get(Is64Bit ? X86::PUSH64r : X86::PUSH32r))
.addReg(RegA, RegState::Undef);
Amount -= SlotSize;
@@ -227,15 +225,18 @@ void X86WinAllocaExpander::lower(MachineInstr* MI, Lowering L) {
// Fall through to make any remaining adjustment.
LLVM_FALLTHROUGH;
+ }
case Sub:
assert(Amount > 0);
if (Amount == SlotSize) {
// Use push to save size.
+ unsigned RegA = Is64Bit ? X86::RAX : X86::EAX;
BuildMI(*MBB, I, DL, TII->get(Is64Bit ? X86::PUSH64r : X86::PUSH32r))
.addReg(RegA, RegState::Undef);
} else {
// Sub.
- BuildMI(*MBB, I, DL, TII->get(getSubOpcode(Is64Bit, Amount)), StackPtr)
+ BuildMI(*MBB, I, DL,
+ TII->get(getSubOpcode(Is64BitAlloca, Amount)), StackPtr)
.addReg(StackPtr)
.addImm(Amount);
}
@@ -243,16 +244,17 @@ void X86WinAllocaExpander::lower(MachineInstr* MI, Lowering L) {
case Probe:
if (!NoStackArgProbe) {
// The probe lowering expects the amount in RAX/EAX.
+ unsigned RegA = Is64BitAlloca ? X86::RAX : X86::EAX;
BuildMI(*MBB, MI, DL, TII->get(TargetOpcode::COPY), RegA)
.addReg(MI->getOperand(0).getReg());
// Do the probe.
STI->getFrameLowering()->emitStackProbe(*MBB->getParent(), *MBB, MI, DL,
- /*InPrologue=*/false);
+ /*InProlog=*/false);
} else {
// Sub
- BuildMI(*MBB, I, DL, TII->get(Is64Bit ? X86::SUB64rr : X86::SUB32rr),
- StackPtr)
+ BuildMI(*MBB, I, DL,
+ TII->get(Is64BitAlloca ? X86::SUB64rr : X86::SUB32rr), StackPtr)
.addReg(StackPtr)
.addReg(MI->getOperand(0).getReg());
}
@@ -262,18 +264,10 @@ void X86WinAllocaExpander::lower(MachineInstr* MI, Lowering L) {
unsigned AmountReg = MI->getOperand(0).getReg();
MI->eraseFromParent();
- // Delete the definition of AmountReg, possibly walking a chain of copies.
- for (;;) {
- if (!MRI->use_empty(AmountReg))
- break;
- MachineInstr *AmountDef = MRI->getUniqueVRegDef(AmountReg);
- if (!AmountDef)
- break;
- if (AmountDef->isCopy() && AmountDef->getOperand(1).isReg())
- AmountReg = AmountDef->getOperand(1).isReg();
- AmountDef->eraseFromParent();
- break;
- }
+ // Delete the definition of AmountReg.
+ if (MRI->use_empty(AmountReg))
+ if (MachineInstr *AmountDef = MRI->getUniqueVRegDef(AmountReg))
+ AmountDef->eraseFromParent();
}
bool X86WinAllocaExpander::runOnMachineFunction(MachineFunction &MF) {
diff --git a/lib/Target/X86/X86WinEHState.cpp b/lib/Target/X86/X86WinEHState.cpp
index 185deda97c1f..f68d17d7256d 100644
--- a/lib/Target/X86/X86WinEHState.cpp
+++ b/lib/Target/X86/X86WinEHState.cpp
@@ -1,9 +1,8 @@
//===-- X86WinEHState - Insert EH state updates for win32 exceptions ------===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
@@ -41,9 +40,7 @@ class WinEHStatePass : public FunctionPass {
public:
static char ID; // Pass identification, replacement for typeid.
- WinEHStatePass() : FunctionPass(ID) {
- initializeWinEHStatePassPass(*PassRegistry::getPassRegistry());
- }
+ WinEHStatePass() : FunctionPass(ID) { }
bool runOnFunction(Function &Fn) override;
@@ -87,15 +84,15 @@ private:
StructType *EHLinkRegistrationTy = nullptr;
StructType *CXXEHRegistrationTy = nullptr;
StructType *SEHRegistrationTy = nullptr;
- Constant *SetJmp3 = nullptr;
- Constant *CxxLongjmpUnwind = nullptr;
+ FunctionCallee SetJmp3 = nullptr;
+ FunctionCallee CxxLongjmpUnwind = nullptr;
// Per-function state
EHPersonality Personality = EHPersonality::Unknown;
Function *PersonalityFn = nullptr;
bool UseStackGuard = false;
int ParentBaseState;
- Constant *SehLongjmpUnwind = nullptr;
+ FunctionCallee SehLongjmpUnwind = nullptr;
Constant *Cookie = nullptr;
/// The stack allocation containing all EH data, including the link in the
@@ -304,7 +301,7 @@ void WinEHStatePass::emitExceptionRegistrationRecord(Function *F) {
CxxLongjmpUnwind = TheModule->getOrInsertFunction(
"__CxxLongjmpUnwind",
FunctionType::get(VoidTy, Int8PtrType, /*isVarArg=*/false));
- cast<Function>(CxxLongjmpUnwind->stripPointerCasts())
+ cast<Function>(CxxLongjmpUnwind.getCallee()->stripPointerCasts())
->setCallingConv(CallingConv::X86_StdCall);
} else if (Personality == EHPersonality::MSVC_X86SEH) {
// If _except_handler4 is in use, some additional guard checks and prologue
@@ -357,7 +354,7 @@ void WinEHStatePass::emitExceptionRegistrationRecord(Function *F) {
UseStackGuard ? "_seh_longjmp_unwind4" : "_seh_longjmp_unwind",
FunctionType::get(Type::getVoidTy(TheModule->getContext()), Int8PtrType,
/*isVarArg=*/false));
- cast<Function>(SehLongjmpUnwind->stripPointerCasts())
+ cast<Function>(SehLongjmpUnwind.getCallee()->stripPointerCasts())
->setCallingConv(CallingConv::X86_StdCall);
} else {
llvm_unreachable("unexpected personality function");
@@ -412,7 +409,7 @@ Function *WinEHStatePass::generateLSDAInEAXThunk(Function *ParentFunc) {
Builder.CreateBitCast(PersonalityFn, TargetFuncTy->getPointerTo());
auto AI = Trampoline->arg_begin();
Value *Args[5] = {LSDA, &*AI++, &*AI++, &*AI++, &*AI++};
- CallInst *Call = Builder.CreateCall(CastPersonality, Args);
+ CallInst *Call = Builder.CreateCall(TargetFuncTy, CastPersonality, Args);
// Can't use musttail due to prototype mismatch, but we can use tail.
Call->setTailCall(true);
// Set inreg so we pass it in EAX.
@@ -433,7 +430,7 @@ void WinEHStatePass::linkExceptionRegistration(IRBuilder<> &Builder,
// Next = [fs:00]
Constant *FSZero =
Constant::getNullValue(LinkTy->getPointerTo()->getPointerTo(257));
- Value *Next = Builder.CreateLoad(FSZero);
+ Value *Next = Builder.CreateLoad(LinkTy->getPointerTo(), FSZero);
Builder.CreateStore(Next, Builder.CreateStructGEP(LinkTy, Link, 0));
// [fs:00] = Link
Builder.CreateStore(Link, FSZero);
@@ -448,8 +445,8 @@ void WinEHStatePass::unlinkExceptionRegistration(IRBuilder<> &Builder) {
}
Type *LinkTy = getEHLinkRegistrationType();
// [fs:00] = Link->Next
- Value *Next =
- Builder.CreateLoad(Builder.CreateStructGEP(LinkTy, Link, 0));
+ Value *Next = Builder.CreateLoad(LinkTy->getPointerTo(),
+ Builder.CreateStructGEP(LinkTy, Link, 0));
Constant *FSZero =
Constant::getNullValue(LinkTy->getPointerTo()->getPointerTo(257));
Builder.CreateStore(Next, FSZero);
@@ -472,11 +469,11 @@ void WinEHStatePass::rewriteSetJmpCallSite(IRBuilder<> &Builder, Function &F,
SmallVector<Value *, 3> OptionalArgs;
if (Personality == EHPersonality::MSVC_CXX) {
- OptionalArgs.push_back(CxxLongjmpUnwind);
+ OptionalArgs.push_back(CxxLongjmpUnwind.getCallee());
OptionalArgs.push_back(State);
OptionalArgs.push_back(emitEHLSDA(Builder, &F));
} else if (Personality == EHPersonality::MSVC_X86SEH) {
- OptionalArgs.push_back(SehLongjmpUnwind);
+ OptionalArgs.push_back(SehLongjmpUnwind.getCallee());
OptionalArgs.push_back(State);
if (UseStackGuard)
OptionalArgs.push_back(Cookie);
@@ -767,7 +764,7 @@ void WinEHStatePass::addStateStores(Function &F, WinEHFuncInfo &FuncInfo) {
if (!CS)
continue;
if (CS.getCalledValue()->stripPointerCasts() !=
- SetJmp3->stripPointerCasts())
+ SetJmp3.getCallee()->stripPointerCasts())
continue;
SetJmp3CallSites.push_back(CS);
@@ -782,9 +779,9 @@ void WinEHStatePass::addStateStores(Function &F, WinEHFuncInfo &FuncInfo) {
IRBuilder<> Builder(CS.getInstruction());
Value *State;
if (InCleanup) {
- Value *StateField =
- Builder.CreateStructGEP(nullptr, RegNode, StateFieldIndex);
- State = Builder.CreateLoad(StateField);
+ Value *StateField = Builder.CreateStructGEP(RegNode->getAllocatedType(),
+ RegNode, StateFieldIndex);
+ State = Builder.CreateLoad(Builder.getInt32Ty(), StateField);
} else {
State = Builder.getInt32(getStateForCallSite(BlockColors, FuncInfo, CS));
}
@@ -794,7 +791,7 @@ void WinEHStatePass::addStateStores(Function &F, WinEHFuncInfo &FuncInfo) {
void WinEHStatePass::insertStateNumberStore(Instruction *IP, int State) {
IRBuilder<> Builder(IP);
- Value *StateField =
- Builder.CreateStructGEP(nullptr, RegNode, StateFieldIndex);
+ Value *StateField = Builder.CreateStructGEP(RegNode->getAllocatedType(),
+ RegNode, StateFieldIndex);
Builder.CreateStore(Builder.getInt32(State), StateField);
}