diff options
Diffstat (limited to 'llvm/lib/Target/X86/X86InstrInfo.cpp')
| -rw-r--r-- | llvm/lib/Target/X86/X86InstrInfo.cpp | 851 |
1 files changed, 533 insertions, 318 deletions
diff --git a/llvm/lib/Target/X86/X86InstrInfo.cpp b/llvm/lib/Target/X86/X86InstrInfo.cpp index 4dcd886fa3b2..ec32ac2acad1 100644 --- a/llvm/lib/Target/X86/X86InstrInfo.cpp +++ b/llvm/lib/Target/X86/X86InstrInfo.cpp @@ -25,13 +25,16 @@ #include "llvm/CodeGen/MachineConstantPool.h" #include "llvm/CodeGen/MachineDominators.h" #include "llvm/CodeGen/MachineFrameInfo.h" +#include "llvm/CodeGen/MachineInstr.h" #include "llvm/CodeGen/MachineInstrBuilder.h" #include "llvm/CodeGen/MachineModuleInfo.h" +#include "llvm/CodeGen/MachineOperand.h" #include "llvm/CodeGen/MachineRegisterInfo.h" #include "llvm/CodeGen/StackMaps.h" #include "llvm/IR/DebugInfoMetadata.h" #include "llvm/IR/DerivedTypes.h" #include "llvm/IR/Function.h" +#include "llvm/IR/InstrTypes.h" #include "llvm/MC/MCAsmInfo.h" #include "llvm/MC/MCExpr.h" #include "llvm/MC/MCInst.h" @@ -137,298 +140,70 @@ X86InstrInfo::isCoalescableExtInstr(const MachineInstr &MI, } bool X86InstrInfo::isDataInvariant(MachineInstr &MI) { - switch (MI.getOpcode()) { - default: - // By default, assume that the instruction is not data invariant. + if (MI.mayLoad() || MI.mayStore()) return false; - // Some target-independent operations that trivially lower to data-invariant - // instructions. - case TargetOpcode::COPY: - case TargetOpcode::INSERT_SUBREG: - case TargetOpcode::SUBREG_TO_REG: + // Some target-independent operations that trivially lower to data-invariant + // instructions. + if (MI.isCopyLike() || MI.isInsertSubreg()) return true; + unsigned Opcode = MI.getOpcode(); + using namespace X86; // On x86 it is believed that imul is constant time w.r.t. the loaded data. // However, they set flags and are perhaps the most surprisingly constant // time operations so we call them out here separately. - case X86::IMUL16rr: - case X86::IMUL16rri8: - case X86::IMUL16rri: - case X86::IMUL32rr: - case X86::IMUL32rri8: - case X86::IMUL32rri: - case X86::IMUL64rr: - case X86::IMUL64rri32: - case X86::IMUL64rri8: - + if (isIMUL(Opcode)) + return true; // Bit scanning and counting instructions that are somewhat surprisingly // constant time as they scan across bits and do other fairly complex // operations like popcnt, but are believed to be constant time on x86. // However, these set flags. - case X86::BSF16rr: - case X86::BSF32rr: - case X86::BSF64rr: - case X86::BSR16rr: - case X86::BSR32rr: - case X86::BSR64rr: - case X86::LZCNT16rr: - case X86::LZCNT32rr: - case X86::LZCNT64rr: - case X86::POPCNT16rr: - case X86::POPCNT32rr: - case X86::POPCNT64rr: - case X86::TZCNT16rr: - case X86::TZCNT32rr: - case X86::TZCNT64rr: - + if (isBSF(Opcode) || isBSR(Opcode) || isLZCNT(Opcode) || isPOPCNT(Opcode) || + isTZCNT(Opcode)) + return true; // Bit manipulation instructions are effectively combinations of basic // arithmetic ops, and should still execute in constant time. These also // set flags. - case X86::BLCFILL32rr: - case X86::BLCFILL64rr: - case X86::BLCI32rr: - case X86::BLCI64rr: - case X86::BLCIC32rr: - case X86::BLCIC64rr: - case X86::BLCMSK32rr: - case X86::BLCMSK64rr: - case X86::BLCS32rr: - case X86::BLCS64rr: - case X86::BLSFILL32rr: - case X86::BLSFILL64rr: - case X86::BLSI32rr: - case X86::BLSI64rr: - case X86::BLSIC32rr: - case X86::BLSIC64rr: - case X86::BLSMSK32rr: - case X86::BLSMSK64rr: - case X86::BLSR32rr: - case X86::BLSR64rr: - case X86::TZMSK32rr: - case X86::TZMSK64rr: - + if (isBLCFILL(Opcode) || isBLCI(Opcode) || isBLCIC(Opcode) || + isBLCMSK(Opcode) || isBLCS(Opcode) || isBLSFILL(Opcode) || + isBLSI(Opcode) || isBLSIC(Opcode) || isBLSMSK(Opcode) || isBLSR(Opcode) || + isTZMSK(Opcode)) + return true; // Bit extracting and clearing instructions should execute in constant time, // and set flags. - case X86::BEXTR32rr: - case X86::BEXTR64rr: - case X86::BEXTRI32ri: - case X86::BEXTRI64ri: - case X86::BZHI32rr: - case X86::BZHI64rr: - + if (isBEXTR(Opcode) || isBZHI(Opcode)) + return true; // Shift and rotate. - case X86::ROL8r1: - case X86::ROL16r1: - case X86::ROL32r1: - case X86::ROL64r1: - case X86::ROL8rCL: - case X86::ROL16rCL: - case X86::ROL32rCL: - case X86::ROL64rCL: - case X86::ROL8ri: - case X86::ROL16ri: - case X86::ROL32ri: - case X86::ROL64ri: - case X86::ROR8r1: - case X86::ROR16r1: - case X86::ROR32r1: - case X86::ROR64r1: - case X86::ROR8rCL: - case X86::ROR16rCL: - case X86::ROR32rCL: - case X86::ROR64rCL: - case X86::ROR8ri: - case X86::ROR16ri: - case X86::ROR32ri: - case X86::ROR64ri: - case X86::SAR8r1: - case X86::SAR16r1: - case X86::SAR32r1: - case X86::SAR64r1: - case X86::SAR8rCL: - case X86::SAR16rCL: - case X86::SAR32rCL: - case X86::SAR64rCL: - case X86::SAR8ri: - case X86::SAR16ri: - case X86::SAR32ri: - case X86::SAR64ri: - case X86::SHL8r1: - case X86::SHL16r1: - case X86::SHL32r1: - case X86::SHL64r1: - case X86::SHL8rCL: - case X86::SHL16rCL: - case X86::SHL32rCL: - case X86::SHL64rCL: - case X86::SHL8ri: - case X86::SHL16ri: - case X86::SHL32ri: - case X86::SHL64ri: - case X86::SHR8r1: - case X86::SHR16r1: - case X86::SHR32r1: - case X86::SHR64r1: - case X86::SHR8rCL: - case X86::SHR16rCL: - case X86::SHR32rCL: - case X86::SHR64rCL: - case X86::SHR8ri: - case X86::SHR16ri: - case X86::SHR32ri: - case X86::SHR64ri: - case X86::SHLD16rrCL: - case X86::SHLD32rrCL: - case X86::SHLD64rrCL: - case X86::SHLD16rri8: - case X86::SHLD32rri8: - case X86::SHLD64rri8: - case X86::SHRD16rrCL: - case X86::SHRD32rrCL: - case X86::SHRD64rrCL: - case X86::SHRD16rri8: - case X86::SHRD32rri8: - case X86::SHRD64rri8: - + if (isROL(Opcode) || isROR(Opcode) || isSAR(Opcode) || isSHL(Opcode) || + isSHR(Opcode) || isSHLD(Opcode) || isSHRD(Opcode)) + return true; // Basic arithmetic is constant time on the input but does set flags. - case X86::ADC8rr: - case X86::ADC8ri: - case X86::ADC16rr: - case X86::ADC16ri: - case X86::ADC16ri8: - case X86::ADC32rr: - case X86::ADC32ri: - case X86::ADC32ri8: - case X86::ADC64rr: - case X86::ADC64ri8: - case X86::ADC64ri32: - case X86::ADD8rr: - case X86::ADD8ri: - case X86::ADD16rr: - case X86::ADD16ri: - case X86::ADD16ri8: - case X86::ADD32rr: - case X86::ADD32ri: - case X86::ADD32ri8: - case X86::ADD64rr: - case X86::ADD64ri8: - case X86::ADD64ri32: - case X86::AND8rr: - case X86::AND8ri: - case X86::AND16rr: - case X86::AND16ri: - case X86::AND16ri8: - case X86::AND32rr: - case X86::AND32ri: - case X86::AND32ri8: - case X86::AND64rr: - case X86::AND64ri8: - case X86::AND64ri32: - case X86::OR8rr: - case X86::OR8ri: - case X86::OR16rr: - case X86::OR16ri: - case X86::OR16ri8: - case X86::OR32rr: - case X86::OR32ri: - case X86::OR32ri8: - case X86::OR64rr: - case X86::OR64ri8: - case X86::OR64ri32: - case X86::SBB8rr: - case X86::SBB8ri: - case X86::SBB16rr: - case X86::SBB16ri: - case X86::SBB16ri8: - case X86::SBB32rr: - case X86::SBB32ri: - case X86::SBB32ri8: - case X86::SBB64rr: - case X86::SBB64ri8: - case X86::SBB64ri32: - case X86::SUB8rr: - case X86::SUB8ri: - case X86::SUB16rr: - case X86::SUB16ri: - case X86::SUB16ri8: - case X86::SUB32rr: - case X86::SUB32ri: - case X86::SUB32ri8: - case X86::SUB64rr: - case X86::SUB64ri8: - case X86::SUB64ri32: - case X86::XOR8rr: - case X86::XOR8ri: - case X86::XOR16rr: - case X86::XOR16ri: - case X86::XOR16ri8: - case X86::XOR32rr: - case X86::XOR32ri: - case X86::XOR32ri8: - case X86::XOR64rr: - case X86::XOR64ri8: - case X86::XOR64ri32: + if (isADC(Opcode) || isADD(Opcode) || isAND(Opcode) || isOR(Opcode) || + isSBB(Opcode) || isSUB(Opcode) || isXOR(Opcode)) + return true; // Arithmetic with just 32-bit and 64-bit variants and no immediates. - case X86::ADCX32rr: - case X86::ADCX64rr: - case X86::ADOX32rr: - case X86::ADOX64rr: - case X86::ANDN32rr: - case X86::ANDN64rr: + if (isADCX(Opcode) || isADOX(Opcode) || isANDN(Opcode)) + return true; // Unary arithmetic operations. - case X86::DEC8r: - case X86::DEC16r: - case X86::DEC32r: - case X86::DEC64r: - case X86::INC8r: - case X86::INC16r: - case X86::INC32r: - case X86::INC64r: - case X86::NEG8r: - case X86::NEG16r: - case X86::NEG32r: - case X86::NEG64r: - + if (isDEC(Opcode) || isINC(Opcode) || isNEG(Opcode)) + return true; // Unlike other arithmetic, NOT doesn't set EFLAGS. - case X86::NOT8r: - case X86::NOT16r: - case X86::NOT32r: - case X86::NOT64r: - + if (isNOT(Opcode)) + return true; // Various move instructions used to zero or sign extend things. Note that we // intentionally don't support the _NOREX variants as we can't handle that // register constraint anyways. - case X86::MOVSX16rr8: - case X86::MOVSX32rr8: - case X86::MOVSX32rr16: - case X86::MOVSX64rr8: - case X86::MOVSX64rr16: - case X86::MOVSX64rr32: - case X86::MOVZX16rr8: - case X86::MOVZX32rr8: - case X86::MOVZX32rr16: - case X86::MOVZX64rr8: - case X86::MOVZX64rr16: - case X86::MOV32rr: - + if (isMOVSX(Opcode) || isMOVZX(Opcode) || isMOVSXD(Opcode) || isMOV(Opcode)) + return true; // Arithmetic instructions that are both constant time and don't set flags. - case X86::RORX32ri: - case X86::RORX64ri: - case X86::SARX32rr: - case X86::SARX64rr: - case X86::SHLX32rr: - case X86::SHLX64rr: - case X86::SHRX32rr: - case X86::SHRX64rr: - + if (isRORX(Opcode) || isSARX(Opcode) || isSHLX(Opcode) || isSHRX(Opcode)) + return true; // LEA doesn't actually access memory, and its arithmetic is constant time. - case X86::LEA16r: - case X86::LEA32r: - case X86::LEA64_32r: - case X86::LEA64r: + if (isLEA(Opcode)) return true; - } + // By default, assume that the instruction is not data invariant. + return false; } bool X86InstrInfo::isDataInvariantLoad(MachineInstr &MI) { @@ -990,6 +765,7 @@ bool X86InstrInfo::isReallyTriviallyReMaterializable(const MachineInstr &MI, case X86::AVX_SET0: case X86::FsFLD0SD: case X86::FsFLD0SS: + case X86::FsFLD0SH: case X86::FsFLD0F128: case X86::KSET0D: case X86::KSET0Q: @@ -1192,6 +968,102 @@ inline static bool isTruncatedShiftCountForLEA(unsigned ShAmt) { return ShAmt < 4 && ShAmt > 0; } +static bool findRedundantFlagInstr(MachineInstr &CmpInstr, + MachineInstr &CmpValDefInstr, + const MachineRegisterInfo *MRI, + MachineInstr **AndInstr, + const TargetRegisterInfo *TRI, + bool &NoSignFlag, bool &ClearsOverflowFlag) { + if (CmpValDefInstr.getOpcode() != X86::SUBREG_TO_REG) + return false; + + if (CmpInstr.getOpcode() != X86::TEST64rr) + return false; + + // CmpInstr is a TEST64rr instruction, and `X86InstrInfo::analyzeCompare` + // guarantees that it's analyzable only if two registers are identical. + assert( + (CmpInstr.getOperand(0).getReg() == CmpInstr.getOperand(1).getReg()) && + "CmpInstr is an analyzable TEST64rr, and `X86InstrInfo::analyzeCompare` " + "requires two reg operands are the same."); + + // Caller (`X86InstrInfo::optimizeCompareInstr`) guarantees that + // `CmpValDefInstr` defines the value that's used by `CmpInstr`; in this case + // if `CmpValDefInstr` sets the EFLAGS, it is likely that `CmpInstr` is + // redundant. + assert( + (MRI->getVRegDef(CmpInstr.getOperand(0).getReg()) == &CmpValDefInstr) && + "Caller guarantees that TEST64rr is a user of SUBREG_TO_REG."); + + // As seen in X86 td files, CmpValDefInstr.getOperand(1).getImm() is typically + // 0. + if (CmpValDefInstr.getOperand(1).getImm() != 0) + return false; + + // As seen in X86 td files, CmpValDefInstr.getOperand(3) is typically + // sub_32bit or sub_xmm. + if (CmpValDefInstr.getOperand(3).getImm() != X86::sub_32bit) + return false; + + MachineInstr *VregDefInstr = + MRI->getVRegDef(CmpValDefInstr.getOperand(2).getReg()); + + assert(VregDefInstr && "Must have a definition (SSA)"); + + // Requires `CmpValDefInstr` and `VregDefInstr` are from the same MBB + // to simplify the subsequent analysis. + // + // FIXME: If `VregDefInstr->getParent()` is the only predecessor of + // `CmpValDefInstr.getParent()`, this could be handled. + if (VregDefInstr->getParent() != CmpValDefInstr.getParent()) + return false; + + if (X86::isAND(VregDefInstr->getOpcode())) { + // Get a sequence of instructions like + // %reg = and* ... // Set EFLAGS + // ... // EFLAGS not changed + // %extended_reg = subreg_to_reg 0, %reg, %subreg.sub_32bit + // test64rr %extended_reg, %extended_reg, implicit-def $eflags + // + // If subsequent readers use a subset of bits that don't change + // after `and*` instructions, it's likely that the test64rr could + // be optimized away. + for (const MachineInstr &Instr : + make_range(std::next(MachineBasicBlock::iterator(VregDefInstr)), + MachineBasicBlock::iterator(CmpValDefInstr))) { + // There are instructions between 'VregDefInstr' and + // 'CmpValDefInstr' that modifies EFLAGS. + if (Instr.modifiesRegister(X86::EFLAGS, TRI)) + return false; + } + + *AndInstr = VregDefInstr; + + // AND instruction will essentially update SF and clear OF, so + // NoSignFlag should be false in the sense that SF is modified by `AND`. + // + // However, the implementation artifically sets `NoSignFlag` to true + // to poison the SF bit; that is to say, if SF is looked at later, the + // optimization (to erase TEST64rr) will be disabled. + // + // The reason to poison SF bit is that SF bit value could be different + // in the `AND` and `TEST` operation; signed bit is not known for `AND`, + // and is known to be 0 as a result of `TEST64rr`. + // + // FIXME: As opposed to poisoning the SF bit directly, consider peeking into + // the AND instruction and using the static information to guide peephole + // optimization if possible. For example, it's possible to fold a + // conditional move into a copy if the relevant EFLAG bits could be deduced + // from an immediate operand of and operation. + // + NoSignFlag = true; + // ClearsOverflowFlag is true for AND operation (no surprise). + ClearsOverflowFlag = true; + return true; + } + return false; +} + bool X86InstrInfo::classifyLEAReg(MachineInstr &MI, const MachineOperand &Src, unsigned Opc, bool AllowSP, Register &NewSrc, bool &isKill, MachineOperand &ImplicitOp, @@ -1314,8 +1186,11 @@ MachineInstr *X86InstrInfo::convertToThreeAddressWithLEA(unsigned MIOpc, case X86::SHL8ri: case X86::SHL16ri: { unsigned ShAmt = MI.getOperand(2).getImm(); - MIB.addReg(0).addImm(1ULL << ShAmt) - .addReg(InRegLEA, RegState::Kill).addImm(0).addReg(0); + MIB.addReg(0) + .addImm(1LL << ShAmt) + .addReg(InRegLEA, RegState::Kill) + .addImm(0) + .addReg(0); break; } case X86::INC8r: @@ -1478,7 +1353,7 @@ MachineInstr *X86InstrInfo::convertToThreeAddress(MachineInstr &MI, NewMI = BuildMI(MF, MI.getDebugLoc(), get(X86::LEA64r)) .add(Dest) .addReg(0) - .addImm(1ULL << ShAmt) + .addImm(1LL << ShAmt) .add(Src) .addImm(0) .addReg(0); @@ -1502,7 +1377,7 @@ MachineInstr *X86InstrInfo::convertToThreeAddress(MachineInstr &MI, BuildMI(MF, MI.getDebugLoc(), get(Opc)) .add(Dest) .addReg(0) - .addImm(1ULL << ShAmt) + .addImm(1LL << ShAmt) .addReg(SrcReg, getKillRegState(isKill)) .addImm(0) .addReg(0); @@ -1957,14 +1832,13 @@ unsigned X86InstrInfo::getFMA3OpcodeToCommuteOperands( FMAForms[0] = FMA3Group.get132Opcode(); FMAForms[1] = FMA3Group.get213Opcode(); FMAForms[2] = FMA3Group.get231Opcode(); - unsigned FormIndex; - for (FormIndex = 0; FormIndex < 3; FormIndex++) - if (Opc == FMAForms[FormIndex]) - break; // Everything is ready, just adjust the FMA opcode and return it. - FormIndex = FormMapping[Case][FormIndex]; - return FMAForms[FormIndex]; + for (unsigned FormIndex = 0; FormIndex < 3; FormIndex++) + if (Opc == FMAForms[FormIndex]) + return FMAForms[FormMapping[Case][FormIndex]]; + + llvm_unreachable("Illegal FMA3 format"); } static void commuteVPTERNLOG(MachineInstr &MI, unsigned SrcOpIdx1, @@ -2141,7 +2015,7 @@ MachineInstr *X86InstrInfo::commuteInstructionImpl(MachineInstr &MI, bool NewMI, if ((MI.getOperand(3).getImm() ^ Mask) == 1) { auto &WorkingMI = cloneIfNew(MI); WorkingMI.setDesc(get(Opc)); - WorkingMI.RemoveOperand(3); + WorkingMI.removeOperand(3); return TargetInstrInfo::commuteInstructionImpl(WorkingMI, /*NewMI=*/false, OpIdx1, OpIdx2); @@ -2238,7 +2112,7 @@ MachineInstr *X86InstrInfo::commuteInstructionImpl(MachineInstr &MI, bool NewMI, assert(MI.getOperand(3).getImm() == 0x02 && "Unexpected immediate!"); auto &WorkingMI = cloneIfNew(MI); WorkingMI.setDesc(get(X86::MOVSDrr)); - WorkingMI.RemoveOperand(3); + WorkingMI.removeOperand(3); return TargetInstrInfo::commuteInstructionImpl(WorkingMI, /*NewMI=*/false, OpIdx1, OpIdx2); } @@ -2813,34 +2687,37 @@ bool X86InstrInfo::hasCommutePreference(MachineInstr &MI, bool &Commute) const { return false; } +int X86::getCondSrcNoFromDesc(const MCInstrDesc &MCID) { + unsigned Opcode = MCID.getOpcode(); + if (!(X86::isJCC(Opcode) || X86::isSETCC(Opcode) || X86::isCMOVCC(Opcode))) + return -1; + // Assume that condition code is always the last use operand. + unsigned NumUses = MCID.getNumOperands() - MCID.getNumDefs(); + return NumUses - 1; +} + +X86::CondCode X86::getCondFromMI(const MachineInstr &MI) { + const MCInstrDesc &MCID = MI.getDesc(); + int CondNo = getCondSrcNoFromDesc(MCID); + if (CondNo < 0) + return X86::COND_INVALID; + CondNo += MCID.getNumDefs(); + return static_cast<X86::CondCode>(MI.getOperand(CondNo).getImm()); +} + X86::CondCode X86::getCondFromBranch(const MachineInstr &MI) { - switch (MI.getOpcode()) { - default: return X86::COND_INVALID; - case X86::JCC_1: - return static_cast<X86::CondCode>( - MI.getOperand(MI.getDesc().getNumOperands() - 1).getImm()); - } + return X86::isJCC(MI.getOpcode()) ? X86::getCondFromMI(MI) + : X86::COND_INVALID; } -/// Return condition code of a SETCC opcode. X86::CondCode X86::getCondFromSETCC(const MachineInstr &MI) { - switch (MI.getOpcode()) { - default: return X86::COND_INVALID; - case X86::SETCCr: case X86::SETCCm: - return static_cast<X86::CondCode>( - MI.getOperand(MI.getDesc().getNumOperands() - 1).getImm()); - } + return X86::isSETCC(MI.getOpcode()) ? X86::getCondFromMI(MI) + : X86::COND_INVALID; } -/// Return condition code of a CMov opcode. X86::CondCode X86::getCondFromCMov(const MachineInstr &MI) { - switch (MI.getOpcode()) { - default: return X86::COND_INVALID; - case X86::CMOV16rr: case X86::CMOV32rr: case X86::CMOV64rr: - case X86::CMOV16rm: case X86::CMOV32rm: case X86::CMOV64rm: - return static_cast<X86::CondCode>( - MI.getOperand(MI.getDesc().getNumOperands() - 1).getImm()); - } + return X86::isCMOVCC(MI.getOpcode()) ? X86::getCondFromMI(MI) + : X86::COND_INVALID; } /// Return the inverse of the specified condition, @@ -3166,8 +3043,7 @@ bool X86InstrInfo::AnalyzeBranchImpl( } // If the block has any instructions after a JMP, delete them. - while (std::next(I) != MBB.end()) - std::next(I)->eraseFromParent(); + MBB.erase(std::next(I), MBB.end()); Cond.clear(); FBB = nullptr; @@ -3464,7 +3340,7 @@ bool X86InstrInfo::canInsertSelect(const MachineBasicBlock &MBB, Register FalseReg, int &CondCycles, int &TrueCycles, int &FalseCycles) const { // Not all subtargets have cmov instructions. - if (!Subtarget.hasCMov()) + if (!Subtarget.canUseCMOV()) return false; if (Cond.size() != 1) return false; @@ -3708,10 +3584,6 @@ static unsigned getLoadStoreRegOpcode(Register Reg, case 2: if (X86::VK16RegClass.hasSubClassEq(RC)) return load ? X86::KMOVWkm : X86::KMOVWmk; - if (X86::FR16XRegClass.hasSubClassEq(RC)) { - assert(STI.hasFP16()); - return load ? X86::VMOVSHZrm_alt : X86::VMOVSHZmr; - } assert(X86::GR16RegClass.hasSubClassEq(RC) && "Unknown 2-byte regclass"); return load ? X86::MOV16rm : X86::MOV16mr; case 4: @@ -3739,6 +3611,10 @@ static unsigned getLoadStoreRegOpcode(Register Reg, X86::VK8PAIRRegClass.hasSubClassEq(RC) || X86::VK16PAIRRegClass.hasSubClassEq(RC)) return load ? X86::MASKPAIR16LOAD : X86::MASKPAIR16STORE; + if ((X86::FR16RegClass.hasSubClassEq(RC) || + X86::FR16XRegClass.hasSubClassEq(RC)) && + STI.hasFP16()) + return load ? X86::VMOVSHZrm_alt : X86::VMOVSHZmr; llvm_unreachable("Unknown 4-byte regclass"); case 8: if (X86::GR64RegClass.hasSubClassEq(RC)) @@ -3845,6 +3721,35 @@ X86InstrInfo::getAddrModeFromMemoryOp(const MachineInstr &MemI, return AM; } +bool X86InstrInfo::verifyInstruction(const MachineInstr &MI, + StringRef &ErrInfo) const { + Optional<ExtAddrMode> AMOrNone = getAddrModeFromMemoryOp(MI, nullptr); + if (!AMOrNone) + return true; + + ExtAddrMode AM = *AMOrNone; + + if (AM.ScaledReg != X86::NoRegister) { + switch (AM.Scale) { + case 1: + case 2: + case 4: + case 8: + break; + default: + ErrInfo = "Scale factor in address must be 1, 2, 4 or 8"; + return false; + } + } + if (!isInt<32>(AM.Displacement)) { + ErrInfo = "Displacement in address must fit into 32-bit signed " + "integer"; + return false; + } + + return true; +} + bool X86InstrInfo::getConstValDefinedInReg(const MachineInstr &MI, const Register Reg, int64_t &ImmVal) const { @@ -3949,12 +3854,12 @@ void X86InstrInfo::storeRegToStackSlot(MachineBasicBlock &MBB, const TargetRegisterInfo *TRI) const { const MachineFunction &MF = *MBB.getParent(); const MachineFrameInfo &MFI = MF.getFrameInfo(); + MachineRegisterInfo &RegInfo = MBB.getParent()->getRegInfo(); assert(MFI.getObjectSize(FrameIdx) >= TRI->getSpillSize(*RC) && "Stack slot too small for store"); if (RC->getID() == X86::TILERegClassID) { unsigned Opc = X86::TILESTORED; // tilestored %tmm, (%sp, %idx) - MachineRegisterInfo &RegInfo = MBB.getParent()->getRegInfo(); Register VirtReg = RegInfo.createVirtualRegister(&X86::GR64_NOSPRegClass); BuildMI(MBB, MI, DebugLoc(), get(X86::MOV64ri), VirtReg).addImm(64); MachineInstr *NewMI = @@ -3963,6 +3868,14 @@ void X86InstrInfo::storeRegToStackSlot(MachineBasicBlock &MBB, MachineOperand &MO = NewMI->getOperand(2); MO.setReg(VirtReg); MO.setIsKill(true); + } else if ((RC->getID() == X86::FR16RegClassID || + RC->getID() == X86::FR16XRegClassID) && + !Subtarget.hasFP16()) { + unsigned Opc = Subtarget.hasAVX512() ? X86::VMOVSSZmr + : Subtarget.hasAVX() ? X86::VMOVSSmr + : X86::MOVSSmr; + addFrameReference(BuildMI(MBB, MI, DebugLoc(), get(Opc)), FrameIdx) + .addReg(SrcReg, getKillRegState(isKill)); } else { unsigned Alignment = std::max<uint32_t>(TRI->getSpillSize(*RC), 16); bool isAligned = @@ -3991,6 +3904,14 @@ void X86InstrInfo::loadRegFromStackSlot(MachineBasicBlock &MBB, MachineOperand &MO = NewMI->getOperand(3); MO.setReg(VirtReg); MO.setIsKill(true); + } else if ((RC->getID() == X86::FR16RegClassID || + RC->getID() == X86::FR16XRegClassID) && + !Subtarget.hasFP16()) { + unsigned Opc = Subtarget.hasAVX512() ? X86::VMOVSSZrm + : Subtarget.hasAVX() ? X86::VMOVSSrm + : X86::MOVSSrm; + addFrameReference(BuildMI(MBB, MI, DebugLoc(), get(Opc), DestReg), + FrameIdx); } else { const MachineFunction &MF = *MBB.getParent(); const MachineFrameInfo &MFI = MF.getFrameInfo(); @@ -4375,7 +4296,7 @@ bool X86InstrInfo::optimizeCompareInstr(MachineInstr &CmpInstr, Register SrcReg, case X86::SUB8ri: NewOpcode = X86::CMP8ri; break; } CmpInstr.setDesc(get(NewOpcode)); - CmpInstr.RemoveOperand(0); + CmpInstr.removeOperand(0); // Mutating this instruction invalidates any debug data associated with it. CmpInstr.dropDebugNumber(); // Fall through to optimize Cmp if Cmp is CMPrr or CMPri. @@ -4423,6 +4344,23 @@ bool X86InstrInfo::optimizeCompareInstr(MachineInstr &CmpInstr, Register SrcReg, MI = &Inst; break; } + + // Look back for the following pattern, in which case the test64rr + // instruction could be erased. + // + // Example: + // %reg = and32ri %in_reg, 5 + // ... // EFLAGS not changed. + // %src_reg = subreg_to_reg 0, %reg, %subreg.sub_index + // test64rr %src_reg, %src_reg, implicit-def $eflags + MachineInstr *AndInstr = nullptr; + if (IsCmpZero && + findRedundantFlagInstr(CmpInstr, Inst, MRI, &AndInstr, TRI, + NoSignFlag, ClearsOverflowFlag)) { + assert(AndInstr != nullptr && X86::isAND(AndInstr->getOpcode())); + MI = AndInstr; + break; + } // Cannot find other candidates before definition of SrcReg. return false; } @@ -4524,6 +4462,11 @@ bool X86InstrInfo::optimizeCompareInstr(MachineInstr &CmpInstr, Register SrcReg, return false; case X86::COND_G: case X86::COND_GE: case X86::COND_L: case X86::COND_LE: + // If SF is used, but the instruction doesn't update the SF, then we + // can't do the optimization. + if (NoSignFlag) + return false; + LLVM_FALLTHROUGH; case X86::COND_O: case X86::COND_NO: // If OF is used, the instruction needs to clear it like CmpZero does. if (!ClearsOverflowFlag) @@ -4811,7 +4754,7 @@ static bool ExpandMOVImmSExti8(MachineInstrBuilder &MIB, BuildMI(MBB, I, DL, TII.get(X86::PUSH32i8)).addImm(Imm); MIB->setDesc(TII.get(X86::POP32r)); } - MIB->RemoveOperand(1); + MIB->removeOperand(1); MIB->addImplicitDefUseOperands(*MBB.getParent()); // Build CFI if necessary. @@ -4918,7 +4861,7 @@ static bool expandSHXDROT(MachineInstrBuilder &MIB, const MCInstrDesc &Desc) { MIB->setDesc(Desc); int64_t ShiftAmt = MIB->getOperand(2).getImm(); // Temporarily remove the immediate so we can add another source register. - MIB->RemoveOperand(2); + MIB->removeOperand(2); // Add the register. Don't copy the kill flag if there is one. MIB.addReg(MIB.getReg(1), getUndefRegState(MIB->getOperand(1).isUndef())); @@ -4949,6 +4892,7 @@ bool X86InstrInfo::expandPostRAPseudo(MachineInstr &MI) const { case X86::V_SET0: case X86::FsFLD0SS: case X86::FsFLD0SD: + case X86::FsFLD0SH: case X86::FsFLD0F128: return Expand2AddrUndef(MIB, get(HasAVX ? X86::VXORPSrr : X86::XORPSrr)); case X86::AVX_SET0: { @@ -5026,7 +4970,7 @@ bool X86InstrInfo::expandPostRAPseudo(MachineInstr &MI) const { unsigned MaskState = getRegState(MIB->getOperand(1)); unsigned Opc = (MI.getOpcode() == X86::AVX512_512_SEXT_MASK_64) ? X86::VPTERNLOGQZrrikz : X86::VPTERNLOGDZrrikz; - MI.RemoveOperand(1); + MI.removeOperand(1); MIB->setDesc(get(Opc)); // VPTERNLOG needs 3 register inputs and an immediate. // 0xff will return 1s for any input. @@ -5165,6 +5109,255 @@ static bool hasPartialRegUpdate(unsigned Opcode, case X86::SQRTSDr_Int: case X86::SQRTSDm_Int: return true; + case X86::VFCMULCPHZ128rm: + case X86::VFCMULCPHZ128rmb: + case X86::VFCMULCPHZ128rmbkz: + case X86::VFCMULCPHZ128rmkz: + case X86::VFCMULCPHZ128rr: + case X86::VFCMULCPHZ128rrkz: + case X86::VFCMULCPHZ256rm: + case X86::VFCMULCPHZ256rmb: + case X86::VFCMULCPHZ256rmbkz: + case X86::VFCMULCPHZ256rmkz: + case X86::VFCMULCPHZ256rr: + case X86::VFCMULCPHZ256rrkz: + case X86::VFCMULCPHZrm: + case X86::VFCMULCPHZrmb: + case X86::VFCMULCPHZrmbkz: + case X86::VFCMULCPHZrmkz: + case X86::VFCMULCPHZrr: + case X86::VFCMULCPHZrrb: + case X86::VFCMULCPHZrrbkz: + case X86::VFCMULCPHZrrkz: + case X86::VFMULCPHZ128rm: + case X86::VFMULCPHZ128rmb: + case X86::VFMULCPHZ128rmbkz: + case X86::VFMULCPHZ128rmkz: + case X86::VFMULCPHZ128rr: + case X86::VFMULCPHZ128rrkz: + case X86::VFMULCPHZ256rm: + case X86::VFMULCPHZ256rmb: + case X86::VFMULCPHZ256rmbkz: + case X86::VFMULCPHZ256rmkz: + case X86::VFMULCPHZ256rr: + case X86::VFMULCPHZ256rrkz: + case X86::VFMULCPHZrm: + case X86::VFMULCPHZrmb: + case X86::VFMULCPHZrmbkz: + case X86::VFMULCPHZrmkz: + case X86::VFMULCPHZrr: + case X86::VFMULCPHZrrb: + case X86::VFMULCPHZrrbkz: + case X86::VFMULCPHZrrkz: + case X86::VFCMULCSHZrm: + case X86::VFCMULCSHZrmkz: + case X86::VFCMULCSHZrr: + case X86::VFCMULCSHZrrb: + case X86::VFCMULCSHZrrbkz: + case X86::VFCMULCSHZrrkz: + case X86::VFMULCSHZrm: + case X86::VFMULCSHZrmkz: + case X86::VFMULCSHZrr: + case X86::VFMULCSHZrrb: + case X86::VFMULCSHZrrbkz: + case X86::VFMULCSHZrrkz: + return Subtarget.hasMULCFalseDeps(); + case X86::VPERMDYrm: + case X86::VPERMDYrr: + case X86::VPERMQYmi: + case X86::VPERMQYri: + case X86::VPERMPSYrm: + case X86::VPERMPSYrr: + case X86::VPERMPDYmi: + case X86::VPERMPDYri: + case X86::VPERMDZ256rm: + case X86::VPERMDZ256rmb: + case X86::VPERMDZ256rmbkz: + case X86::VPERMDZ256rmkz: + case X86::VPERMDZ256rr: + case X86::VPERMDZ256rrkz: + case X86::VPERMDZrm: + case X86::VPERMDZrmb: + case X86::VPERMDZrmbkz: + case X86::VPERMDZrmkz: + case X86::VPERMDZrr: + case X86::VPERMDZrrkz: + case X86::VPERMQZ256mbi: + case X86::VPERMQZ256mbikz: + case X86::VPERMQZ256mi: + case X86::VPERMQZ256mikz: + case X86::VPERMQZ256ri: + case X86::VPERMQZ256rikz: + case X86::VPERMQZ256rm: + case X86::VPERMQZ256rmb: + case X86::VPERMQZ256rmbkz: + case X86::VPERMQZ256rmkz: + case X86::VPERMQZ256rr: + case X86::VPERMQZ256rrkz: + case X86::VPERMQZmbi: + case X86::VPERMQZmbikz: + case X86::VPERMQZmi: + case X86::VPERMQZmikz: + case X86::VPERMQZri: + case X86::VPERMQZrikz: + case X86::VPERMQZrm: + case X86::VPERMQZrmb: + case X86::VPERMQZrmbkz: + case X86::VPERMQZrmkz: + case X86::VPERMQZrr: + case X86::VPERMQZrrkz: + case X86::VPERMPSZ256rm: + case X86::VPERMPSZ256rmb: + case X86::VPERMPSZ256rmbkz: + case X86::VPERMPSZ256rmkz: + case X86::VPERMPSZ256rr: + case X86::VPERMPSZ256rrkz: + case X86::VPERMPSZrm: + case X86::VPERMPSZrmb: + case X86::VPERMPSZrmbkz: + case X86::VPERMPSZrmkz: + case X86::VPERMPSZrr: + case X86::VPERMPSZrrkz: + case X86::VPERMPDZ256mbi: + case X86::VPERMPDZ256mbikz: + case X86::VPERMPDZ256mi: + case X86::VPERMPDZ256mikz: + case X86::VPERMPDZ256ri: + case X86::VPERMPDZ256rikz: + case X86::VPERMPDZ256rm: + case X86::VPERMPDZ256rmb: + case X86::VPERMPDZ256rmbkz: + case X86::VPERMPDZ256rmkz: + case X86::VPERMPDZ256rr: + case X86::VPERMPDZ256rrkz: + case X86::VPERMPDZmbi: + case X86::VPERMPDZmbikz: + case X86::VPERMPDZmi: + case X86::VPERMPDZmikz: + case X86::VPERMPDZri: + case X86::VPERMPDZrikz: + case X86::VPERMPDZrm: + case X86::VPERMPDZrmb: + case X86::VPERMPDZrmbkz: + case X86::VPERMPDZrmkz: + case X86::VPERMPDZrr: + case X86::VPERMPDZrrkz: + return Subtarget.hasPERMFalseDeps(); + case X86::VRANGEPDZ128rmbi: + case X86::VRANGEPDZ128rmbikz: + case X86::VRANGEPDZ128rmi: + case X86::VRANGEPDZ128rmikz: + case X86::VRANGEPDZ128rri: + case X86::VRANGEPDZ128rrikz: + case X86::VRANGEPDZ256rmbi: + case X86::VRANGEPDZ256rmbikz: + case X86::VRANGEPDZ256rmi: + case X86::VRANGEPDZ256rmikz: + case X86::VRANGEPDZ256rri: + case X86::VRANGEPDZ256rrikz: + case X86::VRANGEPDZrmbi: + case X86::VRANGEPDZrmbikz: + case X86::VRANGEPDZrmi: + case X86::VRANGEPDZrmikz: + case X86::VRANGEPDZrri: + case X86::VRANGEPDZrrib: + case X86::VRANGEPDZrribkz: + case X86::VRANGEPDZrrikz: + case X86::VRANGEPSZ128rmbi: + case X86::VRANGEPSZ128rmbikz: + case X86::VRANGEPSZ128rmi: + case X86::VRANGEPSZ128rmikz: + case X86::VRANGEPSZ128rri: + case X86::VRANGEPSZ128rrikz: + case X86::VRANGEPSZ256rmbi: + case X86::VRANGEPSZ256rmbikz: + case X86::VRANGEPSZ256rmi: + case X86::VRANGEPSZ256rmikz: + case X86::VRANGEPSZ256rri: + case X86::VRANGEPSZ256rrikz: + case X86::VRANGEPSZrmbi: + case X86::VRANGEPSZrmbikz: + case X86::VRANGEPSZrmi: + case X86::VRANGEPSZrmikz: + case X86::VRANGEPSZrri: + case X86::VRANGEPSZrrib: + case X86::VRANGEPSZrribkz: + case X86::VRANGEPSZrrikz: + case X86::VRANGESDZrmi: + case X86::VRANGESDZrmikz: + case X86::VRANGESDZrri: + case X86::VRANGESDZrrib: + case X86::VRANGESDZrribkz: + case X86::VRANGESDZrrikz: + case X86::VRANGESSZrmi: + case X86::VRANGESSZrmikz: + case X86::VRANGESSZrri: + case X86::VRANGESSZrrib: + case X86::VRANGESSZrribkz: + case X86::VRANGESSZrrikz: + return Subtarget.hasRANGEFalseDeps(); + case X86::VGETMANTSSZrmi: + case X86::VGETMANTSSZrmikz: + case X86::VGETMANTSSZrri: + case X86::VGETMANTSSZrrib: + case X86::VGETMANTSSZrribkz: + case X86::VGETMANTSSZrrikz: + case X86::VGETMANTSDZrmi: + case X86::VGETMANTSDZrmikz: + case X86::VGETMANTSDZrri: + case X86::VGETMANTSDZrrib: + case X86::VGETMANTSDZrribkz: + case X86::VGETMANTSDZrrikz: + case X86::VGETMANTSHZrmi: + case X86::VGETMANTSHZrmikz: + case X86::VGETMANTSHZrri: + case X86::VGETMANTSHZrrib: + case X86::VGETMANTSHZrribkz: + case X86::VGETMANTSHZrrikz: + case X86::VGETMANTPSZ128rmbi: + case X86::VGETMANTPSZ128rmbikz: + case X86::VGETMANTPSZ128rmi: + case X86::VGETMANTPSZ128rmikz: + case X86::VGETMANTPSZ256rmbi: + case X86::VGETMANTPSZ256rmbikz: + case X86::VGETMANTPSZ256rmi: + case X86::VGETMANTPSZ256rmikz: + case X86::VGETMANTPSZrmbi: + case X86::VGETMANTPSZrmbikz: + case X86::VGETMANTPSZrmi: + case X86::VGETMANTPSZrmikz: + case X86::VGETMANTPDZ128rmbi: + case X86::VGETMANTPDZ128rmbikz: + case X86::VGETMANTPDZ128rmi: + case X86::VGETMANTPDZ128rmikz: + case X86::VGETMANTPDZ256rmbi: + case X86::VGETMANTPDZ256rmbikz: + case X86::VGETMANTPDZ256rmi: + case X86::VGETMANTPDZ256rmikz: + case X86::VGETMANTPDZrmbi: + case X86::VGETMANTPDZrmbikz: + case X86::VGETMANTPDZrmi: + case X86::VGETMANTPDZrmikz: + return Subtarget.hasGETMANTFalseDeps(); + case X86::VPMULLQZ128rm: + case X86::VPMULLQZ128rmb: + case X86::VPMULLQZ128rmbkz: + case X86::VPMULLQZ128rmkz: + case X86::VPMULLQZ128rr: + case X86::VPMULLQZ128rrkz: + case X86::VPMULLQZ256rm: + case X86::VPMULLQZ256rmb: + case X86::VPMULLQZ256rmbkz: + case X86::VPMULLQZ256rmkz: + case X86::VPMULLQZ256rr: + case X86::VPMULLQZ256rrkz: + case X86::VPMULLQZrm: + case X86::VPMULLQZrmb: + case X86::VPMULLQZrmbkz: + case X86::VPMULLQZrmkz: + case X86::VPMULLQZrr: + case X86::VPMULLQZrrkz: + return Subtarget.hasMULLQFalseDeps(); // GPR case X86::POPCNT32rm: case X86::POPCNT32rr: @@ -5591,6 +5784,28 @@ void X86InstrInfo::breakPartialRegDependency( .addReg(XReg, RegState::Undef) .addReg(Reg, RegState::ImplicitDefine); MI.addRegisterKilled(Reg, TRI, true); + } else if (X86::VR128XRegClass.contains(Reg)) { + // Only handle VLX targets. + if (!Subtarget.hasVLX()) + return; + // Since vxorps requires AVX512DQ, vpxord should be the best choice. + BuildMI(*MI.getParent(), MI, MI.getDebugLoc(), get(X86::VPXORDZ128rr), Reg) + .addReg(Reg, RegState::Undef) + .addReg(Reg, RegState::Undef); + MI.addRegisterKilled(Reg, TRI, true); + } else if (X86::VR256XRegClass.contains(Reg) || + X86::VR512RegClass.contains(Reg)) { + // Only handle VLX targets. + if (!Subtarget.hasVLX()) + return; + // Use vpxord to clear the full ymm/zmm register. + // It wants to read and write the xmm sub-register. + Register XReg = TRI->getSubReg(Reg, X86::sub_xmm); + BuildMI(*MI.getParent(), MI, MI.getDebugLoc(), get(X86::VPXORDZ128rr), XReg) + .addReg(XReg, RegState::Undef) + .addReg(XReg, RegState::Undef) + .addReg(Reg, RegState::ImplicitDefine); + MI.addRegisterKilled(Reg, TRI, true); } else if (X86::GR64RegClass.contains(Reg)) { // Using XOR32rr because it has shorter encoding and zeros up the upper bits // as well. @@ -6413,6 +6628,7 @@ MachineInstr *X86InstrInfo::foldMemoryOperandImpl( case X86::AVX512_FsFLD0SS: Alignment = Align(4); break; + case X86::FsFLD0SH: case X86::AVX512_FsFLD0SH: Alignment = Align(2); break; @@ -6451,6 +6667,7 @@ MachineInstr *X86InstrInfo::foldMemoryOperandImpl( case X86::AVX512_256_SET0: case X86::AVX512_512_SET0: case X86::AVX512_512_SETALLONES: + case X86::FsFLD0SH: case X86::AVX512_FsFLD0SH: case X86::FsFLD0SD: case X86::AVX512_FsFLD0SD: @@ -6490,7 +6707,7 @@ MachineInstr *X86InstrInfo::foldMemoryOperandImpl( Ty = Type::getDoubleTy(MF.getFunction().getContext()); else if (Opc == X86::FsFLD0F128 || Opc == X86::AVX512_FsFLD0F128) Ty = Type::getFP128Ty(MF.getFunction().getContext()); - else if (Opc == X86::AVX512_FsFLD0SH) + else if (Opc == X86::FsFLD0SH || Opc == X86::AVX512_FsFLD0SH) Ty = Type::getHalfTy(MF.getFunction().getContext()); else if (Opc == X86::AVX512_512_SET0 || Opc == X86::AVX512_512_SETALLONES) Ty = FixedVectorType::get(Type::getInt32Ty(MF.getFunction().getContext()), @@ -7170,7 +7387,7 @@ bool X86InstrInfo::isSchedulingBoundary(const MachineInstr &MI, // ENDBR instructions should not be scheduled around. unsigned Opcode = MI.getOpcode(); if (Opcode == X86::ENDBR64 || Opcode == X86::ENDBR32 || - Opcode == X86::LDTILECFG) + Opcode == X86::PLDTILECFGV) return true; return TargetInstrInfo::isSchedulingBoundary(MI, MBB, MF); @@ -9298,12 +9515,10 @@ outliner::OutlinedFunction X86InstrInfo::getOutliningCandidateInfo( // We check to see if CFI Instructions are present, and if they are // we find the number of CFI Instructions in the candidates. unsigned CFICount = 0; - MachineBasicBlock::iterator MBBI = RepeatedSequenceLocs[0].front(); - for (unsigned Loc = RepeatedSequenceLocs[0].getStartIdx(); - Loc < RepeatedSequenceLocs[0].getEndIdx() + 1; Loc++) { - if (MBBI->isCFIInstruction()) + for (auto &I : make_range(RepeatedSequenceLocs[0].front(), + std::next(RepeatedSequenceLocs[0].back()))) { + if (I.isCFIInstruction()) CFICount++; - MBBI++; } // We compare the number of found CFI Instructions to the number of CFI @@ -9440,7 +9655,7 @@ MachineBasicBlock::iterator X86InstrInfo::insertOutlinedCall(Module &M, MachineBasicBlock &MBB, MachineBasicBlock::iterator &It, MachineFunction &MF, - const outliner::Candidate &C) const { + outliner::Candidate &C) const { // Is it a tail call? if (C.CallConstructionID == MachineOutlinerTailCall) { // Yes, just insert a JMP. |
