From 044eb2f6afba375a914ac9d8024f8f5142bb912e Mon Sep 17 00:00:00 2001 From: Dimitry Andric Date: Mon, 18 Dec 2017 20:10:56 +0000 Subject: Vendor import of llvm trunk r321017: https://llvm.org/svn/llvm-project/llvm/trunk@321017 --- lib/Target/PowerPC/AsmParser/PPCAsmParser.cpp | 29 +- lib/Target/PowerPC/CMakeLists.txt | 3 + lib/Target/PowerPC/InstPrinter/PPCInstPrinter.cpp | 86 +- lib/Target/PowerPC/InstPrinter/PPCInstPrinter.h | 19 +- lib/Target/PowerPC/MCTargetDesc/PPCAsmBackend.cpp | 6 +- .../PowerPC/MCTargetDesc/PPCELFObjectWriter.cpp | 12 +- lib/Target/PowerPC/MCTargetDesc/PPCMCAsmInfo.cpp | 1 + .../PowerPC/MCTargetDesc/PPCMCTargetDesc.cpp | 14 +- lib/Target/PowerPC/MCTargetDesc/PPCMCTargetDesc.h | 15 +- .../PowerPC/MCTargetDesc/PPCMachObjectWriter.cpp | 8 +- lib/Target/PowerPC/MCTargetDesc/PPCPredicates.h | 16 + lib/Target/PowerPC/P9InstrResources.td | 687 +++-- lib/Target/PowerPC/PPC.h | 10 + lib/Target/PowerPC/PPCAsmPrinter.cpp | 170 +- lib/Target/PowerPC/PPCBranchCoalescing.cpp | 784 ++++++ lib/Target/PowerPC/PPCBranchSelector.cpp | 2 +- lib/Target/PowerPC/PPCCTRLoops.cpp | 56 +- lib/Target/PowerPC/PPCEarlyReturn.cpp | 2 +- lib/Target/PowerPC/PPCExpandISEL.cpp | 87 +- lib/Target/PowerPC/PPCFastISel.cpp | 10 +- lib/Target/PowerPC/PPCFrameLowering.cpp | 18 +- lib/Target/PowerPC/PPCFrameLowering.h | 4 +- lib/Target/PowerPC/PPCISelDAGToDAG.cpp | 2755 +++++++++++++------- lib/Target/PowerPC/PPCISelLowering.cpp | 537 +++- lib/Target/PowerPC/PPCISelLowering.h | 73 +- lib/Target/PowerPC/PPCInstr64Bit.td | 27 +- lib/Target/PowerPC/PPCInstrAltivec.td | 38 +- lib/Target/PowerPC/PPCInstrFormats.td | 107 + lib/Target/PowerPC/PPCInstrInfo.cpp | 1417 +++++++++- lib/Target/PowerPC/PPCInstrInfo.h | 74 +- lib/Target/PowerPC/PPCInstrInfo.td | 219 +- lib/Target/PowerPC/PPCInstrVSX.td | 362 ++- lib/Target/PowerPC/PPCLoopPreIncPrep.cpp | 65 + lib/Target/PowerPC/PPCMCInstLower.cpp | 85 +- lib/Target/PowerPC/PPCMIPeephole.cpp | 966 ++++++- lib/Target/PowerPC/PPCMachineBasicBlockUtils.h | 198 ++ lib/Target/PowerPC/PPCMachineFunctionInfo.cpp | 14 + lib/Target/PowerPC/PPCMachineFunctionInfo.h | 18 + lib/Target/PowerPC/PPCPreEmitPeephole.cpp | 95 + lib/Target/PowerPC/PPCQPXLoadSplat.cpp | 8 +- lib/Target/PowerPC/PPCReduceCRLogicals.cpp | 535 ++++ lib/Target/PowerPC/PPCRegisterInfo.cpp | 47 +- lib/Target/PowerPC/PPCRegisterInfo.td | 5 + lib/Target/PowerPC/PPCScheduleP9.td | 108 +- lib/Target/PowerPC/PPCSubtarget.h | 2 +- lib/Target/PowerPC/PPCTLSDynamicCall.cpp | 2 +- lib/Target/PowerPC/PPCTargetMachine.cpp | 37 +- lib/Target/PowerPC/PPCTargetMachine.h | 9 +- lib/Target/PowerPC/PPCTargetObjectFile.h | 2 +- lib/Target/PowerPC/PPCTargetTransformInfo.cpp | 29 +- lib/Target/PowerPC/PPCTargetTransformInfo.h | 7 +- lib/Target/PowerPC/PPCVSXFMAMutate.cpp | 44 +- lib/Target/PowerPC/PPCVSXSwapRemoval.cpp | 6 +- lib/Target/PowerPC/README.txt | 2 +- lib/Target/PowerPC/README_ALTIVEC.txt | 2 +- .../PowerPC/TargetInfo/PowerPCTargetInfo.cpp | 6 +- lib/Target/PowerPC/p9-instrs.txt | 442 ---- 57 files changed, 8116 insertions(+), 2266 deletions(-) create mode 100644 lib/Target/PowerPC/PPCBranchCoalescing.cpp create mode 100644 lib/Target/PowerPC/PPCMachineBasicBlockUtils.h create mode 100644 lib/Target/PowerPC/PPCPreEmitPeephole.cpp create mode 100644 lib/Target/PowerPC/PPCReduceCRLogicals.cpp delete mode 100644 lib/Target/PowerPC/p9-instrs.txt (limited to 'lib/Target/PowerPC') diff --git a/lib/Target/PowerPC/AsmParser/PPCAsmParser.cpp b/lib/Target/PowerPC/AsmParser/PPCAsmParser.cpp index 52432a5820fbe..d6db354e02152 100644 --- a/lib/Target/PowerPC/AsmParser/PPCAsmParser.cpp +++ b/lib/Target/PowerPC/AsmParser/PPCAsmParser.cpp @@ -251,7 +251,6 @@ namespace { struct PPCOperand; class PPCAsmParser : public MCTargetAsmParser { - const MCInstrInfo &MII; bool IsPPC64; bool IsDarwin; @@ -298,7 +297,7 @@ class PPCAsmParser : public MCTargetAsmParser { public: PPCAsmParser(const MCSubtargetInfo &STI, MCAsmParser &, const MCInstrInfo &MII, const MCTargetOptions &Options) - : MCTargetAsmParser(Options, STI), MII(MII) { + : MCTargetAsmParser(Options, STI, MII) { // Check for 64-bit vs. 32-bit pointer mode. const Triple &TheTriple = STI.getTargetTriple(); IsPPC64 = (TheTriple.getArch() == Triple::ppc64 || @@ -394,6 +393,10 @@ public: /// getEndLoc - Get the location of the last token of this operand. SMLoc getEndLoc() const override { return EndLoc; } + /// getLocRange - Get the range between the first and last token of this + /// operand. + SMRange getLocRange() const { return SMRange(StartLoc, EndLoc); } + /// isPPC64 - True if this operand is for an instruction in 64-bit mode. bool isPPC64() const { return IsPPC64; } @@ -1138,6 +1141,15 @@ void PPCAsmParser::ProcessInstruction(MCInst &Inst, Inst = TmpInst; break; } + case PPC::SUBPCIS: { + MCInst TmpInst; + int64_t N = Inst.getOperand(1).getImm(); + TmpInst.setOpcode(PPC::ADDPCIS); + TmpInst.addOperand(Inst.getOperand(0)); + TmpInst.addOperand(MCOperand::createImm(-N)); + Inst = TmpInst; + break; + } case PPC::SRDI: case PPC::SRDIo: { MCInst TmpInst; @@ -1260,6 +1272,9 @@ void PPCAsmParser::ProcessInstruction(MCInst &Inst, } } +static std::string PPCMnemonicSpellCheck(StringRef S, uint64_t FBS, + unsigned VariantID = 0); + bool PPCAsmParser::MatchAndEmitInstruction(SMLoc IDLoc, unsigned &Opcode, OperandVector &Operands, MCStreamer &Out, uint64_t &ErrorInfo, @@ -1275,8 +1290,13 @@ bool PPCAsmParser::MatchAndEmitInstruction(SMLoc IDLoc, unsigned &Opcode, return false; case Match_MissingFeature: return Error(IDLoc, "instruction use requires an option to be enabled"); - case Match_MnemonicFail: - return Error(IDLoc, "unrecognized instruction mnemonic"); + case Match_MnemonicFail: { + uint64_t FBS = ComputeAvailableFeatures(getSTI().getFeatureBits()); + std::string Suggestion = PPCMnemonicSpellCheck( + ((PPCOperand &)*Operands[0]).getToken(), FBS); + return Error(IDLoc, "invalid instruction" + Suggestion, + ((PPCOperand &)*Operands[0]).getLocRange()); + } case Match_InvalidOperand: { SMLoc ErrorLoc = IDLoc; if (ErrorInfo != ~0ULL) { @@ -1912,6 +1932,7 @@ extern "C" void LLVMInitializePowerPCAsmParser() { #define GET_REGISTER_MATCHER #define GET_MATCHER_IMPLEMENTATION +#define GET_MNEMONIC_SPELL_CHECKER #include "PPCGenAsmMatcher.inc" // Define this matcher function after the auto-generated include so we diff --git a/lib/Target/PowerPC/CMakeLists.txt b/lib/Target/PowerPC/CMakeLists.txt index 7ca4c1999003a..3f173787114d4 100644 --- a/lib/Target/PowerPC/CMakeLists.txt +++ b/lib/Target/PowerPC/CMakeLists.txt @@ -16,6 +16,7 @@ add_llvm_target(PowerPCCodeGen PPCBoolRetToInt.cpp PPCAsmPrinter.cpp PPCBranchSelector.cpp + PPCBranchCoalescing.cpp PPCCCState.cpp PPCCTRLoops.cpp PPCHazardRecognizers.cpp @@ -38,9 +39,11 @@ add_llvm_target(PowerPCCodeGen PPCTOCRegDeps.cpp PPCTLSDynamicCall.cpp PPCVSXCopy.cpp + PPCReduceCRLogicals.cpp PPCVSXFMAMutate.cpp PPCVSXSwapRemoval.cpp PPCExpandISEL.cpp + PPCPreEmitPeephole.cpp ) add_subdirectory(AsmParser) diff --git a/lib/Target/PowerPC/InstPrinter/PPCInstPrinter.cpp b/lib/Target/PowerPC/InstPrinter/PPCInstPrinter.cpp index baf5902ddf584..ea709a73ebf26 100644 --- a/lib/Target/PowerPC/InstPrinter/PPCInstPrinter.cpp +++ b/lib/Target/PowerPC/InstPrinter/PPCInstPrinter.cpp @@ -15,6 +15,7 @@ #include "MCTargetDesc/PPCMCTargetDesc.h" #include "MCTargetDesc/PPCPredicates.h" #include "PPCInstrInfo.h" +#include "llvm/CodeGen/TargetOpcodes.h" #include "llvm/MC/MCExpr.h" #include "llvm/MC/MCInst.h" #include "llvm/MC/MCInstrInfo.h" @@ -23,7 +24,6 @@ #include "llvm/MC/MCSymbol.h" #include "llvm/Support/CommandLine.h" #include "llvm/Support/raw_ostream.h" -#include "llvm/Target/TargetOpcodes.h" using namespace llvm; #define DEBUG_TYPE "asm-printer" @@ -39,6 +39,12 @@ static cl::opt ShowVSRNumsAsVR("ppc-vsr-nums-as-vr", cl::Hidden, cl::init(false), cl::desc("Prints full register names with vs{31-63} as v{0-31}")); +// Prints full register names with percent symbol. +static cl::opt +FullRegNamesWithPercent("ppc-reg-with-percent-prefix", cl::Hidden, + cl::init(false), + cl::desc("Prints full register names with percent")); + #define PRINT_ALIAS_INSTR #include "PPCGenAsmWriter.inc" @@ -84,7 +90,7 @@ void PPCInstPrinter::printInst(const MCInst *MI, raw_ostream &O, return; } } - + if ((MI->getOpcode() == PPC::OR || MI->getOpcode() == PPC::OR8) && MI->getOperand(1).getReg() == MI->getOperand(2).getReg()) { O << "\tmr "; @@ -94,7 +100,7 @@ void PPCInstPrinter::printInst(const MCInst *MI, raw_ostream &O, printAnnotation(O, Annot); return; } - + if (MI->getOpcode() == PPC::RLDICR || MI->getOpcode() == PPC::RLDICR_32) { unsigned char SH = MI->getOperand(2).getImm(); @@ -161,7 +167,7 @@ void PPCInstPrinter::printInst(const MCInst *MI, raw_ostream &O, return; } } - + if (!printAliasInstr(MI, O)) printInstruction(MI, O); printAnnotation(O, Annot); @@ -259,7 +265,7 @@ void PPCInstPrinter::printPredicateOperand(const MCInst *MI, unsigned OpNo, } llvm_unreachable("Invalid predicate code"); } - + assert(StringRef(Modifier) == "reg" && "Need to specify 'cc', 'pm' or 'reg' as predicate op modifier!"); printOperand(MI, OpNo+1, O); @@ -445,13 +451,57 @@ void PPCInstPrinter::printTLSCall(const MCInst *MI, unsigned OpNo, O << '@' << MCSymbolRefExpr::getVariantKindName(refExp.getKind()); } +/// showRegistersWithPercentPrefix - Check if this register name should be +/// printed with a percentage symbol as prefix. +bool PPCInstPrinter::showRegistersWithPercentPrefix(const char *RegName) const { + if (!FullRegNamesWithPercent || TT.isOSDarwin() || TT.getOS() == Triple::AIX) + return false; + + switch (RegName[0]) { + default: + return false; + case 'r': + case 'f': + case 'q': + case 'v': + case 'c': + return true; + } +} + +/// getVerboseConditionalRegName - This method expands the condition register +/// when requested explicitly or targetting Darwin. +const char *PPCInstPrinter::getVerboseConditionRegName(unsigned RegNum, + unsigned RegEncoding) + const { + if (!TT.isOSDarwin() && !FullRegNames) + return nullptr; + if (RegNum < PPC::CR0EQ || RegNum > PPC::CR7UN) + return nullptr; + const char *CRBits[] = { + "lt", "gt", "eq", "un", + "4*cr1+lt", "4*cr1+gt", "4*cr1+eq", "4*cr1+un", + "4*cr2+lt", "4*cr2+gt", "4*cr2+eq", "4*cr2+un", + "4*cr3+lt", "4*cr3+gt", "4*cr3+eq", "4*cr3+un", + "4*cr4+lt", "4*cr4+gt", "4*cr4+eq", "4*cr4+un", + "4*cr5+lt", "4*cr5+gt", "4*cr5+eq", "4*cr5+un", + "4*cr6+lt", "4*cr6+gt", "4*cr6+eq", "4*cr6+un", + "4*cr7+lt", "4*cr7+gt", "4*cr7+eq", "4*cr7+un" + }; + return CRBits[RegEncoding]; +} + +// showRegistersWithPrefix - This method determines whether registers +// should be number-only or include the prefix. +bool PPCInstPrinter::showRegistersWithPrefix() const { + if (TT.getOS() == Triple::AIX) + return false; + return TT.isOSDarwin() || FullRegNamesWithPercent || FullRegNames; +} /// stripRegisterPrefix - This method strips the character prefix from a -/// register name so that only the number is left. Used by for linux asm. +/// register name so that only the number is left. static const char *stripRegisterPrefix(const char *RegName) { - if (FullRegNames || ShowVSRNumsAsVR) - return RegName; - switch (RegName[0]) { case 'r': case 'f': @@ -462,7 +512,7 @@ static const char *stripRegisterPrefix(const char *RegName) { return RegName + 1; case 'c': if (RegName[1] == 'r') return RegName + 2; } - + return RegName; } @@ -487,20 +537,24 @@ void PPCInstPrinter::printOperand(const MCInst *MI, unsigned OpNo, Reg = PPC::VSX32 + (Reg - PPC::VF0); } - const char *RegName = getRegisterName(Reg); - // The linux and AIX assembler does not take register prefixes. - if (!isDarwinSyntax()) + const char *RegName; + RegName = getVerboseConditionRegName(Reg, MRI.getEncodingValue(Reg)); + if (RegName == nullptr) + RegName = getRegisterName(Reg); + if (showRegistersWithPercentPrefix(RegName)) + O << "%"; + if (!showRegistersWithPrefix()) RegName = stripRegisterPrefix(RegName); - + O << RegName; return; } - + if (Op.isImm()) { O << Op.getImm(); return; } - + assert(Op.isExpr() && "unknown operand kind in printOperand"); Op.getExpr()->print(O, &MAI); } diff --git a/lib/Target/PowerPC/InstPrinter/PPCInstPrinter.h b/lib/Target/PowerPC/InstPrinter/PPCInstPrinter.h index 9c79ffb1176c0..f000fbb98110d 100644 --- a/lib/Target/PowerPC/InstPrinter/PPCInstPrinter.h +++ b/lib/Target/PowerPC/InstPrinter/PPCInstPrinter.h @@ -14,21 +14,24 @@ #ifndef LLVM_LIB_TARGET_POWERPC_INSTPRINTER_PPCINSTPRINTER_H #define LLVM_LIB_TARGET_POWERPC_INSTPRINTER_PPCINSTPRINTER_H +#include "llvm/ADT/Triple.h" #include "llvm/MC/MCInstPrinter.h" namespace llvm { class PPCInstPrinter : public MCInstPrinter { - bool IsDarwin; + Triple TT; +private: + bool showRegistersWithPercentPrefix(const char *RegName) const; + bool showRegistersWithPrefix() const; + const char *getVerboseConditionRegName(unsigned RegNum, + unsigned RegEncoding) const; + public: PPCInstPrinter(const MCAsmInfo &MAI, const MCInstrInfo &MII, - const MCRegisterInfo &MRI, bool isDarwin) - : MCInstPrinter(MAI, MII, MRI), IsDarwin(isDarwin) {} - - bool isDarwinSyntax() const { - return IsDarwin; - } - + const MCRegisterInfo &MRI, Triple T) + : MCInstPrinter(MAI, MII, MRI), TT(T) {} + void printRegName(raw_ostream &OS, unsigned RegNo) const override; void printInst(const MCInst *MI, raw_ostream &O, StringRef Annot, const MCSubtargetInfo &STI) override; diff --git a/lib/Target/PowerPC/MCTargetDesc/PPCAsmBackend.cpp b/lib/Target/PowerPC/MCTargetDesc/PPCAsmBackend.cpp index bdad2fe8714fd..2a1de244da923 100644 --- a/lib/Target/PowerPC/MCTargetDesc/PPCAsmBackend.cpp +++ b/lib/Target/PowerPC/MCTargetDesc/PPCAsmBackend.cpp @@ -204,7 +204,8 @@ namespace { public: DarwinPPCAsmBackend(const Target &T) : PPCAsmBackend(T, false) { } - MCObjectWriter *createObjectWriter(raw_pwrite_stream &OS) const override { + std::unique_ptr + createObjectWriter(raw_pwrite_stream &OS) const override { bool is64 = getPointerSize() == 8; return createPPCMachObjectWriter( OS, @@ -220,7 +221,8 @@ namespace { ELFPPCAsmBackend(const Target &T, bool IsLittleEndian, uint8_t OSABI) : PPCAsmBackend(T, IsLittleEndian), OSABI(OSABI) { } - MCObjectWriter *createObjectWriter(raw_pwrite_stream &OS) const override { + std::unique_ptr + createObjectWriter(raw_pwrite_stream &OS) const override { bool is64 = getPointerSize() == 8; return createPPCELFObjectWriter(OS, is64, isLittleEndian(), OSABI); } diff --git a/lib/Target/PowerPC/MCTargetDesc/PPCELFObjectWriter.cpp b/lib/Target/PowerPC/MCTargetDesc/PPCELFObjectWriter.cpp index 1488bd5b0be61..44ee9733b16e1 100644 --- a/lib/Target/PowerPC/MCTargetDesc/PPCELFObjectWriter.cpp +++ b/lib/Target/PowerPC/MCTargetDesc/PPCELFObjectWriter.cpp @@ -13,6 +13,7 @@ #include "llvm/ADT/STLExtras.h" #include "llvm/MC/MCELFObjectWriter.h" #include "llvm/MC/MCExpr.h" +#include "llvm/MC/MCObjectWriter.h" #include "llvm/MC/MCSymbolELF.h" #include "llvm/MC/MCValue.h" #include "llvm/Support/ErrorHandling.h" @@ -416,10 +417,9 @@ bool PPCELFObjectWriter::needsRelocateWithSymbol(const MCSymbol &Sym, } } -MCObjectWriter *llvm::createPPCELFObjectWriter(raw_pwrite_stream &OS, - bool Is64Bit, - bool IsLittleEndian, - uint8_t OSABI) { - MCELFObjectTargetWriter *MOTW = new PPCELFObjectWriter(Is64Bit, OSABI); - return createELFObjectWriter(MOTW, OS, IsLittleEndian); +std::unique_ptr +llvm::createPPCELFObjectWriter(raw_pwrite_stream &OS, bool Is64Bit, + bool IsLittleEndian, uint8_t OSABI) { + auto MOTW = llvm::make_unique(Is64Bit, OSABI); + return createELFObjectWriter(std::move(MOTW), OS, IsLittleEndian); } diff --git a/lib/Target/PowerPC/MCTargetDesc/PPCMCAsmInfo.cpp b/lib/Target/PowerPC/MCTargetDesc/PPCMCAsmInfo.cpp index d30bf1a56e8aa..8ac461b96b88c 100644 --- a/lib/Target/PowerPC/MCTargetDesc/PPCMCAsmInfo.cpp +++ b/lib/Target/PowerPC/MCTargetDesc/PPCMCAsmInfo.cpp @@ -24,6 +24,7 @@ PPCMCAsmInfoDarwin::PPCMCAsmInfoDarwin(bool is64Bit, const Triple& T) { } IsLittleEndian = false; + SeparatorString = "@"; CommentString = ";"; ExceptionsType = ExceptionHandling::DwarfCFI; diff --git a/lib/Target/PowerPC/MCTargetDesc/PPCMCTargetDesc.cpp b/lib/Target/PowerPC/MCTargetDesc/PPCMCTargetDesc.cpp index e8f220ea54576..a1e4e07b25af4 100644 --- a/lib/Target/PowerPC/MCTargetDesc/PPCMCTargetDesc.cpp +++ b/lib/Target/PowerPC/MCTargetDesc/PPCMCTargetDesc.cpp @@ -94,15 +94,6 @@ static MCAsmInfo *createPPCMCAsmInfo(const MCRegisterInfo &MRI, return MAI; } -static void adjustCodeGenOpts(const Triple &TT, Reloc::Model RM, - CodeModel::Model &CM) { - if (CM == CodeModel::Default) { - if (!TT.isOSDarwin() && - (TT.getArch() == Triple::ppc64 || TT.getArch() == Triple::ppc64le)) - CM = CodeModel::Medium; - } -} - namespace { class PPCTargetAsmStreamer : public PPCTargetStreamer { @@ -248,7 +239,7 @@ static MCInstPrinter *createPPCMCInstPrinter(const Triple &T, const MCAsmInfo &MAI, const MCInstrInfo &MII, const MCRegisterInfo &MRI) { - return new PPCInstPrinter(MAI, MII, MRI, T.isOSDarwin()); + return new PPCInstPrinter(MAI, MII, MRI, T); } extern "C" void LLVMInitializePowerPCTargetMC() { @@ -257,9 +248,6 @@ extern "C" void LLVMInitializePowerPCTargetMC() { // Register the MC asm info. RegisterMCAsmInfoFn C(*T, createPPCMCAsmInfo); - // Register the MC codegen info. - TargetRegistry::registerMCAdjustCodeGenOpts(*T, adjustCodeGenOpts); - // Register the MC instruction info. TargetRegistry::RegisterMCInstrInfo(*T, createPPCMCInstrInfo); diff --git a/lib/Target/PowerPC/MCTargetDesc/PPCMCTargetDesc.h b/lib/Target/PowerPC/MCTargetDesc/PPCMCTargetDesc.h index 893233ee2300f..80a74c09a598a 100644 --- a/lib/Target/PowerPC/MCTargetDesc/PPCMCTargetDesc.h +++ b/lib/Target/PowerPC/MCTargetDesc/PPCMCTargetDesc.h @@ -19,6 +19,7 @@ #include "llvm/Support/MathExtras.h" #include +#include namespace llvm { @@ -47,12 +48,15 @@ MCAsmBackend *createPPCAsmBackend(const Target &T, const MCRegisterInfo &MRI, const MCTargetOptions &Options); /// Construct an PPC ELF object writer. -MCObjectWriter *createPPCELFObjectWriter(raw_pwrite_stream &OS, bool Is64Bit, - bool IsLittleEndian, uint8_t OSABI); +std::unique_ptr createPPCELFObjectWriter(raw_pwrite_stream &OS, + bool Is64Bit, + bool IsLittleEndian, + uint8_t OSABI); /// Construct a PPC Mach-O object writer. -MCObjectWriter *createPPCMachObjectWriter(raw_pwrite_stream &OS, bool Is64Bit, - uint32_t CPUType, - uint32_t CPUSubtype); +std::unique_ptr createPPCMachObjectWriter(raw_pwrite_stream &OS, + bool Is64Bit, + uint32_t CPUType, + uint32_t CPUSubtype); /// Returns true iff Val consists of one contiguous run of 1s with any number of /// 0s on either side. The 1s are allowed to wrap from LSB to MSB, so @@ -97,6 +101,7 @@ static inline bool isRunOfOnes(unsigned Val, unsigned &MB, unsigned &ME) { // Defines symbolic names for the PowerPC instructions. // #define GET_INSTRINFO_ENUM +#define GET_INSTRINFO_SCHED_ENUM #include "PPCGenInstrInfo.inc" #define GET_SUBTARGETINFO_ENUM diff --git a/lib/Target/PowerPC/MCTargetDesc/PPCMachObjectWriter.cpp b/lib/Target/PowerPC/MCTargetDesc/PPCMachObjectWriter.cpp index d5506277ca880..4b9055ec70419 100644 --- a/lib/Target/PowerPC/MCTargetDesc/PPCMachObjectWriter.cpp +++ b/lib/Target/PowerPC/MCTargetDesc/PPCMachObjectWriter.cpp @@ -374,10 +374,10 @@ void PPCMachObjectWriter::RecordPPCRelocation( Writer->addRelocation(RelSymbol, Fragment->getParent(), MRE); } -MCObjectWriter *llvm::createPPCMachObjectWriter(raw_pwrite_stream &OS, - bool Is64Bit, uint32_t CPUType, - uint32_t CPUSubtype) { +std::unique_ptr +llvm::createPPCMachObjectWriter(raw_pwrite_stream &OS, bool Is64Bit, + uint32_t CPUType, uint32_t CPUSubtype) { return createMachObjectWriter( - new PPCMachObjectWriter(Is64Bit, CPUType, CPUSubtype), OS, + llvm::make_unique(Is64Bit, CPUType, CPUSubtype), OS, /*IsLittleEndian=*/false); } diff --git a/lib/Target/PowerPC/MCTargetDesc/PPCPredicates.h b/lib/Target/PowerPC/MCTargetDesc/PPCPredicates.h index acea600fbb0da..603ac960133f9 100644 --- a/lib/Target/PowerPC/MCTargetDesc/PPCPredicates.h +++ b/lib/Target/PowerPC/MCTargetDesc/PPCPredicates.h @@ -70,6 +70,22 @@ namespace PPC { /// Assume the condition register is set by MI(a,b), return the predicate if /// we modify the instructions such that condition register is set by MI(b,a). Predicate getSwappedPredicate(Predicate Opcode); + + /// Return the condition without hint bits. + inline unsigned getPredicateCondition(Predicate Opcode) { + return (unsigned)(Opcode & ~BR_HINT_MASK); + } + + /// Return the hint bits of the predicate. + inline unsigned getPredicateHint(Predicate Opcode) { + return (unsigned)(Opcode & BR_HINT_MASK); + } + + /// Return predicate consisting of specified condition and hint bits. + inline Predicate getPredicate(unsigned Condition, unsigned Hint) { + return (Predicate)((Condition & ~BR_HINT_MASK) | + (Hint & BR_HINT_MASK)); + } } } diff --git a/lib/Target/PowerPC/P9InstrResources.td b/lib/Target/PowerPC/P9InstrResources.td index aea022f887667..dc6ed16e53ce7 100644 --- a/lib/Target/PowerPC/P9InstrResources.td +++ b/lib/Target/PowerPC/P9InstrResources.td @@ -12,11 +12,29 @@ // is listed here. Instructions in this file belong to itinerary classes that // have instructions with different resource requirements. // +// The makeup of the P9 CPU is modeled as follows: +// - Each CPU is made up of two superslices. +// - Each superslice is made up of two slices. Therefore, there are 4 slices +// for each CPU. +// - Up to 6 instructions can be dispatched to each CPU. Three per superslice. +// - Each CPU has: +// - One CY (Crypto) unit P9_CY_* +// - One DFU (Decimal Floating Point and Quad Precision) unit P9_DFU_* +// - Two PM (Permute) units. One on each superslice. P9_PM_* +// - Two DIV (Fixed Point Divide) units. One on each superslize. P9_DIV_* +// - Four ALU (Fixed Point Arithmetic) units. One on each slice. P9_ALU_* +// - Four DP (Floating Point) units. One on each slice. P9_DP_* +// This also includes fixed point multiply add. +// - Four AGEN (Address Generation) units. One for each slice. P9_AGEN_* +// - Four Load/Store Queues. P9_LS_* +// - Each set of instructions will require a number of these resources. //===----------------------------------------------------------------------===// - +// Two cycle ALU vector operation that uses an entire superslice. +// Uses both ALU units (the even ALUE and odd ALUO units), two pipelines +// (EXECE, EXECO) and all three dispatches (DISP) to the given superslice. def : InstRW<[P9_ALUE_2C, P9_ALUO_2C, IP_EXECE_1C, IP_EXECO_1C, - DISP_1C, DISP_1C], + DISP_1C, DISP_1C, DISP_1C], (instrs VADDCUW, VADDUBM, @@ -26,47 +44,41 @@ def : InstRW<[P9_ALUE_2C, P9_ALUO_2C, IP_EXECE_1C, IP_EXECO_1C, VAND, VANDC, VCMPEQUB, - VCMPEQUBo, VCMPEQUD, - VCMPEQUDo, VCMPEQUH, - VCMPEQUHo, VCMPEQUW, - VCMPEQUWo, - VCMPGTSB, - VCMPGTSBo, - VCMPGTSD, - VCMPGTSDo, - VCMPGTSH, - VCMPGTSHo, - VCMPGTSW, - VCMPGTSWo, - VCMPGTUB, - VCMPGTUBo, - VCMPGTUD, - VCMPGTUDo, - VCMPGTUH, - VCMPGTUHo, - VCMPGTUW, - VCMPGTUWo, VCMPNEB, - VCMPNEBo, VCMPNEH, - VCMPNEHo, VCMPNEW, - VCMPNEWo, VCMPNEZB, - VCMPNEZBo, VCMPNEZH, - VCMPNEZHo, VCMPNEZW, - VCMPNEZWo, VEQV, VEXTSB2D, VEXTSB2W, VEXTSH2D, VEXTSH2W, VEXTSW2D, + VRLB, + VRLD, + VRLDMI, + VRLDNM, + VRLH, + VRLW, + VRLWMI, + VRLWNM, + VSRAB, + VSRAD, + VSRAH, + VSRAW, + VSRB, + VSRD, + VSRH, + VSRW, + VSLB, + VSLD, + VSLH, + VSLW, VMRGEW, VMRGOW, VNAND, @@ -77,9 +89,7 @@ def : InstRW<[P9_ALUE_2C, P9_ALUO_2C, IP_EXECE_1C, IP_EXECO_1C, VORC, VPOPCNTB, VPOPCNTH, - VPOPCNTW, VSEL, - VSUBCUW, VSUBUBM, VSUBUDM, VSUBUHM, @@ -98,6 +108,8 @@ def : InstRW<[P9_ALUE_2C, P9_ALUO_2C, IP_EXECE_1C, IP_EXECO_1C, XVNEGDP, XVNEGSP, XVXEXPDP, + XVIEXPSP, + XVXEXPSP, XXLAND, XXLANDC, XXLEQV, @@ -107,28 +119,128 @@ def : InstRW<[P9_ALUE_2C, P9_ALUO_2C, IP_EXECE_1C, IP_EXECO_1C, XXLORf, XXLORC, XXLXOR, - XXSEL -)>; - -def : InstRW<[P9_ALU_2C, IP_EXEC_1C, DISP_1C, DISP_1C], - (instrs + XXSEL, XSABSQP, XSCPSGNQP, XSIEXPQP, XSNABSQP, XSNEGQP, - XSXEXPQP, - XSABSDP, - XSCPSGNDP, - XSIEXPDP, + XSXEXPQP +)>; + +// Restricted Dispatch ALU operation for 3 cycles. The operation runs on a +// slingle slice. However, since it is Restricted it requires all 3 dispatches +// (DISP) for that superslice. +def : InstRW<[P9_ALU_3C, IP_EXEC_1C, DISP_1C, DISP_1C, DISP_1C], + (instrs + FCMPUS, + FCMPUD, + XSTSTDCDP, + XSTSTDCSP +)>; + +// Standard Dispatch ALU operation for 3 cycles. Only one slice used. +def : InstRW<[P9_ALU_3C, IP_EXEC_1C, DISP_1C, DISP_1C], + (instrs + XSMAXCDP, + XSMAXDP, + XSMAXJDP, + XSMINCDP, + XSMINDP, + XSMINJDP, + XSTDIVDP, + XSTSQRTDP, + XSCMPEQDP, + XSCMPEXPDP, + XSCMPGEDP, + XSCMPGTDP, + XSCMPODP, + XSCMPUDP, + XSXSIGDP, + XSCVSPDPN +)>; + +// Standard Dispatch ALU operation for 2 cycles. Only one slice used. +def : InstRW<[P9_ALU_2C, IP_EXEC_1C, DISP_1C, DISP_1C], + (instrs + ADDIStocHA, + ADDItocL, + MCRF, + MCRXRX, + SLD, + SRD, + SRAD, + SRADI, + RLDIC, XSNABSDP, + XSXEXPDP, + XSABSDP, XSNEGDP, - XSXEXPDP + XSCPSGNDP )>; -def : InstRW<[P9_ALUE_3C, P9_ALUO_3C, IP_EXECE_1C, IP_EXECO_1C, DISP_1C, DISP_1C], +// Restricted Dispatch ALU operation for 2 cycles. The operation runs on a +// slingle slice. However, since it is Restricted it requires all 3 dispatches +// (DISP) for that superslice. +def : InstRW<[P9_ALU_2C, IP_EXEC_1C, DISP_1C, DISP_1C, DISP_1C], (instrs + RLDCL, + RLDCR, + RLDIMI, + RLDICL, + RLDICR, + RLDICL_32_64, + XSIEXPDP, + FMR, + FABSD, + FABSS, + FNABSD, + FNABSS, + FNEGD, + FNEGS, + FCPSGND, + FCPSGNS +)>; +// Three cycle ALU vector operation that uses an entire superslice. +// Uses both ALU units (the even ALUE and odd ALUO units), two pipelines +// (EXECE, EXECO) and all three dispatches (DISP) to the given superslice. +def : InstRW<[P9_ALUE_3C, P9_ALUO_3C, IP_EXECE_1C, IP_EXECO_1C, + DISP_1C, DISP_1C, DISP_1C], + (instrs + VBPERMD, + VABSDUB, + VABSDUH, + VABSDUW, + VADDUBS, + VADDUHS, + VADDUWS, + VAVGSB, + VAVGSH, + VAVGSW, + VAVGUB, + VAVGUH, + VAVGUW, + VCMPEQFP, + VCMPEQFPo, + VCMPGEFP, + VCMPGEFPo, + VCMPBFP, + VCMPBFPo, + VCMPGTFP, + VCMPGTFPo, + VCLZB, + VCLZD, + VCLZH, + VCLZW, + VCTZB, + VCTZD, + VCTZH, + VCTZW, + VADDSBS, + VADDSHS, + VADDSWS, + VMINFP, VMINSB, VMINSD, VMINSH, @@ -137,55 +249,54 @@ def : InstRW<[P9_ALUE_3C, P9_ALUO_3C, IP_EXECE_1C, IP_EXECO_1C, DISP_1C, DISP_1C VMINUD, VMINUH, VMINUW, + VMAXFP, + VMAXSB, + VMAXSD, + VMAXSH, + VMAXSW, + VMAXUB, + VMAXUD, + VMAXUH, + VMAXUW, + VPOPCNTW, VPOPCNTD, VPRTYBD, VPRTYBW, - VRLB, - VRLD, - VRLDMI, - VRLDNM, - VRLH, - VRLW, - VRLWMI, - VRLWNM, VSHASIGMAD, VSHASIGMAW, - VSLB, - VSLD, - VSLH, - VSLW, - VSRAB, - VSRAD, - VSRAH, - VSRAW, - VSRB, - VSRD, - VSRH, - VSRW, VSUBSBS, VSUBSHS, VSUBSWS, VSUBUBS, VSUBUHS, VSUBUWS, - XSCMPEQDP, - XSCMPEXPDP, - XSCMPGEDP, - XSCMPGTDP, - XSCMPODP, - XSCMPUDP, - XSCVSPDPN, - XSMAXCDP, - XSMAXDP, - XSMAXJDP, - XSMINCDP, - XSMINDP, - XSMINJDP, - XSTDIVDP, - XSTSQRTDP, - XSTSTDCDP, - XSTSTDCSP, - XSXSIGDP, + VSUBCUW, + VCMPGTSB, + VCMPGTSBo, + VCMPGTSD, + VCMPGTSDo, + VCMPGTSH, + VCMPGTSHo, + VCMPGTSW, + VCMPGTSWo, + VCMPGTUB, + VCMPGTUBo, + VCMPGTUD, + VCMPGTUDo, + VCMPGTUH, + VCMPGTUHo, + VCMPGTUW, + VCMPGTUWo, + VCMPNEBo, + VCMPNEHo, + VCMPNEWo, + VCMPNEZBo, + VCMPNEZHo, + VCMPNEZWo, + VCMPEQUBo, + VCMPEQUDo, + VCMPEQUHo, + VCMPEQUWo, XVCMPEQDP, XVCMPEQDPo, XVCMPEQSP, @@ -198,7 +309,6 @@ def : InstRW<[P9_ALUE_3C, P9_ALUO_3C, IP_EXECE_1C, IP_EXECO_1C, DISP_1C, DISP_1C XVCMPGTDPo, XVCMPGTSP, XVCMPGTSPo, - XVIEXPSP, XVMAXDP, XVMAXSP, XVMINDP, @@ -209,58 +319,15 @@ def : InstRW<[P9_ALUE_3C, P9_ALUO_3C, IP_EXECE_1C, IP_EXECO_1C, DISP_1C, DISP_1C XVTSQRTSP, XVTSTDCDP, XVTSTDCSP, - XVXEXPSP, XVXSIGDP, XVXSIGSP )>; -def : InstRW<[P9_ALUE_4C, P9_ALUO_4C, IP_EXECE_1C, IP_EXECO_1C, DISP_1C, DISP_1C], - (instrs - VABSDUB, - VABSDUH, - VABSDUW, - VADDSBS, - VADDSHS, - VADDSWS, - VADDUBS, - VADDUHS, - VADDUWS, - VAVGSB, - VAVGSH, - VAVGSW, - VAVGUB, - VAVGUH, - VAVGUW, - VBPERMD, - VCLZB, - VCLZD, - VCLZH, - VCLZW, - VCMPBFP, - VCMPBFPo, - VCMPGTFP, - VCMPGTFPo, - VCTZB, - VCTZD, - VCTZH, - VCTZW, - VMAXFP, - VMAXSB, - VMAXSD, - VMAXSH, - VMAXSW, - VMAXUB, - VMAXUD, - VMAXUH, - VMAXUW, - VMINFP, - VCMPEQFP, - VCMPEQFPo, - VCMPGEFP, - VCMPGEFPo -)>; - -def : InstRW<[P9_DPE_7C, P9_DPO_7C, IP_EXECE_1C, IP_EXECO_1C, DISP_1C, DISP_1C], +// 7 cycle DP vector operation that uses an entire superslice. +// Uses both DP units (the even DPE and odd DPO units), two pipelines +// (EXECE, EXECO) and all three dispatches (DISP) to the given superslice. +def : InstRW<[P9_DPE_7C, P9_DPO_7C, IP_EXECE_1C, IP_EXECO_1C, + DISP_1C, DISP_1C, DISP_1C], (instrs VADDFP, VCTSXS, @@ -367,8 +434,47 @@ def : InstRW<[P9_DPE_7C, P9_DPO_7C, IP_EXECE_1C, IP_EXECO_1C, DISP_1C, DISP_1C], VSUMSWS )>; +// 7 cycle Restricted DP operation. One DP unit, one EXEC pipeline and all three +// dispatch units for the superslice. def : InstRW<[P9_DP_7C, IP_EXEC_1C, DISP_1C, DISP_1C, DISP_1C], (instrs + FRSP, + FRIND, + FRINS, + FRIPD, + FRIPS, + FRIZD, + FRIZS, + FRIMD, + FRIMS, + FRE, + FRES, + FRSQRTE, + FRSQRTES, + FMADDS, + FMADD, + FMSUBS, + FMSUB, + FNMADDS, + FNMADD, + FNMSUBS, + FNMSUB, + FSELD, + FSELS, + FADDS, + FMULS, + FMUL, + FSUBS, + FCFID, + FCTID, + FCTIDZ, + FCFIDU, + FCFIDS, + FCFIDUS, + FCTIDUZ, + FCTIWUZ, + FCTIW, + FCTIWZ, XSMADDADP, XSMADDASP, XSMADDMDP, @@ -389,7 +495,19 @@ def : InstRW<[P9_DP_7C, IP_EXEC_1C, DISP_1C, DISP_1C, DISP_1C], XSNMSUBMSP )>; +// 7 cycle Restricted DP operation and one 2 cycle ALU operation. +// The DP is restricted so we need a full 5 dispatches. +def : InstRW<[P9_DPOpAndALUOp_9C, IP_EXEC_1C, IP_EXEC_1C, + DISP_1C, DISP_1C, DISP_1C, DISP_1C, DISP_1C], + (instrs + FMULo, + FMADDo, + FMSUBo, + FNMADDo, + FNMSUBo +)>; +// 7 cycle DP operation. One DP unit, one EXEC pipeline and two dispatch units. def : InstRW<[P9_DP_7C, IP_EXEC_1C, DISP_1C, DISP_1C], (instrs XSADDDP, @@ -397,8 +515,10 @@ def : InstRW<[P9_DP_7C, IP_EXEC_1C, DISP_1C, DISP_1C], XSCVDPHP, XSCVDPSP, XSCVDPSXDS, + XSCVDPSXDSs, XSCVDPSXWS, XSCVDPUXDS, + XSCVDPUXDSs, XSCVDPUXWS, XSCVHPDP, XSCVSPDP, @@ -421,7 +541,10 @@ def : InstRW<[P9_DP_7C, IP_EXEC_1C, DISP_1C, DISP_1C], XSCVDPSPN )>; -def : InstRW<[P9_PM_3C, IP_EXECO_1C, IP_EXECE_1C, DISP_1C, DISP_1C], +// Three Cycle PM operation. Only one PM unit per superslice so we use the whole +// superslice. That includes both exec pipelines (EXECO, EXECE) and all three +// dispatches. +def : InstRW<[P9_PM_3C, IP_EXECO_1C, IP_EXECE_1C, DISP_1C, DISP_1C, DISP_1C], (instrs VBPERMQ, VCLZLSBB, @@ -469,7 +592,9 @@ def : InstRW<[P9_PM_3C, IP_EXECO_1C, IP_EXECE_1C, DISP_1C, DISP_1C], VSLO, VSLV, VSPLTB, + VSPLTBs, VSPLTH, + VSPLTHs, VSPLTISB, VSPLTISH, VSPLTISW, @@ -498,6 +623,9 @@ def : InstRW<[P9_PM_3C, IP_EXECO_1C, IP_EXECE_1C, DISP_1C, DISP_1C], XXSLDWI, XXSPLTIB, XXSPLTW, + XXSPLTWs, + XXPERMDI, + XXPERMDIs, VADDCUQ, VADDECUQ, VADDEUQM, @@ -517,7 +645,10 @@ def : InstRW<[P9_PM_3C, IP_EXECO_1C, IP_EXECE_1C, DISP_1C, DISP_1C], XSXSIGQP )>; -def : InstRW<[P9_DFU_12C, IP_EXECE_1C, IP_EXECO_1C, DISP_1C, DISP_1C], +// 12 Cycle DFU operation. Only one DFU unit per CPU so we use a whole +// superslice. That includes both exec pipelines (EXECO, EXECE) and all three +// dispatches. +def : InstRW<[P9_DFU_12C, IP_EXECE_1C, IP_EXECO_1C, DISP_1C, DISP_1C, DISP_1C], (instrs XSADDQP, XSADDQPO, @@ -536,7 +667,10 @@ def : InstRW<[P9_DFU_12C, IP_EXECE_1C, IP_EXECO_1C, DISP_1C, DISP_1C], XSSUBQPO )>; -def : InstRW<[P9_DFU_24C, IP_EXECE_1C, IP_EXECO_1C, DISP_1C, DISP_1C], +// 24 Cycle DFU operation. Only one DFU unit per CPU so we use a whole +// superslice. That includes both exec pipelines (EXECO, EXECE) and all three +// dispatches. +def : InstRW<[P9_DFU_24C, IP_EXECE_1C, IP_EXECO_1C, DISP_1C, DISP_1C, DISP_1C], (instrs XSMADDQP, XSMADDQPO, @@ -550,45 +684,57 @@ def : InstRW<[P9_DFU_24C, IP_EXECE_1C, IP_EXECO_1C, DISP_1C, DISP_1C], XSNMSUBQPO )>; -def : InstRW<[P9_DFU_58C, IP_EXECE_1C, IP_EXECO_1C, DISP_1C, DISP_1C], +// 58 Cycle DFU operation. Only one DFU unit per CPU so we use a whole +// superslice. That includes both exec pipelines (EXECO, EXECE) and all three +// dispatches. +def : InstRW<[P9_DFU_58C, IP_EXECE_1C, IP_EXECO_1C, DISP_1C, DISP_1C, DISP_1C], (instrs XSDIVQP, XSDIVQPO )>; -def : InstRW<[P9_DFU_76C, IP_EXECE_1C, IP_EXECO_1C, DISP_1C, DISP_1C], +// 76 Cycle DFU operation. Only one DFU unit per CPU so we use a whole +// superslice. That includes both exec pipelines (EXECO, EXECE) and all three +// dispatches. +def : InstRW<[P9_DFU_76C, IP_EXECE_1C, IP_EXECO_1C, DISP_1C, DISP_1C, DISP_1C], (instrs XSSQRTQP, XSSQRTQPO )>; -// Load Operation in IIC_LdStLFD - +// 5 Cycle load uses a single slice. def : InstRW<[P9_LS_5C, IP_AGEN_1C, DISP_1C, DISP_1C], (instrs LXSDX, LXVD2X, LXSIWZX, LXV, - LXSD + LXVX, + LXSD, + DFLOADf64, + XFLOADf64 )>; -def : InstRW<[P9_LS_5C, IP_AGEN_1C, DISP_1C, DISP_1C, DISP_1C], +// 4 Cycle load uses a single slice. +def : InstRW<[P9_LS_4C, IP_AGEN_1C, DISP_1C, DISP_1C], (instrs - LFIWZX, - LFDX, - LFD + COPY )>; -def : InstRW<[P9_LoadAndALUOp_7C, IP_AGEN_1C, IP_EXEC_1C, - DISP_1C, DISP_1C, DISP_1C, DISP_1C], +// 4 Cycle Restricted load uses a single slice but the dispatch for the whole +// superslice. +def : InstRW<[P9_LS_4C, IP_AGEN_1C, DISP_1C, DISP_1C, DISP_1C], (instrs - LXSSPX, - LXSIWAX, - LXSSP + LFIWZX, + LFDX, + LFD )>; -def : InstRW<[P9_LoadAndALUOp_7C, IP_AGEN_1C, IP_EXEC_1C, +// Cracked Restricted Load instruction. +// Requires consecutive Load and ALU pieces totaling 6 cycles. The Load and ALU +// operations cannot be done at the same time and so their latencies are added. +// Full 6 dispatches are required as this is both cracked and restricted. +def : InstRW<[P9_LoadAndALUOp_6C, IP_EXEC_1C, IP_AGEN_1C, DISP_1C, DISP_1C, DISP_1C, DISP_1C, DISP_1C, DISP_1C], (instrs LFIWAX, @@ -596,14 +742,38 @@ def : InstRW<[P9_LoadAndALUOp_7C, IP_AGEN_1C, IP_EXEC_1C, LFS )>; -def : InstRW<[P9_LoadAndPMOp_8C, IP_AGEN_1C, IP_EXEC_1C, DISP_1C, DISP_1C], +// Cracked Load instruction. +// Requires consecutive Load and ALU pieces totaling 7 cycles. The Load and ALU +// operations cannot be done at the same time and so their latencies are added. +// Full 4 dispatches are required as this is a cracked instruction. +def : InstRW<[P9_LoadAndALUOp_7C, IP_AGEN_1C, IP_EXEC_1C, + DISP_1C, DISP_1C, DISP_1C, DISP_1C], + (instrs + LXSSPX, + LXSIWAX, + LXSSP, + DFLOADf32, + XFLOADf32, + LIWAX, + LIWZX +)>; + +// Cracked Load that requires the PM resource. +// Since the Load and the PM cannot be done at the same time the latencies are +// added. Requires 8 cycles. +// Since the PM requires the full superslice we need both EXECE, EXECO pipelines +// as well as 3 dispatches for the PM. The Load requires the remaining 2 +// dispatches. +def : InstRW<[P9_LoadAndPMOp_8C, IP_AGEN_1C, IP_EXECE_1C, IP_EXECO_1C, + DISP_1C, DISP_1C, DISP_1C, DISP_1C, DISP_1C], (instrs LXVDSX, + LXVWSX, LXVW4X )>; -// Store Operations in IIC_LdStSTFD. - +// Single slice Restricted store operation. The restricted operation requires +// all three dispatches for the superslice. def : InstRW<[P9_LS_1C, IP_EXEC_1C, IP_AGEN_1C, DISP_1C, DISP_1C, DISP_1C], (instrs STFS, @@ -613,74 +783,88 @@ def : InstRW<[P9_LS_1C, IP_EXEC_1C, IP_AGEN_1C, DISP_1C, DISP_1C, DISP_1C], STFDX, STXSDX, STXSSPX, - STXSIWX + STXSIWX, + DFSTOREf32, + DFSTOREf64, + XFSTOREf32, + XFSTOREf64, + STIWX )>; -def : InstRW<[P9_LS_1C, IP_EXEC_1C, IP_EXEC_1C, IP_AGEN_1C, DISP_1C, DISP_1C], +// Store operation that requires the whole superslice. +def : InstRW<[P9_LS_1C, IP_EXECE_1C, IP_EXECO_1C, IP_AGEN_1C, + DISP_1C, DISP_1C, DISP_1C], (instrs STXVD2X, STXVW4X )>; -// Divide Operations in IIC_IntDivW, IIC_IntDivD. - -def : InstRW<[P9_DIV_16C_8, IP_EXECE_1C, DISP_1C, DISP_1C], +// 16 Cycle DIV operation. Only one DIV unit per superslice so we use the whole +// superslice. That includes both exec pipelines (EXECO, EXECE) and all three +// dispatches. +def : InstRW<[P9_DIV_16C_8, IP_EXECO_1C, IP_EXECE_1C, + DISP_1C, DISP_1C, DISP_1C], (instrs DIVW, - DIVWU + DIVWU, + MODSW )>; -def : InstRW<[P9_DIV_24C_8, IP_EXECE_1C, DISP_1C, DISP_1C], +// 24 Cycle DIV operation. Only one DIV unit per superslice so we use the whole +// superslice. That includes both exec pipelines (EXECO, EXECE) and all three +// dispatches. +def : InstRW<[P9_DIV_24C_8, IP_EXECO_1C, IP_EXECE_1C, + DISP_1C, DISP_1C, DISP_1C], (instrs DIVWE, DIVD, DIVWEU, - DIVDU + DIVDU, + MODSD, + MODUD, + MODUW )>; -def : InstRW<[P9_DIV_40C_8, IP_EXECE_1C, DISP_1C, DISP_1C], +// 40 Cycle DIV operation. Only one DIV unit per superslice so we use the whole +// superslice. That includes both exec pipelines (EXECO, EXECE) and all three +// dispatches. +def : InstRW<[P9_DIV_40C_8, IP_EXECO_1C, IP_EXECE_1C, + DISP_1C, DISP_1C, DISP_1C], (instrs DIVDE, DIVDEU )>; -def : InstRW<[P9_IntDivAndALUOp_26C_8, IP_EXECE_1C, IP_EXEC_1C, - DISP_1C, DISP_1C, DISP_1C, DISP_1C], +// Cracked DIV and ALU operation. Requires one full slice for the ALU operation +// and one full superslice for the DIV operation since there is only one DIV +// per superslice. Latency of DIV plus ALU is 26. +def : InstRW<[P9_IntDivAndALUOp_26C_8, IP_EXECE_1C, IP_EXECO_1C, IP_EXEC_1C, + DISP_1C, DISP_1C, DISP_1C, DISP_1C, DISP_1C], (instrs + DIVDo, + DIVDUo, DIVWEo, DIVWEUo )>; -def : InstRW<[P9_IntDivAndALUOp_42C_8, IP_EXECE_1C, IP_EXEC_1C, - DISP_1C, DISP_1C, DISP_1C, DISP_1C], +// Cracked DIV and ALU operation. Requires one full slice for the ALU operation +// and one full superslice for the DIV operation since there is only one DIV +// per superslice. Latency of DIV plus ALU is 42. +def : InstRW<[P9_IntDivAndALUOp_42C_8, IP_EXECE_1C, IP_EXECO_1C, IP_EXEC_1C, + DISP_1C, DISP_1C, DISP_1C, DISP_1C, DISP_1C], (instrs DIVDEo, DIVDEUo )>; -// Rotate Operations in IIC_IntRotateD, IIC_IntRotateDI -def : InstRW<[P9_ALU_2C, IP_EXEC_1C, DISP_1C, DISP_1C], - (instrs - SLD, - SRD, - SRAD, - SRADI, - RLDIC -)>; - -def : InstRW<[P9_ALU_2C, IP_EXEC_1C, DISP_1C, DISP_1C, DISP_1C], - (instrs - RLDCL, - RLDCR, - RLDIMI, - RLDICL, - RLDICR, - RLDICL_32_64 -)>; - // CR access instructions in _BrMCR, IIC_BrMCRX. +// Cracked, restricted, ALU operations. +// Here the two ALU ops can actually be done in parallel and therefore the +// latencies are not added together. Otherwise this is like having two +// instructions running together on two pipelines and 6 dispatches. +// ALU ops are 2 cycles each. def : InstRW<[P9_ALU_2C, P9_ALU_2C, IP_EXEC_1C, IP_EXEC_1C, DISP_1C, DISP_1C, DISP_1C, DISP_1C, DISP_1C, DISP_1C], (instrs @@ -690,13 +874,12 @@ def : InstRW<[P9_ALU_2C, P9_ALU_2C, IP_EXEC_1C, IP_EXEC_1C, MTCRF8 )>; -def : InstRW<[P9_ALU_5C, IP_EXEC_1C, DISP_1C, DISP_1C], - (instrs - MCRF, - MCRXRX -)>; - -def : InstRW<[P9_ALU_5C, P9_ALU_5C, IP_EXEC_1C, IP_EXEC_1C, +// Cracked, restricted, ALU operations. +// Here the two ALU ops can actually be done in parallel and therefore the +// latencies are not added together. Otherwise this is like having two +// instructions running together on two pipelines and 6 dispatches. +// ALU ops are 3 cycles each. +def : InstRW<[P9_ALU_3C, P9_ALU_3C, IP_EXEC_1C, IP_EXEC_1C, DISP_1C, DISP_1C, DISP_1C, DISP_1C], (instrs MCRFS @@ -704,93 +887,71 @@ def : InstRW<[P9_ALU_5C, P9_ALU_5C, IP_EXEC_1C, IP_EXEC_1C, // FP Div instructions in IIC_FPDivD and IIC_FPDivS. +// 33 Cycle DP Instruction Restricted. Takes one slice and 3 dispatches. def : InstRW<[P9_DP_33C_8, IP_EXEC_1C, DISP_1C, DISP_1C, DISP_1C], (instrs - FDIV, - XSDIVDP + FDIV )>; -def : InstRW<[P9_DP_22C_5, IP_EXEC_1C, DISP_1C, DISP_1C, DISP_1C], +// 33 Cycle DP Instruction Restricted and Cracked with 2 Cycle ALU. +def : InstRW<[P9_DPOpAndALUOp_35C_8, IP_EXEC_1C, IP_EXEC_1C, + DISP_1C, DISP_1C, DISP_1C, DISP_1C, DISP_1C], (instrs - FDIVS, - XSDIVSP + FDIVo )>; -def : InstRW<[P9_DP_24C_8, IP_EXECE_1C, IP_EXECO_1C, DISP_1C, DISP_1C], +// 33 Cycle DP Instruction. Takes one slice and 2 dispatches. +def : InstRW<[P9_DP_33C_8, IP_EXEC_1C, DISP_1C, DISP_1C], (instrs - XVDIVSP + XSDIVDP )>; -def : InstRW<[P9_DP_33C_8, IP_EXECE_1C, IP_EXECO_1C, DISP_1C, DISP_1C], +// 22 Cycle DP Instruction Restricted. Takes one slice and 3 dispatches. +def : InstRW<[P9_DP_22C_5, IP_EXEC_1C, DISP_1C, DISP_1C, DISP_1C], (instrs - XVDIVDP + FDIVS )>; -// FP Instructions in IIC_FPGeneral, IIC_FPFused +// 22 Cycle DP Instruction Restricted and Cracked with 2 Cycle ALU. +def : InstRW<[P9_DPOpAndALUOp_24C_5, IP_EXEC_1C, IP_EXEC_1C, + DISP_1C, DISP_1C, DISP_1C, DISP_1C, DISP_1C], + (instrs + FDIVSo +)>; -def : InstRW<[P9_DP_7C, IP_EXEC_1C, DISP_1C, DISP_1C, DISP_1C], +// 22 Cycle DP Instruction. Takes one slice and 2 dispatches. +def : InstRW<[P9_DP_22C_5, IP_EXEC_1C, DISP_1C, DISP_1C], (instrs - FRSP, - FRIND, - FRINS, - FRIPD, - FRIPS, - FRIZD, - FRIZS, - FRIMD, - FRIMS, - FRE, - FRES, - FRSQRTE, - FRSQRTES, - FMADDS, - FMADD, - FMSUBS, - FMSUB, - FNMADDS, - FNMADD, - FNMSUBS, - FNMSUB, - FSELD, - FSELS, - FADDS, - FMULS, - FMUL, - FSUBS, - FCFID, - FCTID, - FCTIDZ, - FCFIDU, - FCFIDS, - FCFIDUS, - FCTIDUZ, - FCTIWUZ, - FCTIW, - FCTIWZ + XSDIVSP )>; -def : InstRW<[P9_DP_7C, IP_EXEC_1C, DISP_1C, DISP_1C, DISP_1C], +// 24 Cycle DP Vector Instruction. Takes one full superslice. +// Includes both EXECE, EXECO pipelines and all 3 dispatches for the given +// superslice. +def : InstRW<[P9_DPE_24C_8, P9_DPO_24C_8, IP_EXECE_1C, IP_EXECO_1C, + DISP_1C, DISP_1C, DISP_1C], (instrs - FMR, - FABSD, - FABSS, - FNABSD, - FNABSS, - FNEGD, - FNEGS, - FCPSGND, - FCPSGNS + XVDIVSP )>; -def : InstRW<[P9_ALU_3C, IP_EXEC_1C, DISP_1C, DISP_1C, DISP_1C], +// 33 Cycle DP Vector Instruction. Takes one full superslice. +// Includes both EXECE, EXECO pipelines and all 3 dispatches for the given +// superslice. +def : InstRW<[P9_DPE_33C_8, P9_DPO_33C_8, IP_EXECE_1C, IP_EXECO_1C, + DISP_1C, DISP_1C, DISP_1C], (instrs - FCMPUS, - FCMPUD + XVDIVDP )>; // Load instructions in IIC_LdStLFDU and IIC_LdStLFDUX. -def : InstRW<[P9_LoadAndALUOp_7C, P9_ALU_2C, +// Instruction cracked into three pieces. One Load and two ALU operations. +// The Load and one of the ALU ops cannot be run at the same time and so the +// latencies are added together for 6 cycles. The remainaing ALU is 2 cycles. +// Both the load and the ALU that depends on it are restricted and so they take +// a total of 6 dispatches. The final 2 dispatches come from the second ALU op. +// The two EXEC pipelines are for the 2 ALUs while the AGEN is for the load. +def : InstRW<[P9_LoadAndALUOp_6C, P9_ALU_2C, IP_AGEN_1C, IP_EXEC_1C, IP_EXEC_1C, DISP_1C, DISP_1C, DISP_1C, DISP_1C, DISP_1C, DISP_1C, DISP_1C, DISP_1C], @@ -799,10 +960,32 @@ def : InstRW<[P9_LoadAndALUOp_7C, P9_ALU_2C, LFSUX )>; -def : InstRW<[P9_LS_5C, P9_ALU_2C, IP_AGEN_1C, IP_EXEC_1C, +// Cracked instruction made up of a Load and an ALU. The ALU does not depend on +// the load and so it can be run at the same time as the load. The load is also +// restricted. 3 dispatches are from the restricted load while the other two +// are from the ALU. The AGEN pipeline is from the load and the EXEC pipeline +// is required for the ALU. +def : InstRW<[P9_LS_4C, P9_ALU_2C, IP_AGEN_1C, IP_EXEC_1C, DISP_1C, DISP_1C, DISP_1C, DISP_1C, DISP_1C], (instrs LFDU, LFDUX )>; +// Crypto Instructions + +// 6 Cycle CY operation. Only one CY unit per CPU so we use a whole +// superslice. That includes both exec pipelines (EXECO, EXECE) and all three +// dispatches. +def : InstRW<[P9_CY_6C, IP_EXECO_1C, IP_EXECE_1C, DISP_1C, DISP_1C, DISP_1C], + (instrs + VPMSUMB, + VPMSUMD, + VPMSUMH, + VPMSUMW, + VCIPHER, + VCIPHERLAST, + VNCIPHER, + VNCIPHERLAST, + VSBOX +)>; diff --git a/lib/Target/PowerPC/PPC.h b/lib/Target/PowerPC/PPC.h index ad92ac8ce1207..dfdec246e8686 100644 --- a/lib/Target/PowerPC/PPC.h +++ b/lib/Target/PowerPC/PPC.h @@ -26,8 +26,10 @@ namespace llvm { class PassRegistry; class FunctionPass; class MachineInstr; + class MachineOperand; class AsmPrinter; class MCInst; + class MCOperand; FunctionPass *createPPCCTRLoops(); #ifndef NDEBUG @@ -39,20 +41,28 @@ namespace llvm { FunctionPass *createPPCVSXCopyPass(); FunctionPass *createPPCVSXFMAMutatePass(); FunctionPass *createPPCVSXSwapRemovalPass(); + FunctionPass *createPPCReduceCRLogicalsPass(); FunctionPass *createPPCMIPeepholePass(); FunctionPass *createPPCBranchSelectionPass(); + FunctionPass *createPPCBranchCoalescingPass(); FunctionPass *createPPCQPXLoadSplatPass(); FunctionPass *createPPCISelDag(PPCTargetMachine &TM, CodeGenOpt::Level OL); FunctionPass *createPPCTLSDynamicCallPass(); FunctionPass *createPPCBoolRetToIntPass(); FunctionPass *createPPCExpandISELPass(); + FunctionPass *createPPCPreEmitPeepholePass(); void LowerPPCMachineInstrToMCInst(const MachineInstr *MI, MCInst &OutMI, AsmPrinter &AP, bool isDarwin); + bool LowerPPCMachineOperandToMCOperand(const MachineOperand &MO, + MCOperand &OutMO, AsmPrinter &AP, + bool isDarwin); void initializePPCVSXFMAMutatePass(PassRegistry&); void initializePPCBoolRetToIntPass(PassRegistry&); void initializePPCExpandISELPass(PassRegistry &); + void initializePPCPreEmitPeepholePass(PassRegistry &); void initializePPCTLSDynamicCallPass(PassRegistry &); + void initializePPCMIPeepholePass(PassRegistry&); extern char &PPCVSXFMAMutateID; namespace PPCII { diff --git a/lib/Target/PowerPC/PPCAsmPrinter.cpp b/lib/Target/PowerPC/PPCAsmPrinter.cpp index 841b8c5144641..17451900840a4 100644 --- a/lib/Target/PowerPC/PPCAsmPrinter.cpp +++ b/lib/Target/PowerPC/PPCAsmPrinter.cpp @@ -19,6 +19,7 @@ #include "InstPrinter/PPCInstPrinter.h" #include "MCTargetDesc/PPCMCExpr.h" #include "MCTargetDesc/PPCMCTargetDesc.h" +#include "MCTargetDesc/PPCPredicates.h" #include "PPC.h" #include "PPCInstrInfo.h" #include "PPCMachineFunctionInfo.h" @@ -506,7 +507,7 @@ void PPCAsmPrinter::EmitInstruction(const MachineInstr *MI) { MCInst TmpInst; bool isPPC64 = Subtarget->isPPC64(); bool isDarwin = TM.getTargetTriple().isOSDarwin(); - const Module *M = MF->getFunction()->getParent(); + const Module *M = MF->getFunction().getParent(); PICLevel::Level PL = M->getPICLevel(); // Lower multi-instruction pseudo operations. @@ -520,7 +521,7 @@ void PPCAsmPrinter::EmitInstruction(const MachineInstr *MI) { return LowerPATCHPOINT(SM, *MI); case PPC::MoveGOTtoLR: { - // Transform %LR = MoveGOTtoLR + // Transform %lr = MoveGOTtoLR // Into this: bl _GLOBAL_OFFSET_TABLE_@local-4 // _GLOBAL_OFFSET_TABLE_@local-4 (instruction preceding // _GLOBAL_OFFSET_TABLE_) has exactly one instruction: @@ -541,7 +542,7 @@ void PPCAsmPrinter::EmitInstruction(const MachineInstr *MI) { } case PPC::MovePCtoLR: case PPC::MovePCtoLR8: { - // Transform %LR = MovePCtoLR + // Transform %lr = MovePCtoLR // Into this, where the label is the PIC base: // bl L1$pb // L1$pb: @@ -559,9 +560,9 @@ void PPCAsmPrinter::EmitInstruction(const MachineInstr *MI) { return; } case PPC::UpdateGBR: { - // Transform %Rd = UpdateGBR(%Rt, %Ri) - // Into: lwz %Rt, .L0$poff - .L0$pb(%Ri) - // add %Rd, %Rt, %Ri + // Transform %rd = UpdateGBR(%rt, %ri) + // Into: lwz %rt, .L0$poff - .L0$pb(%ri) + // add %rd, %rt, %ri // Get the offset from the GOT Base Register to the GOT LowerPPCMachineInstrToMCInst(MI, TmpInst, *this, isDarwin); MCSymbol *PICOffset = @@ -576,7 +577,7 @@ void PPCAsmPrinter::EmitInstruction(const MachineInstr *MI) { const MCOperand TR = TmpInst.getOperand(1); const MCOperand PICR = TmpInst.getOperand(0); - // Step 1: lwz %Rt, .L$poff - .L$pb(%Ri) + // Step 1: lwz %rt, .L$poff - .L$pb(%ri) TmpInst.getOperand(1) = MCOperand::createExpr(MCBinaryExpr::createSub(Exp, PB, OutContext)); TmpInst.getOperand(0) = TR; @@ -591,7 +592,7 @@ void PPCAsmPrinter::EmitInstruction(const MachineInstr *MI) { return; } case PPC::LWZtoc: { - // Transform %R3 = LWZtoc , %R2 + // Transform %r3 = LWZtoc @min1, %r2 LowerPPCMachineInstrToMCInst(MI, TmpInst, *this, isDarwin); // Change the opcode to LWZ, and the global address operand to be a @@ -635,7 +636,7 @@ void PPCAsmPrinter::EmitInstruction(const MachineInstr *MI) { case PPC::LDtocCPT: case PPC::LDtocBA: case PPC::LDtoc: { - // Transform %X3 = LDtoc , %X2 + // Transform %x3 = LDtoc @min1, %x2 LowerPPCMachineInstrToMCInst(MI, TmpInst, *this, isDarwin); // Change the opcode to LD, and the global address operand to be a @@ -666,7 +667,7 @@ void PPCAsmPrinter::EmitInstruction(const MachineInstr *MI) { } case PPC::ADDIStocHA: { - // Transform %Xd = ADDIStocHA %X2, + // Transform %xd = ADDIStocHA %x2, @sym LowerPPCMachineInstrToMCInst(MI, TmpInst, *this, isDarwin); // Change the opcode to ADDIS8. If the global address is external, has @@ -713,7 +714,7 @@ void PPCAsmPrinter::EmitInstruction(const MachineInstr *MI) { return; } case PPC::LDtocL: { - // Transform %Xd = LDtocL , %Xs + // Transform %xd = LDtocL @sym, %xs LowerPPCMachineInstrToMCInst(MI, TmpInst, *this, isDarwin); // Change the opcode to LD. If the global address is external, has @@ -756,7 +757,7 @@ void PPCAsmPrinter::EmitInstruction(const MachineInstr *MI) { return; } case PPC::ADDItocL: { - // Transform %Xd = ADDItocL %Xs, + // Transform %xd = ADDItocL %xs, @sym LowerPPCMachineInstrToMCInst(MI, TmpInst, *this, isDarwin); // Change the opcode to ADDI8. If the global address is external, then @@ -787,8 +788,8 @@ void PPCAsmPrinter::EmitInstruction(const MachineInstr *MI) { return; } case PPC::ADDISgotTprelHA: { - // Transform: %Xd = ADDISgotTprelHA %X2, - // Into: %Xd = ADDIS8 %X2, sym@got@tlsgd@ha + // Transform: %xd = ADDISgotTprelHA %x2, @sym + // Into: %xd = ADDIS8 %x2, sym@got@tlsgd@ha assert(Subtarget->isPPC64() && "Not supported for 32-bit PowerPC"); const MachineOperand &MO = MI->getOperand(2); const GlobalValue *GValue = MO.getGlobal(); @@ -804,7 +805,7 @@ void PPCAsmPrinter::EmitInstruction(const MachineInstr *MI) { } case PPC::LDgotTprelL: case PPC::LDgotTprelL32: { - // Transform %Xd = LDgotTprelL , %Xs + // Transform %xd = LDgotTprelL @sym, %xs LowerPPCMachineInstrToMCInst(MI, TmpInst, *this, isDarwin); // Change the opcode to LD. @@ -865,8 +866,8 @@ void PPCAsmPrinter::EmitInstruction(const MachineInstr *MI) { return; } case PPC::ADDIStlsgdHA: { - // Transform: %Xd = ADDIStlsgdHA %X2, - // Into: %Xd = ADDIS8 %X2, sym@got@tlsgd@ha + // Transform: %xd = ADDIStlsgdHA %x2, @sym + // Into: %xd = ADDIS8 %x2, sym@got@tlsgd@ha assert(Subtarget->isPPC64() && "Not supported for 32-bit PowerPC"); const MachineOperand &MO = MI->getOperand(2); const GlobalValue *GValue = MO.getGlobal(); @@ -881,11 +882,11 @@ void PPCAsmPrinter::EmitInstruction(const MachineInstr *MI) { return; } case PPC::ADDItlsgdL: - // Transform: %Xd = ADDItlsgdL %Xs, - // Into: %Xd = ADDI8 %Xs, sym@got@tlsgd@l + // Transform: %xd = ADDItlsgdL %xs, @sym + // Into: %xd = ADDI8 %xs, sym@got@tlsgd@l case PPC::ADDItlsgdL32: { - // Transform: %Rd = ADDItlsgdL32 %Rs, - // Into: %Rd = ADDI %Rs, sym@got@tlsgd + // Transform: %rd = ADDItlsgdL32 %rs, @sym + // Into: %rd = ADDI %rs, sym@got@tlsgd const MachineOperand &MO = MI->getOperand(2); const GlobalValue *GValue = MO.getGlobal(); MCSymbol *MOSymbol = getSymbol(GValue); @@ -901,17 +902,17 @@ void PPCAsmPrinter::EmitInstruction(const MachineInstr *MI) { return; } case PPC::GETtlsADDR: - // Transform: %X3 = GETtlsADDR %X3, + // Transform: %x3 = GETtlsADDR %x3, @sym // Into: BL8_NOP_TLS __tls_get_addr(sym at tlsgd) case PPC::GETtlsADDR32: { - // Transform: %R3 = GETtlsADDR32 %R3, + // Transform: %r3 = GETtlsADDR32 %r3, @sym // Into: BL_TLS __tls_get_addr(sym at tlsgd)@PLT EmitTlsCall(MI, MCSymbolRefExpr::VK_PPC_TLSGD); return; } case PPC::ADDIStlsldHA: { - // Transform: %Xd = ADDIStlsldHA %X2, - // Into: %Xd = ADDIS8 %X2, sym@got@tlsld@ha + // Transform: %xd = ADDIStlsldHA %x2, @sym + // Into: %xd = ADDIS8 %x2, sym@got@tlsld@ha assert(Subtarget->isPPC64() && "Not supported for 32-bit PowerPC"); const MachineOperand &MO = MI->getOperand(2); const GlobalValue *GValue = MO.getGlobal(); @@ -926,11 +927,11 @@ void PPCAsmPrinter::EmitInstruction(const MachineInstr *MI) { return; } case PPC::ADDItlsldL: - // Transform: %Xd = ADDItlsldL %Xs, - // Into: %Xd = ADDI8 %Xs, sym@got@tlsld@l + // Transform: %xd = ADDItlsldL %xs, @sym + // Into: %xd = ADDI8 %xs, sym@got@tlsld@l case PPC::ADDItlsldL32: { - // Transform: %Rd = ADDItlsldL32 %Rs, - // Into: %Rd = ADDI %Rs, sym@got@tlsld + // Transform: %rd = ADDItlsldL32 %rs, @sym + // Into: %rd = ADDI %rs, sym@got@tlsld const MachineOperand &MO = MI->getOperand(2); const GlobalValue *GValue = MO.getGlobal(); MCSymbol *MOSymbol = getSymbol(GValue); @@ -946,20 +947,20 @@ void PPCAsmPrinter::EmitInstruction(const MachineInstr *MI) { return; } case PPC::GETtlsldADDR: - // Transform: %X3 = GETtlsldADDR %X3, + // Transform: %x3 = GETtlsldADDR %x3, @sym // Into: BL8_NOP_TLS __tls_get_addr(sym at tlsld) case PPC::GETtlsldADDR32: { - // Transform: %R3 = GETtlsldADDR32 %R3, + // Transform: %r3 = GETtlsldADDR32 %r3, @sym // Into: BL_TLS __tls_get_addr(sym at tlsld)@PLT EmitTlsCall(MI, MCSymbolRefExpr::VK_PPC_TLSLD); return; } case PPC::ADDISdtprelHA: - // Transform: %Xd = ADDISdtprelHA %Xs, - // Into: %Xd = ADDIS8 %Xs, sym@dtprel@ha + // Transform: %xd = ADDISdtprelHA %xs, @sym + // Into: %xd = ADDIS8 %xs, sym@dtprel@ha case PPC::ADDISdtprelHA32: { - // Transform: %Rd = ADDISdtprelHA32 %Rs, - // Into: %Rd = ADDIS %Rs, sym@dtprel@ha + // Transform: %rd = ADDISdtprelHA32 %rs, @sym + // Into: %rd = ADDIS %rs, sym@dtprel@ha const MachineOperand &MO = MI->getOperand(2); const GlobalValue *GValue = MO.getGlobal(); MCSymbol *MOSymbol = getSymbol(GValue); @@ -975,11 +976,11 @@ void PPCAsmPrinter::EmitInstruction(const MachineInstr *MI) { return; } case PPC::ADDIdtprelL: - // Transform: %Xd = ADDIdtprelL %Xs, - // Into: %Xd = ADDI8 %Xs, sym@dtprel@l + // Transform: %xd = ADDIdtprelL %xs, @sym + // Into: %xd = ADDI8 %xs, sym@dtprel@l case PPC::ADDIdtprelL32: { - // Transform: %Rd = ADDIdtprelL32 %Rs, - // Into: %Rd = ADDI %Rs, sym@dtprel@l + // Transform: %rd = ADDIdtprelL32 %rs, @sym + // Into: %rd = ADDI %rs, sym@dtprel@l const MachineOperand &MO = MI->getOperand(2); const GlobalValue *GValue = MO.getGlobal(); MCSymbol *MOSymbol = getSymbol(GValue); @@ -996,8 +997,8 @@ void PPCAsmPrinter::EmitInstruction(const MachineInstr *MI) { case PPC::MFOCRF: case PPC::MFOCRF8: if (!Subtarget->hasMFOCRF()) { - // Transform: %R3 = MFOCRF %CR7 - // Into: %R3 = MFCR ;; cr7 + // Transform: %r3 = MFOCRF %cr7 + // Into: %r3 = MFCR ;; cr7 unsigned NewOpcode = MI->getOpcode() == PPC::MFOCRF ? PPC::MFCR : PPC::MFCR8; OutStreamer->AddComment(PPCInstPrinter:: @@ -1010,8 +1011,8 @@ void PPCAsmPrinter::EmitInstruction(const MachineInstr *MI) { case PPC::MTOCRF: case PPC::MTOCRF8: if (!Subtarget->hasMFOCRF()) { - // Transform: %CR7 = MTOCRF %R3 - // Into: MTCRF mask, %R3 ;; cr7 + // Transform: %cr7 = MTOCRF %r3 + // Into: MTCRF mask, %r3 ;; cr7 unsigned NewOpcode = MI->getOpcode() == PPC::MTOCRF ? PPC::MTCRF : PPC::MTCRF8; unsigned Mask = 0x80 >> OutContext.getRegisterInfo() @@ -1089,7 +1090,61 @@ void PPCLinuxAsmPrinter::EmitInstruction(const MachineInstr *MI) { recordSled(BeginOfSled, *MI, SledKind::FUNCTION_ENTER); break; } - case TargetOpcode::PATCHABLE_FUNCTION_EXIT: { + case TargetOpcode::PATCHABLE_RET: { + unsigned RetOpcode = MI->getOperand(0).getImm(); + MCInst RetInst; + RetInst.setOpcode(RetOpcode); + for (const auto &MO : + make_range(std::next(MI->operands_begin()), MI->operands_end())) { + MCOperand MCOp; + if (LowerPPCMachineOperandToMCOperand(MO, MCOp, *this, false)) + RetInst.addOperand(MCOp); + } + + bool IsConditional; + if (RetOpcode == PPC::BCCLR) { + IsConditional = true; + } else if (RetOpcode == PPC::TCRETURNdi8 || RetOpcode == PPC::TCRETURNri8 || + RetOpcode == PPC::TCRETURNai8) { + break; + } else if (RetOpcode == PPC::BLR8 || RetOpcode == PPC::TAILB8) { + IsConditional = false; + } else { + EmitToStreamer(*OutStreamer, RetInst); + break; + } + + MCSymbol *FallthroughLabel; + if (IsConditional) { + // Before: + // bgtlr cr0 + // + // After: + // ble cr0, .end + // .p2align 3 + // .begin: + // blr # lis 0, FuncId[16..32] + // nop # li 0, FuncId[0..15] + // std 0, -8(1) + // mflr 0 + // bl __xray_FunctionExit + // mtlr 0 + // blr + // .end: + // + // Update compiler-rt/lib/xray/xray_powerpc64.cc accordingly when number + // of instructions change. + FallthroughLabel = OutContext.createTempSymbol(); + EmitToStreamer( + *OutStreamer, + MCInstBuilder(PPC::BCC) + .addImm(PPC::InvertPredicate( + static_cast(MI->getOperand(1).getImm()))) + .addReg(MI->getOperand(2).getReg()) + .addExpr(MCSymbolRefExpr::create(FallthroughLabel, OutContext))); + RetInst = MCInst(); + RetInst.setOpcode(PPC::BLR8); + } // .p2align 3 // .begin: // b(lr)? # lis 0, FuncId[16..32] @@ -1098,24 +1153,14 @@ void PPCLinuxAsmPrinter::EmitInstruction(const MachineInstr *MI) { // mflr 0 // bl __xray_FunctionExit // mtlr 0 - // .end: // b(lr)? // // Update compiler-rt/lib/xray/xray_powerpc64.cc accordingly when number // of instructions change. - const MachineInstr *Next = [&] { - MachineBasicBlock::const_iterator It(MI); - assert(It != MI->getParent()->end()); - ++It; - assert(It->isReturn()); - return &*It; - }(); OutStreamer->EmitCodeAlignment(8); MCSymbol *BeginOfSled = OutContext.createTempSymbol(); OutStreamer->EmitLabel(BeginOfSled); - MCInst TmpInst; - LowerPPCMachineInstrToMCInst(Next, TmpInst, *this, false); - EmitToStreamer(*OutStreamer, TmpInst); + EmitToStreamer(*OutStreamer, RetInst); EmitToStreamer(*OutStreamer, MCInstBuilder(PPC::NOP)); EmitToStreamer( *OutStreamer, @@ -1127,15 +1172,18 @@ void PPCLinuxAsmPrinter::EmitInstruction(const MachineInstr *MI) { OutContext.getOrCreateSymbol("__xray_FunctionExit"), OutContext))); EmitToStreamer(*OutStreamer, MCInstBuilder(PPC::MTLR8).addReg(PPC::X0)); + EmitToStreamer(*OutStreamer, RetInst); + if (IsConditional) + OutStreamer->EmitLabel(FallthroughLabel); recordSled(BeginOfSled, *MI, SledKind::FUNCTION_EXIT); break; } + case TargetOpcode::PATCHABLE_FUNCTION_EXIT: + llvm_unreachable("PATCHABLE_FUNCTION_EXIT should never be emitted"); case TargetOpcode::PATCHABLE_TAIL_CALL: - case TargetOpcode::PATCHABLE_RET: - // PPC's tail call instruction, e.g. PPC::TCRETURNdi8, doesn't really - // lower to a PPC::B instruction. The PPC::B instruction is generated - // before it, and handled by the normal case. - llvm_unreachable("Tail call is handled in the normal case. See comments" + // TODO: Define a trampoline `__xray_FunctionTailExit` and differentiate a + // normal function exit from a tail exit. + llvm_unreachable("Tail call is handled in the normal case. See comments " "around this assert."); } } @@ -1180,7 +1228,7 @@ void PPCLinuxAsmPrinter::EmitFunctionEntryLabel() { // linux/ppc32 - Normal entry label. if (!Subtarget->isPPC64() && (!isPositionIndependent() || - MF->getFunction()->getParent()->getPICLevel() == PICLevel::SmallPIC)) + MF->getFunction().getParent()->getPICLevel() == PICLevel::SmallPIC)) return AsmPrinter::EmitFunctionEntryLabel(); if (!Subtarget->isPPC64()) { diff --git a/lib/Target/PowerPC/PPCBranchCoalescing.cpp b/lib/Target/PowerPC/PPCBranchCoalescing.cpp new file mode 100644 index 0000000000000..32d801b13ded9 --- /dev/null +++ b/lib/Target/PowerPC/PPCBranchCoalescing.cpp @@ -0,0 +1,784 @@ +//===-- CoalesceBranches.cpp - Coalesce blocks with the same condition ---===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +/// +/// \file +/// Coalesce basic blocks guarded by the same branch condition into a single +/// basic block. +/// +//===----------------------------------------------------------------------===// + +#include "PPC.h" +#include "llvm/ADT/BitVector.h" +#include "llvm/ADT/Statistic.h" +#include "llvm/CodeGen/MachineDominators.h" +#include "llvm/CodeGen/MachineFunctionPass.h" +#include "llvm/CodeGen/MachinePostDominators.h" +#include "llvm/CodeGen/MachineRegisterInfo.h" +#include "llvm/CodeGen/Passes.h" +#include "llvm/CodeGen/TargetFrameLowering.h" +#include "llvm/CodeGen/TargetInstrInfo.h" +#include "llvm/CodeGen/TargetSubtargetInfo.h" +#include "llvm/Support/Debug.h" + +using namespace llvm; + +#define DEBUG_TYPE "ppc-branch-coalescing" + +STATISTIC(NumBlocksCoalesced, "Number of blocks coalesced"); +STATISTIC(NumPHINotMoved, "Number of PHI Nodes that cannot be merged"); +STATISTIC(NumBlocksNotCoalesced, "Number of blocks not coalesced"); + +namespace llvm { + void initializePPCBranchCoalescingPass(PassRegistry&); +} + +//===----------------------------------------------------------------------===// +// PPCBranchCoalescing +//===----------------------------------------------------------------------===// +/// +/// Improve scheduling by coalescing branches that depend on the same condition. +/// This pass looks for blocks that are guarded by the same branch condition +/// and attempts to merge the blocks together. Such opportunities arise from +/// the expansion of select statements in the IR. +/// +/// This pass does not handle implicit operands on branch statements. In order +/// to run on targets that use implicit operands, changes need to be made in the +/// canCoalesceBranch and canMerge methods. +/// +/// Example: the following LLVM IR +/// +/// %test = icmp eq i32 %x 0 +/// %tmp1 = select i1 %test, double %a, double 2.000000e-03 +/// %tmp2 = select i1 %test, double %b, double 5.000000e-03 +/// +/// expands to the following machine code: +/// +/// %bb.0: derived from LLVM BB %entry +/// Live Ins: %f1 %f3 %x6 +/// +/// %0 = COPY %f1; F8RC:%0 +/// %5 = CMPLWI killed %4, 0; CRRC:%5 GPRC:%4 +/// %8 = LXSDX %zero8, killed %7, implicit %rm; +/// mem:LD8[ConstantPool] F8RC:%8 G8RC:%7 +/// BCC 76, %5, <%bb.2>; CRRC:%5 +/// Successors according to CFG: %bb.1(?%) %bb.2(?%) +/// +/// %bb.1: derived from LLVM BB %entry +/// Predecessors according to CFG: %bb.0 +/// Successors according to CFG: %bb.2(?%) +/// +/// %bb.2: derived from LLVM BB %entry +/// Predecessors according to CFG: %bb.0 %bb.1 +/// %9 = PHI %8, <%bb.1>, %0, <%bb.0>; +/// F8RC:%9,%8,%0 +/// +/// BCC 76, %5, <%bb.4>; CRRC:%5 +/// Successors according to CFG: %bb.3(?%) %bb.4(?%) +/// +/// %bb.3: derived from LLVM BB %entry +/// Predecessors according to CFG: %bb.2 +/// Successors according to CFG: %bb.4(?%) +/// +/// %bb.4: derived from LLVM BB %entry +/// Predecessors according to CFG: %bb.2 %bb.3 +/// %13 = PHI %12, <%bb.3>, %2, <%bb.2>; +/// F8RC:%13,%12,%2 +/// +/// BLR8 implicit %lr8, implicit %rm, implicit %f1 +/// +/// When this pattern is detected, branch coalescing will try to collapse +/// it by moving code in %bb.2 to %bb.0 and/or %bb.4 and removing %bb.3. +/// +/// If all conditions are meet, IR should collapse to: +/// +/// %bb.0: derived from LLVM BB %entry +/// Live Ins: %f1 %f3 %x6 +/// +/// %0 = COPY %f1; F8RC:%0 +/// %5 = CMPLWI killed %4, 0; CRRC:%5 GPRC:%4 +/// %8 = LXSDX %zero8, killed %7, implicit %rm; +/// mem:LD8[ConstantPool] F8RC:%8 G8RC:%7 +/// +/// BCC 76, %5, <%bb.4>; CRRC:%5 +/// Successors according to CFG: %bb.1(0x2aaaaaaa / 0x80000000 = 33.33%) +/// %bb.4(0x55555554 / 0x80000000 = 66.67%) +/// +/// %bb.1: derived from LLVM BB %entry +/// Predecessors according to CFG: %bb.0 +/// Successors according to CFG: %bb.4(0x40000000 / 0x80000000 = 50.00%) +/// +/// %bb.4: derived from LLVM BB %entry +/// Predecessors according to CFG: %bb.0 %bb.1 +/// %9 = PHI %8, <%bb.1>, %0, <%bb.0>; +/// F8RC:%9,%8,%0 +/// %13 = PHI %12, <%bb.1>, %2, <%bb.0>; +/// F8RC:%13,%12,%2 +/// +/// BLR8 implicit %lr8, implicit %rm, implicit %f1 +/// +/// Branch Coalescing does not split blocks, it moves everything in the same +/// direction ensuring it does not break use/definition semantics. +/// +/// PHI nodes and its corresponding use instructions are moved to its successor +/// block if there are no uses within the successor block PHI nodes. PHI +/// node ordering cannot be assumed. +/// +/// Non-PHI can be moved up to the predecessor basic block or down to the +/// successor basic block following any PHI instructions. Whether it moves +/// up or down depends on whether the register(s) defined in the instructions +/// are used in current block or in any PHI instructions at the beginning of +/// the successor block. + +namespace { + +class PPCBranchCoalescing : public MachineFunctionPass { + struct CoalescingCandidateInfo { + MachineBasicBlock *BranchBlock; // Block containing the branch + MachineBasicBlock *BranchTargetBlock; // Block branched to + MachineBasicBlock *FallThroughBlock; // Fall-through if branch not taken + SmallVector Cond; + bool MustMoveDown; + bool MustMoveUp; + + CoalescingCandidateInfo(); + void clear(); + }; + + MachineDominatorTree *MDT; + MachinePostDominatorTree *MPDT; + const TargetInstrInfo *TII; + MachineRegisterInfo *MRI; + + void initialize(MachineFunction &F); + bool canCoalesceBranch(CoalescingCandidateInfo &Cand); + bool identicalOperands(ArrayRef OperandList1, + ArrayRef OperandList2) const; + bool validateCandidates(CoalescingCandidateInfo &SourceRegion, + CoalescingCandidateInfo &TargetRegion) const; + +public: + static char ID; + + PPCBranchCoalescing() : MachineFunctionPass(ID) { + initializePPCBranchCoalescingPass(*PassRegistry::getPassRegistry()); + } + + void getAnalysisUsage(AnalysisUsage &AU) const override { + AU.addRequired(); + AU.addRequired(); + MachineFunctionPass::getAnalysisUsage(AU); + } + + StringRef getPassName() const override { return "Branch Coalescing"; } + + bool mergeCandidates(CoalescingCandidateInfo &SourceRegion, + CoalescingCandidateInfo &TargetRegion); + bool canMoveToBeginning(const MachineInstr &MI, + const MachineBasicBlock &MBB) const; + bool canMoveToEnd(const MachineInstr &MI, + const MachineBasicBlock &MBB) const; + bool canMerge(CoalescingCandidateInfo &SourceRegion, + CoalescingCandidateInfo &TargetRegion) const; + void moveAndUpdatePHIs(MachineBasicBlock *SourceRegionMBB, + MachineBasicBlock *TargetRegionMBB); + bool runOnMachineFunction(MachineFunction &MF) override; +}; +} // End anonymous namespace. + +char PPCBranchCoalescing::ID = 0; +/// createPPCBranchCoalescingPass - returns an instance of the Branch Coalescing +/// Pass +FunctionPass *llvm::createPPCBranchCoalescingPass() { + return new PPCBranchCoalescing(); +} + +INITIALIZE_PASS_BEGIN(PPCBranchCoalescing, DEBUG_TYPE, + "Branch Coalescing", false, false) +INITIALIZE_PASS_DEPENDENCY(MachineDominatorTree) +INITIALIZE_PASS_DEPENDENCY(MachinePostDominatorTree) +INITIALIZE_PASS_END(PPCBranchCoalescing, DEBUG_TYPE, "Branch Coalescing", + false, false) + +PPCBranchCoalescing::CoalescingCandidateInfo::CoalescingCandidateInfo() + : BranchBlock(nullptr), BranchTargetBlock(nullptr), + FallThroughBlock(nullptr), MustMoveDown(false), MustMoveUp(false) {} + +void PPCBranchCoalescing::CoalescingCandidateInfo::clear() { + BranchBlock = nullptr; + BranchTargetBlock = nullptr; + FallThroughBlock = nullptr; + Cond.clear(); + MustMoveDown = false; + MustMoveUp = false; +} + +void PPCBranchCoalescing::initialize(MachineFunction &MF) { + MDT = &getAnalysis(); + MPDT = &getAnalysis(); + TII = MF.getSubtarget().getInstrInfo(); + MRI = &MF.getRegInfo(); +} + +/// +/// Analyze the branch statement to determine if it can be coalesced. This +/// method analyses the branch statement for the given candidate to determine +/// if it can be coalesced. If the branch can be coalesced, then the +/// BranchTargetBlock and the FallThroughBlock are recorded in the specified +/// Candidate. +/// +///\param[in,out] Cand The coalescing candidate to analyze +///\return true if and only if the branch can be coalesced, false otherwise +/// +bool PPCBranchCoalescing::canCoalesceBranch(CoalescingCandidateInfo &Cand) { + DEBUG(dbgs() << "Determine if branch block " << Cand.BranchBlock->getNumber() + << " can be coalesced:"); + MachineBasicBlock *FalseMBB = nullptr; + + if (TII->analyzeBranch(*Cand.BranchBlock, Cand.BranchTargetBlock, FalseMBB, + Cand.Cond)) { + DEBUG(dbgs() << "TII unable to Analyze Branch - skip\n"); + return false; + } + + for (auto &I : Cand.BranchBlock->terminators()) { + DEBUG(dbgs() << "Looking at terminator : " << I << "\n"); + if (!I.isBranch()) + continue; + + // The analyzeBranch method does not include any implicit operands. + // This is not an issue on PPC but must be handled on other targets. + // For this pass to be made target-independent, the analyzeBranch API + // need to be updated to support implicit operands and there would + // need to be a way to verify that any implicit operands would not be + // clobbered by merging blocks. This would include identifying the + // implicit operands as well as the basic block they are defined in. + // This could be done by changing the analyzeBranch API to have it also + // record and return the implicit operands and the blocks where they are + // defined. Alternatively, the BranchCoalescing code would need to be + // extended to identify the implicit operands. The analysis in canMerge + // must then be extended to prove that none of the implicit operands are + // changed in the blocks that are combined during coalescing. + if (I.getNumOperands() != I.getNumExplicitOperands()) { + DEBUG(dbgs() << "Terminator contains implicit operands - skip : " << I + << "\n"); + return false; + } + } + + if (Cand.BranchBlock->isEHPad() || Cand.BranchBlock->hasEHPadSuccessor()) { + DEBUG(dbgs() << "EH Pad - skip\n"); + return false; + } + + // For now only consider triangles (i.e, BranchTargetBlock is set, + // FalseMBB is null, and BranchTargetBlock is a successor to BranchBlock) + if (!Cand.BranchTargetBlock || FalseMBB || + !Cand.BranchBlock->isSuccessor(Cand.BranchTargetBlock)) { + DEBUG(dbgs() << "Does not form a triangle - skip\n"); + return false; + } + + // Ensure there are only two successors + if (Cand.BranchBlock->succ_size() != 2) { + DEBUG(dbgs() << "Does not have 2 successors - skip\n"); + return false; + } + + // Sanity check - the block must be able to fall through + assert(Cand.BranchBlock->canFallThrough() && + "Expecting the block to fall through!"); + + // We have already ensured there are exactly two successors to + // BranchBlock and that BranchTargetBlock is a successor to BranchBlock. + // Ensure the single fall though block is empty. + MachineBasicBlock *Succ = + (*Cand.BranchBlock->succ_begin() == Cand.BranchTargetBlock) + ? *Cand.BranchBlock->succ_rbegin() + : *Cand.BranchBlock->succ_begin(); + + assert(Succ && "Expecting a valid fall-through block\n"); + + if (!Succ->empty()) { + DEBUG(dbgs() << "Fall-through block contains code -- skip\n"); + return false; + } + + if (!Succ->isSuccessor(Cand.BranchTargetBlock)) { + DEBUG(dbgs() + << "Successor of fall through block is not branch taken block\n"); + return false; + } + + Cand.FallThroughBlock = Succ; + DEBUG(dbgs() << "Valid Candidate\n"); + return true; +} + +/// +/// Determine if the two operand lists are identical +/// +/// \param[in] OpList1 operand list +/// \param[in] OpList2 operand list +/// \return true if and only if the operands lists are identical +/// +bool PPCBranchCoalescing::identicalOperands( + ArrayRef OpList1, ArrayRef OpList2) const { + + if (OpList1.size() != OpList2.size()) { + DEBUG(dbgs() << "Operand list is different size\n"); + return false; + } + + for (unsigned i = 0; i < OpList1.size(); ++i) { + const MachineOperand &Op1 = OpList1[i]; + const MachineOperand &Op2 = OpList2[i]; + + DEBUG(dbgs() << "Op1: " << Op1 << "\n" + << "Op2: " << Op2 << "\n"); + + if (Op1.isIdenticalTo(Op2)) { + // filter out instructions with physical-register uses + if (Op1.isReg() && TargetRegisterInfo::isPhysicalRegister(Op1.getReg()) + // If the physical register is constant then we can assume the value + // has not changed between uses. + && !(Op1.isUse() && MRI->isConstantPhysReg(Op1.getReg()))) { + DEBUG(dbgs() << "The operands are not provably identical.\n"); + return false; + } + DEBUG(dbgs() << "Op1 and Op2 are identical!\n"); + continue; + } + + // If the operands are not identical, but are registers, check to see if the + // definition of the register produces the same value. If they produce the + // same value, consider them to be identical. + if (Op1.isReg() && Op2.isReg() && + TargetRegisterInfo::isVirtualRegister(Op1.getReg()) && + TargetRegisterInfo::isVirtualRegister(Op2.getReg())) { + MachineInstr *Op1Def = MRI->getVRegDef(Op1.getReg()); + MachineInstr *Op2Def = MRI->getVRegDef(Op2.getReg()); + if (TII->produceSameValue(*Op1Def, *Op2Def, MRI)) { + DEBUG(dbgs() << "Op1Def: " << *Op1Def << " and " << *Op2Def + << " produce the same value!\n"); + } else { + DEBUG(dbgs() << "Operands produce different values\n"); + return false; + } + } else { + DEBUG(dbgs() << "The operands are not provably identical.\n"); + return false; + } + } + + return true; +} + +/// +/// Moves ALL PHI instructions in SourceMBB to beginning of TargetMBB +/// and update them to refer to the new block. PHI node ordering +/// cannot be assumed so it does not matter where the PHI instructions +/// are moved to in TargetMBB. +/// +/// \param[in] SourceMBB block to move PHI instructions from +/// \param[in] TargetMBB block to move PHI instructions to +/// +void PPCBranchCoalescing::moveAndUpdatePHIs(MachineBasicBlock *SourceMBB, + MachineBasicBlock *TargetMBB) { + + MachineBasicBlock::iterator MI = SourceMBB->begin(); + MachineBasicBlock::iterator ME = SourceMBB->getFirstNonPHI(); + + if (MI == ME) { + DEBUG(dbgs() << "SourceMBB contains no PHI instructions.\n"); + return; + } + + // Update all PHI instructions in SourceMBB and move to top of TargetMBB + for (MachineBasicBlock::iterator Iter = MI; Iter != ME; Iter++) { + MachineInstr &PHIInst = *Iter; + for (unsigned i = 2, e = PHIInst.getNumOperands() + 1; i != e; i += 2) { + MachineOperand &MO = PHIInst.getOperand(i); + if (MO.getMBB() == SourceMBB) + MO.setMBB(TargetMBB); + } + } + TargetMBB->splice(TargetMBB->begin(), SourceMBB, MI, ME); +} + +/// +/// This function checks if MI can be moved to the beginning of the TargetMBB +/// following PHI instructions. A MI instruction can be moved to beginning of +/// the TargetMBB if there are no uses of it within the TargetMBB PHI nodes. +/// +/// \param[in] MI the machine instruction to move. +/// \param[in] TargetMBB the machine basic block to move to +/// \return true if it is safe to move MI to beginning of TargetMBB, +/// false otherwise. +/// +bool PPCBranchCoalescing::canMoveToBeginning(const MachineInstr &MI, + const MachineBasicBlock &TargetMBB + ) const { + + DEBUG(dbgs() << "Checking if " << MI << " can move to beginning of " + << TargetMBB.getNumber() << "\n"); + + for (auto &Def : MI.defs()) { // Looking at Def + for (auto &Use : MRI->use_instructions(Def.getReg())) { + if (Use.isPHI() && Use.getParent() == &TargetMBB) { + DEBUG(dbgs() << " *** used in a PHI -- cannot move ***\n"); + return false; + } + } + } + + DEBUG(dbgs() << " Safe to move to the beginning.\n"); + return true; +} + +/// +/// This function checks if MI can be moved to the end of the TargetMBB, +/// immediately before the first terminator. A MI instruction can be moved +/// to then end of the TargetMBB if no PHI node defines what MI uses within +/// it's own MBB. +/// +/// \param[in] MI the machine instruction to move. +/// \param[in] TargetMBB the machine basic block to move to +/// \return true if it is safe to move MI to end of TargetMBB, +/// false otherwise. +/// +bool PPCBranchCoalescing::canMoveToEnd(const MachineInstr &MI, + const MachineBasicBlock &TargetMBB + ) const { + + DEBUG(dbgs() << "Checking if " << MI << " can move to end of " + << TargetMBB.getNumber() << "\n"); + + for (auto &Use : MI.uses()) { + if (Use.isReg() && TargetRegisterInfo::isVirtualRegister(Use.getReg())) { + MachineInstr *DefInst = MRI->getVRegDef(Use.getReg()); + if (DefInst->isPHI() && DefInst->getParent() == MI.getParent()) { + DEBUG(dbgs() << " *** Cannot move this instruction ***\n"); + return false; + } else { + DEBUG(dbgs() << " *** def is in another block -- safe to move!\n"); + } + } + } + + DEBUG(dbgs() << " Safe to move to the end.\n"); + return true; +} + +/// +/// This method checks to ensure the two coalescing candidates follows the +/// expected pattern required for coalescing. +/// +/// \param[in] SourceRegion The candidate to move statements from +/// \param[in] TargetRegion The candidate to move statements to +/// \return true if all instructions in SourceRegion.BranchBlock can be merged +/// into a block in TargetRegion; false otherwise. +/// +bool PPCBranchCoalescing::validateCandidates( + CoalescingCandidateInfo &SourceRegion, + CoalescingCandidateInfo &TargetRegion) const { + + if (TargetRegion.BranchTargetBlock != SourceRegion.BranchBlock) + llvm_unreachable("Expecting SourceRegion to immediately follow TargetRegion"); + else if (!MDT->dominates(TargetRegion.BranchBlock, SourceRegion.BranchBlock)) + llvm_unreachable("Expecting TargetRegion to dominate SourceRegion"); + else if (!MPDT->dominates(SourceRegion.BranchBlock, TargetRegion.BranchBlock)) + llvm_unreachable("Expecting SourceRegion to post-dominate TargetRegion"); + else if (!TargetRegion.FallThroughBlock->empty() || + !SourceRegion.FallThroughBlock->empty()) + llvm_unreachable("Expecting fall-through blocks to be empty"); + + return true; +} + +/// +/// This method determines whether the two coalescing candidates can be merged. +/// In order to be merged, all instructions must be able to +/// 1. Move to the beginning of the SourceRegion.BranchTargetBlock; +/// 2. Move to the end of the TargetRegion.BranchBlock. +/// Merging involves moving the instructions in the +/// TargetRegion.BranchTargetBlock (also SourceRegion.BranchBlock). +/// +/// This function first try to move instructions from the +/// TargetRegion.BranchTargetBlock down, to the beginning of the +/// SourceRegion.BranchTargetBlock. This is not possible if any register defined +/// in TargetRegion.BranchTargetBlock is used in a PHI node in the +/// SourceRegion.BranchTargetBlock. In this case, check whether the statement +/// can be moved up, to the end of the TargetRegion.BranchBlock (immediately +/// before the branch statement). If it cannot move, then these blocks cannot +/// be merged. +/// +/// Note that there is no analysis for moving instructions past the fall-through +/// blocks because they are confirmed to be empty. An assert is thrown if they +/// are not. +/// +/// \param[in] SourceRegion The candidate to move statements from +/// \param[in] TargetRegion The candidate to move statements to +/// \return true if all instructions in SourceRegion.BranchBlock can be merged +/// into a block in TargetRegion, false otherwise. +/// +bool PPCBranchCoalescing::canMerge(CoalescingCandidateInfo &SourceRegion, + CoalescingCandidateInfo &TargetRegion) const { + if (!validateCandidates(SourceRegion, TargetRegion)) + return false; + + // Walk through PHI nodes first and see if they force the merge into the + // SourceRegion.BranchTargetBlock. + for (MachineBasicBlock::iterator + I = SourceRegion.BranchBlock->instr_begin(), + E = SourceRegion.BranchBlock->getFirstNonPHI(); + I != E; ++I) { + for (auto &Def : I->defs()) + for (auto &Use : MRI->use_instructions(Def.getReg())) { + if (Use.isPHI() && Use.getParent() == SourceRegion.BranchTargetBlock) { + DEBUG(dbgs() << "PHI " << *I << " defines register used in another " + "PHI within branch target block -- can't merge\n"); + NumPHINotMoved++; + return false; + } + if (Use.getParent() == SourceRegion.BranchBlock) { + DEBUG(dbgs() << "PHI " << *I + << " defines register used in this " + "block -- all must move down\n"); + SourceRegion.MustMoveDown = true; + } + } + } + + // Walk through the MI to see if they should be merged into + // TargetRegion.BranchBlock (up) or SourceRegion.BranchTargetBlock (down) + for (MachineBasicBlock::iterator + I = SourceRegion.BranchBlock->getFirstNonPHI(), + E = SourceRegion.BranchBlock->end(); + I != E; ++I) { + if (!canMoveToBeginning(*I, *SourceRegion.BranchTargetBlock)) { + DEBUG(dbgs() << "Instruction " << *I + << " cannot move down - must move up!\n"); + SourceRegion.MustMoveUp = true; + } + if (!canMoveToEnd(*I, *TargetRegion.BranchBlock)) { + DEBUG(dbgs() << "Instruction " << *I + << " cannot move up - must move down!\n"); + SourceRegion.MustMoveDown = true; + } + } + + return (SourceRegion.MustMoveUp && SourceRegion.MustMoveDown) ? false : true; +} + +/// Merge the instructions from SourceRegion.BranchBlock, +/// SourceRegion.BranchTargetBlock, and SourceRegion.FallThroughBlock into +/// TargetRegion.BranchBlock, TargetRegion.BranchTargetBlock and +/// TargetRegion.FallThroughBlock respectively. +/// +/// The successors for blocks in TargetRegion will be updated to use the +/// successors from blocks in SourceRegion. Finally, the blocks in SourceRegion +/// will be removed from the function. +/// +/// A region consists of a BranchBlock, a FallThroughBlock, and a +/// BranchTargetBlock. Branch coalesce works on patterns where the +/// TargetRegion's BranchTargetBlock must also be the SourceRegions's +/// BranchBlock. +/// +/// Before mergeCandidates: +/// +/// +---------------------------+ +/// | TargetRegion.BranchBlock | +/// +---------------------------+ +/// / | +/// / +--------------------------------+ +/// | | TargetRegion.FallThroughBlock | +/// \ +--------------------------------+ +/// \ | +/// +----------------------------------+ +/// | TargetRegion.BranchTargetBlock | +/// | SourceRegion.BranchBlock | +/// +----------------------------------+ +/// / | +/// / +--------------------------------+ +/// | | SourceRegion.FallThroughBlock | +/// \ +--------------------------------+ +/// \ | +/// +----------------------------------+ +/// | SourceRegion.BranchTargetBlock | +/// +----------------------------------+ +/// +/// After mergeCandidates: +/// +/// +-----------------------------+ +/// | TargetRegion.BranchBlock | +/// | SourceRegion.BranchBlock | +/// +-----------------------------+ +/// / | +/// / +---------------------------------+ +/// | | TargetRegion.FallThroughBlock | +/// | | SourceRegion.FallThroughBlock | +/// \ +---------------------------------+ +/// \ | +/// +----------------------------------+ +/// | SourceRegion.BranchTargetBlock | +/// +----------------------------------+ +/// +/// \param[in] SourceRegion The candidate to move blocks from +/// \param[in] TargetRegion The candidate to move blocks to +/// +bool PPCBranchCoalescing::mergeCandidates(CoalescingCandidateInfo &SourceRegion, + CoalescingCandidateInfo &TargetRegion) { + + if (SourceRegion.MustMoveUp && SourceRegion.MustMoveDown) { + llvm_unreachable("Cannot have both MustMoveDown and MustMoveUp set!"); + return false; + } + + if (!validateCandidates(SourceRegion, TargetRegion)) + return false; + + // Start the merging process by first handling the BranchBlock. + // Move any PHIs in SourceRegion.BranchBlock down to the branch-taken block + moveAndUpdatePHIs(SourceRegion.BranchBlock, SourceRegion.BranchTargetBlock); + + // Move remaining instructions in SourceRegion.BranchBlock into + // TargetRegion.BranchBlock + MachineBasicBlock::iterator firstInstr = + SourceRegion.BranchBlock->getFirstNonPHI(); + MachineBasicBlock::iterator lastInstr = + SourceRegion.BranchBlock->getFirstTerminator(); + + MachineBasicBlock *Source = SourceRegion.MustMoveDown + ? SourceRegion.BranchTargetBlock + : TargetRegion.BranchBlock; + + MachineBasicBlock::iterator Target = + SourceRegion.MustMoveDown + ? SourceRegion.BranchTargetBlock->getFirstNonPHI() + : TargetRegion.BranchBlock->getFirstTerminator(); + + Source->splice(Target, SourceRegion.BranchBlock, firstInstr, lastInstr); + + // Once PHI and instructions have been moved we need to clean up the + // control flow. + + // Remove SourceRegion.FallThroughBlock before transferring successors of + // SourceRegion.BranchBlock to TargetRegion.BranchBlock. + SourceRegion.BranchBlock->removeSuccessor(SourceRegion.FallThroughBlock); + TargetRegion.BranchBlock->transferSuccessorsAndUpdatePHIs( + SourceRegion.BranchBlock); + // Update branch in TargetRegion.BranchBlock to jump to + // SourceRegion.BranchTargetBlock + // In this case, TargetRegion.BranchTargetBlock == SourceRegion.BranchBlock. + TargetRegion.BranchBlock->ReplaceUsesOfBlockWith( + SourceRegion.BranchBlock, SourceRegion.BranchTargetBlock); + // Remove the branch statement(s) in SourceRegion.BranchBlock + MachineBasicBlock::iterator I = + SourceRegion.BranchBlock->terminators().begin(); + while (I != SourceRegion.BranchBlock->terminators().end()) { + MachineInstr &CurrInst = *I; + ++I; + if (CurrInst.isBranch()) + CurrInst.eraseFromParent(); + } + + // Fall-through block should be empty since this is part of the condition + // to coalesce the branches. + assert(TargetRegion.FallThroughBlock->empty() && + "FallThroughBlocks should be empty!"); + + // Transfer successor information and move PHIs down to the + // branch-taken block. + TargetRegion.FallThroughBlock->transferSuccessorsAndUpdatePHIs( + SourceRegion.FallThroughBlock); + TargetRegion.FallThroughBlock->removeSuccessor(SourceRegion.BranchBlock); + + // Remove the blocks from the function. + assert(SourceRegion.BranchBlock->empty() && + "Expecting branch block to be empty!"); + SourceRegion.BranchBlock->eraseFromParent(); + + assert(SourceRegion.FallThroughBlock->empty() && + "Expecting fall-through block to be empty!\n"); + SourceRegion.FallThroughBlock->eraseFromParent(); + + NumBlocksCoalesced++; + return true; +} + +bool PPCBranchCoalescing::runOnMachineFunction(MachineFunction &MF) { + + if (skipFunction(MF.getFunction()) || MF.empty()) + return false; + + bool didSomething = false; + + DEBUG(dbgs() << "******** Branch Coalescing ********\n"); + initialize(MF); + + DEBUG(dbgs() << "Function: "; MF.dump(); dbgs() << "\n"); + + CoalescingCandidateInfo Cand1, Cand2; + // Walk over blocks and find candidates to merge + // Continue trying to merge with the first candidate found, as long as merging + // is successfull. + for (MachineBasicBlock &MBB : MF) { + bool MergedCandidates = false; + do { + MergedCandidates = false; + Cand1.clear(); + Cand2.clear(); + + Cand1.BranchBlock = &MBB; + + // If unable to coalesce the branch, then continue to next block + if (!canCoalesceBranch(Cand1)) + break; + + Cand2.BranchBlock = Cand1.BranchTargetBlock; + if (!canCoalesceBranch(Cand2)) + break; + + // Sanity check + // The branch-taken block of the second candidate should post-dominate the + // first candidate + assert(MPDT->dominates(Cand2.BranchTargetBlock, Cand1.BranchBlock) && + "Branch-taken block should post-dominate first candidate"); + + if (!identicalOperands(Cand1.Cond, Cand2.Cond)) { + DEBUG(dbgs() << "Blocks " << Cand1.BranchBlock->getNumber() << " and " + << Cand2.BranchBlock->getNumber() + << " have different branches\n"); + break; + } + if (!canMerge(Cand2, Cand1)) { + DEBUG(dbgs() << "Cannot merge blocks " << Cand1.BranchBlock->getNumber() + << " and " << Cand2.BranchBlock->getNumber() << "\n"); + NumBlocksNotCoalesced++; + continue; + } + DEBUG(dbgs() << "Merging blocks " << Cand1.BranchBlock->getNumber() + << " and " << Cand1.BranchTargetBlock->getNumber() << "\n"); + MergedCandidates = mergeCandidates(Cand2, Cand1); + if (MergedCandidates) + didSomething = true; + + DEBUG(dbgs() << "Function after merging: "; MF.dump(); dbgs() << "\n"); + } while (MergedCandidates); + } + +#ifndef NDEBUG + // Verify MF is still valid after branch coalescing + if (didSomething) + MF.verify(nullptr, "Error in code produced by branch coalescing"); +#endif // NDEBUG + + DEBUG(dbgs() << "Finished Branch Coalescing\n"); + return didSomething; +} diff --git a/lib/Target/PowerPC/PPCBranchSelector.cpp b/lib/Target/PowerPC/PPCBranchSelector.cpp index d0b66f9bca09a..64b8f1168beb8 100644 --- a/lib/Target/PowerPC/PPCBranchSelector.cpp +++ b/lib/Target/PowerPC/PPCBranchSelector.cpp @@ -23,9 +23,9 @@ #include "llvm/ADT/Statistic.h" #include "llvm/CodeGen/MachineFunctionPass.h" #include "llvm/CodeGen/MachineRegisterInfo.h" +#include "llvm/CodeGen/TargetSubtargetInfo.h" #include "llvm/Support/MathExtras.h" #include "llvm/Target/TargetMachine.h" -#include "llvm/Target/TargetSubtargetInfo.h" using namespace llvm; #define DEBUG_TYPE "ppc-branch-select" diff --git a/lib/Target/PowerPC/PPCCTRLoops.cpp b/lib/Target/PowerPC/PPCCTRLoops.cpp index 53f33ac1fc0ed..fc638829378ab 100644 --- a/lib/Target/PowerPC/PPCCTRLoops.cpp +++ b/lib/Target/PowerPC/PPCCTRLoops.cpp @@ -26,12 +26,17 @@ #include "PPC.h" #include "PPCSubtarget.h" #include "PPCTargetMachine.h" +#include "PPCTargetTransformInfo.h" #include "llvm/ADT/STLExtras.h" #include "llvm/ADT/Statistic.h" +#include "llvm/Analysis/AssumptionCache.h" +#include "llvm/Analysis/CodeMetrics.h" #include "llvm/Analysis/LoopInfo.h" #include "llvm/Analysis/ScalarEvolutionExpander.h" #include "llvm/Analysis/TargetLibraryInfo.h" +#include "llvm/Analysis/TargetTransformInfo.h" #include "llvm/CodeGen/TargetPassConfig.h" +#include "llvm/CodeGen/TargetSchedule.h" #include "llvm/IR/Constants.h" #include "llvm/IR/DerivedTypes.h" #include "llvm/IR/Dominators.h" @@ -64,6 +69,13 @@ using namespace llvm; static cl::opt CTRLoopLimit("ppc-max-ctrloop", cl::Hidden, cl::init(-1)); #endif +// The latency of mtctr is only justified if there are more than 4 +// comparisons that will be removed as a result. +static cl::opt +SmallCTRLoopThreshold("min-ctr-loop-threshold", cl::init(4), cl::Hidden, + cl::desc("Loops with a constant trip count smaller than " + "this value will not use the count register.")); + STATISTIC(NumCTRLoops, "Number of loops converted to CTR loops"); namespace llvm { @@ -95,6 +107,8 @@ namespace { AU.addRequired(); AU.addPreserved(); AU.addRequired(); + AU.addRequired(); + AU.addRequired(); } private: @@ -107,10 +121,12 @@ namespace { const PPCTargetLowering *TLI; const DataLayout *DL; const TargetLibraryInfo *LibInfo; + const TargetTransformInfo *TTI; LoopInfo *LI; ScalarEvolution *SE; DominatorTree *DT; bool PreserveLCSSA; + TargetSchedModel SchedModel; }; char PPCCTRLoops::ID = 0; @@ -179,6 +195,7 @@ bool PPCCTRLoops::runOnFunction(Function &F) { LI = &getAnalysis().getLoopInfo(); SE = &getAnalysis().getSE(); DT = &getAnalysis().getDomTree(); + TTI = &getAnalysis().getTTI(F); DL = &F.getParent()->getDataLayout(); auto *TLIP = getAnalysisIfAvailable(); LibInfo = TLIP ? &TLIP->getTLI() : nullptr; @@ -243,8 +260,8 @@ bool PPCCTRLoops::mightUseCTR(BasicBlock *BB) { if (CallInst *CI = dyn_cast(J)) { // Inline ASM is okay, unless it clobbers the ctr register. if (InlineAsm *IA = dyn_cast(CI->getCalledValue())) { - if (asmClobbersCTR(IA)) - return true; + if (asmClobbersCTR(IA)) + return true; continue; } @@ -462,10 +479,24 @@ bool PPCCTRLoops::mightUseCTR(BasicBlock *BB) { return false; } - bool PPCCTRLoops::convertToCTRLoop(Loop *L) { bool MadeChange = false; + // Do not convert small short loops to CTR loop. + unsigned ConstTripCount = SE->getSmallConstantTripCount(L); + if (ConstTripCount && ConstTripCount < SmallCTRLoopThreshold) { + SmallPtrSet EphValues; + auto AC = getAnalysis().getAssumptionCache( + *L->getHeader()->getParent()); + CodeMetrics::collectEphemeralValues(L, &AC, EphValues); + CodeMetrics Metrics; + for (BasicBlock *BB : L->blocks()) + Metrics.analyzeBasicBlock(BB, *TTI, EphValues); + // 6 is an approximate latency for the mtctr instruction. + if (Metrics.NumInsts <= (6 * SchedModel.getIssueWidth())) + return false; + } + // Process nested loops first. for (Loop::iterator I = L->begin(), E = L->end(); I != E; ++I) { MadeChange |= convertToCTRLoop(*I); @@ -659,12 +690,11 @@ check_block: } if (I != BI && clobbersCTR(*I)) { - DEBUG(dbgs() << "BB#" << MBB->getNumber() << " (" << - MBB->getFullName() << ") instruction " << *I << - " clobbers CTR, invalidating " << "BB#" << - BI->getParent()->getNumber() << " (" << - BI->getParent()->getFullName() << ") instruction " << - *BI << "\n"); + DEBUG(dbgs() << printMBBReference(*MBB) << " (" << MBB->getFullName() + << ") instruction " << *I << " clobbers CTR, invalidating " + << printMBBReference(*BI->getParent()) << " (" + << BI->getParent()->getFullName() << ") instruction " << *BI + << "\n"); return false; } @@ -678,10 +708,10 @@ check_block: if (CheckPreds) { queue_preds: if (MachineFunction::iterator(MBB) == MBB->getParent()->begin()) { - DEBUG(dbgs() << "Unable to find a MTCTR instruction for BB#" << - BI->getParent()->getNumber() << " (" << - BI->getParent()->getFullName() << ") instruction " << - *BI << "\n"); + DEBUG(dbgs() << "Unable to find a MTCTR instruction for " + << printMBBReference(*BI->getParent()) << " (" + << BI->getParent()->getFullName() << ") instruction " << *BI + << "\n"); return false; } diff --git a/lib/Target/PowerPC/PPCEarlyReturn.cpp b/lib/Target/PowerPC/PPCEarlyReturn.cpp index 811e4dd9dfe16..1699463c0a4bc 100644 --- a/lib/Target/PowerPC/PPCEarlyReturn.cpp +++ b/lib/Target/PowerPC/PPCEarlyReturn.cpp @@ -173,7 +173,7 @@ protected: public: bool runOnMachineFunction(MachineFunction &MF) override { - if (skipFunction(*MF.getFunction())) + if (skipFunction(MF.getFunction())) return false; TII = MF.getSubtarget().getInstrInfo(); diff --git a/lib/Target/PowerPC/PPCExpandISEL.cpp b/lib/Target/PowerPC/PPCExpandISEL.cpp index 41e3190c3eec7..b00e98b63e346 100644 --- a/lib/Target/PowerPC/PPCExpandISEL.cpp +++ b/lib/Target/PowerPC/PPCExpandISEL.cpp @@ -59,6 +59,8 @@ class PPCExpandISEL : public MachineFunctionPass { typedef SmallDenseMap ISELInstructionList; // A map of MBB numbers to their lists of contained ISEL instructions. + // Please note when we traverse this list and expand ISEL, we only remove + // the ISEL from the MBB not from this list. ISELInstructionList ISELInstructions; /// Initialize the object. @@ -124,9 +126,6 @@ public: #endif bool runOnMachineFunction(MachineFunction &MF) override { - if (!isExpandISELEnabled(MF)) - return false; - DEBUG(dbgs() << "Function: "; MF.dump(); dbgs() << "\n"); initialize(MF); @@ -171,7 +170,7 @@ bool PPCExpandISEL::collectISELInstructions() { #ifndef NDEBUG void PPCExpandISEL::DumpISELInstructions() const { for (const auto &I : ISELInstructions) { - DEBUG(dbgs() << "BB#" << I.first << ":\n"); + DEBUG(dbgs() << printMBBReference(*MF->getBlockNumbered(I.first)) << ":\n"); for (const auto &VI : I.second) DEBUG(dbgs() << " "; VI->print(dbgs())); } @@ -190,26 +189,71 @@ bool PPCExpandISEL::canMerge(MachineInstr *PrevPushedMI, MachineInstr *MI) { } void PPCExpandISEL::expandAndMergeISELs() { + bool ExpandISELEnabled = isExpandISELEnabled(*MF); + for (auto &BlockList : ISELInstructions) { - DEBUG(dbgs() << "Expanding ISEL instructions in BB#" << BlockList.first + DEBUG(dbgs() << "Expanding ISEL instructions in " + << printMBBReference(*MF->getBlockNumbered(BlockList.first)) << "\n"); - BlockISELList &CurrentISELList = BlockList.second; auto I = CurrentISELList.begin(); auto E = CurrentISELList.end(); while (I != E) { - BlockISELList SubISELList; - - SubISELList.push_back(*I++); - - // Collect the ISELs that can be merged together. - while (I != E && canMerge(SubISELList.back(), *I)) + assert(isISEL(**I) && "Expecting an ISEL instruction"); + MachineOperand &Dest = (*I)->getOperand(0); + MachineOperand &TrueValue = (*I)->getOperand(1); + MachineOperand &FalseValue = (*I)->getOperand(2); + + // Special case 1, all registers used by ISEL are the same one. + // The non-redundant isel 0, 0, 0, N would not satisfy these conditions + // as it would be ISEL %R0, %ZERO, %R0, %CRN. + if (useSameRegister(Dest, TrueValue) && + useSameRegister(Dest, FalseValue)) { + DEBUG(dbgs() << "Remove redudant ISEL instruction: " << **I << "\n"); + // FIXME: if the CR field used has no other uses, we could eliminate the + // instruction that defines it. This would have to be done manually + // since this pass runs too late to run DCE after it. + NumRemoved++; + (*I)->eraseFromParent(); + I++; + } else if (useSameRegister(TrueValue, FalseValue)) { + // Special case 2, the two input registers used by ISEL are the same. + // Note: the non-foldable isel RX, 0, 0, N would not satisfy this + // condition as it would be ISEL %RX, %ZERO, %R0, %CRN, which makes it + // safe to fold ISEL to MR(OR) instead of ADDI. + MachineBasicBlock *MBB = (*I)->getParent(); + DEBUG(dbgs() << "Fold the ISEL instruction to an unconditonal copy:\n"); + DEBUG(dbgs() << "ISEL: " << **I << "\n"); + NumFolded++; + // Note: we're using both the TrueValue and FalseValue operands so as + // not to lose the kill flag if it is set on either of them. + BuildMI(*MBB, (*I), dl, TII->get(isISEL8(**I) ? PPC::OR8 : PPC::OR)) + .add(Dest) + .add(TrueValue) + .add(FalseValue); + (*I)->eraseFromParent(); + I++; + } else if (ExpandISELEnabled) { // Normal cases expansion enabled + DEBUG(dbgs() << "Expand ISEL instructions:\n"); + DEBUG(dbgs() << "ISEL: " << **I << "\n"); + BlockISELList SubISELList; SubISELList.push_back(*I++); - - expandMergeableISELs(SubISELList); - } - } + // Collect the ISELs that can be merged together. + // This will eat up ISEL instructions without considering whether they + // may be redundant or foldable to a register copy. So we still keep + // the handleSpecialCases() downstream to handle them. + while (I != E && canMerge(SubISELList.back(), *I)) { + DEBUG(dbgs() << "ISEL: " << **I << "\n"); + SubISELList.push_back(*I++); + } + + expandMergeableISELs(SubISELList); + } else { // Normal cases expansion disabled + I++; // leave the ISEL as it is + } + } // end while + } // end for } void PPCExpandISEL::handleSpecialCases(BlockISELList &BIL, @@ -232,13 +276,15 @@ void PPCExpandISEL::handleSpecialCases(BlockISELList &BIL, // Similarly, if at least one of the ISEL instructions satisfy the // following condition, we need the False Block: // The Dest Register and False Value Register are not the same. - bool IsADDIInstRequired = !useSameRegister(Dest, TrueValue); bool IsORIInstRequired = !useSameRegister(Dest, FalseValue); // Special case 1, all registers used by ISEL are the same one. if (!IsADDIInstRequired && !IsORIInstRequired) { DEBUG(dbgs() << "Remove redudant ISEL instruction."); + // FIXME: if the CR field used has no other uses, we could eliminate the + // instruction that defines it. This would have to be done manually + // since this pass runs too late to run DCE after it. NumRemoved++; (*MI)->eraseFromParent(); // Setting MI to the erase result keeps the iterator valid and increased. @@ -253,14 +299,15 @@ void PPCExpandISEL::handleSpecialCases(BlockISELList &BIL, // PPC::ZERO8 will be used for the first operand if the value is meant to // be zero. In this case, the useSameRegister method will return false, // thereby preventing this ISEL from being folded. - if (useSameRegister(TrueValue, FalseValue) && (BIL.size() == 1)) { DEBUG(dbgs() << "Fold the ISEL instruction to an unconditonal copy."); NumFolded++; - BuildMI(*MBB, (*MI), dl, TII->get(isISEL8(**MI) ? PPC::ADDI8 : PPC::ADDI)) + // Note: we're using both the TrueValue and FalseValue operands so as + // not to lose the kill flag if it is set on either of them. + BuildMI(*MBB, (*MI), dl, TII->get(isISEL8(**MI) ? PPC::OR8 : PPC::OR)) .add(Dest) .add(TrueValue) - .add(MachineOperand::CreateImm(0)); + .add(FalseValue); (*MI)->eraseFromParent(); // Setting MI to the erase result keeps the iterator valid and increased. MI = BIL.erase(MI); diff --git a/lib/Target/PowerPC/PPCFastISel.cpp b/lib/Target/PowerPC/PPCFastISel.cpp index bc9957194f6dd..402e29cdff726 100644 --- a/lib/Target/PowerPC/PPCFastISel.cpp +++ b/lib/Target/PowerPC/PPCFastISel.cpp @@ -29,6 +29,7 @@ #include "llvm/CodeGen/MachineFrameInfo.h" #include "llvm/CodeGen/MachineInstrBuilder.h" #include "llvm/CodeGen/MachineRegisterInfo.h" +#include "llvm/CodeGen/TargetLowering.h" #include "llvm/IR/CallingConv.h" #include "llvm/IR/GetElementPtrTypeIterator.h" #include "llvm/IR/GlobalAlias.h" @@ -36,7 +37,6 @@ #include "llvm/IR/IntrinsicInst.h" #include "llvm/IR/Operator.h" #include "llvm/Support/Debug.h" -#include "llvm/Target/TargetLowering.h" #include "llvm/Target/TargetMachine.h" //===----------------------------------------------------------------------===// @@ -1930,7 +1930,7 @@ unsigned PPCFastISel::PPCMaterializeFP(const ConstantFP *CFP, MVT VT) { PPCFuncInfo->setUsesTOCBasePtr(); // For small code model, generate a LF[SD](0, LDtocCPT(Idx, X2)). - if (CModel == CodeModel::Small || CModel == CodeModel::JITDefault) { + if (CModel == CodeModel::Small) { BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(PPC::LDtocCPT), TmpReg) .addConstantPoolIndex(Idx).addReg(PPC::X2); @@ -1981,7 +1981,7 @@ unsigned PPCFastISel::PPCMaterializeGV(const GlobalValue *GV, MVT VT) { PPCFuncInfo->setUsesTOCBasePtr(); // For small code model, generate a simple TOC load. - if (CModel == CodeModel::Small || CModel == CodeModel::JITDefault) + if (CModel == CodeModel::Small) BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(PPC::LDtoc), DestReg) .addGlobalAddress(GV) @@ -1991,9 +1991,9 @@ unsigned PPCFastISel::PPCMaterializeGV(const GlobalValue *GV, MVT VT) { // or externally available linkage, a non-local function address, or a // jump table address (not yet needed), or if we are generating code // for large code model, we generate: - // LDtocL(GV, ADDIStocHA(%X2, GV)) + // LDtocL(GV, ADDIStocHA(%x2, GV)) // Otherwise we generate: - // ADDItocL(ADDIStocHA(%X2, GV), GV) + // ADDItocL(ADDIStocHA(%x2, GV), GV) // Either way, start with the ADDIStocHA: unsigned HighPartReg = createResultReg(RC); BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(PPC::ADDIStocHA), diff --git a/lib/Target/PowerPC/PPCFrameLowering.cpp b/lib/Target/PowerPC/PPCFrameLowering.cpp index b49c3345a17dd..c870a2256691e 100644 --- a/lib/Target/PowerPC/PPCFrameLowering.cpp +++ b/lib/Target/PowerPC/PPCFrameLowering.cpp @@ -312,11 +312,9 @@ static void HandleVRSaveUpdate(MachineInstr &MI, const TargetInstrInfo &TII) { // Live in and live out values already must be in the mask, so don't bother // marking them. - for (MachineRegisterInfo::livein_iterator - I = MF->getRegInfo().livein_begin(), - E = MF->getRegInfo().livein_end(); I != E; ++I) { - unsigned RegNo = TRI->getEncodingValue(I->first); - if (VRRegNo[RegNo] == I->first) // If this really is a vector reg. + for (std::pair LI : MF->getRegInfo().liveins()) { + unsigned RegNo = TRI->getEncodingValue(LI.first); + if (VRRegNo[RegNo] == LI.first) // If this really is a vector reg. UsedRegMask &= ~(1 << (31-RegNo)); // Doesn't need to be marked. } @@ -436,7 +434,7 @@ unsigned PPCFrameLowering::determineFrameLayout(MachineFunction &MF, const PPCRegisterInfo *RegInfo = Subtarget.getRegisterInfo(); unsigned LR = RegInfo->getRARegister(); - bool DisableRedZone = MF.getFunction()->hasFnAttribute(Attribute::NoRedZone); + bool DisableRedZone = MF.getFunction().hasFnAttribute(Attribute::NoRedZone); bool CanUseRedZone = !MFI.hasVarSizedObjects() && // No dynamic alloca. !MFI.adjustsStack() && // No calls. !MustSaveLR(MF, LR) && // No need to save LR. @@ -501,7 +499,7 @@ bool PPCFrameLowering::needsFP(const MachineFunction &MF) const { // Naked functions have no stack frame pushed, so we don't have a frame // pointer. - if (MF.getFunction()->hasFnAttribute(Attribute::Naked)) + if (MF.getFunction().hasFnAttribute(Attribute::Naked)) return false; return MF.getTarget().Options.DisableFramePointerElim(MF) || @@ -694,7 +692,7 @@ void PPCFrameLowering::emitPrologue(MachineFunction &MF, const MCRegisterInfo *MRI = MMI.getContext().getRegisterInfo(); DebugLoc dl; bool needsCFI = MMI.hasDebugInfo() || - MF.getFunction()->needsUnwindTableEntry(); + MF.getFunction().needsUnwindTableEntry(); // Get processor type. bool isPPC64 = Subtarget.isPPC64(); @@ -1507,7 +1505,7 @@ void PPCFrameLowering::emitEpilogue(MachineFunction &MF, unsigned RetOpcode = MBBI->getOpcode(); if (MF.getTarget().Options.GuaranteedTailCallOpt && (RetOpcode == PPC::BLR || RetOpcode == PPC::BLR8) && - MF.getFunction()->getCallingConv() == CallingConv::Fast) { + MF.getFunction().getCallingConv() == CallingConv::Fast) { PPCFunctionInfo *FI = MF.getInfo(); unsigned CallerAllocatedAmt = FI->getMinReservedArea(); @@ -2067,7 +2065,7 @@ eliminateCallFramePseudoInstr(MachineFunction &MF, MachineBasicBlock &MBB, bool PPCFrameLowering::restoreCalleeSavedRegisters(MachineBasicBlock &MBB, MachineBasicBlock::iterator MI, - const std::vector &CSI, + std::vector &CSI, const TargetRegisterInfo *TRI) const { // Currently, this function only handles SVR4 32- and 64-bit ABIs. diff --git a/lib/Target/PowerPC/PPCFrameLowering.h b/lib/Target/PowerPC/PPCFrameLowering.h index 28b0c57f0ffb5..f845d5a9ac64a 100644 --- a/lib/Target/PowerPC/PPCFrameLowering.h +++ b/lib/Target/PowerPC/PPCFrameLowering.h @@ -15,7 +15,7 @@ #include "PPC.h" #include "llvm/ADT/STLExtras.h" -#include "llvm/Target/TargetFrameLowering.h" +#include "llvm/CodeGen/TargetFrameLowering.h" #include "llvm/Target/TargetMachine.h" namespace llvm { @@ -106,7 +106,7 @@ public: bool restoreCalleeSavedRegisters(MachineBasicBlock &MBB, MachineBasicBlock::iterator MI, - const std::vector &CSI, + std::vector &CSI, const TargetRegisterInfo *TRI) const override; /// targetHandlesStackFrameRounding - Returns true if the target is diff --git a/lib/Target/PowerPC/PPCISelDAGToDAG.cpp b/lib/Target/PowerPC/PPCISelDAGToDAG.cpp index 901539b682baa..d3a223fe03e0f 100644 --- a/lib/Target/PowerPC/PPCISelDAGToDAG.cpp +++ b/lib/Target/PowerPC/PPCISelDAGToDAG.cpp @@ -36,6 +36,8 @@ #include "llvm/CodeGen/SelectionDAG.h" #include "llvm/CodeGen/SelectionDAGISel.h" #include "llvm/CodeGen/SelectionDAGNodes.h" +#include "llvm/CodeGen/TargetInstrInfo.h" +#include "llvm/CodeGen/TargetRegisterInfo.h" #include "llvm/CodeGen/ValueTypes.h" #include "llvm/IR/BasicBlock.h" #include "llvm/IR/DebugLoc.h" @@ -53,8 +55,6 @@ #include "llvm/Support/KnownBits.h" #include "llvm/Support/MathExtras.h" #include "llvm/Support/raw_ostream.h" -#include "llvm/Target/TargetInstrInfo.h" -#include "llvm/Target/TargetRegisterInfo.h" #include #include #include @@ -101,6 +101,29 @@ static cl::opt EnableBranchHint( cl::desc("Enable static hinting of branches on ppc"), cl::Hidden); +enum ICmpInGPRType { ICGPR_All, ICGPR_None, ICGPR_I32, ICGPR_I64, + ICGPR_NonExtIn, ICGPR_Zext, ICGPR_Sext, ICGPR_ZextI32, + ICGPR_SextI32, ICGPR_ZextI64, ICGPR_SextI64 }; + +static cl::opt CmpInGPR( + "ppc-gpr-icmps", cl::Hidden, cl::init(ICGPR_All), + cl::desc("Specify the types of comparisons to emit GPR-only code for."), + cl::values(clEnumValN(ICGPR_None, "none", "Do not modify integer comparisons."), + clEnumValN(ICGPR_All, "all", "All possible int comparisons in GPRs."), + clEnumValN(ICGPR_I32, "i32", "Only i32 comparisons in GPRs."), + clEnumValN(ICGPR_I64, "i64", "Only i64 comparisons in GPRs."), + clEnumValN(ICGPR_NonExtIn, "nonextin", + "Only comparisons where inputs don't need [sz]ext."), + clEnumValN(ICGPR_Zext, "zext", "Only comparisons with zext result."), + clEnumValN(ICGPR_ZextI32, "zexti32", + "Only i32 comparisons with zext result."), + clEnumValN(ICGPR_ZextI64, "zexti64", + "Only i64 comparisons with zext result."), + clEnumValN(ICGPR_Sext, "sext", "Only comparisons with sext result."), + clEnumValN(ICGPR_SextI32, "sexti32", + "Only i32 comparisons with sext result."), + clEnumValN(ICGPR_SextI64, "sexti64", + "Only i64 comparisons with sext result."))); namespace { //===--------------------------------------------------------------------===// @@ -133,6 +156,12 @@ namespace { void PreprocessISelDAG() override; void PostprocessISelDAG() override; + /// getI16Imm - Return a target constant with the specified value, of type + /// i16. + inline SDValue getI16Imm(unsigned Imm, const SDLoc &dl) { + return CurDAG->getTargetConstant(Imm, dl, MVT::i16); + } + /// getI32Imm - Return a target constant with the specified value, of type /// i32. inline SDValue getI32Imm(unsigned Imm, const SDLoc &dl) { @@ -168,6 +197,7 @@ namespace { bool tryBitfieldInsert(SDNode *N); bool tryBitPermutation(SDNode *N); + bool tryIntCompareInGPR(SDNode *N); /// SelectCC - Select a comparison of the specified values with the /// specified condition code, returning the CR# of the expression. @@ -270,34 +300,7 @@ namespace { #include "PPCGenDAGISel.inc" private: - // Conversion type for interpreting results of a 32-bit instruction as - // a 64-bit value or vice versa. - enum ExtOrTruncConversion { Ext, Trunc }; - - // Modifiers to guide how an ISD::SETCC node's result is to be computed - // in a GPR. - // ZExtOrig - use the original condition code, zero-extend value - // ZExtInvert - invert the condition code, zero-extend value - // SExtOrig - use the original condition code, sign-extend value - // SExtInvert - invert the condition code, sign-extend value - enum SetccInGPROpts { ZExtOrig, ZExtInvert, SExtOrig, SExtInvert }; - bool trySETCC(SDNode *N); - bool tryEXTEND(SDNode *N); - bool tryLogicOpOfCompares(SDNode *N); - SDValue computeLogicOpInGPR(SDValue LogicOp); - SDValue signExtendInputIfNeeded(SDValue Input); - SDValue zeroExtendInputIfNeeded(SDValue Input); - SDValue addExtOrTrunc(SDValue NatWidthRes, ExtOrTruncConversion Conv); - SDValue get32BitZExtCompare(SDValue LHS, SDValue RHS, ISD::CondCode CC, - int64_t RHSValue, SDLoc dl); - SDValue get32BitSExtCompare(SDValue LHS, SDValue RHS, ISD::CondCode CC, - int64_t RHSValue, SDLoc dl); - SDValue get64BitZExtCompare(SDValue LHS, SDValue RHS, ISD::CondCode CC, - int64_t RHSValue, SDLoc dl); - SDValue get64BitSExtCompare(SDValue LHS, SDValue RHS, ISD::CondCode CC, - int64_t RHSValue, SDLoc dl); - SDValue getSETCCInGPR(SDValue Compare, SetccInGPROpts ConvOpts); void PeepholePPC64(); void PeepholePPC64ZExt(); @@ -388,7 +391,7 @@ SDNode *PPCDAGToDAGISel::getGlobalBaseReg() { // Insert the set of GlobalBaseReg into the first MBB of the function MachineBasicBlock &FirstMBB = MF->front(); MachineBasicBlock::iterator MBBI = FirstMBB.begin(); - const Module *M = MF->getFunction()->getParent(); + const Module *M = MF->getFunction().getParent(); DebugLoc dl; if (PPCLowering->getPointerTy(CurDAG->getDataLayout()) == MVT::i32) { @@ -450,6 +453,12 @@ static bool isInt32Immediate(SDValue N, unsigned &Imm) { return isInt32Immediate(N.getNode(), Imm); } +/// isInt64Immediate - This method tests to see if the value is a 64-bit +/// constant operand. If so Imm will receive the 64-bit value. +static bool isInt64Immediate(SDValue N, uint64_t &Imm) { + return isInt64Immediate(N.getNode(), Imm); +} + static unsigned getBranchHint(unsigned PCC, FunctionLoweringInfo *FuncInfo, const SDValue &DestMBB) { assert(isa(DestMBB)); @@ -607,8 +616,6 @@ bool PPCDAGToDAGISel::tryBitfieldInsert(SDNode *N) { unsigned MB, ME; if (isRunOfOnes(InsertMask, MB, ME)) { - SDValue Tmp1, Tmp2; - if ((Op1Opc == ISD::SHL || Op1Opc == ISD::SRL) && isInt32Immediate(Op1.getOperand(1), Value)) { Op1 = Op1.getOperand(0); @@ -643,8 +650,8 @@ bool PPCDAGToDAGISel::tryBitfieldInsert(SDNode *N) { } // Predict the number of instructions that would be generated by calling -// getInt64(N). -static unsigned getInt64CountDirect(int64_t Imm) { +// selectI64Imm(N). +static unsigned selectI64ImmInstrCountDirect(int64_t Imm) { // Assume no remaining bits. unsigned Remainder = 0; // Assume no shift required. @@ -712,8 +719,8 @@ static uint64_t Rot64(uint64_t Imm, unsigned R) { return (Imm << R) | (Imm >> (64 - R)); } -static unsigned getInt64Count(int64_t Imm) { - unsigned Count = getInt64CountDirect(Imm); +static unsigned selectI64ImmInstrCount(int64_t Imm) { + unsigned Count = selectI64ImmInstrCountDirect(Imm); // If the instruction count is 1 or 2, we do not need further analysis // since rotate + load constant requires at least 2 instructions. @@ -722,10 +729,10 @@ static unsigned getInt64Count(int64_t Imm) { for (unsigned r = 1; r < 63; ++r) { uint64_t RImm = Rot64(Imm, r); - unsigned RCount = getInt64CountDirect(RImm) + 1; + unsigned RCount = selectI64ImmInstrCountDirect(RImm) + 1; Count = std::min(Count, RCount); - // See comments in getInt64 for an explanation of the logic below. + // See comments in selectI64Imm for an explanation of the logic below. unsigned LS = findLastSet(RImm); if (LS != r-1) continue; @@ -733,17 +740,17 @@ static unsigned getInt64Count(int64_t Imm) { uint64_t OnesMask = -(int64_t) (UINT64_C(1) << (LS+1)); uint64_t RImmWithOnes = RImm | OnesMask; - RCount = getInt64CountDirect(RImmWithOnes) + 1; + RCount = selectI64ImmInstrCountDirect(RImmWithOnes) + 1; Count = std::min(Count, RCount); } return Count; } -// Select a 64-bit constant. For cost-modeling purposes, getInt64Count +// Select a 64-bit constant. For cost-modeling purposes, selectI64ImmInstrCount // (above) needs to be kept in sync with this function. -static SDNode *getInt64Direct(SelectionDAG *CurDAG, const SDLoc &dl, - int64_t Imm) { +static SDNode *selectI64ImmDirect(SelectionDAG *CurDAG, const SDLoc &dl, + int64_t Imm) { // Assume no remaining bits. unsigned Remainder = 0; // Assume no shift required. @@ -779,8 +786,10 @@ static SDNode *getInt64Direct(SelectionDAG *CurDAG, const SDLoc &dl, // Simple value. if (isInt<16>(Imm)) { + uint64_t SextImm = SignExtend64(Lo, 16); + SDValue SDImm = CurDAG->getTargetConstant(SextImm, dl, MVT::i64); // Just the Lo bits. - Result = CurDAG->getMachineNode(PPC::LI8, dl, MVT::i64, getI32Imm(Lo)); + Result = CurDAG->getMachineNode(PPC::LI8, dl, MVT::i64, SDImm); } else if (Lo) { // Handle the Hi bits. unsigned OpC = Hi ? PPC::LIS8 : PPC::LI8; @@ -825,13 +834,14 @@ static SDNode *getInt64Direct(SelectionDAG *CurDAG, const SDLoc &dl, return Result; } -static SDNode *getInt64(SelectionDAG *CurDAG, const SDLoc &dl, int64_t Imm) { - unsigned Count = getInt64CountDirect(Imm); +static SDNode *selectI64Imm(SelectionDAG *CurDAG, const SDLoc &dl, + int64_t Imm) { + unsigned Count = selectI64ImmInstrCountDirect(Imm); // If the instruction count is 1 or 2, we do not need further analysis // since rotate + load constant requires at least 2 instructions. if (Count <= 2) - return getInt64Direct(CurDAG, dl, Imm); + return selectI64ImmDirect(CurDAG, dl, Imm); unsigned RMin = 0; @@ -840,7 +850,7 @@ static SDNode *getInt64(SelectionDAG *CurDAG, const SDLoc &dl, int64_t Imm) { for (unsigned r = 1; r < 63; ++r) { uint64_t RImm = Rot64(Imm, r); - unsigned RCount = getInt64CountDirect(RImm) + 1; + unsigned RCount = selectI64ImmInstrCountDirect(RImm) + 1; if (RCount < Count) { Count = RCount; RMin = r; @@ -863,7 +873,7 @@ static SDNode *getInt64(SelectionDAG *CurDAG, const SDLoc &dl, int64_t Imm) { uint64_t OnesMask = -(int64_t) (UINT64_C(1) << (LS+1)); uint64_t RImmWithOnes = RImm | OnesMask; - RCount = getInt64CountDirect(RImmWithOnes) + 1; + RCount = selectI64ImmInstrCountDirect(RImmWithOnes) + 1; if (RCount < Count) { Count = RCount; RMin = r; @@ -873,24 +883,86 @@ static SDNode *getInt64(SelectionDAG *CurDAG, const SDLoc &dl, int64_t Imm) { } if (!RMin) - return getInt64Direct(CurDAG, dl, Imm); + return selectI64ImmDirect(CurDAG, dl, Imm); auto getI32Imm = [CurDAG, dl](unsigned Imm) { return CurDAG->getTargetConstant(Imm, dl, MVT::i32); }; - SDValue Val = SDValue(getInt64Direct(CurDAG, dl, MatImm), 0); + SDValue Val = SDValue(selectI64ImmDirect(CurDAG, dl, MatImm), 0); return CurDAG->getMachineNode(PPC::RLDICR, dl, MVT::i64, Val, getI32Imm(64 - RMin), getI32Imm(MaskEnd)); } +static unsigned allUsesTruncate(SelectionDAG *CurDAG, SDNode *N) { + unsigned MaxTruncation = 0; + // Cannot use range-based for loop here as we need the actual use (i.e. we + // need the operand number corresponding to the use). A range-based for + // will unbox the use and provide an SDNode*. + for (SDNode::use_iterator Use = N->use_begin(), UseEnd = N->use_end(); + Use != UseEnd; ++Use) { + unsigned Opc = + Use->isMachineOpcode() ? Use->getMachineOpcode() : Use->getOpcode(); + switch (Opc) { + default: return 0; + case ISD::TRUNCATE: + if (Use->isMachineOpcode()) + return 0; + MaxTruncation = + std::max(MaxTruncation, Use->getValueType(0).getSizeInBits()); + continue; + case ISD::STORE: { + if (Use->isMachineOpcode()) + return 0; + StoreSDNode *STN = cast(*Use); + unsigned MemVTSize = STN->getMemoryVT().getSizeInBits(); + if (MemVTSize == 64 || Use.getOperandNo() != 0) + return 0; + MaxTruncation = std::max(MaxTruncation, MemVTSize); + continue; + } + case PPC::STW8: + case PPC::STWX8: + case PPC::STWU8: + case PPC::STWUX8: + if (Use.getOperandNo() != 0) + return 0; + MaxTruncation = std::max(MaxTruncation, 32u); + continue; + case PPC::STH8: + case PPC::STHX8: + case PPC::STHU8: + case PPC::STHUX8: + if (Use.getOperandNo() != 0) + return 0; + MaxTruncation = std::max(MaxTruncation, 16u); + continue; + case PPC::STB8: + case PPC::STBX8: + case PPC::STBU8: + case PPC::STBUX8: + if (Use.getOperandNo() != 0) + return 0; + MaxTruncation = std::max(MaxTruncation, 8u); + continue; + } + } + return MaxTruncation; +} + // Select a 64-bit constant. -static SDNode *getInt64(SelectionDAG *CurDAG, SDNode *N) { +static SDNode *selectI64Imm(SelectionDAG *CurDAG, SDNode *N) { SDLoc dl(N); // Get 64 bit value. int64_t Imm = cast(N)->getZExtValue(); - return getInt64(CurDAG, dl, Imm); + if (unsigned MinSize = allUsesTruncate(CurDAG, N)) { + uint64_t SextImm = SignExtend64(Imm, MinSize); + SDValue SDImm = CurDAG->getTargetConstant(SextImm, dl, MVT::i64); + if (isInt<16>(SextImm)) + return CurDAG->getMachineNode(PPC::LI8, dl, MVT::i64, SDImm); + } + return selectI64Imm(CurDAG, dl, Imm); } namespace { @@ -1090,6 +1162,25 @@ class BitPermutationSelector { return std::make_pair(Interesting = true, &Bits); } + case ISD::ZERO_EXTEND: { + // We support only the case with zero extension from i32 to i64 so far. + if (V.getValueType() != MVT::i64 || + V.getOperand(0).getValueType() != MVT::i32) + break; + + const SmallVector *LHSBits; + const unsigned NumOperandBits = 32; + std::tie(Interesting, LHSBits) = getValueBits(V.getOperand(0), + NumOperandBits); + + for (unsigned i = 0; i < NumOperandBits; ++i) + Bits[i] = (*LHSBits)[i]; + + for (unsigned i = NumOperandBits; i < NumBits; ++i) + Bits[i] = ValueBit(ValueBit::ConstZero); + + return std::make_pair(Interesting, &Bits); + } } for (unsigned i = 0; i < NumBits; ++i) @@ -1351,6 +1442,24 @@ class BitPermutationSelector { return ~Mask; } + // This method extends an input value to 64 bit if input is 32-bit integer. + // While selecting instructions in BitPermutationSelector in 64-bit mode, + // an input value can be a 32-bit integer if a ZERO_EXTEND node is included. + // In such case, we extend it to 64 bit to be consistent with other values. + SDValue ExtendToInt64(SDValue V, const SDLoc &dl) { + if (V.getValueSizeInBits() == 64) + return V; + + assert(V.getValueSizeInBits() == 32); + SDValue SubRegIdx = CurDAG->getTargetConstant(PPC::sub_32, dl, MVT::i32); + SDValue ImDef = SDValue(CurDAG->getMachineNode(PPC::IMPLICIT_DEF, dl, + MVT::i64), 0); + SDValue ExtVal = SDValue(CurDAG->getMachineNode(PPC::INSERT_SUBREG, dl, + MVT::i64, ImDef, V, + SubRegIdx), 0); + return ExtVal; + } + // Depending on the number of groups for a particular value, it might be // better to rotate, mask explicitly (using andi/andis), and then or the // result. Select this part of the result first. @@ -1567,27 +1676,30 @@ class BitPermutationSelector { assert(InstMaskStart >= 32 && "Mask cannot start out of range"); assert(InstMaskEnd >= 32 && "Mask cannot end out of range"); SDValue Ops[] = - { V, getI32Imm(RLAmt, dl), getI32Imm(InstMaskStart - 32, dl), - getI32Imm(InstMaskEnd - 32, dl) }; + { ExtendToInt64(V, dl), getI32Imm(RLAmt, dl), + getI32Imm(InstMaskStart - 32, dl), getI32Imm(InstMaskEnd - 32, dl) }; return SDValue(CurDAG->getMachineNode(PPC::RLWINM8, dl, MVT::i64, Ops), 0); } if (InstMaskEnd == 63) { SDValue Ops[] = - { V, getI32Imm(RLAmt, dl), getI32Imm(InstMaskStart, dl) }; + { ExtendToInt64(V, dl), getI32Imm(RLAmt, dl), + getI32Imm(InstMaskStart, dl) }; return SDValue(CurDAG->getMachineNode(PPC::RLDICL, dl, MVT::i64, Ops), 0); } if (InstMaskStart == 0) { SDValue Ops[] = - { V, getI32Imm(RLAmt, dl), getI32Imm(InstMaskEnd, dl) }; + { ExtendToInt64(V, dl), getI32Imm(RLAmt, dl), + getI32Imm(InstMaskEnd, dl) }; return SDValue(CurDAG->getMachineNode(PPC::RLDICR, dl, MVT::i64, Ops), 0); } if (InstMaskEnd == 63 - RLAmt) { SDValue Ops[] = - { V, getI32Imm(RLAmt, dl), getI32Imm(InstMaskStart, dl) }; + { ExtendToInt64(V, dl), getI32Imm(RLAmt, dl), + getI32Imm(InstMaskStart, dl) }; return SDValue(CurDAG->getMachineNode(PPC::RLDIC, dl, MVT::i64, Ops), 0); } @@ -1628,15 +1740,16 @@ class BitPermutationSelector { assert(InstMaskStart >= 32 && "Mask cannot start out of range"); assert(InstMaskEnd >= 32 && "Mask cannot end out of range"); SDValue Ops[] = - { Base, V, getI32Imm(RLAmt, dl), getI32Imm(InstMaskStart - 32, dl), - getI32Imm(InstMaskEnd - 32, dl) }; + { ExtendToInt64(Base, dl), ExtendToInt64(V, dl), getI32Imm(RLAmt, dl), + getI32Imm(InstMaskStart - 32, dl), getI32Imm(InstMaskEnd - 32, dl) }; return SDValue(CurDAG->getMachineNode(PPC::RLWIMI8, dl, MVT::i64, Ops), 0); } if (InstMaskEnd == 63 - RLAmt) { SDValue Ops[] = - { Base, V, getI32Imm(RLAmt, dl), getI32Imm(InstMaskStart, dl) }; + { ExtendToInt64(Base, dl), ExtendToInt64(V, dl), getI32Imm(RLAmt, dl), + getI32Imm(InstMaskStart, dl) }; return SDValue(CurDAG->getMachineNode(PPC::RLDIMI, dl, MVT::i64, Ops), 0); } @@ -1730,7 +1843,7 @@ class BitPermutationSelector { NumAndInsts += (unsigned) (ANDIMask != 0) + (unsigned) (ANDISMask != 0) + (unsigned) (ANDIMask != 0 && ANDISMask != 0); else - NumAndInsts += getInt64Count(Mask) + /* and */ 1; + NumAndInsts += selectI64ImmInstrCount(Mask) + /* and */ 1; unsigned NumRLInsts = 0; bool FirstBG = true; @@ -1786,10 +1899,14 @@ class BitPermutationSelector { SDValue ANDIVal, ANDISVal; if (ANDIMask != 0) ANDIVal = SDValue(CurDAG->getMachineNode(PPC::ANDIo8, dl, MVT::i64, - VRot, getI32Imm(ANDIMask, dl)), 0); + ExtendToInt64(VRot, dl), + getI32Imm(ANDIMask, dl)), + 0); if (ANDISMask != 0) ANDISVal = SDValue(CurDAG->getMachineNode(PPC::ANDISo8, dl, MVT::i64, - VRot, getI32Imm(ANDISMask, dl)), 0); + ExtendToInt64(VRot, dl), + getI32Imm(ANDISMask, dl)), + 0); if (!ANDIVal) TotalVal = ANDISVal; @@ -1797,19 +1914,21 @@ class BitPermutationSelector { TotalVal = ANDIVal; else TotalVal = SDValue(CurDAG->getMachineNode(PPC::OR8, dl, MVT::i64, - ANDIVal, ANDISVal), 0); + ExtendToInt64(ANDIVal, dl), ANDISVal), 0); } else { - TotalVal = SDValue(getInt64(CurDAG, dl, Mask), 0); + TotalVal = SDValue(selectI64Imm(CurDAG, dl, Mask), 0); TotalVal = SDValue(CurDAG->getMachineNode(PPC::AND8, dl, MVT::i64, - VRot, TotalVal), 0); + ExtendToInt64(VRot, dl), TotalVal), + 0); } if (!Res) Res = TotalVal; else Res = SDValue(CurDAG->getMachineNode(PPC::OR8, dl, MVT::i64, - Res, TotalVal), 0); + ExtendToInt64(Res, dl), TotalVal), + 0); // Now, remove all groups with this underlying value and rotation // factor. @@ -1929,10 +2048,10 @@ class BitPermutationSelector { SDValue ANDIVal, ANDISVal; if (ANDIMask != 0) ANDIVal = SDValue(CurDAG->getMachineNode(PPC::ANDIo8, dl, MVT::i64, - Res, getI32Imm(ANDIMask, dl)), 0); + ExtendToInt64(Res, dl), getI32Imm(ANDIMask, dl)), 0); if (ANDISMask != 0) ANDISVal = SDValue(CurDAG->getMachineNode(PPC::ANDISo8, dl, MVT::i64, - Res, getI32Imm(ANDISMask, dl)), 0); + ExtendToInt64(Res, dl), getI32Imm(ANDISMask, dl)), 0); if (!ANDIVal) Res = ANDISVal; @@ -1940,14 +2059,14 @@ class BitPermutationSelector { Res = ANDIVal; else Res = SDValue(CurDAG->getMachineNode(PPC::OR8, dl, MVT::i64, - ANDIVal, ANDISVal), 0); + ExtendToInt64(ANDIVal, dl), ANDISVal), 0); } else { - if (InstCnt) *InstCnt += getInt64Count(Mask) + /* and */ 1; + if (InstCnt) *InstCnt += selectI64ImmInstrCount(Mask) + /* and */ 1; - SDValue MaskVal = SDValue(getInt64(CurDAG, dl, Mask), 0); + SDValue MaskVal = SDValue(selectI64Imm(CurDAG, dl, Mask), 0); Res = SDValue(CurDAG->getMachineNode(PPC::AND8, dl, MVT::i64, - Res, MaskVal), 0); + ExtendToInt64(Res, dl), MaskVal), 0); } } @@ -2046,962 +2165,1658 @@ public: } }; -} // end anonymous namespace - -bool PPCDAGToDAGISel::tryBitPermutation(SDNode *N) { - if (N->getValueType(0) != MVT::i32 && - N->getValueType(0) != MVT::i64) - return false; - - if (!UseBitPermRewriter) - return false; +class IntegerCompareEliminator { + SelectionDAG *CurDAG; + PPCDAGToDAGISel *S; + // Conversion type for interpreting results of a 32-bit instruction as + // a 64-bit value or vice versa. + enum ExtOrTruncConversion { Ext, Trunc }; + + // Modifiers to guide how an ISD::SETCC node's result is to be computed + // in a GPR. + // ZExtOrig - use the original condition code, zero-extend value + // ZExtInvert - invert the condition code, zero-extend value + // SExtOrig - use the original condition code, sign-extend value + // SExtInvert - invert the condition code, sign-extend value + enum SetccInGPROpts { ZExtOrig, ZExtInvert, SExtOrig, SExtInvert }; + + // Comparisons against zero to emit GPR code sequences for. Each of these + // sequences may need to be emitted for two or more equivalent patterns. + // For example (a >= 0) == (a > -1). The direction of the comparison () + // matters as well as the extension type: sext (-1/0), zext (1/0). + // GEZExt - (zext (LHS >= 0)) + // GESExt - (sext (LHS >= 0)) + // LEZExt - (zext (LHS <= 0)) + // LESExt - (sext (LHS <= 0)) + enum ZeroCompare { GEZExt, GESExt, LEZExt, LESExt }; + + SDNode *tryEXTEND(SDNode *N); + SDNode *tryLogicOpOfCompares(SDNode *N); + SDValue computeLogicOpInGPR(SDValue LogicOp); + SDValue signExtendInputIfNeeded(SDValue Input); + SDValue zeroExtendInputIfNeeded(SDValue Input); + SDValue addExtOrTrunc(SDValue NatWidthRes, ExtOrTruncConversion Conv); + SDValue getCompoundZeroComparisonInGPR(SDValue LHS, SDLoc dl, + ZeroCompare CmpTy); + SDValue get32BitZExtCompare(SDValue LHS, SDValue RHS, ISD::CondCode CC, + int64_t RHSValue, SDLoc dl); + SDValue get32BitSExtCompare(SDValue LHS, SDValue RHS, ISD::CondCode CC, + int64_t RHSValue, SDLoc dl); + SDValue get64BitZExtCompare(SDValue LHS, SDValue RHS, ISD::CondCode CC, + int64_t RHSValue, SDLoc dl); + SDValue get64BitSExtCompare(SDValue LHS, SDValue RHS, ISD::CondCode CC, + int64_t RHSValue, SDLoc dl); + SDValue getSETCCInGPR(SDValue Compare, SetccInGPROpts ConvOpts); - switch (N->getOpcode()) { - default: break; - case ISD::ROTL: - case ISD::SHL: - case ISD::SRL: - case ISD::AND: - case ISD::OR: { - BitPermutationSelector BPS(CurDAG); - if (SDNode *New = BPS.Select(N)) { - ReplaceNode(N, New); - return true; - } - return false; +public: + IntegerCompareEliminator(SelectionDAG *DAG, + PPCDAGToDAGISel *Sel) : CurDAG(DAG), S(Sel) { + assert(CurDAG->getTargetLoweringInfo() + .getPointerTy(CurDAG->getDataLayout()).getSizeInBits() == 64 && + "Only expecting to use this on 64 bit targets."); } + SDNode *Select(SDNode *N) { + if (CmpInGPR == ICGPR_None) + return nullptr; + switch (N->getOpcode()) { + default: break; + case ISD::ZERO_EXTEND: + if (CmpInGPR == ICGPR_Sext || CmpInGPR == ICGPR_SextI32 || + CmpInGPR == ICGPR_SextI64) + return nullptr; + LLVM_FALLTHROUGH; + case ISD::SIGN_EXTEND: + if (CmpInGPR == ICGPR_Zext || CmpInGPR == ICGPR_ZextI32 || + CmpInGPR == ICGPR_ZextI64) + return nullptr; + return tryEXTEND(N); + case ISD::AND: + case ISD::OR: + case ISD::XOR: + return tryLogicOpOfCompares(N); + } + return nullptr; } +}; - return false; +static bool isLogicOp(unsigned Opc) { + return Opc == ISD::AND || Opc == ISD::OR || Opc == ISD::XOR; } +// The obvious case for wanting to keep the value in a GPR. Namely, the +// result of the comparison is actually needed in a GPR. +SDNode *IntegerCompareEliminator::tryEXTEND(SDNode *N) { + assert((N->getOpcode() == ISD::ZERO_EXTEND || + N->getOpcode() == ISD::SIGN_EXTEND) && + "Expecting a zero/sign extend node!"); + SDValue WideRes; + // If we are zero-extending the result of a logical operation on i1 + // values, we can keep the values in GPRs. + if (isLogicOp(N->getOperand(0).getOpcode()) && + N->getOperand(0).getValueType() == MVT::i1 && + N->getOpcode() == ISD::ZERO_EXTEND) + WideRes = computeLogicOpInGPR(N->getOperand(0)); + else if (N->getOperand(0).getOpcode() != ISD::SETCC) + return nullptr; + else + WideRes = + getSETCCInGPR(N->getOperand(0), + N->getOpcode() == ISD::SIGN_EXTEND ? + SetccInGPROpts::SExtOrig : SetccInGPROpts::ZExtOrig); -/// SelectCC - Select a comparison of the specified values with the specified -/// condition code, returning the CR# of the expression. -SDValue PPCDAGToDAGISel::SelectCC(SDValue LHS, SDValue RHS, ISD::CondCode CC, - const SDLoc &dl) { - // Always select the LHS. - unsigned Opc; + if (!WideRes) + return nullptr; - if (LHS.getValueType() == MVT::i32) { - unsigned Imm; - if (CC == ISD::SETEQ || CC == ISD::SETNE) { - if (isInt32Immediate(RHS, Imm)) { - // SETEQ/SETNE comparison with 16-bit immediate, fold it. - if (isUInt<16>(Imm)) - return SDValue(CurDAG->getMachineNode(PPC::CMPLWI, dl, MVT::i32, LHS, - getI32Imm(Imm & 0xFFFF, dl)), - 0); - // If this is a 16-bit signed immediate, fold it. - if (isInt<16>((int)Imm)) - return SDValue(CurDAG->getMachineNode(PPC::CMPWI, dl, MVT::i32, LHS, - getI32Imm(Imm & 0xFFFF, dl)), - 0); + SDLoc dl(N); + bool Input32Bit = WideRes.getValueType() == MVT::i32; + bool Output32Bit = N->getValueType(0) == MVT::i32; - // For non-equality comparisons, the default code would materialize the - // constant, then compare against it, like this: - // lis r2, 4660 - // ori r2, r2, 22136 - // cmpw cr0, r3, r2 - // Since we are just comparing for equality, we can emit this instead: - // xoris r0,r3,0x1234 - // cmplwi cr0,r0,0x5678 - // beq cr0,L6 - SDValue Xor(CurDAG->getMachineNode(PPC::XORIS, dl, MVT::i32, LHS, - getI32Imm(Imm >> 16, dl)), 0); - return SDValue(CurDAG->getMachineNode(PPC::CMPLWI, dl, MVT::i32, Xor, - getI32Imm(Imm & 0xFFFF, dl)), 0); - } - Opc = PPC::CMPLW; - } else if (ISD::isUnsignedIntSetCC(CC)) { - if (isInt32Immediate(RHS, Imm) && isUInt<16>(Imm)) - return SDValue(CurDAG->getMachineNode(PPC::CMPLWI, dl, MVT::i32, LHS, - getI32Imm(Imm & 0xFFFF, dl)), 0); - Opc = PPC::CMPLW; - } else { - int16_t SImm; - if (isIntS16Immediate(RHS, SImm)) - return SDValue(CurDAG->getMachineNode(PPC::CMPWI, dl, MVT::i32, LHS, - getI32Imm((int)SImm & 0xFFFF, - dl)), - 0); - Opc = PPC::CMPW; - } - } else if (LHS.getValueType() == MVT::i64) { - uint64_t Imm; - if (CC == ISD::SETEQ || CC == ISD::SETNE) { - if (isInt64Immediate(RHS.getNode(), Imm)) { - // SETEQ/SETNE comparison with 16-bit immediate, fold it. - if (isUInt<16>(Imm)) - return SDValue(CurDAG->getMachineNode(PPC::CMPLDI, dl, MVT::i64, LHS, - getI32Imm(Imm & 0xFFFF, dl)), - 0); - // If this is a 16-bit signed immediate, fold it. - if (isInt<16>(Imm)) - return SDValue(CurDAG->getMachineNode(PPC::CMPDI, dl, MVT::i64, LHS, - getI32Imm(Imm & 0xFFFF, dl)), - 0); + NumSextSetcc += N->getOpcode() == ISD::SIGN_EXTEND ? 1 : 0; + NumZextSetcc += N->getOpcode() == ISD::SIGN_EXTEND ? 0 : 1; - // For non-equality comparisons, the default code would materialize the - // constant, then compare against it, like this: - // lis r2, 4660 - // ori r2, r2, 22136 - // cmpd cr0, r3, r2 - // Since we are just comparing for equality, we can emit this instead: - // xoris r0,r3,0x1234 - // cmpldi cr0,r0,0x5678 - // beq cr0,L6 - if (isUInt<32>(Imm)) { - SDValue Xor(CurDAG->getMachineNode(PPC::XORIS8, dl, MVT::i64, LHS, - getI64Imm(Imm >> 16, dl)), 0); - return SDValue(CurDAG->getMachineNode(PPC::CMPLDI, dl, MVT::i64, Xor, - getI64Imm(Imm & 0xFFFF, dl)), - 0); - } - } - Opc = PPC::CMPLD; - } else if (ISD::isUnsignedIntSetCC(CC)) { - if (isInt64Immediate(RHS.getNode(), Imm) && isUInt<16>(Imm)) - return SDValue(CurDAG->getMachineNode(PPC::CMPLDI, dl, MVT::i64, LHS, - getI64Imm(Imm & 0xFFFF, dl)), 0); - Opc = PPC::CMPLD; - } else { - int16_t SImm; - if (isIntS16Immediate(RHS, SImm)) - return SDValue(CurDAG->getMachineNode(PPC::CMPDI, dl, MVT::i64, LHS, - getI64Imm(SImm & 0xFFFF, dl)), - 0); - Opc = PPC::CMPD; - } - } else if (LHS.getValueType() == MVT::f32) { - Opc = PPC::FCMPUS; + SDValue ConvOp = WideRes; + if (Input32Bit != Output32Bit) + ConvOp = addExtOrTrunc(WideRes, Input32Bit ? ExtOrTruncConversion::Ext : + ExtOrTruncConversion::Trunc); + return ConvOp.getNode(); +} + +// Attempt to perform logical operations on the results of comparisons while +// keeping the values in GPRs. Without doing so, these would end up being +// lowered to CR-logical operations which suffer from significant latency and +// low ILP. +SDNode *IntegerCompareEliminator::tryLogicOpOfCompares(SDNode *N) { + if (N->getValueType(0) != MVT::i1) + return nullptr; + assert(isLogicOp(N->getOpcode()) && + "Expected a logic operation on setcc results."); + SDValue LoweredLogical = computeLogicOpInGPR(SDValue(N, 0)); + if (!LoweredLogical) + return nullptr; + + SDLoc dl(N); + bool IsBitwiseNegate = LoweredLogical.getMachineOpcode() == PPC::XORI8; + unsigned SubRegToExtract = IsBitwiseNegate ? PPC::sub_eq : PPC::sub_gt; + SDValue CR0Reg = CurDAG->getRegister(PPC::CR0, MVT::i32); + SDValue LHS = LoweredLogical.getOperand(0); + SDValue RHS = LoweredLogical.getOperand(1); + SDValue WideOp; + SDValue OpToConvToRecForm; + + // Look through any 32-bit to 64-bit implicit extend nodes to find the + // opcode that is input to the XORI. + if (IsBitwiseNegate && + LoweredLogical.getOperand(0).getMachineOpcode() == PPC::INSERT_SUBREG) + OpToConvToRecForm = LoweredLogical.getOperand(0).getOperand(1); + else if (IsBitwiseNegate) + // If the input to the XORI isn't an extension, that's what we're after. + OpToConvToRecForm = LoweredLogical.getOperand(0); + else + // If this is not an XORI, it is a reg-reg logical op and we can convert + // it to record-form. + OpToConvToRecForm = LoweredLogical; + + // Get the record-form version of the node we're looking to use to get the + // CR result from. + uint16_t NonRecOpc = OpToConvToRecForm.getMachineOpcode(); + int NewOpc = PPCInstrInfo::getRecordFormOpcode(NonRecOpc); + + // Convert the right node to record-form. This is either the logical we're + // looking at or it is the input node to the negation (if we're looking at + // a bitwise negation). + if (NewOpc != -1 && IsBitwiseNegate) { + // The input to the XORI has a record-form. Use it. + assert(LoweredLogical.getConstantOperandVal(1) == 1 && + "Expected a PPC::XORI8 only for bitwise negation."); + // Emit the record-form instruction. + std::vector Ops; + for (int i = 0, e = OpToConvToRecForm.getNumOperands(); i < e; i++) + Ops.push_back(OpToConvToRecForm.getOperand(i)); + + WideOp = + SDValue(CurDAG->getMachineNode(NewOpc, dl, + OpToConvToRecForm.getValueType(), + MVT::Glue, Ops), 0); } else { - assert(LHS.getValueType() == MVT::f64 && "Unknown vt!"); - Opc = PPCSubTarget->hasVSX() ? PPC::XSCMPUDP : PPC::FCMPUD; + assert((NewOpc != -1 || !IsBitwiseNegate) && + "No record form available for AND8/OR8/XOR8?"); + WideOp = + SDValue(CurDAG->getMachineNode(NewOpc == -1 ? PPC::ANDIo8 : NewOpc, dl, + MVT::i64, MVT::Glue, LHS, RHS), 0); } - return SDValue(CurDAG->getMachineNode(Opc, dl, MVT::i32, LHS, RHS), 0); + + // Select this node to a single bit from CR0 set by the record-form node + // just created. For bitwise negation, use the EQ bit which is the equivalent + // of negating the result (i.e. it is a bit set when the result of the + // operation is zero). + SDValue SRIdxVal = + CurDAG->getTargetConstant(SubRegToExtract, dl, MVT::i32); + SDValue CRBit = + SDValue(CurDAG->getMachineNode(TargetOpcode::EXTRACT_SUBREG, dl, + MVT::i1, CR0Reg, SRIdxVal, + WideOp.getValue(1)), 0); + return CRBit.getNode(); } -static PPC::Predicate getPredicateForSetCC(ISD::CondCode CC) { - switch (CC) { - case ISD::SETUEQ: - case ISD::SETONE: - case ISD::SETOLE: - case ISD::SETOGE: - llvm_unreachable("Should be lowered by legalize!"); - default: llvm_unreachable("Unknown condition!"); - case ISD::SETOEQ: - case ISD::SETEQ: return PPC::PRED_EQ; - case ISD::SETUNE: - case ISD::SETNE: return PPC::PRED_NE; - case ISD::SETOLT: - case ISD::SETLT: return PPC::PRED_LT; - case ISD::SETULE: - case ISD::SETLE: return PPC::PRED_LE; - case ISD::SETOGT: - case ISD::SETGT: return PPC::PRED_GT; - case ISD::SETUGE: - case ISD::SETGE: return PPC::PRED_GE; - case ISD::SETO: return PPC::PRED_NU; - case ISD::SETUO: return PPC::PRED_UN; - // These two are invalid for floating point. Assume we have int. - case ISD::SETULT: return PPC::PRED_LT; - case ISD::SETUGT: return PPC::PRED_GT; +// Lower a logical operation on i1 values into a GPR sequence if possible. +// The result can be kept in a GPR if requested. +// Three types of inputs can be handled: +// - SETCC +// - TRUNCATE +// - Logical operation (AND/OR/XOR) +// There is also a special case that is handled (namely a complement operation +// achieved with xor %a, -1). +SDValue IntegerCompareEliminator::computeLogicOpInGPR(SDValue LogicOp) { + assert(isLogicOp(LogicOp.getOpcode()) && + "Can only handle logic operations here."); + assert(LogicOp.getValueType() == MVT::i1 && + "Can only handle logic operations on i1 values here."); + SDLoc dl(LogicOp); + SDValue LHS, RHS; + + // Special case: xor %a, -1 + bool IsBitwiseNegation = isBitwiseNot(LogicOp); + + // Produces a GPR sequence for each operand of the binary logic operation. + // For SETCC, it produces the respective comparison, for TRUNCATE it truncates + // the value in a GPR and for logic operations, it will recursively produce + // a GPR sequence for the operation. + auto getLogicOperand = [&] (SDValue Operand) -> SDValue { + unsigned OperandOpcode = Operand.getOpcode(); + if (OperandOpcode == ISD::SETCC) + return getSETCCInGPR(Operand, SetccInGPROpts::ZExtOrig); + else if (OperandOpcode == ISD::TRUNCATE) { + SDValue InputOp = Operand.getOperand(0); + EVT InVT = InputOp.getValueType(); + return SDValue(CurDAG->getMachineNode(InVT == MVT::i32 ? PPC::RLDICL_32 : + PPC::RLDICL, dl, InVT, InputOp, + S->getI64Imm(0, dl), + S->getI64Imm(63, dl)), 0); + } else if (isLogicOp(OperandOpcode)) + return computeLogicOpInGPR(Operand); + return SDValue(); + }; + LHS = getLogicOperand(LogicOp.getOperand(0)); + RHS = getLogicOperand(LogicOp.getOperand(1)); + + // If a GPR sequence can't be produced for the LHS we can't proceed. + // Not producing a GPR sequence for the RHS is only a problem if this isn't + // a bitwise negation operation. + if (!LHS || (!RHS && !IsBitwiseNegation)) + return SDValue(); + + NumLogicOpsOnComparison++; + + // We will use the inputs as 64-bit values. + if (LHS.getValueType() == MVT::i32) + LHS = addExtOrTrunc(LHS, ExtOrTruncConversion::Ext); + if (!IsBitwiseNegation && RHS.getValueType() == MVT::i32) + RHS = addExtOrTrunc(RHS, ExtOrTruncConversion::Ext); + + unsigned NewOpc; + switch (LogicOp.getOpcode()) { + default: llvm_unreachable("Unknown logic operation."); + case ISD::AND: NewOpc = PPC::AND8; break; + case ISD::OR: NewOpc = PPC::OR8; break; + case ISD::XOR: NewOpc = PPC::XOR8; break; } -} -/// getCRIdxForSetCC - Return the index of the condition register field -/// associated with the SetCC condition, and whether or not the field is -/// treated as inverted. That is, lt = 0; ge = 0 inverted. -static unsigned getCRIdxForSetCC(ISD::CondCode CC, bool &Invert) { - Invert = false; - switch (CC) { - default: llvm_unreachable("Unknown condition!"); - case ISD::SETOLT: - case ISD::SETLT: return 0; // Bit #0 = SETOLT - case ISD::SETOGT: - case ISD::SETGT: return 1; // Bit #1 = SETOGT - case ISD::SETOEQ: - case ISD::SETEQ: return 2; // Bit #2 = SETOEQ - case ISD::SETUO: return 3; // Bit #3 = SETUO - case ISD::SETUGE: - case ISD::SETGE: Invert = true; return 0; // !Bit #0 = SETUGE - case ISD::SETULE: - case ISD::SETLE: Invert = true; return 1; // !Bit #1 = SETULE - case ISD::SETUNE: - case ISD::SETNE: Invert = true; return 2; // !Bit #2 = SETUNE - case ISD::SETO: Invert = true; return 3; // !Bit #3 = SETO - case ISD::SETUEQ: - case ISD::SETOGE: - case ISD::SETOLE: - case ISD::SETONE: - llvm_unreachable("Invalid branch code: should be expanded by legalize"); - // These are invalid for floating point. Assume integer. - case ISD::SETULT: return 0; - case ISD::SETUGT: return 1; + if (IsBitwiseNegation) { + RHS = S->getI64Imm(1, dl); + NewOpc = PPC::XORI8; } + + return SDValue(CurDAG->getMachineNode(NewOpc, dl, MVT::i64, LHS, RHS), 0); + } -// getVCmpInst: return the vector compare instruction for the specified -// vector type and condition code. Since this is for altivec specific code, -// only support the altivec types (v16i8, v8i16, v4i32, v2i64, and v4f32). -static unsigned int getVCmpInst(MVT VecVT, ISD::CondCode CC, - bool HasVSX, bool &Swap, bool &Negate) { - Swap = false; - Negate = false; +/// If the value isn't guaranteed to be sign-extended to 64-bits, extend it. +/// Otherwise just reinterpret it as a 64-bit value. +/// Useful when emitting comparison code for 32-bit values without using +/// the compare instruction (which only considers the lower 32-bits). +SDValue IntegerCompareEliminator::signExtendInputIfNeeded(SDValue Input) { + assert(Input.getValueType() == MVT::i32 && + "Can only sign-extend 32-bit values here."); + unsigned Opc = Input.getOpcode(); - if (VecVT.isFloatingPoint()) { - /* Handle some cases by swapping input operands. */ - switch (CC) { - case ISD::SETLE: CC = ISD::SETGE; Swap = true; break; - case ISD::SETLT: CC = ISD::SETGT; Swap = true; break; - case ISD::SETOLE: CC = ISD::SETOGE; Swap = true; break; - case ISD::SETOLT: CC = ISD::SETOGT; Swap = true; break; - case ISD::SETUGE: CC = ISD::SETULE; Swap = true; break; - case ISD::SETUGT: CC = ISD::SETULT; Swap = true; break; - default: break; - } - /* Handle some cases by negating the result. */ - switch (CC) { - case ISD::SETNE: CC = ISD::SETEQ; Negate = true; break; - case ISD::SETUNE: CC = ISD::SETOEQ; Negate = true; break; - case ISD::SETULE: CC = ISD::SETOGT; Negate = true; break; - case ISD::SETULT: CC = ISD::SETOGE; Negate = true; break; - default: break; - } - /* We have instructions implementing the remaining cases. */ - switch (CC) { - case ISD::SETEQ: - case ISD::SETOEQ: - if (VecVT == MVT::v4f32) - return HasVSX ? PPC::XVCMPEQSP : PPC::VCMPEQFP; - else if (VecVT == MVT::v2f64) - return PPC::XVCMPEQDP; - break; - case ISD::SETGT: - case ISD::SETOGT: - if (VecVT == MVT::v4f32) - return HasVSX ? PPC::XVCMPGTSP : PPC::VCMPGTFP; - else if (VecVT == MVT::v2f64) - return PPC::XVCMPGTDP; - break; - case ISD::SETGE: - case ISD::SETOGE: - if (VecVT == MVT::v4f32) - return HasVSX ? PPC::XVCMPGESP : PPC::VCMPGEFP; - else if (VecVT == MVT::v2f64) - return PPC::XVCMPGEDP; - break; - default: - break; - } - llvm_unreachable("Invalid floating-point vector compare condition"); - } else { - /* Handle some cases by swapping input operands. */ - switch (CC) { - case ISD::SETGE: CC = ISD::SETLE; Swap = true; break; - case ISD::SETLT: CC = ISD::SETGT; Swap = true; break; - case ISD::SETUGE: CC = ISD::SETULE; Swap = true; break; - case ISD::SETULT: CC = ISD::SETUGT; Swap = true; break; - default: break; - } - /* Handle some cases by negating the result. */ - switch (CC) { - case ISD::SETNE: CC = ISD::SETEQ; Negate = true; break; - case ISD::SETUNE: CC = ISD::SETUEQ; Negate = true; break; - case ISD::SETLE: CC = ISD::SETGT; Negate = true; break; - case ISD::SETULE: CC = ISD::SETUGT; Negate = true; break; - default: break; - } - /* We have instructions implementing the remaining cases. */ - switch (CC) { - case ISD::SETEQ: - case ISD::SETUEQ: - if (VecVT == MVT::v16i8) - return PPC::VCMPEQUB; - else if (VecVT == MVT::v8i16) - return PPC::VCMPEQUH; - else if (VecVT == MVT::v4i32) - return PPC::VCMPEQUW; - else if (VecVT == MVT::v2i64) - return PPC::VCMPEQUD; - break; - case ISD::SETGT: - if (VecVT == MVT::v16i8) - return PPC::VCMPGTSB; - else if (VecVT == MVT::v8i16) - return PPC::VCMPGTSH; - else if (VecVT == MVT::v4i32) - return PPC::VCMPGTSW; - else if (VecVT == MVT::v2i64) - return PPC::VCMPGTSD; - break; - case ISD::SETUGT: - if (VecVT == MVT::v16i8) - return PPC::VCMPGTUB; - else if (VecVT == MVT::v8i16) - return PPC::VCMPGTUH; - else if (VecVT == MVT::v4i32) - return PPC::VCMPGTUW; - else if (VecVT == MVT::v2i64) - return PPC::VCMPGTUD; - break; - default: - break; - } - llvm_unreachable("Invalid integer vector compare condition"); - } + // The value was sign extended and then truncated to 32-bits. No need to + // sign extend it again. + if (Opc == ISD::TRUNCATE && + (Input.getOperand(0).getOpcode() == ISD::AssertSext || + Input.getOperand(0).getOpcode() == ISD::SIGN_EXTEND)) + return addExtOrTrunc(Input, ExtOrTruncConversion::Ext); + + LoadSDNode *InputLoad = dyn_cast(Input); + // The input is a sign-extending load. All ppc sign-extending loads + // sign-extend to the full 64-bits. + if (InputLoad && InputLoad->getExtensionType() == ISD::SEXTLOAD) + return addExtOrTrunc(Input, ExtOrTruncConversion::Ext); + + ConstantSDNode *InputConst = dyn_cast(Input); + // We don't sign-extend constants. + if (InputConst) + return addExtOrTrunc(Input, ExtOrTruncConversion::Ext); + + SDLoc dl(Input); + SignExtensionsAdded++; + return SDValue(CurDAG->getMachineNode(PPC::EXTSW_32_64, dl, + MVT::i64, Input), 0); } -bool PPCDAGToDAGISel::trySETCC(SDNode *N) { - SDLoc dl(N); - unsigned Imm; - ISD::CondCode CC = cast(N->getOperand(2))->get(); - EVT PtrVT = - CurDAG->getTargetLoweringInfo().getPointerTy(CurDAG->getDataLayout()); - bool isPPC64 = (PtrVT == MVT::i64); +/// If the value isn't guaranteed to be zero-extended to 64-bits, extend it. +/// Otherwise just reinterpret it as a 64-bit value. +/// Useful when emitting comparison code for 32-bit values without using +/// the compare instruction (which only considers the lower 32-bits). +SDValue IntegerCompareEliminator::zeroExtendInputIfNeeded(SDValue Input) { + assert(Input.getValueType() == MVT::i32 && + "Can only zero-extend 32-bit values here."); + unsigned Opc = Input.getOpcode(); - if (!PPCSubTarget->useCRBits() && - isInt32Immediate(N->getOperand(1), Imm)) { - // We can codegen setcc op, imm very efficiently compared to a brcond. - // Check for those cases here. - // setcc op, 0 - if (Imm == 0) { - SDValue Op = N->getOperand(0); - switch (CC) { - default: break; - case ISD::SETEQ: { - Op = SDValue(CurDAG->getMachineNode(PPC::CNTLZW, dl, MVT::i32, Op), 0); - SDValue Ops[] = { Op, getI32Imm(27, dl), getI32Imm(5, dl), - getI32Imm(31, dl) }; - CurDAG->SelectNodeTo(N, PPC::RLWINM, MVT::i32, Ops); - return true; - } - case ISD::SETNE: { - if (isPPC64) break; - SDValue AD = - SDValue(CurDAG->getMachineNode(PPC::ADDIC, dl, MVT::i32, MVT::Glue, - Op, getI32Imm(~0U, dl)), 0); - CurDAG->SelectNodeTo(N, PPC::SUBFE, MVT::i32, AD, Op, AD.getValue(1)); - return true; - } - case ISD::SETLT: { - SDValue Ops[] = { Op, getI32Imm(1, dl), getI32Imm(31, dl), - getI32Imm(31, dl) }; - CurDAG->SelectNodeTo(N, PPC::RLWINM, MVT::i32, Ops); - return true; - } - case ISD::SETGT: { - SDValue T = - SDValue(CurDAG->getMachineNode(PPC::NEG, dl, MVT::i32, Op), 0); - T = SDValue(CurDAG->getMachineNode(PPC::ANDC, dl, MVT::i32, T, Op), 0); - SDValue Ops[] = { T, getI32Imm(1, dl), getI32Imm(31, dl), - getI32Imm(31, dl) }; - CurDAG->SelectNodeTo(N, PPC::RLWINM, MVT::i32, Ops); - return true; - } - } - } else if (Imm == ~0U) { // setcc op, -1 - SDValue Op = N->getOperand(0); - switch (CC) { - default: break; - case ISD::SETEQ: - if (isPPC64) break; - Op = SDValue(CurDAG->getMachineNode(PPC::ADDIC, dl, MVT::i32, MVT::Glue, - Op, getI32Imm(1, dl)), 0); - CurDAG->SelectNodeTo(N, PPC::ADDZE, MVT::i32, - SDValue(CurDAG->getMachineNode(PPC::LI, dl, - MVT::i32, - getI32Imm(0, dl)), - 0), Op.getValue(1)); - return true; - case ISD::SETNE: { - if (isPPC64) break; - Op = SDValue(CurDAG->getMachineNode(PPC::NOR, dl, MVT::i32, Op, Op), 0); - SDNode *AD = CurDAG->getMachineNode(PPC::ADDIC, dl, MVT::i32, MVT::Glue, - Op, getI32Imm(~0U, dl)); - CurDAG->SelectNodeTo(N, PPC::SUBFE, MVT::i32, SDValue(AD, 0), Op, - SDValue(AD, 1)); - return true; - } - case ISD::SETLT: { - SDValue AD = SDValue(CurDAG->getMachineNode(PPC::ADDI, dl, MVT::i32, Op, - getI32Imm(1, dl)), 0); - SDValue AN = SDValue(CurDAG->getMachineNode(PPC::AND, dl, MVT::i32, AD, - Op), 0); - SDValue Ops[] = { AN, getI32Imm(1, dl), getI32Imm(31, dl), - getI32Imm(31, dl) }; - CurDAG->SelectNodeTo(N, PPC::RLWINM, MVT::i32, Ops); - return true; - } - case ISD::SETGT: { - SDValue Ops[] = { Op, getI32Imm(1, dl), getI32Imm(31, dl), - getI32Imm(31, dl) }; - Op = SDValue(CurDAG->getMachineNode(PPC::RLWINM, dl, MVT::i32, Ops), 0); - CurDAG->SelectNodeTo(N, PPC::XORI, MVT::i32, Op, getI32Imm(1, dl)); - return true; - } - } - } + // The only condition under which we can omit the actual extend instruction: + // - The value is a positive constant + // - The value comes from a load that isn't a sign-extending load + // An ISD::TRUNCATE needs to be zero-extended unless it is fed by a zext. + bool IsTruncateOfZExt = Opc == ISD::TRUNCATE && + (Input.getOperand(0).getOpcode() == ISD::AssertZext || + Input.getOperand(0).getOpcode() == ISD::ZERO_EXTEND); + if (IsTruncateOfZExt) + return addExtOrTrunc(Input, ExtOrTruncConversion::Ext); + + ConstantSDNode *InputConst = dyn_cast(Input); + if (InputConst && InputConst->getSExtValue() >= 0) + return addExtOrTrunc(Input, ExtOrTruncConversion::Ext); + + LoadSDNode *InputLoad = dyn_cast(Input); + // The input is a load that doesn't sign-extend (it will be zero-extended). + if (InputLoad && InputLoad->getExtensionType() != ISD::SEXTLOAD) + return addExtOrTrunc(Input, ExtOrTruncConversion::Ext); + + // None of the above, need to zero-extend. + SDLoc dl(Input); + ZeroExtensionsAdded++; + return SDValue(CurDAG->getMachineNode(PPC::RLDICL_32_64, dl, MVT::i64, Input, + S->getI64Imm(0, dl), + S->getI64Imm(32, dl)), 0); +} + +// Handle a 32-bit value in a 64-bit register and vice-versa. These are of +// course not actual zero/sign extensions that will generate machine code, +// they're just a way to reinterpret a 32 bit value in a register as a +// 64 bit value and vice-versa. +SDValue IntegerCompareEliminator::addExtOrTrunc(SDValue NatWidthRes, + ExtOrTruncConversion Conv) { + SDLoc dl(NatWidthRes); + + // For reinterpreting 32-bit values as 64 bit values, we generate + // INSERT_SUBREG IMPLICIT_DEF:i64, , TargetConstant:i32<1> + if (Conv == ExtOrTruncConversion::Ext) { + SDValue ImDef(CurDAG->getMachineNode(PPC::IMPLICIT_DEF, dl, MVT::i64), 0); + SDValue SubRegIdx = + CurDAG->getTargetConstant(PPC::sub_32, dl, MVT::i32); + return SDValue(CurDAG->getMachineNode(PPC::INSERT_SUBREG, dl, MVT::i64, + ImDef, NatWidthRes, SubRegIdx), 0); } - SDValue LHS = N->getOperand(0); - SDValue RHS = N->getOperand(1); + assert(Conv == ExtOrTruncConversion::Trunc && + "Unknown convertion between 32 and 64 bit values."); + // For reinterpreting 64-bit values as 32-bit values, we just need to + // EXTRACT_SUBREG (i.e. extract the low word). + SDValue SubRegIdx = + CurDAG->getTargetConstant(PPC::sub_32, dl, MVT::i32); + return SDValue(CurDAG->getMachineNode(PPC::EXTRACT_SUBREG, dl, MVT::i32, + NatWidthRes, SubRegIdx), 0); +} - // Altivec Vector compare instructions do not set any CR register by default and - // vector compare operations return the same type as the operands. - if (LHS.getValueType().isVector()) { - if (PPCSubTarget->hasQPX()) - return false; +// Produce a GPR sequence for compound comparisons (<=, >=) against zero. +// Handle both zero-extensions and sign-extensions. +SDValue +IntegerCompareEliminator::getCompoundZeroComparisonInGPR(SDValue LHS, SDLoc dl, + ZeroCompare CmpTy) { + EVT InVT = LHS.getValueType(); + bool Is32Bit = InVT == MVT::i32; + SDValue ToExtend; + + // Produce the value that needs to be either zero or sign extended. + switch (CmpTy) { + case ZeroCompare::GEZExt: + case ZeroCompare::GESExt: + ToExtend = SDValue(CurDAG->getMachineNode(Is32Bit ? PPC::NOR : PPC::NOR8, + dl, InVT, LHS, LHS), 0); + break; + case ZeroCompare::LEZExt: + case ZeroCompare::LESExt: { + if (Is32Bit) { + // Upper 32 bits cannot be undefined for this sequence. + LHS = signExtendInputIfNeeded(LHS); + SDValue Neg = + SDValue(CurDAG->getMachineNode(PPC::NEG8, dl, MVT::i64, LHS), 0); + ToExtend = + SDValue(CurDAG->getMachineNode(PPC::RLDICL, dl, MVT::i64, + Neg, S->getI64Imm(1, dl), + S->getI64Imm(63, dl)), 0); + } else { + SDValue Addi = + SDValue(CurDAG->getMachineNode(PPC::ADDI8, dl, MVT::i64, LHS, + S->getI64Imm(~0ULL, dl)), 0); + ToExtend = SDValue(CurDAG->getMachineNode(PPC::OR8, dl, MVT::i64, + Addi, LHS), 0); + } + break; + } + } - EVT VecVT = LHS.getValueType(); - bool Swap, Negate; - unsigned int VCmpInst = getVCmpInst(VecVT.getSimpleVT(), CC, - PPCSubTarget->hasVSX(), Swap, Negate); - if (Swap) - std::swap(LHS, RHS); + // For 64-bit sequences, the extensions are the same for the GE/LE cases. + if (!Is32Bit && + (CmpTy == ZeroCompare::GEZExt || CmpTy == ZeroCompare::LEZExt)) + return SDValue(CurDAG->getMachineNode(PPC::RLDICL, dl, MVT::i64, + ToExtend, S->getI64Imm(1, dl), + S->getI64Imm(63, dl)), 0); + if (!Is32Bit && + (CmpTy == ZeroCompare::GESExt || CmpTy == ZeroCompare::LESExt)) + return SDValue(CurDAG->getMachineNode(PPC::SRADI, dl, MVT::i64, ToExtend, + S->getI64Imm(63, dl)), 0); + + assert(Is32Bit && "Should have handled the 32-bit sequences above."); + // For 32-bit sequences, the extensions differ between GE/LE cases. + switch (CmpTy) { + case ZeroCompare::GEZExt: { + SDValue ShiftOps[] = { ToExtend, S->getI32Imm(1, dl), S->getI32Imm(31, dl), + S->getI32Imm(31, dl) }; + return SDValue(CurDAG->getMachineNode(PPC::RLWINM, dl, MVT::i32, + ShiftOps), 0); + } + case ZeroCompare::GESExt: + return SDValue(CurDAG->getMachineNode(PPC::SRAWI, dl, MVT::i32, ToExtend, + S->getI32Imm(31, dl)), 0); + case ZeroCompare::LEZExt: + return SDValue(CurDAG->getMachineNode(PPC::XORI8, dl, MVT::i64, ToExtend, + S->getI32Imm(1, dl)), 0); + case ZeroCompare::LESExt: + return SDValue(CurDAG->getMachineNode(PPC::ADDI8, dl, MVT::i64, ToExtend, + S->getI32Imm(-1, dl)), 0); + } - EVT ResVT = VecVT.changeVectorElementTypeToInteger(); - if (Negate) { - SDValue VCmp(CurDAG->getMachineNode(VCmpInst, dl, ResVT, LHS, RHS), 0); - CurDAG->SelectNodeTo(N, PPCSubTarget->hasVSX() ? PPC::XXLNOR : PPC::VNOR, - ResVT, VCmp, VCmp); - return true; + // The above case covers all the enumerators so it can't have a default clause + // to avoid compiler warnings. + llvm_unreachable("Unknown zero-comparison type."); +} + +/// Produces a zero-extended result of comparing two 32-bit values according to +/// the passed condition code. +SDValue +IntegerCompareEliminator::get32BitZExtCompare(SDValue LHS, SDValue RHS, + ISD::CondCode CC, + int64_t RHSValue, SDLoc dl) { + if (CmpInGPR == ICGPR_I64 || CmpInGPR == ICGPR_SextI64 || + CmpInGPR == ICGPR_ZextI64 || CmpInGPR == ICGPR_Sext) + return SDValue(); + bool IsRHSZero = RHSValue == 0; + bool IsRHSOne = RHSValue == 1; + bool IsRHSNegOne = RHSValue == -1LL; + switch (CC) { + default: return SDValue(); + case ISD::SETEQ: { + // (zext (setcc %a, %b, seteq)) -> (lshr (cntlzw (xor %a, %b)), 5) + // (zext (setcc %a, 0, seteq)) -> (lshr (cntlzw %a), 5) + SDValue Xor = IsRHSZero ? LHS : + SDValue(CurDAG->getMachineNode(PPC::XOR, dl, MVT::i32, LHS, RHS), 0); + SDValue Clz = + SDValue(CurDAG->getMachineNode(PPC::CNTLZW, dl, MVT::i32, Xor), 0); + SDValue ShiftOps[] = { Clz, S->getI32Imm(27, dl), S->getI32Imm(5, dl), + S->getI32Imm(31, dl) }; + return SDValue(CurDAG->getMachineNode(PPC::RLWINM, dl, MVT::i32, + ShiftOps), 0); + } + case ISD::SETNE: { + // (zext (setcc %a, %b, setne)) -> (xor (lshr (cntlzw (xor %a, %b)), 5), 1) + // (zext (setcc %a, 0, setne)) -> (xor (lshr (cntlzw %a), 5), 1) + SDValue Xor = IsRHSZero ? LHS : + SDValue(CurDAG->getMachineNode(PPC::XOR, dl, MVT::i32, LHS, RHS), 0); + SDValue Clz = + SDValue(CurDAG->getMachineNode(PPC::CNTLZW, dl, MVT::i32, Xor), 0); + SDValue ShiftOps[] = { Clz, S->getI32Imm(27, dl), S->getI32Imm(5, dl), + S->getI32Imm(31, dl) }; + SDValue Shift = + SDValue(CurDAG->getMachineNode(PPC::RLWINM, dl, MVT::i32, ShiftOps), 0); + return SDValue(CurDAG->getMachineNode(PPC::XORI, dl, MVT::i32, Shift, + S->getI32Imm(1, dl)), 0); + } + case ISD::SETGE: { + // (zext (setcc %a, %b, setge)) -> (xor (lshr (sub %a, %b), 63), 1) + // (zext (setcc %a, 0, setge)) -> (lshr (~ %a), 31) + if(IsRHSZero) + return getCompoundZeroComparisonInGPR(LHS, dl, ZeroCompare::GEZExt); + + // Not a special case (i.e. RHS == 0). Handle (%a >= %b) as (%b <= %a) + // by swapping inputs and falling through. + std::swap(LHS, RHS); + ConstantSDNode *RHSConst = dyn_cast(RHS); + IsRHSZero = RHSConst && RHSConst->isNullValue(); + LLVM_FALLTHROUGH; + } + case ISD::SETLE: { + if (CmpInGPR == ICGPR_NonExtIn) + return SDValue(); + // (zext (setcc %a, %b, setle)) -> (xor (lshr (sub %b, %a), 63), 1) + // (zext (setcc %a, 0, setle)) -> (xor (lshr (- %a), 63), 1) + if(IsRHSZero) { + if (CmpInGPR == ICGPR_NonExtIn) + return SDValue(); + return getCompoundZeroComparisonInGPR(LHS, dl, ZeroCompare::LEZExt); } - CurDAG->SelectNodeTo(N, VCmpInst, ResVT, LHS, RHS); - return true; + // The upper 32-bits of the register can't be undefined for this sequence. + LHS = signExtendInputIfNeeded(LHS); + RHS = signExtendInputIfNeeded(RHS); + SDValue Sub = + SDValue(CurDAG->getMachineNode(PPC::SUBF8, dl, MVT::i64, LHS, RHS), 0); + SDValue Shift = + SDValue(CurDAG->getMachineNode(PPC::RLDICL, dl, MVT::i64, Sub, + S->getI64Imm(1, dl), S->getI64Imm(63, dl)), + 0); + return + SDValue(CurDAG->getMachineNode(PPC::XORI8, dl, + MVT::i64, Shift, S->getI32Imm(1, dl)), 0); } + case ISD::SETGT: { + // (zext (setcc %a, %b, setgt)) -> (lshr (sub %b, %a), 63) + // (zext (setcc %a, -1, setgt)) -> (lshr (~ %a), 31) + // (zext (setcc %a, 0, setgt)) -> (lshr (- %a), 63) + // Handle SETLT -1 (which is equivalent to SETGE 0). + if (IsRHSNegOne) + return getCompoundZeroComparisonInGPR(LHS, dl, ZeroCompare::GEZExt); + + if (IsRHSZero) { + if (CmpInGPR == ICGPR_NonExtIn) + return SDValue(); + // The upper 32-bits of the register can't be undefined for this sequence. + LHS = signExtendInputIfNeeded(LHS); + RHS = signExtendInputIfNeeded(RHS); + SDValue Neg = + SDValue(CurDAG->getMachineNode(PPC::NEG8, dl, MVT::i64, LHS), 0); + return SDValue(CurDAG->getMachineNode(PPC::RLDICL, dl, MVT::i64, + Neg, S->getI32Imm(1, dl), S->getI32Imm(63, dl)), 0); + } + // Not a special case (i.e. RHS == 0 or RHS == -1). Handle (%a > %b) as + // (%b < %a) by swapping inputs and falling through. + std::swap(LHS, RHS); + ConstantSDNode *RHSConst = dyn_cast(RHS); + IsRHSZero = RHSConst && RHSConst->isNullValue(); + IsRHSOne = RHSConst && RHSConst->getSExtValue() == 1; + LLVM_FALLTHROUGH; + } + case ISD::SETLT: { + // (zext (setcc %a, %b, setlt)) -> (lshr (sub %a, %b), 63) + // (zext (setcc %a, 1, setlt)) -> (xor (lshr (- %a), 63), 1) + // (zext (setcc %a, 0, setlt)) -> (lshr %a, 31) + // Handle SETLT 1 (which is equivalent to SETLE 0). + if (IsRHSOne) { + if (CmpInGPR == ICGPR_NonExtIn) + return SDValue(); + return getCompoundZeroComparisonInGPR(LHS, dl, ZeroCompare::LEZExt); + } - if (PPCSubTarget->useCRBits()) - return false; - - bool Inv; - unsigned Idx = getCRIdxForSetCC(CC, Inv); - SDValue CCReg = SelectCC(LHS, RHS, CC, dl); - SDValue IntCR; - - // Force the ccreg into CR7. - SDValue CR7Reg = CurDAG->getRegister(PPC::CR7, MVT::i32); + if (IsRHSZero) { + SDValue ShiftOps[] = { LHS, S->getI32Imm(1, dl), S->getI32Imm(31, dl), + S->getI32Imm(31, dl) }; + return SDValue(CurDAG->getMachineNode(PPC::RLWINM, dl, MVT::i32, + ShiftOps), 0); + } - SDValue InFlag(nullptr, 0); // Null incoming flag value. - CCReg = CurDAG->getCopyToReg(CurDAG->getEntryNode(), dl, CR7Reg, CCReg, - InFlag).getValue(1); + if (CmpInGPR == ICGPR_NonExtIn) + return SDValue(); + // The upper 32-bits of the register can't be undefined for this sequence. + LHS = signExtendInputIfNeeded(LHS); + RHS = signExtendInputIfNeeded(RHS); + SDValue SUBFNode = + SDValue(CurDAG->getMachineNode(PPC::SUBF8, dl, MVT::i64, RHS, LHS), 0); + return SDValue(CurDAG->getMachineNode(PPC::RLDICL, dl, MVT::i64, + SUBFNode, S->getI64Imm(1, dl), + S->getI64Imm(63, dl)), 0); + } + case ISD::SETUGE: + // (zext (setcc %a, %b, setuge)) -> (xor (lshr (sub %b, %a), 63), 1) + // (zext (setcc %a, %b, setule)) -> (xor (lshr (sub %a, %b), 63), 1) + std::swap(LHS, RHS); + LLVM_FALLTHROUGH; + case ISD::SETULE: { + if (CmpInGPR == ICGPR_NonExtIn) + return SDValue(); + // The upper 32-bits of the register can't be undefined for this sequence. + LHS = zeroExtendInputIfNeeded(LHS); + RHS = zeroExtendInputIfNeeded(RHS); + SDValue Subtract = + SDValue(CurDAG->getMachineNode(PPC::SUBF8, dl, MVT::i64, LHS, RHS), 0); + SDValue SrdiNode = + SDValue(CurDAG->getMachineNode(PPC::RLDICL, dl, MVT::i64, + Subtract, S->getI64Imm(1, dl), + S->getI64Imm(63, dl)), 0); + return SDValue(CurDAG->getMachineNode(PPC::XORI8, dl, MVT::i64, SrdiNode, + S->getI32Imm(1, dl)), 0); + } + case ISD::SETUGT: + // (zext (setcc %a, %b, setugt)) -> (lshr (sub %b, %a), 63) + // (zext (setcc %a, %b, setult)) -> (lshr (sub %a, %b), 63) + std::swap(LHS, RHS); + LLVM_FALLTHROUGH; + case ISD::SETULT: { + if (CmpInGPR == ICGPR_NonExtIn) + return SDValue(); + // The upper 32-bits of the register can't be undefined for this sequence. + LHS = zeroExtendInputIfNeeded(LHS); + RHS = zeroExtendInputIfNeeded(RHS); + SDValue Subtract = + SDValue(CurDAG->getMachineNode(PPC::SUBF8, dl, MVT::i64, RHS, LHS), 0); + return SDValue(CurDAG->getMachineNode(PPC::RLDICL, dl, MVT::i64, + Subtract, S->getI64Imm(1, dl), + S->getI64Imm(63, dl)), 0); + } + } +} - IntCR = SDValue(CurDAG->getMachineNode(PPC::MFOCRF, dl, MVT::i32, CR7Reg, - CCReg), 0); +/// Produces a sign-extended result of comparing two 32-bit values according to +/// the passed condition code. +SDValue +IntegerCompareEliminator::get32BitSExtCompare(SDValue LHS, SDValue RHS, + ISD::CondCode CC, + int64_t RHSValue, SDLoc dl) { + if (CmpInGPR == ICGPR_I64 || CmpInGPR == ICGPR_SextI64 || + CmpInGPR == ICGPR_ZextI64 || CmpInGPR == ICGPR_Zext) + return SDValue(); + bool IsRHSZero = RHSValue == 0; + bool IsRHSOne = RHSValue == 1; + bool IsRHSNegOne = RHSValue == -1LL; - SDValue Ops[] = { IntCR, getI32Imm((32 - (3 - Idx)) & 31, dl), - getI32Imm(31, dl), getI32Imm(31, dl) }; - if (!Inv) { - CurDAG->SelectNodeTo(N, PPC::RLWINM, MVT::i32, Ops); - return true; + switch (CC) { + default: return SDValue(); + case ISD::SETEQ: { + // (sext (setcc %a, %b, seteq)) -> + // (ashr (shl (ctlz (xor %a, %b)), 58), 63) + // (sext (setcc %a, 0, seteq)) -> + // (ashr (shl (ctlz %a), 58), 63) + SDValue CountInput = IsRHSZero ? LHS : + SDValue(CurDAG->getMachineNode(PPC::XOR, dl, MVT::i32, LHS, RHS), 0); + SDValue Cntlzw = + SDValue(CurDAG->getMachineNode(PPC::CNTLZW, dl, MVT::i32, CountInput), 0); + SDValue SHLOps[] = { Cntlzw, S->getI32Imm(27, dl), + S->getI32Imm(5, dl), S->getI32Imm(31, dl) }; + SDValue Slwi = + SDValue(CurDAG->getMachineNode(PPC::RLWINM, dl, MVT::i32, SHLOps), 0); + return SDValue(CurDAG->getMachineNode(PPC::NEG, dl, MVT::i32, Slwi), 0); + } + case ISD::SETNE: { + // Bitwise xor the operands, count leading zeros, shift right by 5 bits and + // flip the bit, finally take 2's complement. + // (sext (setcc %a, %b, setne)) -> + // (neg (xor (lshr (ctlz (xor %a, %b)), 5), 1)) + // Same as above, but the first xor is not needed. + // (sext (setcc %a, 0, setne)) -> + // (neg (xor (lshr (ctlz %a), 5), 1)) + SDValue Xor = IsRHSZero ? LHS : + SDValue(CurDAG->getMachineNode(PPC::XOR, dl, MVT::i32, LHS, RHS), 0); + SDValue Clz = + SDValue(CurDAG->getMachineNode(PPC::CNTLZW, dl, MVT::i32, Xor), 0); + SDValue ShiftOps[] = + { Clz, S->getI32Imm(27, dl), S->getI32Imm(5, dl), S->getI32Imm(31, dl) }; + SDValue Shift = + SDValue(CurDAG->getMachineNode(PPC::RLWINM, dl, MVT::i32, ShiftOps), 0); + SDValue Xori = + SDValue(CurDAG->getMachineNode(PPC::XORI, dl, MVT::i32, Shift, + S->getI32Imm(1, dl)), 0); + return SDValue(CurDAG->getMachineNode(PPC::NEG, dl, MVT::i32, Xori), 0); + } + case ISD::SETGE: { + // (sext (setcc %a, %b, setge)) -> (add (lshr (sub %a, %b), 63), -1) + // (sext (setcc %a, 0, setge)) -> (ashr (~ %a), 31) + if (IsRHSZero) + return getCompoundZeroComparisonInGPR(LHS, dl, ZeroCompare::GESExt); + + // Not a special case (i.e. RHS == 0). Handle (%a >= %b) as (%b <= %a) + // by swapping inputs and falling through. + std::swap(LHS, RHS); + ConstantSDNode *RHSConst = dyn_cast(RHS); + IsRHSZero = RHSConst && RHSConst->isNullValue(); + LLVM_FALLTHROUGH; + } + case ISD::SETLE: { + if (CmpInGPR == ICGPR_NonExtIn) + return SDValue(); + // (sext (setcc %a, %b, setge)) -> (add (lshr (sub %b, %a), 63), -1) + // (sext (setcc %a, 0, setle)) -> (add (lshr (- %a), 63), -1) + if (IsRHSZero) + return getCompoundZeroComparisonInGPR(LHS, dl, ZeroCompare::LESExt); + + // The upper 32-bits of the register can't be undefined for this sequence. + LHS = signExtendInputIfNeeded(LHS); + RHS = signExtendInputIfNeeded(RHS); + SDValue SUBFNode = + SDValue(CurDAG->getMachineNode(PPC::SUBF8, dl, MVT::i64, MVT::Glue, + LHS, RHS), 0); + SDValue Srdi = + SDValue(CurDAG->getMachineNode(PPC::RLDICL, dl, MVT::i64, + SUBFNode, S->getI64Imm(1, dl), + S->getI64Imm(63, dl)), 0); + return SDValue(CurDAG->getMachineNode(PPC::ADDI8, dl, MVT::i64, Srdi, + S->getI32Imm(-1, dl)), 0); + } + case ISD::SETGT: { + // (sext (setcc %a, %b, setgt)) -> (ashr (sub %b, %a), 63) + // (sext (setcc %a, -1, setgt)) -> (ashr (~ %a), 31) + // (sext (setcc %a, 0, setgt)) -> (ashr (- %a), 63) + if (IsRHSNegOne) + return getCompoundZeroComparisonInGPR(LHS, dl, ZeroCompare::GESExt); + if (IsRHSZero) { + if (CmpInGPR == ICGPR_NonExtIn) + return SDValue(); + // The upper 32-bits of the register can't be undefined for this sequence. + LHS = signExtendInputIfNeeded(LHS); + RHS = signExtendInputIfNeeded(RHS); + SDValue Neg = + SDValue(CurDAG->getMachineNode(PPC::NEG8, dl, MVT::i64, LHS), 0); + return SDValue(CurDAG->getMachineNode(PPC::SRADI, dl, MVT::i64, Neg, + S->getI64Imm(63, dl)), 0); + } + // Not a special case (i.e. RHS == 0 or RHS == -1). Handle (%a > %b) as + // (%b < %a) by swapping inputs and falling through. + std::swap(LHS, RHS); + ConstantSDNode *RHSConst = dyn_cast(RHS); + IsRHSZero = RHSConst && RHSConst->isNullValue(); + IsRHSOne = RHSConst && RHSConst->getSExtValue() == 1; + LLVM_FALLTHROUGH; } + case ISD::SETLT: { + // (sext (setcc %a, %b, setgt)) -> (ashr (sub %a, %b), 63) + // (sext (setcc %a, 1, setgt)) -> (add (lshr (- %a), 63), -1) + // (sext (setcc %a, 0, setgt)) -> (ashr %a, 31) + if (IsRHSOne) { + if (CmpInGPR == ICGPR_NonExtIn) + return SDValue(); + return getCompoundZeroComparisonInGPR(LHS, dl, ZeroCompare::LESExt); + } + if (IsRHSZero) + return SDValue(CurDAG->getMachineNode(PPC::SRAWI, dl, MVT::i32, LHS, + S->getI32Imm(31, dl)), 0); - // Get the specified bit. - SDValue Tmp = - SDValue(CurDAG->getMachineNode(PPC::RLWINM, dl, MVT::i32, Ops), 0); - CurDAG->SelectNodeTo(N, PPC::XORI, MVT::i32, Tmp, getI32Imm(1, dl)); - return true; + if (CmpInGPR == ICGPR_NonExtIn) + return SDValue(); + // The upper 32-bits of the register can't be undefined for this sequence. + LHS = signExtendInputIfNeeded(LHS); + RHS = signExtendInputIfNeeded(RHS); + SDValue SUBFNode = + SDValue(CurDAG->getMachineNode(PPC::SUBF8, dl, MVT::i64, RHS, LHS), 0); + return SDValue(CurDAG->getMachineNode(PPC::SRADI, dl, MVT::i64, + SUBFNode, S->getI64Imm(63, dl)), 0); + } + case ISD::SETUGE: + // (sext (setcc %a, %b, setuge)) -> (add (lshr (sub %a, %b), 63), -1) + // (sext (setcc %a, %b, setule)) -> (add (lshr (sub %b, %a), 63), -1) + std::swap(LHS, RHS); + LLVM_FALLTHROUGH; + case ISD::SETULE: { + if (CmpInGPR == ICGPR_NonExtIn) + return SDValue(); + // The upper 32-bits of the register can't be undefined for this sequence. + LHS = zeroExtendInputIfNeeded(LHS); + RHS = zeroExtendInputIfNeeded(RHS); + SDValue Subtract = + SDValue(CurDAG->getMachineNode(PPC::SUBF8, dl, MVT::i64, LHS, RHS), 0); + SDValue Shift = + SDValue(CurDAG->getMachineNode(PPC::RLDICL, dl, MVT::i64, Subtract, + S->getI32Imm(1, dl), S->getI32Imm(63,dl)), + 0); + return SDValue(CurDAG->getMachineNode(PPC::ADDI8, dl, MVT::i64, Shift, + S->getI32Imm(-1, dl)), 0); + } + case ISD::SETUGT: + // (sext (setcc %a, %b, setugt)) -> (ashr (sub %b, %a), 63) + // (sext (setcc %a, %b, setugt)) -> (ashr (sub %a, %b), 63) + std::swap(LHS, RHS); + LLVM_FALLTHROUGH; + case ISD::SETULT: { + if (CmpInGPR == ICGPR_NonExtIn) + return SDValue(); + // The upper 32-bits of the register can't be undefined for this sequence. + LHS = zeroExtendInputIfNeeded(LHS); + RHS = zeroExtendInputIfNeeded(RHS); + SDValue Subtract = + SDValue(CurDAG->getMachineNode(PPC::SUBF8, dl, MVT::i64, RHS, LHS), 0); + return SDValue(CurDAG->getMachineNode(PPC::SRADI, dl, MVT::i64, + Subtract, S->getI64Imm(63, dl)), 0); + } + } } -// Is this opcode a bitwise logical operation? -static bool isLogicOp(unsigned Opc) { - return Opc == ISD::AND || Opc == ISD::OR || Opc == ISD::XOR; +/// Produces a zero-extended result of comparing two 64-bit values according to +/// the passed condition code. +SDValue +IntegerCompareEliminator::get64BitZExtCompare(SDValue LHS, SDValue RHS, + ISD::CondCode CC, + int64_t RHSValue, SDLoc dl) { + if (CmpInGPR == ICGPR_I32 || CmpInGPR == ICGPR_SextI32 || + CmpInGPR == ICGPR_ZextI32 || CmpInGPR == ICGPR_Sext) + return SDValue(); + bool IsRHSZero = RHSValue == 0; + bool IsRHSOne = RHSValue == 1; + bool IsRHSNegOne = RHSValue == -1LL; + switch (CC) { + default: return SDValue(); + case ISD::SETEQ: { + // (zext (setcc %a, %b, seteq)) -> (lshr (ctlz (xor %a, %b)), 6) + // (zext (setcc %a, 0, seteq)) -> (lshr (ctlz %a), 6) + SDValue Xor = IsRHSZero ? LHS : + SDValue(CurDAG->getMachineNode(PPC::XOR8, dl, MVT::i64, LHS, RHS), 0); + SDValue Clz = + SDValue(CurDAG->getMachineNode(PPC::CNTLZD, dl, MVT::i64, Xor), 0); + return SDValue(CurDAG->getMachineNode(PPC::RLDICL, dl, MVT::i64, Clz, + S->getI64Imm(58, dl), + S->getI64Imm(63, dl)), 0); + } + case ISD::SETNE: { + // {addc.reg, addc.CA} = (addcarry (xor %a, %b), -1) + // (zext (setcc %a, %b, setne)) -> (sube addc.reg, addc.reg, addc.CA) + // {addcz.reg, addcz.CA} = (addcarry %a, -1) + // (zext (setcc %a, 0, setne)) -> (sube addcz.reg, addcz.reg, addcz.CA) + SDValue Xor = IsRHSZero ? LHS : + SDValue(CurDAG->getMachineNode(PPC::XOR8, dl, MVT::i64, LHS, RHS), 0); + SDValue AC = + SDValue(CurDAG->getMachineNode(PPC::ADDIC8, dl, MVT::i64, MVT::Glue, + Xor, S->getI32Imm(~0U, dl)), 0); + return SDValue(CurDAG->getMachineNode(PPC::SUBFE8, dl, MVT::i64, AC, + Xor, AC.getValue(1)), 0); + } + case ISD::SETGE: { + // {subc.reg, subc.CA} = (subcarry %a, %b) + // (zext (setcc %a, %b, setge)) -> + // (adde (lshr %b, 63), (ashr %a, 63), subc.CA) + // (zext (setcc %a, 0, setge)) -> (lshr (~ %a), 63) + if (IsRHSZero) + return getCompoundZeroComparisonInGPR(LHS, dl, ZeroCompare::GEZExt); + std::swap(LHS, RHS); + ConstantSDNode *RHSConst = dyn_cast(RHS); + IsRHSZero = RHSConst && RHSConst->isNullValue(); + LLVM_FALLTHROUGH; + } + case ISD::SETLE: { + // {subc.reg, subc.CA} = (subcarry %b, %a) + // (zext (setcc %a, %b, setge)) -> + // (adde (lshr %a, 63), (ashr %b, 63), subc.CA) + // (zext (setcc %a, 0, setge)) -> (lshr (or %a, (add %a, -1)), 63) + if (IsRHSZero) + return getCompoundZeroComparisonInGPR(LHS, dl, ZeroCompare::LEZExt); + SDValue ShiftL = + SDValue(CurDAG->getMachineNode(PPC::RLDICL, dl, MVT::i64, LHS, + S->getI64Imm(1, dl), + S->getI64Imm(63, dl)), 0); + SDValue ShiftR = + SDValue(CurDAG->getMachineNode(PPC::SRADI, dl, MVT::i64, RHS, + S->getI64Imm(63, dl)), 0); + SDValue SubtractCarry = + SDValue(CurDAG->getMachineNode(PPC::SUBFC8, dl, MVT::i64, MVT::Glue, + LHS, RHS), 1); + return SDValue(CurDAG->getMachineNode(PPC::ADDE8, dl, MVT::i64, MVT::Glue, + ShiftR, ShiftL, SubtractCarry), 0); + } + case ISD::SETGT: { + // {subc.reg, subc.CA} = (subcarry %b, %a) + // (zext (setcc %a, %b, setgt)) -> + // (xor (adde (lshr %a, 63), (ashr %b, 63), subc.CA), 1) + // (zext (setcc %a, 0, setgt)) -> (lshr (nor (add %a, -1), %a), 63) + if (IsRHSNegOne) + return getCompoundZeroComparisonInGPR(LHS, dl, ZeroCompare::GEZExt); + if (IsRHSZero) { + SDValue Addi = + SDValue(CurDAG->getMachineNode(PPC::ADDI8, dl, MVT::i64, LHS, + S->getI64Imm(~0ULL, dl)), 0); + SDValue Nor = + SDValue(CurDAG->getMachineNode(PPC::NOR8, dl, MVT::i64, Addi, LHS), 0); + return SDValue(CurDAG->getMachineNode(PPC::RLDICL, dl, MVT::i64, Nor, + S->getI64Imm(1, dl), + S->getI64Imm(63, dl)), 0); + } + std::swap(LHS, RHS); + ConstantSDNode *RHSConst = dyn_cast(RHS); + IsRHSZero = RHSConst && RHSConst->isNullValue(); + IsRHSOne = RHSConst && RHSConst->getSExtValue() == 1; + LLVM_FALLTHROUGH; + } + case ISD::SETLT: { + // {subc.reg, subc.CA} = (subcarry %a, %b) + // (zext (setcc %a, %b, setlt)) -> + // (xor (adde (lshr %b, 63), (ashr %a, 63), subc.CA), 1) + // (zext (setcc %a, 0, setlt)) -> (lshr %a, 63) + if (IsRHSOne) + return getCompoundZeroComparisonInGPR(LHS, dl, ZeroCompare::LEZExt); + if (IsRHSZero) + return SDValue(CurDAG->getMachineNode(PPC::RLDICL, dl, MVT::i64, LHS, + S->getI64Imm(1, dl), + S->getI64Imm(63, dl)), 0); + SDValue SRADINode = + SDValue(CurDAG->getMachineNode(PPC::SRADI, dl, MVT::i64, + LHS, S->getI64Imm(63, dl)), 0); + SDValue SRDINode = + SDValue(CurDAG->getMachineNode(PPC::RLDICL, dl, MVT::i64, + RHS, S->getI64Imm(1, dl), + S->getI64Imm(63, dl)), 0); + SDValue SUBFC8Carry = + SDValue(CurDAG->getMachineNode(PPC::SUBFC8, dl, MVT::i64, MVT::Glue, + RHS, LHS), 1); + SDValue ADDE8Node = + SDValue(CurDAG->getMachineNode(PPC::ADDE8, dl, MVT::i64, MVT::Glue, + SRDINode, SRADINode, SUBFC8Carry), 0); + return SDValue(CurDAG->getMachineNode(PPC::XORI8, dl, MVT::i64, + ADDE8Node, S->getI64Imm(1, dl)), 0); + } + case ISD::SETUGE: + // {subc.reg, subc.CA} = (subcarry %a, %b) + // (zext (setcc %a, %b, setuge)) -> (add (sube %b, %b, subc.CA), 1) + std::swap(LHS, RHS); + LLVM_FALLTHROUGH; + case ISD::SETULE: { + // {subc.reg, subc.CA} = (subcarry %b, %a) + // (zext (setcc %a, %b, setule)) -> (add (sube %a, %a, subc.CA), 1) + SDValue SUBFC8Carry = + SDValue(CurDAG->getMachineNode(PPC::SUBFC8, dl, MVT::i64, MVT::Glue, + LHS, RHS), 1); + SDValue SUBFE8Node = + SDValue(CurDAG->getMachineNode(PPC::SUBFE8, dl, MVT::i64, MVT::Glue, + LHS, LHS, SUBFC8Carry), 0); + return SDValue(CurDAG->getMachineNode(PPC::ADDI8, dl, MVT::i64, + SUBFE8Node, S->getI64Imm(1, dl)), 0); + } + case ISD::SETUGT: + // {subc.reg, subc.CA} = (subcarry %b, %a) + // (zext (setcc %a, %b, setugt)) -> -(sube %b, %b, subc.CA) + std::swap(LHS, RHS); + LLVM_FALLTHROUGH; + case ISD::SETULT: { + // {subc.reg, subc.CA} = (subcarry %a, %b) + // (zext (setcc %a, %b, setult)) -> -(sube %a, %a, subc.CA) + SDValue SubtractCarry = + SDValue(CurDAG->getMachineNode(PPC::SUBFC8, dl, MVT::i64, MVT::Glue, + RHS, LHS), 1); + SDValue ExtSub = + SDValue(CurDAG->getMachineNode(PPC::SUBFE8, dl, MVT::i64, + LHS, LHS, SubtractCarry), 0); + return SDValue(CurDAG->getMachineNode(PPC::NEG8, dl, MVT::i64, + ExtSub), 0); + } + } } -/// If this node is a sign/zero extension of an integer comparison, -/// it can usually be computed in GPR's rather than using comparison -/// instructions and ISEL. We only do this on 64-bit targets for now -/// as the code is specialized for 64-bit (it uses 64-bit instructions -/// and assumes 64-bit registers). -bool PPCDAGToDAGISel::tryEXTEND(SDNode *N) { - if (TM.getOptLevel() == CodeGenOpt::None || !TM.isPPC64()) - return false; - assert((N->getOpcode() == ISD::ZERO_EXTEND || - N->getOpcode() == ISD::SIGN_EXTEND) && - "Expecting a zero/sign extend node!"); - - SDValue WideRes; - // If we are zero-extending the result of a logical operation on i1 - // values, we can keep the values in GPRs. - if (isLogicOp(N->getOperand(0).getOpcode()) && - N->getOperand(0).getValueType() == MVT::i1 && - N->getOpcode() == ISD::ZERO_EXTEND) - WideRes = computeLogicOpInGPR(N->getOperand(0)); - else if (N->getOperand(0).getOpcode() != ISD::SETCC) - return false; - else - WideRes = - getSETCCInGPR(N->getOperand(0), - N->getOpcode() == ISD::SIGN_EXTEND ? - SetccInGPROpts::SExtOrig : SetccInGPROpts::ZExtOrig); - - if (!WideRes) - return false; - - SDLoc dl(N); - bool Inputs32Bit = N->getOperand(0).getOperand(0).getValueType() == MVT::i32; - bool Output32Bit = N->getValueType(0) == MVT::i32; - - NumSextSetcc += N->getOpcode() == ISD::SIGN_EXTEND ? 1 : 0; - NumZextSetcc += N->getOpcode() == ISD::SIGN_EXTEND ? 0 : 1; +/// Produces a sign-extended result of comparing two 64-bit values according to +/// the passed condition code. +SDValue +IntegerCompareEliminator::get64BitSExtCompare(SDValue LHS, SDValue RHS, + ISD::CondCode CC, + int64_t RHSValue, SDLoc dl) { + if (CmpInGPR == ICGPR_I32 || CmpInGPR == ICGPR_SextI32 || + CmpInGPR == ICGPR_ZextI32 || CmpInGPR == ICGPR_Zext) + return SDValue(); + bool IsRHSZero = RHSValue == 0; + bool IsRHSOne = RHSValue == 1; + bool IsRHSNegOne = RHSValue == -1LL; + switch (CC) { + default: return SDValue(); + case ISD::SETEQ: { + // {addc.reg, addc.CA} = (addcarry (xor %a, %b), -1) + // (sext (setcc %a, %b, seteq)) -> (sube addc.reg, addc.reg, addc.CA) + // {addcz.reg, addcz.CA} = (addcarry %a, -1) + // (sext (setcc %a, 0, seteq)) -> (sube addcz.reg, addcz.reg, addcz.CA) + SDValue AddInput = IsRHSZero ? LHS : + SDValue(CurDAG->getMachineNode(PPC::XOR8, dl, MVT::i64, LHS, RHS), 0); + SDValue Addic = + SDValue(CurDAG->getMachineNode(PPC::ADDIC8, dl, MVT::i64, MVT::Glue, + AddInput, S->getI32Imm(~0U, dl)), 0); + return SDValue(CurDAG->getMachineNode(PPC::SUBFE8, dl, MVT::i64, Addic, + Addic, Addic.getValue(1)), 0); + } + case ISD::SETNE: { + // {subfc.reg, subfc.CA} = (subcarry 0, (xor %a, %b)) + // (sext (setcc %a, %b, setne)) -> (sube subfc.reg, subfc.reg, subfc.CA) + // {subfcz.reg, subfcz.CA} = (subcarry 0, %a) + // (sext (setcc %a, 0, setne)) -> (sube subfcz.reg, subfcz.reg, subfcz.CA) + SDValue Xor = IsRHSZero ? LHS : + SDValue(CurDAG->getMachineNode(PPC::XOR8, dl, MVT::i64, LHS, RHS), 0); + SDValue SC = + SDValue(CurDAG->getMachineNode(PPC::SUBFIC8, dl, MVT::i64, MVT::Glue, + Xor, S->getI32Imm(0, dl)), 0); + return SDValue(CurDAG->getMachineNode(PPC::SUBFE8, dl, MVT::i64, SC, + SC, SC.getValue(1)), 0); + } + case ISD::SETGE: { + // {subc.reg, subc.CA} = (subcarry %a, %b) + // (zext (setcc %a, %b, setge)) -> + // (- (adde (lshr %b, 63), (ashr %a, 63), subc.CA)) + // (zext (setcc %a, 0, setge)) -> (~ (ashr %a, 63)) + if (IsRHSZero) + return getCompoundZeroComparisonInGPR(LHS, dl, ZeroCompare::GESExt); + std::swap(LHS, RHS); + ConstantSDNode *RHSConst = dyn_cast(RHS); + IsRHSZero = RHSConst && RHSConst->isNullValue(); + LLVM_FALLTHROUGH; + } + case ISD::SETLE: { + // {subc.reg, subc.CA} = (subcarry %b, %a) + // (zext (setcc %a, %b, setge)) -> + // (- (adde (lshr %a, 63), (ashr %b, 63), subc.CA)) + // (zext (setcc %a, 0, setge)) -> (ashr (or %a, (add %a, -1)), 63) + if (IsRHSZero) + return getCompoundZeroComparisonInGPR(LHS, dl, ZeroCompare::LESExt); + SDValue ShiftR = + SDValue(CurDAG->getMachineNode(PPC::SRADI, dl, MVT::i64, RHS, + S->getI64Imm(63, dl)), 0); + SDValue ShiftL = + SDValue(CurDAG->getMachineNode(PPC::RLDICL, dl, MVT::i64, LHS, + S->getI64Imm(1, dl), + S->getI64Imm(63, dl)), 0); + SDValue SubtractCarry = + SDValue(CurDAG->getMachineNode(PPC::SUBFC8, dl, MVT::i64, MVT::Glue, + LHS, RHS), 1); + SDValue Adde = + SDValue(CurDAG->getMachineNode(PPC::ADDE8, dl, MVT::i64, MVT::Glue, + ShiftR, ShiftL, SubtractCarry), 0); + return SDValue(CurDAG->getMachineNode(PPC::NEG8, dl, MVT::i64, Adde), 0); + } + case ISD::SETGT: { + // {subc.reg, subc.CA} = (subcarry %b, %a) + // (zext (setcc %a, %b, setgt)) -> + // -(xor (adde (lshr %a, 63), (ashr %b, 63), subc.CA), 1) + // (zext (setcc %a, 0, setgt)) -> (ashr (nor (add %a, -1), %a), 63) + if (IsRHSNegOne) + return getCompoundZeroComparisonInGPR(LHS, dl, ZeroCompare::GESExt); + if (IsRHSZero) { + SDValue Add = + SDValue(CurDAG->getMachineNode(PPC::ADDI8, dl, MVT::i64, LHS, + S->getI64Imm(-1, dl)), 0); + SDValue Nor = + SDValue(CurDAG->getMachineNode(PPC::NOR8, dl, MVT::i64, Add, LHS), 0); + return SDValue(CurDAG->getMachineNode(PPC::SRADI, dl, MVT::i64, Nor, + S->getI64Imm(63, dl)), 0); + } + std::swap(LHS, RHS); + ConstantSDNode *RHSConst = dyn_cast(RHS); + IsRHSZero = RHSConst && RHSConst->isNullValue(); + IsRHSOne = RHSConst && RHSConst->getSExtValue() == 1; + LLVM_FALLTHROUGH; + } + case ISD::SETLT: { + // {subc.reg, subc.CA} = (subcarry %a, %b) + // (zext (setcc %a, %b, setlt)) -> + // -(xor (adde (lshr %b, 63), (ashr %a, 63), subc.CA), 1) + // (zext (setcc %a, 0, setlt)) -> (ashr %a, 63) + if (IsRHSOne) + return getCompoundZeroComparisonInGPR(LHS, dl, ZeroCompare::LESExt); + if (IsRHSZero) { + return SDValue(CurDAG->getMachineNode(PPC::SRADI, dl, MVT::i64, LHS, + S->getI64Imm(63, dl)), 0); + } + SDValue SRADINode = + SDValue(CurDAG->getMachineNode(PPC::SRADI, dl, MVT::i64, + LHS, S->getI64Imm(63, dl)), 0); + SDValue SRDINode = + SDValue(CurDAG->getMachineNode(PPC::RLDICL, dl, MVT::i64, + RHS, S->getI64Imm(1, dl), + S->getI64Imm(63, dl)), 0); + SDValue SUBFC8Carry = + SDValue(CurDAG->getMachineNode(PPC::SUBFC8, dl, MVT::i64, MVT::Glue, + RHS, LHS), 1); + SDValue ADDE8Node = + SDValue(CurDAG->getMachineNode(PPC::ADDE8, dl, MVT::i64, + SRDINode, SRADINode, SUBFC8Carry), 0); + SDValue XORI8Node = + SDValue(CurDAG->getMachineNode(PPC::XORI8, dl, MVT::i64, + ADDE8Node, S->getI64Imm(1, dl)), 0); + return SDValue(CurDAG->getMachineNode(PPC::NEG8, dl, MVT::i64, + XORI8Node), 0); + } + case ISD::SETUGE: + // {subc.reg, subc.CA} = (subcarry %a, %b) + // (sext (setcc %a, %b, setuge)) -> ~(sube %b, %b, subc.CA) + std::swap(LHS, RHS); + LLVM_FALLTHROUGH; + case ISD::SETULE: { + // {subc.reg, subc.CA} = (subcarry %b, %a) + // (sext (setcc %a, %b, setule)) -> ~(sube %a, %a, subc.CA) + SDValue SubtractCarry = + SDValue(CurDAG->getMachineNode(PPC::SUBFC8, dl, MVT::i64, MVT::Glue, + LHS, RHS), 1); + SDValue ExtSub = + SDValue(CurDAG->getMachineNode(PPC::SUBFE8, dl, MVT::i64, MVT::Glue, LHS, + LHS, SubtractCarry), 0); + return SDValue(CurDAG->getMachineNode(PPC::NOR8, dl, MVT::i64, + ExtSub, ExtSub), 0); + } + case ISD::SETUGT: + // {subc.reg, subc.CA} = (subcarry %b, %a) + // (sext (setcc %a, %b, setugt)) -> (sube %b, %b, subc.CA) + std::swap(LHS, RHS); + LLVM_FALLTHROUGH; + case ISD::SETULT: { + // {subc.reg, subc.CA} = (subcarry %a, %b) + // (sext (setcc %a, %b, setult)) -> (sube %a, %a, subc.CA) + SDValue SubCarry = + SDValue(CurDAG->getMachineNode(PPC::SUBFC8, dl, MVT::i64, MVT::Glue, + RHS, LHS), 1); + return SDValue(CurDAG->getMachineNode(PPC::SUBFE8, dl, MVT::i64, + LHS, LHS, SubCarry), 0); + } + } +} - SDValue ConvOp = WideRes; - if (Inputs32Bit != Output32Bit) - ConvOp = addExtOrTrunc(WideRes, Inputs32Bit ? ExtOrTruncConversion::Ext : - ExtOrTruncConversion::Trunc); - ReplaceNode(N, ConvOp.getNode()); +/// Do all uses of this SDValue need the result in a GPR? +/// This is meant to be used on values that have type i1 since +/// it is somewhat meaningless to ask if values of other types +/// should be kept in GPR's. +static bool allUsesExtend(SDValue Compare, SelectionDAG *CurDAG) { + assert(Compare.getOpcode() == ISD::SETCC && + "An ISD::SETCC node required here."); + // For values that have a single use, the caller should obviously already have + // checked if that use is an extending use. We check the other uses here. + if (Compare.hasOneUse()) + return true; + // We want the value in a GPR if it is being extended, used for a select, or + // used in logical operations. + for (auto CompareUse : Compare.getNode()->uses()) + if (CompareUse->getOpcode() != ISD::SIGN_EXTEND && + CompareUse->getOpcode() != ISD::ZERO_EXTEND && + CompareUse->getOpcode() != ISD::SELECT && + !isLogicOp(CompareUse->getOpcode())) { + OmittedForNonExtendUses++; + return false; + } return true; } -// Lower a logical operation on i1 values into a GPR sequence if possible. -// The result can be kept in a GPR if requested. -// Three types of inputs can be handled: -// - SETCC -// - TRUNCATE -// - Logical operation (AND/OR/XOR) -// There is also a special case that is handled (namely a complement operation -// achieved with xor %a, -1). -SDValue PPCDAGToDAGISel::computeLogicOpInGPR(SDValue LogicOp) { - assert(isLogicOp(LogicOp.getOpcode()) && - "Can only handle logic operations here."); - assert(LogicOp.getValueType() == MVT::i1 && - "Can only handle logic operations on i1 values here."); - SDLoc dl(LogicOp); - SDValue LHS, RHS; - - // Special case: xor %a, -1 - bool IsBitwiseNegation = isBitwiseNot(LogicOp); +/// Returns an equivalent of a SETCC node but with the result the same width as +/// the inputs. This can nalso be used for SELECT_CC if either the true or false +/// values is a power of two while the other is zero. +SDValue IntegerCompareEliminator::getSETCCInGPR(SDValue Compare, + SetccInGPROpts ConvOpts) { + assert((Compare.getOpcode() == ISD::SETCC || + Compare.getOpcode() == ISD::SELECT_CC) && + "An ISD::SETCC node required here."); - // Produces a GPR sequence for each operand of the binary logic operation. - // For SETCC, it produces the respective comparison, for TRUNCATE it truncates - // the value in a GPR and for logic operations, it will recursively produce - // a GPR sequence for the operation. - auto getLogicOperand = [&] (SDValue Operand) -> SDValue { - unsigned OperandOpcode = Operand.getOpcode(); - if (OperandOpcode == ISD::SETCC) - return getSETCCInGPR(Operand, SetccInGPROpts::ZExtOrig); - else if (OperandOpcode == ISD::TRUNCATE) { - SDValue InputOp = Operand.getOperand(0); - EVT InVT = InputOp.getValueType(); - return - SDValue(CurDAG->getMachineNode(InVT == MVT::i32 ? PPC::RLDICL_32 : - PPC::RLDICL, dl, InVT, InputOp, - getI64Imm(0, dl), getI64Imm(63, dl)), 0); - } else if (isLogicOp(OperandOpcode)) - return computeLogicOpInGPR(Operand); + // Don't convert this comparison to a GPR sequence because there are uses + // of the i1 result (i.e. uses that require the result in the CR). + if ((Compare.getOpcode() == ISD::SETCC) && !allUsesExtend(Compare, CurDAG)) return SDValue(); - }; - LHS = getLogicOperand(LogicOp.getOperand(0)); - RHS = getLogicOperand(LogicOp.getOperand(1)); - // If a GPR sequence can't be produced for the LHS we can't proceed. - // Not producing a GPR sequence for the RHS is only a problem if this isn't - // a bitwise negation operation. - if (!LHS || (!RHS && !IsBitwiseNegation)) + SDValue LHS = Compare.getOperand(0); + SDValue RHS = Compare.getOperand(1); + + // The condition code is operand 2 for SETCC and operand 4 for SELECT_CC. + int CCOpNum = Compare.getOpcode() == ISD::SELECT_CC ? 4 : 2; + ISD::CondCode CC = + cast(Compare.getOperand(CCOpNum))->get(); + EVT InputVT = LHS.getValueType(); + if (InputVT != MVT::i32 && InputVT != MVT::i64) return SDValue(); - NumLogicOpsOnComparison++; + if (ConvOpts == SetccInGPROpts::ZExtInvert || + ConvOpts == SetccInGPROpts::SExtInvert) + CC = ISD::getSetCCInverse(CC, true); - // We will use the inputs as 64-bit values. - if (LHS.getValueType() == MVT::i32) - LHS = addExtOrTrunc(LHS, ExtOrTruncConversion::Ext); - if (!IsBitwiseNegation && RHS.getValueType() == MVT::i32) - RHS = addExtOrTrunc(RHS, ExtOrTruncConversion::Ext); + bool Inputs32Bit = InputVT == MVT::i32; - unsigned NewOpc; - switch (LogicOp.getOpcode()) { - default: llvm_unreachable("Unknown logic operation."); - case ISD::AND: NewOpc = PPC::AND8; break; - case ISD::OR: NewOpc = PPC::OR8; break; - case ISD::XOR: NewOpc = PPC::XOR8; break; - } + SDLoc dl(Compare); + ConstantSDNode *RHSConst = dyn_cast(RHS); + int64_t RHSValue = RHSConst ? RHSConst->getSExtValue() : INT64_MAX; + bool IsSext = ConvOpts == SetccInGPROpts::SExtOrig || + ConvOpts == SetccInGPROpts::SExtInvert; - if (IsBitwiseNegation) { - RHS = getI64Imm(1, dl); - NewOpc = PPC::XORI8; - } + if (IsSext && Inputs32Bit) + return get32BitSExtCompare(LHS, RHS, CC, RHSValue, dl); + else if (Inputs32Bit) + return get32BitZExtCompare(LHS, RHS, CC, RHSValue, dl); + else if (IsSext) + return get64BitSExtCompare(LHS, RHS, CC, RHSValue, dl); + return get64BitZExtCompare(LHS, RHS, CC, RHSValue, dl); +} - return SDValue(CurDAG->getMachineNode(NewOpc, dl, MVT::i64, LHS, RHS), 0); +} // end anonymous namespace -} +bool PPCDAGToDAGISel::tryIntCompareInGPR(SDNode *N) { + if (N->getValueType(0) != MVT::i32 && + N->getValueType(0) != MVT::i64) + return false; -/// Try performing logical operations on results of comparisons in GPRs. -/// It is typically preferred from a performance perspective over performing -/// the operations on individual bits in the CR. We only do this on 64-bit -/// targets for now as the code is specialized for 64-bit (it uses 64-bit -/// instructions and assumes 64-bit registers). -bool PPCDAGToDAGISel::tryLogicOpOfCompares(SDNode *N) { + // This optimization will emit code that assumes 64-bit registers + // so we don't want to run it in 32-bit mode. Also don't run it + // on functions that are not to be optimized. if (TM.getOptLevel() == CodeGenOpt::None || !TM.isPPC64()) return false; - if (N->getValueType(0) != MVT::i1) - return false; - assert(isLogicOp(N->getOpcode()) && - "Expected a logic operation on setcc results."); - SDValue LoweredLogical = computeLogicOpInGPR(SDValue(N, 0)); - if (!LoweredLogical) - return false; - - SDLoc dl(N); - bool IsBitwiseNegate = LoweredLogical.getMachineOpcode() == PPC::XORI8; - unsigned SubRegToExtract = IsBitwiseNegate ? PPC::sub_eq : PPC::sub_gt; - SDValue CR0Reg = CurDAG->getRegister(PPC::CR0, MVT::i32); - SDValue LHS = LoweredLogical.getOperand(0); - SDValue RHS = LoweredLogical.getOperand(1); - SDValue WideOp; - SDValue OpToConvToRecForm; - - // Look through any 32-bit to 64-bit implicit extend nodes to find the opcode - // that is input to the XORI. - if (IsBitwiseNegate && - LoweredLogical.getOperand(0).getMachineOpcode() == PPC::INSERT_SUBREG) - OpToConvToRecForm = LoweredLogical.getOperand(0).getOperand(1); - else if (IsBitwiseNegate) - // If the input to the XORI isn't an extension, that's what we're after. - OpToConvToRecForm = LoweredLogical.getOperand(0); - else - // If this is not an XORI, it is a reg-reg logical op and we can convert it - // to record-form. - OpToConvToRecForm = LoweredLogical; - - // Get the record-form version of the node we're looking to use to get the - // CR result from. - uint16_t NonRecOpc = OpToConvToRecForm.getMachineOpcode(); - int NewOpc = PPCInstrInfo::getRecordFormOpcode(NonRecOpc); - - // Convert the right node to record-form. This is either the logical we're - // looking at or it is the input node to the negation (if we're looking at - // a bitwise negation). - if (NewOpc != -1 && IsBitwiseNegate) { - // The input to the XORI has a record-form. Use it. - assert(LoweredLogical.getConstantOperandVal(1) == 1 && - "Expected a PPC::XORI8 only for bitwise negation."); - // Emit the record-form instruction. - std::vector Ops; - for (int i = 0, e = OpToConvToRecForm.getNumOperands(); i < e; i++) - Ops.push_back(OpToConvToRecForm.getOperand(i)); - WideOp = - SDValue(CurDAG->getMachineNode(NewOpc, dl, - OpToConvToRecForm.getValueType(), - MVT::Glue, Ops), 0); - } else { - assert((NewOpc != -1 || !IsBitwiseNegate) && - "No record form available for AND8/OR8/XOR8?"); - WideOp = - SDValue(CurDAG->getMachineNode(NewOpc == -1 ? PPC::ANDIo8 : NewOpc, dl, - MVT::i64, MVT::Glue, LHS, RHS), 0); + switch (N->getOpcode()) { + default: break; + case ISD::ZERO_EXTEND: + case ISD::SIGN_EXTEND: + case ISD::AND: + case ISD::OR: + case ISD::XOR: { + IntegerCompareEliminator ICmpElim(CurDAG, this); + if (SDNode *New = ICmpElim.Select(N)) { + ReplaceNode(N, New); + return true; + } } - - // Select this node to a single bit from CR0 set by the record-form node - // just created. For bitwise negation, use the EQ bit which is the equivalent - // of negating the result (i.e. it is a bit set when the result of the - // operation is zero). - SDValue SRIdxVal = - CurDAG->getTargetConstant(SubRegToExtract, dl, MVT::i32); - SDValue CRBit = - SDValue(CurDAG->getMachineNode(TargetOpcode::EXTRACT_SUBREG, dl, - MVT::i1, CR0Reg, SRIdxVal, - WideOp.getValue(1)), 0); - ReplaceNode(N, CRBit.getNode()); - return true; + } + return false; } -/// If the value isn't guaranteed to be sign-extended to 64-bits, extend it. -/// Useful when emitting comparison code for 32-bit values without using -/// the compare instruction (which only considers the lower 32-bits). -SDValue PPCDAGToDAGISel::signExtendInputIfNeeded(SDValue Input) { - assert(Input.getValueType() == MVT::i32 && - "Can only sign-extend 32-bit values here."); - unsigned Opc = Input.getOpcode(); - - // The value was sign extended and then truncated to 32-bits. No need to - // sign extend it again. - if (Opc == ISD::TRUNCATE && - (Input.getOperand(0).getOpcode() == ISD::AssertSext || - Input.getOperand(0).getOpcode() == ISD::SIGN_EXTEND)) - return Input; +bool PPCDAGToDAGISel::tryBitPermutation(SDNode *N) { + if (N->getValueType(0) != MVT::i32 && + N->getValueType(0) != MVT::i64) + return false; - LoadSDNode *InputLoad = dyn_cast(Input); - // The input is a sign-extending load. No reason to sign-extend. - if (InputLoad && InputLoad->getExtensionType() == ISD::SEXTLOAD) - return Input; + if (!UseBitPermRewriter) + return false; - ConstantSDNode *InputConst = dyn_cast(Input); - // We don't sign-extend constants and already sign-extended values. - if (InputConst || Opc == ISD::AssertSext || Opc == ISD::SIGN_EXTEND_INREG || - Opc == ISD::SIGN_EXTEND) - return Input; + switch (N->getOpcode()) { + default: break; + case ISD::ROTL: + case ISD::SHL: + case ISD::SRL: + case ISD::AND: + case ISD::OR: { + BitPermutationSelector BPS(CurDAG); + if (SDNode *New = BPS.Select(N)) { + ReplaceNode(N, New); + return true; + } + return false; + } + } - SDLoc dl(Input); - SignExtensionsAdded++; - return SDValue(CurDAG->getMachineNode(PPC::EXTSW_32, dl, MVT::i32, Input), 0); + return false; } -/// If the value isn't guaranteed to be zero-extended to 64-bits, extend it. -/// Useful when emitting comparison code for 32-bit values without using -/// the compare instruction (which only considers the lower 32-bits). -SDValue PPCDAGToDAGISel::zeroExtendInputIfNeeded(SDValue Input) { - assert(Input.getValueType() == MVT::i32 && - "Can only zero-extend 32-bit values here."); - LoadSDNode *InputLoad = dyn_cast(Input); - unsigned Opc = Input.getOpcode(); - - // No need to zero-extend loaded values (unless they're loaded with - // a sign-extending load). - if (InputLoad && InputLoad->getExtensionType() != ISD::SEXTLOAD) - return Input; - - ConstantSDNode *InputConst = dyn_cast(Input); - bool InputZExtConst = InputConst && InputConst->getSExtValue() >= 0; - // An ISD::TRUNCATE will be lowered to an EXTRACT_SUBREG so we have - // to conservatively actually clear the high bits. We also don't need to - // zero-extend constants or values that are already zero-extended. - if (InputZExtConst || Opc == ISD::AssertZext || Opc == ISD::ZERO_EXTEND) - return Input; +/// SelectCC - Select a comparison of the specified values with the specified +/// condition code, returning the CR# of the expression. +SDValue PPCDAGToDAGISel::SelectCC(SDValue LHS, SDValue RHS, ISD::CondCode CC, + const SDLoc &dl) { + // Always select the LHS. + unsigned Opc; - SDLoc dl(Input); - ZeroExtensionsAdded++; - return SDValue(CurDAG->getMachineNode(PPC::RLDICL_32, dl, MVT::i32, Input, - getI64Imm(0, dl), getI64Imm(32, dl)), - 0); -} + if (LHS.getValueType() == MVT::i32) { + unsigned Imm; + if (CC == ISD::SETEQ || CC == ISD::SETNE) { + if (isInt32Immediate(RHS, Imm)) { + // SETEQ/SETNE comparison with 16-bit immediate, fold it. + if (isUInt<16>(Imm)) + return SDValue(CurDAG->getMachineNode(PPC::CMPLWI, dl, MVT::i32, LHS, + getI32Imm(Imm & 0xFFFF, dl)), + 0); + // If this is a 16-bit signed immediate, fold it. + if (isInt<16>((int)Imm)) + return SDValue(CurDAG->getMachineNode(PPC::CMPWI, dl, MVT::i32, LHS, + getI32Imm(Imm & 0xFFFF, dl)), + 0); -// Handle a 32-bit value in a 64-bit register and vice-versa. These are of -// course not actual zero/sign extensions that will generate machine code, -// they're just a way to reinterpret a 32 bit value in a register as a -// 64 bit value and vice-versa. -SDValue PPCDAGToDAGISel::addExtOrTrunc(SDValue NatWidthRes, - ExtOrTruncConversion Conv) { - SDLoc dl(NatWidthRes); + // For non-equality comparisons, the default code would materialize the + // constant, then compare against it, like this: + // lis r2, 4660 + // ori r2, r2, 22136 + // cmpw cr0, r3, r2 + // Since we are just comparing for equality, we can emit this instead: + // xoris r0,r3,0x1234 + // cmplwi cr0,r0,0x5678 + // beq cr0,L6 + SDValue Xor(CurDAG->getMachineNode(PPC::XORIS, dl, MVT::i32, LHS, + getI32Imm(Imm >> 16, dl)), 0); + return SDValue(CurDAG->getMachineNode(PPC::CMPLWI, dl, MVT::i32, Xor, + getI32Imm(Imm & 0xFFFF, dl)), 0); + } + Opc = PPC::CMPLW; + } else if (ISD::isUnsignedIntSetCC(CC)) { + if (isInt32Immediate(RHS, Imm) && isUInt<16>(Imm)) + return SDValue(CurDAG->getMachineNode(PPC::CMPLWI, dl, MVT::i32, LHS, + getI32Imm(Imm & 0xFFFF, dl)), 0); + Opc = PPC::CMPLW; + } else { + int16_t SImm; + if (isIntS16Immediate(RHS, SImm)) + return SDValue(CurDAG->getMachineNode(PPC::CMPWI, dl, MVT::i32, LHS, + getI32Imm((int)SImm & 0xFFFF, + dl)), + 0); + Opc = PPC::CMPW; + } + } else if (LHS.getValueType() == MVT::i64) { + uint64_t Imm; + if (CC == ISD::SETEQ || CC == ISD::SETNE) { + if (isInt64Immediate(RHS.getNode(), Imm)) { + // SETEQ/SETNE comparison with 16-bit immediate, fold it. + if (isUInt<16>(Imm)) + return SDValue(CurDAG->getMachineNode(PPC::CMPLDI, dl, MVT::i64, LHS, + getI32Imm(Imm & 0xFFFF, dl)), + 0); + // If this is a 16-bit signed immediate, fold it. + if (isInt<16>(Imm)) + return SDValue(CurDAG->getMachineNode(PPC::CMPDI, dl, MVT::i64, LHS, + getI32Imm(Imm & 0xFFFF, dl)), + 0); - // For reinterpreting 32-bit values as 64 bit values, we generate - // INSERT_SUBREG IMPLICIT_DEF:i64, , TargetConstant:i32<1> - if (Conv == ExtOrTruncConversion::Ext) { - SDValue ImDef(CurDAG->getMachineNode(PPC::IMPLICIT_DEF, dl, MVT::i64), 0); - SDValue SubRegIdx = - CurDAG->getTargetConstant(PPC::sub_32, dl, MVT::i32); - return SDValue(CurDAG->getMachineNode(PPC::INSERT_SUBREG, dl, MVT::i64, - ImDef, NatWidthRes, SubRegIdx), 0); + // For non-equality comparisons, the default code would materialize the + // constant, then compare against it, like this: + // lis r2, 4660 + // ori r2, r2, 22136 + // cmpd cr0, r3, r2 + // Since we are just comparing for equality, we can emit this instead: + // xoris r0,r3,0x1234 + // cmpldi cr0,r0,0x5678 + // beq cr0,L6 + if (isUInt<32>(Imm)) { + SDValue Xor(CurDAG->getMachineNode(PPC::XORIS8, dl, MVT::i64, LHS, + getI64Imm(Imm >> 16, dl)), 0); + return SDValue(CurDAG->getMachineNode(PPC::CMPLDI, dl, MVT::i64, Xor, + getI64Imm(Imm & 0xFFFF, dl)), + 0); + } + } + Opc = PPC::CMPLD; + } else if (ISD::isUnsignedIntSetCC(CC)) { + if (isInt64Immediate(RHS.getNode(), Imm) && isUInt<16>(Imm)) + return SDValue(CurDAG->getMachineNode(PPC::CMPLDI, dl, MVT::i64, LHS, + getI64Imm(Imm & 0xFFFF, dl)), 0); + Opc = PPC::CMPLD; + } else { + int16_t SImm; + if (isIntS16Immediate(RHS, SImm)) + return SDValue(CurDAG->getMachineNode(PPC::CMPDI, dl, MVT::i64, LHS, + getI64Imm(SImm & 0xFFFF, dl)), + 0); + Opc = PPC::CMPD; + } + } else if (LHS.getValueType() == MVT::f32) { + Opc = PPC::FCMPUS; + } else { + assert(LHS.getValueType() == MVT::f64 && "Unknown vt!"); + Opc = PPCSubTarget->hasVSX() ? PPC::XSCMPUDP : PPC::FCMPUD; } - - assert(Conv == ExtOrTruncConversion::Trunc && - "Unknown convertion between 32 and 64 bit values."); - // For reinterpreting 64-bit values as 32-bit values, we just need to - // EXTRACT_SUBREG (i.e. extract the low word). - SDValue SubRegIdx = - CurDAG->getTargetConstant(PPC::sub_32, dl, MVT::i32); - return SDValue(CurDAG->getMachineNode(PPC::EXTRACT_SUBREG, dl, MVT::i32, - NatWidthRes, SubRegIdx), 0); + return SDValue(CurDAG->getMachineNode(Opc, dl, MVT::i32, LHS, RHS), 0); } -/// Produces a zero-extended result of comparing two 32-bit values according to -/// the passed condition code. -SDValue PPCDAGToDAGISel::get32BitZExtCompare(SDValue LHS, SDValue RHS, - ISD::CondCode CC, - int64_t RHSValue, SDLoc dl) { - bool IsRHSZero = RHSValue == 0; +static PPC::Predicate getPredicateForSetCC(ISD::CondCode CC) { switch (CC) { - default: return SDValue(); - case ISD::SETEQ: { - // (zext (setcc %a, %b, seteq)) -> (lshr (cntlzw (xor %a, %b)), 5) - // (zext (setcc %a, 0, seteq)) -> (lshr (cntlzw %a), 5) - SDValue Xor = IsRHSZero ? LHS : - SDValue(CurDAG->getMachineNode(PPC::XOR, dl, MVT::i32, LHS, RHS), 0); - SDValue Clz = - SDValue(CurDAG->getMachineNode(PPC::CNTLZW, dl, MVT::i32, Xor), 0); - SDValue ShiftOps[] = { Clz, getI32Imm(27, dl), getI32Imm(5, dl), - getI32Imm(31, dl) }; - return SDValue(CurDAG->getMachineNode(PPC::RLWINM, dl, MVT::i32, - ShiftOps), 0); - } - case ISD::SETNE: { - // (zext (setcc %a, %b, setne)) -> (xor (lshr (cntlzw (xor %a, %b)), 5), 1) - // (zext (setcc %a, 0, setne)) -> (xor (lshr (cntlzw %a), 5), 1) - SDValue Xor = IsRHSZero ? LHS : - SDValue(CurDAG->getMachineNode(PPC::XOR, dl, MVT::i32, LHS, RHS), 0); - SDValue Clz = - SDValue(CurDAG->getMachineNode(PPC::CNTLZW, dl, MVT::i32, Xor), 0); - SDValue ShiftOps[] = { Clz, getI32Imm(27, dl), getI32Imm(5, dl), - getI32Imm(31, dl) }; - SDValue Shift = - SDValue(CurDAG->getMachineNode(PPC::RLWINM, dl, MVT::i32, ShiftOps), 0); - return SDValue(CurDAG->getMachineNode(PPC::XORI, dl, MVT::i32, Shift, - getI32Imm(1, dl)), 0); - } + case ISD::SETUEQ: + case ISD::SETONE: + case ISD::SETOLE: + case ISD::SETOGE: + llvm_unreachable("Should be lowered by legalize!"); + default: llvm_unreachable("Unknown condition!"); + case ISD::SETOEQ: + case ISD::SETEQ: return PPC::PRED_EQ; + case ISD::SETUNE: + case ISD::SETNE: return PPC::PRED_NE; + case ISD::SETOLT: + case ISD::SETLT: return PPC::PRED_LT; + case ISD::SETULE: + case ISD::SETLE: return PPC::PRED_LE; + case ISD::SETOGT: + case ISD::SETGT: return PPC::PRED_GT; + case ISD::SETUGE: + case ISD::SETGE: return PPC::PRED_GE; + case ISD::SETO: return PPC::PRED_NU; + case ISD::SETUO: return PPC::PRED_UN; + // These two are invalid for floating point. Assume we have int. + case ISD::SETULT: return PPC::PRED_LT; + case ISD::SETUGT: return PPC::PRED_GT; } } -/// Produces a sign-extended result of comparing two 32-bit values according to -/// the passed condition code. -SDValue PPCDAGToDAGISel::get32BitSExtCompare(SDValue LHS, SDValue RHS, - ISD::CondCode CC, - int64_t RHSValue, SDLoc dl) { - bool IsRHSZero = RHSValue == 0; +/// getCRIdxForSetCC - Return the index of the condition register field +/// associated with the SetCC condition, and whether or not the field is +/// treated as inverted. That is, lt = 0; ge = 0 inverted. +static unsigned getCRIdxForSetCC(ISD::CondCode CC, bool &Invert) { + Invert = false; switch (CC) { - default: return SDValue(); - case ISD::SETEQ: { - // (sext (setcc %a, %b, seteq)) -> - // (ashr (shl (ctlz (xor %a, %b)), 58), 63) - // (sext (setcc %a, 0, seteq)) -> - // (ashr (shl (ctlz %a), 58), 63) - SDValue CountInput = IsRHSZero ? LHS : - SDValue(CurDAG->getMachineNode(PPC::XOR, dl, MVT::i32, LHS, RHS), 0); - SDValue Cntlzw = - SDValue(CurDAG->getMachineNode(PPC::CNTLZW, dl, MVT::i32, CountInput), 0); - SDValue SHLOps[] = { Cntlzw, getI32Imm(58, dl), getI32Imm(0, dl) }; - SDValue Sldi = - SDValue(CurDAG->getMachineNode(PPC::RLDICR_32, dl, MVT::i32, SHLOps), 0); - return SDValue(CurDAG->getMachineNode(PPC::SRADI_32, dl, MVT::i32, Sldi, - getI32Imm(63, dl)), 0); - } - case ISD::SETNE: { - // Bitwise xor the operands, count leading zeros, shift right by 5 bits and - // flip the bit, finally take 2's complement. - // (sext (setcc %a, %b, setne)) -> - // (neg (xor (lshr (ctlz (xor %a, %b)), 5), 1)) - // Same as above, but the first xor is not needed. - // (sext (setcc %a, 0, setne)) -> - // (neg (xor (lshr (ctlz %a), 5), 1)) - SDValue Xor = IsRHSZero ? LHS : - SDValue(CurDAG->getMachineNode(PPC::XOR, dl, MVT::i32, LHS, RHS), 0); - SDValue Clz = - SDValue(CurDAG->getMachineNode(PPC::CNTLZW, dl, MVT::i32, Xor), 0); - SDValue ShiftOps[] = - { Clz, getI32Imm(27, dl), getI32Imm(5, dl), getI32Imm(31, dl) }; - SDValue Shift = - SDValue(CurDAG->getMachineNode(PPC::RLWINM, dl, MVT::i32, ShiftOps), 0); - SDValue Xori = - SDValue(CurDAG->getMachineNode(PPC::XORI, dl, MVT::i32, Shift, - getI32Imm(1, dl)), 0); - return SDValue(CurDAG->getMachineNode(PPC::NEG, dl, MVT::i32, Xori), 0); + default: llvm_unreachable("Unknown condition!"); + case ISD::SETOLT: + case ISD::SETLT: return 0; // Bit #0 = SETOLT + case ISD::SETOGT: + case ISD::SETGT: return 1; // Bit #1 = SETOGT + case ISD::SETOEQ: + case ISD::SETEQ: return 2; // Bit #2 = SETOEQ + case ISD::SETUO: return 3; // Bit #3 = SETUO + case ISD::SETUGE: + case ISD::SETGE: Invert = true; return 0; // !Bit #0 = SETUGE + case ISD::SETULE: + case ISD::SETLE: Invert = true; return 1; // !Bit #1 = SETULE + case ISD::SETUNE: + case ISD::SETNE: Invert = true; return 2; // !Bit #2 = SETUNE + case ISD::SETO: Invert = true; return 3; // !Bit #3 = SETO + case ISD::SETUEQ: + case ISD::SETOGE: + case ISD::SETOLE: + case ISD::SETONE: + llvm_unreachable("Invalid branch code: should be expanded by legalize"); + // These are invalid for floating point. Assume integer. + case ISD::SETULT: return 0; + case ISD::SETUGT: return 1; } +} + +// getVCmpInst: return the vector compare instruction for the specified +// vector type and condition code. Since this is for altivec specific code, +// only support the altivec types (v16i8, v8i16, v4i32, v2i64, and v4f32). +static unsigned int getVCmpInst(MVT VecVT, ISD::CondCode CC, + bool HasVSX, bool &Swap, bool &Negate) { + Swap = false; + Negate = false; + + if (VecVT.isFloatingPoint()) { + /* Handle some cases by swapping input operands. */ + switch (CC) { + case ISD::SETLE: CC = ISD::SETGE; Swap = true; break; + case ISD::SETLT: CC = ISD::SETGT; Swap = true; break; + case ISD::SETOLE: CC = ISD::SETOGE; Swap = true; break; + case ISD::SETOLT: CC = ISD::SETOGT; Swap = true; break; + case ISD::SETUGE: CC = ISD::SETULE; Swap = true; break; + case ISD::SETUGT: CC = ISD::SETULT; Swap = true; break; + default: break; + } + /* Handle some cases by negating the result. */ + switch (CC) { + case ISD::SETNE: CC = ISD::SETEQ; Negate = true; break; + case ISD::SETUNE: CC = ISD::SETOEQ; Negate = true; break; + case ISD::SETULE: CC = ISD::SETOGT; Negate = true; break; + case ISD::SETULT: CC = ISD::SETOGE; Negate = true; break; + default: break; + } + /* We have instructions implementing the remaining cases. */ + switch (CC) { + case ISD::SETEQ: + case ISD::SETOEQ: + if (VecVT == MVT::v4f32) + return HasVSX ? PPC::XVCMPEQSP : PPC::VCMPEQFP; + else if (VecVT == MVT::v2f64) + return PPC::XVCMPEQDP; + break; + case ISD::SETGT: + case ISD::SETOGT: + if (VecVT == MVT::v4f32) + return HasVSX ? PPC::XVCMPGTSP : PPC::VCMPGTFP; + else if (VecVT == MVT::v2f64) + return PPC::XVCMPGTDP; + break; + case ISD::SETGE: + case ISD::SETOGE: + if (VecVT == MVT::v4f32) + return HasVSX ? PPC::XVCMPGESP : PPC::VCMPGEFP; + else if (VecVT == MVT::v2f64) + return PPC::XVCMPGEDP; + break; + default: + break; + } + llvm_unreachable("Invalid floating-point vector compare condition"); + } else { + /* Handle some cases by swapping input operands. */ + switch (CC) { + case ISD::SETGE: CC = ISD::SETLE; Swap = true; break; + case ISD::SETLT: CC = ISD::SETGT; Swap = true; break; + case ISD::SETUGE: CC = ISD::SETULE; Swap = true; break; + case ISD::SETULT: CC = ISD::SETUGT; Swap = true; break; + default: break; + } + /* Handle some cases by negating the result. */ + switch (CC) { + case ISD::SETNE: CC = ISD::SETEQ; Negate = true; break; + case ISD::SETUNE: CC = ISD::SETUEQ; Negate = true; break; + case ISD::SETLE: CC = ISD::SETGT; Negate = true; break; + case ISD::SETULE: CC = ISD::SETUGT; Negate = true; break; + default: break; + } + /* We have instructions implementing the remaining cases. */ + switch (CC) { + case ISD::SETEQ: + case ISD::SETUEQ: + if (VecVT == MVT::v16i8) + return PPC::VCMPEQUB; + else if (VecVT == MVT::v8i16) + return PPC::VCMPEQUH; + else if (VecVT == MVT::v4i32) + return PPC::VCMPEQUW; + else if (VecVT == MVT::v2i64) + return PPC::VCMPEQUD; + break; + case ISD::SETGT: + if (VecVT == MVT::v16i8) + return PPC::VCMPGTSB; + else if (VecVT == MVT::v8i16) + return PPC::VCMPGTSH; + else if (VecVT == MVT::v4i32) + return PPC::VCMPGTSW; + else if (VecVT == MVT::v2i64) + return PPC::VCMPGTSD; + break; + case ISD::SETUGT: + if (VecVT == MVT::v16i8) + return PPC::VCMPGTUB; + else if (VecVT == MVT::v8i16) + return PPC::VCMPGTUH; + else if (VecVT == MVT::v4i32) + return PPC::VCMPGTUW; + else if (VecVT == MVT::v2i64) + return PPC::VCMPGTUD; + break; + default: + break; + } + llvm_unreachable("Invalid integer vector compare condition"); } } -/// Produces a zero-extended result of comparing two 64-bit values according to -/// the passed condition code. -SDValue PPCDAGToDAGISel::get64BitZExtCompare(SDValue LHS, SDValue RHS, - ISD::CondCode CC, - int64_t RHSValue, SDLoc dl) { - bool IsRHSZero = RHSValue == 0; - switch (CC) { - default: return SDValue(); - case ISD::SETEQ: { - // (zext (setcc %a, %b, seteq)) -> (lshr (ctlz (xor %a, %b)), 6) - // (zext (setcc %a, 0, seteq)) -> (lshr (ctlz %a), 6) - SDValue Xor = IsRHSZero ? LHS : - SDValue(CurDAG->getMachineNode(PPC::XOR8, dl, MVT::i64, LHS, RHS), 0); - SDValue Clz = - SDValue(CurDAG->getMachineNode(PPC::CNTLZD, dl, MVT::i64, Xor), 0); - return SDValue(CurDAG->getMachineNode(PPC::RLDICL, dl, MVT::i64, Clz, - getI64Imm(58, dl), getI64Imm(63, dl)), - 0); - } - } -} +bool PPCDAGToDAGISel::trySETCC(SDNode *N) { + SDLoc dl(N); + unsigned Imm; + ISD::CondCode CC = cast(N->getOperand(2))->get(); + EVT PtrVT = + CurDAG->getTargetLoweringInfo().getPointerTy(CurDAG->getDataLayout()); + bool isPPC64 = (PtrVT == MVT::i64); -/// Produces a sign-extended result of comparing two 64-bit values according to -/// the passed condition code. -SDValue PPCDAGToDAGISel::get64BitSExtCompare(SDValue LHS, SDValue RHS, - ISD::CondCode CC, - int64_t RHSValue, SDLoc dl) { - bool IsRHSZero = RHSValue == 0; - switch (CC) { - default: return SDValue(); - case ISD::SETEQ: { - // {addc.reg, addc.CA} = (addcarry (xor %a, %b), -1) - // (sext (setcc %a, %b, seteq)) -> (sube addc.reg, addc.reg, addc.CA) - // {addcz.reg, addcz.CA} = (addcarry %a, -1) - // (sext (setcc %a, 0, seteq)) -> (sube addcz.reg, addcz.reg, addcz.CA) - SDValue AddInput = IsRHSZero ? LHS : - SDValue(CurDAG->getMachineNode(PPC::XOR8, dl, MVT::i64, LHS, RHS), 0); - SDValue Addic = - SDValue(CurDAG->getMachineNode(PPC::ADDIC8, dl, MVT::i64, MVT::Glue, - AddInput, getI32Imm(~0U, dl)), 0); - return SDValue(CurDAG->getMachineNode(PPC::SUBFE8, dl, MVT::i64, Addic, - Addic, Addic.getValue(1)), 0); - } + if (!PPCSubTarget->useCRBits() && + isInt32Immediate(N->getOperand(1), Imm)) { + // We can codegen setcc op, imm very efficiently compared to a brcond. + // Check for those cases here. + // setcc op, 0 + if (Imm == 0) { + SDValue Op = N->getOperand(0); + switch (CC) { + default: break; + case ISD::SETEQ: { + Op = SDValue(CurDAG->getMachineNode(PPC::CNTLZW, dl, MVT::i32, Op), 0); + SDValue Ops[] = { Op, getI32Imm(27, dl), getI32Imm(5, dl), + getI32Imm(31, dl) }; + CurDAG->SelectNodeTo(N, PPC::RLWINM, MVT::i32, Ops); + return true; + } + case ISD::SETNE: { + if (isPPC64) break; + SDValue AD = + SDValue(CurDAG->getMachineNode(PPC::ADDIC, dl, MVT::i32, MVT::Glue, + Op, getI32Imm(~0U, dl)), 0); + CurDAG->SelectNodeTo(N, PPC::SUBFE, MVT::i32, AD, Op, AD.getValue(1)); + return true; + } + case ISD::SETLT: { + SDValue Ops[] = { Op, getI32Imm(1, dl), getI32Imm(31, dl), + getI32Imm(31, dl) }; + CurDAG->SelectNodeTo(N, PPC::RLWINM, MVT::i32, Ops); + return true; + } + case ISD::SETGT: { + SDValue T = + SDValue(CurDAG->getMachineNode(PPC::NEG, dl, MVT::i32, Op), 0); + T = SDValue(CurDAG->getMachineNode(PPC::ANDC, dl, MVT::i32, T, Op), 0); + SDValue Ops[] = { T, getI32Imm(1, dl), getI32Imm(31, dl), + getI32Imm(31, dl) }; + CurDAG->SelectNodeTo(N, PPC::RLWINM, MVT::i32, Ops); + return true; + } + } + } else if (Imm == ~0U) { // setcc op, -1 + SDValue Op = N->getOperand(0); + switch (CC) { + default: break; + case ISD::SETEQ: + if (isPPC64) break; + Op = SDValue(CurDAG->getMachineNode(PPC::ADDIC, dl, MVT::i32, MVT::Glue, + Op, getI32Imm(1, dl)), 0); + CurDAG->SelectNodeTo(N, PPC::ADDZE, MVT::i32, + SDValue(CurDAG->getMachineNode(PPC::LI, dl, + MVT::i32, + getI32Imm(0, dl)), + 0), Op.getValue(1)); + return true; + case ISD::SETNE: { + if (isPPC64) break; + Op = SDValue(CurDAG->getMachineNode(PPC::NOR, dl, MVT::i32, Op, Op), 0); + SDNode *AD = CurDAG->getMachineNode(PPC::ADDIC, dl, MVT::i32, MVT::Glue, + Op, getI32Imm(~0U, dl)); + CurDAG->SelectNodeTo(N, PPC::SUBFE, MVT::i32, SDValue(AD, 0), Op, + SDValue(AD, 1)); + return true; + } + case ISD::SETLT: { + SDValue AD = SDValue(CurDAG->getMachineNode(PPC::ADDI, dl, MVT::i32, Op, + getI32Imm(1, dl)), 0); + SDValue AN = SDValue(CurDAG->getMachineNode(PPC::AND, dl, MVT::i32, AD, + Op), 0); + SDValue Ops[] = { AN, getI32Imm(1, dl), getI32Imm(31, dl), + getI32Imm(31, dl) }; + CurDAG->SelectNodeTo(N, PPC::RLWINM, MVT::i32, Ops); + return true; + } + case ISD::SETGT: { + SDValue Ops[] = { Op, getI32Imm(1, dl), getI32Imm(31, dl), + getI32Imm(31, dl) }; + Op = SDValue(CurDAG->getMachineNode(PPC::RLWINM, dl, MVT::i32, Ops), 0); + CurDAG->SelectNodeTo(N, PPC::XORI, MVT::i32, Op, getI32Imm(1, dl)); + return true; + } + } + } } -} -/// Does this SDValue have any uses for which keeping the value in a GPR is -/// appropriate. This is meant to be used on values that have type i1 since -/// it is somewhat meaningless to ask if values of other types can be kept in -/// GPR's. -static bool allUsesExtend(SDValue Compare, SelectionDAG *CurDAG) { - assert(Compare.getOpcode() == ISD::SETCC && - "An ISD::SETCC node required here."); + SDValue LHS = N->getOperand(0); + SDValue RHS = N->getOperand(1); - // For values that have a single use, the caller should obviously already have - // checked if that use is an extending use. We check the other uses here. - if (Compare.hasOneUse()) - return true; - // We want the value in a GPR if it is being extended, used for a select, or - // used in logical operations. - for (auto CompareUse : Compare.getNode()->uses()) - if (CompareUse->getOpcode() != ISD::SIGN_EXTEND && - CompareUse->getOpcode() != ISD::ZERO_EXTEND && - CompareUse->getOpcode() != ISD::SELECT && - !isLogicOp(CompareUse->getOpcode())) { - OmittedForNonExtendUses++; + // Altivec Vector compare instructions do not set any CR register by default and + // vector compare operations return the same type as the operands. + if (LHS.getValueType().isVector()) { + if (PPCSubTarget->hasQPX()) return false; + + EVT VecVT = LHS.getValueType(); + bool Swap, Negate; + unsigned int VCmpInst = getVCmpInst(VecVT.getSimpleVT(), CC, + PPCSubTarget->hasVSX(), Swap, Negate); + if (Swap) + std::swap(LHS, RHS); + + EVT ResVT = VecVT.changeVectorElementTypeToInteger(); + if (Negate) { + SDValue VCmp(CurDAG->getMachineNode(VCmpInst, dl, ResVT, LHS, RHS), 0); + CurDAG->SelectNodeTo(N, PPCSubTarget->hasVSX() ? PPC::XXLNOR : PPC::VNOR, + ResVT, VCmp, VCmp); + return true; } - return true; -} -/// Returns an equivalent of a SETCC node but with the result the same width as -/// the inputs. This can nalso be used for SELECT_CC if either the true or false -/// values is a power of two while the other is zero. -SDValue PPCDAGToDAGISel::getSETCCInGPR(SDValue Compare, - SetccInGPROpts ConvOpts) { - assert((Compare.getOpcode() == ISD::SETCC || - Compare.getOpcode() == ISD::SELECT_CC) && - "An ISD::SETCC node required here."); + CurDAG->SelectNodeTo(N, VCmpInst, ResVT, LHS, RHS); + return true; + } - // Don't convert this comparison to a GPR sequence because there are uses - // of the i1 result (i.e. uses that require the result in the CR). - if ((Compare.getOpcode() == ISD::SETCC) && !allUsesExtend(Compare, CurDAG)) - return SDValue(); + if (PPCSubTarget->useCRBits()) + return false; - SDValue LHS = Compare.getOperand(0); - SDValue RHS = Compare.getOperand(1); + bool Inv; + unsigned Idx = getCRIdxForSetCC(CC, Inv); + SDValue CCReg = SelectCC(LHS, RHS, CC, dl); + SDValue IntCR; - // The condition code is operand 2 for SETCC and operand 4 for SELECT_CC. - int CCOpNum = Compare.getOpcode() == ISD::SELECT_CC ? 4 : 2; - ISD::CondCode CC = - cast(Compare.getOperand(CCOpNum))->get(); - EVT InputVT = LHS.getValueType(); - if (InputVT != MVT::i32 && InputVT != MVT::i64) - return SDValue(); + // Force the ccreg into CR7. + SDValue CR7Reg = CurDAG->getRegister(PPC::CR7, MVT::i32); - if (ConvOpts == SetccInGPROpts::ZExtInvert || - ConvOpts == SetccInGPROpts::SExtInvert) - CC = ISD::getSetCCInverse(CC, true); + SDValue InFlag(nullptr, 0); // Null incoming flag value. + CCReg = CurDAG->getCopyToReg(CurDAG->getEntryNode(), dl, CR7Reg, CCReg, + InFlag).getValue(1); - bool Inputs32Bit = InputVT == MVT::i32; - if (ISD::isSignedIntSetCC(CC) && Inputs32Bit) { - LHS = signExtendInputIfNeeded(LHS); - RHS = signExtendInputIfNeeded(RHS); - } else if (ISD::isUnsignedIntSetCC(CC) && Inputs32Bit) { - LHS = zeroExtendInputIfNeeded(LHS); - RHS = zeroExtendInputIfNeeded(RHS); - } + IntCR = SDValue(CurDAG->getMachineNode(PPC::MFOCRF, dl, MVT::i32, CR7Reg, + CCReg), 0); - SDLoc dl(Compare); - ConstantSDNode *RHSConst = dyn_cast(RHS); - int64_t RHSValue = RHSConst ? RHSConst->getSExtValue() : INT64_MAX; - bool IsSext = ConvOpts == SetccInGPROpts::SExtOrig || - ConvOpts == SetccInGPROpts::SExtInvert; + SDValue Ops[] = { IntCR, getI32Imm((32 - (3 - Idx)) & 31, dl), + getI32Imm(31, dl), getI32Imm(31, dl) }; + if (!Inv) { + CurDAG->SelectNodeTo(N, PPC::RLWINM, MVT::i32, Ops); + return true; + } - if (IsSext && Inputs32Bit) - return get32BitSExtCompare(LHS, RHS, CC, RHSValue, dl); - else if (Inputs32Bit) - return get32BitZExtCompare(LHS, RHS, CC, RHSValue, dl); - else if (IsSext) - return get64BitSExtCompare(LHS, RHS, CC, RHSValue, dl); - return get64BitZExtCompare(LHS, RHS, CC, RHSValue, dl); + // Get the specified bit. + SDValue Tmp = + SDValue(CurDAG->getMachineNode(PPC::RLWINM, dl, MVT::i32, Ops), 0); + CurDAG->SelectNodeTo(N, PPC::XORI, MVT::i32, Tmp, getI32Imm(1, dl)); + return true; } /// Does this node represent a load/store node whose address can be represented @@ -3016,8 +3831,18 @@ bool PPCDAGToDAGISel::isOffsetMultipleOf(SDNode *N, unsigned Val) const { AddrOp = STN->getOperand(2); short Imm = 0; - if (AddrOp.getOpcode() == ISD::ADD) + if (AddrOp.getOpcode() == ISD::ADD) { + // If op0 is a frame index that is under aligned, we can't do it either, + // because it is translated to r31 or r1 + slot + offset. We won't know the + // slot number until the stack frame is finalized. + if (FrameIndexSDNode *FI = dyn_cast(AddrOp.getOperand(0))) { + const MachineFrameInfo &MFI = CurDAG->getMachineFunction().getFrameInfo(); + unsigned SlotAlign = MFI.getObjectAlignment(FI->getIndex()); + if ((SlotAlign % Val) != 0) + return false; + } return isIntS16Immediate(AddrOp.getOperand(1), Imm) && !(Imm % Val); + } // If the address comes from the outside, the offset will be zero. return AddrOp.getOpcode() == ISD::CopyFromReg; @@ -3050,22 +3875,20 @@ void PPCDAGToDAGISel::Select(SDNode *N) { if (tryBitPermutation(N)) return; + // Try to emit integer compares as GPR-only sequences (i.e. no use of CR). + if (tryIntCompareInGPR(N)) + return; + switch (N->getOpcode()) { default: break; case ISD::Constant: if (N->getValueType(0) == MVT::i64) { - ReplaceNode(N, getInt64(CurDAG, N)); + ReplaceNode(N, selectI64Imm(CurDAG, N)); return; } break; - case ISD::ZERO_EXTEND: - case ISD::SIGN_EXTEND: - if (tryEXTEND(N)) - return; - break; - case ISD::SETCC: if (trySETCC(N)) return; @@ -3209,9 +4032,6 @@ void PPCDAGToDAGISel::Select(SDNode *N) { } case ISD::AND: { - if (tryLogicOpOfCompares(N)) - return; - unsigned Imm, Imm2, SH, MB, ME; uint64_t Imm64; @@ -3331,9 +4151,6 @@ void PPCDAGToDAGISel::Select(SDNode *N) { if (tryBitfieldInsert(N)) return; - if (tryLogicOpOfCompares(N)) - return; - int16_t Imm; if (N->getOperand(0)->getOpcode() == ISD::FrameIndex && isIntS16Immediate(N->getOperand(1), Imm)) { @@ -3348,12 +4165,48 @@ void PPCDAGToDAGISel::Select(SDNode *N) { } } + // OR with a 32-bit immediate can be handled by ori + oris + // without creating an immediate in a GPR. + uint64_t Imm64 = 0; + bool IsPPC64 = PPCSubTarget->isPPC64(); + if (IsPPC64 && isInt64Immediate(N->getOperand(1), Imm64) && + (Imm64 & ~0xFFFFFFFFuLL) == 0) { + // If ImmHi (ImmHi) is zero, only one ori (oris) is generated later. + uint64_t ImmHi = Imm64 >> 16; + uint64_t ImmLo = Imm64 & 0xFFFF; + if (ImmHi != 0 && ImmLo != 0) { + SDNode *Lo = CurDAG->getMachineNode(PPC::ORI8, dl, MVT::i64, + N->getOperand(0), + getI16Imm(ImmLo, dl)); + SDValue Ops1[] = { SDValue(Lo, 0), getI16Imm(ImmHi, dl)}; + CurDAG->SelectNodeTo(N, PPC::ORIS8, MVT::i64, Ops1); + return; + } + } + // Other cases are autogenerated. break; } case ISD::XOR: { - if (tryLogicOpOfCompares(N)) - return; + // XOR with a 32-bit immediate can be handled by xori + xoris + // without creating an immediate in a GPR. + uint64_t Imm64 = 0; + bool IsPPC64 = PPCSubTarget->isPPC64(); + if (IsPPC64 && isInt64Immediate(N->getOperand(1), Imm64) && + (Imm64 & ~0xFFFFFFFFuLL) == 0) { + // If ImmHi (ImmHi) is zero, only one xori (xoris) is generated later. + uint64_t ImmHi = Imm64 >> 16; + uint64_t ImmLo = Imm64 & 0xFFFF; + if (ImmHi != 0 && ImmLo != 0) { + SDNode *Lo = CurDAG->getMachineNode(PPC::XORI8, dl, MVT::i64, + N->getOperand(0), + getI16Imm(ImmLo, dl)); + SDValue Ops1[] = { SDValue(Lo, 0), getI16Imm(ImmHi, dl)}; + CurDAG->SelectNodeTo(N, PPC::XORIS8, MVT::i64, Ops1); + return; + } + } + break; } case ISD::ADD: { @@ -3666,9 +4519,9 @@ void PPCDAGToDAGISel::Select(SDNode *N) { // The first source operand is a TargetGlobalAddress or a TargetJumpTable. // If it must be toc-referenced according to PPCSubTarget, we generate: - // LDtocL(, ADDIStocHA(%X2, )) + // LDtocL(@sym, ADDIStocHA(%x2, @sym)) // Otherwise we generate: - // ADDItocL(ADDIStocHA(%X2, ), ) + // ADDItocL(ADDIStocHA(%x2, @sym), @sym) SDValue GA = N->getOperand(0); SDValue TOCbase = N->getOperand(1); SDNode *Tmp = CurDAG->getMachineNode(PPC::ADDIStocHA, dl, MVT::i64, diff --git a/lib/Target/PowerPC/PPCISelLowering.cpp b/lib/Target/PowerPC/PPCISelLowering.cpp index b3a3c73f6df03..18e567fa589c7 100644 --- a/lib/Target/PowerPC/PPCISelLowering.cpp +++ b/lib/Target/PowerPC/PPCISelLowering.cpp @@ -51,6 +51,9 @@ #include "llvm/CodeGen/RuntimeLibcalls.h" #include "llvm/CodeGen/SelectionDAG.h" #include "llvm/CodeGen/SelectionDAGNodes.h" +#include "llvm/CodeGen/TargetInstrInfo.h" +#include "llvm/CodeGen/TargetLowering.h" +#include "llvm/CodeGen/TargetRegisterInfo.h" #include "llvm/CodeGen/ValueTypes.h" #include "llvm/IR/CallSite.h" #include "llvm/IR/CallingConv.h" @@ -82,11 +85,8 @@ #include "llvm/Support/KnownBits.h" #include "llvm/Support/MathExtras.h" #include "llvm/Support/raw_ostream.h" -#include "llvm/Target/TargetInstrInfo.h" -#include "llvm/Target/TargetLowering.h" #include "llvm/Target/TargetMachine.h" #include "llvm/Target/TargetOptions.h" -#include "llvm/Target/TargetRegisterInfo.h" #include #include #include @@ -114,6 +114,8 @@ cl::desc("disable sibling call optimization on ppc"), cl::Hidden); STATISTIC(NumTailCalls, "Number of tail calls"); STATISTIC(NumSiblingCalls, "Number of sibling calls"); +static bool isNByteElemShuffleMask(ShuffleVectorSDNode *, unsigned, int); + // FIXME: Remove this once the bug has been fixed! extern cl::opt ANDIGlueBug; @@ -226,6 +228,12 @@ PPCTargetLowering::PPCTargetLowering(const PPCTargetMachine &TM, setOperationAction(ISD::UREM, MVT::i64, Expand); } + if (Subtarget.hasP9Vector()) { + setOperationAction(ISD::ABS, MVT::v4i32, Legal); + setOperationAction(ISD::ABS, MVT::v8i16, Legal); + setOperationAction(ISD::ABS, MVT::v16i8, Legal); + } + // Don't use SMUL_LOHI/UMUL_LOHI or SDIVREM/UDIVREM to lower SREM/UREM. setOperationAction(ISD::UMUL_LOHI, MVT::i32, Expand); setOperationAction(ISD::SMUL_LOHI, MVT::i32, Expand); @@ -283,14 +291,16 @@ PPCTargetLowering::PPCTargetLowering(const PPCTargetMachine &TM, setOperationAction(ISD::FROUND, MVT::f32, Legal); } - // PowerPC does not have BSWAP + // PowerPC does not have BSWAP, but we can use vector BSWAP instruction xxbrd + // to speed up scalar BSWAP64. // CTPOP or CTTZ were introduced in P8/P9 respectivelly setOperationAction(ISD::BSWAP, MVT::i32 , Expand); - setOperationAction(ISD::BSWAP, MVT::i64 , Expand); if (Subtarget.isISA3_0()) { + setOperationAction(ISD::BSWAP, MVT::i64 , Custom); setOperationAction(ISD::CTTZ , MVT::i32 , Legal); setOperationAction(ISD::CTTZ , MVT::i64 , Legal); } else { + setOperationAction(ISD::BSWAP, MVT::i64 , Expand); setOperationAction(ISD::CTTZ , MVT::i32 , Expand); setOperationAction(ISD::CTTZ , MVT::i64 , Expand); } @@ -773,6 +783,11 @@ PPCTargetLowering::PPCTargetLowering(const PPCTargetMachine &TM, setOperationAction(ISD::SRL, MVT::v1i128, Legal); setOperationAction(ISD::SRA, MVT::v1i128, Expand); } + + if (Subtarget.hasP9Altivec()) { + setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v8i16, Custom); + setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v16i8, Custom); + } } if (Subtarget.hasQPX()) { @@ -1131,7 +1146,7 @@ const char *PPCTargetLowering::getTargetNodeName(unsigned Opcode) const { case PPCISD::VNMSUBFP: return "PPCISD::VNMSUBFP"; case PPCISD::VPERM: return "PPCISD::VPERM"; case PPCISD::XXSPLT: return "PPCISD::XXSPLT"; - case PPCISD::XXINSERT: return "PPCISD::XXINSERT"; + case PPCISD::VECINSERT: return "PPCISD::VECINSERT"; case PPCISD::XXREVERSE: return "PPCISD::XXREVERSE"; case PPCISD::XXPERMDI: return "PPCISD::XXPERMDI"; case PPCISD::VECSHL: return "PPCISD::VECSHL"; @@ -2413,8 +2428,8 @@ static SDValue getTOCEntry(SelectionDAG &DAG, const SDLoc &dl, bool Is64Bit, SDValue Ops[] = { GA, Reg }; return DAG.getMemIntrinsicNode( PPCISD::TOC_ENTRY, dl, DAG.getVTList(VT, MVT::Other), Ops, VT, - MachinePointerInfo::getGOT(DAG.getMachineFunction()), 0, false, true, - false, 0); + MachinePointerInfo::getGOT(DAG.getMachineFunction()), 0, + MachineMemOperand::MOLoad); } SDValue PPCTargetLowering::LowerConstantPool(SDValue Op, @@ -2470,7 +2485,6 @@ SDValue PPCTargetLowering::getPICJumpTableRelocBase(SDValue Table, return TargetLowering::getPICJumpTableRelocBase(Table, DAG); switch (getTargetMachine().getCodeModel()) { - case CodeModel::Default: case CodeModel::Small: case CodeModel::Medium: return TargetLowering::getPICJumpTableRelocBase(Table, DAG); @@ -2488,7 +2502,6 @@ PPCTargetLowering::getPICJumpTableRelocBaseExpr(const MachineFunction *MF, return TargetLowering::getPICJumpTableRelocBaseExpr(MF, JTI, Ctx); switch (getTargetMachine().getCodeModel()) { - case CodeModel::Default: case CodeModel::Small: case CodeModel::Medium: return TargetLowering::getPICJumpTableRelocBaseExpr(MF, JTI, Ctx); @@ -2560,7 +2573,7 @@ SDValue PPCTargetLowering::LowerGlobalTLSAddress(SDValue Op, const GlobalValue *GV = GA->getGlobal(); EVT PtrVT = getPointerTy(DAG.getDataLayout()); bool is64bit = Subtarget.isPPC64(); - const Module *M = DAG.getMachineFunction().getFunction()->getParent(); + const Module *M = DAG.getMachineFunction().getFunction().getParent(); PICLevel::Level picLevel = M->getPICLevel(); TLSModel::Model Model = getTargetMachine().getTLSModel(GV); @@ -3529,7 +3542,7 @@ SDValue PPCTargetLowering::LowerFormalArguments_64SVR4( unsigned GPR_idx = 0, FPR_idx = 0, VR_idx = 0; unsigned &QFPR_idx = FPR_idx; SmallVector MemOps; - Function::const_arg_iterator FuncArg = MF.getFunction()->arg_begin(); + Function::const_arg_iterator FuncArg = MF.getFunction().arg_begin(); unsigned CurArgIdx = 0; for (unsigned ArgNo = 0, e = Ins.size(); ArgNo != e; ++ArgNo) { SDValue ArgVal; @@ -3614,6 +3627,7 @@ SDValue PPCTargetLowering::LowerFormalArguments_64SVR4( if (GPR_idx != Num_GPR_Regs) { unsigned VReg = MF.addLiveIn(GPR[GPR_idx++], &PPC::G8RCRegClass); + FuncInfo->addLiveInAttr(VReg, Flags); SDValue Val = DAG.getCopyFromReg(Chain, dl, VReg, PtrVT); SDValue Store; @@ -3648,6 +3662,7 @@ SDValue PPCTargetLowering::LowerFormalArguments_64SVR4( break; unsigned VReg = MF.addLiveIn(GPR[GPR_idx], &PPC::G8RCRegClass); + FuncInfo->addLiveInAttr(VReg, Flags); SDValue Val = DAG.getCopyFromReg(Chain, dl, VReg, PtrVT); SDValue Addr = FIN; if (j) { @@ -3684,6 +3699,7 @@ SDValue PPCTargetLowering::LowerFormalArguments_64SVR4( // types to avoid forcing arguments to memory unnecessarily. if (GPR_idx != Num_GPR_Regs) { unsigned VReg = MF.addLiveIn(GPR[GPR_idx++], &PPC::G8RCRegClass); + FuncInfo->addLiveInAttr(VReg, Flags); ArgVal = DAG.getCopyFromReg(Chain, dl, VReg, MVT::i64); if (ObjectVT == MVT::i32 || ObjectVT == MVT::i1) @@ -3729,6 +3745,7 @@ SDValue PPCTargetLowering::LowerFormalArguments_64SVR4( // since otherwise we never run out of FPRs before running out // of GPRs. unsigned VReg = MF.addLiveIn(GPR[GPR_idx++], &PPC::G8RCRegClass); + FuncInfo->addLiveInAttr(VReg, Flags); ArgVal = DAG.getCopyFromReg(Chain, dl, VReg, MVT::i64); if (ObjectVT == MVT::f32) { @@ -3969,7 +3986,7 @@ SDValue PPCTargetLowering::LowerFormalArguments_Darwin( SmallVector MemOps; unsigned nAltivecParamsAtEnd = 0; - Function::const_arg_iterator FuncArg = MF.getFunction()->arg_begin(); + Function::const_arg_iterator FuncArg = MF.getFunction().arg_begin(); unsigned CurArgIdx = 0; for (unsigned ArgNo = 0, e = Ins.size(); ArgNo != e; ++ArgNo) { SDValue ArgVal; @@ -4251,13 +4268,25 @@ static int CalculateTailCallSPDiff(SelectionDAG& DAG, bool isTailCall, static bool isFunctionGlobalAddress(SDValue Callee); static bool -resideInSameSection(const Function *Caller, SDValue Callee, +callsShareTOCBase(const Function *Caller, SDValue Callee, const TargetMachine &TM) { // If !G, Callee can be an external symbol. GlobalAddressSDNode *G = dyn_cast(Callee); if (!G) return false; + // The medium and large code models are expected to provide a sufficiently + // large TOC to provide all data addressing needs of a module with a + // single TOC. Since each module will be addressed with a single TOC then we + // only need to check that caller and callee don't cross dso boundaries. + if (CodeModel::Medium == TM.getCodeModel() || + CodeModel::Large == TM.getCodeModel()) + return TM.shouldAssumeDSOLocal(*Caller->getParent(), G->getGlobal()); + + // Otherwise we need to ensure callee and caller are in the same section, + // since the linker may allocate multiple TOCs, and we don't know which + // sections will belong to the same TOC base. + const GlobalValue *GV = G->getGlobal(); if (!GV->isStrongDefinitionForLinker()) return false; @@ -4335,12 +4364,12 @@ needStackSlotPassParameters(const PPCSubtarget &Subtarget, } static bool -hasSameArgumentList(const Function *CallerFn, ImmutableCallSite *CS) { - if (CS->arg_size() != CallerFn->arg_size()) +hasSameArgumentList(const Function *CallerFn, ImmutableCallSite CS) { + if (CS.arg_size() != CallerFn->arg_size()) return false; - ImmutableCallSite::arg_iterator CalleeArgIter = CS->arg_begin(); - ImmutableCallSite::arg_iterator CalleeArgEnd = CS->arg_end(); + ImmutableCallSite::arg_iterator CalleeArgIter = CS.arg_begin(); + ImmutableCallSite::arg_iterator CalleeArgEnd = CS.arg_end(); Function::const_arg_iterator CallerArgIter = CallerFn->arg_begin(); for (; CalleeArgIter != CalleeArgEnd; ++CalleeArgIter, ++CallerArgIter) { @@ -4363,11 +4392,25 @@ hasSameArgumentList(const Function *CallerFn, ImmutableCallSite *CS) { return true; } +// Returns true if TCO is possible between the callers and callees +// calling conventions. +static bool +areCallingConvEligibleForTCO_64SVR4(CallingConv::ID CallerCC, + CallingConv::ID CalleeCC) { + // Tail or Sibling call optimization (TCO/SCO) needs callee and caller to + // have the same calling convention. + if (CallerCC != CalleeCC) + return false; + + // Tail or Sibling calls can be done with fastcc/ccc. + return (CallerCC == CallingConv::Fast || CallerCC == CallingConv::C); +} + bool PPCTargetLowering::IsEligibleForTailCallOptimization_64SVR4( SDValue Callee, CallingConv::ID CalleeCC, - ImmutableCallSite *CS, + ImmutableCallSite CS, bool isVarArg, const SmallVectorImpl &Outs, const SmallVectorImpl &Ins, @@ -4379,15 +4422,9 @@ PPCTargetLowering::IsEligibleForTailCallOptimization_64SVR4( // Variadic argument functions are not supported. if (isVarArg) return false; - MachineFunction &MF = DAG.getMachineFunction(); - CallingConv::ID CallerCC = MF.getFunction()->getCallingConv(); - - // Tail or Sibling call optimization (TCO/SCO) needs callee and caller has - // the same calling convention - if (CallerCC != CalleeCC) return false; - - // SCO support C calling convention - if (CalleeCC != CallingConv::Fast && CalleeCC != CallingConv::C) + auto &Caller = DAG.getMachineFunction().getFunction(); + // Check that the calling conventions are compatible for tco. + if (!areCallingConvEligibleForTCO_64SVR4(Caller.getCallingConv(), CalleeCC)) return false; // Caller contains any byval parameter is not supported. @@ -4406,11 +4443,10 @@ PPCTargetLowering::IsEligibleForTailCallOptimization_64SVR4( !isa(Callee)) return false; - // Check if Callee resides in the same section, because for now, PPC64 SVR4 - // ABI (ELFv1/ELFv2) doesn't allow tail calls to a symbol resides in another - // section. + // If the caller and callee potentially have different TOC bases then we + // cannot tail call since we need to restore the TOC pointer after the call. // ref: https://bugzilla.mozilla.org/show_bug.cgi?id=973977 - if (!resideInSameSection(MF.getFunction(), Callee, getTargetMachine())) + if (!callsShareTOCBase(&Caller, Callee, getTargetMachine())) return false; // TCO allows altering callee ABI, so we don't have to check further. @@ -4422,7 +4458,7 @@ PPCTargetLowering::IsEligibleForTailCallOptimization_64SVR4( // If callee use the same argument list that caller is using, then we can // apply SCO on this case. If it is not, then we need to check if callee needs // stack for passing arguments. - if (!hasSameArgumentList(MF.getFunction(), CS) && + if (!hasSameArgumentList(&Caller, CS) && needStackSlotPassParameters(Subtarget, Outs)) { return false; } @@ -4447,7 +4483,7 @@ PPCTargetLowering::IsEligibleForTailCallOptimization(SDValue Callee, return false; MachineFunction &MF = DAG.getMachineFunction(); - CallingConv::ID CallerCC = MF.getFunction()->getCallingConv(); + CallingConv::ID CallerCC = MF.getFunction().getCallingConv(); if (CalleeCC == CallingConv::Fast && CallerCC == CalleeCC) { // Functions containing by val parameters are not supported. for (unsigned i = 0; i != Ins.size(); i++) { @@ -4676,7 +4712,7 @@ PrepareCall(SelectionDAG &DAG, SDValue &Callee, SDValue &InFlag, SDValue &Chain, bool isPatchPoint, bool hasNest, SmallVectorImpl> &RegsToPass, SmallVectorImpl &Ops, std::vector &NodeTys, - ImmutableCallSite *CS, const PPCSubtarget &Subtarget) { + ImmutableCallSite CS, const PPCSubtarget &Subtarget) { bool isPPC64 = Subtarget.isPPC64(); bool isSVR4ABI = Subtarget.isSVR4ABI(); bool isELFv2ABI = Subtarget.isELFv2ABI(); @@ -4699,7 +4735,7 @@ PrepareCall(SelectionDAG &DAG, SDValue &Callee, SDValue &InFlag, SDValue &Chain, // we're building with the leopard linker or later, which automatically // synthesizes these stubs. const TargetMachine &TM = DAG.getTarget(); - const Module *Mod = DAG.getMachineFunction().getFunction()->getParent(); + const Module *Mod = DAG.getMachineFunction().getFunction().getParent(); const GlobalValue *GV = nullptr; if (auto *G = dyn_cast(Callee)) GV = G->getGlobal(); @@ -4787,7 +4823,7 @@ PrepareCall(SelectionDAG &DAG, SDValue &Callee, SDValue &InFlag, SDValue &Chain, MachineMemOperand::MOInvariant) : MachineMemOperand::MONone; - MachinePointerInfo MPI(CS ? CS->getCalledValue() : nullptr); + MachinePointerInfo MPI(CS ? CS.getCalledValue() : nullptr); SDValue LoadFuncPtr = DAG.getLoad(MVT::i64, dl, LDChain, Callee, MPI, /* Alignment = */ 8, MMOFlags); @@ -4917,7 +4953,7 @@ SDValue PPCTargetLowering::FinishCall( SmallVector, 8> &RegsToPass, SDValue InFlag, SDValue Chain, SDValue CallSeqStart, SDValue &Callee, int SPDiff, unsigned NumBytes, const SmallVectorImpl &Ins, - SmallVectorImpl &InVals, ImmutableCallSite *CS) const { + SmallVectorImpl &InVals, ImmutableCallSite CS) const { std::vector NodeTys; SmallVector Ops; unsigned CallOpc = PrepareCall(DAG, Callee, InFlag, Chain, CallSeqStart, dl, @@ -4992,7 +5028,7 @@ SDValue PPCTargetLowering::FinishCall( // any other variadic arguments). Ops.insert(std::next(Ops.begin()), AddTOC); } else if (CallOpc == PPCISD::CALL && - !resideInSameSection(MF.getFunction(), Callee, DAG.getTarget())) { + !callsShareTOCBase(&MF.getFunction(), Callee, DAG.getTarget())) { // Otherwise insert NOP for non-local calls. CallOpc = PPCISD::CALL_NOP; } @@ -5025,10 +5061,10 @@ PPCTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI, CallingConv::ID CallConv = CLI.CallConv; bool isVarArg = CLI.IsVarArg; bool isPatchPoint = CLI.IsPatchPoint; - ImmutableCallSite *CS = CLI.CS; + ImmutableCallSite CS = CLI.CS; if (isTailCall) { - if (Subtarget.useLongCalls() && !(CS && CS->isMustTailCall())) + if (Subtarget.useLongCalls() && !(CS && CS.isMustTailCall())) isTailCall = false; else if (Subtarget.isSVR4ABI() && Subtarget.isPPC64()) isTailCall = @@ -5056,7 +5092,7 @@ PPCTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI, } } - if (!isTailCall && CS && CS->isMustTailCall()) + if (!isTailCall && CS && CS.isMustTailCall()) report_fatal_error("failed to perform tail call elimination on a call " "site marked musttail"); @@ -5090,7 +5126,7 @@ SDValue PPCTargetLowering::LowerCall_32SVR4( const SmallVectorImpl &OutVals, const SmallVectorImpl &Ins, const SDLoc &dl, SelectionDAG &DAG, SmallVectorImpl &InVals, - ImmutableCallSite *CS) const { + ImmutableCallSite CS) const { // See PPCTargetLowering::LowerFormalArguments_32SVR4() for a description // of the 32-bit SVR4 ABI stack frame layout. @@ -5324,7 +5360,7 @@ SDValue PPCTargetLowering::LowerCall_64SVR4( const SmallVectorImpl &OutVals, const SmallVectorImpl &Ins, const SDLoc &dl, SelectionDAG &DAG, SmallVectorImpl &InVals, - ImmutableCallSite *CS) const { + ImmutableCallSite CS) const { bool isELFv2ABI = Subtarget.isELFv2ABI(); bool isLittleEndian = Subtarget.isLittleEndian(); unsigned NumOps = Outs.size(); @@ -5974,7 +6010,7 @@ SDValue PPCTargetLowering::LowerCall_Darwin( const SmallVectorImpl &OutVals, const SmallVectorImpl &Ins, const SDLoc &dl, SelectionDAG &DAG, SmallVectorImpl &InVals, - ImmutableCallSite *CS) const { + ImmutableCallSite CS) const { unsigned NumOps = Outs.size(); EVT PtrVT = getPointerTy(DAG.getDataLayout()); @@ -7448,9 +7484,11 @@ static SDValue BuildVSLDOI(SDValue LHS, SDValue RHS, unsigned Amt, EVT VT, /// - The node is a "load-and-splat" /// In all other cases, we will choose to keep the BUILD_VECTOR. static bool haveEfficientBuildVectorPattern(BuildVectorSDNode *V, - bool HasDirectMove) { + bool HasDirectMove, + bool HasP8Vector) { EVT VecVT = V->getValueType(0); - bool RightType = VecVT == MVT::v2f64 || VecVT == MVT::v4f32 || + bool RightType = VecVT == MVT::v2f64 || + (HasP8Vector && VecVT == MVT::v4f32) || (HasDirectMove && (VecVT == MVT::v2i64 || VecVT == MVT::v4i32)); if (!RightType) return false; @@ -7612,7 +7650,8 @@ SDValue PPCTargetLowering::LowerBUILD_VECTOR(SDValue Op, // lowered to VSX instructions under certain conditions. // Without VSX, there is no pattern more efficient than expanding the node. if (Subtarget.hasVSX() && - haveEfficientBuildVectorPattern(BVN, Subtarget.hasDirectMove())) + haveEfficientBuildVectorPattern(BVN, Subtarget.hasDirectMove(), + Subtarget.hasP8Vector())) return Op; return SDValue(); } @@ -7646,6 +7685,15 @@ SDValue PPCTargetLowering::LowerBUILD_VECTOR(SDValue Op, return DAG.getBitcast(Op.getValueType(), NewBV); return NewBV; } + + // BuildVectorSDNode::isConstantSplat() is actually pretty smart. It'll + // detect that constant splats like v8i16: 0xABAB are really just splats + // of a 1-byte constant. In this case, we need to convert the node to a + // splat of v16i8 and a bitcast. + if (Op.getValueType() != MVT::v16i8) + return DAG.getBitcast(Op.getValueType(), + DAG.getConstant(SplatBits, dl, MVT::v16i8)); + return Op; } @@ -7855,6 +7903,219 @@ static SDValue GeneratePerfectShuffle(unsigned PFEntry, SDValue LHS, return DAG.getNode(ISD::BITCAST, dl, VT, T); } +/// lowerToVINSERTB - Return the SDValue if this VECTOR_SHUFFLE can be handled +/// by the VINSERTB instruction introduced in ISA 3.0, else just return default +/// SDValue. +SDValue PPCTargetLowering::lowerToVINSERTB(ShuffleVectorSDNode *N, + SelectionDAG &DAG) const { + const unsigned BytesInVector = 16; + bool IsLE = Subtarget.isLittleEndian(); + SDLoc dl(N); + SDValue V1 = N->getOperand(0); + SDValue V2 = N->getOperand(1); + unsigned ShiftElts = 0, InsertAtByte = 0; + bool Swap = false; + + // Shifts required to get the byte we want at element 7. + unsigned LittleEndianShifts[] = {8, 7, 6, 5, 4, 3, 2, 1, + 0, 15, 14, 13, 12, 11, 10, 9}; + unsigned BigEndianShifts[] = {9, 10, 11, 12, 13, 14, 15, 0, + 1, 2, 3, 4, 5, 6, 7, 8}; + + ArrayRef Mask = N->getMask(); + int OriginalOrder[] = {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15}; + + // For each mask element, find out if we're just inserting something + // from V2 into V1 or vice versa. + // Possible permutations inserting an element from V2 into V1: + // X, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 + // 0, X, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 + // ... + // 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, X + // Inserting from V1 into V2 will be similar, except mask range will be + // [16,31]. + + bool FoundCandidate = false; + // If both vector operands for the shuffle are the same vector, the mask + // will contain only elements from the first one and the second one will be + // undef. + unsigned VINSERTBSrcElem = IsLE ? 8 : 7; + // Go through the mask of half-words to find an element that's being moved + // from one vector to the other. + for (unsigned i = 0; i < BytesInVector; ++i) { + unsigned CurrentElement = Mask[i]; + // If 2nd operand is undefined, we should only look for element 7 in the + // Mask. + if (V2.isUndef() && CurrentElement != VINSERTBSrcElem) + continue; + + bool OtherElementsInOrder = true; + // Examine the other elements in the Mask to see if they're in original + // order. + for (unsigned j = 0; j < BytesInVector; ++j) { + if (j == i) + continue; + // If CurrentElement is from V1 [0,15], then we the rest of the Mask to be + // from V2 [16,31] and vice versa. Unless the 2nd operand is undefined, + // in which we always assume we're always picking from the 1st operand. + int MaskOffset = + (!V2.isUndef() && CurrentElement < BytesInVector) ? BytesInVector : 0; + if (Mask[j] != OriginalOrder[j] + MaskOffset) { + OtherElementsInOrder = false; + break; + } + } + // If other elements are in original order, we record the number of shifts + // we need to get the element we want into element 7. Also record which byte + // in the vector we should insert into. + if (OtherElementsInOrder) { + // If 2nd operand is undefined, we assume no shifts and no swapping. + if (V2.isUndef()) { + ShiftElts = 0; + Swap = false; + } else { + // Only need the last 4-bits for shifts because operands will be swapped if CurrentElement is >= 2^4. + ShiftElts = IsLE ? LittleEndianShifts[CurrentElement & 0xF] + : BigEndianShifts[CurrentElement & 0xF]; + Swap = CurrentElement < BytesInVector; + } + InsertAtByte = IsLE ? BytesInVector - (i + 1) : i; + FoundCandidate = true; + break; + } + } + + if (!FoundCandidate) + return SDValue(); + + // Candidate found, construct the proper SDAG sequence with VINSERTB, + // optionally with VECSHL if shift is required. + if (Swap) + std::swap(V1, V2); + if (V2.isUndef()) + V2 = V1; + if (ShiftElts) { + SDValue Shl = DAG.getNode(PPCISD::VECSHL, dl, MVT::v16i8, V2, V2, + DAG.getConstant(ShiftElts, dl, MVT::i32)); + return DAG.getNode(PPCISD::VECINSERT, dl, MVT::v16i8, V1, Shl, + DAG.getConstant(InsertAtByte, dl, MVT::i32)); + } + return DAG.getNode(PPCISD::VECINSERT, dl, MVT::v16i8, V1, V2, + DAG.getConstant(InsertAtByte, dl, MVT::i32)); +} + +/// lowerToVINSERTH - Return the SDValue if this VECTOR_SHUFFLE can be handled +/// by the VINSERTH instruction introduced in ISA 3.0, else just return default +/// SDValue. +SDValue PPCTargetLowering::lowerToVINSERTH(ShuffleVectorSDNode *N, + SelectionDAG &DAG) const { + const unsigned NumHalfWords = 8; + const unsigned BytesInVector = NumHalfWords * 2; + // Check that the shuffle is on half-words. + if (!isNByteElemShuffleMask(N, 2, 1)) + return SDValue(); + + bool IsLE = Subtarget.isLittleEndian(); + SDLoc dl(N); + SDValue V1 = N->getOperand(0); + SDValue V2 = N->getOperand(1); + unsigned ShiftElts = 0, InsertAtByte = 0; + bool Swap = false; + + // Shifts required to get the half-word we want at element 3. + unsigned LittleEndianShifts[] = {4, 3, 2, 1, 0, 7, 6, 5}; + unsigned BigEndianShifts[] = {5, 6, 7, 0, 1, 2, 3, 4}; + + uint32_t Mask = 0; + uint32_t OriginalOrderLow = 0x1234567; + uint32_t OriginalOrderHigh = 0x89ABCDEF; + // Now we look at mask elements 0,2,4,6,8,10,12,14. Pack the mask into a + // 32-bit space, only need 4-bit nibbles per element. + for (unsigned i = 0; i < NumHalfWords; ++i) { + unsigned MaskShift = (NumHalfWords - 1 - i) * 4; + Mask |= ((uint32_t)(N->getMaskElt(i * 2) / 2) << MaskShift); + } + + // For each mask element, find out if we're just inserting something + // from V2 into V1 or vice versa. Possible permutations inserting an element + // from V2 into V1: + // X, 1, 2, 3, 4, 5, 6, 7 + // 0, X, 2, 3, 4, 5, 6, 7 + // 0, 1, X, 3, 4, 5, 6, 7 + // 0, 1, 2, X, 4, 5, 6, 7 + // 0, 1, 2, 3, X, 5, 6, 7 + // 0, 1, 2, 3, 4, X, 6, 7 + // 0, 1, 2, 3, 4, 5, X, 7 + // 0, 1, 2, 3, 4, 5, 6, X + // Inserting from V1 into V2 will be similar, except mask range will be [8,15]. + + bool FoundCandidate = false; + // Go through the mask of half-words to find an element that's being moved + // from one vector to the other. + for (unsigned i = 0; i < NumHalfWords; ++i) { + unsigned MaskShift = (NumHalfWords - 1 - i) * 4; + uint32_t MaskOneElt = (Mask >> MaskShift) & 0xF; + uint32_t MaskOtherElts = ~(0xF << MaskShift); + uint32_t TargetOrder = 0x0; + + // If both vector operands for the shuffle are the same vector, the mask + // will contain only elements from the first one and the second one will be + // undef. + if (V2.isUndef()) { + ShiftElts = 0; + unsigned VINSERTHSrcElem = IsLE ? 4 : 3; + TargetOrder = OriginalOrderLow; + Swap = false; + // Skip if not the correct element or mask of other elements don't equal + // to our expected order. + if (MaskOneElt == VINSERTHSrcElem && + (Mask & MaskOtherElts) == (TargetOrder & MaskOtherElts)) { + InsertAtByte = IsLE ? BytesInVector - (i + 1) * 2 : i * 2; + FoundCandidate = true; + break; + } + } else { // If both operands are defined. + // Target order is [8,15] if the current mask is between [0,7]. + TargetOrder = + (MaskOneElt < NumHalfWords) ? OriginalOrderHigh : OriginalOrderLow; + // Skip if mask of other elements don't equal our expected order. + if ((Mask & MaskOtherElts) == (TargetOrder & MaskOtherElts)) { + // We only need the last 3 bits for the number of shifts. + ShiftElts = IsLE ? LittleEndianShifts[MaskOneElt & 0x7] + : BigEndianShifts[MaskOneElt & 0x7]; + InsertAtByte = IsLE ? BytesInVector - (i + 1) * 2 : i * 2; + Swap = MaskOneElt < NumHalfWords; + FoundCandidate = true; + break; + } + } + } + + if (!FoundCandidate) + return SDValue(); + + // Candidate found, construct the proper SDAG sequence with VINSERTH, + // optionally with VECSHL if shift is required. + if (Swap) + std::swap(V1, V2); + if (V2.isUndef()) + V2 = V1; + SDValue Conv1 = DAG.getNode(ISD::BITCAST, dl, MVT::v8i16, V1); + if (ShiftElts) { + // Double ShiftElts because we're left shifting on v16i8 type. + SDValue Shl = DAG.getNode(PPCISD::VECSHL, dl, MVT::v16i8, V2, V2, + DAG.getConstant(2 * ShiftElts, dl, MVT::i32)); + SDValue Conv2 = DAG.getNode(ISD::BITCAST, dl, MVT::v8i16, Shl); + SDValue Ins = DAG.getNode(PPCISD::VECINSERT, dl, MVT::v8i16, Conv1, Conv2, + DAG.getConstant(InsertAtByte, dl, MVT::i32)); + return DAG.getNode(ISD::BITCAST, dl, MVT::v16i8, Ins); + } + SDValue Conv2 = DAG.getNode(ISD::BITCAST, dl, MVT::v8i16, V2); + SDValue Ins = DAG.getNode(PPCISD::VECINSERT, dl, MVT::v8i16, Conv1, Conv2, + DAG.getConstant(InsertAtByte, dl, MVT::i32)); + return DAG.getNode(ISD::BITCAST, dl, MVT::v16i8, Ins); +} + /// LowerVECTOR_SHUFFLE - Return the code we lower for VECTOR_SHUFFLE. If this /// is a shuffle we can handle in a single instruction, return it. Otherwise, /// return the code it can be lowered into. Worst case, it can always be @@ -7869,7 +8130,7 @@ SDValue PPCTargetLowering::LowerVECTOR_SHUFFLE(SDValue Op, bool isLittleEndian = Subtarget.isLittleEndian(); unsigned ShiftElts, InsertAtByte; - bool Swap; + bool Swap = false; if (Subtarget.hasP9Vector() && PPC::isXXINSERTWMask(SVOp, ShiftElts, InsertAtByte, Swap, isLittleEndian)) { @@ -7880,15 +8141,23 @@ SDValue PPCTargetLowering::LowerVECTOR_SHUFFLE(SDValue Op, if (ShiftElts) { SDValue Shl = DAG.getNode(PPCISD::VECSHL, dl, MVT::v4i32, Conv2, Conv2, DAG.getConstant(ShiftElts, dl, MVT::i32)); - SDValue Ins = DAG.getNode(PPCISD::XXINSERT, dl, MVT::v4i32, Conv1, Shl, + SDValue Ins = DAG.getNode(PPCISD::VECINSERT, dl, MVT::v4i32, Conv1, Shl, DAG.getConstant(InsertAtByte, dl, MVT::i32)); return DAG.getNode(ISD::BITCAST, dl, MVT::v16i8, Ins); } - SDValue Ins = DAG.getNode(PPCISD::XXINSERT, dl, MVT::v4i32, Conv1, Conv2, + SDValue Ins = DAG.getNode(PPCISD::VECINSERT, dl, MVT::v4i32, Conv1, Conv2, DAG.getConstant(InsertAtByte, dl, MVT::i32)); return DAG.getNode(ISD::BITCAST, dl, MVT::v16i8, Ins); } + if (Subtarget.hasP9Altivec()) { + SDValue NewISDNode; + if ((NewISDNode = lowerToVINSERTH(SVOp, DAG))) + return NewISDNode; + + if ((NewISDNode = lowerToVINSERTB(SVOp, DAG))) + return NewISDNode; + } if (Subtarget.hasVSX() && PPC::isXXSLDWIShuffleMask(SVOp, ShiftElts, Swap, isLittleEndian)) { @@ -8390,6 +8659,8 @@ SDValue PPCTargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op, unsigned IntrinsicID = cast(Op.getOperand(0))->getZExtValue(); + SDLoc dl(Op); + if (IntrinsicID == Intrinsic::thread_pointer) { // Reads the thread pointer register, used for __builtin_thread_pointer. if (Subtarget.isPPC64()) @@ -8397,9 +8668,37 @@ SDValue PPCTargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op, return DAG.getRegister(PPC::R2, MVT::i32); } + // We are looking for absolute values here. + // The idea is to try to fit one of two patterns: + // max (a, (0-a)) OR max ((0-a), a) + if (Subtarget.hasP9Vector() && + (IntrinsicID == Intrinsic::ppc_altivec_vmaxsw || + IntrinsicID == Intrinsic::ppc_altivec_vmaxsh || + IntrinsicID == Intrinsic::ppc_altivec_vmaxsb)) { + SDValue V1 = Op.getOperand(1); + SDValue V2 = Op.getOperand(2); + if (V1.getSimpleValueType() == V2.getSimpleValueType() && + (V1.getSimpleValueType() == MVT::v4i32 || + V1.getSimpleValueType() == MVT::v8i16 || + V1.getSimpleValueType() == MVT::v16i8)) { + if ( V1.getOpcode() == ISD::SUB && + ISD::isBuildVectorAllZeros(V1.getOperand(0).getNode()) && + V1.getOperand(1) == V2 ) { + // Generate the abs instruction with the operands + return DAG.getNode(ISD::ABS, dl, V2.getValueType(),V2); + } + + if ( V2.getOpcode() == ISD::SUB && + ISD::isBuildVectorAllZeros(V2.getOperand(0).getNode()) && + V2.getOperand(1) == V1 ) { + // Generate the abs instruction with the operands + return DAG.getNode(ISD::ABS, dl, V1.getValueType(),V1); + } + } + } + // If this is a lowered altivec predicate compare, CompareOpc is set to the // opcode number of the comparison. - SDLoc dl(Op); int CompareOpc; bool isDot; if (!getVectorCompareInfo(Op, CompareOpc, isDot, Subtarget)) @@ -8495,6 +8794,23 @@ SDValue PPCTargetLowering::LowerREM(SDValue Op, SelectionDAG &DAG) const { return Op; } +// Lower scalar BSWAP64 to xxbrd. +SDValue PPCTargetLowering::LowerBSWAP(SDValue Op, SelectionDAG &DAG) const { + SDLoc dl(Op); + // MTVSRDD + Op = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v2i64, Op.getOperand(0), + Op.getOperand(0)); + // XXBRD + Op = DAG.getNode(PPCISD::XXREVERSE, dl, MVT::v2i64, Op); + // MFVSRD + int VectorIndex = 0; + if (Subtarget.isLittleEndian()) + VectorIndex = 1; + Op = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i64, Op, + DAG.getTargetConstant(VectorIndex, dl, MVT::i32)); + return Op; +} + SDValue PPCTargetLowering::LowerSIGN_EXTEND_INREG(SDValue Op, SelectionDAG &DAG) const { SDLoc dl(Op); @@ -8539,11 +8855,29 @@ SDValue PPCTargetLowering::LowerINSERT_VECTOR_ELT(SDValue Op, SelectionDAG &DAG) const { assert(Op.getOpcode() == ISD::INSERT_VECTOR_ELT && "Should only be called for ISD::INSERT_VECTOR_ELT"); + ConstantSDNode *C = dyn_cast(Op.getOperand(2)); // We have legal lowering for constant indices but not for variable ones. - if (C) - return Op; - return SDValue(); + if (!C) + return SDValue(); + + EVT VT = Op.getValueType(); + SDLoc dl(Op); + SDValue V1 = Op.getOperand(0); + SDValue V2 = Op.getOperand(1); + // We can use MTVSRZ + VECINSERT for v8i16 and v16i8 types. + if (VT == MVT::v8i16 || VT == MVT::v16i8) { + SDValue Mtvsrz = DAG.getNode(PPCISD::MTVSRZ, dl, VT, V2); + unsigned BytesInEachElement = VT.getVectorElementType().getSizeInBits() / 8; + unsigned InsertAtElement = C->getZExtValue(); + unsigned InsertAtByte = InsertAtElement * BytesInEachElement; + if (Subtarget.isLittleEndian()) { + InsertAtByte = (16 - BytesInEachElement) - InsertAtByte; + } + return DAG.getNode(PPCISD::VECINSERT, dl, VT, V1, Mtvsrz, + DAG.getConstant(InsertAtByte, dl, MVT::i32)); + } + return Op; } SDValue PPCTargetLowering::LowerEXTRACT_VECTOR_ELT(SDValue Op, @@ -8966,6 +9300,8 @@ SDValue PPCTargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const { case ISD::SREM: case ISD::UREM: return LowerREM(Op, DAG); + case ISD::BSWAP: + return LowerBSWAP(Op, DAG); } } @@ -9461,7 +9797,7 @@ PPCTargetLowering::emitEHSjLjSetJmp(MachineInstr &MI, // Naked functions never have a base pointer, and so we use r1. For all // other functions, this decision must be delayed until during PEI. unsigned BaseReg; - if (MF->getFunction()->hasFnAttribute(Attribute::Naked)) + if (MF->getFunction().hasFnAttribute(Attribute::Naked)) BaseReg = Subtarget.isPPC64() ? PPC::X1 : PPC::R1; else BaseReg = Subtarget.isPPC64() ? PPC::BP8 : PPC::BP; @@ -11887,9 +12223,28 @@ SDValue PPCTargetLowering::PerformDAGCombine(SDNode *N, cast(N)->getMemOperand()); } + // STORE Constant:i32<0> -> STORE Constant:i64<0> + // So it can increase the chance of CSE constant construction. + EVT VT = N->getOperand(1).getValueType(); + if (Subtarget.isPPC64() && !DCI.isBeforeLegalize() && + isa(N->getOperand(1)) && VT == MVT::i32) { + // Need to sign-extended to 64-bits to handle negative values. + EVT MemVT = cast(N)->getMemoryVT(); + uint64_t Val64 = SignExtend64(N->getConstantOperandVal(1), + MemVT.getSizeInBits()); + SDValue Const64 = DAG.getConstant(Val64, dl, MVT::i64); + + // DAG.getTruncStore() can't be used here because it doesn't accept + // the general (base + offset) addressing mode. + // So we use UpdateNodeOperands and setTruncatingStore instead. + DAG.UpdateNodeOperands(N, N->getOperand(0), Const64, N->getOperand(2), + N->getOperand(3)); + cast(N)->setTruncatingStore(true); + return SDValue(N, 0); + } + // For little endian, VSX stores require generating xxswapd/lxvd2x. // Not needed on ISA 3.0 based CPUs since we have a non-permuting store. - EVT VT = N->getOperand(1).getValueType(); if (VT.isSimple()) { MVT StoreVT = VT.getSimpleVT(); if (Subtarget.needsSwapsForVSXMemOps() && @@ -12690,6 +13045,7 @@ PPCTargetLowering::getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI, return std::make_pair(0U, &PPC::QSRCRegClass); if (Subtarget.hasAltivec()) return std::make_pair(0U, &PPC::VRRCRegClass); + break; case 'y': // crrc return std::make_pair(0U, &PPC::CRRCRegClass); } @@ -12810,7 +13166,7 @@ void PPCTargetLowering::LowerAsmOperandForConstraint(SDValue Op, // by AM is legal for this target, for a load/store of the specified type. bool PPCTargetLowering::isLegalAddressingMode(const DataLayout &DL, const AddrMode &AM, Type *Ty, - unsigned AS) const { + unsigned AS, Instruction *I) const { // PPC does not allow r+i addressing modes for vectors! if (Ty->isVectorTy() && AM.BaseOffs != 0) return false; @@ -12895,7 +13251,7 @@ SDValue PPCTargetLowering::LowerFRAMEADDR(SDValue Op, // Naked functions never have a frame pointer, and so we use r1. For all // other functions, this decision must be delayed until during PEI. unsigned FrameReg; - if (MF.getFunction()->hasFnAttribute(Attribute::Naked)) + if (MF.getFunction().hasFnAttribute(Attribute::Naked)) FrameReg = isPPC64 ? PPC::X1 : PPC::R1; else FrameReg = isPPC64 ? PPC::FP8 : PPC::FP; @@ -12940,6 +13296,7 @@ PPCTargetLowering::isOffsetFoldingLegal(const GlobalAddressSDNode *GA) const { bool PPCTargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info, const CallInst &I, + MachineFunction &MF, unsigned Intrinsic) const { switch (Intrinsic) { case Intrinsic::ppc_qpx_qvlfd: @@ -12992,9 +13349,7 @@ bool PPCTargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info, Info.offset = -VT.getStoreSize()+1; Info.size = 2*VT.getStoreSize()-1; Info.align = 1; - Info.vol = false; - Info.readMem = true; - Info.writeMem = false; + Info.flags = MachineMemOperand::MOLoad; return true; } case Intrinsic::ppc_qpx_qvlfda: @@ -13028,9 +13383,7 @@ bool PPCTargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info, Info.offset = 0; Info.size = VT.getStoreSize(); Info.align = 1; - Info.vol = false; - Info.readMem = true; - Info.writeMem = false; + Info.flags = MachineMemOperand::MOLoad; return true; } case Intrinsic::ppc_qpx_qvstfd: @@ -13082,9 +13435,7 @@ bool PPCTargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info, Info.offset = -VT.getStoreSize()+1; Info.size = 2*VT.getStoreSize()-1; Info.align = 1; - Info.vol = false; - Info.readMem = false; - Info.writeMem = true; + Info.flags = MachineMemOperand::MOStore; return true; } case Intrinsic::ppc_qpx_qvstfda: @@ -13117,9 +13468,7 @@ bool PPCTargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info, Info.offset = 0; Info.size = VT.getStoreSize(); Info.align = 1; - Info.vol = false; - Info.readMem = false; - Info.writeMem = true; + Info.flags = MachineMemOperand::MOStore; return true; } default: @@ -13146,12 +13495,12 @@ EVT PPCTargetLowering::getOptimalMemOpType(uint64_t Size, bool MemcpyStrSrc, MachineFunction &MF) const { if (getTargetMachine().getOptLevel() != CodeGenOpt::None) { - const Function *F = MF.getFunction(); + const Function &F = MF.getFunction(); // When expanding a memset, require at least two QPX instructions to cover // the cost of loading the value to be stored from the constant pool. if (Subtarget.hasQPX() && Size >= 32 && (!IsMemset || Size >= 64) && (!SrcAlign || SrcAlign >= 32) && (!DstAlign || DstAlign >= 32) && - !F->hasFnAttribute(Attribute::NoImplicitFloat)) { + !F.hasFnAttribute(Attribute::NoImplicitFloat)) { return MVT::v4f64; } @@ -13216,8 +13565,9 @@ bool PPCTargetLowering::isZExtFree(SDValue Val, EVT VT2) const { return TargetLowering::isZExtFree(Val, VT2); } -bool PPCTargetLowering::isFPExtFree(EVT VT) const { - assert(VT.isFloatingPoint()); +bool PPCTargetLowering::isFPExtFree(EVT DestVT, EVT SrcVT) const { + assert(DestVT.isFloatingPoint() && SrcVT.isFloatingPoint() && + "invalid fpext types"); return true; } @@ -13369,7 +13719,7 @@ void PPCTargetLowering::insertCopiesSplitCSR( // fine for CXX_FAST_TLS since the C++-style TLS access functions should be // nounwind. If we want to generalize this later, we may need to emit // CFI pseudo-instructions. - assert(Entry->getParent()->getFunction()->hasFnAttribute( + assert(Entry->getParent()->getFunction().hasFnAttribute( Attribute::NoUnwind) && "Function should be nounwind in insertCopiesSplitCSR!"); Entry->addLiveIn(*I); @@ -13467,3 +13817,38 @@ SDValue PPCTargetLowering::combineSRL(SDNode *N, DAGCombinerInfo &DCI) const { return SDValue(); } + +bool PPCTargetLowering::mayBeEmittedAsTailCall(const CallInst *CI) const { + // Only duplicate to increase tail-calls for the 64bit SysV ABIs. + if (!Subtarget.isSVR4ABI() || !Subtarget.isPPC64()) + return false; + + // If not a tail call then no need to proceed. + if (!CI->isTailCall()) + return false; + + // If tail calls are disabled for the caller then we are done. + const Function *Caller = CI->getParent()->getParent(); + auto Attr = Caller->getFnAttribute("disable-tail-calls"); + if (Attr.getValueAsString() == "true") + return false; + + // If sibling calls have been disabled and tail-calls aren't guaranteed + // there is no reason to duplicate. + auto &TM = getTargetMachine(); + if (!TM.Options.GuaranteedTailCallOpt && DisableSCO) + return false; + + // Can't tail call a function called indirectly, or if it has variadic args. + const Function *Callee = CI->getCalledFunction(); + if (!Callee || Callee->isVarArg()) + return false; + + // Make sure the callee and caller calling conventions are eligible for tco. + if (!areCallingConvEligibleForTCO_64SVR4(Caller->getCallingConv(), + CI->getCallingConv())) + return false; + + // If the function is local then we have a good chance at tail-calling it + return getTargetMachine().shouldAssumeDSOLocal(*Caller->getParent(), Callee); +} diff --git a/lib/Target/PowerPC/PPCISelLowering.h b/lib/Target/PowerPC/PPCISelLowering.h index 49d7d8220af16..b119e5b4a5649 100644 --- a/lib/Target/PowerPC/PPCISelLowering.h +++ b/lib/Target/PowerPC/PPCISelLowering.h @@ -23,6 +23,7 @@ #include "llvm/CodeGen/MachineValueType.h" #include "llvm/CodeGen/SelectionDAG.h" #include "llvm/CodeGen/SelectionDAGNodes.h" +#include "llvm/CodeGen/TargetLowering.h" #include "llvm/CodeGen/ValueTypes.h" #include "llvm/IR/Attributes.h" #include "llvm/IR/CallingConv.h" @@ -30,13 +31,20 @@ #include "llvm/IR/InlineAsm.h" #include "llvm/IR/Metadata.h" #include "llvm/IR/Type.h" -#include "llvm/Target/TargetLowering.h" #include namespace llvm { namespace PPCISD { + // When adding a NEW PPCISD node please add it to the correct position in + // the enum. The order of elements in this enum matters! + // Values that are added after this entry: + // STBRX = ISD::FIRST_TARGET_MEMORY_OPCODE + // are considerd memory opcodes and are treated differently than entries + // that come before it. For example, ADD or MUL should be placed before + // the ISD::FIRST_TARGET_MEMORY_OPCODE while a LOAD or STORE should come + // after it. enum NodeType : unsigned { // Start the numbering where the builtin ops and target ops leave off. FIRST_NUMBER = ISD::BUILTIN_OP_END, @@ -86,15 +94,15 @@ namespace llvm { /// XXSPLT, - /// XXINSERT - The PPC VSX insert instruction + /// VECINSERT - The PPC vector insert instruction /// - XXINSERT, + VECINSERT, /// XXREVERSE - The PPC VSX reverse instruction /// XXREVERSE, - /// VECSHL - The PPC VSX shift left instruction + /// VECSHL - The PPC vector shift left instruction /// VECSHL, @@ -254,7 +262,7 @@ namespace llvm { /// local dynamic TLS on PPC32. PPC32_PICGOT, - /// G8RC = ADDIS_GOT_TPREL_HA %X2, Symbol - Used by the initial-exec + /// G8RC = ADDIS_GOT_TPREL_HA %x2, Symbol - Used by the initial-exec /// TLS model, produces an ADDIS8 instruction that adds the GOT /// base to sym\@got\@tprel\@ha. ADDIS_GOT_TPREL_HA, @@ -273,18 +281,18 @@ namespace llvm { /// TLS sequence. ADD_TLS, - /// G8RC = ADDIS_TLSGD_HA %X2, Symbol - For the general-dynamic TLS + /// G8RC = ADDIS_TLSGD_HA %x2, Symbol - For the general-dynamic TLS /// model, produces an ADDIS8 instruction that adds the GOT base /// register to sym\@got\@tlsgd\@ha. ADDIS_TLSGD_HA, - /// %X3 = ADDI_TLSGD_L G8RReg, Symbol - For the general-dynamic TLS + /// %x3 = ADDI_TLSGD_L G8RReg, Symbol - For the general-dynamic TLS /// model, produces an ADDI8 instruction that adds G8RReg to /// sym\@got\@tlsgd\@l and stores the result in X3. Hidden by /// ADDIS_TLSGD_L_ADDR until after register assignment. ADDI_TLSGD_L, - /// %X3 = GET_TLS_ADDR %X3, Symbol - For the general-dynamic TLS + /// %x3 = GET_TLS_ADDR %x3, Symbol - For the general-dynamic TLS /// model, produces a call to __tls_get_addr(sym\@tlsgd). Hidden by /// ADDIS_TLSGD_L_ADDR until after register assignment. GET_TLS_ADDR, @@ -294,18 +302,18 @@ namespace llvm { /// register assignment. ADDI_TLSGD_L_ADDR, - /// G8RC = ADDIS_TLSLD_HA %X2, Symbol - For the local-dynamic TLS + /// G8RC = ADDIS_TLSLD_HA %x2, Symbol - For the local-dynamic TLS /// model, produces an ADDIS8 instruction that adds the GOT base /// register to sym\@got\@tlsld\@ha. ADDIS_TLSLD_HA, - /// %X3 = ADDI_TLSLD_L G8RReg, Symbol - For the local-dynamic TLS + /// %x3 = ADDI_TLSLD_L G8RReg, Symbol - For the local-dynamic TLS /// model, produces an ADDI8 instruction that adds G8RReg to /// sym\@got\@tlsld\@l and stores the result in X3. Hidden by /// ADDIS_TLSLD_L_ADDR until after register assignment. ADDI_TLSLD_L, - /// %X3 = GET_TLSLD_ADDR %X3, Symbol - For the local-dynamic TLS + /// %x3 = GET_TLSLD_ADDR %x3, Symbol - For the local-dynamic TLS /// model, produces a call to __tls_get_addr(sym\@tlsld). Hidden by /// ADDIS_TLSLD_L_ADDR until after register assignment. GET_TLSLD_ADDR, @@ -315,7 +323,7 @@ namespace llvm { /// following register assignment. ADDI_TLSLD_L_ADDR, - /// G8RC = ADDIS_DTPREL_HA %X3, Symbol - For the local-dynamic TLS + /// G8RC = ADDIS_DTPREL_HA %x3, Symbol - For the local-dynamic TLS /// model, produces an ADDIS8 instruction that adds X3 to /// sym\@dtprel\@ha. ADDIS_DTPREL_HA, @@ -578,8 +586,8 @@ namespace llvm { bool supportSplitCSR(MachineFunction *MF) const override { return - MF->getFunction()->getCallingConv() == CallingConv::CXX_FAST_TLS && - MF->getFunction()->hasFnAttribute(Attribute::NoUnwind); + MF->getFunction().getCallingConv() == CallingConv::CXX_FAST_TLS && + MF->getFunction().hasFnAttribute(Attribute::NoUnwind); } void initializeSplitCSR(MachineBasicBlock *Entry) const override; @@ -727,7 +735,8 @@ namespace llvm { /// isLegalAddressingMode - Return true if the addressing mode represented /// by AM is legal for this target, for a load/store of the specified type. bool isLegalAddressingMode(const DataLayout &DL, const AddrMode &AM, - Type *Ty, unsigned AS) const override; + Type *Ty, unsigned AS, + Instruction *I = nullptr) const override; /// isLegalICmpImmediate - Return true if the specified immediate is legal /// icmp immediate, that is the target has icmp instructions which can @@ -749,14 +758,14 @@ namespace llvm { bool isZExtFree(SDValue Val, EVT VT2) const override; - bool isFPExtFree(EVT VT) const override; + bool isFPExtFree(EVT DestVT, EVT SrcVT) const override; /// \brief Returns true if it is beneficial to convert a load of a constant /// to just the constant itself. bool shouldConvertConstantLoadToIntImm(const APInt &Imm, Type *Ty) const override; - bool convertSelectOfConstantsToMath() const override { + bool convertSelectOfConstantsToMath(EVT VT) const override { return true; } @@ -764,6 +773,7 @@ namespace llvm { bool getTgtMemIntrinsic(IntrinsicInfo &Info, const CallInst &I, + MachineFunction &MF, unsigned Intrinsic) const override; /// getOptimalMemOpType - Returns the target specific optimal type for load @@ -898,7 +908,7 @@ namespace llvm { IsEligibleForTailCallOptimization_64SVR4( SDValue Callee, CallingConv::ID CalleeCC, - ImmutableCallSite *CS, + ImmutableCallSite CS, bool isVarArg, const SmallVectorImpl &Outs, const SmallVectorImpl &Ins, @@ -944,6 +954,7 @@ namespace llvm { SDValue LowerINTRINSIC_WO_CHAIN(SDValue Op, SelectionDAG &DAG) const; SDValue LowerINTRINSIC_VOID(SDValue Op, SelectionDAG &DAG) const; SDValue LowerREM(SDValue Op, SelectionDAG &DAG) const; + SDValue LowerBSWAP(SDValue Op, SelectionDAG &DAG) const; SDValue LowerSCALAR_TO_VECTOR(SDValue Op, SelectionDAG &DAG) const; SDValue LowerSIGN_EXTEND_INREG(SDValue Op, SelectionDAG &DAG) const; SDValue LowerMUL(SDValue Op, SelectionDAG &DAG) const; @@ -964,7 +975,7 @@ namespace llvm { SDValue &Callee, int SPDiff, unsigned NumBytes, const SmallVectorImpl &Ins, SmallVectorImpl &InVals, - ImmutableCallSite *CS) const; + ImmutableCallSite CS) const; SDValue LowerFormalArguments(SDValue Chain, CallingConv::ID CallConv, bool isVarArg, @@ -1015,7 +1026,7 @@ namespace llvm { const SmallVectorImpl &Ins, const SDLoc &dl, SelectionDAG &DAG, SmallVectorImpl &InVals, - ImmutableCallSite *CS) const; + ImmutableCallSite CS) const; SDValue LowerCall_64SVR4(SDValue Chain, SDValue Callee, CallingConv::ID CallConv, bool isVarArg, bool isTailCall, bool isPatchPoint, @@ -1024,7 +1035,7 @@ namespace llvm { const SmallVectorImpl &Ins, const SDLoc &dl, SelectionDAG &DAG, SmallVectorImpl &InVals, - ImmutableCallSite *CS) const; + ImmutableCallSite CS) const; SDValue LowerCall_32SVR4(SDValue Chain, SDValue Callee, CallingConv::ID CallConv, bool isVarArg, bool isTailCall, bool isPatchPoint, @@ -1033,7 +1044,7 @@ namespace llvm { const SmallVectorImpl &Ins, const SDLoc &dl, SelectionDAG &DAG, SmallVectorImpl &InVals, - ImmutableCallSite *CS) const; + ImmutableCallSite CS) const; SDValue lowerEH_SJLJ_SETJMP(SDValue Op, SelectionDAG &DAG) const; SDValue lowerEH_SJLJ_LONGJMP(SDValue Op, SelectionDAG &DAG) const; @@ -1063,7 +1074,23 @@ namespace llvm { SDValue combineElementTruncationToVectorTruncation(SDNode *N, DAGCombinerInfo &DCI) const; - }; + + /// lowerToVINSERTH - Return the SDValue if this VECTOR_SHUFFLE can be + /// handled by the VINSERTH instruction introduced in ISA 3.0. This is + /// essentially any shuffle of v8i16 vectors that just inserts one element + /// from one vector into the other. + SDValue lowerToVINSERTH(ShuffleVectorSDNode *N, SelectionDAG &DAG) const; + + /// lowerToVINSERTB - Return the SDValue if this VECTOR_SHUFFLE can be + /// handled by the VINSERTB instruction introduced in ISA 3.0. This is + /// essentially v16i8 vector version of VINSERTH. + SDValue lowerToVINSERTB(ShuffleVectorSDNode *N, SelectionDAG &DAG) const; + + // Return whether the call instruction can potentially be optimized to a + // tail call. This will cause the optimizers to attempt to move, or + // duplicate return instructions to help enable tail call optimizations. + bool mayBeEmittedAsTailCall(const CallInst *CI) const override; + }; // end class PPCTargetLowering namespace PPC { diff --git a/lib/Target/PowerPC/PPCInstr64Bit.td b/lib/Target/PowerPC/PPCInstr64Bit.td index e2af5e5295445..fdd28c2ff03f2 100644 --- a/lib/Target/PowerPC/PPCInstr64Bit.td +++ b/lib/Target/PowerPC/PPCInstr64Bit.td @@ -194,6 +194,11 @@ def : Pat<(PPCcall_nop (i64 texternalsym:$dst)), (BL8_NOP texternalsym:$dst)>; // Atomic operations +// FIXME: some of these might be used with constant operands. This will result +// in constant materialization instructions that may be redundant. We currently +// clean this up in PPCMIPeephole with calls to +// PPCInstrInfo::convertToImmediateForm() but we should probably not emit them +// in the first place. let usesCustomInserter = 1 in { let Defs = [CR0] in { def ATOMIC_LOAD_ADD_I64 : Pseudo< @@ -642,8 +647,13 @@ def EXTSW_32 : XForm_11<31, 986, (outs gprc:$rA), (ins gprc:$rS), defm SRADI : XSForm_1rc<31, 413, (outs g8rc:$rA), (ins g8rc:$rS, u6imm:$SH), "sradi", "$rA, $rS, $SH", IIC_IntRotateDI, [(set i64:$rA, (sra i64:$rS, (i32 imm:$SH)))]>, isPPC64; + +defm EXTSWSLI : XSForm_1r<31, 445, (outs g8rc:$rA), (ins g8rc:$rS, u6imm:$SH), + "extswsli", "$rA, $rS, $SH", IIC_IntRotateDI, + []>, isPPC64; + // For fast-isel: -let isCodeGenOnly = 1 in +let isCodeGenOnly = 1, Defs = [CARRY] in def SRADI_32 : XSForm_1<31, 413, (outs gprc:$rA), (ins gprc:$rS, u6imm:$SH), "sradi $rA, $rS, $SH", IIC_IntRotateDI, []>, isPPC64; @@ -673,6 +683,9 @@ def POPCNTW : XForm_11<31, 378, (outs gprc:$rA), (ins gprc:$rS), "popcntw $rA, $rS", IIC_IntGeneral, [(set i32:$rA, (ctpop i32:$rS))]>; +def POPCNTB : XForm_11<31, 122, (outs gprc:$rA), (ins gprc:$rS), + "popcntb $rA, $rS", IIC_IntGeneral, []>; + defm DIVD : XOForm_1rcr<31, 489, 0, (outs g8rc:$rT), (ins g8rc:$rA, g8rc:$rB), "divd", "$rT, $rA, $rB", IIC_IntDivD, [(set i64:$rT, (sdiv i64:$rA, i64:$rB))]>, isPPC64; @@ -685,6 +698,18 @@ def DIVDE : XOForm_1<31, 425, 0, (outs g8rc:$rT), (ins g8rc:$rA, g8rc:$rB), isPPC64, Requires<[HasExtDiv]>; let Predicates = [IsISA3_0] in { +def MADDHD : VAForm_1a<48, (outs g8rc :$RT), (ins g8rc:$RA, g8rc:$RB, g8rc:$RC), + "maddhd $RT, $RA, $RB, $RC", IIC_IntMulHD, []>, isPPC64; +def MADDHDU : VAForm_1a<49, (outs g8rc :$RT), (ins g8rc:$RA, g8rc:$RB, g8rc:$RC), + "maddhdu $RT, $RA, $RB, $RC", IIC_IntMulHD, []>, isPPC64; +def MADDLD : VAForm_1a<51, (outs g8rc :$RT), (ins g8rc:$RA, g8rc:$RB, g8rc:$RC), + "maddld $RT, $RA, $RB, $RC", IIC_IntMulHD, []>, isPPC64; +def SETB : XForm_44<31, 128, (outs g8rc:$RT), (ins crrc:$BFA), + "setb $RT, $BFA", IIC_IntGeneral>, isPPC64; +def DARN : XForm_45<31, 755, (outs g8rc:$RT), (ins i32imm:$L), + "darn $RT, $L", IIC_LdStLD>, isPPC64; +def ADDPCIS : DXForm<19, 2, (outs g8rc:$RT), (ins i32imm:$D), + "addpcis $RT, $D", IIC_BrB, []>, isPPC64; def MODSD : XForm_8<31, 777, (outs g8rc:$rT), (ins g8rc:$rA, g8rc:$rB), "modsd $rT, $rA, $rB", IIC_IntDivW, [(set i64:$rT, (srem i64:$rA, i64:$rB))]>; diff --git a/lib/Target/PowerPC/PPCInstrAltivec.td b/lib/Target/PowerPC/PPCInstrAltivec.td index 5465b5f2d66cd..e751c149b0b32 100644 --- a/lib/Target/PowerPC/PPCInstrAltivec.td +++ b/lib/Target/PowerPC/PPCInstrAltivec.td @@ -477,10 +477,10 @@ def VPERM : VA1a_Int_Ty3<43, "vperm", int_ppc_altivec_vperm, def VSEL : VA1a_Int_Ty<42, "vsel", int_ppc_altivec_vsel, v4i32>; // Shuffles. -def VSLDOI : VAForm_2<44, (outs vrrc:$vD), (ins vrrc:$vA, vrrc:$vB, u5imm:$SH), +def VSLDOI : VAForm_2<44, (outs vrrc:$vD), (ins vrrc:$vA, vrrc:$vB, u4imm:$SH), "vsldoi $vD, $vA, $vB, $SH", IIC_VecFP, - [(set v16i8:$vD, - (vsldoi_shuffle:$SH v16i8:$vA, v16i8:$vB))]>; + [(set v16i8:$vD, + (PPCvecshl v16i8:$vA, v16i8:$vB, imm32SExt16:$SH))]>; // VX-Form instructions. AltiVec arithmetic ops. let isCommutable = 1 in { @@ -908,6 +908,9 @@ def:Pat<(vpkuwum_unary_shuffle v16i8:$vA, undef), (VPKUWUM $vA, $vA)>; def:Pat<(vpkuhum_unary_shuffle v16i8:$vA, undef), (VPKUHUM $vA, $vA)>; +def:Pat<(vsldoi_shuffle:$SH v16i8:$vA, v16i8:$vB), + (VSLDOI v16i8:$vA, v16i8:$vB, (VSLDOI_get_imm $SH))>; + // Match vsldoi(y,x), vpkuwum(y,x), vpkuhum(y,x), i.e., swapped operands. // These fragments are matched for little-endian, where the inputs must @@ -1309,8 +1312,18 @@ def VEXTUWLX : VX1_RT5_RA5_VB5<1677, "vextuwlx", []>; def VEXTUWRX : VX1_RT5_RA5_VB5<1933, "vextuwrx", []>; // Vector Insert Element Instructions -def VINSERTB : VX1_VT5_UIM5_VB5<781, "vinsertb", []>; -def VINSERTH : VX1_VT5_UIM5_VB5<845, "vinserth", []>; +def VINSERTB : VXForm_1<781, (outs vrrc:$vD), + (ins vrrc:$vDi, u4imm:$UIM, vrrc:$vB), + "vinsertb $vD, $vB, $UIM", IIC_VecGeneral, + [(set v16i8:$vD, (PPCvecinsert v16i8:$vDi, v16i8:$vB, + imm32SExt16:$UIM))]>, + RegConstraint<"$vDi = $vD">, NoEncode<"$vDi">; +def VINSERTH : VXForm_1<845, (outs vrrc:$vD), + (ins vrrc:$vDi, u4imm:$UIM, vrrc:$vB), + "vinserth $vD, $vB, $UIM", IIC_VecGeneral, + [(set v8i16:$vD, (PPCvecinsert v8i16:$vDi, v8i16:$vB, + imm32SExt16:$UIM))]>, + RegConstraint<"$vDi = $vD">, NoEncode<"$vDi">; def VINSERTW : VX1_VT5_UIM5_VB5<909, "vinsertw", []>; def VINSERTD : VX1_VT5_UIM5_VB5<973, "vinsertd", []>; @@ -1488,4 +1501,19 @@ def VABSDUH : VXForm_1<1091, (outs vrrc:$vD), (ins vrrc:$vA, vrrc:$vB), def VABSDUW : VXForm_1<1155, (outs vrrc:$vD), (ins vrrc:$vA, vrrc:$vB), "vabsduw $vD, $vA, $vB", IIC_VecGeneral, [(set v4i32:$vD, (int_ppc_altivec_vabsduw v4i32:$vA, v4i32:$vB))]>; + +def : Pat<(v16i8:$vD (abs v16i8:$vA)), + (v16i8 (VABSDUB $vA, (V_SET0B)))>; +def : Pat<(v8i16:$vD (abs v8i16:$vA)), + (v8i16 (VABSDUH $vA, (V_SET0H)))>; +def : Pat<(v4i32:$vD (abs v4i32:$vA)), + (v4i32 (VABSDUW $vA, (V_SET0)))>; + +def : Pat<(v16i8:$vD (abs (sub v16i8:$vA, v16i8:$vB))), + (v16i8 (VABSDUB $vA, $vB))>; +def : Pat<(v8i16:$vD (abs (sub v8i16:$vA, v8i16:$vB))), + (v8i16 (VABSDUH $vA, $vB))>; +def : Pat<(v4i32:$vD (abs (sub v4i32:$vA, v4i32:$vB))), + (v4i32 (VABSDUW $vA, $vB))>; + } // end HasP9Altivec diff --git a/lib/Target/PowerPC/PPCInstrFormats.td b/lib/Target/PowerPC/PPCInstrFormats.td index ef7d2012a2332..f2845415ecb5a 100644 --- a/lib/Target/PowerPC/PPCInstrFormats.td +++ b/lib/Target/PowerPC/PPCInstrFormats.td @@ -386,6 +386,22 @@ class DSForm_1 opcode, bits<2> xo, dag OOL, dag IOL, string asmstr, let Inst{30-31} = xo; } +// ISA V3.0B 1.6.6 DX-Form +class DXForm opcode, bits<5> xo, dag OOL, dag IOL, string asmstr, + InstrItinClass itin, list pattern> + : I { + bits<5> RT; + bits<16> D; + + let Pattern = pattern; + + let Inst{6-10} = RT; + let Inst{11-15} = D{5-1}; // d1 + let Inst{16-25} = D{15-6}; // d0 + let Inst{26-30} = xo; + let Inst{31} = D{0}; // d2 +} + // DQ-Form: [PO T RA DQ TX XO] or [PO S RA DQ SX XO] class DQ_RD6_RS5_DQ12 opcode, bits<3> xo, dag OOL, dag IOL, string asmstr, InstrItinClass itin, list pattern> @@ -725,6 +741,96 @@ class XForm_43 opcode, bits<10> xo, dag OOL, dag IOL, string asmstr, let Inst{31} = RC; } +class XForm_44 opcode, bits<10> xo, dag OOL, dag IOL, string asmstr, + InstrItinClass itin> + : I { + bits<5> RT; + bits<3> BFA; + + let Inst{6-10} = RT; + let Inst{11-13} = BFA; + let Inst{14-15} = 0; + let Inst{16-20} = 0; + let Inst{21-30} = xo; + let Inst{31} = 0; +} + +class XForm_45 opcode, bits<10> xo, dag OOL, dag IOL, string asmstr, + InstrItinClass itin> + : I { + bits<5> RT; + bits<2> L; + + let Inst{6-10} = RT; + let Inst{11-13} = 0; + let Inst{14-15} = L; + let Inst{16-20} = 0; + let Inst{21-30} = xo; + let Inst{31} = 0; +} + +class X_FRT5_XO2_XO3_XO10 opcode, bits<2> xo1, bits<3> xo2, bits<10> xo, + dag OOL, dag IOL, string asmstr, InstrItinClass itin, + list pattern> + : XForm_base_r3xo { + let Pattern = pattern; + + let Inst{6-10} = RST; + let Inst{11-12} = xo1; + let Inst{13-15} = xo2; + let Inst{16-20} = 0; + let Inst{21-30} = xo; + let Inst{31} = 0; +} + +class X_FRT5_XO2_XO3_FRB5_XO10 opcode, bits<2> xo1, bits<3> xo2, + bits<10> xo, dag OOL, dag IOL, string asmstr, + InstrItinClass itin, list pattern> + : XForm_base_r3xo { + let Pattern = pattern; + bits<5> FRB; + + let Inst{6-10} = RST; + let Inst{11-12} = xo1; + let Inst{13-15} = xo2; + let Inst{16-20} = FRB; + let Inst{21-30} = xo; + let Inst{31} = 0; +} + +class X_FRT5_XO2_XO3_DRM3_XO10 opcode, bits<2> xo1, bits<3> xo2, + bits<10> xo, dag OOL, dag IOL, string asmstr, + InstrItinClass itin, list pattern> + : XForm_base_r3xo { + let Pattern = pattern; + bits<3> DRM; + + let Inst{6-10} = RST; + let Inst{11-12} = xo1; + let Inst{13-15} = xo2; + let Inst{16-17} = 0; + let Inst{18-20} = DRM; + let Inst{21-30} = xo; + let Inst{31} = 0; +} + +class X_FRT5_XO2_XO3_RM2_X10 opcode, bits<2> xo1, bits<3> xo2, + bits<10> xo, dag OOL, dag IOL, string asmstr, + InstrItinClass itin, list pattern> + : XForm_base_r3xo { + let Pattern = pattern; + bits<2> RM; + + let Inst{6-10} = RST; + let Inst{11-12} = xo1; + let Inst{13-15} = xo2; + let Inst{16-18} = 0; + let Inst{19-20} = RM; + let Inst{21-30} = xo; + let Inst{31} = 0; +} + + class XForm_0 opcode, bits<10> xo, dag OOL, dag IOL, string asmstr, InstrItinClass itin, list pattern> : XForm_base_r3xo { @@ -1995,4 +2101,5 @@ class Pseudo pattern> let PPC64 = 0; let Pattern = pattern; let Inst{31-0} = 0; + let hasNoSchedulingInfo = 1; } diff --git a/lib/Target/PowerPC/PPCInstrInfo.cpp b/lib/Target/PowerPC/PPCInstrInfo.cpp index e74ba38c351f0..ffb5cc8757f25 100644 --- a/lib/Target/PowerPC/PPCInstrInfo.cpp +++ b/lib/Target/PowerPC/PPCInstrInfo.cpp @@ -20,7 +20,7 @@ #include "PPCTargetMachine.h" #include "llvm/ADT/STLExtras.h" #include "llvm/ADT/Statistic.h" -#include "llvm/CodeGen/LiveIntervalAnalysis.h" +#include "llvm/CodeGen/LiveIntervals.h" #include "llvm/CodeGen/MachineFrameInfo.h" #include "llvm/CodeGen/MachineFunctionPass.h" #include "llvm/CodeGen/MachineInstrBuilder.h" @@ -46,6 +46,16 @@ using namespace llvm; #define GET_INSTRINFO_CTOR_DTOR #include "PPCGenInstrInfo.inc" +STATISTIC(NumStoreSPILLVSRRCAsVec, + "Number of spillvsrrc spilled to stack as vec"); +STATISTIC(NumStoreSPILLVSRRCAsGpr, + "Number of spillvsrrc spilled to stack as gpr"); +STATISTIC(NumGPRtoVSRSpill, "Number of gpr spills to spillvsrrc"); +STATISTIC(CmpIselsConverted, + "Number of ISELs that depend on comparison of constants converted"); +STATISTIC(MissedConvertibleImmediateInstrs, + "Number of compare-immediate instructions fed by constants"); + static cl:: opt DisableCTRLoopAnal("disable-ppc-ctrloop-analysis", cl::Hidden, cl::desc("Disable analysis for CTR loops")); @@ -254,6 +264,7 @@ bool PPCInstrInfo::isCoalescableExtInstr(const MachineInstr &MI, switch (MI.getOpcode()) { default: return false; case PPC::EXTSW: + case PPC::EXTSW_32: case PPC::EXTSW_32_64: SrcReg = MI.getOperand(1).getReg(); DstReg = MI.getOperand(0).getReg(); @@ -275,11 +286,12 @@ unsigned PPCInstrInfo::isLoadFromStackSlot(const MachineInstr &MI, case PPC::RESTORE_CRBIT: case PPC::LVX: case PPC::LXVD2X: - case PPC::LXVX: + case PPC::LXV: case PPC::QVLFDX: case PPC::QVLFSXs: case PPC::QVLFDXb: case PPC::RESTORE_VRSAVE: + case PPC::SPILLTOVSR_LD: // Check for the operands added by addFrameReference (the immediate is the // offset which defaults to 0). if (MI.getOperand(1).isImm() && !MI.getOperand(1).getImm() && @@ -328,11 +340,12 @@ unsigned PPCInstrInfo::isStoreToStackSlot(const MachineInstr &MI, case PPC::SPILL_CRBIT: case PPC::STVX: case PPC::STXVD2X: - case PPC::STXVX: + case PPC::STXV: case PPC::QVSTFDX: case PPC::QVSTFSXs: case PPC::QVSTFDXb: case PPC::SPILL_VRSAVE: + case PPC::SPILLTOVSR_ST: // Check for the operands added by addFrameReference (the immediate is the // offset which defaults to 0). if (MI.getOperand(1).isImm() && !MI.getOperand(1).getImm() && @@ -486,6 +499,20 @@ bool PPCInstrInfo::analyzeBranch(MachineBasicBlock &MBB, if (!isUnpredicatedTerminator(*I)) return false; + if (AllowModify) { + // If the BB ends with an unconditional branch to the fallthrough BB, + // we eliminate the branch instruction. + if (I->getOpcode() == PPC::B && + MBB.isLayoutSuccessor(I->getOperand(0).getMBB())) { + I->eraseFromParent(); + + // We update iterator after deleting the last branch. + I = MBB.getLastNonDebugInstr(); + if (I == MBB.end() || !isUnpredicatedTerminator(*I)) + return false; + } + } + // Get the last instruction in the block. MachineInstr &LastInst = *I; @@ -917,7 +944,18 @@ void PPCInstrInfo::copyPhysReg(MachineBasicBlock &MBB, BuildMI(MBB, I, DL, get(PPC::MFOCRF), DestReg).addReg(SrcReg); getKillRegState(KillSrc); return; - } + } else if (PPC::G8RCRegClass.contains(SrcReg) && + PPC::VSFRCRegClass.contains(DestReg)) { + BuildMI(MBB, I, DL, get(PPC::MTVSRD), DestReg).addReg(SrcReg); + NumGPRtoVSRSpill++; + getKillRegState(KillSrc); + return; + } else if (PPC::VSFRCRegClass.contains(SrcReg) && + PPC::G8RCRegClass.contains(DestReg)) { + BuildMI(MBB, I, DL, get(PPC::MFVSRD), DestReg).addReg(SrcReg); + getKillRegState(KillSrc); + return; + } unsigned Opc; if (PPC::GPRCRegClass.contains(DestReg, SrcReg)) @@ -1015,7 +1053,7 @@ PPCInstrInfo::StoreRegToStackSlot(MachineFunction &MF, FrameIdx)); NonRI = true; } else if (PPC::VSRCRegClass.hasSubClassEq(RC)) { - unsigned Op = Subtarget.hasP9Vector() ? PPC::STXVX : PPC::STXVD2X; + unsigned Op = Subtarget.hasP9Vector() ? PPC::STXV : PPC::STXVD2X; NewMIs.push_back(addFrameReference(BuildMI(MF, DL, get(Op)) .addReg(SrcReg, getKillRegState(isKill)), @@ -1061,6 +1099,11 @@ PPCInstrInfo::StoreRegToStackSlot(MachineFunction &MF, getKillRegState(isKill)), FrameIdx)); NonRI = true; + } else if (PPC::SPILLTOVSRRCRegClass.hasSubClassEq(RC)) { + NewMIs.push_back(addFrameReference(BuildMI(MF, DL, get(PPC::SPILLTOVSR_ST)) + .addReg(SrcReg, + getKillRegState(isKill)), + FrameIdx)); } else { llvm_unreachable("Unknown regclass!"); } @@ -1148,7 +1191,7 @@ bool PPCInstrInfo::LoadRegFromStackSlot(MachineFunction &MF, const DebugLoc &DL, FrameIdx)); NonRI = true; } else if (PPC::VSRCRegClass.hasSubClassEq(RC)) { - unsigned Op = Subtarget.hasP9Vector() ? PPC::LXVX : PPC::LXVD2X; + unsigned Op = Subtarget.hasP9Vector() ? PPC::LXV : PPC::LXVD2X; NewMIs.push_back(addFrameReference(BuildMI(MF, DL, get(Op), DestReg), FrameIdx)); NonRI = true; @@ -1182,6 +1225,9 @@ bool PPCInstrInfo::LoadRegFromStackSlot(MachineFunction &MF, const DebugLoc &DL, NewMIs.push_back(addFrameReference(BuildMI(MF, DL, get(PPC::QVLFDXb), DestReg), FrameIdx)); NonRI = true; + } else if (PPC::SPILLTOVSRRCRegClass.hasSubClassEq(RC)) { + NewMIs.push_back(addFrameReference(BuildMI(MF, DL, get(PPC::SPILLTOVSR_LD), + DestReg), FrameIdx)); } else { llvm_unreachable("Unknown regclass!"); } @@ -1592,37 +1638,20 @@ bool PPCInstrInfo::optimizeCompareInstr(MachineInstr &CmpInstr, unsigned SrcReg, // Get the unique definition of SrcReg. MachineInstr *MI = MRI->getUniqueVRegDef(SrcReg); if (!MI) return false; - int MIOpC = MI->getOpcode(); bool equalityOnly = false; bool noSub = false; if (isPPC64) { if (is32BitSignedCompare) { // We can perform this optimization only if MI is sign-extending. - if (MIOpC == PPC::SRAW || MIOpC == PPC::SRAWo || - MIOpC == PPC::SRAWI || MIOpC == PPC::SRAWIo || - MIOpC == PPC::EXTSB || MIOpC == PPC::EXTSBo || - MIOpC == PPC::EXTSH || MIOpC == PPC::EXTSHo || - MIOpC == PPC::EXTSW || MIOpC == PPC::EXTSWo) { + if (isSignExtended(*MI)) noSub = true; - } else + else return false; } else if (is32BitUnsignedCompare) { - // 32-bit rotate and mask instructions are zero extending only if MB <= ME - bool isZeroExtendingRotate = - (MIOpC == PPC::RLWINM || MIOpC == PPC::RLWINMo || - MIOpC == PPC::RLWNM || MIOpC == PPC::RLWNMo) - && MI->getOperand(3).getImm() <= MI->getOperand(4).getImm(); - // We can perform this optimization, equality only, if MI is // zero-extending. - // FIXME: Other possible target instructions include ANDISo and - // RLWINM aliases, such as ROTRWI, EXTLWI, SLWI and SRWI. - if (MIOpC == PPC::CNTLZW || MIOpC == PPC::CNTLZWo || - MIOpC == PPC::SLW || MIOpC == PPC::SLWo || - MIOpC == PPC::SRW || MIOpC == PPC::SRWo || - MIOpC == PPC::ANDIo || - isZeroExtendingRotate) { + if (isZeroExtended(*MI)) { noSub = true; equalityOnly = true; } else @@ -1640,8 +1669,10 @@ bool PPCInstrInfo::optimizeCompareInstr(MachineInstr &CmpInstr, unsigned SrcReg, I != IE; ++I) { MachineInstr *UseMI = &*I; if (UseMI->getOpcode() == PPC::BCC) { - unsigned Pred = UseMI->getOperand(0).getImm(); - if (Pred != PPC::PRED_EQ && Pred != PPC::PRED_NE) + PPC::Predicate Pred = (PPC::Predicate)UseMI->getOperand(0).getImm(); + unsigned PredCond = PPC::getPredicateCondition(Pred); + // We ignore hint bits when checking for non-equality comparisons. + if (PredCond != PPC::PRED_EQ && PredCond != PPC::PRED_NE) return false; } else if (UseMI->getOpcode() == PPC::ISEL || UseMI->getOpcode() == PPC::ISEL8) { @@ -1688,34 +1719,47 @@ bool PPCInstrInfo::optimizeCompareInstr(MachineInstr &CmpInstr, unsigned SrcReg, else if (MI->getParent() != CmpInstr.getParent()) return false; else if (Value != 0) { - // The record-form instructions set CR bit based on signed comparison against 0. - // We try to convert a compare against 1 or -1 into a compare against 0. - bool Success = false; - if (!equalityOnly && MRI->hasOneUse(CRReg)) { - MachineInstr *UseMI = &*MRI->use_instr_begin(CRReg); - if (UseMI->getOpcode() == PPC::BCC) { - PPC::Predicate Pred = (PPC::Predicate)UseMI->getOperand(0).getImm(); - int16_t Immed = (int16_t)Value; - - if (Immed == -1 && Pred == PPC::PRED_GT) { - // We convert "greater than -1" into "greater than or equal to 0", - // since we are assuming signed comparison by !equalityOnly - PredsToUpdate.push_back(std::make_pair(&(UseMI->getOperand(0)), - PPC::PRED_GE)); - Success = true; - } - else if (Immed == 1 && Pred == PPC::PRED_LT) { - // We convert "less than 1" into "less than or equal to 0". - PredsToUpdate.push_back(std::make_pair(&(UseMI->getOperand(0)), - PPC::PRED_LE)); - Success = true; - } - } - } + // The record-form instructions set CR bit based on signed comparison + // against 0. We try to convert a compare against 1 or -1 into a compare + // against 0 to exploit record-form instructions. For example, we change + // the condition "greater than -1" into "greater than or equal to 0" + // and "less than 1" into "less than or equal to 0". + + // Since we optimize comparison based on a specific branch condition, + // we don't optimize if condition code is used by more than once. + if (equalityOnly || !MRI->hasOneUse(CRReg)) + return false; + + MachineInstr *UseMI = &*MRI->use_instr_begin(CRReg); + if (UseMI->getOpcode() != PPC::BCC) + return false; - // PPC does not have a record-form SUBri. - if (!Success) + PPC::Predicate Pred = (PPC::Predicate)UseMI->getOperand(0).getImm(); + PPC::Predicate NewPred = Pred; + unsigned PredCond = PPC::getPredicateCondition(Pred); + unsigned PredHint = PPC::getPredicateHint(Pred); + int16_t Immed = (int16_t)Value; + + // When modyfing the condition in the predicate, we propagate hint bits + // from the original predicate to the new one. + if (Immed == -1 && PredCond == PPC::PRED_GT) + // We convert "greater than -1" into "greater than or equal to 0", + // since we are assuming signed comparison by !equalityOnly + NewPred = PPC::getPredicate(PPC::PRED_GE, PredHint); + else if (Immed == -1 && PredCond == PPC::PRED_LE) + // We convert "less than or equal to -1" into "less than 0". + NewPred = PPC::getPredicate(PPC::PRED_LT, PredHint); + else if (Immed == 1 && PredCond == PPC::PRED_LT) + // We convert "less than 1" into "less than or equal to 0". + NewPred = PPC::getPredicate(PPC::PRED_LE, PredHint); + else if (Immed == 1 && PredCond == PPC::PRED_GE) + // We convert "greater than or equal to 1" into "greater than 0". + NewPred = PPC::getPredicate(PPC::PRED_GT, PredHint); + else return false; + + PredsToUpdate.push_back(std::make_pair(&(UseMI->getOperand(0)), + NewPred)); } // Search for Sub. @@ -1763,7 +1807,7 @@ bool PPCInstrInfo::optimizeCompareInstr(MachineInstr &CmpInstr, unsigned SrcReg, if (!MI) MI = Sub; int NewOpC = -1; - MIOpC = MI->getOpcode(); + int MIOpC = MI->getOpcode(); if (MIOpC == PPC::ANDIo || MIOpC == PPC::ANDIo8) NewOpC = MIOpC; else { @@ -1804,9 +1848,11 @@ bool PPCInstrInfo::optimizeCompareInstr(MachineInstr &CmpInstr, unsigned SrcReg, MachineInstr *UseMI = &*I; if (UseMI->getOpcode() == PPC::BCC) { PPC::Predicate Pred = (PPC::Predicate) UseMI->getOperand(0).getImm(); + unsigned PredCond = PPC::getPredicateCondition(Pred); assert((!equalityOnly || - Pred == PPC::PRED_EQ || Pred == PPC::PRED_NE) && + PredCond == PPC::PRED_EQ || PredCond == PPC::PRED_NE) && "Invalid predicate for equality-only optimization"); + (void)PredCond; // To suppress warning in release build. PredsToUpdate.push_back(std::make_pair(&(UseMI->getOperand(0)), PPC::getSwappedPredicate(Pred))); } else if (UseMI->getOpcode() == PPC::ISEL || @@ -1935,29 +1981,13 @@ PPCInstrInfo::getSerializableBitmaskMachineOperandTargetFlags() const { return makeArrayRef(TargetFlags); } -bool PPCInstrInfo::expandPostRAPseudo(MachineInstr &MI) const { - auto &MBB = *MI.getParent(); - auto DL = MI.getDebugLoc(); - switch (MI.getOpcode()) { - case TargetOpcode::LOAD_STACK_GUARD: { - assert(Subtarget.isTargetLinux() && - "Only Linux target is expected to contain LOAD_STACK_GUARD"); - const int64_t Offset = Subtarget.isPPC64() ? -0x7010 : -0x7008; - const unsigned Reg = Subtarget.isPPC64() ? PPC::X13 : PPC::R2; - MI.setDesc(get(Subtarget.isPPC64() ? PPC::LD : PPC::LWZ)); - MachineInstrBuilder(*MI.getParent()->getParent(), MI) - .addImm(Offset) - .addReg(Reg); - return true; - } - case PPC::DFLOADf32: - case PPC::DFLOADf64: - case PPC::DFSTOREf32: - case PPC::DFSTOREf64: { - assert(Subtarget.hasP9Vector() && - "Invalid D-Form Pseudo-ops on non-P9 target."); - assert(MI.getOperand(2).isReg() && MI.getOperand(1).isImm() && - "D-form op must have register and immediate operands"); +// Expand VSX Memory Pseudo instruction to either a VSX or a FP instruction. +// The VSX versions have the advantage of a full 64-register target whereas +// the FP ones have the advantage of lower latency and higher throughput. So +// what we are after is using the faster instructions in low register pressure +// situations and using the larger register file in high register pressure +// situations. +bool PPCInstrInfo::expandVSXMemPseudo(MachineInstr &MI) const { unsigned UpperOpcode, LowerOpcode; switch (MI.getOpcode()) { case PPC::DFLOADf32: @@ -1976,7 +2006,38 @@ bool PPCInstrInfo::expandPostRAPseudo(MachineInstr &MI) const { UpperOpcode = PPC::STXSD; LowerOpcode = PPC::STFD; break; + case PPC::XFLOADf32: + UpperOpcode = PPC::LXSSPX; + LowerOpcode = PPC::LFSX; + break; + case PPC::XFLOADf64: + UpperOpcode = PPC::LXSDX; + LowerOpcode = PPC::LFDX; + break; + case PPC::XFSTOREf32: + UpperOpcode = PPC::STXSSPX; + LowerOpcode = PPC::STFSX; + break; + case PPC::XFSTOREf64: + UpperOpcode = PPC::STXSDX; + LowerOpcode = PPC::STFDX; + break; + case PPC::LIWAX: + UpperOpcode = PPC::LXSIWAX; + LowerOpcode = PPC::LFIWAX; + break; + case PPC::LIWZX: + UpperOpcode = PPC::LXSIWZX; + LowerOpcode = PPC::LFIWZX; + break; + case PPC::STIWX: + UpperOpcode = PPC::STXSIWX; + LowerOpcode = PPC::STFIWX; + break; + default: + llvm_unreachable("Unknown Operation!"); } + unsigned TargetReg = MI.getOperand(0).getReg(); unsigned Opcode; if ((TargetReg >= PPC::F0 && TargetReg <= PPC::F31) || @@ -1986,7 +2047,95 @@ bool PPCInstrInfo::expandPostRAPseudo(MachineInstr &MI) const { Opcode = UpperOpcode; MI.setDesc(get(Opcode)); return true; +} + +bool PPCInstrInfo::expandPostRAPseudo(MachineInstr &MI) const { + auto &MBB = *MI.getParent(); + auto DL = MI.getDebugLoc(); + + switch (MI.getOpcode()) { + case TargetOpcode::LOAD_STACK_GUARD: { + assert(Subtarget.isTargetLinux() && + "Only Linux target is expected to contain LOAD_STACK_GUARD"); + const int64_t Offset = Subtarget.isPPC64() ? -0x7010 : -0x7008; + const unsigned Reg = Subtarget.isPPC64() ? PPC::X13 : PPC::R2; + MI.setDesc(get(Subtarget.isPPC64() ? PPC::LD : PPC::LWZ)); + MachineInstrBuilder(*MI.getParent()->getParent(), MI) + .addImm(Offset) + .addReg(Reg); + return true; } + case PPC::DFLOADf32: + case PPC::DFLOADf64: + case PPC::DFSTOREf32: + case PPC::DFSTOREf64: { + assert(Subtarget.hasP9Vector() && + "Invalid D-Form Pseudo-ops on Pre-P9 target."); + assert(MI.getOperand(2).isReg() && MI.getOperand(1).isImm() && + "D-form op must have register and immediate operands"); + return expandVSXMemPseudo(MI); + } + case PPC::XFLOADf32: + case PPC::XFSTOREf32: + case PPC::LIWAX: + case PPC::LIWZX: + case PPC::STIWX: { + assert(Subtarget.hasP8Vector() && + "Invalid X-Form Pseudo-ops on Pre-P8 target."); + assert(MI.getOperand(2).isReg() && MI.getOperand(1).isReg() && + "X-form op must have register and register operands"); + return expandVSXMemPseudo(MI); + } + case PPC::XFLOADf64: + case PPC::XFSTOREf64: { + assert(Subtarget.hasVSX() && + "Invalid X-Form Pseudo-ops on target that has no VSX."); + assert(MI.getOperand(2).isReg() && MI.getOperand(1).isReg() && + "X-form op must have register and register operands"); + return expandVSXMemPseudo(MI); + } + case PPC::SPILLTOVSR_LD: { + unsigned TargetReg = MI.getOperand(0).getReg(); + if (PPC::VSFRCRegClass.contains(TargetReg)) { + MI.setDesc(get(PPC::DFLOADf64)); + return expandPostRAPseudo(MI); + } + else + MI.setDesc(get(PPC::LD)); + return true; + } + case PPC::SPILLTOVSR_ST: { + unsigned SrcReg = MI.getOperand(0).getReg(); + if (PPC::VSFRCRegClass.contains(SrcReg)) { + NumStoreSPILLVSRRCAsVec++; + MI.setDesc(get(PPC::DFSTOREf64)); + return expandPostRAPseudo(MI); + } else { + NumStoreSPILLVSRRCAsGpr++; + MI.setDesc(get(PPC::STD)); + } + return true; + } + case PPC::SPILLTOVSR_LDX: { + unsigned TargetReg = MI.getOperand(0).getReg(); + if (PPC::VSFRCRegClass.contains(TargetReg)) + MI.setDesc(get(PPC::LXSDX)); + else + MI.setDesc(get(PPC::LDX)); + return true; + } + case PPC::SPILLTOVSR_STX: { + unsigned SrcReg = MI.getOperand(0).getReg(); + if (PPC::VSFRCRegClass.contains(SrcReg)) { + NumStoreSPILLVSRRCAsVec++; + MI.setDesc(get(PPC::STXSDX)); + } else { + NumStoreSPILLVSRRCAsGpr++; + MI.setDesc(get(PPC::STDX)); + } + return true; + } + case PPC::CFENCE8: { auto Val = MI.getOperand(0).getReg(); BuildMI(MBB, MI, DL, get(PPC::CMPD), PPC::CR7).addReg(Val).addReg(Val); @@ -2002,6 +2151,829 @@ bool PPCInstrInfo::expandPostRAPseudo(MachineInstr &MI) const { return false; } +unsigned PPCInstrInfo::lookThruCopyLike(unsigned SrcReg, + const MachineRegisterInfo *MRI) { + while (true) { + MachineInstr *MI = MRI->getVRegDef(SrcReg); + if (!MI->isCopyLike()) + return SrcReg; + + unsigned CopySrcReg; + if (MI->isCopy()) + CopySrcReg = MI->getOperand(1).getReg(); + else { + assert(MI->isSubregToReg() && "Bad opcode for lookThruCopyLike"); + CopySrcReg = MI->getOperand(2).getReg(); + } + + if (!TargetRegisterInfo::isVirtualRegister(CopySrcReg)) + return CopySrcReg; + + SrcReg = CopySrcReg; + } +} + +// Essentially a compile-time implementation of a compare->isel sequence. +// It takes two constants to compare, along with the true/false registers +// and the comparison type (as a subreg to a CR field) and returns one +// of the true/false registers, depending on the comparison results. +static unsigned selectReg(int64_t Imm1, int64_t Imm2, unsigned CompareOpc, + unsigned TrueReg, unsigned FalseReg, + unsigned CRSubReg) { + // Signed comparisons. The immediates are assumed to be sign-extended. + if (CompareOpc == PPC::CMPWI || CompareOpc == PPC::CMPDI) { + switch (CRSubReg) { + default: llvm_unreachable("Unknown integer comparison type."); + case PPC::sub_lt: + return Imm1 < Imm2 ? TrueReg : FalseReg; + case PPC::sub_gt: + return Imm1 > Imm2 ? TrueReg : FalseReg; + case PPC::sub_eq: + return Imm1 == Imm2 ? TrueReg : FalseReg; + } + } + // Unsigned comparisons. + else if (CompareOpc == PPC::CMPLWI || CompareOpc == PPC::CMPLDI) { + switch (CRSubReg) { + default: llvm_unreachable("Unknown integer comparison type."); + case PPC::sub_lt: + return (uint64_t)Imm1 < (uint64_t)Imm2 ? TrueReg : FalseReg; + case PPC::sub_gt: + return (uint64_t)Imm1 > (uint64_t)Imm2 ? TrueReg : FalseReg; + case PPC::sub_eq: + return Imm1 == Imm2 ? TrueReg : FalseReg; + } + } + return PPC::NoRegister; +} + +// Replace an instruction with one that materializes a constant (and sets +// CR0 if the original instruction was a record-form instruction). +void PPCInstrInfo::replaceInstrWithLI(MachineInstr &MI, + const LoadImmediateInfo &LII) const { + // Remove existing operands. + int OperandToKeep = LII.SetCR ? 1 : 0; + for (int i = MI.getNumOperands() - 1; i > OperandToKeep; i--) + MI.RemoveOperand(i); + + // Replace the instruction. + if (LII.SetCR) { + MI.setDesc(get(LII.Is64Bit ? PPC::ANDIo8 : PPC::ANDIo)); + // Set the immediate. + MachineInstrBuilder(*MI.getParent()->getParent(), MI) + .addImm(LII.Imm).addReg(PPC::CR0, RegState::ImplicitDefine); + return; + } + else + MI.setDesc(get(LII.Is64Bit ? PPC::LI8 : PPC::LI)); + + // Set the immediate. + MachineInstrBuilder(*MI.getParent()->getParent(), MI) + .addImm(LII.Imm); +} + +MachineInstr *PPCInstrInfo::getConstantDefMI(MachineInstr &MI, + unsigned &ConstOp, + bool &SeenIntermediateUse) const { + ConstOp = ~0U; + MachineInstr *DefMI = nullptr; + MachineRegisterInfo *MRI = &MI.getParent()->getParent()->getRegInfo(); + // If we'ere in SSA, get the defs through the MRI. Otherwise, only look + // within the basic block to see if the register is defined using an LI/LI8. + if (MRI->isSSA()) { + for (int i = 1, e = MI.getNumOperands(); i < e; i++) { + if (!MI.getOperand(i).isReg()) + continue; + unsigned Reg = MI.getOperand(i).getReg(); + if (!TargetRegisterInfo::isVirtualRegister(Reg)) + continue; + unsigned TrueReg = lookThruCopyLike(Reg, MRI); + if (TargetRegisterInfo::isVirtualRegister(TrueReg)) { + DefMI = MRI->getVRegDef(TrueReg); + if (DefMI->getOpcode() == PPC::LI || DefMI->getOpcode() == PPC::LI8) { + ConstOp = i; + break; + } + } + } + } else { + // Looking back through the definition for each operand could be expensive, + // so exit early if this isn't an instruction that either has an immediate + // form or is already an immediate form that we can handle. + ImmInstrInfo III; + unsigned Opc = MI.getOpcode(); + bool ConvertibleImmForm = + Opc == PPC::CMPWI || Opc == PPC::CMPLWI || + Opc == PPC::CMPDI || Opc == PPC::CMPLDI || + Opc == PPC::ADDI || Opc == PPC::ADDI8 || + Opc == PPC::ORI || Opc == PPC::ORI8 || + Opc == PPC::XORI || Opc == PPC::XORI8 || + Opc == PPC::RLDICL || Opc == PPC::RLDICLo || + Opc == PPC::RLDICL_32 || Opc == PPC::RLDICL_32_64 || + Opc == PPC::RLWINM || Opc == PPC::RLWINMo || + Opc == PPC::RLWINM8 || Opc == PPC::RLWINM8o; + if (!instrHasImmForm(MI, III) && !ConvertibleImmForm) + return nullptr; + + // Don't convert or %X, %Y, %Y since that's just a register move. + if ((Opc == PPC::OR || Opc == PPC::OR8) && + MI.getOperand(1).getReg() == MI.getOperand(2).getReg()) + return nullptr; + for (int i = 1, e = MI.getNumOperands(); i < e; i++) { + MachineOperand &MO = MI.getOperand(i); + SeenIntermediateUse = false; + if (MO.isReg() && MO.isUse() && !MO.isImplicit()) { + MachineBasicBlock::reverse_iterator E = MI.getParent()->rend(), It = MI; + It++; + unsigned Reg = MI.getOperand(i).getReg(); + // MachineInstr::readsRegister only returns true if the machine + // instruction reads the exact register or its super-register. It + // does not consider uses of sub-registers which seems like strange + // behaviour. Nonetheless, if we end up with a 64-bit register here, + // get the corresponding 32-bit register to check. + if (PPC::G8RCRegClass.contains(Reg)) + Reg = Reg - PPC::X0 + PPC::R0; + + // Is this register defined by a load-immediate in this block? + for ( ; It != E; ++It) { + if (It->modifiesRegister(Reg, &getRegisterInfo())) { + if (It->getOpcode() == PPC::LI || It->getOpcode() == PPC::LI8) { + ConstOp = i; + return &*It; + } else + break; + } else if (It->readsRegister(Reg, &getRegisterInfo())) + // If we see another use of this reg between the def and the MI, + // we want to flat it so the def isn't deleted. + SeenIntermediateUse = true; + } + } + } + } + return ConstOp == ~0U ? nullptr : DefMI; +} + +// If this instruction has an immediate form and one of its operands is a +// result of a load-immediate, convert it to the immediate form if the constant +// is in range. +bool PPCInstrInfo::convertToImmediateForm(MachineInstr &MI, + MachineInstr **KilledDef) const { + MachineFunction *MF = MI.getParent()->getParent(); + MachineRegisterInfo *MRI = &MF->getRegInfo(); + bool PostRA = !MRI->isSSA(); + bool SeenIntermediateUse = true; + unsigned ConstantOperand = ~0U; + MachineInstr *DefMI = getConstantDefMI(MI, ConstantOperand, + SeenIntermediateUse); + if (!DefMI || !DefMI->getOperand(1).isImm()) + return false; + assert(ConstantOperand < MI.getNumOperands() && + "The constant operand needs to be valid at this point"); + + int64_t Immediate = DefMI->getOperand(1).getImm(); + // Sign-extend to 64-bits. + int64_t SExtImm = ((uint64_t)Immediate & ~0x7FFFuLL) != 0 ? + (Immediate | 0xFFFFFFFFFFFF0000) : Immediate; + + if (KilledDef && MI.getOperand(ConstantOperand).isKill() && + !SeenIntermediateUse) + *KilledDef = DefMI; + + // If this is a reg+reg instruction that has a reg+imm form, convert it now. + ImmInstrInfo III; + if (instrHasImmForm(MI, III)) + return transformToImmForm(MI, III, ConstantOperand, SExtImm); + + bool ReplaceWithLI = false; + bool Is64BitLI = false; + int64_t NewImm = 0; + bool SetCR = false; + unsigned Opc = MI.getOpcode(); + switch (Opc) { + default: return false; + + // FIXME: Any branches conditional on such a comparison can be made + // unconditional. At this time, this happens too infrequently to be worth + // the implementation effort, but if that ever changes, we could convert + // such a pattern here. + case PPC::CMPWI: + case PPC::CMPLWI: + case PPC::CMPDI: + case PPC::CMPLDI: { + // Doing this post-RA would require dataflow analysis to reliably find uses + // of the CR register set by the compare. + if (PostRA) + return false; + // If a compare-immediate is fed by an immediate and is itself an input of + // an ISEL (the most common case) into a COPY of the correct register. + bool Changed = false; + unsigned DefReg = MI.getOperand(0).getReg(); + int64_t Comparand = MI.getOperand(2).getImm(); + int64_t SExtComparand = ((uint64_t)Comparand & ~0x7FFFuLL) != 0 ? + (Comparand | 0xFFFFFFFFFFFF0000) : Comparand; + + for (auto &CompareUseMI : MRI->use_instructions(DefReg)) { + unsigned UseOpc = CompareUseMI.getOpcode(); + if (UseOpc != PPC::ISEL && UseOpc != PPC::ISEL8) + continue; + unsigned CRSubReg = CompareUseMI.getOperand(3).getSubReg(); + unsigned TrueReg = CompareUseMI.getOperand(1).getReg(); + unsigned FalseReg = CompareUseMI.getOperand(2).getReg(); + unsigned RegToCopy = selectReg(SExtImm, SExtComparand, Opc, TrueReg, + FalseReg, CRSubReg); + if (RegToCopy == PPC::NoRegister) + continue; + // Can't use PPC::COPY to copy PPC::ZERO[8]. Convert it to LI[8] 0. + if (RegToCopy == PPC::ZERO || RegToCopy == PPC::ZERO8) { + CompareUseMI.setDesc(get(UseOpc == PPC::ISEL8 ? PPC::LI8 : PPC::LI)); + CompareUseMI.getOperand(1).ChangeToImmediate(0); + CompareUseMI.RemoveOperand(3); + CompareUseMI.RemoveOperand(2); + continue; + } + DEBUG(dbgs() << "Found LI -> CMPI -> ISEL, replacing with a copy.\n"); + DEBUG(DefMI->dump(); MI.dump(); CompareUseMI.dump()); + DEBUG(dbgs() << "Is converted to:\n"); + // Convert to copy and remove unneeded operands. + CompareUseMI.setDesc(get(PPC::COPY)); + CompareUseMI.RemoveOperand(3); + CompareUseMI.RemoveOperand(RegToCopy == TrueReg ? 2 : 1); + CmpIselsConverted++; + Changed = true; + DEBUG(CompareUseMI.dump()); + } + if (Changed) + return true; + // This may end up incremented multiple times since this function is called + // during a fixed-point transformation, but it is only meant to indicate the + // presence of this opportunity. + MissedConvertibleImmediateInstrs++; + return false; + } + + // Immediate forms - may simply be convertable to an LI. + case PPC::ADDI: + case PPC::ADDI8: { + // Does the sum fit in a 16-bit signed field? + int64_t Addend = MI.getOperand(2).getImm(); + if (isInt<16>(Addend + SExtImm)) { + ReplaceWithLI = true; + Is64BitLI = Opc == PPC::ADDI8; + NewImm = Addend + SExtImm; + break; + } + return false; + } + case PPC::RLDICL: + case PPC::RLDICLo: + case PPC::RLDICL_32: + case PPC::RLDICL_32_64: { + // Use APInt's rotate function. + int64_t SH = MI.getOperand(2).getImm(); + int64_t MB = MI.getOperand(3).getImm(); + APInt InVal(Opc == PPC::RLDICL ? 64 : 32, SExtImm, true); + InVal = InVal.rotl(SH); + uint64_t Mask = (1LU << (63 - MB + 1)) - 1; + InVal &= Mask; + // Can't replace negative values with an LI as that will sign-extend + // and not clear the left bits. If we're setting the CR bit, we will use + // ANDIo which won't sign extend, so that's safe. + if (isUInt<15>(InVal.getSExtValue()) || + (Opc == PPC::RLDICLo && isUInt<16>(InVal.getSExtValue()))) { + ReplaceWithLI = true; + Is64BitLI = Opc != PPC::RLDICL_32; + NewImm = InVal.getSExtValue(); + SetCR = Opc == PPC::RLDICLo; + break; + } + return false; + } + case PPC::RLWINM: + case PPC::RLWINM8: + case PPC::RLWINMo: + case PPC::RLWINM8o: { + int64_t SH = MI.getOperand(2).getImm(); + int64_t MB = MI.getOperand(3).getImm(); + int64_t ME = MI.getOperand(4).getImm(); + APInt InVal(32, SExtImm, true); + InVal = InVal.rotl(SH); + // Set the bits ( MB + 32 ) to ( ME + 32 ). + uint64_t Mask = ((1 << (32 - MB)) - 1) & ~((1 << (31 - ME)) - 1); + InVal &= Mask; + // Can't replace negative values with an LI as that will sign-extend + // and not clear the left bits. If we're setting the CR bit, we will use + // ANDIo which won't sign extend, so that's safe. + bool ValueFits = isUInt<15>(InVal.getSExtValue()); + ValueFits |= ((Opc == PPC::RLWINMo || Opc == PPC::RLWINM8o) && + isUInt<16>(InVal.getSExtValue())); + if (ValueFits) { + ReplaceWithLI = true; + Is64BitLI = Opc == PPC::RLWINM8 || Opc == PPC::RLWINM8o; + NewImm = InVal.getSExtValue(); + SetCR = Opc == PPC::RLWINMo || Opc == PPC::RLWINM8o; + break; + } + return false; + } + case PPC::ORI: + case PPC::ORI8: + case PPC::XORI: + case PPC::XORI8: { + int64_t LogicalImm = MI.getOperand(2).getImm(); + int64_t Result = 0; + if (Opc == PPC::ORI || Opc == PPC::ORI8) + Result = LogicalImm | SExtImm; + else + Result = LogicalImm ^ SExtImm; + if (isInt<16>(Result)) { + ReplaceWithLI = true; + Is64BitLI = Opc == PPC::ORI8 || Opc == PPC::XORI8; + NewImm = Result; + break; + } + return false; + } + } + + if (ReplaceWithLI) { + DEBUG(dbgs() << "Replacing instruction:\n"); + DEBUG(MI.dump()); + DEBUG(dbgs() << "Fed by:\n"); + DEBUG(DefMI->dump()); + LoadImmediateInfo LII; + LII.Imm = NewImm; + LII.Is64Bit = Is64BitLI; + LII.SetCR = SetCR; + // If we're setting the CR, the original load-immediate must be kept (as an + // operand to ANDIo/ANDI8o). + if (KilledDef && SetCR) + *KilledDef = nullptr; + replaceInstrWithLI(MI, LII); + DEBUG(dbgs() << "With:\n"); + DEBUG(MI.dump()); + return true; + } + return false; +} + +bool PPCInstrInfo::instrHasImmForm(const MachineInstr &MI, + ImmInstrInfo &III) const { + unsigned Opc = MI.getOpcode(); + // The vast majority of the instructions would need their operand 2 replaced + // with an immediate when switching to the reg+imm form. A marked exception + // are the update form loads/stores for which a constant operand 2 would need + // to turn into a displacement and move operand 1 to the operand 2 position. + III.ImmOpNo = 2; + III.ConstantOpNo = 2; + III.ImmWidth = 16; + III.ImmMustBeMultipleOf = 1; + switch (Opc) { + default: return false; + case PPC::ADD4: + case PPC::ADD8: + III.SignedImm = true; + III.ZeroIsSpecialOrig = 0; + III.ZeroIsSpecialNew = 1; + III.IsCommutative = true; + III.ImmOpcode = Opc == PPC::ADD4 ? PPC::ADDI : PPC::ADDI8; + break; + case PPC::ADDC: + case PPC::ADDC8: + III.SignedImm = true; + III.ZeroIsSpecialOrig = 0; + III.ZeroIsSpecialNew = 0; + III.IsCommutative = true; + III.ImmOpcode = Opc == PPC::ADDC ? PPC::ADDIC : PPC::ADDIC8; + break; + case PPC::ADDCo: + III.SignedImm = true; + III.ZeroIsSpecialOrig = 0; + III.ZeroIsSpecialNew = 0; + III.IsCommutative = true; + III.ImmOpcode = PPC::ADDICo; + break; + case PPC::SUBFC: + case PPC::SUBFC8: + III.SignedImm = true; + III.ZeroIsSpecialOrig = 0; + III.ZeroIsSpecialNew = 0; + III.IsCommutative = false; + III.ImmOpcode = Opc == PPC::SUBFC ? PPC::SUBFIC : PPC::SUBFIC8; + break; + case PPC::CMPW: + case PPC::CMPD: + III.SignedImm = true; + III.ZeroIsSpecialOrig = 0; + III.ZeroIsSpecialNew = 0; + III.IsCommutative = false; + III.ImmOpcode = Opc == PPC::CMPW ? PPC::CMPWI : PPC::CMPDI; + break; + case PPC::CMPLW: + case PPC::CMPLD: + III.SignedImm = false; + III.ZeroIsSpecialOrig = 0; + III.ZeroIsSpecialNew = 0; + III.IsCommutative = false; + III.ImmOpcode = Opc == PPC::CMPLW ? PPC::CMPLWI : PPC::CMPLDI; + break; + case PPC::ANDo: + case PPC::AND8o: + case PPC::OR: + case PPC::OR8: + case PPC::XOR: + case PPC::XOR8: + III.SignedImm = false; + III.ZeroIsSpecialOrig = 0; + III.ZeroIsSpecialNew = 0; + III.IsCommutative = true; + switch(Opc) { + default: llvm_unreachable("Unknown opcode"); + case PPC::ANDo: III.ImmOpcode = PPC::ANDIo; break; + case PPC::AND8o: III.ImmOpcode = PPC::ANDIo8; break; + case PPC::OR: III.ImmOpcode = PPC::ORI; break; + case PPC::OR8: III.ImmOpcode = PPC::ORI8; break; + case PPC::XOR: III.ImmOpcode = PPC::XORI; break; + case PPC::XOR8: III.ImmOpcode = PPC::XORI8; break; + } + break; + case PPC::RLWNM: + case PPC::RLWNM8: + case PPC::RLWNMo: + case PPC::RLWNM8o: + case PPC::RLDCL: + case PPC::RLDCLo: + case PPC::RLDCR: + case PPC::RLDCRo: + case PPC::SLW: + case PPC::SLW8: + case PPC::SLWo: + case PPC::SLW8o: + case PPC::SRW: + case PPC::SRW8: + case PPC::SRWo: + case PPC::SRW8o: + case PPC::SRAW: + case PPC::SRAWo: + case PPC::SLD: + case PPC::SLDo: + case PPC::SRD: + case PPC::SRDo: + case PPC::SRAD: + case PPC::SRADo: + III.SignedImm = false; + III.ZeroIsSpecialOrig = 0; + III.ZeroIsSpecialNew = 0; + III.IsCommutative = false; + // This isn't actually true, but the instructions ignore any of the + // upper bits, so any immediate loaded with an LI is acceptable. + III.ImmWidth = 16; + switch(Opc) { + default: llvm_unreachable("Unknown opcode"); + case PPC::RLWNM: III.ImmOpcode = PPC::RLWINM; break; + case PPC::RLWNM8: III.ImmOpcode = PPC::RLWINM8; break; + case PPC::RLWNMo: III.ImmOpcode = PPC::RLWINMo; break; + case PPC::RLWNM8o: III.ImmOpcode = PPC::RLWINM8o; break; + case PPC::RLDCL: III.ImmOpcode = PPC::RLDICL; break; + case PPC::RLDCLo: III.ImmOpcode = PPC::RLDICLo; break; + case PPC::RLDCR: III.ImmOpcode = PPC::RLDICR; break; + case PPC::RLDCRo: III.ImmOpcode = PPC::RLDICRo; break; + case PPC::SLW: III.ImmOpcode = PPC::RLWINM; break; + case PPC::SLW8: III.ImmOpcode = PPC::RLWINM8; break; + case PPC::SLWo: III.ImmOpcode = PPC::RLWINMo; break; + case PPC::SLW8o: III.ImmOpcode = PPC::RLWINM8o; break; + case PPC::SRW: III.ImmOpcode = PPC::RLWINM; break; + case PPC::SRW8: III.ImmOpcode = PPC::RLWINM8; break; + case PPC::SRWo: III.ImmOpcode = PPC::RLWINMo; break; + case PPC::SRW8o: III.ImmOpcode = PPC::RLWINM8o; break; + case PPC::SRAW: III.ImmOpcode = PPC::SRAWI; break; + case PPC::SRAWo: III.ImmOpcode = PPC::SRAWIo; break; + case PPC::SLD: III.ImmOpcode = PPC::RLDICR; break; + case PPC::SLDo: III.ImmOpcode = PPC::RLDICRo; break; + case PPC::SRD: III.ImmOpcode = PPC::RLDICL; break; + case PPC::SRDo: III.ImmOpcode = PPC::RLDICLo; break; + case PPC::SRAD: III.ImmOpcode = PPC::SRADI; break; + case PPC::SRADo: III.ImmOpcode = PPC::SRADIo; break; + } + break; + // Loads and stores: + case PPC::LBZX: + case PPC::LBZX8: + case PPC::LHZX: + case PPC::LHZX8: + case PPC::LHAX: + case PPC::LHAX8: + case PPC::LWZX: + case PPC::LWZX8: + case PPC::LWAX: + case PPC::LDX: + case PPC::LFSX: + case PPC::LFDX: + case PPC::STBX: + case PPC::STBX8: + case PPC::STHX: + case PPC::STHX8: + case PPC::STWX: + case PPC::STWX8: + case PPC::STDX: + case PPC::STFSX: + case PPC::STFDX: + III.SignedImm = true; + III.ZeroIsSpecialOrig = 1; + III.ZeroIsSpecialNew = 2; + III.IsCommutative = true; + III.ImmOpNo = 1; + III.ConstantOpNo = 2; + switch(Opc) { + default: llvm_unreachable("Unknown opcode"); + case PPC::LBZX: III.ImmOpcode = PPC::LBZ; break; + case PPC::LBZX8: III.ImmOpcode = PPC::LBZ8; break; + case PPC::LHZX: III.ImmOpcode = PPC::LHZ; break; + case PPC::LHZX8: III.ImmOpcode = PPC::LHZ8; break; + case PPC::LHAX: III.ImmOpcode = PPC::LHA; break; + case PPC::LHAX8: III.ImmOpcode = PPC::LHA8; break; + case PPC::LWZX: III.ImmOpcode = PPC::LWZ; break; + case PPC::LWZX8: III.ImmOpcode = PPC::LWZ8; break; + case PPC::LWAX: + III.ImmOpcode = PPC::LWA; + III.ImmMustBeMultipleOf = 4; + break; + case PPC::LDX: III.ImmOpcode = PPC::LD; III.ImmMustBeMultipleOf = 4; break; + case PPC::LFSX: III.ImmOpcode = PPC::LFS; break; + case PPC::LFDX: III.ImmOpcode = PPC::LFD; break; + case PPC::STBX: III.ImmOpcode = PPC::STB; break; + case PPC::STBX8: III.ImmOpcode = PPC::STB8; break; + case PPC::STHX: III.ImmOpcode = PPC::STH; break; + case PPC::STHX8: III.ImmOpcode = PPC::STH8; break; + case PPC::STWX: III.ImmOpcode = PPC::STW; break; + case PPC::STWX8: III.ImmOpcode = PPC::STW8; break; + case PPC::STDX: + III.ImmOpcode = PPC::STD; + III.ImmMustBeMultipleOf = 4; + break; + case PPC::STFSX: III.ImmOpcode = PPC::STFS; break; + case PPC::STFDX: III.ImmOpcode = PPC::STFD; break; + } + break; + case PPC::LBZUX: + case PPC::LBZUX8: + case PPC::LHZUX: + case PPC::LHZUX8: + case PPC::LHAUX: + case PPC::LHAUX8: + case PPC::LWZUX: + case PPC::LWZUX8: + case PPC::LDUX: + case PPC::LFSUX: + case PPC::LFDUX: + case PPC::STBUX: + case PPC::STBUX8: + case PPC::STHUX: + case PPC::STHUX8: + case PPC::STWUX: + case PPC::STWUX8: + case PPC::STDUX: + case PPC::STFSUX: + case PPC::STFDUX: + III.SignedImm = true; + III.ZeroIsSpecialOrig = 2; + III.ZeroIsSpecialNew = 3; + III.IsCommutative = false; + III.ImmOpNo = 2; + III.ConstantOpNo = 3; + switch(Opc) { + default: llvm_unreachable("Unknown opcode"); + case PPC::LBZUX: III.ImmOpcode = PPC::LBZU; break; + case PPC::LBZUX8: III.ImmOpcode = PPC::LBZU8; break; + case PPC::LHZUX: III.ImmOpcode = PPC::LHZU; break; + case PPC::LHZUX8: III.ImmOpcode = PPC::LHZU8; break; + case PPC::LHAUX: III.ImmOpcode = PPC::LHAU; break; + case PPC::LHAUX8: III.ImmOpcode = PPC::LHAU8; break; + case PPC::LWZUX: III.ImmOpcode = PPC::LWZU; break; + case PPC::LWZUX8: III.ImmOpcode = PPC::LWZU8; break; + case PPC::LDUX: + III.ImmOpcode = PPC::LDU; + III.ImmMustBeMultipleOf = 4; + break; + case PPC::LFSUX: III.ImmOpcode = PPC::LFSU; break; + case PPC::LFDUX: III.ImmOpcode = PPC::LFDU; break; + case PPC::STBUX: III.ImmOpcode = PPC::STBU; break; + case PPC::STBUX8: III.ImmOpcode = PPC::STBU8; break; + case PPC::STHUX: III.ImmOpcode = PPC::STHU; break; + case PPC::STHUX8: III.ImmOpcode = PPC::STHU8; break; + case PPC::STWUX: III.ImmOpcode = PPC::STWU; break; + case PPC::STWUX8: III.ImmOpcode = PPC::STWU8; break; + case PPC::STDUX: + III.ImmOpcode = PPC::STDU; + III.ImmMustBeMultipleOf = 4; + break; + case PPC::STFSUX: III.ImmOpcode = PPC::STFSU; break; + case PPC::STFDUX: III.ImmOpcode = PPC::STFDU; break; + } + break; + // Power9 only. + case PPC::LXVX: + case PPC::LXSSPX: + case PPC::LXSDX: + case PPC::STXVX: + case PPC::STXSSPX: + case PPC::STXSDX: + if (!Subtarget.hasP9Vector()) + return false; + III.SignedImm = true; + III.ZeroIsSpecialOrig = 1; + III.ZeroIsSpecialNew = 2; + III.IsCommutative = true; + III.ImmOpNo = 1; + III.ConstantOpNo = 2; + switch(Opc) { + default: llvm_unreachable("Unknown opcode"); + case PPC::LXVX: + III.ImmOpcode = PPC::LXV; + III.ImmMustBeMultipleOf = 16; + break; + case PPC::LXSSPX: + III.ImmOpcode = PPC::LXSSP; + III.ImmMustBeMultipleOf = 4; + break; + case PPC::LXSDX: + III.ImmOpcode = PPC::LXSD; + III.ImmMustBeMultipleOf = 4; + break; + case PPC::STXVX: + III.ImmOpcode = PPC::STXV; + III.ImmMustBeMultipleOf = 16; + break; + case PPC::STXSSPX: + III.ImmOpcode = PPC::STXSSP; + III.ImmMustBeMultipleOf = 4; + break; + case PPC::STXSDX: + III.ImmOpcode = PPC::STXSD; + III.ImmMustBeMultipleOf = 4; + break; + } + break; + } + return true; +} + +// Utility function for swaping two arbitrary operands of an instruction. +static void swapMIOperands(MachineInstr &MI, unsigned Op1, unsigned Op2) { + assert(Op1 != Op2 && "Cannot swap operand with itself."); + + unsigned MaxOp = std::max(Op1, Op2); + unsigned MinOp = std::min(Op1, Op2); + MachineOperand MOp1 = MI.getOperand(MinOp); + MachineOperand MOp2 = MI.getOperand(MaxOp); + MI.RemoveOperand(std::max(Op1, Op2)); + MI.RemoveOperand(std::min(Op1, Op2)); + + // If the operands we are swapping are the two at the end (the common case) + // we can just remove both and add them in the opposite order. + if (MaxOp - MinOp == 1 && MI.getNumOperands() == MinOp) { + MI.addOperand(MOp2); + MI.addOperand(MOp1); + } else { + // Store all operands in a temporary vector, remove them and re-add in the + // right order. + SmallVector MOps; + unsigned TotalOps = MI.getNumOperands() + 2; // We've already removed 2 ops. + for (unsigned i = MI.getNumOperands() - 1; i >= MinOp; i--) { + MOps.push_back(MI.getOperand(i)); + MI.RemoveOperand(i); + } + // MOp2 needs to be added next. + MI.addOperand(MOp2); + // Now add the rest. + for (unsigned i = MI.getNumOperands(); i < TotalOps; i++) { + if (i == MaxOp) + MI.addOperand(MOp1); + else { + MI.addOperand(MOps.back()); + MOps.pop_back(); + } + } + } +} + +bool PPCInstrInfo::transformToImmForm(MachineInstr &MI, const ImmInstrInfo &III, + unsigned ConstantOpNo, + int64_t Imm) const { + MachineRegisterInfo &MRI = MI.getParent()->getParent()->getRegInfo(); + bool PostRA = !MRI.isSSA(); + // Exit early if we can't convert this. + if ((ConstantOpNo != III.ConstantOpNo) && !III.IsCommutative) + return false; + if (Imm % III.ImmMustBeMultipleOf) + return false; + if (III.SignedImm) { + APInt ActualValue(64, Imm, true); + if (!ActualValue.isSignedIntN(III.ImmWidth)) + return false; + } else { + uint64_t UnsignedMax = (1 << III.ImmWidth) - 1; + if ((uint64_t)Imm > UnsignedMax) + return false; + } + + // If we're post-RA, the instructions don't agree on whether register zero is + // special, we can transform this as long as the register operand that will + // end up in the location where zero is special isn't R0. + if (PostRA && III.ZeroIsSpecialOrig != III.ZeroIsSpecialNew) { + unsigned PosForOrigZero = III.ZeroIsSpecialOrig ? III.ZeroIsSpecialOrig : + III.ZeroIsSpecialNew + 1; + unsigned OrigZeroReg = MI.getOperand(PosForOrigZero).getReg(); + unsigned NewZeroReg = MI.getOperand(III.ZeroIsSpecialNew).getReg(); + // If R0 is in the operand where zero is special for the new instruction, + // it is unsafe to transform if the constant operand isn't that operand. + if ((NewZeroReg == PPC::R0 || NewZeroReg == PPC::X0) && + ConstantOpNo != III.ZeroIsSpecialNew) + return false; + if ((OrigZeroReg == PPC::R0 || OrigZeroReg == PPC::X0) && + ConstantOpNo != PosForOrigZero) + return false; + } + + unsigned Opc = MI.getOpcode(); + bool SpecialShift32 = + Opc == PPC::SLW || Opc == PPC::SLWo || Opc == PPC::SRW || Opc == PPC::SRWo; + bool SpecialShift64 = + Opc == PPC::SLD || Opc == PPC::SLDo || Opc == PPC::SRD || Opc == PPC::SRDo; + bool SetCR = Opc == PPC::SLWo || Opc == PPC::SRWo || + Opc == PPC::SLDo || Opc == PPC::SRDo; + bool RightShift = + Opc == PPC::SRW || Opc == PPC::SRWo || Opc == PPC::SRD || Opc == PPC::SRDo; + + MI.setDesc(get(III.ImmOpcode)); + if (ConstantOpNo == III.ConstantOpNo) { + // Converting shifts to immediate form is a bit tricky since they may do + // one of three things: + // 1. If the shift amount is between OpSize and 2*OpSize, the result is zero + // 2. If the shift amount is zero, the result is unchanged (save for maybe + // setting CR0) + // 3. If the shift amount is in [1, OpSize), it's just a shift + if (SpecialShift32 || SpecialShift64) { + LoadImmediateInfo LII; + LII.Imm = 0; + LII.SetCR = SetCR; + LII.Is64Bit = SpecialShift64; + uint64_t ShAmt = Imm & (SpecialShift32 ? 0x1F : 0x3F); + if (Imm & (SpecialShift32 ? 0x20 : 0x40)) + replaceInstrWithLI(MI, LII); + // Shifts by zero don't change the value. If we don't need to set CR0, + // just convert this to a COPY. Can't do this post-RA since we've already + // cleaned up the copies. + else if (!SetCR && ShAmt == 0 && !PostRA) { + MI.RemoveOperand(2); + MI.setDesc(get(PPC::COPY)); + } else { + // The 32 bit and 64 bit instructions are quite different. + if (SpecialShift32) { + // Left shifts use (N, 0, 31-N), right shifts use (32-N, N, 31). + uint64_t SH = RightShift ? 32 - ShAmt : ShAmt; + uint64_t MB = RightShift ? ShAmt : 0; + uint64_t ME = RightShift ? 31 : 31 - ShAmt; + MI.getOperand(III.ConstantOpNo).ChangeToImmediate(SH); + MachineInstrBuilder(*MI.getParent()->getParent(), MI).addImm(MB) + .addImm(ME); + } else { + // Left shifts use (N, 63-N), right shifts use (64-N, N). + uint64_t SH = RightShift ? 64 - ShAmt : ShAmt; + uint64_t ME = RightShift ? ShAmt : 63 - ShAmt; + MI.getOperand(III.ConstantOpNo).ChangeToImmediate(SH); + MachineInstrBuilder(*MI.getParent()->getParent(), MI).addImm(ME); + } + } + } else + MI.getOperand(ConstantOpNo).ChangeToImmediate(Imm); + } + // Convert commutative instructions (switch the operands and convert the + // desired one to an immediate. + else if (III.IsCommutative) { + MI.getOperand(ConstantOpNo).ChangeToImmediate(Imm); + swapMIOperands(MI, ConstantOpNo, III.ConstantOpNo); + } else + llvm_unreachable("Should have exited early!"); + + // For instructions for which the constant register replaces a different + // operand than where the immediate goes, we need to swap them. + if (III.ConstantOpNo != III.ImmOpNo) + swapMIOperands(MI, III.ConstantOpNo, III.ImmOpNo); + + // If the R0/X0 register is special for the original instruction and not for + // the new instruction (or vice versa), we need to fix up the register class. + if (!PostRA && III.ZeroIsSpecialOrig != III.ZeroIsSpecialNew) { + if (!III.ZeroIsSpecialOrig) { + unsigned RegToModify = MI.getOperand(III.ZeroIsSpecialNew).getReg(); + const TargetRegisterClass *NewRC = + MRI.getRegClass(RegToModify)->hasSuperClassEq(&PPC::GPRCRegClass) ? + &PPC::GPRC_and_GPRC_NOR0RegClass : &PPC::G8RC_and_G8RC_NOX0RegClass; + MRI.setRegClass(RegToModify, NewRC); + } + } + return true; +} + const TargetRegisterClass * PPCInstrInfo::updatedRC(const TargetRegisterClass *RC) const { if (Subtarget.hasVSX() && RC == &PPC::VRRCRegClass) @@ -2012,3 +2984,290 @@ PPCInstrInfo::updatedRC(const TargetRegisterClass *RC) const { int PPCInstrInfo::getRecordFormOpcode(unsigned Opcode) { return PPC::getRecordFormOpcode(Opcode); } + +// This function returns true if the machine instruction +// always outputs a value by sign-extending a 32 bit value, +// i.e. 0 to 31-th bits are same as 32-th bit. +static bool isSignExtendingOp(const MachineInstr &MI) { + int Opcode = MI.getOpcode(); + if (Opcode == PPC::LI || Opcode == PPC::LI8 || + Opcode == PPC::LIS || Opcode == PPC::LIS8 || + Opcode == PPC::SRAW || Opcode == PPC::SRAWo || + Opcode == PPC::SRAWI || Opcode == PPC::SRAWIo || + Opcode == PPC::LWA || Opcode == PPC::LWAX || + Opcode == PPC::LWA_32 || Opcode == PPC::LWAX_32 || + Opcode == PPC::LHA || Opcode == PPC::LHAX || + Opcode == PPC::LHA8 || Opcode == PPC::LHAX8 || + Opcode == PPC::LBZ || Opcode == PPC::LBZX || + Opcode == PPC::LBZ8 || Opcode == PPC::LBZX8 || + Opcode == PPC::LBZU || Opcode == PPC::LBZUX || + Opcode == PPC::LBZU8 || Opcode == PPC::LBZUX8 || + Opcode == PPC::LHZ || Opcode == PPC::LHZX || + Opcode == PPC::LHZ8 || Opcode == PPC::LHZX8 || + Opcode == PPC::LHZU || Opcode == PPC::LHZUX || + Opcode == PPC::LHZU8 || Opcode == PPC::LHZUX8 || + Opcode == PPC::EXTSB || Opcode == PPC::EXTSBo || + Opcode == PPC::EXTSH || Opcode == PPC::EXTSHo || + Opcode == PPC::EXTSB8 || Opcode == PPC::EXTSH8 || + Opcode == PPC::EXTSW || Opcode == PPC::EXTSWo || + Opcode == PPC::EXTSH8_32_64 || Opcode == PPC::EXTSW_32_64 || + Opcode == PPC::EXTSB8_32_64) + return true; + + if (Opcode == PPC::RLDICL && MI.getOperand(3).getImm() >= 33) + return true; + + if ((Opcode == PPC::RLWINM || Opcode == PPC::RLWINMo || + Opcode == PPC::RLWNM || Opcode == PPC::RLWNMo) && + MI.getOperand(3).getImm() > 0 && + MI.getOperand(3).getImm() <= MI.getOperand(4).getImm()) + return true; + + return false; +} + +// This function returns true if the machine instruction +// always outputs zeros in higher 32 bits. +static bool isZeroExtendingOp(const MachineInstr &MI) { + int Opcode = MI.getOpcode(); + // The 16-bit immediate is sign-extended in li/lis. + // If the most significant bit is zero, all higher bits are zero. + if (Opcode == PPC::LI || Opcode == PPC::LI8 || + Opcode == PPC::LIS || Opcode == PPC::LIS8) { + int64_t Imm = MI.getOperand(1).getImm(); + if (((uint64_t)Imm & ~0x7FFFuLL) == 0) + return true; + } + + // We have some variations of rotate-and-mask instructions + // that clear higher 32-bits. + if ((Opcode == PPC::RLDICL || Opcode == PPC::RLDICLo || + Opcode == PPC::RLDCL || Opcode == PPC::RLDCLo || + Opcode == PPC::RLDICL_32_64) && + MI.getOperand(3).getImm() >= 32) + return true; + + if ((Opcode == PPC::RLDIC || Opcode == PPC::RLDICo) && + MI.getOperand(3).getImm() >= 32 && + MI.getOperand(3).getImm() <= 63 - MI.getOperand(2).getImm()) + return true; + + if ((Opcode == PPC::RLWINM || Opcode == PPC::RLWINMo || + Opcode == PPC::RLWNM || Opcode == PPC::RLWNMo || + Opcode == PPC::RLWINM8 || Opcode == PPC::RLWNM8) && + MI.getOperand(3).getImm() <= MI.getOperand(4).getImm()) + return true; + + // There are other instructions that clear higher 32-bits. + if (Opcode == PPC::CNTLZW || Opcode == PPC::CNTLZWo || + Opcode == PPC::CNTTZW || Opcode == PPC::CNTTZWo || + Opcode == PPC::CNTLZW8 || Opcode == PPC::CNTTZW8 || + Opcode == PPC::CNTLZD || Opcode == PPC::CNTLZDo || + Opcode == PPC::CNTTZD || Opcode == PPC::CNTTZDo || + Opcode == PPC::POPCNTD || Opcode == PPC::POPCNTW || + Opcode == PPC::SLW || Opcode == PPC::SLWo || + Opcode == PPC::SRW || Opcode == PPC::SRWo || + Opcode == PPC::SLW8 || Opcode == PPC::SRW8 || + Opcode == PPC::SLWI || Opcode == PPC::SLWIo || + Opcode == PPC::SRWI || Opcode == PPC::SRWIo || + Opcode == PPC::LWZ || Opcode == PPC::LWZX || + Opcode == PPC::LWZU || Opcode == PPC::LWZUX || + Opcode == PPC::LWBRX || Opcode == PPC::LHBRX || + Opcode == PPC::LHZ || Opcode == PPC::LHZX || + Opcode == PPC::LHZU || Opcode == PPC::LHZUX || + Opcode == PPC::LBZ || Opcode == PPC::LBZX || + Opcode == PPC::LBZU || Opcode == PPC::LBZUX || + Opcode == PPC::LWZ8 || Opcode == PPC::LWZX8 || + Opcode == PPC::LWZU8 || Opcode == PPC::LWZUX8 || + Opcode == PPC::LWBRX8 || Opcode == PPC::LHBRX8 || + Opcode == PPC::LHZ8 || Opcode == PPC::LHZX8 || + Opcode == PPC::LHZU8 || Opcode == PPC::LHZUX8 || + Opcode == PPC::LBZ8 || Opcode == PPC::LBZX8 || + Opcode == PPC::LBZU8 || Opcode == PPC::LBZUX8 || + Opcode == PPC::ANDIo || Opcode == PPC::ANDISo || + Opcode == PPC::ROTRWI || Opcode == PPC::ROTRWIo || + Opcode == PPC::EXTLWI || Opcode == PPC::EXTLWIo || + Opcode == PPC::MFVSRWZ) + return true; + + return false; +} + +// This function returns true if the input MachineInstr is a TOC save +// instruction. +bool PPCInstrInfo::isTOCSaveMI(const MachineInstr &MI) const { + if (!MI.getOperand(1).isImm() || !MI.getOperand(2).isReg()) + return false; + unsigned TOCSaveOffset = Subtarget.getFrameLowering()->getTOCSaveOffset(); + unsigned StackOffset = MI.getOperand(1).getImm(); + unsigned StackReg = MI.getOperand(2).getReg(); + if (StackReg == PPC::X1 && StackOffset == TOCSaveOffset) + return true; + + return false; +} + +// We limit the max depth to track incoming values of PHIs or binary ops +// (e.g. AND) to avoid exsessive cost. +const unsigned MAX_DEPTH = 1; + +bool +PPCInstrInfo::isSignOrZeroExtended(const MachineInstr &MI, bool SignExt, + const unsigned Depth) const { + const MachineFunction *MF = MI.getParent()->getParent(); + const MachineRegisterInfo *MRI = &MF->getRegInfo(); + + // If we know this instruction returns sign- or zero-extended result, + // return true. + if (SignExt ? isSignExtendingOp(MI): + isZeroExtendingOp(MI)) + return true; + + switch (MI.getOpcode()) { + case PPC::COPY: { + unsigned SrcReg = MI.getOperand(1).getReg(); + + // In both ELFv1 and v2 ABI, method parameters and the return value + // are sign- or zero-extended. + if (MF->getSubtarget().isSVR4ABI()) { + const PPCFunctionInfo *FuncInfo = MF->getInfo(); + // We check the ZExt/SExt flags for a method parameter. + if (MI.getParent()->getBasicBlock() == + &MF->getFunction().getEntryBlock()) { + unsigned VReg = MI.getOperand(0).getReg(); + if (MF->getRegInfo().isLiveIn(VReg)) + return SignExt ? FuncInfo->isLiveInSExt(VReg) : + FuncInfo->isLiveInZExt(VReg); + } + + // For a method return value, we check the ZExt/SExt flags in attribute. + // We assume the following code sequence for method call. + // ADJCALLSTACKDOWN 32, implicit dead %r1, implicit %r1 + // BL8_NOP @func,... + // ADJCALLSTACKUP 32, 0, implicit dead %r1, implicit %r1 + // %5 = COPY %x3; G8RC:%5 + if (SrcReg == PPC::X3) { + const MachineBasicBlock *MBB = MI.getParent(); + MachineBasicBlock::const_instr_iterator II = + MachineBasicBlock::const_instr_iterator(&MI); + if (II != MBB->instr_begin() && + (--II)->getOpcode() == PPC::ADJCALLSTACKUP) { + const MachineInstr &CallMI = *(--II); + if (CallMI.isCall() && CallMI.getOperand(0).isGlobal()) { + const Function *CalleeFn = + dyn_cast(CallMI.getOperand(0).getGlobal()); + if (!CalleeFn) + return false; + const IntegerType *IntTy = + dyn_cast(CalleeFn->getReturnType()); + const AttributeSet &Attrs = + CalleeFn->getAttributes().getRetAttributes(); + if (IntTy && IntTy->getBitWidth() <= 32) + return Attrs.hasAttribute(SignExt ? Attribute::SExt : + Attribute::ZExt); + } + } + } + } + + // If this is a copy from another register, we recursively check source. + if (!TargetRegisterInfo::isVirtualRegister(SrcReg)) + return false; + const MachineInstr *SrcMI = MRI->getVRegDef(SrcReg); + if (SrcMI != NULL) + return isSignOrZeroExtended(*SrcMI, SignExt, Depth); + + return false; + } + + case PPC::ANDIo: + case PPC::ANDISo: + case PPC::ORI: + case PPC::ORIS: + case PPC::XORI: + case PPC::XORIS: + case PPC::ANDIo8: + case PPC::ANDISo8: + case PPC::ORI8: + case PPC::ORIS8: + case PPC::XORI8: + case PPC::XORIS8: { + // logical operation with 16-bit immediate does not change the upper bits. + // So, we track the operand register as we do for register copy. + unsigned SrcReg = MI.getOperand(1).getReg(); + if (!TargetRegisterInfo::isVirtualRegister(SrcReg)) + return false; + const MachineInstr *SrcMI = MRI->getVRegDef(SrcReg); + if (SrcMI != NULL) + return isSignOrZeroExtended(*SrcMI, SignExt, Depth); + + return false; + } + + // If all incoming values are sign-/zero-extended, + // the output of OR, ISEL or PHI is also sign-/zero-extended. + case PPC::OR: + case PPC::OR8: + case PPC::ISEL: + case PPC::PHI: { + if (Depth >= MAX_DEPTH) + return false; + + // The input registers for PHI are operand 1, 3, ... + // The input registers for others are operand 1 and 2. + unsigned E = 3, D = 1; + if (MI.getOpcode() == PPC::PHI) { + E = MI.getNumOperands(); + D = 2; + } + + for (unsigned I = 1; I != E; I += D) { + if (MI.getOperand(I).isReg()) { + unsigned SrcReg = MI.getOperand(I).getReg(); + if (!TargetRegisterInfo::isVirtualRegister(SrcReg)) + return false; + const MachineInstr *SrcMI = MRI->getVRegDef(SrcReg); + if (SrcMI == NULL || !isSignOrZeroExtended(*SrcMI, SignExt, Depth+1)) + return false; + } + else + return false; + } + return true; + } + + // If at least one of the incoming values of an AND is zero extended + // then the output is also zero-extended. If both of the incoming values + // are sign-extended then the output is also sign extended. + case PPC::AND: + case PPC::AND8: { + if (Depth >= MAX_DEPTH) + return false; + + assert(MI.getOperand(1).isReg() && MI.getOperand(2).isReg()); + + unsigned SrcReg1 = MI.getOperand(1).getReg(); + unsigned SrcReg2 = MI.getOperand(2).getReg(); + + if (!TargetRegisterInfo::isVirtualRegister(SrcReg1) || + !TargetRegisterInfo::isVirtualRegister(SrcReg2)) + return false; + + const MachineInstr *MISrc1 = MRI->getVRegDef(SrcReg1); + const MachineInstr *MISrc2 = MRI->getVRegDef(SrcReg2); + if (!MISrc1 || !MISrc2) + return false; + + if(SignExt) + return isSignOrZeroExtended(*MISrc1, SignExt, Depth+1) && + isSignOrZeroExtended(*MISrc2, SignExt, Depth+1); + else + return isSignOrZeroExtended(*MISrc1, SignExt, Depth+1) || + isSignOrZeroExtended(*MISrc2, SignExt, Depth+1); + } + + default: + break; + } + return false; +} diff --git a/lib/Target/PowerPC/PPCInstrInfo.h b/lib/Target/PowerPC/PPCInstrInfo.h index b0629c88cf57b..4271c50127a1d 100644 --- a/lib/Target/PowerPC/PPCInstrInfo.h +++ b/lib/Target/PowerPC/PPCInstrInfo.h @@ -16,7 +16,7 @@ #include "PPC.h" #include "PPCRegisterInfo.h" -#include "llvm/Target/TargetInstrInfo.h" +#include "llvm/CodeGen/TargetInstrInfo.h" #define GET_INSTRINFO_HEADER #include "PPCGenInstrInfo.inc" @@ -72,6 +72,41 @@ enum { }; } // end namespace PPCII +// Instructions that have an immediate form might be convertible to that +// form if the correct input is a result of a load immediate. In order to +// know whether the transformation is special, we might need to know some +// of the details of the two forms. +struct ImmInstrInfo { + // Is the immediate field in the immediate form signed or unsigned? + uint64_t SignedImm : 1; + // Does the immediate need to be a multiple of some value? + uint64_t ImmMustBeMultipleOf : 5; + // Is R0/X0 treated specially by the original r+r instruction? + // If so, in which operand? + uint64_t ZeroIsSpecialOrig : 3; + // Is R0/X0 treated specially by the new r+i instruction? + // If so, in which operand? + uint64_t ZeroIsSpecialNew : 3; + // Is the operation commutative? + uint64_t IsCommutative : 1; + // The operand number to check for load immediate. + uint64_t ConstantOpNo : 3; + // The operand number for the immediate. + uint64_t ImmOpNo : 3; + // The opcode of the new instruction. + uint64_t ImmOpcode : 16; + // The size of the immediate. + uint64_t ImmWidth : 5; +}; + +// Information required to convert an instruction to just a materialized +// immediate. +struct LoadImmediateInfo { + unsigned Imm : 16; + unsigned Is64Bit : 1; + unsigned SetCR : 1; +}; + class PPCSubtarget; class PPCInstrInfo : public PPCGenInstrInfo { PPCSubtarget &Subtarget; @@ -87,6 +122,10 @@ class PPCInstrInfo : public PPCGenInstrInfo { const TargetRegisterClass *RC, SmallVectorImpl &NewMIs, bool &NonRI, bool &SpillsVRS) const; + bool transformToImmForm(MachineInstr &MI, const ImmInstrInfo &III, + unsigned ConstantOpNo, int64_t Imm) const; + MachineInstr *getConstantDefMI(MachineInstr &MI, unsigned &ConstOp, + bool &SeenIntermediateUse) const; virtual void anchor(); protected: @@ -282,6 +321,9 @@ public: ArrayRef> getSerializableBitmaskMachineOperandTargetFlags() const override; + // Expand VSX Memory Pseudo instruction to either a VSX or a FP instruction. + bool expandVSXMemPseudo(MachineInstr &MI) const; + // Lower pseudo instructions after register allocation. bool expandPostRAPseudo(MachineInstr &MI) const override; @@ -293,6 +335,36 @@ public: } const TargetRegisterClass *updatedRC(const TargetRegisterClass *RC) const; static int getRecordFormOpcode(unsigned Opcode); + + bool isTOCSaveMI(const MachineInstr &MI) const; + + bool isSignOrZeroExtended(const MachineInstr &MI, bool SignExt, + const unsigned PhiDepth) const; + + /// Return true if the output of the instruction is always a sign-extended, + /// i.e. 0 to 31-th bits are same as 32-th bit. + bool isSignExtended(const MachineInstr &MI, const unsigned depth = 0) const { + return isSignOrZeroExtended(MI, true, depth); + } + + /// Return true if the output of the instruction is always zero-extended, + /// i.e. 0 to 31-th bits are all zeros + bool isZeroExtended(const MachineInstr &MI, const unsigned depth = 0) const { + return isSignOrZeroExtended(MI, false, depth); + } + + bool convertToImmediateForm(MachineInstr &MI, + MachineInstr **KilledDef = nullptr) const; + void replaceInstrWithLI(MachineInstr &MI, const LoadImmediateInfo &LII) const; + + // This is used to find the "true" source register for n + // Machine instruction. Returns the original SrcReg unless it is the target + // of a copy-like operation, in which case we chain backwards through all + // such operations to the ultimate source register. If a + // physical register is encountered, we stop the search. + static unsigned lookThruCopyLike(unsigned SrcReg, + const MachineRegisterInfo *MRI); + bool instrHasImmForm(const MachineInstr &MI, ImmInstrInfo &III) const; }; } diff --git a/lib/Target/PowerPC/PPCInstrInfo.td b/lib/Target/PowerPC/PPCInstrInfo.td index dd7fc2659102a..a932d05b24eef 100644 --- a/lib/Target/PowerPC/PPCInstrInfo.td +++ b/lib/Target/PowerPC/PPCInstrInfo.td @@ -181,7 +181,7 @@ def PPCaddiDtprelL : SDNode<"PPCISD::ADDI_DTPREL_L", SDTIntBinOp>; def PPCvperm : SDNode<"PPCISD::VPERM", SDT_PPCvperm, []>; def PPCxxsplt : SDNode<"PPCISD::XXSPLT", SDT_PPCVecSplat, []>; -def PPCxxinsert : SDNode<"PPCISD::XXINSERT", SDT_PPCVecInsert, []>; +def PPCvecinsert : SDNode<"PPCISD::VECINSERT", SDT_PPCVecInsert, []>; def PPCxxreverse : SDNode<"PPCISD::XXREVERSE", SDT_PPCVecReverse, []>; def PPCxxpermdi : SDNode<"PPCISD::XXPERMDI", SDT_PPCxxpermdi, []>; def PPCvecshl : SDNode<"PPCISD::VECSHL", SDT_PPCVecShift, []>; @@ -1057,6 +1057,20 @@ multiclass XSForm_1rc opcode, bits<9> xo, dag OOL, dag IOL, } } +multiclass XSForm_1r opcode, bits<9> xo, dag OOL, dag IOL, + string asmbase, string asmstr, InstrItinClass itin, + list pattern> { + let BaseName = asmbase in { + def NAME : XSForm_1, RecFormRel; + let Defs = [CR0] in + def o : XSForm_1, isDOT, RecFormRel; + } +} + multiclass XForm_26r opcode, bits<10> xo, dag OOL, dag IOL, string asmbase, string asmstr, InstrItinClass itin, list pattern> { @@ -1576,6 +1590,11 @@ def : Pat<(prefetch xoaddr:$dst, (i32 0), imm, (i32 0)), (ICBT 0, xoaddr:$dst)>, Requires<[HasICBT]>; // inst prefetch (for read) // Atomic operations +// FIXME: some of these might be used with constant operands. This will result +// in constant materialization instructions that may be redundant. We currently +// clean this up in PPCMIPeephole with calls to +// PPCInstrInfo::convertToImmediateForm() but we should probably not emit them +// in the first place. let usesCustomInserter = 1 in { let Defs = [CR0] in { def ATOMIC_LOAD_ADD_I8 : Pseudo< @@ -2571,6 +2590,35 @@ let Uses = [RM] in { let Defs = [CR1] in def MFFSo : XForm_42<63, 583, (outs f8rc:$rT), (ins), "mffs. $rT", IIC_IntMFFS, []>, isDOT; + + def MFFSCE : X_FRT5_XO2_XO3_XO10<63, 0, 1, 583, (outs f8rc:$rT), (ins), + "mffsce $rT", IIC_IntMFFS, []>, + PPC970_DGroup_Single, PPC970_Unit_FPU; + + def MFFSCDRN : X_FRT5_XO2_XO3_FRB5_XO10<63, 2, 4, 583, (outs f8rc:$rT), + (ins f8rc:$FRB), "mffscdrn $rT, $FRB", + IIC_IntMFFS, []>, + PPC970_DGroup_Single, PPC970_Unit_FPU; + + def MFFSCDRNI : X_FRT5_XO2_XO3_DRM3_XO10<63, 2, 5, 583, (outs f8rc:$rT), + (ins u3imm:$DRM), + "mffscdrni $rT, $DRM", + IIC_IntMFFS, []>, + PPC970_DGroup_Single, PPC970_Unit_FPU; + + def MFFSCRN : X_FRT5_XO2_XO3_FRB5_XO10<63, 2, 6, 583, (outs f8rc:$rT), + (ins f8rc:$FRB), "mffscrn $rT, $FRB", + IIC_IntMFFS, []>, + PPC970_DGroup_Single, PPC970_Unit_FPU; + + def MFFSCRNI : X_FRT5_XO2_XO3_RM2_X10<63, 2, 7, 583, (outs f8rc:$rT), + (ins u2imm:$RM), "mffscrni $rT, $RM", + IIC_IntMFFS, []>, + PPC970_DGroup_Single, PPC970_Unit_FPU; + + def MFFSL : X_FRT5_XO2_XO3_XO10<63, 3, 0, 583, (outs f8rc:$rT), (ins), + "mffsl $rT", IIC_IntMFFS, []>, + PPC970_DGroup_Single, PPC970_Unit_FPU; } let Predicates = [IsISA3_0] in { @@ -3890,6 +3938,63 @@ def STWCIX : XForm_base_r3xo<31, 917, (outs), (ins gprc:$RST, gprc:$A, gprc:$B), def STDCIX : XForm_base_r3xo<31, 1013, (outs), (ins gprc:$RST, gprc:$A, gprc:$B), "stdcix $RST, $A, $B", IIC_LdStLoad, []>; +// External PID Load Store Instructions + +def LBEPX : XForm_1<31, 95, (outs gprc:$rD), (ins memrr:$src), + "lbepx $rD, $src", IIC_LdStLoad, []>, + Requires<[IsE500]>; + +def LFDEPX : XForm_25<31, 607, (outs f8rc:$frD), (ins memrr:$src), + "lfdepx $frD, $src", IIC_LdStLFD, []>, + Requires<[IsE500]>; + +def LHEPX : XForm_1<31, 287, (outs gprc:$rD), (ins memrr:$src), + "lhepx $rD, $src", IIC_LdStLoad, []>, + Requires<[IsE500]>; + +def LWEPX : XForm_1<31, 31, (outs gprc:$rD), (ins memrr:$src), + "lwepx $rD, $src", IIC_LdStLoad, []>, + Requires<[IsE500]>; + +def STBEPX : XForm_8<31, 223, (outs), (ins gprc:$rS, memrr:$dst), + "stbepx $rS, $dst", IIC_LdStStore, []>, + Requires<[IsE500]>; + +def STFDEPX : XForm_28<31, 735, (outs), (ins f8rc:$frS, memrr:$dst), + "stfdepx $frS, $dst", IIC_LdStSTFD, []>, + Requires<[IsE500]>; + +def STHEPX : XForm_8<31, 415, (outs), (ins gprc:$rS, memrr:$dst), + "sthepx $rS, $dst", IIC_LdStStore, []>, + Requires<[IsE500]>; + +def STWEPX : XForm_8<31, 159, (outs), (ins gprc:$rS, memrr:$dst), + "stwepx $rS, $dst", IIC_LdStStore, []>, + Requires<[IsE500]>; + +def DCBFEP : DCB_Form<127, 0, (outs), (ins memrr:$dst), "dcbfep $dst", + IIC_LdStDCBF, []>, Requires<[IsE500]>; + +def DCBSTEP : DCB_Form<63, 0, (outs), (ins memrr:$dst), "dcbstep $dst", + IIC_LdStDCBF, []>, Requires<[IsE500]>; + +def DCBTEP : DCB_Form_hint<319, (outs), (ins memrr:$dst, u5imm:$TH), + "dcbtep $TH, $dst", IIC_LdStDCBF, []>, + Requires<[IsE500]>; + +def DCBTSTEP : DCB_Form_hint<255, (outs), (ins memrr:$dst, u5imm:$TH), + "dcbtstep $TH, $dst", IIC_LdStDCBF, []>, + Requires<[IsE500]>; + +def DCBZEP : DCB_Form<1023, 0, (outs), (ins memrr:$dst), "dcbzep $dst", + IIC_LdStDCBF, []>, Requires<[IsE500]>; + +def DCBZLEP : DCB_Form<1023, 1, (outs), (ins memrr:$dst), "dcbzlep $dst", + IIC_LdStDCBF, []>, Requires<[IsE500]>; + +def ICBIEP : XForm_1a<31, 991, (outs), (ins memrr:$src), "icbiep $src", + IIC_LdStICBI, []>, Requires<[IsE500]>; + //===----------------------------------------------------------------------===// // PowerPC Assembler Instruction Aliases // @@ -3908,6 +4013,7 @@ class PPCAsmPseudo let AsmString = asm; let isAsmParserOnly = 1; let isPseudo = 1; + let hasNoSchedulingInfo = 1; } def : InstAlias<"sc", (SC 0)>; @@ -4208,6 +4314,7 @@ def CLRLSLDI : PPCAsmPseudo<"clrlsldi $rA, $rS, $b, $n", (ins g8rc:$rA, g8rc:$rS, u6imm:$b, u6imm:$n)>; def CLRLSLDIo : PPCAsmPseudo<"clrlsldi. $rA, $rS, $b, $n", (ins g8rc:$rA, g8rc:$rS, u6imm:$b, u6imm:$n)>; +def SUBPCIS : PPCAsmPseudo<"subpcis $RT, $D", (ins g8rc:$RT, s16imm:$D)>; def : InstAlias<"rotldi $rA, $rS, $n", (RLDICL g8rc:$rA, g8rc:$rS, u6imm:$n, 0)>; def : InstAlias<"rotldi. $rA, $rS, $n", (RLDICLo g8rc:$rA, g8rc:$rS, u6imm:$n, 0)>; @@ -4215,8 +4322,9 @@ def : InstAlias<"rotld $rA, $rS, $rB", (RLDCL g8rc:$rA, g8rc:$rS, gprc:$rB, 0)>; def : InstAlias<"rotld. $rA, $rS, $rB", (RLDCLo g8rc:$rA, g8rc:$rS, gprc:$rB, 0)>; def : InstAlias<"clrldi $rA, $rS, $n", (RLDICL g8rc:$rA, g8rc:$rS, 0, u6imm:$n)>; def : InstAlias<"clrldi $rA, $rS, $n", - (RLDICL_32 gprc:$rA, gprc:$rS, 0, u6imm:$n)>; + (RLDICL_32_64 g8rc:$rA, gprc:$rS, 0, u6imm:$n)>; def : InstAlias<"clrldi. $rA, $rS, $n", (RLDICLo g8rc:$rA, g8rc:$rS, 0, u6imm:$n)>; +def : InstAlias<"lnia $RT", (ADDPCIS g8rc:$RT, 0)>; def RLWINMbm : PPCAsmPseudo<"rlwinm $rA, $rS, $n, $b", (ins g8rc:$rA, g8rc:$rS, u5imm:$n, i32imm:$b)>; @@ -4233,7 +4341,7 @@ def RLWNMobm : PPCAsmPseudo<"rlwnm. $rA, $rS, $n, $b", // These generic branch instruction forms are used for the assembler parser only. // Defs and Uses are conservative, since we don't know the BO value. -let PPC970_Unit = 7 in { +let PPC970_Unit = 7, isBranch = 1 in { let Defs = [CTR], Uses = [CTR, RM] in { def gBC : BForm_3<16, 0, 0, (outs), (ins u5imm:$bo, crbitrc:$bi, condbrtarget:$dst), @@ -4550,7 +4658,7 @@ def : Pat<(i32 (bitreverse i32:$A)), // n = ((n >> 2) & 0x3333333333333333) | ((n << 2) & 0xCCCCCCCCCCCCCCCC); // Step 3: 4-bit swap (swap odd 4-bit and even 4-bit): // n = ((n >> 4) & 0x0F0F0F0F0F0F0F0F) | ((n << 4) & 0xF0F0F0F0F0F0F0F0); -// Step 4: byte reverse (Suppose n = [B1,B2,B3,B4,B5,B6,B7,B8]): +// Step 4: byte reverse (Suppose n = [B0,B1,B2,B3,B4,B5,B6,B7]): // Apply the same byte reverse algorithm mentioned above for the fast 32-bit // reverse to both the high 32 bit and low 32 bit of the 64 bit value. And // then OR them together to get the final result. @@ -4572,92 +4680,55 @@ def DWMaskValues { dag Hi4 = (ORI8 (ORIS8 (RLDICR MaskValues64.Hi4, 32, 31), 0xF0F0), 0xF0F0); } -def DWShift1 { - dag Right = (RLDICL $A, 63, 1); - dag Left = (RLDICR $A, 1, 62); -} - -def DWSwap1 { - dag Bit = (OR8 (AND8 DWShift1.Right, DWMaskValues.Lo1), - (AND8 DWShift1.Left, DWMaskValues.Hi1)); -} - -def DWShift2 { - dag Right = (RLDICL DWSwap1.Bit, 62, 2); - dag Left = (RLDICR DWSwap1.Bit, 2, 61); -} - -def DWSwap2 { - dag Bits = (OR8 (AND8 DWShift2.Right, DWMaskValues.Lo2), - (AND8 DWShift2.Left, DWMaskValues.Hi2)); -} - -def DWShift4 { - dag Right = (RLDICL DWSwap2.Bits, 60, 4); - dag Left = (RLDICR DWSwap2.Bits, 4, 59); -} - -def DWSwap4 { - dag Bits = (OR8 (AND8 DWShift4.Right, DWMaskValues.Lo4), - (AND8 DWShift4.Left, DWMaskValues.Hi4)); -} - -// Bit swap is done, now start byte swap. -def DWExtractLo32 { - dag SubReg = (i32 (EXTRACT_SUBREG DWSwap4.Bits, sub_32)); -} - -def DWRotateLo32 { - dag Left24 = (RLWINM DWExtractLo32.SubReg, 24, 0, 31); -} - -def DWLo32RotateInsertByte3 { - dag Left = (RLWIMI DWRotateLo32.Left24, DWExtractLo32.SubReg, 8, 8, 15); -} - -// Lower 32 bits in the right order -def DWLo32RotateInsertByte1 { - dag Left = - (RLWIMI DWLo32RotateInsertByte3.Left, DWExtractLo32.SubReg, 8, 24, 31); +def DWSwapInByte { + dag Swap1 = (OR8 (AND8 (RLDICL $A, 63, 1), DWMaskValues.Lo1), + (AND8 (RLDICR $A, 1, 62), DWMaskValues.Hi1)); + dag Swap2 = (OR8 (AND8 (RLDICL DWSwapInByte.Swap1, 62, 2), DWMaskValues.Lo2), + (AND8 (RLDICR DWSwapInByte.Swap1, 2, 61), DWMaskValues.Hi2)); + dag Swap4 = (OR8 (AND8 (RLDICL DWSwapInByte.Swap2, 60, 4), DWMaskValues.Lo4), + (AND8 (RLDICR DWSwapInByte.Swap2, 4, 59), DWMaskValues.Hi4)); } -def ExtendLo32 { - dag To64Bit = - (i64 (INSERT_SUBREG (i64 (IMPLICIT_DEF)), - DWLo32RotateInsertByte1.Left, sub_32)); +// Intra-byte swap is done, now start inter-byte swap. +def DWBytes4567 { + dag Word = (i32 (EXTRACT_SUBREG DWSwapInByte.Swap4, sub_32)); } -def DWShiftHi32 { // SRDI DWSwap4.Bits, 32) - dag ToLo32 = (RLDICL DWSwap4.Bits, 32, 32); +def DWBytes7456 { + dag Word = (RLWINM DWBytes4567.Word, 24, 0, 31); } -def DWExtractHi32 { - dag SubReg = (i32 (EXTRACT_SUBREG DWShiftHi32.ToLo32, sub_32)); +def DWBytes7656 { + dag Word = (RLWIMI DWBytes7456.Word, DWBytes4567.Word, 8, 8, 15); } -def DWRotateHi32 { - dag Left24 = (RLWINM DWExtractHi32.SubReg, 24, 0, 31); +// B7 B6 B5 B4 in the right order +def DWBytes7654 { + dag Word = (RLWIMI DWBytes7656.Word, DWBytes4567.Word, 8, 24, 31); + dag DWord = + (i64 (INSERT_SUBREG (i64 (IMPLICIT_DEF)), DWBytes7654.Word, sub_32)); } -def DWHi32RotateInsertByte3 { - dag Left = (RLWIMI DWRotateHi32.Left24, DWExtractHi32.SubReg, 8, 8, 15); +def DWBytes0123 { + dag Word = (i32 (EXTRACT_SUBREG (RLDICL DWSwapInByte.Swap4, 32, 32), sub_32)); } -// High 32 bits in the right order, but in the low 32-bit position -def DWHi32RotateInsertByte1 { - dag Left = - (RLWIMI DWHi32RotateInsertByte3.Left, DWExtractHi32.SubReg, 8, 24, 31); +def DWBytes3012 { + dag Word = (RLWINM DWBytes0123.Word, 24, 0, 31); } -def ExtendHi32 { - dag To64Bit = - (i64 (INSERT_SUBREG (i64 (IMPLICIT_DEF)), - DWHi32RotateInsertByte1.Left, sub_32)); +def DWBytes3212 { + dag Word = (RLWIMI DWBytes3012.Word, DWBytes0123.Word, 8, 8, 15); } -def DWShiftLo32 { // SLDI ExtendHi32.To64Bit, 32 - dag ToHi32 = (RLDICR ExtendHi32.To64Bit, 32, 31); +// B3 B2 B1 B0 in the right order +def DWBytes3210 { + dag Word = (RLWIMI DWBytes3212.Word, DWBytes0123.Word, 8, 24, 31); + dag DWord = + (i64 (INSERT_SUBREG (i64 (IMPLICIT_DEF)), DWBytes3210.Word, sub_32)); } +// Now both high word and low word are reversed, next +// swap the high word and low word. def : Pat<(i64 (bitreverse i64:$A)), - (OR8 DWShiftLo32.ToHi32, ExtendLo32.To64Bit)>; + (OR8 (RLDICR DWBytes7654.DWord, 32, 31), DWBytes3210.DWord)>; diff --git a/lib/Target/PowerPC/PPCInstrVSX.td b/lib/Target/PowerPC/PPCInstrVSX.td index 942e8b392b82b..6f719784eb7c6 100644 --- a/lib/Target/PowerPC/PPCInstrVSX.td +++ b/lib/Target/PowerPC/PPCInstrVSX.td @@ -47,6 +47,13 @@ def vssrc : RegisterOperand { let ParserMatchClass = PPCRegVSSRCAsmOperand; } +def PPCRegSPILLTOVSRRCAsmOperand : AsmOperandClass { + let Name = "RegSPILLTOVSRRC"; let PredicateMethod = "isVSRegNumber"; +} + +def spilltovsrrc : RegisterOperand { + let ParserMatchClass = PPCRegSPILLTOVSRRCAsmOperand; +} // Little-endian-specific nodes. def SDT_PPClxvd2x : SDTypeProfile<1, 1, [ SDTCisVT<0, v2f64>, SDTCisPtrTy<1> @@ -124,6 +131,12 @@ let Uses = [RM] in { "lxsdx $XT, $src", IIC_LdStLFD, [(set f64:$XT, (load xoaddr:$src))]>; + // Pseudo instruction XFLOADf64 will be expanded to LXSDX or LFDX later + let isPseudo = 1, CodeSize = 3 in + def XFLOADf64 : Pseudo<(outs vsfrc:$XT), (ins memrr:$src), + "#XFLOADf64", + [(set f64:$XT, (load xoaddr:$src))]>; + let Predicates = [HasVSX, HasOnlySwappingMemOps] in def LXVD2X : XX1Form<31, 844, (outs vsrc:$XT), (ins memrr:$src), @@ -149,6 +162,12 @@ let Uses = [RM] in { "stxsdx $XT, $dst", IIC_LdStSTFD, [(store f64:$XT, xoaddr:$dst)]>; + // Pseudo instruction XFSTOREf64 will be expanded to STXSDX or STFDX later + let isPseudo = 1, CodeSize = 3 in + def XFSTOREf64 : Pseudo<(outs), (ins vsfrc:$XT, memrr:$dst), + "#XFSTOREf64", + [(store f64:$XT, xoaddr:$dst)]>; + let Predicates = [HasVSX, HasOnlySwappingMemOps] in { // The behaviour of this instruction is endianness-specific so we provide no // pattern to match it without considering endianness. @@ -1208,32 +1227,59 @@ let AddedComplexity = 400 in { // Prefer VSX patterns over non-VSX patterns. let mayLoad = 1, mayStore = 0 in { let CodeSize = 3 in def LXSSPX : XX1Form<31, 524, (outs vssrc:$XT), (ins memrr:$src), - "lxsspx $XT, $src", IIC_LdStLFD, - [(set f32:$XT, (load xoaddr:$src))]>; + "lxsspx $XT, $src", IIC_LdStLFD, []>; def LXSIWAX : XX1Form<31, 76, (outs vsfrc:$XT), (ins memrr:$src), - "lxsiwax $XT, $src", IIC_LdStLFD, - [(set f64:$XT, (PPClfiwax xoaddr:$src))]>; + "lxsiwax $XT, $src", IIC_LdStLFD, []>; def LXSIWZX : XX1Form<31, 12, (outs vsfrc:$XT), (ins memrr:$src), - "lxsiwzx $XT, $src", IIC_LdStLFD, - [(set f64:$XT, (PPClfiwzx xoaddr:$src))]>; + "lxsiwzx $XT, $src", IIC_LdStLFD, []>; + + // Please note let isPseudo = 1 is not part of class Pseudo<>. Missing it + // would cause these Pseudos are not expanded in expandPostRAPseudos() + let isPseudo = 1 in { + // Pseudo instruction XFLOADf32 will be expanded to LXSSPX or LFSX later + let CodeSize = 3 in + def XFLOADf32 : Pseudo<(outs vssrc:$XT), (ins memrr:$src), + "#XFLOADf32", + [(set f32:$XT, (load xoaddr:$src))]>; + // Pseudo instruction LIWAX will be expanded to LXSIWAX or LFIWAX later + def LIWAX : Pseudo<(outs vsfrc:$XT), (ins memrr:$src), + "#LIWAX", + [(set f64:$XT, (PPClfiwax xoaddr:$src))]>; + // Pseudo instruction LIWZX will be expanded to LXSIWZX or LFIWZX later + def LIWZX : Pseudo<(outs vsfrc:$XT), (ins memrr:$src), + "#LIWZX", + [(set f64:$XT, (PPClfiwzx xoaddr:$src))]>; + } } // mayLoad // VSX scalar stores introduced in ISA 2.07 let mayStore = 1, mayLoad = 0 in { let CodeSize = 3 in def STXSSPX : XX1Form<31, 652, (outs), (ins vssrc:$XT, memrr:$dst), - "stxsspx $XT, $dst", IIC_LdStSTFD, - [(store f32:$XT, xoaddr:$dst)]>; + "stxsspx $XT, $dst", IIC_LdStSTFD, []>; def STXSIWX : XX1Form<31, 140, (outs), (ins vsfrc:$XT, memrr:$dst), - "stxsiwx $XT, $dst", IIC_LdStSTFD, - [(PPCstfiwx f64:$XT, xoaddr:$dst)]>; + "stxsiwx $XT, $dst", IIC_LdStSTFD, []>; + + // Please note let isPseudo = 1 is not part of class Pseudo<>. Missing it + // would cause these Pseudos are not expanded in expandPostRAPseudos() + let isPseudo = 1 in { + // Pseudo instruction XFSTOREf32 will be expanded to STXSSPX or STFSX later + let CodeSize = 3 in + def XFSTOREf32 : Pseudo<(outs), (ins vssrc:$XT, memrr:$dst), + "#XFSTOREf32", + [(store f32:$XT, xoaddr:$dst)]>; + // Pseudo instruction STIWX will be expanded to STXSIWX or STFIWX later + def STIWX : Pseudo<(outs), (ins vsfrc:$XT, memrr:$dst), + "#STIWX", + [(PPCstfiwx f64:$XT, xoaddr:$dst)]>; + } } // mayStore } // UseVSXReg = 1 def : Pat<(f64 (extloadf32 xoaddr:$src)), - (COPY_TO_REGCLASS (LXSSPX xoaddr:$src), VSFRC)>; + (COPY_TO_REGCLASS (XFLOADf32 xoaddr:$src), VSFRC)>; def : Pat<(f32 (fpround (extloadf32 xoaddr:$src))), - (f32 (LXSSPX xoaddr:$src))>; + (f32 (XFLOADf32 xoaddr:$src))>; def : Pat<(f64 (fpextend f32:$src)), (COPY_TO_REGCLASS $src, VSFRC)>; @@ -1407,7 +1453,7 @@ let AddedComplexity = 400 in { // Prefer VSX patterns over non-VSX patterns. (f32 (XSCVUXDSP (COPY_TO_REGCLASS (XXPERMDI $S, $S, 2), VSFRC)))>; } def : Pat<(v4i32 (scalar_to_vector ScalarLoads.Li32)), - (v4i32 (XXSPLTWs (LXSIWAX xoaddr:$src), 1))>; + (v4i32 (XXSPLTWs (LIWAX xoaddr:$src), 1))>; } // AddedComplexity = 400 } // HasP8Vector @@ -1769,6 +1815,7 @@ def VectorExtractions { dag BE_VARIABLE_DOUBLE = (COPY_TO_REGCLASS BE_VDOUBLE_PERMUTE, VSRC); } +def NoP9Altivec : Predicate<"!PPCSubTarget->hasP9Altivec()">; let AddedComplexity = 400 in { // v4f32 scalar <-> vector conversions (BE) let Predicates = [IsBigEndian, HasP8Vector] in { @@ -1801,6 +1848,17 @@ let Predicates = [IsBigEndian, HasDirectMove] in { (v4i32 (SUBREG_TO_REG (i64 1), MovesToVSR.BE_WORD_0, sub_64))>; def : Pat<(v2i64 (scalar_to_vector i64:$A)), (v2i64 (SUBREG_TO_REG (i64 1), MovesToVSR.BE_DWORD_0, sub_64))>; + + // v2i64 scalar <-> vector conversions (BE) + def : Pat<(i64 (vector_extract v2i64:$S, 0)), + (i64 VectorExtractions.LE_DWORD_1)>; + def : Pat<(i64 (vector_extract v2i64:$S, 1)), + (i64 VectorExtractions.LE_DWORD_0)>; + def : Pat<(i64 (vector_extract v2i64:$S, i64:$Idx)), + (i64 VectorExtractions.BE_VARIABLE_DWORD)>; +} // IsBigEndian, HasDirectMove + +let Predicates = [IsBigEndian, HasDirectMove, NoP9Altivec] in { def : Pat<(i32 (vector_extract v16i8:$S, 0)), (i32 VectorExtractions.LE_BYTE_15)>; def : Pat<(i32 (vector_extract v16i8:$S, 1)), @@ -1867,15 +1925,7 @@ let Predicates = [IsBigEndian, HasDirectMove] in { (i32 VectorExtractions.LE_WORD_0)>; def : Pat<(i32 (vector_extract v4i32:$S, i64:$Idx)), (i32 VectorExtractions.BE_VARIABLE_WORD)>; - - // v2i64 scalar <-> vector conversions (BE) - def : Pat<(i64 (vector_extract v2i64:$S, 0)), - (i64 VectorExtractions.LE_DWORD_1)>; - def : Pat<(i64 (vector_extract v2i64:$S, 1)), - (i64 VectorExtractions.LE_DWORD_0)>; - def : Pat<(i64 (vector_extract v2i64:$S, i64:$Idx)), - (i64 VectorExtractions.BE_VARIABLE_DWORD)>; -} // IsBigEndian, HasDirectMove +} // IsBigEndian, HasDirectMove, NoP9Altivec // v4f32 scalar <-> vector conversions (LE) let Predicates = [IsLittleEndian, HasP8Vector] in { @@ -1931,8 +1981,10 @@ let Predicates = [HasP9Altivec, IsLittleEndian] in { (VEXTUWRX (LI8 0), $S)>; def : Pat<(i64 (zext (i32 (vector_extract v4i32:$S, 1)))), (VEXTUWRX (LI8 4), $S)>; + // For extracting LE word 2, MFVSRWZ is better than VEXTUWRX def : Pat<(i64 (zext (i32 (vector_extract v4i32:$S, 2)))), - (VEXTUWRX (LI8 8), $S)>; + (INSERT_SUBREG (i64 (IMPLICIT_DEF)), + (i32 VectorExtractions.LE_WORD_2), sub_32)>; def : Pat<(i64 (zext (i32 (vector_extract v4i32:$S, 3)))), (VEXTUWRX (LI8 12), $S)>; @@ -1942,11 +1994,82 @@ let Predicates = [HasP9Altivec, IsLittleEndian] in { (EXTSW (VEXTUWRX (LI8 0), $S))>; def : Pat<(i64 (sext (i32 (vector_extract v4i32:$S, 1)))), (EXTSW (VEXTUWRX (LI8 4), $S))>; + // For extracting LE word 2, MFVSRWZ is better than VEXTUWRX def : Pat<(i64 (sext (i32 (vector_extract v4i32:$S, 2)))), - (EXTSW (VEXTUWRX (LI8 8), $S))>; + (EXTSW (INSERT_SUBREG (i64 (IMPLICIT_DEF)), + (i32 VectorExtractions.LE_WORD_2), sub_32))>; def : Pat<(i64 (sext (i32 (vector_extract v4i32:$S, 3)))), (EXTSW (VEXTUWRX (LI8 12), $S))>; + + def : Pat<(i32 (vector_extract v16i8:$S, i64:$Idx)), + (i32 (EXTRACT_SUBREG (VEXTUBRX $Idx, $S), sub_32))>; + def : Pat<(i32 (vector_extract v16i8:$S, 0)), + (i32 (EXTRACT_SUBREG (VEXTUBRX (LI8 0), $S), sub_32))>; + def : Pat<(i32 (vector_extract v16i8:$S, 1)), + (i32 (EXTRACT_SUBREG (VEXTUBRX (LI8 1), $S), sub_32))>; + def : Pat<(i32 (vector_extract v16i8:$S, 2)), + (i32 (EXTRACT_SUBREG (VEXTUBRX (LI8 2), $S), sub_32))>; + def : Pat<(i32 (vector_extract v16i8:$S, 3)), + (i32 (EXTRACT_SUBREG (VEXTUBRX (LI8 3), $S), sub_32))>; + def : Pat<(i32 (vector_extract v16i8:$S, 4)), + (i32 (EXTRACT_SUBREG (VEXTUBRX (LI8 4), $S), sub_32))>; + def : Pat<(i32 (vector_extract v16i8:$S, 5)), + (i32 (EXTRACT_SUBREG (VEXTUBRX (LI8 5), $S), sub_32))>; + def : Pat<(i32 (vector_extract v16i8:$S, 6)), + (i32 (EXTRACT_SUBREG (VEXTUBRX (LI8 6), $S), sub_32))>; + def : Pat<(i32 (vector_extract v16i8:$S, 7)), + (i32 (EXTRACT_SUBREG (VEXTUBRX (LI8 7), $S), sub_32))>; + def : Pat<(i32 (vector_extract v16i8:$S, 8)), + (i32 (EXTRACT_SUBREG (VEXTUBRX (LI8 8), $S), sub_32))>; + def : Pat<(i32 (vector_extract v16i8:$S, 9)), + (i32 (EXTRACT_SUBREG (VEXTUBRX (LI8 9), $S), sub_32))>; + def : Pat<(i32 (vector_extract v16i8:$S, 10)), + (i32 (EXTRACT_SUBREG (VEXTUBRX (LI8 10), $S), sub_32))>; + def : Pat<(i32 (vector_extract v16i8:$S, 11)), + (i32 (EXTRACT_SUBREG (VEXTUBRX (LI8 11), $S), sub_32))>; + def : Pat<(i32 (vector_extract v16i8:$S, 12)), + (i32 (EXTRACT_SUBREG (VEXTUBRX (LI8 12), $S), sub_32))>; + def : Pat<(i32 (vector_extract v16i8:$S, 13)), + (i32 (EXTRACT_SUBREG (VEXTUBRX (LI8 13), $S), sub_32))>; + def : Pat<(i32 (vector_extract v16i8:$S, 14)), + (i32 (EXTRACT_SUBREG (VEXTUBRX (LI8 14), $S), sub_32))>; + def : Pat<(i32 (vector_extract v16i8:$S, 15)), + (i32 (EXTRACT_SUBREG (VEXTUBRX (LI8 15), $S), sub_32))>; + + def : Pat<(i32 (vector_extract v8i16:$S, i64:$Idx)), + (i32 (EXTRACT_SUBREG (VEXTUHRX + (RLWINM8 $Idx, 1, 28, 30), $S), sub_32))>; + def : Pat<(i32 (vector_extract v8i16:$S, 0)), + (i32 (EXTRACT_SUBREG (VEXTUHRX (LI8 0), $S), sub_32))>; + def : Pat<(i32 (vector_extract v8i16:$S, 1)), + (i32 (EXTRACT_SUBREG (VEXTUHRX (LI8 2), $S), sub_32))>; + def : Pat<(i32 (vector_extract v8i16:$S, 2)), + (i32 (EXTRACT_SUBREG (VEXTUHRX (LI8 4), $S), sub_32))>; + def : Pat<(i32 (vector_extract v8i16:$S, 3)), + (i32 (EXTRACT_SUBREG (VEXTUHRX (LI8 6), $S), sub_32))>; + def : Pat<(i32 (vector_extract v8i16:$S, 4)), + (i32 (EXTRACT_SUBREG (VEXTUHRX (LI8 8), $S), sub_32))>; + def : Pat<(i32 (vector_extract v8i16:$S, 5)), + (i32 (EXTRACT_SUBREG (VEXTUHRX (LI8 10), $S), sub_32))>; + def : Pat<(i32 (vector_extract v8i16:$S, 6)), + (i32 (EXTRACT_SUBREG (VEXTUHRX (LI8 12), $S), sub_32))>; + def : Pat<(i32 (vector_extract v8i16:$S, 6)), + (i32 (EXTRACT_SUBREG (VEXTUHRX (LI8 14), $S), sub_32))>; + + def : Pat<(i32 (vector_extract v4i32:$S, i64:$Idx)), + (i32 (EXTRACT_SUBREG (VEXTUWRX + (RLWINM8 $Idx, 2, 28, 29), $S), sub_32))>; + def : Pat<(i32 (vector_extract v4i32:$S, 0)), + (i32 (EXTRACT_SUBREG (VEXTUWRX (LI8 0), $S), sub_32))>; + def : Pat<(i32 (vector_extract v4i32:$S, 1)), + (i32 (EXTRACT_SUBREG (VEXTUWRX (LI8 4), $S), sub_32))>; + // For extracting LE word 2, MFVSRWZ is better than VEXTUWRX + def : Pat<(i32 (vector_extract v4i32:$S, 2)), + (i32 VectorExtractions.LE_WORD_2)>; + def : Pat<(i32 (vector_extract v4i32:$S, 3)), + (i32 (EXTRACT_SUBREG (VEXTUWRX (LI8 12), $S), sub_32))>; } + let Predicates = [HasP9Altivec, IsBigEndian] in { def : Pat<(i64 (anyext (i32 (vector_extract v16i8:$S, i64:$Idx)))), (VEXTUBLX $Idx, $S)>; @@ -1974,8 +2097,11 @@ let Predicates = [HasP9Altivec, IsBigEndian] in { (VEXTUWLX (RLWINM8 $Idx, 2, 28, 29), $S)>; def : Pat<(i64 (zext (i32 (vector_extract v4i32:$S, 0)))), (VEXTUWLX (LI8 0), $S)>; + + // For extracting BE word 1, MFVSRWZ is better than VEXTUWLX def : Pat<(i64 (zext (i32 (vector_extract v4i32:$S, 1)))), - (VEXTUWLX (LI8 4), $S)>; + (INSERT_SUBREG (i64 (IMPLICIT_DEF)), + (i32 VectorExtractions.LE_WORD_2), sub_32)>; def : Pat<(i64 (zext (i32 (vector_extract v4i32:$S, 2)))), (VEXTUWLX (LI8 8), $S)>; def : Pat<(i64 (zext (i32 (vector_extract v4i32:$S, 3)))), @@ -1985,12 +2111,82 @@ let Predicates = [HasP9Altivec, IsBigEndian] in { (EXTSW (VEXTUWLX (RLWINM8 $Idx, 2, 28, 29), $S))>; def : Pat<(i64 (sext (i32 (vector_extract v4i32:$S, 0)))), (EXTSW (VEXTUWLX (LI8 0), $S))>; + // For extracting BE word 1, MFVSRWZ is better than VEXTUWLX def : Pat<(i64 (sext (i32 (vector_extract v4i32:$S, 1)))), - (EXTSW (VEXTUWLX (LI8 4), $S))>; + (EXTSW (INSERT_SUBREG (i64 (IMPLICIT_DEF)), + (i32 VectorExtractions.LE_WORD_2), sub_32))>; def : Pat<(i64 (sext (i32 (vector_extract v4i32:$S, 2)))), (EXTSW (VEXTUWLX (LI8 8), $S))>; def : Pat<(i64 (sext (i32 (vector_extract v4i32:$S, 3)))), (EXTSW (VEXTUWLX (LI8 12), $S))>; + + def : Pat<(i32 (vector_extract v16i8:$S, i64:$Idx)), + (i32 (EXTRACT_SUBREG (VEXTUBLX $Idx, $S), sub_32))>; + def : Pat<(i32 (vector_extract v16i8:$S, 0)), + (i32 (EXTRACT_SUBREG (VEXTUBLX (LI8 0), $S), sub_32))>; + def : Pat<(i32 (vector_extract v16i8:$S, 1)), + (i32 (EXTRACT_SUBREG (VEXTUBLX (LI8 1), $S), sub_32))>; + def : Pat<(i32 (vector_extract v16i8:$S, 2)), + (i32 (EXTRACT_SUBREG (VEXTUBLX (LI8 2), $S), sub_32))>; + def : Pat<(i32 (vector_extract v16i8:$S, 3)), + (i32 (EXTRACT_SUBREG (VEXTUBLX (LI8 3), $S), sub_32))>; + def : Pat<(i32 (vector_extract v16i8:$S, 4)), + (i32 (EXTRACT_SUBREG (VEXTUBLX (LI8 4), $S), sub_32))>; + def : Pat<(i32 (vector_extract v16i8:$S, 5)), + (i32 (EXTRACT_SUBREG (VEXTUBLX (LI8 5), $S), sub_32))>; + def : Pat<(i32 (vector_extract v16i8:$S, 6)), + (i32 (EXTRACT_SUBREG (VEXTUBLX (LI8 6), $S), sub_32))>; + def : Pat<(i32 (vector_extract v16i8:$S, 7)), + (i32 (EXTRACT_SUBREG (VEXTUBLX (LI8 7), $S), sub_32))>; + def : Pat<(i32 (vector_extract v16i8:$S, 8)), + (i32 (EXTRACT_SUBREG (VEXTUBLX (LI8 8), $S), sub_32))>; + def : Pat<(i32 (vector_extract v16i8:$S, 9)), + (i32 (EXTRACT_SUBREG (VEXTUBLX (LI8 9), $S), sub_32))>; + def : Pat<(i32 (vector_extract v16i8:$S, 10)), + (i32 (EXTRACT_SUBREG (VEXTUBLX (LI8 10), $S), sub_32))>; + def : Pat<(i32 (vector_extract v16i8:$S, 11)), + (i32 (EXTRACT_SUBREG (VEXTUBLX (LI8 11), $S), sub_32))>; + def : Pat<(i32 (vector_extract v16i8:$S, 12)), + (i32 (EXTRACT_SUBREG (VEXTUBLX (LI8 12), $S), sub_32))>; + def : Pat<(i32 (vector_extract v16i8:$S, 13)), + (i32 (EXTRACT_SUBREG (VEXTUBLX (LI8 13), $S), sub_32))>; + def : Pat<(i32 (vector_extract v16i8:$S, 14)), + (i32 (EXTRACT_SUBREG (VEXTUBLX (LI8 14), $S), sub_32))>; + def : Pat<(i32 (vector_extract v16i8:$S, 15)), + (i32 (EXTRACT_SUBREG (VEXTUBLX (LI8 15), $S), sub_32))>; + + def : Pat<(i32 (vector_extract v8i16:$S, i64:$Idx)), + (i32 (EXTRACT_SUBREG (VEXTUHLX + (RLWINM8 $Idx, 1, 28, 30), $S), sub_32))>; + def : Pat<(i32 (vector_extract v8i16:$S, 0)), + (i32 (EXTRACT_SUBREG (VEXTUHLX (LI8 0), $S), sub_32))>; + def : Pat<(i32 (vector_extract v8i16:$S, 1)), + (i32 (EXTRACT_SUBREG (VEXTUHLX (LI8 2), $S), sub_32))>; + def : Pat<(i32 (vector_extract v8i16:$S, 2)), + (i32 (EXTRACT_SUBREG (VEXTUHLX (LI8 4), $S), sub_32))>; + def : Pat<(i32 (vector_extract v8i16:$S, 3)), + (i32 (EXTRACT_SUBREG (VEXTUHLX (LI8 6), $S), sub_32))>; + def : Pat<(i32 (vector_extract v8i16:$S, 4)), + (i32 (EXTRACT_SUBREG (VEXTUHLX (LI8 8), $S), sub_32))>; + def : Pat<(i32 (vector_extract v8i16:$S, 5)), + (i32 (EXTRACT_SUBREG (VEXTUHLX (LI8 10), $S), sub_32))>; + def : Pat<(i32 (vector_extract v8i16:$S, 6)), + (i32 (EXTRACT_SUBREG (VEXTUHLX (LI8 12), $S), sub_32))>; + def : Pat<(i32 (vector_extract v8i16:$S, 6)), + (i32 (EXTRACT_SUBREG (VEXTUHLX (LI8 14), $S), sub_32))>; + + def : Pat<(i32 (vector_extract v4i32:$S, i64:$Idx)), + (i32 (EXTRACT_SUBREG (VEXTUWLX + (RLWINM8 $Idx, 2, 28, 29), $S), sub_32))>; + def : Pat<(i32 (vector_extract v4i32:$S, 0)), + (i32 (EXTRACT_SUBREG (VEXTUWLX (LI8 0), $S), sub_32))>; + // For extracting BE word 1, MFVSRWZ is better than VEXTUWLX + def : Pat<(i32 (vector_extract v4i32:$S, 1)), + (i32 VectorExtractions.LE_WORD_2)>; + def : Pat<(i32 (vector_extract v4i32:$S, 2)), + (i32 (EXTRACT_SUBREG (VEXTUWLX (LI8 8), $S), sub_32))>; + def : Pat<(i32 (vector_extract v4i32:$S, 3)), + (i32 (EXTRACT_SUBREG (VEXTUWLX (LI8 12), $S), sub_32))>; } let Predicates = [IsLittleEndian, HasDirectMove] in { @@ -2003,6 +2199,16 @@ let Predicates = [IsLittleEndian, HasDirectMove] in { (v4i32 MovesToVSR.LE_WORD_0)>; def : Pat<(v2i64 (scalar_to_vector i64:$A)), (v2i64 MovesToVSR.LE_DWORD_0)>; + // v2i64 scalar <-> vector conversions (LE) + def : Pat<(i64 (vector_extract v2i64:$S, 0)), + (i64 VectorExtractions.LE_DWORD_0)>; + def : Pat<(i64 (vector_extract v2i64:$S, 1)), + (i64 VectorExtractions.LE_DWORD_1)>; + def : Pat<(i64 (vector_extract v2i64:$S, i64:$Idx)), + (i64 VectorExtractions.LE_VARIABLE_DWORD)>; +} // IsLittleEndian, HasDirectMove + +let Predicates = [IsLittleEndian, HasDirectMove, NoP9Altivec] in { def : Pat<(i32 (vector_extract v16i8:$S, 0)), (i32 VectorExtractions.LE_BYTE_0)>; def : Pat<(i32 (vector_extract v16i8:$S, 1)), @@ -2069,15 +2275,7 @@ let Predicates = [IsLittleEndian, HasDirectMove] in { (i32 VectorExtractions.LE_WORD_3)>; def : Pat<(i32 (vector_extract v4i32:$S, i64:$Idx)), (i32 VectorExtractions.LE_VARIABLE_WORD)>; - - // v2i64 scalar <-> vector conversions (LE) - def : Pat<(i64 (vector_extract v2i64:$S, 0)), - (i64 VectorExtractions.LE_DWORD_0)>; - def : Pat<(i64 (vector_extract v2i64:$S, 1)), - (i64 VectorExtractions.LE_DWORD_1)>; - def : Pat<(i64 (vector_extract v2i64:$S, i64:$Idx)), - (i64 VectorExtractions.LE_VARIABLE_DWORD)>; -} // IsLittleEndian, HasDirectMove +} // IsLittleEndian, HasDirectMove, NoP9Altivec let Predicates = [HasDirectMove, HasVSX] in { // bitconvert f32 -> i32 @@ -2344,7 +2542,7 @@ let AddedComplexity = 400, Predicates = [HasP9Vector] in { XX2_RD6_UIM5_RS6<60, 181, (outs vsrc:$XT), (ins vsrc:$XTi, vsrc:$XB, u4imm:$UIM), "xxinsertw $XT, $XB, $UIM", IIC_VecFP, - [(set v4i32:$XT, (PPCxxinsert v4i32:$XTi, v4i32:$XB, + [(set v4i32:$XT, (PPCvecinsert v4i32:$XTi, v4i32:$XB, imm32SExt16:$UIM))]>, RegConstraint<"$XTi = $XT">, NoEncode<"$XTi">; @@ -2550,6 +2748,51 @@ let AddedComplexity = 400, Predicates = [HasP9Vector] in { UseVSXReg; } // mayStore + let Predicates = [IsLittleEndian] in { + def: Pat<(f32 (PPCfcfids (PPCmtvsra (i32 (extractelt v4i32:$A, 0))))), + (f32 (XSCVSPDPN (XVCVSXWSP (XXSPLTW $A, 3))))>; + def: Pat<(f32 (PPCfcfids (PPCmtvsra (i32 (extractelt v4i32:$A, 1))))), + (f32 (XSCVSPDPN (XVCVSXWSP (XXSPLTW $A, 2))))>; + def: Pat<(f32 (PPCfcfids (PPCmtvsra (i32 (extractelt v4i32:$A, 2))))), + (f32 (XSCVSPDPN (XVCVSXWSP (XXSPLTW $A, 1))))>; + def: Pat<(f32 (PPCfcfids (PPCmtvsra (i32 (extractelt v4i32:$A, 3))))), + (f32 (XSCVSPDPN (XVCVSXWSP (XXSPLTW $A, 0))))>; + def: Pat<(f64 (PPCfcfid (PPCmtvsra (i32 (extractelt v4i32:$A, 0))))), + (f64 (COPY_TO_REGCLASS (XVCVSXWDP (XXSPLTW $A, 3)), VSFRC))>; + def: Pat<(f64 (PPCfcfid (PPCmtvsra (i32 (extractelt v4i32:$A, 1))))), + (f64 (COPY_TO_REGCLASS (XVCVSXWDP (XXSPLTW $A, 2)), VSFRC))>; + def: Pat<(f64 (PPCfcfid (PPCmtvsra (i32 (extractelt v4i32:$A, 2))))), + (f64 (COPY_TO_REGCLASS (XVCVSXWDP (XXSPLTW $A, 1)), VSFRC))>; + def: Pat<(f64 (PPCfcfid (PPCmtvsra (i32 (extractelt v4i32:$A, 3))))), + (f64 (COPY_TO_REGCLASS (XVCVSXWDP (XXSPLTW $A, 0)), VSFRC))>; + } + + let Predicates = [IsBigEndian] in { + def: Pat<(f32 (PPCfcfids (PPCmtvsra (i32 (extractelt v4i32:$A, 0))))), + (f32 (XSCVSPDPN (XVCVSXWSP (XXSPLTW $A, 0))))>; + def: Pat<(f32 (PPCfcfids (PPCmtvsra (i32 (extractelt v4i32:$A, 1))))), + (f32 (XSCVSPDPN (XVCVSXWSP (XXSPLTW $A, 1))))>; + def: Pat<(f32 (PPCfcfids (PPCmtvsra (i32 (extractelt v4i32:$A, 2))))), + (f32 (XSCVSPDPN (XVCVSXWSP (XXSPLTW $A, 2))))>; + def: Pat<(f32 (PPCfcfids (PPCmtvsra (i32 (extractelt v4i32:$A, 3))))), + (f32 (XSCVSPDPN (XVCVSXWSP (XXSPLTW $A, 3))))>; + def: Pat<(f64 (PPCfcfid (PPCmtvsra (i32 (extractelt v4i32:$A, 0))))), + (f64 (COPY_TO_REGCLASS (XVCVSXWDP (XXSPLTW $A, 0)), VSFRC))>; + def: Pat<(f64 (PPCfcfid (PPCmtvsra (i32 (extractelt v4i32:$A, 1))))), + (f64 (COPY_TO_REGCLASS (XVCVSXWDP (XXSPLTW $A, 1)), VSFRC))>; + def: Pat<(f64 (PPCfcfid (PPCmtvsra (i32 (extractelt v4i32:$A, 2))))), + (f64 (COPY_TO_REGCLASS (XVCVSXWDP (XXSPLTW $A, 2)), VSFRC))>; + def: Pat<(f64 (PPCfcfid (PPCmtvsra (i32 (extractelt v4i32:$A, 3))))), + (f64 (COPY_TO_REGCLASS (XVCVSXWDP (XXSPLTW $A, 3)), VSFRC))>; + } + + // Alternate patterns for PPCmtvsrz where the output is v8i16 or v16i8 instead + // of f64 + def : Pat<(v8i16 (PPCmtvsrz i32:$A)), + (v8i16 (SUBREG_TO_REG (i64 1), (MTVSRWZ $A), sub_64))>; + def : Pat<(v16i8 (PPCmtvsrz i32:$A)), + (v16i8 (SUBREG_TO_REG (i64 1), (MTVSRWZ $A), sub_64))>; + // Patterns for which instructions from ISA 3.0 are a better match let Predicates = [IsLittleEndian, HasP9Vector] in { def : Pat<(f32 (PPCfcfidus (PPCmtvsrz (i32 (extractelt v4i32:$A, 0))))), @@ -2560,6 +2803,14 @@ let AddedComplexity = 400, Predicates = [HasP9Vector] in { (f32 (XSCVUXDSP (XXEXTRACTUW $A, 4)))>; def : Pat<(f32 (PPCfcfidus (PPCmtvsrz (i32 (extractelt v4i32:$A, 3))))), (f32 (XSCVUXDSP (XXEXTRACTUW $A, 0)))>; + def : Pat<(f64 (PPCfcfidu (PPCmtvsrz (i32 (extractelt v4i32:$A, 0))))), + (f64 (XSCVUXDDP (XXEXTRACTUW $A, 12)))>; + def : Pat<(f64 (PPCfcfidu (PPCmtvsrz (i32 (extractelt v4i32:$A, 1))))), + (f64 (XSCVUXDDP (XXEXTRACTUW $A, 8)))>; + def : Pat<(f64 (PPCfcfidu (PPCmtvsrz (i32 (extractelt v4i32:$A, 2))))), + (f64 (XSCVUXDDP (XXEXTRACTUW $A, 4)))>; + def : Pat<(f64 (PPCfcfidu (PPCmtvsrz (i32 (extractelt v4i32:$A, 3))))), + (f64 (XSCVUXDDP (XXEXTRACTUW $A, 0)))>; def : Pat<(v4i32 (insertelt v4i32:$A, i32:$B, 0)), (v4i32 (XXINSERTW v4i32:$A, AlignValues.I32_TO_BE_WORD1, 12))>; def : Pat<(v4i32 (insertelt v4i32:$A, i32:$B, 1)), @@ -2587,6 +2838,14 @@ let AddedComplexity = 400, Predicates = [HasP9Vector] in { (f32 (XSCVUXDSP (XXEXTRACTUW $A, 8)))>; def : Pat<(f32 (PPCfcfidus (PPCmtvsrz (i32 (extractelt v4i32:$A, 3))))), (f32 (XSCVUXDSP (XXEXTRACTUW $A, 12)))>; + def : Pat<(f64 (PPCfcfidu (PPCmtvsrz (i32 (extractelt v4i32:$A, 0))))), + (f64 (XSCVUXDDP (XXEXTRACTUW $A, 0)))>; + def : Pat<(f64 (PPCfcfidu (PPCmtvsrz (i32 (extractelt v4i32:$A, 1))))), + (f64 (XSCVUXDDP (XXEXTRACTUW $A, 4)))>; + def : Pat<(f64 (PPCfcfidu (PPCmtvsrz (i32 (extractelt v4i32:$A, 2))))), + (f64 (XSCVUXDDP (XXEXTRACTUW $A, 8)))>; + def : Pat<(f64 (PPCfcfidu (PPCmtvsrz (i32 (extractelt v4i32:$A, 3))))), + (f64 (XSCVUXDDP (XXEXTRACTUW $A, 12)))>; def : Pat<(v4i32 (insertelt v4i32:$A, i32:$B, 0)), (v4i32 (XXINSERTW v4i32:$A, AlignValues.I32_TO_BE_WORD1, 0))>; def : Pat<(v4i32 (insertelt v4i32:$A, i32:$B, 1)), @@ -2809,6 +3068,23 @@ let AddedComplexity = 400, Predicates = [HasP9Vector] in { (f32 (DFLOADf32 ixaddr:$src))>; } // end HasP9Vector, AddedComplexity +let Predicates = [HasP9Vector] in { + let isPseudo = 1 in { + let mayStore = 1 in { + def SPILLTOVSR_STX : Pseudo<(outs), (ins spilltovsrrc:$XT, memrr:$dst), + "#SPILLTOVSR_STX", []>; + def SPILLTOVSR_ST : Pseudo<(outs), (ins spilltovsrrc:$XT, memrix:$dst), + "#SPILLTOVSR_ST", []>; + } + let mayLoad = 1 in { + def SPILLTOVSR_LDX : Pseudo<(outs spilltovsrrc:$XT), (ins memrr:$src), + "#SPILLTOVSR_LDX", []>; + def SPILLTOVSR_LD : Pseudo<(outs spilltovsrrc:$XT), (ins memrix:$src), + "#SPILLTOVSR_LD", []>; + + } + } +} // Integer extend helper dags 32 -> 64 def AnyExts { dag A = (INSERT_SUBREG (i64 (IMPLICIT_DEF)), $A, sub_32); @@ -2962,10 +3238,10 @@ let AddedComplexity = 400 in { (COPY_TO_REGCLASS (XSCVDPUXDS $A), VSRC), 0))>; def : Pat<(v4i32 (scalar_to_vector FltToIntLoad.A)), (v4i32 (XXSPLTW (COPY_TO_REGCLASS - (XSCVDPSXWSs (LXSSPX xoaddr:$A)), VSRC), 1))>; + (XSCVDPSXWSs (XFLOADf32 xoaddr:$A)), VSRC), 1))>; def : Pat<(v4i32 (scalar_to_vector FltToUIntLoad.A)), (v4i32 (XXSPLTW (COPY_TO_REGCLASS - (XSCVDPUXWSs (LXSSPX xoaddr:$A)), VSRC), 1))>; + (XSCVDPUXWSs (XFLOADf32 xoaddr:$A)), VSRC), 1))>; def : Pat<(v4f32 (build_vector f32:$A, f32:$A, f32:$A, f32:$A)), (v4f32 (XXSPLTW (v4f32 (XSCVDPSPN $A)), 0))>; @@ -2983,19 +3259,19 @@ let AddedComplexity = 400 in { } let Predicates = [HasVSX, NoP9Vector] in { - // Load-and-splat with fp-to-int conversion (using X-Form VSX loads). + // Load-and-splat with fp-to-int conversion (using X-Form VSX/FP loads). def : Pat<(v4i32 (scalar_to_vector DblToIntLoad.A)), (v4i32 (XXSPLTW (COPY_TO_REGCLASS - (XSCVDPSXWS (LXSDX xoaddr:$A)), VSRC), 1))>; + (XSCVDPSXWS (XFLOADf64 xoaddr:$A)), VSRC), 1))>; def : Pat<(v4i32 (scalar_to_vector DblToUIntLoad.A)), (v4i32 (XXSPLTW (COPY_TO_REGCLASS - (XSCVDPUXWS (LXSDX xoaddr:$A)), VSRC), 1))>; + (XSCVDPUXWS (XFLOADf64 xoaddr:$A)), VSRC), 1))>; def : Pat<(v2i64 (scalar_to_vector FltToLongLoad.A)), (v2i64 (XXPERMDIs (XSCVDPSXDS (COPY_TO_REGCLASS - (LXSSPX xoaddr:$A), VSFRC)), 0))>; + (XFLOADf32 xoaddr:$A), VSFRC)), 0))>; def : Pat<(v2i64 (scalar_to_vector FltToULongLoad.A)), (v2i64 (XXPERMDIs (XSCVDPUXDS (COPY_TO_REGCLASS - (LXSSPX xoaddr:$A), VSFRC)), 0))>; + (XFLOADf32 xoaddr:$A), VSFRC)), 0))>; } // Big endian, available on all targets with VSX diff --git a/lib/Target/PowerPC/PPCLoopPreIncPrep.cpp b/lib/Target/PowerPC/PPCLoopPreIncPrep.cpp index a349fa1b40907..cdf544bdfac35 100644 --- a/lib/Target/PowerPC/PPCLoopPreIncPrep.cpp +++ b/lib/Target/PowerPC/PPCLoopPreIncPrep.cpp @@ -28,6 +28,7 @@ #include "llvm/ADT/SmallPtrSet.h" #include "llvm/ADT/SmallSet.h" #include "llvm/ADT/SmallVector.h" +#include "llvm/ADT/Statistic.h" #include "llvm/Analysis/LoopInfo.h" #include "llvm/Analysis/ScalarEvolution.h" #include "llvm/Analysis/ScalarEvolutionExpander.h" @@ -61,6 +62,8 @@ static cl::opt MaxVars("ppc-preinc-prep-max-vars", cl::Hidden, cl::init(16), cl::desc("Potential PHI threshold for PPC preinc loop prep")); +STATISTIC(PHINodeAlreadyExists, "PHI node already in pre-increment form"); + namespace llvm { void initializePPCLoopPreIncPrepPass(PassRegistry&); @@ -88,6 +91,9 @@ namespace { AU.addRequired(); } + bool alreadyPrepared(Loop *L, Instruction* MemI, + const SCEV *BasePtrStartSCEV, + const SCEVConstant *BasePtrIncSCEV); bool runOnFunction(Function &F) override; bool runOnLoop(Loop *L); @@ -177,6 +183,62 @@ bool PPCLoopPreIncPrep::runOnFunction(Function &F) { return MadeChange; } +// In order to prepare for the pre-increment a PHI is added. +// This function will check to see if that PHI already exists and will return +// true if it found an existing PHI with the same start and increment as the +// one we wanted to create. +bool PPCLoopPreIncPrep::alreadyPrepared(Loop *L, Instruction* MemI, + const SCEV *BasePtrStartSCEV, + const SCEVConstant *BasePtrIncSCEV) { + BasicBlock *BB = MemI->getParent(); + if (!BB) + return false; + + BasicBlock *PredBB = L->getLoopPredecessor(); + BasicBlock *LatchBB = L->getLoopLatch(); + + if (!PredBB || !LatchBB) + return false; + + // Run through the PHIs and see if we have some that looks like a preparation + iterator_range PHIIter = BB->phis(); + for (auto & CurrentPHI : PHIIter) { + PHINode *CurrentPHINode = dyn_cast(&CurrentPHI); + if (!CurrentPHINode) + continue; + + if (!SE->isSCEVable(CurrentPHINode->getType())) + continue; + + const SCEV *PHISCEV = SE->getSCEVAtScope(CurrentPHINode, L); + + const SCEVAddRecExpr *PHIBasePtrSCEV = dyn_cast(PHISCEV); + if (!PHIBasePtrSCEV) + continue; + + const SCEVConstant *PHIBasePtrIncSCEV = + dyn_cast(PHIBasePtrSCEV->getStepRecurrence(*SE)); + if (!PHIBasePtrIncSCEV) + continue; + + if (CurrentPHINode->getNumIncomingValues() == 2) { + if ( (CurrentPHINode->getIncomingBlock(0) == LatchBB && + CurrentPHINode->getIncomingBlock(1) == PredBB) || + (CurrentPHINode->getIncomingBlock(1) == LatchBB && + CurrentPHINode->getIncomingBlock(0) == PredBB) ) { + if (PHIBasePtrSCEV->getStart() == BasePtrStartSCEV && + PHIBasePtrIncSCEV == BasePtrIncSCEV) { + // The existing PHI (CurrentPHINode) has the same start and increment + // as the PHI that we wanted to create. + ++PHINodeAlreadyExists; + return true; + } + } + } + } + return false; +} + bool PPCLoopPreIncPrep::runOnLoop(Loop *L) { bool MadeChange = false; @@ -347,6 +409,9 @@ bool PPCLoopPreIncPrep::runOnLoop(Loop *L) { DEBUG(dbgs() << "PIP: New start is: " << *BasePtrStartSCEV << "\n"); + if (alreadyPrepared(L, MemI, BasePtrStartSCEV, BasePtrIncSCEV)) + continue; + PHINode *NewPHI = PHINode::Create(I8PtrTy, HeaderLoopPredCount, MemI->hasName() ? MemI->getName() + ".phi" : "", Header->getFirstNonPHI()); diff --git a/lib/Target/PowerPC/PPCMCInstLower.cpp b/lib/Target/PowerPC/PPCMCInstLower.cpp index b310493587ae7..1e40711328ece 100644 --- a/lib/Target/PowerPC/PPCMCInstLower.cpp +++ b/lib/Target/PowerPC/PPCMCInstLower.cpp @@ -20,14 +20,14 @@ #include "llvm/CodeGen/AsmPrinter.h" #include "llvm/CodeGen/MachineFunction.h" #include "llvm/CodeGen/MachineModuleInfoImpls.h" +#include "llvm/CodeGen/TargetLowering.h" +#include "llvm/CodeGen/TargetLoweringObjectFile.h" #include "llvm/IR/DataLayout.h" #include "llvm/IR/GlobalValue.h" #include "llvm/IR/Mangler.h" #include "llvm/MC/MCAsmInfo.h" #include "llvm/MC/MCExpr.h" #include "llvm/MC/MCInst.h" -#include "llvm/Target/TargetLowering.h" -#include "llvm/Target/TargetLoweringObjectFile.h" using namespace llvm; static MachineModuleInfoMachO &getMachOMMI(AsmPrinter &AP) { @@ -143,45 +143,48 @@ void llvm::LowerPPCMachineInstrToMCInst(const MachineInstr *MI, MCInst &OutMI, OutMI.setOpcode(MI->getOpcode()); for (unsigned i = 0, e = MI->getNumOperands(); i != e; ++i) { - const MachineOperand &MO = MI->getOperand(i); - MCOperand MCOp; - switch (MO.getType()) { - default: - MI->print(errs()); - llvm_unreachable("unknown operand type"); - case MachineOperand::MO_Register: - assert(!MO.getSubReg() && "Subregs should be eliminated!"); - assert(MO.getReg() > PPC::NoRegister && - MO.getReg() < PPC::NUM_TARGET_REGS && - "Invalid register for this target!"); - MCOp = MCOperand::createReg(MO.getReg()); - break; - case MachineOperand::MO_Immediate: - MCOp = MCOperand::createImm(MO.getImm()); - break; - case MachineOperand::MO_MachineBasicBlock: - MCOp = MCOperand::createExpr(MCSymbolRefExpr::create( - MO.getMBB()->getSymbol(), AP.OutContext)); - break; - case MachineOperand::MO_GlobalAddress: - case MachineOperand::MO_ExternalSymbol: - MCOp = GetSymbolRef(MO, GetSymbolFromOperand(MO, AP), AP, isDarwin); - break; - case MachineOperand::MO_JumpTableIndex: - MCOp = GetSymbolRef(MO, AP.GetJTISymbol(MO.getIndex()), AP, isDarwin); - break; - case MachineOperand::MO_ConstantPoolIndex: - MCOp = GetSymbolRef(MO, AP.GetCPISymbol(MO.getIndex()), AP, isDarwin); - break; - case MachineOperand::MO_BlockAddress: - MCOp = GetSymbolRef(MO,AP.GetBlockAddressSymbol(MO.getBlockAddress()),AP, - isDarwin); - break; - case MachineOperand::MO_RegisterMask: - continue; - } - - OutMI.addOperand(MCOp); + if (LowerPPCMachineOperandToMCOperand(MI->getOperand(i), MCOp, AP, + isDarwin)) + OutMI.addOperand(MCOp); + } +} + +bool llvm::LowerPPCMachineOperandToMCOperand(const MachineOperand &MO, + MCOperand &OutMO, AsmPrinter &AP, + bool isDarwin) { + switch (MO.getType()) { + default: + llvm_unreachable("unknown operand type"); + case MachineOperand::MO_Register: + assert(!MO.getSubReg() && "Subregs should be eliminated!"); + assert(MO.getReg() > PPC::NoRegister && + MO.getReg() < PPC::NUM_TARGET_REGS && + "Invalid register for this target!"); + OutMO = MCOperand::createReg(MO.getReg()); + return true; + case MachineOperand::MO_Immediate: + OutMO = MCOperand::createImm(MO.getImm()); + return true; + case MachineOperand::MO_MachineBasicBlock: + OutMO = MCOperand::createExpr( + MCSymbolRefExpr::create(MO.getMBB()->getSymbol(), AP.OutContext)); + return true; + case MachineOperand::MO_GlobalAddress: + case MachineOperand::MO_ExternalSymbol: + OutMO = GetSymbolRef(MO, GetSymbolFromOperand(MO, AP), AP, isDarwin); + return true; + case MachineOperand::MO_JumpTableIndex: + OutMO = GetSymbolRef(MO, AP.GetJTISymbol(MO.getIndex()), AP, isDarwin); + return true; + case MachineOperand::MO_ConstantPoolIndex: + OutMO = GetSymbolRef(MO, AP.GetCPISymbol(MO.getIndex()), AP, isDarwin); + return true; + case MachineOperand::MO_BlockAddress: + OutMO = GetSymbolRef(MO, AP.GetBlockAddressSymbol(MO.getBlockAddress()), AP, + isDarwin); + return true; + case MachineOperand::MO_RegisterMask: + return false; } } diff --git a/lib/Target/PowerPC/PPCMIPeephole.cpp b/lib/Target/PowerPC/PPCMIPeephole.cpp index ff5f17c7628f2..a2640727f8138 100644 --- a/lib/Target/PowerPC/PPCMIPeephole.cpp +++ b/lib/Target/PowerPC/PPCMIPeephole.cpp @@ -23,18 +23,50 @@ #include "PPCInstrBuilder.h" #include "PPCInstrInfo.h" #include "PPCTargetMachine.h" +#include "llvm/ADT/Statistic.h" +#include "llvm/CodeGen/MachineDominators.h" #include "llvm/CodeGen/MachineFunctionPass.h" #include "llvm/CodeGen/MachineInstrBuilder.h" #include "llvm/CodeGen/MachineRegisterInfo.h" #include "llvm/Support/Debug.h" +#include "MCTargetDesc/PPCPredicates.h" using namespace llvm; #define DEBUG_TYPE "ppc-mi-peepholes" -namespace llvm { - void initializePPCMIPeepholePass(PassRegistry&); -} +STATISTIC(RemoveTOCSave, "Number of TOC saves removed"); +STATISTIC(MultiTOCSaves, + "Number of functions with multiple TOC saves that must be kept"); +STATISTIC(NumEliminatedSExt, "Number of eliminated sign-extensions"); +STATISTIC(NumEliminatedZExt, "Number of eliminated zero-extensions"); +STATISTIC(NumOptADDLIs, "Number of optimized ADD instruction fed by LI"); +STATISTIC(NumConvertedToImmediateForm, + "Number of instructions converted to their immediate form"); +STATISTIC(NumFunctionsEnteredInMIPeephole, + "Number of functions entered in PPC MI Peepholes"); +STATISTIC(NumFixedPointIterations, + "Number of fixed-point iterations converting reg-reg instructions " + "to reg-imm ones"); + +static cl::opt +FixedPointRegToImm("ppc-reg-to-imm-fixed-point", cl::Hidden, cl::init(true), + cl::desc("Iterate to a fixed point when attempting to " + "convert reg-reg instructions to reg-imm")); + +static cl::opt +ConvertRegReg("ppc-convert-rr-to-ri", cl::Hidden, cl::init(false), + cl::desc("Convert eligible reg+reg instructions to reg+imm")); + +static cl::opt + EnableSExtElimination("ppc-eliminate-signext", + cl::desc("enable elimination of sign-extensions"), + cl::init(false), cl::Hidden); + +static cl::opt + EnableZExtElimination("ppc-eliminate-zeroext", + cl::desc("enable elimination of zero-extensions"), + cl::init(false), cl::Hidden); namespace { @@ -50,20 +82,31 @@ struct PPCMIPeephole : public MachineFunctionPass { } private: + MachineDominatorTree *MDT; + // Initialize class variables. void initialize(MachineFunction &MFParm); // Perform peepholes. bool simplifyCode(void); - // Find the "true" register represented by SrcReg (following chains - // of copies and subreg_to_reg operations). - unsigned lookThruCopyLike(unsigned SrcReg); + // Perform peepholes. + bool eliminateRedundantCompare(void); + bool eliminateRedundantTOCSaves(std::map &TOCSaves); + void UpdateTOCSaves(std::map &TOCSaves, + MachineInstr *MI); public: + + void getAnalysisUsage(AnalysisUsage &AU) const override { + AU.addRequired(); + AU.addPreserved(); + MachineFunctionPass::getAnalysisUsage(AU); + } + // Main entry point for this pass. bool runOnMachineFunction(MachineFunction &MF) override { - if (skipFunction(*MF.getFunction())) + if (skipFunction(MF.getFunction())) return false; initialize(MF); return simplifyCode(); @@ -74,15 +117,138 @@ public: void PPCMIPeephole::initialize(MachineFunction &MFParm) { MF = &MFParm; MRI = &MF->getRegInfo(); + MDT = &getAnalysis(); TII = MF->getSubtarget().getInstrInfo(); DEBUG(dbgs() << "*** PowerPC MI peephole pass ***\n\n"); DEBUG(MF->dump()); } +static MachineInstr *getVRegDefOrNull(MachineOperand *Op, + MachineRegisterInfo *MRI) { + assert(Op && "Invalid Operand!"); + if (!Op->isReg()) + return nullptr; + + unsigned Reg = Op->getReg(); + if (!TargetRegisterInfo::isVirtualRegister(Reg)) + return nullptr; + + return MRI->getVRegDef(Reg); +} + +// This function returns number of known zero bits in output of MI +// starting from the most significant bit. +static unsigned +getKnownLeadingZeroCount(MachineInstr *MI, const PPCInstrInfo *TII) { + unsigned Opcode = MI->getOpcode(); + if (Opcode == PPC::RLDICL || Opcode == PPC::RLDICLo || + Opcode == PPC::RLDCL || Opcode == PPC::RLDCLo) + return MI->getOperand(3).getImm(); + + if ((Opcode == PPC::RLDIC || Opcode == PPC::RLDICo) && + MI->getOperand(3).getImm() <= 63 - MI->getOperand(2).getImm()) + return MI->getOperand(3).getImm(); + + if ((Opcode == PPC::RLWINM || Opcode == PPC::RLWINMo || + Opcode == PPC::RLWNM || Opcode == PPC::RLWNMo || + Opcode == PPC::RLWINM8 || Opcode == PPC::RLWNM8) && + MI->getOperand(3).getImm() <= MI->getOperand(4).getImm()) + return 32 + MI->getOperand(3).getImm(); + + if (Opcode == PPC::ANDIo) { + uint16_t Imm = MI->getOperand(2).getImm(); + return 48 + countLeadingZeros(Imm); + } + + if (Opcode == PPC::CNTLZW || Opcode == PPC::CNTLZWo || + Opcode == PPC::CNTTZW || Opcode == PPC::CNTTZWo || + Opcode == PPC::CNTLZW8 || Opcode == PPC::CNTTZW8) + // The result ranges from 0 to 32. + return 58; + + if (Opcode == PPC::CNTLZD || Opcode == PPC::CNTLZDo || + Opcode == PPC::CNTTZD || Opcode == PPC::CNTTZDo) + // The result ranges from 0 to 64. + return 57; + + if (Opcode == PPC::LHZ || Opcode == PPC::LHZX || + Opcode == PPC::LHZ8 || Opcode == PPC::LHZX8 || + Opcode == PPC::LHZU || Opcode == PPC::LHZUX || + Opcode == PPC::LHZU8 || Opcode == PPC::LHZUX8) + return 48; + + if (Opcode == PPC::LBZ || Opcode == PPC::LBZX || + Opcode == PPC::LBZ8 || Opcode == PPC::LBZX8 || + Opcode == PPC::LBZU || Opcode == PPC::LBZUX || + Opcode == PPC::LBZU8 || Opcode == PPC::LBZUX8) + return 56; + + if (TII->isZeroExtended(*MI)) + return 32; + + return 0; +} + +// This function maintains a map for the pairs +// Each time a new TOC save is encountered, it checks if any of the exisiting +// ones are dominated by the new one. If so, it marks the exisiting one as +// redundant by setting it's entry in the map as false. It then adds the new +// instruction to the map with either true or false depending on if any +// exisiting instructions dominated the new one. +void PPCMIPeephole::UpdateTOCSaves( + std::map &TOCSaves, MachineInstr *MI) { + assert(TII->isTOCSaveMI(*MI) && "Expecting a TOC save instruction here"); + bool Keep = true; + for (auto It = TOCSaves.begin(); It != TOCSaves.end(); It++ ) { + MachineInstr *CurrInst = It->first; + // If new instruction dominates an exisiting one, mark exisiting one as + // redundant. + if (It->second && MDT->dominates(MI, CurrInst)) + It->second = false; + // Check if the new instruction is redundant. + if (MDT->dominates(CurrInst, MI)) { + Keep = false; + break; + } + } + // Add new instruction to map. + TOCSaves[MI] = Keep; +} + // Perform peephole optimizations. bool PPCMIPeephole::simplifyCode(void) { bool Simplified = false; MachineInstr* ToErase = nullptr; + std::map TOCSaves; + + NumFunctionsEnteredInMIPeephole++; + if (ConvertRegReg) { + // Fixed-point conversion of reg/reg instructions fed by load-immediate + // into reg/imm instructions. FIXME: This is expensive, control it with + // an option. + bool SomethingChanged = false; + do { + NumFixedPointIterations++; + SomethingChanged = false; + for (MachineBasicBlock &MBB : *MF) { + for (MachineInstr &MI : MBB) { + if (MI.isDebugValue()) + continue; + + if (TII->convertToImmediateForm(MI)) { + // We don't erase anything in case the def has other uses. Let DCE + // remove it if it can be removed. + DEBUG(dbgs() << "Converted instruction to imm form: "); + DEBUG(MI.dump()); + NumConvertedToImmediateForm++; + SomethingChanged = true; + Simplified = true; + continue; + } + } + } + } while (SomethingChanged && FixedPointRegToImm); + } for (MachineBasicBlock &MBB : *MF) { for (MachineInstr &MI : MBB) { @@ -104,6 +270,18 @@ bool PPCMIPeephole::simplifyCode(void) { default: break; + case PPC::STD: { + MachineFrameInfo &MFI = MF->getFrameInfo(); + if (MFI.hasVarSizedObjects() || + !MF->getSubtarget().isELFv2ABI()) + break; + // When encountering a TOC save instruction, call UpdateTOCSaves + // to add it to the TOCSaves map and mark any exisiting TOC saves + // it dominates as redundant. + if (TII->isTOCSaveMI(MI)) + UpdateTOCSaves(TOCSaves, &MI); + break; + } case PPC::XXPERMDI: { // Perform simplifications of 2x64 vector swaps and splats. // A swap is identified by an immediate value of 2, and a splat @@ -118,8 +296,10 @@ bool PPCMIPeephole::simplifyCode(void) { // XXPERMDI t, SUBREG_TO_REG(s), SUBREG_TO_REG(s), immed. // We have to look through chains of COPY and SUBREG_TO_REG // to find the real source values for comparison. - unsigned TrueReg1 = lookThruCopyLike(MI.getOperand(1).getReg()); - unsigned TrueReg2 = lookThruCopyLike(MI.getOperand(2).getReg()); + unsigned TrueReg1 = + TII->lookThruCopyLike(MI.getOperand(1).getReg(), MRI); + unsigned TrueReg2 = + TII->lookThruCopyLike(MI.getOperand(2).getReg(), MRI); if (TrueReg1 == TrueReg2 && TargetRegisterInfo::isVirtualRegister(TrueReg1)) { @@ -133,7 +313,8 @@ bool PPCMIPeephole::simplifyCode(void) { auto isConversionOfLoadAndSplat = [=]() -> bool { if (DefOpc != PPC::XVCVDPSXDS && DefOpc != PPC::XVCVDPUXDS) return false; - unsigned DefReg = lookThruCopyLike(DefMI->getOperand(1).getReg()); + unsigned DefReg = + TII->lookThruCopyLike(DefMI->getOperand(1).getReg(), MRI); if (TargetRegisterInfo::isVirtualRegister(DefReg)) { MachineInstr *LoadMI = MRI->getVRegDef(DefReg); if (LoadMI && LoadMI->getOpcode() == PPC::LXVDSX) @@ -159,10 +340,10 @@ bool PPCMIPeephole::simplifyCode(void) { // can replace it with a copy. if (DefOpc == PPC::XXPERMDI) { unsigned FeedImmed = DefMI->getOperand(3).getImm(); - unsigned FeedReg1 - = lookThruCopyLike(DefMI->getOperand(1).getReg()); - unsigned FeedReg2 - = lookThruCopyLike(DefMI->getOperand(2).getReg()); + unsigned FeedReg1 = + TII->lookThruCopyLike(DefMI->getOperand(1).getReg(), MRI); + unsigned FeedReg2 = + TII->lookThruCopyLike(DefMI->getOperand(2).getReg(), MRI); if ((FeedImmed == 0 || FeedImmed == 3) && FeedReg1 == FeedReg2) { DEBUG(dbgs() @@ -220,7 +401,8 @@ bool PPCMIPeephole::simplifyCode(void) { case PPC::XXSPLTW: { unsigned MyOpcode = MI.getOpcode(); unsigned OpNo = MyOpcode == PPC::XXSPLTW ? 1 : 2; - unsigned TrueReg = lookThruCopyLike(MI.getOperand(OpNo).getReg()); + unsigned TrueReg = + TII->lookThruCopyLike(MI.getOperand(OpNo).getReg(), MRI); if (!TargetRegisterInfo::isVirtualRegister(TrueReg)) break; MachineInstr *DefMI = MRI->getVRegDef(TrueReg); @@ -282,7 +464,8 @@ bool PPCMIPeephole::simplifyCode(void) { } case PPC::XVCVDPSP: { // If this is a DP->SP conversion fed by an FRSP, the FRSP is redundant. - unsigned TrueReg = lookThruCopyLike(MI.getOperand(1).getReg()); + unsigned TrueReg = + TII->lookThruCopyLike(MI.getOperand(1).getReg(), MRI); if (!TargetRegisterInfo::isVirtualRegister(TrueReg)) break; MachineInstr *DefMI = MRI->getVRegDef(TrueReg); @@ -290,8 +473,10 @@ bool PPCMIPeephole::simplifyCode(void) { // This can occur when building a vector of single precision or integer // values. if (DefMI && DefMI->getOpcode() == PPC::XXPERMDI) { - unsigned DefsReg1 = lookThruCopyLike(DefMI->getOperand(1).getReg()); - unsigned DefsReg2 = lookThruCopyLike(DefMI->getOperand(2).getReg()); + unsigned DefsReg1 = + TII->lookThruCopyLike(DefMI->getOperand(1).getReg(), MRI); + unsigned DefsReg2 = + TII->lookThruCopyLike(DefMI->getOperand(2).getReg(), MRI); if (!TargetRegisterInfo::isVirtualRegister(DefsReg1) || !TargetRegisterInfo::isVirtualRegister(DefsReg2)) break; @@ -336,8 +521,248 @@ bool PPCMIPeephole::simplifyCode(void) { } break; } + case PPC::EXTSH: + case PPC::EXTSH8: + case PPC::EXTSH8_32_64: { + if (!EnableSExtElimination) break; + unsigned NarrowReg = MI.getOperand(1).getReg(); + if (!TargetRegisterInfo::isVirtualRegister(NarrowReg)) + break; + + MachineInstr *SrcMI = MRI->getVRegDef(NarrowReg); + // If we've used a zero-extending load that we will sign-extend, + // just do a sign-extending load. + if (SrcMI->getOpcode() == PPC::LHZ || + SrcMI->getOpcode() == PPC::LHZX) { + if (!MRI->hasOneNonDBGUse(SrcMI->getOperand(0).getReg())) + break; + auto is64Bit = [] (unsigned Opcode) { + return Opcode == PPC::EXTSH8; + }; + auto isXForm = [] (unsigned Opcode) { + return Opcode == PPC::LHZX; + }; + auto getSextLoadOp = [] (bool is64Bit, bool isXForm) { + if (is64Bit) + if (isXForm) return PPC::LHAX8; + else return PPC::LHA8; + else + if (isXForm) return PPC::LHAX; + else return PPC::LHA; + }; + unsigned Opc = getSextLoadOp(is64Bit(MI.getOpcode()), + isXForm(SrcMI->getOpcode())); + DEBUG(dbgs() << "Zero-extending load\n"); + DEBUG(SrcMI->dump()); + DEBUG(dbgs() << "and sign-extension\n"); + DEBUG(MI.dump()); + DEBUG(dbgs() << "are merged into sign-extending load\n"); + SrcMI->setDesc(TII->get(Opc)); + SrcMI->getOperand(0).setReg(MI.getOperand(0).getReg()); + ToErase = &MI; + Simplified = true; + NumEliminatedSExt++; + } + break; + } + case PPC::EXTSW: + case PPC::EXTSW_32: + case PPC::EXTSW_32_64: { + if (!EnableSExtElimination) break; + unsigned NarrowReg = MI.getOperand(1).getReg(); + if (!TargetRegisterInfo::isVirtualRegister(NarrowReg)) + break; + + MachineInstr *SrcMI = MRI->getVRegDef(NarrowReg); + // If we've used a zero-extending load that we will sign-extend, + // just do a sign-extending load. + if (SrcMI->getOpcode() == PPC::LWZ || + SrcMI->getOpcode() == PPC::LWZX) { + if (!MRI->hasOneNonDBGUse(SrcMI->getOperand(0).getReg())) + break; + auto is64Bit = [] (unsigned Opcode) { + return Opcode == PPC::EXTSW || Opcode == PPC::EXTSW_32_64; + }; + auto isXForm = [] (unsigned Opcode) { + return Opcode == PPC::LWZX; + }; + auto getSextLoadOp = [] (bool is64Bit, bool isXForm) { + if (is64Bit) + if (isXForm) return PPC::LWAX; + else return PPC::LWA; + else + if (isXForm) return PPC::LWAX_32; + else return PPC::LWA_32; + }; + unsigned Opc = getSextLoadOp(is64Bit(MI.getOpcode()), + isXForm(SrcMI->getOpcode())); + DEBUG(dbgs() << "Zero-extending load\n"); + DEBUG(SrcMI->dump()); + DEBUG(dbgs() << "and sign-extension\n"); + DEBUG(MI.dump()); + DEBUG(dbgs() << "are merged into sign-extending load\n"); + SrcMI->setDesc(TII->get(Opc)); + SrcMI->getOperand(0).setReg(MI.getOperand(0).getReg()); + ToErase = &MI; + Simplified = true; + NumEliminatedSExt++; + } else if (MI.getOpcode() == PPC::EXTSW_32_64 && + TII->isSignExtended(*SrcMI)) { + // We can eliminate EXTSW if the input is known to be already + // sign-extended. + DEBUG(dbgs() << "Removing redundant sign-extension\n"); + unsigned TmpReg = + MF->getRegInfo().createVirtualRegister(&PPC::G8RCRegClass); + BuildMI(MBB, &MI, MI.getDebugLoc(), TII->get(PPC::IMPLICIT_DEF), + TmpReg); + BuildMI(MBB, &MI, MI.getDebugLoc(), TII->get(PPC::INSERT_SUBREG), + MI.getOperand(0).getReg()) + .addReg(TmpReg) + .addReg(NarrowReg) + .addImm(PPC::sub_32); + ToErase = &MI; + Simplified = true; + NumEliminatedSExt++; + } + break; + } + case PPC::RLDICL: { + // We can eliminate RLDICL (e.g. for zero-extension) + // if all bits to clear are already zero in the input. + // This code assume following code sequence for zero-extension. + // %6 = COPY %5:sub_32; (optional) + // %8 = IMPLICIT_DEF; + // %7 = INSERT_SUBREG %8, %6, sub_32; + if (!EnableZExtElimination) break; + + if (MI.getOperand(2).getImm() != 0) + break; + + unsigned SrcReg = MI.getOperand(1).getReg(); + if (!TargetRegisterInfo::isVirtualRegister(SrcReg)) + break; + + MachineInstr *SrcMI = MRI->getVRegDef(SrcReg); + if (!(SrcMI && SrcMI->getOpcode() == PPC::INSERT_SUBREG && + SrcMI->getOperand(0).isReg() && SrcMI->getOperand(1).isReg())) + break; + + MachineInstr *ImpDefMI, *SubRegMI; + ImpDefMI = MRI->getVRegDef(SrcMI->getOperand(1).getReg()); + SubRegMI = MRI->getVRegDef(SrcMI->getOperand(2).getReg()); + if (ImpDefMI->getOpcode() != PPC::IMPLICIT_DEF) break; + + SrcMI = SubRegMI; + if (SubRegMI->getOpcode() == PPC::COPY) { + unsigned CopyReg = SubRegMI->getOperand(1).getReg(); + if (TargetRegisterInfo::isVirtualRegister(CopyReg)) + SrcMI = MRI->getVRegDef(CopyReg); + } + + unsigned KnownZeroCount = getKnownLeadingZeroCount(SrcMI, TII); + if (MI.getOperand(3).getImm() <= KnownZeroCount) { + DEBUG(dbgs() << "Removing redundant zero-extension\n"); + BuildMI(MBB, &MI, MI.getDebugLoc(), TII->get(PPC::COPY), + MI.getOperand(0).getReg()) + .addReg(SrcReg); + ToErase = &MI; + Simplified = true; + NumEliminatedZExt++; + } + break; + } + + // TODO: Any instruction that has an immediate form fed only by a PHI + // whose operands are all load immediate can be folded away. We currently + // do this for ADD instructions, but should expand it to arithmetic and + // binary instructions with immediate forms in the future. + case PPC::ADD4: + case PPC::ADD8: { + auto isSingleUsePHI = [&](MachineOperand *PhiOp) { + assert(PhiOp && "Invalid Operand!"); + MachineInstr *DefPhiMI = getVRegDefOrNull(PhiOp, MRI); + + return DefPhiMI && (DefPhiMI->getOpcode() == PPC::PHI) && + MRI->hasOneNonDBGUse(DefPhiMI->getOperand(0).getReg()); + }; + + auto dominatesAllSingleUseLIs = [&](MachineOperand *DominatorOp, + MachineOperand *PhiOp) { + assert(PhiOp && "Invalid Operand!"); + assert(DominatorOp && "Invalid Operand!"); + MachineInstr *DefPhiMI = getVRegDefOrNull(PhiOp, MRI); + MachineInstr *DefDomMI = getVRegDefOrNull(DominatorOp, MRI); + + // Note: the vregs only show up at odd indices position of PHI Node, + // the even indices position save the BB info. + for (unsigned i = 1; i < DefPhiMI->getNumOperands(); i += 2) { + MachineInstr *LiMI = + getVRegDefOrNull(&DefPhiMI->getOperand(i), MRI); + if (!LiMI || + (LiMI->getOpcode() != PPC::LI && LiMI->getOpcode() != PPC::LI8) + || !MRI->hasOneNonDBGUse(LiMI->getOperand(0).getReg()) || + !MDT->dominates(DefDomMI, LiMI)) + return false; + } + + return true; + }; + + MachineOperand Op1 = MI.getOperand(1); + MachineOperand Op2 = MI.getOperand(2); + if (isSingleUsePHI(&Op2) && dominatesAllSingleUseLIs(&Op1, &Op2)) + std::swap(Op1, Op2); + else if (!isSingleUsePHI(&Op1) || !dominatesAllSingleUseLIs(&Op2, &Op1)) + break; // We don't have an ADD fed by LI's that can be transformed + + // Now we know that Op1 is the PHI node and Op2 is the dominator + unsigned DominatorReg = Op2.getReg(); + + const TargetRegisterClass *TRC = MI.getOpcode() == PPC::ADD8 + ? &PPC::G8RC_and_G8RC_NOX0RegClass + : &PPC::GPRC_and_GPRC_NOR0RegClass; + MRI->setRegClass(DominatorReg, TRC); + + // replace LIs with ADDIs + MachineInstr *DefPhiMI = getVRegDefOrNull(&Op1, MRI); + for (unsigned i = 1; i < DefPhiMI->getNumOperands(); i += 2) { + MachineInstr *LiMI = getVRegDefOrNull(&DefPhiMI->getOperand(i), MRI); + DEBUG(dbgs() << "Optimizing LI to ADDI: "); + DEBUG(LiMI->dump()); + + // There could be repeated registers in the PHI, e.g: %1 = + // PHI %6, <%bb.2>, %8, <%bb.3>, %8, <%bb.6>; So if we've + // already replaced the def instruction, skip. + if (LiMI->getOpcode() == PPC::ADDI || LiMI->getOpcode() == PPC::ADDI8) + continue; + + assert((LiMI->getOpcode() == PPC::LI || + LiMI->getOpcode() == PPC::LI8) && + "Invalid Opcode!"); + auto LiImm = LiMI->getOperand(1).getImm(); // save the imm of LI + LiMI->RemoveOperand(1); // remove the imm of LI + LiMI->setDesc(TII->get(LiMI->getOpcode() == PPC::LI ? PPC::ADDI + : PPC::ADDI8)); + MachineInstrBuilder(*LiMI->getParent()->getParent(), *LiMI) + .addReg(DominatorReg) + .addImm(LiImm); // restore the imm of LI + DEBUG(LiMI->dump()); + } + + // Replace ADD with COPY + DEBUG(dbgs() << "Optimizing ADD to COPY: "); + DEBUG(MI.dump()); + BuildMI(MBB, &MI, MI.getDebugLoc(), TII->get(PPC::COPY), + MI.getOperand(0).getReg()) + .add(Op1); + ToErase = &MI; + Simplified = true; + NumOptADDLIs++; + break; + } } } + // If the last instruction was marked for elimination, // remove it now. if (ToErase) { @@ -346,37 +771,502 @@ bool PPCMIPeephole::simplifyCode(void) { } } + // Eliminate all the TOC save instructions which are redundant. + Simplified |= eliminateRedundantTOCSaves(TOCSaves); + // We try to eliminate redundant compare instruction. + Simplified |= eliminateRedundantCompare(); + return Simplified; } -// This is used to find the "true" source register for an -// XXPERMDI instruction, since MachineCSE does not handle the -// "copy-like" operations (Copy and SubregToReg). Returns -// the original SrcReg unless it is the target of a copy-like -// operation, in which case we chain backwards through all -// such operations to the ultimate source register. If a -// physical register is encountered, we stop the search. -unsigned PPCMIPeephole::lookThruCopyLike(unsigned SrcReg) { +// helper functions for eliminateRedundantCompare +static bool isEqOrNe(MachineInstr *BI) { + PPC::Predicate Pred = (PPC::Predicate)BI->getOperand(0).getImm(); + unsigned PredCond = PPC::getPredicateCondition(Pred); + return (PredCond == PPC::PRED_EQ || PredCond == PPC::PRED_NE); +} + +static bool isSupportedCmpOp(unsigned opCode) { + return (opCode == PPC::CMPLD || opCode == PPC::CMPD || + opCode == PPC::CMPLW || opCode == PPC::CMPW || + opCode == PPC::CMPLDI || opCode == PPC::CMPDI || + opCode == PPC::CMPLWI || opCode == PPC::CMPWI); +} + +static bool is64bitCmpOp(unsigned opCode) { + return (opCode == PPC::CMPLD || opCode == PPC::CMPD || + opCode == PPC::CMPLDI || opCode == PPC::CMPDI); +} + +static bool isSignedCmpOp(unsigned opCode) { + return (opCode == PPC::CMPD || opCode == PPC::CMPW || + opCode == PPC::CMPDI || opCode == PPC::CMPWI); +} + +static unsigned getSignedCmpOpCode(unsigned opCode) { + if (opCode == PPC::CMPLD) return PPC::CMPD; + if (opCode == PPC::CMPLW) return PPC::CMPW; + if (opCode == PPC::CMPLDI) return PPC::CMPDI; + if (opCode == PPC::CMPLWI) return PPC::CMPWI; + return opCode; +} + +// We can decrement immediate x in (GE x) by changing it to (GT x-1) or +// (LT x) to (LE x-1) +static unsigned getPredicateToDecImm(MachineInstr *BI, MachineInstr *CMPI) { + uint64_t Imm = CMPI->getOperand(2).getImm(); + bool SignedCmp = isSignedCmpOp(CMPI->getOpcode()); + if ((!SignedCmp && Imm == 0) || (SignedCmp && Imm == 0x8000)) + return 0; + + PPC::Predicate Pred = (PPC::Predicate)BI->getOperand(0).getImm(); + unsigned PredCond = PPC::getPredicateCondition(Pred); + unsigned PredHint = PPC::getPredicateHint(Pred); + if (PredCond == PPC::PRED_GE) + return PPC::getPredicate(PPC::PRED_GT, PredHint); + if (PredCond == PPC::PRED_LT) + return PPC::getPredicate(PPC::PRED_LE, PredHint); + + return 0; +} + +// We can increment immediate x in (GT x) by changing it to (GE x+1) or +// (LE x) to (LT x+1) +static unsigned getPredicateToIncImm(MachineInstr *BI, MachineInstr *CMPI) { + uint64_t Imm = CMPI->getOperand(2).getImm(); + bool SignedCmp = isSignedCmpOp(CMPI->getOpcode()); + if ((!SignedCmp && Imm == 0xFFFF) || (SignedCmp && Imm == 0x7FFF)) + return 0; + + PPC::Predicate Pred = (PPC::Predicate)BI->getOperand(0).getImm(); + unsigned PredCond = PPC::getPredicateCondition(Pred); + unsigned PredHint = PPC::getPredicateHint(Pred); + if (PredCond == PPC::PRED_GT) + return PPC::getPredicate(PPC::PRED_GE, PredHint); + if (PredCond == PPC::PRED_LE) + return PPC::getPredicate(PPC::PRED_LT, PredHint); + + return 0; +} + +// This takes a Phi node and returns a register value for the spefied BB. +static unsigned getIncomingRegForBlock(MachineInstr *Phi, + MachineBasicBlock *MBB) { + for (unsigned I = 2, E = Phi->getNumOperands() + 1; I != E; I += 2) { + MachineOperand &MO = Phi->getOperand(I); + if (MO.getMBB() == MBB) + return Phi->getOperand(I-1).getReg(); + } + llvm_unreachable("invalid src basic block for this Phi node\n"); + return 0; +} + +// This function tracks the source of the register through register copy. +// If BB1 and BB2 are non-NULL, we also track PHI instruction in BB2 +// assuming that the control comes from BB1 into BB2. +static unsigned getSrcVReg(unsigned Reg, MachineBasicBlock *BB1, + MachineBasicBlock *BB2, MachineRegisterInfo *MRI) { + unsigned SrcReg = Reg; + while (1) { + unsigned NextReg = SrcReg; + MachineInstr *Inst = MRI->getVRegDef(SrcReg); + if (BB1 && Inst->getOpcode() == PPC::PHI && Inst->getParent() == BB2) { + NextReg = getIncomingRegForBlock(Inst, BB1); + // We track through PHI only once to avoid infinite loop. + BB1 = nullptr; + } + else if (Inst->isFullCopy()) + NextReg = Inst->getOperand(1).getReg(); + if (NextReg == SrcReg || !TargetRegisterInfo::isVirtualRegister(NextReg)) + break; + SrcReg = NextReg; + } + return SrcReg; +} + +static bool eligibleForCompareElimination(MachineBasicBlock &MBB, + MachineBasicBlock *&PredMBB, + MachineBasicBlock *&MBBtoMoveCmp, + MachineRegisterInfo *MRI) { + + auto isEligibleBB = [&](MachineBasicBlock &BB) { + auto BII = BB.getFirstInstrTerminator(); + // We optimize BBs ending with a conditional branch. + // We check only for BCC here, not BCCLR, because BCCLR + // will be formed only later in the pipeline. + if (BB.succ_size() == 2 && + BII != BB.instr_end() && + (*BII).getOpcode() == PPC::BCC && + (*BII).getOperand(1).isReg()) { + // We optimize only if the condition code is used only by one BCC. + unsigned CndReg = (*BII).getOperand(1).getReg(); + if (!TargetRegisterInfo::isVirtualRegister(CndReg) || + !MRI->hasOneNonDBGUse(CndReg)) + return false; + + MachineInstr *CMPI = MRI->getVRegDef(CndReg); + // We assume compare and branch are in the same BB for ease of analysis. + if (CMPI->getParent() != &BB) + return false; + + // We skip this BB if a physical register is used in comparison. + for (MachineOperand &MO : CMPI->operands()) + if (MO.isReg() && !TargetRegisterInfo::isVirtualRegister(MO.getReg())) + return false; + + return true; + } + return false; + }; + + // If this BB has more than one successor, we can create a new BB and + // move the compare instruction in the new BB. + // So far, we do not move compare instruction to a BB having multiple + // successors to avoid potentially increasing code size. + auto isEligibleForMoveCmp = [](MachineBasicBlock &BB) { + return BB.succ_size() == 1; + }; + + if (!isEligibleBB(MBB)) + return false; + + unsigned NumPredBBs = MBB.pred_size(); + if (NumPredBBs == 1) { + MachineBasicBlock *TmpMBB = *MBB.pred_begin(); + if (isEligibleBB(*TmpMBB)) { + PredMBB = TmpMBB; + MBBtoMoveCmp = nullptr; + return true; + } + } + else if (NumPredBBs == 2) { + // We check for partially redundant case. + // So far, we support cases with only two predecessors + // to avoid increasing the number of instructions. + MachineBasicBlock::pred_iterator PI = MBB.pred_begin(); + MachineBasicBlock *Pred1MBB = *PI; + MachineBasicBlock *Pred2MBB = *(PI+1); + + if (isEligibleBB(*Pred1MBB) && isEligibleForMoveCmp(*Pred2MBB)) { + // We assume Pred1MBB is the BB containing the compare to be merged and + // Pred2MBB is the BB to which we will append a compare instruction. + // Hence we can proceed as is. + } + else if (isEligibleBB(*Pred2MBB) && isEligibleForMoveCmp(*Pred1MBB)) { + // We need to swap Pred1MBB and Pred2MBB to canonicalize. + std::swap(Pred1MBB, Pred2MBB); + } + else return false; + + // Here, Pred2MBB is the BB to which we need to append a compare inst. + // We cannot move the compare instruction if operands are not available + // in Pred2MBB (i.e. defined in MBB by an instruction other than PHI). + MachineInstr *BI = &*MBB.getFirstInstrTerminator(); + MachineInstr *CMPI = MRI->getVRegDef(BI->getOperand(1).getReg()); + for (int I = 1; I <= 2; I++) + if (CMPI->getOperand(I).isReg()) { + MachineInstr *Inst = MRI->getVRegDef(CMPI->getOperand(I).getReg()); + if (Inst->getParent() == &MBB && Inst->getOpcode() != PPC::PHI) + return false; + } + + PredMBB = Pred1MBB; + MBBtoMoveCmp = Pred2MBB; + return true; + } + + return false; +} + +// This function will iterate over the input map containing a pair of TOC save +// instruction and a flag. The flag will be set to false if the TOC save is proven +// redundant. This function will erase from the basic block all the TOC saves +// marked as redundant. +bool PPCMIPeephole::eliminateRedundantTOCSaves( + std::map &TOCSaves) { + bool Simplified = false; + int NumKept = 0; + for (auto TOCSave : TOCSaves) { + if (!TOCSave.second) { + TOCSave.first->eraseFromParent(); + RemoveTOCSave++; + Simplified = true; + } else { + NumKept++; + } + } - while (true) { + if (NumKept > 1) + MultiTOCSaves++; - MachineInstr *MI = MRI->getVRegDef(SrcReg); - if (!MI->isCopyLike()) - return SrcReg; + return Simplified; +} - unsigned CopySrcReg; - if (MI->isCopy()) - CopySrcReg = MI->getOperand(1).getReg(); +// If multiple conditional branches are executed based on the (essentially) +// same comparison, we merge compare instructions into one and make multiple +// conditional branches on this comparison. +// For example, +// if (a == 0) { ... } +// else if (a < 0) { ... } +// can be executed by one compare and two conditional branches instead of +// two pairs of a compare and a conditional branch. +// +// This method merges two compare instructions in two MBBs and modifies the +// compare and conditional branch instructions if needed. +// For the above example, the input for this pass looks like: +// cmplwi r3, 0 +// beq 0, .LBB0_3 +// cmpwi r3, -1 +// bgt 0, .LBB0_4 +// So, before merging two compares, we need to modify these instructions as +// cmpwi r3, 0 ; cmplwi and cmpwi yield same result for beq +// beq 0, .LBB0_3 +// cmpwi r3, 0 ; greather than -1 means greater or equal to 0 +// bge 0, .LBB0_4 + +bool PPCMIPeephole::eliminateRedundantCompare(void) { + // FIXME: this transformation is causing miscompiles. Disabling it for now + // until we can resolve the issue. + return false; + bool Simplified = false; + + for (MachineBasicBlock &MBB2 : *MF) { + MachineBasicBlock *MBB1 = nullptr, *MBBtoMoveCmp = nullptr; + + // For fully redundant case, we select two basic blocks MBB1 and MBB2 + // as an optimization target if + // - both MBBs end with a conditional branch, + // - MBB1 is the only predecessor of MBB2, and + // - compare does not take a physical register as a operand in both MBBs. + // In this case, eligibleForCompareElimination sets MBBtoMoveCmp nullptr. + // + // As partially redundant case, we additionally handle if MBB2 has one + // additional predecessor, which has only one successor (MBB2). + // In this case, we move the compare instruction originally in MBB2 into + // MBBtoMoveCmp. This partially redundant case is typically appear by + // compiling a while loop; here, MBBtoMoveCmp is the loop preheader. + // + // Overview of CFG of related basic blocks + // Fully redundant case Partially redundant case + // -------- ---------------- -------- + // | MBB1 | (w/ 2 succ) | MBBtoMoveCmp | | MBB1 | (w/ 2 succ) + // -------- ---------------- -------- + // | \ (w/ 1 succ) \ | \ + // | \ \ | \ + // | \ | + // -------- -------- + // | MBB2 | (w/ 1 pred | MBB2 | (w/ 2 pred + // -------- and 2 succ) -------- and 2 succ) + // | \ | \ + // | \ | \ + // + if (!eligibleForCompareElimination(MBB2, MBB1, MBBtoMoveCmp, MRI)) + continue; + + MachineInstr *BI1 = &*MBB1->getFirstInstrTerminator(); + MachineInstr *CMPI1 = MRI->getVRegDef(BI1->getOperand(1).getReg()); + + MachineInstr *BI2 = &*MBB2.getFirstInstrTerminator(); + MachineInstr *CMPI2 = MRI->getVRegDef(BI2->getOperand(1).getReg()); + bool IsPartiallyRedundant = (MBBtoMoveCmp != nullptr); + + // We cannot optimize an unsupported compare opcode or + // a mix of 32-bit and 64-bit comaprisons + if (!isSupportedCmpOp(CMPI1->getOpcode()) || + !isSupportedCmpOp(CMPI2->getOpcode()) || + is64bitCmpOp(CMPI1->getOpcode()) != is64bitCmpOp(CMPI2->getOpcode())) + continue; + + unsigned NewOpCode = 0; + unsigned NewPredicate1 = 0, NewPredicate2 = 0; + int16_t Imm1 = 0, NewImm1 = 0, Imm2 = 0, NewImm2 = 0; + bool SwapOperands = false; + + if (CMPI1->getOpcode() != CMPI2->getOpcode()) { + // Typically, unsigned comparison is used for equality check, but + // we replace it with a signed comparison if the comparison + // to be merged is a signed comparison. + // In other cases of opcode mismatch, we cannot optimize this. + if (isEqOrNe(BI2) && + CMPI1->getOpcode() == getSignedCmpOpCode(CMPI2->getOpcode())) + NewOpCode = CMPI1->getOpcode(); + else if (isEqOrNe(BI1) && + getSignedCmpOpCode(CMPI1->getOpcode()) == CMPI2->getOpcode()) + NewOpCode = CMPI2->getOpcode(); + else continue; + } + + if (CMPI1->getOperand(2).isReg() && CMPI2->getOperand(2).isReg()) { + // In case of comparisons between two registers, these two registers + // must be same to merge two comparisons. + unsigned Cmp1Operand1 = getSrcVReg(CMPI1->getOperand(1).getReg(), + nullptr, nullptr, MRI); + unsigned Cmp1Operand2 = getSrcVReg(CMPI1->getOperand(2).getReg(), + nullptr, nullptr, MRI); + unsigned Cmp2Operand1 = getSrcVReg(CMPI2->getOperand(1).getReg(), + MBB1, &MBB2, MRI); + unsigned Cmp2Operand2 = getSrcVReg(CMPI2->getOperand(2).getReg(), + MBB1, &MBB2, MRI); + + if (Cmp1Operand1 == Cmp2Operand1 && Cmp1Operand2 == Cmp2Operand2) { + // Same pair of registers in the same order; ready to merge as is. + } + else if (Cmp1Operand1 == Cmp2Operand2 && Cmp1Operand2 == Cmp2Operand1) { + // Same pair of registers in different order. + // We reverse the predicate to merge compare instructions. + PPC::Predicate Pred = (PPC::Predicate)BI2->getOperand(0).getImm(); + NewPredicate2 = (unsigned)PPC::getSwappedPredicate(Pred); + // In case of partial redundancy, we need to swap operands + // in another compare instruction. + SwapOperands = true; + } + else continue; + } + else if (CMPI1->getOperand(2).isImm() && CMPI2->getOperand(2).isImm()) { + // In case of comparisons between a register and an immediate, + // the operand register must be same for two compare instructions. + unsigned Cmp1Operand1 = getSrcVReg(CMPI1->getOperand(1).getReg(), + nullptr, nullptr, MRI); + unsigned Cmp2Operand1 = getSrcVReg(CMPI2->getOperand(1).getReg(), + MBB1, &MBB2, MRI); + if (Cmp1Operand1 != Cmp2Operand1) + continue; + + NewImm1 = Imm1 = (int16_t)CMPI1->getOperand(2).getImm(); + NewImm2 = Imm2 = (int16_t)CMPI2->getOperand(2).getImm(); + + // If immediate are not same, we try to adjust by changing predicate; + // e.g. GT imm means GE (imm+1). + if (Imm1 != Imm2 && (!isEqOrNe(BI2) || !isEqOrNe(BI1))) { + int Diff = Imm1 - Imm2; + if (Diff < -2 || Diff > 2) + continue; + + unsigned PredToInc1 = getPredicateToIncImm(BI1, CMPI1); + unsigned PredToDec1 = getPredicateToDecImm(BI1, CMPI1); + unsigned PredToInc2 = getPredicateToIncImm(BI2, CMPI2); + unsigned PredToDec2 = getPredicateToDecImm(BI2, CMPI2); + if (Diff == 2) { + if (PredToInc2 && PredToDec1) { + NewPredicate2 = PredToInc2; + NewPredicate1 = PredToDec1; + NewImm2++; + NewImm1--; + } + } + else if (Diff == 1) { + if (PredToInc2) { + NewImm2++; + NewPredicate2 = PredToInc2; + } + else if (PredToDec1) { + NewImm1--; + NewPredicate1 = PredToDec1; + } + } + else if (Diff == -1) { + if (PredToDec2) { + NewImm2--; + NewPredicate2 = PredToDec2; + } + else if (PredToInc1) { + NewImm1++; + NewPredicate1 = PredToInc1; + } + } + else if (Diff == -2) { + if (PredToDec2 && PredToInc1) { + NewPredicate2 = PredToDec2; + NewPredicate1 = PredToInc1; + NewImm2--; + NewImm1++; + } + } + } + + // We cannnot merge two compares if the immediates are not same. + if (NewImm2 != NewImm1) + continue; + } + + DEBUG(dbgs() << "Optimize two pairs of compare and branch:\n"); + DEBUG(CMPI1->dump()); + DEBUG(BI1->dump()); + DEBUG(CMPI2->dump()); + DEBUG(BI2->dump()); + + // We adjust opcode, predicates and immediate as we determined above. + if (NewOpCode != 0 && NewOpCode != CMPI1->getOpcode()) { + CMPI1->setDesc(TII->get(NewOpCode)); + } + if (NewPredicate1) { + BI1->getOperand(0).setImm(NewPredicate1); + } + if (NewPredicate2) { + BI2->getOperand(0).setImm(NewPredicate2); + } + if (NewImm1 != Imm1) { + CMPI1->getOperand(2).setImm(NewImm1); + } + + if (IsPartiallyRedundant) { + // We touch up the compare instruction in MBB2 and move it to + // a previous BB to handle partially redundant case. + if (SwapOperands) { + unsigned Op1 = CMPI2->getOperand(1).getReg(); + unsigned Op2 = CMPI2->getOperand(2).getReg(); + CMPI2->getOperand(1).setReg(Op2); + CMPI2->getOperand(2).setReg(Op1); + } + if (NewImm2 != Imm2) + CMPI2->getOperand(2).setImm(NewImm2); + + for (int I = 1; I <= 2; I++) { + if (CMPI2->getOperand(I).isReg()) { + MachineInstr *Inst = MRI->getVRegDef(CMPI2->getOperand(I).getReg()); + if (Inst->getParent() != &MBB2) + continue; + + assert(Inst->getOpcode() == PPC::PHI && + "We cannot support if an operand comes from this BB."); + unsigned SrcReg = getIncomingRegForBlock(Inst, MBBtoMoveCmp); + CMPI2->getOperand(I).setReg(SrcReg); + } + } + auto I = MachineBasicBlock::iterator(MBBtoMoveCmp->getFirstTerminator()); + MBBtoMoveCmp->splice(I, &MBB2, MachineBasicBlock::iterator(CMPI2)); + + DebugLoc DL = CMPI2->getDebugLoc(); + unsigned NewVReg = MRI->createVirtualRegister(&PPC::CRRCRegClass); + BuildMI(MBB2, MBB2.begin(), DL, + TII->get(PPC::PHI), NewVReg) + .addReg(BI1->getOperand(1).getReg()).addMBB(MBB1) + .addReg(BI2->getOperand(1).getReg()).addMBB(MBBtoMoveCmp); + BI2->getOperand(1).setReg(NewVReg); + } else { - assert(MI->isSubregToReg() && "bad opcode for lookThruCopyLike"); - CopySrcReg = MI->getOperand(2).getReg(); + // We finally eliminate compare instruction in MBB2. + BI2->getOperand(1).setReg(BI1->getOperand(1).getReg()); + CMPI2->eraseFromParent(); } + BI2->getOperand(1).setIsKill(true); + BI1->getOperand(1).setIsKill(false); - if (!TargetRegisterInfo::isVirtualRegister(CopySrcReg)) - return CopySrcReg; + DEBUG(dbgs() << "into a compare and two branches:\n"); + DEBUG(CMPI1->dump()); + DEBUG(BI1->dump()); + DEBUG(BI2->dump()); + if (IsPartiallyRedundant) { + DEBUG(dbgs() << "The following compare is moved into " + << printMBBReference(*MBBtoMoveCmp) + << " to handle partial redundancy.\n"); + DEBUG(CMPI2->dump()); + } - SrcReg = CopySrcReg; + Simplified = true; } + + return Simplified; } } // end default namespace diff --git a/lib/Target/PowerPC/PPCMachineBasicBlockUtils.h b/lib/Target/PowerPC/PPCMachineBasicBlockUtils.h new file mode 100644 index 0000000000000..628ea2ab9fe62 --- /dev/null +++ b/lib/Target/PowerPC/PPCMachineBasicBlockUtils.h @@ -0,0 +1,198 @@ +//==-- PPCMachineBasicBlockUtils.h - Functions for common MBB operations ---==// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This file defines utility functions for commonly used operations on +// MachineBasicBlock's. +// NOTE: Include this file after defining DEBUG_TYPE so that the debug messages +// can be emitted for the pass that is using this. +// +//===----------------------------------------------------------------------===// + +#ifndef LLVM_LIB_TARGET_PPC_MACHINE_BASIC_BLOCK_UTILS_H +#define LLVM_LIB_TARGET_PPC_MACHINE_BASIC_BLOCK_UTILS_H + +#include "PPCInstrInfo.h" +#include "llvm/CodeGen/MachineInstrBuilder.h" +#include "llvm/CodeGen/MachineBranchProbabilityInfo.h" +#include "llvm/CodeGen/MachineRegisterInfo.h" + +#ifndef DEBUG_TYPE +#define DEBUG_TYPE "ppc-generic-mbb-utilities" +#endif + +using namespace llvm; + +/// Given a basic block \p Successor that potentially contains PHIs, this +/// function will look for any incoming values in the PHIs that are supposed to +/// be coming from \p OrigMBB but whose definition is actually in \p NewMBB. +/// Any such PHIs will be updated to reflect reality. +static void updatePHIs(MachineBasicBlock *Successor, MachineBasicBlock *OrigMBB, + MachineBasicBlock *NewMBB, MachineRegisterInfo *MRI) { + for (auto &MI : Successor->instrs()) { + if (!MI.isPHI()) + continue; + // This is a really ugly-looking loop, but it was pillaged directly from + // MachineBasicBlock::transferSuccessorsAndUpdatePHIs(). + for (unsigned i = 2, e = MI.getNumOperands()+1; i != e; i += 2) { + MachineOperand &MO = MI.getOperand(i); + if (MO.getMBB() == OrigMBB) { + // Check if the instruction is actualy defined in NewMBB. + if (MI.getOperand(i-1).isReg()) { + MachineInstr *DefMI = MRI->getVRegDef(MI.getOperand(i-1).getReg()); + if (DefMI->getParent() == NewMBB || !OrigMBB->isSuccessor(Successor)) { + MO.setMBB(NewMBB); + break; + } + } + } + } + } +} + +/// Given a basic block \p Successor that potentially contains PHIs, this +/// function will look for PHIs that have an incoming value from \p OrigMBB +/// and will add the same incoming value from \p NewMBB. +/// NOTE: This should only be used if \p NewMBB is an immediate dominator of +/// \p OrigMBB. +static void addIncomingValuesToPHIs(MachineBasicBlock *Successor, + MachineBasicBlock *OrigMBB, + MachineBasicBlock *NewMBB, + MachineRegisterInfo *MRI) { + assert(OrigMBB->isSuccessor(NewMBB) && "NewMBB must be a sucessor of OrigMBB"); + for (auto &MI : Successor->instrs()) { + if (!MI.isPHI()) + continue; + // This is a really ugly-looking loop, but it was pillaged directly from + // MachineBasicBlock::transferSuccessorsAndUpdatePHIs(). + for (unsigned i = 2, e = MI.getNumOperands()+1; i != e; i += 2) { + MachineOperand &MO = MI.getOperand(i); + if (MO.getMBB() == OrigMBB) { + MachineInstrBuilder MIB(*MI.getParent()->getParent(), &MI); + MIB.addReg(MI.getOperand(i-1).getReg()).addMBB(NewMBB); + break; + } + } + } +} + +struct BlockSplitInfo { + MachineInstr *OrigBranch; + MachineInstr *SplitBefore; + MachineInstr *SplitCond; + bool InvertNewBranch; + bool InvertOrigBranch; + bool BranchToFallThrough; + const MachineBranchProbabilityInfo *MBPI; + MachineInstr *MIToDelete; + MachineInstr *NewCond; + bool allInstrsInSameMBB() { + if (!OrigBranch || !SplitBefore || !SplitCond) + return false; + MachineBasicBlock *MBB = OrigBranch->getParent(); + if (SplitBefore->getParent() != MBB || + SplitCond->getParent() != MBB) + return false; + if (MIToDelete && MIToDelete->getParent() != MBB) + return false; + if (NewCond && NewCond->getParent() != MBB) + return false; + return true; + } +}; + +/// Splits a MachineBasicBlock to branch before \p SplitBefore. The original +/// branch is \p OrigBranch. The target of the new branch can either be the same +/// as the target of the original branch or the fallthrough successor of the +/// original block as determined by \p BranchToFallThrough. The branch +/// conditions will be inverted according to \p InvertNewBranch and +/// \p InvertOrigBranch. If an instruction that previously fed the branch is to +/// be deleted, it is provided in \p MIToDelete and \p NewCond will be used as +/// the branch condition. The branch probabilities will be set if the +/// MachineBranchProbabilityInfo isn't null. +static bool splitMBB(BlockSplitInfo &BSI) { + assert(BSI.allInstrsInSameMBB() && + "All instructions must be in the same block."); + + MachineBasicBlock *ThisMBB = BSI.OrigBranch->getParent(); + MachineFunction *MF = ThisMBB->getParent(); + MachineRegisterInfo *MRI = &MF->getRegInfo(); + assert(MRI->isSSA() && "Can only do this while the function is in SSA form."); + if (ThisMBB->succ_size() != 2) { + DEBUG(dbgs() << "Don't know how to handle blocks that don't have exactly" + << " two succesors.\n"); + return false; + } + + const PPCInstrInfo *TII = MF->getSubtarget().getInstrInfo(); + unsigned OrigBROpcode = BSI.OrigBranch->getOpcode(); + unsigned InvertedOpcode = + OrigBROpcode == PPC::BC ? PPC::BCn : + OrigBROpcode == PPC::BCn ? PPC::BC : + OrigBROpcode == PPC::BCLR ? PPC::BCLRn : PPC::BCLR; + unsigned NewBROpcode = BSI.InvertNewBranch ? InvertedOpcode : OrigBROpcode; + MachineBasicBlock *OrigTarget = BSI.OrigBranch->getOperand(1).getMBB(); + MachineBasicBlock *OrigFallThrough = + OrigTarget == *ThisMBB->succ_begin() ? *ThisMBB->succ_rbegin() : + *ThisMBB->succ_begin(); + MachineBasicBlock *NewBRTarget = + BSI.BranchToFallThrough ? OrigFallThrough : OrigTarget; + BranchProbability ProbToNewTarget = + !BSI.MBPI ? BranchProbability::getUnknown() : + BSI.MBPI->getEdgeProbability(ThisMBB, NewBRTarget); + + // Create a new basic block. + MachineBasicBlock::iterator InsertPoint = BSI.SplitBefore; + const BasicBlock *LLVM_BB = ThisMBB->getBasicBlock(); + MachineFunction::iterator It = ThisMBB->getIterator(); + MachineBasicBlock *NewMBB = MF->CreateMachineBasicBlock(LLVM_BB); + MF->insert(++It, NewMBB); + + // Move everything after SplitBefore into the new block. + NewMBB->splice(NewMBB->end(), ThisMBB, InsertPoint, ThisMBB->end()); + NewMBB->transferSuccessors(ThisMBB); + + // Add the two successors to ThisMBB. The probabilities come from the + // existing blocks if available. + ThisMBB->addSuccessor(NewBRTarget, ProbToNewTarget); + ThisMBB->addSuccessor(NewMBB, ProbToNewTarget.getCompl()); + + // Add the branches to ThisMBB. + BuildMI(*ThisMBB, ThisMBB->end(), BSI.SplitBefore->getDebugLoc(), + TII->get(NewBROpcode)).addReg(BSI.SplitCond->getOperand(0).getReg()) + .addMBB(NewBRTarget); + BuildMI(*ThisMBB, ThisMBB->end(), BSI.SplitBefore->getDebugLoc(), + TII->get(PPC::B)).addMBB(NewMBB); + if (BSI.MIToDelete) + BSI.MIToDelete->eraseFromParent(); + + // Change the condition on the original branch and invert it if requested. + auto FirstTerminator = NewMBB->getFirstTerminator(); + if (BSI.NewCond) { + assert(FirstTerminator->getOperand(0).isReg() && + "Can't update condition of unconditional branch."); + FirstTerminator->getOperand(0).setReg(BSI.NewCond->getOperand(0).getReg()); + } + if (BSI.InvertOrigBranch) + FirstTerminator->setDesc(TII->get(InvertedOpcode)); + + // If any of the PHIs in the successors of NewMBB reference values that + // now come from NewMBB, they need to be updated. + for (auto *Succ : NewMBB->successors()) { + updatePHIs(Succ, ThisMBB, NewMBB, MRI); + } + addIncomingValuesToPHIs(NewBRTarget, ThisMBB, NewMBB, MRI); + + DEBUG(dbgs() << "After splitting, ThisMBB:\n"; ThisMBB->dump()); + DEBUG(dbgs() << "NewMBB:\n"; NewMBB->dump()); + DEBUG(dbgs() << "New branch-to block:\n"; NewBRTarget->dump()); + return true; +} + + +#endif diff --git a/lib/Target/PowerPC/PPCMachineFunctionInfo.cpp b/lib/Target/PowerPC/PPCMachineFunctionInfo.cpp index bc2d9a08b5e86..3923417257e8c 100644 --- a/lib/Target/PowerPC/PPCMachineFunctionInfo.cpp +++ b/lib/Target/PowerPC/PPCMachineFunctionInfo.cpp @@ -43,3 +43,17 @@ MCSymbol *PPCFunctionInfo::getTOCOffsetSymbol() const { "func_toc" + Twine(MF.getFunctionNumber())); } + +bool PPCFunctionInfo::isLiveInSExt(unsigned VReg) const { + for (const std::pair &LiveIn : LiveInAttrs) + if (LiveIn.first == VReg) + return LiveIn.second.isSExt(); + return false; +} + +bool PPCFunctionInfo::isLiveInZExt(unsigned VReg) const { + for (const std::pair &LiveIn : LiveInAttrs) + if (LiveIn.first == VReg) + return LiveIn.second.isZExt(); + return false; +} diff --git a/lib/Target/PowerPC/PPCMachineFunctionInfo.h b/lib/Target/PowerPC/PPCMachineFunctionInfo.h index 202e10058b733..a9b6073106eae 100644 --- a/lib/Target/PowerPC/PPCMachineFunctionInfo.h +++ b/lib/Target/PowerPC/PPCMachineFunctionInfo.h @@ -16,6 +16,7 @@ #include "llvm/ADT/SmallVector.h" #include "llvm/CodeGen/MachineFunction.h" +#include "llvm/CodeGen/TargetCallingConv.h" namespace llvm { @@ -113,6 +114,10 @@ class PPCFunctionInfo : public MachineFunctionInfo { /// copies bool IsSplitCSR = false; + /// We keep track attributes for each live-in virtual registers + /// to use SExt/ZExt flags in later optimization. + std::vector> LiveInAttrs; + public: explicit PPCFunctionInfo(MachineFunction &MF) : MF(MF) {} @@ -175,6 +180,19 @@ public: unsigned getVarArgsNumFPR() const { return VarArgsNumFPR; } void setVarArgsNumFPR(unsigned Num) { VarArgsNumFPR = Num; } + /// This function associates attributes for each live-in virtual register. + void addLiveInAttr(unsigned VReg, ISD::ArgFlagsTy Flags) { + LiveInAttrs.push_back(std::make_pair(VReg, Flags)); + } + + /// This function returns true if the spesified vreg is + /// a live-in register and sign-extended. + bool isLiveInSExt(unsigned VReg) const; + + /// This function returns true if the spesified vreg is + /// a live-in register and zero-extended. + bool isLiveInZExt(unsigned VReg) const; + int getCRSpillFrameIndex() const { return CRSpillFrameIndex; } void setCRSpillFrameIndex(int idx) { CRSpillFrameIndex = idx; } diff --git a/lib/Target/PowerPC/PPCPreEmitPeephole.cpp b/lib/Target/PowerPC/PPCPreEmitPeephole.cpp new file mode 100644 index 0000000000000..9501f0f89b81b --- /dev/null +++ b/lib/Target/PowerPC/PPCPreEmitPeephole.cpp @@ -0,0 +1,95 @@ +//===--------- PPCPreEmitPeephole.cpp - Late peephole optimizations -------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// A pre-emit peephole for catching opportunities introduced by late passes such +// as MachineBlockPlacement. +// +//===----------------------------------------------------------------------===// + +#include "PPC.h" +#include "PPCInstrInfo.h" +#include "PPCSubtarget.h" +#include "llvm/ADT/DenseMap.h" +#include "llvm/ADT/Statistic.h" +#include "llvm/CodeGen/LivePhysRegs.h" +#include "llvm/CodeGen/MachineFunctionPass.h" +#include "llvm/CodeGen/MachineInstrBuilder.h" +#include "llvm/CodeGen/MachineRegisterInfo.h" +#include "llvm/Support/CommandLine.h" +#include "llvm/ADT/Statistic.h" +#include "llvm/Support/Debug.h" + +using namespace llvm; + +#define DEBUG_TYPE "ppc-pre-emit-peephole" + +STATISTIC(NumRRConvertedInPreEmit, + "Number of r+r instructions converted to r+i in pre-emit peephole"); +STATISTIC(NumRemovedInPreEmit, + "Number of instructions deleted in pre-emit peephole"); + +static cl::opt +RunPreEmitPeephole("ppc-late-peephole", cl::Hidden, cl::init(false), + cl::desc("Run pre-emit peephole optimizations.")); + +namespace { + class PPCPreEmitPeephole : public MachineFunctionPass { + public: + static char ID; + PPCPreEmitPeephole() : MachineFunctionPass(ID) { + initializePPCPreEmitPeepholePass(*PassRegistry::getPassRegistry()); + } + + void getAnalysisUsage(AnalysisUsage &AU) const override { + MachineFunctionPass::getAnalysisUsage(AU); + } + + MachineFunctionProperties getRequiredProperties() const override { + return MachineFunctionProperties().set( + MachineFunctionProperties::Property::NoVRegs); + } + + bool runOnMachineFunction(MachineFunction &MF) override { + if (skipFunction(MF.getFunction()) || !RunPreEmitPeephole) + return false; + bool Changed = false; + const PPCInstrInfo *TII = MF.getSubtarget().getInstrInfo(); + SmallVector InstrsToErase; + for (MachineBasicBlock &MBB : MF) { + for (MachineInstr &MI : MBB) { + MachineInstr *DefMIToErase = nullptr; + if (TII->convertToImmediateForm(MI, &DefMIToErase)) { + Changed = true; + NumRRConvertedInPreEmit++; + DEBUG(dbgs() << "Converted instruction to imm form: "); + DEBUG(MI.dump()); + if (DefMIToErase) { + InstrsToErase.push_back(DefMIToErase); + } + } + } + } + for (MachineInstr *MI : InstrsToErase) { + DEBUG(dbgs() << "PPC pre-emit peephole: erasing instruction: "); + DEBUG(MI->dump()); + MI->eraseFromParent(); + NumRemovedInPreEmit++; + } + return Changed; + } + }; +} + +INITIALIZE_PASS(PPCPreEmitPeephole, DEBUG_TYPE, "PowerPC Pre-Emit Peephole", + false, false) +char PPCPreEmitPeephole::ID = 0; + +FunctionPass *llvm::createPPCPreEmitPeepholePass() { + return new PPCPreEmitPeephole(); +} diff --git a/lib/Target/PowerPC/PPCQPXLoadSplat.cpp b/lib/Target/PowerPC/PPCQPXLoadSplat.cpp index 8a18ab9e0e9a3..25b2b54cbe98a 100644 --- a/lib/Target/PowerPC/PPCQPXLoadSplat.cpp +++ b/lib/Target/PowerPC/PPCQPXLoadSplat.cpp @@ -22,9 +22,9 @@ #include "llvm/ADT/SmallVector.h" #include "llvm/ADT/Statistic.h" #include "llvm/CodeGen/MachineFunctionPass.h" +#include "llvm/CodeGen/TargetSubtargetInfo.h" #include "llvm/Support/MathExtras.h" #include "llvm/Target/TargetMachine.h" -#include "llvm/Target/TargetSubtargetInfo.h" using namespace llvm; #define DEBUG_TYPE "ppc-qpx-load-splat" @@ -60,7 +60,7 @@ FunctionPass *llvm::createPPCQPXLoadSplatPass() { } bool PPCQPXLoadSplat::runOnMachineFunction(MachineFunction &MF) { - if (skipFunction(*MF.getFunction())) + if (skipFunction(MF.getFunction())) return false; bool MadeChange = false; @@ -79,8 +79,8 @@ bool PPCQPXLoadSplat::runOnMachineFunction(MachineFunction &MF) { } // We're looking for a sequence like this: - // %F0 = LFD 0, %X3, %QF0; mem:LD8[%a](tbaa=!2) - // %QF1 = QVESPLATI %QF0, 0, %RM + // %f0 = LFD 0, killed %x3, implicit-def %qf0; mem:LD8[%a](tbaa=!2) + // %qf1 = QVESPLATI killed %qf0, 0, implicit %rm for (auto SI = Splats.begin(); SI != Splats.end();) { MachineInstr *SMI = *SI; diff --git a/lib/Target/PowerPC/PPCReduceCRLogicals.cpp b/lib/Target/PowerPC/PPCReduceCRLogicals.cpp new file mode 100644 index 0000000000000..5b2d7191683c0 --- /dev/null +++ b/lib/Target/PowerPC/PPCReduceCRLogicals.cpp @@ -0,0 +1,535 @@ +//===---- PPCReduceCRLogicals.cpp - Reduce CR Bit Logical operations ------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===---------------------------------------------------------------------===// +// +// This pass aims to reduce the number of logical operations on bits in the CR +// register. These instructions have a fairly high latency and only a single +// pipeline at their disposal in modern PPC cores. Furthermore, they have a +// tendency to occur in fairly small blocks where there's little opportunity +// to hide the latency between the CR logical operation and its user. +// +//===---------------------------------------------------------------------===// + +#include "PPCInstrInfo.h" +#include "PPC.h" +#include "PPCTargetMachine.h" +#include "llvm/CodeGen/MachineFunctionPass.h" +#include "llvm/CodeGen/MachineDominators.h" +#include "llvm/Support/Debug.h" +#include "llvm/ADT/Statistic.h" + +using namespace llvm; + +#define DEBUG_TYPE "ppc-reduce-cr-ops" +#include "PPCMachineBasicBlockUtils.h" + +STATISTIC(NumContainedSingleUseBinOps, + "Number of single-use binary CR logical ops contained in a block"); +STATISTIC(NumToSplitBlocks, + "Number of binary CR logical ops that can be used to split blocks"); +STATISTIC(TotalCRLogicals, "Number of CR logical ops."); +STATISTIC(TotalNullaryCRLogicals, + "Number of nullary CR logical ops (CRSET/CRUNSET)."); +STATISTIC(TotalUnaryCRLogicals, "Number of unary CR logical ops."); +STATISTIC(TotalBinaryCRLogicals, "Number of CR logical ops."); +STATISTIC(NumBlocksSplitOnBinaryCROp, + "Number of blocks split on CR binary logical ops."); +STATISTIC(NumNotSplitIdenticalOperands, + "Number of blocks not split due to operands being identical."); +STATISTIC(NumNotSplitChainCopies, + "Number of blocks not split due to operands being chained copies."); +STATISTIC(NumNotSplitWrongOpcode, + "Number of blocks not split due to the wrong opcode."); + +namespace llvm { + void initializePPCReduceCRLogicalsPass(PassRegistry&); +} + +namespace { + +static bool isBinary(MachineInstr &MI) { + return MI.getNumOperands() == 3; +} + +static bool isNullary(MachineInstr &MI) { + return MI.getNumOperands() == 1; +} + +/// Given a CR logical operation \p CROp, branch opcode \p BROp as well as +/// a flag to indicate if the first operand of \p CROp is used as the +/// SplitBefore operand, determines whether either of the branches are to be +/// inverted as well as whether the new target should be the original +/// fall-through block. +static void +computeBranchTargetAndInversion(unsigned CROp, unsigned BROp, bool UsingDef1, + bool &InvertNewBranch, bool &InvertOrigBranch, + bool &TargetIsFallThrough) { + // The conditions under which each of the output operands should be [un]set + // can certainly be written much more concisely with just 3 if statements or + // ternary expressions. However, this provides a much clearer overview to the + // reader as to what is set for each combination. + if (BROp == PPC::BC || BROp == PPC::BCLR) { + // Regular branches. + switch (CROp) { + default: + llvm_unreachable("Don't know how to handle this CR logical."); + case PPC::CROR: + InvertNewBranch = false; + InvertOrigBranch = false; + TargetIsFallThrough = false; + return; + case PPC::CRAND: + InvertNewBranch = true; + InvertOrigBranch = false; + TargetIsFallThrough = true; + return; + case PPC::CRNAND: + InvertNewBranch = true; + InvertOrigBranch = true; + TargetIsFallThrough = false; + return; + case PPC::CRNOR: + InvertNewBranch = false; + InvertOrigBranch = true; + TargetIsFallThrough = true; + return; + case PPC::CRORC: + InvertNewBranch = UsingDef1; + InvertOrigBranch = !UsingDef1; + TargetIsFallThrough = false; + return; + case PPC::CRANDC: + InvertNewBranch = !UsingDef1; + InvertOrigBranch = !UsingDef1; + TargetIsFallThrough = true; + return; + } + } else if (BROp == PPC::BCn || BROp == PPC::BCLRn) { + // Negated branches. + switch (CROp) { + default: + llvm_unreachable("Don't know how to handle this CR logical."); + case PPC::CROR: + InvertNewBranch = true; + InvertOrigBranch = false; + TargetIsFallThrough = true; + return; + case PPC::CRAND: + InvertNewBranch = false; + InvertOrigBranch = false; + TargetIsFallThrough = false; + return; + case PPC::CRNAND: + InvertNewBranch = false; + InvertOrigBranch = true; + TargetIsFallThrough = true; + return; + case PPC::CRNOR: + InvertNewBranch = true; + InvertOrigBranch = true; + TargetIsFallThrough = false; + return; + case PPC::CRORC: + InvertNewBranch = !UsingDef1; + InvertOrigBranch = !UsingDef1; + TargetIsFallThrough = true; + return; + case PPC::CRANDC: + InvertNewBranch = UsingDef1; + InvertOrigBranch = !UsingDef1; + TargetIsFallThrough = false; + return; + } + } else + llvm_unreachable("Don't know how to handle this branch."); +} + +class PPCReduceCRLogicals : public MachineFunctionPass { + +public: + static char ID; + struct CRLogicalOpInfo { + MachineInstr *MI; + // FIXME: If chains of copies are to be handled, this should be a vector. + std::pair CopyDefs; + std::pair TrueDefs; + unsigned IsBinary : 1; + unsigned IsNullary : 1; + unsigned ContainedInBlock : 1; + unsigned FeedsISEL : 1; + unsigned FeedsBR : 1; + unsigned FeedsLogical : 1; + unsigned SingleUse : 1; + unsigned DefsSingleUse : 1; + unsigned SubregDef1; + unsigned SubregDef2; + CRLogicalOpInfo() : MI(nullptr), IsBinary(0), IsNullary(0), + ContainedInBlock(0), FeedsISEL(0), FeedsBR(0), + FeedsLogical(0), SingleUse(0), DefsSingleUse(1), + SubregDef1(0), SubregDef2(0) { } + void dump(); + }; + +private: + const PPCInstrInfo *TII; + MachineFunction *MF; + MachineRegisterInfo *MRI; + const MachineBranchProbabilityInfo *MBPI; + + // A vector to contain all the CR logical operations + std::vector AllCRLogicalOps; + void initialize(MachineFunction &MFParm); + void collectCRLogicals(); + bool handleCROp(CRLogicalOpInfo &CRI); + bool splitBlockOnBinaryCROp(CRLogicalOpInfo &CRI); + static bool isCRLogical(MachineInstr &MI) { + unsigned Opc = MI.getOpcode(); + return Opc == PPC::CRAND || Opc == PPC::CRNAND || Opc == PPC::CROR || + Opc == PPC::CRXOR || Opc == PPC::CRNOR || Opc == PPC::CREQV || + Opc == PPC::CRANDC || Opc == PPC::CRORC || Opc == PPC::CRSET || + Opc == PPC::CRUNSET || Opc == PPC::CR6SET || Opc == PPC::CR6UNSET; + } + bool simplifyCode() { + bool Changed = false; + // Not using a range-based for loop here as the vector may grow while being + // operated on. + for (unsigned i = 0; i < AllCRLogicalOps.size(); i++) + Changed |= handleCROp(AllCRLogicalOps[i]); + return Changed; + } + +public: + PPCReduceCRLogicals() : MachineFunctionPass(ID) { + initializePPCReduceCRLogicalsPass(*PassRegistry::getPassRegistry()); + } + + MachineInstr *lookThroughCRCopy(unsigned Reg, unsigned &Subreg, + MachineInstr *&CpDef); + bool runOnMachineFunction(MachineFunction &MF) override { + if (skipFunction(MF.getFunction())) + return false; + + // If the subtarget doesn't use CR bits, there's nothing to do. + const PPCSubtarget &STI = MF.getSubtarget(); + if (!STI.useCRBits()) + return false; + + initialize(MF); + collectCRLogicals(); + return simplifyCode(); + } + CRLogicalOpInfo createCRLogicalOpInfo(MachineInstr &MI); + void getAnalysisUsage(AnalysisUsage &AU) const override { + AU.addRequired(); + AU.addRequired(); + MachineFunctionPass::getAnalysisUsage(AU); + } +}; + +#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) +LLVM_DUMP_METHOD void PPCReduceCRLogicals::CRLogicalOpInfo::dump() { + dbgs() << "CRLogicalOpMI: "; + MI->dump(); + dbgs() << "IsBinary: " << IsBinary << ", FeedsISEL: " << FeedsISEL; + dbgs() << ", FeedsBR: " << FeedsBR << ", FeedsLogical: "; + dbgs() << FeedsLogical << ", SingleUse: " << SingleUse; + dbgs() << ", DefsSingleUse: " << DefsSingleUse; + dbgs() << ", SubregDef1: " << SubregDef1 << ", SubregDef2: "; + dbgs() << SubregDef2 << ", ContainedInBlock: " << ContainedInBlock; + if (!IsNullary) { + dbgs() << "\nDefs:\n"; + TrueDefs.first->dump(); + } + if (IsBinary) + TrueDefs.second->dump(); + dbgs() << "\n"; + if (CopyDefs.first) { + dbgs() << "CopyDef1: "; + CopyDefs.first->dump(); + } + if (CopyDefs.second) { + dbgs() << "CopyDef2: "; + CopyDefs.second->dump(); + } +} +#endif + +PPCReduceCRLogicals::CRLogicalOpInfo +PPCReduceCRLogicals::createCRLogicalOpInfo(MachineInstr &MIParam) { + CRLogicalOpInfo Ret; + Ret.MI = &MIParam; + // Get the defs + if (isNullary(MIParam)) { + Ret.IsNullary = 1; + Ret.TrueDefs = std::make_pair(nullptr, nullptr); + Ret.CopyDefs = std::make_pair(nullptr, nullptr); + } else { + MachineInstr *Def1 = lookThroughCRCopy(MIParam.getOperand(1).getReg(), + Ret.SubregDef1, Ret.CopyDefs.first); + Ret.DefsSingleUse &= + MRI->hasOneNonDBGUse(Def1->getOperand(0).getReg()); + Ret.DefsSingleUse &= + MRI->hasOneNonDBGUse(Ret.CopyDefs.first->getOperand(0).getReg()); + assert(Def1 && "Must be able to find a definition of operand 1."); + if (isBinary(MIParam)) { + Ret.IsBinary = 1; + MachineInstr *Def2 = lookThroughCRCopy(MIParam.getOperand(2).getReg(), + Ret.SubregDef2, + Ret.CopyDefs.second); + Ret.DefsSingleUse &= + MRI->hasOneNonDBGUse(Def2->getOperand(0).getReg()); + Ret.DefsSingleUse &= + MRI->hasOneNonDBGUse(Ret.CopyDefs.second->getOperand(0).getReg()); + assert(Def2 && "Must be able to find a definition of operand 2."); + Ret.TrueDefs = std::make_pair(Def1, Def2); + } else { + Ret.TrueDefs = std::make_pair(Def1, nullptr); + Ret.CopyDefs.second = nullptr; + } + } + + Ret.ContainedInBlock = 1; + // Get the uses + for (MachineInstr &UseMI : + MRI->use_nodbg_instructions(MIParam.getOperand(0).getReg())) { + unsigned Opc = UseMI.getOpcode(); + if (Opc == PPC::ISEL || Opc == PPC::ISEL8) + Ret.FeedsISEL = 1; + if (Opc == PPC::BC || Opc == PPC::BCn || Opc == PPC::BCLR || + Opc == PPC::BCLRn) + Ret.FeedsBR = 1; + Ret.FeedsLogical = isCRLogical(UseMI); + if (UseMI.getParent() != MIParam.getParent()) + Ret.ContainedInBlock = 0; + } + Ret.SingleUse = MRI->hasOneNonDBGUse(MIParam.getOperand(0).getReg()) ? 1 : 0; + + // We now know whether all the uses of the CR logical are in the same block. + if (!Ret.IsNullary) { + Ret.ContainedInBlock &= + (MIParam.getParent() == Ret.TrueDefs.first->getParent()); + if (Ret.IsBinary) + Ret.ContainedInBlock &= + (MIParam.getParent() == Ret.TrueDefs.second->getParent()); + } + DEBUG(Ret.dump()); + if (Ret.IsBinary && Ret.ContainedInBlock && Ret.SingleUse) { + NumContainedSingleUseBinOps++; + if (Ret.FeedsBR && Ret.DefsSingleUse) + NumToSplitBlocks++; + } + return Ret; +} + +/// Looks trhough a COPY instruction to the actual definition of the CR-bit +/// register and returns the instruction that defines it. +/// FIXME: This currently handles what is by-far the most common case: +/// an instruction that defines a CR field followed by a single copy of a bit +/// from that field into a virtual register. If chains of copies need to be +/// handled, this should have a loop until a non-copy instruction is found. +MachineInstr *PPCReduceCRLogicals::lookThroughCRCopy(unsigned Reg, + unsigned &Subreg, + MachineInstr *&CpDef) { + Subreg = -1; + if (!TargetRegisterInfo::isVirtualRegister(Reg)) + return nullptr; + MachineInstr *Copy = MRI->getVRegDef(Reg); + CpDef = Copy; + if (!Copy->isCopy()) + return Copy; + unsigned CopySrc = Copy->getOperand(1).getReg(); + Subreg = Copy->getOperand(1).getSubReg(); + if (!TargetRegisterInfo::isVirtualRegister(CopySrc)) { + const TargetRegisterInfo *TRI = &TII->getRegisterInfo(); + // Set the Subreg + if (CopySrc == PPC::CR0EQ || CopySrc == PPC::CR6EQ) + Subreg = PPC::sub_eq; + if (CopySrc == PPC::CR0LT || CopySrc == PPC::CR6LT) + Subreg = PPC::sub_lt; + if (CopySrc == PPC::CR0GT || CopySrc == PPC::CR6GT) + Subreg = PPC::sub_gt; + if (CopySrc == PPC::CR0UN || CopySrc == PPC::CR6UN) + Subreg = PPC::sub_un; + // Loop backwards and return the first MI that modifies the physical CR Reg. + MachineBasicBlock::iterator Me = Copy, B = Copy->getParent()->begin(); + while (Me != B) + if ((--Me)->modifiesRegister(CopySrc, TRI)) + return &*Me; + return nullptr; + } + return MRI->getVRegDef(CopySrc); +} + +void PPCReduceCRLogicals::initialize(MachineFunction &MFParam) { + MF = &MFParam; + MRI = &MF->getRegInfo(); + TII = MF->getSubtarget().getInstrInfo(); + MBPI = &getAnalysis(); + + AllCRLogicalOps.clear(); +} + +/// Contains all the implemented transformations on CR logical operations. +/// For example, a binary CR logical can be used to split a block on its inputs, +/// a unary CR logical might be used to change the condition code on a +/// comparison feeding it. A nullary CR logical might simply be removable +/// if the user of the bit it [un]sets can be transformed. +bool PPCReduceCRLogicals::handleCROp(CRLogicalOpInfo &CRI) { + // We can definitely split a block on the inputs to a binary CR operation + // whose defs and (single) use are within the same block. + bool Changed = false; + if (CRI.IsBinary && CRI.ContainedInBlock && CRI.SingleUse && CRI.FeedsBR && + CRI.DefsSingleUse) { + Changed = splitBlockOnBinaryCROp(CRI); + if (Changed) + NumBlocksSplitOnBinaryCROp++; + } + return Changed; +} + +/// Splits a block that contains a CR-logical operation that feeds a branch +/// and whose operands are produced within the block. +/// Example: +/// %vr5 = CMPDI %vr2, 0; CRRC:%vr5 G8RC:%vr2 +/// %vr6 = COPY %vr5:sub_eq; CRBITRC:%vr6 CRRC:%vr5 +/// %vr7 = CMPDI %vr3, 0; CRRC:%vr7 G8RC:%vr3 +/// %vr8 = COPY %vr7:sub_eq; CRBITRC:%vr8 CRRC:%vr7 +/// %vr9 = CROR %vr6, %vr8; CRBITRC:%vr9,%vr6,%vr8 +/// BC %vr9, ; CRBITRC:%vr9 +/// Becomes: +/// %vr5 = CMPDI %vr2, 0; CRRC:%vr5 G8RC:%vr2 +/// %vr6 = COPY %vr5:sub_eq; CRBITRC:%vr6 CRRC:%vr5 +/// BC %vr6, ; CRBITRC:%vr6 +/// +/// %vr7 = CMPDI %vr3, 0; CRRC:%vr7 G8RC:%vr3 +/// %vr8 = COPY %vr7:sub_eq; CRBITRC:%vr8 CRRC:%vr7 +/// BC %vr9, ; CRBITRC:%vr9 +bool PPCReduceCRLogicals::splitBlockOnBinaryCROp(CRLogicalOpInfo &CRI) { + if (CRI.CopyDefs.first == CRI.CopyDefs.second) { + DEBUG(dbgs() << "Unable to split as the two operands are the same\n"); + NumNotSplitIdenticalOperands++; + return false; + } + if (CRI.TrueDefs.first->isCopy() || CRI.TrueDefs.second->isCopy() || + CRI.TrueDefs.first->isPHI() || CRI.TrueDefs.second->isPHI()) { + DEBUG(dbgs() << "Unable to split because one of the operands is a PHI or " + "chain of copies.\n"); + NumNotSplitChainCopies++; + return false; + } + // Note: keep in sync with computeBranchTargetAndInversion(). + if (CRI.MI->getOpcode() != PPC::CROR && + CRI.MI->getOpcode() != PPC::CRAND && + CRI.MI->getOpcode() != PPC::CRNOR && + CRI.MI->getOpcode() != PPC::CRNAND && + CRI.MI->getOpcode() != PPC::CRORC && + CRI.MI->getOpcode() != PPC::CRANDC) { + DEBUG(dbgs() << "Unable to split blocks on this opcode.\n"); + NumNotSplitWrongOpcode++; + return false; + } + DEBUG(dbgs() << "Splitting the following CR op:\n"; CRI.dump()); + MachineBasicBlock::iterator Def1It = CRI.TrueDefs.first; + MachineBasicBlock::iterator Def2It = CRI.TrueDefs.second; + + bool UsingDef1 = false; + MachineInstr *SplitBefore = &*Def2It; + for (auto E = CRI.MI->getParent()->end(); Def2It != E; ++Def2It) { + if (Def1It == Def2It) { // Def2 comes before Def1. + SplitBefore = &*Def1It; + UsingDef1 = true; + break; + } + } + + DEBUG(dbgs() << "We will split the following block:\n";); + DEBUG(CRI.MI->getParent()->dump()); + DEBUG(dbgs() << "Before instruction:\n"; SplitBefore->dump()); + + // Get the branch instruction. + MachineInstr *Branch = + MRI->use_nodbg_begin(CRI.MI->getOperand(0).getReg())->getParent(); + + // We want the new block to have no code in it other than the definition + // of the input to the CR logical and the CR logical itself. So we move + // those to the bottom of the block (just before the branch). Then we + // will split before the CR logical. + MachineBasicBlock *MBB = SplitBefore->getParent(); + auto FirstTerminator = MBB->getFirstTerminator(); + MachineBasicBlock::iterator FirstInstrToMove = + UsingDef1 ? CRI.TrueDefs.first : CRI.TrueDefs.second; + MachineBasicBlock::iterator SecondInstrToMove = + UsingDef1 ? CRI.CopyDefs.first : CRI.CopyDefs.second; + + // The instructions that need to be moved are not guaranteed to be + // contiguous. Move them individually. + // FIXME: If one of the operands is a chain of (single use) copies, they + // can all be moved and we can still split. + MBB->splice(FirstTerminator, MBB, FirstInstrToMove); + if (FirstInstrToMove != SecondInstrToMove) + MBB->splice(FirstTerminator, MBB, SecondInstrToMove); + MBB->splice(FirstTerminator, MBB, CRI.MI); + + unsigned Opc = CRI.MI->getOpcode(); + bool InvertOrigBranch, InvertNewBranch, TargetIsFallThrough; + computeBranchTargetAndInversion(Opc, Branch->getOpcode(), UsingDef1, + InvertNewBranch, InvertOrigBranch, + TargetIsFallThrough); + MachineInstr *SplitCond = + UsingDef1 ? CRI.CopyDefs.second : CRI.CopyDefs.first; + DEBUG(dbgs() << "We will " << (InvertNewBranch ? "invert" : "copy")); + DEBUG(dbgs() << " the original branch and the target is the " << + (TargetIsFallThrough ? "fallthrough block\n" : "orig. target block\n")); + DEBUG(dbgs() << "Original branch instruction: "; Branch->dump()); + BlockSplitInfo BSI { Branch, SplitBefore, SplitCond, InvertNewBranch, + InvertOrigBranch, TargetIsFallThrough, MBPI, CRI.MI, + UsingDef1 ? CRI.CopyDefs.first : CRI.CopyDefs.second }; + bool Changed = splitMBB(BSI); + // If we've split on a CR logical that is fed by a CR logical, + // recompute the source CR logical as it may be usable for splitting. + if (Changed) { + bool Input1CRlogical = + CRI.TrueDefs.first && isCRLogical(*CRI.TrueDefs.first); + bool Input2CRlogical = + CRI.TrueDefs.second && isCRLogical(*CRI.TrueDefs.second); + if (Input1CRlogical) + AllCRLogicalOps.push_back(createCRLogicalOpInfo(*CRI.TrueDefs.first)); + if (Input2CRlogical) + AllCRLogicalOps.push_back(createCRLogicalOpInfo(*CRI.TrueDefs.second)); + } + return Changed; +} + +void PPCReduceCRLogicals::collectCRLogicals() { + for (MachineBasicBlock &MBB : *MF) { + for (MachineInstr &MI : MBB) { + if (isCRLogical(MI)) { + AllCRLogicalOps.push_back(createCRLogicalOpInfo(MI)); + TotalCRLogicals++; + if (AllCRLogicalOps.back().IsNullary) + TotalNullaryCRLogicals++; + else if (AllCRLogicalOps.back().IsBinary) + TotalBinaryCRLogicals++; + else + TotalUnaryCRLogicals++; + } + } + } +} + +} // end annonymous namespace + +INITIALIZE_PASS_BEGIN(PPCReduceCRLogicals, DEBUG_TYPE, + "PowerPC Reduce CR logical Operation", false, false) +INITIALIZE_PASS_DEPENDENCY(MachineDominatorTree) +INITIALIZE_PASS_END(PPCReduceCRLogicals, DEBUG_TYPE, + "PowerPC Reduce CR logical Operation", false, false) + +char PPCReduceCRLogicals::ID = 0; +FunctionPass* +llvm::createPPCReduceCRLogicalsPass() { return new PPCReduceCRLogicals(); } diff --git a/lib/Target/PowerPC/PPCRegisterInfo.cpp b/lib/Target/PowerPC/PPCRegisterInfo.cpp index 9207165c46a6d..6b62a82ef7bf9 100644 --- a/lib/Target/PowerPC/PPCRegisterInfo.cpp +++ b/lib/Target/PowerPC/PPCRegisterInfo.cpp @@ -21,12 +21,15 @@ #include "PPCTargetMachine.h" #include "llvm/ADT/BitVector.h" #include "llvm/ADT/STLExtras.h" +#include "llvm/ADT/Statistic.h" #include "llvm/CodeGen/MachineFrameInfo.h" #include "llvm/CodeGen/MachineFunction.h" #include "llvm/CodeGen/MachineInstrBuilder.h" #include "llvm/CodeGen/MachineModuleInfo.h" #include "llvm/CodeGen/MachineRegisterInfo.h" #include "llvm/CodeGen/RegisterScavenging.h" +#include "llvm/CodeGen/TargetFrameLowering.h" +#include "llvm/CodeGen/TargetInstrInfo.h" #include "llvm/IR/CallingConv.h" #include "llvm/IR/Constants.h" #include "llvm/IR/Function.h" @@ -36,8 +39,6 @@ #include "llvm/Support/ErrorHandling.h" #include "llvm/Support/MathExtras.h" #include "llvm/Support/raw_ostream.h" -#include "llvm/Target/TargetFrameLowering.h" -#include "llvm/Target/TargetInstrInfo.h" #include "llvm/Target/TargetMachine.h" #include "llvm/Target/TargetOptions.h" #include @@ -49,6 +50,9 @@ using namespace llvm; #define GET_REGINFO_TARGET_DESC #include "PPCGenRegisterInfo.inc" +STATISTIC(InflateGPRC, "Number of gprc inputs for getLargestLegalClass"); +STATISTIC(InflateGP8RC, "Number of g8rc inputs for getLargestLegalClass"); + static cl::opt EnableBasePointer("ppc-use-base-pointer", cl::Hidden, cl::init(true), cl::desc("Enable use of a base pointer for complex stack frames")); @@ -57,6 +61,10 @@ static cl::opt AlwaysBasePointer("ppc-always-use-base-pointer", cl::Hidden, cl::init(false), cl::desc("Force the use of a base pointer in every function")); +static cl::opt +EnableGPRToVecSpills("ppc-enable-gpr-to-vsr-spills", cl::Hidden, cl::init(false), + cl::desc("Enable spills from gpr to vsr rather than stack")); + PPCRegisterInfo::PPCRegisterInfo(const PPCTargetMachine &TM) : PPCGenRegisterInfo(TM.isPPC64() ? PPC::LR8 : PPC::LR, TM.isPPC64() ? 0 : 1, @@ -82,6 +90,8 @@ PPCRegisterInfo::PPCRegisterInfo(const PPCTargetMachine &TM) // VSX ImmToIdxMap[PPC::DFLOADf32] = PPC::LXSSPX; ImmToIdxMap[PPC::DFLOADf64] = PPC::LXSDX; + ImmToIdxMap[PPC::SPILLTOVSR_LD] = PPC::SPILLTOVSR_LDX; + ImmToIdxMap[PPC::SPILLTOVSR_ST] = PPC::SPILLTOVSR_STX; ImmToIdxMap[PPC::DFSTOREf32] = PPC::STXSSPX; ImmToIdxMap[PPC::DFSTOREf64] = PPC::STXSDX; ImmToIdxMap[PPC::LXV] = PPC::LXVX; @@ -113,7 +123,7 @@ PPCRegisterInfo::getPointerRegClass(const MachineFunction &MF, unsigned Kind) const MCPhysReg* PPCRegisterInfo::getCalleeSavedRegs(const MachineFunction *MF) const { const PPCSubtarget &Subtarget = MF->getSubtarget(); - if (MF->getFunction()->getCallingConv() == CallingConv::AnyReg) { + if (MF->getFunction().getCallingConv() == CallingConv::AnyReg) { if (Subtarget.hasVSX()) return CSR_64_AllRegs_VSX_SaveList; if (Subtarget.hasAltivec()) @@ -151,7 +161,7 @@ PPCRegisterInfo::getCalleeSavedRegsViaCopy(const MachineFunction *MF) const { return nullptr; if (!TM.isPPC64()) return nullptr; - if (MF->getFunction()->getCallingConv() != CallingConv::CXX_FAST_TLS) + if (MF->getFunction().getCallingConv() != CallingConv::CXX_FAST_TLS) return nullptr; if (!MF->getInfo()->isSplitCSR()) return nullptr; @@ -328,6 +338,18 @@ PPCRegisterInfo::getLargestLegalSuperClass(const TargetRegisterClass *RC, // With VSX, we can inflate various sub-register classes to the full VSX // register set. + // For Power9 we allow the user to enable GPR to vector spills. + // FIXME: Currently limited to spilling GP8RC. A follow on patch will add + // support to spill GPRC. + if (TM.isELFv2ABI()) { + if (Subtarget.hasP9Vector() && EnableGPRToVecSpills && + RC == &PPC::G8RCRegClass) { + InflateGP8RC++; + return &PPC::SPILLTOVSRRCRegClass; + } + if (RC == &PPC::GPRCRegClass && EnableGPRToVecSpills) + InflateGPRC++; + } if (RC == &PPC::F8RCRegClass) return &PPC::VSFRCRegClass; else if (RC == &PPC::VRRCRegClass) @@ -879,7 +901,7 @@ PPCRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator II, // Naked functions have stack size 0, although getStackSize may not reflect // that because we didn't call all the pieces that compute it for naked // functions. - if (!MF.getFunction()->hasFnAttribute(Attribute::Naked)) { + if (!MF.getFunction().hasFnAttribute(Attribute::Naked)) { if (!(hasBasePointer(MF) && FrameIndex < 0)) Offset += MFI.getStackSize(); } @@ -911,11 +933,16 @@ PPCRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator II, SReg = MF.getRegInfo().createVirtualRegister(RC); // Insert a set of rA with the full offset value before the ld, st, or add - BuildMI(MBB, II, dl, TII.get(is64Bit ? PPC::LIS8 : PPC::LIS), SRegHi) - .addImm(Offset >> 16); - BuildMI(MBB, II, dl, TII.get(is64Bit ? PPC::ORI8 : PPC::ORI), SReg) - .addReg(SRegHi, RegState::Kill) - .addImm(Offset); + if (isInt<16>(Offset)) + BuildMI(MBB, II, dl, TII.get(is64Bit ? PPC::LI8 : PPC::LI), SReg) + .addImm(Offset); + else { + BuildMI(MBB, II, dl, TII.get(is64Bit ? PPC::LIS8 : PPC::LIS), SRegHi) + .addImm(Offset >> 16); + BuildMI(MBB, II, dl, TII.get(is64Bit ? PPC::ORI8 : PPC::ORI), SReg) + .addReg(SRegHi, RegState::Kill) + .addImm(Offset); + } // Convert into indexed form of the instruction: // diff --git a/lib/Target/PowerPC/PPCRegisterInfo.td b/lib/Target/PowerPC/PPCRegisterInfo.td index 896cec7e4f6e8..f7807907bd640 100644 --- a/lib/Target/PowerPC/PPCRegisterInfo.td +++ b/lib/Target/PowerPC/PPCRegisterInfo.td @@ -305,6 +305,11 @@ def VFRC : RegisterClass<"PPC", [f64], 64, VF22, VF21, VF20)>; def VSFRC : RegisterClass<"PPC", [f64], 64, (add F8RC, VFRC)>; +// Allow spilling GPR's into caller-saved VSR's. +def SPILLTOVSRRC : RegisterClass<"PPC", [i64, f64], 64, (add G8RC, (sub VSFRC, + (sequence "VF%u", 31, 20), + (sequence "F%u", 31, 14)))>; + // Register class for single precision scalars in VSX registers def VSSRC : RegisterClass<"PPC", [f32], 32, (add VSFRC)>; diff --git a/lib/Target/PowerPC/PPCScheduleP9.td b/lib/Target/PowerPC/PPCScheduleP9.td index a01995a629c29..b24f4fc603a15 100644 --- a/lib/Target/PowerPC/PPCScheduleP9.td +++ b/lib/Target/PowerPC/PPCScheduleP9.td @@ -22,7 +22,9 @@ def P9Model : SchedMachineModel { // Try to make sure we have at least 10 dispatch groups in a loop. let LoopMicroOpBufferSize = 60; - let CompleteModel = 0; + let CompleteModel = 1; + + let UnsupportedFeatures = [HasQPX]; } @@ -68,6 +70,10 @@ let SchedModel = P9Model in { def LS : ProcResource<4>; def PM : ProcResource<2>; def DFU : ProcResource<1>; + def BR : ProcResource<1> { + let BufferSize = 16; + } + def CY : ProcResource<1>; def TestGroup : ProcResGroup<[ALU, DP]>; @@ -145,6 +151,10 @@ let SchedModel = P9Model in { let Latency = 6; } + def P9_DIV_12C : SchedWriteRes<[DIV]> { + let Latency = 12; + } + def P9_DIV_16C_8 : SchedWriteRes<[DIV]> { let ResourceCycles = [8]; let Latency = 16; @@ -190,6 +200,16 @@ let SchedModel = P9Model in { let Latency = 24; } + def P9_DPO_24C_8 : SchedWriteRes<[DPO]> { + let ResourceCycles = [8]; + let Latency = 24; + } + + def P9_DPE_24C_8 : SchedWriteRes<[DPE]> { + let ResourceCycles = [8]; + let Latency = 24; + } + def P9_DP_26C_5 : SchedWriteRes<[DP]> { let ResourceCycles = [5]; let Latency = 22; @@ -205,6 +225,16 @@ let SchedModel = P9Model in { let Latency = 33; } + def P9_DPE_33C_8 : SchedWriteRes<[DPE]> { + let ResourceCycles = [8]; + let Latency = 33; + } + + def P9_DPO_33C_8 : SchedWriteRes<[DPO]> { + let ResourceCycles = [8]; + let Latency = 33; + } + def P9_DP_36C_10 : SchedWriteRes<[DP]> { let ResourceCycles = [10]; let Latency = 36; @@ -248,31 +278,61 @@ let SchedModel = P9Model in { let Latency = 76; let ResourceCycles = [62]; } + + def P9_BR_2C : SchedWriteRes<[BR]> { + let Latency = 2; + } + + def P9_BR_5C : SchedWriteRes<[BR]> { + let Latency = 5; + } + + def P9_CY_6C : SchedWriteRes<[CY]> { + let Latency = 6; + } + // ***************** WriteSeq Definitions ***************** def P9_LoadAndALUOp_6C : WriteSequence<[P9_LS_4C, P9_ALU_2C]>; def P9_LoadAndALUOp_7C : WriteSequence<[P9_LS_5C, P9_ALU_2C]>; def P9_LoadAndPMOp_8C : WriteSequence<[P9_LS_5C, P9_PM_3C]>; + def P9_LoadAndLoadOp_8C : WriteSequence<[P9_LS_4C, P9_LS_4C]>; def P9_IntDivAndALUOp_26C_8 : WriteSequence<[P9_DIV_24C_8, P9_ALU_2C]>; def P9_IntDivAndALUOp_42C_8 : WriteSequence<[P9_DIV_40C_8, P9_ALU_2C]>; def P9_StoreAndALUOp_4C : WriteSequence<[P9_LS_1C, P9_ALU_3C]>; def P9_ALUOpAndALUOp_4C : WriteSequence<[P9_ALU_2C, P9_ALU_2C]>; + def P9_DPOpAndALUOp_9C : WriteSequence<[P9_DP_7C, P9_ALU_2C]>; + def P9_DPOpAndALUOp_24C_5 : WriteSequence<[P9_DP_22C_5, P9_ALU_2C]>; + def P9_DPOpAndALUOp_35C_8 : WriteSequence<[P9_DP_33C_8, P9_ALU_2C]>; // ***************** Defining Itinerary Class Resources ***************** + // The following itineraries are fully covered by the InstRW definitions in + // P9InstrResources.td so aren't listed here. + // IIC_FPDivD, IIC_FPDivS, IIC_FPFused, IIC_IntDivD, IIC_LdStLFDU, + // IIC_LdStLFDUX + def : ItinRW<[P9_ALU_2C, IP_EXEC_1C, DISP_1C, DISP_1C], - [IIC_IntSimple, IIC_IntGeneral]>; + [IIC_IntSimple, IIC_IntGeneral, IIC_IntRFID, + IIC_IntRotateD, IIC_IntRotateDI, IIC_IntTrapD, + IIC_SprRFI]>; + + def : ItinRW<[P9_ALU_3C, IP_EXEC_1C, DISP_1C, DISP_1C], + [IIC_IntTrapW]>; def : ItinRW<[P9_ALU_2C, IP_EXEC_1C, DISP_1C, DISP_1C, DISP_1C], [IIC_IntISEL, IIC_IntRotate, IIC_IntShift]>; def : ItinRW<[P9_ALU_2C, IP_EXEC_1C, DISP_1C, DISP_1C], [IIC_IntCompare]>; + def : ItinRW<[P9_ALUE_2C, P9_ALUO_2C, IP_EXECE_1C, IP_EXECO_1C, + DISP_1C, DISP_1C], [IIC_VecGeneral, IIC_FPCompare]>; + def : ItinRW<[P9_DP_5C, IP_EXEC_1C, DISP_1C, DISP_1C, DISP_1C], - [IIC_IntMulHW, IIC_IntMulHWU, IIC_IntMulLI]>; + [IIC_IntMulHW, IIC_IntMulHWU, IIC_IntMulLI, IIC_IntMulHD]>; def : ItinRW<[P9_LS_5C, IP_EXEC_1C, DISP_1C, DISP_1C], - [IIC_LdStLoad, IIC_LdStLD]>; + [IIC_LdStLoad, IIC_LdStLD, IIC_LdStLFD]>; def : ItinRW<[P9_LS_4C, P9_ALU_2C, IP_EXEC_1C, IP_EXEC_1C, DISP_1C, DISP_1C, DISP_1C, DISP_1C], @@ -300,12 +360,18 @@ let SchedModel = P9Model in { def : ItinRW<[P9_LS_4C, IP_EXEC_1C, DISP_1C, DISP_1C], [IIC_LdStLWARX, IIC_LdStLDARX, IIC_LdStLMW]>; + def : ItinRW<[P9_LS_4C, IP_EXEC_1C, DISP_1C, DISP_1C], + [IIC_LdStCOPY, IIC_SprABORT, IIC_LdStPASTE, IIC_LdStDCBF, + IIC_LdStICBI, IIC_LdStSync, IIC_SprISYNC, IIC_SprMSGSYNC, + IIC_SprSLBIA, IIC_SprSLBSYNC, IIC_SprTLBSYNC]>; + def : ItinRW<[P9_LS_1C, IP_EXEC_1C, IP_AGEN_1C, DISP_1C, DISP_1C, DISP_1C], [IIC_LdStSTFD, IIC_LdStSTD, IIC_LdStStore]>; def : ItinRW<[P9_LS_1C, P9_ALU_2C, IP_EXEC_1C, IP_EXEC_1C, IP_AGEN_1C, DISP_1C, DISP_1C, DISP_1C, DISP_1C, DISP_1C], - [IIC_LdStSTDU, IIC_LdStSTDUX]>; + [IIC_LdStSTDU, IIC_LdStSTDUX, IIC_LdStStoreUpd, IIC_SprSLBIEG, + IIC_SprTLBIA, IIC_SprTLBIE]>; def : ItinRW<[P9_StoreAndALUOp_4C, IP_EXEC_1C, IP_EXEC_1C, IP_AGEN_1C, DISP_1C, DISP_1C, DISP_1C, DISP_1C, DISP_1C], @@ -315,20 +381,44 @@ let SchedModel = P9Model in { [IIC_BrCR, IIC_IntMTFSB0]>; def : ItinRW<[P9_ALUOpAndALUOp_4C, P9_ALU_2C, IP_EXEC_1C, IP_EXEC_1C, - IP_EXEC_1C, DISP_1C, DISP_1C, DISP_1C, DISP_1C, DISP_1C, DISP_1C, - DISP_1C, DISP_1C, DISP_1C], [IIC_SprMFCR, IIC_SprMFCRF]>; + IP_EXEC_1C, DISP_1C, DISP_1C, DISP_1C, DISP_1C, DISP_1C, + DISP_1C, DISP_1C, DISP_1C, DISP_1C], + [IIC_SprMFCR, IIC_SprMFCRF, IIC_BrMCR, IIC_BrMCRX, IIC_IntMFFS]>; + + def : ItinRW<[P9_BR_2C, DISP_1C], [IIC_BrB]>; + def : ItinRW<[P9_BR_5C, DISP_1C], [IIC_SprMFSPR]>; // This class should be broken down to instruction level, once some missing // info is obtained. def : ItinRW<[P9_LoadAndALUOp_6C, IP_EXEC_1C, IP_AGEN_1C, DISP_1C, DISP_1C, DISP_1C], [IIC_SprMTSPR]>; - def : ItinRW<[P9_DP_7C, IP_EXEC_1C, - DISP_1C, DISP_1C, DISP_1C], [IIC_FPGeneral, IIC_FPAddSub]>; + def : ItinRW<[P9_LoadAndLoadOp_8C, IP_EXEC_1C, DISP_1C, DISP_1C], + [IIC_SprSLBIE, IIC_SprSLBMFEE, IIC_SprSLBMFEV, IIC_SprSLBMTE, + IIC_SprTLBIEL]>; + + // IIC_VecFP is added here although many instructions with that itinerary + // use very different resources. It would appear that instructions were + // given that itinerary rather carelessly over time. Specific instructions + // that use different resources are listed in various InstrRW classes. + def : ItinRW<[P9_DP_7C, IP_EXEC_1C, DISP_1C, DISP_1C, DISP_1C], + [IIC_FPGeneral, IIC_FPAddSub, IIC_VecFP]>; + + def : ItinRW<[P9_ALUE_3C, P9_ALUO_3C, IP_EXECE_1C, IP_EXECO_1C, + DISP_1C, DISP_1C], [IIC_VecFPCompare]>; + + def : ItinRW<[P9_PM_3C, IP_EXECO_1C, IP_EXECE_1C, DISP_1C, DISP_1C], + [IIC_VecPerm]>; def : ItinRW<[P9_DP_36C_10, IP_EXEC_1C], [IIC_FPSqrtD]>; def : ItinRW<[P9_DP_26C_5, P9_DP_26C_5, IP_EXEC_1C, IP_EXEC_1C], [IIC_FPSqrtS]>; + def : ItinRW<[P9_DIV_12C, IP_EXECE_1C, DISP_1C, DISP_1C], + [IIC_SprMFMSR, IIC_SprMFPMR, IIC_SprMFSR, IIC_SprMFTB, + IIC_SprMTMSR, IIC_SprMTMSRD, IIC_SprMTPMR, IIC_SprMTSR]>; + + def : ItinRW<[], [IIC_SprSTOP]>; + include "P9InstrResources.td" } diff --git a/lib/Target/PowerPC/PPCSubtarget.h b/lib/Target/PowerPC/PPCSubtarget.h index 90d11f46a384d..c351b5c04a056 100644 --- a/lib/Target/PowerPC/PPCSubtarget.h +++ b/lib/Target/PowerPC/PPCSubtarget.h @@ -19,9 +19,9 @@ #include "PPCInstrInfo.h" #include "llvm/ADT/Triple.h" #include "llvm/CodeGen/SelectionDAGTargetInfo.h" +#include "llvm/CodeGen/TargetSubtargetInfo.h" #include "llvm/IR/DataLayout.h" #include "llvm/MC/MCInstrItineraries.h" -#include "llvm/Target/TargetSubtargetInfo.h" #include #define GET_SUBTARGETINFO_HEADER diff --git a/lib/Target/PowerPC/PPCTLSDynamicCall.cpp b/lib/Target/PowerPC/PPCTLSDynamicCall.cpp index 5f8085f4626e2..49f2699ab082e 100644 --- a/lib/Target/PowerPC/PPCTLSDynamicCall.cpp +++ b/lib/Target/PowerPC/PPCTLSDynamicCall.cpp @@ -25,7 +25,7 @@ #include "PPCInstrBuilder.h" #include "PPCInstrInfo.h" #include "PPCTargetMachine.h" -#include "llvm/CodeGen/LiveIntervalAnalysis.h" +#include "llvm/CodeGen/LiveIntervals.h" #include "llvm/CodeGen/MachineFunctionPass.h" #include "llvm/CodeGen/MachineInstrBuilder.h" #include "llvm/Support/Debug.h" diff --git a/lib/Target/PowerPC/PPCTargetMachine.cpp b/lib/Target/PowerPC/PPCTargetMachine.cpp index fe092cc3b858d..491f25ca2c64a 100644 --- a/lib/Target/PowerPC/PPCTargetMachine.cpp +++ b/lib/Target/PowerPC/PPCTargetMachine.cpp @@ -23,6 +23,7 @@ #include "llvm/ADT/Triple.h" #include "llvm/Analysis/TargetTransformInfo.h" #include "llvm/CodeGen/Passes.h" +#include "llvm/CodeGen/TargetLoweringObjectFile.h" #include "llvm/CodeGen/TargetPassConfig.h" #include "llvm/IR/Attributes.h" #include "llvm/IR/DataLayout.h" @@ -31,7 +32,6 @@ #include "llvm/Support/CodeGen.h" #include "llvm/Support/CommandLine.h" #include "llvm/Support/TargetRegistry.h" -#include "llvm/Target/TargetLoweringObjectFile.h" #include "llvm/Target/TargetOptions.h" #include "llvm/Transforms/Scalar.h" #include @@ -40,6 +40,10 @@ using namespace llvm; + +static cl::opt + EnableBranchCoalescing("enable-ppc-branch-coalesce", cl::Hidden, + cl::desc("enable coalescing of duplicate branches for PPC")); static cl:: opt DisableCTRLoops("disable-ppc-ctrloops", cl::Hidden, cl::desc("Disable CTR loops for PPC")); @@ -84,6 +88,10 @@ EnableMachineCombinerPass("ppc-machine-combiner", cl::desc("Enable the machine combiner pass"), cl::init(true), cl::Hidden); +static cl::opt + ReduceCRLogical("ppc-reduce-cr-logicals", + cl::desc("Expand eligible cr-logical binary ops to branches"), + cl::init(false), cl::Hidden); extern "C" void LLVMInitializePowerPCTarget() { // Register the targets RegisterTargetMachine A(getThePPC32Target()); @@ -93,7 +101,9 @@ extern "C" void LLVMInitializePowerPCTarget() { PassRegistry &PR = *PassRegistry::getPassRegistry(); initializePPCBoolRetToIntPass(PR); initializePPCExpandISELPass(PR); + initializePPCPreEmitPeepholePass(PR); initializePPCTLSDynamicCallPass(PR); + initializePPCMIPeepholePass(PR); } /// Return the datalayout string of a subtarget. @@ -208,6 +218,17 @@ static Reloc::Model getEffectiveRelocModel(const Triple &TT, return Reloc::Static; } +static CodeModel::Model getEffectiveCodeModel(const Triple &TT, + Optional CM, + bool JIT) { + if (CM) + return *CM; + if (!TT.isOSDarwin() && !JIT && + (TT.getArch() == Triple::ppc64 || TT.getArch() == Triple::ppc64le)) + return CodeModel::Medium; + return CodeModel::Small; +} + // The FeatureString here is a little subtle. We are modifying the feature // string with what are (currently) non-function specific overrides as it goes // into the LLVMTargetMachine constructor and then using the stored value in the @@ -216,10 +237,12 @@ PPCTargetMachine::PPCTargetMachine(const Target &T, const Triple &TT, StringRef CPU, StringRef FS, const TargetOptions &Options, Optional RM, - CodeModel::Model CM, CodeGenOpt::Level OL) + Optional CM, + CodeGenOpt::Level OL, bool JIT) : LLVMTargetMachine(T, getDataLayoutString(TT), TT, CPU, computeFSAdditions(FS, OL, TT), Options, - getEffectiveRelocModel(TT, RM), CM, OL), + getEffectiveRelocModel(TT, RM), + getEffectiveCodeModel(TT, CM, JIT), OL), TLOF(createTLOF(getTargetTriple())), TargetABI(computeTargetABI(TT, Options)) { initAsmInfo(); @@ -365,12 +388,19 @@ bool PPCPassConfig::addInstSelector() { } void PPCPassConfig::addMachineSSAOptimization() { + // PPCBranchCoalescingPass need to be done before machine sinking + // since it merges empty blocks. + if (EnableBranchCoalescing && getOptLevel() != CodeGenOpt::None) + addPass(createPPCBranchCoalescingPass()); TargetPassConfig::addMachineSSAOptimization(); // For little endian, remove where possible the vector swap instructions // introduced at code generation to normalize vector element order. if (TM->getTargetTriple().getArch() == Triple::ppc64le && !DisableVSXSwapRemoval) addPass(createPPCVSXSwapRemovalPass()); + // Reduce the number of cr-logical ops. + if (ReduceCRLogical && getOptLevel() != CodeGenOpt::None) + addPass(createPPCReduceCRLogicalsPass()); // Target-specific peephole cleanups performed after instruction // selection. if (!DisableMIPeephole) { @@ -412,6 +442,7 @@ void PPCPassConfig::addPreSched2() { } void PPCPassConfig::addPreEmitPass() { + addPass(createPPCPreEmitPeepholePass()); addPass(createPPCExpandISELPass()); if (getOptLevel() != CodeGenOpt::None) diff --git a/lib/Target/PowerPC/PPCTargetMachine.h b/lib/Target/PowerPC/PPCTargetMachine.h index be705507b5347..102bf7ca59c26 100644 --- a/lib/Target/PowerPC/PPCTargetMachine.h +++ b/lib/Target/PowerPC/PPCTargetMachine.h @@ -35,14 +35,15 @@ private: public: PPCTargetMachine(const Target &T, const Triple &TT, StringRef CPU, StringRef FS, const TargetOptions &Options, - Optional RM, CodeModel::Model CM, - CodeGenOpt::Level OL); + Optional RM, Optional CM, + CodeGenOpt::Level OL, bool JIT); ~PPCTargetMachine() override; const PPCSubtarget *getSubtargetImpl(const Function &F) const override; - // The no argument getSubtargetImpl, while it exists on some targets, is - // deprecated and should not be used. + // DO NOT IMPLEMENT: There is no such thing as a valid default subtarget, + // subtargets are per-function entities based on the target-specific + // attributes of each function. const PPCSubtarget *getSubtargetImpl() const = delete; // Pass Pipeline Configuration diff --git a/lib/Target/PowerPC/PPCTargetObjectFile.h b/lib/Target/PowerPC/PPCTargetObjectFile.h index c8b9b2e9790b6..8343a90696d92 100644 --- a/lib/Target/PowerPC/PPCTargetObjectFile.h +++ b/lib/Target/PowerPC/PPCTargetObjectFile.h @@ -10,8 +10,8 @@ #ifndef LLVM_LIB_TARGET_POWERPC_PPCTARGETOBJECTFILE_H #define LLVM_LIB_TARGET_POWERPC_PPCTARGETOBJECTFILE_H +#include "llvm/CodeGen/TargetLoweringObjectFile.h" #include "llvm/CodeGen/TargetLoweringObjectFileImpl.h" -#include "llvm/Target/TargetLoweringObjectFile.h" #include "llvm/Target/TargetMachine.h" namespace llvm { diff --git a/lib/Target/PowerPC/PPCTargetTransformInfo.cpp b/lib/Target/PowerPC/PPCTargetTransformInfo.cpp index 6110706b01b90..aa4073f7ea025 100644 --- a/lib/Target/PowerPC/PPCTargetTransformInfo.cpp +++ b/lib/Target/PowerPC/PPCTargetTransformInfo.cpp @@ -10,10 +10,10 @@ #include "PPCTargetTransformInfo.h" #include "llvm/Analysis/TargetTransformInfo.h" #include "llvm/CodeGen/BasicTTIImpl.h" +#include "llvm/CodeGen/CostTable.h" +#include "llvm/CodeGen/TargetLowering.h" #include "llvm/Support/CommandLine.h" #include "llvm/Support/Debug.h" -#include "llvm/Target/CostTable.h" -#include "llvm/Target/TargetLowering.h" using namespace llvm; #define DEBUG_TYPE "ppctti" @@ -189,6 +189,17 @@ int PPCTTIImpl::getIntImmCost(unsigned Opcode, unsigned Idx, const APInt &Imm, return PPCTTIImpl::getIntImmCost(Imm, Ty); } +unsigned PPCTTIImpl::getUserCost(const User *U, + ArrayRef Operands) { + if (U->getType()->isVectorTy()) { + // Instructions that need to be split should cost more. + std::pair LT = TLI->getTypeLegalizationCost(DL, U->getType()); + return LT.first * BaseT::getUserCost(U, Operands); + } + + return BaseT::getUserCost(U, Operands); +} + void PPCTTIImpl::getUnrollingPreferences(Loop *L, ScalarEvolution &SE, TTI::UnrollingPreferences &UP) { if (ST->getDarwinDirective() == PPC::DIR_A2) { @@ -215,9 +226,17 @@ bool PPCTTIImpl::enableAggressiveInterleaving(bool LoopHasReductions) { return LoopHasReductions; } -bool PPCTTIImpl::expandMemCmp(Instruction *I, unsigned &MaxLoadSize) { - MaxLoadSize = 8; - return true; +const PPCTTIImpl::TTI::MemCmpExpansionOptions * +PPCTTIImpl::enableMemCmpExpansion(bool IsZeroCmp) const { + static const auto Options = []() { + TTI::MemCmpExpansionOptions Options; + Options.LoadSizes.push_back(8); + Options.LoadSizes.push_back(4); + Options.LoadSizes.push_back(2); + Options.LoadSizes.push_back(1); + return Options; + }(); + return &Options; } bool PPCTTIImpl::enableInterleavedAccessVectorization() { diff --git a/lib/Target/PowerPC/PPCTargetTransformInfo.h b/lib/Target/PowerPC/PPCTargetTransformInfo.h index 99ca6394d1bed..b42dae4a0254c 100644 --- a/lib/Target/PowerPC/PPCTargetTransformInfo.h +++ b/lib/Target/PowerPC/PPCTargetTransformInfo.h @@ -21,7 +21,7 @@ #include "PPCTargetMachine.h" #include "llvm/Analysis/TargetTransformInfo.h" #include "llvm/CodeGen/BasicTTIImpl.h" -#include "llvm/Target/TargetLowering.h" +#include "llvm/CodeGen/TargetLowering.h" namespace llvm { @@ -51,6 +51,8 @@ public: int getIntImmCost(Intrinsic::ID IID, unsigned Idx, const APInt &Imm, Type *Ty); + unsigned getUserCost(const User *U, ArrayRef Operands); + TTI::PopcntSupportKind getPopcntSupport(unsigned TyWidth); void getUnrollingPreferences(Loop *L, ScalarEvolution &SE, TTI::UnrollingPreferences &UP); @@ -61,7 +63,8 @@ public: /// @{ bool enableAggressiveInterleaving(bool LoopHasReductions); - bool expandMemCmp(Instruction *I, unsigned &MaxLoadSize); + const TTI::MemCmpExpansionOptions *enableMemCmpExpansion( + bool IsZeroCmp) const; bool enableInterleavedAccessVectorization(); unsigned getNumberOfRegisters(bool Vector); unsigned getRegisterBitWidth(bool Vector) const; diff --git a/lib/Target/PowerPC/PPCVSXFMAMutate.cpp b/lib/Target/PowerPC/PPCVSXFMAMutate.cpp index a57484e5abdf7..f15af790de8f5 100644 --- a/lib/Target/PowerPC/PPCVSXFMAMutate.cpp +++ b/lib/Target/PowerPC/PPCVSXFMAMutate.cpp @@ -20,7 +20,7 @@ #include "PPCTargetMachine.h" #include "llvm/ADT/STLExtras.h" #include "llvm/ADT/Statistic.h" -#include "llvm/CodeGen/LiveIntervalAnalysis.h" +#include "llvm/CodeGen/LiveIntervals.h" #include "llvm/CodeGen/MachineDominators.h" #include "llvm/CodeGen/MachineFrameInfo.h" #include "llvm/CodeGen/MachineFunctionPass.h" @@ -90,21 +90,21 @@ protected: // This pass is run after register coalescing, and so we're looking for // a situation like this: // ... - // %vreg5 = COPY %vreg9; VSLRC:%vreg5,%vreg9 - // %vreg5 = XSMADDADP %vreg5, %vreg17, %vreg16, - // %RM; VSLRC:%vreg5,%vreg17,%vreg16 + // %5 = COPY %9; VSLRC:%5,%9 + // %5 = XSMADDADP %5, %17, %16, + // implicit %rm; VSLRC:%5,%17,%16 // ... - // %vreg9 = XSMADDADP %vreg9, %vreg17, %vreg19, - // %RM; VSLRC:%vreg9,%vreg17,%vreg19 + // %9 = XSMADDADP %9, %17, %19, + // implicit %rm; VSLRC:%9,%17,%19 // ... // Where we can eliminate the copy by changing from the A-type to the // M-type instruction. Specifically, for this example, this means: - // %vreg5 = XSMADDADP %vreg5, %vreg17, %vreg16, - // %RM; VSLRC:%vreg5,%vreg17,%vreg16 + // %5 = XSMADDADP %5, %17, %16, + // implicit %rm; VSLRC:%5,%17,%16 // is replaced by: - // %vreg16 = XSMADDMDP %vreg16, %vreg18, %vreg9, - // %RM; VSLRC:%vreg16,%vreg18,%vreg9 - // and we remove: %vreg5 = COPY %vreg9; VSLRC:%vreg5,%vreg9 + // %16 = XSMADDMDP %16, %18, %9, + // implicit %rm; VSLRC:%16,%18,%9 + // and we remove: %5 = COPY %9; VSLRC:%5,%9 SlotIndex FMAIdx = LIS->getInstructionIndex(MI); @@ -150,13 +150,13 @@ protected: // walking the MIs we may as well test liveness here. // // FIXME: There is a case that occurs in practice, like this: - // %vreg9 = COPY %F1; VSSRC:%vreg9 + // %9 = COPY %f1; VSSRC:%9 // ... - // %vreg6 = COPY %vreg9; VSSRC:%vreg6,%vreg9 - // %vreg7 = COPY %vreg9; VSSRC:%vreg7,%vreg9 - // %vreg9 = XSMADDASP %vreg9, %vreg1, %vreg4; VSSRC: - // %vreg6 = XSMADDASP %vreg6, %vreg1, %vreg2; VSSRC: - // %vreg7 = XSMADDASP %vreg7, %vreg1, %vreg3; VSSRC: + // %6 = COPY %9; VSSRC:%6,%9 + // %7 = COPY %9; VSSRC:%7,%9 + // %9 = XSMADDASP %9, %1, %4; VSSRC: + // %6 = XSMADDASP %6, %1, %2; VSSRC: + // %7 = XSMADDASP %7, %1, %3; VSSRC: // which prevents an otherwise-profitable transformation. bool OtherUsers = false, KillsAddendSrc = false; for (auto J = std::prev(I), JE = MachineBasicBlock::iterator(AddendMI); @@ -177,11 +177,11 @@ protected: // The transformation doesn't work well with things like: - // %vreg5 = A-form-op %vreg5, %vreg11, %vreg5; - // unless vreg11 is also a kill, so skip when it is not, + // %5 = A-form-op %5, %11, %5; + // unless %11 is also a kill, so skip when it is not, // and check operand 3 to see it is also a kill to handle the case: - // %vreg5 = A-form-op %vreg5, %vreg5, %vreg11; - // where vreg5 and vreg11 are both kills. This case would be skipped + // %5 = A-form-op %5, %5, %11; + // where %5 and %11 are both kills. This case would be skipped // otherwise. unsigned OldFMAReg = MI.getOperand(0).getReg(); @@ -343,7 +343,7 @@ protected: public: bool runOnMachineFunction(MachineFunction &MF) override { - if (skipFunction(*MF.getFunction())) + if (skipFunction(MF.getFunction())) return false; // If we don't have VSX then go ahead and return without doing diff --git a/lib/Target/PowerPC/PPCVSXSwapRemoval.cpp b/lib/Target/PowerPC/PPCVSXSwapRemoval.cpp index 7d34efd4af3e0..8a5fb9fdaef11 100644 --- a/lib/Target/PowerPC/PPCVSXSwapRemoval.cpp +++ b/lib/Target/PowerPC/PPCVSXSwapRemoval.cpp @@ -191,7 +191,7 @@ private: public: // Main entry point for this pass. bool runOnMachineFunction(MachineFunction &MF) override { - if (skipFunction(*MF.getFunction())) + if (skipFunction(MF.getFunction())) return false; // If we don't have VSX on the subtarget, don't do anything. @@ -353,6 +353,8 @@ bool PPCVSXSwapRemoval::gatherVectorInstructions() { break; case PPC::LXSDX: case PPC::LXSSPX: + case PPC::XFLOADf64: + case PPC::XFLOADf32: // A load of a floating-point value into the high-order half of // a vector register is safe, provided that we introduce a swap // following the load, which will be done by the SUBREG_TO_REG @@ -964,7 +966,7 @@ LLVM_DUMP_METHOD void PPCVSXSwapRemoval::dumpSwapVector() { dbgs() << format("%6d", ID); dbgs() << format("%6d", EC->getLeaderValue(ID)); - dbgs() << format(" BB#%3d", MI->getParent()->getNumber()); + dbgs() << format(" %bb.%3d", MI->getParent()->getNumber()); dbgs() << format(" %14s ", TII->getName(MI->getOpcode()).str().c_str()); if (SwapVector[EntryIdx].IsLoad) diff --git a/lib/Target/PowerPC/README.txt b/lib/Target/PowerPC/README.txt index bc09d5f8a7e8e..b4bf635dc2c75 100644 --- a/lib/Target/PowerPC/README.txt +++ b/lib/Target/PowerPC/README.txt @@ -256,7 +256,7 @@ _clamp0g: cmpwi cr0, r3, 0 li r2, 0 blt cr0, LBB1_2 -; BB#1: ; %entry +; %bb.1: ; %entry mr r2, r3 LBB1_2: ; %entry mr r3, r2 diff --git a/lib/Target/PowerPC/README_ALTIVEC.txt b/lib/Target/PowerPC/README_ALTIVEC.txt index f70ebd82bd5c9..c38e019231611 100644 --- a/lib/Target/PowerPC/README_ALTIVEC.txt +++ b/lib/Target/PowerPC/README_ALTIVEC.txt @@ -233,7 +233,7 @@ declare <16 x i8> @llvm.ppc.altivec.crypto.vpmsumb(<16 x i8>, <16 x i8>) #1 Produces the following code with -mtriple=powerpc64-unknown-linux-gnu: -# BB#0: # %entry +# %bb.0: # %entry addis 3, 2, .LCPI0_0@toc@ha addis 4, 2, .LCPI0_1@toc@ha addi 3, 3, .LCPI0_0@toc@l diff --git a/lib/Target/PowerPC/TargetInfo/PowerPCTargetInfo.cpp b/lib/Target/PowerPC/TargetInfo/PowerPCTargetInfo.cpp index a637dd11f8105..979595264472f 100644 --- a/lib/Target/PowerPC/TargetInfo/PowerPCTargetInfo.cpp +++ b/lib/Target/PowerPC/TargetInfo/PowerPCTargetInfo.cpp @@ -27,11 +27,11 @@ Target &llvm::getThePPC64LETarget() { extern "C" void LLVMInitializePowerPCTargetInfo() { RegisterTarget X(getThePPC32Target(), "ppc32", - "PowerPC 32"); + "PowerPC 32", "PPC"); RegisterTarget Y(getThePPC64Target(), "ppc64", - "PowerPC 64"); + "PowerPC 64", "PPC"); RegisterTarget Z( - getThePPC64LETarget(), "ppc64le", "PowerPC 64 LE"); + getThePPC64LETarget(), "ppc64le", "PowerPC 64 LE", "PPC"); } diff --git a/lib/Target/PowerPC/p9-instrs.txt b/lib/Target/PowerPC/p9-instrs.txt deleted file mode 100644 index a70582aca3989..0000000000000 --- a/lib/Target/PowerPC/p9-instrs.txt +++ /dev/null @@ -1,442 +0,0 @@ -Content: -======== -. Remaining Instructions (Total 56 Instructions, include 2 unknow instructions) -. Done (Total 155 Instructions: 101 VSX, 54 Altivec) - -//------------------------------------------------------------------------------ -//. Remaining Instructions -//------------------------------------------------------------------------------ -GCC reference: https://sourceware.org/ml/binutils/2015-11/msg00071.html - -// Add PC Immediate Shifted DX-form p69 -[PO RT d1 d0 XO d2] addpcis RT,D - subpcis Rx,value = addpcis Rx,-value - -// 6.17.2 Decimal Integer Format Conversion Instructions - -// Decimal Convert From National VX-form p352 -[PO VRT EO VRB 1 PS XO] bcdcfn. VRT,VRB,PS - -// Decimal Convert From Zoned VX-form p353 -[PO VRT EO VRB 1 PS XO] bcdcfz. VRT,VRB,PS - -// Decimal Convert To National VX-form p354 -[PO VRT EO VRB 1 / XO] bcdctn. VRT,VRB - -// Decimal Convert To Zoned VX-form p355 -[PO VRT EO VRB 1 PS XO] bcdctz. VRT,VRB,PS - -// Decimal Convert From Signed Quadword VX-form p356 -[PO VRT EO VRB 1 PS XO] bcdcfsq. VRT,VRB,PS - -// Decimal Convert To Signed Quadword VX-form p356 -[PO VRT EO VRB 1 / XO] bcdctsq. VRT,VRB - -// 6.17.3 Decimal Integer Sign Manipulation Instructions - -// Decimal Copy Sign VX-form p358 -[PO VRT VRA VRB XO] bcdcpsgn. VRT,VRA,VRB - -// Decimal Set Sign VX-form p358 -[PO VRT EO VRB 1 PS XO] bcdsetsgn. VRT,VRB,PS - -// Decimal Shift VX-form p359 -[PO VRT VRA VRB 1 PS XO] bcds. VRT,VRA,VRB,PS - -// Decimal Unsigned Shift VX-form p360 -[PO VRT VRA VRB 1 / XO] bcdus. VRT,VRA,VRB - -// Decimal Shift and Round VX-form p361 -[PO VRT VRA VRB 1 PS XO] bcdsr. VRT,VRA,VRB,PS - -// 6.17.5 Decimal Integer Truncate Instructions - -// Decimal Truncate VX-form p362 -[PO VRT VRA VRB 1 PS XO] bcdtrunc. VRT,VRA,VRB,PS - -// Decimal Unsigned Truncate VX-form p363 -[PO VRT VRA VRB 1 / XO] bcdutrunc. VRT,VRA,VRB - -// 3.3.10.1 Character-Type Compare Instructions - -// Compare Ranged Byte X-form p87 -[PO BF / L RA RB XO /] cmprb BF,L,RA,RB - -// Compare Equal Byte X-form p88 -[PO BF // RA RB XO /] cmpeqb BF,RA,RB - -// 3.3.13 Fixed-Point Logical Instructions - -// Count Trailing Zeros Word X-form p95 -[PO RS RA /// XO Rc] cnttzw(.) RA,RS - -// 3.3.13.1 64-bit Fixed-Point Logical Instructions - -// Count Trailing Zeros Doubleword X-form p98 -[PO RS RA /// XO Rc] cnttzd(.) RA,RS - -// 4.4 Copy-Paste Facility - -// Copy X-form p858 -[PO /// L RA RB XO /] copy RA,RB,L - copy_first = copy RA, RB, 1 -// CP_Abort p860 -[PO /// /// /// XO /] cp_abort - -// Paste p859 -[PO /// L RA RB XO Rc] paste(.) RA,RB,L - paste_last = paste RA,RB,1 - -// 3.3.9 Fixed-Point Arithmetic Instructions - -// Deliver A Random Number X-form p79 -[PO RT /// L /// XO /] darn RT,L - -// Multiply-Add High Doubleword VA-form p81 -[PO RT RA RB RC XO] maddhd RT,RA.RB,RC - -// Multiply-Add High Doubleword Unsigned VA-form p81 -[PO RT RA RB RC XO] maddhdu RT,RA.RB,RC - -// Multiply-Add Low Doubleword VA-form p81 -[PO RT RA RB RC XO] maddld RT,RA.RB,RC - -// Modulo Signed Word X-form p76 -[PO RT RA RB XO /] modsw RT,RA,RB - -// Modulo Unsigned Word X-form p76 -[PO RT RA RB XO /] moduw RT,RA,RB - -// Modulo Signed Doubleword X-form p84 -[PO RT RA RB XO /] modsd RT,RA,RB - -// Modulo Unsigned Doubleword X-form p84 -[PO RT RA RB XO /] modud RT,RA,RB - - -// DFP Test Significance Immediate [Quad] X-form p204 -[PO BF / UIM FRB XO /] dtstsfi BF,UIM,FRB -[PO BF / UIM FRBp XO /] dtstsfiq BF,UIM,FRBp - -// 3.3.14.2.1 64-bit Fixed-Point Shift Instructions - -// Extend-Sign Word and Shift Left Immediate XS-form p109 -[PO RS RA sh XO sh Rc] extswsli(.) RA,RS,SH - -// 4.5.1 Load Atomic - -// Load Word Atomic X-form p864 -[PO RT RA FC XO /] lwat RT,RA,FC - -// Load Doubleword Atomic X-form p864 -[PO RT RA FC XO /] ldat RT,RA,FC - -// 4.5.2 Store Atomic - -// Store Word Atomic X-form p866 -[PO RS RA FC XO /] stwat RS,RA,FC - -// Store Doubleword Atomic X-form p866 -[PO RS RA FC XO /] stdat RS,RA,FC - -// 3.3.2.1 64-bit Fixed-Point Load Instructions - -// Load Doubleword Monitored Indexed X-form p54 -[PO RT RA RB XO /] ldmx RT,RA,RB - -// 3.3.16 Move To/From Vector-Scalar Register Instructions - -// Move From VSR Lower Doubleword XX1-form p111 -[PO S RA /// XO SX] mfvsrld RA,XS - -// Move To VSR Double Doubleword XX1-form p114 -[PO T RA RB XO TX] mtvsrdd XT,RA,RB - -// Move To VSR Word & Splat XX1-form p115 -[PO T RA /// XO TX] mtvsrws XT,RA - -// Move to CR from XER Extended X-form p119 -[PO BF // /// /// XO /] mcrxrx BF - -// Set Boolean X-form p121 -[PO RT BFA // /// XO /] setb RT,BFA - -// Message Synchronize X-form p1126 -[PO /// /// /// XO /] msgsync - -// SLB Invalidate Entry Global X-form p1026 -[PO RS /// RB XO /] slbieg RS,RB - -// SLB Synchronize X-form p1031 -[PO /// /// /// XO /] slbsync - -// 3.3.2.1 Power-Saving Mode Instruction - -// stop XL-form p957 -[PO /// /// /// XO /] stop - -// 4.6.4 Wait Instruction -// Wait X-form p880 -[PO /// WC /// /// XO /] wait - -// Unknow Instructions: -urfid -- gcc's implementation: - {"urfid", XL(19,306), 0xffffffff, POWER9, PPCNONE, {0}}, - (4c 00 02 64|64 02 00 4c) urfid - -rmieg -- gcc's implementation: - {"rmieg", X(31,882), XRTRA_MASK, POWER9, PPCNONE, {RB}}, - (7c 00 f6 e4|e4 f6 00 7c) rmieg r30 - -//------------------------------------------------------------------------------ -//. Done: -//------------------------------------------------------------------------------ - -//====================================== -"vsx instructions" - -//-------------------------------------- -"7.6.1.2.1 VSX Scalar Move Instructions" -// VSX Scalar Quad-Precision Move Instructions - -// VSX Scalar Copy Sign Quad-Precision X-form p.553 -[PO VRT VRA VRB XO /] xscpsgnqp - -// VSX Scalar Absolute Quad-Precision X-form 531 -// VSX Scalar Negate Quad-Precision X-form 627 -// VSX Scalar Negative Absolute Quad-Precision X-form 626 -[PO VRT XO VRB XO /] xsabsqp xsnegqp xsnabsqp - -//-------------------------------------- -"7.6.1.3 VSX Floating-Point Arithmetic Instructions" - -// VSX Scalar Quad-Precision Elementary Arithmetic - -// VSX Scalar Add Quad-Precision [using round to Odd] X-form 539 -// VSX Scalar Divide Quad-Precision [using round to Odd] X-form 584 -// VSX Scalar Multiply Quad-Precision [using round to Odd] X-form 622 -[PO VRT VRA VRB XO RO] xsaddqp xsaddqpo xsdivqp xsdivqpo xsmulqp xsmulqpo - -// VSX Scalar Square Root Quad-Precision [using round to Odd] X-form 662 -// VSX Scalar Subtract Quad-Precision [using round to Odd] X-form 667 - xssubqp xssubqpo - -[PO VRT XO VRB XO RO] xssqrtqp xssqrtqpo - -// VSX Scalar Quad-Precision Multiply-Add Arithmetic Instructions - -// VSX Scalar Multiply-Add Quad-Precision [using round to Odd] X-form 596 -// VSX Scalar Multiply-Subtract Quad-Precision [using round to Odd] X-form 617 -// VSX Scalar Negative Multiply-Add Quad-Precision [using round to Odd] X-form 636 -// VSX Scalar Negative Multiply-Subtract Quad-Precision [using round to Odd] -// X-form 645 -[PO VRT VRA VRB XO RO] xsmaddqp xsmaddqpo xsmsubqp xsmsubqpo - xsnmaddqp xsnmaddqpo xsnmsubqp xsnmsubqpo - -22 -//-------------------------------------- -"7.6.1.4 VSX Floating-Point Compare Instructions" - -// VSX Scalar Quad-Precision Compare Instructions - -// VSX Scalar Compare Ordered Quad-Precision X-form 549 -// VSX Scalar Compare Unordered Quad-Precision X-form 552 -[PO BF // VRA VRB XO /] xscmpoqp xscmpuqp - -"7.6.1.8 VSX Scalar Floating-Point Support Instructions" -// VSX Scalar Compare Exponents Quad-Precision X-form p. 541 542 -[PO BF // A B XO AX BX /] xscmpexpdp -[PO BF // VRA VRB XO /] xscmpexpqp - -// VSX Scalar Compare DP, XX3-form, p.543 544 545 -// VSX Scalar Compare Equal Double-Precision, -[PO T A B XO AX BX TX] xscmpeqdp xscmpgedp xscmpgtdp xscmpnedp - -// VSX Vector Compare Not Equal Double-Precision XX3-form 691 -[PO T A B Rc XO AX BX TX] xvcmpnedp xvcmpnedp. xvcmpnesp xvcmpnesp. - -//-------------------------------------- -"7.6.1.5 VSX FP-FP Conversion Instructions" -// VSX Scalar Quad-Precision Floating-Point Conversion Instructions - -// VSX Scalar round & Convert Quad-Precision format to Double-Precision format -// [using round to Odd] X-form 567 -[PO VRT XO VRB XO /] xscvqpdp xscvqpdpo (actually [PO VRT XO VRB XO RO]) -[PO VRT XO VRB XO /] xscvdpqp - -// VSX Scalar Quad-Precision Convert to Integer Instructions - -// VSX Scalar truncate & Convert Quad-Precision format to Signed Doubleword format -// 568 570 572 574 -[PO VRT XO VRB XO /] xscvqpsdz xscvqpswz xscvqpudz xscvqpuwz -576 = 580 xscvsdqp xscvudqp - -"7.6.1.7 VSX Round to Floating-Point Integer Instructions" -// VSX Scalar round & Convert Double-Precision format to Half-Precision format -// XX2-form 554 566 -[PO T XO B XO BX TX] xscvdphp xscvhpdp - -// VSX Vector Convert Half-Precision format to Single-Precision format -// XX2-form 703 705 -[PO T XO B XO BX TX] xvcvhpsp xvcvsphp - -// VSX Scalar Round to Quad-Precision Integer [with Inexact] Z23-form 654 -[PO VRT /// R VRB RMC XO EX] xsrqpi xsrqpix - -// VSX Scalar Round Quad-Precision to Double-Extended Precision Z23-form 656 -[PO VRT /// R VRB RMC XO /] xsrqpxp -def XSRQPXP : Z23Form_1<63, 37, - (outs vrrc:$vT), (ins u5imm:$R, vrrc:$vB, u2imm:$RMC), - "xsrqpxp $vT, $R, $vB, $RMC"), IIC_VecFP, []>; - -27~28 -//-------------------------------------- -// VSX Scalar Insert Exponent Double-Precision X-form 588 -// VSX Scalar Insert Exponent Quad-Precision X-form 589 -[PO VT rA rB XO /] xsiexpdp -[PO VRT VRA VRB XO /] xsiexpqp - -// VSX Vector Insert Exponent Double-Precision XX3-form 722 -[PO T A B XO AX BX TX] xviexpdp xviexpsp - -// VSX Vector Extract Unsigned Word XX2-form 788 -// VSX Vector Insert Word XX2-form -[PO T / UIM B XO BX TX] xxextractuw xxinsertw - -// VSX Scalar Extract Exponent Double-Precision XX2-form 676 -[PO BF DCMX B XO BX /] -[PO T XO B XO BX /] xsxexpdp xsxsigdp -// X-form -[PO VRT XO VRB XO /] xsxexpqp xsxsigqp - -// VSX Vector Extract Exponent Double-Precision XX2-form 784 -[PO T XO B XO BX TX] xvxexpdp xvxexpsp - -// VSX Vector Extract Significand Double-Precision XX2-form 785 -[PO T XO B XO BX TX] xvxsigdp xvxsigsp - -//-------------------------------------- -// VSX Scalar Test Data Class Double-Precision XX2-form p673 -// VSX Scalar Test Data Class Quad-Precision X-form 674 -// VSX Scalar Test Data Class Single-Precision XX2-form 675 -[PO BF DCMX B XO BX /] xststdcdp xststdcsp -[PO BF DCMX VRB XO /] xststdcqp - -// VSX Vector Test Data Class Double-Precision XX2-form 782 783 -[PO T dx B XO dc XO dm BX TX] xvtstdcdp xvtstdcsp - -//-------------------------------------- -// VSX Scalar Maximum Type-C Double-Precision XX3-form 601 ~ 609 -[PO T A B XO AX BX TX] xsmaxcdp xsmaxjdp xsmincdp xsminjdp - -//-------------------------------------- -// VSX Vector Byte-Reverse Doubleword XX2-form 786 787 -[PO T XO B XO BX TX] xxbrd xxbrh xxbrq xxbrw - -// VSX Vector Permute XX3-form 794 -[PO T A B XO AX BX TX] xxperm xxpermr - -// VSX Vector Splat Immediate Byte 796 x-form -[PO T EO IMM8 XO TX] xxspltib <= sign or unsigned? - -30 -//-------------------------------------- -// Load VSX Vector DQ-form 511 -[PO T RA DQ TX XO] lxv - -// Store VSX Vector DQ-form 526 -[PO S RA DQ SX XO] stxv - -// Load VSX Scalar Doubleword DS-form 499 -// Load VSX Scalar Single DS-form 504 -[PO VRT RA DS XO] lxsd lxssp - -// Store VSX Scalar Doubleword DS-form 517 -// Store VSX Scalar Single DS-form 520 -[PO VRT RA DS XO] stxsd stxssp - - -// Load VSX Vector Indexed X-form 511 -// Load VSX Scalar as Integer Byte & Zero Indexed X-form 501 -// Load VSX Vector Byte*16 Indexed X-form 506 -// Load VSX Vector with Length X-form 508 -// Load VSX Vector Left-justified with Length X-form 510 -// Load VSX Vector Halfword*8 Indexed X-form 514 -// Load VSX Vector Word & Splat Indexed X-form 516 -[PO T RA RB XO TX] lxvx lxsibzx lxsihzx lxvb16x lxvl lxvll lxvh8x lxvwsx - -// Store VSX Scalar as Integer Byte Indexed X-form 518 -// Store VSX Scalar as Integer Halfword Indexed X-form 518 -// Store VSX Vector Byte*16 Indexed X-form 522 -// Store VSX Vector Halfword*8 Indexed X-form 524 -// Store VSX Vector with Length X-form 526 -// Store VSX Vector Left-justified with Length X-form 528 -// Store VSX Vector Indexed X-form 529 -[PO S RA RB XO SX] stxsibx stxsihx stxvb16x stxvh8x stxvl stxvll stxvx - -21 - -//-------------------------------------- -". vector instructions" - -[1] PowerISA-v3.0 p.933 - Table 1, and Chapter 6. Vector Facility (altivec) -[2] https://sourceware.org/ml/binutils/2015-11/msg00071.html - -//-------------------------------------- -New patch: -// vector bit, p.367, 6.16 Vector Bit Permute Instruction -[PO VRT VRA VRB XO] vbpermd, (existing: vbpermq) - -// vector permute, p.280 -[PO VRT VRA VRB VRC XO] vpermr - -// vector rotate left, p.341 -[PO VRT VRA VRB XO] vrlwnm vrlwmi vrldnm vrldmi - -// vector shift, p.285 -[PO VRT VRA VRB XO] vslv vsrv - -// vector multiply-by-10, p.375 -[PO VRT VRA /// XO] vmul10cuq vmul10uq -[PO VRT VRA VRB XO] vmul10ecuq vmul10euq - -12 -//-------------------------------------- -http://reviews.llvm.org/D15887 + ext + neg + prty - vbpermd -// vector count leading/trailing zero -. new vx-form: p.31, 1.6.14 VX-FORM -[PO RT EO VRB XO] vclzlsbb vctzlsbb (p.363) - -// Vector Count Trailing Zeros Instructions, 362 -[PO VRT EO VRB XO] vctzb vctzh vctzw vctzd (v16i8 v8i16 v4i32 v2i64) - -// vector extend sign (p.314) -[PO VRT EO VRB XO] vextsb2w vextsh2w vextsb2d vextsh2d vextsw2d - -// vector negate, p.313 -[PO VRT EO VRB XO] vnegd vnegw - -// vector parity, p.335 -[PO VRT EO VRB XO] vprtybd vprtybq vprtybw - -16 -//-------------------------------------- -// vector compare, p.330 -[PO VRT VRA VRB RC XO] vcmpneb vcmpneb. vcmpneh vcmpneh. vcmpnew vcmpnew. - vcmpnezb vcmpnezb. vcmpnezh vcmpnezh. vcmpnezw vcmpnezw. -12 -//-------------------------------------- -http://reviews.llvm.org/D15917 + insert -// vector extract (p.287) ref: vspltb (v2.07, p.227) -// vector insert, p.288 -[PO VRT / UIM VRB XO] vinsertb vinsertd vinserth vinsertw - -// Vector Extract Unsigned -[PO VRT / UIM VRB XO] vextractub vextractuh vextractuw vextractd - -// p.364: Vector Extract Unsigned Left/Right-Indexed -[PO RT RA VRB XO] vextublx vextubrx vextuhlx vextuhrx vextuwlx vextuwrx - -14 -- cgit v1.3