summaryrefslogtreecommitdiff
path: root/lib/Target/PowerPC
diff options
context:
space:
mode:
Diffstat (limited to 'lib/Target/PowerPC')
-rw-r--r--lib/Target/PowerPC/AsmParser/PPCAsmParser.cpp29
-rw-r--r--lib/Target/PowerPC/CMakeLists.txt3
-rw-r--r--lib/Target/PowerPC/InstPrinter/PPCInstPrinter.cpp86
-rw-r--r--lib/Target/PowerPC/InstPrinter/PPCInstPrinter.h19
-rw-r--r--lib/Target/PowerPC/MCTargetDesc/PPCAsmBackend.cpp6
-rw-r--r--lib/Target/PowerPC/MCTargetDesc/PPCELFObjectWriter.cpp12
-rw-r--r--lib/Target/PowerPC/MCTargetDesc/PPCMCAsmInfo.cpp1
-rw-r--r--lib/Target/PowerPC/MCTargetDesc/PPCMCTargetDesc.cpp14
-rw-r--r--lib/Target/PowerPC/MCTargetDesc/PPCMCTargetDesc.h15
-rw-r--r--lib/Target/PowerPC/MCTargetDesc/PPCMachObjectWriter.cpp8
-rw-r--r--lib/Target/PowerPC/MCTargetDesc/PPCPredicates.h16
-rw-r--r--lib/Target/PowerPC/P9InstrResources.td687
-rw-r--r--lib/Target/PowerPC/PPC.h10
-rw-r--r--lib/Target/PowerPC/PPCAsmPrinter.cpp170
-rw-r--r--lib/Target/PowerPC/PPCBranchCoalescing.cpp784
-rw-r--r--lib/Target/PowerPC/PPCBranchSelector.cpp2
-rw-r--r--lib/Target/PowerPC/PPCCTRLoops.cpp56
-rw-r--r--lib/Target/PowerPC/PPCEarlyReturn.cpp2
-rw-r--r--lib/Target/PowerPC/PPCExpandISEL.cpp83
-rw-r--r--lib/Target/PowerPC/PPCFastISel.cpp10
-rw-r--r--lib/Target/PowerPC/PPCFrameLowering.cpp18
-rw-r--r--lib/Target/PowerPC/PPCFrameLowering.h4
-rw-r--r--lib/Target/PowerPC/PPCISelDAGToDAG.cpp2035
-rw-r--r--lib/Target/PowerPC/PPCISelLowering.cpp537
-rw-r--r--lib/Target/PowerPC/PPCISelLowering.h73
-rw-r--r--lib/Target/PowerPC/PPCInstr64Bit.td27
-rw-r--r--lib/Target/PowerPC/PPCInstrAltivec.td38
-rw-r--r--lib/Target/PowerPC/PPCInstrFormats.td107
-rw-r--r--lib/Target/PowerPC/PPCInstrInfo.cpp1415
-rw-r--r--lib/Target/PowerPC/PPCInstrInfo.h74
-rw-r--r--lib/Target/PowerPC/PPCInstrInfo.td219
-rw-r--r--lib/Target/PowerPC/PPCInstrVSX.td362
-rw-r--r--lib/Target/PowerPC/PPCLoopPreIncPrep.cpp65
-rw-r--r--lib/Target/PowerPC/PPCMCInstLower.cpp85
-rw-r--r--lib/Target/PowerPC/PPCMIPeephole.cpp966
-rw-r--r--lib/Target/PowerPC/PPCMachineBasicBlockUtils.h198
-rw-r--r--lib/Target/PowerPC/PPCMachineFunctionInfo.cpp14
-rw-r--r--lib/Target/PowerPC/PPCMachineFunctionInfo.h18
-rw-r--r--lib/Target/PowerPC/PPCPreEmitPeephole.cpp95
-rw-r--r--lib/Target/PowerPC/PPCQPXLoadSplat.cpp8
-rw-r--r--lib/Target/PowerPC/PPCReduceCRLogicals.cpp535
-rw-r--r--lib/Target/PowerPC/PPCRegisterInfo.cpp47
-rw-r--r--lib/Target/PowerPC/PPCRegisterInfo.td5
-rw-r--r--lib/Target/PowerPC/PPCScheduleP9.td108
-rw-r--r--lib/Target/PowerPC/PPCSubtarget.h2
-rw-r--r--lib/Target/PowerPC/PPCTLSDynamicCall.cpp2
-rw-r--r--lib/Target/PowerPC/PPCTargetMachine.cpp37
-rw-r--r--lib/Target/PowerPC/PPCTargetMachine.h9
-rw-r--r--lib/Target/PowerPC/PPCTargetObjectFile.h2
-rw-r--r--lib/Target/PowerPC/PPCTargetTransformInfo.cpp29
-rw-r--r--lib/Target/PowerPC/PPCTargetTransformInfo.h7
-rw-r--r--lib/Target/PowerPC/PPCVSXFMAMutate.cpp44
-rw-r--r--lib/Target/PowerPC/PPCVSXSwapRemoval.cpp6
-rw-r--r--lib/Target/PowerPC/README.txt2
-rw-r--r--lib/Target/PowerPC/README_ALTIVEC.txt2
-rw-r--r--lib/Target/PowerPC/TargetInfo/PowerPCTargetInfo.cpp6
-rw-r--r--lib/Target/PowerPC/p9-instrs.txt442
57 files changed, 7753 insertions, 1903 deletions
diff --git a/lib/Target/PowerPC/AsmParser/PPCAsmParser.cpp b/lib/Target/PowerPC/AsmParser/PPCAsmParser.cpp
index 52432a5820fbe..d6db354e02152 100644
--- a/lib/Target/PowerPC/AsmParser/PPCAsmParser.cpp
+++ b/lib/Target/PowerPC/AsmParser/PPCAsmParser.cpp
@@ -251,7 +251,6 @@ namespace {
struct PPCOperand;
class PPCAsmParser : public MCTargetAsmParser {
- const MCInstrInfo &MII;
bool IsPPC64;
bool IsDarwin;
@@ -298,7 +297,7 @@ class PPCAsmParser : public MCTargetAsmParser {
public:
PPCAsmParser(const MCSubtargetInfo &STI, MCAsmParser &,
const MCInstrInfo &MII, const MCTargetOptions &Options)
- : MCTargetAsmParser(Options, STI), MII(MII) {
+ : MCTargetAsmParser(Options, STI, MII) {
// Check for 64-bit vs. 32-bit pointer mode.
const Triple &TheTriple = STI.getTargetTriple();
IsPPC64 = (TheTriple.getArch() == Triple::ppc64 ||
@@ -394,6 +393,10 @@ public:
/// getEndLoc - Get the location of the last token of this operand.
SMLoc getEndLoc() const override { return EndLoc; }
+ /// getLocRange - Get the range between the first and last token of this
+ /// operand.
+ SMRange getLocRange() const { return SMRange(StartLoc, EndLoc); }
+
/// isPPC64 - True if this operand is for an instruction in 64-bit mode.
bool isPPC64() const { return IsPPC64; }
@@ -1138,6 +1141,15 @@ void PPCAsmParser::ProcessInstruction(MCInst &Inst,
Inst = TmpInst;
break;
}
+ case PPC::SUBPCIS: {
+ MCInst TmpInst;
+ int64_t N = Inst.getOperand(1).getImm();
+ TmpInst.setOpcode(PPC::ADDPCIS);
+ TmpInst.addOperand(Inst.getOperand(0));
+ TmpInst.addOperand(MCOperand::createImm(-N));
+ Inst = TmpInst;
+ break;
+ }
case PPC::SRDI:
case PPC::SRDIo: {
MCInst TmpInst;
@@ -1260,6 +1272,9 @@ void PPCAsmParser::ProcessInstruction(MCInst &Inst,
}
}
+static std::string PPCMnemonicSpellCheck(StringRef S, uint64_t FBS,
+ unsigned VariantID = 0);
+
bool PPCAsmParser::MatchAndEmitInstruction(SMLoc IDLoc, unsigned &Opcode,
OperandVector &Operands,
MCStreamer &Out, uint64_t &ErrorInfo,
@@ -1275,8 +1290,13 @@ bool PPCAsmParser::MatchAndEmitInstruction(SMLoc IDLoc, unsigned &Opcode,
return false;
case Match_MissingFeature:
return Error(IDLoc, "instruction use requires an option to be enabled");
- case Match_MnemonicFail:
- return Error(IDLoc, "unrecognized instruction mnemonic");
+ case Match_MnemonicFail: {
+ uint64_t FBS = ComputeAvailableFeatures(getSTI().getFeatureBits());
+ std::string Suggestion = PPCMnemonicSpellCheck(
+ ((PPCOperand &)*Operands[0]).getToken(), FBS);
+ return Error(IDLoc, "invalid instruction" + Suggestion,
+ ((PPCOperand &)*Operands[0]).getLocRange());
+ }
case Match_InvalidOperand: {
SMLoc ErrorLoc = IDLoc;
if (ErrorInfo != ~0ULL) {
@@ -1912,6 +1932,7 @@ extern "C" void LLVMInitializePowerPCAsmParser() {
#define GET_REGISTER_MATCHER
#define GET_MATCHER_IMPLEMENTATION
+#define GET_MNEMONIC_SPELL_CHECKER
#include "PPCGenAsmMatcher.inc"
// Define this matcher function after the auto-generated include so we
diff --git a/lib/Target/PowerPC/CMakeLists.txt b/lib/Target/PowerPC/CMakeLists.txt
index 7ca4c1999003a..3f173787114d4 100644
--- a/lib/Target/PowerPC/CMakeLists.txt
+++ b/lib/Target/PowerPC/CMakeLists.txt
@@ -16,6 +16,7 @@ add_llvm_target(PowerPCCodeGen
PPCBoolRetToInt.cpp
PPCAsmPrinter.cpp
PPCBranchSelector.cpp
+ PPCBranchCoalescing.cpp
PPCCCState.cpp
PPCCTRLoops.cpp
PPCHazardRecognizers.cpp
@@ -38,9 +39,11 @@ add_llvm_target(PowerPCCodeGen
PPCTOCRegDeps.cpp
PPCTLSDynamicCall.cpp
PPCVSXCopy.cpp
+ PPCReduceCRLogicals.cpp
PPCVSXFMAMutate.cpp
PPCVSXSwapRemoval.cpp
PPCExpandISEL.cpp
+ PPCPreEmitPeephole.cpp
)
add_subdirectory(AsmParser)
diff --git a/lib/Target/PowerPC/InstPrinter/PPCInstPrinter.cpp b/lib/Target/PowerPC/InstPrinter/PPCInstPrinter.cpp
index baf5902ddf584..ea709a73ebf26 100644
--- a/lib/Target/PowerPC/InstPrinter/PPCInstPrinter.cpp
+++ b/lib/Target/PowerPC/InstPrinter/PPCInstPrinter.cpp
@@ -15,6 +15,7 @@
#include "MCTargetDesc/PPCMCTargetDesc.h"
#include "MCTargetDesc/PPCPredicates.h"
#include "PPCInstrInfo.h"
+#include "llvm/CodeGen/TargetOpcodes.h"
#include "llvm/MC/MCExpr.h"
#include "llvm/MC/MCInst.h"
#include "llvm/MC/MCInstrInfo.h"
@@ -23,7 +24,6 @@
#include "llvm/MC/MCSymbol.h"
#include "llvm/Support/CommandLine.h"
#include "llvm/Support/raw_ostream.h"
-#include "llvm/Target/TargetOpcodes.h"
using namespace llvm;
#define DEBUG_TYPE "asm-printer"
@@ -39,6 +39,12 @@ static cl::opt<bool>
ShowVSRNumsAsVR("ppc-vsr-nums-as-vr", cl::Hidden, cl::init(false),
cl::desc("Prints full register names with vs{31-63} as v{0-31}"));
+// Prints full register names with percent symbol.
+static cl::opt<bool>
+FullRegNamesWithPercent("ppc-reg-with-percent-prefix", cl::Hidden,
+ cl::init(false),
+ cl::desc("Prints full register names with percent"));
+
#define PRINT_ALIAS_INSTR
#include "PPCGenAsmWriter.inc"
@@ -84,7 +90,7 @@ void PPCInstPrinter::printInst(const MCInst *MI, raw_ostream &O,
return;
}
}
-
+
if ((MI->getOpcode() == PPC::OR || MI->getOpcode() == PPC::OR8) &&
MI->getOperand(1).getReg() == MI->getOperand(2).getReg()) {
O << "\tmr ";
@@ -94,7 +100,7 @@ void PPCInstPrinter::printInst(const MCInst *MI, raw_ostream &O,
printAnnotation(O, Annot);
return;
}
-
+
if (MI->getOpcode() == PPC::RLDICR ||
MI->getOpcode() == PPC::RLDICR_32) {
unsigned char SH = MI->getOperand(2).getImm();
@@ -161,7 +167,7 @@ void PPCInstPrinter::printInst(const MCInst *MI, raw_ostream &O,
return;
}
}
-
+
if (!printAliasInstr(MI, O))
printInstruction(MI, O);
printAnnotation(O, Annot);
@@ -259,7 +265,7 @@ void PPCInstPrinter::printPredicateOperand(const MCInst *MI, unsigned OpNo,
}
llvm_unreachable("Invalid predicate code");
}
-
+
assert(StringRef(Modifier) == "reg" &&
"Need to specify 'cc', 'pm' or 'reg' as predicate op modifier!");
printOperand(MI, OpNo+1, O);
@@ -445,13 +451,57 @@ void PPCInstPrinter::printTLSCall(const MCInst *MI, unsigned OpNo,
O << '@' << MCSymbolRefExpr::getVariantKindName(refExp.getKind());
}
+/// showRegistersWithPercentPrefix - Check if this register name should be
+/// printed with a percentage symbol as prefix.
+bool PPCInstPrinter::showRegistersWithPercentPrefix(const char *RegName) const {
+ if (!FullRegNamesWithPercent || TT.isOSDarwin() || TT.getOS() == Triple::AIX)
+ return false;
+
+ switch (RegName[0]) {
+ default:
+ return false;
+ case 'r':
+ case 'f':
+ case 'q':
+ case 'v':
+ case 'c':
+ return true;
+ }
+}
+
+/// getVerboseConditionalRegName - This method expands the condition register
+/// when requested explicitly or targetting Darwin.
+const char *PPCInstPrinter::getVerboseConditionRegName(unsigned RegNum,
+ unsigned RegEncoding)
+ const {
+ if (!TT.isOSDarwin() && !FullRegNames)
+ return nullptr;
+ if (RegNum < PPC::CR0EQ || RegNum > PPC::CR7UN)
+ return nullptr;
+ const char *CRBits[] = {
+ "lt", "gt", "eq", "un",
+ "4*cr1+lt", "4*cr1+gt", "4*cr1+eq", "4*cr1+un",
+ "4*cr2+lt", "4*cr2+gt", "4*cr2+eq", "4*cr2+un",
+ "4*cr3+lt", "4*cr3+gt", "4*cr3+eq", "4*cr3+un",
+ "4*cr4+lt", "4*cr4+gt", "4*cr4+eq", "4*cr4+un",
+ "4*cr5+lt", "4*cr5+gt", "4*cr5+eq", "4*cr5+un",
+ "4*cr6+lt", "4*cr6+gt", "4*cr6+eq", "4*cr6+un",
+ "4*cr7+lt", "4*cr7+gt", "4*cr7+eq", "4*cr7+un"
+ };
+ return CRBits[RegEncoding];
+}
+
+// showRegistersWithPrefix - This method determines whether registers
+// should be number-only or include the prefix.
+bool PPCInstPrinter::showRegistersWithPrefix() const {
+ if (TT.getOS() == Triple::AIX)
+ return false;
+ return TT.isOSDarwin() || FullRegNamesWithPercent || FullRegNames;
+}
/// stripRegisterPrefix - This method strips the character prefix from a
-/// register name so that only the number is left. Used by for linux asm.
+/// register name so that only the number is left.
static const char *stripRegisterPrefix(const char *RegName) {
- if (FullRegNames || ShowVSRNumsAsVR)
- return RegName;
-
switch (RegName[0]) {
case 'r':
case 'f':
@@ -462,7 +512,7 @@ static const char *stripRegisterPrefix(const char *RegName) {
return RegName + 1;
case 'c': if (RegName[1] == 'r') return RegName + 2;
}
-
+
return RegName;
}
@@ -487,20 +537,24 @@ void PPCInstPrinter::printOperand(const MCInst *MI, unsigned OpNo,
Reg = PPC::VSX32 + (Reg - PPC::VF0);
}
- const char *RegName = getRegisterName(Reg);
- // The linux and AIX assembler does not take register prefixes.
- if (!isDarwinSyntax())
+ const char *RegName;
+ RegName = getVerboseConditionRegName(Reg, MRI.getEncodingValue(Reg));
+ if (RegName == nullptr)
+ RegName = getRegisterName(Reg);
+ if (showRegistersWithPercentPrefix(RegName))
+ O << "%";
+ if (!showRegistersWithPrefix())
RegName = stripRegisterPrefix(RegName);
-
+
O << RegName;
return;
}
-
+
if (Op.isImm()) {
O << Op.getImm();
return;
}
-
+
assert(Op.isExpr() && "unknown operand kind in printOperand");
Op.getExpr()->print(O, &MAI);
}
diff --git a/lib/Target/PowerPC/InstPrinter/PPCInstPrinter.h b/lib/Target/PowerPC/InstPrinter/PPCInstPrinter.h
index 9c79ffb1176c0..f000fbb98110d 100644
--- a/lib/Target/PowerPC/InstPrinter/PPCInstPrinter.h
+++ b/lib/Target/PowerPC/InstPrinter/PPCInstPrinter.h
@@ -14,21 +14,24 @@
#ifndef LLVM_LIB_TARGET_POWERPC_INSTPRINTER_PPCINSTPRINTER_H
#define LLVM_LIB_TARGET_POWERPC_INSTPRINTER_PPCINSTPRINTER_H
+#include "llvm/ADT/Triple.h"
#include "llvm/MC/MCInstPrinter.h"
namespace llvm {
class PPCInstPrinter : public MCInstPrinter {
- bool IsDarwin;
+ Triple TT;
+private:
+ bool showRegistersWithPercentPrefix(const char *RegName) const;
+ bool showRegistersWithPrefix() const;
+ const char *getVerboseConditionRegName(unsigned RegNum,
+ unsigned RegEncoding) const;
+
public:
PPCInstPrinter(const MCAsmInfo &MAI, const MCInstrInfo &MII,
- const MCRegisterInfo &MRI, bool isDarwin)
- : MCInstPrinter(MAI, MII, MRI), IsDarwin(isDarwin) {}
-
- bool isDarwinSyntax() const {
- return IsDarwin;
- }
-
+ const MCRegisterInfo &MRI, Triple T)
+ : MCInstPrinter(MAI, MII, MRI), TT(T) {}
+
void printRegName(raw_ostream &OS, unsigned RegNo) const override;
void printInst(const MCInst *MI, raw_ostream &O, StringRef Annot,
const MCSubtargetInfo &STI) override;
diff --git a/lib/Target/PowerPC/MCTargetDesc/PPCAsmBackend.cpp b/lib/Target/PowerPC/MCTargetDesc/PPCAsmBackend.cpp
index bdad2fe8714fd..2a1de244da923 100644
--- a/lib/Target/PowerPC/MCTargetDesc/PPCAsmBackend.cpp
+++ b/lib/Target/PowerPC/MCTargetDesc/PPCAsmBackend.cpp
@@ -204,7 +204,8 @@ namespace {
public:
DarwinPPCAsmBackend(const Target &T) : PPCAsmBackend(T, false) { }
- MCObjectWriter *createObjectWriter(raw_pwrite_stream &OS) const override {
+ std::unique_ptr<MCObjectWriter>
+ createObjectWriter(raw_pwrite_stream &OS) const override {
bool is64 = getPointerSize() == 8;
return createPPCMachObjectWriter(
OS,
@@ -220,7 +221,8 @@ namespace {
ELFPPCAsmBackend(const Target &T, bool IsLittleEndian, uint8_t OSABI) :
PPCAsmBackend(T, IsLittleEndian), OSABI(OSABI) { }
- MCObjectWriter *createObjectWriter(raw_pwrite_stream &OS) const override {
+ std::unique_ptr<MCObjectWriter>
+ createObjectWriter(raw_pwrite_stream &OS) const override {
bool is64 = getPointerSize() == 8;
return createPPCELFObjectWriter(OS, is64, isLittleEndian(), OSABI);
}
diff --git a/lib/Target/PowerPC/MCTargetDesc/PPCELFObjectWriter.cpp b/lib/Target/PowerPC/MCTargetDesc/PPCELFObjectWriter.cpp
index 1488bd5b0be61..44ee9733b16e1 100644
--- a/lib/Target/PowerPC/MCTargetDesc/PPCELFObjectWriter.cpp
+++ b/lib/Target/PowerPC/MCTargetDesc/PPCELFObjectWriter.cpp
@@ -13,6 +13,7 @@
#include "llvm/ADT/STLExtras.h"
#include "llvm/MC/MCELFObjectWriter.h"
#include "llvm/MC/MCExpr.h"
+#include "llvm/MC/MCObjectWriter.h"
#include "llvm/MC/MCSymbolELF.h"
#include "llvm/MC/MCValue.h"
#include "llvm/Support/ErrorHandling.h"
@@ -416,10 +417,9 @@ bool PPCELFObjectWriter::needsRelocateWithSymbol(const MCSymbol &Sym,
}
}
-MCObjectWriter *llvm::createPPCELFObjectWriter(raw_pwrite_stream &OS,
- bool Is64Bit,
- bool IsLittleEndian,
- uint8_t OSABI) {
- MCELFObjectTargetWriter *MOTW = new PPCELFObjectWriter(Is64Bit, OSABI);
- return createELFObjectWriter(MOTW, OS, IsLittleEndian);
+std::unique_ptr<MCObjectWriter>
+llvm::createPPCELFObjectWriter(raw_pwrite_stream &OS, bool Is64Bit,
+ bool IsLittleEndian, uint8_t OSABI) {
+ auto MOTW = llvm::make_unique<PPCELFObjectWriter>(Is64Bit, OSABI);
+ return createELFObjectWriter(std::move(MOTW), OS, IsLittleEndian);
}
diff --git a/lib/Target/PowerPC/MCTargetDesc/PPCMCAsmInfo.cpp b/lib/Target/PowerPC/MCTargetDesc/PPCMCAsmInfo.cpp
index d30bf1a56e8aa..8ac461b96b88c 100644
--- a/lib/Target/PowerPC/MCTargetDesc/PPCMCAsmInfo.cpp
+++ b/lib/Target/PowerPC/MCTargetDesc/PPCMCAsmInfo.cpp
@@ -24,6 +24,7 @@ PPCMCAsmInfoDarwin::PPCMCAsmInfoDarwin(bool is64Bit, const Triple& T) {
}
IsLittleEndian = false;
+ SeparatorString = "@";
CommentString = ";";
ExceptionsType = ExceptionHandling::DwarfCFI;
diff --git a/lib/Target/PowerPC/MCTargetDesc/PPCMCTargetDesc.cpp b/lib/Target/PowerPC/MCTargetDesc/PPCMCTargetDesc.cpp
index e8f220ea54576..a1e4e07b25af4 100644
--- a/lib/Target/PowerPC/MCTargetDesc/PPCMCTargetDesc.cpp
+++ b/lib/Target/PowerPC/MCTargetDesc/PPCMCTargetDesc.cpp
@@ -94,15 +94,6 @@ static MCAsmInfo *createPPCMCAsmInfo(const MCRegisterInfo &MRI,
return MAI;
}
-static void adjustCodeGenOpts(const Triple &TT, Reloc::Model RM,
- CodeModel::Model &CM) {
- if (CM == CodeModel::Default) {
- if (!TT.isOSDarwin() &&
- (TT.getArch() == Triple::ppc64 || TT.getArch() == Triple::ppc64le))
- CM = CodeModel::Medium;
- }
-}
-
namespace {
class PPCTargetAsmStreamer : public PPCTargetStreamer {
@@ -248,7 +239,7 @@ static MCInstPrinter *createPPCMCInstPrinter(const Triple &T,
const MCAsmInfo &MAI,
const MCInstrInfo &MII,
const MCRegisterInfo &MRI) {
- return new PPCInstPrinter(MAI, MII, MRI, T.isOSDarwin());
+ return new PPCInstPrinter(MAI, MII, MRI, T);
}
extern "C" void LLVMInitializePowerPCTargetMC() {
@@ -257,9 +248,6 @@ extern "C" void LLVMInitializePowerPCTargetMC() {
// Register the MC asm info.
RegisterMCAsmInfoFn C(*T, createPPCMCAsmInfo);
- // Register the MC codegen info.
- TargetRegistry::registerMCAdjustCodeGenOpts(*T, adjustCodeGenOpts);
-
// Register the MC instruction info.
TargetRegistry::RegisterMCInstrInfo(*T, createPPCMCInstrInfo);
diff --git a/lib/Target/PowerPC/MCTargetDesc/PPCMCTargetDesc.h b/lib/Target/PowerPC/MCTargetDesc/PPCMCTargetDesc.h
index 893233ee2300f..80a74c09a598a 100644
--- a/lib/Target/PowerPC/MCTargetDesc/PPCMCTargetDesc.h
+++ b/lib/Target/PowerPC/MCTargetDesc/PPCMCTargetDesc.h
@@ -19,6 +19,7 @@
#include "llvm/Support/MathExtras.h"
#include <cstdint>
+#include <memory>
namespace llvm {
@@ -47,12 +48,15 @@ MCAsmBackend *createPPCAsmBackend(const Target &T, const MCRegisterInfo &MRI,
const MCTargetOptions &Options);
/// Construct an PPC ELF object writer.
-MCObjectWriter *createPPCELFObjectWriter(raw_pwrite_stream &OS, bool Is64Bit,
- bool IsLittleEndian, uint8_t OSABI);
+std::unique_ptr<MCObjectWriter> createPPCELFObjectWriter(raw_pwrite_stream &OS,
+ bool Is64Bit,
+ bool IsLittleEndian,
+ uint8_t OSABI);
/// Construct a PPC Mach-O object writer.
-MCObjectWriter *createPPCMachObjectWriter(raw_pwrite_stream &OS, bool Is64Bit,
- uint32_t CPUType,
- uint32_t CPUSubtype);
+std::unique_ptr<MCObjectWriter> createPPCMachObjectWriter(raw_pwrite_stream &OS,
+ bool Is64Bit,
+ uint32_t CPUType,
+ uint32_t CPUSubtype);
/// Returns true iff Val consists of one contiguous run of 1s with any number of
/// 0s on either side. The 1s are allowed to wrap from LSB to MSB, so
@@ -97,6 +101,7 @@ static inline bool isRunOfOnes(unsigned Val, unsigned &MB, unsigned &ME) {
// Defines symbolic names for the PowerPC instructions.
//
#define GET_INSTRINFO_ENUM
+#define GET_INSTRINFO_SCHED_ENUM
#include "PPCGenInstrInfo.inc"
#define GET_SUBTARGETINFO_ENUM
diff --git a/lib/Target/PowerPC/MCTargetDesc/PPCMachObjectWriter.cpp b/lib/Target/PowerPC/MCTargetDesc/PPCMachObjectWriter.cpp
index d5506277ca880..4b9055ec70419 100644
--- a/lib/Target/PowerPC/MCTargetDesc/PPCMachObjectWriter.cpp
+++ b/lib/Target/PowerPC/MCTargetDesc/PPCMachObjectWriter.cpp
@@ -374,10 +374,10 @@ void PPCMachObjectWriter::RecordPPCRelocation(
Writer->addRelocation(RelSymbol, Fragment->getParent(), MRE);
}
-MCObjectWriter *llvm::createPPCMachObjectWriter(raw_pwrite_stream &OS,
- bool Is64Bit, uint32_t CPUType,
- uint32_t CPUSubtype) {
+std::unique_ptr<MCObjectWriter>
+llvm::createPPCMachObjectWriter(raw_pwrite_stream &OS, bool Is64Bit,
+ uint32_t CPUType, uint32_t CPUSubtype) {
return createMachObjectWriter(
- new PPCMachObjectWriter(Is64Bit, CPUType, CPUSubtype), OS,
+ llvm::make_unique<PPCMachObjectWriter>(Is64Bit, CPUType, CPUSubtype), OS,
/*IsLittleEndian=*/false);
}
diff --git a/lib/Target/PowerPC/MCTargetDesc/PPCPredicates.h b/lib/Target/PowerPC/MCTargetDesc/PPCPredicates.h
index acea600fbb0da..603ac960133f9 100644
--- a/lib/Target/PowerPC/MCTargetDesc/PPCPredicates.h
+++ b/lib/Target/PowerPC/MCTargetDesc/PPCPredicates.h
@@ -70,6 +70,22 @@ namespace PPC {
/// Assume the condition register is set by MI(a,b), return the predicate if
/// we modify the instructions such that condition register is set by MI(b,a).
Predicate getSwappedPredicate(Predicate Opcode);
+
+ /// Return the condition without hint bits.
+ inline unsigned getPredicateCondition(Predicate Opcode) {
+ return (unsigned)(Opcode & ~BR_HINT_MASK);
+ }
+
+ /// Return the hint bits of the predicate.
+ inline unsigned getPredicateHint(Predicate Opcode) {
+ return (unsigned)(Opcode & BR_HINT_MASK);
+ }
+
+ /// Return predicate consisting of specified condition and hint bits.
+ inline Predicate getPredicate(unsigned Condition, unsigned Hint) {
+ return (Predicate)((Condition & ~BR_HINT_MASK) |
+ (Hint & BR_HINT_MASK));
+ }
}
}
diff --git a/lib/Target/PowerPC/P9InstrResources.td b/lib/Target/PowerPC/P9InstrResources.td
index aea022f887667..dc6ed16e53ce7 100644
--- a/lib/Target/PowerPC/P9InstrResources.td
+++ b/lib/Target/PowerPC/P9InstrResources.td
@@ -12,11 +12,29 @@
// is listed here. Instructions in this file belong to itinerary classes that
// have instructions with different resource requirements.
//
+// The makeup of the P9 CPU is modeled as follows:
+// - Each CPU is made up of two superslices.
+// - Each superslice is made up of two slices. Therefore, there are 4 slices
+// for each CPU.
+// - Up to 6 instructions can be dispatched to each CPU. Three per superslice.
+// - Each CPU has:
+// - One CY (Crypto) unit P9_CY_*
+// - One DFU (Decimal Floating Point and Quad Precision) unit P9_DFU_*
+// - Two PM (Permute) units. One on each superslice. P9_PM_*
+// - Two DIV (Fixed Point Divide) units. One on each superslize. P9_DIV_*
+// - Four ALU (Fixed Point Arithmetic) units. One on each slice. P9_ALU_*
+// - Four DP (Floating Point) units. One on each slice. P9_DP_*
+// This also includes fixed point multiply add.
+// - Four AGEN (Address Generation) units. One for each slice. P9_AGEN_*
+// - Four Load/Store Queues. P9_LS_*
+// - Each set of instructions will require a number of these resources.
//===----------------------------------------------------------------------===//
-
+// Two cycle ALU vector operation that uses an entire superslice.
+// Uses both ALU units (the even ALUE and odd ALUO units), two pipelines
+// (EXECE, EXECO) and all three dispatches (DISP) to the given superslice.
def : InstRW<[P9_ALUE_2C, P9_ALUO_2C, IP_EXECE_1C, IP_EXECO_1C,
- DISP_1C, DISP_1C],
+ DISP_1C, DISP_1C, DISP_1C],
(instrs
VADDCUW,
VADDUBM,
@@ -26,47 +44,41 @@ def : InstRW<[P9_ALUE_2C, P9_ALUO_2C, IP_EXECE_1C, IP_EXECO_1C,
VAND,
VANDC,
VCMPEQUB,
- VCMPEQUBo,
VCMPEQUD,
- VCMPEQUDo,
VCMPEQUH,
- VCMPEQUHo,
VCMPEQUW,
- VCMPEQUWo,
- VCMPGTSB,
- VCMPGTSBo,
- VCMPGTSD,
- VCMPGTSDo,
- VCMPGTSH,
- VCMPGTSHo,
- VCMPGTSW,
- VCMPGTSWo,
- VCMPGTUB,
- VCMPGTUBo,
- VCMPGTUD,
- VCMPGTUDo,
- VCMPGTUH,
- VCMPGTUHo,
- VCMPGTUW,
- VCMPGTUWo,
VCMPNEB,
- VCMPNEBo,
VCMPNEH,
- VCMPNEHo,
VCMPNEW,
- VCMPNEWo,
VCMPNEZB,
- VCMPNEZBo,
VCMPNEZH,
- VCMPNEZHo,
VCMPNEZW,
- VCMPNEZWo,
VEQV,
VEXTSB2D,
VEXTSB2W,
VEXTSH2D,
VEXTSH2W,
VEXTSW2D,
+ VRLB,
+ VRLD,
+ VRLDMI,
+ VRLDNM,
+ VRLH,
+ VRLW,
+ VRLWMI,
+ VRLWNM,
+ VSRAB,
+ VSRAD,
+ VSRAH,
+ VSRAW,
+ VSRB,
+ VSRD,
+ VSRH,
+ VSRW,
+ VSLB,
+ VSLD,
+ VSLH,
+ VSLW,
VMRGEW,
VMRGOW,
VNAND,
@@ -77,9 +89,7 @@ def : InstRW<[P9_ALUE_2C, P9_ALUO_2C, IP_EXECE_1C, IP_EXECO_1C,
VORC,
VPOPCNTB,
VPOPCNTH,
- VPOPCNTW,
VSEL,
- VSUBCUW,
VSUBUBM,
VSUBUDM,
VSUBUHM,
@@ -98,6 +108,8 @@ def : InstRW<[P9_ALUE_2C, P9_ALUO_2C, IP_EXECE_1C, IP_EXECO_1C,
XVNEGDP,
XVNEGSP,
XVXEXPDP,
+ XVIEXPSP,
+ XVXEXPSP,
XXLAND,
XXLANDC,
XXLEQV,
@@ -107,28 +119,128 @@ def : InstRW<[P9_ALUE_2C, P9_ALUO_2C, IP_EXECE_1C, IP_EXECO_1C,
XXLORf,
XXLORC,
XXLXOR,
- XXSEL
-)>;
-
-def : InstRW<[P9_ALU_2C, IP_EXEC_1C, DISP_1C, DISP_1C],
- (instrs
+ XXSEL,
XSABSQP,
XSCPSGNQP,
XSIEXPQP,
XSNABSQP,
XSNEGQP,
- XSXEXPQP,
- XSABSDP,
- XSCPSGNDP,
- XSIEXPDP,
+ XSXEXPQP
+)>;
+
+// Restricted Dispatch ALU operation for 3 cycles. The operation runs on a
+// slingle slice. However, since it is Restricted it requires all 3 dispatches
+// (DISP) for that superslice.
+def : InstRW<[P9_ALU_3C, IP_EXEC_1C, DISP_1C, DISP_1C, DISP_1C],
+ (instrs
+ FCMPUS,
+ FCMPUD,
+ XSTSTDCDP,
+ XSTSTDCSP
+)>;
+
+// Standard Dispatch ALU operation for 3 cycles. Only one slice used.
+def : InstRW<[P9_ALU_3C, IP_EXEC_1C, DISP_1C, DISP_1C],
+ (instrs
+ XSMAXCDP,
+ XSMAXDP,
+ XSMAXJDP,
+ XSMINCDP,
+ XSMINDP,
+ XSMINJDP,
+ XSTDIVDP,
+ XSTSQRTDP,
+ XSCMPEQDP,
+ XSCMPEXPDP,
+ XSCMPGEDP,
+ XSCMPGTDP,
+ XSCMPODP,
+ XSCMPUDP,
+ XSXSIGDP,
+ XSCVSPDPN
+)>;
+
+// Standard Dispatch ALU operation for 2 cycles. Only one slice used.
+def : InstRW<[P9_ALU_2C, IP_EXEC_1C, DISP_1C, DISP_1C],
+ (instrs
+ ADDIStocHA,
+ ADDItocL,
+ MCRF,
+ MCRXRX,
+ SLD,
+ SRD,
+ SRAD,
+ SRADI,
+ RLDIC,
XSNABSDP,
+ XSXEXPDP,
+ XSABSDP,
XSNEGDP,
- XSXEXPDP
+ XSCPSGNDP
)>;
-def : InstRW<[P9_ALUE_3C, P9_ALUO_3C, IP_EXECE_1C, IP_EXECO_1C, DISP_1C, DISP_1C],
+// Restricted Dispatch ALU operation for 2 cycles. The operation runs on a
+// slingle slice. However, since it is Restricted it requires all 3 dispatches
+// (DISP) for that superslice.
+def : InstRW<[P9_ALU_2C, IP_EXEC_1C, DISP_1C, DISP_1C, DISP_1C],
(instrs
+ RLDCL,
+ RLDCR,
+ RLDIMI,
+ RLDICL,
+ RLDICR,
+ RLDICL_32_64,
+ XSIEXPDP,
+ FMR,
+ FABSD,
+ FABSS,
+ FNABSD,
+ FNABSS,
+ FNEGD,
+ FNEGS,
+ FCPSGND,
+ FCPSGNS
+)>;
+// Three cycle ALU vector operation that uses an entire superslice.
+// Uses both ALU units (the even ALUE and odd ALUO units), two pipelines
+// (EXECE, EXECO) and all three dispatches (DISP) to the given superslice.
+def : InstRW<[P9_ALUE_3C, P9_ALUO_3C, IP_EXECE_1C, IP_EXECO_1C,
+ DISP_1C, DISP_1C, DISP_1C],
+ (instrs
+ VBPERMD,
+ VABSDUB,
+ VABSDUH,
+ VABSDUW,
+ VADDUBS,
+ VADDUHS,
+ VADDUWS,
+ VAVGSB,
+ VAVGSH,
+ VAVGSW,
+ VAVGUB,
+ VAVGUH,
+ VAVGUW,
+ VCMPEQFP,
+ VCMPEQFPo,
+ VCMPGEFP,
+ VCMPGEFPo,
+ VCMPBFP,
+ VCMPBFPo,
+ VCMPGTFP,
+ VCMPGTFPo,
+ VCLZB,
+ VCLZD,
+ VCLZH,
+ VCLZW,
+ VCTZB,
+ VCTZD,
+ VCTZH,
+ VCTZW,
+ VADDSBS,
+ VADDSHS,
+ VADDSWS,
+ VMINFP,
VMINSB,
VMINSD,
VMINSH,
@@ -137,55 +249,54 @@ def : InstRW<[P9_ALUE_3C, P9_ALUO_3C, IP_EXECE_1C, IP_EXECO_1C, DISP_1C, DISP_1C
VMINUD,
VMINUH,
VMINUW,
+ VMAXFP,
+ VMAXSB,
+ VMAXSD,
+ VMAXSH,
+ VMAXSW,
+ VMAXUB,
+ VMAXUD,
+ VMAXUH,
+ VMAXUW,
+ VPOPCNTW,
VPOPCNTD,
VPRTYBD,
VPRTYBW,
- VRLB,
- VRLD,
- VRLDMI,
- VRLDNM,
- VRLH,
- VRLW,
- VRLWMI,
- VRLWNM,
VSHASIGMAD,
VSHASIGMAW,
- VSLB,
- VSLD,
- VSLH,
- VSLW,
- VSRAB,
- VSRAD,
- VSRAH,
- VSRAW,
- VSRB,
- VSRD,
- VSRH,
- VSRW,
VSUBSBS,
VSUBSHS,
VSUBSWS,
VSUBUBS,
VSUBUHS,
VSUBUWS,
- XSCMPEQDP,
- XSCMPEXPDP,
- XSCMPGEDP,
- XSCMPGTDP,
- XSCMPODP,
- XSCMPUDP,
- XSCVSPDPN,
- XSMAXCDP,
- XSMAXDP,
- XSMAXJDP,
- XSMINCDP,
- XSMINDP,
- XSMINJDP,
- XSTDIVDP,
- XSTSQRTDP,
- XSTSTDCDP,
- XSTSTDCSP,
- XSXSIGDP,
+ VSUBCUW,
+ VCMPGTSB,
+ VCMPGTSBo,
+ VCMPGTSD,
+ VCMPGTSDo,
+ VCMPGTSH,
+ VCMPGTSHo,
+ VCMPGTSW,
+ VCMPGTSWo,
+ VCMPGTUB,
+ VCMPGTUBo,
+ VCMPGTUD,
+ VCMPGTUDo,
+ VCMPGTUH,
+ VCMPGTUHo,
+ VCMPGTUW,
+ VCMPGTUWo,
+ VCMPNEBo,
+ VCMPNEHo,
+ VCMPNEWo,
+ VCMPNEZBo,
+ VCMPNEZHo,
+ VCMPNEZWo,
+ VCMPEQUBo,
+ VCMPEQUDo,
+ VCMPEQUHo,
+ VCMPEQUWo,
XVCMPEQDP,
XVCMPEQDPo,
XVCMPEQSP,
@@ -198,7 +309,6 @@ def : InstRW<[P9_ALUE_3C, P9_ALUO_3C, IP_EXECE_1C, IP_EXECO_1C, DISP_1C, DISP_1C
XVCMPGTDPo,
XVCMPGTSP,
XVCMPGTSPo,
- XVIEXPSP,
XVMAXDP,
XVMAXSP,
XVMINDP,
@@ -209,58 +319,15 @@ def : InstRW<[P9_ALUE_3C, P9_ALUO_3C, IP_EXECE_1C, IP_EXECO_1C, DISP_1C, DISP_1C
XVTSQRTSP,
XVTSTDCDP,
XVTSTDCSP,
- XVXEXPSP,
XVXSIGDP,
XVXSIGSP
)>;
-def : InstRW<[P9_ALUE_4C, P9_ALUO_4C, IP_EXECE_1C, IP_EXECO_1C, DISP_1C, DISP_1C],
- (instrs
- VABSDUB,
- VABSDUH,
- VABSDUW,
- VADDSBS,
- VADDSHS,
- VADDSWS,
- VADDUBS,
- VADDUHS,
- VADDUWS,
- VAVGSB,
- VAVGSH,
- VAVGSW,
- VAVGUB,
- VAVGUH,
- VAVGUW,
- VBPERMD,
- VCLZB,
- VCLZD,
- VCLZH,
- VCLZW,
- VCMPBFP,
- VCMPBFPo,
- VCMPGTFP,
- VCMPGTFPo,
- VCTZB,
- VCTZD,
- VCTZH,
- VCTZW,
- VMAXFP,
- VMAXSB,
- VMAXSD,
- VMAXSH,
- VMAXSW,
- VMAXUB,
- VMAXUD,
- VMAXUH,
- VMAXUW,
- VMINFP,
- VCMPEQFP,
- VCMPEQFPo,
- VCMPGEFP,
- VCMPGEFPo
-)>;
-
-def : InstRW<[P9_DPE_7C, P9_DPO_7C, IP_EXECE_1C, IP_EXECO_1C, DISP_1C, DISP_1C],
+// 7 cycle DP vector operation that uses an entire superslice.
+// Uses both DP units (the even DPE and odd DPO units), two pipelines
+// (EXECE, EXECO) and all three dispatches (DISP) to the given superslice.
+def : InstRW<[P9_DPE_7C, P9_DPO_7C, IP_EXECE_1C, IP_EXECO_1C,
+ DISP_1C, DISP_1C, DISP_1C],
(instrs
VADDFP,
VCTSXS,
@@ -367,8 +434,47 @@ def : InstRW<[P9_DPE_7C, P9_DPO_7C, IP_EXECE_1C, IP_EXECO_1C, DISP_1C, DISP_1C],
VSUMSWS
)>;
+// 7 cycle Restricted DP operation. One DP unit, one EXEC pipeline and all three
+// dispatch units for the superslice.
def : InstRW<[P9_DP_7C, IP_EXEC_1C, DISP_1C, DISP_1C, DISP_1C],
(instrs
+ FRSP,
+ FRIND,
+ FRINS,
+ FRIPD,
+ FRIPS,
+ FRIZD,
+ FRIZS,
+ FRIMD,
+ FRIMS,
+ FRE,
+ FRES,
+ FRSQRTE,
+ FRSQRTES,
+ FMADDS,
+ FMADD,
+ FMSUBS,
+ FMSUB,
+ FNMADDS,
+ FNMADD,
+ FNMSUBS,
+ FNMSUB,
+ FSELD,
+ FSELS,
+ FADDS,
+ FMULS,
+ FMUL,
+ FSUBS,
+ FCFID,
+ FCTID,
+ FCTIDZ,
+ FCFIDU,
+ FCFIDS,
+ FCFIDUS,
+ FCTIDUZ,
+ FCTIWUZ,
+ FCTIW,
+ FCTIWZ,
XSMADDADP,
XSMADDASP,
XSMADDMDP,
@@ -389,7 +495,19 @@ def : InstRW<[P9_DP_7C, IP_EXEC_1C, DISP_1C, DISP_1C, DISP_1C],
XSNMSUBMSP
)>;
+// 7 cycle Restricted DP operation and one 2 cycle ALU operation.
+// The DP is restricted so we need a full 5 dispatches.
+def : InstRW<[P9_DPOpAndALUOp_9C, IP_EXEC_1C, IP_EXEC_1C,
+ DISP_1C, DISP_1C, DISP_1C, DISP_1C, DISP_1C],
+ (instrs
+ FMULo,
+ FMADDo,
+ FMSUBo,
+ FNMADDo,
+ FNMSUBo
+)>;
+// 7 cycle DP operation. One DP unit, one EXEC pipeline and two dispatch units.
def : InstRW<[P9_DP_7C, IP_EXEC_1C, DISP_1C, DISP_1C],
(instrs
XSADDDP,
@@ -397,8 +515,10 @@ def : InstRW<[P9_DP_7C, IP_EXEC_1C, DISP_1C, DISP_1C],
XSCVDPHP,
XSCVDPSP,
XSCVDPSXDS,
+ XSCVDPSXDSs,
XSCVDPSXWS,
XSCVDPUXDS,
+ XSCVDPUXDSs,
XSCVDPUXWS,
XSCVHPDP,
XSCVSPDP,
@@ -421,7 +541,10 @@ def : InstRW<[P9_DP_7C, IP_EXEC_1C, DISP_1C, DISP_1C],
XSCVDPSPN
)>;
-def : InstRW<[P9_PM_3C, IP_EXECO_1C, IP_EXECE_1C, DISP_1C, DISP_1C],
+// Three Cycle PM operation. Only one PM unit per superslice so we use the whole
+// superslice. That includes both exec pipelines (EXECO, EXECE) and all three
+// dispatches.
+def : InstRW<[P9_PM_3C, IP_EXECO_1C, IP_EXECE_1C, DISP_1C, DISP_1C, DISP_1C],
(instrs
VBPERMQ,
VCLZLSBB,
@@ -469,7 +592,9 @@ def : InstRW<[P9_PM_3C, IP_EXECO_1C, IP_EXECE_1C, DISP_1C, DISP_1C],
VSLO,
VSLV,
VSPLTB,
+ VSPLTBs,
VSPLTH,
+ VSPLTHs,
VSPLTISB,
VSPLTISH,
VSPLTISW,
@@ -498,6 +623,9 @@ def : InstRW<[P9_PM_3C, IP_EXECO_1C, IP_EXECE_1C, DISP_1C, DISP_1C],
XXSLDWI,
XXSPLTIB,
XXSPLTW,
+ XXSPLTWs,
+ XXPERMDI,
+ XXPERMDIs,
VADDCUQ,
VADDECUQ,
VADDEUQM,
@@ -517,7 +645,10 @@ def : InstRW<[P9_PM_3C, IP_EXECO_1C, IP_EXECE_1C, DISP_1C, DISP_1C],
XSXSIGQP
)>;
-def : InstRW<[P9_DFU_12C, IP_EXECE_1C, IP_EXECO_1C, DISP_1C, DISP_1C],
+// 12 Cycle DFU operation. Only one DFU unit per CPU so we use a whole
+// superslice. That includes both exec pipelines (EXECO, EXECE) and all three
+// dispatches.
+def : InstRW<[P9_DFU_12C, IP_EXECE_1C, IP_EXECO_1C, DISP_1C, DISP_1C, DISP_1C],
(instrs
XSADDQP,
XSADDQPO,
@@ -536,7 +667,10 @@ def : InstRW<[P9_DFU_12C, IP_EXECE_1C, IP_EXECO_1C, DISP_1C, DISP_1C],
XSSUBQPO
)>;
-def : InstRW<[P9_DFU_24C, IP_EXECE_1C, IP_EXECO_1C, DISP_1C, DISP_1C],
+// 24 Cycle DFU operation. Only one DFU unit per CPU so we use a whole
+// superslice. That includes both exec pipelines (EXECO, EXECE) and all three
+// dispatches.
+def : InstRW<[P9_DFU_24C, IP_EXECE_1C, IP_EXECO_1C, DISP_1C, DISP_1C, DISP_1C],
(instrs
XSMADDQP,
XSMADDQPO,
@@ -550,45 +684,57 @@ def : InstRW<[P9_DFU_24C, IP_EXECE_1C, IP_EXECO_1C, DISP_1C, DISP_1C],
XSNMSUBQPO
)>;
-def : InstRW<[P9_DFU_58C, IP_EXECE_1C, IP_EXECO_1C, DISP_1C, DISP_1C],
+// 58 Cycle DFU operation. Only one DFU unit per CPU so we use a whole
+// superslice. That includes both exec pipelines (EXECO, EXECE) and all three
+// dispatches.
+def : InstRW<[P9_DFU_58C, IP_EXECE_1C, IP_EXECO_1C, DISP_1C, DISP_1C, DISP_1C],
(instrs
XSDIVQP,
XSDIVQPO
)>;
-def : InstRW<[P9_DFU_76C, IP_EXECE_1C, IP_EXECO_1C, DISP_1C, DISP_1C],
+// 76 Cycle DFU operation. Only one DFU unit per CPU so we use a whole
+// superslice. That includes both exec pipelines (EXECO, EXECE) and all three
+// dispatches.
+def : InstRW<[P9_DFU_76C, IP_EXECE_1C, IP_EXECO_1C, DISP_1C, DISP_1C, DISP_1C],
(instrs
XSSQRTQP,
XSSQRTQPO
)>;
-// Load Operation in IIC_LdStLFD
-
+// 5 Cycle load uses a single slice.
def : InstRW<[P9_LS_5C, IP_AGEN_1C, DISP_1C, DISP_1C],
(instrs
LXSDX,
LXVD2X,
LXSIWZX,
LXV,
- LXSD
+ LXVX,
+ LXSD,
+ DFLOADf64,
+ XFLOADf64
)>;
-def : InstRW<[P9_LS_5C, IP_AGEN_1C, DISP_1C, DISP_1C, DISP_1C],
+// 4 Cycle load uses a single slice.
+def : InstRW<[P9_LS_4C, IP_AGEN_1C, DISP_1C, DISP_1C],
(instrs
- LFIWZX,
- LFDX,
- LFD
+ COPY
)>;
-def : InstRW<[P9_LoadAndALUOp_7C, IP_AGEN_1C, IP_EXEC_1C,
- DISP_1C, DISP_1C, DISP_1C, DISP_1C],
+// 4 Cycle Restricted load uses a single slice but the dispatch for the whole
+// superslice.
+def : InstRW<[P9_LS_4C, IP_AGEN_1C, DISP_1C, DISP_1C, DISP_1C],
(instrs
- LXSSPX,
- LXSIWAX,
- LXSSP
+ LFIWZX,
+ LFDX,
+ LFD
)>;
-def : InstRW<[P9_LoadAndALUOp_7C, IP_AGEN_1C, IP_EXEC_1C,
+// Cracked Restricted Load instruction.
+// Requires consecutive Load and ALU pieces totaling 6 cycles. The Load and ALU
+// operations cannot be done at the same time and so their latencies are added.
+// Full 6 dispatches are required as this is both cracked and restricted.
+def : InstRW<[P9_LoadAndALUOp_6C, IP_EXEC_1C, IP_AGEN_1C,
DISP_1C, DISP_1C, DISP_1C, DISP_1C, DISP_1C, DISP_1C],
(instrs
LFIWAX,
@@ -596,14 +742,38 @@ def : InstRW<[P9_LoadAndALUOp_7C, IP_AGEN_1C, IP_EXEC_1C,
LFS
)>;
-def : InstRW<[P9_LoadAndPMOp_8C, IP_AGEN_1C, IP_EXEC_1C, DISP_1C, DISP_1C],
+// Cracked Load instruction.
+// Requires consecutive Load and ALU pieces totaling 7 cycles. The Load and ALU
+// operations cannot be done at the same time and so their latencies are added.
+// Full 4 dispatches are required as this is a cracked instruction.
+def : InstRW<[P9_LoadAndALUOp_7C, IP_AGEN_1C, IP_EXEC_1C,
+ DISP_1C, DISP_1C, DISP_1C, DISP_1C],
+ (instrs
+ LXSSPX,
+ LXSIWAX,
+ LXSSP,
+ DFLOADf32,
+ XFLOADf32,
+ LIWAX,
+ LIWZX
+)>;
+
+// Cracked Load that requires the PM resource.
+// Since the Load and the PM cannot be done at the same time the latencies are
+// added. Requires 8 cycles.
+// Since the PM requires the full superslice we need both EXECE, EXECO pipelines
+// as well as 3 dispatches for the PM. The Load requires the remaining 2
+// dispatches.
+def : InstRW<[P9_LoadAndPMOp_8C, IP_AGEN_1C, IP_EXECE_1C, IP_EXECO_1C,
+ DISP_1C, DISP_1C, DISP_1C, DISP_1C, DISP_1C],
(instrs
LXVDSX,
+ LXVWSX,
LXVW4X
)>;
-// Store Operations in IIC_LdStSTFD.
-
+// Single slice Restricted store operation. The restricted operation requires
+// all three dispatches for the superslice.
def : InstRW<[P9_LS_1C, IP_EXEC_1C, IP_AGEN_1C, DISP_1C, DISP_1C, DISP_1C],
(instrs
STFS,
@@ -613,74 +783,88 @@ def : InstRW<[P9_LS_1C, IP_EXEC_1C, IP_AGEN_1C, DISP_1C, DISP_1C, DISP_1C],
STFDX,
STXSDX,
STXSSPX,
- STXSIWX
+ STXSIWX,
+ DFSTOREf32,
+ DFSTOREf64,
+ XFSTOREf32,
+ XFSTOREf64,
+ STIWX
)>;
-def : InstRW<[P9_LS_1C, IP_EXEC_1C, IP_EXEC_1C, IP_AGEN_1C, DISP_1C, DISP_1C],
+// Store operation that requires the whole superslice.
+def : InstRW<[P9_LS_1C, IP_EXECE_1C, IP_EXECO_1C, IP_AGEN_1C,
+ DISP_1C, DISP_1C, DISP_1C],
(instrs
STXVD2X,
STXVW4X
)>;
-// Divide Operations in IIC_IntDivW, IIC_IntDivD.
-
-def : InstRW<[P9_DIV_16C_8, IP_EXECE_1C, DISP_1C, DISP_1C],
+// 16 Cycle DIV operation. Only one DIV unit per superslice so we use the whole
+// superslice. That includes both exec pipelines (EXECO, EXECE) and all three
+// dispatches.
+def : InstRW<[P9_DIV_16C_8, IP_EXECO_1C, IP_EXECE_1C,
+ DISP_1C, DISP_1C, DISP_1C],
(instrs
DIVW,
- DIVWU
+ DIVWU,
+ MODSW
)>;
-def : InstRW<[P9_DIV_24C_8, IP_EXECE_1C, DISP_1C, DISP_1C],
+// 24 Cycle DIV operation. Only one DIV unit per superslice so we use the whole
+// superslice. That includes both exec pipelines (EXECO, EXECE) and all three
+// dispatches.
+def : InstRW<[P9_DIV_24C_8, IP_EXECO_1C, IP_EXECE_1C,
+ DISP_1C, DISP_1C, DISP_1C],
(instrs
DIVWE,
DIVD,
DIVWEU,
- DIVDU
+ DIVDU,
+ MODSD,
+ MODUD,
+ MODUW
)>;
-def : InstRW<[P9_DIV_40C_8, IP_EXECE_1C, DISP_1C, DISP_1C],
+// 40 Cycle DIV operation. Only one DIV unit per superslice so we use the whole
+// superslice. That includes both exec pipelines (EXECO, EXECE) and all three
+// dispatches.
+def : InstRW<[P9_DIV_40C_8, IP_EXECO_1C, IP_EXECE_1C,
+ DISP_1C, DISP_1C, DISP_1C],
(instrs
DIVDE,
DIVDEU
)>;
-def : InstRW<[P9_IntDivAndALUOp_26C_8, IP_EXECE_1C, IP_EXEC_1C,
- DISP_1C, DISP_1C, DISP_1C, DISP_1C],
+// Cracked DIV and ALU operation. Requires one full slice for the ALU operation
+// and one full superslice for the DIV operation since there is only one DIV
+// per superslice. Latency of DIV plus ALU is 26.
+def : InstRW<[P9_IntDivAndALUOp_26C_8, IP_EXECE_1C, IP_EXECO_1C, IP_EXEC_1C,
+ DISP_1C, DISP_1C, DISP_1C, DISP_1C, DISP_1C],
(instrs
+ DIVDo,
+ DIVDUo,
DIVWEo,
DIVWEUo
)>;
-def : InstRW<[P9_IntDivAndALUOp_42C_8, IP_EXECE_1C, IP_EXEC_1C,
- DISP_1C, DISP_1C, DISP_1C, DISP_1C],
+// Cracked DIV and ALU operation. Requires one full slice for the ALU operation
+// and one full superslice for the DIV operation since there is only one DIV
+// per superslice. Latency of DIV plus ALU is 42.
+def : InstRW<[P9_IntDivAndALUOp_42C_8, IP_EXECE_1C, IP_EXECO_1C, IP_EXEC_1C,
+ DISP_1C, DISP_1C, DISP_1C, DISP_1C, DISP_1C],
(instrs
DIVDEo,
DIVDEUo
)>;
-// Rotate Operations in IIC_IntRotateD, IIC_IntRotateDI
-def : InstRW<[P9_ALU_2C, IP_EXEC_1C, DISP_1C, DISP_1C],
- (instrs
- SLD,
- SRD,
- SRAD,
- SRADI,
- RLDIC
-)>;
-
-def : InstRW<[P9_ALU_2C, IP_EXEC_1C, DISP_1C, DISP_1C, DISP_1C],
- (instrs
- RLDCL,
- RLDCR,
- RLDIMI,
- RLDICL,
- RLDICR,
- RLDICL_32_64
-)>;
-
// CR access instructions in _BrMCR, IIC_BrMCRX.
+// Cracked, restricted, ALU operations.
+// Here the two ALU ops can actually be done in parallel and therefore the
+// latencies are not added together. Otherwise this is like having two
+// instructions running together on two pipelines and 6 dispatches.
+// ALU ops are 2 cycles each.
def : InstRW<[P9_ALU_2C, P9_ALU_2C, IP_EXEC_1C, IP_EXEC_1C,
DISP_1C, DISP_1C, DISP_1C, DISP_1C, DISP_1C, DISP_1C],
(instrs
@@ -690,13 +874,12 @@ def : InstRW<[P9_ALU_2C, P9_ALU_2C, IP_EXEC_1C, IP_EXEC_1C,
MTCRF8
)>;
-def : InstRW<[P9_ALU_5C, IP_EXEC_1C, DISP_1C, DISP_1C],
- (instrs
- MCRF,
- MCRXRX
-)>;
-
-def : InstRW<[P9_ALU_5C, P9_ALU_5C, IP_EXEC_1C, IP_EXEC_1C,
+// Cracked, restricted, ALU operations.
+// Here the two ALU ops can actually be done in parallel and therefore the
+// latencies are not added together. Otherwise this is like having two
+// instructions running together on two pipelines and 6 dispatches.
+// ALU ops are 3 cycles each.
+def : InstRW<[P9_ALU_3C, P9_ALU_3C, IP_EXEC_1C, IP_EXEC_1C,
DISP_1C, DISP_1C, DISP_1C, DISP_1C],
(instrs
MCRFS
@@ -704,93 +887,71 @@ def : InstRW<[P9_ALU_5C, P9_ALU_5C, IP_EXEC_1C, IP_EXEC_1C,
// FP Div instructions in IIC_FPDivD and IIC_FPDivS.
+// 33 Cycle DP Instruction Restricted. Takes one slice and 3 dispatches.
def : InstRW<[P9_DP_33C_8, IP_EXEC_1C, DISP_1C, DISP_1C, DISP_1C],
(instrs
- FDIV,
- XSDIVDP
+ FDIV
)>;
-def : InstRW<[P9_DP_22C_5, IP_EXEC_1C, DISP_1C, DISP_1C, DISP_1C],
+// 33 Cycle DP Instruction Restricted and Cracked with 2 Cycle ALU.
+def : InstRW<[P9_DPOpAndALUOp_35C_8, IP_EXEC_1C, IP_EXEC_1C,
+ DISP_1C, DISP_1C, DISP_1C, DISP_1C, DISP_1C],
(instrs
- FDIVS,
- XSDIVSP
+ FDIVo
)>;
-def : InstRW<[P9_DP_24C_8, IP_EXECE_1C, IP_EXECO_1C, DISP_1C, DISP_1C],
+// 33 Cycle DP Instruction. Takes one slice and 2 dispatches.
+def : InstRW<[P9_DP_33C_8, IP_EXEC_1C, DISP_1C, DISP_1C],
(instrs
- XVDIVSP
+ XSDIVDP
)>;
-def : InstRW<[P9_DP_33C_8, IP_EXECE_1C, IP_EXECO_1C, DISP_1C, DISP_1C],
+// 22 Cycle DP Instruction Restricted. Takes one slice and 3 dispatches.
+def : InstRW<[P9_DP_22C_5, IP_EXEC_1C, DISP_1C, DISP_1C, DISP_1C],
(instrs
- XVDIVDP
+ FDIVS
)>;
-// FP Instructions in IIC_FPGeneral, IIC_FPFused
+// 22 Cycle DP Instruction Restricted and Cracked with 2 Cycle ALU.
+def : InstRW<[P9_DPOpAndALUOp_24C_5, IP_EXEC_1C, IP_EXEC_1C,
+ DISP_1C, DISP_1C, DISP_1C, DISP_1C, DISP_1C],
+ (instrs
+ FDIVSo
+)>;
-def : InstRW<[P9_DP_7C, IP_EXEC_1C, DISP_1C, DISP_1C, DISP_1C],
+// 22 Cycle DP Instruction. Takes one slice and 2 dispatches.
+def : InstRW<[P9_DP_22C_5, IP_EXEC_1C, DISP_1C, DISP_1C],
(instrs
- FRSP,
- FRIND,
- FRINS,
- FRIPD,
- FRIPS,
- FRIZD,
- FRIZS,
- FRIMD,
- FRIMS,
- FRE,
- FRES,
- FRSQRTE,
- FRSQRTES,
- FMADDS,
- FMADD,
- FMSUBS,
- FMSUB,
- FNMADDS,
- FNMADD,
- FNMSUBS,
- FNMSUB,
- FSELD,
- FSELS,
- FADDS,
- FMULS,
- FMUL,
- FSUBS,
- FCFID,
- FCTID,
- FCTIDZ,
- FCFIDU,
- FCFIDS,
- FCFIDUS,
- FCTIDUZ,
- FCTIWUZ,
- FCTIW,
- FCTIWZ
+ XSDIVSP
)>;
-def : InstRW<[P9_DP_7C, IP_EXEC_1C, DISP_1C, DISP_1C, DISP_1C],
+// 24 Cycle DP Vector Instruction. Takes one full superslice.
+// Includes both EXECE, EXECO pipelines and all 3 dispatches for the given
+// superslice.
+def : InstRW<[P9_DPE_24C_8, P9_DPO_24C_8, IP_EXECE_1C, IP_EXECO_1C,
+ DISP_1C, DISP_1C, DISP_1C],
(instrs
- FMR,
- FABSD,
- FABSS,
- FNABSD,
- FNABSS,
- FNEGD,
- FNEGS,
- FCPSGND,
- FCPSGNS
+ XVDIVSP
)>;
-def : InstRW<[P9_ALU_3C, IP_EXEC_1C, DISP_1C, DISP_1C, DISP_1C],
+// 33 Cycle DP Vector Instruction. Takes one full superslice.
+// Includes both EXECE, EXECO pipelines and all 3 dispatches for the given
+// superslice.
+def : InstRW<[P9_DPE_33C_8, P9_DPO_33C_8, IP_EXECE_1C, IP_EXECO_1C,
+ DISP_1C, DISP_1C, DISP_1C],
(instrs
- FCMPUS,
- FCMPUD
+ XVDIVDP
)>;
// Load instructions in IIC_LdStLFDU and IIC_LdStLFDUX.
-def : InstRW<[P9_LoadAndALUOp_7C, P9_ALU_2C,
+// Instruction cracked into three pieces. One Load and two ALU operations.
+// The Load and one of the ALU ops cannot be run at the same time and so the
+// latencies are added together for 6 cycles. The remainaing ALU is 2 cycles.
+// Both the load and the ALU that depends on it are restricted and so they take
+// a total of 6 dispatches. The final 2 dispatches come from the second ALU op.
+// The two EXEC pipelines are for the 2 ALUs while the AGEN is for the load.
+def : InstRW<[P9_LoadAndALUOp_6C, P9_ALU_2C,
IP_AGEN_1C, IP_EXEC_1C, IP_EXEC_1C,
DISP_1C, DISP_1C, DISP_1C, DISP_1C,
DISP_1C, DISP_1C, DISP_1C, DISP_1C],
@@ -799,10 +960,32 @@ def : InstRW<[P9_LoadAndALUOp_7C, P9_ALU_2C,
LFSUX
)>;
-def : InstRW<[P9_LS_5C, P9_ALU_2C, IP_AGEN_1C, IP_EXEC_1C,
+// Cracked instruction made up of a Load and an ALU. The ALU does not depend on
+// the load and so it can be run at the same time as the load. The load is also
+// restricted. 3 dispatches are from the restricted load while the other two
+// are from the ALU. The AGEN pipeline is from the load and the EXEC pipeline
+// is required for the ALU.
+def : InstRW<[P9_LS_4C, P9_ALU_2C, IP_AGEN_1C, IP_EXEC_1C,
DISP_1C, DISP_1C, DISP_1C, DISP_1C, DISP_1C],
(instrs
LFDU,
LFDUX
)>;
+// Crypto Instructions
+
+// 6 Cycle CY operation. Only one CY unit per CPU so we use a whole
+// superslice. That includes both exec pipelines (EXECO, EXECE) and all three
+// dispatches.
+def : InstRW<[P9_CY_6C, IP_EXECO_1C, IP_EXECE_1C, DISP_1C, DISP_1C, DISP_1C],
+ (instrs
+ VPMSUMB,
+ VPMSUMD,
+ VPMSUMH,
+ VPMSUMW,
+ VCIPHER,
+ VCIPHERLAST,
+ VNCIPHER,
+ VNCIPHERLAST,
+ VSBOX
+)>;
diff --git a/lib/Target/PowerPC/PPC.h b/lib/Target/PowerPC/PPC.h
index ad92ac8ce1207..dfdec246e8686 100644
--- a/lib/Target/PowerPC/PPC.h
+++ b/lib/Target/PowerPC/PPC.h
@@ -26,8 +26,10 @@ namespace llvm {
class PassRegistry;
class FunctionPass;
class MachineInstr;
+ class MachineOperand;
class AsmPrinter;
class MCInst;
+ class MCOperand;
FunctionPass *createPPCCTRLoops();
#ifndef NDEBUG
@@ -39,20 +41,28 @@ namespace llvm {
FunctionPass *createPPCVSXCopyPass();
FunctionPass *createPPCVSXFMAMutatePass();
FunctionPass *createPPCVSXSwapRemovalPass();
+ FunctionPass *createPPCReduceCRLogicalsPass();
FunctionPass *createPPCMIPeepholePass();
FunctionPass *createPPCBranchSelectionPass();
+ FunctionPass *createPPCBranchCoalescingPass();
FunctionPass *createPPCQPXLoadSplatPass();
FunctionPass *createPPCISelDag(PPCTargetMachine &TM, CodeGenOpt::Level OL);
FunctionPass *createPPCTLSDynamicCallPass();
FunctionPass *createPPCBoolRetToIntPass();
FunctionPass *createPPCExpandISELPass();
+ FunctionPass *createPPCPreEmitPeepholePass();
void LowerPPCMachineInstrToMCInst(const MachineInstr *MI, MCInst &OutMI,
AsmPrinter &AP, bool isDarwin);
+ bool LowerPPCMachineOperandToMCOperand(const MachineOperand &MO,
+ MCOperand &OutMO, AsmPrinter &AP,
+ bool isDarwin);
void initializePPCVSXFMAMutatePass(PassRegistry&);
void initializePPCBoolRetToIntPass(PassRegistry&);
void initializePPCExpandISELPass(PassRegistry &);
+ void initializePPCPreEmitPeepholePass(PassRegistry &);
void initializePPCTLSDynamicCallPass(PassRegistry &);
+ void initializePPCMIPeepholePass(PassRegistry&);
extern char &PPCVSXFMAMutateID;
namespace PPCII {
diff --git a/lib/Target/PowerPC/PPCAsmPrinter.cpp b/lib/Target/PowerPC/PPCAsmPrinter.cpp
index 841b8c5144641..17451900840a4 100644
--- a/lib/Target/PowerPC/PPCAsmPrinter.cpp
+++ b/lib/Target/PowerPC/PPCAsmPrinter.cpp
@@ -19,6 +19,7 @@
#include "InstPrinter/PPCInstPrinter.h"
#include "MCTargetDesc/PPCMCExpr.h"
#include "MCTargetDesc/PPCMCTargetDesc.h"
+#include "MCTargetDesc/PPCPredicates.h"
#include "PPC.h"
#include "PPCInstrInfo.h"
#include "PPCMachineFunctionInfo.h"
@@ -506,7 +507,7 @@ void PPCAsmPrinter::EmitInstruction(const MachineInstr *MI) {
MCInst TmpInst;
bool isPPC64 = Subtarget->isPPC64();
bool isDarwin = TM.getTargetTriple().isOSDarwin();
- const Module *M = MF->getFunction()->getParent();
+ const Module *M = MF->getFunction().getParent();
PICLevel::Level PL = M->getPICLevel();
// Lower multi-instruction pseudo operations.
@@ -520,7 +521,7 @@ void PPCAsmPrinter::EmitInstruction(const MachineInstr *MI) {
return LowerPATCHPOINT(SM, *MI);
case PPC::MoveGOTtoLR: {
- // Transform %LR = MoveGOTtoLR
+ // Transform %lr = MoveGOTtoLR
// Into this: bl _GLOBAL_OFFSET_TABLE_@local-4
// _GLOBAL_OFFSET_TABLE_@local-4 (instruction preceding
// _GLOBAL_OFFSET_TABLE_) has exactly one instruction:
@@ -541,7 +542,7 @@ void PPCAsmPrinter::EmitInstruction(const MachineInstr *MI) {
}
case PPC::MovePCtoLR:
case PPC::MovePCtoLR8: {
- // Transform %LR = MovePCtoLR
+ // Transform %lr = MovePCtoLR
// Into this, where the label is the PIC base:
// bl L1$pb
// L1$pb:
@@ -559,9 +560,9 @@ void PPCAsmPrinter::EmitInstruction(const MachineInstr *MI) {
return;
}
case PPC::UpdateGBR: {
- // Transform %Rd = UpdateGBR(%Rt, %Ri)
- // Into: lwz %Rt, .L0$poff - .L0$pb(%Ri)
- // add %Rd, %Rt, %Ri
+ // Transform %rd = UpdateGBR(%rt, %ri)
+ // Into: lwz %rt, .L0$poff - .L0$pb(%ri)
+ // add %rd, %rt, %ri
// Get the offset from the GOT Base Register to the GOT
LowerPPCMachineInstrToMCInst(MI, TmpInst, *this, isDarwin);
MCSymbol *PICOffset =
@@ -576,7 +577,7 @@ void PPCAsmPrinter::EmitInstruction(const MachineInstr *MI) {
const MCOperand TR = TmpInst.getOperand(1);
const MCOperand PICR = TmpInst.getOperand(0);
- // Step 1: lwz %Rt, .L$poff - .L$pb(%Ri)
+ // Step 1: lwz %rt, .L$poff - .L$pb(%ri)
TmpInst.getOperand(1) =
MCOperand::createExpr(MCBinaryExpr::createSub(Exp, PB, OutContext));
TmpInst.getOperand(0) = TR;
@@ -591,7 +592,7 @@ void PPCAsmPrinter::EmitInstruction(const MachineInstr *MI) {
return;
}
case PPC::LWZtoc: {
- // Transform %R3 = LWZtoc <ga:@min1>, %R2
+ // Transform %r3 = LWZtoc @min1, %r2
LowerPPCMachineInstrToMCInst(MI, TmpInst, *this, isDarwin);
// Change the opcode to LWZ, and the global address operand to be a
@@ -635,7 +636,7 @@ void PPCAsmPrinter::EmitInstruction(const MachineInstr *MI) {
case PPC::LDtocCPT:
case PPC::LDtocBA:
case PPC::LDtoc: {
- // Transform %X3 = LDtoc <ga:@min1>, %X2
+ // Transform %x3 = LDtoc @min1, %x2
LowerPPCMachineInstrToMCInst(MI, TmpInst, *this, isDarwin);
// Change the opcode to LD, and the global address operand to be a
@@ -666,7 +667,7 @@ void PPCAsmPrinter::EmitInstruction(const MachineInstr *MI) {
}
case PPC::ADDIStocHA: {
- // Transform %Xd = ADDIStocHA %X2, <ga:@sym>
+ // Transform %xd = ADDIStocHA %x2, @sym
LowerPPCMachineInstrToMCInst(MI, TmpInst, *this, isDarwin);
// Change the opcode to ADDIS8. If the global address is external, has
@@ -713,7 +714,7 @@ void PPCAsmPrinter::EmitInstruction(const MachineInstr *MI) {
return;
}
case PPC::LDtocL: {
- // Transform %Xd = LDtocL <ga:@sym>, %Xs
+ // Transform %xd = LDtocL @sym, %xs
LowerPPCMachineInstrToMCInst(MI, TmpInst, *this, isDarwin);
// Change the opcode to LD. If the global address is external, has
@@ -756,7 +757,7 @@ void PPCAsmPrinter::EmitInstruction(const MachineInstr *MI) {
return;
}
case PPC::ADDItocL: {
- // Transform %Xd = ADDItocL %Xs, <ga:@sym>
+ // Transform %xd = ADDItocL %xs, @sym
LowerPPCMachineInstrToMCInst(MI, TmpInst, *this, isDarwin);
// Change the opcode to ADDI8. If the global address is external, then
@@ -787,8 +788,8 @@ void PPCAsmPrinter::EmitInstruction(const MachineInstr *MI) {
return;
}
case PPC::ADDISgotTprelHA: {
- // Transform: %Xd = ADDISgotTprelHA %X2, <ga:@sym>
- // Into: %Xd = ADDIS8 %X2, sym@got@tlsgd@ha
+ // Transform: %xd = ADDISgotTprelHA %x2, @sym
+ // Into: %xd = ADDIS8 %x2, sym@got@tlsgd@ha
assert(Subtarget->isPPC64() && "Not supported for 32-bit PowerPC");
const MachineOperand &MO = MI->getOperand(2);
const GlobalValue *GValue = MO.getGlobal();
@@ -804,7 +805,7 @@ void PPCAsmPrinter::EmitInstruction(const MachineInstr *MI) {
}
case PPC::LDgotTprelL:
case PPC::LDgotTprelL32: {
- // Transform %Xd = LDgotTprelL <ga:@sym>, %Xs
+ // Transform %xd = LDgotTprelL @sym, %xs
LowerPPCMachineInstrToMCInst(MI, TmpInst, *this, isDarwin);
// Change the opcode to LD.
@@ -865,8 +866,8 @@ void PPCAsmPrinter::EmitInstruction(const MachineInstr *MI) {
return;
}
case PPC::ADDIStlsgdHA: {
- // Transform: %Xd = ADDIStlsgdHA %X2, <ga:@sym>
- // Into: %Xd = ADDIS8 %X2, sym@got@tlsgd@ha
+ // Transform: %xd = ADDIStlsgdHA %x2, @sym
+ // Into: %xd = ADDIS8 %x2, sym@got@tlsgd@ha
assert(Subtarget->isPPC64() && "Not supported for 32-bit PowerPC");
const MachineOperand &MO = MI->getOperand(2);
const GlobalValue *GValue = MO.getGlobal();
@@ -881,11 +882,11 @@ void PPCAsmPrinter::EmitInstruction(const MachineInstr *MI) {
return;
}
case PPC::ADDItlsgdL:
- // Transform: %Xd = ADDItlsgdL %Xs, <ga:@sym>
- // Into: %Xd = ADDI8 %Xs, sym@got@tlsgd@l
+ // Transform: %xd = ADDItlsgdL %xs, @sym
+ // Into: %xd = ADDI8 %xs, sym@got@tlsgd@l
case PPC::ADDItlsgdL32: {
- // Transform: %Rd = ADDItlsgdL32 %Rs, <ga:@sym>
- // Into: %Rd = ADDI %Rs, sym@got@tlsgd
+ // Transform: %rd = ADDItlsgdL32 %rs, @sym
+ // Into: %rd = ADDI %rs, sym@got@tlsgd
const MachineOperand &MO = MI->getOperand(2);
const GlobalValue *GValue = MO.getGlobal();
MCSymbol *MOSymbol = getSymbol(GValue);
@@ -901,17 +902,17 @@ void PPCAsmPrinter::EmitInstruction(const MachineInstr *MI) {
return;
}
case PPC::GETtlsADDR:
- // Transform: %X3 = GETtlsADDR %X3, <ga:@sym>
+ // Transform: %x3 = GETtlsADDR %x3, @sym
// Into: BL8_NOP_TLS __tls_get_addr(sym at tlsgd)
case PPC::GETtlsADDR32: {
- // Transform: %R3 = GETtlsADDR32 %R3, <ga:@sym>
+ // Transform: %r3 = GETtlsADDR32 %r3, @sym
// Into: BL_TLS __tls_get_addr(sym at tlsgd)@PLT
EmitTlsCall(MI, MCSymbolRefExpr::VK_PPC_TLSGD);
return;
}
case PPC::ADDIStlsldHA: {
- // Transform: %Xd = ADDIStlsldHA %X2, <ga:@sym>
- // Into: %Xd = ADDIS8 %X2, sym@got@tlsld@ha
+ // Transform: %xd = ADDIStlsldHA %x2, @sym
+ // Into: %xd = ADDIS8 %x2, sym@got@tlsld@ha
assert(Subtarget->isPPC64() && "Not supported for 32-bit PowerPC");
const MachineOperand &MO = MI->getOperand(2);
const GlobalValue *GValue = MO.getGlobal();
@@ -926,11 +927,11 @@ void PPCAsmPrinter::EmitInstruction(const MachineInstr *MI) {
return;
}
case PPC::ADDItlsldL:
- // Transform: %Xd = ADDItlsldL %Xs, <ga:@sym>
- // Into: %Xd = ADDI8 %Xs, sym@got@tlsld@l
+ // Transform: %xd = ADDItlsldL %xs, @sym
+ // Into: %xd = ADDI8 %xs, sym@got@tlsld@l
case PPC::ADDItlsldL32: {
- // Transform: %Rd = ADDItlsldL32 %Rs, <ga:@sym>
- // Into: %Rd = ADDI %Rs, sym@got@tlsld
+ // Transform: %rd = ADDItlsldL32 %rs, @sym
+ // Into: %rd = ADDI %rs, sym@got@tlsld
const MachineOperand &MO = MI->getOperand(2);
const GlobalValue *GValue = MO.getGlobal();
MCSymbol *MOSymbol = getSymbol(GValue);
@@ -946,20 +947,20 @@ void PPCAsmPrinter::EmitInstruction(const MachineInstr *MI) {
return;
}
case PPC::GETtlsldADDR:
- // Transform: %X3 = GETtlsldADDR %X3, <ga:@sym>
+ // Transform: %x3 = GETtlsldADDR %x3, @sym
// Into: BL8_NOP_TLS __tls_get_addr(sym at tlsld)
case PPC::GETtlsldADDR32: {
- // Transform: %R3 = GETtlsldADDR32 %R3, <ga:@sym>
+ // Transform: %r3 = GETtlsldADDR32 %r3, @sym
// Into: BL_TLS __tls_get_addr(sym at tlsld)@PLT
EmitTlsCall(MI, MCSymbolRefExpr::VK_PPC_TLSLD);
return;
}
case PPC::ADDISdtprelHA:
- // Transform: %Xd = ADDISdtprelHA %Xs, <ga:@sym>
- // Into: %Xd = ADDIS8 %Xs, sym@dtprel@ha
+ // Transform: %xd = ADDISdtprelHA %xs, @sym
+ // Into: %xd = ADDIS8 %xs, sym@dtprel@ha
case PPC::ADDISdtprelHA32: {
- // Transform: %Rd = ADDISdtprelHA32 %Rs, <ga:@sym>
- // Into: %Rd = ADDIS %Rs, sym@dtprel@ha
+ // Transform: %rd = ADDISdtprelHA32 %rs, @sym
+ // Into: %rd = ADDIS %rs, sym@dtprel@ha
const MachineOperand &MO = MI->getOperand(2);
const GlobalValue *GValue = MO.getGlobal();
MCSymbol *MOSymbol = getSymbol(GValue);
@@ -975,11 +976,11 @@ void PPCAsmPrinter::EmitInstruction(const MachineInstr *MI) {
return;
}
case PPC::ADDIdtprelL:
- // Transform: %Xd = ADDIdtprelL %Xs, <ga:@sym>
- // Into: %Xd = ADDI8 %Xs, sym@dtprel@l
+ // Transform: %xd = ADDIdtprelL %xs, @sym
+ // Into: %xd = ADDI8 %xs, sym@dtprel@l
case PPC::ADDIdtprelL32: {
- // Transform: %Rd = ADDIdtprelL32 %Rs, <ga:@sym>
- // Into: %Rd = ADDI %Rs, sym@dtprel@l
+ // Transform: %rd = ADDIdtprelL32 %rs, @sym
+ // Into: %rd = ADDI %rs, sym@dtprel@l
const MachineOperand &MO = MI->getOperand(2);
const GlobalValue *GValue = MO.getGlobal();
MCSymbol *MOSymbol = getSymbol(GValue);
@@ -996,8 +997,8 @@ void PPCAsmPrinter::EmitInstruction(const MachineInstr *MI) {
case PPC::MFOCRF:
case PPC::MFOCRF8:
if (!Subtarget->hasMFOCRF()) {
- // Transform: %R3 = MFOCRF %CR7
- // Into: %R3 = MFCR ;; cr7
+ // Transform: %r3 = MFOCRF %cr7
+ // Into: %r3 = MFCR ;; cr7
unsigned NewOpcode =
MI->getOpcode() == PPC::MFOCRF ? PPC::MFCR : PPC::MFCR8;
OutStreamer->AddComment(PPCInstPrinter::
@@ -1010,8 +1011,8 @@ void PPCAsmPrinter::EmitInstruction(const MachineInstr *MI) {
case PPC::MTOCRF:
case PPC::MTOCRF8:
if (!Subtarget->hasMFOCRF()) {
- // Transform: %CR7 = MTOCRF %R3
- // Into: MTCRF mask, %R3 ;; cr7
+ // Transform: %cr7 = MTOCRF %r3
+ // Into: MTCRF mask, %r3 ;; cr7
unsigned NewOpcode =
MI->getOpcode() == PPC::MTOCRF ? PPC::MTCRF : PPC::MTCRF8;
unsigned Mask = 0x80 >> OutContext.getRegisterInfo()
@@ -1089,7 +1090,61 @@ void PPCLinuxAsmPrinter::EmitInstruction(const MachineInstr *MI) {
recordSled(BeginOfSled, *MI, SledKind::FUNCTION_ENTER);
break;
}
- case TargetOpcode::PATCHABLE_FUNCTION_EXIT: {
+ case TargetOpcode::PATCHABLE_RET: {
+ unsigned RetOpcode = MI->getOperand(0).getImm();
+ MCInst RetInst;
+ RetInst.setOpcode(RetOpcode);
+ for (const auto &MO :
+ make_range(std::next(MI->operands_begin()), MI->operands_end())) {
+ MCOperand MCOp;
+ if (LowerPPCMachineOperandToMCOperand(MO, MCOp, *this, false))
+ RetInst.addOperand(MCOp);
+ }
+
+ bool IsConditional;
+ if (RetOpcode == PPC::BCCLR) {
+ IsConditional = true;
+ } else if (RetOpcode == PPC::TCRETURNdi8 || RetOpcode == PPC::TCRETURNri8 ||
+ RetOpcode == PPC::TCRETURNai8) {
+ break;
+ } else if (RetOpcode == PPC::BLR8 || RetOpcode == PPC::TAILB8) {
+ IsConditional = false;
+ } else {
+ EmitToStreamer(*OutStreamer, RetInst);
+ break;
+ }
+
+ MCSymbol *FallthroughLabel;
+ if (IsConditional) {
+ // Before:
+ // bgtlr cr0
+ //
+ // After:
+ // ble cr0, .end
+ // .p2align 3
+ // .begin:
+ // blr # lis 0, FuncId[16..32]
+ // nop # li 0, FuncId[0..15]
+ // std 0, -8(1)
+ // mflr 0
+ // bl __xray_FunctionExit
+ // mtlr 0
+ // blr
+ // .end:
+ //
+ // Update compiler-rt/lib/xray/xray_powerpc64.cc accordingly when number
+ // of instructions change.
+ FallthroughLabel = OutContext.createTempSymbol();
+ EmitToStreamer(
+ *OutStreamer,
+ MCInstBuilder(PPC::BCC)
+ .addImm(PPC::InvertPredicate(
+ static_cast<PPC::Predicate>(MI->getOperand(1).getImm())))
+ .addReg(MI->getOperand(2).getReg())
+ .addExpr(MCSymbolRefExpr::create(FallthroughLabel, OutContext)));
+ RetInst = MCInst();
+ RetInst.setOpcode(PPC::BLR8);
+ }
// .p2align 3
// .begin:
// b(lr)? # lis 0, FuncId[16..32]
@@ -1098,24 +1153,14 @@ void PPCLinuxAsmPrinter::EmitInstruction(const MachineInstr *MI) {
// mflr 0
// bl __xray_FunctionExit
// mtlr 0
- // .end:
// b(lr)?
//
// Update compiler-rt/lib/xray/xray_powerpc64.cc accordingly when number
// of instructions change.
- const MachineInstr *Next = [&] {
- MachineBasicBlock::const_iterator It(MI);
- assert(It != MI->getParent()->end());
- ++It;
- assert(It->isReturn());
- return &*It;
- }();
OutStreamer->EmitCodeAlignment(8);
MCSymbol *BeginOfSled = OutContext.createTempSymbol();
OutStreamer->EmitLabel(BeginOfSled);
- MCInst TmpInst;
- LowerPPCMachineInstrToMCInst(Next, TmpInst, *this, false);
- EmitToStreamer(*OutStreamer, TmpInst);
+ EmitToStreamer(*OutStreamer, RetInst);
EmitToStreamer(*OutStreamer, MCInstBuilder(PPC::NOP));
EmitToStreamer(
*OutStreamer,
@@ -1127,15 +1172,18 @@ void PPCLinuxAsmPrinter::EmitInstruction(const MachineInstr *MI) {
OutContext.getOrCreateSymbol("__xray_FunctionExit"),
OutContext)));
EmitToStreamer(*OutStreamer, MCInstBuilder(PPC::MTLR8).addReg(PPC::X0));
+ EmitToStreamer(*OutStreamer, RetInst);
+ if (IsConditional)
+ OutStreamer->EmitLabel(FallthroughLabel);
recordSled(BeginOfSled, *MI, SledKind::FUNCTION_EXIT);
break;
}
+ case TargetOpcode::PATCHABLE_FUNCTION_EXIT:
+ llvm_unreachable("PATCHABLE_FUNCTION_EXIT should never be emitted");
case TargetOpcode::PATCHABLE_TAIL_CALL:
- case TargetOpcode::PATCHABLE_RET:
- // PPC's tail call instruction, e.g. PPC::TCRETURNdi8, doesn't really
- // lower to a PPC::B instruction. The PPC::B instruction is generated
- // before it, and handled by the normal case.
- llvm_unreachable("Tail call is handled in the normal case. See comments"
+ // TODO: Define a trampoline `__xray_FunctionTailExit` and differentiate a
+ // normal function exit from a tail exit.
+ llvm_unreachable("Tail call is handled in the normal case. See comments "
"around this assert.");
}
}
@@ -1180,7 +1228,7 @@ void PPCLinuxAsmPrinter::EmitFunctionEntryLabel() {
// linux/ppc32 - Normal entry label.
if (!Subtarget->isPPC64() &&
(!isPositionIndependent() ||
- MF->getFunction()->getParent()->getPICLevel() == PICLevel::SmallPIC))
+ MF->getFunction().getParent()->getPICLevel() == PICLevel::SmallPIC))
return AsmPrinter::EmitFunctionEntryLabel();
if (!Subtarget->isPPC64()) {
diff --git a/lib/Target/PowerPC/PPCBranchCoalescing.cpp b/lib/Target/PowerPC/PPCBranchCoalescing.cpp
new file mode 100644
index 0000000000000..32d801b13ded9
--- /dev/null
+++ b/lib/Target/PowerPC/PPCBranchCoalescing.cpp
@@ -0,0 +1,784 @@
+//===-- CoalesceBranches.cpp - Coalesce blocks with the same condition ---===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+///
+/// \file
+/// Coalesce basic blocks guarded by the same branch condition into a single
+/// basic block.
+///
+//===----------------------------------------------------------------------===//
+
+#include "PPC.h"
+#include "llvm/ADT/BitVector.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/CodeGen/MachineDominators.h"
+#include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/CodeGen/MachinePostDominators.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/CodeGen/Passes.h"
+#include "llvm/CodeGen/TargetFrameLowering.h"
+#include "llvm/CodeGen/TargetInstrInfo.h"
+#include "llvm/CodeGen/TargetSubtargetInfo.h"
+#include "llvm/Support/Debug.h"
+
+using namespace llvm;
+
+#define DEBUG_TYPE "ppc-branch-coalescing"
+
+STATISTIC(NumBlocksCoalesced, "Number of blocks coalesced");
+STATISTIC(NumPHINotMoved, "Number of PHI Nodes that cannot be merged");
+STATISTIC(NumBlocksNotCoalesced, "Number of blocks not coalesced");
+
+namespace llvm {
+ void initializePPCBranchCoalescingPass(PassRegistry&);
+}
+
+//===----------------------------------------------------------------------===//
+// PPCBranchCoalescing
+//===----------------------------------------------------------------------===//
+///
+/// Improve scheduling by coalescing branches that depend on the same condition.
+/// This pass looks for blocks that are guarded by the same branch condition
+/// and attempts to merge the blocks together. Such opportunities arise from
+/// the expansion of select statements in the IR.
+///
+/// This pass does not handle implicit operands on branch statements. In order
+/// to run on targets that use implicit operands, changes need to be made in the
+/// canCoalesceBranch and canMerge methods.
+///
+/// Example: the following LLVM IR
+///
+/// %test = icmp eq i32 %x 0
+/// %tmp1 = select i1 %test, double %a, double 2.000000e-03
+/// %tmp2 = select i1 %test, double %b, double 5.000000e-03
+///
+/// expands to the following machine code:
+///
+/// %bb.0: derived from LLVM BB %entry
+/// Live Ins: %f1 %f3 %x6
+/// <SNIP1>
+/// %0 = COPY %f1; F8RC:%0
+/// %5 = CMPLWI killed %4, 0; CRRC:%5 GPRC:%4
+/// %8 = LXSDX %zero8, killed %7, implicit %rm;
+/// mem:LD8[ConstantPool] F8RC:%8 G8RC:%7
+/// BCC 76, %5, <%bb.2>; CRRC:%5
+/// Successors according to CFG: %bb.1(?%) %bb.2(?%)
+///
+/// %bb.1: derived from LLVM BB %entry
+/// Predecessors according to CFG: %bb.0
+/// Successors according to CFG: %bb.2(?%)
+///
+/// %bb.2: derived from LLVM BB %entry
+/// Predecessors according to CFG: %bb.0 %bb.1
+/// %9 = PHI %8, <%bb.1>, %0, <%bb.0>;
+/// F8RC:%9,%8,%0
+/// <SNIP2>
+/// BCC 76, %5, <%bb.4>; CRRC:%5
+/// Successors according to CFG: %bb.3(?%) %bb.4(?%)
+///
+/// %bb.3: derived from LLVM BB %entry
+/// Predecessors according to CFG: %bb.2
+/// Successors according to CFG: %bb.4(?%)
+///
+/// %bb.4: derived from LLVM BB %entry
+/// Predecessors according to CFG: %bb.2 %bb.3
+/// %13 = PHI %12, <%bb.3>, %2, <%bb.2>;
+/// F8RC:%13,%12,%2
+/// <SNIP3>
+/// BLR8 implicit %lr8, implicit %rm, implicit %f1
+///
+/// When this pattern is detected, branch coalescing will try to collapse
+/// it by moving code in %bb.2 to %bb.0 and/or %bb.4 and removing %bb.3.
+///
+/// If all conditions are meet, IR should collapse to:
+///
+/// %bb.0: derived from LLVM BB %entry
+/// Live Ins: %f1 %f3 %x6
+/// <SNIP1>
+/// %0 = COPY %f1; F8RC:%0
+/// %5 = CMPLWI killed %4, 0; CRRC:%5 GPRC:%4
+/// %8 = LXSDX %zero8, killed %7, implicit %rm;
+/// mem:LD8[ConstantPool] F8RC:%8 G8RC:%7
+/// <SNIP2>
+/// BCC 76, %5, <%bb.4>; CRRC:%5
+/// Successors according to CFG: %bb.1(0x2aaaaaaa / 0x80000000 = 33.33%)
+/// %bb.4(0x55555554 / 0x80000000 = 66.67%)
+///
+/// %bb.1: derived from LLVM BB %entry
+/// Predecessors according to CFG: %bb.0
+/// Successors according to CFG: %bb.4(0x40000000 / 0x80000000 = 50.00%)
+///
+/// %bb.4: derived from LLVM BB %entry
+/// Predecessors according to CFG: %bb.0 %bb.1
+/// %9 = PHI %8, <%bb.1>, %0, <%bb.0>;
+/// F8RC:%9,%8,%0
+/// %13 = PHI %12, <%bb.1>, %2, <%bb.0>;
+/// F8RC:%13,%12,%2
+/// <SNIP3>
+/// BLR8 implicit %lr8, implicit %rm, implicit %f1
+///
+/// Branch Coalescing does not split blocks, it moves everything in the same
+/// direction ensuring it does not break use/definition semantics.
+///
+/// PHI nodes and its corresponding use instructions are moved to its successor
+/// block if there are no uses within the successor block PHI nodes. PHI
+/// node ordering cannot be assumed.
+///
+/// Non-PHI can be moved up to the predecessor basic block or down to the
+/// successor basic block following any PHI instructions. Whether it moves
+/// up or down depends on whether the register(s) defined in the instructions
+/// are used in current block or in any PHI instructions at the beginning of
+/// the successor block.
+
+namespace {
+
+class PPCBranchCoalescing : public MachineFunctionPass {
+ struct CoalescingCandidateInfo {
+ MachineBasicBlock *BranchBlock; // Block containing the branch
+ MachineBasicBlock *BranchTargetBlock; // Block branched to
+ MachineBasicBlock *FallThroughBlock; // Fall-through if branch not taken
+ SmallVector<MachineOperand, 4> Cond;
+ bool MustMoveDown;
+ bool MustMoveUp;
+
+ CoalescingCandidateInfo();
+ void clear();
+ };
+
+ MachineDominatorTree *MDT;
+ MachinePostDominatorTree *MPDT;
+ const TargetInstrInfo *TII;
+ MachineRegisterInfo *MRI;
+
+ void initialize(MachineFunction &F);
+ bool canCoalesceBranch(CoalescingCandidateInfo &Cand);
+ bool identicalOperands(ArrayRef<MachineOperand> OperandList1,
+ ArrayRef<MachineOperand> OperandList2) const;
+ bool validateCandidates(CoalescingCandidateInfo &SourceRegion,
+ CoalescingCandidateInfo &TargetRegion) const;
+
+public:
+ static char ID;
+
+ PPCBranchCoalescing() : MachineFunctionPass(ID) {
+ initializePPCBranchCoalescingPass(*PassRegistry::getPassRegistry());
+ }
+
+ void getAnalysisUsage(AnalysisUsage &AU) const override {
+ AU.addRequired<MachineDominatorTree>();
+ AU.addRequired<MachinePostDominatorTree>();
+ MachineFunctionPass::getAnalysisUsage(AU);
+ }
+
+ StringRef getPassName() const override { return "Branch Coalescing"; }
+
+ bool mergeCandidates(CoalescingCandidateInfo &SourceRegion,
+ CoalescingCandidateInfo &TargetRegion);
+ bool canMoveToBeginning(const MachineInstr &MI,
+ const MachineBasicBlock &MBB) const;
+ bool canMoveToEnd(const MachineInstr &MI,
+ const MachineBasicBlock &MBB) const;
+ bool canMerge(CoalescingCandidateInfo &SourceRegion,
+ CoalescingCandidateInfo &TargetRegion) const;
+ void moveAndUpdatePHIs(MachineBasicBlock *SourceRegionMBB,
+ MachineBasicBlock *TargetRegionMBB);
+ bool runOnMachineFunction(MachineFunction &MF) override;
+};
+} // End anonymous namespace.
+
+char PPCBranchCoalescing::ID = 0;
+/// createPPCBranchCoalescingPass - returns an instance of the Branch Coalescing
+/// Pass
+FunctionPass *llvm::createPPCBranchCoalescingPass() {
+ return new PPCBranchCoalescing();
+}
+
+INITIALIZE_PASS_BEGIN(PPCBranchCoalescing, DEBUG_TYPE,
+ "Branch Coalescing", false, false)
+INITIALIZE_PASS_DEPENDENCY(MachineDominatorTree)
+INITIALIZE_PASS_DEPENDENCY(MachinePostDominatorTree)
+INITIALIZE_PASS_END(PPCBranchCoalescing, DEBUG_TYPE, "Branch Coalescing",
+ false, false)
+
+PPCBranchCoalescing::CoalescingCandidateInfo::CoalescingCandidateInfo()
+ : BranchBlock(nullptr), BranchTargetBlock(nullptr),
+ FallThroughBlock(nullptr), MustMoveDown(false), MustMoveUp(false) {}
+
+void PPCBranchCoalescing::CoalescingCandidateInfo::clear() {
+ BranchBlock = nullptr;
+ BranchTargetBlock = nullptr;
+ FallThroughBlock = nullptr;
+ Cond.clear();
+ MustMoveDown = false;
+ MustMoveUp = false;
+}
+
+void PPCBranchCoalescing::initialize(MachineFunction &MF) {
+ MDT = &getAnalysis<MachineDominatorTree>();
+ MPDT = &getAnalysis<MachinePostDominatorTree>();
+ TII = MF.getSubtarget().getInstrInfo();
+ MRI = &MF.getRegInfo();
+}
+
+///
+/// Analyze the branch statement to determine if it can be coalesced. This
+/// method analyses the branch statement for the given candidate to determine
+/// if it can be coalesced. If the branch can be coalesced, then the
+/// BranchTargetBlock and the FallThroughBlock are recorded in the specified
+/// Candidate.
+///
+///\param[in,out] Cand The coalescing candidate to analyze
+///\return true if and only if the branch can be coalesced, false otherwise
+///
+bool PPCBranchCoalescing::canCoalesceBranch(CoalescingCandidateInfo &Cand) {
+ DEBUG(dbgs() << "Determine if branch block " << Cand.BranchBlock->getNumber()
+ << " can be coalesced:");
+ MachineBasicBlock *FalseMBB = nullptr;
+
+ if (TII->analyzeBranch(*Cand.BranchBlock, Cand.BranchTargetBlock, FalseMBB,
+ Cand.Cond)) {
+ DEBUG(dbgs() << "TII unable to Analyze Branch - skip\n");
+ return false;
+ }
+
+ for (auto &I : Cand.BranchBlock->terminators()) {
+ DEBUG(dbgs() << "Looking at terminator : " << I << "\n");
+ if (!I.isBranch())
+ continue;
+
+ // The analyzeBranch method does not include any implicit operands.
+ // This is not an issue on PPC but must be handled on other targets.
+ // For this pass to be made target-independent, the analyzeBranch API
+ // need to be updated to support implicit operands and there would
+ // need to be a way to verify that any implicit operands would not be
+ // clobbered by merging blocks. This would include identifying the
+ // implicit operands as well as the basic block they are defined in.
+ // This could be done by changing the analyzeBranch API to have it also
+ // record and return the implicit operands and the blocks where they are
+ // defined. Alternatively, the BranchCoalescing code would need to be
+ // extended to identify the implicit operands. The analysis in canMerge
+ // must then be extended to prove that none of the implicit operands are
+ // changed in the blocks that are combined during coalescing.
+ if (I.getNumOperands() != I.getNumExplicitOperands()) {
+ DEBUG(dbgs() << "Terminator contains implicit operands - skip : " << I
+ << "\n");
+ return false;
+ }
+ }
+
+ if (Cand.BranchBlock->isEHPad() || Cand.BranchBlock->hasEHPadSuccessor()) {
+ DEBUG(dbgs() << "EH Pad - skip\n");
+ return false;
+ }
+
+ // For now only consider triangles (i.e, BranchTargetBlock is set,
+ // FalseMBB is null, and BranchTargetBlock is a successor to BranchBlock)
+ if (!Cand.BranchTargetBlock || FalseMBB ||
+ !Cand.BranchBlock->isSuccessor(Cand.BranchTargetBlock)) {
+ DEBUG(dbgs() << "Does not form a triangle - skip\n");
+ return false;
+ }
+
+ // Ensure there are only two successors
+ if (Cand.BranchBlock->succ_size() != 2) {
+ DEBUG(dbgs() << "Does not have 2 successors - skip\n");
+ return false;
+ }
+
+ // Sanity check - the block must be able to fall through
+ assert(Cand.BranchBlock->canFallThrough() &&
+ "Expecting the block to fall through!");
+
+ // We have already ensured there are exactly two successors to
+ // BranchBlock and that BranchTargetBlock is a successor to BranchBlock.
+ // Ensure the single fall though block is empty.
+ MachineBasicBlock *Succ =
+ (*Cand.BranchBlock->succ_begin() == Cand.BranchTargetBlock)
+ ? *Cand.BranchBlock->succ_rbegin()
+ : *Cand.BranchBlock->succ_begin();
+
+ assert(Succ && "Expecting a valid fall-through block\n");
+
+ if (!Succ->empty()) {
+ DEBUG(dbgs() << "Fall-through block contains code -- skip\n");
+ return false;
+ }
+
+ if (!Succ->isSuccessor(Cand.BranchTargetBlock)) {
+ DEBUG(dbgs()
+ << "Successor of fall through block is not branch taken block\n");
+ return false;
+ }
+
+ Cand.FallThroughBlock = Succ;
+ DEBUG(dbgs() << "Valid Candidate\n");
+ return true;
+}
+
+///
+/// Determine if the two operand lists are identical
+///
+/// \param[in] OpList1 operand list
+/// \param[in] OpList2 operand list
+/// \return true if and only if the operands lists are identical
+///
+bool PPCBranchCoalescing::identicalOperands(
+ ArrayRef<MachineOperand> OpList1, ArrayRef<MachineOperand> OpList2) const {
+
+ if (OpList1.size() != OpList2.size()) {
+ DEBUG(dbgs() << "Operand list is different size\n");
+ return false;
+ }
+
+ for (unsigned i = 0; i < OpList1.size(); ++i) {
+ const MachineOperand &Op1 = OpList1[i];
+ const MachineOperand &Op2 = OpList2[i];
+
+ DEBUG(dbgs() << "Op1: " << Op1 << "\n"
+ << "Op2: " << Op2 << "\n");
+
+ if (Op1.isIdenticalTo(Op2)) {
+ // filter out instructions with physical-register uses
+ if (Op1.isReg() && TargetRegisterInfo::isPhysicalRegister(Op1.getReg())
+ // If the physical register is constant then we can assume the value
+ // has not changed between uses.
+ && !(Op1.isUse() && MRI->isConstantPhysReg(Op1.getReg()))) {
+ DEBUG(dbgs() << "The operands are not provably identical.\n");
+ return false;
+ }
+ DEBUG(dbgs() << "Op1 and Op2 are identical!\n");
+ continue;
+ }
+
+ // If the operands are not identical, but are registers, check to see if the
+ // definition of the register produces the same value. If they produce the
+ // same value, consider them to be identical.
+ if (Op1.isReg() && Op2.isReg() &&
+ TargetRegisterInfo::isVirtualRegister(Op1.getReg()) &&
+ TargetRegisterInfo::isVirtualRegister(Op2.getReg())) {
+ MachineInstr *Op1Def = MRI->getVRegDef(Op1.getReg());
+ MachineInstr *Op2Def = MRI->getVRegDef(Op2.getReg());
+ if (TII->produceSameValue(*Op1Def, *Op2Def, MRI)) {
+ DEBUG(dbgs() << "Op1Def: " << *Op1Def << " and " << *Op2Def
+ << " produce the same value!\n");
+ } else {
+ DEBUG(dbgs() << "Operands produce different values\n");
+ return false;
+ }
+ } else {
+ DEBUG(dbgs() << "The operands are not provably identical.\n");
+ return false;
+ }
+ }
+
+ return true;
+}
+
+///
+/// Moves ALL PHI instructions in SourceMBB to beginning of TargetMBB
+/// and update them to refer to the new block. PHI node ordering
+/// cannot be assumed so it does not matter where the PHI instructions
+/// are moved to in TargetMBB.
+///
+/// \param[in] SourceMBB block to move PHI instructions from
+/// \param[in] TargetMBB block to move PHI instructions to
+///
+void PPCBranchCoalescing::moveAndUpdatePHIs(MachineBasicBlock *SourceMBB,
+ MachineBasicBlock *TargetMBB) {
+
+ MachineBasicBlock::iterator MI = SourceMBB->begin();
+ MachineBasicBlock::iterator ME = SourceMBB->getFirstNonPHI();
+
+ if (MI == ME) {
+ DEBUG(dbgs() << "SourceMBB contains no PHI instructions.\n");
+ return;
+ }
+
+ // Update all PHI instructions in SourceMBB and move to top of TargetMBB
+ for (MachineBasicBlock::iterator Iter = MI; Iter != ME; Iter++) {
+ MachineInstr &PHIInst = *Iter;
+ for (unsigned i = 2, e = PHIInst.getNumOperands() + 1; i != e; i += 2) {
+ MachineOperand &MO = PHIInst.getOperand(i);
+ if (MO.getMBB() == SourceMBB)
+ MO.setMBB(TargetMBB);
+ }
+ }
+ TargetMBB->splice(TargetMBB->begin(), SourceMBB, MI, ME);
+}
+
+///
+/// This function checks if MI can be moved to the beginning of the TargetMBB
+/// following PHI instructions. A MI instruction can be moved to beginning of
+/// the TargetMBB if there are no uses of it within the TargetMBB PHI nodes.
+///
+/// \param[in] MI the machine instruction to move.
+/// \param[in] TargetMBB the machine basic block to move to
+/// \return true if it is safe to move MI to beginning of TargetMBB,
+/// false otherwise.
+///
+bool PPCBranchCoalescing::canMoveToBeginning(const MachineInstr &MI,
+ const MachineBasicBlock &TargetMBB
+ ) const {
+
+ DEBUG(dbgs() << "Checking if " << MI << " can move to beginning of "
+ << TargetMBB.getNumber() << "\n");
+
+ for (auto &Def : MI.defs()) { // Looking at Def
+ for (auto &Use : MRI->use_instructions(Def.getReg())) {
+ if (Use.isPHI() && Use.getParent() == &TargetMBB) {
+ DEBUG(dbgs() << " *** used in a PHI -- cannot move ***\n");
+ return false;
+ }
+ }
+ }
+
+ DEBUG(dbgs() << " Safe to move to the beginning.\n");
+ return true;
+}
+
+///
+/// This function checks if MI can be moved to the end of the TargetMBB,
+/// immediately before the first terminator. A MI instruction can be moved
+/// to then end of the TargetMBB if no PHI node defines what MI uses within
+/// it's own MBB.
+///
+/// \param[in] MI the machine instruction to move.
+/// \param[in] TargetMBB the machine basic block to move to
+/// \return true if it is safe to move MI to end of TargetMBB,
+/// false otherwise.
+///
+bool PPCBranchCoalescing::canMoveToEnd(const MachineInstr &MI,
+ const MachineBasicBlock &TargetMBB
+ ) const {
+
+ DEBUG(dbgs() << "Checking if " << MI << " can move to end of "
+ << TargetMBB.getNumber() << "\n");
+
+ for (auto &Use : MI.uses()) {
+ if (Use.isReg() && TargetRegisterInfo::isVirtualRegister(Use.getReg())) {
+ MachineInstr *DefInst = MRI->getVRegDef(Use.getReg());
+ if (DefInst->isPHI() && DefInst->getParent() == MI.getParent()) {
+ DEBUG(dbgs() << " *** Cannot move this instruction ***\n");
+ return false;
+ } else {
+ DEBUG(dbgs() << " *** def is in another block -- safe to move!\n");
+ }
+ }
+ }
+
+ DEBUG(dbgs() << " Safe to move to the end.\n");
+ return true;
+}
+
+///
+/// This method checks to ensure the two coalescing candidates follows the
+/// expected pattern required for coalescing.
+///
+/// \param[in] SourceRegion The candidate to move statements from
+/// \param[in] TargetRegion The candidate to move statements to
+/// \return true if all instructions in SourceRegion.BranchBlock can be merged
+/// into a block in TargetRegion; false otherwise.
+///
+bool PPCBranchCoalescing::validateCandidates(
+ CoalescingCandidateInfo &SourceRegion,
+ CoalescingCandidateInfo &TargetRegion) const {
+
+ if (TargetRegion.BranchTargetBlock != SourceRegion.BranchBlock)
+ llvm_unreachable("Expecting SourceRegion to immediately follow TargetRegion");
+ else if (!MDT->dominates(TargetRegion.BranchBlock, SourceRegion.BranchBlock))
+ llvm_unreachable("Expecting TargetRegion to dominate SourceRegion");
+ else if (!MPDT->dominates(SourceRegion.BranchBlock, TargetRegion.BranchBlock))
+ llvm_unreachable("Expecting SourceRegion to post-dominate TargetRegion");
+ else if (!TargetRegion.FallThroughBlock->empty() ||
+ !SourceRegion.FallThroughBlock->empty())
+ llvm_unreachable("Expecting fall-through blocks to be empty");
+
+ return true;
+}
+
+///
+/// This method determines whether the two coalescing candidates can be merged.
+/// In order to be merged, all instructions must be able to
+/// 1. Move to the beginning of the SourceRegion.BranchTargetBlock;
+/// 2. Move to the end of the TargetRegion.BranchBlock.
+/// Merging involves moving the instructions in the
+/// TargetRegion.BranchTargetBlock (also SourceRegion.BranchBlock).
+///
+/// This function first try to move instructions from the
+/// TargetRegion.BranchTargetBlock down, to the beginning of the
+/// SourceRegion.BranchTargetBlock. This is not possible if any register defined
+/// in TargetRegion.BranchTargetBlock is used in a PHI node in the
+/// SourceRegion.BranchTargetBlock. In this case, check whether the statement
+/// can be moved up, to the end of the TargetRegion.BranchBlock (immediately
+/// before the branch statement). If it cannot move, then these blocks cannot
+/// be merged.
+///
+/// Note that there is no analysis for moving instructions past the fall-through
+/// blocks because they are confirmed to be empty. An assert is thrown if they
+/// are not.
+///
+/// \param[in] SourceRegion The candidate to move statements from
+/// \param[in] TargetRegion The candidate to move statements to
+/// \return true if all instructions in SourceRegion.BranchBlock can be merged
+/// into a block in TargetRegion, false otherwise.
+///
+bool PPCBranchCoalescing::canMerge(CoalescingCandidateInfo &SourceRegion,
+ CoalescingCandidateInfo &TargetRegion) const {
+ if (!validateCandidates(SourceRegion, TargetRegion))
+ return false;
+
+ // Walk through PHI nodes first and see if they force the merge into the
+ // SourceRegion.BranchTargetBlock.
+ for (MachineBasicBlock::iterator
+ I = SourceRegion.BranchBlock->instr_begin(),
+ E = SourceRegion.BranchBlock->getFirstNonPHI();
+ I != E; ++I) {
+ for (auto &Def : I->defs())
+ for (auto &Use : MRI->use_instructions(Def.getReg())) {
+ if (Use.isPHI() && Use.getParent() == SourceRegion.BranchTargetBlock) {
+ DEBUG(dbgs() << "PHI " << *I << " defines register used in another "
+ "PHI within branch target block -- can't merge\n");
+ NumPHINotMoved++;
+ return false;
+ }
+ if (Use.getParent() == SourceRegion.BranchBlock) {
+ DEBUG(dbgs() << "PHI " << *I
+ << " defines register used in this "
+ "block -- all must move down\n");
+ SourceRegion.MustMoveDown = true;
+ }
+ }
+ }
+
+ // Walk through the MI to see if they should be merged into
+ // TargetRegion.BranchBlock (up) or SourceRegion.BranchTargetBlock (down)
+ for (MachineBasicBlock::iterator
+ I = SourceRegion.BranchBlock->getFirstNonPHI(),
+ E = SourceRegion.BranchBlock->end();
+ I != E; ++I) {
+ if (!canMoveToBeginning(*I, *SourceRegion.BranchTargetBlock)) {
+ DEBUG(dbgs() << "Instruction " << *I
+ << " cannot move down - must move up!\n");
+ SourceRegion.MustMoveUp = true;
+ }
+ if (!canMoveToEnd(*I, *TargetRegion.BranchBlock)) {
+ DEBUG(dbgs() << "Instruction " << *I
+ << " cannot move up - must move down!\n");
+ SourceRegion.MustMoveDown = true;
+ }
+ }
+
+ return (SourceRegion.MustMoveUp && SourceRegion.MustMoveDown) ? false : true;
+}
+
+/// Merge the instructions from SourceRegion.BranchBlock,
+/// SourceRegion.BranchTargetBlock, and SourceRegion.FallThroughBlock into
+/// TargetRegion.BranchBlock, TargetRegion.BranchTargetBlock and
+/// TargetRegion.FallThroughBlock respectively.
+///
+/// The successors for blocks in TargetRegion will be updated to use the
+/// successors from blocks in SourceRegion. Finally, the blocks in SourceRegion
+/// will be removed from the function.
+///
+/// A region consists of a BranchBlock, a FallThroughBlock, and a
+/// BranchTargetBlock. Branch coalesce works on patterns where the
+/// TargetRegion's BranchTargetBlock must also be the SourceRegions's
+/// BranchBlock.
+///
+/// Before mergeCandidates:
+///
+/// +---------------------------+
+/// | TargetRegion.BranchBlock |
+/// +---------------------------+
+/// / |
+/// / +--------------------------------+
+/// | | TargetRegion.FallThroughBlock |
+/// \ +--------------------------------+
+/// \ |
+/// +----------------------------------+
+/// | TargetRegion.BranchTargetBlock |
+/// | SourceRegion.BranchBlock |
+/// +----------------------------------+
+/// / |
+/// / +--------------------------------+
+/// | | SourceRegion.FallThroughBlock |
+/// \ +--------------------------------+
+/// \ |
+/// +----------------------------------+
+/// | SourceRegion.BranchTargetBlock |
+/// +----------------------------------+
+///
+/// After mergeCandidates:
+///
+/// +-----------------------------+
+/// | TargetRegion.BranchBlock |
+/// | SourceRegion.BranchBlock |
+/// +-----------------------------+
+/// / |
+/// / +---------------------------------+
+/// | | TargetRegion.FallThroughBlock |
+/// | | SourceRegion.FallThroughBlock |
+/// \ +---------------------------------+
+/// \ |
+/// +----------------------------------+
+/// | SourceRegion.BranchTargetBlock |
+/// +----------------------------------+
+///
+/// \param[in] SourceRegion The candidate to move blocks from
+/// \param[in] TargetRegion The candidate to move blocks to
+///
+bool PPCBranchCoalescing::mergeCandidates(CoalescingCandidateInfo &SourceRegion,
+ CoalescingCandidateInfo &TargetRegion) {
+
+ if (SourceRegion.MustMoveUp && SourceRegion.MustMoveDown) {
+ llvm_unreachable("Cannot have both MustMoveDown and MustMoveUp set!");
+ return false;
+ }
+
+ if (!validateCandidates(SourceRegion, TargetRegion))
+ return false;
+
+ // Start the merging process by first handling the BranchBlock.
+ // Move any PHIs in SourceRegion.BranchBlock down to the branch-taken block
+ moveAndUpdatePHIs(SourceRegion.BranchBlock, SourceRegion.BranchTargetBlock);
+
+ // Move remaining instructions in SourceRegion.BranchBlock into
+ // TargetRegion.BranchBlock
+ MachineBasicBlock::iterator firstInstr =
+ SourceRegion.BranchBlock->getFirstNonPHI();
+ MachineBasicBlock::iterator lastInstr =
+ SourceRegion.BranchBlock->getFirstTerminator();
+
+ MachineBasicBlock *Source = SourceRegion.MustMoveDown
+ ? SourceRegion.BranchTargetBlock
+ : TargetRegion.BranchBlock;
+
+ MachineBasicBlock::iterator Target =
+ SourceRegion.MustMoveDown
+ ? SourceRegion.BranchTargetBlock->getFirstNonPHI()
+ : TargetRegion.BranchBlock->getFirstTerminator();
+
+ Source->splice(Target, SourceRegion.BranchBlock, firstInstr, lastInstr);
+
+ // Once PHI and instructions have been moved we need to clean up the
+ // control flow.
+
+ // Remove SourceRegion.FallThroughBlock before transferring successors of
+ // SourceRegion.BranchBlock to TargetRegion.BranchBlock.
+ SourceRegion.BranchBlock->removeSuccessor(SourceRegion.FallThroughBlock);
+ TargetRegion.BranchBlock->transferSuccessorsAndUpdatePHIs(
+ SourceRegion.BranchBlock);
+ // Update branch in TargetRegion.BranchBlock to jump to
+ // SourceRegion.BranchTargetBlock
+ // In this case, TargetRegion.BranchTargetBlock == SourceRegion.BranchBlock.
+ TargetRegion.BranchBlock->ReplaceUsesOfBlockWith(
+ SourceRegion.BranchBlock, SourceRegion.BranchTargetBlock);
+ // Remove the branch statement(s) in SourceRegion.BranchBlock
+ MachineBasicBlock::iterator I =
+ SourceRegion.BranchBlock->terminators().begin();
+ while (I != SourceRegion.BranchBlock->terminators().end()) {
+ MachineInstr &CurrInst = *I;
+ ++I;
+ if (CurrInst.isBranch())
+ CurrInst.eraseFromParent();
+ }
+
+ // Fall-through block should be empty since this is part of the condition
+ // to coalesce the branches.
+ assert(TargetRegion.FallThroughBlock->empty() &&
+ "FallThroughBlocks should be empty!");
+
+ // Transfer successor information and move PHIs down to the
+ // branch-taken block.
+ TargetRegion.FallThroughBlock->transferSuccessorsAndUpdatePHIs(
+ SourceRegion.FallThroughBlock);
+ TargetRegion.FallThroughBlock->removeSuccessor(SourceRegion.BranchBlock);
+
+ // Remove the blocks from the function.
+ assert(SourceRegion.BranchBlock->empty() &&
+ "Expecting branch block to be empty!");
+ SourceRegion.BranchBlock->eraseFromParent();
+
+ assert(SourceRegion.FallThroughBlock->empty() &&
+ "Expecting fall-through block to be empty!\n");
+ SourceRegion.FallThroughBlock->eraseFromParent();
+
+ NumBlocksCoalesced++;
+ return true;
+}
+
+bool PPCBranchCoalescing::runOnMachineFunction(MachineFunction &MF) {
+
+ if (skipFunction(MF.getFunction()) || MF.empty())
+ return false;
+
+ bool didSomething = false;
+
+ DEBUG(dbgs() << "******** Branch Coalescing ********\n");
+ initialize(MF);
+
+ DEBUG(dbgs() << "Function: "; MF.dump(); dbgs() << "\n");
+
+ CoalescingCandidateInfo Cand1, Cand2;
+ // Walk over blocks and find candidates to merge
+ // Continue trying to merge with the first candidate found, as long as merging
+ // is successfull.
+ for (MachineBasicBlock &MBB : MF) {
+ bool MergedCandidates = false;
+ do {
+ MergedCandidates = false;
+ Cand1.clear();
+ Cand2.clear();
+
+ Cand1.BranchBlock = &MBB;
+
+ // If unable to coalesce the branch, then continue to next block
+ if (!canCoalesceBranch(Cand1))
+ break;
+
+ Cand2.BranchBlock = Cand1.BranchTargetBlock;
+ if (!canCoalesceBranch(Cand2))
+ break;
+
+ // Sanity check
+ // The branch-taken block of the second candidate should post-dominate the
+ // first candidate
+ assert(MPDT->dominates(Cand2.BranchTargetBlock, Cand1.BranchBlock) &&
+ "Branch-taken block should post-dominate first candidate");
+
+ if (!identicalOperands(Cand1.Cond, Cand2.Cond)) {
+ DEBUG(dbgs() << "Blocks " << Cand1.BranchBlock->getNumber() << " and "
+ << Cand2.BranchBlock->getNumber()
+ << " have different branches\n");
+ break;
+ }
+ if (!canMerge(Cand2, Cand1)) {
+ DEBUG(dbgs() << "Cannot merge blocks " << Cand1.BranchBlock->getNumber()
+ << " and " << Cand2.BranchBlock->getNumber() << "\n");
+ NumBlocksNotCoalesced++;
+ continue;
+ }
+ DEBUG(dbgs() << "Merging blocks " << Cand1.BranchBlock->getNumber()
+ << " and " << Cand1.BranchTargetBlock->getNumber() << "\n");
+ MergedCandidates = mergeCandidates(Cand2, Cand1);
+ if (MergedCandidates)
+ didSomething = true;
+
+ DEBUG(dbgs() << "Function after merging: "; MF.dump(); dbgs() << "\n");
+ } while (MergedCandidates);
+ }
+
+#ifndef NDEBUG
+ // Verify MF is still valid after branch coalescing
+ if (didSomething)
+ MF.verify(nullptr, "Error in code produced by branch coalescing");
+#endif // NDEBUG
+
+ DEBUG(dbgs() << "Finished Branch Coalescing\n");
+ return didSomething;
+}
diff --git a/lib/Target/PowerPC/PPCBranchSelector.cpp b/lib/Target/PowerPC/PPCBranchSelector.cpp
index d0b66f9bca09a..64b8f1168beb8 100644
--- a/lib/Target/PowerPC/PPCBranchSelector.cpp
+++ b/lib/Target/PowerPC/PPCBranchSelector.cpp
@@ -23,9 +23,9 @@
#include "llvm/ADT/Statistic.h"
#include "llvm/CodeGen/MachineFunctionPass.h"
#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/CodeGen/TargetSubtargetInfo.h"
#include "llvm/Support/MathExtras.h"
#include "llvm/Target/TargetMachine.h"
-#include "llvm/Target/TargetSubtargetInfo.h"
using namespace llvm;
#define DEBUG_TYPE "ppc-branch-select"
diff --git a/lib/Target/PowerPC/PPCCTRLoops.cpp b/lib/Target/PowerPC/PPCCTRLoops.cpp
index 53f33ac1fc0ed..fc638829378ab 100644
--- a/lib/Target/PowerPC/PPCCTRLoops.cpp
+++ b/lib/Target/PowerPC/PPCCTRLoops.cpp
@@ -26,12 +26,17 @@
#include "PPC.h"
#include "PPCSubtarget.h"
#include "PPCTargetMachine.h"
+#include "PPCTargetTransformInfo.h"
#include "llvm/ADT/STLExtras.h"
#include "llvm/ADT/Statistic.h"
+#include "llvm/Analysis/AssumptionCache.h"
+#include "llvm/Analysis/CodeMetrics.h"
#include "llvm/Analysis/LoopInfo.h"
#include "llvm/Analysis/ScalarEvolutionExpander.h"
#include "llvm/Analysis/TargetLibraryInfo.h"
+#include "llvm/Analysis/TargetTransformInfo.h"
#include "llvm/CodeGen/TargetPassConfig.h"
+#include "llvm/CodeGen/TargetSchedule.h"
#include "llvm/IR/Constants.h"
#include "llvm/IR/DerivedTypes.h"
#include "llvm/IR/Dominators.h"
@@ -64,6 +69,13 @@ using namespace llvm;
static cl::opt<int> CTRLoopLimit("ppc-max-ctrloop", cl::Hidden, cl::init(-1));
#endif
+// The latency of mtctr is only justified if there are more than 4
+// comparisons that will be removed as a result.
+static cl::opt<unsigned>
+SmallCTRLoopThreshold("min-ctr-loop-threshold", cl::init(4), cl::Hidden,
+ cl::desc("Loops with a constant trip count smaller than "
+ "this value will not use the count register."));
+
STATISTIC(NumCTRLoops, "Number of loops converted to CTR loops");
namespace llvm {
@@ -95,6 +107,8 @@ namespace {
AU.addRequired<DominatorTreeWrapperPass>();
AU.addPreserved<DominatorTreeWrapperPass>();
AU.addRequired<ScalarEvolutionWrapperPass>();
+ AU.addRequired<AssumptionCacheTracker>();
+ AU.addRequired<TargetTransformInfoWrapperPass>();
}
private:
@@ -107,10 +121,12 @@ namespace {
const PPCTargetLowering *TLI;
const DataLayout *DL;
const TargetLibraryInfo *LibInfo;
+ const TargetTransformInfo *TTI;
LoopInfo *LI;
ScalarEvolution *SE;
DominatorTree *DT;
bool PreserveLCSSA;
+ TargetSchedModel SchedModel;
};
char PPCCTRLoops::ID = 0;
@@ -179,6 +195,7 @@ bool PPCCTRLoops::runOnFunction(Function &F) {
LI = &getAnalysis<LoopInfoWrapperPass>().getLoopInfo();
SE = &getAnalysis<ScalarEvolutionWrapperPass>().getSE();
DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree();
+ TTI = &getAnalysis<TargetTransformInfoWrapperPass>().getTTI(F);
DL = &F.getParent()->getDataLayout();
auto *TLIP = getAnalysisIfAvailable<TargetLibraryInfoWrapperPass>();
LibInfo = TLIP ? &TLIP->getTLI() : nullptr;
@@ -243,8 +260,8 @@ bool PPCCTRLoops::mightUseCTR(BasicBlock *BB) {
if (CallInst *CI = dyn_cast<CallInst>(J)) {
// Inline ASM is okay, unless it clobbers the ctr register.
if (InlineAsm *IA = dyn_cast<InlineAsm>(CI->getCalledValue())) {
- if (asmClobbersCTR(IA))
- return true;
+ if (asmClobbersCTR(IA))
+ return true;
continue;
}
@@ -462,10 +479,24 @@ bool PPCCTRLoops::mightUseCTR(BasicBlock *BB) {
return false;
}
-
bool PPCCTRLoops::convertToCTRLoop(Loop *L) {
bool MadeChange = false;
+ // Do not convert small short loops to CTR loop.
+ unsigned ConstTripCount = SE->getSmallConstantTripCount(L);
+ if (ConstTripCount && ConstTripCount < SmallCTRLoopThreshold) {
+ SmallPtrSet<const Value *, 32> EphValues;
+ auto AC = getAnalysis<AssumptionCacheTracker>().getAssumptionCache(
+ *L->getHeader()->getParent());
+ CodeMetrics::collectEphemeralValues(L, &AC, EphValues);
+ CodeMetrics Metrics;
+ for (BasicBlock *BB : L->blocks())
+ Metrics.analyzeBasicBlock(BB, *TTI, EphValues);
+ // 6 is an approximate latency for the mtctr instruction.
+ if (Metrics.NumInsts <= (6 * SchedModel.getIssueWidth()))
+ return false;
+ }
+
// Process nested loops first.
for (Loop::iterator I = L->begin(), E = L->end(); I != E; ++I) {
MadeChange |= convertToCTRLoop(*I);
@@ -659,12 +690,11 @@ check_block:
}
if (I != BI && clobbersCTR(*I)) {
- DEBUG(dbgs() << "BB#" << MBB->getNumber() << " (" <<
- MBB->getFullName() << ") instruction " << *I <<
- " clobbers CTR, invalidating " << "BB#" <<
- BI->getParent()->getNumber() << " (" <<
- BI->getParent()->getFullName() << ") instruction " <<
- *BI << "\n");
+ DEBUG(dbgs() << printMBBReference(*MBB) << " (" << MBB->getFullName()
+ << ") instruction " << *I << " clobbers CTR, invalidating "
+ << printMBBReference(*BI->getParent()) << " ("
+ << BI->getParent()->getFullName() << ") instruction " << *BI
+ << "\n");
return false;
}
@@ -678,10 +708,10 @@ check_block:
if (CheckPreds) {
queue_preds:
if (MachineFunction::iterator(MBB) == MBB->getParent()->begin()) {
- DEBUG(dbgs() << "Unable to find a MTCTR instruction for BB#" <<
- BI->getParent()->getNumber() << " (" <<
- BI->getParent()->getFullName() << ") instruction " <<
- *BI << "\n");
+ DEBUG(dbgs() << "Unable to find a MTCTR instruction for "
+ << printMBBReference(*BI->getParent()) << " ("
+ << BI->getParent()->getFullName() << ") instruction " << *BI
+ << "\n");
return false;
}
diff --git a/lib/Target/PowerPC/PPCEarlyReturn.cpp b/lib/Target/PowerPC/PPCEarlyReturn.cpp
index 811e4dd9dfe16..1699463c0a4bc 100644
--- a/lib/Target/PowerPC/PPCEarlyReturn.cpp
+++ b/lib/Target/PowerPC/PPCEarlyReturn.cpp
@@ -173,7 +173,7 @@ protected:
public:
bool runOnMachineFunction(MachineFunction &MF) override {
- if (skipFunction(*MF.getFunction()))
+ if (skipFunction(MF.getFunction()))
return false;
TII = MF.getSubtarget().getInstrInfo();
diff --git a/lib/Target/PowerPC/PPCExpandISEL.cpp b/lib/Target/PowerPC/PPCExpandISEL.cpp
index 41e3190c3eec7..b00e98b63e346 100644
--- a/lib/Target/PowerPC/PPCExpandISEL.cpp
+++ b/lib/Target/PowerPC/PPCExpandISEL.cpp
@@ -59,6 +59,8 @@ class PPCExpandISEL : public MachineFunctionPass {
typedef SmallDenseMap<int, BlockISELList> ISELInstructionList;
// A map of MBB numbers to their lists of contained ISEL instructions.
+ // Please note when we traverse this list and expand ISEL, we only remove
+ // the ISEL from the MBB not from this list.
ISELInstructionList ISELInstructions;
/// Initialize the object.
@@ -124,9 +126,6 @@ public:
#endif
bool runOnMachineFunction(MachineFunction &MF) override {
- if (!isExpandISELEnabled(MF))
- return false;
-
DEBUG(dbgs() << "Function: "; MF.dump(); dbgs() << "\n");
initialize(MF);
@@ -171,7 +170,7 @@ bool PPCExpandISEL::collectISELInstructions() {
#ifndef NDEBUG
void PPCExpandISEL::DumpISELInstructions() const {
for (const auto &I : ISELInstructions) {
- DEBUG(dbgs() << "BB#" << I.first << ":\n");
+ DEBUG(dbgs() << printMBBReference(*MF->getBlockNumbered(I.first)) << ":\n");
for (const auto &VI : I.second)
DEBUG(dbgs() << " "; VI->print(dbgs()));
}
@@ -190,26 +189,71 @@ bool PPCExpandISEL::canMerge(MachineInstr *PrevPushedMI, MachineInstr *MI) {
}
void PPCExpandISEL::expandAndMergeISELs() {
+ bool ExpandISELEnabled = isExpandISELEnabled(*MF);
+
for (auto &BlockList : ISELInstructions) {
- DEBUG(dbgs() << "Expanding ISEL instructions in BB#" << BlockList.first
+ DEBUG(dbgs() << "Expanding ISEL instructions in "
+ << printMBBReference(*MF->getBlockNumbered(BlockList.first))
<< "\n");
-
BlockISELList &CurrentISELList = BlockList.second;
auto I = CurrentISELList.begin();
auto E = CurrentISELList.end();
while (I != E) {
- BlockISELList SubISELList;
-
- SubISELList.push_back(*I++);
+ assert(isISEL(**I) && "Expecting an ISEL instruction");
+ MachineOperand &Dest = (*I)->getOperand(0);
+ MachineOperand &TrueValue = (*I)->getOperand(1);
+ MachineOperand &FalseValue = (*I)->getOperand(2);
- // Collect the ISELs that can be merged together.
- while (I != E && canMerge(SubISELList.back(), *I))
+ // Special case 1, all registers used by ISEL are the same one.
+ // The non-redundant isel 0, 0, 0, N would not satisfy these conditions
+ // as it would be ISEL %R0, %ZERO, %R0, %CRN.
+ if (useSameRegister(Dest, TrueValue) &&
+ useSameRegister(Dest, FalseValue)) {
+ DEBUG(dbgs() << "Remove redudant ISEL instruction: " << **I << "\n");
+ // FIXME: if the CR field used has no other uses, we could eliminate the
+ // instruction that defines it. This would have to be done manually
+ // since this pass runs too late to run DCE after it.
+ NumRemoved++;
+ (*I)->eraseFromParent();
+ I++;
+ } else if (useSameRegister(TrueValue, FalseValue)) {
+ // Special case 2, the two input registers used by ISEL are the same.
+ // Note: the non-foldable isel RX, 0, 0, N would not satisfy this
+ // condition as it would be ISEL %RX, %ZERO, %R0, %CRN, which makes it
+ // safe to fold ISEL to MR(OR) instead of ADDI.
+ MachineBasicBlock *MBB = (*I)->getParent();
+ DEBUG(dbgs() << "Fold the ISEL instruction to an unconditonal copy:\n");
+ DEBUG(dbgs() << "ISEL: " << **I << "\n");
+ NumFolded++;
+ // Note: we're using both the TrueValue and FalseValue operands so as
+ // not to lose the kill flag if it is set on either of them.
+ BuildMI(*MBB, (*I), dl, TII->get(isISEL8(**I) ? PPC::OR8 : PPC::OR))
+ .add(Dest)
+ .add(TrueValue)
+ .add(FalseValue);
+ (*I)->eraseFromParent();
+ I++;
+ } else if (ExpandISELEnabled) { // Normal cases expansion enabled
+ DEBUG(dbgs() << "Expand ISEL instructions:\n");
+ DEBUG(dbgs() << "ISEL: " << **I << "\n");
+ BlockISELList SubISELList;
SubISELList.push_back(*I++);
+ // Collect the ISELs that can be merged together.
+ // This will eat up ISEL instructions without considering whether they
+ // may be redundant or foldable to a register copy. So we still keep
+ // the handleSpecialCases() downstream to handle them.
+ while (I != E && canMerge(SubISELList.back(), *I)) {
+ DEBUG(dbgs() << "ISEL: " << **I << "\n");
+ SubISELList.push_back(*I++);
+ }
- expandMergeableISELs(SubISELList);
- }
- }
+ expandMergeableISELs(SubISELList);
+ } else { // Normal cases expansion disabled
+ I++; // leave the ISEL as it is
+ }
+ } // end while
+ } // end for
}
void PPCExpandISEL::handleSpecialCases(BlockISELList &BIL,
@@ -232,13 +276,15 @@ void PPCExpandISEL::handleSpecialCases(BlockISELList &BIL,
// Similarly, if at least one of the ISEL instructions satisfy the
// following condition, we need the False Block:
// The Dest Register and False Value Register are not the same.
-
bool IsADDIInstRequired = !useSameRegister(Dest, TrueValue);
bool IsORIInstRequired = !useSameRegister(Dest, FalseValue);
// Special case 1, all registers used by ISEL are the same one.
if (!IsADDIInstRequired && !IsORIInstRequired) {
DEBUG(dbgs() << "Remove redudant ISEL instruction.");
+ // FIXME: if the CR field used has no other uses, we could eliminate the
+ // instruction that defines it. This would have to be done manually
+ // since this pass runs too late to run DCE after it.
NumRemoved++;
(*MI)->eraseFromParent();
// Setting MI to the erase result keeps the iterator valid and increased.
@@ -253,14 +299,15 @@ void PPCExpandISEL::handleSpecialCases(BlockISELList &BIL,
// PPC::ZERO8 will be used for the first operand if the value is meant to
// be zero. In this case, the useSameRegister method will return false,
// thereby preventing this ISEL from being folded.
-
if (useSameRegister(TrueValue, FalseValue) && (BIL.size() == 1)) {
DEBUG(dbgs() << "Fold the ISEL instruction to an unconditonal copy.");
NumFolded++;
- BuildMI(*MBB, (*MI), dl, TII->get(isISEL8(**MI) ? PPC::ADDI8 : PPC::ADDI))
+ // Note: we're using both the TrueValue and FalseValue operands so as
+ // not to lose the kill flag if it is set on either of them.
+ BuildMI(*MBB, (*MI), dl, TII->get(isISEL8(**MI) ? PPC::OR8 : PPC::OR))
.add(Dest)
.add(TrueValue)
- .add(MachineOperand::CreateImm(0));
+ .add(FalseValue);
(*MI)->eraseFromParent();
// Setting MI to the erase result keeps the iterator valid and increased.
MI = BIL.erase(MI);
diff --git a/lib/Target/PowerPC/PPCFastISel.cpp b/lib/Target/PowerPC/PPCFastISel.cpp
index bc9957194f6dd..402e29cdff726 100644
--- a/lib/Target/PowerPC/PPCFastISel.cpp
+++ b/lib/Target/PowerPC/PPCFastISel.cpp
@@ -29,6 +29,7 @@
#include "llvm/CodeGen/MachineFrameInfo.h"
#include "llvm/CodeGen/MachineInstrBuilder.h"
#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/CodeGen/TargetLowering.h"
#include "llvm/IR/CallingConv.h"
#include "llvm/IR/GetElementPtrTypeIterator.h"
#include "llvm/IR/GlobalAlias.h"
@@ -36,7 +37,6 @@
#include "llvm/IR/IntrinsicInst.h"
#include "llvm/IR/Operator.h"
#include "llvm/Support/Debug.h"
-#include "llvm/Target/TargetLowering.h"
#include "llvm/Target/TargetMachine.h"
//===----------------------------------------------------------------------===//
@@ -1930,7 +1930,7 @@ unsigned PPCFastISel::PPCMaterializeFP(const ConstantFP *CFP, MVT VT) {
PPCFuncInfo->setUsesTOCBasePtr();
// For small code model, generate a LF[SD](0, LDtocCPT(Idx, X2)).
- if (CModel == CodeModel::Small || CModel == CodeModel::JITDefault) {
+ if (CModel == CodeModel::Small) {
BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(PPC::LDtocCPT),
TmpReg)
.addConstantPoolIndex(Idx).addReg(PPC::X2);
@@ -1981,7 +1981,7 @@ unsigned PPCFastISel::PPCMaterializeGV(const GlobalValue *GV, MVT VT) {
PPCFuncInfo->setUsesTOCBasePtr();
// For small code model, generate a simple TOC load.
- if (CModel == CodeModel::Small || CModel == CodeModel::JITDefault)
+ if (CModel == CodeModel::Small)
BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(PPC::LDtoc),
DestReg)
.addGlobalAddress(GV)
@@ -1991,9 +1991,9 @@ unsigned PPCFastISel::PPCMaterializeGV(const GlobalValue *GV, MVT VT) {
// or externally available linkage, a non-local function address, or a
// jump table address (not yet needed), or if we are generating code
// for large code model, we generate:
- // LDtocL(GV, ADDIStocHA(%X2, GV))
+ // LDtocL(GV, ADDIStocHA(%x2, GV))
// Otherwise we generate:
- // ADDItocL(ADDIStocHA(%X2, GV), GV)
+ // ADDItocL(ADDIStocHA(%x2, GV), GV)
// Either way, start with the ADDIStocHA:
unsigned HighPartReg = createResultReg(RC);
BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(PPC::ADDIStocHA),
diff --git a/lib/Target/PowerPC/PPCFrameLowering.cpp b/lib/Target/PowerPC/PPCFrameLowering.cpp
index b49c3345a17dd..c870a2256691e 100644
--- a/lib/Target/PowerPC/PPCFrameLowering.cpp
+++ b/lib/Target/PowerPC/PPCFrameLowering.cpp
@@ -312,11 +312,9 @@ static void HandleVRSaveUpdate(MachineInstr &MI, const TargetInstrInfo &TII) {
// Live in and live out values already must be in the mask, so don't bother
// marking them.
- for (MachineRegisterInfo::livein_iterator
- I = MF->getRegInfo().livein_begin(),
- E = MF->getRegInfo().livein_end(); I != E; ++I) {
- unsigned RegNo = TRI->getEncodingValue(I->first);
- if (VRRegNo[RegNo] == I->first) // If this really is a vector reg.
+ for (std::pair<unsigned, unsigned> LI : MF->getRegInfo().liveins()) {
+ unsigned RegNo = TRI->getEncodingValue(LI.first);
+ if (VRRegNo[RegNo] == LI.first) // If this really is a vector reg.
UsedRegMask &= ~(1 << (31-RegNo)); // Doesn't need to be marked.
}
@@ -436,7 +434,7 @@ unsigned PPCFrameLowering::determineFrameLayout(MachineFunction &MF,
const PPCRegisterInfo *RegInfo = Subtarget.getRegisterInfo();
unsigned LR = RegInfo->getRARegister();
- bool DisableRedZone = MF.getFunction()->hasFnAttribute(Attribute::NoRedZone);
+ bool DisableRedZone = MF.getFunction().hasFnAttribute(Attribute::NoRedZone);
bool CanUseRedZone = !MFI.hasVarSizedObjects() && // No dynamic alloca.
!MFI.adjustsStack() && // No calls.
!MustSaveLR(MF, LR) && // No need to save LR.
@@ -501,7 +499,7 @@ bool PPCFrameLowering::needsFP(const MachineFunction &MF) const {
// Naked functions have no stack frame pushed, so we don't have a frame
// pointer.
- if (MF.getFunction()->hasFnAttribute(Attribute::Naked))
+ if (MF.getFunction().hasFnAttribute(Attribute::Naked))
return false;
return MF.getTarget().Options.DisableFramePointerElim(MF) ||
@@ -694,7 +692,7 @@ void PPCFrameLowering::emitPrologue(MachineFunction &MF,
const MCRegisterInfo *MRI = MMI.getContext().getRegisterInfo();
DebugLoc dl;
bool needsCFI = MMI.hasDebugInfo() ||
- MF.getFunction()->needsUnwindTableEntry();
+ MF.getFunction().needsUnwindTableEntry();
// Get processor type.
bool isPPC64 = Subtarget.isPPC64();
@@ -1507,7 +1505,7 @@ void PPCFrameLowering::emitEpilogue(MachineFunction &MF,
unsigned RetOpcode = MBBI->getOpcode();
if (MF.getTarget().Options.GuaranteedTailCallOpt &&
(RetOpcode == PPC::BLR || RetOpcode == PPC::BLR8) &&
- MF.getFunction()->getCallingConv() == CallingConv::Fast) {
+ MF.getFunction().getCallingConv() == CallingConv::Fast) {
PPCFunctionInfo *FI = MF.getInfo<PPCFunctionInfo>();
unsigned CallerAllocatedAmt = FI->getMinReservedArea();
@@ -2067,7 +2065,7 @@ eliminateCallFramePseudoInstr(MachineFunction &MF, MachineBasicBlock &MBB,
bool
PPCFrameLowering::restoreCalleeSavedRegisters(MachineBasicBlock &MBB,
MachineBasicBlock::iterator MI,
- const std::vector<CalleeSavedInfo> &CSI,
+ std::vector<CalleeSavedInfo> &CSI,
const TargetRegisterInfo *TRI) const {
// Currently, this function only handles SVR4 32- and 64-bit ABIs.
diff --git a/lib/Target/PowerPC/PPCFrameLowering.h b/lib/Target/PowerPC/PPCFrameLowering.h
index 28b0c57f0ffb5..f845d5a9ac64a 100644
--- a/lib/Target/PowerPC/PPCFrameLowering.h
+++ b/lib/Target/PowerPC/PPCFrameLowering.h
@@ -15,7 +15,7 @@
#include "PPC.h"
#include "llvm/ADT/STLExtras.h"
-#include "llvm/Target/TargetFrameLowering.h"
+#include "llvm/CodeGen/TargetFrameLowering.h"
#include "llvm/Target/TargetMachine.h"
namespace llvm {
@@ -106,7 +106,7 @@ public:
bool restoreCalleeSavedRegisters(MachineBasicBlock &MBB,
MachineBasicBlock::iterator MI,
- const std::vector<CalleeSavedInfo> &CSI,
+ std::vector<CalleeSavedInfo> &CSI,
const TargetRegisterInfo *TRI) const override;
/// targetHandlesStackFrameRounding - Returns true if the target is
diff --git a/lib/Target/PowerPC/PPCISelDAGToDAG.cpp b/lib/Target/PowerPC/PPCISelDAGToDAG.cpp
index 901539b682baa..d3a223fe03e0f 100644
--- a/lib/Target/PowerPC/PPCISelDAGToDAG.cpp
+++ b/lib/Target/PowerPC/PPCISelDAGToDAG.cpp
@@ -36,6 +36,8 @@
#include "llvm/CodeGen/SelectionDAG.h"
#include "llvm/CodeGen/SelectionDAGISel.h"
#include "llvm/CodeGen/SelectionDAGNodes.h"
+#include "llvm/CodeGen/TargetInstrInfo.h"
+#include "llvm/CodeGen/TargetRegisterInfo.h"
#include "llvm/CodeGen/ValueTypes.h"
#include "llvm/IR/BasicBlock.h"
#include "llvm/IR/DebugLoc.h"
@@ -53,8 +55,6 @@
#include "llvm/Support/KnownBits.h"
#include "llvm/Support/MathExtras.h"
#include "llvm/Support/raw_ostream.h"
-#include "llvm/Target/TargetInstrInfo.h"
-#include "llvm/Target/TargetRegisterInfo.h"
#include <algorithm>
#include <cassert>
#include <cstdint>
@@ -101,6 +101,29 @@ static cl::opt<bool> EnableBranchHint(
cl::desc("Enable static hinting of branches on ppc"),
cl::Hidden);
+enum ICmpInGPRType { ICGPR_All, ICGPR_None, ICGPR_I32, ICGPR_I64,
+ ICGPR_NonExtIn, ICGPR_Zext, ICGPR_Sext, ICGPR_ZextI32,
+ ICGPR_SextI32, ICGPR_ZextI64, ICGPR_SextI64 };
+
+static cl::opt<ICmpInGPRType> CmpInGPR(
+ "ppc-gpr-icmps", cl::Hidden, cl::init(ICGPR_All),
+ cl::desc("Specify the types of comparisons to emit GPR-only code for."),
+ cl::values(clEnumValN(ICGPR_None, "none", "Do not modify integer comparisons."),
+ clEnumValN(ICGPR_All, "all", "All possible int comparisons in GPRs."),
+ clEnumValN(ICGPR_I32, "i32", "Only i32 comparisons in GPRs."),
+ clEnumValN(ICGPR_I64, "i64", "Only i64 comparisons in GPRs."),
+ clEnumValN(ICGPR_NonExtIn, "nonextin",
+ "Only comparisons where inputs don't need [sz]ext."),
+ clEnumValN(ICGPR_Zext, "zext", "Only comparisons with zext result."),
+ clEnumValN(ICGPR_ZextI32, "zexti32",
+ "Only i32 comparisons with zext result."),
+ clEnumValN(ICGPR_ZextI64, "zexti64",
+ "Only i64 comparisons with zext result."),
+ clEnumValN(ICGPR_Sext, "sext", "Only comparisons with sext result."),
+ clEnumValN(ICGPR_SextI32, "sexti32",
+ "Only i32 comparisons with sext result."),
+ clEnumValN(ICGPR_SextI64, "sexti64",
+ "Only i64 comparisons with sext result.")));
namespace {
//===--------------------------------------------------------------------===//
@@ -133,6 +156,12 @@ namespace {
void PreprocessISelDAG() override;
void PostprocessISelDAG() override;
+ /// getI16Imm - Return a target constant with the specified value, of type
+ /// i16.
+ inline SDValue getI16Imm(unsigned Imm, const SDLoc &dl) {
+ return CurDAG->getTargetConstant(Imm, dl, MVT::i16);
+ }
+
/// getI32Imm - Return a target constant with the specified value, of type
/// i32.
inline SDValue getI32Imm(unsigned Imm, const SDLoc &dl) {
@@ -168,6 +197,7 @@ namespace {
bool tryBitfieldInsert(SDNode *N);
bool tryBitPermutation(SDNode *N);
+ bool tryIntCompareInGPR(SDNode *N);
/// SelectCC - Select a comparison of the specified values with the
/// specified condition code, returning the CR# of the expression.
@@ -270,34 +300,7 @@ namespace {
#include "PPCGenDAGISel.inc"
private:
- // Conversion type for interpreting results of a 32-bit instruction as
- // a 64-bit value or vice versa.
- enum ExtOrTruncConversion { Ext, Trunc };
-
- // Modifiers to guide how an ISD::SETCC node's result is to be computed
- // in a GPR.
- // ZExtOrig - use the original condition code, zero-extend value
- // ZExtInvert - invert the condition code, zero-extend value
- // SExtOrig - use the original condition code, sign-extend value
- // SExtInvert - invert the condition code, sign-extend value
- enum SetccInGPROpts { ZExtOrig, ZExtInvert, SExtOrig, SExtInvert };
-
bool trySETCC(SDNode *N);
- bool tryEXTEND(SDNode *N);
- bool tryLogicOpOfCompares(SDNode *N);
- SDValue computeLogicOpInGPR(SDValue LogicOp);
- SDValue signExtendInputIfNeeded(SDValue Input);
- SDValue zeroExtendInputIfNeeded(SDValue Input);
- SDValue addExtOrTrunc(SDValue NatWidthRes, ExtOrTruncConversion Conv);
- SDValue get32BitZExtCompare(SDValue LHS, SDValue RHS, ISD::CondCode CC,
- int64_t RHSValue, SDLoc dl);
- SDValue get32BitSExtCompare(SDValue LHS, SDValue RHS, ISD::CondCode CC,
- int64_t RHSValue, SDLoc dl);
- SDValue get64BitZExtCompare(SDValue LHS, SDValue RHS, ISD::CondCode CC,
- int64_t RHSValue, SDLoc dl);
- SDValue get64BitSExtCompare(SDValue LHS, SDValue RHS, ISD::CondCode CC,
- int64_t RHSValue, SDLoc dl);
- SDValue getSETCCInGPR(SDValue Compare, SetccInGPROpts ConvOpts);
void PeepholePPC64();
void PeepholePPC64ZExt();
@@ -388,7 +391,7 @@ SDNode *PPCDAGToDAGISel::getGlobalBaseReg() {
// Insert the set of GlobalBaseReg into the first MBB of the function
MachineBasicBlock &FirstMBB = MF->front();
MachineBasicBlock::iterator MBBI = FirstMBB.begin();
- const Module *M = MF->getFunction()->getParent();
+ const Module *M = MF->getFunction().getParent();
DebugLoc dl;
if (PPCLowering->getPointerTy(CurDAG->getDataLayout()) == MVT::i32) {
@@ -450,6 +453,12 @@ static bool isInt32Immediate(SDValue N, unsigned &Imm) {
return isInt32Immediate(N.getNode(), Imm);
}
+/// isInt64Immediate - This method tests to see if the value is a 64-bit
+/// constant operand. If so Imm will receive the 64-bit value.
+static bool isInt64Immediate(SDValue N, uint64_t &Imm) {
+ return isInt64Immediate(N.getNode(), Imm);
+}
+
static unsigned getBranchHint(unsigned PCC, FunctionLoweringInfo *FuncInfo,
const SDValue &DestMBB) {
assert(isa<BasicBlockSDNode>(DestMBB));
@@ -607,8 +616,6 @@ bool PPCDAGToDAGISel::tryBitfieldInsert(SDNode *N) {
unsigned MB, ME;
if (isRunOfOnes(InsertMask, MB, ME)) {
- SDValue Tmp1, Tmp2;
-
if ((Op1Opc == ISD::SHL || Op1Opc == ISD::SRL) &&
isInt32Immediate(Op1.getOperand(1), Value)) {
Op1 = Op1.getOperand(0);
@@ -643,8 +650,8 @@ bool PPCDAGToDAGISel::tryBitfieldInsert(SDNode *N) {
}
// Predict the number of instructions that would be generated by calling
-// getInt64(N).
-static unsigned getInt64CountDirect(int64_t Imm) {
+// selectI64Imm(N).
+static unsigned selectI64ImmInstrCountDirect(int64_t Imm) {
// Assume no remaining bits.
unsigned Remainder = 0;
// Assume no shift required.
@@ -712,8 +719,8 @@ static uint64_t Rot64(uint64_t Imm, unsigned R) {
return (Imm << R) | (Imm >> (64 - R));
}
-static unsigned getInt64Count(int64_t Imm) {
- unsigned Count = getInt64CountDirect(Imm);
+static unsigned selectI64ImmInstrCount(int64_t Imm) {
+ unsigned Count = selectI64ImmInstrCountDirect(Imm);
// If the instruction count is 1 or 2, we do not need further analysis
// since rotate + load constant requires at least 2 instructions.
@@ -722,10 +729,10 @@ static unsigned getInt64Count(int64_t Imm) {
for (unsigned r = 1; r < 63; ++r) {
uint64_t RImm = Rot64(Imm, r);
- unsigned RCount = getInt64CountDirect(RImm) + 1;
+ unsigned RCount = selectI64ImmInstrCountDirect(RImm) + 1;
Count = std::min(Count, RCount);
- // See comments in getInt64 for an explanation of the logic below.
+ // See comments in selectI64Imm for an explanation of the logic below.
unsigned LS = findLastSet(RImm);
if (LS != r-1)
continue;
@@ -733,17 +740,17 @@ static unsigned getInt64Count(int64_t Imm) {
uint64_t OnesMask = -(int64_t) (UINT64_C(1) << (LS+1));
uint64_t RImmWithOnes = RImm | OnesMask;
- RCount = getInt64CountDirect(RImmWithOnes) + 1;
+ RCount = selectI64ImmInstrCountDirect(RImmWithOnes) + 1;
Count = std::min(Count, RCount);
}
return Count;
}
-// Select a 64-bit constant. For cost-modeling purposes, getInt64Count
+// Select a 64-bit constant. For cost-modeling purposes, selectI64ImmInstrCount
// (above) needs to be kept in sync with this function.
-static SDNode *getInt64Direct(SelectionDAG *CurDAG, const SDLoc &dl,
- int64_t Imm) {
+static SDNode *selectI64ImmDirect(SelectionDAG *CurDAG, const SDLoc &dl,
+ int64_t Imm) {
// Assume no remaining bits.
unsigned Remainder = 0;
// Assume no shift required.
@@ -779,8 +786,10 @@ static SDNode *getInt64Direct(SelectionDAG *CurDAG, const SDLoc &dl,
// Simple value.
if (isInt<16>(Imm)) {
+ uint64_t SextImm = SignExtend64(Lo, 16);
+ SDValue SDImm = CurDAG->getTargetConstant(SextImm, dl, MVT::i64);
// Just the Lo bits.
- Result = CurDAG->getMachineNode(PPC::LI8, dl, MVT::i64, getI32Imm(Lo));
+ Result = CurDAG->getMachineNode(PPC::LI8, dl, MVT::i64, SDImm);
} else if (Lo) {
// Handle the Hi bits.
unsigned OpC = Hi ? PPC::LIS8 : PPC::LI8;
@@ -825,13 +834,14 @@ static SDNode *getInt64Direct(SelectionDAG *CurDAG, const SDLoc &dl,
return Result;
}
-static SDNode *getInt64(SelectionDAG *CurDAG, const SDLoc &dl, int64_t Imm) {
- unsigned Count = getInt64CountDirect(Imm);
+static SDNode *selectI64Imm(SelectionDAG *CurDAG, const SDLoc &dl,
+ int64_t Imm) {
+ unsigned Count = selectI64ImmInstrCountDirect(Imm);
// If the instruction count is 1 or 2, we do not need further analysis
// since rotate + load constant requires at least 2 instructions.
if (Count <= 2)
- return getInt64Direct(CurDAG, dl, Imm);
+ return selectI64ImmDirect(CurDAG, dl, Imm);
unsigned RMin = 0;
@@ -840,7 +850,7 @@ static SDNode *getInt64(SelectionDAG *CurDAG, const SDLoc &dl, int64_t Imm) {
for (unsigned r = 1; r < 63; ++r) {
uint64_t RImm = Rot64(Imm, r);
- unsigned RCount = getInt64CountDirect(RImm) + 1;
+ unsigned RCount = selectI64ImmInstrCountDirect(RImm) + 1;
if (RCount < Count) {
Count = RCount;
RMin = r;
@@ -863,7 +873,7 @@ static SDNode *getInt64(SelectionDAG *CurDAG, const SDLoc &dl, int64_t Imm) {
uint64_t OnesMask = -(int64_t) (UINT64_C(1) << (LS+1));
uint64_t RImmWithOnes = RImm | OnesMask;
- RCount = getInt64CountDirect(RImmWithOnes) + 1;
+ RCount = selectI64ImmInstrCountDirect(RImmWithOnes) + 1;
if (RCount < Count) {
Count = RCount;
RMin = r;
@@ -873,24 +883,86 @@ static SDNode *getInt64(SelectionDAG *CurDAG, const SDLoc &dl, int64_t Imm) {
}
if (!RMin)
- return getInt64Direct(CurDAG, dl, Imm);
+ return selectI64ImmDirect(CurDAG, dl, Imm);
auto getI32Imm = [CurDAG, dl](unsigned Imm) {
return CurDAG->getTargetConstant(Imm, dl, MVT::i32);
};
- SDValue Val = SDValue(getInt64Direct(CurDAG, dl, MatImm), 0);
+ SDValue Val = SDValue(selectI64ImmDirect(CurDAG, dl, MatImm), 0);
return CurDAG->getMachineNode(PPC::RLDICR, dl, MVT::i64, Val,
getI32Imm(64 - RMin), getI32Imm(MaskEnd));
}
+static unsigned allUsesTruncate(SelectionDAG *CurDAG, SDNode *N) {
+ unsigned MaxTruncation = 0;
+ // Cannot use range-based for loop here as we need the actual use (i.e. we
+ // need the operand number corresponding to the use). A range-based for
+ // will unbox the use and provide an SDNode*.
+ for (SDNode::use_iterator Use = N->use_begin(), UseEnd = N->use_end();
+ Use != UseEnd; ++Use) {
+ unsigned Opc =
+ Use->isMachineOpcode() ? Use->getMachineOpcode() : Use->getOpcode();
+ switch (Opc) {
+ default: return 0;
+ case ISD::TRUNCATE:
+ if (Use->isMachineOpcode())
+ return 0;
+ MaxTruncation =
+ std::max(MaxTruncation, Use->getValueType(0).getSizeInBits());
+ continue;
+ case ISD::STORE: {
+ if (Use->isMachineOpcode())
+ return 0;
+ StoreSDNode *STN = cast<StoreSDNode>(*Use);
+ unsigned MemVTSize = STN->getMemoryVT().getSizeInBits();
+ if (MemVTSize == 64 || Use.getOperandNo() != 0)
+ return 0;
+ MaxTruncation = std::max(MaxTruncation, MemVTSize);
+ continue;
+ }
+ case PPC::STW8:
+ case PPC::STWX8:
+ case PPC::STWU8:
+ case PPC::STWUX8:
+ if (Use.getOperandNo() != 0)
+ return 0;
+ MaxTruncation = std::max(MaxTruncation, 32u);
+ continue;
+ case PPC::STH8:
+ case PPC::STHX8:
+ case PPC::STHU8:
+ case PPC::STHUX8:
+ if (Use.getOperandNo() != 0)
+ return 0;
+ MaxTruncation = std::max(MaxTruncation, 16u);
+ continue;
+ case PPC::STB8:
+ case PPC::STBX8:
+ case PPC::STBU8:
+ case PPC::STBUX8:
+ if (Use.getOperandNo() != 0)
+ return 0;
+ MaxTruncation = std::max(MaxTruncation, 8u);
+ continue;
+ }
+ }
+ return MaxTruncation;
+}
+
// Select a 64-bit constant.
-static SDNode *getInt64(SelectionDAG *CurDAG, SDNode *N) {
+static SDNode *selectI64Imm(SelectionDAG *CurDAG, SDNode *N) {
SDLoc dl(N);
// Get 64 bit value.
int64_t Imm = cast<ConstantSDNode>(N)->getZExtValue();
- return getInt64(CurDAG, dl, Imm);
+ if (unsigned MinSize = allUsesTruncate(CurDAG, N)) {
+ uint64_t SextImm = SignExtend64(Imm, MinSize);
+ SDValue SDImm = CurDAG->getTargetConstant(SextImm, dl, MVT::i64);
+ if (isInt<16>(SextImm))
+ return CurDAG->getMachineNode(PPC::LI8, dl, MVT::i64, SDImm);
+ }
+ return selectI64Imm(CurDAG, dl, Imm);
}
namespace {
@@ -1090,6 +1162,25 @@ class BitPermutationSelector {
return std::make_pair(Interesting = true, &Bits);
}
+ case ISD::ZERO_EXTEND: {
+ // We support only the case with zero extension from i32 to i64 so far.
+ if (V.getValueType() != MVT::i64 ||
+ V.getOperand(0).getValueType() != MVT::i32)
+ break;
+
+ const SmallVector<ValueBit, 64> *LHSBits;
+ const unsigned NumOperandBits = 32;
+ std::tie(Interesting, LHSBits) = getValueBits(V.getOperand(0),
+ NumOperandBits);
+
+ for (unsigned i = 0; i < NumOperandBits; ++i)
+ Bits[i] = (*LHSBits)[i];
+
+ for (unsigned i = NumOperandBits; i < NumBits; ++i)
+ Bits[i] = ValueBit(ValueBit::ConstZero);
+
+ return std::make_pair(Interesting, &Bits);
+ }
}
for (unsigned i = 0; i < NumBits; ++i)
@@ -1351,6 +1442,24 @@ class BitPermutationSelector {
return ~Mask;
}
+ // This method extends an input value to 64 bit if input is 32-bit integer.
+ // While selecting instructions in BitPermutationSelector in 64-bit mode,
+ // an input value can be a 32-bit integer if a ZERO_EXTEND node is included.
+ // In such case, we extend it to 64 bit to be consistent with other values.
+ SDValue ExtendToInt64(SDValue V, const SDLoc &dl) {
+ if (V.getValueSizeInBits() == 64)
+ return V;
+
+ assert(V.getValueSizeInBits() == 32);
+ SDValue SubRegIdx = CurDAG->getTargetConstant(PPC::sub_32, dl, MVT::i32);
+ SDValue ImDef = SDValue(CurDAG->getMachineNode(PPC::IMPLICIT_DEF, dl,
+ MVT::i64), 0);
+ SDValue ExtVal = SDValue(CurDAG->getMachineNode(PPC::INSERT_SUBREG, dl,
+ MVT::i64, ImDef, V,
+ SubRegIdx), 0);
+ return ExtVal;
+ }
+
// Depending on the number of groups for a particular value, it might be
// better to rotate, mask explicitly (using andi/andis), and then or the
// result. Select this part of the result first.
@@ -1567,27 +1676,30 @@ class BitPermutationSelector {
assert(InstMaskStart >= 32 && "Mask cannot start out of range");
assert(InstMaskEnd >= 32 && "Mask cannot end out of range");
SDValue Ops[] =
- { V, getI32Imm(RLAmt, dl), getI32Imm(InstMaskStart - 32, dl),
- getI32Imm(InstMaskEnd - 32, dl) };
+ { ExtendToInt64(V, dl), getI32Imm(RLAmt, dl),
+ getI32Imm(InstMaskStart - 32, dl), getI32Imm(InstMaskEnd - 32, dl) };
return SDValue(CurDAG->getMachineNode(PPC::RLWINM8, dl, MVT::i64,
Ops), 0);
}
if (InstMaskEnd == 63) {
SDValue Ops[] =
- { V, getI32Imm(RLAmt, dl), getI32Imm(InstMaskStart, dl) };
+ { ExtendToInt64(V, dl), getI32Imm(RLAmt, dl),
+ getI32Imm(InstMaskStart, dl) };
return SDValue(CurDAG->getMachineNode(PPC::RLDICL, dl, MVT::i64, Ops), 0);
}
if (InstMaskStart == 0) {
SDValue Ops[] =
- { V, getI32Imm(RLAmt, dl), getI32Imm(InstMaskEnd, dl) };
+ { ExtendToInt64(V, dl), getI32Imm(RLAmt, dl),
+ getI32Imm(InstMaskEnd, dl) };
return SDValue(CurDAG->getMachineNode(PPC::RLDICR, dl, MVT::i64, Ops), 0);
}
if (InstMaskEnd == 63 - RLAmt) {
SDValue Ops[] =
- { V, getI32Imm(RLAmt, dl), getI32Imm(InstMaskStart, dl) };
+ { ExtendToInt64(V, dl), getI32Imm(RLAmt, dl),
+ getI32Imm(InstMaskStart, dl) };
return SDValue(CurDAG->getMachineNode(PPC::RLDIC, dl, MVT::i64, Ops), 0);
}
@@ -1628,15 +1740,16 @@ class BitPermutationSelector {
assert(InstMaskStart >= 32 && "Mask cannot start out of range");
assert(InstMaskEnd >= 32 && "Mask cannot end out of range");
SDValue Ops[] =
- { Base, V, getI32Imm(RLAmt, dl), getI32Imm(InstMaskStart - 32, dl),
- getI32Imm(InstMaskEnd - 32, dl) };
+ { ExtendToInt64(Base, dl), ExtendToInt64(V, dl), getI32Imm(RLAmt, dl),
+ getI32Imm(InstMaskStart - 32, dl), getI32Imm(InstMaskEnd - 32, dl) };
return SDValue(CurDAG->getMachineNode(PPC::RLWIMI8, dl, MVT::i64,
Ops), 0);
}
if (InstMaskEnd == 63 - RLAmt) {
SDValue Ops[] =
- { Base, V, getI32Imm(RLAmt, dl), getI32Imm(InstMaskStart, dl) };
+ { ExtendToInt64(Base, dl), ExtendToInt64(V, dl), getI32Imm(RLAmt, dl),
+ getI32Imm(InstMaskStart, dl) };
return SDValue(CurDAG->getMachineNode(PPC::RLDIMI, dl, MVT::i64, Ops), 0);
}
@@ -1730,7 +1843,7 @@ class BitPermutationSelector {
NumAndInsts += (unsigned) (ANDIMask != 0) + (unsigned) (ANDISMask != 0) +
(unsigned) (ANDIMask != 0 && ANDISMask != 0);
else
- NumAndInsts += getInt64Count(Mask) + /* and */ 1;
+ NumAndInsts += selectI64ImmInstrCount(Mask) + /* and */ 1;
unsigned NumRLInsts = 0;
bool FirstBG = true;
@@ -1786,10 +1899,14 @@ class BitPermutationSelector {
SDValue ANDIVal, ANDISVal;
if (ANDIMask != 0)
ANDIVal = SDValue(CurDAG->getMachineNode(PPC::ANDIo8, dl, MVT::i64,
- VRot, getI32Imm(ANDIMask, dl)), 0);
+ ExtendToInt64(VRot, dl),
+ getI32Imm(ANDIMask, dl)),
+ 0);
if (ANDISMask != 0)
ANDISVal = SDValue(CurDAG->getMachineNode(PPC::ANDISo8, dl, MVT::i64,
- VRot, getI32Imm(ANDISMask, dl)), 0);
+ ExtendToInt64(VRot, dl),
+ getI32Imm(ANDISMask, dl)),
+ 0);
if (!ANDIVal)
TotalVal = ANDISVal;
@@ -1797,19 +1914,21 @@ class BitPermutationSelector {
TotalVal = ANDIVal;
else
TotalVal = SDValue(CurDAG->getMachineNode(PPC::OR8, dl, MVT::i64,
- ANDIVal, ANDISVal), 0);
+ ExtendToInt64(ANDIVal, dl), ANDISVal), 0);
} else {
- TotalVal = SDValue(getInt64(CurDAG, dl, Mask), 0);
+ TotalVal = SDValue(selectI64Imm(CurDAG, dl, Mask), 0);
TotalVal =
SDValue(CurDAG->getMachineNode(PPC::AND8, dl, MVT::i64,
- VRot, TotalVal), 0);
+ ExtendToInt64(VRot, dl), TotalVal),
+ 0);
}
if (!Res)
Res = TotalVal;
else
Res = SDValue(CurDAG->getMachineNode(PPC::OR8, dl, MVT::i64,
- Res, TotalVal), 0);
+ ExtendToInt64(Res, dl), TotalVal),
+ 0);
// Now, remove all groups with this underlying value and rotation
// factor.
@@ -1929,10 +2048,10 @@ class BitPermutationSelector {
SDValue ANDIVal, ANDISVal;
if (ANDIMask != 0)
ANDIVal = SDValue(CurDAG->getMachineNode(PPC::ANDIo8, dl, MVT::i64,
- Res, getI32Imm(ANDIMask, dl)), 0);
+ ExtendToInt64(Res, dl), getI32Imm(ANDIMask, dl)), 0);
if (ANDISMask != 0)
ANDISVal = SDValue(CurDAG->getMachineNode(PPC::ANDISo8, dl, MVT::i64,
- Res, getI32Imm(ANDISMask, dl)), 0);
+ ExtendToInt64(Res, dl), getI32Imm(ANDISMask, dl)), 0);
if (!ANDIVal)
Res = ANDISVal;
@@ -1940,14 +2059,14 @@ class BitPermutationSelector {
Res = ANDIVal;
else
Res = SDValue(CurDAG->getMachineNode(PPC::OR8, dl, MVT::i64,
- ANDIVal, ANDISVal), 0);
+ ExtendToInt64(ANDIVal, dl), ANDISVal), 0);
} else {
- if (InstCnt) *InstCnt += getInt64Count(Mask) + /* and */ 1;
+ if (InstCnt) *InstCnt += selectI64ImmInstrCount(Mask) + /* and */ 1;
- SDValue MaskVal = SDValue(getInt64(CurDAG, dl, Mask), 0);
+ SDValue MaskVal = SDValue(selectI64Imm(CurDAG, dl, Mask), 0);
Res =
SDValue(CurDAG->getMachineNode(PPC::AND8, dl, MVT::i64,
- Res, MaskVal), 0);
+ ExtendToInt64(Res, dl), MaskVal), 0);
}
}
@@ -2046,8 +2165,1204 @@ public:
}
};
+class IntegerCompareEliminator {
+ SelectionDAG *CurDAG;
+ PPCDAGToDAGISel *S;
+ // Conversion type for interpreting results of a 32-bit instruction as
+ // a 64-bit value or vice versa.
+ enum ExtOrTruncConversion { Ext, Trunc };
+
+ // Modifiers to guide how an ISD::SETCC node's result is to be computed
+ // in a GPR.
+ // ZExtOrig - use the original condition code, zero-extend value
+ // ZExtInvert - invert the condition code, zero-extend value
+ // SExtOrig - use the original condition code, sign-extend value
+ // SExtInvert - invert the condition code, sign-extend value
+ enum SetccInGPROpts { ZExtOrig, ZExtInvert, SExtOrig, SExtInvert };
+
+ // Comparisons against zero to emit GPR code sequences for. Each of these
+ // sequences may need to be emitted for two or more equivalent patterns.
+ // For example (a >= 0) == (a > -1). The direction of the comparison (</>)
+ // matters as well as the extension type: sext (-1/0), zext (1/0).
+ // GEZExt - (zext (LHS >= 0))
+ // GESExt - (sext (LHS >= 0))
+ // LEZExt - (zext (LHS <= 0))
+ // LESExt - (sext (LHS <= 0))
+ enum ZeroCompare { GEZExt, GESExt, LEZExt, LESExt };
+
+ SDNode *tryEXTEND(SDNode *N);
+ SDNode *tryLogicOpOfCompares(SDNode *N);
+ SDValue computeLogicOpInGPR(SDValue LogicOp);
+ SDValue signExtendInputIfNeeded(SDValue Input);
+ SDValue zeroExtendInputIfNeeded(SDValue Input);
+ SDValue addExtOrTrunc(SDValue NatWidthRes, ExtOrTruncConversion Conv);
+ SDValue getCompoundZeroComparisonInGPR(SDValue LHS, SDLoc dl,
+ ZeroCompare CmpTy);
+ SDValue get32BitZExtCompare(SDValue LHS, SDValue RHS, ISD::CondCode CC,
+ int64_t RHSValue, SDLoc dl);
+ SDValue get32BitSExtCompare(SDValue LHS, SDValue RHS, ISD::CondCode CC,
+ int64_t RHSValue, SDLoc dl);
+ SDValue get64BitZExtCompare(SDValue LHS, SDValue RHS, ISD::CondCode CC,
+ int64_t RHSValue, SDLoc dl);
+ SDValue get64BitSExtCompare(SDValue LHS, SDValue RHS, ISD::CondCode CC,
+ int64_t RHSValue, SDLoc dl);
+ SDValue getSETCCInGPR(SDValue Compare, SetccInGPROpts ConvOpts);
+
+public:
+ IntegerCompareEliminator(SelectionDAG *DAG,
+ PPCDAGToDAGISel *Sel) : CurDAG(DAG), S(Sel) {
+ assert(CurDAG->getTargetLoweringInfo()
+ .getPointerTy(CurDAG->getDataLayout()).getSizeInBits() == 64 &&
+ "Only expecting to use this on 64 bit targets.");
+ }
+ SDNode *Select(SDNode *N) {
+ if (CmpInGPR == ICGPR_None)
+ return nullptr;
+ switch (N->getOpcode()) {
+ default: break;
+ case ISD::ZERO_EXTEND:
+ if (CmpInGPR == ICGPR_Sext || CmpInGPR == ICGPR_SextI32 ||
+ CmpInGPR == ICGPR_SextI64)
+ return nullptr;
+ LLVM_FALLTHROUGH;
+ case ISD::SIGN_EXTEND:
+ if (CmpInGPR == ICGPR_Zext || CmpInGPR == ICGPR_ZextI32 ||
+ CmpInGPR == ICGPR_ZextI64)
+ return nullptr;
+ return tryEXTEND(N);
+ case ISD::AND:
+ case ISD::OR:
+ case ISD::XOR:
+ return tryLogicOpOfCompares(N);
+ }
+ return nullptr;
+ }
+};
+
+static bool isLogicOp(unsigned Opc) {
+ return Opc == ISD::AND || Opc == ISD::OR || Opc == ISD::XOR;
+}
+// The obvious case for wanting to keep the value in a GPR. Namely, the
+// result of the comparison is actually needed in a GPR.
+SDNode *IntegerCompareEliminator::tryEXTEND(SDNode *N) {
+ assert((N->getOpcode() == ISD::ZERO_EXTEND ||
+ N->getOpcode() == ISD::SIGN_EXTEND) &&
+ "Expecting a zero/sign extend node!");
+ SDValue WideRes;
+ // If we are zero-extending the result of a logical operation on i1
+ // values, we can keep the values in GPRs.
+ if (isLogicOp(N->getOperand(0).getOpcode()) &&
+ N->getOperand(0).getValueType() == MVT::i1 &&
+ N->getOpcode() == ISD::ZERO_EXTEND)
+ WideRes = computeLogicOpInGPR(N->getOperand(0));
+ else if (N->getOperand(0).getOpcode() != ISD::SETCC)
+ return nullptr;
+ else
+ WideRes =
+ getSETCCInGPR(N->getOperand(0),
+ N->getOpcode() == ISD::SIGN_EXTEND ?
+ SetccInGPROpts::SExtOrig : SetccInGPROpts::ZExtOrig);
+
+ if (!WideRes)
+ return nullptr;
+
+ SDLoc dl(N);
+ bool Input32Bit = WideRes.getValueType() == MVT::i32;
+ bool Output32Bit = N->getValueType(0) == MVT::i32;
+
+ NumSextSetcc += N->getOpcode() == ISD::SIGN_EXTEND ? 1 : 0;
+ NumZextSetcc += N->getOpcode() == ISD::SIGN_EXTEND ? 0 : 1;
+
+ SDValue ConvOp = WideRes;
+ if (Input32Bit != Output32Bit)
+ ConvOp = addExtOrTrunc(WideRes, Input32Bit ? ExtOrTruncConversion::Ext :
+ ExtOrTruncConversion::Trunc);
+ return ConvOp.getNode();
+}
+
+// Attempt to perform logical operations on the results of comparisons while
+// keeping the values in GPRs. Without doing so, these would end up being
+// lowered to CR-logical operations which suffer from significant latency and
+// low ILP.
+SDNode *IntegerCompareEliminator::tryLogicOpOfCompares(SDNode *N) {
+ if (N->getValueType(0) != MVT::i1)
+ return nullptr;
+ assert(isLogicOp(N->getOpcode()) &&
+ "Expected a logic operation on setcc results.");
+ SDValue LoweredLogical = computeLogicOpInGPR(SDValue(N, 0));
+ if (!LoweredLogical)
+ return nullptr;
+
+ SDLoc dl(N);
+ bool IsBitwiseNegate = LoweredLogical.getMachineOpcode() == PPC::XORI8;
+ unsigned SubRegToExtract = IsBitwiseNegate ? PPC::sub_eq : PPC::sub_gt;
+ SDValue CR0Reg = CurDAG->getRegister(PPC::CR0, MVT::i32);
+ SDValue LHS = LoweredLogical.getOperand(0);
+ SDValue RHS = LoweredLogical.getOperand(1);
+ SDValue WideOp;
+ SDValue OpToConvToRecForm;
+
+ // Look through any 32-bit to 64-bit implicit extend nodes to find the
+ // opcode that is input to the XORI.
+ if (IsBitwiseNegate &&
+ LoweredLogical.getOperand(0).getMachineOpcode() == PPC::INSERT_SUBREG)
+ OpToConvToRecForm = LoweredLogical.getOperand(0).getOperand(1);
+ else if (IsBitwiseNegate)
+ // If the input to the XORI isn't an extension, that's what we're after.
+ OpToConvToRecForm = LoweredLogical.getOperand(0);
+ else
+ // If this is not an XORI, it is a reg-reg logical op and we can convert
+ // it to record-form.
+ OpToConvToRecForm = LoweredLogical;
+
+ // Get the record-form version of the node we're looking to use to get the
+ // CR result from.
+ uint16_t NonRecOpc = OpToConvToRecForm.getMachineOpcode();
+ int NewOpc = PPCInstrInfo::getRecordFormOpcode(NonRecOpc);
+
+ // Convert the right node to record-form. This is either the logical we're
+ // looking at or it is the input node to the negation (if we're looking at
+ // a bitwise negation).
+ if (NewOpc != -1 && IsBitwiseNegate) {
+ // The input to the XORI has a record-form. Use it.
+ assert(LoweredLogical.getConstantOperandVal(1) == 1 &&
+ "Expected a PPC::XORI8 only for bitwise negation.");
+ // Emit the record-form instruction.
+ std::vector<SDValue> Ops;
+ for (int i = 0, e = OpToConvToRecForm.getNumOperands(); i < e; i++)
+ Ops.push_back(OpToConvToRecForm.getOperand(i));
+
+ WideOp =
+ SDValue(CurDAG->getMachineNode(NewOpc, dl,
+ OpToConvToRecForm.getValueType(),
+ MVT::Glue, Ops), 0);
+ } else {
+ assert((NewOpc != -1 || !IsBitwiseNegate) &&
+ "No record form available for AND8/OR8/XOR8?");
+ WideOp =
+ SDValue(CurDAG->getMachineNode(NewOpc == -1 ? PPC::ANDIo8 : NewOpc, dl,
+ MVT::i64, MVT::Glue, LHS, RHS), 0);
+ }
+
+ // Select this node to a single bit from CR0 set by the record-form node
+ // just created. For bitwise negation, use the EQ bit which is the equivalent
+ // of negating the result (i.e. it is a bit set when the result of the
+ // operation is zero).
+ SDValue SRIdxVal =
+ CurDAG->getTargetConstant(SubRegToExtract, dl, MVT::i32);
+ SDValue CRBit =
+ SDValue(CurDAG->getMachineNode(TargetOpcode::EXTRACT_SUBREG, dl,
+ MVT::i1, CR0Reg, SRIdxVal,
+ WideOp.getValue(1)), 0);
+ return CRBit.getNode();
+}
+
+// Lower a logical operation on i1 values into a GPR sequence if possible.
+// The result can be kept in a GPR if requested.
+// Three types of inputs can be handled:
+// - SETCC
+// - TRUNCATE
+// - Logical operation (AND/OR/XOR)
+// There is also a special case that is handled (namely a complement operation
+// achieved with xor %a, -1).
+SDValue IntegerCompareEliminator::computeLogicOpInGPR(SDValue LogicOp) {
+ assert(isLogicOp(LogicOp.getOpcode()) &&
+ "Can only handle logic operations here.");
+ assert(LogicOp.getValueType() == MVT::i1 &&
+ "Can only handle logic operations on i1 values here.");
+ SDLoc dl(LogicOp);
+ SDValue LHS, RHS;
+
+ // Special case: xor %a, -1
+ bool IsBitwiseNegation = isBitwiseNot(LogicOp);
+
+ // Produces a GPR sequence for each operand of the binary logic operation.
+ // For SETCC, it produces the respective comparison, for TRUNCATE it truncates
+ // the value in a GPR and for logic operations, it will recursively produce
+ // a GPR sequence for the operation.
+ auto getLogicOperand = [&] (SDValue Operand) -> SDValue {
+ unsigned OperandOpcode = Operand.getOpcode();
+ if (OperandOpcode == ISD::SETCC)
+ return getSETCCInGPR(Operand, SetccInGPROpts::ZExtOrig);
+ else if (OperandOpcode == ISD::TRUNCATE) {
+ SDValue InputOp = Operand.getOperand(0);
+ EVT InVT = InputOp.getValueType();
+ return SDValue(CurDAG->getMachineNode(InVT == MVT::i32 ? PPC::RLDICL_32 :
+ PPC::RLDICL, dl, InVT, InputOp,
+ S->getI64Imm(0, dl),
+ S->getI64Imm(63, dl)), 0);
+ } else if (isLogicOp(OperandOpcode))
+ return computeLogicOpInGPR(Operand);
+ return SDValue();
+ };
+ LHS = getLogicOperand(LogicOp.getOperand(0));
+ RHS = getLogicOperand(LogicOp.getOperand(1));
+
+ // If a GPR sequence can't be produced for the LHS we can't proceed.
+ // Not producing a GPR sequence for the RHS is only a problem if this isn't
+ // a bitwise negation operation.
+ if (!LHS || (!RHS && !IsBitwiseNegation))
+ return SDValue();
+
+ NumLogicOpsOnComparison++;
+
+ // We will use the inputs as 64-bit values.
+ if (LHS.getValueType() == MVT::i32)
+ LHS = addExtOrTrunc(LHS, ExtOrTruncConversion::Ext);
+ if (!IsBitwiseNegation && RHS.getValueType() == MVT::i32)
+ RHS = addExtOrTrunc(RHS, ExtOrTruncConversion::Ext);
+
+ unsigned NewOpc;
+ switch (LogicOp.getOpcode()) {
+ default: llvm_unreachable("Unknown logic operation.");
+ case ISD::AND: NewOpc = PPC::AND8; break;
+ case ISD::OR: NewOpc = PPC::OR8; break;
+ case ISD::XOR: NewOpc = PPC::XOR8; break;
+ }
+
+ if (IsBitwiseNegation) {
+ RHS = S->getI64Imm(1, dl);
+ NewOpc = PPC::XORI8;
+ }
+
+ return SDValue(CurDAG->getMachineNode(NewOpc, dl, MVT::i64, LHS, RHS), 0);
+
+}
+
+/// If the value isn't guaranteed to be sign-extended to 64-bits, extend it.
+/// Otherwise just reinterpret it as a 64-bit value.
+/// Useful when emitting comparison code for 32-bit values without using
+/// the compare instruction (which only considers the lower 32-bits).
+SDValue IntegerCompareEliminator::signExtendInputIfNeeded(SDValue Input) {
+ assert(Input.getValueType() == MVT::i32 &&
+ "Can only sign-extend 32-bit values here.");
+ unsigned Opc = Input.getOpcode();
+
+ // The value was sign extended and then truncated to 32-bits. No need to
+ // sign extend it again.
+ if (Opc == ISD::TRUNCATE &&
+ (Input.getOperand(0).getOpcode() == ISD::AssertSext ||
+ Input.getOperand(0).getOpcode() == ISD::SIGN_EXTEND))
+ return addExtOrTrunc(Input, ExtOrTruncConversion::Ext);
+
+ LoadSDNode *InputLoad = dyn_cast<LoadSDNode>(Input);
+ // The input is a sign-extending load. All ppc sign-extending loads
+ // sign-extend to the full 64-bits.
+ if (InputLoad && InputLoad->getExtensionType() == ISD::SEXTLOAD)
+ return addExtOrTrunc(Input, ExtOrTruncConversion::Ext);
+
+ ConstantSDNode *InputConst = dyn_cast<ConstantSDNode>(Input);
+ // We don't sign-extend constants.
+ if (InputConst)
+ return addExtOrTrunc(Input, ExtOrTruncConversion::Ext);
+
+ SDLoc dl(Input);
+ SignExtensionsAdded++;
+ return SDValue(CurDAG->getMachineNode(PPC::EXTSW_32_64, dl,
+ MVT::i64, Input), 0);
+}
+
+/// If the value isn't guaranteed to be zero-extended to 64-bits, extend it.
+/// Otherwise just reinterpret it as a 64-bit value.
+/// Useful when emitting comparison code for 32-bit values without using
+/// the compare instruction (which only considers the lower 32-bits).
+SDValue IntegerCompareEliminator::zeroExtendInputIfNeeded(SDValue Input) {
+ assert(Input.getValueType() == MVT::i32 &&
+ "Can only zero-extend 32-bit values here.");
+ unsigned Opc = Input.getOpcode();
+
+ // The only condition under which we can omit the actual extend instruction:
+ // - The value is a positive constant
+ // - The value comes from a load that isn't a sign-extending load
+ // An ISD::TRUNCATE needs to be zero-extended unless it is fed by a zext.
+ bool IsTruncateOfZExt = Opc == ISD::TRUNCATE &&
+ (Input.getOperand(0).getOpcode() == ISD::AssertZext ||
+ Input.getOperand(0).getOpcode() == ISD::ZERO_EXTEND);
+ if (IsTruncateOfZExt)
+ return addExtOrTrunc(Input, ExtOrTruncConversion::Ext);
+
+ ConstantSDNode *InputConst = dyn_cast<ConstantSDNode>(Input);
+ if (InputConst && InputConst->getSExtValue() >= 0)
+ return addExtOrTrunc(Input, ExtOrTruncConversion::Ext);
+
+ LoadSDNode *InputLoad = dyn_cast<LoadSDNode>(Input);
+ // The input is a load that doesn't sign-extend (it will be zero-extended).
+ if (InputLoad && InputLoad->getExtensionType() != ISD::SEXTLOAD)
+ return addExtOrTrunc(Input, ExtOrTruncConversion::Ext);
+
+ // None of the above, need to zero-extend.
+ SDLoc dl(Input);
+ ZeroExtensionsAdded++;
+ return SDValue(CurDAG->getMachineNode(PPC::RLDICL_32_64, dl, MVT::i64, Input,
+ S->getI64Imm(0, dl),
+ S->getI64Imm(32, dl)), 0);
+}
+
+// Handle a 32-bit value in a 64-bit register and vice-versa. These are of
+// course not actual zero/sign extensions that will generate machine code,
+// they're just a way to reinterpret a 32 bit value in a register as a
+// 64 bit value and vice-versa.
+SDValue IntegerCompareEliminator::addExtOrTrunc(SDValue NatWidthRes,
+ ExtOrTruncConversion Conv) {
+ SDLoc dl(NatWidthRes);
+
+ // For reinterpreting 32-bit values as 64 bit values, we generate
+ // INSERT_SUBREG IMPLICIT_DEF:i64, <input>, TargetConstant:i32<1>
+ if (Conv == ExtOrTruncConversion::Ext) {
+ SDValue ImDef(CurDAG->getMachineNode(PPC::IMPLICIT_DEF, dl, MVT::i64), 0);
+ SDValue SubRegIdx =
+ CurDAG->getTargetConstant(PPC::sub_32, dl, MVT::i32);
+ return SDValue(CurDAG->getMachineNode(PPC::INSERT_SUBREG, dl, MVT::i64,
+ ImDef, NatWidthRes, SubRegIdx), 0);
+ }
+
+ assert(Conv == ExtOrTruncConversion::Trunc &&
+ "Unknown convertion between 32 and 64 bit values.");
+ // For reinterpreting 64-bit values as 32-bit values, we just need to
+ // EXTRACT_SUBREG (i.e. extract the low word).
+ SDValue SubRegIdx =
+ CurDAG->getTargetConstant(PPC::sub_32, dl, MVT::i32);
+ return SDValue(CurDAG->getMachineNode(PPC::EXTRACT_SUBREG, dl, MVT::i32,
+ NatWidthRes, SubRegIdx), 0);
+}
+
+// Produce a GPR sequence for compound comparisons (<=, >=) against zero.
+// Handle both zero-extensions and sign-extensions.
+SDValue
+IntegerCompareEliminator::getCompoundZeroComparisonInGPR(SDValue LHS, SDLoc dl,
+ ZeroCompare CmpTy) {
+ EVT InVT = LHS.getValueType();
+ bool Is32Bit = InVT == MVT::i32;
+ SDValue ToExtend;
+
+ // Produce the value that needs to be either zero or sign extended.
+ switch (CmpTy) {
+ case ZeroCompare::GEZExt:
+ case ZeroCompare::GESExt:
+ ToExtend = SDValue(CurDAG->getMachineNode(Is32Bit ? PPC::NOR : PPC::NOR8,
+ dl, InVT, LHS, LHS), 0);
+ break;
+ case ZeroCompare::LEZExt:
+ case ZeroCompare::LESExt: {
+ if (Is32Bit) {
+ // Upper 32 bits cannot be undefined for this sequence.
+ LHS = signExtendInputIfNeeded(LHS);
+ SDValue Neg =
+ SDValue(CurDAG->getMachineNode(PPC::NEG8, dl, MVT::i64, LHS), 0);
+ ToExtend =
+ SDValue(CurDAG->getMachineNode(PPC::RLDICL, dl, MVT::i64,
+ Neg, S->getI64Imm(1, dl),
+ S->getI64Imm(63, dl)), 0);
+ } else {
+ SDValue Addi =
+ SDValue(CurDAG->getMachineNode(PPC::ADDI8, dl, MVT::i64, LHS,
+ S->getI64Imm(~0ULL, dl)), 0);
+ ToExtend = SDValue(CurDAG->getMachineNode(PPC::OR8, dl, MVT::i64,
+ Addi, LHS), 0);
+ }
+ break;
+ }
+ }
+
+ // For 64-bit sequences, the extensions are the same for the GE/LE cases.
+ if (!Is32Bit &&
+ (CmpTy == ZeroCompare::GEZExt || CmpTy == ZeroCompare::LEZExt))
+ return SDValue(CurDAG->getMachineNode(PPC::RLDICL, dl, MVT::i64,
+ ToExtend, S->getI64Imm(1, dl),
+ S->getI64Imm(63, dl)), 0);
+ if (!Is32Bit &&
+ (CmpTy == ZeroCompare::GESExt || CmpTy == ZeroCompare::LESExt))
+ return SDValue(CurDAG->getMachineNode(PPC::SRADI, dl, MVT::i64, ToExtend,
+ S->getI64Imm(63, dl)), 0);
+
+ assert(Is32Bit && "Should have handled the 32-bit sequences above.");
+ // For 32-bit sequences, the extensions differ between GE/LE cases.
+ switch (CmpTy) {
+ case ZeroCompare::GEZExt: {
+ SDValue ShiftOps[] = { ToExtend, S->getI32Imm(1, dl), S->getI32Imm(31, dl),
+ S->getI32Imm(31, dl) };
+ return SDValue(CurDAG->getMachineNode(PPC::RLWINM, dl, MVT::i32,
+ ShiftOps), 0);
+ }
+ case ZeroCompare::GESExt:
+ return SDValue(CurDAG->getMachineNode(PPC::SRAWI, dl, MVT::i32, ToExtend,
+ S->getI32Imm(31, dl)), 0);
+ case ZeroCompare::LEZExt:
+ return SDValue(CurDAG->getMachineNode(PPC::XORI8, dl, MVT::i64, ToExtend,
+ S->getI32Imm(1, dl)), 0);
+ case ZeroCompare::LESExt:
+ return SDValue(CurDAG->getMachineNode(PPC::ADDI8, dl, MVT::i64, ToExtend,
+ S->getI32Imm(-1, dl)), 0);
+ }
+
+ // The above case covers all the enumerators so it can't have a default clause
+ // to avoid compiler warnings.
+ llvm_unreachable("Unknown zero-comparison type.");
+}
+
+/// Produces a zero-extended result of comparing two 32-bit values according to
+/// the passed condition code.
+SDValue
+IntegerCompareEliminator::get32BitZExtCompare(SDValue LHS, SDValue RHS,
+ ISD::CondCode CC,
+ int64_t RHSValue, SDLoc dl) {
+ if (CmpInGPR == ICGPR_I64 || CmpInGPR == ICGPR_SextI64 ||
+ CmpInGPR == ICGPR_ZextI64 || CmpInGPR == ICGPR_Sext)
+ return SDValue();
+ bool IsRHSZero = RHSValue == 0;
+ bool IsRHSOne = RHSValue == 1;
+ bool IsRHSNegOne = RHSValue == -1LL;
+ switch (CC) {
+ default: return SDValue();
+ case ISD::SETEQ: {
+ // (zext (setcc %a, %b, seteq)) -> (lshr (cntlzw (xor %a, %b)), 5)
+ // (zext (setcc %a, 0, seteq)) -> (lshr (cntlzw %a), 5)
+ SDValue Xor = IsRHSZero ? LHS :
+ SDValue(CurDAG->getMachineNode(PPC::XOR, dl, MVT::i32, LHS, RHS), 0);
+ SDValue Clz =
+ SDValue(CurDAG->getMachineNode(PPC::CNTLZW, dl, MVT::i32, Xor), 0);
+ SDValue ShiftOps[] = { Clz, S->getI32Imm(27, dl), S->getI32Imm(5, dl),
+ S->getI32Imm(31, dl) };
+ return SDValue(CurDAG->getMachineNode(PPC::RLWINM, dl, MVT::i32,
+ ShiftOps), 0);
+ }
+ case ISD::SETNE: {
+ // (zext (setcc %a, %b, setne)) -> (xor (lshr (cntlzw (xor %a, %b)), 5), 1)
+ // (zext (setcc %a, 0, setne)) -> (xor (lshr (cntlzw %a), 5), 1)
+ SDValue Xor = IsRHSZero ? LHS :
+ SDValue(CurDAG->getMachineNode(PPC::XOR, dl, MVT::i32, LHS, RHS), 0);
+ SDValue Clz =
+ SDValue(CurDAG->getMachineNode(PPC::CNTLZW, dl, MVT::i32, Xor), 0);
+ SDValue ShiftOps[] = { Clz, S->getI32Imm(27, dl), S->getI32Imm(5, dl),
+ S->getI32Imm(31, dl) };
+ SDValue Shift =
+ SDValue(CurDAG->getMachineNode(PPC::RLWINM, dl, MVT::i32, ShiftOps), 0);
+ return SDValue(CurDAG->getMachineNode(PPC::XORI, dl, MVT::i32, Shift,
+ S->getI32Imm(1, dl)), 0);
+ }
+ case ISD::SETGE: {
+ // (zext (setcc %a, %b, setge)) -> (xor (lshr (sub %a, %b), 63), 1)
+ // (zext (setcc %a, 0, setge)) -> (lshr (~ %a), 31)
+ if(IsRHSZero)
+ return getCompoundZeroComparisonInGPR(LHS, dl, ZeroCompare::GEZExt);
+
+ // Not a special case (i.e. RHS == 0). Handle (%a >= %b) as (%b <= %a)
+ // by swapping inputs and falling through.
+ std::swap(LHS, RHS);
+ ConstantSDNode *RHSConst = dyn_cast<ConstantSDNode>(RHS);
+ IsRHSZero = RHSConst && RHSConst->isNullValue();
+ LLVM_FALLTHROUGH;
+ }
+ case ISD::SETLE: {
+ if (CmpInGPR == ICGPR_NonExtIn)
+ return SDValue();
+ // (zext (setcc %a, %b, setle)) -> (xor (lshr (sub %b, %a), 63), 1)
+ // (zext (setcc %a, 0, setle)) -> (xor (lshr (- %a), 63), 1)
+ if(IsRHSZero) {
+ if (CmpInGPR == ICGPR_NonExtIn)
+ return SDValue();
+ return getCompoundZeroComparisonInGPR(LHS, dl, ZeroCompare::LEZExt);
+ }
+
+ // The upper 32-bits of the register can't be undefined for this sequence.
+ LHS = signExtendInputIfNeeded(LHS);
+ RHS = signExtendInputIfNeeded(RHS);
+ SDValue Sub =
+ SDValue(CurDAG->getMachineNode(PPC::SUBF8, dl, MVT::i64, LHS, RHS), 0);
+ SDValue Shift =
+ SDValue(CurDAG->getMachineNode(PPC::RLDICL, dl, MVT::i64, Sub,
+ S->getI64Imm(1, dl), S->getI64Imm(63, dl)),
+ 0);
+ return
+ SDValue(CurDAG->getMachineNode(PPC::XORI8, dl,
+ MVT::i64, Shift, S->getI32Imm(1, dl)), 0);
+ }
+ case ISD::SETGT: {
+ // (zext (setcc %a, %b, setgt)) -> (lshr (sub %b, %a), 63)
+ // (zext (setcc %a, -1, setgt)) -> (lshr (~ %a), 31)
+ // (zext (setcc %a, 0, setgt)) -> (lshr (- %a), 63)
+ // Handle SETLT -1 (which is equivalent to SETGE 0).
+ if (IsRHSNegOne)
+ return getCompoundZeroComparisonInGPR(LHS, dl, ZeroCompare::GEZExt);
+
+ if (IsRHSZero) {
+ if (CmpInGPR == ICGPR_NonExtIn)
+ return SDValue();
+ // The upper 32-bits of the register can't be undefined for this sequence.
+ LHS = signExtendInputIfNeeded(LHS);
+ RHS = signExtendInputIfNeeded(RHS);
+ SDValue Neg =
+ SDValue(CurDAG->getMachineNode(PPC::NEG8, dl, MVT::i64, LHS), 0);
+ return SDValue(CurDAG->getMachineNode(PPC::RLDICL, dl, MVT::i64,
+ Neg, S->getI32Imm(1, dl), S->getI32Imm(63, dl)), 0);
+ }
+ // Not a special case (i.e. RHS == 0 or RHS == -1). Handle (%a > %b) as
+ // (%b < %a) by swapping inputs and falling through.
+ std::swap(LHS, RHS);
+ ConstantSDNode *RHSConst = dyn_cast<ConstantSDNode>(RHS);
+ IsRHSZero = RHSConst && RHSConst->isNullValue();
+ IsRHSOne = RHSConst && RHSConst->getSExtValue() == 1;
+ LLVM_FALLTHROUGH;
+ }
+ case ISD::SETLT: {
+ // (zext (setcc %a, %b, setlt)) -> (lshr (sub %a, %b), 63)
+ // (zext (setcc %a, 1, setlt)) -> (xor (lshr (- %a), 63), 1)
+ // (zext (setcc %a, 0, setlt)) -> (lshr %a, 31)
+ // Handle SETLT 1 (which is equivalent to SETLE 0).
+ if (IsRHSOne) {
+ if (CmpInGPR == ICGPR_NonExtIn)
+ return SDValue();
+ return getCompoundZeroComparisonInGPR(LHS, dl, ZeroCompare::LEZExt);
+ }
+
+ if (IsRHSZero) {
+ SDValue ShiftOps[] = { LHS, S->getI32Imm(1, dl), S->getI32Imm(31, dl),
+ S->getI32Imm(31, dl) };
+ return SDValue(CurDAG->getMachineNode(PPC::RLWINM, dl, MVT::i32,
+ ShiftOps), 0);
+ }
+
+ if (CmpInGPR == ICGPR_NonExtIn)
+ return SDValue();
+ // The upper 32-bits of the register can't be undefined for this sequence.
+ LHS = signExtendInputIfNeeded(LHS);
+ RHS = signExtendInputIfNeeded(RHS);
+ SDValue SUBFNode =
+ SDValue(CurDAG->getMachineNode(PPC::SUBF8, dl, MVT::i64, RHS, LHS), 0);
+ return SDValue(CurDAG->getMachineNode(PPC::RLDICL, dl, MVT::i64,
+ SUBFNode, S->getI64Imm(1, dl),
+ S->getI64Imm(63, dl)), 0);
+ }
+ case ISD::SETUGE:
+ // (zext (setcc %a, %b, setuge)) -> (xor (lshr (sub %b, %a), 63), 1)
+ // (zext (setcc %a, %b, setule)) -> (xor (lshr (sub %a, %b), 63), 1)
+ std::swap(LHS, RHS);
+ LLVM_FALLTHROUGH;
+ case ISD::SETULE: {
+ if (CmpInGPR == ICGPR_NonExtIn)
+ return SDValue();
+ // The upper 32-bits of the register can't be undefined for this sequence.
+ LHS = zeroExtendInputIfNeeded(LHS);
+ RHS = zeroExtendInputIfNeeded(RHS);
+ SDValue Subtract =
+ SDValue(CurDAG->getMachineNode(PPC::SUBF8, dl, MVT::i64, LHS, RHS), 0);
+ SDValue SrdiNode =
+ SDValue(CurDAG->getMachineNode(PPC::RLDICL, dl, MVT::i64,
+ Subtract, S->getI64Imm(1, dl),
+ S->getI64Imm(63, dl)), 0);
+ return SDValue(CurDAG->getMachineNode(PPC::XORI8, dl, MVT::i64, SrdiNode,
+ S->getI32Imm(1, dl)), 0);
+ }
+ case ISD::SETUGT:
+ // (zext (setcc %a, %b, setugt)) -> (lshr (sub %b, %a), 63)
+ // (zext (setcc %a, %b, setult)) -> (lshr (sub %a, %b), 63)
+ std::swap(LHS, RHS);
+ LLVM_FALLTHROUGH;
+ case ISD::SETULT: {
+ if (CmpInGPR == ICGPR_NonExtIn)
+ return SDValue();
+ // The upper 32-bits of the register can't be undefined for this sequence.
+ LHS = zeroExtendInputIfNeeded(LHS);
+ RHS = zeroExtendInputIfNeeded(RHS);
+ SDValue Subtract =
+ SDValue(CurDAG->getMachineNode(PPC::SUBF8, dl, MVT::i64, RHS, LHS), 0);
+ return SDValue(CurDAG->getMachineNode(PPC::RLDICL, dl, MVT::i64,
+ Subtract, S->getI64Imm(1, dl),
+ S->getI64Imm(63, dl)), 0);
+ }
+ }
+}
+
+/// Produces a sign-extended result of comparing two 32-bit values according to
+/// the passed condition code.
+SDValue
+IntegerCompareEliminator::get32BitSExtCompare(SDValue LHS, SDValue RHS,
+ ISD::CondCode CC,
+ int64_t RHSValue, SDLoc dl) {
+ if (CmpInGPR == ICGPR_I64 || CmpInGPR == ICGPR_SextI64 ||
+ CmpInGPR == ICGPR_ZextI64 || CmpInGPR == ICGPR_Zext)
+ return SDValue();
+ bool IsRHSZero = RHSValue == 0;
+ bool IsRHSOne = RHSValue == 1;
+ bool IsRHSNegOne = RHSValue == -1LL;
+
+ switch (CC) {
+ default: return SDValue();
+ case ISD::SETEQ: {
+ // (sext (setcc %a, %b, seteq)) ->
+ // (ashr (shl (ctlz (xor %a, %b)), 58), 63)
+ // (sext (setcc %a, 0, seteq)) ->
+ // (ashr (shl (ctlz %a), 58), 63)
+ SDValue CountInput = IsRHSZero ? LHS :
+ SDValue(CurDAG->getMachineNode(PPC::XOR, dl, MVT::i32, LHS, RHS), 0);
+ SDValue Cntlzw =
+ SDValue(CurDAG->getMachineNode(PPC::CNTLZW, dl, MVT::i32, CountInput), 0);
+ SDValue SHLOps[] = { Cntlzw, S->getI32Imm(27, dl),
+ S->getI32Imm(5, dl), S->getI32Imm(31, dl) };
+ SDValue Slwi =
+ SDValue(CurDAG->getMachineNode(PPC::RLWINM, dl, MVT::i32, SHLOps), 0);
+ return SDValue(CurDAG->getMachineNode(PPC::NEG, dl, MVT::i32, Slwi), 0);
+ }
+ case ISD::SETNE: {
+ // Bitwise xor the operands, count leading zeros, shift right by 5 bits and
+ // flip the bit, finally take 2's complement.
+ // (sext (setcc %a, %b, setne)) ->
+ // (neg (xor (lshr (ctlz (xor %a, %b)), 5), 1))
+ // Same as above, but the first xor is not needed.
+ // (sext (setcc %a, 0, setne)) ->
+ // (neg (xor (lshr (ctlz %a), 5), 1))
+ SDValue Xor = IsRHSZero ? LHS :
+ SDValue(CurDAG->getMachineNode(PPC::XOR, dl, MVT::i32, LHS, RHS), 0);
+ SDValue Clz =
+ SDValue(CurDAG->getMachineNode(PPC::CNTLZW, dl, MVT::i32, Xor), 0);
+ SDValue ShiftOps[] =
+ { Clz, S->getI32Imm(27, dl), S->getI32Imm(5, dl), S->getI32Imm(31, dl) };
+ SDValue Shift =
+ SDValue(CurDAG->getMachineNode(PPC::RLWINM, dl, MVT::i32, ShiftOps), 0);
+ SDValue Xori =
+ SDValue(CurDAG->getMachineNode(PPC::XORI, dl, MVT::i32, Shift,
+ S->getI32Imm(1, dl)), 0);
+ return SDValue(CurDAG->getMachineNode(PPC::NEG, dl, MVT::i32, Xori), 0);
+ }
+ case ISD::SETGE: {
+ // (sext (setcc %a, %b, setge)) -> (add (lshr (sub %a, %b), 63), -1)
+ // (sext (setcc %a, 0, setge)) -> (ashr (~ %a), 31)
+ if (IsRHSZero)
+ return getCompoundZeroComparisonInGPR(LHS, dl, ZeroCompare::GESExt);
+
+ // Not a special case (i.e. RHS == 0). Handle (%a >= %b) as (%b <= %a)
+ // by swapping inputs and falling through.
+ std::swap(LHS, RHS);
+ ConstantSDNode *RHSConst = dyn_cast<ConstantSDNode>(RHS);
+ IsRHSZero = RHSConst && RHSConst->isNullValue();
+ LLVM_FALLTHROUGH;
+ }
+ case ISD::SETLE: {
+ if (CmpInGPR == ICGPR_NonExtIn)
+ return SDValue();
+ // (sext (setcc %a, %b, setge)) -> (add (lshr (sub %b, %a), 63), -1)
+ // (sext (setcc %a, 0, setle)) -> (add (lshr (- %a), 63), -1)
+ if (IsRHSZero)
+ return getCompoundZeroComparisonInGPR(LHS, dl, ZeroCompare::LESExt);
+
+ // The upper 32-bits of the register can't be undefined for this sequence.
+ LHS = signExtendInputIfNeeded(LHS);
+ RHS = signExtendInputIfNeeded(RHS);
+ SDValue SUBFNode =
+ SDValue(CurDAG->getMachineNode(PPC::SUBF8, dl, MVT::i64, MVT::Glue,
+ LHS, RHS), 0);
+ SDValue Srdi =
+ SDValue(CurDAG->getMachineNode(PPC::RLDICL, dl, MVT::i64,
+ SUBFNode, S->getI64Imm(1, dl),
+ S->getI64Imm(63, dl)), 0);
+ return SDValue(CurDAG->getMachineNode(PPC::ADDI8, dl, MVT::i64, Srdi,
+ S->getI32Imm(-1, dl)), 0);
+ }
+ case ISD::SETGT: {
+ // (sext (setcc %a, %b, setgt)) -> (ashr (sub %b, %a), 63)
+ // (sext (setcc %a, -1, setgt)) -> (ashr (~ %a), 31)
+ // (sext (setcc %a, 0, setgt)) -> (ashr (- %a), 63)
+ if (IsRHSNegOne)
+ return getCompoundZeroComparisonInGPR(LHS, dl, ZeroCompare::GESExt);
+ if (IsRHSZero) {
+ if (CmpInGPR == ICGPR_NonExtIn)
+ return SDValue();
+ // The upper 32-bits of the register can't be undefined for this sequence.
+ LHS = signExtendInputIfNeeded(LHS);
+ RHS = signExtendInputIfNeeded(RHS);
+ SDValue Neg =
+ SDValue(CurDAG->getMachineNode(PPC::NEG8, dl, MVT::i64, LHS), 0);
+ return SDValue(CurDAG->getMachineNode(PPC::SRADI, dl, MVT::i64, Neg,
+ S->getI64Imm(63, dl)), 0);
+ }
+ // Not a special case (i.e. RHS == 0 or RHS == -1). Handle (%a > %b) as
+ // (%b < %a) by swapping inputs and falling through.
+ std::swap(LHS, RHS);
+ ConstantSDNode *RHSConst = dyn_cast<ConstantSDNode>(RHS);
+ IsRHSZero = RHSConst && RHSConst->isNullValue();
+ IsRHSOne = RHSConst && RHSConst->getSExtValue() == 1;
+ LLVM_FALLTHROUGH;
+ }
+ case ISD::SETLT: {
+ // (sext (setcc %a, %b, setgt)) -> (ashr (sub %a, %b), 63)
+ // (sext (setcc %a, 1, setgt)) -> (add (lshr (- %a), 63), -1)
+ // (sext (setcc %a, 0, setgt)) -> (ashr %a, 31)
+ if (IsRHSOne) {
+ if (CmpInGPR == ICGPR_NonExtIn)
+ return SDValue();
+ return getCompoundZeroComparisonInGPR(LHS, dl, ZeroCompare::LESExt);
+ }
+ if (IsRHSZero)
+ return SDValue(CurDAG->getMachineNode(PPC::SRAWI, dl, MVT::i32, LHS,
+ S->getI32Imm(31, dl)), 0);
+
+ if (CmpInGPR == ICGPR_NonExtIn)
+ return SDValue();
+ // The upper 32-bits of the register can't be undefined for this sequence.
+ LHS = signExtendInputIfNeeded(LHS);
+ RHS = signExtendInputIfNeeded(RHS);
+ SDValue SUBFNode =
+ SDValue(CurDAG->getMachineNode(PPC::SUBF8, dl, MVT::i64, RHS, LHS), 0);
+ return SDValue(CurDAG->getMachineNode(PPC::SRADI, dl, MVT::i64,
+ SUBFNode, S->getI64Imm(63, dl)), 0);
+ }
+ case ISD::SETUGE:
+ // (sext (setcc %a, %b, setuge)) -> (add (lshr (sub %a, %b), 63), -1)
+ // (sext (setcc %a, %b, setule)) -> (add (lshr (sub %b, %a), 63), -1)
+ std::swap(LHS, RHS);
+ LLVM_FALLTHROUGH;
+ case ISD::SETULE: {
+ if (CmpInGPR == ICGPR_NonExtIn)
+ return SDValue();
+ // The upper 32-bits of the register can't be undefined for this sequence.
+ LHS = zeroExtendInputIfNeeded(LHS);
+ RHS = zeroExtendInputIfNeeded(RHS);
+ SDValue Subtract =
+ SDValue(CurDAG->getMachineNode(PPC::SUBF8, dl, MVT::i64, LHS, RHS), 0);
+ SDValue Shift =
+ SDValue(CurDAG->getMachineNode(PPC::RLDICL, dl, MVT::i64, Subtract,
+ S->getI32Imm(1, dl), S->getI32Imm(63,dl)),
+ 0);
+ return SDValue(CurDAG->getMachineNode(PPC::ADDI8, dl, MVT::i64, Shift,
+ S->getI32Imm(-1, dl)), 0);
+ }
+ case ISD::SETUGT:
+ // (sext (setcc %a, %b, setugt)) -> (ashr (sub %b, %a), 63)
+ // (sext (setcc %a, %b, setugt)) -> (ashr (sub %a, %b), 63)
+ std::swap(LHS, RHS);
+ LLVM_FALLTHROUGH;
+ case ISD::SETULT: {
+ if (CmpInGPR == ICGPR_NonExtIn)
+ return SDValue();
+ // The upper 32-bits of the register can't be undefined for this sequence.
+ LHS = zeroExtendInputIfNeeded(LHS);
+ RHS = zeroExtendInputIfNeeded(RHS);
+ SDValue Subtract =
+ SDValue(CurDAG->getMachineNode(PPC::SUBF8, dl, MVT::i64, RHS, LHS), 0);
+ return SDValue(CurDAG->getMachineNode(PPC::SRADI, dl, MVT::i64,
+ Subtract, S->getI64Imm(63, dl)), 0);
+ }
+ }
+}
+
+/// Produces a zero-extended result of comparing two 64-bit values according to
+/// the passed condition code.
+SDValue
+IntegerCompareEliminator::get64BitZExtCompare(SDValue LHS, SDValue RHS,
+ ISD::CondCode CC,
+ int64_t RHSValue, SDLoc dl) {
+ if (CmpInGPR == ICGPR_I32 || CmpInGPR == ICGPR_SextI32 ||
+ CmpInGPR == ICGPR_ZextI32 || CmpInGPR == ICGPR_Sext)
+ return SDValue();
+ bool IsRHSZero = RHSValue == 0;
+ bool IsRHSOne = RHSValue == 1;
+ bool IsRHSNegOne = RHSValue == -1LL;
+ switch (CC) {
+ default: return SDValue();
+ case ISD::SETEQ: {
+ // (zext (setcc %a, %b, seteq)) -> (lshr (ctlz (xor %a, %b)), 6)
+ // (zext (setcc %a, 0, seteq)) -> (lshr (ctlz %a), 6)
+ SDValue Xor = IsRHSZero ? LHS :
+ SDValue(CurDAG->getMachineNode(PPC::XOR8, dl, MVT::i64, LHS, RHS), 0);
+ SDValue Clz =
+ SDValue(CurDAG->getMachineNode(PPC::CNTLZD, dl, MVT::i64, Xor), 0);
+ return SDValue(CurDAG->getMachineNode(PPC::RLDICL, dl, MVT::i64, Clz,
+ S->getI64Imm(58, dl),
+ S->getI64Imm(63, dl)), 0);
+ }
+ case ISD::SETNE: {
+ // {addc.reg, addc.CA} = (addcarry (xor %a, %b), -1)
+ // (zext (setcc %a, %b, setne)) -> (sube addc.reg, addc.reg, addc.CA)
+ // {addcz.reg, addcz.CA} = (addcarry %a, -1)
+ // (zext (setcc %a, 0, setne)) -> (sube addcz.reg, addcz.reg, addcz.CA)
+ SDValue Xor = IsRHSZero ? LHS :
+ SDValue(CurDAG->getMachineNode(PPC::XOR8, dl, MVT::i64, LHS, RHS), 0);
+ SDValue AC =
+ SDValue(CurDAG->getMachineNode(PPC::ADDIC8, dl, MVT::i64, MVT::Glue,
+ Xor, S->getI32Imm(~0U, dl)), 0);
+ return SDValue(CurDAG->getMachineNode(PPC::SUBFE8, dl, MVT::i64, AC,
+ Xor, AC.getValue(1)), 0);
+ }
+ case ISD::SETGE: {
+ // {subc.reg, subc.CA} = (subcarry %a, %b)
+ // (zext (setcc %a, %b, setge)) ->
+ // (adde (lshr %b, 63), (ashr %a, 63), subc.CA)
+ // (zext (setcc %a, 0, setge)) -> (lshr (~ %a), 63)
+ if (IsRHSZero)
+ return getCompoundZeroComparisonInGPR(LHS, dl, ZeroCompare::GEZExt);
+ std::swap(LHS, RHS);
+ ConstantSDNode *RHSConst = dyn_cast<ConstantSDNode>(RHS);
+ IsRHSZero = RHSConst && RHSConst->isNullValue();
+ LLVM_FALLTHROUGH;
+ }
+ case ISD::SETLE: {
+ // {subc.reg, subc.CA} = (subcarry %b, %a)
+ // (zext (setcc %a, %b, setge)) ->
+ // (adde (lshr %a, 63), (ashr %b, 63), subc.CA)
+ // (zext (setcc %a, 0, setge)) -> (lshr (or %a, (add %a, -1)), 63)
+ if (IsRHSZero)
+ return getCompoundZeroComparisonInGPR(LHS, dl, ZeroCompare::LEZExt);
+ SDValue ShiftL =
+ SDValue(CurDAG->getMachineNode(PPC::RLDICL, dl, MVT::i64, LHS,
+ S->getI64Imm(1, dl),
+ S->getI64Imm(63, dl)), 0);
+ SDValue ShiftR =
+ SDValue(CurDAG->getMachineNode(PPC::SRADI, dl, MVT::i64, RHS,
+ S->getI64Imm(63, dl)), 0);
+ SDValue SubtractCarry =
+ SDValue(CurDAG->getMachineNode(PPC::SUBFC8, dl, MVT::i64, MVT::Glue,
+ LHS, RHS), 1);
+ return SDValue(CurDAG->getMachineNode(PPC::ADDE8, dl, MVT::i64, MVT::Glue,
+ ShiftR, ShiftL, SubtractCarry), 0);
+ }
+ case ISD::SETGT: {
+ // {subc.reg, subc.CA} = (subcarry %b, %a)
+ // (zext (setcc %a, %b, setgt)) ->
+ // (xor (adde (lshr %a, 63), (ashr %b, 63), subc.CA), 1)
+ // (zext (setcc %a, 0, setgt)) -> (lshr (nor (add %a, -1), %a), 63)
+ if (IsRHSNegOne)
+ return getCompoundZeroComparisonInGPR(LHS, dl, ZeroCompare::GEZExt);
+ if (IsRHSZero) {
+ SDValue Addi =
+ SDValue(CurDAG->getMachineNode(PPC::ADDI8, dl, MVT::i64, LHS,
+ S->getI64Imm(~0ULL, dl)), 0);
+ SDValue Nor =
+ SDValue(CurDAG->getMachineNode(PPC::NOR8, dl, MVT::i64, Addi, LHS), 0);
+ return SDValue(CurDAG->getMachineNode(PPC::RLDICL, dl, MVT::i64, Nor,
+ S->getI64Imm(1, dl),
+ S->getI64Imm(63, dl)), 0);
+ }
+ std::swap(LHS, RHS);
+ ConstantSDNode *RHSConst = dyn_cast<ConstantSDNode>(RHS);
+ IsRHSZero = RHSConst && RHSConst->isNullValue();
+ IsRHSOne = RHSConst && RHSConst->getSExtValue() == 1;
+ LLVM_FALLTHROUGH;
+ }
+ case ISD::SETLT: {
+ // {subc.reg, subc.CA} = (subcarry %a, %b)
+ // (zext (setcc %a, %b, setlt)) ->
+ // (xor (adde (lshr %b, 63), (ashr %a, 63), subc.CA), 1)
+ // (zext (setcc %a, 0, setlt)) -> (lshr %a, 63)
+ if (IsRHSOne)
+ return getCompoundZeroComparisonInGPR(LHS, dl, ZeroCompare::LEZExt);
+ if (IsRHSZero)
+ return SDValue(CurDAG->getMachineNode(PPC::RLDICL, dl, MVT::i64, LHS,
+ S->getI64Imm(1, dl),
+ S->getI64Imm(63, dl)), 0);
+ SDValue SRADINode =
+ SDValue(CurDAG->getMachineNode(PPC::SRADI, dl, MVT::i64,
+ LHS, S->getI64Imm(63, dl)), 0);
+ SDValue SRDINode =
+ SDValue(CurDAG->getMachineNode(PPC::RLDICL, dl, MVT::i64,
+ RHS, S->getI64Imm(1, dl),
+ S->getI64Imm(63, dl)), 0);
+ SDValue SUBFC8Carry =
+ SDValue(CurDAG->getMachineNode(PPC::SUBFC8, dl, MVT::i64, MVT::Glue,
+ RHS, LHS), 1);
+ SDValue ADDE8Node =
+ SDValue(CurDAG->getMachineNode(PPC::ADDE8, dl, MVT::i64, MVT::Glue,
+ SRDINode, SRADINode, SUBFC8Carry), 0);
+ return SDValue(CurDAG->getMachineNode(PPC::XORI8, dl, MVT::i64,
+ ADDE8Node, S->getI64Imm(1, dl)), 0);
+ }
+ case ISD::SETUGE:
+ // {subc.reg, subc.CA} = (subcarry %a, %b)
+ // (zext (setcc %a, %b, setuge)) -> (add (sube %b, %b, subc.CA), 1)
+ std::swap(LHS, RHS);
+ LLVM_FALLTHROUGH;
+ case ISD::SETULE: {
+ // {subc.reg, subc.CA} = (subcarry %b, %a)
+ // (zext (setcc %a, %b, setule)) -> (add (sube %a, %a, subc.CA), 1)
+ SDValue SUBFC8Carry =
+ SDValue(CurDAG->getMachineNode(PPC::SUBFC8, dl, MVT::i64, MVT::Glue,
+ LHS, RHS), 1);
+ SDValue SUBFE8Node =
+ SDValue(CurDAG->getMachineNode(PPC::SUBFE8, dl, MVT::i64, MVT::Glue,
+ LHS, LHS, SUBFC8Carry), 0);
+ return SDValue(CurDAG->getMachineNode(PPC::ADDI8, dl, MVT::i64,
+ SUBFE8Node, S->getI64Imm(1, dl)), 0);
+ }
+ case ISD::SETUGT:
+ // {subc.reg, subc.CA} = (subcarry %b, %a)
+ // (zext (setcc %a, %b, setugt)) -> -(sube %b, %b, subc.CA)
+ std::swap(LHS, RHS);
+ LLVM_FALLTHROUGH;
+ case ISD::SETULT: {
+ // {subc.reg, subc.CA} = (subcarry %a, %b)
+ // (zext (setcc %a, %b, setult)) -> -(sube %a, %a, subc.CA)
+ SDValue SubtractCarry =
+ SDValue(CurDAG->getMachineNode(PPC::SUBFC8, dl, MVT::i64, MVT::Glue,
+ RHS, LHS), 1);
+ SDValue ExtSub =
+ SDValue(CurDAG->getMachineNode(PPC::SUBFE8, dl, MVT::i64,
+ LHS, LHS, SubtractCarry), 0);
+ return SDValue(CurDAG->getMachineNode(PPC::NEG8, dl, MVT::i64,
+ ExtSub), 0);
+ }
+ }
+}
+
+/// Produces a sign-extended result of comparing two 64-bit values according to
+/// the passed condition code.
+SDValue
+IntegerCompareEliminator::get64BitSExtCompare(SDValue LHS, SDValue RHS,
+ ISD::CondCode CC,
+ int64_t RHSValue, SDLoc dl) {
+ if (CmpInGPR == ICGPR_I32 || CmpInGPR == ICGPR_SextI32 ||
+ CmpInGPR == ICGPR_ZextI32 || CmpInGPR == ICGPR_Zext)
+ return SDValue();
+ bool IsRHSZero = RHSValue == 0;
+ bool IsRHSOne = RHSValue == 1;
+ bool IsRHSNegOne = RHSValue == -1LL;
+ switch (CC) {
+ default: return SDValue();
+ case ISD::SETEQ: {
+ // {addc.reg, addc.CA} = (addcarry (xor %a, %b), -1)
+ // (sext (setcc %a, %b, seteq)) -> (sube addc.reg, addc.reg, addc.CA)
+ // {addcz.reg, addcz.CA} = (addcarry %a, -1)
+ // (sext (setcc %a, 0, seteq)) -> (sube addcz.reg, addcz.reg, addcz.CA)
+ SDValue AddInput = IsRHSZero ? LHS :
+ SDValue(CurDAG->getMachineNode(PPC::XOR8, dl, MVT::i64, LHS, RHS), 0);
+ SDValue Addic =
+ SDValue(CurDAG->getMachineNode(PPC::ADDIC8, dl, MVT::i64, MVT::Glue,
+ AddInput, S->getI32Imm(~0U, dl)), 0);
+ return SDValue(CurDAG->getMachineNode(PPC::SUBFE8, dl, MVT::i64, Addic,
+ Addic, Addic.getValue(1)), 0);
+ }
+ case ISD::SETNE: {
+ // {subfc.reg, subfc.CA} = (subcarry 0, (xor %a, %b))
+ // (sext (setcc %a, %b, setne)) -> (sube subfc.reg, subfc.reg, subfc.CA)
+ // {subfcz.reg, subfcz.CA} = (subcarry 0, %a)
+ // (sext (setcc %a, 0, setne)) -> (sube subfcz.reg, subfcz.reg, subfcz.CA)
+ SDValue Xor = IsRHSZero ? LHS :
+ SDValue(CurDAG->getMachineNode(PPC::XOR8, dl, MVT::i64, LHS, RHS), 0);
+ SDValue SC =
+ SDValue(CurDAG->getMachineNode(PPC::SUBFIC8, dl, MVT::i64, MVT::Glue,
+ Xor, S->getI32Imm(0, dl)), 0);
+ return SDValue(CurDAG->getMachineNode(PPC::SUBFE8, dl, MVT::i64, SC,
+ SC, SC.getValue(1)), 0);
+ }
+ case ISD::SETGE: {
+ // {subc.reg, subc.CA} = (subcarry %a, %b)
+ // (zext (setcc %a, %b, setge)) ->
+ // (- (adde (lshr %b, 63), (ashr %a, 63), subc.CA))
+ // (zext (setcc %a, 0, setge)) -> (~ (ashr %a, 63))
+ if (IsRHSZero)
+ return getCompoundZeroComparisonInGPR(LHS, dl, ZeroCompare::GESExt);
+ std::swap(LHS, RHS);
+ ConstantSDNode *RHSConst = dyn_cast<ConstantSDNode>(RHS);
+ IsRHSZero = RHSConst && RHSConst->isNullValue();
+ LLVM_FALLTHROUGH;
+ }
+ case ISD::SETLE: {
+ // {subc.reg, subc.CA} = (subcarry %b, %a)
+ // (zext (setcc %a, %b, setge)) ->
+ // (- (adde (lshr %a, 63), (ashr %b, 63), subc.CA))
+ // (zext (setcc %a, 0, setge)) -> (ashr (or %a, (add %a, -1)), 63)
+ if (IsRHSZero)
+ return getCompoundZeroComparisonInGPR(LHS, dl, ZeroCompare::LESExt);
+ SDValue ShiftR =
+ SDValue(CurDAG->getMachineNode(PPC::SRADI, dl, MVT::i64, RHS,
+ S->getI64Imm(63, dl)), 0);
+ SDValue ShiftL =
+ SDValue(CurDAG->getMachineNode(PPC::RLDICL, dl, MVT::i64, LHS,
+ S->getI64Imm(1, dl),
+ S->getI64Imm(63, dl)), 0);
+ SDValue SubtractCarry =
+ SDValue(CurDAG->getMachineNode(PPC::SUBFC8, dl, MVT::i64, MVT::Glue,
+ LHS, RHS), 1);
+ SDValue Adde =
+ SDValue(CurDAG->getMachineNode(PPC::ADDE8, dl, MVT::i64, MVT::Glue,
+ ShiftR, ShiftL, SubtractCarry), 0);
+ return SDValue(CurDAG->getMachineNode(PPC::NEG8, dl, MVT::i64, Adde), 0);
+ }
+ case ISD::SETGT: {
+ // {subc.reg, subc.CA} = (subcarry %b, %a)
+ // (zext (setcc %a, %b, setgt)) ->
+ // -(xor (adde (lshr %a, 63), (ashr %b, 63), subc.CA), 1)
+ // (zext (setcc %a, 0, setgt)) -> (ashr (nor (add %a, -1), %a), 63)
+ if (IsRHSNegOne)
+ return getCompoundZeroComparisonInGPR(LHS, dl, ZeroCompare::GESExt);
+ if (IsRHSZero) {
+ SDValue Add =
+ SDValue(CurDAG->getMachineNode(PPC::ADDI8, dl, MVT::i64, LHS,
+ S->getI64Imm(-1, dl)), 0);
+ SDValue Nor =
+ SDValue(CurDAG->getMachineNode(PPC::NOR8, dl, MVT::i64, Add, LHS), 0);
+ return SDValue(CurDAG->getMachineNode(PPC::SRADI, dl, MVT::i64, Nor,
+ S->getI64Imm(63, dl)), 0);
+ }
+ std::swap(LHS, RHS);
+ ConstantSDNode *RHSConst = dyn_cast<ConstantSDNode>(RHS);
+ IsRHSZero = RHSConst && RHSConst->isNullValue();
+ IsRHSOne = RHSConst && RHSConst->getSExtValue() == 1;
+ LLVM_FALLTHROUGH;
+ }
+ case ISD::SETLT: {
+ // {subc.reg, subc.CA} = (subcarry %a, %b)
+ // (zext (setcc %a, %b, setlt)) ->
+ // -(xor (adde (lshr %b, 63), (ashr %a, 63), subc.CA), 1)
+ // (zext (setcc %a, 0, setlt)) -> (ashr %a, 63)
+ if (IsRHSOne)
+ return getCompoundZeroComparisonInGPR(LHS, dl, ZeroCompare::LESExt);
+ if (IsRHSZero) {
+ return SDValue(CurDAG->getMachineNode(PPC::SRADI, dl, MVT::i64, LHS,
+ S->getI64Imm(63, dl)), 0);
+ }
+ SDValue SRADINode =
+ SDValue(CurDAG->getMachineNode(PPC::SRADI, dl, MVT::i64,
+ LHS, S->getI64Imm(63, dl)), 0);
+ SDValue SRDINode =
+ SDValue(CurDAG->getMachineNode(PPC::RLDICL, dl, MVT::i64,
+ RHS, S->getI64Imm(1, dl),
+ S->getI64Imm(63, dl)), 0);
+ SDValue SUBFC8Carry =
+ SDValue(CurDAG->getMachineNode(PPC::SUBFC8, dl, MVT::i64, MVT::Glue,
+ RHS, LHS), 1);
+ SDValue ADDE8Node =
+ SDValue(CurDAG->getMachineNode(PPC::ADDE8, dl, MVT::i64,
+ SRDINode, SRADINode, SUBFC8Carry), 0);
+ SDValue XORI8Node =
+ SDValue(CurDAG->getMachineNode(PPC::XORI8, dl, MVT::i64,
+ ADDE8Node, S->getI64Imm(1, dl)), 0);
+ return SDValue(CurDAG->getMachineNode(PPC::NEG8, dl, MVT::i64,
+ XORI8Node), 0);
+ }
+ case ISD::SETUGE:
+ // {subc.reg, subc.CA} = (subcarry %a, %b)
+ // (sext (setcc %a, %b, setuge)) -> ~(sube %b, %b, subc.CA)
+ std::swap(LHS, RHS);
+ LLVM_FALLTHROUGH;
+ case ISD::SETULE: {
+ // {subc.reg, subc.CA} = (subcarry %b, %a)
+ // (sext (setcc %a, %b, setule)) -> ~(sube %a, %a, subc.CA)
+ SDValue SubtractCarry =
+ SDValue(CurDAG->getMachineNode(PPC::SUBFC8, dl, MVT::i64, MVT::Glue,
+ LHS, RHS), 1);
+ SDValue ExtSub =
+ SDValue(CurDAG->getMachineNode(PPC::SUBFE8, dl, MVT::i64, MVT::Glue, LHS,
+ LHS, SubtractCarry), 0);
+ return SDValue(CurDAG->getMachineNode(PPC::NOR8, dl, MVT::i64,
+ ExtSub, ExtSub), 0);
+ }
+ case ISD::SETUGT:
+ // {subc.reg, subc.CA} = (subcarry %b, %a)
+ // (sext (setcc %a, %b, setugt)) -> (sube %b, %b, subc.CA)
+ std::swap(LHS, RHS);
+ LLVM_FALLTHROUGH;
+ case ISD::SETULT: {
+ // {subc.reg, subc.CA} = (subcarry %a, %b)
+ // (sext (setcc %a, %b, setult)) -> (sube %a, %a, subc.CA)
+ SDValue SubCarry =
+ SDValue(CurDAG->getMachineNode(PPC::SUBFC8, dl, MVT::i64, MVT::Glue,
+ RHS, LHS), 1);
+ return SDValue(CurDAG->getMachineNode(PPC::SUBFE8, dl, MVT::i64,
+ LHS, LHS, SubCarry), 0);
+ }
+ }
+}
+
+/// Do all uses of this SDValue need the result in a GPR?
+/// This is meant to be used on values that have type i1 since
+/// it is somewhat meaningless to ask if values of other types
+/// should be kept in GPR's.
+static bool allUsesExtend(SDValue Compare, SelectionDAG *CurDAG) {
+ assert(Compare.getOpcode() == ISD::SETCC &&
+ "An ISD::SETCC node required here.");
+
+ // For values that have a single use, the caller should obviously already have
+ // checked if that use is an extending use. We check the other uses here.
+ if (Compare.hasOneUse())
+ return true;
+ // We want the value in a GPR if it is being extended, used for a select, or
+ // used in logical operations.
+ for (auto CompareUse : Compare.getNode()->uses())
+ if (CompareUse->getOpcode() != ISD::SIGN_EXTEND &&
+ CompareUse->getOpcode() != ISD::ZERO_EXTEND &&
+ CompareUse->getOpcode() != ISD::SELECT &&
+ !isLogicOp(CompareUse->getOpcode())) {
+ OmittedForNonExtendUses++;
+ return false;
+ }
+ return true;
+}
+
+/// Returns an equivalent of a SETCC node but with the result the same width as
+/// the inputs. This can nalso be used for SELECT_CC if either the true or false
+/// values is a power of two while the other is zero.
+SDValue IntegerCompareEliminator::getSETCCInGPR(SDValue Compare,
+ SetccInGPROpts ConvOpts) {
+ assert((Compare.getOpcode() == ISD::SETCC ||
+ Compare.getOpcode() == ISD::SELECT_CC) &&
+ "An ISD::SETCC node required here.");
+
+ // Don't convert this comparison to a GPR sequence because there are uses
+ // of the i1 result (i.e. uses that require the result in the CR).
+ if ((Compare.getOpcode() == ISD::SETCC) && !allUsesExtend(Compare, CurDAG))
+ return SDValue();
+
+ SDValue LHS = Compare.getOperand(0);
+ SDValue RHS = Compare.getOperand(1);
+
+ // The condition code is operand 2 for SETCC and operand 4 for SELECT_CC.
+ int CCOpNum = Compare.getOpcode() == ISD::SELECT_CC ? 4 : 2;
+ ISD::CondCode CC =
+ cast<CondCodeSDNode>(Compare.getOperand(CCOpNum))->get();
+ EVT InputVT = LHS.getValueType();
+ if (InputVT != MVT::i32 && InputVT != MVT::i64)
+ return SDValue();
+
+ if (ConvOpts == SetccInGPROpts::ZExtInvert ||
+ ConvOpts == SetccInGPROpts::SExtInvert)
+ CC = ISD::getSetCCInverse(CC, true);
+
+ bool Inputs32Bit = InputVT == MVT::i32;
+
+ SDLoc dl(Compare);
+ ConstantSDNode *RHSConst = dyn_cast<ConstantSDNode>(RHS);
+ int64_t RHSValue = RHSConst ? RHSConst->getSExtValue() : INT64_MAX;
+ bool IsSext = ConvOpts == SetccInGPROpts::SExtOrig ||
+ ConvOpts == SetccInGPROpts::SExtInvert;
+
+ if (IsSext && Inputs32Bit)
+ return get32BitSExtCompare(LHS, RHS, CC, RHSValue, dl);
+ else if (Inputs32Bit)
+ return get32BitZExtCompare(LHS, RHS, CC, RHSValue, dl);
+ else if (IsSext)
+ return get64BitSExtCompare(LHS, RHS, CC, RHSValue, dl);
+ return get64BitZExtCompare(LHS, RHS, CC, RHSValue, dl);
+}
+
} // end anonymous namespace
+bool PPCDAGToDAGISel::tryIntCompareInGPR(SDNode *N) {
+ if (N->getValueType(0) != MVT::i32 &&
+ N->getValueType(0) != MVT::i64)
+ return false;
+
+ // This optimization will emit code that assumes 64-bit registers
+ // so we don't want to run it in 32-bit mode. Also don't run it
+ // on functions that are not to be optimized.
+ if (TM.getOptLevel() == CodeGenOpt::None || !TM.isPPC64())
+ return false;
+
+ switch (N->getOpcode()) {
+ default: break;
+ case ISD::ZERO_EXTEND:
+ case ISD::SIGN_EXTEND:
+ case ISD::AND:
+ case ISD::OR:
+ case ISD::XOR: {
+ IntegerCompareEliminator ICmpElim(CurDAG, this);
+ if (SDNode *New = ICmpElim.Select(N)) {
+ ReplaceNode(N, New);
+ return true;
+ }
+ }
+ }
+ return false;
+}
+
bool PPCDAGToDAGISel::tryBitPermutation(SDNode *N) {
if (N->getValueType(0) != MVT::i32 &&
N->getValueType(0) != MVT::i64)
@@ -2504,506 +3819,6 @@ bool PPCDAGToDAGISel::trySETCC(SDNode *N) {
return true;
}
-// Is this opcode a bitwise logical operation?
-static bool isLogicOp(unsigned Opc) {
- return Opc == ISD::AND || Opc == ISD::OR || Opc == ISD::XOR;
-}
-
-/// If this node is a sign/zero extension of an integer comparison,
-/// it can usually be computed in GPR's rather than using comparison
-/// instructions and ISEL. We only do this on 64-bit targets for now
-/// as the code is specialized for 64-bit (it uses 64-bit instructions
-/// and assumes 64-bit registers).
-bool PPCDAGToDAGISel::tryEXTEND(SDNode *N) {
- if (TM.getOptLevel() == CodeGenOpt::None || !TM.isPPC64())
- return false;
- assert((N->getOpcode() == ISD::ZERO_EXTEND ||
- N->getOpcode() == ISD::SIGN_EXTEND) &&
- "Expecting a zero/sign extend node!");
-
- SDValue WideRes;
- // If we are zero-extending the result of a logical operation on i1
- // values, we can keep the values in GPRs.
- if (isLogicOp(N->getOperand(0).getOpcode()) &&
- N->getOperand(0).getValueType() == MVT::i1 &&
- N->getOpcode() == ISD::ZERO_EXTEND)
- WideRes = computeLogicOpInGPR(N->getOperand(0));
- else if (N->getOperand(0).getOpcode() != ISD::SETCC)
- return false;
- else
- WideRes =
- getSETCCInGPR(N->getOperand(0),
- N->getOpcode() == ISD::SIGN_EXTEND ?
- SetccInGPROpts::SExtOrig : SetccInGPROpts::ZExtOrig);
-
- if (!WideRes)
- return false;
-
- SDLoc dl(N);
- bool Inputs32Bit = N->getOperand(0).getOperand(0).getValueType() == MVT::i32;
- bool Output32Bit = N->getValueType(0) == MVT::i32;
-
- NumSextSetcc += N->getOpcode() == ISD::SIGN_EXTEND ? 1 : 0;
- NumZextSetcc += N->getOpcode() == ISD::SIGN_EXTEND ? 0 : 1;
-
- SDValue ConvOp = WideRes;
- if (Inputs32Bit != Output32Bit)
- ConvOp = addExtOrTrunc(WideRes, Inputs32Bit ? ExtOrTruncConversion::Ext :
- ExtOrTruncConversion::Trunc);
- ReplaceNode(N, ConvOp.getNode());
-
- return true;
-}
-
-// Lower a logical operation on i1 values into a GPR sequence if possible.
-// The result can be kept in a GPR if requested.
-// Three types of inputs can be handled:
-// - SETCC
-// - TRUNCATE
-// - Logical operation (AND/OR/XOR)
-// There is also a special case that is handled (namely a complement operation
-// achieved with xor %a, -1).
-SDValue PPCDAGToDAGISel::computeLogicOpInGPR(SDValue LogicOp) {
- assert(isLogicOp(LogicOp.getOpcode()) &&
- "Can only handle logic operations here.");
- assert(LogicOp.getValueType() == MVT::i1 &&
- "Can only handle logic operations on i1 values here.");
- SDLoc dl(LogicOp);
- SDValue LHS, RHS;
-
- // Special case: xor %a, -1
- bool IsBitwiseNegation = isBitwiseNot(LogicOp);
-
- // Produces a GPR sequence for each operand of the binary logic operation.
- // For SETCC, it produces the respective comparison, for TRUNCATE it truncates
- // the value in a GPR and for logic operations, it will recursively produce
- // a GPR sequence for the operation.
- auto getLogicOperand = [&] (SDValue Operand) -> SDValue {
- unsigned OperandOpcode = Operand.getOpcode();
- if (OperandOpcode == ISD::SETCC)
- return getSETCCInGPR(Operand, SetccInGPROpts::ZExtOrig);
- else if (OperandOpcode == ISD::TRUNCATE) {
- SDValue InputOp = Operand.getOperand(0);
- EVT InVT = InputOp.getValueType();
- return
- SDValue(CurDAG->getMachineNode(InVT == MVT::i32 ? PPC::RLDICL_32 :
- PPC::RLDICL, dl, InVT, InputOp,
- getI64Imm(0, dl), getI64Imm(63, dl)), 0);
- } else if (isLogicOp(OperandOpcode))
- return computeLogicOpInGPR(Operand);
- return SDValue();
- };
- LHS = getLogicOperand(LogicOp.getOperand(0));
- RHS = getLogicOperand(LogicOp.getOperand(1));
-
- // If a GPR sequence can't be produced for the LHS we can't proceed.
- // Not producing a GPR sequence for the RHS is only a problem if this isn't
- // a bitwise negation operation.
- if (!LHS || (!RHS && !IsBitwiseNegation))
- return SDValue();
-
- NumLogicOpsOnComparison++;
-
- // We will use the inputs as 64-bit values.
- if (LHS.getValueType() == MVT::i32)
- LHS = addExtOrTrunc(LHS, ExtOrTruncConversion::Ext);
- if (!IsBitwiseNegation && RHS.getValueType() == MVT::i32)
- RHS = addExtOrTrunc(RHS, ExtOrTruncConversion::Ext);
-
- unsigned NewOpc;
- switch (LogicOp.getOpcode()) {
- default: llvm_unreachable("Unknown logic operation.");
- case ISD::AND: NewOpc = PPC::AND8; break;
- case ISD::OR: NewOpc = PPC::OR8; break;
- case ISD::XOR: NewOpc = PPC::XOR8; break;
- }
-
- if (IsBitwiseNegation) {
- RHS = getI64Imm(1, dl);
- NewOpc = PPC::XORI8;
- }
-
- return SDValue(CurDAG->getMachineNode(NewOpc, dl, MVT::i64, LHS, RHS), 0);
-
-}
-
-/// Try performing logical operations on results of comparisons in GPRs.
-/// It is typically preferred from a performance perspective over performing
-/// the operations on individual bits in the CR. We only do this on 64-bit
-/// targets for now as the code is specialized for 64-bit (it uses 64-bit
-/// instructions and assumes 64-bit registers).
-bool PPCDAGToDAGISel::tryLogicOpOfCompares(SDNode *N) {
- if (TM.getOptLevel() == CodeGenOpt::None || !TM.isPPC64())
- return false;
- if (N->getValueType(0) != MVT::i1)
- return false;
- assert(isLogicOp(N->getOpcode()) &&
- "Expected a logic operation on setcc results.");
- SDValue LoweredLogical = computeLogicOpInGPR(SDValue(N, 0));
- if (!LoweredLogical)
- return false;
-
- SDLoc dl(N);
- bool IsBitwiseNegate = LoweredLogical.getMachineOpcode() == PPC::XORI8;
- unsigned SubRegToExtract = IsBitwiseNegate ? PPC::sub_eq : PPC::sub_gt;
- SDValue CR0Reg = CurDAG->getRegister(PPC::CR0, MVT::i32);
- SDValue LHS = LoweredLogical.getOperand(0);
- SDValue RHS = LoweredLogical.getOperand(1);
- SDValue WideOp;
- SDValue OpToConvToRecForm;
-
- // Look through any 32-bit to 64-bit implicit extend nodes to find the opcode
- // that is input to the XORI.
- if (IsBitwiseNegate &&
- LoweredLogical.getOperand(0).getMachineOpcode() == PPC::INSERT_SUBREG)
- OpToConvToRecForm = LoweredLogical.getOperand(0).getOperand(1);
- else if (IsBitwiseNegate)
- // If the input to the XORI isn't an extension, that's what we're after.
- OpToConvToRecForm = LoweredLogical.getOperand(0);
- else
- // If this is not an XORI, it is a reg-reg logical op and we can convert it
- // to record-form.
- OpToConvToRecForm = LoweredLogical;
-
- // Get the record-form version of the node we're looking to use to get the
- // CR result from.
- uint16_t NonRecOpc = OpToConvToRecForm.getMachineOpcode();
- int NewOpc = PPCInstrInfo::getRecordFormOpcode(NonRecOpc);
-
- // Convert the right node to record-form. This is either the logical we're
- // looking at or it is the input node to the negation (if we're looking at
- // a bitwise negation).
- if (NewOpc != -1 && IsBitwiseNegate) {
- // The input to the XORI has a record-form. Use it.
- assert(LoweredLogical.getConstantOperandVal(1) == 1 &&
- "Expected a PPC::XORI8 only for bitwise negation.");
- // Emit the record-form instruction.
- std::vector<SDValue> Ops;
- for (int i = 0, e = OpToConvToRecForm.getNumOperands(); i < e; i++)
- Ops.push_back(OpToConvToRecForm.getOperand(i));
-
- WideOp =
- SDValue(CurDAG->getMachineNode(NewOpc, dl,
- OpToConvToRecForm.getValueType(),
- MVT::Glue, Ops), 0);
- } else {
- assert((NewOpc != -1 || !IsBitwiseNegate) &&
- "No record form available for AND8/OR8/XOR8?");
- WideOp =
- SDValue(CurDAG->getMachineNode(NewOpc == -1 ? PPC::ANDIo8 : NewOpc, dl,
- MVT::i64, MVT::Glue, LHS, RHS), 0);
- }
-
- // Select this node to a single bit from CR0 set by the record-form node
- // just created. For bitwise negation, use the EQ bit which is the equivalent
- // of negating the result (i.e. it is a bit set when the result of the
- // operation is zero).
- SDValue SRIdxVal =
- CurDAG->getTargetConstant(SubRegToExtract, dl, MVT::i32);
- SDValue CRBit =
- SDValue(CurDAG->getMachineNode(TargetOpcode::EXTRACT_SUBREG, dl,
- MVT::i1, CR0Reg, SRIdxVal,
- WideOp.getValue(1)), 0);
- ReplaceNode(N, CRBit.getNode());
- return true;
-}
-
-/// If the value isn't guaranteed to be sign-extended to 64-bits, extend it.
-/// Useful when emitting comparison code for 32-bit values without using
-/// the compare instruction (which only considers the lower 32-bits).
-SDValue PPCDAGToDAGISel::signExtendInputIfNeeded(SDValue Input) {
- assert(Input.getValueType() == MVT::i32 &&
- "Can only sign-extend 32-bit values here.");
- unsigned Opc = Input.getOpcode();
-
- // The value was sign extended and then truncated to 32-bits. No need to
- // sign extend it again.
- if (Opc == ISD::TRUNCATE &&
- (Input.getOperand(0).getOpcode() == ISD::AssertSext ||
- Input.getOperand(0).getOpcode() == ISD::SIGN_EXTEND))
- return Input;
-
- LoadSDNode *InputLoad = dyn_cast<LoadSDNode>(Input);
- // The input is a sign-extending load. No reason to sign-extend.
- if (InputLoad && InputLoad->getExtensionType() == ISD::SEXTLOAD)
- return Input;
-
- ConstantSDNode *InputConst = dyn_cast<ConstantSDNode>(Input);
- // We don't sign-extend constants and already sign-extended values.
- if (InputConst || Opc == ISD::AssertSext || Opc == ISD::SIGN_EXTEND_INREG ||
- Opc == ISD::SIGN_EXTEND)
- return Input;
-
- SDLoc dl(Input);
- SignExtensionsAdded++;
- return SDValue(CurDAG->getMachineNode(PPC::EXTSW_32, dl, MVT::i32, Input), 0);
-}
-
-/// If the value isn't guaranteed to be zero-extended to 64-bits, extend it.
-/// Useful when emitting comparison code for 32-bit values without using
-/// the compare instruction (which only considers the lower 32-bits).
-SDValue PPCDAGToDAGISel::zeroExtendInputIfNeeded(SDValue Input) {
- assert(Input.getValueType() == MVT::i32 &&
- "Can only zero-extend 32-bit values here.");
- LoadSDNode *InputLoad = dyn_cast<LoadSDNode>(Input);
- unsigned Opc = Input.getOpcode();
-
- // No need to zero-extend loaded values (unless they're loaded with
- // a sign-extending load).
- if (InputLoad && InputLoad->getExtensionType() != ISD::SEXTLOAD)
- return Input;
-
- ConstantSDNode *InputConst = dyn_cast<ConstantSDNode>(Input);
- bool InputZExtConst = InputConst && InputConst->getSExtValue() >= 0;
- // An ISD::TRUNCATE will be lowered to an EXTRACT_SUBREG so we have
- // to conservatively actually clear the high bits. We also don't need to
- // zero-extend constants or values that are already zero-extended.
- if (InputZExtConst || Opc == ISD::AssertZext || Opc == ISD::ZERO_EXTEND)
- return Input;
-
- SDLoc dl(Input);
- ZeroExtensionsAdded++;
- return SDValue(CurDAG->getMachineNode(PPC::RLDICL_32, dl, MVT::i32, Input,
- getI64Imm(0, dl), getI64Imm(32, dl)),
- 0);
-}
-
-// Handle a 32-bit value in a 64-bit register and vice-versa. These are of
-// course not actual zero/sign extensions that will generate machine code,
-// they're just a way to reinterpret a 32 bit value in a register as a
-// 64 bit value and vice-versa.
-SDValue PPCDAGToDAGISel::addExtOrTrunc(SDValue NatWidthRes,
- ExtOrTruncConversion Conv) {
- SDLoc dl(NatWidthRes);
-
- // For reinterpreting 32-bit values as 64 bit values, we generate
- // INSERT_SUBREG IMPLICIT_DEF:i64, <input>, TargetConstant:i32<1>
- if (Conv == ExtOrTruncConversion::Ext) {
- SDValue ImDef(CurDAG->getMachineNode(PPC::IMPLICIT_DEF, dl, MVT::i64), 0);
- SDValue SubRegIdx =
- CurDAG->getTargetConstant(PPC::sub_32, dl, MVT::i32);
- return SDValue(CurDAG->getMachineNode(PPC::INSERT_SUBREG, dl, MVT::i64,
- ImDef, NatWidthRes, SubRegIdx), 0);
- }
-
- assert(Conv == ExtOrTruncConversion::Trunc &&
- "Unknown convertion between 32 and 64 bit values.");
- // For reinterpreting 64-bit values as 32-bit values, we just need to
- // EXTRACT_SUBREG (i.e. extract the low word).
- SDValue SubRegIdx =
- CurDAG->getTargetConstant(PPC::sub_32, dl, MVT::i32);
- return SDValue(CurDAG->getMachineNode(PPC::EXTRACT_SUBREG, dl, MVT::i32,
- NatWidthRes, SubRegIdx), 0);
-}
-
-/// Produces a zero-extended result of comparing two 32-bit values according to
-/// the passed condition code.
-SDValue PPCDAGToDAGISel::get32BitZExtCompare(SDValue LHS, SDValue RHS,
- ISD::CondCode CC,
- int64_t RHSValue, SDLoc dl) {
- bool IsRHSZero = RHSValue == 0;
- switch (CC) {
- default: return SDValue();
- case ISD::SETEQ: {
- // (zext (setcc %a, %b, seteq)) -> (lshr (cntlzw (xor %a, %b)), 5)
- // (zext (setcc %a, 0, seteq)) -> (lshr (cntlzw %a), 5)
- SDValue Xor = IsRHSZero ? LHS :
- SDValue(CurDAG->getMachineNode(PPC::XOR, dl, MVT::i32, LHS, RHS), 0);
- SDValue Clz =
- SDValue(CurDAG->getMachineNode(PPC::CNTLZW, dl, MVT::i32, Xor), 0);
- SDValue ShiftOps[] = { Clz, getI32Imm(27, dl), getI32Imm(5, dl),
- getI32Imm(31, dl) };
- return SDValue(CurDAG->getMachineNode(PPC::RLWINM, dl, MVT::i32,
- ShiftOps), 0);
- }
- case ISD::SETNE: {
- // (zext (setcc %a, %b, setne)) -> (xor (lshr (cntlzw (xor %a, %b)), 5), 1)
- // (zext (setcc %a, 0, setne)) -> (xor (lshr (cntlzw %a), 5), 1)
- SDValue Xor = IsRHSZero ? LHS :
- SDValue(CurDAG->getMachineNode(PPC::XOR, dl, MVT::i32, LHS, RHS), 0);
- SDValue Clz =
- SDValue(CurDAG->getMachineNode(PPC::CNTLZW, dl, MVT::i32, Xor), 0);
- SDValue ShiftOps[] = { Clz, getI32Imm(27, dl), getI32Imm(5, dl),
- getI32Imm(31, dl) };
- SDValue Shift =
- SDValue(CurDAG->getMachineNode(PPC::RLWINM, dl, MVT::i32, ShiftOps), 0);
- return SDValue(CurDAG->getMachineNode(PPC::XORI, dl, MVT::i32, Shift,
- getI32Imm(1, dl)), 0);
- }
- }
-}
-
-/// Produces a sign-extended result of comparing two 32-bit values according to
-/// the passed condition code.
-SDValue PPCDAGToDAGISel::get32BitSExtCompare(SDValue LHS, SDValue RHS,
- ISD::CondCode CC,
- int64_t RHSValue, SDLoc dl) {
- bool IsRHSZero = RHSValue == 0;
- switch (CC) {
- default: return SDValue();
- case ISD::SETEQ: {
- // (sext (setcc %a, %b, seteq)) ->
- // (ashr (shl (ctlz (xor %a, %b)), 58), 63)
- // (sext (setcc %a, 0, seteq)) ->
- // (ashr (shl (ctlz %a), 58), 63)
- SDValue CountInput = IsRHSZero ? LHS :
- SDValue(CurDAG->getMachineNode(PPC::XOR, dl, MVT::i32, LHS, RHS), 0);
- SDValue Cntlzw =
- SDValue(CurDAG->getMachineNode(PPC::CNTLZW, dl, MVT::i32, CountInput), 0);
- SDValue SHLOps[] = { Cntlzw, getI32Imm(58, dl), getI32Imm(0, dl) };
- SDValue Sldi =
- SDValue(CurDAG->getMachineNode(PPC::RLDICR_32, dl, MVT::i32, SHLOps), 0);
- return SDValue(CurDAG->getMachineNode(PPC::SRADI_32, dl, MVT::i32, Sldi,
- getI32Imm(63, dl)), 0);
- }
- case ISD::SETNE: {
- // Bitwise xor the operands, count leading zeros, shift right by 5 bits and
- // flip the bit, finally take 2's complement.
- // (sext (setcc %a, %b, setne)) ->
- // (neg (xor (lshr (ctlz (xor %a, %b)), 5), 1))
- // Same as above, but the first xor is not needed.
- // (sext (setcc %a, 0, setne)) ->
- // (neg (xor (lshr (ctlz %a), 5), 1))
- SDValue Xor = IsRHSZero ? LHS :
- SDValue(CurDAG->getMachineNode(PPC::XOR, dl, MVT::i32, LHS, RHS), 0);
- SDValue Clz =
- SDValue(CurDAG->getMachineNode(PPC::CNTLZW, dl, MVT::i32, Xor), 0);
- SDValue ShiftOps[] =
- { Clz, getI32Imm(27, dl), getI32Imm(5, dl), getI32Imm(31, dl) };
- SDValue Shift =
- SDValue(CurDAG->getMachineNode(PPC::RLWINM, dl, MVT::i32, ShiftOps), 0);
- SDValue Xori =
- SDValue(CurDAG->getMachineNode(PPC::XORI, dl, MVT::i32, Shift,
- getI32Imm(1, dl)), 0);
- return SDValue(CurDAG->getMachineNode(PPC::NEG, dl, MVT::i32, Xori), 0);
- }
- }
-}
-
-/// Produces a zero-extended result of comparing two 64-bit values according to
-/// the passed condition code.
-SDValue PPCDAGToDAGISel::get64BitZExtCompare(SDValue LHS, SDValue RHS,
- ISD::CondCode CC,
- int64_t RHSValue, SDLoc dl) {
- bool IsRHSZero = RHSValue == 0;
- switch (CC) {
- default: return SDValue();
- case ISD::SETEQ: {
- // (zext (setcc %a, %b, seteq)) -> (lshr (ctlz (xor %a, %b)), 6)
- // (zext (setcc %a, 0, seteq)) -> (lshr (ctlz %a), 6)
- SDValue Xor = IsRHSZero ? LHS :
- SDValue(CurDAG->getMachineNode(PPC::XOR8, dl, MVT::i64, LHS, RHS), 0);
- SDValue Clz =
- SDValue(CurDAG->getMachineNode(PPC::CNTLZD, dl, MVT::i64, Xor), 0);
- return SDValue(CurDAG->getMachineNode(PPC::RLDICL, dl, MVT::i64, Clz,
- getI64Imm(58, dl), getI64Imm(63, dl)),
- 0);
- }
- }
-}
-
-/// Produces a sign-extended result of comparing two 64-bit values according to
-/// the passed condition code.
-SDValue PPCDAGToDAGISel::get64BitSExtCompare(SDValue LHS, SDValue RHS,
- ISD::CondCode CC,
- int64_t RHSValue, SDLoc dl) {
- bool IsRHSZero = RHSValue == 0;
- switch (CC) {
- default: return SDValue();
- case ISD::SETEQ: {
- // {addc.reg, addc.CA} = (addcarry (xor %a, %b), -1)
- // (sext (setcc %a, %b, seteq)) -> (sube addc.reg, addc.reg, addc.CA)
- // {addcz.reg, addcz.CA} = (addcarry %a, -1)
- // (sext (setcc %a, 0, seteq)) -> (sube addcz.reg, addcz.reg, addcz.CA)
- SDValue AddInput = IsRHSZero ? LHS :
- SDValue(CurDAG->getMachineNode(PPC::XOR8, dl, MVT::i64, LHS, RHS), 0);
- SDValue Addic =
- SDValue(CurDAG->getMachineNode(PPC::ADDIC8, dl, MVT::i64, MVT::Glue,
- AddInput, getI32Imm(~0U, dl)), 0);
- return SDValue(CurDAG->getMachineNode(PPC::SUBFE8, dl, MVT::i64, Addic,
- Addic, Addic.getValue(1)), 0);
- }
- }
-}
-
-/// Does this SDValue have any uses for which keeping the value in a GPR is
-/// appropriate. This is meant to be used on values that have type i1 since
-/// it is somewhat meaningless to ask if values of other types can be kept in
-/// GPR's.
-static bool allUsesExtend(SDValue Compare, SelectionDAG *CurDAG) {
- assert(Compare.getOpcode() == ISD::SETCC &&
- "An ISD::SETCC node required here.");
-
- // For values that have a single use, the caller should obviously already have
- // checked if that use is an extending use. We check the other uses here.
- if (Compare.hasOneUse())
- return true;
- // We want the value in a GPR if it is being extended, used for a select, or
- // used in logical operations.
- for (auto CompareUse : Compare.getNode()->uses())
- if (CompareUse->getOpcode() != ISD::SIGN_EXTEND &&
- CompareUse->getOpcode() != ISD::ZERO_EXTEND &&
- CompareUse->getOpcode() != ISD::SELECT &&
- !isLogicOp(CompareUse->getOpcode())) {
- OmittedForNonExtendUses++;
- return false;
- }
- return true;
-}
-
-/// Returns an equivalent of a SETCC node but with the result the same width as
-/// the inputs. This can nalso be used for SELECT_CC if either the true or false
-/// values is a power of two while the other is zero.
-SDValue PPCDAGToDAGISel::getSETCCInGPR(SDValue Compare,
- SetccInGPROpts ConvOpts) {
- assert((Compare.getOpcode() == ISD::SETCC ||
- Compare.getOpcode() == ISD::SELECT_CC) &&
- "An ISD::SETCC node required here.");
-
- // Don't convert this comparison to a GPR sequence because there are uses
- // of the i1 result (i.e. uses that require the result in the CR).
- if ((Compare.getOpcode() == ISD::SETCC) && !allUsesExtend(Compare, CurDAG))
- return SDValue();
-
- SDValue LHS = Compare.getOperand(0);
- SDValue RHS = Compare.getOperand(1);
-
- // The condition code is operand 2 for SETCC and operand 4 for SELECT_CC.
- int CCOpNum = Compare.getOpcode() == ISD::SELECT_CC ? 4 : 2;
- ISD::CondCode CC =
- cast<CondCodeSDNode>(Compare.getOperand(CCOpNum))->get();
- EVT InputVT = LHS.getValueType();
- if (InputVT != MVT::i32 && InputVT != MVT::i64)
- return SDValue();
-
- if (ConvOpts == SetccInGPROpts::ZExtInvert ||
- ConvOpts == SetccInGPROpts::SExtInvert)
- CC = ISD::getSetCCInverse(CC, true);
-
- bool Inputs32Bit = InputVT == MVT::i32;
- if (ISD::isSignedIntSetCC(CC) && Inputs32Bit) {
- LHS = signExtendInputIfNeeded(LHS);
- RHS = signExtendInputIfNeeded(RHS);
- } else if (ISD::isUnsignedIntSetCC(CC) && Inputs32Bit) {
- LHS = zeroExtendInputIfNeeded(LHS);
- RHS = zeroExtendInputIfNeeded(RHS);
- }
-
- SDLoc dl(Compare);
- ConstantSDNode *RHSConst = dyn_cast<ConstantSDNode>(RHS);
- int64_t RHSValue = RHSConst ? RHSConst->getSExtValue() : INT64_MAX;
- bool IsSext = ConvOpts == SetccInGPROpts::SExtOrig ||
- ConvOpts == SetccInGPROpts::SExtInvert;
-
- if (IsSext && Inputs32Bit)
- return get32BitSExtCompare(LHS, RHS, CC, RHSValue, dl);
- else if (Inputs32Bit)
- return get32BitZExtCompare(LHS, RHS, CC, RHSValue, dl);
- else if (IsSext)
- return get64BitSExtCompare(LHS, RHS, CC, RHSValue, dl);
- return get64BitZExtCompare(LHS, RHS, CC, RHSValue, dl);
-}
-
/// Does this node represent a load/store node whose address can be represented
/// with a register plus an immediate that's a multiple of \p Val:
bool PPCDAGToDAGISel::isOffsetMultipleOf(SDNode *N, unsigned Val) const {
@@ -3016,8 +3831,18 @@ bool PPCDAGToDAGISel::isOffsetMultipleOf(SDNode *N, unsigned Val) const {
AddrOp = STN->getOperand(2);
short Imm = 0;
- if (AddrOp.getOpcode() == ISD::ADD)
+ if (AddrOp.getOpcode() == ISD::ADD) {
+ // If op0 is a frame index that is under aligned, we can't do it either,
+ // because it is translated to r31 or r1 + slot + offset. We won't know the
+ // slot number until the stack frame is finalized.
+ if (FrameIndexSDNode *FI = dyn_cast<FrameIndexSDNode>(AddrOp.getOperand(0))) {
+ const MachineFrameInfo &MFI = CurDAG->getMachineFunction().getFrameInfo();
+ unsigned SlotAlign = MFI.getObjectAlignment(FI->getIndex());
+ if ((SlotAlign % Val) != 0)
+ return false;
+ }
return isIntS16Immediate(AddrOp.getOperand(1), Imm) && !(Imm % Val);
+ }
// If the address comes from the outside, the offset will be zero.
return AddrOp.getOpcode() == ISD::CopyFromReg;
@@ -3050,22 +3875,20 @@ void PPCDAGToDAGISel::Select(SDNode *N) {
if (tryBitPermutation(N))
return;
+ // Try to emit integer compares as GPR-only sequences (i.e. no use of CR).
+ if (tryIntCompareInGPR(N))
+ return;
+
switch (N->getOpcode()) {
default: break;
case ISD::Constant:
if (N->getValueType(0) == MVT::i64) {
- ReplaceNode(N, getInt64(CurDAG, N));
+ ReplaceNode(N, selectI64Imm(CurDAG, N));
return;
}
break;
- case ISD::ZERO_EXTEND:
- case ISD::SIGN_EXTEND:
- if (tryEXTEND(N))
- return;
- break;
-
case ISD::SETCC:
if (trySETCC(N))
return;
@@ -3209,9 +4032,6 @@ void PPCDAGToDAGISel::Select(SDNode *N) {
}
case ISD::AND: {
- if (tryLogicOpOfCompares(N))
- return;
-
unsigned Imm, Imm2, SH, MB, ME;
uint64_t Imm64;
@@ -3331,9 +4151,6 @@ void PPCDAGToDAGISel::Select(SDNode *N) {
if (tryBitfieldInsert(N))
return;
- if (tryLogicOpOfCompares(N))
- return;
-
int16_t Imm;
if (N->getOperand(0)->getOpcode() == ISD::FrameIndex &&
isIntS16Immediate(N->getOperand(1), Imm)) {
@@ -3348,12 +4165,48 @@ void PPCDAGToDAGISel::Select(SDNode *N) {
}
}
+ // OR with a 32-bit immediate can be handled by ori + oris
+ // without creating an immediate in a GPR.
+ uint64_t Imm64 = 0;
+ bool IsPPC64 = PPCSubTarget->isPPC64();
+ if (IsPPC64 && isInt64Immediate(N->getOperand(1), Imm64) &&
+ (Imm64 & ~0xFFFFFFFFuLL) == 0) {
+ // If ImmHi (ImmHi) is zero, only one ori (oris) is generated later.
+ uint64_t ImmHi = Imm64 >> 16;
+ uint64_t ImmLo = Imm64 & 0xFFFF;
+ if (ImmHi != 0 && ImmLo != 0) {
+ SDNode *Lo = CurDAG->getMachineNode(PPC::ORI8, dl, MVT::i64,
+ N->getOperand(0),
+ getI16Imm(ImmLo, dl));
+ SDValue Ops1[] = { SDValue(Lo, 0), getI16Imm(ImmHi, dl)};
+ CurDAG->SelectNodeTo(N, PPC::ORIS8, MVT::i64, Ops1);
+ return;
+ }
+ }
+
// Other cases are autogenerated.
break;
}
case ISD::XOR: {
- if (tryLogicOpOfCompares(N))
- return;
+ // XOR with a 32-bit immediate can be handled by xori + xoris
+ // without creating an immediate in a GPR.
+ uint64_t Imm64 = 0;
+ bool IsPPC64 = PPCSubTarget->isPPC64();
+ if (IsPPC64 && isInt64Immediate(N->getOperand(1), Imm64) &&
+ (Imm64 & ~0xFFFFFFFFuLL) == 0) {
+ // If ImmHi (ImmHi) is zero, only one xori (xoris) is generated later.
+ uint64_t ImmHi = Imm64 >> 16;
+ uint64_t ImmLo = Imm64 & 0xFFFF;
+ if (ImmHi != 0 && ImmLo != 0) {
+ SDNode *Lo = CurDAG->getMachineNode(PPC::XORI8, dl, MVT::i64,
+ N->getOperand(0),
+ getI16Imm(ImmLo, dl));
+ SDValue Ops1[] = { SDValue(Lo, 0), getI16Imm(ImmHi, dl)};
+ CurDAG->SelectNodeTo(N, PPC::XORIS8, MVT::i64, Ops1);
+ return;
+ }
+ }
+
break;
}
case ISD::ADD: {
@@ -3666,9 +4519,9 @@ void PPCDAGToDAGISel::Select(SDNode *N) {
// The first source operand is a TargetGlobalAddress or a TargetJumpTable.
// If it must be toc-referenced according to PPCSubTarget, we generate:
- // LDtocL(<ga:@sym>, ADDIStocHA(%X2, <ga:@sym>))
+ // LDtocL(@sym, ADDIStocHA(%x2, @sym))
// Otherwise we generate:
- // ADDItocL(ADDIStocHA(%X2, <ga:@sym>), <ga:@sym>)
+ // ADDItocL(ADDIStocHA(%x2, @sym), @sym)
SDValue GA = N->getOperand(0);
SDValue TOCbase = N->getOperand(1);
SDNode *Tmp = CurDAG->getMachineNode(PPC::ADDIStocHA, dl, MVT::i64,
diff --git a/lib/Target/PowerPC/PPCISelLowering.cpp b/lib/Target/PowerPC/PPCISelLowering.cpp
index b3a3c73f6df03..18e567fa589c7 100644
--- a/lib/Target/PowerPC/PPCISelLowering.cpp
+++ b/lib/Target/PowerPC/PPCISelLowering.cpp
@@ -51,6 +51,9 @@
#include "llvm/CodeGen/RuntimeLibcalls.h"
#include "llvm/CodeGen/SelectionDAG.h"
#include "llvm/CodeGen/SelectionDAGNodes.h"
+#include "llvm/CodeGen/TargetInstrInfo.h"
+#include "llvm/CodeGen/TargetLowering.h"
+#include "llvm/CodeGen/TargetRegisterInfo.h"
#include "llvm/CodeGen/ValueTypes.h"
#include "llvm/IR/CallSite.h"
#include "llvm/IR/CallingConv.h"
@@ -82,11 +85,8 @@
#include "llvm/Support/KnownBits.h"
#include "llvm/Support/MathExtras.h"
#include "llvm/Support/raw_ostream.h"
-#include "llvm/Target/TargetInstrInfo.h"
-#include "llvm/Target/TargetLowering.h"
#include "llvm/Target/TargetMachine.h"
#include "llvm/Target/TargetOptions.h"
-#include "llvm/Target/TargetRegisterInfo.h"
#include <algorithm>
#include <cassert>
#include <cstdint>
@@ -114,6 +114,8 @@ cl::desc("disable sibling call optimization on ppc"), cl::Hidden);
STATISTIC(NumTailCalls, "Number of tail calls");
STATISTIC(NumSiblingCalls, "Number of sibling calls");
+static bool isNByteElemShuffleMask(ShuffleVectorSDNode *, unsigned, int);
+
// FIXME: Remove this once the bug has been fixed!
extern cl::opt<bool> ANDIGlueBug;
@@ -226,6 +228,12 @@ PPCTargetLowering::PPCTargetLowering(const PPCTargetMachine &TM,
setOperationAction(ISD::UREM, MVT::i64, Expand);
}
+ if (Subtarget.hasP9Vector()) {
+ setOperationAction(ISD::ABS, MVT::v4i32, Legal);
+ setOperationAction(ISD::ABS, MVT::v8i16, Legal);
+ setOperationAction(ISD::ABS, MVT::v16i8, Legal);
+ }
+
// Don't use SMUL_LOHI/UMUL_LOHI or SDIVREM/UDIVREM to lower SREM/UREM.
setOperationAction(ISD::UMUL_LOHI, MVT::i32, Expand);
setOperationAction(ISD::SMUL_LOHI, MVT::i32, Expand);
@@ -283,14 +291,16 @@ PPCTargetLowering::PPCTargetLowering(const PPCTargetMachine &TM,
setOperationAction(ISD::FROUND, MVT::f32, Legal);
}
- // PowerPC does not have BSWAP
+ // PowerPC does not have BSWAP, but we can use vector BSWAP instruction xxbrd
+ // to speed up scalar BSWAP64.
// CTPOP or CTTZ were introduced in P8/P9 respectivelly
setOperationAction(ISD::BSWAP, MVT::i32 , Expand);
- setOperationAction(ISD::BSWAP, MVT::i64 , Expand);
if (Subtarget.isISA3_0()) {
+ setOperationAction(ISD::BSWAP, MVT::i64 , Custom);
setOperationAction(ISD::CTTZ , MVT::i32 , Legal);
setOperationAction(ISD::CTTZ , MVT::i64 , Legal);
} else {
+ setOperationAction(ISD::BSWAP, MVT::i64 , Expand);
setOperationAction(ISD::CTTZ , MVT::i32 , Expand);
setOperationAction(ISD::CTTZ , MVT::i64 , Expand);
}
@@ -773,6 +783,11 @@ PPCTargetLowering::PPCTargetLowering(const PPCTargetMachine &TM,
setOperationAction(ISD::SRL, MVT::v1i128, Legal);
setOperationAction(ISD::SRA, MVT::v1i128, Expand);
}
+
+ if (Subtarget.hasP9Altivec()) {
+ setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v8i16, Custom);
+ setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v16i8, Custom);
+ }
}
if (Subtarget.hasQPX()) {
@@ -1131,7 +1146,7 @@ const char *PPCTargetLowering::getTargetNodeName(unsigned Opcode) const {
case PPCISD::VNMSUBFP: return "PPCISD::VNMSUBFP";
case PPCISD::VPERM: return "PPCISD::VPERM";
case PPCISD::XXSPLT: return "PPCISD::XXSPLT";
- case PPCISD::XXINSERT: return "PPCISD::XXINSERT";
+ case PPCISD::VECINSERT: return "PPCISD::VECINSERT";
case PPCISD::XXREVERSE: return "PPCISD::XXREVERSE";
case PPCISD::XXPERMDI: return "PPCISD::XXPERMDI";
case PPCISD::VECSHL: return "PPCISD::VECSHL";
@@ -2413,8 +2428,8 @@ static SDValue getTOCEntry(SelectionDAG &DAG, const SDLoc &dl, bool Is64Bit,
SDValue Ops[] = { GA, Reg };
return DAG.getMemIntrinsicNode(
PPCISD::TOC_ENTRY, dl, DAG.getVTList(VT, MVT::Other), Ops, VT,
- MachinePointerInfo::getGOT(DAG.getMachineFunction()), 0, false, true,
- false, 0);
+ MachinePointerInfo::getGOT(DAG.getMachineFunction()), 0,
+ MachineMemOperand::MOLoad);
}
SDValue PPCTargetLowering::LowerConstantPool(SDValue Op,
@@ -2470,7 +2485,6 @@ SDValue PPCTargetLowering::getPICJumpTableRelocBase(SDValue Table,
return TargetLowering::getPICJumpTableRelocBase(Table, DAG);
switch (getTargetMachine().getCodeModel()) {
- case CodeModel::Default:
case CodeModel::Small:
case CodeModel::Medium:
return TargetLowering::getPICJumpTableRelocBase(Table, DAG);
@@ -2488,7 +2502,6 @@ PPCTargetLowering::getPICJumpTableRelocBaseExpr(const MachineFunction *MF,
return TargetLowering::getPICJumpTableRelocBaseExpr(MF, JTI, Ctx);
switch (getTargetMachine().getCodeModel()) {
- case CodeModel::Default:
case CodeModel::Small:
case CodeModel::Medium:
return TargetLowering::getPICJumpTableRelocBaseExpr(MF, JTI, Ctx);
@@ -2560,7 +2573,7 @@ SDValue PPCTargetLowering::LowerGlobalTLSAddress(SDValue Op,
const GlobalValue *GV = GA->getGlobal();
EVT PtrVT = getPointerTy(DAG.getDataLayout());
bool is64bit = Subtarget.isPPC64();
- const Module *M = DAG.getMachineFunction().getFunction()->getParent();
+ const Module *M = DAG.getMachineFunction().getFunction().getParent();
PICLevel::Level picLevel = M->getPICLevel();
TLSModel::Model Model = getTargetMachine().getTLSModel(GV);
@@ -3529,7 +3542,7 @@ SDValue PPCTargetLowering::LowerFormalArguments_64SVR4(
unsigned GPR_idx = 0, FPR_idx = 0, VR_idx = 0;
unsigned &QFPR_idx = FPR_idx;
SmallVector<SDValue, 8> MemOps;
- Function::const_arg_iterator FuncArg = MF.getFunction()->arg_begin();
+ Function::const_arg_iterator FuncArg = MF.getFunction().arg_begin();
unsigned CurArgIdx = 0;
for (unsigned ArgNo = 0, e = Ins.size(); ArgNo != e; ++ArgNo) {
SDValue ArgVal;
@@ -3614,6 +3627,7 @@ SDValue PPCTargetLowering::LowerFormalArguments_64SVR4(
if (GPR_idx != Num_GPR_Regs) {
unsigned VReg = MF.addLiveIn(GPR[GPR_idx++], &PPC::G8RCRegClass);
+ FuncInfo->addLiveInAttr(VReg, Flags);
SDValue Val = DAG.getCopyFromReg(Chain, dl, VReg, PtrVT);
SDValue Store;
@@ -3648,6 +3662,7 @@ SDValue PPCTargetLowering::LowerFormalArguments_64SVR4(
break;
unsigned VReg = MF.addLiveIn(GPR[GPR_idx], &PPC::G8RCRegClass);
+ FuncInfo->addLiveInAttr(VReg, Flags);
SDValue Val = DAG.getCopyFromReg(Chain, dl, VReg, PtrVT);
SDValue Addr = FIN;
if (j) {
@@ -3684,6 +3699,7 @@ SDValue PPCTargetLowering::LowerFormalArguments_64SVR4(
// types to avoid forcing arguments to memory unnecessarily.
if (GPR_idx != Num_GPR_Regs) {
unsigned VReg = MF.addLiveIn(GPR[GPR_idx++], &PPC::G8RCRegClass);
+ FuncInfo->addLiveInAttr(VReg, Flags);
ArgVal = DAG.getCopyFromReg(Chain, dl, VReg, MVT::i64);
if (ObjectVT == MVT::i32 || ObjectVT == MVT::i1)
@@ -3729,6 +3745,7 @@ SDValue PPCTargetLowering::LowerFormalArguments_64SVR4(
// since otherwise we never run out of FPRs before running out
// of GPRs.
unsigned VReg = MF.addLiveIn(GPR[GPR_idx++], &PPC::G8RCRegClass);
+ FuncInfo->addLiveInAttr(VReg, Flags);
ArgVal = DAG.getCopyFromReg(Chain, dl, VReg, MVT::i64);
if (ObjectVT == MVT::f32) {
@@ -3969,7 +3986,7 @@ SDValue PPCTargetLowering::LowerFormalArguments_Darwin(
SmallVector<SDValue, 8> MemOps;
unsigned nAltivecParamsAtEnd = 0;
- Function::const_arg_iterator FuncArg = MF.getFunction()->arg_begin();
+ Function::const_arg_iterator FuncArg = MF.getFunction().arg_begin();
unsigned CurArgIdx = 0;
for (unsigned ArgNo = 0, e = Ins.size(); ArgNo != e; ++ArgNo) {
SDValue ArgVal;
@@ -4251,13 +4268,25 @@ static int CalculateTailCallSPDiff(SelectionDAG& DAG, bool isTailCall,
static bool isFunctionGlobalAddress(SDValue Callee);
static bool
-resideInSameSection(const Function *Caller, SDValue Callee,
+callsShareTOCBase(const Function *Caller, SDValue Callee,
const TargetMachine &TM) {
// If !G, Callee can be an external symbol.
GlobalAddressSDNode *G = dyn_cast<GlobalAddressSDNode>(Callee);
if (!G)
return false;
+ // The medium and large code models are expected to provide a sufficiently
+ // large TOC to provide all data addressing needs of a module with a
+ // single TOC. Since each module will be addressed with a single TOC then we
+ // only need to check that caller and callee don't cross dso boundaries.
+ if (CodeModel::Medium == TM.getCodeModel() ||
+ CodeModel::Large == TM.getCodeModel())
+ return TM.shouldAssumeDSOLocal(*Caller->getParent(), G->getGlobal());
+
+ // Otherwise we need to ensure callee and caller are in the same section,
+ // since the linker may allocate multiple TOCs, and we don't know which
+ // sections will belong to the same TOC base.
+
const GlobalValue *GV = G->getGlobal();
if (!GV->isStrongDefinitionForLinker())
return false;
@@ -4335,12 +4364,12 @@ needStackSlotPassParameters(const PPCSubtarget &Subtarget,
}
static bool
-hasSameArgumentList(const Function *CallerFn, ImmutableCallSite *CS) {
- if (CS->arg_size() != CallerFn->arg_size())
+hasSameArgumentList(const Function *CallerFn, ImmutableCallSite CS) {
+ if (CS.arg_size() != CallerFn->arg_size())
return false;
- ImmutableCallSite::arg_iterator CalleeArgIter = CS->arg_begin();
- ImmutableCallSite::arg_iterator CalleeArgEnd = CS->arg_end();
+ ImmutableCallSite::arg_iterator CalleeArgIter = CS.arg_begin();
+ ImmutableCallSite::arg_iterator CalleeArgEnd = CS.arg_end();
Function::const_arg_iterator CallerArgIter = CallerFn->arg_begin();
for (; CalleeArgIter != CalleeArgEnd; ++CalleeArgIter, ++CallerArgIter) {
@@ -4363,11 +4392,25 @@ hasSameArgumentList(const Function *CallerFn, ImmutableCallSite *CS) {
return true;
}
+// Returns true if TCO is possible between the callers and callees
+// calling conventions.
+static bool
+areCallingConvEligibleForTCO_64SVR4(CallingConv::ID CallerCC,
+ CallingConv::ID CalleeCC) {
+ // Tail or Sibling call optimization (TCO/SCO) needs callee and caller to
+ // have the same calling convention.
+ if (CallerCC != CalleeCC)
+ return false;
+
+ // Tail or Sibling calls can be done with fastcc/ccc.
+ return (CallerCC == CallingConv::Fast || CallerCC == CallingConv::C);
+}
+
bool
PPCTargetLowering::IsEligibleForTailCallOptimization_64SVR4(
SDValue Callee,
CallingConv::ID CalleeCC,
- ImmutableCallSite *CS,
+ ImmutableCallSite CS,
bool isVarArg,
const SmallVectorImpl<ISD::OutputArg> &Outs,
const SmallVectorImpl<ISD::InputArg> &Ins,
@@ -4379,15 +4422,9 @@ PPCTargetLowering::IsEligibleForTailCallOptimization_64SVR4(
// Variadic argument functions are not supported.
if (isVarArg) return false;
- MachineFunction &MF = DAG.getMachineFunction();
- CallingConv::ID CallerCC = MF.getFunction()->getCallingConv();
-
- // Tail or Sibling call optimization (TCO/SCO) needs callee and caller has
- // the same calling convention
- if (CallerCC != CalleeCC) return false;
-
- // SCO support C calling convention
- if (CalleeCC != CallingConv::Fast && CalleeCC != CallingConv::C)
+ auto &Caller = DAG.getMachineFunction().getFunction();
+ // Check that the calling conventions are compatible for tco.
+ if (!areCallingConvEligibleForTCO_64SVR4(Caller.getCallingConv(), CalleeCC))
return false;
// Caller contains any byval parameter is not supported.
@@ -4406,11 +4443,10 @@ PPCTargetLowering::IsEligibleForTailCallOptimization_64SVR4(
!isa<ExternalSymbolSDNode>(Callee))
return false;
- // Check if Callee resides in the same section, because for now, PPC64 SVR4
- // ABI (ELFv1/ELFv2) doesn't allow tail calls to a symbol resides in another
- // section.
+ // If the caller and callee potentially have different TOC bases then we
+ // cannot tail call since we need to restore the TOC pointer after the call.
// ref: https://bugzilla.mozilla.org/show_bug.cgi?id=973977
- if (!resideInSameSection(MF.getFunction(), Callee, getTargetMachine()))
+ if (!callsShareTOCBase(&Caller, Callee, getTargetMachine()))
return false;
// TCO allows altering callee ABI, so we don't have to check further.
@@ -4422,7 +4458,7 @@ PPCTargetLowering::IsEligibleForTailCallOptimization_64SVR4(
// If callee use the same argument list that caller is using, then we can
// apply SCO on this case. If it is not, then we need to check if callee needs
// stack for passing arguments.
- if (!hasSameArgumentList(MF.getFunction(), CS) &&
+ if (!hasSameArgumentList(&Caller, CS) &&
needStackSlotPassParameters(Subtarget, Outs)) {
return false;
}
@@ -4447,7 +4483,7 @@ PPCTargetLowering::IsEligibleForTailCallOptimization(SDValue Callee,
return false;
MachineFunction &MF = DAG.getMachineFunction();
- CallingConv::ID CallerCC = MF.getFunction()->getCallingConv();
+ CallingConv::ID CallerCC = MF.getFunction().getCallingConv();
if (CalleeCC == CallingConv::Fast && CallerCC == CalleeCC) {
// Functions containing by val parameters are not supported.
for (unsigned i = 0; i != Ins.size(); i++) {
@@ -4676,7 +4712,7 @@ PrepareCall(SelectionDAG &DAG, SDValue &Callee, SDValue &InFlag, SDValue &Chain,
bool isPatchPoint, bool hasNest,
SmallVectorImpl<std::pair<unsigned, SDValue>> &RegsToPass,
SmallVectorImpl<SDValue> &Ops, std::vector<EVT> &NodeTys,
- ImmutableCallSite *CS, const PPCSubtarget &Subtarget) {
+ ImmutableCallSite CS, const PPCSubtarget &Subtarget) {
bool isPPC64 = Subtarget.isPPC64();
bool isSVR4ABI = Subtarget.isSVR4ABI();
bool isELFv2ABI = Subtarget.isELFv2ABI();
@@ -4699,7 +4735,7 @@ PrepareCall(SelectionDAG &DAG, SDValue &Callee, SDValue &InFlag, SDValue &Chain,
// we're building with the leopard linker or later, which automatically
// synthesizes these stubs.
const TargetMachine &TM = DAG.getTarget();
- const Module *Mod = DAG.getMachineFunction().getFunction()->getParent();
+ const Module *Mod = DAG.getMachineFunction().getFunction().getParent();
const GlobalValue *GV = nullptr;
if (auto *G = dyn_cast<GlobalAddressSDNode>(Callee))
GV = G->getGlobal();
@@ -4787,7 +4823,7 @@ PrepareCall(SelectionDAG &DAG, SDValue &Callee, SDValue &InFlag, SDValue &Chain,
MachineMemOperand::MOInvariant)
: MachineMemOperand::MONone;
- MachinePointerInfo MPI(CS ? CS->getCalledValue() : nullptr);
+ MachinePointerInfo MPI(CS ? CS.getCalledValue() : nullptr);
SDValue LoadFuncPtr = DAG.getLoad(MVT::i64, dl, LDChain, Callee, MPI,
/* Alignment = */ 8, MMOFlags);
@@ -4917,7 +4953,7 @@ SDValue PPCTargetLowering::FinishCall(
SmallVector<std::pair<unsigned, SDValue>, 8> &RegsToPass, SDValue InFlag,
SDValue Chain, SDValue CallSeqStart, SDValue &Callee, int SPDiff,
unsigned NumBytes, const SmallVectorImpl<ISD::InputArg> &Ins,
- SmallVectorImpl<SDValue> &InVals, ImmutableCallSite *CS) const {
+ SmallVectorImpl<SDValue> &InVals, ImmutableCallSite CS) const {
std::vector<EVT> NodeTys;
SmallVector<SDValue, 8> Ops;
unsigned CallOpc = PrepareCall(DAG, Callee, InFlag, Chain, CallSeqStart, dl,
@@ -4992,7 +5028,7 @@ SDValue PPCTargetLowering::FinishCall(
// any other variadic arguments).
Ops.insert(std::next(Ops.begin()), AddTOC);
} else if (CallOpc == PPCISD::CALL &&
- !resideInSameSection(MF.getFunction(), Callee, DAG.getTarget())) {
+ !callsShareTOCBase(&MF.getFunction(), Callee, DAG.getTarget())) {
// Otherwise insert NOP for non-local calls.
CallOpc = PPCISD::CALL_NOP;
}
@@ -5025,10 +5061,10 @@ PPCTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
CallingConv::ID CallConv = CLI.CallConv;
bool isVarArg = CLI.IsVarArg;
bool isPatchPoint = CLI.IsPatchPoint;
- ImmutableCallSite *CS = CLI.CS;
+ ImmutableCallSite CS = CLI.CS;
if (isTailCall) {
- if (Subtarget.useLongCalls() && !(CS && CS->isMustTailCall()))
+ if (Subtarget.useLongCalls() && !(CS && CS.isMustTailCall()))
isTailCall = false;
else if (Subtarget.isSVR4ABI() && Subtarget.isPPC64())
isTailCall =
@@ -5056,7 +5092,7 @@ PPCTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
}
}
- if (!isTailCall && CS && CS->isMustTailCall())
+ if (!isTailCall && CS && CS.isMustTailCall())
report_fatal_error("failed to perform tail call elimination on a call "
"site marked musttail");
@@ -5090,7 +5126,7 @@ SDValue PPCTargetLowering::LowerCall_32SVR4(
const SmallVectorImpl<SDValue> &OutVals,
const SmallVectorImpl<ISD::InputArg> &Ins, const SDLoc &dl,
SelectionDAG &DAG, SmallVectorImpl<SDValue> &InVals,
- ImmutableCallSite *CS) const {
+ ImmutableCallSite CS) const {
// See PPCTargetLowering::LowerFormalArguments_32SVR4() for a description
// of the 32-bit SVR4 ABI stack frame layout.
@@ -5324,7 +5360,7 @@ SDValue PPCTargetLowering::LowerCall_64SVR4(
const SmallVectorImpl<SDValue> &OutVals,
const SmallVectorImpl<ISD::InputArg> &Ins, const SDLoc &dl,
SelectionDAG &DAG, SmallVectorImpl<SDValue> &InVals,
- ImmutableCallSite *CS) const {
+ ImmutableCallSite CS) const {
bool isELFv2ABI = Subtarget.isELFv2ABI();
bool isLittleEndian = Subtarget.isLittleEndian();
unsigned NumOps = Outs.size();
@@ -5974,7 +6010,7 @@ SDValue PPCTargetLowering::LowerCall_Darwin(
const SmallVectorImpl<SDValue> &OutVals,
const SmallVectorImpl<ISD::InputArg> &Ins, const SDLoc &dl,
SelectionDAG &DAG, SmallVectorImpl<SDValue> &InVals,
- ImmutableCallSite *CS) const {
+ ImmutableCallSite CS) const {
unsigned NumOps = Outs.size();
EVT PtrVT = getPointerTy(DAG.getDataLayout());
@@ -7448,9 +7484,11 @@ static SDValue BuildVSLDOI(SDValue LHS, SDValue RHS, unsigned Amt, EVT VT,
/// - The node is a "load-and-splat"
/// In all other cases, we will choose to keep the BUILD_VECTOR.
static bool haveEfficientBuildVectorPattern(BuildVectorSDNode *V,
- bool HasDirectMove) {
+ bool HasDirectMove,
+ bool HasP8Vector) {
EVT VecVT = V->getValueType(0);
- bool RightType = VecVT == MVT::v2f64 || VecVT == MVT::v4f32 ||
+ bool RightType = VecVT == MVT::v2f64 ||
+ (HasP8Vector && VecVT == MVT::v4f32) ||
(HasDirectMove && (VecVT == MVT::v2i64 || VecVT == MVT::v4i32));
if (!RightType)
return false;
@@ -7612,7 +7650,8 @@ SDValue PPCTargetLowering::LowerBUILD_VECTOR(SDValue Op,
// lowered to VSX instructions under certain conditions.
// Without VSX, there is no pattern more efficient than expanding the node.
if (Subtarget.hasVSX() &&
- haveEfficientBuildVectorPattern(BVN, Subtarget.hasDirectMove()))
+ haveEfficientBuildVectorPattern(BVN, Subtarget.hasDirectMove(),
+ Subtarget.hasP8Vector()))
return Op;
return SDValue();
}
@@ -7646,6 +7685,15 @@ SDValue PPCTargetLowering::LowerBUILD_VECTOR(SDValue Op,
return DAG.getBitcast(Op.getValueType(), NewBV);
return NewBV;
}
+
+ // BuildVectorSDNode::isConstantSplat() is actually pretty smart. It'll
+ // detect that constant splats like v8i16: 0xABAB are really just splats
+ // of a 1-byte constant. In this case, we need to convert the node to a
+ // splat of v16i8 and a bitcast.
+ if (Op.getValueType() != MVT::v16i8)
+ return DAG.getBitcast(Op.getValueType(),
+ DAG.getConstant(SplatBits, dl, MVT::v16i8));
+
return Op;
}
@@ -7855,6 +7903,219 @@ static SDValue GeneratePerfectShuffle(unsigned PFEntry, SDValue LHS,
return DAG.getNode(ISD::BITCAST, dl, VT, T);
}
+/// lowerToVINSERTB - Return the SDValue if this VECTOR_SHUFFLE can be handled
+/// by the VINSERTB instruction introduced in ISA 3.0, else just return default
+/// SDValue.
+SDValue PPCTargetLowering::lowerToVINSERTB(ShuffleVectorSDNode *N,
+ SelectionDAG &DAG) const {
+ const unsigned BytesInVector = 16;
+ bool IsLE = Subtarget.isLittleEndian();
+ SDLoc dl(N);
+ SDValue V1 = N->getOperand(0);
+ SDValue V2 = N->getOperand(1);
+ unsigned ShiftElts = 0, InsertAtByte = 0;
+ bool Swap = false;
+
+ // Shifts required to get the byte we want at element 7.
+ unsigned LittleEndianShifts[] = {8, 7, 6, 5, 4, 3, 2, 1,
+ 0, 15, 14, 13, 12, 11, 10, 9};
+ unsigned BigEndianShifts[] = {9, 10, 11, 12, 13, 14, 15, 0,
+ 1, 2, 3, 4, 5, 6, 7, 8};
+
+ ArrayRef<int> Mask = N->getMask();
+ int OriginalOrder[] = {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15};
+
+ // For each mask element, find out if we're just inserting something
+ // from V2 into V1 or vice versa.
+ // Possible permutations inserting an element from V2 into V1:
+ // X, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15
+ // 0, X, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15
+ // ...
+ // 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, X
+ // Inserting from V1 into V2 will be similar, except mask range will be
+ // [16,31].
+
+ bool FoundCandidate = false;
+ // If both vector operands for the shuffle are the same vector, the mask
+ // will contain only elements from the first one and the second one will be
+ // undef.
+ unsigned VINSERTBSrcElem = IsLE ? 8 : 7;
+ // Go through the mask of half-words to find an element that's being moved
+ // from one vector to the other.
+ for (unsigned i = 0; i < BytesInVector; ++i) {
+ unsigned CurrentElement = Mask[i];
+ // If 2nd operand is undefined, we should only look for element 7 in the
+ // Mask.
+ if (V2.isUndef() && CurrentElement != VINSERTBSrcElem)
+ continue;
+
+ bool OtherElementsInOrder = true;
+ // Examine the other elements in the Mask to see if they're in original
+ // order.
+ for (unsigned j = 0; j < BytesInVector; ++j) {
+ if (j == i)
+ continue;
+ // If CurrentElement is from V1 [0,15], then we the rest of the Mask to be
+ // from V2 [16,31] and vice versa. Unless the 2nd operand is undefined,
+ // in which we always assume we're always picking from the 1st operand.
+ int MaskOffset =
+ (!V2.isUndef() && CurrentElement < BytesInVector) ? BytesInVector : 0;
+ if (Mask[j] != OriginalOrder[j] + MaskOffset) {
+ OtherElementsInOrder = false;
+ break;
+ }
+ }
+ // If other elements are in original order, we record the number of shifts
+ // we need to get the element we want into element 7. Also record which byte
+ // in the vector we should insert into.
+ if (OtherElementsInOrder) {
+ // If 2nd operand is undefined, we assume no shifts and no swapping.
+ if (V2.isUndef()) {
+ ShiftElts = 0;
+ Swap = false;
+ } else {
+ // Only need the last 4-bits for shifts because operands will be swapped if CurrentElement is >= 2^4.
+ ShiftElts = IsLE ? LittleEndianShifts[CurrentElement & 0xF]
+ : BigEndianShifts[CurrentElement & 0xF];
+ Swap = CurrentElement < BytesInVector;
+ }
+ InsertAtByte = IsLE ? BytesInVector - (i + 1) : i;
+ FoundCandidate = true;
+ break;
+ }
+ }
+
+ if (!FoundCandidate)
+ return SDValue();
+
+ // Candidate found, construct the proper SDAG sequence with VINSERTB,
+ // optionally with VECSHL if shift is required.
+ if (Swap)
+ std::swap(V1, V2);
+ if (V2.isUndef())
+ V2 = V1;
+ if (ShiftElts) {
+ SDValue Shl = DAG.getNode(PPCISD::VECSHL, dl, MVT::v16i8, V2, V2,
+ DAG.getConstant(ShiftElts, dl, MVT::i32));
+ return DAG.getNode(PPCISD::VECINSERT, dl, MVT::v16i8, V1, Shl,
+ DAG.getConstant(InsertAtByte, dl, MVT::i32));
+ }
+ return DAG.getNode(PPCISD::VECINSERT, dl, MVT::v16i8, V1, V2,
+ DAG.getConstant(InsertAtByte, dl, MVT::i32));
+}
+
+/// lowerToVINSERTH - Return the SDValue if this VECTOR_SHUFFLE can be handled
+/// by the VINSERTH instruction introduced in ISA 3.0, else just return default
+/// SDValue.
+SDValue PPCTargetLowering::lowerToVINSERTH(ShuffleVectorSDNode *N,
+ SelectionDAG &DAG) const {
+ const unsigned NumHalfWords = 8;
+ const unsigned BytesInVector = NumHalfWords * 2;
+ // Check that the shuffle is on half-words.
+ if (!isNByteElemShuffleMask(N, 2, 1))
+ return SDValue();
+
+ bool IsLE = Subtarget.isLittleEndian();
+ SDLoc dl(N);
+ SDValue V1 = N->getOperand(0);
+ SDValue V2 = N->getOperand(1);
+ unsigned ShiftElts = 0, InsertAtByte = 0;
+ bool Swap = false;
+
+ // Shifts required to get the half-word we want at element 3.
+ unsigned LittleEndianShifts[] = {4, 3, 2, 1, 0, 7, 6, 5};
+ unsigned BigEndianShifts[] = {5, 6, 7, 0, 1, 2, 3, 4};
+
+ uint32_t Mask = 0;
+ uint32_t OriginalOrderLow = 0x1234567;
+ uint32_t OriginalOrderHigh = 0x89ABCDEF;
+ // Now we look at mask elements 0,2,4,6,8,10,12,14. Pack the mask into a
+ // 32-bit space, only need 4-bit nibbles per element.
+ for (unsigned i = 0; i < NumHalfWords; ++i) {
+ unsigned MaskShift = (NumHalfWords - 1 - i) * 4;
+ Mask |= ((uint32_t)(N->getMaskElt(i * 2) / 2) << MaskShift);
+ }
+
+ // For each mask element, find out if we're just inserting something
+ // from V2 into V1 or vice versa. Possible permutations inserting an element
+ // from V2 into V1:
+ // X, 1, 2, 3, 4, 5, 6, 7
+ // 0, X, 2, 3, 4, 5, 6, 7
+ // 0, 1, X, 3, 4, 5, 6, 7
+ // 0, 1, 2, X, 4, 5, 6, 7
+ // 0, 1, 2, 3, X, 5, 6, 7
+ // 0, 1, 2, 3, 4, X, 6, 7
+ // 0, 1, 2, 3, 4, 5, X, 7
+ // 0, 1, 2, 3, 4, 5, 6, X
+ // Inserting from V1 into V2 will be similar, except mask range will be [8,15].
+
+ bool FoundCandidate = false;
+ // Go through the mask of half-words to find an element that's being moved
+ // from one vector to the other.
+ for (unsigned i = 0; i < NumHalfWords; ++i) {
+ unsigned MaskShift = (NumHalfWords - 1 - i) * 4;
+ uint32_t MaskOneElt = (Mask >> MaskShift) & 0xF;
+ uint32_t MaskOtherElts = ~(0xF << MaskShift);
+ uint32_t TargetOrder = 0x0;
+
+ // If both vector operands for the shuffle are the same vector, the mask
+ // will contain only elements from the first one and the second one will be
+ // undef.
+ if (V2.isUndef()) {
+ ShiftElts = 0;
+ unsigned VINSERTHSrcElem = IsLE ? 4 : 3;
+ TargetOrder = OriginalOrderLow;
+ Swap = false;
+ // Skip if not the correct element or mask of other elements don't equal
+ // to our expected order.
+ if (MaskOneElt == VINSERTHSrcElem &&
+ (Mask & MaskOtherElts) == (TargetOrder & MaskOtherElts)) {
+ InsertAtByte = IsLE ? BytesInVector - (i + 1) * 2 : i * 2;
+ FoundCandidate = true;
+ break;
+ }
+ } else { // If both operands are defined.
+ // Target order is [8,15] if the current mask is between [0,7].
+ TargetOrder =
+ (MaskOneElt < NumHalfWords) ? OriginalOrderHigh : OriginalOrderLow;
+ // Skip if mask of other elements don't equal our expected order.
+ if ((Mask & MaskOtherElts) == (TargetOrder & MaskOtherElts)) {
+ // We only need the last 3 bits for the number of shifts.
+ ShiftElts = IsLE ? LittleEndianShifts[MaskOneElt & 0x7]
+ : BigEndianShifts[MaskOneElt & 0x7];
+ InsertAtByte = IsLE ? BytesInVector - (i + 1) * 2 : i * 2;
+ Swap = MaskOneElt < NumHalfWords;
+ FoundCandidate = true;
+ break;
+ }
+ }
+ }
+
+ if (!FoundCandidate)
+ return SDValue();
+
+ // Candidate found, construct the proper SDAG sequence with VINSERTH,
+ // optionally with VECSHL if shift is required.
+ if (Swap)
+ std::swap(V1, V2);
+ if (V2.isUndef())
+ V2 = V1;
+ SDValue Conv1 = DAG.getNode(ISD::BITCAST, dl, MVT::v8i16, V1);
+ if (ShiftElts) {
+ // Double ShiftElts because we're left shifting on v16i8 type.
+ SDValue Shl = DAG.getNode(PPCISD::VECSHL, dl, MVT::v16i8, V2, V2,
+ DAG.getConstant(2 * ShiftElts, dl, MVT::i32));
+ SDValue Conv2 = DAG.getNode(ISD::BITCAST, dl, MVT::v8i16, Shl);
+ SDValue Ins = DAG.getNode(PPCISD::VECINSERT, dl, MVT::v8i16, Conv1, Conv2,
+ DAG.getConstant(InsertAtByte, dl, MVT::i32));
+ return DAG.getNode(ISD::BITCAST, dl, MVT::v16i8, Ins);
+ }
+ SDValue Conv2 = DAG.getNode(ISD::BITCAST, dl, MVT::v8i16, V2);
+ SDValue Ins = DAG.getNode(PPCISD::VECINSERT, dl, MVT::v8i16, Conv1, Conv2,
+ DAG.getConstant(InsertAtByte, dl, MVT::i32));
+ return DAG.getNode(ISD::BITCAST, dl, MVT::v16i8, Ins);
+}
+
/// LowerVECTOR_SHUFFLE - Return the code we lower for VECTOR_SHUFFLE. If this
/// is a shuffle we can handle in a single instruction, return it. Otherwise,
/// return the code it can be lowered into. Worst case, it can always be
@@ -7869,7 +8130,7 @@ SDValue PPCTargetLowering::LowerVECTOR_SHUFFLE(SDValue Op,
bool isLittleEndian = Subtarget.isLittleEndian();
unsigned ShiftElts, InsertAtByte;
- bool Swap;
+ bool Swap = false;
if (Subtarget.hasP9Vector() &&
PPC::isXXINSERTWMask(SVOp, ShiftElts, InsertAtByte, Swap,
isLittleEndian)) {
@@ -7880,15 +8141,23 @@ SDValue PPCTargetLowering::LowerVECTOR_SHUFFLE(SDValue Op,
if (ShiftElts) {
SDValue Shl = DAG.getNode(PPCISD::VECSHL, dl, MVT::v4i32, Conv2, Conv2,
DAG.getConstant(ShiftElts, dl, MVT::i32));
- SDValue Ins = DAG.getNode(PPCISD::XXINSERT, dl, MVT::v4i32, Conv1, Shl,
+ SDValue Ins = DAG.getNode(PPCISD::VECINSERT, dl, MVT::v4i32, Conv1, Shl,
DAG.getConstant(InsertAtByte, dl, MVT::i32));
return DAG.getNode(ISD::BITCAST, dl, MVT::v16i8, Ins);
}
- SDValue Ins = DAG.getNode(PPCISD::XXINSERT, dl, MVT::v4i32, Conv1, Conv2,
+ SDValue Ins = DAG.getNode(PPCISD::VECINSERT, dl, MVT::v4i32, Conv1, Conv2,
DAG.getConstant(InsertAtByte, dl, MVT::i32));
return DAG.getNode(ISD::BITCAST, dl, MVT::v16i8, Ins);
}
+ if (Subtarget.hasP9Altivec()) {
+ SDValue NewISDNode;
+ if ((NewISDNode = lowerToVINSERTH(SVOp, DAG)))
+ return NewISDNode;
+
+ if ((NewISDNode = lowerToVINSERTB(SVOp, DAG)))
+ return NewISDNode;
+ }
if (Subtarget.hasVSX() &&
PPC::isXXSLDWIShuffleMask(SVOp, ShiftElts, Swap, isLittleEndian)) {
@@ -8390,6 +8659,8 @@ SDValue PPCTargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op,
unsigned IntrinsicID =
cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue();
+ SDLoc dl(Op);
+
if (IntrinsicID == Intrinsic::thread_pointer) {
// Reads the thread pointer register, used for __builtin_thread_pointer.
if (Subtarget.isPPC64())
@@ -8397,9 +8668,37 @@ SDValue PPCTargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op,
return DAG.getRegister(PPC::R2, MVT::i32);
}
+ // We are looking for absolute values here.
+ // The idea is to try to fit one of two patterns:
+ // max (a, (0-a)) OR max ((0-a), a)
+ if (Subtarget.hasP9Vector() &&
+ (IntrinsicID == Intrinsic::ppc_altivec_vmaxsw ||
+ IntrinsicID == Intrinsic::ppc_altivec_vmaxsh ||
+ IntrinsicID == Intrinsic::ppc_altivec_vmaxsb)) {
+ SDValue V1 = Op.getOperand(1);
+ SDValue V2 = Op.getOperand(2);
+ if (V1.getSimpleValueType() == V2.getSimpleValueType() &&
+ (V1.getSimpleValueType() == MVT::v4i32 ||
+ V1.getSimpleValueType() == MVT::v8i16 ||
+ V1.getSimpleValueType() == MVT::v16i8)) {
+ if ( V1.getOpcode() == ISD::SUB &&
+ ISD::isBuildVectorAllZeros(V1.getOperand(0).getNode()) &&
+ V1.getOperand(1) == V2 ) {
+ // Generate the abs instruction with the operands
+ return DAG.getNode(ISD::ABS, dl, V2.getValueType(),V2);
+ }
+
+ if ( V2.getOpcode() == ISD::SUB &&
+ ISD::isBuildVectorAllZeros(V2.getOperand(0).getNode()) &&
+ V2.getOperand(1) == V1 ) {
+ // Generate the abs instruction with the operands
+ return DAG.getNode(ISD::ABS, dl, V1.getValueType(),V1);
+ }
+ }
+ }
+
// If this is a lowered altivec predicate compare, CompareOpc is set to the
// opcode number of the comparison.
- SDLoc dl(Op);
int CompareOpc;
bool isDot;
if (!getVectorCompareInfo(Op, CompareOpc, isDot, Subtarget))
@@ -8495,6 +8794,23 @@ SDValue PPCTargetLowering::LowerREM(SDValue Op, SelectionDAG &DAG) const {
return Op;
}
+// Lower scalar BSWAP64 to xxbrd.
+SDValue PPCTargetLowering::LowerBSWAP(SDValue Op, SelectionDAG &DAG) const {
+ SDLoc dl(Op);
+ // MTVSRDD
+ Op = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v2i64, Op.getOperand(0),
+ Op.getOperand(0));
+ // XXBRD
+ Op = DAG.getNode(PPCISD::XXREVERSE, dl, MVT::v2i64, Op);
+ // MFVSRD
+ int VectorIndex = 0;
+ if (Subtarget.isLittleEndian())
+ VectorIndex = 1;
+ Op = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i64, Op,
+ DAG.getTargetConstant(VectorIndex, dl, MVT::i32));
+ return Op;
+}
+
SDValue PPCTargetLowering::LowerSIGN_EXTEND_INREG(SDValue Op,
SelectionDAG &DAG) const {
SDLoc dl(Op);
@@ -8539,11 +8855,29 @@ SDValue PPCTargetLowering::LowerINSERT_VECTOR_ELT(SDValue Op,
SelectionDAG &DAG) const {
assert(Op.getOpcode() == ISD::INSERT_VECTOR_ELT &&
"Should only be called for ISD::INSERT_VECTOR_ELT");
+
ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op.getOperand(2));
// We have legal lowering for constant indices but not for variable ones.
- if (C)
- return Op;
- return SDValue();
+ if (!C)
+ return SDValue();
+
+ EVT VT = Op.getValueType();
+ SDLoc dl(Op);
+ SDValue V1 = Op.getOperand(0);
+ SDValue V2 = Op.getOperand(1);
+ // We can use MTVSRZ + VECINSERT for v8i16 and v16i8 types.
+ if (VT == MVT::v8i16 || VT == MVT::v16i8) {
+ SDValue Mtvsrz = DAG.getNode(PPCISD::MTVSRZ, dl, VT, V2);
+ unsigned BytesInEachElement = VT.getVectorElementType().getSizeInBits() / 8;
+ unsigned InsertAtElement = C->getZExtValue();
+ unsigned InsertAtByte = InsertAtElement * BytesInEachElement;
+ if (Subtarget.isLittleEndian()) {
+ InsertAtByte = (16 - BytesInEachElement) - InsertAtByte;
+ }
+ return DAG.getNode(PPCISD::VECINSERT, dl, VT, V1, Mtvsrz,
+ DAG.getConstant(InsertAtByte, dl, MVT::i32));
+ }
+ return Op;
}
SDValue PPCTargetLowering::LowerEXTRACT_VECTOR_ELT(SDValue Op,
@@ -8966,6 +9300,8 @@ SDValue PPCTargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const {
case ISD::SREM:
case ISD::UREM:
return LowerREM(Op, DAG);
+ case ISD::BSWAP:
+ return LowerBSWAP(Op, DAG);
}
}
@@ -9461,7 +9797,7 @@ PPCTargetLowering::emitEHSjLjSetJmp(MachineInstr &MI,
// Naked functions never have a base pointer, and so we use r1. For all
// other functions, this decision must be delayed until during PEI.
unsigned BaseReg;
- if (MF->getFunction()->hasFnAttribute(Attribute::Naked))
+ if (MF->getFunction().hasFnAttribute(Attribute::Naked))
BaseReg = Subtarget.isPPC64() ? PPC::X1 : PPC::R1;
else
BaseReg = Subtarget.isPPC64() ? PPC::BP8 : PPC::BP;
@@ -11887,9 +12223,28 @@ SDValue PPCTargetLowering::PerformDAGCombine(SDNode *N,
cast<StoreSDNode>(N)->getMemOperand());
}
+ // STORE Constant:i32<0> -> STORE<trunc to i32> Constant:i64<0>
+ // So it can increase the chance of CSE constant construction.
+ EVT VT = N->getOperand(1).getValueType();
+ if (Subtarget.isPPC64() && !DCI.isBeforeLegalize() &&
+ isa<ConstantSDNode>(N->getOperand(1)) && VT == MVT::i32) {
+ // Need to sign-extended to 64-bits to handle negative values.
+ EVT MemVT = cast<StoreSDNode>(N)->getMemoryVT();
+ uint64_t Val64 = SignExtend64(N->getConstantOperandVal(1),
+ MemVT.getSizeInBits());
+ SDValue Const64 = DAG.getConstant(Val64, dl, MVT::i64);
+
+ // DAG.getTruncStore() can't be used here because it doesn't accept
+ // the general (base + offset) addressing mode.
+ // So we use UpdateNodeOperands and setTruncatingStore instead.
+ DAG.UpdateNodeOperands(N, N->getOperand(0), Const64, N->getOperand(2),
+ N->getOperand(3));
+ cast<StoreSDNode>(N)->setTruncatingStore(true);
+ return SDValue(N, 0);
+ }
+
// For little endian, VSX stores require generating xxswapd/lxvd2x.
// Not needed on ISA 3.0 based CPUs since we have a non-permuting store.
- EVT VT = N->getOperand(1).getValueType();
if (VT.isSimple()) {
MVT StoreVT = VT.getSimpleVT();
if (Subtarget.needsSwapsForVSXMemOps() &&
@@ -12690,6 +13045,7 @@ PPCTargetLowering::getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI,
return std::make_pair(0U, &PPC::QSRCRegClass);
if (Subtarget.hasAltivec())
return std::make_pair(0U, &PPC::VRRCRegClass);
+ break;
case 'y': // crrc
return std::make_pair(0U, &PPC::CRRCRegClass);
}
@@ -12810,7 +13166,7 @@ void PPCTargetLowering::LowerAsmOperandForConstraint(SDValue Op,
// by AM is legal for this target, for a load/store of the specified type.
bool PPCTargetLowering::isLegalAddressingMode(const DataLayout &DL,
const AddrMode &AM, Type *Ty,
- unsigned AS) const {
+ unsigned AS, Instruction *I) const {
// PPC does not allow r+i addressing modes for vectors!
if (Ty->isVectorTy() && AM.BaseOffs != 0)
return false;
@@ -12895,7 +13251,7 @@ SDValue PPCTargetLowering::LowerFRAMEADDR(SDValue Op,
// Naked functions never have a frame pointer, and so we use r1. For all
// other functions, this decision must be delayed until during PEI.
unsigned FrameReg;
- if (MF.getFunction()->hasFnAttribute(Attribute::Naked))
+ if (MF.getFunction().hasFnAttribute(Attribute::Naked))
FrameReg = isPPC64 ? PPC::X1 : PPC::R1;
else
FrameReg = isPPC64 ? PPC::FP8 : PPC::FP;
@@ -12940,6 +13296,7 @@ PPCTargetLowering::isOffsetFoldingLegal(const GlobalAddressSDNode *GA) const {
bool PPCTargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info,
const CallInst &I,
+ MachineFunction &MF,
unsigned Intrinsic) const {
switch (Intrinsic) {
case Intrinsic::ppc_qpx_qvlfd:
@@ -12992,9 +13349,7 @@ bool PPCTargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info,
Info.offset = -VT.getStoreSize()+1;
Info.size = 2*VT.getStoreSize()-1;
Info.align = 1;
- Info.vol = false;
- Info.readMem = true;
- Info.writeMem = false;
+ Info.flags = MachineMemOperand::MOLoad;
return true;
}
case Intrinsic::ppc_qpx_qvlfda:
@@ -13028,9 +13383,7 @@ bool PPCTargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info,
Info.offset = 0;
Info.size = VT.getStoreSize();
Info.align = 1;
- Info.vol = false;
- Info.readMem = true;
- Info.writeMem = false;
+ Info.flags = MachineMemOperand::MOLoad;
return true;
}
case Intrinsic::ppc_qpx_qvstfd:
@@ -13082,9 +13435,7 @@ bool PPCTargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info,
Info.offset = -VT.getStoreSize()+1;
Info.size = 2*VT.getStoreSize()-1;
Info.align = 1;
- Info.vol = false;
- Info.readMem = false;
- Info.writeMem = true;
+ Info.flags = MachineMemOperand::MOStore;
return true;
}
case Intrinsic::ppc_qpx_qvstfda:
@@ -13117,9 +13468,7 @@ bool PPCTargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info,
Info.offset = 0;
Info.size = VT.getStoreSize();
Info.align = 1;
- Info.vol = false;
- Info.readMem = false;
- Info.writeMem = true;
+ Info.flags = MachineMemOperand::MOStore;
return true;
}
default:
@@ -13146,12 +13495,12 @@ EVT PPCTargetLowering::getOptimalMemOpType(uint64_t Size,
bool MemcpyStrSrc,
MachineFunction &MF) const {
if (getTargetMachine().getOptLevel() != CodeGenOpt::None) {
- const Function *F = MF.getFunction();
+ const Function &F = MF.getFunction();
// When expanding a memset, require at least two QPX instructions to cover
// the cost of loading the value to be stored from the constant pool.
if (Subtarget.hasQPX() && Size >= 32 && (!IsMemset || Size >= 64) &&
(!SrcAlign || SrcAlign >= 32) && (!DstAlign || DstAlign >= 32) &&
- !F->hasFnAttribute(Attribute::NoImplicitFloat)) {
+ !F.hasFnAttribute(Attribute::NoImplicitFloat)) {
return MVT::v4f64;
}
@@ -13216,8 +13565,9 @@ bool PPCTargetLowering::isZExtFree(SDValue Val, EVT VT2) const {
return TargetLowering::isZExtFree(Val, VT2);
}
-bool PPCTargetLowering::isFPExtFree(EVT VT) const {
- assert(VT.isFloatingPoint());
+bool PPCTargetLowering::isFPExtFree(EVT DestVT, EVT SrcVT) const {
+ assert(DestVT.isFloatingPoint() && SrcVT.isFloatingPoint() &&
+ "invalid fpext types");
return true;
}
@@ -13369,7 +13719,7 @@ void PPCTargetLowering::insertCopiesSplitCSR(
// fine for CXX_FAST_TLS since the C++-style TLS access functions should be
// nounwind. If we want to generalize this later, we may need to emit
// CFI pseudo-instructions.
- assert(Entry->getParent()->getFunction()->hasFnAttribute(
+ assert(Entry->getParent()->getFunction().hasFnAttribute(
Attribute::NoUnwind) &&
"Function should be nounwind in insertCopiesSplitCSR!");
Entry->addLiveIn(*I);
@@ -13467,3 +13817,38 @@ SDValue PPCTargetLowering::combineSRL(SDNode *N, DAGCombinerInfo &DCI) const {
return SDValue();
}
+
+bool PPCTargetLowering::mayBeEmittedAsTailCall(const CallInst *CI) const {
+ // Only duplicate to increase tail-calls for the 64bit SysV ABIs.
+ if (!Subtarget.isSVR4ABI() || !Subtarget.isPPC64())
+ return false;
+
+ // If not a tail call then no need to proceed.
+ if (!CI->isTailCall())
+ return false;
+
+ // If tail calls are disabled for the caller then we are done.
+ const Function *Caller = CI->getParent()->getParent();
+ auto Attr = Caller->getFnAttribute("disable-tail-calls");
+ if (Attr.getValueAsString() == "true")
+ return false;
+
+ // If sibling calls have been disabled and tail-calls aren't guaranteed
+ // there is no reason to duplicate.
+ auto &TM = getTargetMachine();
+ if (!TM.Options.GuaranteedTailCallOpt && DisableSCO)
+ return false;
+
+ // Can't tail call a function called indirectly, or if it has variadic args.
+ const Function *Callee = CI->getCalledFunction();
+ if (!Callee || Callee->isVarArg())
+ return false;
+
+ // Make sure the callee and caller calling conventions are eligible for tco.
+ if (!areCallingConvEligibleForTCO_64SVR4(Caller->getCallingConv(),
+ CI->getCallingConv()))
+ return false;
+
+ // If the function is local then we have a good chance at tail-calling it
+ return getTargetMachine().shouldAssumeDSOLocal(*Caller->getParent(), Callee);
+}
diff --git a/lib/Target/PowerPC/PPCISelLowering.h b/lib/Target/PowerPC/PPCISelLowering.h
index 49d7d8220af16..b119e5b4a5649 100644
--- a/lib/Target/PowerPC/PPCISelLowering.h
+++ b/lib/Target/PowerPC/PPCISelLowering.h
@@ -23,6 +23,7 @@
#include "llvm/CodeGen/MachineValueType.h"
#include "llvm/CodeGen/SelectionDAG.h"
#include "llvm/CodeGen/SelectionDAGNodes.h"
+#include "llvm/CodeGen/TargetLowering.h"
#include "llvm/CodeGen/ValueTypes.h"
#include "llvm/IR/Attributes.h"
#include "llvm/IR/CallingConv.h"
@@ -30,13 +31,20 @@
#include "llvm/IR/InlineAsm.h"
#include "llvm/IR/Metadata.h"
#include "llvm/IR/Type.h"
-#include "llvm/Target/TargetLowering.h"
#include <utility>
namespace llvm {
namespace PPCISD {
+ // When adding a NEW PPCISD node please add it to the correct position in
+ // the enum. The order of elements in this enum matters!
+ // Values that are added after this entry:
+ // STBRX = ISD::FIRST_TARGET_MEMORY_OPCODE
+ // are considerd memory opcodes and are treated differently than entries
+ // that come before it. For example, ADD or MUL should be placed before
+ // the ISD::FIRST_TARGET_MEMORY_OPCODE while a LOAD or STORE should come
+ // after it.
enum NodeType : unsigned {
// Start the numbering where the builtin ops and target ops leave off.
FIRST_NUMBER = ISD::BUILTIN_OP_END,
@@ -86,15 +94,15 @@ namespace llvm {
///
XXSPLT,
- /// XXINSERT - The PPC VSX insert instruction
+ /// VECINSERT - The PPC vector insert instruction
///
- XXINSERT,
+ VECINSERT,
/// XXREVERSE - The PPC VSX reverse instruction
///
XXREVERSE,
- /// VECSHL - The PPC VSX shift left instruction
+ /// VECSHL - The PPC vector shift left instruction
///
VECSHL,
@@ -254,7 +262,7 @@ namespace llvm {
/// local dynamic TLS on PPC32.
PPC32_PICGOT,
- /// G8RC = ADDIS_GOT_TPREL_HA %X2, Symbol - Used by the initial-exec
+ /// G8RC = ADDIS_GOT_TPREL_HA %x2, Symbol - Used by the initial-exec
/// TLS model, produces an ADDIS8 instruction that adds the GOT
/// base to sym\@got\@tprel\@ha.
ADDIS_GOT_TPREL_HA,
@@ -273,18 +281,18 @@ namespace llvm {
/// TLS sequence.
ADD_TLS,
- /// G8RC = ADDIS_TLSGD_HA %X2, Symbol - For the general-dynamic TLS
+ /// G8RC = ADDIS_TLSGD_HA %x2, Symbol - For the general-dynamic TLS
/// model, produces an ADDIS8 instruction that adds the GOT base
/// register to sym\@got\@tlsgd\@ha.
ADDIS_TLSGD_HA,
- /// %X3 = ADDI_TLSGD_L G8RReg, Symbol - For the general-dynamic TLS
+ /// %x3 = ADDI_TLSGD_L G8RReg, Symbol - For the general-dynamic TLS
/// model, produces an ADDI8 instruction that adds G8RReg to
/// sym\@got\@tlsgd\@l and stores the result in X3. Hidden by
/// ADDIS_TLSGD_L_ADDR until after register assignment.
ADDI_TLSGD_L,
- /// %X3 = GET_TLS_ADDR %X3, Symbol - For the general-dynamic TLS
+ /// %x3 = GET_TLS_ADDR %x3, Symbol - For the general-dynamic TLS
/// model, produces a call to __tls_get_addr(sym\@tlsgd). Hidden by
/// ADDIS_TLSGD_L_ADDR until after register assignment.
GET_TLS_ADDR,
@@ -294,18 +302,18 @@ namespace llvm {
/// register assignment.
ADDI_TLSGD_L_ADDR,
- /// G8RC = ADDIS_TLSLD_HA %X2, Symbol - For the local-dynamic TLS
+ /// G8RC = ADDIS_TLSLD_HA %x2, Symbol - For the local-dynamic TLS
/// model, produces an ADDIS8 instruction that adds the GOT base
/// register to sym\@got\@tlsld\@ha.
ADDIS_TLSLD_HA,
- /// %X3 = ADDI_TLSLD_L G8RReg, Symbol - For the local-dynamic TLS
+ /// %x3 = ADDI_TLSLD_L G8RReg, Symbol - For the local-dynamic TLS
/// model, produces an ADDI8 instruction that adds G8RReg to
/// sym\@got\@tlsld\@l and stores the result in X3. Hidden by
/// ADDIS_TLSLD_L_ADDR until after register assignment.
ADDI_TLSLD_L,
- /// %X3 = GET_TLSLD_ADDR %X3, Symbol - For the local-dynamic TLS
+ /// %x3 = GET_TLSLD_ADDR %x3, Symbol - For the local-dynamic TLS
/// model, produces a call to __tls_get_addr(sym\@tlsld). Hidden by
/// ADDIS_TLSLD_L_ADDR until after register assignment.
GET_TLSLD_ADDR,
@@ -315,7 +323,7 @@ namespace llvm {
/// following register assignment.
ADDI_TLSLD_L_ADDR,
- /// G8RC = ADDIS_DTPREL_HA %X3, Symbol - For the local-dynamic TLS
+ /// G8RC = ADDIS_DTPREL_HA %x3, Symbol - For the local-dynamic TLS
/// model, produces an ADDIS8 instruction that adds X3 to
/// sym\@dtprel\@ha.
ADDIS_DTPREL_HA,
@@ -578,8 +586,8 @@ namespace llvm {
bool supportSplitCSR(MachineFunction *MF) const override {
return
- MF->getFunction()->getCallingConv() == CallingConv::CXX_FAST_TLS &&
- MF->getFunction()->hasFnAttribute(Attribute::NoUnwind);
+ MF->getFunction().getCallingConv() == CallingConv::CXX_FAST_TLS &&
+ MF->getFunction().hasFnAttribute(Attribute::NoUnwind);
}
void initializeSplitCSR(MachineBasicBlock *Entry) const override;
@@ -727,7 +735,8 @@ namespace llvm {
/// isLegalAddressingMode - Return true if the addressing mode represented
/// by AM is legal for this target, for a load/store of the specified type.
bool isLegalAddressingMode(const DataLayout &DL, const AddrMode &AM,
- Type *Ty, unsigned AS) const override;
+ Type *Ty, unsigned AS,
+ Instruction *I = nullptr) const override;
/// isLegalICmpImmediate - Return true if the specified immediate is legal
/// icmp immediate, that is the target has icmp instructions which can
@@ -749,14 +758,14 @@ namespace llvm {
bool isZExtFree(SDValue Val, EVT VT2) const override;
- bool isFPExtFree(EVT VT) const override;
+ bool isFPExtFree(EVT DestVT, EVT SrcVT) const override;
/// \brief Returns true if it is beneficial to convert a load of a constant
/// to just the constant itself.
bool shouldConvertConstantLoadToIntImm(const APInt &Imm,
Type *Ty) const override;
- bool convertSelectOfConstantsToMath() const override {
+ bool convertSelectOfConstantsToMath(EVT VT) const override {
return true;
}
@@ -764,6 +773,7 @@ namespace llvm {
bool getTgtMemIntrinsic(IntrinsicInfo &Info,
const CallInst &I,
+ MachineFunction &MF,
unsigned Intrinsic) const override;
/// getOptimalMemOpType - Returns the target specific optimal type for load
@@ -898,7 +908,7 @@ namespace llvm {
IsEligibleForTailCallOptimization_64SVR4(
SDValue Callee,
CallingConv::ID CalleeCC,
- ImmutableCallSite *CS,
+ ImmutableCallSite CS,
bool isVarArg,
const SmallVectorImpl<ISD::OutputArg> &Outs,
const SmallVectorImpl<ISD::InputArg> &Ins,
@@ -944,6 +954,7 @@ namespace llvm {
SDValue LowerINTRINSIC_WO_CHAIN(SDValue Op, SelectionDAG &DAG) const;
SDValue LowerINTRINSIC_VOID(SDValue Op, SelectionDAG &DAG) const;
SDValue LowerREM(SDValue Op, SelectionDAG &DAG) const;
+ SDValue LowerBSWAP(SDValue Op, SelectionDAG &DAG) const;
SDValue LowerSCALAR_TO_VECTOR(SDValue Op, SelectionDAG &DAG) const;
SDValue LowerSIGN_EXTEND_INREG(SDValue Op, SelectionDAG &DAG) const;
SDValue LowerMUL(SDValue Op, SelectionDAG &DAG) const;
@@ -964,7 +975,7 @@ namespace llvm {
SDValue &Callee, int SPDiff, unsigned NumBytes,
const SmallVectorImpl<ISD::InputArg> &Ins,
SmallVectorImpl<SDValue> &InVals,
- ImmutableCallSite *CS) const;
+ ImmutableCallSite CS) const;
SDValue
LowerFormalArguments(SDValue Chain, CallingConv::ID CallConv, bool isVarArg,
@@ -1015,7 +1026,7 @@ namespace llvm {
const SmallVectorImpl<ISD::InputArg> &Ins,
const SDLoc &dl, SelectionDAG &DAG,
SmallVectorImpl<SDValue> &InVals,
- ImmutableCallSite *CS) const;
+ ImmutableCallSite CS) const;
SDValue LowerCall_64SVR4(SDValue Chain, SDValue Callee,
CallingConv::ID CallConv, bool isVarArg,
bool isTailCall, bool isPatchPoint,
@@ -1024,7 +1035,7 @@ namespace llvm {
const SmallVectorImpl<ISD::InputArg> &Ins,
const SDLoc &dl, SelectionDAG &DAG,
SmallVectorImpl<SDValue> &InVals,
- ImmutableCallSite *CS) const;
+ ImmutableCallSite CS) const;
SDValue LowerCall_32SVR4(SDValue Chain, SDValue Callee,
CallingConv::ID CallConv, bool isVarArg,
bool isTailCall, bool isPatchPoint,
@@ -1033,7 +1044,7 @@ namespace llvm {
const SmallVectorImpl<ISD::InputArg> &Ins,
const SDLoc &dl, SelectionDAG &DAG,
SmallVectorImpl<SDValue> &InVals,
- ImmutableCallSite *CS) const;
+ ImmutableCallSite CS) const;
SDValue lowerEH_SJLJ_SETJMP(SDValue Op, SelectionDAG &DAG) const;
SDValue lowerEH_SJLJ_LONGJMP(SDValue Op, SelectionDAG &DAG) const;
@@ -1063,7 +1074,23 @@ namespace llvm {
SDValue
combineElementTruncationToVectorTruncation(SDNode *N,
DAGCombinerInfo &DCI) const;
- };
+
+ /// lowerToVINSERTH - Return the SDValue if this VECTOR_SHUFFLE can be
+ /// handled by the VINSERTH instruction introduced in ISA 3.0. This is
+ /// essentially any shuffle of v8i16 vectors that just inserts one element
+ /// from one vector into the other.
+ SDValue lowerToVINSERTH(ShuffleVectorSDNode *N, SelectionDAG &DAG) const;
+
+ /// lowerToVINSERTB - Return the SDValue if this VECTOR_SHUFFLE can be
+ /// handled by the VINSERTB instruction introduced in ISA 3.0. This is
+ /// essentially v16i8 vector version of VINSERTH.
+ SDValue lowerToVINSERTB(ShuffleVectorSDNode *N, SelectionDAG &DAG) const;
+
+ // Return whether the call instruction can potentially be optimized to a
+ // tail call. This will cause the optimizers to attempt to move, or
+ // duplicate return instructions to help enable tail call optimizations.
+ bool mayBeEmittedAsTailCall(const CallInst *CI) const override;
+ }; // end class PPCTargetLowering
namespace PPC {
diff --git a/lib/Target/PowerPC/PPCInstr64Bit.td b/lib/Target/PowerPC/PPCInstr64Bit.td
index e2af5e5295445..fdd28c2ff03f2 100644
--- a/lib/Target/PowerPC/PPCInstr64Bit.td
+++ b/lib/Target/PowerPC/PPCInstr64Bit.td
@@ -194,6 +194,11 @@ def : Pat<(PPCcall_nop (i64 texternalsym:$dst)),
(BL8_NOP texternalsym:$dst)>;
// Atomic operations
+// FIXME: some of these might be used with constant operands. This will result
+// in constant materialization instructions that may be redundant. We currently
+// clean this up in PPCMIPeephole with calls to
+// PPCInstrInfo::convertToImmediateForm() but we should probably not emit them
+// in the first place.
let usesCustomInserter = 1 in {
let Defs = [CR0] in {
def ATOMIC_LOAD_ADD_I64 : Pseudo<
@@ -642,8 +647,13 @@ def EXTSW_32 : XForm_11<31, 986, (outs gprc:$rA), (ins gprc:$rS),
defm SRADI : XSForm_1rc<31, 413, (outs g8rc:$rA), (ins g8rc:$rS, u6imm:$SH),
"sradi", "$rA, $rS, $SH", IIC_IntRotateDI,
[(set i64:$rA, (sra i64:$rS, (i32 imm:$SH)))]>, isPPC64;
+
+defm EXTSWSLI : XSForm_1r<31, 445, (outs g8rc:$rA), (ins g8rc:$rS, u6imm:$SH),
+ "extswsli", "$rA, $rS, $SH", IIC_IntRotateDI,
+ []>, isPPC64;
+
// For fast-isel:
-let isCodeGenOnly = 1 in
+let isCodeGenOnly = 1, Defs = [CARRY] in
def SRADI_32 : XSForm_1<31, 413, (outs gprc:$rA), (ins gprc:$rS, u6imm:$SH),
"sradi $rA, $rS, $SH", IIC_IntRotateDI, []>, isPPC64;
@@ -673,6 +683,9 @@ def POPCNTW : XForm_11<31, 378, (outs gprc:$rA), (ins gprc:$rS),
"popcntw $rA, $rS", IIC_IntGeneral,
[(set i32:$rA, (ctpop i32:$rS))]>;
+def POPCNTB : XForm_11<31, 122, (outs gprc:$rA), (ins gprc:$rS),
+ "popcntb $rA, $rS", IIC_IntGeneral, []>;
+
defm DIVD : XOForm_1rcr<31, 489, 0, (outs g8rc:$rT), (ins g8rc:$rA, g8rc:$rB),
"divd", "$rT, $rA, $rB", IIC_IntDivD,
[(set i64:$rT, (sdiv i64:$rA, i64:$rB))]>, isPPC64;
@@ -685,6 +698,18 @@ def DIVDE : XOForm_1<31, 425, 0, (outs g8rc:$rT), (ins g8rc:$rA, g8rc:$rB),
isPPC64, Requires<[HasExtDiv]>;
let Predicates = [IsISA3_0] in {
+def MADDHD : VAForm_1a<48, (outs g8rc :$RT), (ins g8rc:$RA, g8rc:$RB, g8rc:$RC),
+ "maddhd $RT, $RA, $RB, $RC", IIC_IntMulHD, []>, isPPC64;
+def MADDHDU : VAForm_1a<49, (outs g8rc :$RT), (ins g8rc:$RA, g8rc:$RB, g8rc:$RC),
+ "maddhdu $RT, $RA, $RB, $RC", IIC_IntMulHD, []>, isPPC64;
+def MADDLD : VAForm_1a<51, (outs g8rc :$RT), (ins g8rc:$RA, g8rc:$RB, g8rc:$RC),
+ "maddld $RT, $RA, $RB, $RC", IIC_IntMulHD, []>, isPPC64;
+def SETB : XForm_44<31, 128, (outs g8rc:$RT), (ins crrc:$BFA),
+ "setb $RT, $BFA", IIC_IntGeneral>, isPPC64;
+def DARN : XForm_45<31, 755, (outs g8rc:$RT), (ins i32imm:$L),
+ "darn $RT, $L", IIC_LdStLD>, isPPC64;
+def ADDPCIS : DXForm<19, 2, (outs g8rc:$RT), (ins i32imm:$D),
+ "addpcis $RT, $D", IIC_BrB, []>, isPPC64;
def MODSD : XForm_8<31, 777, (outs g8rc:$rT), (ins g8rc:$rA, g8rc:$rB),
"modsd $rT, $rA, $rB", IIC_IntDivW,
[(set i64:$rT, (srem i64:$rA, i64:$rB))]>;
diff --git a/lib/Target/PowerPC/PPCInstrAltivec.td b/lib/Target/PowerPC/PPCInstrAltivec.td
index 5465b5f2d66cd..e751c149b0b32 100644
--- a/lib/Target/PowerPC/PPCInstrAltivec.td
+++ b/lib/Target/PowerPC/PPCInstrAltivec.td
@@ -477,10 +477,10 @@ def VPERM : VA1a_Int_Ty3<43, "vperm", int_ppc_altivec_vperm,
def VSEL : VA1a_Int_Ty<42, "vsel", int_ppc_altivec_vsel, v4i32>;
// Shuffles.
-def VSLDOI : VAForm_2<44, (outs vrrc:$vD), (ins vrrc:$vA, vrrc:$vB, u5imm:$SH),
+def VSLDOI : VAForm_2<44, (outs vrrc:$vD), (ins vrrc:$vA, vrrc:$vB, u4imm:$SH),
"vsldoi $vD, $vA, $vB, $SH", IIC_VecFP,
- [(set v16i8:$vD,
- (vsldoi_shuffle:$SH v16i8:$vA, v16i8:$vB))]>;
+ [(set v16i8:$vD,
+ (PPCvecshl v16i8:$vA, v16i8:$vB, imm32SExt16:$SH))]>;
// VX-Form instructions. AltiVec arithmetic ops.
let isCommutable = 1 in {
@@ -908,6 +908,9 @@ def:Pat<(vpkuwum_unary_shuffle v16i8:$vA, undef),
(VPKUWUM $vA, $vA)>;
def:Pat<(vpkuhum_unary_shuffle v16i8:$vA, undef),
(VPKUHUM $vA, $vA)>;
+def:Pat<(vsldoi_shuffle:$SH v16i8:$vA, v16i8:$vB),
+ (VSLDOI v16i8:$vA, v16i8:$vB, (VSLDOI_get_imm $SH))>;
+
// Match vsldoi(y,x), vpkuwum(y,x), vpkuhum(y,x), i.e., swapped operands.
// These fragments are matched for little-endian, where the inputs must
@@ -1309,8 +1312,18 @@ def VEXTUWLX : VX1_RT5_RA5_VB5<1677, "vextuwlx", []>;
def VEXTUWRX : VX1_RT5_RA5_VB5<1933, "vextuwrx", []>;
// Vector Insert Element Instructions
-def VINSERTB : VX1_VT5_UIM5_VB5<781, "vinsertb", []>;
-def VINSERTH : VX1_VT5_UIM5_VB5<845, "vinserth", []>;
+def VINSERTB : VXForm_1<781, (outs vrrc:$vD),
+ (ins vrrc:$vDi, u4imm:$UIM, vrrc:$vB),
+ "vinsertb $vD, $vB, $UIM", IIC_VecGeneral,
+ [(set v16i8:$vD, (PPCvecinsert v16i8:$vDi, v16i8:$vB,
+ imm32SExt16:$UIM))]>,
+ RegConstraint<"$vDi = $vD">, NoEncode<"$vDi">;
+def VINSERTH : VXForm_1<845, (outs vrrc:$vD),
+ (ins vrrc:$vDi, u4imm:$UIM, vrrc:$vB),
+ "vinserth $vD, $vB, $UIM", IIC_VecGeneral,
+ [(set v8i16:$vD, (PPCvecinsert v8i16:$vDi, v8i16:$vB,
+ imm32SExt16:$UIM))]>,
+ RegConstraint<"$vDi = $vD">, NoEncode<"$vDi">;
def VINSERTW : VX1_VT5_UIM5_VB5<909, "vinsertw", []>;
def VINSERTD : VX1_VT5_UIM5_VB5<973, "vinsertd", []>;
@@ -1488,4 +1501,19 @@ def VABSDUH : VXForm_1<1091, (outs vrrc:$vD), (ins vrrc:$vA, vrrc:$vB),
def VABSDUW : VXForm_1<1155, (outs vrrc:$vD), (ins vrrc:$vA, vrrc:$vB),
"vabsduw $vD, $vA, $vB", IIC_VecGeneral,
[(set v4i32:$vD, (int_ppc_altivec_vabsduw v4i32:$vA, v4i32:$vB))]>;
+
+def : Pat<(v16i8:$vD (abs v16i8:$vA)),
+ (v16i8 (VABSDUB $vA, (V_SET0B)))>;
+def : Pat<(v8i16:$vD (abs v8i16:$vA)),
+ (v8i16 (VABSDUH $vA, (V_SET0H)))>;
+def : Pat<(v4i32:$vD (abs v4i32:$vA)),
+ (v4i32 (VABSDUW $vA, (V_SET0)))>;
+
+def : Pat<(v16i8:$vD (abs (sub v16i8:$vA, v16i8:$vB))),
+ (v16i8 (VABSDUB $vA, $vB))>;
+def : Pat<(v8i16:$vD (abs (sub v8i16:$vA, v8i16:$vB))),
+ (v8i16 (VABSDUH $vA, $vB))>;
+def : Pat<(v4i32:$vD (abs (sub v4i32:$vA, v4i32:$vB))),
+ (v4i32 (VABSDUW $vA, $vB))>;
+
} // end HasP9Altivec
diff --git a/lib/Target/PowerPC/PPCInstrFormats.td b/lib/Target/PowerPC/PPCInstrFormats.td
index ef7d2012a2332..f2845415ecb5a 100644
--- a/lib/Target/PowerPC/PPCInstrFormats.td
+++ b/lib/Target/PowerPC/PPCInstrFormats.td
@@ -386,6 +386,22 @@ class DSForm_1<bits<6> opcode, bits<2> xo, dag OOL, dag IOL, string asmstr,
let Inst{30-31} = xo;
}
+// ISA V3.0B 1.6.6 DX-Form
+class DXForm<bits<6> opcode, bits<5> xo, dag OOL, dag IOL, string asmstr,
+ InstrItinClass itin, list<dag> pattern>
+ : I<opcode, OOL, IOL, asmstr, itin> {
+ bits<5> RT;
+ bits<16> D;
+
+ let Pattern = pattern;
+
+ let Inst{6-10} = RT;
+ let Inst{11-15} = D{5-1}; // d1
+ let Inst{16-25} = D{15-6}; // d0
+ let Inst{26-30} = xo;
+ let Inst{31} = D{0}; // d2
+}
+
// DQ-Form: [PO T RA DQ TX XO] or [PO S RA DQ SX XO]
class DQ_RD6_RS5_DQ12<bits<6> opcode, bits<3> xo, dag OOL, dag IOL,
string asmstr, InstrItinClass itin, list<dag> pattern>
@@ -725,6 +741,96 @@ class XForm_43<bits<6> opcode, bits<10> xo, dag OOL, dag IOL, string asmstr,
let Inst{31} = RC;
}
+class XForm_44<bits<6> opcode, bits<10> xo, dag OOL, dag IOL, string asmstr,
+ InstrItinClass itin>
+ : I<opcode, OOL, IOL, asmstr, itin> {
+ bits<5> RT;
+ bits<3> BFA;
+
+ let Inst{6-10} = RT;
+ let Inst{11-13} = BFA;
+ let Inst{14-15} = 0;
+ let Inst{16-20} = 0;
+ let Inst{21-30} = xo;
+ let Inst{31} = 0;
+}
+
+class XForm_45<bits<6> opcode, bits<10> xo, dag OOL, dag IOL, string asmstr,
+ InstrItinClass itin>
+ : I<opcode, OOL, IOL, asmstr, itin> {
+ bits<5> RT;
+ bits<2> L;
+
+ let Inst{6-10} = RT;
+ let Inst{11-13} = 0;
+ let Inst{14-15} = L;
+ let Inst{16-20} = 0;
+ let Inst{21-30} = xo;
+ let Inst{31} = 0;
+}
+
+class X_FRT5_XO2_XO3_XO10<bits<6> opcode, bits<2> xo1, bits<3> xo2, bits<10> xo,
+ dag OOL, dag IOL, string asmstr, InstrItinClass itin,
+ list<dag> pattern>
+ : XForm_base_r3xo<opcode, xo, OOL, IOL, asmstr, itin, pattern> {
+ let Pattern = pattern;
+
+ let Inst{6-10} = RST;
+ let Inst{11-12} = xo1;
+ let Inst{13-15} = xo2;
+ let Inst{16-20} = 0;
+ let Inst{21-30} = xo;
+ let Inst{31} = 0;
+}
+
+class X_FRT5_XO2_XO3_FRB5_XO10<bits<6> opcode, bits<2> xo1, bits<3> xo2,
+ bits<10> xo, dag OOL, dag IOL, string asmstr,
+ InstrItinClass itin, list<dag> pattern>
+ : XForm_base_r3xo<opcode, xo, OOL, IOL, asmstr, itin, pattern> {
+ let Pattern = pattern;
+ bits<5> FRB;
+
+ let Inst{6-10} = RST;
+ let Inst{11-12} = xo1;
+ let Inst{13-15} = xo2;
+ let Inst{16-20} = FRB;
+ let Inst{21-30} = xo;
+ let Inst{31} = 0;
+}
+
+class X_FRT5_XO2_XO3_DRM3_XO10<bits<6> opcode, bits<2> xo1, bits<3> xo2,
+ bits<10> xo, dag OOL, dag IOL, string asmstr,
+ InstrItinClass itin, list<dag> pattern>
+ : XForm_base_r3xo<opcode, xo, OOL, IOL, asmstr, itin, pattern> {
+ let Pattern = pattern;
+ bits<3> DRM;
+
+ let Inst{6-10} = RST;
+ let Inst{11-12} = xo1;
+ let Inst{13-15} = xo2;
+ let Inst{16-17} = 0;
+ let Inst{18-20} = DRM;
+ let Inst{21-30} = xo;
+ let Inst{31} = 0;
+}
+
+class X_FRT5_XO2_XO3_RM2_X10<bits<6> opcode, bits<2> xo1, bits<3> xo2,
+ bits<10> xo, dag OOL, dag IOL, string asmstr,
+ InstrItinClass itin, list<dag> pattern>
+ : XForm_base_r3xo<opcode, xo, OOL, IOL, asmstr, itin, pattern> {
+ let Pattern = pattern;
+ bits<2> RM;
+
+ let Inst{6-10} = RST;
+ let Inst{11-12} = xo1;
+ let Inst{13-15} = xo2;
+ let Inst{16-18} = 0;
+ let Inst{19-20} = RM;
+ let Inst{21-30} = xo;
+ let Inst{31} = 0;
+}
+
+
class XForm_0<bits<6> opcode, bits<10> xo, dag OOL, dag IOL, string asmstr,
InstrItinClass itin, list<dag> pattern>
: XForm_base_r3xo<opcode, xo, OOL, IOL, asmstr, itin, pattern> {
@@ -1995,4 +2101,5 @@ class Pseudo<dag OOL, dag IOL, string asmstr, list<dag> pattern>
let PPC64 = 0;
let Pattern = pattern;
let Inst{31-0} = 0;
+ let hasNoSchedulingInfo = 1;
}
diff --git a/lib/Target/PowerPC/PPCInstrInfo.cpp b/lib/Target/PowerPC/PPCInstrInfo.cpp
index e74ba38c351f0..ffb5cc8757f25 100644
--- a/lib/Target/PowerPC/PPCInstrInfo.cpp
+++ b/lib/Target/PowerPC/PPCInstrInfo.cpp
@@ -20,7 +20,7 @@
#include "PPCTargetMachine.h"
#include "llvm/ADT/STLExtras.h"
#include "llvm/ADT/Statistic.h"
-#include "llvm/CodeGen/LiveIntervalAnalysis.h"
+#include "llvm/CodeGen/LiveIntervals.h"
#include "llvm/CodeGen/MachineFrameInfo.h"
#include "llvm/CodeGen/MachineFunctionPass.h"
#include "llvm/CodeGen/MachineInstrBuilder.h"
@@ -46,6 +46,16 @@ using namespace llvm;
#define GET_INSTRINFO_CTOR_DTOR
#include "PPCGenInstrInfo.inc"
+STATISTIC(NumStoreSPILLVSRRCAsVec,
+ "Number of spillvsrrc spilled to stack as vec");
+STATISTIC(NumStoreSPILLVSRRCAsGpr,
+ "Number of spillvsrrc spilled to stack as gpr");
+STATISTIC(NumGPRtoVSRSpill, "Number of gpr spills to spillvsrrc");
+STATISTIC(CmpIselsConverted,
+ "Number of ISELs that depend on comparison of constants converted");
+STATISTIC(MissedConvertibleImmediateInstrs,
+ "Number of compare-immediate instructions fed by constants");
+
static cl::
opt<bool> DisableCTRLoopAnal("disable-ppc-ctrloop-analysis", cl::Hidden,
cl::desc("Disable analysis for CTR loops"));
@@ -254,6 +264,7 @@ bool PPCInstrInfo::isCoalescableExtInstr(const MachineInstr &MI,
switch (MI.getOpcode()) {
default: return false;
case PPC::EXTSW:
+ case PPC::EXTSW_32:
case PPC::EXTSW_32_64:
SrcReg = MI.getOperand(1).getReg();
DstReg = MI.getOperand(0).getReg();
@@ -275,11 +286,12 @@ unsigned PPCInstrInfo::isLoadFromStackSlot(const MachineInstr &MI,
case PPC::RESTORE_CRBIT:
case PPC::LVX:
case PPC::LXVD2X:
- case PPC::LXVX:
+ case PPC::LXV:
case PPC::QVLFDX:
case PPC::QVLFSXs:
case PPC::QVLFDXb:
case PPC::RESTORE_VRSAVE:
+ case PPC::SPILLTOVSR_LD:
// Check for the operands added by addFrameReference (the immediate is the
// offset which defaults to 0).
if (MI.getOperand(1).isImm() && !MI.getOperand(1).getImm() &&
@@ -328,11 +340,12 @@ unsigned PPCInstrInfo::isStoreToStackSlot(const MachineInstr &MI,
case PPC::SPILL_CRBIT:
case PPC::STVX:
case PPC::STXVD2X:
- case PPC::STXVX:
+ case PPC::STXV:
case PPC::QVSTFDX:
case PPC::QVSTFSXs:
case PPC::QVSTFDXb:
case PPC::SPILL_VRSAVE:
+ case PPC::SPILLTOVSR_ST:
// Check for the operands added by addFrameReference (the immediate is the
// offset which defaults to 0).
if (MI.getOperand(1).isImm() && !MI.getOperand(1).getImm() &&
@@ -486,6 +499,20 @@ bool PPCInstrInfo::analyzeBranch(MachineBasicBlock &MBB,
if (!isUnpredicatedTerminator(*I))
return false;
+ if (AllowModify) {
+ // If the BB ends with an unconditional branch to the fallthrough BB,
+ // we eliminate the branch instruction.
+ if (I->getOpcode() == PPC::B &&
+ MBB.isLayoutSuccessor(I->getOperand(0).getMBB())) {
+ I->eraseFromParent();
+
+ // We update iterator after deleting the last branch.
+ I = MBB.getLastNonDebugInstr();
+ if (I == MBB.end() || !isUnpredicatedTerminator(*I))
+ return false;
+ }
+ }
+
// Get the last instruction in the block.
MachineInstr &LastInst = *I;
@@ -917,7 +944,18 @@ void PPCInstrInfo::copyPhysReg(MachineBasicBlock &MBB,
BuildMI(MBB, I, DL, get(PPC::MFOCRF), DestReg).addReg(SrcReg);
getKillRegState(KillSrc);
return;
- }
+ } else if (PPC::G8RCRegClass.contains(SrcReg) &&
+ PPC::VSFRCRegClass.contains(DestReg)) {
+ BuildMI(MBB, I, DL, get(PPC::MTVSRD), DestReg).addReg(SrcReg);
+ NumGPRtoVSRSpill++;
+ getKillRegState(KillSrc);
+ return;
+ } else if (PPC::VSFRCRegClass.contains(SrcReg) &&
+ PPC::G8RCRegClass.contains(DestReg)) {
+ BuildMI(MBB, I, DL, get(PPC::MFVSRD), DestReg).addReg(SrcReg);
+ getKillRegState(KillSrc);
+ return;
+ }
unsigned Opc;
if (PPC::GPRCRegClass.contains(DestReg, SrcReg))
@@ -1015,7 +1053,7 @@ PPCInstrInfo::StoreRegToStackSlot(MachineFunction &MF,
FrameIdx));
NonRI = true;
} else if (PPC::VSRCRegClass.hasSubClassEq(RC)) {
- unsigned Op = Subtarget.hasP9Vector() ? PPC::STXVX : PPC::STXVD2X;
+ unsigned Op = Subtarget.hasP9Vector() ? PPC::STXV : PPC::STXVD2X;
NewMIs.push_back(addFrameReference(BuildMI(MF, DL, get(Op))
.addReg(SrcReg,
getKillRegState(isKill)),
@@ -1061,6 +1099,11 @@ PPCInstrInfo::StoreRegToStackSlot(MachineFunction &MF,
getKillRegState(isKill)),
FrameIdx));
NonRI = true;
+ } else if (PPC::SPILLTOVSRRCRegClass.hasSubClassEq(RC)) {
+ NewMIs.push_back(addFrameReference(BuildMI(MF, DL, get(PPC::SPILLTOVSR_ST))
+ .addReg(SrcReg,
+ getKillRegState(isKill)),
+ FrameIdx));
} else {
llvm_unreachable("Unknown regclass!");
}
@@ -1148,7 +1191,7 @@ bool PPCInstrInfo::LoadRegFromStackSlot(MachineFunction &MF, const DebugLoc &DL,
FrameIdx));
NonRI = true;
} else if (PPC::VSRCRegClass.hasSubClassEq(RC)) {
- unsigned Op = Subtarget.hasP9Vector() ? PPC::LXVX : PPC::LXVD2X;
+ unsigned Op = Subtarget.hasP9Vector() ? PPC::LXV : PPC::LXVD2X;
NewMIs.push_back(addFrameReference(BuildMI(MF, DL, get(Op), DestReg),
FrameIdx));
NonRI = true;
@@ -1182,6 +1225,9 @@ bool PPCInstrInfo::LoadRegFromStackSlot(MachineFunction &MF, const DebugLoc &DL,
NewMIs.push_back(addFrameReference(BuildMI(MF, DL, get(PPC::QVLFDXb), DestReg),
FrameIdx));
NonRI = true;
+ } else if (PPC::SPILLTOVSRRCRegClass.hasSubClassEq(RC)) {
+ NewMIs.push_back(addFrameReference(BuildMI(MF, DL, get(PPC::SPILLTOVSR_LD),
+ DestReg), FrameIdx));
} else {
llvm_unreachable("Unknown regclass!");
}
@@ -1592,37 +1638,20 @@ bool PPCInstrInfo::optimizeCompareInstr(MachineInstr &CmpInstr, unsigned SrcReg,
// Get the unique definition of SrcReg.
MachineInstr *MI = MRI->getUniqueVRegDef(SrcReg);
if (!MI) return false;
- int MIOpC = MI->getOpcode();
bool equalityOnly = false;
bool noSub = false;
if (isPPC64) {
if (is32BitSignedCompare) {
// We can perform this optimization only if MI is sign-extending.
- if (MIOpC == PPC::SRAW || MIOpC == PPC::SRAWo ||
- MIOpC == PPC::SRAWI || MIOpC == PPC::SRAWIo ||
- MIOpC == PPC::EXTSB || MIOpC == PPC::EXTSBo ||
- MIOpC == PPC::EXTSH || MIOpC == PPC::EXTSHo ||
- MIOpC == PPC::EXTSW || MIOpC == PPC::EXTSWo) {
+ if (isSignExtended(*MI))
noSub = true;
- } else
+ else
return false;
} else if (is32BitUnsignedCompare) {
- // 32-bit rotate and mask instructions are zero extending only if MB <= ME
- bool isZeroExtendingRotate =
- (MIOpC == PPC::RLWINM || MIOpC == PPC::RLWINMo ||
- MIOpC == PPC::RLWNM || MIOpC == PPC::RLWNMo)
- && MI->getOperand(3).getImm() <= MI->getOperand(4).getImm();
-
// We can perform this optimization, equality only, if MI is
// zero-extending.
- // FIXME: Other possible target instructions include ANDISo and
- // RLWINM aliases, such as ROTRWI, EXTLWI, SLWI and SRWI.
- if (MIOpC == PPC::CNTLZW || MIOpC == PPC::CNTLZWo ||
- MIOpC == PPC::SLW || MIOpC == PPC::SLWo ||
- MIOpC == PPC::SRW || MIOpC == PPC::SRWo ||
- MIOpC == PPC::ANDIo ||
- isZeroExtendingRotate) {
+ if (isZeroExtended(*MI)) {
noSub = true;
equalityOnly = true;
} else
@@ -1640,8 +1669,10 @@ bool PPCInstrInfo::optimizeCompareInstr(MachineInstr &CmpInstr, unsigned SrcReg,
I != IE; ++I) {
MachineInstr *UseMI = &*I;
if (UseMI->getOpcode() == PPC::BCC) {
- unsigned Pred = UseMI->getOperand(0).getImm();
- if (Pred != PPC::PRED_EQ && Pred != PPC::PRED_NE)
+ PPC::Predicate Pred = (PPC::Predicate)UseMI->getOperand(0).getImm();
+ unsigned PredCond = PPC::getPredicateCondition(Pred);
+ // We ignore hint bits when checking for non-equality comparisons.
+ if (PredCond != PPC::PRED_EQ && PredCond != PPC::PRED_NE)
return false;
} else if (UseMI->getOpcode() == PPC::ISEL ||
UseMI->getOpcode() == PPC::ISEL8) {
@@ -1688,34 +1719,47 @@ bool PPCInstrInfo::optimizeCompareInstr(MachineInstr &CmpInstr, unsigned SrcReg,
else if (MI->getParent() != CmpInstr.getParent())
return false;
else if (Value != 0) {
- // The record-form instructions set CR bit based on signed comparison against 0.
- // We try to convert a compare against 1 or -1 into a compare against 0.
- bool Success = false;
- if (!equalityOnly && MRI->hasOneUse(CRReg)) {
- MachineInstr *UseMI = &*MRI->use_instr_begin(CRReg);
- if (UseMI->getOpcode() == PPC::BCC) {
- PPC::Predicate Pred = (PPC::Predicate)UseMI->getOperand(0).getImm();
- int16_t Immed = (int16_t)Value;
+ // The record-form instructions set CR bit based on signed comparison
+ // against 0. We try to convert a compare against 1 or -1 into a compare
+ // against 0 to exploit record-form instructions. For example, we change
+ // the condition "greater than -1" into "greater than or equal to 0"
+ // and "less than 1" into "less than or equal to 0".
- if (Immed == -1 && Pred == PPC::PRED_GT) {
- // We convert "greater than -1" into "greater than or equal to 0",
- // since we are assuming signed comparison by !equalityOnly
- PredsToUpdate.push_back(std::make_pair(&(UseMI->getOperand(0)),
- PPC::PRED_GE));
- Success = true;
- }
- else if (Immed == 1 && Pred == PPC::PRED_LT) {
- // We convert "less than 1" into "less than or equal to 0".
- PredsToUpdate.push_back(std::make_pair(&(UseMI->getOperand(0)),
- PPC::PRED_LE));
- Success = true;
- }
- }
- }
+ // Since we optimize comparison based on a specific branch condition,
+ // we don't optimize if condition code is used by more than once.
+ if (equalityOnly || !MRI->hasOneUse(CRReg))
+ return false;
+
+ MachineInstr *UseMI = &*MRI->use_instr_begin(CRReg);
+ if (UseMI->getOpcode() != PPC::BCC)
+ return false;
- // PPC does not have a record-form SUBri.
- if (!Success)
+ PPC::Predicate Pred = (PPC::Predicate)UseMI->getOperand(0).getImm();
+ PPC::Predicate NewPred = Pred;
+ unsigned PredCond = PPC::getPredicateCondition(Pred);
+ unsigned PredHint = PPC::getPredicateHint(Pred);
+ int16_t Immed = (int16_t)Value;
+
+ // When modyfing the condition in the predicate, we propagate hint bits
+ // from the original predicate to the new one.
+ if (Immed == -1 && PredCond == PPC::PRED_GT)
+ // We convert "greater than -1" into "greater than or equal to 0",
+ // since we are assuming signed comparison by !equalityOnly
+ NewPred = PPC::getPredicate(PPC::PRED_GE, PredHint);
+ else if (Immed == -1 && PredCond == PPC::PRED_LE)
+ // We convert "less than or equal to -1" into "less than 0".
+ NewPred = PPC::getPredicate(PPC::PRED_LT, PredHint);
+ else if (Immed == 1 && PredCond == PPC::PRED_LT)
+ // We convert "less than 1" into "less than or equal to 0".
+ NewPred = PPC::getPredicate(PPC::PRED_LE, PredHint);
+ else if (Immed == 1 && PredCond == PPC::PRED_GE)
+ // We convert "greater than or equal to 1" into "greater than 0".
+ NewPred = PPC::getPredicate(PPC::PRED_GT, PredHint);
+ else
return false;
+
+ PredsToUpdate.push_back(std::make_pair(&(UseMI->getOperand(0)),
+ NewPred));
}
// Search for Sub.
@@ -1763,7 +1807,7 @@ bool PPCInstrInfo::optimizeCompareInstr(MachineInstr &CmpInstr, unsigned SrcReg,
if (!MI) MI = Sub;
int NewOpC = -1;
- MIOpC = MI->getOpcode();
+ int MIOpC = MI->getOpcode();
if (MIOpC == PPC::ANDIo || MIOpC == PPC::ANDIo8)
NewOpC = MIOpC;
else {
@@ -1804,9 +1848,11 @@ bool PPCInstrInfo::optimizeCompareInstr(MachineInstr &CmpInstr, unsigned SrcReg,
MachineInstr *UseMI = &*I;
if (UseMI->getOpcode() == PPC::BCC) {
PPC::Predicate Pred = (PPC::Predicate) UseMI->getOperand(0).getImm();
+ unsigned PredCond = PPC::getPredicateCondition(Pred);
assert((!equalityOnly ||
- Pred == PPC::PRED_EQ || Pred == PPC::PRED_NE) &&
+ PredCond == PPC::PRED_EQ || PredCond == PPC::PRED_NE) &&
"Invalid predicate for equality-only optimization");
+ (void)PredCond; // To suppress warning in release build.
PredsToUpdate.push_back(std::make_pair(&(UseMI->getOperand(0)),
PPC::getSwappedPredicate(Pred)));
} else if (UseMI->getOpcode() == PPC::ISEL ||
@@ -1935,29 +1981,13 @@ PPCInstrInfo::getSerializableBitmaskMachineOperandTargetFlags() const {
return makeArrayRef(TargetFlags);
}
-bool PPCInstrInfo::expandPostRAPseudo(MachineInstr &MI) const {
- auto &MBB = *MI.getParent();
- auto DL = MI.getDebugLoc();
- switch (MI.getOpcode()) {
- case TargetOpcode::LOAD_STACK_GUARD: {
- assert(Subtarget.isTargetLinux() &&
- "Only Linux target is expected to contain LOAD_STACK_GUARD");
- const int64_t Offset = Subtarget.isPPC64() ? -0x7010 : -0x7008;
- const unsigned Reg = Subtarget.isPPC64() ? PPC::X13 : PPC::R2;
- MI.setDesc(get(Subtarget.isPPC64() ? PPC::LD : PPC::LWZ));
- MachineInstrBuilder(*MI.getParent()->getParent(), MI)
- .addImm(Offset)
- .addReg(Reg);
- return true;
- }
- case PPC::DFLOADf32:
- case PPC::DFLOADf64:
- case PPC::DFSTOREf32:
- case PPC::DFSTOREf64: {
- assert(Subtarget.hasP9Vector() &&
- "Invalid D-Form Pseudo-ops on non-P9 target.");
- assert(MI.getOperand(2).isReg() && MI.getOperand(1).isImm() &&
- "D-form op must have register and immediate operands");
+// Expand VSX Memory Pseudo instruction to either a VSX or a FP instruction.
+// The VSX versions have the advantage of a full 64-register target whereas
+// the FP ones have the advantage of lower latency and higher throughput. So
+// what we are after is using the faster instructions in low register pressure
+// situations and using the larger register file in high register pressure
+// situations.
+bool PPCInstrInfo::expandVSXMemPseudo(MachineInstr &MI) const {
unsigned UpperOpcode, LowerOpcode;
switch (MI.getOpcode()) {
case PPC::DFLOADf32:
@@ -1976,7 +2006,38 @@ bool PPCInstrInfo::expandPostRAPseudo(MachineInstr &MI) const {
UpperOpcode = PPC::STXSD;
LowerOpcode = PPC::STFD;
break;
+ case PPC::XFLOADf32:
+ UpperOpcode = PPC::LXSSPX;
+ LowerOpcode = PPC::LFSX;
+ break;
+ case PPC::XFLOADf64:
+ UpperOpcode = PPC::LXSDX;
+ LowerOpcode = PPC::LFDX;
+ break;
+ case PPC::XFSTOREf32:
+ UpperOpcode = PPC::STXSSPX;
+ LowerOpcode = PPC::STFSX;
+ break;
+ case PPC::XFSTOREf64:
+ UpperOpcode = PPC::STXSDX;
+ LowerOpcode = PPC::STFDX;
+ break;
+ case PPC::LIWAX:
+ UpperOpcode = PPC::LXSIWAX;
+ LowerOpcode = PPC::LFIWAX;
+ break;
+ case PPC::LIWZX:
+ UpperOpcode = PPC::LXSIWZX;
+ LowerOpcode = PPC::LFIWZX;
+ break;
+ case PPC::STIWX:
+ UpperOpcode = PPC::STXSIWX;
+ LowerOpcode = PPC::STFIWX;
+ break;
+ default:
+ llvm_unreachable("Unknown Operation!");
}
+
unsigned TargetReg = MI.getOperand(0).getReg();
unsigned Opcode;
if ((TargetReg >= PPC::F0 && TargetReg <= PPC::F31) ||
@@ -1986,7 +2047,95 @@ bool PPCInstrInfo::expandPostRAPseudo(MachineInstr &MI) const {
Opcode = UpperOpcode;
MI.setDesc(get(Opcode));
return true;
+}
+
+bool PPCInstrInfo::expandPostRAPseudo(MachineInstr &MI) const {
+ auto &MBB = *MI.getParent();
+ auto DL = MI.getDebugLoc();
+
+ switch (MI.getOpcode()) {
+ case TargetOpcode::LOAD_STACK_GUARD: {
+ assert(Subtarget.isTargetLinux() &&
+ "Only Linux target is expected to contain LOAD_STACK_GUARD");
+ const int64_t Offset = Subtarget.isPPC64() ? -0x7010 : -0x7008;
+ const unsigned Reg = Subtarget.isPPC64() ? PPC::X13 : PPC::R2;
+ MI.setDesc(get(Subtarget.isPPC64() ? PPC::LD : PPC::LWZ));
+ MachineInstrBuilder(*MI.getParent()->getParent(), MI)
+ .addImm(Offset)
+ .addReg(Reg);
+ return true;
}
+ case PPC::DFLOADf32:
+ case PPC::DFLOADf64:
+ case PPC::DFSTOREf32:
+ case PPC::DFSTOREf64: {
+ assert(Subtarget.hasP9Vector() &&
+ "Invalid D-Form Pseudo-ops on Pre-P9 target.");
+ assert(MI.getOperand(2).isReg() && MI.getOperand(1).isImm() &&
+ "D-form op must have register and immediate operands");
+ return expandVSXMemPseudo(MI);
+ }
+ case PPC::XFLOADf32:
+ case PPC::XFSTOREf32:
+ case PPC::LIWAX:
+ case PPC::LIWZX:
+ case PPC::STIWX: {
+ assert(Subtarget.hasP8Vector() &&
+ "Invalid X-Form Pseudo-ops on Pre-P8 target.");
+ assert(MI.getOperand(2).isReg() && MI.getOperand(1).isReg() &&
+ "X-form op must have register and register operands");
+ return expandVSXMemPseudo(MI);
+ }
+ case PPC::XFLOADf64:
+ case PPC::XFSTOREf64: {
+ assert(Subtarget.hasVSX() &&
+ "Invalid X-Form Pseudo-ops on target that has no VSX.");
+ assert(MI.getOperand(2).isReg() && MI.getOperand(1).isReg() &&
+ "X-form op must have register and register operands");
+ return expandVSXMemPseudo(MI);
+ }
+ case PPC::SPILLTOVSR_LD: {
+ unsigned TargetReg = MI.getOperand(0).getReg();
+ if (PPC::VSFRCRegClass.contains(TargetReg)) {
+ MI.setDesc(get(PPC::DFLOADf64));
+ return expandPostRAPseudo(MI);
+ }
+ else
+ MI.setDesc(get(PPC::LD));
+ return true;
+ }
+ case PPC::SPILLTOVSR_ST: {
+ unsigned SrcReg = MI.getOperand(0).getReg();
+ if (PPC::VSFRCRegClass.contains(SrcReg)) {
+ NumStoreSPILLVSRRCAsVec++;
+ MI.setDesc(get(PPC::DFSTOREf64));
+ return expandPostRAPseudo(MI);
+ } else {
+ NumStoreSPILLVSRRCAsGpr++;
+ MI.setDesc(get(PPC::STD));
+ }
+ return true;
+ }
+ case PPC::SPILLTOVSR_LDX: {
+ unsigned TargetReg = MI.getOperand(0).getReg();
+ if (PPC::VSFRCRegClass.contains(TargetReg))
+ MI.setDesc(get(PPC::LXSDX));
+ else
+ MI.setDesc(get(PPC::LDX));
+ return true;
+ }
+ case PPC::SPILLTOVSR_STX: {
+ unsigned SrcReg = MI.getOperand(0).getReg();
+ if (PPC::VSFRCRegClass.contains(SrcReg)) {
+ NumStoreSPILLVSRRCAsVec++;
+ MI.setDesc(get(PPC::STXSDX));
+ } else {
+ NumStoreSPILLVSRRCAsGpr++;
+ MI.setDesc(get(PPC::STDX));
+ }
+ return true;
+ }
+
case PPC::CFENCE8: {
auto Val = MI.getOperand(0).getReg();
BuildMI(MBB, MI, DL, get(PPC::CMPD), PPC::CR7).addReg(Val).addReg(Val);
@@ -2002,6 +2151,829 @@ bool PPCInstrInfo::expandPostRAPseudo(MachineInstr &MI) const {
return false;
}
+unsigned PPCInstrInfo::lookThruCopyLike(unsigned SrcReg,
+ const MachineRegisterInfo *MRI) {
+ while (true) {
+ MachineInstr *MI = MRI->getVRegDef(SrcReg);
+ if (!MI->isCopyLike())
+ return SrcReg;
+
+ unsigned CopySrcReg;
+ if (MI->isCopy())
+ CopySrcReg = MI->getOperand(1).getReg();
+ else {
+ assert(MI->isSubregToReg() && "Bad opcode for lookThruCopyLike");
+ CopySrcReg = MI->getOperand(2).getReg();
+ }
+
+ if (!TargetRegisterInfo::isVirtualRegister(CopySrcReg))
+ return CopySrcReg;
+
+ SrcReg = CopySrcReg;
+ }
+}
+
+// Essentially a compile-time implementation of a compare->isel sequence.
+// It takes two constants to compare, along with the true/false registers
+// and the comparison type (as a subreg to a CR field) and returns one
+// of the true/false registers, depending on the comparison results.
+static unsigned selectReg(int64_t Imm1, int64_t Imm2, unsigned CompareOpc,
+ unsigned TrueReg, unsigned FalseReg,
+ unsigned CRSubReg) {
+ // Signed comparisons. The immediates are assumed to be sign-extended.
+ if (CompareOpc == PPC::CMPWI || CompareOpc == PPC::CMPDI) {
+ switch (CRSubReg) {
+ default: llvm_unreachable("Unknown integer comparison type.");
+ case PPC::sub_lt:
+ return Imm1 < Imm2 ? TrueReg : FalseReg;
+ case PPC::sub_gt:
+ return Imm1 > Imm2 ? TrueReg : FalseReg;
+ case PPC::sub_eq:
+ return Imm1 == Imm2 ? TrueReg : FalseReg;
+ }
+ }
+ // Unsigned comparisons.
+ else if (CompareOpc == PPC::CMPLWI || CompareOpc == PPC::CMPLDI) {
+ switch (CRSubReg) {
+ default: llvm_unreachable("Unknown integer comparison type.");
+ case PPC::sub_lt:
+ return (uint64_t)Imm1 < (uint64_t)Imm2 ? TrueReg : FalseReg;
+ case PPC::sub_gt:
+ return (uint64_t)Imm1 > (uint64_t)Imm2 ? TrueReg : FalseReg;
+ case PPC::sub_eq:
+ return Imm1 == Imm2 ? TrueReg : FalseReg;
+ }
+ }
+ return PPC::NoRegister;
+}
+
+// Replace an instruction with one that materializes a constant (and sets
+// CR0 if the original instruction was a record-form instruction).
+void PPCInstrInfo::replaceInstrWithLI(MachineInstr &MI,
+ const LoadImmediateInfo &LII) const {
+ // Remove existing operands.
+ int OperandToKeep = LII.SetCR ? 1 : 0;
+ for (int i = MI.getNumOperands() - 1; i > OperandToKeep; i--)
+ MI.RemoveOperand(i);
+
+ // Replace the instruction.
+ if (LII.SetCR) {
+ MI.setDesc(get(LII.Is64Bit ? PPC::ANDIo8 : PPC::ANDIo));
+ // Set the immediate.
+ MachineInstrBuilder(*MI.getParent()->getParent(), MI)
+ .addImm(LII.Imm).addReg(PPC::CR0, RegState::ImplicitDefine);
+ return;
+ }
+ else
+ MI.setDesc(get(LII.Is64Bit ? PPC::LI8 : PPC::LI));
+
+ // Set the immediate.
+ MachineInstrBuilder(*MI.getParent()->getParent(), MI)
+ .addImm(LII.Imm);
+}
+
+MachineInstr *PPCInstrInfo::getConstantDefMI(MachineInstr &MI,
+ unsigned &ConstOp,
+ bool &SeenIntermediateUse) const {
+ ConstOp = ~0U;
+ MachineInstr *DefMI = nullptr;
+ MachineRegisterInfo *MRI = &MI.getParent()->getParent()->getRegInfo();
+ // If we'ere in SSA, get the defs through the MRI. Otherwise, only look
+ // within the basic block to see if the register is defined using an LI/LI8.
+ if (MRI->isSSA()) {
+ for (int i = 1, e = MI.getNumOperands(); i < e; i++) {
+ if (!MI.getOperand(i).isReg())
+ continue;
+ unsigned Reg = MI.getOperand(i).getReg();
+ if (!TargetRegisterInfo::isVirtualRegister(Reg))
+ continue;
+ unsigned TrueReg = lookThruCopyLike(Reg, MRI);
+ if (TargetRegisterInfo::isVirtualRegister(TrueReg)) {
+ DefMI = MRI->getVRegDef(TrueReg);
+ if (DefMI->getOpcode() == PPC::LI || DefMI->getOpcode() == PPC::LI8) {
+ ConstOp = i;
+ break;
+ }
+ }
+ }
+ } else {
+ // Looking back through the definition for each operand could be expensive,
+ // so exit early if this isn't an instruction that either has an immediate
+ // form or is already an immediate form that we can handle.
+ ImmInstrInfo III;
+ unsigned Opc = MI.getOpcode();
+ bool ConvertibleImmForm =
+ Opc == PPC::CMPWI || Opc == PPC::CMPLWI ||
+ Opc == PPC::CMPDI || Opc == PPC::CMPLDI ||
+ Opc == PPC::ADDI || Opc == PPC::ADDI8 ||
+ Opc == PPC::ORI || Opc == PPC::ORI8 ||
+ Opc == PPC::XORI || Opc == PPC::XORI8 ||
+ Opc == PPC::RLDICL || Opc == PPC::RLDICLo ||
+ Opc == PPC::RLDICL_32 || Opc == PPC::RLDICL_32_64 ||
+ Opc == PPC::RLWINM || Opc == PPC::RLWINMo ||
+ Opc == PPC::RLWINM8 || Opc == PPC::RLWINM8o;
+ if (!instrHasImmForm(MI, III) && !ConvertibleImmForm)
+ return nullptr;
+
+ // Don't convert or %X, %Y, %Y since that's just a register move.
+ if ((Opc == PPC::OR || Opc == PPC::OR8) &&
+ MI.getOperand(1).getReg() == MI.getOperand(2).getReg())
+ return nullptr;
+ for (int i = 1, e = MI.getNumOperands(); i < e; i++) {
+ MachineOperand &MO = MI.getOperand(i);
+ SeenIntermediateUse = false;
+ if (MO.isReg() && MO.isUse() && !MO.isImplicit()) {
+ MachineBasicBlock::reverse_iterator E = MI.getParent()->rend(), It = MI;
+ It++;
+ unsigned Reg = MI.getOperand(i).getReg();
+ // MachineInstr::readsRegister only returns true if the machine
+ // instruction reads the exact register or its super-register. It
+ // does not consider uses of sub-registers which seems like strange
+ // behaviour. Nonetheless, if we end up with a 64-bit register here,
+ // get the corresponding 32-bit register to check.
+ if (PPC::G8RCRegClass.contains(Reg))
+ Reg = Reg - PPC::X0 + PPC::R0;
+
+ // Is this register defined by a load-immediate in this block?
+ for ( ; It != E; ++It) {
+ if (It->modifiesRegister(Reg, &getRegisterInfo())) {
+ if (It->getOpcode() == PPC::LI || It->getOpcode() == PPC::LI8) {
+ ConstOp = i;
+ return &*It;
+ } else
+ break;
+ } else if (It->readsRegister(Reg, &getRegisterInfo()))
+ // If we see another use of this reg between the def and the MI,
+ // we want to flat it so the def isn't deleted.
+ SeenIntermediateUse = true;
+ }
+ }
+ }
+ }
+ return ConstOp == ~0U ? nullptr : DefMI;
+}
+
+// If this instruction has an immediate form and one of its operands is a
+// result of a load-immediate, convert it to the immediate form if the constant
+// is in range.
+bool PPCInstrInfo::convertToImmediateForm(MachineInstr &MI,
+ MachineInstr **KilledDef) const {
+ MachineFunction *MF = MI.getParent()->getParent();
+ MachineRegisterInfo *MRI = &MF->getRegInfo();
+ bool PostRA = !MRI->isSSA();
+ bool SeenIntermediateUse = true;
+ unsigned ConstantOperand = ~0U;
+ MachineInstr *DefMI = getConstantDefMI(MI, ConstantOperand,
+ SeenIntermediateUse);
+ if (!DefMI || !DefMI->getOperand(1).isImm())
+ return false;
+ assert(ConstantOperand < MI.getNumOperands() &&
+ "The constant operand needs to be valid at this point");
+
+ int64_t Immediate = DefMI->getOperand(1).getImm();
+ // Sign-extend to 64-bits.
+ int64_t SExtImm = ((uint64_t)Immediate & ~0x7FFFuLL) != 0 ?
+ (Immediate | 0xFFFFFFFFFFFF0000) : Immediate;
+
+ if (KilledDef && MI.getOperand(ConstantOperand).isKill() &&
+ !SeenIntermediateUse)
+ *KilledDef = DefMI;
+
+ // If this is a reg+reg instruction that has a reg+imm form, convert it now.
+ ImmInstrInfo III;
+ if (instrHasImmForm(MI, III))
+ return transformToImmForm(MI, III, ConstantOperand, SExtImm);
+
+ bool ReplaceWithLI = false;
+ bool Is64BitLI = false;
+ int64_t NewImm = 0;
+ bool SetCR = false;
+ unsigned Opc = MI.getOpcode();
+ switch (Opc) {
+ default: return false;
+
+ // FIXME: Any branches conditional on such a comparison can be made
+ // unconditional. At this time, this happens too infrequently to be worth
+ // the implementation effort, but if that ever changes, we could convert
+ // such a pattern here.
+ case PPC::CMPWI:
+ case PPC::CMPLWI:
+ case PPC::CMPDI:
+ case PPC::CMPLDI: {
+ // Doing this post-RA would require dataflow analysis to reliably find uses
+ // of the CR register set by the compare.
+ if (PostRA)
+ return false;
+ // If a compare-immediate is fed by an immediate and is itself an input of
+ // an ISEL (the most common case) into a COPY of the correct register.
+ bool Changed = false;
+ unsigned DefReg = MI.getOperand(0).getReg();
+ int64_t Comparand = MI.getOperand(2).getImm();
+ int64_t SExtComparand = ((uint64_t)Comparand & ~0x7FFFuLL) != 0 ?
+ (Comparand | 0xFFFFFFFFFFFF0000) : Comparand;
+
+ for (auto &CompareUseMI : MRI->use_instructions(DefReg)) {
+ unsigned UseOpc = CompareUseMI.getOpcode();
+ if (UseOpc != PPC::ISEL && UseOpc != PPC::ISEL8)
+ continue;
+ unsigned CRSubReg = CompareUseMI.getOperand(3).getSubReg();
+ unsigned TrueReg = CompareUseMI.getOperand(1).getReg();
+ unsigned FalseReg = CompareUseMI.getOperand(2).getReg();
+ unsigned RegToCopy = selectReg(SExtImm, SExtComparand, Opc, TrueReg,
+ FalseReg, CRSubReg);
+ if (RegToCopy == PPC::NoRegister)
+ continue;
+ // Can't use PPC::COPY to copy PPC::ZERO[8]. Convert it to LI[8] 0.
+ if (RegToCopy == PPC::ZERO || RegToCopy == PPC::ZERO8) {
+ CompareUseMI.setDesc(get(UseOpc == PPC::ISEL8 ? PPC::LI8 : PPC::LI));
+ CompareUseMI.getOperand(1).ChangeToImmediate(0);
+ CompareUseMI.RemoveOperand(3);
+ CompareUseMI.RemoveOperand(2);
+ continue;
+ }
+ DEBUG(dbgs() << "Found LI -> CMPI -> ISEL, replacing with a copy.\n");
+ DEBUG(DefMI->dump(); MI.dump(); CompareUseMI.dump());
+ DEBUG(dbgs() << "Is converted to:\n");
+ // Convert to copy and remove unneeded operands.
+ CompareUseMI.setDesc(get(PPC::COPY));
+ CompareUseMI.RemoveOperand(3);
+ CompareUseMI.RemoveOperand(RegToCopy == TrueReg ? 2 : 1);
+ CmpIselsConverted++;
+ Changed = true;
+ DEBUG(CompareUseMI.dump());
+ }
+ if (Changed)
+ return true;
+ // This may end up incremented multiple times since this function is called
+ // during a fixed-point transformation, but it is only meant to indicate the
+ // presence of this opportunity.
+ MissedConvertibleImmediateInstrs++;
+ return false;
+ }
+
+ // Immediate forms - may simply be convertable to an LI.
+ case PPC::ADDI:
+ case PPC::ADDI8: {
+ // Does the sum fit in a 16-bit signed field?
+ int64_t Addend = MI.getOperand(2).getImm();
+ if (isInt<16>(Addend + SExtImm)) {
+ ReplaceWithLI = true;
+ Is64BitLI = Opc == PPC::ADDI8;
+ NewImm = Addend + SExtImm;
+ break;
+ }
+ return false;
+ }
+ case PPC::RLDICL:
+ case PPC::RLDICLo:
+ case PPC::RLDICL_32:
+ case PPC::RLDICL_32_64: {
+ // Use APInt's rotate function.
+ int64_t SH = MI.getOperand(2).getImm();
+ int64_t MB = MI.getOperand(3).getImm();
+ APInt InVal(Opc == PPC::RLDICL ? 64 : 32, SExtImm, true);
+ InVal = InVal.rotl(SH);
+ uint64_t Mask = (1LU << (63 - MB + 1)) - 1;
+ InVal &= Mask;
+ // Can't replace negative values with an LI as that will sign-extend
+ // and not clear the left bits. If we're setting the CR bit, we will use
+ // ANDIo which won't sign extend, so that's safe.
+ if (isUInt<15>(InVal.getSExtValue()) ||
+ (Opc == PPC::RLDICLo && isUInt<16>(InVal.getSExtValue()))) {
+ ReplaceWithLI = true;
+ Is64BitLI = Opc != PPC::RLDICL_32;
+ NewImm = InVal.getSExtValue();
+ SetCR = Opc == PPC::RLDICLo;
+ break;
+ }
+ return false;
+ }
+ case PPC::RLWINM:
+ case PPC::RLWINM8:
+ case PPC::RLWINMo:
+ case PPC::RLWINM8o: {
+ int64_t SH = MI.getOperand(2).getImm();
+ int64_t MB = MI.getOperand(3).getImm();
+ int64_t ME = MI.getOperand(4).getImm();
+ APInt InVal(32, SExtImm, true);
+ InVal = InVal.rotl(SH);
+ // Set the bits ( MB + 32 ) to ( ME + 32 ).
+ uint64_t Mask = ((1 << (32 - MB)) - 1) & ~((1 << (31 - ME)) - 1);
+ InVal &= Mask;
+ // Can't replace negative values with an LI as that will sign-extend
+ // and not clear the left bits. If we're setting the CR bit, we will use
+ // ANDIo which won't sign extend, so that's safe.
+ bool ValueFits = isUInt<15>(InVal.getSExtValue());
+ ValueFits |= ((Opc == PPC::RLWINMo || Opc == PPC::RLWINM8o) &&
+ isUInt<16>(InVal.getSExtValue()));
+ if (ValueFits) {
+ ReplaceWithLI = true;
+ Is64BitLI = Opc == PPC::RLWINM8 || Opc == PPC::RLWINM8o;
+ NewImm = InVal.getSExtValue();
+ SetCR = Opc == PPC::RLWINMo || Opc == PPC::RLWINM8o;
+ break;
+ }
+ return false;
+ }
+ case PPC::ORI:
+ case PPC::ORI8:
+ case PPC::XORI:
+ case PPC::XORI8: {
+ int64_t LogicalImm = MI.getOperand(2).getImm();
+ int64_t Result = 0;
+ if (Opc == PPC::ORI || Opc == PPC::ORI8)
+ Result = LogicalImm | SExtImm;
+ else
+ Result = LogicalImm ^ SExtImm;
+ if (isInt<16>(Result)) {
+ ReplaceWithLI = true;
+ Is64BitLI = Opc == PPC::ORI8 || Opc == PPC::XORI8;
+ NewImm = Result;
+ break;
+ }
+ return false;
+ }
+ }
+
+ if (ReplaceWithLI) {
+ DEBUG(dbgs() << "Replacing instruction:\n");
+ DEBUG(MI.dump());
+ DEBUG(dbgs() << "Fed by:\n");
+ DEBUG(DefMI->dump());
+ LoadImmediateInfo LII;
+ LII.Imm = NewImm;
+ LII.Is64Bit = Is64BitLI;
+ LII.SetCR = SetCR;
+ // If we're setting the CR, the original load-immediate must be kept (as an
+ // operand to ANDIo/ANDI8o).
+ if (KilledDef && SetCR)
+ *KilledDef = nullptr;
+ replaceInstrWithLI(MI, LII);
+ DEBUG(dbgs() << "With:\n");
+ DEBUG(MI.dump());
+ return true;
+ }
+ return false;
+}
+
+bool PPCInstrInfo::instrHasImmForm(const MachineInstr &MI,
+ ImmInstrInfo &III) const {
+ unsigned Opc = MI.getOpcode();
+ // The vast majority of the instructions would need their operand 2 replaced
+ // with an immediate when switching to the reg+imm form. A marked exception
+ // are the update form loads/stores for which a constant operand 2 would need
+ // to turn into a displacement and move operand 1 to the operand 2 position.
+ III.ImmOpNo = 2;
+ III.ConstantOpNo = 2;
+ III.ImmWidth = 16;
+ III.ImmMustBeMultipleOf = 1;
+ switch (Opc) {
+ default: return false;
+ case PPC::ADD4:
+ case PPC::ADD8:
+ III.SignedImm = true;
+ III.ZeroIsSpecialOrig = 0;
+ III.ZeroIsSpecialNew = 1;
+ III.IsCommutative = true;
+ III.ImmOpcode = Opc == PPC::ADD4 ? PPC::ADDI : PPC::ADDI8;
+ break;
+ case PPC::ADDC:
+ case PPC::ADDC8:
+ III.SignedImm = true;
+ III.ZeroIsSpecialOrig = 0;
+ III.ZeroIsSpecialNew = 0;
+ III.IsCommutative = true;
+ III.ImmOpcode = Opc == PPC::ADDC ? PPC::ADDIC : PPC::ADDIC8;
+ break;
+ case PPC::ADDCo:
+ III.SignedImm = true;
+ III.ZeroIsSpecialOrig = 0;
+ III.ZeroIsSpecialNew = 0;
+ III.IsCommutative = true;
+ III.ImmOpcode = PPC::ADDICo;
+ break;
+ case PPC::SUBFC:
+ case PPC::SUBFC8:
+ III.SignedImm = true;
+ III.ZeroIsSpecialOrig = 0;
+ III.ZeroIsSpecialNew = 0;
+ III.IsCommutative = false;
+ III.ImmOpcode = Opc == PPC::SUBFC ? PPC::SUBFIC : PPC::SUBFIC8;
+ break;
+ case PPC::CMPW:
+ case PPC::CMPD:
+ III.SignedImm = true;
+ III.ZeroIsSpecialOrig = 0;
+ III.ZeroIsSpecialNew = 0;
+ III.IsCommutative = false;
+ III.ImmOpcode = Opc == PPC::CMPW ? PPC::CMPWI : PPC::CMPDI;
+ break;
+ case PPC::CMPLW:
+ case PPC::CMPLD:
+ III.SignedImm = false;
+ III.ZeroIsSpecialOrig = 0;
+ III.ZeroIsSpecialNew = 0;
+ III.IsCommutative = false;
+ III.ImmOpcode = Opc == PPC::CMPLW ? PPC::CMPLWI : PPC::CMPLDI;
+ break;
+ case PPC::ANDo:
+ case PPC::AND8o:
+ case PPC::OR:
+ case PPC::OR8:
+ case PPC::XOR:
+ case PPC::XOR8:
+ III.SignedImm = false;
+ III.ZeroIsSpecialOrig = 0;
+ III.ZeroIsSpecialNew = 0;
+ III.IsCommutative = true;
+ switch(Opc) {
+ default: llvm_unreachable("Unknown opcode");
+ case PPC::ANDo: III.ImmOpcode = PPC::ANDIo; break;
+ case PPC::AND8o: III.ImmOpcode = PPC::ANDIo8; break;
+ case PPC::OR: III.ImmOpcode = PPC::ORI; break;
+ case PPC::OR8: III.ImmOpcode = PPC::ORI8; break;
+ case PPC::XOR: III.ImmOpcode = PPC::XORI; break;
+ case PPC::XOR8: III.ImmOpcode = PPC::XORI8; break;
+ }
+ break;
+ case PPC::RLWNM:
+ case PPC::RLWNM8:
+ case PPC::RLWNMo:
+ case PPC::RLWNM8o:
+ case PPC::RLDCL:
+ case PPC::RLDCLo:
+ case PPC::RLDCR:
+ case PPC::RLDCRo:
+ case PPC::SLW:
+ case PPC::SLW8:
+ case PPC::SLWo:
+ case PPC::SLW8o:
+ case PPC::SRW:
+ case PPC::SRW8:
+ case PPC::SRWo:
+ case PPC::SRW8o:
+ case PPC::SRAW:
+ case PPC::SRAWo:
+ case PPC::SLD:
+ case PPC::SLDo:
+ case PPC::SRD:
+ case PPC::SRDo:
+ case PPC::SRAD:
+ case PPC::SRADo:
+ III.SignedImm = false;
+ III.ZeroIsSpecialOrig = 0;
+ III.ZeroIsSpecialNew = 0;
+ III.IsCommutative = false;
+ // This isn't actually true, but the instructions ignore any of the
+ // upper bits, so any immediate loaded with an LI is acceptable.
+ III.ImmWidth = 16;
+ switch(Opc) {
+ default: llvm_unreachable("Unknown opcode");
+ case PPC::RLWNM: III.ImmOpcode = PPC::RLWINM; break;
+ case PPC::RLWNM8: III.ImmOpcode = PPC::RLWINM8; break;
+ case PPC::RLWNMo: III.ImmOpcode = PPC::RLWINMo; break;
+ case PPC::RLWNM8o: III.ImmOpcode = PPC::RLWINM8o; break;
+ case PPC::RLDCL: III.ImmOpcode = PPC::RLDICL; break;
+ case PPC::RLDCLo: III.ImmOpcode = PPC::RLDICLo; break;
+ case PPC::RLDCR: III.ImmOpcode = PPC::RLDICR; break;
+ case PPC::RLDCRo: III.ImmOpcode = PPC::RLDICRo; break;
+ case PPC::SLW: III.ImmOpcode = PPC::RLWINM; break;
+ case PPC::SLW8: III.ImmOpcode = PPC::RLWINM8; break;
+ case PPC::SLWo: III.ImmOpcode = PPC::RLWINMo; break;
+ case PPC::SLW8o: III.ImmOpcode = PPC::RLWINM8o; break;
+ case PPC::SRW: III.ImmOpcode = PPC::RLWINM; break;
+ case PPC::SRW8: III.ImmOpcode = PPC::RLWINM8; break;
+ case PPC::SRWo: III.ImmOpcode = PPC::RLWINMo; break;
+ case PPC::SRW8o: III.ImmOpcode = PPC::RLWINM8o; break;
+ case PPC::SRAW: III.ImmOpcode = PPC::SRAWI; break;
+ case PPC::SRAWo: III.ImmOpcode = PPC::SRAWIo; break;
+ case PPC::SLD: III.ImmOpcode = PPC::RLDICR; break;
+ case PPC::SLDo: III.ImmOpcode = PPC::RLDICRo; break;
+ case PPC::SRD: III.ImmOpcode = PPC::RLDICL; break;
+ case PPC::SRDo: III.ImmOpcode = PPC::RLDICLo; break;
+ case PPC::SRAD: III.ImmOpcode = PPC::SRADI; break;
+ case PPC::SRADo: III.ImmOpcode = PPC::SRADIo; break;
+ }
+ break;
+ // Loads and stores:
+ case PPC::LBZX:
+ case PPC::LBZX8:
+ case PPC::LHZX:
+ case PPC::LHZX8:
+ case PPC::LHAX:
+ case PPC::LHAX8:
+ case PPC::LWZX:
+ case PPC::LWZX8:
+ case PPC::LWAX:
+ case PPC::LDX:
+ case PPC::LFSX:
+ case PPC::LFDX:
+ case PPC::STBX:
+ case PPC::STBX8:
+ case PPC::STHX:
+ case PPC::STHX8:
+ case PPC::STWX:
+ case PPC::STWX8:
+ case PPC::STDX:
+ case PPC::STFSX:
+ case PPC::STFDX:
+ III.SignedImm = true;
+ III.ZeroIsSpecialOrig = 1;
+ III.ZeroIsSpecialNew = 2;
+ III.IsCommutative = true;
+ III.ImmOpNo = 1;
+ III.ConstantOpNo = 2;
+ switch(Opc) {
+ default: llvm_unreachable("Unknown opcode");
+ case PPC::LBZX: III.ImmOpcode = PPC::LBZ; break;
+ case PPC::LBZX8: III.ImmOpcode = PPC::LBZ8; break;
+ case PPC::LHZX: III.ImmOpcode = PPC::LHZ; break;
+ case PPC::LHZX8: III.ImmOpcode = PPC::LHZ8; break;
+ case PPC::LHAX: III.ImmOpcode = PPC::LHA; break;
+ case PPC::LHAX8: III.ImmOpcode = PPC::LHA8; break;
+ case PPC::LWZX: III.ImmOpcode = PPC::LWZ; break;
+ case PPC::LWZX8: III.ImmOpcode = PPC::LWZ8; break;
+ case PPC::LWAX:
+ III.ImmOpcode = PPC::LWA;
+ III.ImmMustBeMultipleOf = 4;
+ break;
+ case PPC::LDX: III.ImmOpcode = PPC::LD; III.ImmMustBeMultipleOf = 4; break;
+ case PPC::LFSX: III.ImmOpcode = PPC::LFS; break;
+ case PPC::LFDX: III.ImmOpcode = PPC::LFD; break;
+ case PPC::STBX: III.ImmOpcode = PPC::STB; break;
+ case PPC::STBX8: III.ImmOpcode = PPC::STB8; break;
+ case PPC::STHX: III.ImmOpcode = PPC::STH; break;
+ case PPC::STHX8: III.ImmOpcode = PPC::STH8; break;
+ case PPC::STWX: III.ImmOpcode = PPC::STW; break;
+ case PPC::STWX8: III.ImmOpcode = PPC::STW8; break;
+ case PPC::STDX:
+ III.ImmOpcode = PPC::STD;
+ III.ImmMustBeMultipleOf = 4;
+ break;
+ case PPC::STFSX: III.ImmOpcode = PPC::STFS; break;
+ case PPC::STFDX: III.ImmOpcode = PPC::STFD; break;
+ }
+ break;
+ case PPC::LBZUX:
+ case PPC::LBZUX8:
+ case PPC::LHZUX:
+ case PPC::LHZUX8:
+ case PPC::LHAUX:
+ case PPC::LHAUX8:
+ case PPC::LWZUX:
+ case PPC::LWZUX8:
+ case PPC::LDUX:
+ case PPC::LFSUX:
+ case PPC::LFDUX:
+ case PPC::STBUX:
+ case PPC::STBUX8:
+ case PPC::STHUX:
+ case PPC::STHUX8:
+ case PPC::STWUX:
+ case PPC::STWUX8:
+ case PPC::STDUX:
+ case PPC::STFSUX:
+ case PPC::STFDUX:
+ III.SignedImm = true;
+ III.ZeroIsSpecialOrig = 2;
+ III.ZeroIsSpecialNew = 3;
+ III.IsCommutative = false;
+ III.ImmOpNo = 2;
+ III.ConstantOpNo = 3;
+ switch(Opc) {
+ default: llvm_unreachable("Unknown opcode");
+ case PPC::LBZUX: III.ImmOpcode = PPC::LBZU; break;
+ case PPC::LBZUX8: III.ImmOpcode = PPC::LBZU8; break;
+ case PPC::LHZUX: III.ImmOpcode = PPC::LHZU; break;
+ case PPC::LHZUX8: III.ImmOpcode = PPC::LHZU8; break;
+ case PPC::LHAUX: III.ImmOpcode = PPC::LHAU; break;
+ case PPC::LHAUX8: III.ImmOpcode = PPC::LHAU8; break;
+ case PPC::LWZUX: III.ImmOpcode = PPC::LWZU; break;
+ case PPC::LWZUX8: III.ImmOpcode = PPC::LWZU8; break;
+ case PPC::LDUX:
+ III.ImmOpcode = PPC::LDU;
+ III.ImmMustBeMultipleOf = 4;
+ break;
+ case PPC::LFSUX: III.ImmOpcode = PPC::LFSU; break;
+ case PPC::LFDUX: III.ImmOpcode = PPC::LFDU; break;
+ case PPC::STBUX: III.ImmOpcode = PPC::STBU; break;
+ case PPC::STBUX8: III.ImmOpcode = PPC::STBU8; break;
+ case PPC::STHUX: III.ImmOpcode = PPC::STHU; break;
+ case PPC::STHUX8: III.ImmOpcode = PPC::STHU8; break;
+ case PPC::STWUX: III.ImmOpcode = PPC::STWU; break;
+ case PPC::STWUX8: III.ImmOpcode = PPC::STWU8; break;
+ case PPC::STDUX:
+ III.ImmOpcode = PPC::STDU;
+ III.ImmMustBeMultipleOf = 4;
+ break;
+ case PPC::STFSUX: III.ImmOpcode = PPC::STFSU; break;
+ case PPC::STFDUX: III.ImmOpcode = PPC::STFDU; break;
+ }
+ break;
+ // Power9 only.
+ case PPC::LXVX:
+ case PPC::LXSSPX:
+ case PPC::LXSDX:
+ case PPC::STXVX:
+ case PPC::STXSSPX:
+ case PPC::STXSDX:
+ if (!Subtarget.hasP9Vector())
+ return false;
+ III.SignedImm = true;
+ III.ZeroIsSpecialOrig = 1;
+ III.ZeroIsSpecialNew = 2;
+ III.IsCommutative = true;
+ III.ImmOpNo = 1;
+ III.ConstantOpNo = 2;
+ switch(Opc) {
+ default: llvm_unreachable("Unknown opcode");
+ case PPC::LXVX:
+ III.ImmOpcode = PPC::LXV;
+ III.ImmMustBeMultipleOf = 16;
+ break;
+ case PPC::LXSSPX:
+ III.ImmOpcode = PPC::LXSSP;
+ III.ImmMustBeMultipleOf = 4;
+ break;
+ case PPC::LXSDX:
+ III.ImmOpcode = PPC::LXSD;
+ III.ImmMustBeMultipleOf = 4;
+ break;
+ case PPC::STXVX:
+ III.ImmOpcode = PPC::STXV;
+ III.ImmMustBeMultipleOf = 16;
+ break;
+ case PPC::STXSSPX:
+ III.ImmOpcode = PPC::STXSSP;
+ III.ImmMustBeMultipleOf = 4;
+ break;
+ case PPC::STXSDX:
+ III.ImmOpcode = PPC::STXSD;
+ III.ImmMustBeMultipleOf = 4;
+ break;
+ }
+ break;
+ }
+ return true;
+}
+
+// Utility function for swaping two arbitrary operands of an instruction.
+static void swapMIOperands(MachineInstr &MI, unsigned Op1, unsigned Op2) {
+ assert(Op1 != Op2 && "Cannot swap operand with itself.");
+
+ unsigned MaxOp = std::max(Op1, Op2);
+ unsigned MinOp = std::min(Op1, Op2);
+ MachineOperand MOp1 = MI.getOperand(MinOp);
+ MachineOperand MOp2 = MI.getOperand(MaxOp);
+ MI.RemoveOperand(std::max(Op1, Op2));
+ MI.RemoveOperand(std::min(Op1, Op2));
+
+ // If the operands we are swapping are the two at the end (the common case)
+ // we can just remove both and add them in the opposite order.
+ if (MaxOp - MinOp == 1 && MI.getNumOperands() == MinOp) {
+ MI.addOperand(MOp2);
+ MI.addOperand(MOp1);
+ } else {
+ // Store all operands in a temporary vector, remove them and re-add in the
+ // right order.
+ SmallVector<MachineOperand, 2> MOps;
+ unsigned TotalOps = MI.getNumOperands() + 2; // We've already removed 2 ops.
+ for (unsigned i = MI.getNumOperands() - 1; i >= MinOp; i--) {
+ MOps.push_back(MI.getOperand(i));
+ MI.RemoveOperand(i);
+ }
+ // MOp2 needs to be added next.
+ MI.addOperand(MOp2);
+ // Now add the rest.
+ for (unsigned i = MI.getNumOperands(); i < TotalOps; i++) {
+ if (i == MaxOp)
+ MI.addOperand(MOp1);
+ else {
+ MI.addOperand(MOps.back());
+ MOps.pop_back();
+ }
+ }
+ }
+}
+
+bool PPCInstrInfo::transformToImmForm(MachineInstr &MI, const ImmInstrInfo &III,
+ unsigned ConstantOpNo,
+ int64_t Imm) const {
+ MachineRegisterInfo &MRI = MI.getParent()->getParent()->getRegInfo();
+ bool PostRA = !MRI.isSSA();
+ // Exit early if we can't convert this.
+ if ((ConstantOpNo != III.ConstantOpNo) && !III.IsCommutative)
+ return false;
+ if (Imm % III.ImmMustBeMultipleOf)
+ return false;
+ if (III.SignedImm) {
+ APInt ActualValue(64, Imm, true);
+ if (!ActualValue.isSignedIntN(III.ImmWidth))
+ return false;
+ } else {
+ uint64_t UnsignedMax = (1 << III.ImmWidth) - 1;
+ if ((uint64_t)Imm > UnsignedMax)
+ return false;
+ }
+
+ // If we're post-RA, the instructions don't agree on whether register zero is
+ // special, we can transform this as long as the register operand that will
+ // end up in the location where zero is special isn't R0.
+ if (PostRA && III.ZeroIsSpecialOrig != III.ZeroIsSpecialNew) {
+ unsigned PosForOrigZero = III.ZeroIsSpecialOrig ? III.ZeroIsSpecialOrig :
+ III.ZeroIsSpecialNew + 1;
+ unsigned OrigZeroReg = MI.getOperand(PosForOrigZero).getReg();
+ unsigned NewZeroReg = MI.getOperand(III.ZeroIsSpecialNew).getReg();
+ // If R0 is in the operand where zero is special for the new instruction,
+ // it is unsafe to transform if the constant operand isn't that operand.
+ if ((NewZeroReg == PPC::R0 || NewZeroReg == PPC::X0) &&
+ ConstantOpNo != III.ZeroIsSpecialNew)
+ return false;
+ if ((OrigZeroReg == PPC::R0 || OrigZeroReg == PPC::X0) &&
+ ConstantOpNo != PosForOrigZero)
+ return false;
+ }
+
+ unsigned Opc = MI.getOpcode();
+ bool SpecialShift32 =
+ Opc == PPC::SLW || Opc == PPC::SLWo || Opc == PPC::SRW || Opc == PPC::SRWo;
+ bool SpecialShift64 =
+ Opc == PPC::SLD || Opc == PPC::SLDo || Opc == PPC::SRD || Opc == PPC::SRDo;
+ bool SetCR = Opc == PPC::SLWo || Opc == PPC::SRWo ||
+ Opc == PPC::SLDo || Opc == PPC::SRDo;
+ bool RightShift =
+ Opc == PPC::SRW || Opc == PPC::SRWo || Opc == PPC::SRD || Opc == PPC::SRDo;
+
+ MI.setDesc(get(III.ImmOpcode));
+ if (ConstantOpNo == III.ConstantOpNo) {
+ // Converting shifts to immediate form is a bit tricky since they may do
+ // one of three things:
+ // 1. If the shift amount is between OpSize and 2*OpSize, the result is zero
+ // 2. If the shift amount is zero, the result is unchanged (save for maybe
+ // setting CR0)
+ // 3. If the shift amount is in [1, OpSize), it's just a shift
+ if (SpecialShift32 || SpecialShift64) {
+ LoadImmediateInfo LII;
+ LII.Imm = 0;
+ LII.SetCR = SetCR;
+ LII.Is64Bit = SpecialShift64;
+ uint64_t ShAmt = Imm & (SpecialShift32 ? 0x1F : 0x3F);
+ if (Imm & (SpecialShift32 ? 0x20 : 0x40))
+ replaceInstrWithLI(MI, LII);
+ // Shifts by zero don't change the value. If we don't need to set CR0,
+ // just convert this to a COPY. Can't do this post-RA since we've already
+ // cleaned up the copies.
+ else if (!SetCR && ShAmt == 0 && !PostRA) {
+ MI.RemoveOperand(2);
+ MI.setDesc(get(PPC::COPY));
+ } else {
+ // The 32 bit and 64 bit instructions are quite different.
+ if (SpecialShift32) {
+ // Left shifts use (N, 0, 31-N), right shifts use (32-N, N, 31).
+ uint64_t SH = RightShift ? 32 - ShAmt : ShAmt;
+ uint64_t MB = RightShift ? ShAmt : 0;
+ uint64_t ME = RightShift ? 31 : 31 - ShAmt;
+ MI.getOperand(III.ConstantOpNo).ChangeToImmediate(SH);
+ MachineInstrBuilder(*MI.getParent()->getParent(), MI).addImm(MB)
+ .addImm(ME);
+ } else {
+ // Left shifts use (N, 63-N), right shifts use (64-N, N).
+ uint64_t SH = RightShift ? 64 - ShAmt : ShAmt;
+ uint64_t ME = RightShift ? ShAmt : 63 - ShAmt;
+ MI.getOperand(III.ConstantOpNo).ChangeToImmediate(SH);
+ MachineInstrBuilder(*MI.getParent()->getParent(), MI).addImm(ME);
+ }
+ }
+ } else
+ MI.getOperand(ConstantOpNo).ChangeToImmediate(Imm);
+ }
+ // Convert commutative instructions (switch the operands and convert the
+ // desired one to an immediate.
+ else if (III.IsCommutative) {
+ MI.getOperand(ConstantOpNo).ChangeToImmediate(Imm);
+ swapMIOperands(MI, ConstantOpNo, III.ConstantOpNo);
+ } else
+ llvm_unreachable("Should have exited early!");
+
+ // For instructions for which the constant register replaces a different
+ // operand than where the immediate goes, we need to swap them.
+ if (III.ConstantOpNo != III.ImmOpNo)
+ swapMIOperands(MI, III.ConstantOpNo, III.ImmOpNo);
+
+ // If the R0/X0 register is special for the original instruction and not for
+ // the new instruction (or vice versa), we need to fix up the register class.
+ if (!PostRA && III.ZeroIsSpecialOrig != III.ZeroIsSpecialNew) {
+ if (!III.ZeroIsSpecialOrig) {
+ unsigned RegToModify = MI.getOperand(III.ZeroIsSpecialNew).getReg();
+ const TargetRegisterClass *NewRC =
+ MRI.getRegClass(RegToModify)->hasSuperClassEq(&PPC::GPRCRegClass) ?
+ &PPC::GPRC_and_GPRC_NOR0RegClass : &PPC::G8RC_and_G8RC_NOX0RegClass;
+ MRI.setRegClass(RegToModify, NewRC);
+ }
+ }
+ return true;
+}
+
const TargetRegisterClass *
PPCInstrInfo::updatedRC(const TargetRegisterClass *RC) const {
if (Subtarget.hasVSX() && RC == &PPC::VRRCRegClass)
@@ -2012,3 +2984,290 @@ PPCInstrInfo::updatedRC(const TargetRegisterClass *RC) const {
int PPCInstrInfo::getRecordFormOpcode(unsigned Opcode) {
return PPC::getRecordFormOpcode(Opcode);
}
+
+// This function returns true if the machine instruction
+// always outputs a value by sign-extending a 32 bit value,
+// i.e. 0 to 31-th bits are same as 32-th bit.
+static bool isSignExtendingOp(const MachineInstr &MI) {
+ int Opcode = MI.getOpcode();
+ if (Opcode == PPC::LI || Opcode == PPC::LI8 ||
+ Opcode == PPC::LIS || Opcode == PPC::LIS8 ||
+ Opcode == PPC::SRAW || Opcode == PPC::SRAWo ||
+ Opcode == PPC::SRAWI || Opcode == PPC::SRAWIo ||
+ Opcode == PPC::LWA || Opcode == PPC::LWAX ||
+ Opcode == PPC::LWA_32 || Opcode == PPC::LWAX_32 ||
+ Opcode == PPC::LHA || Opcode == PPC::LHAX ||
+ Opcode == PPC::LHA8 || Opcode == PPC::LHAX8 ||
+ Opcode == PPC::LBZ || Opcode == PPC::LBZX ||
+ Opcode == PPC::LBZ8 || Opcode == PPC::LBZX8 ||
+ Opcode == PPC::LBZU || Opcode == PPC::LBZUX ||
+ Opcode == PPC::LBZU8 || Opcode == PPC::LBZUX8 ||
+ Opcode == PPC::LHZ || Opcode == PPC::LHZX ||
+ Opcode == PPC::LHZ8 || Opcode == PPC::LHZX8 ||
+ Opcode == PPC::LHZU || Opcode == PPC::LHZUX ||
+ Opcode == PPC::LHZU8 || Opcode == PPC::LHZUX8 ||
+ Opcode == PPC::EXTSB || Opcode == PPC::EXTSBo ||
+ Opcode == PPC::EXTSH || Opcode == PPC::EXTSHo ||
+ Opcode == PPC::EXTSB8 || Opcode == PPC::EXTSH8 ||
+ Opcode == PPC::EXTSW || Opcode == PPC::EXTSWo ||
+ Opcode == PPC::EXTSH8_32_64 || Opcode == PPC::EXTSW_32_64 ||
+ Opcode == PPC::EXTSB8_32_64)
+ return true;
+
+ if (Opcode == PPC::RLDICL && MI.getOperand(3).getImm() >= 33)
+ return true;
+
+ if ((Opcode == PPC::RLWINM || Opcode == PPC::RLWINMo ||
+ Opcode == PPC::RLWNM || Opcode == PPC::RLWNMo) &&
+ MI.getOperand(3).getImm() > 0 &&
+ MI.getOperand(3).getImm() <= MI.getOperand(4).getImm())
+ return true;
+
+ return false;
+}
+
+// This function returns true if the machine instruction
+// always outputs zeros in higher 32 bits.
+static bool isZeroExtendingOp(const MachineInstr &MI) {
+ int Opcode = MI.getOpcode();
+ // The 16-bit immediate is sign-extended in li/lis.
+ // If the most significant bit is zero, all higher bits are zero.
+ if (Opcode == PPC::LI || Opcode == PPC::LI8 ||
+ Opcode == PPC::LIS || Opcode == PPC::LIS8) {
+ int64_t Imm = MI.getOperand(1).getImm();
+ if (((uint64_t)Imm & ~0x7FFFuLL) == 0)
+ return true;
+ }
+
+ // We have some variations of rotate-and-mask instructions
+ // that clear higher 32-bits.
+ if ((Opcode == PPC::RLDICL || Opcode == PPC::RLDICLo ||
+ Opcode == PPC::RLDCL || Opcode == PPC::RLDCLo ||
+ Opcode == PPC::RLDICL_32_64) &&
+ MI.getOperand(3).getImm() >= 32)
+ return true;
+
+ if ((Opcode == PPC::RLDIC || Opcode == PPC::RLDICo) &&
+ MI.getOperand(3).getImm() >= 32 &&
+ MI.getOperand(3).getImm() <= 63 - MI.getOperand(2).getImm())
+ return true;
+
+ if ((Opcode == PPC::RLWINM || Opcode == PPC::RLWINMo ||
+ Opcode == PPC::RLWNM || Opcode == PPC::RLWNMo ||
+ Opcode == PPC::RLWINM8 || Opcode == PPC::RLWNM8) &&
+ MI.getOperand(3).getImm() <= MI.getOperand(4).getImm())
+ return true;
+
+ // There are other instructions that clear higher 32-bits.
+ if (Opcode == PPC::CNTLZW || Opcode == PPC::CNTLZWo ||
+ Opcode == PPC::CNTTZW || Opcode == PPC::CNTTZWo ||
+ Opcode == PPC::CNTLZW8 || Opcode == PPC::CNTTZW8 ||
+ Opcode == PPC::CNTLZD || Opcode == PPC::CNTLZDo ||
+ Opcode == PPC::CNTTZD || Opcode == PPC::CNTTZDo ||
+ Opcode == PPC::POPCNTD || Opcode == PPC::POPCNTW ||
+ Opcode == PPC::SLW || Opcode == PPC::SLWo ||
+ Opcode == PPC::SRW || Opcode == PPC::SRWo ||
+ Opcode == PPC::SLW8 || Opcode == PPC::SRW8 ||
+ Opcode == PPC::SLWI || Opcode == PPC::SLWIo ||
+ Opcode == PPC::SRWI || Opcode == PPC::SRWIo ||
+ Opcode == PPC::LWZ || Opcode == PPC::LWZX ||
+ Opcode == PPC::LWZU || Opcode == PPC::LWZUX ||
+ Opcode == PPC::LWBRX || Opcode == PPC::LHBRX ||
+ Opcode == PPC::LHZ || Opcode == PPC::LHZX ||
+ Opcode == PPC::LHZU || Opcode == PPC::LHZUX ||
+ Opcode == PPC::LBZ || Opcode == PPC::LBZX ||
+ Opcode == PPC::LBZU || Opcode == PPC::LBZUX ||
+ Opcode == PPC::LWZ8 || Opcode == PPC::LWZX8 ||
+ Opcode == PPC::LWZU8 || Opcode == PPC::LWZUX8 ||
+ Opcode == PPC::LWBRX8 || Opcode == PPC::LHBRX8 ||
+ Opcode == PPC::LHZ8 || Opcode == PPC::LHZX8 ||
+ Opcode == PPC::LHZU8 || Opcode == PPC::LHZUX8 ||
+ Opcode == PPC::LBZ8 || Opcode == PPC::LBZX8 ||
+ Opcode == PPC::LBZU8 || Opcode == PPC::LBZUX8 ||
+ Opcode == PPC::ANDIo || Opcode == PPC::ANDISo ||
+ Opcode == PPC::ROTRWI || Opcode == PPC::ROTRWIo ||
+ Opcode == PPC::EXTLWI || Opcode == PPC::EXTLWIo ||
+ Opcode == PPC::MFVSRWZ)
+ return true;
+
+ return false;
+}
+
+// This function returns true if the input MachineInstr is a TOC save
+// instruction.
+bool PPCInstrInfo::isTOCSaveMI(const MachineInstr &MI) const {
+ if (!MI.getOperand(1).isImm() || !MI.getOperand(2).isReg())
+ return false;
+ unsigned TOCSaveOffset = Subtarget.getFrameLowering()->getTOCSaveOffset();
+ unsigned StackOffset = MI.getOperand(1).getImm();
+ unsigned StackReg = MI.getOperand(2).getReg();
+ if (StackReg == PPC::X1 && StackOffset == TOCSaveOffset)
+ return true;
+
+ return false;
+}
+
+// We limit the max depth to track incoming values of PHIs or binary ops
+// (e.g. AND) to avoid exsessive cost.
+const unsigned MAX_DEPTH = 1;
+
+bool
+PPCInstrInfo::isSignOrZeroExtended(const MachineInstr &MI, bool SignExt,
+ const unsigned Depth) const {
+ const MachineFunction *MF = MI.getParent()->getParent();
+ const MachineRegisterInfo *MRI = &MF->getRegInfo();
+
+ // If we know this instruction returns sign- or zero-extended result,
+ // return true.
+ if (SignExt ? isSignExtendingOp(MI):
+ isZeroExtendingOp(MI))
+ return true;
+
+ switch (MI.getOpcode()) {
+ case PPC::COPY: {
+ unsigned SrcReg = MI.getOperand(1).getReg();
+
+ // In both ELFv1 and v2 ABI, method parameters and the return value
+ // are sign- or zero-extended.
+ if (MF->getSubtarget<PPCSubtarget>().isSVR4ABI()) {
+ const PPCFunctionInfo *FuncInfo = MF->getInfo<PPCFunctionInfo>();
+ // We check the ZExt/SExt flags for a method parameter.
+ if (MI.getParent()->getBasicBlock() ==
+ &MF->getFunction().getEntryBlock()) {
+ unsigned VReg = MI.getOperand(0).getReg();
+ if (MF->getRegInfo().isLiveIn(VReg))
+ return SignExt ? FuncInfo->isLiveInSExt(VReg) :
+ FuncInfo->isLiveInZExt(VReg);
+ }
+
+ // For a method return value, we check the ZExt/SExt flags in attribute.
+ // We assume the following code sequence for method call.
+ // ADJCALLSTACKDOWN 32, implicit dead %r1, implicit %r1
+ // BL8_NOP @func,...
+ // ADJCALLSTACKUP 32, 0, implicit dead %r1, implicit %r1
+ // %5 = COPY %x3; G8RC:%5
+ if (SrcReg == PPC::X3) {
+ const MachineBasicBlock *MBB = MI.getParent();
+ MachineBasicBlock::const_instr_iterator II =
+ MachineBasicBlock::const_instr_iterator(&MI);
+ if (II != MBB->instr_begin() &&
+ (--II)->getOpcode() == PPC::ADJCALLSTACKUP) {
+ const MachineInstr &CallMI = *(--II);
+ if (CallMI.isCall() && CallMI.getOperand(0).isGlobal()) {
+ const Function *CalleeFn =
+ dyn_cast<Function>(CallMI.getOperand(0).getGlobal());
+ if (!CalleeFn)
+ return false;
+ const IntegerType *IntTy =
+ dyn_cast<IntegerType>(CalleeFn->getReturnType());
+ const AttributeSet &Attrs =
+ CalleeFn->getAttributes().getRetAttributes();
+ if (IntTy && IntTy->getBitWidth() <= 32)
+ return Attrs.hasAttribute(SignExt ? Attribute::SExt :
+ Attribute::ZExt);
+ }
+ }
+ }
+ }
+
+ // If this is a copy from another register, we recursively check source.
+ if (!TargetRegisterInfo::isVirtualRegister(SrcReg))
+ return false;
+ const MachineInstr *SrcMI = MRI->getVRegDef(SrcReg);
+ if (SrcMI != NULL)
+ return isSignOrZeroExtended(*SrcMI, SignExt, Depth);
+
+ return false;
+ }
+
+ case PPC::ANDIo:
+ case PPC::ANDISo:
+ case PPC::ORI:
+ case PPC::ORIS:
+ case PPC::XORI:
+ case PPC::XORIS:
+ case PPC::ANDIo8:
+ case PPC::ANDISo8:
+ case PPC::ORI8:
+ case PPC::ORIS8:
+ case PPC::XORI8:
+ case PPC::XORIS8: {
+ // logical operation with 16-bit immediate does not change the upper bits.
+ // So, we track the operand register as we do for register copy.
+ unsigned SrcReg = MI.getOperand(1).getReg();
+ if (!TargetRegisterInfo::isVirtualRegister(SrcReg))
+ return false;
+ const MachineInstr *SrcMI = MRI->getVRegDef(SrcReg);
+ if (SrcMI != NULL)
+ return isSignOrZeroExtended(*SrcMI, SignExt, Depth);
+
+ return false;
+ }
+
+ // If all incoming values are sign-/zero-extended,
+ // the output of OR, ISEL or PHI is also sign-/zero-extended.
+ case PPC::OR:
+ case PPC::OR8:
+ case PPC::ISEL:
+ case PPC::PHI: {
+ if (Depth >= MAX_DEPTH)
+ return false;
+
+ // The input registers for PHI are operand 1, 3, ...
+ // The input registers for others are operand 1 and 2.
+ unsigned E = 3, D = 1;
+ if (MI.getOpcode() == PPC::PHI) {
+ E = MI.getNumOperands();
+ D = 2;
+ }
+
+ for (unsigned I = 1; I != E; I += D) {
+ if (MI.getOperand(I).isReg()) {
+ unsigned SrcReg = MI.getOperand(I).getReg();
+ if (!TargetRegisterInfo::isVirtualRegister(SrcReg))
+ return false;
+ const MachineInstr *SrcMI = MRI->getVRegDef(SrcReg);
+ if (SrcMI == NULL || !isSignOrZeroExtended(*SrcMI, SignExt, Depth+1))
+ return false;
+ }
+ else
+ return false;
+ }
+ return true;
+ }
+
+ // If at least one of the incoming values of an AND is zero extended
+ // then the output is also zero-extended. If both of the incoming values
+ // are sign-extended then the output is also sign extended.
+ case PPC::AND:
+ case PPC::AND8: {
+ if (Depth >= MAX_DEPTH)
+ return false;
+
+ assert(MI.getOperand(1).isReg() && MI.getOperand(2).isReg());
+
+ unsigned SrcReg1 = MI.getOperand(1).getReg();
+ unsigned SrcReg2 = MI.getOperand(2).getReg();
+
+ if (!TargetRegisterInfo::isVirtualRegister(SrcReg1) ||
+ !TargetRegisterInfo::isVirtualRegister(SrcReg2))
+ return false;
+
+ const MachineInstr *MISrc1 = MRI->getVRegDef(SrcReg1);
+ const MachineInstr *MISrc2 = MRI->getVRegDef(SrcReg2);
+ if (!MISrc1 || !MISrc2)
+ return false;
+
+ if(SignExt)
+ return isSignOrZeroExtended(*MISrc1, SignExt, Depth+1) &&
+ isSignOrZeroExtended(*MISrc2, SignExt, Depth+1);
+ else
+ return isSignOrZeroExtended(*MISrc1, SignExt, Depth+1) ||
+ isSignOrZeroExtended(*MISrc2, SignExt, Depth+1);
+ }
+
+ default:
+ break;
+ }
+ return false;
+}
diff --git a/lib/Target/PowerPC/PPCInstrInfo.h b/lib/Target/PowerPC/PPCInstrInfo.h
index b0629c88cf57b..4271c50127a1d 100644
--- a/lib/Target/PowerPC/PPCInstrInfo.h
+++ b/lib/Target/PowerPC/PPCInstrInfo.h
@@ -16,7 +16,7 @@
#include "PPC.h"
#include "PPCRegisterInfo.h"
-#include "llvm/Target/TargetInstrInfo.h"
+#include "llvm/CodeGen/TargetInstrInfo.h"
#define GET_INSTRINFO_HEADER
#include "PPCGenInstrInfo.inc"
@@ -72,6 +72,41 @@ enum {
};
} // end namespace PPCII
+// Instructions that have an immediate form might be convertible to that
+// form if the correct input is a result of a load immediate. In order to
+// know whether the transformation is special, we might need to know some
+// of the details of the two forms.
+struct ImmInstrInfo {
+ // Is the immediate field in the immediate form signed or unsigned?
+ uint64_t SignedImm : 1;
+ // Does the immediate need to be a multiple of some value?
+ uint64_t ImmMustBeMultipleOf : 5;
+ // Is R0/X0 treated specially by the original r+r instruction?
+ // If so, in which operand?
+ uint64_t ZeroIsSpecialOrig : 3;
+ // Is R0/X0 treated specially by the new r+i instruction?
+ // If so, in which operand?
+ uint64_t ZeroIsSpecialNew : 3;
+ // Is the operation commutative?
+ uint64_t IsCommutative : 1;
+ // The operand number to check for load immediate.
+ uint64_t ConstantOpNo : 3;
+ // The operand number for the immediate.
+ uint64_t ImmOpNo : 3;
+ // The opcode of the new instruction.
+ uint64_t ImmOpcode : 16;
+ // The size of the immediate.
+ uint64_t ImmWidth : 5;
+};
+
+// Information required to convert an instruction to just a materialized
+// immediate.
+struct LoadImmediateInfo {
+ unsigned Imm : 16;
+ unsigned Is64Bit : 1;
+ unsigned SetCR : 1;
+};
+
class PPCSubtarget;
class PPCInstrInfo : public PPCGenInstrInfo {
PPCSubtarget &Subtarget;
@@ -87,6 +122,10 @@ class PPCInstrInfo : public PPCGenInstrInfo {
const TargetRegisterClass *RC,
SmallVectorImpl<MachineInstr *> &NewMIs,
bool &NonRI, bool &SpillsVRS) const;
+ bool transformToImmForm(MachineInstr &MI, const ImmInstrInfo &III,
+ unsigned ConstantOpNo, int64_t Imm) const;
+ MachineInstr *getConstantDefMI(MachineInstr &MI, unsigned &ConstOp,
+ bool &SeenIntermediateUse) const;
virtual void anchor();
protected:
@@ -282,6 +321,9 @@ public:
ArrayRef<std::pair<unsigned, const char *>>
getSerializableBitmaskMachineOperandTargetFlags() const override;
+ // Expand VSX Memory Pseudo instruction to either a VSX or a FP instruction.
+ bool expandVSXMemPseudo(MachineInstr &MI) const;
+
// Lower pseudo instructions after register allocation.
bool expandPostRAPseudo(MachineInstr &MI) const override;
@@ -293,6 +335,36 @@ public:
}
const TargetRegisterClass *updatedRC(const TargetRegisterClass *RC) const;
static int getRecordFormOpcode(unsigned Opcode);
+
+ bool isTOCSaveMI(const MachineInstr &MI) const;
+
+ bool isSignOrZeroExtended(const MachineInstr &MI, bool SignExt,
+ const unsigned PhiDepth) const;
+
+ /// Return true if the output of the instruction is always a sign-extended,
+ /// i.e. 0 to 31-th bits are same as 32-th bit.
+ bool isSignExtended(const MachineInstr &MI, const unsigned depth = 0) const {
+ return isSignOrZeroExtended(MI, true, depth);
+ }
+
+ /// Return true if the output of the instruction is always zero-extended,
+ /// i.e. 0 to 31-th bits are all zeros
+ bool isZeroExtended(const MachineInstr &MI, const unsigned depth = 0) const {
+ return isSignOrZeroExtended(MI, false, depth);
+ }
+
+ bool convertToImmediateForm(MachineInstr &MI,
+ MachineInstr **KilledDef = nullptr) const;
+ void replaceInstrWithLI(MachineInstr &MI, const LoadImmediateInfo &LII) const;
+
+ // This is used to find the "true" source register for n
+ // Machine instruction. Returns the original SrcReg unless it is the target
+ // of a copy-like operation, in which case we chain backwards through all
+ // such operations to the ultimate source register. If a
+ // physical register is encountered, we stop the search.
+ static unsigned lookThruCopyLike(unsigned SrcReg,
+ const MachineRegisterInfo *MRI);
+ bool instrHasImmForm(const MachineInstr &MI, ImmInstrInfo &III) const;
};
}
diff --git a/lib/Target/PowerPC/PPCInstrInfo.td b/lib/Target/PowerPC/PPCInstrInfo.td
index dd7fc2659102a..a932d05b24eef 100644
--- a/lib/Target/PowerPC/PPCInstrInfo.td
+++ b/lib/Target/PowerPC/PPCInstrInfo.td
@@ -181,7 +181,7 @@ def PPCaddiDtprelL : SDNode<"PPCISD::ADDI_DTPREL_L", SDTIntBinOp>;
def PPCvperm : SDNode<"PPCISD::VPERM", SDT_PPCvperm, []>;
def PPCxxsplt : SDNode<"PPCISD::XXSPLT", SDT_PPCVecSplat, []>;
-def PPCxxinsert : SDNode<"PPCISD::XXINSERT", SDT_PPCVecInsert, []>;
+def PPCvecinsert : SDNode<"PPCISD::VECINSERT", SDT_PPCVecInsert, []>;
def PPCxxreverse : SDNode<"PPCISD::XXREVERSE", SDT_PPCVecReverse, []>;
def PPCxxpermdi : SDNode<"PPCISD::XXPERMDI", SDT_PPCxxpermdi, []>;
def PPCvecshl : SDNode<"PPCISD::VECSHL", SDT_PPCVecShift, []>;
@@ -1057,6 +1057,20 @@ multiclass XSForm_1rc<bits<6> opcode, bits<9> xo, dag OOL, dag IOL,
}
}
+multiclass XSForm_1r<bits<6> opcode, bits<9> xo, dag OOL, dag IOL,
+ string asmbase, string asmstr, InstrItinClass itin,
+ list<dag> pattern> {
+ let BaseName = asmbase in {
+ def NAME : XSForm_1<opcode, xo, OOL, IOL,
+ !strconcat(asmbase, !strconcat(" ", asmstr)), itin,
+ pattern>, RecFormRel;
+ let Defs = [CR0] in
+ def o : XSForm_1<opcode, xo, OOL, IOL,
+ !strconcat(asmbase, !strconcat(". ", asmstr)), itin,
+ []>, isDOT, RecFormRel;
+ }
+}
+
multiclass XForm_26r<bits<6> opcode, bits<10> xo, dag OOL, dag IOL,
string asmbase, string asmstr, InstrItinClass itin,
list<dag> pattern> {
@@ -1576,6 +1590,11 @@ def : Pat<(prefetch xoaddr:$dst, (i32 0), imm, (i32 0)),
(ICBT 0, xoaddr:$dst)>, Requires<[HasICBT]>; // inst prefetch (for read)
// Atomic operations
+// FIXME: some of these might be used with constant operands. This will result
+// in constant materialization instructions that may be redundant. We currently
+// clean this up in PPCMIPeephole with calls to
+// PPCInstrInfo::convertToImmediateForm() but we should probably not emit them
+// in the first place.
let usesCustomInserter = 1 in {
let Defs = [CR0] in {
def ATOMIC_LOAD_ADD_I8 : Pseudo<
@@ -2571,6 +2590,35 @@ let Uses = [RM] in {
let Defs = [CR1] in
def MFFSo : XForm_42<63, 583, (outs f8rc:$rT), (ins),
"mffs. $rT", IIC_IntMFFS, []>, isDOT;
+
+ def MFFSCE : X_FRT5_XO2_XO3_XO10<63, 0, 1, 583, (outs f8rc:$rT), (ins),
+ "mffsce $rT", IIC_IntMFFS, []>,
+ PPC970_DGroup_Single, PPC970_Unit_FPU;
+
+ def MFFSCDRN : X_FRT5_XO2_XO3_FRB5_XO10<63, 2, 4, 583, (outs f8rc:$rT),
+ (ins f8rc:$FRB), "mffscdrn $rT, $FRB",
+ IIC_IntMFFS, []>,
+ PPC970_DGroup_Single, PPC970_Unit_FPU;
+
+ def MFFSCDRNI : X_FRT5_XO2_XO3_DRM3_XO10<63, 2, 5, 583, (outs f8rc:$rT),
+ (ins u3imm:$DRM),
+ "mffscdrni $rT, $DRM",
+ IIC_IntMFFS, []>,
+ PPC970_DGroup_Single, PPC970_Unit_FPU;
+
+ def MFFSCRN : X_FRT5_XO2_XO3_FRB5_XO10<63, 2, 6, 583, (outs f8rc:$rT),
+ (ins f8rc:$FRB), "mffscrn $rT, $FRB",
+ IIC_IntMFFS, []>,
+ PPC970_DGroup_Single, PPC970_Unit_FPU;
+
+ def MFFSCRNI : X_FRT5_XO2_XO3_RM2_X10<63, 2, 7, 583, (outs f8rc:$rT),
+ (ins u2imm:$RM), "mffscrni $rT, $RM",
+ IIC_IntMFFS, []>,
+ PPC970_DGroup_Single, PPC970_Unit_FPU;
+
+ def MFFSL : X_FRT5_XO2_XO3_XO10<63, 3, 0, 583, (outs f8rc:$rT), (ins),
+ "mffsl $rT", IIC_IntMFFS, []>,
+ PPC970_DGroup_Single, PPC970_Unit_FPU;
}
let Predicates = [IsISA3_0] in {
@@ -3890,6 +3938,63 @@ def STWCIX : XForm_base_r3xo<31, 917, (outs), (ins gprc:$RST, gprc:$A, gprc:$B),
def STDCIX : XForm_base_r3xo<31, 1013, (outs), (ins gprc:$RST, gprc:$A, gprc:$B),
"stdcix $RST, $A, $B", IIC_LdStLoad, []>;
+// External PID Load Store Instructions
+
+def LBEPX : XForm_1<31, 95, (outs gprc:$rD), (ins memrr:$src),
+ "lbepx $rD, $src", IIC_LdStLoad, []>,
+ Requires<[IsE500]>;
+
+def LFDEPX : XForm_25<31, 607, (outs f8rc:$frD), (ins memrr:$src),
+ "lfdepx $frD, $src", IIC_LdStLFD, []>,
+ Requires<[IsE500]>;
+
+def LHEPX : XForm_1<31, 287, (outs gprc:$rD), (ins memrr:$src),
+ "lhepx $rD, $src", IIC_LdStLoad, []>,
+ Requires<[IsE500]>;
+
+def LWEPX : XForm_1<31, 31, (outs gprc:$rD), (ins memrr:$src),
+ "lwepx $rD, $src", IIC_LdStLoad, []>,
+ Requires<[IsE500]>;
+
+def STBEPX : XForm_8<31, 223, (outs), (ins gprc:$rS, memrr:$dst),
+ "stbepx $rS, $dst", IIC_LdStStore, []>,
+ Requires<[IsE500]>;
+
+def STFDEPX : XForm_28<31, 735, (outs), (ins f8rc:$frS, memrr:$dst),
+ "stfdepx $frS, $dst", IIC_LdStSTFD, []>,
+ Requires<[IsE500]>;
+
+def STHEPX : XForm_8<31, 415, (outs), (ins gprc:$rS, memrr:$dst),
+ "sthepx $rS, $dst", IIC_LdStStore, []>,
+ Requires<[IsE500]>;
+
+def STWEPX : XForm_8<31, 159, (outs), (ins gprc:$rS, memrr:$dst),
+ "stwepx $rS, $dst", IIC_LdStStore, []>,
+ Requires<[IsE500]>;
+
+def DCBFEP : DCB_Form<127, 0, (outs), (ins memrr:$dst), "dcbfep $dst",
+ IIC_LdStDCBF, []>, Requires<[IsE500]>;
+
+def DCBSTEP : DCB_Form<63, 0, (outs), (ins memrr:$dst), "dcbstep $dst",
+ IIC_LdStDCBF, []>, Requires<[IsE500]>;
+
+def DCBTEP : DCB_Form_hint<319, (outs), (ins memrr:$dst, u5imm:$TH),
+ "dcbtep $TH, $dst", IIC_LdStDCBF, []>,
+ Requires<[IsE500]>;
+
+def DCBTSTEP : DCB_Form_hint<255, (outs), (ins memrr:$dst, u5imm:$TH),
+ "dcbtstep $TH, $dst", IIC_LdStDCBF, []>,
+ Requires<[IsE500]>;
+
+def DCBZEP : DCB_Form<1023, 0, (outs), (ins memrr:$dst), "dcbzep $dst",
+ IIC_LdStDCBF, []>, Requires<[IsE500]>;
+
+def DCBZLEP : DCB_Form<1023, 1, (outs), (ins memrr:$dst), "dcbzlep $dst",
+ IIC_LdStDCBF, []>, Requires<[IsE500]>;
+
+def ICBIEP : XForm_1a<31, 991, (outs), (ins memrr:$src), "icbiep $src",
+ IIC_LdStICBI, []>, Requires<[IsE500]>;
+
//===----------------------------------------------------------------------===//
// PowerPC Assembler Instruction Aliases
//
@@ -3908,6 +4013,7 @@ class PPCAsmPseudo<string asm, dag iops>
let AsmString = asm;
let isAsmParserOnly = 1;
let isPseudo = 1;
+ let hasNoSchedulingInfo = 1;
}
def : InstAlias<"sc", (SC 0)>;
@@ -4208,6 +4314,7 @@ def CLRLSLDI : PPCAsmPseudo<"clrlsldi $rA, $rS, $b, $n",
(ins g8rc:$rA, g8rc:$rS, u6imm:$b, u6imm:$n)>;
def CLRLSLDIo : PPCAsmPseudo<"clrlsldi. $rA, $rS, $b, $n",
(ins g8rc:$rA, g8rc:$rS, u6imm:$b, u6imm:$n)>;
+def SUBPCIS : PPCAsmPseudo<"subpcis $RT, $D", (ins g8rc:$RT, s16imm:$D)>;
def : InstAlias<"rotldi $rA, $rS, $n", (RLDICL g8rc:$rA, g8rc:$rS, u6imm:$n, 0)>;
def : InstAlias<"rotldi. $rA, $rS, $n", (RLDICLo g8rc:$rA, g8rc:$rS, u6imm:$n, 0)>;
@@ -4215,8 +4322,9 @@ def : InstAlias<"rotld $rA, $rS, $rB", (RLDCL g8rc:$rA, g8rc:$rS, gprc:$rB, 0)>;
def : InstAlias<"rotld. $rA, $rS, $rB", (RLDCLo g8rc:$rA, g8rc:$rS, gprc:$rB, 0)>;
def : InstAlias<"clrldi $rA, $rS, $n", (RLDICL g8rc:$rA, g8rc:$rS, 0, u6imm:$n)>;
def : InstAlias<"clrldi $rA, $rS, $n",
- (RLDICL_32 gprc:$rA, gprc:$rS, 0, u6imm:$n)>;
+ (RLDICL_32_64 g8rc:$rA, gprc:$rS, 0, u6imm:$n)>;
def : InstAlias<"clrldi. $rA, $rS, $n", (RLDICLo g8rc:$rA, g8rc:$rS, 0, u6imm:$n)>;
+def : InstAlias<"lnia $RT", (ADDPCIS g8rc:$RT, 0)>;
def RLWINMbm : PPCAsmPseudo<"rlwinm $rA, $rS, $n, $b",
(ins g8rc:$rA, g8rc:$rS, u5imm:$n, i32imm:$b)>;
@@ -4233,7 +4341,7 @@ def RLWNMobm : PPCAsmPseudo<"rlwnm. $rA, $rS, $n, $b",
// These generic branch instruction forms are used for the assembler parser only.
// Defs and Uses are conservative, since we don't know the BO value.
-let PPC970_Unit = 7 in {
+let PPC970_Unit = 7, isBranch = 1 in {
let Defs = [CTR], Uses = [CTR, RM] in {
def gBC : BForm_3<16, 0, 0, (outs),
(ins u5imm:$bo, crbitrc:$bi, condbrtarget:$dst),
@@ -4550,7 +4658,7 @@ def : Pat<(i32 (bitreverse i32:$A)),
// n = ((n >> 2) & 0x3333333333333333) | ((n << 2) & 0xCCCCCCCCCCCCCCCC);
// Step 3: 4-bit swap (swap odd 4-bit and even 4-bit):
// n = ((n >> 4) & 0x0F0F0F0F0F0F0F0F) | ((n << 4) & 0xF0F0F0F0F0F0F0F0);
-// Step 4: byte reverse (Suppose n = [B1,B2,B3,B4,B5,B6,B7,B8]):
+// Step 4: byte reverse (Suppose n = [B0,B1,B2,B3,B4,B5,B6,B7]):
// Apply the same byte reverse algorithm mentioned above for the fast 32-bit
// reverse to both the high 32 bit and low 32 bit of the 64 bit value. And
// then OR them together to get the final result.
@@ -4572,92 +4680,55 @@ def DWMaskValues {
dag Hi4 = (ORI8 (ORIS8 (RLDICR MaskValues64.Hi4, 32, 31), 0xF0F0), 0xF0F0);
}
-def DWShift1 {
- dag Right = (RLDICL $A, 63, 1);
- dag Left = (RLDICR $A, 1, 62);
-}
-
-def DWSwap1 {
- dag Bit = (OR8 (AND8 DWShift1.Right, DWMaskValues.Lo1),
- (AND8 DWShift1.Left, DWMaskValues.Hi1));
-}
-
-def DWShift2 {
- dag Right = (RLDICL DWSwap1.Bit, 62, 2);
- dag Left = (RLDICR DWSwap1.Bit, 2, 61);
-}
-
-def DWSwap2 {
- dag Bits = (OR8 (AND8 DWShift2.Right, DWMaskValues.Lo2),
- (AND8 DWShift2.Left, DWMaskValues.Hi2));
-}
-
-def DWShift4 {
- dag Right = (RLDICL DWSwap2.Bits, 60, 4);
- dag Left = (RLDICR DWSwap2.Bits, 4, 59);
-}
-
-def DWSwap4 {
- dag Bits = (OR8 (AND8 DWShift4.Right, DWMaskValues.Lo4),
- (AND8 DWShift4.Left, DWMaskValues.Hi4));
-}
-
-// Bit swap is done, now start byte swap.
-def DWExtractLo32 {
- dag SubReg = (i32 (EXTRACT_SUBREG DWSwap4.Bits, sub_32));
-}
-
-def DWRotateLo32 {
- dag Left24 = (RLWINM DWExtractLo32.SubReg, 24, 0, 31);
-}
-
-def DWLo32RotateInsertByte3 {
- dag Left = (RLWIMI DWRotateLo32.Left24, DWExtractLo32.SubReg, 8, 8, 15);
-}
-
-// Lower 32 bits in the right order
-def DWLo32RotateInsertByte1 {
- dag Left =
- (RLWIMI DWLo32RotateInsertByte3.Left, DWExtractLo32.SubReg, 8, 24, 31);
+def DWSwapInByte {
+ dag Swap1 = (OR8 (AND8 (RLDICL $A, 63, 1), DWMaskValues.Lo1),
+ (AND8 (RLDICR $A, 1, 62), DWMaskValues.Hi1));
+ dag Swap2 = (OR8 (AND8 (RLDICL DWSwapInByte.Swap1, 62, 2), DWMaskValues.Lo2),
+ (AND8 (RLDICR DWSwapInByte.Swap1, 2, 61), DWMaskValues.Hi2));
+ dag Swap4 = (OR8 (AND8 (RLDICL DWSwapInByte.Swap2, 60, 4), DWMaskValues.Lo4),
+ (AND8 (RLDICR DWSwapInByte.Swap2, 4, 59), DWMaskValues.Hi4));
}
-def ExtendLo32 {
- dag To64Bit =
- (i64 (INSERT_SUBREG (i64 (IMPLICIT_DEF)),
- DWLo32RotateInsertByte1.Left, sub_32));
+// Intra-byte swap is done, now start inter-byte swap.
+def DWBytes4567 {
+ dag Word = (i32 (EXTRACT_SUBREG DWSwapInByte.Swap4, sub_32));
}
-def DWShiftHi32 { // SRDI DWSwap4.Bits, 32)
- dag ToLo32 = (RLDICL DWSwap4.Bits, 32, 32);
+def DWBytes7456 {
+ dag Word = (RLWINM DWBytes4567.Word, 24, 0, 31);
}
-def DWExtractHi32 {
- dag SubReg = (i32 (EXTRACT_SUBREG DWShiftHi32.ToLo32, sub_32));
+def DWBytes7656 {
+ dag Word = (RLWIMI DWBytes7456.Word, DWBytes4567.Word, 8, 8, 15);
}
-def DWRotateHi32 {
- dag Left24 = (RLWINM DWExtractHi32.SubReg, 24, 0, 31);
+// B7 B6 B5 B4 in the right order
+def DWBytes7654 {
+ dag Word = (RLWIMI DWBytes7656.Word, DWBytes4567.Word, 8, 24, 31);
+ dag DWord =
+ (i64 (INSERT_SUBREG (i64 (IMPLICIT_DEF)), DWBytes7654.Word, sub_32));
}
-def DWHi32RotateInsertByte3 {
- dag Left = (RLWIMI DWRotateHi32.Left24, DWExtractHi32.SubReg, 8, 8, 15);
+def DWBytes0123 {
+ dag Word = (i32 (EXTRACT_SUBREG (RLDICL DWSwapInByte.Swap4, 32, 32), sub_32));
}
-// High 32 bits in the right order, but in the low 32-bit position
-def DWHi32RotateInsertByte1 {
- dag Left =
- (RLWIMI DWHi32RotateInsertByte3.Left, DWExtractHi32.SubReg, 8, 24, 31);
+def DWBytes3012 {
+ dag Word = (RLWINM DWBytes0123.Word, 24, 0, 31);
}
-def ExtendHi32 {
- dag To64Bit =
- (i64 (INSERT_SUBREG (i64 (IMPLICIT_DEF)),
- DWHi32RotateInsertByte1.Left, sub_32));
+def DWBytes3212 {
+ dag Word = (RLWIMI DWBytes3012.Word, DWBytes0123.Word, 8, 8, 15);
}
-def DWShiftLo32 { // SLDI ExtendHi32.To64Bit, 32
- dag ToHi32 = (RLDICR ExtendHi32.To64Bit, 32, 31);
+// B3 B2 B1 B0 in the right order
+def DWBytes3210 {
+ dag Word = (RLWIMI DWBytes3212.Word, DWBytes0123.Word, 8, 24, 31);
+ dag DWord =
+ (i64 (INSERT_SUBREG (i64 (IMPLICIT_DEF)), DWBytes3210.Word, sub_32));
}
+// Now both high word and low word are reversed, next
+// swap the high word and low word.
def : Pat<(i64 (bitreverse i64:$A)),
- (OR8 DWShiftLo32.ToHi32, ExtendLo32.To64Bit)>;
+ (OR8 (RLDICR DWBytes7654.DWord, 32, 31), DWBytes3210.DWord)>;
diff --git a/lib/Target/PowerPC/PPCInstrVSX.td b/lib/Target/PowerPC/PPCInstrVSX.td
index 942e8b392b82b..6f719784eb7c6 100644
--- a/lib/Target/PowerPC/PPCInstrVSX.td
+++ b/lib/Target/PowerPC/PPCInstrVSX.td
@@ -47,6 +47,13 @@ def vssrc : RegisterOperand<VSSRC> {
let ParserMatchClass = PPCRegVSSRCAsmOperand;
}
+def PPCRegSPILLTOVSRRCAsmOperand : AsmOperandClass {
+ let Name = "RegSPILLTOVSRRC"; let PredicateMethod = "isVSRegNumber";
+}
+
+def spilltovsrrc : RegisterOperand<SPILLTOVSRRC> {
+ let ParserMatchClass = PPCRegSPILLTOVSRRCAsmOperand;
+}
// Little-endian-specific nodes.
def SDT_PPClxvd2x : SDTypeProfile<1, 1, [
SDTCisVT<0, v2f64>, SDTCisPtrTy<1>
@@ -124,6 +131,12 @@ let Uses = [RM] in {
"lxsdx $XT, $src", IIC_LdStLFD,
[(set f64:$XT, (load xoaddr:$src))]>;
+ // Pseudo instruction XFLOADf64 will be expanded to LXSDX or LFDX later
+ let isPseudo = 1, CodeSize = 3 in
+ def XFLOADf64 : Pseudo<(outs vsfrc:$XT), (ins memrr:$src),
+ "#XFLOADf64",
+ [(set f64:$XT, (load xoaddr:$src))]>;
+
let Predicates = [HasVSX, HasOnlySwappingMemOps] in
def LXVD2X : XX1Form<31, 844,
(outs vsrc:$XT), (ins memrr:$src),
@@ -149,6 +162,12 @@ let Uses = [RM] in {
"stxsdx $XT, $dst", IIC_LdStSTFD,
[(store f64:$XT, xoaddr:$dst)]>;
+ // Pseudo instruction XFSTOREf64 will be expanded to STXSDX or STFDX later
+ let isPseudo = 1, CodeSize = 3 in
+ def XFSTOREf64 : Pseudo<(outs), (ins vsfrc:$XT, memrr:$dst),
+ "#XFSTOREf64",
+ [(store f64:$XT, xoaddr:$dst)]>;
+
let Predicates = [HasVSX, HasOnlySwappingMemOps] in {
// The behaviour of this instruction is endianness-specific so we provide no
// pattern to match it without considering endianness.
@@ -1208,32 +1227,59 @@ let AddedComplexity = 400 in { // Prefer VSX patterns over non-VSX patterns.
let mayLoad = 1, mayStore = 0 in {
let CodeSize = 3 in
def LXSSPX : XX1Form<31, 524, (outs vssrc:$XT), (ins memrr:$src),
- "lxsspx $XT, $src", IIC_LdStLFD,
- [(set f32:$XT, (load xoaddr:$src))]>;
+ "lxsspx $XT, $src", IIC_LdStLFD, []>;
def LXSIWAX : XX1Form<31, 76, (outs vsfrc:$XT), (ins memrr:$src),
- "lxsiwax $XT, $src", IIC_LdStLFD,
- [(set f64:$XT, (PPClfiwax xoaddr:$src))]>;
+ "lxsiwax $XT, $src", IIC_LdStLFD, []>;
def LXSIWZX : XX1Form<31, 12, (outs vsfrc:$XT), (ins memrr:$src),
- "lxsiwzx $XT, $src", IIC_LdStLFD,
- [(set f64:$XT, (PPClfiwzx xoaddr:$src))]>;
+ "lxsiwzx $XT, $src", IIC_LdStLFD, []>;
+
+ // Please note let isPseudo = 1 is not part of class Pseudo<>. Missing it
+ // would cause these Pseudos are not expanded in expandPostRAPseudos()
+ let isPseudo = 1 in {
+ // Pseudo instruction XFLOADf32 will be expanded to LXSSPX or LFSX later
+ let CodeSize = 3 in
+ def XFLOADf32 : Pseudo<(outs vssrc:$XT), (ins memrr:$src),
+ "#XFLOADf32",
+ [(set f32:$XT, (load xoaddr:$src))]>;
+ // Pseudo instruction LIWAX will be expanded to LXSIWAX or LFIWAX later
+ def LIWAX : Pseudo<(outs vsfrc:$XT), (ins memrr:$src),
+ "#LIWAX",
+ [(set f64:$XT, (PPClfiwax xoaddr:$src))]>;
+ // Pseudo instruction LIWZX will be expanded to LXSIWZX or LFIWZX later
+ def LIWZX : Pseudo<(outs vsfrc:$XT), (ins memrr:$src),
+ "#LIWZX",
+ [(set f64:$XT, (PPClfiwzx xoaddr:$src))]>;
+ }
} // mayLoad
// VSX scalar stores introduced in ISA 2.07
let mayStore = 1, mayLoad = 0 in {
let CodeSize = 3 in
def STXSSPX : XX1Form<31, 652, (outs), (ins vssrc:$XT, memrr:$dst),
- "stxsspx $XT, $dst", IIC_LdStSTFD,
- [(store f32:$XT, xoaddr:$dst)]>;
+ "stxsspx $XT, $dst", IIC_LdStSTFD, []>;
def STXSIWX : XX1Form<31, 140, (outs), (ins vsfrc:$XT, memrr:$dst),
- "stxsiwx $XT, $dst", IIC_LdStSTFD,
- [(PPCstfiwx f64:$XT, xoaddr:$dst)]>;
+ "stxsiwx $XT, $dst", IIC_LdStSTFD, []>;
+
+ // Please note let isPseudo = 1 is not part of class Pseudo<>. Missing it
+ // would cause these Pseudos are not expanded in expandPostRAPseudos()
+ let isPseudo = 1 in {
+ // Pseudo instruction XFSTOREf32 will be expanded to STXSSPX or STFSX later
+ let CodeSize = 3 in
+ def XFSTOREf32 : Pseudo<(outs), (ins vssrc:$XT, memrr:$dst),
+ "#XFSTOREf32",
+ [(store f32:$XT, xoaddr:$dst)]>;
+ // Pseudo instruction STIWX will be expanded to STXSIWX or STFIWX later
+ def STIWX : Pseudo<(outs), (ins vsfrc:$XT, memrr:$dst),
+ "#STIWX",
+ [(PPCstfiwx f64:$XT, xoaddr:$dst)]>;
+ }
} // mayStore
} // UseVSXReg = 1
def : Pat<(f64 (extloadf32 xoaddr:$src)),
- (COPY_TO_REGCLASS (LXSSPX xoaddr:$src), VSFRC)>;
+ (COPY_TO_REGCLASS (XFLOADf32 xoaddr:$src), VSFRC)>;
def : Pat<(f32 (fpround (extloadf32 xoaddr:$src))),
- (f32 (LXSSPX xoaddr:$src))>;
+ (f32 (XFLOADf32 xoaddr:$src))>;
def : Pat<(f64 (fpextend f32:$src)),
(COPY_TO_REGCLASS $src, VSFRC)>;
@@ -1407,7 +1453,7 @@ let AddedComplexity = 400 in { // Prefer VSX patterns over non-VSX patterns.
(f32 (XSCVUXDSP (COPY_TO_REGCLASS (XXPERMDI $S, $S, 2), VSFRC)))>;
}
def : Pat<(v4i32 (scalar_to_vector ScalarLoads.Li32)),
- (v4i32 (XXSPLTWs (LXSIWAX xoaddr:$src), 1))>;
+ (v4i32 (XXSPLTWs (LIWAX xoaddr:$src), 1))>;
} // AddedComplexity = 400
} // HasP8Vector
@@ -1769,6 +1815,7 @@ def VectorExtractions {
dag BE_VARIABLE_DOUBLE = (COPY_TO_REGCLASS BE_VDOUBLE_PERMUTE, VSRC);
}
+def NoP9Altivec : Predicate<"!PPCSubTarget->hasP9Altivec()">;
let AddedComplexity = 400 in {
// v4f32 scalar <-> vector conversions (BE)
let Predicates = [IsBigEndian, HasP8Vector] in {
@@ -1801,6 +1848,17 @@ let Predicates = [IsBigEndian, HasDirectMove] in {
(v4i32 (SUBREG_TO_REG (i64 1), MovesToVSR.BE_WORD_0, sub_64))>;
def : Pat<(v2i64 (scalar_to_vector i64:$A)),
(v2i64 (SUBREG_TO_REG (i64 1), MovesToVSR.BE_DWORD_0, sub_64))>;
+
+ // v2i64 scalar <-> vector conversions (BE)
+ def : Pat<(i64 (vector_extract v2i64:$S, 0)),
+ (i64 VectorExtractions.LE_DWORD_1)>;
+ def : Pat<(i64 (vector_extract v2i64:$S, 1)),
+ (i64 VectorExtractions.LE_DWORD_0)>;
+ def : Pat<(i64 (vector_extract v2i64:$S, i64:$Idx)),
+ (i64 VectorExtractions.BE_VARIABLE_DWORD)>;
+} // IsBigEndian, HasDirectMove
+
+let Predicates = [IsBigEndian, HasDirectMove, NoP9Altivec] in {
def : Pat<(i32 (vector_extract v16i8:$S, 0)),
(i32 VectorExtractions.LE_BYTE_15)>;
def : Pat<(i32 (vector_extract v16i8:$S, 1)),
@@ -1867,15 +1925,7 @@ let Predicates = [IsBigEndian, HasDirectMove] in {
(i32 VectorExtractions.LE_WORD_0)>;
def : Pat<(i32 (vector_extract v4i32:$S, i64:$Idx)),
(i32 VectorExtractions.BE_VARIABLE_WORD)>;
-
- // v2i64 scalar <-> vector conversions (BE)
- def : Pat<(i64 (vector_extract v2i64:$S, 0)),
- (i64 VectorExtractions.LE_DWORD_1)>;
- def : Pat<(i64 (vector_extract v2i64:$S, 1)),
- (i64 VectorExtractions.LE_DWORD_0)>;
- def : Pat<(i64 (vector_extract v2i64:$S, i64:$Idx)),
- (i64 VectorExtractions.BE_VARIABLE_DWORD)>;
-} // IsBigEndian, HasDirectMove
+} // IsBigEndian, HasDirectMove, NoP9Altivec
// v4f32 scalar <-> vector conversions (LE)
let Predicates = [IsLittleEndian, HasP8Vector] in {
@@ -1931,8 +1981,10 @@ let Predicates = [HasP9Altivec, IsLittleEndian] in {
(VEXTUWRX (LI8 0), $S)>;
def : Pat<(i64 (zext (i32 (vector_extract v4i32:$S, 1)))),
(VEXTUWRX (LI8 4), $S)>;
+ // For extracting LE word 2, MFVSRWZ is better than VEXTUWRX
def : Pat<(i64 (zext (i32 (vector_extract v4i32:$S, 2)))),
- (VEXTUWRX (LI8 8), $S)>;
+ (INSERT_SUBREG (i64 (IMPLICIT_DEF)),
+ (i32 VectorExtractions.LE_WORD_2), sub_32)>;
def : Pat<(i64 (zext (i32 (vector_extract v4i32:$S, 3)))),
(VEXTUWRX (LI8 12), $S)>;
@@ -1942,11 +1994,82 @@ let Predicates = [HasP9Altivec, IsLittleEndian] in {
(EXTSW (VEXTUWRX (LI8 0), $S))>;
def : Pat<(i64 (sext (i32 (vector_extract v4i32:$S, 1)))),
(EXTSW (VEXTUWRX (LI8 4), $S))>;
+ // For extracting LE word 2, MFVSRWZ is better than VEXTUWRX
def : Pat<(i64 (sext (i32 (vector_extract v4i32:$S, 2)))),
- (EXTSW (VEXTUWRX (LI8 8), $S))>;
+ (EXTSW (INSERT_SUBREG (i64 (IMPLICIT_DEF)),
+ (i32 VectorExtractions.LE_WORD_2), sub_32))>;
def : Pat<(i64 (sext (i32 (vector_extract v4i32:$S, 3)))),
(EXTSW (VEXTUWRX (LI8 12), $S))>;
+
+ def : Pat<(i32 (vector_extract v16i8:$S, i64:$Idx)),
+ (i32 (EXTRACT_SUBREG (VEXTUBRX $Idx, $S), sub_32))>;
+ def : Pat<(i32 (vector_extract v16i8:$S, 0)),
+ (i32 (EXTRACT_SUBREG (VEXTUBRX (LI8 0), $S), sub_32))>;
+ def : Pat<(i32 (vector_extract v16i8:$S, 1)),
+ (i32 (EXTRACT_SUBREG (VEXTUBRX (LI8 1), $S), sub_32))>;
+ def : Pat<(i32 (vector_extract v16i8:$S, 2)),
+ (i32 (EXTRACT_SUBREG (VEXTUBRX (LI8 2), $S), sub_32))>;
+ def : Pat<(i32 (vector_extract v16i8:$S, 3)),
+ (i32 (EXTRACT_SUBREG (VEXTUBRX (LI8 3), $S), sub_32))>;
+ def : Pat<(i32 (vector_extract v16i8:$S, 4)),
+ (i32 (EXTRACT_SUBREG (VEXTUBRX (LI8 4), $S), sub_32))>;
+ def : Pat<(i32 (vector_extract v16i8:$S, 5)),
+ (i32 (EXTRACT_SUBREG (VEXTUBRX (LI8 5), $S), sub_32))>;
+ def : Pat<(i32 (vector_extract v16i8:$S, 6)),
+ (i32 (EXTRACT_SUBREG (VEXTUBRX (LI8 6), $S), sub_32))>;
+ def : Pat<(i32 (vector_extract v16i8:$S, 7)),
+ (i32 (EXTRACT_SUBREG (VEXTUBRX (LI8 7), $S), sub_32))>;
+ def : Pat<(i32 (vector_extract v16i8:$S, 8)),
+ (i32 (EXTRACT_SUBREG (VEXTUBRX (LI8 8), $S), sub_32))>;
+ def : Pat<(i32 (vector_extract v16i8:$S, 9)),
+ (i32 (EXTRACT_SUBREG (VEXTUBRX (LI8 9), $S), sub_32))>;
+ def : Pat<(i32 (vector_extract v16i8:$S, 10)),
+ (i32 (EXTRACT_SUBREG (VEXTUBRX (LI8 10), $S), sub_32))>;
+ def : Pat<(i32 (vector_extract v16i8:$S, 11)),
+ (i32 (EXTRACT_SUBREG (VEXTUBRX (LI8 11), $S), sub_32))>;
+ def : Pat<(i32 (vector_extract v16i8:$S, 12)),
+ (i32 (EXTRACT_SUBREG (VEXTUBRX (LI8 12), $S), sub_32))>;
+ def : Pat<(i32 (vector_extract v16i8:$S, 13)),
+ (i32 (EXTRACT_SUBREG (VEXTUBRX (LI8 13), $S), sub_32))>;
+ def : Pat<(i32 (vector_extract v16i8:$S, 14)),
+ (i32 (EXTRACT_SUBREG (VEXTUBRX (LI8 14), $S), sub_32))>;
+ def : Pat<(i32 (vector_extract v16i8:$S, 15)),
+ (i32 (EXTRACT_SUBREG (VEXTUBRX (LI8 15), $S), sub_32))>;
+
+ def : Pat<(i32 (vector_extract v8i16:$S, i64:$Idx)),
+ (i32 (EXTRACT_SUBREG (VEXTUHRX
+ (RLWINM8 $Idx, 1, 28, 30), $S), sub_32))>;
+ def : Pat<(i32 (vector_extract v8i16:$S, 0)),
+ (i32 (EXTRACT_SUBREG (VEXTUHRX (LI8 0), $S), sub_32))>;
+ def : Pat<(i32 (vector_extract v8i16:$S, 1)),
+ (i32 (EXTRACT_SUBREG (VEXTUHRX (LI8 2), $S), sub_32))>;
+ def : Pat<(i32 (vector_extract v8i16:$S, 2)),
+ (i32 (EXTRACT_SUBREG (VEXTUHRX (LI8 4), $S), sub_32))>;
+ def : Pat<(i32 (vector_extract v8i16:$S, 3)),
+ (i32 (EXTRACT_SUBREG (VEXTUHRX (LI8 6), $S), sub_32))>;
+ def : Pat<(i32 (vector_extract v8i16:$S, 4)),
+ (i32 (EXTRACT_SUBREG (VEXTUHRX (LI8 8), $S), sub_32))>;
+ def : Pat<(i32 (vector_extract v8i16:$S, 5)),
+ (i32 (EXTRACT_SUBREG (VEXTUHRX (LI8 10), $S), sub_32))>;
+ def : Pat<(i32 (vector_extract v8i16:$S, 6)),
+ (i32 (EXTRACT_SUBREG (VEXTUHRX (LI8 12), $S), sub_32))>;
+ def : Pat<(i32 (vector_extract v8i16:$S, 6)),
+ (i32 (EXTRACT_SUBREG (VEXTUHRX (LI8 14), $S), sub_32))>;
+
+ def : Pat<(i32 (vector_extract v4i32:$S, i64:$Idx)),
+ (i32 (EXTRACT_SUBREG (VEXTUWRX
+ (RLWINM8 $Idx, 2, 28, 29), $S), sub_32))>;
+ def : Pat<(i32 (vector_extract v4i32:$S, 0)),
+ (i32 (EXTRACT_SUBREG (VEXTUWRX (LI8 0), $S), sub_32))>;
+ def : Pat<(i32 (vector_extract v4i32:$S, 1)),
+ (i32 (EXTRACT_SUBREG (VEXTUWRX (LI8 4), $S), sub_32))>;
+ // For extracting LE word 2, MFVSRWZ is better than VEXTUWRX
+ def : Pat<(i32 (vector_extract v4i32:$S, 2)),
+ (i32 VectorExtractions.LE_WORD_2)>;
+ def : Pat<(i32 (vector_extract v4i32:$S, 3)),
+ (i32 (EXTRACT_SUBREG (VEXTUWRX (LI8 12), $S), sub_32))>;
}
+
let Predicates = [HasP9Altivec, IsBigEndian] in {
def : Pat<(i64 (anyext (i32 (vector_extract v16i8:$S, i64:$Idx)))),
(VEXTUBLX $Idx, $S)>;
@@ -1974,8 +2097,11 @@ let Predicates = [HasP9Altivec, IsBigEndian] in {
(VEXTUWLX (RLWINM8 $Idx, 2, 28, 29), $S)>;
def : Pat<(i64 (zext (i32 (vector_extract v4i32:$S, 0)))),
(VEXTUWLX (LI8 0), $S)>;
+
+ // For extracting BE word 1, MFVSRWZ is better than VEXTUWLX
def : Pat<(i64 (zext (i32 (vector_extract v4i32:$S, 1)))),
- (VEXTUWLX (LI8 4), $S)>;
+ (INSERT_SUBREG (i64 (IMPLICIT_DEF)),
+ (i32 VectorExtractions.LE_WORD_2), sub_32)>;
def : Pat<(i64 (zext (i32 (vector_extract v4i32:$S, 2)))),
(VEXTUWLX (LI8 8), $S)>;
def : Pat<(i64 (zext (i32 (vector_extract v4i32:$S, 3)))),
@@ -1985,12 +2111,82 @@ let Predicates = [HasP9Altivec, IsBigEndian] in {
(EXTSW (VEXTUWLX (RLWINM8 $Idx, 2, 28, 29), $S))>;
def : Pat<(i64 (sext (i32 (vector_extract v4i32:$S, 0)))),
(EXTSW (VEXTUWLX (LI8 0), $S))>;
+ // For extracting BE word 1, MFVSRWZ is better than VEXTUWLX
def : Pat<(i64 (sext (i32 (vector_extract v4i32:$S, 1)))),
- (EXTSW (VEXTUWLX (LI8 4), $S))>;
+ (EXTSW (INSERT_SUBREG (i64 (IMPLICIT_DEF)),
+ (i32 VectorExtractions.LE_WORD_2), sub_32))>;
def : Pat<(i64 (sext (i32 (vector_extract v4i32:$S, 2)))),
(EXTSW (VEXTUWLX (LI8 8), $S))>;
def : Pat<(i64 (sext (i32 (vector_extract v4i32:$S, 3)))),
(EXTSW (VEXTUWLX (LI8 12), $S))>;
+
+ def : Pat<(i32 (vector_extract v16i8:$S, i64:$Idx)),
+ (i32 (EXTRACT_SUBREG (VEXTUBLX $Idx, $S), sub_32))>;
+ def : Pat<(i32 (vector_extract v16i8:$S, 0)),
+ (i32 (EXTRACT_SUBREG (VEXTUBLX (LI8 0), $S), sub_32))>;
+ def : Pat<(i32 (vector_extract v16i8:$S, 1)),
+ (i32 (EXTRACT_SUBREG (VEXTUBLX (LI8 1), $S), sub_32))>;
+ def : Pat<(i32 (vector_extract v16i8:$S, 2)),
+ (i32 (EXTRACT_SUBREG (VEXTUBLX (LI8 2), $S), sub_32))>;
+ def : Pat<(i32 (vector_extract v16i8:$S, 3)),
+ (i32 (EXTRACT_SUBREG (VEXTUBLX (LI8 3), $S), sub_32))>;
+ def : Pat<(i32 (vector_extract v16i8:$S, 4)),
+ (i32 (EXTRACT_SUBREG (VEXTUBLX (LI8 4), $S), sub_32))>;
+ def : Pat<(i32 (vector_extract v16i8:$S, 5)),
+ (i32 (EXTRACT_SUBREG (VEXTUBLX (LI8 5), $S), sub_32))>;
+ def : Pat<(i32 (vector_extract v16i8:$S, 6)),
+ (i32 (EXTRACT_SUBREG (VEXTUBLX (LI8 6), $S), sub_32))>;
+ def : Pat<(i32 (vector_extract v16i8:$S, 7)),
+ (i32 (EXTRACT_SUBREG (VEXTUBLX (LI8 7), $S), sub_32))>;
+ def : Pat<(i32 (vector_extract v16i8:$S, 8)),
+ (i32 (EXTRACT_SUBREG (VEXTUBLX (LI8 8), $S), sub_32))>;
+ def : Pat<(i32 (vector_extract v16i8:$S, 9)),
+ (i32 (EXTRACT_SUBREG (VEXTUBLX (LI8 9), $S), sub_32))>;
+ def : Pat<(i32 (vector_extract v16i8:$S, 10)),
+ (i32 (EXTRACT_SUBREG (VEXTUBLX (LI8 10), $S), sub_32))>;
+ def : Pat<(i32 (vector_extract v16i8:$S, 11)),
+ (i32 (EXTRACT_SUBREG (VEXTUBLX (LI8 11), $S), sub_32))>;
+ def : Pat<(i32 (vector_extract v16i8:$S, 12)),
+ (i32 (EXTRACT_SUBREG (VEXTUBLX (LI8 12), $S), sub_32))>;
+ def : Pat<(i32 (vector_extract v16i8:$S, 13)),
+ (i32 (EXTRACT_SUBREG (VEXTUBLX (LI8 13), $S), sub_32))>;
+ def : Pat<(i32 (vector_extract v16i8:$S, 14)),
+ (i32 (EXTRACT_SUBREG (VEXTUBLX (LI8 14), $S), sub_32))>;
+ def : Pat<(i32 (vector_extract v16i8:$S, 15)),
+ (i32 (EXTRACT_SUBREG (VEXTUBLX (LI8 15), $S), sub_32))>;
+
+ def : Pat<(i32 (vector_extract v8i16:$S, i64:$Idx)),
+ (i32 (EXTRACT_SUBREG (VEXTUHLX
+ (RLWINM8 $Idx, 1, 28, 30), $S), sub_32))>;
+ def : Pat<(i32 (vector_extract v8i16:$S, 0)),
+ (i32 (EXTRACT_SUBREG (VEXTUHLX (LI8 0), $S), sub_32))>;
+ def : Pat<(i32 (vector_extract v8i16:$S, 1)),
+ (i32 (EXTRACT_SUBREG (VEXTUHLX (LI8 2), $S), sub_32))>;
+ def : Pat<(i32 (vector_extract v8i16:$S, 2)),
+ (i32 (EXTRACT_SUBREG (VEXTUHLX (LI8 4), $S), sub_32))>;
+ def : Pat<(i32 (vector_extract v8i16:$S, 3)),
+ (i32 (EXTRACT_SUBREG (VEXTUHLX (LI8 6), $S), sub_32))>;
+ def : Pat<(i32 (vector_extract v8i16:$S, 4)),
+ (i32 (EXTRACT_SUBREG (VEXTUHLX (LI8 8), $S), sub_32))>;
+ def : Pat<(i32 (vector_extract v8i16:$S, 5)),
+ (i32 (EXTRACT_SUBREG (VEXTUHLX (LI8 10), $S), sub_32))>;
+ def : Pat<(i32 (vector_extract v8i16:$S, 6)),
+ (i32 (EXTRACT_SUBREG (VEXTUHLX (LI8 12), $S), sub_32))>;
+ def : Pat<(i32 (vector_extract v8i16:$S, 6)),
+ (i32 (EXTRACT_SUBREG (VEXTUHLX (LI8 14), $S), sub_32))>;
+
+ def : Pat<(i32 (vector_extract v4i32:$S, i64:$Idx)),
+ (i32 (EXTRACT_SUBREG (VEXTUWLX
+ (RLWINM8 $Idx, 2, 28, 29), $S), sub_32))>;
+ def : Pat<(i32 (vector_extract v4i32:$S, 0)),
+ (i32 (EXTRACT_SUBREG (VEXTUWLX (LI8 0), $S), sub_32))>;
+ // For extracting BE word 1, MFVSRWZ is better than VEXTUWLX
+ def : Pat<(i32 (vector_extract v4i32:$S, 1)),
+ (i32 VectorExtractions.LE_WORD_2)>;
+ def : Pat<(i32 (vector_extract v4i32:$S, 2)),
+ (i32 (EXTRACT_SUBREG (VEXTUWLX (LI8 8), $S), sub_32))>;
+ def : Pat<(i32 (vector_extract v4i32:$S, 3)),
+ (i32 (EXTRACT_SUBREG (VEXTUWLX (LI8 12), $S), sub_32))>;
}
let Predicates = [IsLittleEndian, HasDirectMove] in {
@@ -2003,6 +2199,16 @@ let Predicates = [IsLittleEndian, HasDirectMove] in {
(v4i32 MovesToVSR.LE_WORD_0)>;
def : Pat<(v2i64 (scalar_to_vector i64:$A)),
(v2i64 MovesToVSR.LE_DWORD_0)>;
+ // v2i64 scalar <-> vector conversions (LE)
+ def : Pat<(i64 (vector_extract v2i64:$S, 0)),
+ (i64 VectorExtractions.LE_DWORD_0)>;
+ def : Pat<(i64 (vector_extract v2i64:$S, 1)),
+ (i64 VectorExtractions.LE_DWORD_1)>;
+ def : Pat<(i64 (vector_extract v2i64:$S, i64:$Idx)),
+ (i64 VectorExtractions.LE_VARIABLE_DWORD)>;
+} // IsLittleEndian, HasDirectMove
+
+let Predicates = [IsLittleEndian, HasDirectMove, NoP9Altivec] in {
def : Pat<(i32 (vector_extract v16i8:$S, 0)),
(i32 VectorExtractions.LE_BYTE_0)>;
def : Pat<(i32 (vector_extract v16i8:$S, 1)),
@@ -2069,15 +2275,7 @@ let Predicates = [IsLittleEndian, HasDirectMove] in {
(i32 VectorExtractions.LE_WORD_3)>;
def : Pat<(i32 (vector_extract v4i32:$S, i64:$Idx)),
(i32 VectorExtractions.LE_VARIABLE_WORD)>;
-
- // v2i64 scalar <-> vector conversions (LE)
- def : Pat<(i64 (vector_extract v2i64:$S, 0)),
- (i64 VectorExtractions.LE_DWORD_0)>;
- def : Pat<(i64 (vector_extract v2i64:$S, 1)),
- (i64 VectorExtractions.LE_DWORD_1)>;
- def : Pat<(i64 (vector_extract v2i64:$S, i64:$Idx)),
- (i64 VectorExtractions.LE_VARIABLE_DWORD)>;
-} // IsLittleEndian, HasDirectMove
+} // IsLittleEndian, HasDirectMove, NoP9Altivec
let Predicates = [HasDirectMove, HasVSX] in {
// bitconvert f32 -> i32
@@ -2344,7 +2542,7 @@ let AddedComplexity = 400, Predicates = [HasP9Vector] in {
XX2_RD6_UIM5_RS6<60, 181, (outs vsrc:$XT),
(ins vsrc:$XTi, vsrc:$XB, u4imm:$UIM),
"xxinsertw $XT, $XB, $UIM", IIC_VecFP,
- [(set v4i32:$XT, (PPCxxinsert v4i32:$XTi, v4i32:$XB,
+ [(set v4i32:$XT, (PPCvecinsert v4i32:$XTi, v4i32:$XB,
imm32SExt16:$UIM))]>,
RegConstraint<"$XTi = $XT">, NoEncode<"$XTi">;
@@ -2550,6 +2748,51 @@ let AddedComplexity = 400, Predicates = [HasP9Vector] in {
UseVSXReg;
} // mayStore
+ let Predicates = [IsLittleEndian] in {
+ def: Pat<(f32 (PPCfcfids (PPCmtvsra (i32 (extractelt v4i32:$A, 0))))),
+ (f32 (XSCVSPDPN (XVCVSXWSP (XXSPLTW $A, 3))))>;
+ def: Pat<(f32 (PPCfcfids (PPCmtvsra (i32 (extractelt v4i32:$A, 1))))),
+ (f32 (XSCVSPDPN (XVCVSXWSP (XXSPLTW $A, 2))))>;
+ def: Pat<(f32 (PPCfcfids (PPCmtvsra (i32 (extractelt v4i32:$A, 2))))),
+ (f32 (XSCVSPDPN (XVCVSXWSP (XXSPLTW $A, 1))))>;
+ def: Pat<(f32 (PPCfcfids (PPCmtvsra (i32 (extractelt v4i32:$A, 3))))),
+ (f32 (XSCVSPDPN (XVCVSXWSP (XXSPLTW $A, 0))))>;
+ def: Pat<(f64 (PPCfcfid (PPCmtvsra (i32 (extractelt v4i32:$A, 0))))),
+ (f64 (COPY_TO_REGCLASS (XVCVSXWDP (XXSPLTW $A, 3)), VSFRC))>;
+ def: Pat<(f64 (PPCfcfid (PPCmtvsra (i32 (extractelt v4i32:$A, 1))))),
+ (f64 (COPY_TO_REGCLASS (XVCVSXWDP (XXSPLTW $A, 2)), VSFRC))>;
+ def: Pat<(f64 (PPCfcfid (PPCmtvsra (i32 (extractelt v4i32:$A, 2))))),
+ (f64 (COPY_TO_REGCLASS (XVCVSXWDP (XXSPLTW $A, 1)), VSFRC))>;
+ def: Pat<(f64 (PPCfcfid (PPCmtvsra (i32 (extractelt v4i32:$A, 3))))),
+ (f64 (COPY_TO_REGCLASS (XVCVSXWDP (XXSPLTW $A, 0)), VSFRC))>;
+ }
+
+ let Predicates = [IsBigEndian] in {
+ def: Pat<(f32 (PPCfcfids (PPCmtvsra (i32 (extractelt v4i32:$A, 0))))),
+ (f32 (XSCVSPDPN (XVCVSXWSP (XXSPLTW $A, 0))))>;
+ def: Pat<(f32 (PPCfcfids (PPCmtvsra (i32 (extractelt v4i32:$A, 1))))),
+ (f32 (XSCVSPDPN (XVCVSXWSP (XXSPLTW $A, 1))))>;
+ def: Pat<(f32 (PPCfcfids (PPCmtvsra (i32 (extractelt v4i32:$A, 2))))),
+ (f32 (XSCVSPDPN (XVCVSXWSP (XXSPLTW $A, 2))))>;
+ def: Pat<(f32 (PPCfcfids (PPCmtvsra (i32 (extractelt v4i32:$A, 3))))),
+ (f32 (XSCVSPDPN (XVCVSXWSP (XXSPLTW $A, 3))))>;
+ def: Pat<(f64 (PPCfcfid (PPCmtvsra (i32 (extractelt v4i32:$A, 0))))),
+ (f64 (COPY_TO_REGCLASS (XVCVSXWDP (XXSPLTW $A, 0)), VSFRC))>;
+ def: Pat<(f64 (PPCfcfid (PPCmtvsra (i32 (extractelt v4i32:$A, 1))))),
+ (f64 (COPY_TO_REGCLASS (XVCVSXWDP (XXSPLTW $A, 1)), VSFRC))>;
+ def: Pat<(f64 (PPCfcfid (PPCmtvsra (i32 (extractelt v4i32:$A, 2))))),
+ (f64 (COPY_TO_REGCLASS (XVCVSXWDP (XXSPLTW $A, 2)), VSFRC))>;
+ def: Pat<(f64 (PPCfcfid (PPCmtvsra (i32 (extractelt v4i32:$A, 3))))),
+ (f64 (COPY_TO_REGCLASS (XVCVSXWDP (XXSPLTW $A, 3)), VSFRC))>;
+ }
+
+ // Alternate patterns for PPCmtvsrz where the output is v8i16 or v16i8 instead
+ // of f64
+ def : Pat<(v8i16 (PPCmtvsrz i32:$A)),
+ (v8i16 (SUBREG_TO_REG (i64 1), (MTVSRWZ $A), sub_64))>;
+ def : Pat<(v16i8 (PPCmtvsrz i32:$A)),
+ (v16i8 (SUBREG_TO_REG (i64 1), (MTVSRWZ $A), sub_64))>;
+
// Patterns for which instructions from ISA 3.0 are a better match
let Predicates = [IsLittleEndian, HasP9Vector] in {
def : Pat<(f32 (PPCfcfidus (PPCmtvsrz (i32 (extractelt v4i32:$A, 0))))),
@@ -2560,6 +2803,14 @@ let AddedComplexity = 400, Predicates = [HasP9Vector] in {
(f32 (XSCVUXDSP (XXEXTRACTUW $A, 4)))>;
def : Pat<(f32 (PPCfcfidus (PPCmtvsrz (i32 (extractelt v4i32:$A, 3))))),
(f32 (XSCVUXDSP (XXEXTRACTUW $A, 0)))>;
+ def : Pat<(f64 (PPCfcfidu (PPCmtvsrz (i32 (extractelt v4i32:$A, 0))))),
+ (f64 (XSCVUXDDP (XXEXTRACTUW $A, 12)))>;
+ def : Pat<(f64 (PPCfcfidu (PPCmtvsrz (i32 (extractelt v4i32:$A, 1))))),
+ (f64 (XSCVUXDDP (XXEXTRACTUW $A, 8)))>;
+ def : Pat<(f64 (PPCfcfidu (PPCmtvsrz (i32 (extractelt v4i32:$A, 2))))),
+ (f64 (XSCVUXDDP (XXEXTRACTUW $A, 4)))>;
+ def : Pat<(f64 (PPCfcfidu (PPCmtvsrz (i32 (extractelt v4i32:$A, 3))))),
+ (f64 (XSCVUXDDP (XXEXTRACTUW $A, 0)))>;
def : Pat<(v4i32 (insertelt v4i32:$A, i32:$B, 0)),
(v4i32 (XXINSERTW v4i32:$A, AlignValues.I32_TO_BE_WORD1, 12))>;
def : Pat<(v4i32 (insertelt v4i32:$A, i32:$B, 1)),
@@ -2587,6 +2838,14 @@ let AddedComplexity = 400, Predicates = [HasP9Vector] in {
(f32 (XSCVUXDSP (XXEXTRACTUW $A, 8)))>;
def : Pat<(f32 (PPCfcfidus (PPCmtvsrz (i32 (extractelt v4i32:$A, 3))))),
(f32 (XSCVUXDSP (XXEXTRACTUW $A, 12)))>;
+ def : Pat<(f64 (PPCfcfidu (PPCmtvsrz (i32 (extractelt v4i32:$A, 0))))),
+ (f64 (XSCVUXDDP (XXEXTRACTUW $A, 0)))>;
+ def : Pat<(f64 (PPCfcfidu (PPCmtvsrz (i32 (extractelt v4i32:$A, 1))))),
+ (f64 (XSCVUXDDP (XXEXTRACTUW $A, 4)))>;
+ def : Pat<(f64 (PPCfcfidu (PPCmtvsrz (i32 (extractelt v4i32:$A, 2))))),
+ (f64 (XSCVUXDDP (XXEXTRACTUW $A, 8)))>;
+ def : Pat<(f64 (PPCfcfidu (PPCmtvsrz (i32 (extractelt v4i32:$A, 3))))),
+ (f64 (XSCVUXDDP (XXEXTRACTUW $A, 12)))>;
def : Pat<(v4i32 (insertelt v4i32:$A, i32:$B, 0)),
(v4i32 (XXINSERTW v4i32:$A, AlignValues.I32_TO_BE_WORD1, 0))>;
def : Pat<(v4i32 (insertelt v4i32:$A, i32:$B, 1)),
@@ -2809,6 +3068,23 @@ let AddedComplexity = 400, Predicates = [HasP9Vector] in {
(f32 (DFLOADf32 ixaddr:$src))>;
} // end HasP9Vector, AddedComplexity
+let Predicates = [HasP9Vector] in {
+ let isPseudo = 1 in {
+ let mayStore = 1 in {
+ def SPILLTOVSR_STX : Pseudo<(outs), (ins spilltovsrrc:$XT, memrr:$dst),
+ "#SPILLTOVSR_STX", []>;
+ def SPILLTOVSR_ST : Pseudo<(outs), (ins spilltovsrrc:$XT, memrix:$dst),
+ "#SPILLTOVSR_ST", []>;
+ }
+ let mayLoad = 1 in {
+ def SPILLTOVSR_LDX : Pseudo<(outs spilltovsrrc:$XT), (ins memrr:$src),
+ "#SPILLTOVSR_LDX", []>;
+ def SPILLTOVSR_LD : Pseudo<(outs spilltovsrrc:$XT), (ins memrix:$src),
+ "#SPILLTOVSR_LD", []>;
+
+ }
+ }
+}
// Integer extend helper dags 32 -> 64
def AnyExts {
dag A = (INSERT_SUBREG (i64 (IMPLICIT_DEF)), $A, sub_32);
@@ -2962,10 +3238,10 @@ let AddedComplexity = 400 in {
(COPY_TO_REGCLASS (XSCVDPUXDS $A), VSRC), 0))>;
def : Pat<(v4i32 (scalar_to_vector FltToIntLoad.A)),
(v4i32 (XXSPLTW (COPY_TO_REGCLASS
- (XSCVDPSXWSs (LXSSPX xoaddr:$A)), VSRC), 1))>;
+ (XSCVDPSXWSs (XFLOADf32 xoaddr:$A)), VSRC), 1))>;
def : Pat<(v4i32 (scalar_to_vector FltToUIntLoad.A)),
(v4i32 (XXSPLTW (COPY_TO_REGCLASS
- (XSCVDPUXWSs (LXSSPX xoaddr:$A)), VSRC), 1))>;
+ (XSCVDPUXWSs (XFLOADf32 xoaddr:$A)), VSRC), 1))>;
def : Pat<(v4f32 (build_vector f32:$A, f32:$A, f32:$A, f32:$A)),
(v4f32 (XXSPLTW (v4f32 (XSCVDPSPN $A)), 0))>;
@@ -2983,19 +3259,19 @@ let AddedComplexity = 400 in {
}
let Predicates = [HasVSX, NoP9Vector] in {
- // Load-and-splat with fp-to-int conversion (using X-Form VSX loads).
+ // Load-and-splat with fp-to-int conversion (using X-Form VSX/FP loads).
def : Pat<(v4i32 (scalar_to_vector DblToIntLoad.A)),
(v4i32 (XXSPLTW (COPY_TO_REGCLASS
- (XSCVDPSXWS (LXSDX xoaddr:$A)), VSRC), 1))>;
+ (XSCVDPSXWS (XFLOADf64 xoaddr:$A)), VSRC), 1))>;
def : Pat<(v4i32 (scalar_to_vector DblToUIntLoad.A)),
(v4i32 (XXSPLTW (COPY_TO_REGCLASS
- (XSCVDPUXWS (LXSDX xoaddr:$A)), VSRC), 1))>;
+ (XSCVDPUXWS (XFLOADf64 xoaddr:$A)), VSRC), 1))>;
def : Pat<(v2i64 (scalar_to_vector FltToLongLoad.A)),
(v2i64 (XXPERMDIs (XSCVDPSXDS (COPY_TO_REGCLASS
- (LXSSPX xoaddr:$A), VSFRC)), 0))>;
+ (XFLOADf32 xoaddr:$A), VSFRC)), 0))>;
def : Pat<(v2i64 (scalar_to_vector FltToULongLoad.A)),
(v2i64 (XXPERMDIs (XSCVDPUXDS (COPY_TO_REGCLASS
- (LXSSPX xoaddr:$A), VSFRC)), 0))>;
+ (XFLOADf32 xoaddr:$A), VSFRC)), 0))>;
}
// Big endian, available on all targets with VSX
diff --git a/lib/Target/PowerPC/PPCLoopPreIncPrep.cpp b/lib/Target/PowerPC/PPCLoopPreIncPrep.cpp
index a349fa1b40907..cdf544bdfac35 100644
--- a/lib/Target/PowerPC/PPCLoopPreIncPrep.cpp
+++ b/lib/Target/PowerPC/PPCLoopPreIncPrep.cpp
@@ -28,6 +28,7 @@
#include "llvm/ADT/SmallPtrSet.h"
#include "llvm/ADT/SmallSet.h"
#include "llvm/ADT/SmallVector.h"
+#include "llvm/ADT/Statistic.h"
#include "llvm/Analysis/LoopInfo.h"
#include "llvm/Analysis/ScalarEvolution.h"
#include "llvm/Analysis/ScalarEvolutionExpander.h"
@@ -61,6 +62,8 @@ static cl::opt<unsigned> MaxVars("ppc-preinc-prep-max-vars",
cl::Hidden, cl::init(16),
cl::desc("Potential PHI threshold for PPC preinc loop prep"));
+STATISTIC(PHINodeAlreadyExists, "PHI node already in pre-increment form");
+
namespace llvm {
void initializePPCLoopPreIncPrepPass(PassRegistry&);
@@ -88,6 +91,9 @@ namespace {
AU.addRequired<ScalarEvolutionWrapperPass>();
}
+ bool alreadyPrepared(Loop *L, Instruction* MemI,
+ const SCEV *BasePtrStartSCEV,
+ const SCEVConstant *BasePtrIncSCEV);
bool runOnFunction(Function &F) override;
bool runOnLoop(Loop *L);
@@ -177,6 +183,62 @@ bool PPCLoopPreIncPrep::runOnFunction(Function &F) {
return MadeChange;
}
+// In order to prepare for the pre-increment a PHI is added.
+// This function will check to see if that PHI already exists and will return
+// true if it found an existing PHI with the same start and increment as the
+// one we wanted to create.
+bool PPCLoopPreIncPrep::alreadyPrepared(Loop *L, Instruction* MemI,
+ const SCEV *BasePtrStartSCEV,
+ const SCEVConstant *BasePtrIncSCEV) {
+ BasicBlock *BB = MemI->getParent();
+ if (!BB)
+ return false;
+
+ BasicBlock *PredBB = L->getLoopPredecessor();
+ BasicBlock *LatchBB = L->getLoopLatch();
+
+ if (!PredBB || !LatchBB)
+ return false;
+
+ // Run through the PHIs and see if we have some that looks like a preparation
+ iterator_range<BasicBlock::phi_iterator> PHIIter = BB->phis();
+ for (auto & CurrentPHI : PHIIter) {
+ PHINode *CurrentPHINode = dyn_cast<PHINode>(&CurrentPHI);
+ if (!CurrentPHINode)
+ continue;
+
+ if (!SE->isSCEVable(CurrentPHINode->getType()))
+ continue;
+
+ const SCEV *PHISCEV = SE->getSCEVAtScope(CurrentPHINode, L);
+
+ const SCEVAddRecExpr *PHIBasePtrSCEV = dyn_cast<SCEVAddRecExpr>(PHISCEV);
+ if (!PHIBasePtrSCEV)
+ continue;
+
+ const SCEVConstant *PHIBasePtrIncSCEV =
+ dyn_cast<SCEVConstant>(PHIBasePtrSCEV->getStepRecurrence(*SE));
+ if (!PHIBasePtrIncSCEV)
+ continue;
+
+ if (CurrentPHINode->getNumIncomingValues() == 2) {
+ if ( (CurrentPHINode->getIncomingBlock(0) == LatchBB &&
+ CurrentPHINode->getIncomingBlock(1) == PredBB) ||
+ (CurrentPHINode->getIncomingBlock(1) == LatchBB &&
+ CurrentPHINode->getIncomingBlock(0) == PredBB) ) {
+ if (PHIBasePtrSCEV->getStart() == BasePtrStartSCEV &&
+ PHIBasePtrIncSCEV == BasePtrIncSCEV) {
+ // The existing PHI (CurrentPHINode) has the same start and increment
+ // as the PHI that we wanted to create.
+ ++PHINodeAlreadyExists;
+ return true;
+ }
+ }
+ }
+ }
+ return false;
+}
+
bool PPCLoopPreIncPrep::runOnLoop(Loop *L) {
bool MadeChange = false;
@@ -347,6 +409,9 @@ bool PPCLoopPreIncPrep::runOnLoop(Loop *L) {
DEBUG(dbgs() << "PIP: New start is: " << *BasePtrStartSCEV << "\n");
+ if (alreadyPrepared(L, MemI, BasePtrStartSCEV, BasePtrIncSCEV))
+ continue;
+
PHINode *NewPHI = PHINode::Create(I8PtrTy, HeaderLoopPredCount,
MemI->hasName() ? MemI->getName() + ".phi" : "",
Header->getFirstNonPHI());
diff --git a/lib/Target/PowerPC/PPCMCInstLower.cpp b/lib/Target/PowerPC/PPCMCInstLower.cpp
index b310493587ae7..1e40711328ece 100644
--- a/lib/Target/PowerPC/PPCMCInstLower.cpp
+++ b/lib/Target/PowerPC/PPCMCInstLower.cpp
@@ -20,14 +20,14 @@
#include "llvm/CodeGen/AsmPrinter.h"
#include "llvm/CodeGen/MachineFunction.h"
#include "llvm/CodeGen/MachineModuleInfoImpls.h"
+#include "llvm/CodeGen/TargetLowering.h"
+#include "llvm/CodeGen/TargetLoweringObjectFile.h"
#include "llvm/IR/DataLayout.h"
#include "llvm/IR/GlobalValue.h"
#include "llvm/IR/Mangler.h"
#include "llvm/MC/MCAsmInfo.h"
#include "llvm/MC/MCExpr.h"
#include "llvm/MC/MCInst.h"
-#include "llvm/Target/TargetLowering.h"
-#include "llvm/Target/TargetLoweringObjectFile.h"
using namespace llvm;
static MachineModuleInfoMachO &getMachOMMI(AsmPrinter &AP) {
@@ -143,45 +143,48 @@ void llvm::LowerPPCMachineInstrToMCInst(const MachineInstr *MI, MCInst &OutMI,
OutMI.setOpcode(MI->getOpcode());
for (unsigned i = 0, e = MI->getNumOperands(); i != e; ++i) {
- const MachineOperand &MO = MI->getOperand(i);
-
MCOperand MCOp;
- switch (MO.getType()) {
- default:
- MI->print(errs());
- llvm_unreachable("unknown operand type");
- case MachineOperand::MO_Register:
- assert(!MO.getSubReg() && "Subregs should be eliminated!");
- assert(MO.getReg() > PPC::NoRegister &&
- MO.getReg() < PPC::NUM_TARGET_REGS &&
- "Invalid register for this target!");
- MCOp = MCOperand::createReg(MO.getReg());
- break;
- case MachineOperand::MO_Immediate:
- MCOp = MCOperand::createImm(MO.getImm());
- break;
- case MachineOperand::MO_MachineBasicBlock:
- MCOp = MCOperand::createExpr(MCSymbolRefExpr::create(
- MO.getMBB()->getSymbol(), AP.OutContext));
- break;
- case MachineOperand::MO_GlobalAddress:
- case MachineOperand::MO_ExternalSymbol:
- MCOp = GetSymbolRef(MO, GetSymbolFromOperand(MO, AP), AP, isDarwin);
- break;
- case MachineOperand::MO_JumpTableIndex:
- MCOp = GetSymbolRef(MO, AP.GetJTISymbol(MO.getIndex()), AP, isDarwin);
- break;
- case MachineOperand::MO_ConstantPoolIndex:
- MCOp = GetSymbolRef(MO, AP.GetCPISymbol(MO.getIndex()), AP, isDarwin);
- break;
- case MachineOperand::MO_BlockAddress:
- MCOp = GetSymbolRef(MO,AP.GetBlockAddressSymbol(MO.getBlockAddress()),AP,
- isDarwin);
- break;
- case MachineOperand::MO_RegisterMask:
- continue;
- }
-
- OutMI.addOperand(MCOp);
+ if (LowerPPCMachineOperandToMCOperand(MI->getOperand(i), MCOp, AP,
+ isDarwin))
+ OutMI.addOperand(MCOp);
+ }
+}
+
+bool llvm::LowerPPCMachineOperandToMCOperand(const MachineOperand &MO,
+ MCOperand &OutMO, AsmPrinter &AP,
+ bool isDarwin) {
+ switch (MO.getType()) {
+ default:
+ llvm_unreachable("unknown operand type");
+ case MachineOperand::MO_Register:
+ assert(!MO.getSubReg() && "Subregs should be eliminated!");
+ assert(MO.getReg() > PPC::NoRegister &&
+ MO.getReg() < PPC::NUM_TARGET_REGS &&
+ "Invalid register for this target!");
+ OutMO = MCOperand::createReg(MO.getReg());
+ return true;
+ case MachineOperand::MO_Immediate:
+ OutMO = MCOperand::createImm(MO.getImm());
+ return true;
+ case MachineOperand::MO_MachineBasicBlock:
+ OutMO = MCOperand::createExpr(
+ MCSymbolRefExpr::create(MO.getMBB()->getSymbol(), AP.OutContext));
+ return true;
+ case MachineOperand::MO_GlobalAddress:
+ case MachineOperand::MO_ExternalSymbol:
+ OutMO = GetSymbolRef(MO, GetSymbolFromOperand(MO, AP), AP, isDarwin);
+ return true;
+ case MachineOperand::MO_JumpTableIndex:
+ OutMO = GetSymbolRef(MO, AP.GetJTISymbol(MO.getIndex()), AP, isDarwin);
+ return true;
+ case MachineOperand::MO_ConstantPoolIndex:
+ OutMO = GetSymbolRef(MO, AP.GetCPISymbol(MO.getIndex()), AP, isDarwin);
+ return true;
+ case MachineOperand::MO_BlockAddress:
+ OutMO = GetSymbolRef(MO, AP.GetBlockAddressSymbol(MO.getBlockAddress()), AP,
+ isDarwin);
+ return true;
+ case MachineOperand::MO_RegisterMask:
+ return false;
}
}
diff --git a/lib/Target/PowerPC/PPCMIPeephole.cpp b/lib/Target/PowerPC/PPCMIPeephole.cpp
index ff5f17c7628f2..a2640727f8138 100644
--- a/lib/Target/PowerPC/PPCMIPeephole.cpp
+++ b/lib/Target/PowerPC/PPCMIPeephole.cpp
@@ -23,18 +23,50 @@
#include "PPCInstrBuilder.h"
#include "PPCInstrInfo.h"
#include "PPCTargetMachine.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/CodeGen/MachineDominators.h"
#include "llvm/CodeGen/MachineFunctionPass.h"
#include "llvm/CodeGen/MachineInstrBuilder.h"
#include "llvm/CodeGen/MachineRegisterInfo.h"
#include "llvm/Support/Debug.h"
+#include "MCTargetDesc/PPCPredicates.h"
using namespace llvm;
#define DEBUG_TYPE "ppc-mi-peepholes"
-namespace llvm {
- void initializePPCMIPeepholePass(PassRegistry&);
-}
+STATISTIC(RemoveTOCSave, "Number of TOC saves removed");
+STATISTIC(MultiTOCSaves,
+ "Number of functions with multiple TOC saves that must be kept");
+STATISTIC(NumEliminatedSExt, "Number of eliminated sign-extensions");
+STATISTIC(NumEliminatedZExt, "Number of eliminated zero-extensions");
+STATISTIC(NumOptADDLIs, "Number of optimized ADD instruction fed by LI");
+STATISTIC(NumConvertedToImmediateForm,
+ "Number of instructions converted to their immediate form");
+STATISTIC(NumFunctionsEnteredInMIPeephole,
+ "Number of functions entered in PPC MI Peepholes");
+STATISTIC(NumFixedPointIterations,
+ "Number of fixed-point iterations converting reg-reg instructions "
+ "to reg-imm ones");
+
+static cl::opt<bool>
+FixedPointRegToImm("ppc-reg-to-imm-fixed-point", cl::Hidden, cl::init(true),
+ cl::desc("Iterate to a fixed point when attempting to "
+ "convert reg-reg instructions to reg-imm"));
+
+static cl::opt<bool>
+ConvertRegReg("ppc-convert-rr-to-ri", cl::Hidden, cl::init(false),
+ cl::desc("Convert eligible reg+reg instructions to reg+imm"));
+
+static cl::opt<bool>
+ EnableSExtElimination("ppc-eliminate-signext",
+ cl::desc("enable elimination of sign-extensions"),
+ cl::init(false), cl::Hidden);
+
+static cl::opt<bool>
+ EnableZExtElimination("ppc-eliminate-zeroext",
+ cl::desc("enable elimination of zero-extensions"),
+ cl::init(false), cl::Hidden);
namespace {
@@ -50,20 +82,31 @@ struct PPCMIPeephole : public MachineFunctionPass {
}
private:
+ MachineDominatorTree *MDT;
+
// Initialize class variables.
void initialize(MachineFunction &MFParm);
// Perform peepholes.
bool simplifyCode(void);
- // Find the "true" register represented by SrcReg (following chains
- // of copies and subreg_to_reg operations).
- unsigned lookThruCopyLike(unsigned SrcReg);
+ // Perform peepholes.
+ bool eliminateRedundantCompare(void);
+ bool eliminateRedundantTOCSaves(std::map<MachineInstr *, bool> &TOCSaves);
+ void UpdateTOCSaves(std::map<MachineInstr *, bool> &TOCSaves,
+ MachineInstr *MI);
public:
+
+ void getAnalysisUsage(AnalysisUsage &AU) const override {
+ AU.addRequired<MachineDominatorTree>();
+ AU.addPreserved<MachineDominatorTree>();
+ MachineFunctionPass::getAnalysisUsage(AU);
+ }
+
// Main entry point for this pass.
bool runOnMachineFunction(MachineFunction &MF) override {
- if (skipFunction(*MF.getFunction()))
+ if (skipFunction(MF.getFunction()))
return false;
initialize(MF);
return simplifyCode();
@@ -74,15 +117,138 @@ public:
void PPCMIPeephole::initialize(MachineFunction &MFParm) {
MF = &MFParm;
MRI = &MF->getRegInfo();
+ MDT = &getAnalysis<MachineDominatorTree>();
TII = MF->getSubtarget<PPCSubtarget>().getInstrInfo();
DEBUG(dbgs() << "*** PowerPC MI peephole pass ***\n\n");
DEBUG(MF->dump());
}
+static MachineInstr *getVRegDefOrNull(MachineOperand *Op,
+ MachineRegisterInfo *MRI) {
+ assert(Op && "Invalid Operand!");
+ if (!Op->isReg())
+ return nullptr;
+
+ unsigned Reg = Op->getReg();
+ if (!TargetRegisterInfo::isVirtualRegister(Reg))
+ return nullptr;
+
+ return MRI->getVRegDef(Reg);
+}
+
+// This function returns number of known zero bits in output of MI
+// starting from the most significant bit.
+static unsigned
+getKnownLeadingZeroCount(MachineInstr *MI, const PPCInstrInfo *TII) {
+ unsigned Opcode = MI->getOpcode();
+ if (Opcode == PPC::RLDICL || Opcode == PPC::RLDICLo ||
+ Opcode == PPC::RLDCL || Opcode == PPC::RLDCLo)
+ return MI->getOperand(3).getImm();
+
+ if ((Opcode == PPC::RLDIC || Opcode == PPC::RLDICo) &&
+ MI->getOperand(3).getImm() <= 63 - MI->getOperand(2).getImm())
+ return MI->getOperand(3).getImm();
+
+ if ((Opcode == PPC::RLWINM || Opcode == PPC::RLWINMo ||
+ Opcode == PPC::RLWNM || Opcode == PPC::RLWNMo ||
+ Opcode == PPC::RLWINM8 || Opcode == PPC::RLWNM8) &&
+ MI->getOperand(3).getImm() <= MI->getOperand(4).getImm())
+ return 32 + MI->getOperand(3).getImm();
+
+ if (Opcode == PPC::ANDIo) {
+ uint16_t Imm = MI->getOperand(2).getImm();
+ return 48 + countLeadingZeros(Imm);
+ }
+
+ if (Opcode == PPC::CNTLZW || Opcode == PPC::CNTLZWo ||
+ Opcode == PPC::CNTTZW || Opcode == PPC::CNTTZWo ||
+ Opcode == PPC::CNTLZW8 || Opcode == PPC::CNTTZW8)
+ // The result ranges from 0 to 32.
+ return 58;
+
+ if (Opcode == PPC::CNTLZD || Opcode == PPC::CNTLZDo ||
+ Opcode == PPC::CNTTZD || Opcode == PPC::CNTTZDo)
+ // The result ranges from 0 to 64.
+ return 57;
+
+ if (Opcode == PPC::LHZ || Opcode == PPC::LHZX ||
+ Opcode == PPC::LHZ8 || Opcode == PPC::LHZX8 ||
+ Opcode == PPC::LHZU || Opcode == PPC::LHZUX ||
+ Opcode == PPC::LHZU8 || Opcode == PPC::LHZUX8)
+ return 48;
+
+ if (Opcode == PPC::LBZ || Opcode == PPC::LBZX ||
+ Opcode == PPC::LBZ8 || Opcode == PPC::LBZX8 ||
+ Opcode == PPC::LBZU || Opcode == PPC::LBZUX ||
+ Opcode == PPC::LBZU8 || Opcode == PPC::LBZUX8)
+ return 56;
+
+ if (TII->isZeroExtended(*MI))
+ return 32;
+
+ return 0;
+}
+
+// This function maintains a map for the pairs <TOC Save Instr, Keep>
+// Each time a new TOC save is encountered, it checks if any of the exisiting
+// ones are dominated by the new one. If so, it marks the exisiting one as
+// redundant by setting it's entry in the map as false. It then adds the new
+// instruction to the map with either true or false depending on if any
+// exisiting instructions dominated the new one.
+void PPCMIPeephole::UpdateTOCSaves(
+ std::map<MachineInstr *, bool> &TOCSaves, MachineInstr *MI) {
+ assert(TII->isTOCSaveMI(*MI) && "Expecting a TOC save instruction here");
+ bool Keep = true;
+ for (auto It = TOCSaves.begin(); It != TOCSaves.end(); It++ ) {
+ MachineInstr *CurrInst = It->first;
+ // If new instruction dominates an exisiting one, mark exisiting one as
+ // redundant.
+ if (It->second && MDT->dominates(MI, CurrInst))
+ It->second = false;
+ // Check if the new instruction is redundant.
+ if (MDT->dominates(CurrInst, MI)) {
+ Keep = false;
+ break;
+ }
+ }
+ // Add new instruction to map.
+ TOCSaves[MI] = Keep;
+}
+
// Perform peephole optimizations.
bool PPCMIPeephole::simplifyCode(void) {
bool Simplified = false;
MachineInstr* ToErase = nullptr;
+ std::map<MachineInstr *, bool> TOCSaves;
+
+ NumFunctionsEnteredInMIPeephole++;
+ if (ConvertRegReg) {
+ // Fixed-point conversion of reg/reg instructions fed by load-immediate
+ // into reg/imm instructions. FIXME: This is expensive, control it with
+ // an option.
+ bool SomethingChanged = false;
+ do {
+ NumFixedPointIterations++;
+ SomethingChanged = false;
+ for (MachineBasicBlock &MBB : *MF) {
+ for (MachineInstr &MI : MBB) {
+ if (MI.isDebugValue())
+ continue;
+
+ if (TII->convertToImmediateForm(MI)) {
+ // We don't erase anything in case the def has other uses. Let DCE
+ // remove it if it can be removed.
+ DEBUG(dbgs() << "Converted instruction to imm form: ");
+ DEBUG(MI.dump());
+ NumConvertedToImmediateForm++;
+ SomethingChanged = true;
+ Simplified = true;
+ continue;
+ }
+ }
+ }
+ } while (SomethingChanged && FixedPointRegToImm);
+ }
for (MachineBasicBlock &MBB : *MF) {
for (MachineInstr &MI : MBB) {
@@ -104,6 +270,18 @@ bool PPCMIPeephole::simplifyCode(void) {
default:
break;
+ case PPC::STD: {
+ MachineFrameInfo &MFI = MF->getFrameInfo();
+ if (MFI.hasVarSizedObjects() ||
+ !MF->getSubtarget<PPCSubtarget>().isELFv2ABI())
+ break;
+ // When encountering a TOC save instruction, call UpdateTOCSaves
+ // to add it to the TOCSaves map and mark any exisiting TOC saves
+ // it dominates as redundant.
+ if (TII->isTOCSaveMI(MI))
+ UpdateTOCSaves(TOCSaves, &MI);
+ break;
+ }
case PPC::XXPERMDI: {
// Perform simplifications of 2x64 vector swaps and splats.
// A swap is identified by an immediate value of 2, and a splat
@@ -118,8 +296,10 @@ bool PPCMIPeephole::simplifyCode(void) {
// XXPERMDI t, SUBREG_TO_REG(s), SUBREG_TO_REG(s), immed.
// We have to look through chains of COPY and SUBREG_TO_REG
// to find the real source values for comparison.
- unsigned TrueReg1 = lookThruCopyLike(MI.getOperand(1).getReg());
- unsigned TrueReg2 = lookThruCopyLike(MI.getOperand(2).getReg());
+ unsigned TrueReg1 =
+ TII->lookThruCopyLike(MI.getOperand(1).getReg(), MRI);
+ unsigned TrueReg2 =
+ TII->lookThruCopyLike(MI.getOperand(2).getReg(), MRI);
if (TrueReg1 == TrueReg2
&& TargetRegisterInfo::isVirtualRegister(TrueReg1)) {
@@ -133,7 +313,8 @@ bool PPCMIPeephole::simplifyCode(void) {
auto isConversionOfLoadAndSplat = [=]() -> bool {
if (DefOpc != PPC::XVCVDPSXDS && DefOpc != PPC::XVCVDPUXDS)
return false;
- unsigned DefReg = lookThruCopyLike(DefMI->getOperand(1).getReg());
+ unsigned DefReg =
+ TII->lookThruCopyLike(DefMI->getOperand(1).getReg(), MRI);
if (TargetRegisterInfo::isVirtualRegister(DefReg)) {
MachineInstr *LoadMI = MRI->getVRegDef(DefReg);
if (LoadMI && LoadMI->getOpcode() == PPC::LXVDSX)
@@ -159,10 +340,10 @@ bool PPCMIPeephole::simplifyCode(void) {
// can replace it with a copy.
if (DefOpc == PPC::XXPERMDI) {
unsigned FeedImmed = DefMI->getOperand(3).getImm();
- unsigned FeedReg1
- = lookThruCopyLike(DefMI->getOperand(1).getReg());
- unsigned FeedReg2
- = lookThruCopyLike(DefMI->getOperand(2).getReg());
+ unsigned FeedReg1 =
+ TII->lookThruCopyLike(DefMI->getOperand(1).getReg(), MRI);
+ unsigned FeedReg2 =
+ TII->lookThruCopyLike(DefMI->getOperand(2).getReg(), MRI);
if ((FeedImmed == 0 || FeedImmed == 3) && FeedReg1 == FeedReg2) {
DEBUG(dbgs()
@@ -220,7 +401,8 @@ bool PPCMIPeephole::simplifyCode(void) {
case PPC::XXSPLTW: {
unsigned MyOpcode = MI.getOpcode();
unsigned OpNo = MyOpcode == PPC::XXSPLTW ? 1 : 2;
- unsigned TrueReg = lookThruCopyLike(MI.getOperand(OpNo).getReg());
+ unsigned TrueReg =
+ TII->lookThruCopyLike(MI.getOperand(OpNo).getReg(), MRI);
if (!TargetRegisterInfo::isVirtualRegister(TrueReg))
break;
MachineInstr *DefMI = MRI->getVRegDef(TrueReg);
@@ -282,7 +464,8 @@ bool PPCMIPeephole::simplifyCode(void) {
}
case PPC::XVCVDPSP: {
// If this is a DP->SP conversion fed by an FRSP, the FRSP is redundant.
- unsigned TrueReg = lookThruCopyLike(MI.getOperand(1).getReg());
+ unsigned TrueReg =
+ TII->lookThruCopyLike(MI.getOperand(1).getReg(), MRI);
if (!TargetRegisterInfo::isVirtualRegister(TrueReg))
break;
MachineInstr *DefMI = MRI->getVRegDef(TrueReg);
@@ -290,8 +473,10 @@ bool PPCMIPeephole::simplifyCode(void) {
// This can occur when building a vector of single precision or integer
// values.
if (DefMI && DefMI->getOpcode() == PPC::XXPERMDI) {
- unsigned DefsReg1 = lookThruCopyLike(DefMI->getOperand(1).getReg());
- unsigned DefsReg2 = lookThruCopyLike(DefMI->getOperand(2).getReg());
+ unsigned DefsReg1 =
+ TII->lookThruCopyLike(DefMI->getOperand(1).getReg(), MRI);
+ unsigned DefsReg2 =
+ TII->lookThruCopyLike(DefMI->getOperand(2).getReg(), MRI);
if (!TargetRegisterInfo::isVirtualRegister(DefsReg1) ||
!TargetRegisterInfo::isVirtualRegister(DefsReg2))
break;
@@ -336,8 +521,248 @@ bool PPCMIPeephole::simplifyCode(void) {
}
break;
}
+ case PPC::EXTSH:
+ case PPC::EXTSH8:
+ case PPC::EXTSH8_32_64: {
+ if (!EnableSExtElimination) break;
+ unsigned NarrowReg = MI.getOperand(1).getReg();
+ if (!TargetRegisterInfo::isVirtualRegister(NarrowReg))
+ break;
+
+ MachineInstr *SrcMI = MRI->getVRegDef(NarrowReg);
+ // If we've used a zero-extending load that we will sign-extend,
+ // just do a sign-extending load.
+ if (SrcMI->getOpcode() == PPC::LHZ ||
+ SrcMI->getOpcode() == PPC::LHZX) {
+ if (!MRI->hasOneNonDBGUse(SrcMI->getOperand(0).getReg()))
+ break;
+ auto is64Bit = [] (unsigned Opcode) {
+ return Opcode == PPC::EXTSH8;
+ };
+ auto isXForm = [] (unsigned Opcode) {
+ return Opcode == PPC::LHZX;
+ };
+ auto getSextLoadOp = [] (bool is64Bit, bool isXForm) {
+ if (is64Bit)
+ if (isXForm) return PPC::LHAX8;
+ else return PPC::LHA8;
+ else
+ if (isXForm) return PPC::LHAX;
+ else return PPC::LHA;
+ };
+ unsigned Opc = getSextLoadOp(is64Bit(MI.getOpcode()),
+ isXForm(SrcMI->getOpcode()));
+ DEBUG(dbgs() << "Zero-extending load\n");
+ DEBUG(SrcMI->dump());
+ DEBUG(dbgs() << "and sign-extension\n");
+ DEBUG(MI.dump());
+ DEBUG(dbgs() << "are merged into sign-extending load\n");
+ SrcMI->setDesc(TII->get(Opc));
+ SrcMI->getOperand(0).setReg(MI.getOperand(0).getReg());
+ ToErase = &MI;
+ Simplified = true;
+ NumEliminatedSExt++;
+ }
+ break;
+ }
+ case PPC::EXTSW:
+ case PPC::EXTSW_32:
+ case PPC::EXTSW_32_64: {
+ if (!EnableSExtElimination) break;
+ unsigned NarrowReg = MI.getOperand(1).getReg();
+ if (!TargetRegisterInfo::isVirtualRegister(NarrowReg))
+ break;
+
+ MachineInstr *SrcMI = MRI->getVRegDef(NarrowReg);
+ // If we've used a zero-extending load that we will sign-extend,
+ // just do a sign-extending load.
+ if (SrcMI->getOpcode() == PPC::LWZ ||
+ SrcMI->getOpcode() == PPC::LWZX) {
+ if (!MRI->hasOneNonDBGUse(SrcMI->getOperand(0).getReg()))
+ break;
+ auto is64Bit = [] (unsigned Opcode) {
+ return Opcode == PPC::EXTSW || Opcode == PPC::EXTSW_32_64;
+ };
+ auto isXForm = [] (unsigned Opcode) {
+ return Opcode == PPC::LWZX;
+ };
+ auto getSextLoadOp = [] (bool is64Bit, bool isXForm) {
+ if (is64Bit)
+ if (isXForm) return PPC::LWAX;
+ else return PPC::LWA;
+ else
+ if (isXForm) return PPC::LWAX_32;
+ else return PPC::LWA_32;
+ };
+ unsigned Opc = getSextLoadOp(is64Bit(MI.getOpcode()),
+ isXForm(SrcMI->getOpcode()));
+ DEBUG(dbgs() << "Zero-extending load\n");
+ DEBUG(SrcMI->dump());
+ DEBUG(dbgs() << "and sign-extension\n");
+ DEBUG(MI.dump());
+ DEBUG(dbgs() << "are merged into sign-extending load\n");
+ SrcMI->setDesc(TII->get(Opc));
+ SrcMI->getOperand(0).setReg(MI.getOperand(0).getReg());
+ ToErase = &MI;
+ Simplified = true;
+ NumEliminatedSExt++;
+ } else if (MI.getOpcode() == PPC::EXTSW_32_64 &&
+ TII->isSignExtended(*SrcMI)) {
+ // We can eliminate EXTSW if the input is known to be already
+ // sign-extended.
+ DEBUG(dbgs() << "Removing redundant sign-extension\n");
+ unsigned TmpReg =
+ MF->getRegInfo().createVirtualRegister(&PPC::G8RCRegClass);
+ BuildMI(MBB, &MI, MI.getDebugLoc(), TII->get(PPC::IMPLICIT_DEF),
+ TmpReg);
+ BuildMI(MBB, &MI, MI.getDebugLoc(), TII->get(PPC::INSERT_SUBREG),
+ MI.getOperand(0).getReg())
+ .addReg(TmpReg)
+ .addReg(NarrowReg)
+ .addImm(PPC::sub_32);
+ ToErase = &MI;
+ Simplified = true;
+ NumEliminatedSExt++;
+ }
+ break;
+ }
+ case PPC::RLDICL: {
+ // We can eliminate RLDICL (e.g. for zero-extension)
+ // if all bits to clear are already zero in the input.
+ // This code assume following code sequence for zero-extension.
+ // %6 = COPY %5:sub_32; (optional)
+ // %8 = IMPLICIT_DEF;
+ // %7<def,tied1> = INSERT_SUBREG %8<tied0>, %6, sub_32;
+ if (!EnableZExtElimination) break;
+
+ if (MI.getOperand(2).getImm() != 0)
+ break;
+
+ unsigned SrcReg = MI.getOperand(1).getReg();
+ if (!TargetRegisterInfo::isVirtualRegister(SrcReg))
+ break;
+
+ MachineInstr *SrcMI = MRI->getVRegDef(SrcReg);
+ if (!(SrcMI && SrcMI->getOpcode() == PPC::INSERT_SUBREG &&
+ SrcMI->getOperand(0).isReg() && SrcMI->getOperand(1).isReg()))
+ break;
+
+ MachineInstr *ImpDefMI, *SubRegMI;
+ ImpDefMI = MRI->getVRegDef(SrcMI->getOperand(1).getReg());
+ SubRegMI = MRI->getVRegDef(SrcMI->getOperand(2).getReg());
+ if (ImpDefMI->getOpcode() != PPC::IMPLICIT_DEF) break;
+
+ SrcMI = SubRegMI;
+ if (SubRegMI->getOpcode() == PPC::COPY) {
+ unsigned CopyReg = SubRegMI->getOperand(1).getReg();
+ if (TargetRegisterInfo::isVirtualRegister(CopyReg))
+ SrcMI = MRI->getVRegDef(CopyReg);
+ }
+
+ unsigned KnownZeroCount = getKnownLeadingZeroCount(SrcMI, TII);
+ if (MI.getOperand(3).getImm() <= KnownZeroCount) {
+ DEBUG(dbgs() << "Removing redundant zero-extension\n");
+ BuildMI(MBB, &MI, MI.getDebugLoc(), TII->get(PPC::COPY),
+ MI.getOperand(0).getReg())
+ .addReg(SrcReg);
+ ToErase = &MI;
+ Simplified = true;
+ NumEliminatedZExt++;
+ }
+ break;
+ }
+
+ // TODO: Any instruction that has an immediate form fed only by a PHI
+ // whose operands are all load immediate can be folded away. We currently
+ // do this for ADD instructions, but should expand it to arithmetic and
+ // binary instructions with immediate forms in the future.
+ case PPC::ADD4:
+ case PPC::ADD8: {
+ auto isSingleUsePHI = [&](MachineOperand *PhiOp) {
+ assert(PhiOp && "Invalid Operand!");
+ MachineInstr *DefPhiMI = getVRegDefOrNull(PhiOp, MRI);
+
+ return DefPhiMI && (DefPhiMI->getOpcode() == PPC::PHI) &&
+ MRI->hasOneNonDBGUse(DefPhiMI->getOperand(0).getReg());
+ };
+
+ auto dominatesAllSingleUseLIs = [&](MachineOperand *DominatorOp,
+ MachineOperand *PhiOp) {
+ assert(PhiOp && "Invalid Operand!");
+ assert(DominatorOp && "Invalid Operand!");
+ MachineInstr *DefPhiMI = getVRegDefOrNull(PhiOp, MRI);
+ MachineInstr *DefDomMI = getVRegDefOrNull(DominatorOp, MRI);
+
+ // Note: the vregs only show up at odd indices position of PHI Node,
+ // the even indices position save the BB info.
+ for (unsigned i = 1; i < DefPhiMI->getNumOperands(); i += 2) {
+ MachineInstr *LiMI =
+ getVRegDefOrNull(&DefPhiMI->getOperand(i), MRI);
+ if (!LiMI ||
+ (LiMI->getOpcode() != PPC::LI && LiMI->getOpcode() != PPC::LI8)
+ || !MRI->hasOneNonDBGUse(LiMI->getOperand(0).getReg()) ||
+ !MDT->dominates(DefDomMI, LiMI))
+ return false;
+ }
+
+ return true;
+ };
+
+ MachineOperand Op1 = MI.getOperand(1);
+ MachineOperand Op2 = MI.getOperand(2);
+ if (isSingleUsePHI(&Op2) && dominatesAllSingleUseLIs(&Op1, &Op2))
+ std::swap(Op1, Op2);
+ else if (!isSingleUsePHI(&Op1) || !dominatesAllSingleUseLIs(&Op2, &Op1))
+ break; // We don't have an ADD fed by LI's that can be transformed
+
+ // Now we know that Op1 is the PHI node and Op2 is the dominator
+ unsigned DominatorReg = Op2.getReg();
+
+ const TargetRegisterClass *TRC = MI.getOpcode() == PPC::ADD8
+ ? &PPC::G8RC_and_G8RC_NOX0RegClass
+ : &PPC::GPRC_and_GPRC_NOR0RegClass;
+ MRI->setRegClass(DominatorReg, TRC);
+
+ // replace LIs with ADDIs
+ MachineInstr *DefPhiMI = getVRegDefOrNull(&Op1, MRI);
+ for (unsigned i = 1; i < DefPhiMI->getNumOperands(); i += 2) {
+ MachineInstr *LiMI = getVRegDefOrNull(&DefPhiMI->getOperand(i), MRI);
+ DEBUG(dbgs() << "Optimizing LI to ADDI: ");
+ DEBUG(LiMI->dump());
+
+ // There could be repeated registers in the PHI, e.g: %1 =
+ // PHI %6, <%bb.2>, %8, <%bb.3>, %8, <%bb.6>; So if we've
+ // already replaced the def instruction, skip.
+ if (LiMI->getOpcode() == PPC::ADDI || LiMI->getOpcode() == PPC::ADDI8)
+ continue;
+
+ assert((LiMI->getOpcode() == PPC::LI ||
+ LiMI->getOpcode() == PPC::LI8) &&
+ "Invalid Opcode!");
+ auto LiImm = LiMI->getOperand(1).getImm(); // save the imm of LI
+ LiMI->RemoveOperand(1); // remove the imm of LI
+ LiMI->setDesc(TII->get(LiMI->getOpcode() == PPC::LI ? PPC::ADDI
+ : PPC::ADDI8));
+ MachineInstrBuilder(*LiMI->getParent()->getParent(), *LiMI)
+ .addReg(DominatorReg)
+ .addImm(LiImm); // restore the imm of LI
+ DEBUG(LiMI->dump());
+ }
+
+ // Replace ADD with COPY
+ DEBUG(dbgs() << "Optimizing ADD to COPY: ");
+ DEBUG(MI.dump());
+ BuildMI(MBB, &MI, MI.getDebugLoc(), TII->get(PPC::COPY),
+ MI.getOperand(0).getReg())
+ .add(Op1);
+ ToErase = &MI;
+ Simplified = true;
+ NumOptADDLIs++;
+ break;
+ }
}
}
+
// If the last instruction was marked for elimination,
// remove it now.
if (ToErase) {
@@ -346,37 +771,502 @@ bool PPCMIPeephole::simplifyCode(void) {
}
}
+ // Eliminate all the TOC save instructions which are redundant.
+ Simplified |= eliminateRedundantTOCSaves(TOCSaves);
+ // We try to eliminate redundant compare instruction.
+ Simplified |= eliminateRedundantCompare();
+
return Simplified;
}
-// This is used to find the "true" source register for an
-// XXPERMDI instruction, since MachineCSE does not handle the
-// "copy-like" operations (Copy and SubregToReg). Returns
-// the original SrcReg unless it is the target of a copy-like
-// operation, in which case we chain backwards through all
-// such operations to the ultimate source register. If a
-// physical register is encountered, we stop the search.
-unsigned PPCMIPeephole::lookThruCopyLike(unsigned SrcReg) {
+// helper functions for eliminateRedundantCompare
+static bool isEqOrNe(MachineInstr *BI) {
+ PPC::Predicate Pred = (PPC::Predicate)BI->getOperand(0).getImm();
+ unsigned PredCond = PPC::getPredicateCondition(Pred);
+ return (PredCond == PPC::PRED_EQ || PredCond == PPC::PRED_NE);
+}
+
+static bool isSupportedCmpOp(unsigned opCode) {
+ return (opCode == PPC::CMPLD || opCode == PPC::CMPD ||
+ opCode == PPC::CMPLW || opCode == PPC::CMPW ||
+ opCode == PPC::CMPLDI || opCode == PPC::CMPDI ||
+ opCode == PPC::CMPLWI || opCode == PPC::CMPWI);
+}
+
+static bool is64bitCmpOp(unsigned opCode) {
+ return (opCode == PPC::CMPLD || opCode == PPC::CMPD ||
+ opCode == PPC::CMPLDI || opCode == PPC::CMPDI);
+}
+
+static bool isSignedCmpOp(unsigned opCode) {
+ return (opCode == PPC::CMPD || opCode == PPC::CMPW ||
+ opCode == PPC::CMPDI || opCode == PPC::CMPWI);
+}
+
+static unsigned getSignedCmpOpCode(unsigned opCode) {
+ if (opCode == PPC::CMPLD) return PPC::CMPD;
+ if (opCode == PPC::CMPLW) return PPC::CMPW;
+ if (opCode == PPC::CMPLDI) return PPC::CMPDI;
+ if (opCode == PPC::CMPLWI) return PPC::CMPWI;
+ return opCode;
+}
+
+// We can decrement immediate x in (GE x) by changing it to (GT x-1) or
+// (LT x) to (LE x-1)
+static unsigned getPredicateToDecImm(MachineInstr *BI, MachineInstr *CMPI) {
+ uint64_t Imm = CMPI->getOperand(2).getImm();
+ bool SignedCmp = isSignedCmpOp(CMPI->getOpcode());
+ if ((!SignedCmp && Imm == 0) || (SignedCmp && Imm == 0x8000))
+ return 0;
+
+ PPC::Predicate Pred = (PPC::Predicate)BI->getOperand(0).getImm();
+ unsigned PredCond = PPC::getPredicateCondition(Pred);
+ unsigned PredHint = PPC::getPredicateHint(Pred);
+ if (PredCond == PPC::PRED_GE)
+ return PPC::getPredicate(PPC::PRED_GT, PredHint);
+ if (PredCond == PPC::PRED_LT)
+ return PPC::getPredicate(PPC::PRED_LE, PredHint);
+
+ return 0;
+}
+
+// We can increment immediate x in (GT x) by changing it to (GE x+1) or
+// (LE x) to (LT x+1)
+static unsigned getPredicateToIncImm(MachineInstr *BI, MachineInstr *CMPI) {
+ uint64_t Imm = CMPI->getOperand(2).getImm();
+ bool SignedCmp = isSignedCmpOp(CMPI->getOpcode());
+ if ((!SignedCmp && Imm == 0xFFFF) || (SignedCmp && Imm == 0x7FFF))
+ return 0;
+
+ PPC::Predicate Pred = (PPC::Predicate)BI->getOperand(0).getImm();
+ unsigned PredCond = PPC::getPredicateCondition(Pred);
+ unsigned PredHint = PPC::getPredicateHint(Pred);
+ if (PredCond == PPC::PRED_GT)
+ return PPC::getPredicate(PPC::PRED_GE, PredHint);
+ if (PredCond == PPC::PRED_LE)
+ return PPC::getPredicate(PPC::PRED_LT, PredHint);
+
+ return 0;
+}
+
+// This takes a Phi node and returns a register value for the spefied BB.
+static unsigned getIncomingRegForBlock(MachineInstr *Phi,
+ MachineBasicBlock *MBB) {
+ for (unsigned I = 2, E = Phi->getNumOperands() + 1; I != E; I += 2) {
+ MachineOperand &MO = Phi->getOperand(I);
+ if (MO.getMBB() == MBB)
+ return Phi->getOperand(I-1).getReg();
+ }
+ llvm_unreachable("invalid src basic block for this Phi node\n");
+ return 0;
+}
+
+// This function tracks the source of the register through register copy.
+// If BB1 and BB2 are non-NULL, we also track PHI instruction in BB2
+// assuming that the control comes from BB1 into BB2.
+static unsigned getSrcVReg(unsigned Reg, MachineBasicBlock *BB1,
+ MachineBasicBlock *BB2, MachineRegisterInfo *MRI) {
+ unsigned SrcReg = Reg;
+ while (1) {
+ unsigned NextReg = SrcReg;
+ MachineInstr *Inst = MRI->getVRegDef(SrcReg);
+ if (BB1 && Inst->getOpcode() == PPC::PHI && Inst->getParent() == BB2) {
+ NextReg = getIncomingRegForBlock(Inst, BB1);
+ // We track through PHI only once to avoid infinite loop.
+ BB1 = nullptr;
+ }
+ else if (Inst->isFullCopy())
+ NextReg = Inst->getOperand(1).getReg();
+ if (NextReg == SrcReg || !TargetRegisterInfo::isVirtualRegister(NextReg))
+ break;
+ SrcReg = NextReg;
+ }
+ return SrcReg;
+}
+
+static bool eligibleForCompareElimination(MachineBasicBlock &MBB,
+ MachineBasicBlock *&PredMBB,
+ MachineBasicBlock *&MBBtoMoveCmp,
+ MachineRegisterInfo *MRI) {
+
+ auto isEligibleBB = [&](MachineBasicBlock &BB) {
+ auto BII = BB.getFirstInstrTerminator();
+ // We optimize BBs ending with a conditional branch.
+ // We check only for BCC here, not BCCLR, because BCCLR
+ // will be formed only later in the pipeline.
+ if (BB.succ_size() == 2 &&
+ BII != BB.instr_end() &&
+ (*BII).getOpcode() == PPC::BCC &&
+ (*BII).getOperand(1).isReg()) {
+ // We optimize only if the condition code is used only by one BCC.
+ unsigned CndReg = (*BII).getOperand(1).getReg();
+ if (!TargetRegisterInfo::isVirtualRegister(CndReg) ||
+ !MRI->hasOneNonDBGUse(CndReg))
+ return false;
+
+ MachineInstr *CMPI = MRI->getVRegDef(CndReg);
+ // We assume compare and branch are in the same BB for ease of analysis.
+ if (CMPI->getParent() != &BB)
+ return false;
+
+ // We skip this BB if a physical register is used in comparison.
+ for (MachineOperand &MO : CMPI->operands())
+ if (MO.isReg() && !TargetRegisterInfo::isVirtualRegister(MO.getReg()))
+ return false;
+
+ return true;
+ }
+ return false;
+ };
+
+ // If this BB has more than one successor, we can create a new BB and
+ // move the compare instruction in the new BB.
+ // So far, we do not move compare instruction to a BB having multiple
+ // successors to avoid potentially increasing code size.
+ auto isEligibleForMoveCmp = [](MachineBasicBlock &BB) {
+ return BB.succ_size() == 1;
+ };
+
+ if (!isEligibleBB(MBB))
+ return false;
+
+ unsigned NumPredBBs = MBB.pred_size();
+ if (NumPredBBs == 1) {
+ MachineBasicBlock *TmpMBB = *MBB.pred_begin();
+ if (isEligibleBB(*TmpMBB)) {
+ PredMBB = TmpMBB;
+ MBBtoMoveCmp = nullptr;
+ return true;
+ }
+ }
+ else if (NumPredBBs == 2) {
+ // We check for partially redundant case.
+ // So far, we support cases with only two predecessors
+ // to avoid increasing the number of instructions.
+ MachineBasicBlock::pred_iterator PI = MBB.pred_begin();
+ MachineBasicBlock *Pred1MBB = *PI;
+ MachineBasicBlock *Pred2MBB = *(PI+1);
+
+ if (isEligibleBB(*Pred1MBB) && isEligibleForMoveCmp(*Pred2MBB)) {
+ // We assume Pred1MBB is the BB containing the compare to be merged and
+ // Pred2MBB is the BB to which we will append a compare instruction.
+ // Hence we can proceed as is.
+ }
+ else if (isEligibleBB(*Pred2MBB) && isEligibleForMoveCmp(*Pred1MBB)) {
+ // We need to swap Pred1MBB and Pred2MBB to canonicalize.
+ std::swap(Pred1MBB, Pred2MBB);
+ }
+ else return false;
+
+ // Here, Pred2MBB is the BB to which we need to append a compare inst.
+ // We cannot move the compare instruction if operands are not available
+ // in Pred2MBB (i.e. defined in MBB by an instruction other than PHI).
+ MachineInstr *BI = &*MBB.getFirstInstrTerminator();
+ MachineInstr *CMPI = MRI->getVRegDef(BI->getOperand(1).getReg());
+ for (int I = 1; I <= 2; I++)
+ if (CMPI->getOperand(I).isReg()) {
+ MachineInstr *Inst = MRI->getVRegDef(CMPI->getOperand(I).getReg());
+ if (Inst->getParent() == &MBB && Inst->getOpcode() != PPC::PHI)
+ return false;
+ }
+
+ PredMBB = Pred1MBB;
+ MBBtoMoveCmp = Pred2MBB;
+ return true;
+ }
+
+ return false;
+}
+
+// This function will iterate over the input map containing a pair of TOC save
+// instruction and a flag. The flag will be set to false if the TOC save is proven
+// redundant. This function will erase from the basic block all the TOC saves
+// marked as redundant.
+bool PPCMIPeephole::eliminateRedundantTOCSaves(
+ std::map<MachineInstr *, bool> &TOCSaves) {
+ bool Simplified = false;
+ int NumKept = 0;
+ for (auto TOCSave : TOCSaves) {
+ if (!TOCSave.second) {
+ TOCSave.first->eraseFromParent();
+ RemoveTOCSave++;
+ Simplified = true;
+ } else {
+ NumKept++;
+ }
+ }
- while (true) {
+ if (NumKept > 1)
+ MultiTOCSaves++;
- MachineInstr *MI = MRI->getVRegDef(SrcReg);
- if (!MI->isCopyLike())
- return SrcReg;
+ return Simplified;
+}
- unsigned CopySrcReg;
- if (MI->isCopy())
- CopySrcReg = MI->getOperand(1).getReg();
+// If multiple conditional branches are executed based on the (essentially)
+// same comparison, we merge compare instructions into one and make multiple
+// conditional branches on this comparison.
+// For example,
+// if (a == 0) { ... }
+// else if (a < 0) { ... }
+// can be executed by one compare and two conditional branches instead of
+// two pairs of a compare and a conditional branch.
+//
+// This method merges two compare instructions in two MBBs and modifies the
+// compare and conditional branch instructions if needed.
+// For the above example, the input for this pass looks like:
+// cmplwi r3, 0
+// beq 0, .LBB0_3
+// cmpwi r3, -1
+// bgt 0, .LBB0_4
+// So, before merging two compares, we need to modify these instructions as
+// cmpwi r3, 0 ; cmplwi and cmpwi yield same result for beq
+// beq 0, .LBB0_3
+// cmpwi r3, 0 ; greather than -1 means greater or equal to 0
+// bge 0, .LBB0_4
+
+bool PPCMIPeephole::eliminateRedundantCompare(void) {
+ // FIXME: this transformation is causing miscompiles. Disabling it for now
+ // until we can resolve the issue.
+ return false;
+ bool Simplified = false;
+
+ for (MachineBasicBlock &MBB2 : *MF) {
+ MachineBasicBlock *MBB1 = nullptr, *MBBtoMoveCmp = nullptr;
+
+ // For fully redundant case, we select two basic blocks MBB1 and MBB2
+ // as an optimization target if
+ // - both MBBs end with a conditional branch,
+ // - MBB1 is the only predecessor of MBB2, and
+ // - compare does not take a physical register as a operand in both MBBs.
+ // In this case, eligibleForCompareElimination sets MBBtoMoveCmp nullptr.
+ //
+ // As partially redundant case, we additionally handle if MBB2 has one
+ // additional predecessor, which has only one successor (MBB2).
+ // In this case, we move the compare instruction originally in MBB2 into
+ // MBBtoMoveCmp. This partially redundant case is typically appear by
+ // compiling a while loop; here, MBBtoMoveCmp is the loop preheader.
+ //
+ // Overview of CFG of related basic blocks
+ // Fully redundant case Partially redundant case
+ // -------- ---------------- --------
+ // | MBB1 | (w/ 2 succ) | MBBtoMoveCmp | | MBB1 | (w/ 2 succ)
+ // -------- ---------------- --------
+ // | \ (w/ 1 succ) \ | \
+ // | \ \ | \
+ // | \ |
+ // -------- --------
+ // | MBB2 | (w/ 1 pred | MBB2 | (w/ 2 pred
+ // -------- and 2 succ) -------- and 2 succ)
+ // | \ | \
+ // | \ | \
+ //
+ if (!eligibleForCompareElimination(MBB2, MBB1, MBBtoMoveCmp, MRI))
+ continue;
+
+ MachineInstr *BI1 = &*MBB1->getFirstInstrTerminator();
+ MachineInstr *CMPI1 = MRI->getVRegDef(BI1->getOperand(1).getReg());
+
+ MachineInstr *BI2 = &*MBB2.getFirstInstrTerminator();
+ MachineInstr *CMPI2 = MRI->getVRegDef(BI2->getOperand(1).getReg());
+ bool IsPartiallyRedundant = (MBBtoMoveCmp != nullptr);
+
+ // We cannot optimize an unsupported compare opcode or
+ // a mix of 32-bit and 64-bit comaprisons
+ if (!isSupportedCmpOp(CMPI1->getOpcode()) ||
+ !isSupportedCmpOp(CMPI2->getOpcode()) ||
+ is64bitCmpOp(CMPI1->getOpcode()) != is64bitCmpOp(CMPI2->getOpcode()))
+ continue;
+
+ unsigned NewOpCode = 0;
+ unsigned NewPredicate1 = 0, NewPredicate2 = 0;
+ int16_t Imm1 = 0, NewImm1 = 0, Imm2 = 0, NewImm2 = 0;
+ bool SwapOperands = false;
+
+ if (CMPI1->getOpcode() != CMPI2->getOpcode()) {
+ // Typically, unsigned comparison is used for equality check, but
+ // we replace it with a signed comparison if the comparison
+ // to be merged is a signed comparison.
+ // In other cases of opcode mismatch, we cannot optimize this.
+ if (isEqOrNe(BI2) &&
+ CMPI1->getOpcode() == getSignedCmpOpCode(CMPI2->getOpcode()))
+ NewOpCode = CMPI1->getOpcode();
+ else if (isEqOrNe(BI1) &&
+ getSignedCmpOpCode(CMPI1->getOpcode()) == CMPI2->getOpcode())
+ NewOpCode = CMPI2->getOpcode();
+ else continue;
+ }
+
+ if (CMPI1->getOperand(2).isReg() && CMPI2->getOperand(2).isReg()) {
+ // In case of comparisons between two registers, these two registers
+ // must be same to merge two comparisons.
+ unsigned Cmp1Operand1 = getSrcVReg(CMPI1->getOperand(1).getReg(),
+ nullptr, nullptr, MRI);
+ unsigned Cmp1Operand2 = getSrcVReg(CMPI1->getOperand(2).getReg(),
+ nullptr, nullptr, MRI);
+ unsigned Cmp2Operand1 = getSrcVReg(CMPI2->getOperand(1).getReg(),
+ MBB1, &MBB2, MRI);
+ unsigned Cmp2Operand2 = getSrcVReg(CMPI2->getOperand(2).getReg(),
+ MBB1, &MBB2, MRI);
+
+ if (Cmp1Operand1 == Cmp2Operand1 && Cmp1Operand2 == Cmp2Operand2) {
+ // Same pair of registers in the same order; ready to merge as is.
+ }
+ else if (Cmp1Operand1 == Cmp2Operand2 && Cmp1Operand2 == Cmp2Operand1) {
+ // Same pair of registers in different order.
+ // We reverse the predicate to merge compare instructions.
+ PPC::Predicate Pred = (PPC::Predicate)BI2->getOperand(0).getImm();
+ NewPredicate2 = (unsigned)PPC::getSwappedPredicate(Pred);
+ // In case of partial redundancy, we need to swap operands
+ // in another compare instruction.
+ SwapOperands = true;
+ }
+ else continue;
+ }
+ else if (CMPI1->getOperand(2).isImm() && CMPI2->getOperand(2).isImm()) {
+ // In case of comparisons between a register and an immediate,
+ // the operand register must be same for two compare instructions.
+ unsigned Cmp1Operand1 = getSrcVReg(CMPI1->getOperand(1).getReg(),
+ nullptr, nullptr, MRI);
+ unsigned Cmp2Operand1 = getSrcVReg(CMPI2->getOperand(1).getReg(),
+ MBB1, &MBB2, MRI);
+ if (Cmp1Operand1 != Cmp2Operand1)
+ continue;
+
+ NewImm1 = Imm1 = (int16_t)CMPI1->getOperand(2).getImm();
+ NewImm2 = Imm2 = (int16_t)CMPI2->getOperand(2).getImm();
+
+ // If immediate are not same, we try to adjust by changing predicate;
+ // e.g. GT imm means GE (imm+1).
+ if (Imm1 != Imm2 && (!isEqOrNe(BI2) || !isEqOrNe(BI1))) {
+ int Diff = Imm1 - Imm2;
+ if (Diff < -2 || Diff > 2)
+ continue;
+
+ unsigned PredToInc1 = getPredicateToIncImm(BI1, CMPI1);
+ unsigned PredToDec1 = getPredicateToDecImm(BI1, CMPI1);
+ unsigned PredToInc2 = getPredicateToIncImm(BI2, CMPI2);
+ unsigned PredToDec2 = getPredicateToDecImm(BI2, CMPI2);
+ if (Diff == 2) {
+ if (PredToInc2 && PredToDec1) {
+ NewPredicate2 = PredToInc2;
+ NewPredicate1 = PredToDec1;
+ NewImm2++;
+ NewImm1--;
+ }
+ }
+ else if (Diff == 1) {
+ if (PredToInc2) {
+ NewImm2++;
+ NewPredicate2 = PredToInc2;
+ }
+ else if (PredToDec1) {
+ NewImm1--;
+ NewPredicate1 = PredToDec1;
+ }
+ }
+ else if (Diff == -1) {
+ if (PredToDec2) {
+ NewImm2--;
+ NewPredicate2 = PredToDec2;
+ }
+ else if (PredToInc1) {
+ NewImm1++;
+ NewPredicate1 = PredToInc1;
+ }
+ }
+ else if (Diff == -2) {
+ if (PredToDec2 && PredToInc1) {
+ NewPredicate2 = PredToDec2;
+ NewPredicate1 = PredToInc1;
+ NewImm2--;
+ NewImm1++;
+ }
+ }
+ }
+
+ // We cannnot merge two compares if the immediates are not same.
+ if (NewImm2 != NewImm1)
+ continue;
+ }
+
+ DEBUG(dbgs() << "Optimize two pairs of compare and branch:\n");
+ DEBUG(CMPI1->dump());
+ DEBUG(BI1->dump());
+ DEBUG(CMPI2->dump());
+ DEBUG(BI2->dump());
+
+ // We adjust opcode, predicates and immediate as we determined above.
+ if (NewOpCode != 0 && NewOpCode != CMPI1->getOpcode()) {
+ CMPI1->setDesc(TII->get(NewOpCode));
+ }
+ if (NewPredicate1) {
+ BI1->getOperand(0).setImm(NewPredicate1);
+ }
+ if (NewPredicate2) {
+ BI2->getOperand(0).setImm(NewPredicate2);
+ }
+ if (NewImm1 != Imm1) {
+ CMPI1->getOperand(2).setImm(NewImm1);
+ }
+
+ if (IsPartiallyRedundant) {
+ // We touch up the compare instruction in MBB2 and move it to
+ // a previous BB to handle partially redundant case.
+ if (SwapOperands) {
+ unsigned Op1 = CMPI2->getOperand(1).getReg();
+ unsigned Op2 = CMPI2->getOperand(2).getReg();
+ CMPI2->getOperand(1).setReg(Op2);
+ CMPI2->getOperand(2).setReg(Op1);
+ }
+ if (NewImm2 != Imm2)
+ CMPI2->getOperand(2).setImm(NewImm2);
+
+ for (int I = 1; I <= 2; I++) {
+ if (CMPI2->getOperand(I).isReg()) {
+ MachineInstr *Inst = MRI->getVRegDef(CMPI2->getOperand(I).getReg());
+ if (Inst->getParent() != &MBB2)
+ continue;
+
+ assert(Inst->getOpcode() == PPC::PHI &&
+ "We cannot support if an operand comes from this BB.");
+ unsigned SrcReg = getIncomingRegForBlock(Inst, MBBtoMoveCmp);
+ CMPI2->getOperand(I).setReg(SrcReg);
+ }
+ }
+ auto I = MachineBasicBlock::iterator(MBBtoMoveCmp->getFirstTerminator());
+ MBBtoMoveCmp->splice(I, &MBB2, MachineBasicBlock::iterator(CMPI2));
+
+ DebugLoc DL = CMPI2->getDebugLoc();
+ unsigned NewVReg = MRI->createVirtualRegister(&PPC::CRRCRegClass);
+ BuildMI(MBB2, MBB2.begin(), DL,
+ TII->get(PPC::PHI), NewVReg)
+ .addReg(BI1->getOperand(1).getReg()).addMBB(MBB1)
+ .addReg(BI2->getOperand(1).getReg()).addMBB(MBBtoMoveCmp);
+ BI2->getOperand(1).setReg(NewVReg);
+ }
else {
- assert(MI->isSubregToReg() && "bad opcode for lookThruCopyLike");
- CopySrcReg = MI->getOperand(2).getReg();
+ // We finally eliminate compare instruction in MBB2.
+ BI2->getOperand(1).setReg(BI1->getOperand(1).getReg());
+ CMPI2->eraseFromParent();
}
+ BI2->getOperand(1).setIsKill(true);
+ BI1->getOperand(1).setIsKill(false);
- if (!TargetRegisterInfo::isVirtualRegister(CopySrcReg))
- return CopySrcReg;
+ DEBUG(dbgs() << "into a compare and two branches:\n");
+ DEBUG(CMPI1->dump());
+ DEBUG(BI1->dump());
+ DEBUG(BI2->dump());
+ if (IsPartiallyRedundant) {
+ DEBUG(dbgs() << "The following compare is moved into "
+ << printMBBReference(*MBBtoMoveCmp)
+ << " to handle partial redundancy.\n");
+ DEBUG(CMPI2->dump());
+ }
- SrcReg = CopySrcReg;
+ Simplified = true;
}
+
+ return Simplified;
}
} // end default namespace
diff --git a/lib/Target/PowerPC/PPCMachineBasicBlockUtils.h b/lib/Target/PowerPC/PPCMachineBasicBlockUtils.h
new file mode 100644
index 0000000000000..628ea2ab9fe62
--- /dev/null
+++ b/lib/Target/PowerPC/PPCMachineBasicBlockUtils.h
@@ -0,0 +1,198 @@
+//==-- PPCMachineBasicBlockUtils.h - Functions for common MBB operations ---==//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file defines utility functions for commonly used operations on
+// MachineBasicBlock's.
+// NOTE: Include this file after defining DEBUG_TYPE so that the debug messages
+// can be emitted for the pass that is using this.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_TARGET_PPC_MACHINE_BASIC_BLOCK_UTILS_H
+#define LLVM_LIB_TARGET_PPC_MACHINE_BASIC_BLOCK_UTILS_H
+
+#include "PPCInstrInfo.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/MachineBranchProbabilityInfo.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+
+#ifndef DEBUG_TYPE
+#define DEBUG_TYPE "ppc-generic-mbb-utilities"
+#endif
+
+using namespace llvm;
+
+/// Given a basic block \p Successor that potentially contains PHIs, this
+/// function will look for any incoming values in the PHIs that are supposed to
+/// be coming from \p OrigMBB but whose definition is actually in \p NewMBB.
+/// Any such PHIs will be updated to reflect reality.
+static void updatePHIs(MachineBasicBlock *Successor, MachineBasicBlock *OrigMBB,
+ MachineBasicBlock *NewMBB, MachineRegisterInfo *MRI) {
+ for (auto &MI : Successor->instrs()) {
+ if (!MI.isPHI())
+ continue;
+ // This is a really ugly-looking loop, but it was pillaged directly from
+ // MachineBasicBlock::transferSuccessorsAndUpdatePHIs().
+ for (unsigned i = 2, e = MI.getNumOperands()+1; i != e; i += 2) {
+ MachineOperand &MO = MI.getOperand(i);
+ if (MO.getMBB() == OrigMBB) {
+ // Check if the instruction is actualy defined in NewMBB.
+ if (MI.getOperand(i-1).isReg()) {
+ MachineInstr *DefMI = MRI->getVRegDef(MI.getOperand(i-1).getReg());
+ if (DefMI->getParent() == NewMBB || !OrigMBB->isSuccessor(Successor)) {
+ MO.setMBB(NewMBB);
+ break;
+ }
+ }
+ }
+ }
+ }
+}
+
+/// Given a basic block \p Successor that potentially contains PHIs, this
+/// function will look for PHIs that have an incoming value from \p OrigMBB
+/// and will add the same incoming value from \p NewMBB.
+/// NOTE: This should only be used if \p NewMBB is an immediate dominator of
+/// \p OrigMBB.
+static void addIncomingValuesToPHIs(MachineBasicBlock *Successor,
+ MachineBasicBlock *OrigMBB,
+ MachineBasicBlock *NewMBB,
+ MachineRegisterInfo *MRI) {
+ assert(OrigMBB->isSuccessor(NewMBB) && "NewMBB must be a sucessor of OrigMBB");
+ for (auto &MI : Successor->instrs()) {
+ if (!MI.isPHI())
+ continue;
+ // This is a really ugly-looking loop, but it was pillaged directly from
+ // MachineBasicBlock::transferSuccessorsAndUpdatePHIs().
+ for (unsigned i = 2, e = MI.getNumOperands()+1; i != e; i += 2) {
+ MachineOperand &MO = MI.getOperand(i);
+ if (MO.getMBB() == OrigMBB) {
+ MachineInstrBuilder MIB(*MI.getParent()->getParent(), &MI);
+ MIB.addReg(MI.getOperand(i-1).getReg()).addMBB(NewMBB);
+ break;
+ }
+ }
+ }
+}
+
+struct BlockSplitInfo {
+ MachineInstr *OrigBranch;
+ MachineInstr *SplitBefore;
+ MachineInstr *SplitCond;
+ bool InvertNewBranch;
+ bool InvertOrigBranch;
+ bool BranchToFallThrough;
+ const MachineBranchProbabilityInfo *MBPI;
+ MachineInstr *MIToDelete;
+ MachineInstr *NewCond;
+ bool allInstrsInSameMBB() {
+ if (!OrigBranch || !SplitBefore || !SplitCond)
+ return false;
+ MachineBasicBlock *MBB = OrigBranch->getParent();
+ if (SplitBefore->getParent() != MBB ||
+ SplitCond->getParent() != MBB)
+ return false;
+ if (MIToDelete && MIToDelete->getParent() != MBB)
+ return false;
+ if (NewCond && NewCond->getParent() != MBB)
+ return false;
+ return true;
+ }
+};
+
+/// Splits a MachineBasicBlock to branch before \p SplitBefore. The original
+/// branch is \p OrigBranch. The target of the new branch can either be the same
+/// as the target of the original branch or the fallthrough successor of the
+/// original block as determined by \p BranchToFallThrough. The branch
+/// conditions will be inverted according to \p InvertNewBranch and
+/// \p InvertOrigBranch. If an instruction that previously fed the branch is to
+/// be deleted, it is provided in \p MIToDelete and \p NewCond will be used as
+/// the branch condition. The branch probabilities will be set if the
+/// MachineBranchProbabilityInfo isn't null.
+static bool splitMBB(BlockSplitInfo &BSI) {
+ assert(BSI.allInstrsInSameMBB() &&
+ "All instructions must be in the same block.");
+
+ MachineBasicBlock *ThisMBB = BSI.OrigBranch->getParent();
+ MachineFunction *MF = ThisMBB->getParent();
+ MachineRegisterInfo *MRI = &MF->getRegInfo();
+ assert(MRI->isSSA() && "Can only do this while the function is in SSA form.");
+ if (ThisMBB->succ_size() != 2) {
+ DEBUG(dbgs() << "Don't know how to handle blocks that don't have exactly"
+ << " two succesors.\n");
+ return false;
+ }
+
+ const PPCInstrInfo *TII = MF->getSubtarget<PPCSubtarget>().getInstrInfo();
+ unsigned OrigBROpcode = BSI.OrigBranch->getOpcode();
+ unsigned InvertedOpcode =
+ OrigBROpcode == PPC::BC ? PPC::BCn :
+ OrigBROpcode == PPC::BCn ? PPC::BC :
+ OrigBROpcode == PPC::BCLR ? PPC::BCLRn : PPC::BCLR;
+ unsigned NewBROpcode = BSI.InvertNewBranch ? InvertedOpcode : OrigBROpcode;
+ MachineBasicBlock *OrigTarget = BSI.OrigBranch->getOperand(1).getMBB();
+ MachineBasicBlock *OrigFallThrough =
+ OrigTarget == *ThisMBB->succ_begin() ? *ThisMBB->succ_rbegin() :
+ *ThisMBB->succ_begin();
+ MachineBasicBlock *NewBRTarget =
+ BSI.BranchToFallThrough ? OrigFallThrough : OrigTarget;
+ BranchProbability ProbToNewTarget =
+ !BSI.MBPI ? BranchProbability::getUnknown() :
+ BSI.MBPI->getEdgeProbability(ThisMBB, NewBRTarget);
+
+ // Create a new basic block.
+ MachineBasicBlock::iterator InsertPoint = BSI.SplitBefore;
+ const BasicBlock *LLVM_BB = ThisMBB->getBasicBlock();
+ MachineFunction::iterator It = ThisMBB->getIterator();
+ MachineBasicBlock *NewMBB = MF->CreateMachineBasicBlock(LLVM_BB);
+ MF->insert(++It, NewMBB);
+
+ // Move everything after SplitBefore into the new block.
+ NewMBB->splice(NewMBB->end(), ThisMBB, InsertPoint, ThisMBB->end());
+ NewMBB->transferSuccessors(ThisMBB);
+
+ // Add the two successors to ThisMBB. The probabilities come from the
+ // existing blocks if available.
+ ThisMBB->addSuccessor(NewBRTarget, ProbToNewTarget);
+ ThisMBB->addSuccessor(NewMBB, ProbToNewTarget.getCompl());
+
+ // Add the branches to ThisMBB.
+ BuildMI(*ThisMBB, ThisMBB->end(), BSI.SplitBefore->getDebugLoc(),
+ TII->get(NewBROpcode)).addReg(BSI.SplitCond->getOperand(0).getReg())
+ .addMBB(NewBRTarget);
+ BuildMI(*ThisMBB, ThisMBB->end(), BSI.SplitBefore->getDebugLoc(),
+ TII->get(PPC::B)).addMBB(NewMBB);
+ if (BSI.MIToDelete)
+ BSI.MIToDelete->eraseFromParent();
+
+ // Change the condition on the original branch and invert it if requested.
+ auto FirstTerminator = NewMBB->getFirstTerminator();
+ if (BSI.NewCond) {
+ assert(FirstTerminator->getOperand(0).isReg() &&
+ "Can't update condition of unconditional branch.");
+ FirstTerminator->getOperand(0).setReg(BSI.NewCond->getOperand(0).getReg());
+ }
+ if (BSI.InvertOrigBranch)
+ FirstTerminator->setDesc(TII->get(InvertedOpcode));
+
+ // If any of the PHIs in the successors of NewMBB reference values that
+ // now come from NewMBB, they need to be updated.
+ for (auto *Succ : NewMBB->successors()) {
+ updatePHIs(Succ, ThisMBB, NewMBB, MRI);
+ }
+ addIncomingValuesToPHIs(NewBRTarget, ThisMBB, NewMBB, MRI);
+
+ DEBUG(dbgs() << "After splitting, ThisMBB:\n"; ThisMBB->dump());
+ DEBUG(dbgs() << "NewMBB:\n"; NewMBB->dump());
+ DEBUG(dbgs() << "New branch-to block:\n"; NewBRTarget->dump());
+ return true;
+}
+
+
+#endif
diff --git a/lib/Target/PowerPC/PPCMachineFunctionInfo.cpp b/lib/Target/PowerPC/PPCMachineFunctionInfo.cpp
index bc2d9a08b5e86..3923417257e8c 100644
--- a/lib/Target/PowerPC/PPCMachineFunctionInfo.cpp
+++ b/lib/Target/PowerPC/PPCMachineFunctionInfo.cpp
@@ -43,3 +43,17 @@ MCSymbol *PPCFunctionInfo::getTOCOffsetSymbol() const {
"func_toc" +
Twine(MF.getFunctionNumber()));
}
+
+bool PPCFunctionInfo::isLiveInSExt(unsigned VReg) const {
+ for (const std::pair<unsigned, ISD::ArgFlagsTy> &LiveIn : LiveInAttrs)
+ if (LiveIn.first == VReg)
+ return LiveIn.second.isSExt();
+ return false;
+}
+
+bool PPCFunctionInfo::isLiveInZExt(unsigned VReg) const {
+ for (const std::pair<unsigned, ISD::ArgFlagsTy> &LiveIn : LiveInAttrs)
+ if (LiveIn.first == VReg)
+ return LiveIn.second.isZExt();
+ return false;
+}
diff --git a/lib/Target/PowerPC/PPCMachineFunctionInfo.h b/lib/Target/PowerPC/PPCMachineFunctionInfo.h
index 202e10058b733..a9b6073106eae 100644
--- a/lib/Target/PowerPC/PPCMachineFunctionInfo.h
+++ b/lib/Target/PowerPC/PPCMachineFunctionInfo.h
@@ -16,6 +16,7 @@
#include "llvm/ADT/SmallVector.h"
#include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/CodeGen/TargetCallingConv.h"
namespace llvm {
@@ -113,6 +114,10 @@ class PPCFunctionInfo : public MachineFunctionInfo {
/// copies
bool IsSplitCSR = false;
+ /// We keep track attributes for each live-in virtual registers
+ /// to use SExt/ZExt flags in later optimization.
+ std::vector<std::pair<unsigned, ISD::ArgFlagsTy>> LiveInAttrs;
+
public:
explicit PPCFunctionInfo(MachineFunction &MF) : MF(MF) {}
@@ -175,6 +180,19 @@ public:
unsigned getVarArgsNumFPR() const { return VarArgsNumFPR; }
void setVarArgsNumFPR(unsigned Num) { VarArgsNumFPR = Num; }
+ /// This function associates attributes for each live-in virtual register.
+ void addLiveInAttr(unsigned VReg, ISD::ArgFlagsTy Flags) {
+ LiveInAttrs.push_back(std::make_pair(VReg, Flags));
+ }
+
+ /// This function returns true if the spesified vreg is
+ /// a live-in register and sign-extended.
+ bool isLiveInSExt(unsigned VReg) const;
+
+ /// This function returns true if the spesified vreg is
+ /// a live-in register and zero-extended.
+ bool isLiveInZExt(unsigned VReg) const;
+
int getCRSpillFrameIndex() const { return CRSpillFrameIndex; }
void setCRSpillFrameIndex(int idx) { CRSpillFrameIndex = idx; }
diff --git a/lib/Target/PowerPC/PPCPreEmitPeephole.cpp b/lib/Target/PowerPC/PPCPreEmitPeephole.cpp
new file mode 100644
index 0000000000000..9501f0f89b81b
--- /dev/null
+++ b/lib/Target/PowerPC/PPCPreEmitPeephole.cpp
@@ -0,0 +1,95 @@
+//===--------- PPCPreEmitPeephole.cpp - Late peephole optimizations -------===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// A pre-emit peephole for catching opportunities introduced by late passes such
+// as MachineBlockPlacement.
+//
+//===----------------------------------------------------------------------===//
+
+#include "PPC.h"
+#include "PPCInstrInfo.h"
+#include "PPCSubtarget.h"
+#include "llvm/ADT/DenseMap.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/CodeGen/LivePhysRegs.h"
+#include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/Support/Debug.h"
+
+using namespace llvm;
+
+#define DEBUG_TYPE "ppc-pre-emit-peephole"
+
+STATISTIC(NumRRConvertedInPreEmit,
+ "Number of r+r instructions converted to r+i in pre-emit peephole");
+STATISTIC(NumRemovedInPreEmit,
+ "Number of instructions deleted in pre-emit peephole");
+
+static cl::opt<bool>
+RunPreEmitPeephole("ppc-late-peephole", cl::Hidden, cl::init(false),
+ cl::desc("Run pre-emit peephole optimizations."));
+
+namespace {
+ class PPCPreEmitPeephole : public MachineFunctionPass {
+ public:
+ static char ID;
+ PPCPreEmitPeephole() : MachineFunctionPass(ID) {
+ initializePPCPreEmitPeepholePass(*PassRegistry::getPassRegistry());
+ }
+
+ void getAnalysisUsage(AnalysisUsage &AU) const override {
+ MachineFunctionPass::getAnalysisUsage(AU);
+ }
+
+ MachineFunctionProperties getRequiredProperties() const override {
+ return MachineFunctionProperties().set(
+ MachineFunctionProperties::Property::NoVRegs);
+ }
+
+ bool runOnMachineFunction(MachineFunction &MF) override {
+ if (skipFunction(MF.getFunction()) || !RunPreEmitPeephole)
+ return false;
+ bool Changed = false;
+ const PPCInstrInfo *TII = MF.getSubtarget<PPCSubtarget>().getInstrInfo();
+ SmallVector<MachineInstr *, 4> InstrsToErase;
+ for (MachineBasicBlock &MBB : MF) {
+ for (MachineInstr &MI : MBB) {
+ MachineInstr *DefMIToErase = nullptr;
+ if (TII->convertToImmediateForm(MI, &DefMIToErase)) {
+ Changed = true;
+ NumRRConvertedInPreEmit++;
+ DEBUG(dbgs() << "Converted instruction to imm form: ");
+ DEBUG(MI.dump());
+ if (DefMIToErase) {
+ InstrsToErase.push_back(DefMIToErase);
+ }
+ }
+ }
+ }
+ for (MachineInstr *MI : InstrsToErase) {
+ DEBUG(dbgs() << "PPC pre-emit peephole: erasing instruction: ");
+ DEBUG(MI->dump());
+ MI->eraseFromParent();
+ NumRemovedInPreEmit++;
+ }
+ return Changed;
+ }
+ };
+}
+
+INITIALIZE_PASS(PPCPreEmitPeephole, DEBUG_TYPE, "PowerPC Pre-Emit Peephole",
+ false, false)
+char PPCPreEmitPeephole::ID = 0;
+
+FunctionPass *llvm::createPPCPreEmitPeepholePass() {
+ return new PPCPreEmitPeephole();
+}
diff --git a/lib/Target/PowerPC/PPCQPXLoadSplat.cpp b/lib/Target/PowerPC/PPCQPXLoadSplat.cpp
index 8a18ab9e0e9a3..25b2b54cbe98a 100644
--- a/lib/Target/PowerPC/PPCQPXLoadSplat.cpp
+++ b/lib/Target/PowerPC/PPCQPXLoadSplat.cpp
@@ -22,9 +22,9 @@
#include "llvm/ADT/SmallVector.h"
#include "llvm/ADT/Statistic.h"
#include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/CodeGen/TargetSubtargetInfo.h"
#include "llvm/Support/MathExtras.h"
#include "llvm/Target/TargetMachine.h"
-#include "llvm/Target/TargetSubtargetInfo.h"
using namespace llvm;
#define DEBUG_TYPE "ppc-qpx-load-splat"
@@ -60,7 +60,7 @@ FunctionPass *llvm::createPPCQPXLoadSplatPass() {
}
bool PPCQPXLoadSplat::runOnMachineFunction(MachineFunction &MF) {
- if (skipFunction(*MF.getFunction()))
+ if (skipFunction(MF.getFunction()))
return false;
bool MadeChange = false;
@@ -79,8 +79,8 @@ bool PPCQPXLoadSplat::runOnMachineFunction(MachineFunction &MF) {
}
// We're looking for a sequence like this:
- // %F0<def> = LFD 0, %X3<kill>, %QF0<imp-def>; mem:LD8[%a](tbaa=!2)
- // %QF1<def> = QVESPLATI %QF0<kill>, 0, %RM<imp-use>
+ // %f0 = LFD 0, killed %x3, implicit-def %qf0; mem:LD8[%a](tbaa=!2)
+ // %qf1 = QVESPLATI killed %qf0, 0, implicit %rm
for (auto SI = Splats.begin(); SI != Splats.end();) {
MachineInstr *SMI = *SI;
diff --git a/lib/Target/PowerPC/PPCReduceCRLogicals.cpp b/lib/Target/PowerPC/PPCReduceCRLogicals.cpp
new file mode 100644
index 0000000000000..5b2d7191683c0
--- /dev/null
+++ b/lib/Target/PowerPC/PPCReduceCRLogicals.cpp
@@ -0,0 +1,535 @@
+//===---- PPCReduceCRLogicals.cpp - Reduce CR Bit Logical operations ------===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===---------------------------------------------------------------------===//
+//
+// This pass aims to reduce the number of logical operations on bits in the CR
+// register. These instructions have a fairly high latency and only a single
+// pipeline at their disposal in modern PPC cores. Furthermore, they have a
+// tendency to occur in fairly small blocks where there's little opportunity
+// to hide the latency between the CR logical operation and its user.
+//
+//===---------------------------------------------------------------------===//
+
+#include "PPCInstrInfo.h"
+#include "PPC.h"
+#include "PPCTargetMachine.h"
+#include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/CodeGen/MachineDominators.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/ADT/Statistic.h"
+
+using namespace llvm;
+
+#define DEBUG_TYPE "ppc-reduce-cr-ops"
+#include "PPCMachineBasicBlockUtils.h"
+
+STATISTIC(NumContainedSingleUseBinOps,
+ "Number of single-use binary CR logical ops contained in a block");
+STATISTIC(NumToSplitBlocks,
+ "Number of binary CR logical ops that can be used to split blocks");
+STATISTIC(TotalCRLogicals, "Number of CR logical ops.");
+STATISTIC(TotalNullaryCRLogicals,
+ "Number of nullary CR logical ops (CRSET/CRUNSET).");
+STATISTIC(TotalUnaryCRLogicals, "Number of unary CR logical ops.");
+STATISTIC(TotalBinaryCRLogicals, "Number of CR logical ops.");
+STATISTIC(NumBlocksSplitOnBinaryCROp,
+ "Number of blocks split on CR binary logical ops.");
+STATISTIC(NumNotSplitIdenticalOperands,
+ "Number of blocks not split due to operands being identical.");
+STATISTIC(NumNotSplitChainCopies,
+ "Number of blocks not split due to operands being chained copies.");
+STATISTIC(NumNotSplitWrongOpcode,
+ "Number of blocks not split due to the wrong opcode.");
+
+namespace llvm {
+ void initializePPCReduceCRLogicalsPass(PassRegistry&);
+}
+
+namespace {
+
+static bool isBinary(MachineInstr &MI) {
+ return MI.getNumOperands() == 3;
+}
+
+static bool isNullary(MachineInstr &MI) {
+ return MI.getNumOperands() == 1;
+}
+
+/// Given a CR logical operation \p CROp, branch opcode \p BROp as well as
+/// a flag to indicate if the first operand of \p CROp is used as the
+/// SplitBefore operand, determines whether either of the branches are to be
+/// inverted as well as whether the new target should be the original
+/// fall-through block.
+static void
+computeBranchTargetAndInversion(unsigned CROp, unsigned BROp, bool UsingDef1,
+ bool &InvertNewBranch, bool &InvertOrigBranch,
+ bool &TargetIsFallThrough) {
+ // The conditions under which each of the output operands should be [un]set
+ // can certainly be written much more concisely with just 3 if statements or
+ // ternary expressions. However, this provides a much clearer overview to the
+ // reader as to what is set for each <CROp, BROp, OpUsed> combination.
+ if (BROp == PPC::BC || BROp == PPC::BCLR) {
+ // Regular branches.
+ switch (CROp) {
+ default:
+ llvm_unreachable("Don't know how to handle this CR logical.");
+ case PPC::CROR:
+ InvertNewBranch = false;
+ InvertOrigBranch = false;
+ TargetIsFallThrough = false;
+ return;
+ case PPC::CRAND:
+ InvertNewBranch = true;
+ InvertOrigBranch = false;
+ TargetIsFallThrough = true;
+ return;
+ case PPC::CRNAND:
+ InvertNewBranch = true;
+ InvertOrigBranch = true;
+ TargetIsFallThrough = false;
+ return;
+ case PPC::CRNOR:
+ InvertNewBranch = false;
+ InvertOrigBranch = true;
+ TargetIsFallThrough = true;
+ return;
+ case PPC::CRORC:
+ InvertNewBranch = UsingDef1;
+ InvertOrigBranch = !UsingDef1;
+ TargetIsFallThrough = false;
+ return;
+ case PPC::CRANDC:
+ InvertNewBranch = !UsingDef1;
+ InvertOrigBranch = !UsingDef1;
+ TargetIsFallThrough = true;
+ return;
+ }
+ } else if (BROp == PPC::BCn || BROp == PPC::BCLRn) {
+ // Negated branches.
+ switch (CROp) {
+ default:
+ llvm_unreachable("Don't know how to handle this CR logical.");
+ case PPC::CROR:
+ InvertNewBranch = true;
+ InvertOrigBranch = false;
+ TargetIsFallThrough = true;
+ return;
+ case PPC::CRAND:
+ InvertNewBranch = false;
+ InvertOrigBranch = false;
+ TargetIsFallThrough = false;
+ return;
+ case PPC::CRNAND:
+ InvertNewBranch = false;
+ InvertOrigBranch = true;
+ TargetIsFallThrough = true;
+ return;
+ case PPC::CRNOR:
+ InvertNewBranch = true;
+ InvertOrigBranch = true;
+ TargetIsFallThrough = false;
+ return;
+ case PPC::CRORC:
+ InvertNewBranch = !UsingDef1;
+ InvertOrigBranch = !UsingDef1;
+ TargetIsFallThrough = true;
+ return;
+ case PPC::CRANDC:
+ InvertNewBranch = UsingDef1;
+ InvertOrigBranch = !UsingDef1;
+ TargetIsFallThrough = false;
+ return;
+ }
+ } else
+ llvm_unreachable("Don't know how to handle this branch.");
+}
+
+class PPCReduceCRLogicals : public MachineFunctionPass {
+
+public:
+ static char ID;
+ struct CRLogicalOpInfo {
+ MachineInstr *MI;
+ // FIXME: If chains of copies are to be handled, this should be a vector.
+ std::pair<MachineInstr*, MachineInstr*> CopyDefs;
+ std::pair<MachineInstr*, MachineInstr*> TrueDefs;
+ unsigned IsBinary : 1;
+ unsigned IsNullary : 1;
+ unsigned ContainedInBlock : 1;
+ unsigned FeedsISEL : 1;
+ unsigned FeedsBR : 1;
+ unsigned FeedsLogical : 1;
+ unsigned SingleUse : 1;
+ unsigned DefsSingleUse : 1;
+ unsigned SubregDef1;
+ unsigned SubregDef2;
+ CRLogicalOpInfo() : MI(nullptr), IsBinary(0), IsNullary(0),
+ ContainedInBlock(0), FeedsISEL(0), FeedsBR(0),
+ FeedsLogical(0), SingleUse(0), DefsSingleUse(1),
+ SubregDef1(0), SubregDef2(0) { }
+ void dump();
+ };
+
+private:
+ const PPCInstrInfo *TII;
+ MachineFunction *MF;
+ MachineRegisterInfo *MRI;
+ const MachineBranchProbabilityInfo *MBPI;
+
+ // A vector to contain all the CR logical operations
+ std::vector<CRLogicalOpInfo> AllCRLogicalOps;
+ void initialize(MachineFunction &MFParm);
+ void collectCRLogicals();
+ bool handleCROp(CRLogicalOpInfo &CRI);
+ bool splitBlockOnBinaryCROp(CRLogicalOpInfo &CRI);
+ static bool isCRLogical(MachineInstr &MI) {
+ unsigned Opc = MI.getOpcode();
+ return Opc == PPC::CRAND || Opc == PPC::CRNAND || Opc == PPC::CROR ||
+ Opc == PPC::CRXOR || Opc == PPC::CRNOR || Opc == PPC::CREQV ||
+ Opc == PPC::CRANDC || Opc == PPC::CRORC || Opc == PPC::CRSET ||
+ Opc == PPC::CRUNSET || Opc == PPC::CR6SET || Opc == PPC::CR6UNSET;
+ }
+ bool simplifyCode() {
+ bool Changed = false;
+ // Not using a range-based for loop here as the vector may grow while being
+ // operated on.
+ for (unsigned i = 0; i < AllCRLogicalOps.size(); i++)
+ Changed |= handleCROp(AllCRLogicalOps[i]);
+ return Changed;
+ }
+
+public:
+ PPCReduceCRLogicals() : MachineFunctionPass(ID) {
+ initializePPCReduceCRLogicalsPass(*PassRegistry::getPassRegistry());
+ }
+
+ MachineInstr *lookThroughCRCopy(unsigned Reg, unsigned &Subreg,
+ MachineInstr *&CpDef);
+ bool runOnMachineFunction(MachineFunction &MF) override {
+ if (skipFunction(MF.getFunction()))
+ return false;
+
+ // If the subtarget doesn't use CR bits, there's nothing to do.
+ const PPCSubtarget &STI = MF.getSubtarget<PPCSubtarget>();
+ if (!STI.useCRBits())
+ return false;
+
+ initialize(MF);
+ collectCRLogicals();
+ return simplifyCode();
+ }
+ CRLogicalOpInfo createCRLogicalOpInfo(MachineInstr &MI);
+ void getAnalysisUsage(AnalysisUsage &AU) const override {
+ AU.addRequired<MachineBranchProbabilityInfo>();
+ AU.addRequired<MachineDominatorTree>();
+ MachineFunctionPass::getAnalysisUsage(AU);
+ }
+};
+
+#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
+LLVM_DUMP_METHOD void PPCReduceCRLogicals::CRLogicalOpInfo::dump() {
+ dbgs() << "CRLogicalOpMI: ";
+ MI->dump();
+ dbgs() << "IsBinary: " << IsBinary << ", FeedsISEL: " << FeedsISEL;
+ dbgs() << ", FeedsBR: " << FeedsBR << ", FeedsLogical: ";
+ dbgs() << FeedsLogical << ", SingleUse: " << SingleUse;
+ dbgs() << ", DefsSingleUse: " << DefsSingleUse;
+ dbgs() << ", SubregDef1: " << SubregDef1 << ", SubregDef2: ";
+ dbgs() << SubregDef2 << ", ContainedInBlock: " << ContainedInBlock;
+ if (!IsNullary) {
+ dbgs() << "\nDefs:\n";
+ TrueDefs.first->dump();
+ }
+ if (IsBinary)
+ TrueDefs.second->dump();
+ dbgs() << "\n";
+ if (CopyDefs.first) {
+ dbgs() << "CopyDef1: ";
+ CopyDefs.first->dump();
+ }
+ if (CopyDefs.second) {
+ dbgs() << "CopyDef2: ";
+ CopyDefs.second->dump();
+ }
+}
+#endif
+
+PPCReduceCRLogicals::CRLogicalOpInfo
+PPCReduceCRLogicals::createCRLogicalOpInfo(MachineInstr &MIParam) {
+ CRLogicalOpInfo Ret;
+ Ret.MI = &MIParam;
+ // Get the defs
+ if (isNullary(MIParam)) {
+ Ret.IsNullary = 1;
+ Ret.TrueDefs = std::make_pair(nullptr, nullptr);
+ Ret.CopyDefs = std::make_pair(nullptr, nullptr);
+ } else {
+ MachineInstr *Def1 = lookThroughCRCopy(MIParam.getOperand(1).getReg(),
+ Ret.SubregDef1, Ret.CopyDefs.first);
+ Ret.DefsSingleUse &=
+ MRI->hasOneNonDBGUse(Def1->getOperand(0).getReg());
+ Ret.DefsSingleUse &=
+ MRI->hasOneNonDBGUse(Ret.CopyDefs.first->getOperand(0).getReg());
+ assert(Def1 && "Must be able to find a definition of operand 1.");
+ if (isBinary(MIParam)) {
+ Ret.IsBinary = 1;
+ MachineInstr *Def2 = lookThroughCRCopy(MIParam.getOperand(2).getReg(),
+ Ret.SubregDef2,
+ Ret.CopyDefs.second);
+ Ret.DefsSingleUse &=
+ MRI->hasOneNonDBGUse(Def2->getOperand(0).getReg());
+ Ret.DefsSingleUse &=
+ MRI->hasOneNonDBGUse(Ret.CopyDefs.second->getOperand(0).getReg());
+ assert(Def2 && "Must be able to find a definition of operand 2.");
+ Ret.TrueDefs = std::make_pair(Def1, Def2);
+ } else {
+ Ret.TrueDefs = std::make_pair(Def1, nullptr);
+ Ret.CopyDefs.second = nullptr;
+ }
+ }
+
+ Ret.ContainedInBlock = 1;
+ // Get the uses
+ for (MachineInstr &UseMI :
+ MRI->use_nodbg_instructions(MIParam.getOperand(0).getReg())) {
+ unsigned Opc = UseMI.getOpcode();
+ if (Opc == PPC::ISEL || Opc == PPC::ISEL8)
+ Ret.FeedsISEL = 1;
+ if (Opc == PPC::BC || Opc == PPC::BCn || Opc == PPC::BCLR ||
+ Opc == PPC::BCLRn)
+ Ret.FeedsBR = 1;
+ Ret.FeedsLogical = isCRLogical(UseMI);
+ if (UseMI.getParent() != MIParam.getParent())
+ Ret.ContainedInBlock = 0;
+ }
+ Ret.SingleUse = MRI->hasOneNonDBGUse(MIParam.getOperand(0).getReg()) ? 1 : 0;
+
+ // We now know whether all the uses of the CR logical are in the same block.
+ if (!Ret.IsNullary) {
+ Ret.ContainedInBlock &=
+ (MIParam.getParent() == Ret.TrueDefs.first->getParent());
+ if (Ret.IsBinary)
+ Ret.ContainedInBlock &=
+ (MIParam.getParent() == Ret.TrueDefs.second->getParent());
+ }
+ DEBUG(Ret.dump());
+ if (Ret.IsBinary && Ret.ContainedInBlock && Ret.SingleUse) {
+ NumContainedSingleUseBinOps++;
+ if (Ret.FeedsBR && Ret.DefsSingleUse)
+ NumToSplitBlocks++;
+ }
+ return Ret;
+}
+
+/// Looks trhough a COPY instruction to the actual definition of the CR-bit
+/// register and returns the instruction that defines it.
+/// FIXME: This currently handles what is by-far the most common case:
+/// an instruction that defines a CR field followed by a single copy of a bit
+/// from that field into a virtual register. If chains of copies need to be
+/// handled, this should have a loop until a non-copy instruction is found.
+MachineInstr *PPCReduceCRLogicals::lookThroughCRCopy(unsigned Reg,
+ unsigned &Subreg,
+ MachineInstr *&CpDef) {
+ Subreg = -1;
+ if (!TargetRegisterInfo::isVirtualRegister(Reg))
+ return nullptr;
+ MachineInstr *Copy = MRI->getVRegDef(Reg);
+ CpDef = Copy;
+ if (!Copy->isCopy())
+ return Copy;
+ unsigned CopySrc = Copy->getOperand(1).getReg();
+ Subreg = Copy->getOperand(1).getSubReg();
+ if (!TargetRegisterInfo::isVirtualRegister(CopySrc)) {
+ const TargetRegisterInfo *TRI = &TII->getRegisterInfo();
+ // Set the Subreg
+ if (CopySrc == PPC::CR0EQ || CopySrc == PPC::CR6EQ)
+ Subreg = PPC::sub_eq;
+ if (CopySrc == PPC::CR0LT || CopySrc == PPC::CR6LT)
+ Subreg = PPC::sub_lt;
+ if (CopySrc == PPC::CR0GT || CopySrc == PPC::CR6GT)
+ Subreg = PPC::sub_gt;
+ if (CopySrc == PPC::CR0UN || CopySrc == PPC::CR6UN)
+ Subreg = PPC::sub_un;
+ // Loop backwards and return the first MI that modifies the physical CR Reg.
+ MachineBasicBlock::iterator Me = Copy, B = Copy->getParent()->begin();
+ while (Me != B)
+ if ((--Me)->modifiesRegister(CopySrc, TRI))
+ return &*Me;
+ return nullptr;
+ }
+ return MRI->getVRegDef(CopySrc);
+}
+
+void PPCReduceCRLogicals::initialize(MachineFunction &MFParam) {
+ MF = &MFParam;
+ MRI = &MF->getRegInfo();
+ TII = MF->getSubtarget<PPCSubtarget>().getInstrInfo();
+ MBPI = &getAnalysis<MachineBranchProbabilityInfo>();
+
+ AllCRLogicalOps.clear();
+}
+
+/// Contains all the implemented transformations on CR logical operations.
+/// For example, a binary CR logical can be used to split a block on its inputs,
+/// a unary CR logical might be used to change the condition code on a
+/// comparison feeding it. A nullary CR logical might simply be removable
+/// if the user of the bit it [un]sets can be transformed.
+bool PPCReduceCRLogicals::handleCROp(CRLogicalOpInfo &CRI) {
+ // We can definitely split a block on the inputs to a binary CR operation
+ // whose defs and (single) use are within the same block.
+ bool Changed = false;
+ if (CRI.IsBinary && CRI.ContainedInBlock && CRI.SingleUse && CRI.FeedsBR &&
+ CRI.DefsSingleUse) {
+ Changed = splitBlockOnBinaryCROp(CRI);
+ if (Changed)
+ NumBlocksSplitOnBinaryCROp++;
+ }
+ return Changed;
+}
+
+/// Splits a block that contains a CR-logical operation that feeds a branch
+/// and whose operands are produced within the block.
+/// Example:
+/// %vr5<def> = CMPDI %vr2, 0; CRRC:%vr5 G8RC:%vr2
+/// %vr6<def> = COPY %vr5:sub_eq; CRBITRC:%vr6 CRRC:%vr5
+/// %vr7<def> = CMPDI %vr3, 0; CRRC:%vr7 G8RC:%vr3
+/// %vr8<def> = COPY %vr7:sub_eq; CRBITRC:%vr8 CRRC:%vr7
+/// %vr9<def> = CROR %vr6<kill>, %vr8<kill>; CRBITRC:%vr9,%vr6,%vr8
+/// BC %vr9<kill>, <BB#2>; CRBITRC:%vr9
+/// Becomes:
+/// %vr5<def> = CMPDI %vr2, 0; CRRC:%vr5 G8RC:%vr2
+/// %vr6<def> = COPY %vr5:sub_eq; CRBITRC:%vr6 CRRC:%vr5
+/// BC %vr6<kill>, <BB#2>; CRBITRC:%vr6
+///
+/// %vr7<def> = CMPDI %vr3, 0; CRRC:%vr7 G8RC:%vr3
+/// %vr8<def> = COPY %vr7:sub_eq; CRBITRC:%vr8 CRRC:%vr7
+/// BC %vr9<kill>, <BB#2>; CRBITRC:%vr9
+bool PPCReduceCRLogicals::splitBlockOnBinaryCROp(CRLogicalOpInfo &CRI) {
+ if (CRI.CopyDefs.first == CRI.CopyDefs.second) {
+ DEBUG(dbgs() << "Unable to split as the two operands are the same\n");
+ NumNotSplitIdenticalOperands++;
+ return false;
+ }
+ if (CRI.TrueDefs.first->isCopy() || CRI.TrueDefs.second->isCopy() ||
+ CRI.TrueDefs.first->isPHI() || CRI.TrueDefs.second->isPHI()) {
+ DEBUG(dbgs() << "Unable to split because one of the operands is a PHI or "
+ "chain of copies.\n");
+ NumNotSplitChainCopies++;
+ return false;
+ }
+ // Note: keep in sync with computeBranchTargetAndInversion().
+ if (CRI.MI->getOpcode() != PPC::CROR &&
+ CRI.MI->getOpcode() != PPC::CRAND &&
+ CRI.MI->getOpcode() != PPC::CRNOR &&
+ CRI.MI->getOpcode() != PPC::CRNAND &&
+ CRI.MI->getOpcode() != PPC::CRORC &&
+ CRI.MI->getOpcode() != PPC::CRANDC) {
+ DEBUG(dbgs() << "Unable to split blocks on this opcode.\n");
+ NumNotSplitWrongOpcode++;
+ return false;
+ }
+ DEBUG(dbgs() << "Splitting the following CR op:\n"; CRI.dump());
+ MachineBasicBlock::iterator Def1It = CRI.TrueDefs.first;
+ MachineBasicBlock::iterator Def2It = CRI.TrueDefs.second;
+
+ bool UsingDef1 = false;
+ MachineInstr *SplitBefore = &*Def2It;
+ for (auto E = CRI.MI->getParent()->end(); Def2It != E; ++Def2It) {
+ if (Def1It == Def2It) { // Def2 comes before Def1.
+ SplitBefore = &*Def1It;
+ UsingDef1 = true;
+ break;
+ }
+ }
+
+ DEBUG(dbgs() << "We will split the following block:\n";);
+ DEBUG(CRI.MI->getParent()->dump());
+ DEBUG(dbgs() << "Before instruction:\n"; SplitBefore->dump());
+
+ // Get the branch instruction.
+ MachineInstr *Branch =
+ MRI->use_nodbg_begin(CRI.MI->getOperand(0).getReg())->getParent();
+
+ // We want the new block to have no code in it other than the definition
+ // of the input to the CR logical and the CR logical itself. So we move
+ // those to the bottom of the block (just before the branch). Then we
+ // will split before the CR logical.
+ MachineBasicBlock *MBB = SplitBefore->getParent();
+ auto FirstTerminator = MBB->getFirstTerminator();
+ MachineBasicBlock::iterator FirstInstrToMove =
+ UsingDef1 ? CRI.TrueDefs.first : CRI.TrueDefs.second;
+ MachineBasicBlock::iterator SecondInstrToMove =
+ UsingDef1 ? CRI.CopyDefs.first : CRI.CopyDefs.second;
+
+ // The instructions that need to be moved are not guaranteed to be
+ // contiguous. Move them individually.
+ // FIXME: If one of the operands is a chain of (single use) copies, they
+ // can all be moved and we can still split.
+ MBB->splice(FirstTerminator, MBB, FirstInstrToMove);
+ if (FirstInstrToMove != SecondInstrToMove)
+ MBB->splice(FirstTerminator, MBB, SecondInstrToMove);
+ MBB->splice(FirstTerminator, MBB, CRI.MI);
+
+ unsigned Opc = CRI.MI->getOpcode();
+ bool InvertOrigBranch, InvertNewBranch, TargetIsFallThrough;
+ computeBranchTargetAndInversion(Opc, Branch->getOpcode(), UsingDef1,
+ InvertNewBranch, InvertOrigBranch,
+ TargetIsFallThrough);
+ MachineInstr *SplitCond =
+ UsingDef1 ? CRI.CopyDefs.second : CRI.CopyDefs.first;
+ DEBUG(dbgs() << "We will " << (InvertNewBranch ? "invert" : "copy"));
+ DEBUG(dbgs() << " the original branch and the target is the " <<
+ (TargetIsFallThrough ? "fallthrough block\n" : "orig. target block\n"));
+ DEBUG(dbgs() << "Original branch instruction: "; Branch->dump());
+ BlockSplitInfo BSI { Branch, SplitBefore, SplitCond, InvertNewBranch,
+ InvertOrigBranch, TargetIsFallThrough, MBPI, CRI.MI,
+ UsingDef1 ? CRI.CopyDefs.first : CRI.CopyDefs.second };
+ bool Changed = splitMBB(BSI);
+ // If we've split on a CR logical that is fed by a CR logical,
+ // recompute the source CR logical as it may be usable for splitting.
+ if (Changed) {
+ bool Input1CRlogical =
+ CRI.TrueDefs.first && isCRLogical(*CRI.TrueDefs.first);
+ bool Input2CRlogical =
+ CRI.TrueDefs.second && isCRLogical(*CRI.TrueDefs.second);
+ if (Input1CRlogical)
+ AllCRLogicalOps.push_back(createCRLogicalOpInfo(*CRI.TrueDefs.first));
+ if (Input2CRlogical)
+ AllCRLogicalOps.push_back(createCRLogicalOpInfo(*CRI.TrueDefs.second));
+ }
+ return Changed;
+}
+
+void PPCReduceCRLogicals::collectCRLogicals() {
+ for (MachineBasicBlock &MBB : *MF) {
+ for (MachineInstr &MI : MBB) {
+ if (isCRLogical(MI)) {
+ AllCRLogicalOps.push_back(createCRLogicalOpInfo(MI));
+ TotalCRLogicals++;
+ if (AllCRLogicalOps.back().IsNullary)
+ TotalNullaryCRLogicals++;
+ else if (AllCRLogicalOps.back().IsBinary)
+ TotalBinaryCRLogicals++;
+ else
+ TotalUnaryCRLogicals++;
+ }
+ }
+ }
+}
+
+} // end annonymous namespace
+
+INITIALIZE_PASS_BEGIN(PPCReduceCRLogicals, DEBUG_TYPE,
+ "PowerPC Reduce CR logical Operation", false, false)
+INITIALIZE_PASS_DEPENDENCY(MachineDominatorTree)
+INITIALIZE_PASS_END(PPCReduceCRLogicals, DEBUG_TYPE,
+ "PowerPC Reduce CR logical Operation", false, false)
+
+char PPCReduceCRLogicals::ID = 0;
+FunctionPass*
+llvm::createPPCReduceCRLogicalsPass() { return new PPCReduceCRLogicals(); }
diff --git a/lib/Target/PowerPC/PPCRegisterInfo.cpp b/lib/Target/PowerPC/PPCRegisterInfo.cpp
index 9207165c46a6d..6b62a82ef7bf9 100644
--- a/lib/Target/PowerPC/PPCRegisterInfo.cpp
+++ b/lib/Target/PowerPC/PPCRegisterInfo.cpp
@@ -21,12 +21,15 @@
#include "PPCTargetMachine.h"
#include "llvm/ADT/BitVector.h"
#include "llvm/ADT/STLExtras.h"
+#include "llvm/ADT/Statistic.h"
#include "llvm/CodeGen/MachineFrameInfo.h"
#include "llvm/CodeGen/MachineFunction.h"
#include "llvm/CodeGen/MachineInstrBuilder.h"
#include "llvm/CodeGen/MachineModuleInfo.h"
#include "llvm/CodeGen/MachineRegisterInfo.h"
#include "llvm/CodeGen/RegisterScavenging.h"
+#include "llvm/CodeGen/TargetFrameLowering.h"
+#include "llvm/CodeGen/TargetInstrInfo.h"
#include "llvm/IR/CallingConv.h"
#include "llvm/IR/Constants.h"
#include "llvm/IR/Function.h"
@@ -36,8 +39,6 @@
#include "llvm/Support/ErrorHandling.h"
#include "llvm/Support/MathExtras.h"
#include "llvm/Support/raw_ostream.h"
-#include "llvm/Target/TargetFrameLowering.h"
-#include "llvm/Target/TargetInstrInfo.h"
#include "llvm/Target/TargetMachine.h"
#include "llvm/Target/TargetOptions.h"
#include <cstdlib>
@@ -49,6 +50,9 @@ using namespace llvm;
#define GET_REGINFO_TARGET_DESC
#include "PPCGenRegisterInfo.inc"
+STATISTIC(InflateGPRC, "Number of gprc inputs for getLargestLegalClass");
+STATISTIC(InflateGP8RC, "Number of g8rc inputs for getLargestLegalClass");
+
static cl::opt<bool>
EnableBasePointer("ppc-use-base-pointer", cl::Hidden, cl::init(true),
cl::desc("Enable use of a base pointer for complex stack frames"));
@@ -57,6 +61,10 @@ static cl::opt<bool>
AlwaysBasePointer("ppc-always-use-base-pointer", cl::Hidden, cl::init(false),
cl::desc("Force the use of a base pointer in every function"));
+static cl::opt<bool>
+EnableGPRToVecSpills("ppc-enable-gpr-to-vsr-spills", cl::Hidden, cl::init(false),
+ cl::desc("Enable spills from gpr to vsr rather than stack"));
+
PPCRegisterInfo::PPCRegisterInfo(const PPCTargetMachine &TM)
: PPCGenRegisterInfo(TM.isPPC64() ? PPC::LR8 : PPC::LR,
TM.isPPC64() ? 0 : 1,
@@ -82,6 +90,8 @@ PPCRegisterInfo::PPCRegisterInfo(const PPCTargetMachine &TM)
// VSX
ImmToIdxMap[PPC::DFLOADf32] = PPC::LXSSPX;
ImmToIdxMap[PPC::DFLOADf64] = PPC::LXSDX;
+ ImmToIdxMap[PPC::SPILLTOVSR_LD] = PPC::SPILLTOVSR_LDX;
+ ImmToIdxMap[PPC::SPILLTOVSR_ST] = PPC::SPILLTOVSR_STX;
ImmToIdxMap[PPC::DFSTOREf32] = PPC::STXSSPX;
ImmToIdxMap[PPC::DFSTOREf64] = PPC::STXSDX;
ImmToIdxMap[PPC::LXV] = PPC::LXVX;
@@ -113,7 +123,7 @@ PPCRegisterInfo::getPointerRegClass(const MachineFunction &MF, unsigned Kind)
const MCPhysReg*
PPCRegisterInfo::getCalleeSavedRegs(const MachineFunction *MF) const {
const PPCSubtarget &Subtarget = MF->getSubtarget<PPCSubtarget>();
- if (MF->getFunction()->getCallingConv() == CallingConv::AnyReg) {
+ if (MF->getFunction().getCallingConv() == CallingConv::AnyReg) {
if (Subtarget.hasVSX())
return CSR_64_AllRegs_VSX_SaveList;
if (Subtarget.hasAltivec())
@@ -151,7 +161,7 @@ PPCRegisterInfo::getCalleeSavedRegsViaCopy(const MachineFunction *MF) const {
return nullptr;
if (!TM.isPPC64())
return nullptr;
- if (MF->getFunction()->getCallingConv() != CallingConv::CXX_FAST_TLS)
+ if (MF->getFunction().getCallingConv() != CallingConv::CXX_FAST_TLS)
return nullptr;
if (!MF->getInfo<PPCFunctionInfo>()->isSplitCSR())
return nullptr;
@@ -328,6 +338,18 @@ PPCRegisterInfo::getLargestLegalSuperClass(const TargetRegisterClass *RC,
// With VSX, we can inflate various sub-register classes to the full VSX
// register set.
+ // For Power9 we allow the user to enable GPR to vector spills.
+ // FIXME: Currently limited to spilling GP8RC. A follow on patch will add
+ // support to spill GPRC.
+ if (TM.isELFv2ABI()) {
+ if (Subtarget.hasP9Vector() && EnableGPRToVecSpills &&
+ RC == &PPC::G8RCRegClass) {
+ InflateGP8RC++;
+ return &PPC::SPILLTOVSRRCRegClass;
+ }
+ if (RC == &PPC::GPRCRegClass && EnableGPRToVecSpills)
+ InflateGPRC++;
+ }
if (RC == &PPC::F8RCRegClass)
return &PPC::VSFRCRegClass;
else if (RC == &PPC::VRRCRegClass)
@@ -879,7 +901,7 @@ PPCRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator II,
// Naked functions have stack size 0, although getStackSize may not reflect
// that because we didn't call all the pieces that compute it for naked
// functions.
- if (!MF.getFunction()->hasFnAttribute(Attribute::Naked)) {
+ if (!MF.getFunction().hasFnAttribute(Attribute::Naked)) {
if (!(hasBasePointer(MF) && FrameIndex < 0))
Offset += MFI.getStackSize();
}
@@ -911,11 +933,16 @@ PPCRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator II,
SReg = MF.getRegInfo().createVirtualRegister(RC);
// Insert a set of rA with the full offset value before the ld, st, or add
- BuildMI(MBB, II, dl, TII.get(is64Bit ? PPC::LIS8 : PPC::LIS), SRegHi)
- .addImm(Offset >> 16);
- BuildMI(MBB, II, dl, TII.get(is64Bit ? PPC::ORI8 : PPC::ORI), SReg)
- .addReg(SRegHi, RegState::Kill)
- .addImm(Offset);
+ if (isInt<16>(Offset))
+ BuildMI(MBB, II, dl, TII.get(is64Bit ? PPC::LI8 : PPC::LI), SReg)
+ .addImm(Offset);
+ else {
+ BuildMI(MBB, II, dl, TII.get(is64Bit ? PPC::LIS8 : PPC::LIS), SRegHi)
+ .addImm(Offset >> 16);
+ BuildMI(MBB, II, dl, TII.get(is64Bit ? PPC::ORI8 : PPC::ORI), SReg)
+ .addReg(SRegHi, RegState::Kill)
+ .addImm(Offset);
+ }
// Convert into indexed form of the instruction:
//
diff --git a/lib/Target/PowerPC/PPCRegisterInfo.td b/lib/Target/PowerPC/PPCRegisterInfo.td
index 896cec7e4f6e8..f7807907bd640 100644
--- a/lib/Target/PowerPC/PPCRegisterInfo.td
+++ b/lib/Target/PowerPC/PPCRegisterInfo.td
@@ -305,6 +305,11 @@ def VFRC : RegisterClass<"PPC", [f64], 64,
VF22, VF21, VF20)>;
def VSFRC : RegisterClass<"PPC", [f64], 64, (add F8RC, VFRC)>;
+// Allow spilling GPR's into caller-saved VSR's.
+def SPILLTOVSRRC : RegisterClass<"PPC", [i64, f64], 64, (add G8RC, (sub VSFRC,
+ (sequence "VF%u", 31, 20),
+ (sequence "F%u", 31, 14)))>;
+
// Register class for single precision scalars in VSX registers
def VSSRC : RegisterClass<"PPC", [f32], 32, (add VSFRC)>;
diff --git a/lib/Target/PowerPC/PPCScheduleP9.td b/lib/Target/PowerPC/PPCScheduleP9.td
index a01995a629c29..b24f4fc603a15 100644
--- a/lib/Target/PowerPC/PPCScheduleP9.td
+++ b/lib/Target/PowerPC/PPCScheduleP9.td
@@ -22,7 +22,9 @@ def P9Model : SchedMachineModel {
// Try to make sure we have at least 10 dispatch groups in a loop.
let LoopMicroOpBufferSize = 60;
- let CompleteModel = 0;
+ let CompleteModel = 1;
+
+ let UnsupportedFeatures = [HasQPX];
}
@@ -68,6 +70,10 @@ let SchedModel = P9Model in {
def LS : ProcResource<4>;
def PM : ProcResource<2>;
def DFU : ProcResource<1>;
+ def BR : ProcResource<1> {
+ let BufferSize = 16;
+ }
+ def CY : ProcResource<1>;
def TestGroup : ProcResGroup<[ALU, DP]>;
@@ -145,6 +151,10 @@ let SchedModel = P9Model in {
let Latency = 6;
}
+ def P9_DIV_12C : SchedWriteRes<[DIV]> {
+ let Latency = 12;
+ }
+
def P9_DIV_16C_8 : SchedWriteRes<[DIV]> {
let ResourceCycles = [8];
let Latency = 16;
@@ -190,6 +200,16 @@ let SchedModel = P9Model in {
let Latency = 24;
}
+ def P9_DPO_24C_8 : SchedWriteRes<[DPO]> {
+ let ResourceCycles = [8];
+ let Latency = 24;
+ }
+
+ def P9_DPE_24C_8 : SchedWriteRes<[DPE]> {
+ let ResourceCycles = [8];
+ let Latency = 24;
+ }
+
def P9_DP_26C_5 : SchedWriteRes<[DP]> {
let ResourceCycles = [5];
let Latency = 22;
@@ -205,6 +225,16 @@ let SchedModel = P9Model in {
let Latency = 33;
}
+ def P9_DPE_33C_8 : SchedWriteRes<[DPE]> {
+ let ResourceCycles = [8];
+ let Latency = 33;
+ }
+
+ def P9_DPO_33C_8 : SchedWriteRes<[DPO]> {
+ let ResourceCycles = [8];
+ let Latency = 33;
+ }
+
def P9_DP_36C_10 : SchedWriteRes<[DP]> {
let ResourceCycles = [10];
let Latency = 36;
@@ -248,31 +278,61 @@ let SchedModel = P9Model in {
let Latency = 76;
let ResourceCycles = [62];
}
+
+ def P9_BR_2C : SchedWriteRes<[BR]> {
+ let Latency = 2;
+ }
+
+ def P9_BR_5C : SchedWriteRes<[BR]> {
+ let Latency = 5;
+ }
+
+ def P9_CY_6C : SchedWriteRes<[CY]> {
+ let Latency = 6;
+ }
+
// ***************** WriteSeq Definitions *****************
def P9_LoadAndALUOp_6C : WriteSequence<[P9_LS_4C, P9_ALU_2C]>;
def P9_LoadAndALUOp_7C : WriteSequence<[P9_LS_5C, P9_ALU_2C]>;
def P9_LoadAndPMOp_8C : WriteSequence<[P9_LS_5C, P9_PM_3C]>;
+ def P9_LoadAndLoadOp_8C : WriteSequence<[P9_LS_4C, P9_LS_4C]>;
def P9_IntDivAndALUOp_26C_8 : WriteSequence<[P9_DIV_24C_8, P9_ALU_2C]>;
def P9_IntDivAndALUOp_42C_8 : WriteSequence<[P9_DIV_40C_8, P9_ALU_2C]>;
def P9_StoreAndALUOp_4C : WriteSequence<[P9_LS_1C, P9_ALU_3C]>;
def P9_ALUOpAndALUOp_4C : WriteSequence<[P9_ALU_2C, P9_ALU_2C]>;
+ def P9_DPOpAndALUOp_9C : WriteSequence<[P9_DP_7C, P9_ALU_2C]>;
+ def P9_DPOpAndALUOp_24C_5 : WriteSequence<[P9_DP_22C_5, P9_ALU_2C]>;
+ def P9_DPOpAndALUOp_35C_8 : WriteSequence<[P9_DP_33C_8, P9_ALU_2C]>;
// ***************** Defining Itinerary Class Resources *****************
+ // The following itineraries are fully covered by the InstRW definitions in
+ // P9InstrResources.td so aren't listed here.
+ // IIC_FPDivD, IIC_FPDivS, IIC_FPFused, IIC_IntDivD, IIC_LdStLFDU,
+ // IIC_LdStLFDUX
+
def : ItinRW<[P9_ALU_2C, IP_EXEC_1C, DISP_1C, DISP_1C],
- [IIC_IntSimple, IIC_IntGeneral]>;
+ [IIC_IntSimple, IIC_IntGeneral, IIC_IntRFID,
+ IIC_IntRotateD, IIC_IntRotateDI, IIC_IntTrapD,
+ IIC_SprRFI]>;
+
+ def : ItinRW<[P9_ALU_3C, IP_EXEC_1C, DISP_1C, DISP_1C],
+ [IIC_IntTrapW]>;
def : ItinRW<[P9_ALU_2C, IP_EXEC_1C, DISP_1C, DISP_1C, DISP_1C],
[IIC_IntISEL, IIC_IntRotate, IIC_IntShift]>;
def : ItinRW<[P9_ALU_2C, IP_EXEC_1C, DISP_1C, DISP_1C], [IIC_IntCompare]>;
+ def : ItinRW<[P9_ALUE_2C, P9_ALUO_2C, IP_EXECE_1C, IP_EXECO_1C,
+ DISP_1C, DISP_1C], [IIC_VecGeneral, IIC_FPCompare]>;
+
def : ItinRW<[P9_DP_5C, IP_EXEC_1C, DISP_1C, DISP_1C, DISP_1C],
- [IIC_IntMulHW, IIC_IntMulHWU, IIC_IntMulLI]>;
+ [IIC_IntMulHW, IIC_IntMulHWU, IIC_IntMulLI, IIC_IntMulHD]>;
def : ItinRW<[P9_LS_5C, IP_EXEC_1C, DISP_1C, DISP_1C],
- [IIC_LdStLoad, IIC_LdStLD]>;
+ [IIC_LdStLoad, IIC_LdStLD, IIC_LdStLFD]>;
def : ItinRW<[P9_LS_4C, P9_ALU_2C, IP_EXEC_1C, IP_EXEC_1C,
DISP_1C, DISP_1C, DISP_1C, DISP_1C],
@@ -300,12 +360,18 @@ let SchedModel = P9Model in {
def : ItinRW<[P9_LS_4C, IP_EXEC_1C, DISP_1C, DISP_1C],
[IIC_LdStLWARX, IIC_LdStLDARX, IIC_LdStLMW]>;
+ def : ItinRW<[P9_LS_4C, IP_EXEC_1C, DISP_1C, DISP_1C],
+ [IIC_LdStCOPY, IIC_SprABORT, IIC_LdStPASTE, IIC_LdStDCBF,
+ IIC_LdStICBI, IIC_LdStSync, IIC_SprISYNC, IIC_SprMSGSYNC,
+ IIC_SprSLBIA, IIC_SprSLBSYNC, IIC_SprTLBSYNC]>;
+
def : ItinRW<[P9_LS_1C, IP_EXEC_1C, IP_AGEN_1C, DISP_1C, DISP_1C, DISP_1C],
[IIC_LdStSTFD, IIC_LdStSTD, IIC_LdStStore]>;
def : ItinRW<[P9_LS_1C, P9_ALU_2C, IP_EXEC_1C, IP_EXEC_1C, IP_AGEN_1C,
DISP_1C, DISP_1C, DISP_1C, DISP_1C, DISP_1C],
- [IIC_LdStSTDU, IIC_LdStSTDUX]>;
+ [IIC_LdStSTDU, IIC_LdStSTDUX, IIC_LdStStoreUpd, IIC_SprSLBIEG,
+ IIC_SprTLBIA, IIC_SprTLBIE]>;
def : ItinRW<[P9_StoreAndALUOp_4C, IP_EXEC_1C, IP_EXEC_1C, IP_AGEN_1C,
DISP_1C, DISP_1C, DISP_1C, DISP_1C, DISP_1C],
@@ -315,20 +381,44 @@ let SchedModel = P9Model in {
[IIC_BrCR, IIC_IntMTFSB0]>;
def : ItinRW<[P9_ALUOpAndALUOp_4C, P9_ALU_2C, IP_EXEC_1C, IP_EXEC_1C,
- IP_EXEC_1C, DISP_1C, DISP_1C, DISP_1C, DISP_1C, DISP_1C, DISP_1C,
- DISP_1C, DISP_1C, DISP_1C], [IIC_SprMFCR, IIC_SprMFCRF]>;
+ IP_EXEC_1C, DISP_1C, DISP_1C, DISP_1C, DISP_1C, DISP_1C,
+ DISP_1C, DISP_1C, DISP_1C, DISP_1C],
+ [IIC_SprMFCR, IIC_SprMFCRF, IIC_BrMCR, IIC_BrMCRX, IIC_IntMFFS]>;
+
+ def : ItinRW<[P9_BR_2C, DISP_1C], [IIC_BrB]>;
+ def : ItinRW<[P9_BR_5C, DISP_1C], [IIC_SprMFSPR]>;
// This class should be broken down to instruction level, once some missing
// info is obtained.
def : ItinRW<[P9_LoadAndALUOp_6C, IP_EXEC_1C, IP_AGEN_1C,
DISP_1C, DISP_1C, DISP_1C], [IIC_SprMTSPR]>;
- def : ItinRW<[P9_DP_7C, IP_EXEC_1C,
- DISP_1C, DISP_1C, DISP_1C], [IIC_FPGeneral, IIC_FPAddSub]>;
+ def : ItinRW<[P9_LoadAndLoadOp_8C, IP_EXEC_1C, DISP_1C, DISP_1C],
+ [IIC_SprSLBIE, IIC_SprSLBMFEE, IIC_SprSLBMFEV, IIC_SprSLBMTE,
+ IIC_SprTLBIEL]>;
+
+ // IIC_VecFP is added here although many instructions with that itinerary
+ // use very different resources. It would appear that instructions were
+ // given that itinerary rather carelessly over time. Specific instructions
+ // that use different resources are listed in various InstrRW classes.
+ def : ItinRW<[P9_DP_7C, IP_EXEC_1C, DISP_1C, DISP_1C, DISP_1C],
+ [IIC_FPGeneral, IIC_FPAddSub, IIC_VecFP]>;
+
+ def : ItinRW<[P9_ALUE_3C, P9_ALUO_3C, IP_EXECE_1C, IP_EXECO_1C,
+ DISP_1C, DISP_1C], [IIC_VecFPCompare]>;
+
+ def : ItinRW<[P9_PM_3C, IP_EXECO_1C, IP_EXECE_1C, DISP_1C, DISP_1C],
+ [IIC_VecPerm]>;
def : ItinRW<[P9_DP_36C_10, IP_EXEC_1C], [IIC_FPSqrtD]>;
def : ItinRW<[P9_DP_26C_5, P9_DP_26C_5, IP_EXEC_1C, IP_EXEC_1C], [IIC_FPSqrtS]>;
+ def : ItinRW<[P9_DIV_12C, IP_EXECE_1C, DISP_1C, DISP_1C],
+ [IIC_SprMFMSR, IIC_SprMFPMR, IIC_SprMFSR, IIC_SprMFTB,
+ IIC_SprMTMSR, IIC_SprMTMSRD, IIC_SprMTPMR, IIC_SprMTSR]>;
+
+ def : ItinRW<[], [IIC_SprSTOP]>;
+
include "P9InstrResources.td"
}
diff --git a/lib/Target/PowerPC/PPCSubtarget.h b/lib/Target/PowerPC/PPCSubtarget.h
index 90d11f46a384d..c351b5c04a056 100644
--- a/lib/Target/PowerPC/PPCSubtarget.h
+++ b/lib/Target/PowerPC/PPCSubtarget.h
@@ -19,9 +19,9 @@
#include "PPCInstrInfo.h"
#include "llvm/ADT/Triple.h"
#include "llvm/CodeGen/SelectionDAGTargetInfo.h"
+#include "llvm/CodeGen/TargetSubtargetInfo.h"
#include "llvm/IR/DataLayout.h"
#include "llvm/MC/MCInstrItineraries.h"
-#include "llvm/Target/TargetSubtargetInfo.h"
#include <string>
#define GET_SUBTARGETINFO_HEADER
diff --git a/lib/Target/PowerPC/PPCTLSDynamicCall.cpp b/lib/Target/PowerPC/PPCTLSDynamicCall.cpp
index 5f8085f4626e2..49f2699ab082e 100644
--- a/lib/Target/PowerPC/PPCTLSDynamicCall.cpp
+++ b/lib/Target/PowerPC/PPCTLSDynamicCall.cpp
@@ -25,7 +25,7 @@
#include "PPCInstrBuilder.h"
#include "PPCInstrInfo.h"
#include "PPCTargetMachine.h"
-#include "llvm/CodeGen/LiveIntervalAnalysis.h"
+#include "llvm/CodeGen/LiveIntervals.h"
#include "llvm/CodeGen/MachineFunctionPass.h"
#include "llvm/CodeGen/MachineInstrBuilder.h"
#include "llvm/Support/Debug.h"
diff --git a/lib/Target/PowerPC/PPCTargetMachine.cpp b/lib/Target/PowerPC/PPCTargetMachine.cpp
index fe092cc3b858d..491f25ca2c64a 100644
--- a/lib/Target/PowerPC/PPCTargetMachine.cpp
+++ b/lib/Target/PowerPC/PPCTargetMachine.cpp
@@ -23,6 +23,7 @@
#include "llvm/ADT/Triple.h"
#include "llvm/Analysis/TargetTransformInfo.h"
#include "llvm/CodeGen/Passes.h"
+#include "llvm/CodeGen/TargetLoweringObjectFile.h"
#include "llvm/CodeGen/TargetPassConfig.h"
#include "llvm/IR/Attributes.h"
#include "llvm/IR/DataLayout.h"
@@ -31,7 +32,6 @@
#include "llvm/Support/CodeGen.h"
#include "llvm/Support/CommandLine.h"
#include "llvm/Support/TargetRegistry.h"
-#include "llvm/Target/TargetLoweringObjectFile.h"
#include "llvm/Target/TargetOptions.h"
#include "llvm/Transforms/Scalar.h"
#include <cassert>
@@ -40,6 +40,10 @@
using namespace llvm;
+
+static cl::opt<bool>
+ EnableBranchCoalescing("enable-ppc-branch-coalesce", cl::Hidden,
+ cl::desc("enable coalescing of duplicate branches for PPC"));
static cl::
opt<bool> DisableCTRLoops("disable-ppc-ctrloops", cl::Hidden,
cl::desc("Disable CTR loops for PPC"));
@@ -84,6 +88,10 @@ EnableMachineCombinerPass("ppc-machine-combiner",
cl::desc("Enable the machine combiner pass"),
cl::init(true), cl::Hidden);
+static cl::opt<bool>
+ ReduceCRLogical("ppc-reduce-cr-logicals",
+ cl::desc("Expand eligible cr-logical binary ops to branches"),
+ cl::init(false), cl::Hidden);
extern "C" void LLVMInitializePowerPCTarget() {
// Register the targets
RegisterTargetMachine<PPCTargetMachine> A(getThePPC32Target());
@@ -93,7 +101,9 @@ extern "C" void LLVMInitializePowerPCTarget() {
PassRegistry &PR = *PassRegistry::getPassRegistry();
initializePPCBoolRetToIntPass(PR);
initializePPCExpandISELPass(PR);
+ initializePPCPreEmitPeepholePass(PR);
initializePPCTLSDynamicCallPass(PR);
+ initializePPCMIPeepholePass(PR);
}
/// Return the datalayout string of a subtarget.
@@ -208,6 +218,17 @@ static Reloc::Model getEffectiveRelocModel(const Triple &TT,
return Reloc::Static;
}
+static CodeModel::Model getEffectiveCodeModel(const Triple &TT,
+ Optional<CodeModel::Model> CM,
+ bool JIT) {
+ if (CM)
+ return *CM;
+ if (!TT.isOSDarwin() && !JIT &&
+ (TT.getArch() == Triple::ppc64 || TT.getArch() == Triple::ppc64le))
+ return CodeModel::Medium;
+ return CodeModel::Small;
+}
+
// The FeatureString here is a little subtle. We are modifying the feature
// string with what are (currently) non-function specific overrides as it goes
// into the LLVMTargetMachine constructor and then using the stored value in the
@@ -216,10 +237,12 @@ PPCTargetMachine::PPCTargetMachine(const Target &T, const Triple &TT,
StringRef CPU, StringRef FS,
const TargetOptions &Options,
Optional<Reloc::Model> RM,
- CodeModel::Model CM, CodeGenOpt::Level OL)
+ Optional<CodeModel::Model> CM,
+ CodeGenOpt::Level OL, bool JIT)
: LLVMTargetMachine(T, getDataLayoutString(TT), TT, CPU,
computeFSAdditions(FS, OL, TT), Options,
- getEffectiveRelocModel(TT, RM), CM, OL),
+ getEffectiveRelocModel(TT, RM),
+ getEffectiveCodeModel(TT, CM, JIT), OL),
TLOF(createTLOF(getTargetTriple())),
TargetABI(computeTargetABI(TT, Options)) {
initAsmInfo();
@@ -365,12 +388,19 @@ bool PPCPassConfig::addInstSelector() {
}
void PPCPassConfig::addMachineSSAOptimization() {
+ // PPCBranchCoalescingPass need to be done before machine sinking
+ // since it merges empty blocks.
+ if (EnableBranchCoalescing && getOptLevel() != CodeGenOpt::None)
+ addPass(createPPCBranchCoalescingPass());
TargetPassConfig::addMachineSSAOptimization();
// For little endian, remove where possible the vector swap instructions
// introduced at code generation to normalize vector element order.
if (TM->getTargetTriple().getArch() == Triple::ppc64le &&
!DisableVSXSwapRemoval)
addPass(createPPCVSXSwapRemovalPass());
+ // Reduce the number of cr-logical ops.
+ if (ReduceCRLogical && getOptLevel() != CodeGenOpt::None)
+ addPass(createPPCReduceCRLogicalsPass());
// Target-specific peephole cleanups performed after instruction
// selection.
if (!DisableMIPeephole) {
@@ -412,6 +442,7 @@ void PPCPassConfig::addPreSched2() {
}
void PPCPassConfig::addPreEmitPass() {
+ addPass(createPPCPreEmitPeepholePass());
addPass(createPPCExpandISELPass());
if (getOptLevel() != CodeGenOpt::None)
diff --git a/lib/Target/PowerPC/PPCTargetMachine.h b/lib/Target/PowerPC/PPCTargetMachine.h
index be705507b5347..102bf7ca59c26 100644
--- a/lib/Target/PowerPC/PPCTargetMachine.h
+++ b/lib/Target/PowerPC/PPCTargetMachine.h
@@ -35,14 +35,15 @@ private:
public:
PPCTargetMachine(const Target &T, const Triple &TT, StringRef CPU,
StringRef FS, const TargetOptions &Options,
- Optional<Reloc::Model> RM, CodeModel::Model CM,
- CodeGenOpt::Level OL);
+ Optional<Reloc::Model> RM, Optional<CodeModel::Model> CM,
+ CodeGenOpt::Level OL, bool JIT);
~PPCTargetMachine() override;
const PPCSubtarget *getSubtargetImpl(const Function &F) const override;
- // The no argument getSubtargetImpl, while it exists on some targets, is
- // deprecated and should not be used.
+ // DO NOT IMPLEMENT: There is no such thing as a valid default subtarget,
+ // subtargets are per-function entities based on the target-specific
+ // attributes of each function.
const PPCSubtarget *getSubtargetImpl() const = delete;
// Pass Pipeline Configuration
diff --git a/lib/Target/PowerPC/PPCTargetObjectFile.h b/lib/Target/PowerPC/PPCTargetObjectFile.h
index c8b9b2e9790b6..8343a90696d92 100644
--- a/lib/Target/PowerPC/PPCTargetObjectFile.h
+++ b/lib/Target/PowerPC/PPCTargetObjectFile.h
@@ -10,8 +10,8 @@
#ifndef LLVM_LIB_TARGET_POWERPC_PPCTARGETOBJECTFILE_H
#define LLVM_LIB_TARGET_POWERPC_PPCTARGETOBJECTFILE_H
+#include "llvm/CodeGen/TargetLoweringObjectFile.h"
#include "llvm/CodeGen/TargetLoweringObjectFileImpl.h"
-#include "llvm/Target/TargetLoweringObjectFile.h"
#include "llvm/Target/TargetMachine.h"
namespace llvm {
diff --git a/lib/Target/PowerPC/PPCTargetTransformInfo.cpp b/lib/Target/PowerPC/PPCTargetTransformInfo.cpp
index 6110706b01b90..aa4073f7ea025 100644
--- a/lib/Target/PowerPC/PPCTargetTransformInfo.cpp
+++ b/lib/Target/PowerPC/PPCTargetTransformInfo.cpp
@@ -10,10 +10,10 @@
#include "PPCTargetTransformInfo.h"
#include "llvm/Analysis/TargetTransformInfo.h"
#include "llvm/CodeGen/BasicTTIImpl.h"
+#include "llvm/CodeGen/CostTable.h"
+#include "llvm/CodeGen/TargetLowering.h"
#include "llvm/Support/CommandLine.h"
#include "llvm/Support/Debug.h"
-#include "llvm/Target/CostTable.h"
-#include "llvm/Target/TargetLowering.h"
using namespace llvm;
#define DEBUG_TYPE "ppctti"
@@ -189,6 +189,17 @@ int PPCTTIImpl::getIntImmCost(unsigned Opcode, unsigned Idx, const APInt &Imm,
return PPCTTIImpl::getIntImmCost(Imm, Ty);
}
+unsigned PPCTTIImpl::getUserCost(const User *U,
+ ArrayRef<const Value *> Operands) {
+ if (U->getType()->isVectorTy()) {
+ // Instructions that need to be split should cost more.
+ std::pair<int, MVT> LT = TLI->getTypeLegalizationCost(DL, U->getType());
+ return LT.first * BaseT::getUserCost(U, Operands);
+ }
+
+ return BaseT::getUserCost(U, Operands);
+}
+
void PPCTTIImpl::getUnrollingPreferences(Loop *L, ScalarEvolution &SE,
TTI::UnrollingPreferences &UP) {
if (ST->getDarwinDirective() == PPC::DIR_A2) {
@@ -215,9 +226,17 @@ bool PPCTTIImpl::enableAggressiveInterleaving(bool LoopHasReductions) {
return LoopHasReductions;
}
-bool PPCTTIImpl::expandMemCmp(Instruction *I, unsigned &MaxLoadSize) {
- MaxLoadSize = 8;
- return true;
+const PPCTTIImpl::TTI::MemCmpExpansionOptions *
+PPCTTIImpl::enableMemCmpExpansion(bool IsZeroCmp) const {
+ static const auto Options = []() {
+ TTI::MemCmpExpansionOptions Options;
+ Options.LoadSizes.push_back(8);
+ Options.LoadSizes.push_back(4);
+ Options.LoadSizes.push_back(2);
+ Options.LoadSizes.push_back(1);
+ return Options;
+ }();
+ return &Options;
}
bool PPCTTIImpl::enableInterleavedAccessVectorization() {
diff --git a/lib/Target/PowerPC/PPCTargetTransformInfo.h b/lib/Target/PowerPC/PPCTargetTransformInfo.h
index 99ca6394d1bed..b42dae4a0254c 100644
--- a/lib/Target/PowerPC/PPCTargetTransformInfo.h
+++ b/lib/Target/PowerPC/PPCTargetTransformInfo.h
@@ -21,7 +21,7 @@
#include "PPCTargetMachine.h"
#include "llvm/Analysis/TargetTransformInfo.h"
#include "llvm/CodeGen/BasicTTIImpl.h"
-#include "llvm/Target/TargetLowering.h"
+#include "llvm/CodeGen/TargetLowering.h"
namespace llvm {
@@ -51,6 +51,8 @@ public:
int getIntImmCost(Intrinsic::ID IID, unsigned Idx, const APInt &Imm,
Type *Ty);
+ unsigned getUserCost(const User *U, ArrayRef<const Value *> Operands);
+
TTI::PopcntSupportKind getPopcntSupport(unsigned TyWidth);
void getUnrollingPreferences(Loop *L, ScalarEvolution &SE,
TTI::UnrollingPreferences &UP);
@@ -61,7 +63,8 @@ public:
/// @{
bool enableAggressiveInterleaving(bool LoopHasReductions);
- bool expandMemCmp(Instruction *I, unsigned &MaxLoadSize);
+ const TTI::MemCmpExpansionOptions *enableMemCmpExpansion(
+ bool IsZeroCmp) const;
bool enableInterleavedAccessVectorization();
unsigned getNumberOfRegisters(bool Vector);
unsigned getRegisterBitWidth(bool Vector) const;
diff --git a/lib/Target/PowerPC/PPCVSXFMAMutate.cpp b/lib/Target/PowerPC/PPCVSXFMAMutate.cpp
index a57484e5abdf7..f15af790de8f5 100644
--- a/lib/Target/PowerPC/PPCVSXFMAMutate.cpp
+++ b/lib/Target/PowerPC/PPCVSXFMAMutate.cpp
@@ -20,7 +20,7 @@
#include "PPCTargetMachine.h"
#include "llvm/ADT/STLExtras.h"
#include "llvm/ADT/Statistic.h"
-#include "llvm/CodeGen/LiveIntervalAnalysis.h"
+#include "llvm/CodeGen/LiveIntervals.h"
#include "llvm/CodeGen/MachineDominators.h"
#include "llvm/CodeGen/MachineFrameInfo.h"
#include "llvm/CodeGen/MachineFunctionPass.h"
@@ -90,21 +90,21 @@ protected:
// This pass is run after register coalescing, and so we're looking for
// a situation like this:
// ...
- // %vreg5<def> = COPY %vreg9; VSLRC:%vreg5,%vreg9
- // %vreg5<def,tied1> = XSMADDADP %vreg5<tied0>, %vreg17, %vreg16,
- // %RM<imp-use>; VSLRC:%vreg5,%vreg17,%vreg16
+ // %5 = COPY %9; VSLRC:%5,%9
+ // %5<def,tied1> = XSMADDADP %5<tied0>, %17, %16,
+ // implicit %rm; VSLRC:%5,%17,%16
// ...
- // %vreg9<def,tied1> = XSMADDADP %vreg9<tied0>, %vreg17, %vreg19,
- // %RM<imp-use>; VSLRC:%vreg9,%vreg17,%vreg19
+ // %9<def,tied1> = XSMADDADP %9<tied0>, %17, %19,
+ // implicit %rm; VSLRC:%9,%17,%19
// ...
// Where we can eliminate the copy by changing from the A-type to the
// M-type instruction. Specifically, for this example, this means:
- // %vreg5<def,tied1> = XSMADDADP %vreg5<tied0>, %vreg17, %vreg16,
- // %RM<imp-use>; VSLRC:%vreg5,%vreg17,%vreg16
+ // %5<def,tied1> = XSMADDADP %5<tied0>, %17, %16,
+ // implicit %rm; VSLRC:%5,%17,%16
// is replaced by:
- // %vreg16<def,tied1> = XSMADDMDP %vreg16<tied0>, %vreg18, %vreg9,
- // %RM<imp-use>; VSLRC:%vreg16,%vreg18,%vreg9
- // and we remove: %vreg5<def> = COPY %vreg9; VSLRC:%vreg5,%vreg9
+ // %16<def,tied1> = XSMADDMDP %16<tied0>, %18, %9,
+ // implicit %rm; VSLRC:%16,%18,%9
+ // and we remove: %5 = COPY %9; VSLRC:%5,%9
SlotIndex FMAIdx = LIS->getInstructionIndex(MI);
@@ -150,13 +150,13 @@ protected:
// walking the MIs we may as well test liveness here.
//
// FIXME: There is a case that occurs in practice, like this:
- // %vreg9<def> = COPY %F1; VSSRC:%vreg9
+ // %9 = COPY %f1; VSSRC:%9
// ...
- // %vreg6<def> = COPY %vreg9; VSSRC:%vreg6,%vreg9
- // %vreg7<def> = COPY %vreg9; VSSRC:%vreg7,%vreg9
- // %vreg9<def,tied1> = XSMADDASP %vreg9<tied0>, %vreg1, %vreg4; VSSRC:
- // %vreg6<def,tied1> = XSMADDASP %vreg6<tied0>, %vreg1, %vreg2; VSSRC:
- // %vreg7<def,tied1> = XSMADDASP %vreg7<tied0>, %vreg1, %vreg3; VSSRC:
+ // %6 = COPY %9; VSSRC:%6,%9
+ // %7 = COPY %9; VSSRC:%7,%9
+ // %9<def,tied1> = XSMADDASP %9<tied0>, %1, %4; VSSRC:
+ // %6<def,tied1> = XSMADDASP %6<tied0>, %1, %2; VSSRC:
+ // %7<def,tied1> = XSMADDASP %7<tied0>, %1, %3; VSSRC:
// which prevents an otherwise-profitable transformation.
bool OtherUsers = false, KillsAddendSrc = false;
for (auto J = std::prev(I), JE = MachineBasicBlock::iterator(AddendMI);
@@ -177,11 +177,11 @@ protected:
// The transformation doesn't work well with things like:
- // %vreg5 = A-form-op %vreg5, %vreg11, %vreg5;
- // unless vreg11 is also a kill, so skip when it is not,
+ // %5 = A-form-op %5, %11, %5;
+ // unless %11 is also a kill, so skip when it is not,
// and check operand 3 to see it is also a kill to handle the case:
- // %vreg5 = A-form-op %vreg5, %vreg5, %vreg11;
- // where vreg5 and vreg11 are both kills. This case would be skipped
+ // %5 = A-form-op %5, %5, %11;
+ // where %5 and %11 are both kills. This case would be skipped
// otherwise.
unsigned OldFMAReg = MI.getOperand(0).getReg();
@@ -343,7 +343,7 @@ protected:
public:
bool runOnMachineFunction(MachineFunction &MF) override {
- if (skipFunction(*MF.getFunction()))
+ if (skipFunction(MF.getFunction()))
return false;
// If we don't have VSX then go ahead and return without doing
diff --git a/lib/Target/PowerPC/PPCVSXSwapRemoval.cpp b/lib/Target/PowerPC/PPCVSXSwapRemoval.cpp
index 7d34efd4af3e0..8a5fb9fdaef11 100644
--- a/lib/Target/PowerPC/PPCVSXSwapRemoval.cpp
+++ b/lib/Target/PowerPC/PPCVSXSwapRemoval.cpp
@@ -191,7 +191,7 @@ private:
public:
// Main entry point for this pass.
bool runOnMachineFunction(MachineFunction &MF) override {
- if (skipFunction(*MF.getFunction()))
+ if (skipFunction(MF.getFunction()))
return false;
// If we don't have VSX on the subtarget, don't do anything.
@@ -353,6 +353,8 @@ bool PPCVSXSwapRemoval::gatherVectorInstructions() {
break;
case PPC::LXSDX:
case PPC::LXSSPX:
+ case PPC::XFLOADf64:
+ case PPC::XFLOADf32:
// A load of a floating-point value into the high-order half of
// a vector register is safe, provided that we introduce a swap
// following the load, which will be done by the SUBREG_TO_REG
@@ -964,7 +966,7 @@ LLVM_DUMP_METHOD void PPCVSXSwapRemoval::dumpSwapVector() {
dbgs() << format("%6d", ID);
dbgs() << format("%6d", EC->getLeaderValue(ID));
- dbgs() << format(" BB#%3d", MI->getParent()->getNumber());
+ dbgs() << format(" %bb.%3d", MI->getParent()->getNumber());
dbgs() << format(" %14s ", TII->getName(MI->getOpcode()).str().c_str());
if (SwapVector[EntryIdx].IsLoad)
diff --git a/lib/Target/PowerPC/README.txt b/lib/Target/PowerPC/README.txt
index bc09d5f8a7e8e..b4bf635dc2c75 100644
--- a/lib/Target/PowerPC/README.txt
+++ b/lib/Target/PowerPC/README.txt
@@ -256,7 +256,7 @@ _clamp0g:
cmpwi cr0, r3, 0
li r2, 0
blt cr0, LBB1_2
-; BB#1: ; %entry
+; %bb.1: ; %entry
mr r2, r3
LBB1_2: ; %entry
mr r3, r2
diff --git a/lib/Target/PowerPC/README_ALTIVEC.txt b/lib/Target/PowerPC/README_ALTIVEC.txt
index f70ebd82bd5c9..c38e019231611 100644
--- a/lib/Target/PowerPC/README_ALTIVEC.txt
+++ b/lib/Target/PowerPC/README_ALTIVEC.txt
@@ -233,7 +233,7 @@ declare <16 x i8> @llvm.ppc.altivec.crypto.vpmsumb(<16 x i8>, <16 x i8>) #1
Produces the following code with -mtriple=powerpc64-unknown-linux-gnu:
-# BB#0: # %entry
+# %bb.0: # %entry
addis 3, 2, .LCPI0_0@toc@ha
addis 4, 2, .LCPI0_1@toc@ha
addi 3, 3, .LCPI0_0@toc@l
diff --git a/lib/Target/PowerPC/TargetInfo/PowerPCTargetInfo.cpp b/lib/Target/PowerPC/TargetInfo/PowerPCTargetInfo.cpp
index a637dd11f8105..979595264472f 100644
--- a/lib/Target/PowerPC/TargetInfo/PowerPCTargetInfo.cpp
+++ b/lib/Target/PowerPC/TargetInfo/PowerPCTargetInfo.cpp
@@ -27,11 +27,11 @@ Target &llvm::getThePPC64LETarget() {
extern "C" void LLVMInitializePowerPCTargetInfo() {
RegisterTarget<Triple::ppc, /*HasJIT=*/true> X(getThePPC32Target(), "ppc32",
- "PowerPC 32");
+ "PowerPC 32", "PPC");
RegisterTarget<Triple::ppc64, /*HasJIT=*/true> Y(getThePPC64Target(), "ppc64",
- "PowerPC 64");
+ "PowerPC 64", "PPC");
RegisterTarget<Triple::ppc64le, /*HasJIT=*/true> Z(
- getThePPC64LETarget(), "ppc64le", "PowerPC 64 LE");
+ getThePPC64LETarget(), "ppc64le", "PowerPC 64 LE", "PPC");
}
diff --git a/lib/Target/PowerPC/p9-instrs.txt b/lib/Target/PowerPC/p9-instrs.txt
deleted file mode 100644
index a70582aca3989..0000000000000
--- a/lib/Target/PowerPC/p9-instrs.txt
+++ /dev/null
@@ -1,442 +0,0 @@
-Content:
-========
-. Remaining Instructions (Total 56 Instructions, include 2 unknow instructions)
-. Done (Total 155 Instructions: 101 VSX, 54 Altivec)
-
-//------------------------------------------------------------------------------
-//. Remaining Instructions
-//------------------------------------------------------------------------------
-GCC reference: https://sourceware.org/ml/binutils/2015-11/msg00071.html
-
-// Add PC Immediate Shifted DX-form p69
-[PO RT d1 d0 XO d2] addpcis RT,D
- subpcis Rx,value = addpcis Rx,-value
-
-// 6.17.2 Decimal Integer Format Conversion Instructions
-
-// Decimal Convert From National VX-form p352
-[PO VRT EO VRB 1 PS XO] bcdcfn. VRT,VRB,PS
-
-// Decimal Convert From Zoned VX-form p353
-[PO VRT EO VRB 1 PS XO] bcdcfz. VRT,VRB,PS
-
-// Decimal Convert To National VX-form p354
-[PO VRT EO VRB 1 / XO] bcdctn. VRT,VRB
-
-// Decimal Convert To Zoned VX-form p355
-[PO VRT EO VRB 1 PS XO] bcdctz. VRT,VRB,PS
-
-// Decimal Convert From Signed Quadword VX-form p356
-[PO VRT EO VRB 1 PS XO] bcdcfsq. VRT,VRB,PS
-
-// Decimal Convert To Signed Quadword VX-form p356
-[PO VRT EO VRB 1 / XO] bcdctsq. VRT,VRB
-
-// 6.17.3 Decimal Integer Sign Manipulation Instructions
-
-// Decimal Copy Sign VX-form p358
-[PO VRT VRA VRB XO] bcdcpsgn. VRT,VRA,VRB
-
-// Decimal Set Sign VX-form p358
-[PO VRT EO VRB 1 PS XO] bcdsetsgn. VRT,VRB,PS
-
-// Decimal Shift VX-form p359
-[PO VRT VRA VRB 1 PS XO] bcds. VRT,VRA,VRB,PS
-
-// Decimal Unsigned Shift VX-form p360
-[PO VRT VRA VRB 1 / XO] bcdus. VRT,VRA,VRB
-
-// Decimal Shift and Round VX-form p361
-[PO VRT VRA VRB 1 PS XO] bcdsr. VRT,VRA,VRB,PS
-
-// 6.17.5 Decimal Integer Truncate Instructions
-
-// Decimal Truncate VX-form p362
-[PO VRT VRA VRB 1 PS XO] bcdtrunc. VRT,VRA,VRB,PS
-
-// Decimal Unsigned Truncate VX-form p363
-[PO VRT VRA VRB 1 / XO] bcdutrunc. VRT,VRA,VRB
-
-// 3.3.10.1 Character-Type Compare Instructions
-
-// Compare Ranged Byte X-form p87
-[PO BF / L RA RB XO /] cmprb BF,L,RA,RB
-
-// Compare Equal Byte X-form p88
-[PO BF // RA RB XO /] cmpeqb BF,RA,RB
-
-// 3.3.13 Fixed-Point Logical Instructions
-
-// Count Trailing Zeros Word X-form p95
-[PO RS RA /// XO Rc] cnttzw(.) RA,RS
-
-// 3.3.13.1 64-bit Fixed-Point Logical Instructions
-
-// Count Trailing Zeros Doubleword X-form p98
-[PO RS RA /// XO Rc] cnttzd(.) RA,RS
-
-// 4.4 Copy-Paste Facility
-
-// Copy X-form p858
-[PO /// L RA RB XO /] copy RA,RB,L
- copy_first = copy RA, RB, 1
-// CP_Abort p860
-[PO /// /// /// XO /] cp_abort
-
-// Paste p859
-[PO /// L RA RB XO Rc] paste(.) RA,RB,L
- paste_last = paste RA,RB,1
-
-// 3.3.9 Fixed-Point Arithmetic Instructions
-
-// Deliver A Random Number X-form p79
-[PO RT /// L /// XO /] darn RT,L
-
-// Multiply-Add High Doubleword VA-form p81
-[PO RT RA RB RC XO] maddhd RT,RA.RB,RC
-
-// Multiply-Add High Doubleword Unsigned VA-form p81
-[PO RT RA RB RC XO] maddhdu RT,RA.RB,RC
-
-// Multiply-Add Low Doubleword VA-form p81
-[PO RT RA RB RC XO] maddld RT,RA.RB,RC
-
-// Modulo Signed Word X-form p76
-[PO RT RA RB XO /] modsw RT,RA,RB
-
-// Modulo Unsigned Word X-form p76
-[PO RT RA RB XO /] moduw RT,RA,RB
-
-// Modulo Signed Doubleword X-form p84
-[PO RT RA RB XO /] modsd RT,RA,RB
-
-// Modulo Unsigned Doubleword X-form p84
-[PO RT RA RB XO /] modud RT,RA,RB
-
-
-// DFP Test Significance Immediate [Quad] X-form p204
-[PO BF / UIM FRB XO /] dtstsfi BF,UIM,FRB
-[PO BF / UIM FRBp XO /] dtstsfiq BF,UIM,FRBp
-
-// 3.3.14.2.1 64-bit Fixed-Point Shift Instructions
-
-// Extend-Sign Word and Shift Left Immediate XS-form p109
-[PO RS RA sh XO sh Rc] extswsli(.) RA,RS,SH
-
-// 4.5.1 Load Atomic
-
-// Load Word Atomic X-form p864
-[PO RT RA FC XO /] lwat RT,RA,FC
-
-// Load Doubleword Atomic X-form p864
-[PO RT RA FC XO /] ldat RT,RA,FC
-
-// 4.5.2 Store Atomic
-
-// Store Word Atomic X-form p866
-[PO RS RA FC XO /] stwat RS,RA,FC
-
-// Store Doubleword Atomic X-form p866
-[PO RS RA FC XO /] stdat RS,RA,FC
-
-// 3.3.2.1 64-bit Fixed-Point Load Instructions
-
-// Load Doubleword Monitored Indexed X-form p54
-[PO RT RA RB XO /] ldmx RT,RA,RB
-
-// 3.3.16 Move To/From Vector-Scalar Register Instructions
-
-// Move From VSR Lower Doubleword XX1-form p111
-[PO S RA /// XO SX] mfvsrld RA,XS
-
-// Move To VSR Double Doubleword XX1-form p114
-[PO T RA RB XO TX] mtvsrdd XT,RA,RB
-
-// Move To VSR Word & Splat XX1-form p115
-[PO T RA /// XO TX] mtvsrws XT,RA
-
-// Move to CR from XER Extended X-form p119
-[PO BF // /// /// XO /] mcrxrx BF
-
-// Set Boolean X-form p121
-[PO RT BFA // /// XO /] setb RT,BFA
-
-// Message Synchronize X-form p1126
-[PO /// /// /// XO /] msgsync
-
-// SLB Invalidate Entry Global X-form p1026
-[PO RS /// RB XO /] slbieg RS,RB
-
-// SLB Synchronize X-form p1031
-[PO /// /// /// XO /] slbsync
-
-// 3.3.2.1 Power-Saving Mode Instruction
-
-// stop XL-form p957
-[PO /// /// /// XO /] stop
-
-// 4.6.4 Wait Instruction
-// Wait X-form p880
-[PO /// WC /// /// XO /] wait
-
-// Unknow Instructions:
-urfid
-- gcc's implementation:
- {"urfid", XL(19,306), 0xffffffff, POWER9, PPCNONE, {0}},
- (4c 00 02 64|64 02 00 4c) urfid
-
-rmieg
-- gcc's implementation:
- {"rmieg", X(31,882), XRTRA_MASK, POWER9, PPCNONE, {RB}},
- (7c 00 f6 e4|e4 f6 00 7c) rmieg r30
-
-//------------------------------------------------------------------------------
-//. Done:
-//------------------------------------------------------------------------------
-
-//======================================
-"vsx instructions"
-
-//--------------------------------------
-"7.6.1.2.1 VSX Scalar Move Instructions"
-// VSX Scalar Quad-Precision Move Instructions
-
-// VSX Scalar Copy Sign Quad-Precision X-form p.553
-[PO VRT VRA VRB XO /] xscpsgnqp
-
-// VSX Scalar Absolute Quad-Precision X-form 531
-// VSX Scalar Negate Quad-Precision X-form 627
-// VSX Scalar Negative Absolute Quad-Precision X-form 626
-[PO VRT XO VRB XO /] xsabsqp xsnegqp xsnabsqp
-
-//--------------------------------------
-"7.6.1.3 VSX Floating-Point Arithmetic Instructions"
-
-// VSX Scalar Quad-Precision Elementary Arithmetic
-
-// VSX Scalar Add Quad-Precision [using round to Odd] X-form 539
-// VSX Scalar Divide Quad-Precision [using round to Odd] X-form 584
-// VSX Scalar Multiply Quad-Precision [using round to Odd] X-form 622
-[PO VRT VRA VRB XO RO] xsaddqp xsaddqpo xsdivqp xsdivqpo xsmulqp xsmulqpo
-
-// VSX Scalar Square Root Quad-Precision [using round to Odd] X-form 662
-// VSX Scalar Subtract Quad-Precision [using round to Odd] X-form 667
- xssubqp xssubqpo
-
-[PO VRT XO VRB XO RO] xssqrtqp xssqrtqpo
-
-// VSX Scalar Quad-Precision Multiply-Add Arithmetic Instructions
-
-// VSX Scalar Multiply-Add Quad-Precision [using round to Odd] X-form 596
-// VSX Scalar Multiply-Subtract Quad-Precision [using round to Odd] X-form 617
-// VSX Scalar Negative Multiply-Add Quad-Precision [using round to Odd] X-form 636
-// VSX Scalar Negative Multiply-Subtract Quad-Precision [using round to Odd]
-// X-form 645
-[PO VRT VRA VRB XO RO] xsmaddqp xsmaddqpo xsmsubqp xsmsubqpo
- xsnmaddqp xsnmaddqpo xsnmsubqp xsnmsubqpo
-
-22
-//--------------------------------------
-"7.6.1.4 VSX Floating-Point Compare Instructions"
-
-// VSX Scalar Quad-Precision Compare Instructions
-
-// VSX Scalar Compare Ordered Quad-Precision X-form 549
-// VSX Scalar Compare Unordered Quad-Precision X-form 552
-[PO BF // VRA VRB XO /] xscmpoqp xscmpuqp
-
-"7.6.1.8 VSX Scalar Floating-Point Support Instructions"
-// VSX Scalar Compare Exponents Quad-Precision X-form p. 541 542
-[PO BF // A B XO AX BX /] xscmpexpdp
-[PO BF // VRA VRB XO /] xscmpexpqp
-
-// VSX Scalar Compare DP, XX3-form, p.543 544 545
-// VSX Scalar Compare Equal Double-Precision,
-[PO T A B XO AX BX TX] xscmpeqdp xscmpgedp xscmpgtdp xscmpnedp
-
-// VSX Vector Compare Not Equal Double-Precision XX3-form 691
-[PO T A B Rc XO AX BX TX] xvcmpnedp xvcmpnedp. xvcmpnesp xvcmpnesp.
-
-//--------------------------------------
-"7.6.1.5 VSX FP-FP Conversion Instructions"
-// VSX Scalar Quad-Precision Floating-Point Conversion Instructions
-
-// VSX Scalar round & Convert Quad-Precision format to Double-Precision format
-// [using round to Odd] X-form 567
-[PO VRT XO VRB XO /] xscvqpdp xscvqpdpo (actually [PO VRT XO VRB XO RO])
-[PO VRT XO VRB XO /] xscvdpqp
-
-// VSX Scalar Quad-Precision Convert to Integer Instructions
-
-// VSX Scalar truncate & Convert Quad-Precision format to Signed Doubleword format
-// 568 570 572 574
-[PO VRT XO VRB XO /] xscvqpsdz xscvqpswz xscvqpudz xscvqpuwz
-576 = 580 xscvsdqp xscvudqp
-
-"7.6.1.7 VSX Round to Floating-Point Integer Instructions"
-// VSX Scalar round & Convert Double-Precision format to Half-Precision format
-// XX2-form 554 566
-[PO T XO B XO BX TX] xscvdphp xscvhpdp
-
-// VSX Vector Convert Half-Precision format to Single-Precision format
-// XX2-form 703 705
-[PO T XO B XO BX TX] xvcvhpsp xvcvsphp
-
-// VSX Scalar Round to Quad-Precision Integer [with Inexact] Z23-form 654
-[PO VRT /// R VRB RMC XO EX] xsrqpi xsrqpix
-
-// VSX Scalar Round Quad-Precision to Double-Extended Precision Z23-form 656
-[PO VRT /// R VRB RMC XO /] xsrqpxp
-def XSRQPXP : Z23Form_1<63, 37,
- (outs vrrc:$vT), (ins u5imm:$R, vrrc:$vB, u2imm:$RMC),
- "xsrqpxp $vT, $R, $vB, $RMC"), IIC_VecFP, []>;
-
-27~28
-//--------------------------------------
-// VSX Scalar Insert Exponent Double-Precision X-form 588
-// VSX Scalar Insert Exponent Quad-Precision X-form 589
-[PO VT rA rB XO /] xsiexpdp
-[PO VRT VRA VRB XO /] xsiexpqp
-
-// VSX Vector Insert Exponent Double-Precision XX3-form 722
-[PO T A B XO AX BX TX] xviexpdp xviexpsp
-
-// VSX Vector Extract Unsigned Word XX2-form 788
-// VSX Vector Insert Word XX2-form
-[PO T / UIM B XO BX TX] xxextractuw xxinsertw
-
-// VSX Scalar Extract Exponent Double-Precision XX2-form 676
-[PO BF DCMX B XO BX /]
-[PO T XO B XO BX /] xsxexpdp xsxsigdp
-// X-form
-[PO VRT XO VRB XO /] xsxexpqp xsxsigqp
-
-// VSX Vector Extract Exponent Double-Precision XX2-form 784
-[PO T XO B XO BX TX] xvxexpdp xvxexpsp
-
-// VSX Vector Extract Significand Double-Precision XX2-form 785
-[PO T XO B XO BX TX] xvxsigdp xvxsigsp
-
-//--------------------------------------
-// VSX Scalar Test Data Class Double-Precision XX2-form p673
-// VSX Scalar Test Data Class Quad-Precision X-form 674
-// VSX Scalar Test Data Class Single-Precision XX2-form 675
-[PO BF DCMX B XO BX /] xststdcdp xststdcsp
-[PO BF DCMX VRB XO /] xststdcqp
-
-// VSX Vector Test Data Class Double-Precision XX2-form 782 783
-[PO T dx B XO dc XO dm BX TX] xvtstdcdp xvtstdcsp
-
-//--------------------------------------
-// VSX Scalar Maximum Type-C Double-Precision XX3-form 601 ~ 609
-[PO T A B XO AX BX TX] xsmaxcdp xsmaxjdp xsmincdp xsminjdp
-
-//--------------------------------------
-// VSX Vector Byte-Reverse Doubleword XX2-form 786 787
-[PO T XO B XO BX TX] xxbrd xxbrh xxbrq xxbrw
-
-// VSX Vector Permute XX3-form 794
-[PO T A B XO AX BX TX] xxperm xxpermr
-
-// VSX Vector Splat Immediate Byte 796 x-form
-[PO T EO IMM8 XO TX] xxspltib <= sign or unsigned?
-
-30
-//--------------------------------------
-// Load VSX Vector DQ-form 511
-[PO T RA DQ TX XO] lxv
-
-// Store VSX Vector DQ-form 526
-[PO S RA DQ SX XO] stxv
-
-// Load VSX Scalar Doubleword DS-form 499
-// Load VSX Scalar Single DS-form 504
-[PO VRT RA DS XO] lxsd lxssp
-
-// Store VSX Scalar Doubleword DS-form 517
-// Store VSX Scalar Single DS-form 520
-[PO VRT RA DS XO] stxsd stxssp
-
-
-// Load VSX Vector Indexed X-form 511
-// Load VSX Scalar as Integer Byte & Zero Indexed X-form 501
-// Load VSX Vector Byte*16 Indexed X-form 506
-// Load VSX Vector with Length X-form 508
-// Load VSX Vector Left-justified with Length X-form 510
-// Load VSX Vector Halfword*8 Indexed X-form 514
-// Load VSX Vector Word & Splat Indexed X-form 516
-[PO T RA RB XO TX] lxvx lxsibzx lxsihzx lxvb16x lxvl lxvll lxvh8x lxvwsx
-
-// Store VSX Scalar as Integer Byte Indexed X-form 518
-// Store VSX Scalar as Integer Halfword Indexed X-form 518
-// Store VSX Vector Byte*16 Indexed X-form 522
-// Store VSX Vector Halfword*8 Indexed X-form 524
-// Store VSX Vector with Length X-form 526
-// Store VSX Vector Left-justified with Length X-form 528
-// Store VSX Vector Indexed X-form 529
-[PO S RA RB XO SX] stxsibx stxsihx stxvb16x stxvh8x stxvl stxvll stxvx
-
-21
-
-//--------------------------------------
-". vector instructions"
-
-[1] PowerISA-v3.0 p.933 - Table 1, and Chapter 6. Vector Facility (altivec)
-[2] https://sourceware.org/ml/binutils/2015-11/msg00071.html
-
-//--------------------------------------
-New patch:
-// vector bit, p.367, 6.16 Vector Bit Permute Instruction
-[PO VRT VRA VRB XO] vbpermd, (existing: vbpermq)
-
-// vector permute, p.280
-[PO VRT VRA VRB VRC XO] vpermr
-
-// vector rotate left, p.341
-[PO VRT VRA VRB XO] vrlwnm vrlwmi vrldnm vrldmi
-
-// vector shift, p.285
-[PO VRT VRA VRB XO] vslv vsrv
-
-// vector multiply-by-10, p.375
-[PO VRT VRA /// XO] vmul10cuq vmul10uq
-[PO VRT VRA VRB XO] vmul10ecuq vmul10euq
-
-12
-//--------------------------------------
-http://reviews.llvm.org/D15887 + ext + neg + prty - vbpermd
-// vector count leading/trailing zero
-. new vx-form: p.31, 1.6.14 VX-FORM
-[PO RT EO VRB XO] vclzlsbb vctzlsbb (p.363)
-
-// Vector Count Trailing Zeros Instructions, 362
-[PO VRT EO VRB XO] vctzb vctzh vctzw vctzd (v16i8 v8i16 v4i32 v2i64)
-
-// vector extend sign (p.314)
-[PO VRT EO VRB XO] vextsb2w vextsh2w vextsb2d vextsh2d vextsw2d
-
-// vector negate, p.313
-[PO VRT EO VRB XO] vnegd vnegw
-
-// vector parity, p.335
-[PO VRT EO VRB XO] vprtybd vprtybq vprtybw
-
-16
-//--------------------------------------
-// vector compare, p.330
-[PO VRT VRA VRB RC XO] vcmpneb vcmpneb. vcmpneh vcmpneh. vcmpnew vcmpnew.
- vcmpnezb vcmpnezb. vcmpnezh vcmpnezh. vcmpnezw vcmpnezw.
-12
-//--------------------------------------
-http://reviews.llvm.org/D15917 + insert
-// vector extract (p.287) ref: vspltb (v2.07, p.227)
-// vector insert, p.288
-[PO VRT / UIM VRB XO] vinsertb vinsertd vinserth vinsertw
-
-// Vector Extract Unsigned
-[PO VRT / UIM VRB XO] vextractub vextractuh vextractuw vextractd
-
-// p.364: Vector Extract Unsigned Left/Right-Indexed
-[PO RT RA VRB XO] vextublx vextubrx vextuhlx vextuhrx vextuwlx vextuwrx
-
-14