diff options
Diffstat (limited to 'lib/Target')
135 files changed, 6453 insertions, 1052 deletions
diff --git a/lib/Target/AArch64/AArch64.td b/lib/Target/AArch64/AArch64.td index 73f2b6a25f66..4af5fef4287c 100644 --- a/lib/Target/AArch64/AArch64.td +++ b/lib/Target/AArch64/AArch64.td @@ -216,6 +216,7 @@ def ProcA72 : SubtargetFeature<"a72", "ARMProcFamily", "CortexA72", FeatureCRC, FeatureCrypto, FeatureFPARMv8, + FeatureFuseAES, FeatureNEON, FeaturePerfMon ]>; diff --git a/lib/Target/AArch64/AArch64CallLowering.cpp b/lib/Target/AArch64/AArch64CallLowering.cpp index ff3e4c40e2c2..29f6d571d6bd 100644 --- a/lib/Target/AArch64/AArch64CallLowering.cpp +++ b/lib/Target/AArch64/AArch64CallLowering.cpp @@ -380,7 +380,7 @@ bool AArch64CallLowering::lowerCall(MachineIRBuilder &MIRBuilder, MIRBuilder.buildSequence(OrigRet.Reg, SplitRegs, RegOffsets); } - CallSeqStart.addImm(Handler.StackSize); + CallSeqStart.addImm(Handler.StackSize).addImm(0); MIRBuilder.buildInstr(AArch64::ADJCALLSTACKUP) .addImm(Handler.StackSize) .addImm(0); diff --git a/lib/Target/AArch64/AArch64FastISel.cpp b/lib/Target/AArch64/AArch64FastISel.cpp index 083708001757..9ac7ecb9cdb4 100644 --- a/lib/Target/AArch64/AArch64FastISel.cpp +++ b/lib/Target/AArch64/AArch64FastISel.cpp @@ -3014,7 +3014,7 @@ bool AArch64FastISel::processCallArgs(CallLoweringInfo &CLI, // Issue CALLSEQ_START unsigned AdjStackDown = TII.getCallFrameSetupOpcode(); BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(AdjStackDown)) - .addImm(NumBytes); + .addImm(NumBytes).addImm(0); // Process the args. for (CCValAssign &VA : ArgLocs) { diff --git a/lib/Target/AArch64/AArch64ISelLowering.cpp b/lib/Target/AArch64/AArch64ISelLowering.cpp index 4b1bb27dce73..4f7c2e122390 100644 --- a/lib/Target/AArch64/AArch64ISelLowering.cpp +++ b/lib/Target/AArch64/AArch64ISelLowering.cpp @@ -2265,7 +2265,7 @@ SDValue AArch64TargetLowering::LowerFSINCOS(SDValue Op, SDValue Callee = DAG.getExternalSymbol(LibcallName, getPointerTy(DAG.getDataLayout())); - StructType *RetTy = StructType::get(ArgTy, ArgTy, nullptr); + StructType *RetTy = StructType::get(ArgTy, ArgTy); TargetLowering::CallLoweringInfo CLI(DAG); CLI.setDebugLoc(dl) .setChain(DAG.getEntryNode()) @@ -3249,9 +3249,7 @@ AArch64TargetLowering::LowerCall(CallLoweringInfo &CLI, // Adjust the stack pointer for the new arguments... // These operations are automatically eliminated by the prolog/epilog pass if (!IsSibCall) - Chain = DAG.getCALLSEQ_START(Chain, DAG.getIntPtrConstant(NumBytes, DL, - true), - DL); + Chain = DAG.getCALLSEQ_START(Chain, NumBytes, 0, DL); SDValue StackPtr = DAG.getCopyFromReg(Chain, DL, AArch64::SP, getPointerTy(DAG.getDataLayout())); diff --git a/lib/Target/AArch64/AArch64InstrInfo.cpp b/lib/Target/AArch64/AArch64InstrInfo.cpp index cb268828455e..c42738da7ab0 100644 --- a/lib/Target/AArch64/AArch64InstrInfo.cpp +++ b/lib/Target/AArch64/AArch64InstrInfo.cpp @@ -3427,6 +3427,10 @@ static bool getFMAPatterns(MachineInstr &Root, Patterns.push_back(MachineCombinerPattern::FMLSv1i32_indexed_OP2); Found = true; } + if (canCombineWithFMUL(MBB, Root.getOperand(1), AArch64::FNMULSrr)) { + Patterns.push_back(MachineCombinerPattern::FNMULSUBS_OP1); + Found = true; + } break; case AArch64::FSUBDrr: if (canCombineWithFMUL(MBB, Root.getOperand(1), AArch64::FMULDrr)) { @@ -3441,6 +3445,10 @@ static bool getFMAPatterns(MachineInstr &Root, Patterns.push_back(MachineCombinerPattern::FMLSv1i64_indexed_OP2); Found = true; } + if (canCombineWithFMUL(MBB, Root.getOperand(1), AArch64::FNMULDrr)) { + Patterns.push_back(MachineCombinerPattern::FNMULSUBD_OP1); + Found = true; + } break; case AArch64::FSUBv2f32: if (canCombineWithFMUL(MBB, Root.getOperand(2), @@ -3495,6 +3503,8 @@ AArch64InstrInfo::isThroughputPattern(MachineCombinerPattern Pattern) const { case MachineCombinerPattern::FMULADDD_OP2: case MachineCombinerPattern::FMULSUBD_OP1: case MachineCombinerPattern::FMULSUBD_OP2: + case MachineCombinerPattern::FNMULSUBS_OP1: + case MachineCombinerPattern::FNMULSUBD_OP1: case MachineCombinerPattern::FMLAv1i32_indexed_OP1: case MachineCombinerPattern::FMLAv1i32_indexed_OP2: case MachineCombinerPattern::FMLAv1i64_indexed_OP1: @@ -3996,6 +4006,24 @@ void AArch64InstrInfo::genAlternativeCodeSequence( MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC); break; } + + case MachineCombinerPattern::FNMULSUBS_OP1: + case MachineCombinerPattern::FNMULSUBD_OP1: { + // FNMUL I=A,B,0 + // FSUB R,I,C + // ==> FNMADD R,A,B,C // = -A*B - C + // --- Create(FNMADD); + if (Pattern == MachineCombinerPattern::FNMULSUBS_OP1) { + Opc = AArch64::FNMADDSrrr; + RC = &AArch64::FPR32RegClass; + } else { + Opc = AArch64::FNMADDDrrr; + RC = &AArch64::FPR64RegClass; + } + MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC); + break; + } + case MachineCombinerPattern::FMULSUBS_OP2: case MachineCombinerPattern::FMULSUBD_OP2: { // FMUL I=A,B,0 @@ -4011,6 +4039,7 @@ void AArch64InstrInfo::genAlternativeCodeSequence( } MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC); break; + } case MachineCombinerPattern::FMLSv1i32_indexed_OP2: Opc = AArch64::FMLSv1i32_indexed; @@ -4067,7 +4096,6 @@ void AArch64InstrInfo::genAlternativeCodeSequence( FMAInstKind::Accumulator); } break; - } } // end switch (Pattern) // Record MUL and ADD/SUB for deletion DelInstrs.push_back(MUL); diff --git a/lib/Target/AArch64/AArch64InstrInfo.td b/lib/Target/AArch64/AArch64InstrInfo.td index 902b08844216..5ddf66654a67 100644 --- a/lib/Target/AArch64/AArch64InstrInfo.td +++ b/lib/Target/AArch64/AArch64InstrInfo.td @@ -156,7 +156,8 @@ def AArch64adrp : SDNode<"AArch64ISD::ADRP", SDTIntUnaryOp, []>; def AArch64addlow : SDNode<"AArch64ISD::ADDlow", SDTIntBinOp, []>; def AArch64LOADgot : SDNode<"AArch64ISD::LOADgot", SDTIntUnaryOp>; def AArch64callseq_start : SDNode<"ISD::CALLSEQ_START", - SDCallSeqStart<[ SDTCisVT<0, i32> ]>, + SDCallSeqStart<[ SDTCisVT<0, i32>, + SDTCisVT<1, i32> ]>, [SDNPHasChain, SDNPOutGlue]>; def AArch64callseq_end : SDNode<"ISD::CALLSEQ_END", SDCallSeqEnd<[ SDTCisVT<0, i32>, @@ -328,8 +329,9 @@ include "AArch64InstrFormats.td" let Defs = [SP], Uses = [SP], hasSideEffects = 1, isCodeGenOnly = 1 in { // We set Sched to empty list because we expect these instructions to simply get // removed in most cases. -def ADJCALLSTACKDOWN : Pseudo<(outs), (ins i32imm:$amt), - [(AArch64callseq_start timm:$amt)]>, Sched<[]>; +def ADJCALLSTACKDOWN : Pseudo<(outs), (ins i32imm:$amt1, i32imm:$amt2), + [(AArch64callseq_start timm:$amt1, timm:$amt2)]>, + Sched<[]>; def ADJCALLSTACKUP : Pseudo<(outs), (ins i32imm:$amt1, i32imm:$amt2), [(AArch64callseq_end timm:$amt1, timm:$amt2)]>, Sched<[]>; diff --git a/lib/Target/AArch64/AArch64RegisterBankInfo.cpp b/lib/Target/AArch64/AArch64RegisterBankInfo.cpp index 5f895903da6f..789270c2a34b 100644 --- a/lib/Target/AArch64/AArch64RegisterBankInfo.cpp +++ b/lib/Target/AArch64/AArch64RegisterBankInfo.cpp @@ -529,9 +529,34 @@ AArch64RegisterBankInfo::getInstrMapping(const MachineInstr &MI) const { // for the greedy mode the cost of the cross bank copy will // offset this number. // FIXME: Should be derived from the scheduling model. - if (OpRegBankIdx[0] >= PMI_FirstFPR) + if (OpRegBankIdx[0] != PMI_FirstGPR) Cost = 2; + else + // Check if that load feeds fp instructions. + // In that case, we want the default mapping to be on FPR + // instead of blind map every scalar to GPR. + for (const MachineInstr &UseMI : + MRI.use_instructions(MI.getOperand(0).getReg())) + // If we have at least one direct use in a FP instruction, + // assume this was a floating point load in the IR. + // If it was not, we would have had a bitcast before + // reaching that instruction. + if (isPreISelGenericFloatingPointOpcode(UseMI.getOpcode())) { + OpRegBankIdx[0] = PMI_FirstFPR; + break; + } break; + case TargetOpcode::G_STORE: + // Check if that store is fed by fp instructions. + if (OpRegBankIdx[0] == PMI_FirstGPR) { + unsigned VReg = MI.getOperand(0).getReg(); + if (!VReg) + break; + MachineInstr *DefMI = MRI.getVRegDef(VReg); + if (isPreISelGenericFloatingPointOpcode(DefMI->getOpcode())) + OpRegBankIdx[0] = PMI_FirstFPR; + break; + } } // Finally construct the computed mapping. diff --git a/lib/Target/AArch64/AArch64SchedFalkorDetails.td b/lib/Target/AArch64/AArch64SchedFalkorDetails.td index 8f8eeef8a6cf..a9b4d44a523e 100644 --- a/lib/Target/AArch64/AArch64SchedFalkorDetails.td +++ b/lib/Target/AArch64/AArch64SchedFalkorDetails.td @@ -42,11 +42,11 @@ def : InstRW<[FalkorWr_1VXVY_4cyc], (instregex "^FCVT(N|M|P|Z|A)(S|U)(v1i32|v1 def : InstRW<[FalkorWr_1VXVY_4cyc], (instrs FCVTXNv1i64)>; def : InstRW<[FalkorWr_1VXVY_4cyc], (instregex "^FCVTZ(S|U)(v2i32|v4i16)(_shift)?$")>; -def : InstRW<[FalkorWr_1VXVY_5cyc], (instregex "^(FMUL|FMULX)(v2f32|v4f16|(v1i16_indexed|v4i16_indexed|v1i32_indexed|v2i32_indexed))$")>; -def : InstRW<[FalkorWr_1VXVY_5cyc], (instrs FMULX16, FMULX32)>; +def : InstRW<[FalkorWr_FMUL32_1VXVY_5cyc], (instregex "^(FMUL|FMULX)(v2f32|v4f16|(v1i16_indexed|v4i16_indexed|v1i32_indexed|v2i32_indexed))$")>; +def : InstRW<[FalkorWr_FMUL32_1VXVY_5cyc], (instrs FMULX16, FMULX32)>; -def : InstRW<[FalkorWr_1VXVY_6cyc], (instregex "^(FMUL|FMULX)v1i64_indexed$")>; -def : InstRW<[FalkorWr_1VXVY_6cyc], (instrs FMULX64)>; +def : InstRW<[FalkorWr_FMUL64_1VXVY_6cyc], (instregex "^(FMUL|FMULX)v1i64_indexed$")>; +def : InstRW<[FalkorWr_FMUL64_1VXVY_6cyc], (instrs FMULX64)>; def : InstRW<[FalkorWr_2VXVY_1cyc], (instregex "^(FABS|FNEG)(v2f64|v4f32|v8f16)$")>; @@ -62,9 +62,9 @@ def : InstRW<[FalkorWr_2VXVY_4cyc], (instregex "^FCVT(N|M|P|Z|A)(S|U)(v2f64|v4 def : InstRW<[FalkorWr_2VXVY_4cyc], (instregex "^(FCVTL|FCVTL2)(v2i32|v4i16|v4i32|v8i16)$")>; def : InstRW<[FalkorWr_2VXVY_4cyc], (instregex "^FCVTZ(S|U)(v2i64|v4i32|v8i16)(_shift)?$")>; -def : InstRW<[FalkorWr_2VXVY_5cyc], (instregex "^(FMUL|FMULX)(v2f64|v4f32|v8f16|v8i16_indexed|v4i32_indexed)$")>; +def : InstRW<[FalkorWr_FMUL32_2VXVY_5cyc], (instregex "^(FMUL|FMULX)(v2f64|v4f32|v8f16|v8i16_indexed|v4i32_indexed)$")>; -def : InstRW<[FalkorWr_2VXVY_6cyc], (instregex "^(FMUL|FMULX)v2i64_indexed$")>; +def : InstRW<[FalkorWr_FMUL64_2VXVY_6cyc], (instregex "^(FMUL|FMULX)v2i64_indexed$")>; def : InstRW<[FalkorWr_3VXVY_4cyc], (instregex "^(FCVTX?N|FCVTX?N2)(v1i32|v1i64|v1f16|v2f32|v4f16)$")>; @@ -72,13 +72,14 @@ def : InstRW<[FalkorWr_3VXVY_5cyc], (instregex "^(FCVTX?N|FCVTX?N2)(v2i32|v4i1 def : InstRW<[FalkorWr_2VX_2VY_2cyc], (instregex "^(FDIV|FSQRT)(v2f64|v4f32|v8f16)$")>; -def : InstRW<[FalkorWr_1VXVY_4cyc, FalkorReadVMA],(instregex "^ML(A|S)(v8i8|v4i16|v2i32)(_indexed)?$")>; -def : InstRW<[FalkorWr_2VXVY_4cyc, FalkorReadVMA],(instregex "^ML(A|S)(v16i8|v8i16|v4i32|v2i64)(_indexed)?$")>; +def : InstRW<[FalkorWr_VMUL32_1VXVY_4cyc, FalkorReadVMA], (instregex "^ML(A|S)(v8i8|v4i16|v2i32)(_indexed)?$")>; +def : InstRW<[FalkorWr_VMUL32_2VXVY_4cyc, FalkorReadVMA], (instregex "^ML(A|S)(v16i8|v8i16|v4i32|v2i64)(_indexed)?$")>; + +def : InstRW<[FalkorWr_FMUL32_1VXVY_5cyc, FalkorReadFMA32], (instregex "^FML(A|S)(v2f32|v4f16|(v1i16_indexed|v4i16_indexed|v1i32_indexed|v2i32_indexed))$")>; +def : InstRW<[FalkorWr_FMUL64_1VXVY_6cyc, FalkorReadFMA64], (instregex "^FML(A|S)v1i64_indexed$")>; +def : InstRW<[FalkorWr_FMUL32_2VXVY_5cyc, FalkorReadFMA32], (instregex "^FML(A|S)(v4f32|v8f16|v8i16_indexed|v4i32_indexed)$")>; +def : InstRW<[FalkorWr_FMUL64_2VXVY_6cyc, FalkorReadFMA64], (instregex "^FML(A|S)(v2f64|v2i64_indexed)$")>; -def : InstRW<[FalkorWr_1VXVY_5cyc, FalkorReadFMA],(instregex "^FML(A|S)(v2f32|v4f16|(v1i16_indexed|v4i16_indexed|v1i32_indexed|v2i32_indexed))$")>; -def : InstRW<[FalkorWr_1VXVY_6cyc, FalkorReadFMA],(instregex "^FML(A|S)v1i64_indexed$")>; -def : InstRW<[FalkorWr_2VXVY_5cyc, FalkorReadFMA],(instregex "^FML(A|S)(v2f64|v4f32|v8f16|v8i16_indexed|v4i32_indexed)$")>; -def : InstRW<[FalkorWr_2VXVY_6cyc, FalkorReadFMA],(instregex "^FML(A|S)v2i64_indexed$")>; // SIMD Integer Instructions // ----------------------------------------------------------------------------- def : InstRW<[FalkorWr_1VXVY_1cyc], (instregex "^ADD(v1i64|v2i32|v4i16|v8i8)$")>; @@ -119,10 +120,10 @@ def : InstRW<[FalkorWr_1VXVY_3cyc], (instregex "^SQNEG(v1i8|v1i16|v1i32|v1i64) def : InstRW<[FalkorWr_1VXVY_4cyc], (instregex "^(S|U)ADDLVv8i8v$")>; def : InstRW<[FalkorWr_1VXVY_4cyc], (instregex "^(S|U)?(MAX|MIN)V(v8i8v|v8i16v)$")>; def : InstRW<[FalkorWr_1VXVY_4cyc], (instrs ADDVv8i8v)>; -def : InstRW<[FalkorWr_1VXVY_4cyc], (instregex "^MUL(v2i32|v4i16|v8i8)(_indexed)?$")>; -def : InstRW<[FalkorWr_1VXVY_4cyc], (instregex "^SQR?DMULH(v8i8|v4i16|v1i32|v2i32|v1i16)(_indexed)?$")>; -def : InstRW<[FalkorWr_1VXVY_4cyc], (instregex "^SQDMULL(i16|i32)$")>; -def : InstRW<[FalkorWr_1VXVY_4cyc], (instregex "^SQRDML(A|S)?H(v8i8|v4i16|v1i32|v2i32|v1i16)(_indexed)?$")>; +def : InstRW<[FalkorWr_VMUL32_1VXVY_4cyc], (instregex "^MUL(v2i32|v4i16|v8i8)(_indexed)?$")>; +def : InstRW<[FalkorWr_VMUL32_1VXVY_4cyc], (instregex "^SQR?DMULH(v8i8|v4i16|v1i32|v2i32|v1i16)(_indexed)?$")>; +def : InstRW<[FalkorWr_VMUL32_1VXVY_4cyc], (instregex "^SQDMULL(i16|i32)$")>; +def : InstRW<[FalkorWr_VMUL32_1VXVY_4cyc, FalkorReadVMA], (instregex "^SQRDML(A|S)H(i16|i32|v8i8|v4i16|v1i32|v2i32|v1i16)(_indexed)?$")>; def : InstRW<[FalkorWr_1VXVY_5cyc], (instregex "^(S|U)?(MAX|MIN)Vv16i8v$")>; @@ -169,9 +170,9 @@ def : InstRW<[FalkorWr_2VXVY_3cyc], (instregex "^PMULL2?(v1i64|v2i64)$")>; def : InstRW<[FalkorWr_2VXVY_3cyc], (instregex "^S(L|R)I(v16i8|v8i16|v4i32|v2i64)_shift$")>; def : InstRW<[FalkorWr_2VXVY_3cyc], (instregex "^SQ(ABS|NEG)(v16i8|v8i16|v4i32|v2i64)$")>; -def : InstRW<[FalkorWr_2VXVY_4cyc], (instregex "^(MUL|SQR?DMULH)(v16i8|v8i16|v4i32)(_indexed)?$")>; -def : InstRW<[FalkorWr_2VXVY_4cyc], (instregex "^SQDMULLv.*$")>; -def : InstRW<[FalkorWr_2VXVY_4cyc], (instregex "^SQRDML(A|S)H(v16i8|v8i16|v4i32)(_indexed)?$")>; +def : InstRW<[FalkorWr_VMUL32_2VXVY_4cyc], (instregex "^(MUL|SQR?DMULH)(v16i8|v8i16|v4i32)(_indexed)?$")>; +def : InstRW<[FalkorWr_VMUL32_2VXVY_4cyc], (instregex "^SQDMULLv.*$")>; +def : InstRW<[FalkorWr_VMUL32_2VXVY_4cyc, FalkorReadVMA], (instregex "^SQRDML(A|S)H(v16i8|v8i16|v4i32)(_indexed)?$")>; def : InstRW<[FalkorWr_3VXVY_3cyc], (instregex "^(S|U)ADDLVv4i32v$")>; @@ -185,8 +186,9 @@ def : InstRW<[FalkorWr_4VXVY_3cyc], (instregex "^(S|U)ABALv.*$")>; def : InstRW<[FalkorWr_4VXVY_4cyc], (instregex "^(S|U)ABA(v16i8|v8i16|v4i32)$")>; -def : InstRW<[FalkorWr_1VXVY_4cyc, FalkorReadVMA],(instregex "^SQD(MLAL|MLSL)(i16|i32)$")>; -def : InstRW<[FalkorWr_2VXVY_4cyc, FalkorReadVMA],(instregex "^SQD(MLAL|MLSL)v.*$")>; +def : InstRW<[FalkorWr_VMUL32_1VXVY_4cyc, FalkorReadVMA], (instregex "^SQD(MLAL|MLSL)(i16|i32|v1i32_indexed|v1i64_indexed)$")>; +def : InstRW<[FalkorWr_VMUL32_2VXVY_4cyc, FalkorReadVMA], (instregex "^SQD(MLAL|MLSL)v[248].*$")>; + // SIMD Load Instructions // ----------------------------------------------------------------------------- def : InstRW<[WriteVLD], (instregex "^LD1(i64|Onev(8b|4h|2s|1d|16b|8h|4s|2d))$")>; @@ -294,9 +296,9 @@ def : InstRW<[FalkorWr_1VXVY_3cyc], (instrs FRECPEv1i32, FRECPEv1i64, FRSQRTEv def : InstRW<[FalkorWr_1VXVY_3cyc], (instrs FRECPXv1i32, FRECPXv1i64)>; def : InstRW<[FalkorWr_1VXVY_3cyc], (instrs URECPEv2i32, URSQRTEv2i32)>; -def : InstRW<[FalkorWr_1VXVY_5cyc], (instrs FRECPS32, FRSQRTS32, FRECPSv2f32, FRSQRTSv2f32)>; +def : InstRW<[FalkorWr_FMUL32_1VXVY_5cyc], (instrs FRECPS32, FRSQRTS32, FRECPSv2f32, FRSQRTSv2f32)>; -def : InstRW<[FalkorWr_1VXVY_6cyc], (instrs FRECPS64, FRSQRTS64)>; +def : InstRW<[FalkorWr_FMUL64_1VXVY_6cyc], (instrs FRECPS64, FRSQRTS64)>; def : InstRW<[FalkorWr_1GTOV_1VXVY_2cyc],(instregex "^INSv(i32|i64)(gpr|lane)$")>; def : InstRW<[FalkorWr_2GTOV_1cyc], (instregex "^DUP(v4i32|v2i64)(gpr|lane)$")>; @@ -311,9 +313,9 @@ def : InstRW<[FalkorWr_2VXVY_3cyc], (instrs URECPEv4i32, URSQRTEv4i32)>; def : InstRW<[FalkorWr_2VXVY_4cyc], (instrs TBLv8i8Two)>; def : InstRW<[FalkorWr_2VXVY_4cyc], (instregex "^TBX(v8|v16)i8One$")>; -def : InstRW<[FalkorWr_2VXVY_5cyc], (instrs FRECPSv4f32, FRSQRTSv4f32)>; +def : InstRW<[FalkorWr_FMUL32_2VXVY_5cyc], (instrs FRECPSv4f32, FRSQRTSv4f32)>; -def : InstRW<[FalkorWr_2VXVY_6cyc], (instrs FRECPSv2f64, FRSQRTSv2f64)>; +def : InstRW<[FalkorWr_FMUL64_2VXVY_6cyc], (instrs FRECPSv2f64, FRSQRTSv2f64)>; def : InstRW<[FalkorWr_3VXVY_5cyc], (instregex "^TBL(v8i8Three|v16i8Two)$")>; def : InstRW<[FalkorWr_3VXVY_5cyc], (instregex "^TBX(v8i8Two|v16i8Two)$")>; @@ -416,22 +418,25 @@ def : InstRW<[FalkorWr_1VXVY_3cyc], (instrs FCVTSHr, FCVTDHr)>; def : InstRW<[FalkorWr_1VXVY_4cyc], (instrs FCVTSDr, FCVTDSr)>; -def : InstRW<[FalkorWr_1VXVY_5cyc], (instregex "^F(N)?MUL(H|S)rr$")>; +def : InstRW<[FalkorWr_FMUL32_1VXVY_5cyc], (instregex "^F(N)?MUL(H|S)rr$")>; -def : InstRW<[FalkorWr_1VXVY_6cyc], (instregex "^F(N)?MULDrr$")>; +def : InstRW<[FalkorWr_FMUL64_1VXVY_6cyc], (instregex "^F(N)?MULDrr$")>; def : InstRW<[FalkorWr_1VX_1VY_10cyc],(instregex "^FDIV(H|S|D)rr$")>; def : InstRW<[FalkorWr_1VX_1VY_2cyc], (instregex "^FSQRT(H|S|D)r$")>; -def : InstRW<[FalkorWr_1VXVY_5cyc, FalkorReadFMA],(instregex "^F(N)?M(ADD|SUB)(H|S)rrr$")>; -def : InstRW<[FalkorWr_1VXVY_6cyc, FalkorReadFMA],(instregex "^F(N)?M(ADD|SUB)Drrr$")>; +def : InstRW<[FalkorWr_FMUL32_1VXVY_5cyc, ReadDefault, ReadDefault, FalkorReadFMA32], (instregex "^F(N)?M(ADD|SUB)(H|S)rrr$")>; +def : InstRW<[FalkorWr_FMUL64_1VXVY_6cyc, ReadDefault, ReadDefault, FalkorReadFMA64], (instregex "^F(N)?M(ADD|SUB)Drrr$")>; // FP Miscellaneous Instructions // ----------------------------------------------------------------------------- -def : InstRW<[FalkorWr_FMOV], (instregex "^FMOV(HW|HX|SW|DX|DXHigh)r$")>; +def : InstRW<[FalkorWr_FMOV], (instregex "^FMOV(H|S|D)i$")>; +def : InstRW<[FalkorWr_1GTOV_1cyc], (instregex "^FMOV(HW|HX|SW|DX|DXHigh)r$")>; def : InstRW<[FalkorWr_1VTOG_1cyc], (instregex "^FCVTZ(S|U)(S|U)(W|X)(D|S)ri?$")>; def : InstRW<[FalkorWr_1VTOG_1cyc], (instregex "^FMOV(WH|WS|XH|XD|XDHigh)r$")>; -def : InstRW<[FalkorWr_1VXVY_1cyc], (instregex "^FMOV(Hi|Hr|S0|Si|Sr|D0|Di|Dr|v.*_ns)$")>; +def : InstRW<[FalkorWr_1VXVY_1cyc], (instregex "^FMOV(Hr|Sr|Dr|v.*_ns)$")>; +// FIXME: We are currently generating movi v0.2d, #0 for these, which is worse than fmov 0.0 +def : InstRW<[FalkorWr_2VXVY_1cyc], (instrs FMOVD0, FMOVS0)>; def : InstRW<[FalkorWr_1GTOV_4cyc], (instregex "^(S|U)CVTF(S|U)(W|X)(D|S)ri$")>; def : InstRW<[FalkorWr_1VXVY_4cyc], (instregex "^(S|U)CVTF(v1i16|v1i32|v2i32|v1i64|v4i16|v2f32|v4f16|d|s)(_shift)?")>; @@ -475,16 +480,17 @@ def : InstRW<[FalkorWr_2XYZ_2cyc], (instregex "^EXTR(W|X)rri$")>; // Divide and Multiply Instructions // ----------------------------------------------------------------------------- -def : InstRW<[FalkorWr_1X_4cyc], (instregex "^(S|U)M(ADD|SUB)Lrrr$")>; -def : InstRW<[FalkorWr_1X_4cyc], (instregex "^M(ADD|SUB)Wrrr$")>; +def : InstRW<[FalkorWr_IMUL64_1X_4cyc, ReadDefault, ReadDefault, FalkorReadIMA64], (instregex "^(S|U)M(ADD|SUB)Lrrr$")>; +def : InstRW<[FalkorWr_IMUL32_1X_2cyc, ReadDefault, ReadDefault, FalkorReadIMA32], (instregex "^M(ADD|SUB)Wrrr$")>; -def : InstRW<[FalkorWr_1X_5cyc], (instregex "^(S|U)MULHrr$")>; -def : InstRW<[FalkorWr_1X_5cyc], (instregex "^M(ADD|SUB)Xrrr$")>; +def : InstRW<[FalkorWr_IMUL64_1X_5cyc], (instregex "^(S|U)MULHrr$")>; +def : InstRW<[FalkorWr_IMUL64_1X_5cyc, ReadDefault, ReadDefault, FalkorReadIMA64], (instregex "^M(ADD|SUB)Xrrr$")>; def : InstRW<[FalkorWr_1X_1Z_8cyc], (instregex "^(S|U)DIVWr$")>; def : InstRW<[FalkorWr_1X_1Z_16cyc], (instregex "^(S|U)DIVXr$")>; -def : InstRW<[FalkorWr_2VXVY_4cyc], (instregex "^(S|U)(MLAL|MLSL|MULL)v.*$")>; +def : InstRW<[FalkorWr_VMUL32_2VXVY_4cyc], (instregex "^(S|U)MULLv.*$")>; +def : InstRW<[FalkorWr_VMUL32_2VXVY_4cyc, FalkorReadVMA], (instregex "^(S|U)(MLAL|MLSL)v.*$")>; // Move and Shift Instructions // ----------------------------------------------------------------------------- diff --git a/lib/Target/AArch64/AArch64SchedFalkorWriteRes.td b/lib/Target/AArch64/AArch64SchedFalkorWriteRes.td index e64b2c441a19..6526cc28e806 100644 --- a/lib/Target/AArch64/AArch64SchedFalkorWriteRes.td +++ b/lib/Target/AArch64/AArch64SchedFalkorWriteRes.td @@ -29,8 +29,9 @@ // Define 1 micro-op types def FalkorWr_1X_2cyc : SchedWriteRes<[FalkorUnitX]> { let Latency = 2; } -def FalkorWr_1X_4cyc : SchedWriteRes<[FalkorUnitX]> { let Latency = 4; } -def FalkorWr_1X_5cyc : SchedWriteRes<[FalkorUnitX]> { let Latency = 5; } +def FalkorWr_IMUL32_1X_2cyc : SchedWriteRes<[FalkorUnitX]> { let Latency = 4; } +def FalkorWr_IMUL64_1X_4cyc : SchedWriteRes<[FalkorUnitX]> { let Latency = 4; } +def FalkorWr_IMUL64_1X_5cyc : SchedWriteRes<[FalkorUnitX]> { let Latency = 5; } def FalkorWr_1Z_0cyc : SchedWriteRes<[FalkorUnitZ]> { let Latency = 0; } def FalkorWr_1ZB_0cyc : SchedWriteRes<[FalkorUnitZB]> { let Latency = 0; } def FalkorWr_1LD_3cyc : SchedWriteRes<[FalkorUnitLD]> { let Latency = 3; } @@ -45,8 +46,10 @@ def FalkorWr_1VXVY_1cyc : SchedWriteRes<[FalkorUnitVXVY]>{ let Latency = 1; } def FalkorWr_1VXVY_2cyc : SchedWriteRes<[FalkorUnitVXVY]>{ let Latency = 2; } def FalkorWr_1VXVY_3cyc : SchedWriteRes<[FalkorUnitVXVY]>{ let Latency = 3; } def FalkorWr_1VXVY_4cyc : SchedWriteRes<[FalkorUnitVXVY]>{ let Latency = 4; } +def FalkorWr_VMUL32_1VXVY_4cyc : SchedWriteRes<[FalkorUnitVXVY]>{ let Latency = 4; } def FalkorWr_1VXVY_5cyc : SchedWriteRes<[FalkorUnitVXVY]>{ let Latency = 5; } -def FalkorWr_1VXVY_6cyc : SchedWriteRes<[FalkorUnitVXVY]>{ let Latency = 6; } +def FalkorWr_FMUL32_1VXVY_5cyc : SchedWriteRes<[FalkorUnitVXVY]>{ let Latency = 5; } +def FalkorWr_FMUL64_1VXVY_6cyc : SchedWriteRes<[FalkorUnitVXVY]>{ let Latency = 6; } def FalkorWr_1LD_0cyc : SchedWriteRes<[FalkorUnitLD]> { let Latency = 0; } def FalkorWr_1ST_0cyc : SchedWriteRes<[FalkorUnitST]> { let Latency = 0; } @@ -75,14 +78,26 @@ def FalkorWr_2VXVY_4cyc : SchedWriteRes<[FalkorUnitVXVY, FalkorUnitVXVY]> { let Latency = 4; let NumMicroOps = 2; } +def FalkorWr_VMUL32_2VXVY_4cyc : SchedWriteRes<[FalkorUnitVXVY, FalkorUnitVXVY]> { + let Latency = 4; + let NumMicroOps = 2; +} def FalkorWr_2VXVY_5cyc : SchedWriteRes<[FalkorUnitVXVY, FalkorUnitVXVY]> { let Latency = 5; let NumMicroOps = 2; } +def FalkorWr_FMUL32_2VXVY_5cyc : SchedWriteRes<[FalkorUnitVXVY, FalkorUnitVXVY]> { + let Latency = 5; + let NumMicroOps = 2; +} def FalkorWr_2VXVY_6cyc : SchedWriteRes<[FalkorUnitVXVY, FalkorUnitVXVY]> { let Latency = 6; let NumMicroOps = 2; } +def FalkorWr_FMUL64_2VXVY_6cyc : SchedWriteRes<[FalkorUnitVXVY, FalkorUnitVXVY]> { + let Latency = 6; + let NumMicroOps = 2; +} def FalkorWr_1LD_1VXVY_4cyc : SchedWriteRes<[FalkorUnitLD, FalkorUnitVXVY]> { let Latency = 4; @@ -350,18 +365,17 @@ def FalkorWr_2LD_2VXVY_1XYZ_2LD_2VXVY_4cyc:SchedWriteRes<[FalkorUnitLD, let NumMicroOps = 9; } -// Forwarding logic is modeled for vector multiply and accumulate +// Forwarding logic is modeled for multiply add/accumulate. // ----------------------------------------------------------------------------- -def FalkorReadVMA : SchedReadAdvance<2, [FalkorWr_1VXVY_4cyc, - FalkorWr_2VXVY_4cyc]>; -def FalkorReadFMA : SchedReadAdvance<3, [FalkorWr_1VXVY_5cyc, - FalkorWr_1VXVY_6cyc, - FalkorWr_2VXVY_5cyc, - FalkorWr_2VXVY_6cyc]>; +def FalkorReadIMA32 : SchedReadAdvance<3, [FalkorWr_IMUL32_1X_2cyc]>; +def FalkorReadIMA64 : SchedReadAdvance<4, [FalkorWr_IMUL64_1X_4cyc, FalkorWr_IMUL64_1X_5cyc]>; +def FalkorReadVMA : SchedReadAdvance<3, [FalkorWr_VMUL32_1VXVY_4cyc, FalkorWr_VMUL32_2VXVY_4cyc]>; +def FalkorReadFMA32 : SchedReadAdvance<1, [FalkorWr_FMUL32_1VXVY_5cyc, FalkorWr_FMUL32_2VXVY_5cyc]>; +def FalkorReadFMA64 : SchedReadAdvance<2, [FalkorWr_FMUL64_1VXVY_6cyc, FalkorWr_FMUL64_2VXVY_6cyc]>; // SchedPredicates and WriteVariants for Immediate Zero and LSLFast // ----------------------------------------------------------------------------- -def FalkorImmZPred : SchedPredicate<[{TII->isGPRZero(*MI)}]>; +def FalkorImmZPred : SchedPredicate<[{MI->getOperand(1).getImm() == 0}]>; def FalkorLSLFastPred : SchedPredicate<[{TII->isFalkorLSLFast(*MI)}]>; def FalkorWr_FMOV : SchedWriteVariant<[ @@ -378,7 +392,6 @@ def FalkorWr_LDR : SchedWriteVariant<[ def FalkorWr_ADD : SchedWriteVariant<[ SchedVar<FalkorLSLFastPred, [FalkorWr_1XYZ_1cyc]>, - SchedVar<FalkorImmZPred, [FalkorWr_1XYZ_1cyc]>, SchedVar<NoSchedPred, [FalkorWr_2XYZ_2cyc]>]>; def FalkorWr_PRFM : SchedWriteVariant<[ diff --git a/lib/Target/AArch64/AArch64Subtarget.cpp b/lib/Target/AArch64/AArch64Subtarget.cpp index abdeac019a18..1c81d34014fd 100644 --- a/lib/Target/AArch64/AArch64Subtarget.cpp +++ b/lib/Target/AArch64/AArch64Subtarget.cpp @@ -91,6 +91,8 @@ void AArch64Subtarget::initializeProperties() { case Falkor: MaxInterleaveFactor = 4; VectorInsertExtractBaseCost = 2; + // FIXME: remove this to enable 64-bit SLP if performance looks good. + MinVectorRegisterBitWidth = 128; break; case Kryo: MaxInterleaveFactor = 4; @@ -99,6 +101,8 @@ void AArch64Subtarget::initializeProperties() { PrefetchDistance = 740; MinPrefetchStride = 1024; MaxPrefetchIterationsAhead = 11; + // FIXME: remove this to enable 64-bit SLP if performance looks good. + MinVectorRegisterBitWidth = 128; break; case ThunderX2T99: CacheLineSize = 64; @@ -108,6 +112,8 @@ void AArch64Subtarget::initializeProperties() { PrefetchDistance = 128; MinPrefetchStride = 1024; MaxPrefetchIterationsAhead = 4; + // FIXME: remove this to enable 64-bit SLP if performance looks good. + MinVectorRegisterBitWidth = 128; break; case ThunderX: case ThunderXT88: @@ -116,6 +122,8 @@ void AArch64Subtarget::initializeProperties() { CacheLineSize = 128; PrefFunctionAlignment = 3; PrefLoopAlignment = 2; + // FIXME: remove this to enable 64-bit SLP if performance looks good. + MinVectorRegisterBitWidth = 128; break; case CortexA35: break; case CortexA53: break; diff --git a/lib/Target/AArch64/AArch64Subtarget.h b/lib/Target/AArch64/AArch64Subtarget.h index 5b9bee6e41b8..df54bf3f48e1 100644 --- a/lib/Target/AArch64/AArch64Subtarget.h +++ b/lib/Target/AArch64/AArch64Subtarget.h @@ -83,6 +83,9 @@ protected: // NegativeImmediates - transform instructions with negative immediates bool NegativeImmediates = true; + // Enable 64-bit vectorization in SLP. + unsigned MinVectorRegisterBitWidth = 64; + bool UseAA = false; bool PredictableSelectIsExpensive = false; bool BalanceFPOps = false; @@ -106,6 +109,7 @@ protected: unsigned PrefFunctionAlignment = 0; unsigned PrefLoopAlignment = 0; unsigned MaxJumpTableSize = 0; + unsigned WideningBaseCost = 0; // ReserveX18 - X18 is not available as a general purpose register. bool ReserveX18; @@ -190,6 +194,10 @@ public: bool isXRaySupported() const override { return true; } + unsigned getMinVectorRegisterBitWidth() const { + return MinVectorRegisterBitWidth; + } + bool isX18Reserved() const { return ReserveX18; } bool hasFPARMv8() const { return HasFPARMv8; } bool hasNEON() const { return HasNEON; } @@ -228,6 +236,8 @@ public: unsigned getMaximumJumpTableSize() const { return MaxJumpTableSize; } + unsigned getWideningBaseCost() const { return WideningBaseCost; } + /// CPU has TBI (top byte of addresses is ignored during HW address /// translation) and OS enables it. bool supportsAddressTopByteIgnored() const; diff --git a/lib/Target/AArch64/AArch64TargetObjectFile.cpp b/lib/Target/AArch64/AArch64TargetObjectFile.cpp index 8875f9b72647..12a2e9a867f0 100644 --- a/lib/Target/AArch64/AArch64TargetObjectFile.cpp +++ b/lib/Target/AArch64/AArch64TargetObjectFile.cpp @@ -70,3 +70,11 @@ const MCExpr *AArch64_MachoTargetObjectFile::getIndirectSymViaGOTPCRel( const MCExpr *PC = MCSymbolRefExpr::create(PCSym, getContext()); return MCBinaryExpr::createSub(Res, PC, getContext()); } + +void AArch64_MachoTargetObjectFile::getNameWithPrefix( + SmallVectorImpl<char> &OutName, const GlobalValue *GV, + const TargetMachine &TM) const { + // AArch64 does not use section-relative relocations so any global symbol must + // be accessed via at least a linker-private symbol. + getMangler().getNameWithPrefix(OutName, GV, /* CannotUsePrivateLabel */ true); +} diff --git a/lib/Target/AArch64/AArch64TargetObjectFile.h b/lib/Target/AArch64/AArch64TargetObjectFile.h index 05e1dfa9e6c9..47e3bce43f6e 100644 --- a/lib/Target/AArch64/AArch64TargetObjectFile.h +++ b/lib/Target/AArch64/AArch64TargetObjectFile.h @@ -40,6 +40,9 @@ public: const MCValue &MV, int64_t Offset, MachineModuleInfo *MMI, MCStreamer &Streamer) const override; + + void getNameWithPrefix(SmallVectorImpl<char> &OutName, const GlobalValue *GV, + const TargetMachine &TM) const override; }; } // end namespace llvm diff --git a/lib/Target/AArch64/AArch64TargetTransformInfo.cpp b/lib/Target/AArch64/AArch64TargetTransformInfo.cpp index 4d59da0c646d..7c6f55c06bce 100644 --- a/lib/Target/AArch64/AArch64TargetTransformInfo.cpp +++ b/lib/Target/AArch64/AArch64TargetTransformInfo.cpp @@ -176,11 +176,95 @@ AArch64TTIImpl::getPopcntSupport(unsigned TyWidth) { return TTI::PSK_Software; } +bool AArch64TTIImpl::isWideningInstruction(Type *DstTy, unsigned Opcode, + ArrayRef<const Value *> Args) { + + // A helper that returns a vector type from the given type. The number of + // elements in type Ty determine the vector width. + auto toVectorTy = [&](Type *ArgTy) { + return VectorType::get(ArgTy->getScalarType(), + DstTy->getVectorNumElements()); + }; + + // Exit early if DstTy is not a vector type whose elements are at least + // 16-bits wide. + if (!DstTy->isVectorTy() || DstTy->getScalarSizeInBits() < 16) + return false; + + // Determine if the operation has a widening variant. We consider both the + // "long" (e.g., usubl) and "wide" (e.g., usubw) versions of the + // instructions. + // + // TODO: Add additional widening operations (e.g., mul, shl, etc.) once we + // verify that their extending operands are eliminated during code + // generation. + switch (Opcode) { + case Instruction::Add: // UADDL(2), SADDL(2), UADDW(2), SADDW(2). + case Instruction::Sub: // USUBL(2), SSUBL(2), USUBW(2), SSUBW(2). + break; + default: + return false; + } + + // To be a widening instruction (either the "wide" or "long" versions), the + // second operand must be a sign- or zero extend having a single user. We + // only consider extends having a single user because they may otherwise not + // be eliminated. + if (Args.size() != 2 || + (!isa<SExtInst>(Args[1]) && !isa<ZExtInst>(Args[1])) || + !Args[1]->hasOneUse()) + return false; + auto *Extend = cast<CastInst>(Args[1]); + + // Legalize the destination type and ensure it can be used in a widening + // operation. + auto DstTyL = TLI->getTypeLegalizationCost(DL, DstTy); + unsigned DstElTySize = DstTyL.second.getScalarSizeInBits(); + if (!DstTyL.second.isVector() || DstElTySize != DstTy->getScalarSizeInBits()) + return false; + + // Legalize the source type and ensure it can be used in a widening + // operation. + Type *SrcTy = toVectorTy(Extend->getSrcTy()); + auto SrcTyL = TLI->getTypeLegalizationCost(DL, SrcTy); + unsigned SrcElTySize = SrcTyL.second.getScalarSizeInBits(); + if (!SrcTyL.second.isVector() || SrcElTySize != SrcTy->getScalarSizeInBits()) + return false; + + // Get the total number of vector elements in the legalized types. + unsigned NumDstEls = DstTyL.first * DstTyL.second.getVectorNumElements(); + unsigned NumSrcEls = SrcTyL.first * SrcTyL.second.getVectorNumElements(); + + // Return true if the legalized types have the same number of vector elements + // and the destination element type size is twice that of the source type. + return NumDstEls == NumSrcEls && 2 * SrcElTySize == DstElTySize; +} + int AArch64TTIImpl::getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src, const Instruction *I) { int ISD = TLI->InstructionOpcodeToISD(Opcode); assert(ISD && "Invalid opcode"); + // If the cast is observable, and it is used by a widening instruction (e.g., + // uaddl, saddw, etc.), it may be free. + if (I && I->hasOneUse()) { + auto *SingleUser = cast<Instruction>(*I->user_begin()); + SmallVector<const Value *, 4> Operands(SingleUser->operand_values()); + if (isWideningInstruction(Dst, SingleUser->getOpcode(), Operands)) { + // If the cast is the second operand, it is free. We will generate either + // a "wide" or "long" version of the widening instruction. + if (I == SingleUser->getOperand(1)) + return 0; + // If the cast is not the second operand, it will be free if it looks the + // same as the second operand. In this case, we will generate a "long" + // version of the widening instruction. + if (auto *Cast = dyn_cast<CastInst>(SingleUser->getOperand(1))) + if (I->getOpcode() == Cast->getOpcode() && + cast<CastInst>(I)->getSrcTy() == Cast->getSrcTy()) + return 0; + } + } + EVT SrcTy = TLI->getValueType(DL, Src); EVT DstTy = TLI->getValueType(DL, Dst); @@ -379,6 +463,16 @@ int AArch64TTIImpl::getArithmeticInstrCost( // Legalize the type. std::pair<int, MVT> LT = TLI->getTypeLegalizationCost(DL, Ty); + // If the instruction is a widening instruction (e.g., uaddl, saddw, etc.), + // add in the widening overhead specified by the sub-target. Since the + // extends feeding widening instructions are performed automatically, they + // aren't present in the generated code and have a zero cost. By adding a + // widening overhead here, we attach the total cost of the combined operation + // to the widening instruction. + int Cost = 0; + if (isWideningInstruction(Ty, Opcode, Args)) + Cost += ST->getWideningBaseCost(); + int ISD = TLI->InstructionOpcodeToISD(Opcode); if (ISD == ISD::SDIV && @@ -388,9 +482,9 @@ int AArch64TTIImpl::getArithmeticInstrCost( // normally expanded to the sequence ADD + CMP + SELECT + SRA. // The OperandValue properties many not be same as that of previous // operation; conservatively assume OP_None. - int Cost = getArithmeticInstrCost(Instruction::Add, Ty, Opd1Info, Opd2Info, - TargetTransformInfo::OP_None, - TargetTransformInfo::OP_None); + Cost += getArithmeticInstrCost(Instruction::Add, Ty, Opd1Info, Opd2Info, + TargetTransformInfo::OP_None, + TargetTransformInfo::OP_None); Cost += getArithmeticInstrCost(Instruction::Sub, Ty, Opd1Info, Opd2Info, TargetTransformInfo::OP_None, TargetTransformInfo::OP_None); @@ -405,8 +499,8 @@ int AArch64TTIImpl::getArithmeticInstrCost( switch (ISD) { default: - return BaseT::getArithmeticInstrCost(Opcode, Ty, Opd1Info, Opd2Info, - Opd1PropInfo, Opd2PropInfo); + return Cost + BaseT::getArithmeticInstrCost(Opcode, Ty, Opd1Info, Opd2Info, + Opd1PropInfo, Opd2PropInfo); case ISD::ADD: case ISD::MUL: case ISD::XOR: @@ -414,7 +508,7 @@ int AArch64TTIImpl::getArithmeticInstrCost( case ISD::AND: // These nodes are marked as 'custom' for combining purposes only. // We know that they are legal. See LowerAdd in ISelLowering. - return 1 * LT.first; + return (Cost + 1) * LT.first; } } diff --git a/lib/Target/AArch64/AArch64TargetTransformInfo.h b/lib/Target/AArch64/AArch64TargetTransformInfo.h index e37c003e064c..280d97f3c502 100644 --- a/lib/Target/AArch64/AArch64TargetTransformInfo.h +++ b/lib/Target/AArch64/AArch64TargetTransformInfo.h @@ -43,6 +43,9 @@ class AArch64TTIImpl : public BasicTTIImplBase<AArch64TTIImpl> { VECTOR_LDST_FOUR_ELEMENTS }; + bool isWideningInstruction(Type *Ty, unsigned Opcode, + ArrayRef<const Value *> Args); + public: explicit AArch64TTIImpl(const AArch64TargetMachine *TM, const Function &F) : BaseT(TM, F.getParent()->getDataLayout()), ST(TM->getSubtargetImpl(F)), @@ -84,6 +87,10 @@ public: return 64; } + unsigned getMinVectorRegisterBitWidth() { + return ST->getMinVectorRegisterBitWidth(); + } + unsigned getMaxInterleaveFactor(unsigned VF); int getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src, @@ -134,6 +141,10 @@ public: unsigned getMinPrefetchStride(); unsigned getMaxPrefetchIterationsAhead(); + + bool shouldExpandReduction(const IntrinsicInst *II) const { + return false; + } /// @} }; diff --git a/lib/Target/AArch64/AsmParser/AArch64AsmParser.cpp b/lib/Target/AArch64/AsmParser/AArch64AsmParser.cpp index 4dbcc9581a84..449d732a8d44 100644 --- a/lib/Target/AArch64/AsmParser/AArch64AsmParser.cpp +++ b/lib/Target/AArch64/AsmParser/AArch64AsmParser.cpp @@ -3904,10 +3904,14 @@ bool AArch64AsmParser::parseDirectiveArch(SMLoc L) { return false; } +static SMLoc incrementLoc(SMLoc L, int Offset) { + return SMLoc::getFromPointer(L.getPointer() + Offset); +} + /// parseDirectiveCPU /// ::= .cpu id bool AArch64AsmParser::parseDirectiveCPU(SMLoc L) { - SMLoc CPULoc = getLoc(); + SMLoc CurLoc = getLoc(); StringRef CPU, ExtensionString; std::tie(CPU, ExtensionString) = @@ -3923,15 +3927,19 @@ bool AArch64AsmParser::parseDirectiveCPU(SMLoc L) { // FIXME This is using tablegen data, but should be moved to ARMTargetParser // once that is tablegen'ed if (!getSTI().isCPUStringValid(CPU)) { - Error(CPULoc, "unknown CPU name"); + Error(CurLoc, "unknown CPU name"); return false; } MCSubtargetInfo &STI = copySTI(); STI.setDefaultFeatures(CPU, ""); + CurLoc = incrementLoc(CurLoc, CPU.size()); FeatureBitset Features = STI.getFeatureBits(); for (auto Name : RequestedExtensions) { + // Advance source location past '+'. + CurLoc = incrementLoc(CurLoc, 1); + bool EnableFeature = true; if (Name.startswith_lower("no")) { @@ -3939,6 +3947,7 @@ bool AArch64AsmParser::parseDirectiveCPU(SMLoc L) { Name = Name.substr(2); } + bool FoundExtension = false; for (const auto &Extension : ExtensionMap) { if (Extension.Name != Name) continue; @@ -3952,9 +3961,15 @@ bool AArch64AsmParser::parseDirectiveCPU(SMLoc L) { uint64_t Features = ComputeAvailableFeatures(STI.ToggleFeature(ToggleFeatures)); setAvailableFeatures(Features); + FoundExtension = true; break; } + + if (!FoundExtension) + Error(CurLoc, "unsupported architectural extension"); + + CurLoc = incrementLoc(CurLoc, Name.size()); } return false; } diff --git a/lib/Target/AArch64/MCTargetDesc/AArch64MCAsmInfo.cpp b/lib/Target/AArch64/MCTargetDesc/AArch64MCAsmInfo.cpp index 94112849f84e..1b28df963b40 100644 --- a/lib/Target/AArch64/MCTargetDesc/AArch64MCAsmInfo.cpp +++ b/lib/Target/AArch64/MCTargetDesc/AArch64MCAsmInfo.cpp @@ -32,8 +32,9 @@ static cl::opt<AsmWriterVariantTy> AsmWriterVariant( clEnumValN(Apple, "apple", "Emit Apple-style NEON assembly"))); AArch64MCAsmInfoDarwin::AArch64MCAsmInfoDarwin() { - // We prefer NEON instructions to be printed in the short form. - AssemblerDialect = AsmWriterVariant == Default ? 1 : AsmWriterVariant; + // We prefer NEON instructions to be printed in the short, Apple-specific + // form when targeting Darwin. + AssemblerDialect = AsmWriterVariant == Default ? Apple : AsmWriterVariant; PrivateGlobalPrefix = "L"; PrivateLabelPrefix = "L"; @@ -68,8 +69,9 @@ AArch64MCAsmInfoELF::AArch64MCAsmInfoELF(const Triple &T) { if (T.getArch() == Triple::aarch64_be) IsLittleEndian = false; - // We prefer NEON instructions to be printed in the short form. - AssemblerDialect = AsmWriterVariant == Default ? 0 : AsmWriterVariant; + // We prefer NEON instructions to be printed in the generic form when + // targeting ELF. + AssemblerDialect = AsmWriterVariant == Default ? Generic : AsmWriterVariant; CodePointerSize = 8; diff --git a/lib/Target/AMDGPU/AMDGPU.h b/lib/Target/AMDGPU/AMDGPU.h index 8f6e1e7d8846..3f89702bed50 100644 --- a/lib/Target/AMDGPU/AMDGPU.h +++ b/lib/Target/AMDGPU/AMDGPU.h @@ -50,6 +50,10 @@ FunctionPass *createSIDebuggerInsertNopsPass(); FunctionPass *createSIInsertWaitsPass(); FunctionPass *createSIInsertWaitcntsPass(); FunctionPass *createAMDGPUCodeGenPreparePass(const GCNTargetMachine *TM = nullptr); +FunctionPass *createAMDGPUMachineCFGStructurizerPass(); + +void initializeAMDGPUMachineCFGStructurizerPass(PassRegistry&); +extern char &AMDGPUMachineCFGStructurizerID; ModulePass *createAMDGPUAnnotateKernelFeaturesPass(const TargetMachine *TM = nullptr); void initializeAMDGPUAnnotateKernelFeaturesPass(PassRegistry &); diff --git a/lib/Target/AMDGPU/AMDGPU.td b/lib/Target/AMDGPU/AMDGPU.td index 2e5b78bbf7ef..b279bd61e180 100644 --- a/lib/Target/AMDGPU/AMDGPU.td +++ b/lib/Target/AMDGPU/AMDGPU.td @@ -61,6 +61,24 @@ def FeatureFlatAddressSpace : SubtargetFeature<"flat-address-space", "Support flat address space" >; +def FeatureFlatInstOffsets : SubtargetFeature<"flat-inst-offsets", + "FlatInstOffsets", + "true", + "Flat instructions have immediate offset addressing mode" +>; + +def FeatureFlatGlobalInsts : SubtargetFeature<"flat-global-insts", + "FlatGlobalInsts", + "true", + "Have global_* flat memory instructions" +>; + +def FeatureFlatScratchInsts : SubtargetFeature<"flat-scratch-insts", + "FlatScratchInsts", + "true", + "Have scratch_* flat memory instructions" +>; + def FeatureUnalignedBufferAccess : SubtargetFeature<"unaligned-buffer-access", "UnalignedBufferAccess", "true", @@ -407,7 +425,8 @@ def FeatureGFX9 : SubtargetFeatureGeneration<"GFX9", FeatureGCN3Encoding, FeatureCIInsts, Feature16BitInsts, FeatureSMemRealTime, FeatureScalarStores, FeatureInv2PiInlineImm, FeatureApertureRegs, FeatureGFX9Insts, FeatureVOP3P, FeatureVGPRIndexMode, - FeatureFastFMAF32, FeatureDPP + FeatureFastFMAF32, FeatureDPP, + FeatureFlatInstOffsets, FeatureFlatGlobalInsts, FeatureFlatScratchInsts ] >; diff --git a/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp b/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp index ccae36ced1f8..7c99752b881f 100644 --- a/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp +++ b/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp @@ -136,8 +136,7 @@ private: bool SelectMUBUFIntrinsicVOffset(SDValue Offset, SDValue &SOffset, SDValue &ImmOffset, SDValue &VOffset) const; - bool SelectFlat(SDValue Addr, SDValue &VAddr, - SDValue &SLC, SDValue &TFE) const; + bool SelectFlat(SDValue Addr, SDValue &VAddr, SDValue &SLC) const; bool SelectSMRDOffset(SDValue ByteOffsetNode, SDValue &Offset, bool &Imm) const; @@ -1278,10 +1277,9 @@ bool AMDGPUDAGToDAGISel::SelectMUBUFIntrinsicVOffset(SDValue Offset, bool AMDGPUDAGToDAGISel::SelectFlat(SDValue Addr, SDValue &VAddr, - SDValue &SLC, - SDValue &TFE) const { + SDValue &SLC) const { VAddr = Addr; - TFE = SLC = CurDAG->getTargetConstant(0, SDLoc(), MVT::i1); + SLC = CurDAG->getTargetConstant(0, SDLoc(), MVT::i1); return true; } diff --git a/lib/Target/AMDGPU/AMDGPUISelLowering.cpp b/lib/Target/AMDGPU/AMDGPUISelLowering.cpp index 915d1d9e0e68..f80652b87373 100644 --- a/lib/Target/AMDGPU/AMDGPUISelLowering.cpp +++ b/lib/Target/AMDGPU/AMDGPUISelLowering.cpp @@ -567,13 +567,19 @@ static bool hasSourceMods(const SDNode *N) { case AMDGPUISD::INTERP_P1: case AMDGPUISD::INTERP_P2: case AMDGPUISD::DIV_SCALE: + + // TODO: Should really be looking at the users of the bitcast. These are + // problematic because bitcasts are used to legalize all stores to integer + // types. + case ISD::BITCAST: return false; default: return true; } } -static bool allUsesHaveSourceMods(const SDNode *N, unsigned CostThreshold = 4) { +bool AMDGPUTargetLowering::allUsesHaveSourceMods(const SDNode *N, + unsigned CostThreshold) { // Some users (such as 3-operand FMA/MAD) must use a VOP3 encoding, and thus // it is truly free to use a source modifier in all cases. If there are // multiple users but for each one will necessitate using VOP3, there will be @@ -2299,7 +2305,7 @@ static bool isU24(SDValue Op, SelectionDAG &DAG) { EVT VT = Op.getValueType(); DAG.computeKnownBits(Op, Known); - return (VT.getSizeInBits() - Known.Zero.countLeadingOnes()) <= 24; + return (VT.getSizeInBits() - Known.countMinLeadingZeros()) <= 24; } static bool isI24(SDValue Op, SelectionDAG &DAG) { diff --git a/lib/Target/AMDGPU/AMDGPUISelLowering.h b/lib/Target/AMDGPU/AMDGPUISelLowering.h index e1a5a2072418..4c588a7bafd0 100644 --- a/lib/Target/AMDGPU/AMDGPUISelLowering.h +++ b/lib/Target/AMDGPU/AMDGPUISelLowering.h @@ -132,6 +132,8 @@ public: return false; } + static bool allUsesHaveSourceMods(const SDNode *N, + unsigned CostThreshold = 4); bool isFAbsFree(EVT VT) const override; bool isFNegFree(EVT VT) const override; bool isTruncateFree(EVT Src, EVT Dest) const override; diff --git a/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp b/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp index 8867ed689a31..a7eac080f885 100644 --- a/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp +++ b/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp @@ -127,9 +127,9 @@ bool AMDGPUInstructionSelector::selectG_STORE(MachineInstr &I) const { .add(I.getOperand(1)) .add(I.getOperand(0)) .addImm(0) - .addImm(0) .addImm(0); + // Now that we selected an opcode, we need to constrain the register // operands to use appropriate classes. bool Ret = constrainSelectedInstRegOperands(*Flat, TII, TRI, RBI); @@ -393,7 +393,6 @@ bool AMDGPUInstructionSelector::selectG_LOAD(MachineInstr &I) const { .add(I.getOperand(0)) .addReg(PtrReg) .addImm(0) - .addImm(0) .addImm(0); bool Ret = constrainSelectedInstRegOperands(*Flat, TII, TRI, RBI); diff --git a/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp b/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp index a2567a549028..9de302994e68 100644 --- a/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp +++ b/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp @@ -33,6 +33,7 @@ AMDGPULegalizerInfo::AMDGPULegalizerInfo() { const LLT P1 = LLT::pointer(1, 64); const LLT P2 = LLT::pointer(2, 64); + setAction({G_CONSTANT, S32}, Legal); setAction({G_CONSTANT, S64}, Legal); setAction({G_GEP, P1}, Legal); diff --git a/lib/Target/AMDGPU/AMDGPUMachineCFGStructurizer.cpp b/lib/Target/AMDGPU/AMDGPUMachineCFGStructurizer.cpp new file mode 100644 index 000000000000..6d2785ba1c60 --- /dev/null +++ b/lib/Target/AMDGPU/AMDGPUMachineCFGStructurizer.cpp @@ -0,0 +1,2881 @@ +//===- AMDGPUMachineCFGStructurizer.cpp - Machine code if conversion pass. ===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This file implements the machine instruction level CFG structurizer pass. +// +//===----------------------------------------------------------------------===// + +#include "AMDGPU.h" +#include "SIInstrInfo.h" +#include "AMDGPUSubtarget.h" +#include "llvm/ADT/DenseSet.h" +#include "llvm/ADT/PostOrderIterator.h" +#include "llvm/ADT/SetVector.h" +#include "llvm/ADT/SmallPtrSet.h" +#include "llvm/ADT/SmallVector.h" +#include "llvm/Analysis/CFG.h" +#include "llvm/CodeGen/MachineBasicBlock.h" +#include "llvm/CodeGen/MachineFunctionPass.h" +#include "llvm/CodeGen/MachineInstr.h" +#include "llvm/CodeGen/MachineInstrBuilder.h" +#include "llvm/CodeGen/MachineRegionInfo.h" +#include "llvm/CodeGen/MachineRegisterInfo.h" +#include "llvm/CodeGen/Passes.h" +#include "llvm/IR/DebugLoc.h" +#include "llvm/Support/Debug.h" +#include "llvm/Target/TargetInstrInfo.h" +#include "llvm/Target/TargetLowering.h" +#include "llvm/Target/TargetSubtargetInfo.h" +#include <tuple> +using namespace llvm; + +#define DEBUG_TYPE "amdgpucfgstructurizer" + +namespace { +class PHILinearizeDestIterator; + +class PHILinearize { + friend class PHILinearizeDestIterator; + +public: + typedef std::pair<unsigned, MachineBasicBlock *> PHISourceT; + +private: + typedef DenseSet<PHISourceT> PHISourcesT; + typedef struct { + unsigned DestReg; + DebugLoc DL; + PHISourcesT Sources; + } PHIInfoElementT; + typedef SmallPtrSet<PHIInfoElementT *, 2> PHIInfoT; + PHIInfoT PHIInfo; + + static unsigned phiInfoElementGetDest(PHIInfoElementT *Info); + static void phiInfoElementSetDef(PHIInfoElementT *Info, unsigned NewDef); + static PHISourcesT &phiInfoElementGetSources(PHIInfoElementT *Info); + static void phiInfoElementAddSource(PHIInfoElementT *Info, unsigned SourceReg, + MachineBasicBlock *SourceMBB); + static void phiInfoElementRemoveSource(PHIInfoElementT *Info, + unsigned SourceReg, + MachineBasicBlock *SourceMBB); + PHIInfoElementT *findPHIInfoElement(unsigned DestReg); + PHIInfoElementT *findPHIInfoElementFromSource(unsigned SourceReg, + MachineBasicBlock *SourceMBB); + +public: + bool findSourcesFromMBB(MachineBasicBlock *SourceMBB, + SmallVector<unsigned, 4> &Sources); + void addDest(unsigned DestReg, const DebugLoc &DL); + void replaceDef(unsigned OldDestReg, unsigned NewDestReg); + void deleteDef(unsigned DestReg); + void addSource(unsigned DestReg, unsigned SourceReg, + MachineBasicBlock *SourceMBB); + void removeSource(unsigned DestReg, unsigned SourceReg, + MachineBasicBlock *SourceMBB = nullptr); + bool findDest(unsigned SourceReg, MachineBasicBlock *SourceMBB, + unsigned &DestReg); + bool isSource(unsigned Reg, MachineBasicBlock *SourceMBB = nullptr); + unsigned getNumSources(unsigned DestReg); + void dump(MachineRegisterInfo *MRI); + void clear(); + + typedef PHISourcesT::iterator source_iterator; + typedef PHILinearizeDestIterator dest_iterator; + + dest_iterator dests_begin(); + dest_iterator dests_end(); + + source_iterator sources_begin(unsigned Reg); + source_iterator sources_end(unsigned Reg); +}; + +class PHILinearizeDestIterator { +private: + PHILinearize::PHIInfoT::iterator Iter; + +public: + unsigned operator*() { return PHILinearize::phiInfoElementGetDest(*Iter); } + PHILinearizeDestIterator &operator++() { + ++Iter; + return *this; + } + bool operator==(const PHILinearizeDestIterator &I) const { + return I.Iter == Iter; + } + bool operator!=(const PHILinearizeDestIterator &I) const { + return I.Iter != Iter; + } + + PHILinearizeDestIterator(PHILinearize::PHIInfoT::iterator I) : Iter(I) {} +}; + +unsigned PHILinearize::phiInfoElementGetDest(PHIInfoElementT *Info) { + return Info->DestReg; +} + +void PHILinearize::phiInfoElementSetDef(PHIInfoElementT *Info, + unsigned NewDef) { + Info->DestReg = NewDef; +} + +PHILinearize::PHISourcesT & +PHILinearize::phiInfoElementGetSources(PHIInfoElementT *Info) { + return Info->Sources; +} + +void PHILinearize::phiInfoElementAddSource(PHIInfoElementT *Info, + unsigned SourceReg, + MachineBasicBlock *SourceMBB) { + // Assertion ensures we don't use the same SourceMBB for the + // sources, because we cannot have different registers with + // identical predecessors, but we can have the same register for + // multiple predecessors. +#if !defined(NDEBUG) + for (auto SI : phiInfoElementGetSources(Info)) { + assert((SI.second != SourceMBB || SourceReg == SI.first)); + } +#endif + + phiInfoElementGetSources(Info).insert(PHISourceT(SourceReg, SourceMBB)); +} + +void PHILinearize::phiInfoElementRemoveSource(PHIInfoElementT *Info, + unsigned SourceReg, + MachineBasicBlock *SourceMBB) { + auto &Sources = phiInfoElementGetSources(Info); + SmallVector<PHISourceT, 4> ElimiatedSources; + for (auto SI : Sources) { + if (SI.first == SourceReg && + (SI.second == nullptr || SI.second == SourceMBB)) { + ElimiatedSources.push_back(PHISourceT(SI.first, SI.second)); + } + } + + for (auto &Source : ElimiatedSources) { + Sources.erase(Source); + } +} + +PHILinearize::PHIInfoElementT * +PHILinearize::findPHIInfoElement(unsigned DestReg) { + for (auto I : PHIInfo) { + if (phiInfoElementGetDest(I) == DestReg) { + return I; + } + } + return nullptr; +} + +PHILinearize::PHIInfoElementT * +PHILinearize::findPHIInfoElementFromSource(unsigned SourceReg, + MachineBasicBlock *SourceMBB) { + for (auto I : PHIInfo) { + for (auto SI : phiInfoElementGetSources(I)) { + if (SI.first == SourceReg && + (SI.second == nullptr || SI.second == SourceMBB)) { + return I; + } + } + } + return nullptr; +} + +bool PHILinearize::findSourcesFromMBB(MachineBasicBlock *SourceMBB, + SmallVector<unsigned, 4> &Sources) { + bool FoundSource = false; + for (auto I : PHIInfo) { + for (auto SI : phiInfoElementGetSources(I)) { + if (SI.second == SourceMBB) { + FoundSource = true; + Sources.push_back(SI.first); + } + } + } + return FoundSource; +} + +void PHILinearize::addDest(unsigned DestReg, const DebugLoc &DL) { + assert(findPHIInfoElement(DestReg) == nullptr && "Dest already exsists"); + PHISourcesT EmptySet; + PHIInfoElementT *NewElement = new PHIInfoElementT(); + NewElement->DestReg = DestReg; + NewElement->DL = DL; + NewElement->Sources = EmptySet; + PHIInfo.insert(NewElement); +} + +void PHILinearize::replaceDef(unsigned OldDestReg, unsigned NewDestReg) { + phiInfoElementSetDef(findPHIInfoElement(OldDestReg), NewDestReg); +} + +void PHILinearize::deleteDef(unsigned DestReg) { + PHIInfoElementT *InfoElement = findPHIInfoElement(DestReg); + PHIInfo.erase(InfoElement); + delete InfoElement; +} + +void PHILinearize::addSource(unsigned DestReg, unsigned SourceReg, + MachineBasicBlock *SourceMBB) { + phiInfoElementAddSource(findPHIInfoElement(DestReg), SourceReg, SourceMBB); +} + +void PHILinearize::removeSource(unsigned DestReg, unsigned SourceReg, + MachineBasicBlock *SourceMBB) { + phiInfoElementRemoveSource(findPHIInfoElement(DestReg), SourceReg, SourceMBB); +} + +bool PHILinearize::findDest(unsigned SourceReg, MachineBasicBlock *SourceMBB, + unsigned &DestReg) { + PHIInfoElementT *InfoElement = + findPHIInfoElementFromSource(SourceReg, SourceMBB); + if (InfoElement != nullptr) { + DestReg = phiInfoElementGetDest(InfoElement); + return true; + } + return false; +} + +bool PHILinearize::isSource(unsigned Reg, MachineBasicBlock *SourceMBB) { + unsigned DestReg; + return findDest(Reg, SourceMBB, DestReg); +} + +unsigned PHILinearize::getNumSources(unsigned DestReg) { + return phiInfoElementGetSources(findPHIInfoElement(DestReg)).size(); +} + +void PHILinearize::dump(MachineRegisterInfo *MRI) { + const TargetRegisterInfo *TRI = MRI->getTargetRegisterInfo(); + dbgs() << "=PHIInfo Start=\n"; + for (auto PII : this->PHIInfo) { + PHIInfoElementT &Element = *PII; + dbgs() << "Dest: " << PrintReg(Element.DestReg, TRI) + << " Sources: {"; + for (auto &SI : Element.Sources) { + dbgs() << PrintReg(SI.first, TRI) << "(BB#" + << SI.second->getNumber() << "),"; + } + dbgs() << "}\n"; + } + dbgs() << "=PHIInfo End=\n"; +} + +void PHILinearize::clear() { PHIInfo = PHIInfoT(); } + +PHILinearize::dest_iterator PHILinearize::dests_begin() { + return PHILinearizeDestIterator(PHIInfo.begin()); +} + +PHILinearize::dest_iterator PHILinearize::dests_end() { + return PHILinearizeDestIterator(PHIInfo.end()); +} + +PHILinearize::source_iterator PHILinearize::sources_begin(unsigned Reg) { + auto InfoElement = findPHIInfoElement(Reg); + return phiInfoElementGetSources(InfoElement).begin(); +} +PHILinearize::source_iterator PHILinearize::sources_end(unsigned Reg) { + auto InfoElement = findPHIInfoElement(Reg); + return phiInfoElementGetSources(InfoElement).end(); +} + +class RegionMRT; +class MBBMRT; + +static unsigned getPHINumInputs(MachineInstr &PHI) { + assert(PHI.isPHI()); + return (PHI.getNumOperands() - 1) / 2; +} + +static MachineBasicBlock *getPHIPred(MachineInstr &PHI, unsigned Index) { + assert(PHI.isPHI()); + return PHI.getOperand(Index * 2 + 2).getMBB(); +} + +static void setPhiPred(MachineInstr &PHI, unsigned Index, + MachineBasicBlock *NewPred) { + PHI.getOperand(Index * 2 + 2).setMBB(NewPred); +} + +static unsigned getPHISourceReg(MachineInstr &PHI, unsigned Index) { + assert(PHI.isPHI()); + return PHI.getOperand(Index * 2 + 1).getReg(); +} + +static unsigned getPHIDestReg(MachineInstr &PHI) { + assert(PHI.isPHI()); + return PHI.getOperand(0).getReg(); +} + +class LinearizedRegion { +protected: + MachineBasicBlock *Entry; + // The exit block is part of the region, and is the last + // merge block before exiting the region. + MachineBasicBlock *Exit; + DenseSet<unsigned> LiveOuts; + SmallPtrSet<MachineBasicBlock *, 1> MBBs; + bool HasLoop; + LinearizedRegion *Parent; + RegionMRT *RMRT; + + void storeLiveOutReg(MachineBasicBlock *MBB, unsigned Reg, + MachineInstr *DefInstr, const MachineRegisterInfo *MRI, + const TargetRegisterInfo *TRI, PHILinearize &PHIInfo); + + void storeLiveOutRegRegion(RegionMRT *Region, unsigned Reg, + MachineInstr *DefInstr, + const MachineRegisterInfo *MRI, + const TargetRegisterInfo *TRI, + PHILinearize &PHIInfo); + + void storeMBBLiveOuts(MachineBasicBlock *MBB, const MachineRegisterInfo *MRI, + const TargetRegisterInfo *TRI, PHILinearize &PHIInfo, + RegionMRT *TopRegion); + + void storeLiveOuts(MachineBasicBlock *MBB, const MachineRegisterInfo *MRI, + const TargetRegisterInfo *TRI, PHILinearize &PHIInfo); + + void storeLiveOuts(RegionMRT *Region, const MachineRegisterInfo *MRI, + const TargetRegisterInfo *TRI, PHILinearize &PHIInfo, + RegionMRT *TopRegion = nullptr); + +public: + void setRegionMRT(RegionMRT *Region) { RMRT = Region; } + + RegionMRT *getRegionMRT() { return RMRT; } + + void setParent(LinearizedRegion *P) { Parent = P; } + + LinearizedRegion *getParent() { return Parent; } + + void print(raw_ostream &OS, const TargetRegisterInfo *TRI = nullptr); + + void setBBSelectRegIn(unsigned Reg); + + unsigned getBBSelectRegIn(); + + void setBBSelectRegOut(unsigned Reg, bool IsLiveOut); + + unsigned getBBSelectRegOut(); + + void setHasLoop(bool Value); + + bool getHasLoop(); + + void addLiveOut(unsigned VReg); + + void removeLiveOut(unsigned Reg); + + void replaceLiveOut(unsigned OldReg, unsigned NewReg); + + void replaceRegister(unsigned Register, unsigned NewRegister, + MachineRegisterInfo *MRI, bool ReplaceInside, + bool ReplaceOutside, bool IncludeLoopPHIs); + + void replaceRegisterInsideRegion(unsigned Register, unsigned NewRegister, + bool IncludeLoopPHIs, + MachineRegisterInfo *MRI); + + void replaceRegisterOutsideRegion(unsigned Register, unsigned NewRegister, + bool IncludeLoopPHIs, + MachineRegisterInfo *MRI); + + DenseSet<unsigned> *getLiveOuts(); + + void setEntry(MachineBasicBlock *NewEntry); + + MachineBasicBlock *getEntry(); + + void setExit(MachineBasicBlock *NewExit); + + MachineBasicBlock *getExit(); + + void addMBB(MachineBasicBlock *MBB); + + void addMBBs(LinearizedRegion *InnerRegion); + + bool contains(MachineBasicBlock *MBB); + + bool isLiveOut(unsigned Reg); + + bool hasNoDef(unsigned Reg, MachineRegisterInfo *MRI); + + void removeFalseRegisterKills(MachineRegisterInfo *MRI); + + void initLiveOut(RegionMRT *Region, const MachineRegisterInfo *MRI, + const TargetRegisterInfo *TRI, PHILinearize &PHIInfo); + + LinearizedRegion(MachineBasicBlock *MBB, const MachineRegisterInfo *MRI, + const TargetRegisterInfo *TRI, PHILinearize &PHIInfo); + + LinearizedRegion(); + + ~LinearizedRegion(); +}; + +class MRT { +protected: + RegionMRT *Parent; + unsigned BBSelectRegIn; + unsigned BBSelectRegOut; + +public: + unsigned getBBSelectRegIn() { return BBSelectRegIn; } + + unsigned getBBSelectRegOut() { return BBSelectRegOut; } + + void setBBSelectRegIn(unsigned Reg) { BBSelectRegIn = Reg; } + + void setBBSelectRegOut(unsigned Reg) { BBSelectRegOut = Reg; } + + virtual RegionMRT *getRegionMRT() { return nullptr; } + + virtual MBBMRT *getMBBMRT() { return nullptr; } + + bool isRegion() { return getRegionMRT() != nullptr; } + + bool isMBB() { return getMBBMRT() != nullptr; } + + bool isRoot() { return Parent == nullptr; } + + void setParent(RegionMRT *Region) { Parent = Region; } + + RegionMRT *getParent() { return Parent; } + + static MachineBasicBlock * + initializeMRT(MachineFunction &MF, const MachineRegionInfo *RegionInfo, + DenseMap<MachineRegion *, RegionMRT *> &RegionMap); + + static RegionMRT *buildMRT(MachineFunction &MF, + const MachineRegionInfo *RegionInfo, + const SIInstrInfo *TII, + MachineRegisterInfo *MRI); + + virtual void dump(const TargetRegisterInfo *TRI, int depth = 0) = 0; + + void dumpDepth(int depth) { + for (int i = depth; i > 0; --i) { + dbgs() << " "; + } + } + + virtual ~MRT() {} +}; + +class MBBMRT : public MRT { + MachineBasicBlock *MBB; + +public: + virtual MBBMRT *getMBBMRT() { return this; } + + MachineBasicBlock *getMBB() { return MBB; } + + virtual void dump(const TargetRegisterInfo *TRI, int depth = 0) { + dumpDepth(depth); + dbgs() << "MBB: " << getMBB()->getNumber(); + dbgs() << " In: " << PrintReg(getBBSelectRegIn(), TRI); + dbgs() << ", Out: " << PrintReg(getBBSelectRegOut(), TRI) << "\n"; + } + + MBBMRT(MachineBasicBlock *BB) : MBB(BB) { + setParent(nullptr); + setBBSelectRegOut(0); + setBBSelectRegIn(0); + } +}; + +class RegionMRT : public MRT { +protected: + MachineRegion *Region; + LinearizedRegion *LRegion; + MachineBasicBlock *Succ; + + SetVector<MRT *> Children; + +public: + virtual RegionMRT *getRegionMRT() { return this; } + + void setLinearizedRegion(LinearizedRegion *LinearizeRegion) { + LRegion = LinearizeRegion; + } + + LinearizedRegion *getLinearizedRegion() { return LRegion; } + + MachineRegion *getMachineRegion() { return Region; } + + unsigned getInnerOutputRegister() { + return (*(Children.begin()))->getBBSelectRegOut(); + } + + void addChild(MRT *Tree) { Children.insert(Tree); } + + SetVector<MRT *> *getChildren() { return &Children; } + + virtual void dump(const TargetRegisterInfo *TRI, int depth = 0) { + dumpDepth(depth); + dbgs() << "Region: " << (void *)Region; + dbgs() << " In: " << PrintReg(getBBSelectRegIn(), TRI); + dbgs() << ", Out: " << PrintReg(getBBSelectRegOut(), TRI) << "\n"; + + dumpDepth(depth); + if (getSucc()) + dbgs() << "Succ: " << getSucc()->getNumber() << "\n"; + else + dbgs() << "Succ: none \n"; + for (auto MRTI : Children) { + MRTI->dump(TRI, depth + 1); + } + } + + MRT *getEntryTree() { return Children.back(); } + + MRT *getExitTree() { return Children.front(); } + + MachineBasicBlock *getEntry() { + MRT *Tree = Children.back(); + return (Tree->isRegion()) ? Tree->getRegionMRT()->getEntry() + : Tree->getMBBMRT()->getMBB(); + } + + MachineBasicBlock *getExit() { + MRT *Tree = Children.front(); + return (Tree->isRegion()) ? Tree->getRegionMRT()->getExit() + : Tree->getMBBMRT()->getMBB(); + } + + void setSucc(MachineBasicBlock *MBB) { Succ = MBB; } + + MachineBasicBlock *getSucc() { return Succ; } + + bool contains(MachineBasicBlock *MBB) { + for (auto CI : Children) { + if (CI->isMBB()) { + if (MBB == CI->getMBBMRT()->getMBB()) { + return true; + } + } else { + if (CI->getRegionMRT()->contains(MBB)) { + return true; + } else if (CI->getRegionMRT()->getLinearizedRegion() != nullptr && + CI->getRegionMRT()->getLinearizedRegion()->contains(MBB)) { + return true; + } + } + } + return false; + } + + void replaceLiveOutReg(unsigned Register, unsigned NewRegister) { + LinearizedRegion *LRegion = getLinearizedRegion(); + LRegion->replaceLiveOut(Register, NewRegister); + for (auto &CI : Children) { + if (CI->isRegion()) { + CI->getRegionMRT()->replaceLiveOutReg(Register, NewRegister); + } + } + } + + RegionMRT(MachineRegion *MachineRegion) + : Region(MachineRegion), LRegion(nullptr), Succ(nullptr) { + setParent(nullptr); + setBBSelectRegOut(0); + setBBSelectRegIn(0); + } + + virtual ~RegionMRT() { + if (LRegion) { + delete LRegion; + } + + for (auto CI : Children) { + delete &(*CI); + } + } +}; + +static unsigned createBBSelectReg(const SIInstrInfo *TII, + MachineRegisterInfo *MRI) { + return MRI->createVirtualRegister(TII->getPreferredSelectRegClass(32)); +} + +MachineBasicBlock * +MRT::initializeMRT(MachineFunction &MF, const MachineRegionInfo *RegionInfo, + DenseMap<MachineRegion *, RegionMRT *> &RegionMap) { + for (auto &MFI : MF) { + MachineBasicBlock *ExitMBB = &MFI; + if (ExitMBB->succ_size() == 0) { + return ExitMBB; + } + } + llvm_unreachable("CFG has no exit block"); + return nullptr; +} + +RegionMRT *MRT::buildMRT(MachineFunction &MF, + const MachineRegionInfo *RegionInfo, + const SIInstrInfo *TII, MachineRegisterInfo *MRI) { + SmallPtrSet<MachineRegion *, 4> PlacedRegions; + DenseMap<MachineRegion *, RegionMRT *> RegionMap; + MachineRegion *TopLevelRegion = RegionInfo->getTopLevelRegion(); + RegionMRT *Result = new RegionMRT(TopLevelRegion); + RegionMap[TopLevelRegion] = Result; + + // Insert the exit block first, we need it to be the merge node + // for the top level region. + MachineBasicBlock *Exit = initializeMRT(MF, RegionInfo, RegionMap); + + unsigned BBSelectRegIn = createBBSelectReg(TII, MRI); + MBBMRT *ExitMRT = new MBBMRT(Exit); + RegionMap[RegionInfo->getRegionFor(Exit)]->addChild(ExitMRT); + ExitMRT->setBBSelectRegIn(BBSelectRegIn); + + for (auto MBBI : post_order(&(MF.front()))) { + MachineBasicBlock *MBB = &(*MBBI); + + // Skip Exit since we already added it + if (MBB == Exit) { + continue; + } + + DEBUG(dbgs() << "Visiting BB#" << MBB->getNumber() << "\n"); + MBBMRT *NewMBB = new MBBMRT(MBB); + MachineRegion *Region = RegionInfo->getRegionFor(MBB); + + // Ensure we have the MRT region + if (RegionMap.count(Region) == 0) { + RegionMRT *NewMRTRegion = new RegionMRT(Region); + RegionMap[Region] = NewMRTRegion; + + // Ensure all parents are in the RegionMap + MachineRegion *Parent = Region->getParent(); + while (RegionMap.count(Parent) == 0) { + RegionMRT *NewMRTParent = new RegionMRT(Parent); + NewMRTParent->addChild(NewMRTRegion); + NewMRTRegion->setParent(NewMRTParent); + RegionMap[Parent] = NewMRTParent; + NewMRTRegion = NewMRTParent; + Parent = Parent->getParent(); + } + RegionMap[Parent]->addChild(NewMRTRegion); + NewMRTRegion->setParent(RegionMap[Parent]); + } + + // Add MBB to Region MRT + RegionMap[Region]->addChild(NewMBB); + NewMBB->setParent(RegionMap[Region]); + RegionMap[Region]->setSucc(Region->getExit()); + } + return Result; +} + +void LinearizedRegion::storeLiveOutReg(MachineBasicBlock *MBB, unsigned Reg, + MachineInstr *DefInstr, + const MachineRegisterInfo *MRI, + const TargetRegisterInfo *TRI, + PHILinearize &PHIInfo) { + if (TRI->isVirtualRegister(Reg)) { + DEBUG(dbgs() << "Considering Register: " << PrintReg(Reg, TRI) << "\n"); + // If this is a source register to a PHI we are chaining, it + // must be live out. + if (PHIInfo.isSource(Reg)) { + DEBUG(dbgs() << "Add LiveOut (PHI): " << PrintReg(Reg, TRI) << "\n"); + addLiveOut(Reg); + } else { + // If this is live out of the MBB + for (auto &UI : MRI->use_operands(Reg)) { + if (UI.getParent()->getParent() != MBB) { + DEBUG(dbgs() << "Add LiveOut (MBB BB#" << MBB->getNumber() + << "): " << PrintReg(Reg, TRI) << "\n"); + addLiveOut(Reg); + } else { + // If the use is in the same MBB we have to make sure + // it is after the def, otherwise it is live out in a loop + MachineInstr *UseInstr = UI.getParent(); + for (MachineBasicBlock::instr_iterator + MII = UseInstr->getIterator(), + MIE = UseInstr->getParent()->instr_end(); + MII != MIE; ++MII) { + if ((&(*MII)) == DefInstr) { + DEBUG(dbgs() << "Add LiveOut (Loop): " << PrintReg(Reg, TRI) + << "\n"); + addLiveOut(Reg); + } + } + } + } + } + } +} + +void LinearizedRegion::storeLiveOutRegRegion(RegionMRT *Region, unsigned Reg, + MachineInstr *DefInstr, + const MachineRegisterInfo *MRI, + const TargetRegisterInfo *TRI, + PHILinearize &PHIInfo) { + if (TRI->isVirtualRegister(Reg)) { + DEBUG(dbgs() << "Considering Register: " << PrintReg(Reg, TRI) << "\n"); + for (auto &UI : MRI->use_operands(Reg)) { + if (!Region->contains(UI.getParent()->getParent())) { + DEBUG(dbgs() << "Add LiveOut (Region " << (void *)Region + << "): " << PrintReg(Reg, TRI) << "\n"); + addLiveOut(Reg); + } + } + } +} + +void LinearizedRegion::storeLiveOuts(MachineBasicBlock *MBB, + const MachineRegisterInfo *MRI, + const TargetRegisterInfo *TRI, + PHILinearize &PHIInfo) { + DEBUG(dbgs() << "-Store Live Outs Begin (BB#" << MBB->getNumber() << ")-\n"); + for (auto &II : *MBB) { + for (auto &RI : II.defs()) { + storeLiveOutReg(MBB, RI.getReg(), RI.getParent(), MRI, TRI, PHIInfo); + } + for (auto &IRI : II.implicit_operands()) { + if (IRI.isDef()) { + storeLiveOutReg(MBB, IRI.getReg(), IRI.getParent(), MRI, TRI, PHIInfo); + } + } + } + + // If we have a successor with a PHI, source coming from this MBB we have to + // add the register as live out + for (MachineBasicBlock::succ_iterator SI = MBB->succ_begin(), + E = MBB->succ_end(); + SI != E; ++SI) { + for (auto &II : *(*SI)) { + if (II.isPHI()) { + MachineInstr &PHI = II; + int numPreds = getPHINumInputs(PHI); + for (int i = 0; i < numPreds; ++i) { + if (getPHIPred(PHI, i) == MBB) { + unsigned PHIReg = getPHISourceReg(PHI, i); + DEBUG(dbgs() << "Add LiveOut (PhiSource BB#" << MBB->getNumber() + << " -> BB#" << (*SI)->getNumber() + << "): " << PrintReg(PHIReg, TRI) << "\n"); + addLiveOut(PHIReg); + } + } + } + } + } + + DEBUG(dbgs() << "-Store Live Outs Endn-\n"); +} + +void LinearizedRegion::storeMBBLiveOuts(MachineBasicBlock *MBB, + const MachineRegisterInfo *MRI, + const TargetRegisterInfo *TRI, + PHILinearize &PHIInfo, + RegionMRT *TopRegion) { + for (auto &II : *MBB) { + for (auto &RI : II.defs()) { + storeLiveOutRegRegion(TopRegion, RI.getReg(), RI.getParent(), MRI, TRI, + PHIInfo); + } + for (auto &IRI : II.implicit_operands()) { + if (IRI.isDef()) { + storeLiveOutRegRegion(TopRegion, IRI.getReg(), IRI.getParent(), MRI, + TRI, PHIInfo); + } + } + } +} + +void LinearizedRegion::storeLiveOuts(RegionMRT *Region, + const MachineRegisterInfo *MRI, + const TargetRegisterInfo *TRI, + PHILinearize &PHIInfo, + RegionMRT *CurrentTopRegion) { + MachineBasicBlock *Exit = Region->getSucc(); + + RegionMRT *TopRegion = + CurrentTopRegion == nullptr ? Region : CurrentTopRegion; + + // Check if exit is end of function, if so, no live outs. + if (Exit == nullptr) + return; + + auto Children = Region->getChildren(); + for (auto CI : *Children) { + if (CI->isMBB()) { + auto MBB = CI->getMBBMRT()->getMBB(); + storeMBBLiveOuts(MBB, MRI, TRI, PHIInfo, TopRegion); + } else { + LinearizedRegion *SubRegion = CI->getRegionMRT()->getLinearizedRegion(); + // We should be limited to only store registers that are live out from the + // lineaized region + for (auto MBBI : SubRegion->MBBs) { + storeMBBLiveOuts(MBBI, MRI, TRI, PHIInfo, TopRegion); + } + } + } + + if (CurrentTopRegion == nullptr) { + auto Succ = Region->getSucc(); + for (auto &II : *Succ) { + if (II.isPHI()) { + MachineInstr &PHI = II; + int numPreds = getPHINumInputs(PHI); + for (int i = 0; i < numPreds; ++i) { + if (Region->contains(getPHIPred(PHI, i))) { + unsigned PHIReg = getPHISourceReg(PHI, i); + DEBUG(dbgs() << "Add Region LiveOut (" << (void *)Region + << "): " << PrintReg(PHIReg, TRI) << "\n"); + addLiveOut(PHIReg); + } + } + } + } + } +} + +void LinearizedRegion::print(raw_ostream &OS, const TargetRegisterInfo *TRI) { + OS << "Linearized Region {"; + bool IsFirst = true; + for (const auto &MBB : MBBs) { + if (IsFirst) { + IsFirst = false; + } else { + OS << " ,"; + } + OS << MBB->getNumber(); + } + OS << "} (" << Entry->getNumber() << ", " + << (Exit == nullptr ? -1 : Exit->getNumber()) + << "): In:" << PrintReg(getBBSelectRegIn(), TRI) + << " Out:" << PrintReg(getBBSelectRegOut(), TRI) << " {"; + for (auto &LI : LiveOuts) { + OS << PrintReg(LI, TRI) << " "; + } + OS << "} \n"; +} + +unsigned LinearizedRegion::getBBSelectRegIn() { + return getRegionMRT()->getBBSelectRegIn(); +} + +unsigned LinearizedRegion::getBBSelectRegOut() { + return getRegionMRT()->getBBSelectRegOut(); +} + +void LinearizedRegion::setHasLoop(bool Value) { HasLoop = Value; } + +bool LinearizedRegion::getHasLoop() { return HasLoop; } + +void LinearizedRegion::addLiveOut(unsigned VReg) { LiveOuts.insert(VReg); } + +void LinearizedRegion::removeLiveOut(unsigned Reg) { + if (isLiveOut(Reg)) + LiveOuts.erase(Reg); +} + +void LinearizedRegion::replaceLiveOut(unsigned OldReg, unsigned NewReg) { + if (isLiveOut(OldReg)) { + removeLiveOut(OldReg); + addLiveOut(NewReg); + } +} + +void LinearizedRegion::replaceRegister(unsigned Register, unsigned NewRegister, + MachineRegisterInfo *MRI, + bool ReplaceInside, bool ReplaceOutside, + bool IncludeLoopPHI) { + assert(Register != NewRegister && "Cannot replace a reg with itself"); + + DEBUG(dbgs() << "Pepareing to replace register (region): " + << PrintReg(Register, MRI->getTargetRegisterInfo()) << " with " + << PrintReg(NewRegister, MRI->getTargetRegisterInfo()) << "\n"); + + // If we are replacing outside, we also need to update the LiveOuts + if (ReplaceOutside && + (isLiveOut(Register) || this->getParent()->isLiveOut(Register))) { + LinearizedRegion *Current = this; + while (Current != nullptr && Current->getEntry() != nullptr) { + DEBUG(dbgs() << "Region before register replace\n"); + DEBUG(Current->print(dbgs(), MRI->getTargetRegisterInfo())); + Current->replaceLiveOut(Register, NewRegister); + DEBUG(dbgs() << "Region after register replace\n"); + DEBUG(Current->print(dbgs(), MRI->getTargetRegisterInfo())); + Current = Current->getParent(); + } + } + + for (MachineRegisterInfo::reg_iterator I = MRI->reg_begin(Register), + E = MRI->reg_end(); + I != E;) { + MachineOperand &O = *I; + ++I; + + // We don't rewrite defs. + if (O.isDef()) + continue; + + bool IsInside = contains(O.getParent()->getParent()); + bool IsLoopPHI = IsInside && (O.getParent()->isPHI() && + O.getParent()->getParent() == getEntry()); + bool ShouldReplace = (IsInside && ReplaceInside) || + (!IsInside && ReplaceOutside) || + (IncludeLoopPHI && IsLoopPHI); + if (ShouldReplace) { + + if (TargetRegisterInfo::isPhysicalRegister(NewRegister)) { + DEBUG(dbgs() << "Trying to substitute physical register: " + << PrintReg(NewRegister, MRI->getTargetRegisterInfo()) + << "\n"); + llvm_unreachable("Cannot substitute physical registers"); + } else { + DEBUG(dbgs() << "Replacing register (region): " + << PrintReg(Register, MRI->getTargetRegisterInfo()) + << " with " + << PrintReg(NewRegister, MRI->getTargetRegisterInfo()) + << "\n"); + O.setReg(NewRegister); + } + } + } +} + +void LinearizedRegion::replaceRegisterInsideRegion(unsigned Register, + unsigned NewRegister, + bool IncludeLoopPHIs, + MachineRegisterInfo *MRI) { + replaceRegister(Register, NewRegister, MRI, true, false, IncludeLoopPHIs); +} + +void LinearizedRegion::replaceRegisterOutsideRegion(unsigned Register, + unsigned NewRegister, + bool IncludeLoopPHIs, + MachineRegisterInfo *MRI) { + replaceRegister(Register, NewRegister, MRI, false, true, IncludeLoopPHIs); +} + +DenseSet<unsigned> *LinearizedRegion::getLiveOuts() { return &LiveOuts; } + +void LinearizedRegion::setEntry(MachineBasicBlock *NewEntry) { + Entry = NewEntry; +} + +MachineBasicBlock *LinearizedRegion::getEntry() { return Entry; } + +void LinearizedRegion::setExit(MachineBasicBlock *NewExit) { Exit = NewExit; } + +MachineBasicBlock *LinearizedRegion::getExit() { return Exit; } + +void LinearizedRegion::addMBB(MachineBasicBlock *MBB) { MBBs.insert(MBB); } + +void LinearizedRegion::addMBBs(LinearizedRegion *InnerRegion) { + for (const auto &MBB : InnerRegion->MBBs) { + addMBB(MBB); + } +} + +bool LinearizedRegion::contains(MachineBasicBlock *MBB) { + return MBBs.count(MBB) == 1; +} + +bool LinearizedRegion::isLiveOut(unsigned Reg) { + return LiveOuts.count(Reg) == 1; +} + +bool LinearizedRegion::hasNoDef(unsigned Reg, MachineRegisterInfo *MRI) { + return MRI->def_begin(Reg) == MRI->def_end(); +} + +// After the code has been structurized, what was flagged as kills +// before are no longer register kills. +void LinearizedRegion::removeFalseRegisterKills(MachineRegisterInfo *MRI) { + const TargetRegisterInfo *TRI = MRI->getTargetRegisterInfo(); + for (auto MBBI : MBBs) { + MachineBasicBlock *MBB = MBBI; + for (auto &II : *MBB) { + for (auto &RI : II.uses()) { + if (RI.isReg()) { + unsigned Reg = RI.getReg(); + if (TRI->isVirtualRegister(Reg)) { + if (hasNoDef(Reg, MRI)) + continue; + if (!MRI->hasOneDef(Reg)) { + DEBUG(this->getEntry()->getParent()->dump()); + DEBUG(dbgs() << PrintReg(Reg, TRI) << "\n"); + } + + if (MRI->def_begin(Reg) == MRI->def_end()) { + DEBUG(dbgs() << "Register " + << PrintReg(Reg, MRI->getTargetRegisterInfo()) + << " has NO defs\n"); + } else if (!MRI->hasOneDef(Reg)) { + DEBUG(dbgs() << "Register " + << PrintReg(Reg, MRI->getTargetRegisterInfo()) + << " has multiple defs\n"); + } + + assert(MRI->hasOneDef(Reg) && "Register has multiple definitions"); + MachineOperand *Def = &(*(MRI->def_begin(Reg))); + MachineOperand *UseOperand = &(RI); + bool UseIsOutsideDefMBB = Def->getParent()->getParent() != MBB; + if (UseIsOutsideDefMBB && UseOperand->isKill()) { + DEBUG(dbgs() << "Removing kill flag on register: " + << PrintReg(Reg, TRI) << "\n"); + UseOperand->setIsKill(false); + } + } + } + } + } + } +} + +void LinearizedRegion::initLiveOut(RegionMRT *Region, + const MachineRegisterInfo *MRI, + const TargetRegisterInfo *TRI, + PHILinearize &PHIInfo) { + storeLiveOuts(Region, MRI, TRI, PHIInfo); +} + +LinearizedRegion::LinearizedRegion(MachineBasicBlock *MBB, + const MachineRegisterInfo *MRI, + const TargetRegisterInfo *TRI, + PHILinearize &PHIInfo) { + setEntry(MBB); + setExit(MBB); + storeLiveOuts(MBB, MRI, TRI, PHIInfo); + MBBs.insert(MBB); + Parent = nullptr; +} + +LinearizedRegion::LinearizedRegion() { + setEntry(nullptr); + setExit(nullptr); + Parent = nullptr; +} + +LinearizedRegion::~LinearizedRegion() {} + +class AMDGPUMachineCFGStructurizer : public MachineFunctionPass { +private: + const MachineRegionInfo *Regions; + const SIInstrInfo *TII; + const TargetRegisterInfo *TRI; + MachineRegisterInfo *MRI; + unsigned BBSelectRegister; + PHILinearize PHIInfo; + DenseMap<MachineBasicBlock *, MachineBasicBlock *> FallthroughMap; + + void getPHIRegionIndices(RegionMRT *Region, MachineInstr &PHI, + SmallVector<unsigned, 2> &RegionIndices); + void getPHIRegionIndices(LinearizedRegion *Region, MachineInstr &PHI, + SmallVector<unsigned, 2> &RegionIndices); + void getPHINonRegionIndices(LinearizedRegion *Region, MachineInstr &PHI, + SmallVector<unsigned, 2> &PHINonRegionIndices); + + void storePHILinearizationInfoDest( + unsigned LDestReg, MachineInstr &PHI, + SmallVector<unsigned, 2> *RegionIndices = nullptr); + + unsigned storePHILinearizationInfo(MachineInstr &PHI, + SmallVector<unsigned, 2> *RegionIndices); + + void extractKilledPHIs(MachineBasicBlock *MBB); + + bool shrinkPHI(MachineInstr &PHI, SmallVector<unsigned, 2> &PHIIndices, + unsigned *ReplaceReg); + + bool shrinkPHI(MachineInstr &PHI, unsigned CombinedSourceReg, + MachineBasicBlock *SourceMBB, + SmallVector<unsigned, 2> &PHIIndices, unsigned *ReplaceReg); + + void replacePHI(MachineInstr &PHI, unsigned CombinedSourceReg, + MachineBasicBlock *LastMerge, + SmallVector<unsigned, 2> &PHIRegionIndices); + void replaceEntryPHI(MachineInstr &PHI, unsigned CombinedSourceReg, + MachineBasicBlock *IfMBB, + SmallVector<unsigned, 2> &PHIRegionIndices); + void replaceLiveOutRegs(MachineInstr &PHI, + SmallVector<unsigned, 2> &PHIRegionIndices, + unsigned CombinedSourceReg, + LinearizedRegion *LRegion); + void rewriteRegionExitPHI(RegionMRT *Region, MachineBasicBlock *LastMerge, + MachineInstr &PHI, LinearizedRegion *LRegion); + + void rewriteRegionExitPHIs(RegionMRT *Region, MachineBasicBlock *LastMerge, + LinearizedRegion *LRegion); + void rewriteRegionEntryPHI(LinearizedRegion *Region, MachineBasicBlock *IfMBB, + MachineInstr &PHI); + void rewriteRegionEntryPHIs(LinearizedRegion *Region, + MachineBasicBlock *IfMBB); + + bool regionIsSimpleIf(RegionMRT *Region); + + void transformSimpleIfRegion(RegionMRT *Region); + + void eliminateDeadBranchOperands(MachineBasicBlock::instr_iterator &II); + + void insertUnconditionalBranch(MachineBasicBlock *MBB, + MachineBasicBlock *Dest, + const DebugLoc &DL = DebugLoc()); + + MachineBasicBlock *createLinearizedExitBlock(RegionMRT *Region); + + void insertMergePHI(MachineBasicBlock *IfBB, MachineBasicBlock *CodeBB, + MachineBasicBlock *MergeBB, unsigned DestRegister, + unsigned IfSourceRegister, unsigned CodeSourceRegister, + bool IsUndefIfSource = false); + + MachineBasicBlock *createIfBlock(MachineBasicBlock *MergeBB, + MachineBasicBlock *CodeBBStart, + MachineBasicBlock *CodeBBEnd, + MachineBasicBlock *SelectBB, unsigned IfReg, + bool InheritPreds); + + void prunePHIInfo(MachineBasicBlock *MBB); + void createEntryPHI(LinearizedRegion *CurrentRegion, unsigned DestReg); + + void createEntryPHIs(LinearizedRegion *CurrentRegion); + void resolvePHIInfos(MachineBasicBlock *FunctionEntry); + + void replaceRegisterWith(unsigned Register, unsigned NewRegister); + + MachineBasicBlock *createIfRegion(MachineBasicBlock *MergeBB, + MachineBasicBlock *CodeBB, + LinearizedRegion *LRegion, + unsigned BBSelectRegIn, + unsigned BBSelectRegOut); + + MachineBasicBlock * + createIfRegion(MachineBasicBlock *MergeMBB, LinearizedRegion *InnerRegion, + LinearizedRegion *CurrentRegion, MachineBasicBlock *SelectBB, + unsigned BBSelectRegIn, unsigned BBSelectRegOut); + void ensureCondIsNotKilled(SmallVector<MachineOperand, 1> Cond); + + void rewriteCodeBBTerminator(MachineBasicBlock *CodeBB, + MachineBasicBlock *MergeBB, + unsigned BBSelectReg); + + MachineInstr *getDefInstr(unsigned Reg); + void insertChainedPHI(MachineBasicBlock *IfBB, MachineBasicBlock *CodeBB, + MachineBasicBlock *MergeBB, + LinearizedRegion *InnerRegion, unsigned DestReg, + unsigned SourceReg); + bool containsDef(MachineBasicBlock *MBB, LinearizedRegion *InnerRegion, + unsigned Register); + void rewriteLiveOutRegs(MachineBasicBlock *IfBB, MachineBasicBlock *CodeBB, + MachineBasicBlock *MergeBB, + LinearizedRegion *InnerRegion, + LinearizedRegion *LRegion); + + void splitLoopPHI(MachineInstr &PHI, MachineBasicBlock *Entry, + MachineBasicBlock *EntrySucc, LinearizedRegion *LRegion); + void splitLoopPHIs(MachineBasicBlock *Entry, MachineBasicBlock *EntrySucc, + LinearizedRegion *LRegion); + + MachineBasicBlock *splitExit(LinearizedRegion *LRegion); + + MachineBasicBlock *splitEntry(LinearizedRegion *LRegion); + + LinearizedRegion *initLinearizedRegion(RegionMRT *Region); + + bool structurizeComplexRegion(RegionMRT *Region); + + bool structurizeRegion(RegionMRT *Region); + + bool structurizeRegions(RegionMRT *Region, bool isTopRegion); + +public: + static char ID; + + void getAnalysisUsage(AnalysisUsage &AU) const override { + AU.addRequired<MachineRegionInfoPass>(); + MachineFunctionPass::getAnalysisUsage(AU); + } + + AMDGPUMachineCFGStructurizer() : MachineFunctionPass(ID) { + initializeAMDGPUMachineCFGStructurizerPass(*PassRegistry::getPassRegistry()); + } + + void initFallthroughMap(MachineFunction &MF); + + void createLinearizedRegion(RegionMRT *Region, unsigned SelectOut); + + unsigned initializeSelectRegisters(MRT *MRT, unsigned ExistingExitReg, + MachineRegisterInfo *MRI, + const SIInstrInfo *TII); + + RegionMRT *RMRT; + void setRegionMRT(RegionMRT *RegionTree) { RMRT = RegionTree; } + + RegionMRT *getRegionMRT() { return RMRT; } + + bool runOnMachineFunction(MachineFunction &MF) override; +}; +} + +char AMDGPUMachineCFGStructurizer::ID = 0; + +bool AMDGPUMachineCFGStructurizer::regionIsSimpleIf(RegionMRT *Region) { + MachineBasicBlock *Entry = Region->getEntry(); + MachineBasicBlock *Succ = Region->getSucc(); + bool FoundBypass = false; + bool FoundIf = false; + + if (Entry->succ_size() != 2) { + return false; + } + + for (MachineBasicBlock::const_succ_iterator SI = Entry->succ_begin(), + E = Entry->succ_end(); + SI != E; ++SI) { + MachineBasicBlock *Current = *SI; + + if (Current == Succ) { + FoundBypass = true; + } else if ((Current->succ_size() == 1) && + *(Current->succ_begin()) == Succ) { + FoundIf = true; + } + } + + return FoundIf && FoundBypass; +} + +void AMDGPUMachineCFGStructurizer::transformSimpleIfRegion(RegionMRT *Region) { + MachineBasicBlock *Entry = Region->getEntry(); + MachineBasicBlock *Exit = Region->getExit(); + TII->convertNonUniformIfRegion(Entry, Exit); +} + +static void fixMBBTerminator(MachineBasicBlock *MBB) { + + if (MBB->succ_size() == 1) { + auto *Succ = *(MBB->succ_begin()); + for (auto &TI : MBB->terminators()) { + for (auto &UI : TI.uses()) { + if (UI.isMBB() && UI.getMBB() != Succ) { + UI.setMBB(Succ); + } + } + } + } +} + +static void fixRegionTerminator(RegionMRT *Region) { + MachineBasicBlock *InternalSucc = nullptr; + MachineBasicBlock *ExternalSucc = nullptr; + LinearizedRegion *LRegion = Region->getLinearizedRegion(); + auto Exit = LRegion->getExit(); + + SmallPtrSet<MachineBasicBlock *, 2> Successors; + for (MachineBasicBlock::const_succ_iterator SI = Exit->succ_begin(), + SE = Exit->succ_end(); + SI != SE; ++SI) { + MachineBasicBlock *Succ = *SI; + if (LRegion->contains(Succ)) { + // Do not allow re-assign + assert(InternalSucc == nullptr); + InternalSucc = Succ; + } else { + // Do not allow re-assign + assert(ExternalSucc == nullptr); + ExternalSucc = Succ; + } + } + + for (auto &TI : Exit->terminators()) { + for (auto &UI : TI.uses()) { + if (UI.isMBB()) { + auto Target = UI.getMBB(); + if (Target != InternalSucc && Target != ExternalSucc) { + UI.setMBB(ExternalSucc); + } + } + } + } +} + +// If a region region is just a sequence of regions (and the exit +// block in the case of the top level region), we can simply skip +// linearizing it, because it is already linear +bool regionIsSequence(RegionMRT *Region) { + auto Children = Region->getChildren(); + for (auto CI : *Children) { + if (!CI->isRegion()) { + if (CI->getMBBMRT()->getMBB()->succ_size() > 1) { + return false; + } + } + } + return true; +} + +void fixupRegionExits(RegionMRT *Region) { + auto Children = Region->getChildren(); + for (auto CI : *Children) { + if (!CI->isRegion()) { + fixMBBTerminator(CI->getMBBMRT()->getMBB()); + } else { + fixRegionTerminator(CI->getRegionMRT()); + } + } +} + +void AMDGPUMachineCFGStructurizer::getPHIRegionIndices( + RegionMRT *Region, MachineInstr &PHI, + SmallVector<unsigned, 2> &PHIRegionIndices) { + unsigned NumInputs = getPHINumInputs(PHI); + for (unsigned i = 0; i < NumInputs; ++i) { + MachineBasicBlock *Pred = getPHIPred(PHI, i); + if (Region->contains(Pred)) { + PHIRegionIndices.push_back(i); + } + } +} + +void AMDGPUMachineCFGStructurizer::getPHIRegionIndices( + LinearizedRegion *Region, MachineInstr &PHI, + SmallVector<unsigned, 2> &PHIRegionIndices) { + unsigned NumInputs = getPHINumInputs(PHI); + for (unsigned i = 0; i < NumInputs; ++i) { + MachineBasicBlock *Pred = getPHIPred(PHI, i); + if (Region->contains(Pred)) { + PHIRegionIndices.push_back(i); + } + } +} + +void AMDGPUMachineCFGStructurizer::getPHINonRegionIndices( + LinearizedRegion *Region, MachineInstr &PHI, + SmallVector<unsigned, 2> &PHINonRegionIndices) { + unsigned NumInputs = getPHINumInputs(PHI); + for (unsigned i = 0; i < NumInputs; ++i) { + MachineBasicBlock *Pred = getPHIPred(PHI, i); + if (!Region->contains(Pred)) { + PHINonRegionIndices.push_back(i); + } + } +} + +void AMDGPUMachineCFGStructurizer::storePHILinearizationInfoDest( + unsigned LDestReg, MachineInstr &PHI, + SmallVector<unsigned, 2> *RegionIndices) { + if (RegionIndices) { + for (auto i : *RegionIndices) { + PHIInfo.addSource(LDestReg, getPHISourceReg(PHI, i), getPHIPred(PHI, i)); + } + } else { + unsigned NumInputs = getPHINumInputs(PHI); + for (unsigned i = 0; i < NumInputs; ++i) { + PHIInfo.addSource(LDestReg, getPHISourceReg(PHI, i), getPHIPred(PHI, i)); + } + } +} + +unsigned AMDGPUMachineCFGStructurizer::storePHILinearizationInfo( + MachineInstr &PHI, SmallVector<unsigned, 2> *RegionIndices) { + unsigned DestReg = getPHIDestReg(PHI); + unsigned LinearizeDestReg = + MRI->createVirtualRegister(MRI->getRegClass(DestReg)); + PHIInfo.addDest(LinearizeDestReg, PHI.getDebugLoc()); + storePHILinearizationInfoDest(LinearizeDestReg, PHI, RegionIndices); + return LinearizeDestReg; +} + +void AMDGPUMachineCFGStructurizer::extractKilledPHIs(MachineBasicBlock *MBB) { + // We need to create a new chain for the killed phi, but there is no + // need to do the renaming outside or inside the block. + SmallPtrSet<MachineInstr *, 2> PHIs; + for (MachineBasicBlock::instr_iterator I = MBB->instr_begin(), + E = MBB->instr_end(); + I != E; ++I) { + MachineInstr &Instr = *I; + if (Instr.isPHI()) { + unsigned PHIDestReg = getPHIDestReg(Instr); + DEBUG(dbgs() << "Extractking killed phi:\n"); + DEBUG(Instr.dump()); + PHIs.insert(&Instr); + PHIInfo.addDest(PHIDestReg, Instr.getDebugLoc()); + storePHILinearizationInfoDest(PHIDestReg, Instr); + } + } + + for (auto PI : PHIs) { + PI->eraseFromParent(); + } +} + +static bool isPHIRegionIndex(SmallVector<unsigned, 2> PHIRegionIndices, + unsigned Index) { + for (auto i : PHIRegionIndices) { + if (i == Index) + return true; + } + return false; +} + +bool AMDGPUMachineCFGStructurizer::shrinkPHI(MachineInstr &PHI, + SmallVector<unsigned, 2> &PHIIndices, + unsigned *ReplaceReg) { + return shrinkPHI(PHI, 0, nullptr, PHIIndices, ReplaceReg); +} + +bool AMDGPUMachineCFGStructurizer::shrinkPHI(MachineInstr &PHI, + unsigned CombinedSourceReg, + MachineBasicBlock *SourceMBB, + SmallVector<unsigned, 2> &PHIIndices, + unsigned *ReplaceReg) { + DEBUG(dbgs() << "Shrink PHI: "); + DEBUG(PHI.dump()); + DEBUG(dbgs() << " to " << PrintReg(getPHIDestReg(PHI), TRI) + << "<def> = PHI("); + + bool Replaced = false; + unsigned NumInputs = getPHINumInputs(PHI); + int SingleExternalEntryIndex = -1; + for (unsigned i = 0; i < NumInputs; ++i) { + if (!isPHIRegionIndex(PHIIndices, i)) { + if (SingleExternalEntryIndex == -1) { + // Single entry + SingleExternalEntryIndex = i; + } else { + // Multiple entries + SingleExternalEntryIndex = -2; + } + } + } + + if (SingleExternalEntryIndex > -1) { + *ReplaceReg = getPHISourceReg(PHI, SingleExternalEntryIndex); + // We should not rewrite the code, we should only pick up the single value + // that represents the shrunk PHI. + Replaced = true; + } else { + MachineBasicBlock *MBB = PHI.getParent(); + MachineInstrBuilder MIB = + BuildMI(*MBB, PHI, PHI.getDebugLoc(), TII->get(TargetOpcode::PHI), + getPHIDestReg(PHI)); + if (SourceMBB) { + MIB.addReg(CombinedSourceReg); + MIB.addMBB(SourceMBB); + DEBUG(dbgs() << PrintReg(CombinedSourceReg, TRI) << ", BB#" + << SourceMBB->getNumber()); + } + + for (unsigned i = 0; i < NumInputs; ++i) { + if (isPHIRegionIndex(PHIIndices, i)) { + continue; + } + unsigned SourceReg = getPHISourceReg(PHI, i); + MachineBasicBlock *SourcePred = getPHIPred(PHI, i); + MIB.addReg(SourceReg); + MIB.addMBB(SourcePred); + DEBUG(dbgs() << PrintReg(SourceReg, TRI) << ", BB#" + << SourcePred->getNumber()); + } + DEBUG(dbgs() << ")\n"); + } + PHI.eraseFromParent(); + return Replaced; +} + +void AMDGPUMachineCFGStructurizer::replacePHI( + MachineInstr &PHI, unsigned CombinedSourceReg, MachineBasicBlock *LastMerge, + SmallVector<unsigned, 2> &PHIRegionIndices) { + DEBUG(dbgs() << "Replace PHI: "); + DEBUG(PHI.dump()); + DEBUG(dbgs() << " with " << PrintReg(getPHIDestReg(PHI), TRI) + << "<def> = PHI("); + + bool HasExternalEdge = false; + unsigned NumInputs = getPHINumInputs(PHI); + for (unsigned i = 0; i < NumInputs; ++i) { + if (!isPHIRegionIndex(PHIRegionIndices, i)) { + HasExternalEdge = true; + } + } + + if (HasExternalEdge) { + MachineBasicBlock *MBB = PHI.getParent(); + MachineInstrBuilder MIB = + BuildMI(*MBB, PHI, PHI.getDebugLoc(), TII->get(TargetOpcode::PHI), + getPHIDestReg(PHI)); + MIB.addReg(CombinedSourceReg); + MIB.addMBB(LastMerge); + DEBUG(dbgs() << PrintReg(CombinedSourceReg, TRI) << ", BB#" + << LastMerge->getNumber()); + for (unsigned i = 0; i < NumInputs; ++i) { + if (isPHIRegionIndex(PHIRegionIndices, i)) { + continue; + } + unsigned SourceReg = getPHISourceReg(PHI, i); + MachineBasicBlock *SourcePred = getPHIPred(PHI, i); + MIB.addReg(SourceReg); + MIB.addMBB(SourcePred); + DEBUG(dbgs() << PrintReg(SourceReg, TRI) << ", BB#" + << SourcePred->getNumber()); + } + DEBUG(dbgs() << ")\n"); + } else { + replaceRegisterWith(getPHIDestReg(PHI), CombinedSourceReg); + } + PHI.eraseFromParent(); +} + +void AMDGPUMachineCFGStructurizer::replaceEntryPHI( + MachineInstr &PHI, unsigned CombinedSourceReg, MachineBasicBlock *IfMBB, + SmallVector<unsigned, 2> &PHIRegionIndices) { + + DEBUG(dbgs() << "Replace entry PHI: "); + DEBUG(PHI.dump()); + DEBUG(dbgs() << " with "); + + unsigned NumInputs = getPHINumInputs(PHI); + unsigned NumNonRegionInputs = NumInputs; + for (unsigned i = 0; i < NumInputs; ++i) { + if (isPHIRegionIndex(PHIRegionIndices, i)) { + NumNonRegionInputs--; + } + } + + if (NumNonRegionInputs == 0) { + auto DestReg = getPHIDestReg(PHI); + replaceRegisterWith(DestReg, CombinedSourceReg); + DEBUG(dbgs() << " register " << PrintReg(CombinedSourceReg, TRI) << "\n"); + PHI.eraseFromParent(); + } else { + DEBUG(dbgs() << PrintReg(getPHIDestReg(PHI), TRI) << "<def> = PHI("); + MachineBasicBlock *MBB = PHI.getParent(); + MachineInstrBuilder MIB = + BuildMI(*MBB, PHI, PHI.getDebugLoc(), TII->get(TargetOpcode::PHI), + getPHIDestReg(PHI)); + MIB.addReg(CombinedSourceReg); + MIB.addMBB(IfMBB); + DEBUG(dbgs() << PrintReg(CombinedSourceReg, TRI) << ", BB#" + << IfMBB->getNumber()); + unsigned NumInputs = getPHINumInputs(PHI); + for (unsigned i = 0; i < NumInputs; ++i) { + if (isPHIRegionIndex(PHIRegionIndices, i)) { + continue; + } + unsigned SourceReg = getPHISourceReg(PHI, i); + MachineBasicBlock *SourcePred = getPHIPred(PHI, i); + MIB.addReg(SourceReg); + MIB.addMBB(SourcePred); + DEBUG(dbgs() << PrintReg(SourceReg, TRI) << ", BB#" + << SourcePred->getNumber()); + } + DEBUG(dbgs() << ")\n"); + PHI.eraseFromParent(); + } +} + +void AMDGPUMachineCFGStructurizer::replaceLiveOutRegs( + MachineInstr &PHI, SmallVector<unsigned, 2> &PHIRegionIndices, + unsigned CombinedSourceReg, LinearizedRegion *LRegion) { + bool WasLiveOut = false; + for (auto PII : PHIRegionIndices) { + unsigned Reg = getPHISourceReg(PHI, PII); + if (LRegion->isLiveOut(Reg)) { + bool IsDead = true; + + // Check if register is live out of the basic block + MachineBasicBlock *DefMBB = getDefInstr(Reg)->getParent(); + for (auto UI = MRI->use_begin(Reg), E = MRI->use_end(); UI != E; ++UI) { + if ((*UI).getParent()->getParent() != DefMBB) { + IsDead = false; + } + } + + DEBUG(dbgs() << "Register " << PrintReg(Reg, TRI) << " is " + << (IsDead ? "dead" : "alive") << " after PHI replace\n"); + if (IsDead) { + LRegion->removeLiveOut(Reg); + } + WasLiveOut = true; + } + } + + if (WasLiveOut) + LRegion->addLiveOut(CombinedSourceReg); +} + +void AMDGPUMachineCFGStructurizer::rewriteRegionExitPHI(RegionMRT *Region, + MachineBasicBlock *LastMerge, + MachineInstr &PHI, + LinearizedRegion *LRegion) { + SmallVector<unsigned, 2> PHIRegionIndices; + getPHIRegionIndices(Region, PHI, PHIRegionIndices); + unsigned LinearizedSourceReg = + storePHILinearizationInfo(PHI, &PHIRegionIndices); + + replacePHI(PHI, LinearizedSourceReg, LastMerge, PHIRegionIndices); + replaceLiveOutRegs(PHI, PHIRegionIndices, LinearizedSourceReg, LRegion); +} + +void AMDGPUMachineCFGStructurizer::rewriteRegionEntryPHI(LinearizedRegion *Region, + MachineBasicBlock *IfMBB, + MachineInstr &PHI) { + SmallVector<unsigned, 2> PHINonRegionIndices; + getPHINonRegionIndices(Region, PHI, PHINonRegionIndices); + unsigned LinearizedSourceReg = + storePHILinearizationInfo(PHI, &PHINonRegionIndices); + replaceEntryPHI(PHI, LinearizedSourceReg, IfMBB, PHINonRegionIndices); +} + +static void collectPHIs(MachineBasicBlock *MBB, + SmallVector<MachineInstr *, 2> &PHIs) { + for (auto &BBI : *MBB) { + if (BBI.isPHI()) { + PHIs.push_back(&BBI); + } + } +} + +void AMDGPUMachineCFGStructurizer::rewriteRegionExitPHIs(RegionMRT *Region, + MachineBasicBlock *LastMerge, + LinearizedRegion *LRegion) { + SmallVector<MachineInstr *, 2> PHIs; + auto Exit = Region->getSucc(); + if (Exit == nullptr) + return; + + collectPHIs(Exit, PHIs); + + for (auto PHII : PHIs) { + rewriteRegionExitPHI(Region, LastMerge, *PHII, LRegion); + } +} + +void AMDGPUMachineCFGStructurizer::rewriteRegionEntryPHIs(LinearizedRegion *Region, + MachineBasicBlock *IfMBB) { + SmallVector<MachineInstr *, 2> PHIs; + auto Entry = Region->getEntry(); + + collectPHIs(Entry, PHIs); + + for (auto PHII : PHIs) { + rewriteRegionEntryPHI(Region, IfMBB, *PHII); + } +} + +void AMDGPUMachineCFGStructurizer::insertUnconditionalBranch(MachineBasicBlock *MBB, + MachineBasicBlock *Dest, + const DebugLoc &DL) { + DEBUG(dbgs() << "Inserting unconditional branch: " << MBB->getNumber() + << " -> " << Dest->getNumber() << "\n"); + MachineBasicBlock::instr_iterator Terminator = MBB->getFirstInstrTerminator(); + bool HasTerminator = Terminator != MBB->instr_end(); + if (HasTerminator) { + TII->ReplaceTailWithBranchTo(Terminator, Dest); + } + if (++MachineFunction::iterator(MBB) != MachineFunction::iterator(Dest)) { + TII->insertUnconditionalBranch(*MBB, Dest, DL); + } +} + +static MachineBasicBlock *getSingleExitNode(MachineFunction &MF) { + MachineBasicBlock *result = nullptr; + for (auto &MFI : MF) { + if (MFI.succ_size() == 0) { + if (result == nullptr) { + result = &MFI; + } else { + return nullptr; + } + } + } + + return result; +} + +static bool hasOneExitNode(MachineFunction &MF) { + return getSingleExitNode(MF) != nullptr; +} + +MachineBasicBlock * +AMDGPUMachineCFGStructurizer::createLinearizedExitBlock(RegionMRT *Region) { + auto Exit = Region->getSucc(); + + // If the exit is the end of the function, we just use the existing + MachineFunction *MF = Region->getEntry()->getParent(); + if (Exit == nullptr && hasOneExitNode(*MF)) { + return &(*(--(Region->getEntry()->getParent()->end()))); + } + + MachineBasicBlock *LastMerge = MF->CreateMachineBasicBlock(); + if (Exit == nullptr) { + MachineFunction::iterator ExitIter = MF->end(); + MF->insert(ExitIter, LastMerge); + } else { + MachineFunction::iterator ExitIter = Exit->getIterator(); + MF->insert(ExitIter, LastMerge); + LastMerge->addSuccessor(Exit); + insertUnconditionalBranch(LastMerge, Exit); + DEBUG(dbgs() << "Created exit block: " << LastMerge->getNumber() << "\n"); + } + return LastMerge; +} + +void AMDGPUMachineCFGStructurizer::insertMergePHI(MachineBasicBlock *IfBB, + MachineBasicBlock *CodeBB, + MachineBasicBlock *MergeBB, + unsigned DestRegister, + unsigned IfSourceRegister, + unsigned CodeSourceRegister, + bool IsUndefIfSource) { + // If this is the function exit block, we don't need a phi. + if (MergeBB->succ_begin() == MergeBB->succ_end()) { + return; + } + DEBUG(dbgs() << "Merge PHI (BB#" << MergeBB->getNumber() + << "): " << PrintReg(DestRegister, TRI) << "<def> = PHI(" + << PrintReg(IfSourceRegister, TRI) << ", BB#" + << IfBB->getNumber() << PrintReg(CodeSourceRegister, TRI) + << ", BB#" << CodeBB->getNumber() << ")\n"); + const DebugLoc &DL = MergeBB->findDebugLoc(MergeBB->begin()); + MachineInstrBuilder MIB = BuildMI(*MergeBB, MergeBB->instr_begin(), DL, + TII->get(TargetOpcode::PHI), DestRegister); + if (IsUndefIfSource && false) { + MIB.addReg(IfSourceRegister, RegState::Undef); + } else { + MIB.addReg(IfSourceRegister); + } + MIB.addMBB(IfBB); + MIB.addReg(CodeSourceRegister); + MIB.addMBB(CodeBB); +} + +static void removeExternalCFGSuccessors(MachineBasicBlock *MBB) { + for (MachineBasicBlock::succ_iterator PI = MBB->succ_begin(), + E = MBB->succ_end(); + PI != E; ++PI) { + if ((*PI) != MBB) { + (MBB)->removeSuccessor(*PI); + } + } +} + +static void removeExternalCFGEdges(MachineBasicBlock *StartMBB, + MachineBasicBlock *EndMBB) { + + // We have to check against the StartMBB successor becasuse a + // structurized region with a loop will have the entry block split, + // and the backedge will go to the entry successor. + DenseSet<std::pair<MachineBasicBlock *, MachineBasicBlock *>> Succs; + unsigned SuccSize = StartMBB->succ_size(); + if (SuccSize > 0) { + MachineBasicBlock *StartMBBSucc = *(StartMBB->succ_begin()); + for (MachineBasicBlock::succ_iterator PI = EndMBB->succ_begin(), + E = EndMBB->succ_end(); + PI != E; ++PI) { + // Either we have a back-edge to the entry block, or a back-edge to the + // succesor of the entry block since the block may be split. + if ((*PI) != StartMBB && + !((*PI) == StartMBBSucc && StartMBB != EndMBB && SuccSize == 1)) { + Succs.insert( + std::pair<MachineBasicBlock *, MachineBasicBlock *>(EndMBB, *PI)); + } + } + } + + for (MachineBasicBlock::pred_iterator PI = StartMBB->pred_begin(), + E = StartMBB->pred_end(); + PI != E; ++PI) { + if ((*PI) != EndMBB) { + Succs.insert( + std::pair<MachineBasicBlock *, MachineBasicBlock *>(*PI, StartMBB)); + } + } + + for (auto SI : Succs) { + std::pair<MachineBasicBlock *, MachineBasicBlock *> Edge = SI; + DEBUG(dbgs() << "Removing edge: BB#" << Edge.first->getNumber() << " -> BB#" + << Edge.second->getNumber() << "\n"); + Edge.first->removeSuccessor(Edge.second); + } +} + +MachineBasicBlock *AMDGPUMachineCFGStructurizer::createIfBlock( + MachineBasicBlock *MergeBB, MachineBasicBlock *CodeBBStart, + MachineBasicBlock *CodeBBEnd, MachineBasicBlock *SelectBB, unsigned IfReg, + bool InheritPreds) { + MachineFunction *MF = MergeBB->getParent(); + MachineBasicBlock *IfBB = MF->CreateMachineBasicBlock(); + + if (InheritPreds) { + for (MachineBasicBlock::pred_iterator PI = CodeBBStart->pred_begin(), + E = CodeBBStart->pred_end(); + PI != E; ++PI) { + if ((*PI) != CodeBBEnd) { + MachineBasicBlock *Pred = (*PI); + Pred->addSuccessor(IfBB); + } + } + } + + removeExternalCFGEdges(CodeBBStart, CodeBBEnd); + + auto CodeBBStartI = CodeBBStart->getIterator(); + auto CodeBBEndI = CodeBBEnd->getIterator(); + auto MergeIter = MergeBB->getIterator(); + MF->insert(MergeIter, IfBB); + MF->splice(MergeIter, CodeBBStartI, ++CodeBBEndI); + IfBB->addSuccessor(MergeBB); + IfBB->addSuccessor(CodeBBStart); + + DEBUG(dbgs() << "Created If block: " << IfBB->getNumber() << "\n"); + // Ensure that the MergeBB is a succesor of the CodeEndBB. + if (!CodeBBEnd->isSuccessor(MergeBB)) + CodeBBEnd->addSuccessor(MergeBB); + + DEBUG(dbgs() << "Moved MBB#" << CodeBBStart->getNumber() << " through MBB#" + << CodeBBEnd->getNumber() << "\n"); + + // If we have a single predecessor we can find a reasonable debug location + MachineBasicBlock *SinglePred = + CodeBBStart->pred_size() == 1 ? *(CodeBBStart->pred_begin()) : nullptr; + const DebugLoc &DL = SinglePred + ? SinglePred->findDebugLoc(SinglePred->getFirstTerminator()) + : DebugLoc(); + + unsigned Reg = + TII->insertEQ(IfBB, IfBB->begin(), DL, IfReg, + SelectBB->getNumber() /* CodeBBStart->getNumber() */); + if (&(*(IfBB->getParent()->begin())) == IfBB) { + TII->materializeImmediate(*IfBB, IfBB->begin(), DL, IfReg, + CodeBBStart->getNumber()); + } + MachineOperand RegOp = MachineOperand::CreateReg(Reg, false, false, true); + ArrayRef<MachineOperand> Cond(RegOp); + TII->insertBranch(*IfBB, MergeBB, CodeBBStart, Cond, DL); + + return IfBB; +} + +void AMDGPUMachineCFGStructurizer::ensureCondIsNotKilled( + SmallVector<MachineOperand, 1> Cond) { + if (Cond.size() != 1) + return; + if (!Cond[0].isReg()) + return; + + unsigned CondReg = Cond[0].getReg(); + for (auto UI = MRI->use_begin(CondReg), E = MRI->use_end(); UI != E; ++UI) { + (*UI).setIsKill(false); + } +} + +void AMDGPUMachineCFGStructurizer::rewriteCodeBBTerminator(MachineBasicBlock *CodeBB, + MachineBasicBlock *MergeBB, + unsigned BBSelectReg) { + MachineBasicBlock *TrueBB = nullptr; + MachineBasicBlock *FalseBB = nullptr; + SmallVector<MachineOperand, 1> Cond; + MachineBasicBlock *FallthroughBB = FallthroughMap[CodeBB]; + TII->analyzeBranch(*CodeBB, TrueBB, FalseBB, Cond); + + const DebugLoc &DL = CodeBB->findDebugLoc(CodeBB->getFirstTerminator()); + + if (FalseBB == nullptr && TrueBB == nullptr && FallthroughBB == nullptr) { + // This is an exit block, hence no successors. We will assign the + // bb select register to the entry block. + TII->materializeImmediate(*CodeBB, CodeBB->getFirstTerminator(), DL, + BBSelectReg, + CodeBB->getParent()->begin()->getNumber()); + insertUnconditionalBranch(CodeBB, MergeBB, DL); + return; + } + + if (FalseBB == nullptr && TrueBB == nullptr) { + TrueBB = FallthroughBB; + } else if (TrueBB != nullptr) { + FalseBB = + (FallthroughBB && (FallthroughBB != TrueBB)) ? FallthroughBB : FalseBB; + } + + if ((TrueBB != nullptr && FalseBB == nullptr) || (TrueBB == FalseBB)) { + TII->materializeImmediate(*CodeBB, CodeBB->getFirstTerminator(), DL, + BBSelectReg, TrueBB->getNumber()); + } else { + const TargetRegisterClass *RegClass = MRI->getRegClass(BBSelectReg); + unsigned TrueBBReg = MRI->createVirtualRegister(RegClass); + unsigned FalseBBReg = MRI->createVirtualRegister(RegClass); + TII->materializeImmediate(*CodeBB, CodeBB->getFirstTerminator(), DL, + TrueBBReg, TrueBB->getNumber()); + TII->materializeImmediate(*CodeBB, CodeBB->getFirstTerminator(), DL, + FalseBBReg, FalseBB->getNumber()); + ensureCondIsNotKilled(Cond); + TII->insertVectorSelect(*CodeBB, CodeBB->getFirstTerminator(), DL, + BBSelectReg, Cond, TrueBBReg, FalseBBReg); + } + + insertUnconditionalBranch(CodeBB, MergeBB, DL); +} + +MachineInstr *AMDGPUMachineCFGStructurizer::getDefInstr(unsigned Reg) { + if (MRI->def_begin(Reg) == MRI->def_end()) { + DEBUG(dbgs() << "Register " << PrintReg(Reg, MRI->getTargetRegisterInfo()) + << " has NO defs\n"); + } else if (!MRI->hasOneDef(Reg)) { + DEBUG(dbgs() << "Register " << PrintReg(Reg, MRI->getTargetRegisterInfo()) + << " has multiple defs\n"); + DEBUG(dbgs() << "DEFS BEGIN:\n"); + for (auto DI = MRI->def_begin(Reg), DE = MRI->def_end(); DI != DE; ++DI) { + DEBUG(DI->getParent()->dump()); + } + DEBUG(dbgs() << "DEFS END\n"); + } + + assert(MRI->hasOneDef(Reg) && "Register has multiple definitions"); + return (*(MRI->def_begin(Reg))).getParent(); +} + +void AMDGPUMachineCFGStructurizer::insertChainedPHI(MachineBasicBlock *IfBB, + MachineBasicBlock *CodeBB, + MachineBasicBlock *MergeBB, + LinearizedRegion *InnerRegion, + unsigned DestReg, + unsigned SourceReg) { + // In this function we know we are part of a chain already, so we need + // to add the registers to the existing chain, and rename the register + // inside the region. + bool IsSingleBB = InnerRegion->getEntry() == InnerRegion->getExit(); + MachineInstr *DefInstr = getDefInstr(SourceReg); + if (DefInstr->isPHI() && DefInstr->getParent() == CodeBB && IsSingleBB) { + // Handle the case where the def is a PHI-def inside a basic + // block, then we only need to do renaming. Special care needs to + // be taken if the PHI-def is part of an existing chain, or if a + // new one needs to be created. + InnerRegion->replaceRegisterInsideRegion(SourceReg, DestReg, true, MRI); + + // We collect all PHI Information, and if we are at the region entry, + // all PHIs will be removed, and then re-introduced if needed. + storePHILinearizationInfoDest(DestReg, *DefInstr); + // We have picked up all the information we need now and can remove + // the PHI + PHIInfo.removeSource(DestReg, SourceReg, CodeBB); + DefInstr->eraseFromParent(); + } else { + // If this is not a phi-def, or it is a phi-def but from a linearized region + if (IsSingleBB && DefInstr->getParent() == InnerRegion->getEntry()) { + // If this is a single BB and the definition is in this block we + // need to replace any uses outside the region. + InnerRegion->replaceRegisterOutsideRegion(SourceReg, DestReg, false, MRI); + } + const TargetRegisterClass *RegClass = MRI->getRegClass(DestReg); + unsigned NextDestReg = MRI->createVirtualRegister(RegClass); + bool IsLastDef = PHIInfo.getNumSources(DestReg) == 1; + DEBUG(dbgs() << "Insert Chained PHI\n"); + insertMergePHI(IfBB, InnerRegion->getExit(), MergeBB, DestReg, NextDestReg, + SourceReg, IsLastDef); + + PHIInfo.removeSource(DestReg, SourceReg, CodeBB); + if (IsLastDef) { + const DebugLoc &DL = IfBB->findDebugLoc(IfBB->getFirstTerminator()); + TII->materializeImmediate(*IfBB, IfBB->getFirstTerminator(), DL, + NextDestReg, 0); + PHIInfo.deleteDef(DestReg); + } else { + PHIInfo.replaceDef(DestReg, NextDestReg); + } + } +} + +bool AMDGPUMachineCFGStructurizer::containsDef(MachineBasicBlock *MBB, + LinearizedRegion *InnerRegion, + unsigned Register) { + return getDefInstr(Register)->getParent() == MBB || + InnerRegion->contains(getDefInstr(Register)->getParent()); +} + +void AMDGPUMachineCFGStructurizer::rewriteLiveOutRegs(MachineBasicBlock *IfBB, + MachineBasicBlock *CodeBB, + MachineBasicBlock *MergeBB, + LinearizedRegion *InnerRegion, + LinearizedRegion *LRegion) { + DenseSet<unsigned> *LiveOuts = InnerRegion->getLiveOuts(); + SmallVector<unsigned, 4> OldLiveOuts; + bool IsSingleBB = InnerRegion->getEntry() == InnerRegion->getExit(); + for (auto OLI : *LiveOuts) { + OldLiveOuts.push_back(OLI); + } + + for (auto LI : OldLiveOuts) { + DEBUG(dbgs() << "LiveOut: " << PrintReg(LI, TRI)); + if (!containsDef(CodeBB, InnerRegion, LI) || + (!IsSingleBB && (getDefInstr(LI)->getParent() == LRegion->getExit()))) { + // If the register simly lives through the CodeBB, we don't have + // to rewrite anything since the register is not defined in this + // part of the code. + DEBUG(dbgs() << "- through"); + continue; + } + DEBUG(dbgs() << "\n"); + unsigned Reg = LI; + if (/*!PHIInfo.isSource(Reg) &&*/ Reg != InnerRegion->getBBSelectRegOut()) { + // If the register is live out, we do want to create a phi, + // unless it is from the Exit block, becasuse in that case there + // is already a PHI, and no need to create a new one. + + // If the register is just a live out def and not part of a phi + // chain, we need to create a PHI node to handle the if region, + // and replace all uses outside of the region with the new dest + // register, unless it is the outgoing BB select register. We have + // already creaed phi nodes for these. + const TargetRegisterClass *RegClass = MRI->getRegClass(Reg); + unsigned PHIDestReg = MRI->createVirtualRegister(RegClass); + unsigned IfSourceReg = MRI->createVirtualRegister(RegClass); + // Create initializer, this value is never used, but is needed + // to satisfy SSA. + DEBUG(dbgs() << "Initializer for reg: " << PrintReg(Reg) << "\n"); + TII->materializeImmediate(*IfBB, IfBB->getFirstTerminator(), DebugLoc(), + IfSourceReg, 0); + + InnerRegion->replaceRegisterOutsideRegion(Reg, PHIDestReg, true, MRI); + DEBUG(dbgs() << "Insert Non-Chained Live out PHI\n"); + insertMergePHI(IfBB, InnerRegion->getExit(), MergeBB, PHIDestReg, + IfSourceReg, Reg, true); + } + } + + // Handle the chained definitions in PHIInfo, checking if this basic block + // is a source block for a definition. + SmallVector<unsigned, 4> Sources; + if (PHIInfo.findSourcesFromMBB(CodeBB, Sources)) { + DEBUG(dbgs() << "Inserting PHI Live Out from BB#" << CodeBB->getNumber() + << "\n"); + for (auto SI : Sources) { + unsigned DestReg; + PHIInfo.findDest(SI, CodeBB, DestReg); + insertChainedPHI(IfBB, CodeBB, MergeBB, InnerRegion, DestReg, SI); + } + DEBUG(dbgs() << "Insertion done.\n"); + } + + DEBUG(PHIInfo.dump(MRI)); +} + +void AMDGPUMachineCFGStructurizer::prunePHIInfo(MachineBasicBlock *MBB) { + DEBUG(dbgs() << "Before PHI Prune\n"); + DEBUG(PHIInfo.dump(MRI)); + SmallVector<std::tuple<unsigned, unsigned, MachineBasicBlock *>, 4> + ElimiatedSources; + for (auto DRI = PHIInfo.dests_begin(), DE = PHIInfo.dests_end(); DRI != DE; + ++DRI) { + + unsigned DestReg = *DRI; + auto SE = PHIInfo.sources_end(DestReg); + + bool MBBContainsPHISource = false; + // Check if there is a PHI source in this MBB + for (auto SRI = PHIInfo.sources_begin(DestReg); SRI != SE; ++SRI) { + unsigned SourceReg = (*SRI).first; + MachineOperand *Def = &(*(MRI->def_begin(SourceReg))); + if (Def->getParent()->getParent() == MBB) { + MBBContainsPHISource = true; + } + } + + // If so, all other sources are useless since we know this block + // is always executed when the region is executed. + if (MBBContainsPHISource) { + for (auto SRI = PHIInfo.sources_begin(DestReg); SRI != SE; ++SRI) { + PHILinearize::PHISourceT Source = *SRI; + unsigned SourceReg = Source.first; + MachineBasicBlock *SourceMBB = Source.second; + MachineOperand *Def = &(*(MRI->def_begin(SourceReg))); + if (Def->getParent()->getParent() != MBB) { + ElimiatedSources.push_back( + std::make_tuple(DestReg, SourceReg, SourceMBB)); + } + } + } + } + + // Remove the PHI sources that are in the given MBB + for (auto &SourceInfo : ElimiatedSources) { + PHIInfo.removeSource(std::get<0>(SourceInfo), std::get<1>(SourceInfo), + std::get<2>(SourceInfo)); + } + DEBUG(dbgs() << "After PHI Prune\n"); + DEBUG(PHIInfo.dump(MRI)); +} + +void AMDGPUMachineCFGStructurizer::createEntryPHI(LinearizedRegion *CurrentRegion, + unsigned DestReg) { + MachineBasicBlock *Entry = CurrentRegion->getEntry(); + MachineBasicBlock *Exit = CurrentRegion->getExit(); + + DEBUG(dbgs() << "RegionExit: " << Exit->getNumber() + << " Pred: " << (*(Entry->pred_begin()))->getNumber() << "\n"); + + int NumSources = 0; + auto SE = PHIInfo.sources_end(DestReg); + + for (auto SRI = PHIInfo.sources_begin(DestReg); SRI != SE; ++SRI) { + NumSources++; + } + + if (NumSources == 1) { + auto SRI = PHIInfo.sources_begin(DestReg); + unsigned SourceReg = (*SRI).first; + replaceRegisterWith(DestReg, SourceReg); + } else { + const DebugLoc &DL = Entry->findDebugLoc(Entry->begin()); + MachineInstrBuilder MIB = BuildMI(*Entry, Entry->instr_begin(), DL, + TII->get(TargetOpcode::PHI), DestReg); + DEBUG(dbgs() << "Entry PHI " << PrintReg(DestReg, TRI) << "<def> = PHI("); + + unsigned CurrentBackedgeReg = 0; + + for (auto SRI = PHIInfo.sources_begin(DestReg); SRI != SE; ++SRI) { + unsigned SourceReg = (*SRI).first; + + if (CurrentRegion->contains((*SRI).second)) { + if (CurrentBackedgeReg == 0) { + CurrentBackedgeReg = SourceReg; + } else { + MachineInstr *PHIDefInstr = getDefInstr(SourceReg); + MachineBasicBlock *PHIDefMBB = PHIDefInstr->getParent(); + const TargetRegisterClass *RegClass = + MRI->getRegClass(CurrentBackedgeReg); + unsigned NewBackedgeReg = MRI->createVirtualRegister(RegClass); + MachineInstrBuilder BackedgePHI = + BuildMI(*PHIDefMBB, PHIDefMBB->instr_begin(), DL, + TII->get(TargetOpcode::PHI), NewBackedgeReg); + BackedgePHI.addReg(CurrentBackedgeReg); + BackedgePHI.addMBB(getPHIPred(*PHIDefInstr, 0)); + BackedgePHI.addReg(getPHISourceReg(*PHIDefInstr, 1)); + BackedgePHI.addMBB((*SRI).second); + CurrentBackedgeReg = NewBackedgeReg; + DEBUG(dbgs() << "Inserting backedge PHI: " + << PrintReg(NewBackedgeReg, TRI) << "<def> = PHI(" + << PrintReg(CurrentBackedgeReg, TRI) << ", BB#" + << getPHIPred(*PHIDefInstr, 0)->getNumber() << ", " + << PrintReg(getPHISourceReg(*PHIDefInstr, 1), TRI) + << ", BB#" << (*SRI).second->getNumber()); + } + } else { + MIB.addReg(SourceReg); + MIB.addMBB((*SRI).second); + DEBUG(dbgs() << PrintReg(SourceReg, TRI) << ", BB#" + << (*SRI).second->getNumber() << ", "); + } + } + + // Add the final backedge register source to the entry phi + if (CurrentBackedgeReg != 0) { + MIB.addReg(CurrentBackedgeReg); + MIB.addMBB(Exit); + DEBUG(dbgs() << PrintReg(CurrentBackedgeReg, TRI) << ", BB#" + << Exit->getNumber() << ")\n"); + } else { + DEBUG(dbgs() << ")\n"); + } + } +} + +void AMDGPUMachineCFGStructurizer::createEntryPHIs(LinearizedRegion *CurrentRegion) { + DEBUG(PHIInfo.dump(MRI)); + + for (auto DRI = PHIInfo.dests_begin(), DE = PHIInfo.dests_end(); DRI != DE; + ++DRI) { + + unsigned DestReg = *DRI; + createEntryPHI(CurrentRegion, DestReg); + } + PHIInfo.clear(); +} + +void AMDGPUMachineCFGStructurizer::replaceRegisterWith(unsigned Register, + unsigned NewRegister) { + assert(Register != NewRegister && "Cannot replace a reg with itself"); + + for (MachineRegisterInfo::reg_iterator I = MRI->reg_begin(Register), + E = MRI->reg_end(); + I != E;) { + MachineOperand &O = *I; + ++I; + if (TargetRegisterInfo::isPhysicalRegister(NewRegister)) { + DEBUG(dbgs() << "Trying to substitute physical register: " + << PrintReg(NewRegister, MRI->getTargetRegisterInfo()) + << "\n"); + llvm_unreachable("Cannot substitute physical registers"); + // We don't handle physical registers, but if we need to + // in the future This is how we do it: + // O.substPhysReg(NewRegister, *TRI); + } else { + DEBUG(dbgs() << "Replacing register: " + << PrintReg(Register, MRI->getTargetRegisterInfo()) + << " with " + << PrintReg(NewRegister, MRI->getTargetRegisterInfo()) + << "\n"); + O.setReg(NewRegister); + } + } + PHIInfo.deleteDef(Register); + + getRegionMRT()->replaceLiveOutReg(Register, NewRegister); + + DEBUG(PHIInfo.dump(MRI)); +} + +void AMDGPUMachineCFGStructurizer::resolvePHIInfos(MachineBasicBlock *FunctionEntry) { + DEBUG(dbgs() << "Resolve PHI Infos\n"); + DEBUG(PHIInfo.dump(MRI)); + for (auto DRI = PHIInfo.dests_begin(), DE = PHIInfo.dests_end(); DRI != DE; + ++DRI) { + unsigned DestReg = *DRI; + DEBUG(dbgs() << "DestReg: " << PrintReg(DestReg, TRI) << "\n"); + auto SRI = PHIInfo.sources_begin(DestReg); + unsigned SourceReg = (*SRI).first; + DEBUG(dbgs() << "DestReg: " << PrintReg(DestReg, TRI) + << " SourceReg: " << PrintReg(SourceReg, TRI) << "\n"); + + assert(PHIInfo.sources_end(DestReg) == ++SRI && + "More than one phi source in entry node"); + replaceRegisterWith(DestReg, SourceReg); + } +} + +static bool isFunctionEntryBlock(MachineBasicBlock *MBB) { + return ((&(*(MBB->getParent()->begin()))) == MBB); +} + +MachineBasicBlock *AMDGPUMachineCFGStructurizer::createIfRegion( + MachineBasicBlock *MergeBB, MachineBasicBlock *CodeBB, + LinearizedRegion *CurrentRegion, unsigned BBSelectRegIn, + unsigned BBSelectRegOut) { + if (isFunctionEntryBlock(CodeBB) && !CurrentRegion->getHasLoop()) { + // Handle non-loop function entry block. + // We need to allow loops to the entry block and then + rewriteCodeBBTerminator(CodeBB, MergeBB, BBSelectRegOut); + resolvePHIInfos(CodeBB); + removeExternalCFGSuccessors(CodeBB); + CodeBB->addSuccessor(MergeBB); + CurrentRegion->addMBB(CodeBB); + return nullptr; + } + if (CurrentRegion->getEntry() == CodeBB && !CurrentRegion->getHasLoop()) { + // Handle non-loop region entry block. + MachineFunction *MF = MergeBB->getParent(); + auto MergeIter = MergeBB->getIterator(); + auto CodeBBStartIter = CodeBB->getIterator(); + auto CodeBBEndIter = ++(CodeBB->getIterator()); + if (CodeBBEndIter != MergeIter) { + MF->splice(MergeIter, CodeBBStartIter, CodeBBEndIter); + } + rewriteCodeBBTerminator(CodeBB, MergeBB, BBSelectRegOut); + prunePHIInfo(CodeBB); + createEntryPHIs(CurrentRegion); + removeExternalCFGSuccessors(CodeBB); + CodeBB->addSuccessor(MergeBB); + CurrentRegion->addMBB(CodeBB); + return nullptr; + } else { + // Handle internal block. + const TargetRegisterClass *RegClass = MRI->getRegClass(BBSelectRegIn); + unsigned CodeBBSelectReg = MRI->createVirtualRegister(RegClass); + rewriteCodeBBTerminator(CodeBB, MergeBB, CodeBBSelectReg); + bool IsRegionEntryBB = CurrentRegion->getEntry() == CodeBB; + MachineBasicBlock *IfBB = createIfBlock(MergeBB, CodeBB, CodeBB, CodeBB, + BBSelectRegIn, IsRegionEntryBB); + CurrentRegion->addMBB(IfBB); + // If this is the entry block we need to make the If block the new + // linearized region entry. + if (IsRegionEntryBB) { + CurrentRegion->setEntry(IfBB); + + if (CurrentRegion->getHasLoop()) { + MachineBasicBlock *RegionExit = CurrentRegion->getExit(); + MachineBasicBlock *ETrueBB = nullptr; + MachineBasicBlock *EFalseBB = nullptr; + SmallVector<MachineOperand, 1> ECond; + + const DebugLoc &DL = DebugLoc(); + TII->analyzeBranch(*RegionExit, ETrueBB, EFalseBB, ECond); + TII->removeBranch(*RegionExit); + + // We need to create a backedge if there is a loop + unsigned Reg = TII->insertNE( + RegionExit, RegionExit->instr_end(), DL, + CurrentRegion->getRegionMRT()->getInnerOutputRegister(), + CurrentRegion->getRegionMRT()->getEntry()->getNumber()); + MachineOperand RegOp = + MachineOperand::CreateReg(Reg, false, false, true); + ArrayRef<MachineOperand> Cond(RegOp); + DEBUG(dbgs() << "RegionExitReg: "); + DEBUG(Cond[0].print(dbgs(), TRI)); + DEBUG(dbgs() << "\n"); + TII->insertBranch(*RegionExit, CurrentRegion->getEntry(), RegionExit, + Cond, DebugLoc()); + RegionExit->addSuccessor(CurrentRegion->getEntry()); + } + } + CurrentRegion->addMBB(CodeBB); + LinearizedRegion InnerRegion(CodeBB, MRI, TRI, PHIInfo); + + InnerRegion.setParent(CurrentRegion); + DEBUG(dbgs() << "Insert BB Select PHI (BB)\n"); + insertMergePHI(IfBB, CodeBB, MergeBB, BBSelectRegOut, BBSelectRegIn, + CodeBBSelectReg); + InnerRegion.addMBB(MergeBB); + + DEBUG(InnerRegion.print(dbgs(), TRI)); + rewriteLiveOutRegs(IfBB, CodeBB, MergeBB, &InnerRegion, CurrentRegion); + extractKilledPHIs(CodeBB); + if (IsRegionEntryBB) { + createEntryPHIs(CurrentRegion); + } + return IfBB; + } +} + +MachineBasicBlock *AMDGPUMachineCFGStructurizer::createIfRegion( + MachineBasicBlock *MergeBB, LinearizedRegion *InnerRegion, + LinearizedRegion *CurrentRegion, MachineBasicBlock *SelectBB, + unsigned BBSelectRegIn, unsigned BBSelectRegOut) { + unsigned CodeBBSelectReg = + InnerRegion->getRegionMRT()->getInnerOutputRegister(); + MachineBasicBlock *CodeEntryBB = InnerRegion->getEntry(); + MachineBasicBlock *CodeExitBB = InnerRegion->getExit(); + MachineBasicBlock *IfBB = createIfBlock(MergeBB, CodeEntryBB, CodeExitBB, + SelectBB, BBSelectRegIn, true); + CurrentRegion->addMBB(IfBB); + bool isEntry = CurrentRegion->getEntry() == InnerRegion->getEntry(); + if (isEntry) { + + if (CurrentRegion->getHasLoop()) { + MachineBasicBlock *RegionExit = CurrentRegion->getExit(); + MachineBasicBlock *ETrueBB = nullptr; + MachineBasicBlock *EFalseBB = nullptr; + SmallVector<MachineOperand, 1> ECond; + + const DebugLoc &DL = DebugLoc(); + TII->analyzeBranch(*RegionExit, ETrueBB, EFalseBB, ECond); + TII->removeBranch(*RegionExit); + + // We need to create a backedge if there is a loop + unsigned Reg = + TII->insertNE(RegionExit, RegionExit->instr_end(), DL, + CurrentRegion->getRegionMRT()->getInnerOutputRegister(), + CurrentRegion->getRegionMRT()->getEntry()->getNumber()); + MachineOperand RegOp = MachineOperand::CreateReg(Reg, false, false, true); + ArrayRef<MachineOperand> Cond(RegOp); + DEBUG(dbgs() << "RegionExitReg: "); + DEBUG(Cond[0].print(dbgs(), TRI)); + DEBUG(dbgs() << "\n"); + TII->insertBranch(*RegionExit, CurrentRegion->getEntry(), RegionExit, + Cond, DebugLoc()); + RegionExit->addSuccessor(IfBB); + } + } + CurrentRegion->addMBBs(InnerRegion); + DEBUG(dbgs() << "Insert BB Select PHI (region)\n"); + insertMergePHI(IfBB, CodeExitBB, MergeBB, BBSelectRegOut, BBSelectRegIn, + CodeBBSelectReg); + + rewriteLiveOutRegs(IfBB, /* CodeEntryBB */ CodeExitBB, MergeBB, InnerRegion, + CurrentRegion); + + rewriteRegionEntryPHIs(InnerRegion, IfBB); + + if (isEntry) { + CurrentRegion->setEntry(IfBB); + } + + if (isEntry) { + createEntryPHIs(CurrentRegion); + } + + return IfBB; +} + +void AMDGPUMachineCFGStructurizer::splitLoopPHI(MachineInstr &PHI, + MachineBasicBlock *Entry, + MachineBasicBlock *EntrySucc, + LinearizedRegion *LRegion) { + SmallVector<unsigned, 2> PHIRegionIndices; + getPHIRegionIndices(LRegion, PHI, PHIRegionIndices); + + assert(PHIRegionIndices.size() == 1); + + unsigned RegionIndex = PHIRegionIndices[0]; + unsigned RegionSourceReg = getPHISourceReg(PHI, RegionIndex); + MachineBasicBlock *RegionSourceMBB = getPHIPred(PHI, RegionIndex); + unsigned PHIDest = getPHIDestReg(PHI); + unsigned PHISource = PHIDest; + unsigned ReplaceReg; + + if (shrinkPHI(PHI, PHIRegionIndices, &ReplaceReg)) { + PHISource = ReplaceReg; + } + + const TargetRegisterClass *RegClass = MRI->getRegClass(PHIDest); + unsigned NewDestReg = MRI->createVirtualRegister(RegClass); + LRegion->replaceRegisterInsideRegion(PHIDest, NewDestReg, false, MRI); + MachineInstrBuilder MIB = + BuildMI(*EntrySucc, EntrySucc->instr_begin(), PHI.getDebugLoc(), + TII->get(TargetOpcode::PHI), NewDestReg); + DEBUG(dbgs() << "Split Entry PHI " << PrintReg(NewDestReg, TRI) + << "<def> = PHI("); + MIB.addReg(PHISource); + MIB.addMBB(Entry); + DEBUG(dbgs() << PrintReg(PHISource, TRI) << ", BB#" << Entry->getNumber()); + MIB.addReg(RegionSourceReg); + MIB.addMBB(RegionSourceMBB); + DEBUG(dbgs() << " ," << PrintReg(RegionSourceReg, TRI) << ", BB#" + << RegionSourceMBB->getNumber() << ")\n"); +} + +void AMDGPUMachineCFGStructurizer::splitLoopPHIs(MachineBasicBlock *Entry, + MachineBasicBlock *EntrySucc, + LinearizedRegion *LRegion) { + SmallVector<MachineInstr *, 2> PHIs; + collectPHIs(Entry, PHIs); + + for (auto PHII : PHIs) { + splitLoopPHI(*PHII, Entry, EntrySucc, LRegion); + } +} + +// Split the exit block so that we can insert a end control flow +MachineBasicBlock * +AMDGPUMachineCFGStructurizer::splitExit(LinearizedRegion *LRegion) { + auto MRTRegion = LRegion->getRegionMRT(); + auto Exit = LRegion->getExit(); + auto MF = Exit->getParent(); + auto Succ = MRTRegion->getSucc(); + + auto NewExit = MF->CreateMachineBasicBlock(); + auto AfterExitIter = Exit->getIterator(); + AfterExitIter++; + MF->insert(AfterExitIter, NewExit); + Exit->removeSuccessor(Succ); + Exit->addSuccessor(NewExit); + NewExit->addSuccessor(Succ); + insertUnconditionalBranch(NewExit, Succ); + LRegion->addMBB(NewExit); + LRegion->setExit(NewExit); + + DEBUG(dbgs() << "Created new exit block: " << NewExit->getNumber() << "\n"); + + // Replace any PHI Predecessors in the successor with NewExit + for (auto &II : *Succ) { + MachineInstr &Instr = II; + + // If we are past the PHI instructions we are done + if (!Instr.isPHI()) + break; + + int numPreds = getPHINumInputs(Instr); + for (int i = 0; i < numPreds; ++i) { + auto Pred = getPHIPred(Instr, i); + if (Pred == Exit) { + setPhiPred(Instr, i, NewExit); + } + } + } + + return NewExit; +} + + +static MachineBasicBlock *split(MachineBasicBlock::iterator I) { + // Create the fall-through block. + MachineBasicBlock *MBB = (*I).getParent(); + MachineFunction *MF = MBB->getParent(); + MachineBasicBlock *SuccMBB = MF->CreateMachineBasicBlock(); + auto MBBIter = ++(MBB->getIterator()); + MF->insert(MBBIter, SuccMBB); + SuccMBB->transferSuccessorsAndUpdatePHIs(MBB); + MBB->addSuccessor(SuccMBB); + + // Splice the code over. + SuccMBB->splice(SuccMBB->end(), MBB, I, MBB->end()); + + return SuccMBB; +} + +// Split the entry block separating PHI-nodes and the rest of the code +// This is needed to insert an initializer for the bb select register +// inloop regions. + +MachineBasicBlock * +AMDGPUMachineCFGStructurizer::splitEntry(LinearizedRegion *LRegion) { + MachineBasicBlock *Entry = LRegion->getEntry(); + MachineBasicBlock *EntrySucc = split(Entry->getFirstNonPHI()); + MachineBasicBlock *Exit = LRegion->getExit(); + + DEBUG(dbgs() << "Split BB#" << Entry->getNumber() << " to BB#" + << Entry->getNumber() << " -> BB#" << EntrySucc->getNumber() + << "\n"); + LRegion->addMBB(EntrySucc); + + // Make the backedge go to Entry Succ + if (Exit->isSuccessor(Entry)) { + Exit->removeSuccessor(Entry); + } + Exit->addSuccessor(EntrySucc); + MachineInstr &Branch = *(Exit->instr_rbegin()); + for (auto &UI : Branch.uses()) { + if (UI.isMBB() && UI.getMBB() == Entry) { + UI.setMBB(EntrySucc); + } + } + + splitLoopPHIs(Entry, EntrySucc, LRegion); + + return EntrySucc; +} + +LinearizedRegion * +AMDGPUMachineCFGStructurizer::initLinearizedRegion(RegionMRT *Region) { + LinearizedRegion *LRegion = Region->getLinearizedRegion(); + LRegion->initLiveOut(Region, MRI, TRI, PHIInfo); + LRegion->setEntry(Region->getEntry()); + return LRegion; +} + +static void removeOldExitPreds(RegionMRT *Region) { + MachineBasicBlock *Exit = Region->getSucc(); + if (Exit == nullptr) { + return; + } + for (MachineBasicBlock::pred_iterator PI = Exit->pred_begin(), + E = Exit->pred_end(); + PI != E; ++PI) { + if (Region->contains(*PI)) { + (*PI)->removeSuccessor(Exit); + } + } +} + +static bool mbbHasBackEdge(MachineBasicBlock *MBB, + SmallPtrSet<MachineBasicBlock *, 8> &MBBs) { + for (auto SI = MBB->succ_begin(), SE = MBB->succ_end(); SI != SE; ++SI) { + if (MBBs.count(*SI) != 0) { + return true; + } + } + return false; +} + +static bool containsNewBackedge(MRT *Tree, + SmallPtrSet<MachineBasicBlock *, 8> &MBBs) { + // Need to traverse this in reverse since it is in post order. + if (Tree == nullptr) + return false; + + if (Tree->isMBB()) { + MachineBasicBlock *MBB = Tree->getMBBMRT()->getMBB(); + MBBs.insert(MBB); + if (mbbHasBackEdge(MBB, MBBs)) { + return true; + } + } else { + RegionMRT *Region = Tree->getRegionMRT(); + SetVector<MRT *> *Children = Region->getChildren(); + for (auto CI = Children->rbegin(), CE = Children->rend(); CI != CE; ++CI) { + if (containsNewBackedge(*CI, MBBs)) + return true; + } + } + return false; +} + +static bool containsNewBackedge(RegionMRT *Region) { + SmallPtrSet<MachineBasicBlock *, 8> MBBs; + return containsNewBackedge(Region, MBBs); +} + +bool AMDGPUMachineCFGStructurizer::structurizeComplexRegion(RegionMRT *Region) { + auto *LRegion = initLinearizedRegion(Region); + LRegion->setHasLoop(containsNewBackedge(Region)); + MachineBasicBlock *LastMerge = createLinearizedExitBlock(Region); + MachineBasicBlock *CurrentMerge = LastMerge; + LRegion->addMBB(LastMerge); + LRegion->setExit(LastMerge); + + rewriteRegionExitPHIs(Region, LastMerge, LRegion); + removeOldExitPreds(Region); + + DEBUG(PHIInfo.dump(MRI)); + + SetVector<MRT *> *Children = Region->getChildren(); + DEBUG(dbgs() << "===========If Region Start===============\n"); + if (LRegion->getHasLoop()) { + DEBUG(dbgs() << "Has Backedge: Yes\n"); + } else { + DEBUG(dbgs() << "Has Backedge: No\n"); + } + + unsigned BBSelectRegIn; + unsigned BBSelectRegOut; + for (auto CI = Children->begin(), CE = Children->end(); CI != CE; ++CI) { + DEBUG(dbgs() << "CurrentRegion: \n"); + DEBUG(LRegion->print(dbgs(), TRI)); + + auto CNI = CI; + ++CNI; + + MRT *Child = (*CI); + + if (Child->isRegion()) { + + LinearizedRegion *InnerLRegion = + Child->getRegionMRT()->getLinearizedRegion(); + // We found the block is the exit of an inner region, we need + // to put it in the current linearized region. + + DEBUG(dbgs() << "Linearizing region: "); + DEBUG(InnerLRegion->print(dbgs(), TRI)); + DEBUG(dbgs() << "\n"); + + MachineBasicBlock *InnerEntry = InnerLRegion->getEntry(); + if ((&(*(InnerEntry->getParent()->begin()))) == InnerEntry) { + // Entry has already been linearized, no need to do this region. + unsigned OuterSelect = InnerLRegion->getBBSelectRegOut(); + unsigned InnerSelectReg = + InnerLRegion->getRegionMRT()->getInnerOutputRegister(); + replaceRegisterWith(InnerSelectReg, OuterSelect), + resolvePHIInfos(InnerEntry); + if (!InnerLRegion->getExit()->isSuccessor(CurrentMerge)) + InnerLRegion->getExit()->addSuccessor(CurrentMerge); + continue; + } + + BBSelectRegOut = Child->getBBSelectRegOut(); + BBSelectRegIn = Child->getBBSelectRegIn(); + + DEBUG(dbgs() << "BBSelectRegIn: " << PrintReg(BBSelectRegIn, TRI) + << "\n"); + DEBUG(dbgs() << "BBSelectRegOut: " << PrintReg(BBSelectRegOut, TRI) + << "\n"); + + MachineBasicBlock *IfEnd = CurrentMerge; + CurrentMerge = createIfRegion(CurrentMerge, InnerLRegion, LRegion, + Child->getRegionMRT()->getEntry(), + BBSelectRegIn, BBSelectRegOut); + TII->convertNonUniformIfRegion(CurrentMerge, IfEnd); + } else { + MachineBasicBlock *MBB = Child->getMBBMRT()->getMBB(); + DEBUG(dbgs() << "Linearizing block: " << MBB->getNumber() << "\n"); + + if (MBB == getSingleExitNode(*(MBB->getParent()))) { + // If this is the exit block then we need to skip to the next. + // The "in" register will be transferred to "out" in the next + // iteration. + continue; + } + + BBSelectRegOut = Child->getBBSelectRegOut(); + BBSelectRegIn = Child->getBBSelectRegIn(); + + DEBUG(dbgs() << "BBSelectRegIn: " << PrintReg(BBSelectRegIn, TRI) + << "\n"); + DEBUG(dbgs() << "BBSelectRegOut: " << PrintReg(BBSelectRegOut, TRI) + << "\n"); + + MachineBasicBlock *IfEnd = CurrentMerge; + // This is a basic block that is not part of an inner region, we + // need to put it in the current linearized region. + CurrentMerge = createIfRegion(CurrentMerge, MBB, LRegion, BBSelectRegIn, + BBSelectRegOut); + if (CurrentMerge) { + TII->convertNonUniformIfRegion(CurrentMerge, IfEnd); + } + + DEBUG(PHIInfo.dump(MRI)); + } + } + + LRegion->removeFalseRegisterKills(MRI); + + if (LRegion->getHasLoop()) { + MachineBasicBlock *NewSucc = splitEntry(LRegion); + if (isFunctionEntryBlock(LRegion->getEntry())) { + resolvePHIInfos(LRegion->getEntry()); + } + const DebugLoc &DL = NewSucc->findDebugLoc(NewSucc->getFirstNonPHI()); + unsigned InReg = LRegion->getBBSelectRegIn(); + unsigned InnerSelectReg = + MRI->createVirtualRegister(MRI->getRegClass(InReg)); + unsigned NewInReg = MRI->createVirtualRegister(MRI->getRegClass(InReg)); + TII->materializeImmediate(*(LRegion->getEntry()), + LRegion->getEntry()->getFirstTerminator(), DL, + NewInReg, Region->getEntry()->getNumber()); + // Need to be careful about updating the registers inside the region. + LRegion->replaceRegisterInsideRegion(InReg, InnerSelectReg, false, MRI); + DEBUG(dbgs() << "Loop BBSelect Merge PHI:\n"); + insertMergePHI(LRegion->getEntry(), LRegion->getExit(), NewSucc, + InnerSelectReg, NewInReg, + LRegion->getRegionMRT()->getInnerOutputRegister()); + splitExit(LRegion); + TII->convertNonUniformLoopRegion(NewSucc, LastMerge); + } + + if (Region->isRoot()) { + TII->insertReturn(*LastMerge); + } + + DEBUG(Region->getEntry()->getParent()->dump()); + DEBUG(LRegion->print(dbgs(), TRI)); + DEBUG(PHIInfo.dump(MRI)); + + DEBUG(dbgs() << "===========If Region End===============\n"); + + Region->setLinearizedRegion(LRegion); + return true; +} + +bool AMDGPUMachineCFGStructurizer::structurizeRegion(RegionMRT *Region) { + if (false && regionIsSimpleIf(Region)) { + transformSimpleIfRegion(Region); + return true; + } else if (regionIsSequence(Region)) { + fixupRegionExits(Region); + return false; + } else { + structurizeComplexRegion(Region); + } + return false; +} + +static int structurize_once = 0; + +bool AMDGPUMachineCFGStructurizer::structurizeRegions(RegionMRT *Region, + bool isTopRegion) { + bool Changed = false; + + auto Children = Region->getChildren(); + for (auto CI : *Children) { + if (CI->isRegion()) { + Changed |= structurizeRegions(CI->getRegionMRT(), false); + } + } + + if (structurize_once < 2 || true) { + Changed |= structurizeRegion(Region); + structurize_once++; + } + return Changed; +} + +void AMDGPUMachineCFGStructurizer::initFallthroughMap(MachineFunction &MF) { + DEBUG(dbgs() << "Fallthrough Map:\n"); + for (auto &MBBI : MF) { + MachineBasicBlock *MBB = MBBI.getFallThrough(); + if (MBB != nullptr) { + DEBUG(dbgs() << "Fallthrough: " << MBBI.getNumber() << " -> " + << MBB->getNumber() << "\n"); + } + FallthroughMap[&MBBI] = MBB; + } +} + +void AMDGPUMachineCFGStructurizer::createLinearizedRegion(RegionMRT *Region, + unsigned SelectOut) { + LinearizedRegion *LRegion = new LinearizedRegion(); + if (SelectOut) { + LRegion->addLiveOut(SelectOut); + DEBUG(dbgs() << "Add LiveOut (BBSelect): " << PrintReg(SelectOut, TRI) + << "\n"); + } + LRegion->setRegionMRT(Region); + Region->setLinearizedRegion(LRegion); + LRegion->setParent(Region->getParent() + ? Region->getParent()->getLinearizedRegion() + : nullptr); +} + +unsigned +AMDGPUMachineCFGStructurizer::initializeSelectRegisters(MRT *MRT, unsigned SelectOut, + MachineRegisterInfo *MRI, + const SIInstrInfo *TII) { + if (MRT->isRegion()) { + RegionMRT *Region = MRT->getRegionMRT(); + Region->setBBSelectRegOut(SelectOut); + unsigned InnerSelectOut = createBBSelectReg(TII, MRI); + + // Fixme: Move linearization creation to the original spot + createLinearizedRegion(Region, SelectOut); + + for (auto CI = Region->getChildren()->begin(), + CE = Region->getChildren()->end(); + CI != CE; ++CI) { + InnerSelectOut = + initializeSelectRegisters((*CI), InnerSelectOut, MRI, TII); + } + MRT->setBBSelectRegIn(InnerSelectOut); + return InnerSelectOut; + } else { + MRT->setBBSelectRegOut(SelectOut); + unsigned NewSelectIn = createBBSelectReg(TII, MRI); + MRT->setBBSelectRegIn(NewSelectIn); + return NewSelectIn; + } +} + +static void checkRegOnlyPHIInputs(MachineFunction &MF) { + for (auto &MBBI : MF) { + for (MachineBasicBlock::instr_iterator I = MBBI.instr_begin(), + E = MBBI.instr_end(); + I != E; ++I) { + MachineInstr &Instr = *I; + if (Instr.isPHI()) { + int numPreds = getPHINumInputs(Instr); + for (int i = 0; i < numPreds; ++i) { + assert(Instr.getOperand(i * 2 + 1).isReg() && + "PHI Operand not a register"); + } + } + } + } +} + + +INITIALIZE_PASS_BEGIN(AMDGPUMachineCFGStructurizer, "amdgpu-machine-cfg-structurizer", + "AMDGPU Machine CFG Structurizer", false, false) +INITIALIZE_PASS_DEPENDENCY(MachineRegionInfoPass) +INITIALIZE_PASS_END(AMDGPUMachineCFGStructurizer, "amdgpu-machine-cfg-structurizer", + "AMDGPU Machine CFG Structurizer", false, false) + +char AMDGPUMachineCFGStructurizerID = AMDGPUMachineCFGStructurizer::ID; + + +bool AMDGPUMachineCFGStructurizer::runOnMachineFunction(MachineFunction &MF) { + const SISubtarget &ST = MF.getSubtarget<SISubtarget>(); + const SIInstrInfo *TII = ST.getInstrInfo(); + TRI = ST.getRegisterInfo(); + MRI = &(MF.getRegInfo()); + initFallthroughMap(MF); + + checkRegOnlyPHIInputs(MF); + DEBUG(dbgs() << "----STRUCTURIZER START----\n"); + DEBUG(MF.dump()); + + Regions = &(getAnalysis<MachineRegionInfoPass>().getRegionInfo()); + DEBUG(Regions->dump()); + + RegionMRT *RTree = MRT::buildMRT(MF, Regions, TII, MRI); + setRegionMRT(RTree); + initializeSelectRegisters(RTree, 0, MRI, TII); + DEBUG(RTree->dump(TRI)); + bool result = structurizeRegions(RTree, true); + delete RTree; + DEBUG(dbgs() << "----STRUCTURIZER END----\n"); + initFallthroughMap(MF); + return result; +} + +FunctionPass *llvm::createAMDGPUMachineCFGStructurizerPass() { + return new AMDGPUMachineCFGStructurizer(); +} diff --git a/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp b/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp index 36dcc699d4ea..e40f39557747 100644 --- a/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp +++ b/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp @@ -397,14 +397,17 @@ static Value* GEPToVectorIndex(GetElementPtrInst *GEP) { // instructions. static bool canVectorizeInst(Instruction *Inst, User *User) { switch (Inst->getOpcode()) { - case Instruction::Load: + case Instruction::Load: { + LoadInst *LI = cast<LoadInst>(Inst); + return !LI->isVolatile(); + } case Instruction::BitCast: case Instruction::AddrSpaceCast: return true; case Instruction::Store: { // Must be the stored pointer operand, not a stored value. StoreInst *SI = cast<StoreInst>(Inst); - return SI->getPointerOperand() == User; + return (SI->getPointerOperand() == User) && !SI->isVolatile(); } default: return false; diff --git a/lib/Target/AMDGPU/AMDGPUSubtarget.cpp b/lib/Target/AMDGPU/AMDGPUSubtarget.cpp index 972c28579f7a..6e301b4ad527 100644 --- a/lib/Target/AMDGPU/AMDGPUSubtarget.cpp +++ b/lib/Target/AMDGPU/AMDGPUSubtarget.cpp @@ -125,6 +125,9 @@ AMDGPUSubtarget::AMDGPUSubtarget(const Triple &TT, StringRef GPU, StringRef FS, HasSDWA(false), HasDPP(false), FlatAddressSpace(false), + FlatInstOffsets(false), + FlatGlobalInsts(false), + FlatScratchInsts(false), R600ALUInst(false), CaymanISA(false), diff --git a/lib/Target/AMDGPU/AMDGPUSubtarget.h b/lib/Target/AMDGPU/AMDGPUSubtarget.h index a5cda817ac11..bed7d326b3dd 100644 --- a/lib/Target/AMDGPU/AMDGPUSubtarget.h +++ b/lib/Target/AMDGPU/AMDGPUSubtarget.h @@ -145,6 +145,9 @@ protected: bool HasSDWA; bool HasDPP; bool FlatAddressSpace; + bool FlatInstOffsets; + bool FlatGlobalInsts; + bool FlatScratchInsts; bool R600ALUInst; bool CaymanISA; bool CFALUBug; @@ -380,6 +383,18 @@ public: return FlatAddressSpace; } + bool hasFlatInstOffsets() const { + return FlatInstOffsets; + } + + bool hasFlatGlobalInsts() const { + return FlatGlobalInsts; + } + + bool hasFlatScratchInsts() const { + return FlatScratchInsts; + } + bool isMesaKernel(const MachineFunction &MF) const { return isMesa3DOS() && !AMDGPU::isShader(MF.getFunction()->getCallingConv()); } diff --git a/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp b/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp index cd5bad04d0b3..386a88b0520f 100644 --- a/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp +++ b/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp @@ -118,6 +118,13 @@ static cl::opt<bool> EnableSIInsertWaitcntsPass( cl::desc("Use new waitcnt insertion pass"), cl::init(false)); +// Option to run late CFG structurizer +static cl::opt<bool> LateCFGStructurize( + "amdgpu-late-structurize", + cl::desc("Enable late CFG structurization"), + cl::init(false), + cl::Hidden); + extern "C" void LLVMInitializeAMDGPUTarget() { // Register the target RegisterTargetMachine<R600TargetMachine> X(getTheAMDGPUTarget()); @@ -702,11 +709,15 @@ bool GCNPassConfig::addPreISel() { // Merge divergent exit nodes. StructurizeCFG won't recognize the multi-exit // regions formed by them. addPass(&AMDGPUUnifyDivergentExitNodesID); - addPass(createStructurizeCFGPass(true)); // true -> SkipUniformRegions + if (!LateCFGStructurize) { + addPass(createStructurizeCFGPass(true)); // true -> SkipUniformRegions + } addPass(createSinkingPass()); addPass(createSITypeRewriter()); addPass(createAMDGPUAnnotateUniformValues()); - addPass(createSIAnnotateControlFlowPass()); + if (!LateCFGStructurize) { + addPass(createSIAnnotateControlFlowPass()); + } return false; } @@ -770,6 +781,9 @@ bool GCNPassConfig::addGlobalInstructionSelect() { #endif void GCNPassConfig::addPreRegAlloc() { + if (LateCFGStructurize) { + addPass(createAMDGPUMachineCFGStructurizerPass()); + } addPass(createSIWholeQuadModePass()); } diff --git a/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp b/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp index c9482c37ec80..beafebc1284a 100644 --- a/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp +++ b/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp @@ -363,13 +363,22 @@ int AMDGPUTTIImpl::getVectorInstrCost(unsigned Opcode, Type *ValTy, unsigned Index) { switch (Opcode) { case Instruction::ExtractElement: - case Instruction::InsertElement: + case Instruction::InsertElement: { + unsigned EltSize + = DL.getTypeSizeInBits(cast<VectorType>(ValTy)->getElementType()); + if (EltSize < 32) { + if (EltSize == 16 && Index == 0 && ST->has16BitInsts()) + return 0; + return BaseT::getVectorInstrCost(Opcode, ValTy, Index); + } + // Extracts are just reads of a subregister, so are free. Inserts are // considered free because we don't want to have any cost for scalarizing // operations, and we don't have to copy into a different register class. // Dynamic indexing isn't free and is best avoided. return Index == ~0u ? 2 : 0; + } default: return BaseT::getVectorInstrCost(Opcode, ValTy, Index); } @@ -479,3 +488,26 @@ bool AMDGPUTTIImpl::isSourceOfDivergence(const Value *V) const { return false; } + +unsigned AMDGPUTTIImpl::getShuffleCost(TTI::ShuffleKind Kind, Type *Tp, int Index, + Type *SubTp) { + if (ST->hasVOP3PInsts()) { + VectorType *VT = cast<VectorType>(Tp); + if (VT->getNumElements() == 2 && + DL.getTypeSizeInBits(VT->getElementType()) == 16) { + // With op_sel VOP3P instructions freely can access the low half or high + // half of a register, so any swizzle is free. + + switch (Kind) { + case TTI::SK_Broadcast: + case TTI::SK_Reverse: + case TTI::SK_PermuteSingleSrc: + return 0; + default: + break; + } + } + } + + return BaseT::getShuffleCost(Kind, Tp, Index, SubTp); +} diff --git a/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.h b/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.h index 71d6306bc1a5..e0024e21e82b 100644 --- a/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.h +++ b/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.h @@ -114,6 +114,9 @@ public: } unsigned getVectorSplitCost() { return 0; } + + unsigned getShuffleCost(TTI::ShuffleKind Kind, Type *Tp, int Index, + Type *SubTp); }; } // end namespace llvm diff --git a/lib/Target/AMDGPU/CMakeLists.txt b/lib/Target/AMDGPU/CMakeLists.txt index 7c0ef4aeac3c..cafce0164fa9 100644 --- a/lib/Target/AMDGPU/CMakeLists.txt +++ b/lib/Target/AMDGPU/CMakeLists.txt @@ -48,6 +48,7 @@ add_llvm_target(AMDGPUCodeGen AMDGPUISelDAGToDAG.cpp AMDGPULowerIntrinsics.cpp AMDGPUMCInstLower.cpp + AMDGPUMachineCFGStructurizer.cpp AMDGPUMachineFunction.cpp AMDGPUUnifyMetadata.cpp AMDGPUOpenCLImageTypeLoweringPass.cpp diff --git a/lib/Target/AMDGPU/FLATInstructions.td b/lib/Target/AMDGPU/FLATInstructions.td index b0ac0e689a0b..8ba9efd42c70 100644 --- a/lib/Target/AMDGPU/FLATInstructions.td +++ b/lib/Target/AMDGPU/FLATInstructions.td @@ -7,7 +7,7 @@ // //===----------------------------------------------------------------------===// -def FLATAtomic : ComplexPattern<i64, 3, "SelectFlat">; +def FLATAtomic : ComplexPattern<i64, 2, "SelectFlat">; //===----------------------------------------------------------------------===// // FLAT classes @@ -62,7 +62,9 @@ class FLAT_Real <bits<7> op, FLAT_Pseudo ps> : bits<8> vdst; bits<1> slc; bits<1> glc; - bits<1> tfe; + + // We don't use tfe right now, and it was removed in gfx9. + bits<1> tfe = 0; // 15-0 is reserved. let Inst{16} = !if(ps.has_glc, glc, ps.glcValue); @@ -79,8 +81,8 @@ class FLAT_Real <bits<7> op, FLAT_Pseudo ps> : class FLAT_Load_Pseudo <string opName, RegisterClass regClass> : FLAT_Pseudo< opName, (outs regClass:$vdst), - (ins VReg_64:$vaddr, GLC:$glc, slc:$slc, tfe:$tfe), - " $vdst, $vaddr$glc$slc$tfe"> { + (ins VReg_64:$vaddr, GLC:$glc, slc:$slc), + " $vdst, $vaddr$glc$slc"> { let has_data = 0; let mayLoad = 1; } @@ -88,8 +90,8 @@ class FLAT_Load_Pseudo <string opName, RegisterClass regClass> : FLAT_Pseudo< class FLAT_Store_Pseudo <string opName, RegisterClass vdataClass> : FLAT_Pseudo< opName, (outs), - (ins VReg_64:$vaddr, vdataClass:$vdata, GLC:$glc, slc:$slc, tfe:$tfe), - " $vaddr, $vdata$glc$slc$tfe"> { + (ins VReg_64:$vaddr, vdataClass:$vdata, GLC:$glc, slc:$slc), + " $vaddr, $vdata$glc$slc"> { let mayLoad = 0; let mayStore = 1; let has_vdst = 0; @@ -105,8 +107,8 @@ multiclass FLAT_Atomic_Pseudo< def "" : FLAT_Pseudo <opName, (outs), - (ins VReg_64:$vaddr, data_rc:$vdata, slc:$slc, tfe:$tfe), - " $vaddr, $vdata$slc$tfe", + (ins VReg_64:$vaddr, data_rc:$vdata, slc:$slc), + " $vaddr, $vdata$slc", []>, AtomicNoRet <NAME, 0> { let mayLoad = 1; @@ -119,10 +121,10 @@ multiclass FLAT_Atomic_Pseudo< def _RTN : FLAT_Pseudo <opName, (outs vdst_rc:$vdst), - (ins VReg_64:$vaddr, data_rc:$vdata, slc:$slc, tfe:$tfe), - " $vdst, $vaddr, $vdata glc$slc$tfe", + (ins VReg_64:$vaddr, data_rc:$vdata, slc:$slc), + " $vdst, $vaddr, $vdata glc$slc", [(set vt:$vdst, - (atomic (FLATAtomic i64:$vaddr, i1:$slc, i1:$tfe), data_vt:$vdata))]>, + (atomic (FLATAtomic i64:$vaddr, i1:$slc), data_vt:$vdata))]>, AtomicNoRet <NAME, 1> { let mayLoad = 1; let mayStore = 1; @@ -311,30 +313,30 @@ def flat_truncstorei16 : flat_st <truncstorei16>; // Patterns for global loads with no offset. class FlatLoadPat <FLAT_Pseudo inst, SDPatternOperator node, ValueType vt> : Pat < (vt (node i64:$addr)), - (inst $addr, 0, 0, 0) + (inst $addr, 0, 0) >; class FlatLoadAtomicPat <FLAT_Pseudo inst, SDPatternOperator node, ValueType vt> : Pat < (vt (node i64:$addr)), - (inst $addr, 1, 0, 0) + (inst $addr, 1, 0) >; class FlatStorePat <FLAT_Pseudo inst, SDPatternOperator node, ValueType vt> : Pat < (node vt:$data, i64:$addr), - (inst $addr, $data, 0, 0, 0) + (inst $addr, $data, 0, 0) >; class FlatStoreAtomicPat <FLAT_Pseudo inst, SDPatternOperator node, ValueType vt> : Pat < // atomic store follows atomic binop convention so the address comes // first. (node i64:$addr, vt:$data), - (inst $addr, $data, 1, 0, 0) + (inst $addr, $data, 1, 0) >; class FlatAtomicPat <FLAT_Pseudo inst, SDPatternOperator node, ValueType vt, ValueType data_vt = vt> : Pat < (vt (node i64:$addr, data_vt:$data)), - (inst $addr, $data, 0, 0) + (inst $addr, $data, 0) >; let Predicates = [isCIVI] in { diff --git a/lib/Target/AMDGPU/GCNRegPressure.cpp b/lib/Target/AMDGPU/GCNRegPressure.cpp index bf16a8216001..8066428fe44a 100644 --- a/lib/Target/AMDGPU/GCNRegPressure.cpp +++ b/lib/Target/AMDGPU/GCNRegPressure.cpp @@ -27,7 +27,7 @@ void llvm::printLivesAt(SlotIndex SI, unsigned Num = 0; for (unsigned I = 0, E = MRI.getNumVirtRegs(); I != E; ++I) { const unsigned Reg = TargetRegisterInfo::index2VirtReg(I); - if (MRI.reg_nodbg_empty(Reg)) + if (!LIS.hasInterval(Reg)) continue; const auto &LI = LIS.getInterval(Reg); if (LI.hasSubRanges()) { @@ -131,13 +131,13 @@ bool GCNRegPressure::less(const SISubtarget &ST, const GCNRegPressure& O, unsigned MaxOccupancy) const { const auto SGPROcc = std::min(MaxOccupancy, - ST.getOccupancyWithNumSGPRs(getSGRPNum())); + ST.getOccupancyWithNumSGPRs(getSGPRNum())); const auto VGPROcc = std::min(MaxOccupancy, - ST.getOccupancyWithNumVGPRs(getVGRPNum())); + ST.getOccupancyWithNumVGPRs(getVGPRNum())); const auto OtherSGPROcc = std::min(MaxOccupancy, - ST.getOccupancyWithNumSGPRs(O.getSGRPNum())); + ST.getOccupancyWithNumSGPRs(O.getSGPRNum())); const auto OtherVGPROcc = std::min(MaxOccupancy, - ST.getOccupancyWithNumVGPRs(O.getVGRPNum())); + ST.getOccupancyWithNumVGPRs(O.getVGPRNum())); const auto Occ = std::min(SGPROcc, VGPROcc); const auto OtherOcc = std::min(OtherSGPROcc, OtherVGPROcc); @@ -167,17 +167,17 @@ bool GCNRegPressure::less(const SISubtarget &ST, return VW < OtherVW; } } - return SGPRImportant ? (getSGRPNum() < O.getSGRPNum()): - (getVGRPNum() < O.getVGRPNum()); + return SGPRImportant ? (getSGPRNum() < O.getSGPRNum()): + (getVGPRNum() < O.getVGPRNum()); } #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) LLVM_DUMP_METHOD void GCNRegPressure::print(raw_ostream &OS, const SISubtarget *ST) const { - OS << "VGPRs: " << getVGRPNum(); - if (ST) OS << "(O" << ST->getOccupancyWithNumVGPRs(getVGRPNum()) << ')'; - OS << ", SGPRs: " << getSGRPNum(); - if (ST) OS << "(O" << ST->getOccupancyWithNumSGPRs(getSGRPNum()) << ')'; + OS << "VGPRs: " << getVGPRNum(); + if (ST) OS << "(O" << ST->getOccupancyWithNumVGPRs(getVGPRNum()) << ')'; + OS << ", SGPRs: " << getSGPRNum(); + if (ST) OS << "(O" << ST->getOccupancyWithNumSGPRs(getSGPRNum()) << ')'; OS << ", LVGPR WT: " << getVGPRTuplesWeight() << ", LSGPR WT: " << getSGPRTuplesWeight(); if (ST) OS << " -> Occ: " << getOccupancy(*ST); @@ -192,7 +192,6 @@ LaneBitmask llvm::getLiveLaneMask(unsigned Reg, SlotIndex SI, const LiveIntervals &LIS, const MachineRegisterInfo &MRI) { - assert(!MRI.reg_nodbg_empty(Reg)); LaneBitmask LiveMask; const auto &LI = LIS.getInterval(Reg); if (LI.hasSubRanges()) { @@ -214,7 +213,7 @@ GCNRPTracker::LiveRegSet llvm::getLiveRegs(SlotIndex SI, GCNRPTracker::LiveRegSet LiveRegs; for (unsigned I = 0, E = MRI.getNumVirtRegs(); I != E; ++I) { auto Reg = TargetRegisterInfo::index2VirtReg(I); - if (MRI.reg_nodbg_empty(Reg)) + if (!LIS.hasInterval(Reg)) continue; auto LiveMask = getLiveLaneMask(Reg, SI, LIS, MRI); if (LiveMask.any()) @@ -223,13 +222,7 @@ GCNRPTracker::LiveRegSet llvm::getLiveRegs(SlotIndex SI, return LiveRegs; } -void GCNUpwardRPTracker::reset(const MachineInstr &MI) { - MRI = &MI.getParent()->getParent()->getRegInfo(); - LiveRegs = getLiveRegsAfter(MI, LIS); - MaxPressure = CurPressure = getRegPressure(*MRI, LiveRegs); -} - -LaneBitmask GCNUpwardRPTracker::getDefRegMask(const MachineOperand &MO) const { +LaneBitmask GCNRPTracker::getDefRegMask(const MachineOperand &MO) const { assert(MO.isDef() && MO.isReg() && TargetRegisterInfo::isVirtualRegister(MO.getReg())); @@ -241,7 +234,7 @@ LaneBitmask GCNUpwardRPTracker::getDefRegMask(const MachineOperand &MO) const { MRI->getTargetRegisterInfo()->getSubRegIndexLaneMask(MO.getSubReg()); } -LaneBitmask GCNUpwardRPTracker::getUsedRegMask(const MachineOperand &MO) const { +LaneBitmask GCNRPTracker::getUsedRegMask(const MachineOperand &MO) const { assert(MO.isUse() && MO.isReg() && TargetRegisterInfo::isVirtualRegister(MO.getReg())); @@ -259,6 +252,18 @@ LaneBitmask GCNUpwardRPTracker::getUsedRegMask(const MachineOperand &MO) const { return getLiveLaneMask(MO.getReg(), SI, LIS, *MRI); } +void GCNUpwardRPTracker::reset(const MachineInstr &MI, + const LiveRegSet *LiveRegsCopy) { + MRI = &MI.getParent()->getParent()->getRegInfo(); + if (LiveRegsCopy) { + if (&LiveRegs != LiveRegsCopy) + LiveRegs = *LiveRegsCopy; + } else { + LiveRegs = getLiveRegsAfter(MI, LIS); + } + MaxPressure = CurPressure = getRegPressure(*MRI, LiveRegs); +} + void GCNUpwardRPTracker::recede(const MachineInstr &MI) { assert(MRI && "call reset first"); @@ -297,6 +302,100 @@ void GCNUpwardRPTracker::recede(const MachineInstr &MI) { MaxPressure = max(MaxPressure, CurPressure); } +bool GCNDownwardRPTracker::reset(const MachineInstr &MI, + const LiveRegSet *LiveRegsCopy) { + MRI = &MI.getParent()->getParent()->getRegInfo(); + LastTrackedMI = nullptr; + MBBEnd = MI.getParent()->end(); + NextMI = &MI; + NextMI = skipDebugInstructionsForward(NextMI, MBBEnd); + if (NextMI == MBBEnd) + return false; + if (LiveRegsCopy) { + if (&LiveRegs != LiveRegsCopy) + LiveRegs = *LiveRegsCopy; + } else { + LiveRegs = getLiveRegsBefore(*NextMI, LIS); + } + MaxPressure = CurPressure = getRegPressure(*MRI, LiveRegs); + return true; +} + +bool GCNDownwardRPTracker::advanceBeforeNext() { + assert(MRI && "call reset first"); + + NextMI = skipDebugInstructionsForward(NextMI, MBBEnd); + if (NextMI == MBBEnd) + return false; + + SlotIndex SI = LIS.getInstructionIndex(*NextMI).getBaseIndex(); + assert(SI.isValid()); + + // Remove dead registers or mask bits. + for (auto &It : LiveRegs) { + const LiveInterval &LI = LIS.getInterval(It.first); + if (LI.hasSubRanges()) { + for (const auto &S : LI.subranges()) { + if (!S.liveAt(SI)) { + auto PrevMask = It.second; + It.second &= ~S.LaneMask; + CurPressure.inc(It.first, PrevMask, It.second, *MRI); + } + } + } else if (!LI.liveAt(SI)) { + auto PrevMask = It.second; + It.second = LaneBitmask::getNone(); + CurPressure.inc(It.first, PrevMask, It.second, *MRI); + } + if (It.second.none()) + LiveRegs.erase(It.first); + } + + MaxPressure = max(MaxPressure, CurPressure); + + return true; +} + +void GCNDownwardRPTracker::advanceToNext() { + LastTrackedMI = &*NextMI++; + + // Add new registers or mask bits. + for (const auto &MO : LastTrackedMI->defs()) { + if (!MO.isReg()) + continue; + unsigned Reg = MO.getReg(); + if (!TargetRegisterInfo::isVirtualRegister(Reg)) + continue; + auto &LiveMask = LiveRegs[Reg]; + auto PrevMask = LiveMask; + LiveMask |= getDefRegMask(MO); + CurPressure.inc(Reg, PrevMask, LiveMask, *MRI); + } + + MaxPressure = max(MaxPressure, CurPressure); +} + +bool GCNDownwardRPTracker::advance() { + // If we have just called reset live set is actual. + if ((NextMI == MBBEnd) || (LastTrackedMI && !advanceBeforeNext())) + return false; + advanceToNext(); + return true; +} + +bool GCNDownwardRPTracker::advance(MachineBasicBlock::const_iterator End) { + while (NextMI != End) + if (!advance()) return false; + return true; +} + +bool GCNDownwardRPTracker::advance(MachineBasicBlock::const_iterator Begin, + MachineBasicBlock::const_iterator End, + const LiveRegSet *LiveRegsCopy) { + reset(*Begin, LiveRegsCopy); + return advance(End); +} + #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) LLVM_DUMP_METHOD static void reportMismatch(const GCNRPTracker::LiveRegSet &LISLR, @@ -352,4 +451,16 @@ bool GCNUpwardRPTracker::isValid() const { return true; } +void GCNRPTracker::printLiveRegs(raw_ostream &OS, const LiveRegSet& LiveRegs, + const MachineRegisterInfo &MRI) { + const TargetRegisterInfo *TRI = MRI.getTargetRegisterInfo(); + for (unsigned I = 0, E = MRI.getNumVirtRegs(); I != E; ++I) { + unsigned Reg = TargetRegisterInfo::index2VirtReg(I); + auto It = LiveRegs.find(Reg); + if (It != LiveRegs.end() && It->second.any()) + OS << ' ' << PrintVRegOrUnit(Reg, TRI) << ':' + << PrintLaneMask(It->second); + } + OS << '\n'; +} #endif diff --git a/lib/Target/AMDGPU/GCNRegPressure.h b/lib/Target/AMDGPU/GCNRegPressure.h index 82e76a7bfddc..9875ca6a6d16 100644 --- a/lib/Target/AMDGPU/GCNRegPressure.h +++ b/lib/Target/AMDGPU/GCNRegPressure.h @@ -33,19 +33,19 @@ struct GCNRegPressure { clear(); } - bool empty() const { return getSGRPNum() == 0 && getVGRPNum() == 0; } + bool empty() const { return getSGPRNum() == 0 && getVGPRNum() == 0; } void clear() { std::fill(&Value[0], &Value[TOTAL_KINDS], 0); } - unsigned getSGRPNum() const { return Value[SGPR32]; } - unsigned getVGRPNum() const { return Value[VGPR32]; } + unsigned getSGPRNum() const { return Value[SGPR32]; } + unsigned getVGPRNum() const { return Value[VGPR32]; } unsigned getVGPRTuplesWeight() const { return Value[VGPR_TUPLE]; } unsigned getSGPRTuplesWeight() const { return Value[SGPR_TUPLE]; } unsigned getOccupancy(const SISubtarget &ST) const { - return std::min(ST.getOccupancyWithNumSGPRs(getSGRPNum()), - ST.getOccupancyWithNumVGPRs(getVGRPNum())); + return std::min(ST.getOccupancyWithNumSGPRs(getSGPRNum()), + ST.getOccupancyWithNumVGPRs(getVGPRNum())); } void inc(unsigned Reg, @@ -92,16 +92,21 @@ public: typedef DenseMap<unsigned, LaneBitmask> LiveRegSet; protected: + const LiveIntervals &LIS; LiveRegSet LiveRegs; GCNRegPressure CurPressure, MaxPressure; const MachineInstr *LastTrackedMI = nullptr; mutable const MachineRegisterInfo *MRI = nullptr; - GCNRPTracker() {} + GCNRPTracker(const LiveIntervals &LIS_) : LIS(LIS_) {} + LaneBitmask getDefRegMask(const MachineOperand &MO) const; + LaneBitmask getUsedRegMask(const MachineOperand &MO) const; public: // live regs for the current state const decltype(LiveRegs) &getLiveRegs() const { return LiveRegs; } const MachineInstr *getLastTrackedMI() const { return LastTrackedMI; } + void clearMaxPressure() { MaxPressure.clear(); } + // returns MaxPressure, resetting it decltype(MaxPressure) moveMaxPressure() { auto Res = MaxPressure; @@ -111,17 +116,16 @@ public: decltype(LiveRegs) moveLiveRegs() { return std::move(LiveRegs); } + static void printLiveRegs(raw_ostream &OS, const LiveRegSet& LiveRegs, + const MachineRegisterInfo &MRI); }; class GCNUpwardRPTracker : public GCNRPTracker { - const LiveIntervals &LIS; - LaneBitmask getDefRegMask(const MachineOperand &MO) const; - LaneBitmask getUsedRegMask(const MachineOperand &MO) const; public: - GCNUpwardRPTracker(const LiveIntervals &LIS_) : LIS(LIS_) {} + GCNUpwardRPTracker(const LiveIntervals &LIS_) : GCNRPTracker(LIS_) {} // reset tracker to the point just below MI // filling live regs upon this point using LIS - void reset(const MachineInstr &MI); + void reset(const MachineInstr &MI, const LiveRegSet *LiveRegs = nullptr); // move to the state just above the MI void recede(const MachineInstr &MI); @@ -131,6 +135,41 @@ public: bool isValid() const; }; +class GCNDownwardRPTracker : public GCNRPTracker { + // Last position of reset or advanceBeforeNext + MachineBasicBlock::const_iterator NextMI; + + MachineBasicBlock::const_iterator MBBEnd; + +public: + GCNDownwardRPTracker(const LiveIntervals &LIS_) : GCNRPTracker(LIS_) {} + + const MachineBasicBlock::const_iterator getNext() const { return NextMI; } + + // Reset tracker to the point before the MI + // filling live regs upon this point using LIS. + // Returns false if block is empty except debug values. + bool reset(const MachineInstr &MI, const LiveRegSet *LiveRegs = nullptr); + + // Move to the state right before the next MI. Returns false if reached + // end of the block. + bool advanceBeforeNext(); + + // Move to the state at the MI, advanceBeforeNext has to be called first. + void advanceToNext(); + + // Move to the state at the next MI. Returns false if reached end of block. + bool advance(); + + // Advance instructions until before End. + bool advance(MachineBasicBlock::const_iterator End); + + // Reset to Begin and advance to End. + bool advance(MachineBasicBlock::const_iterator Begin, + MachineBasicBlock::const_iterator End, + const LiveRegSet *LiveRegsCopy = nullptr); +}; + LaneBitmask getLiveLaneMask(unsigned Reg, SlotIndex SI, const LiveIntervals &LIS, diff --git a/lib/Target/AMDGPU/GCNSchedStrategy.cpp b/lib/Target/AMDGPU/GCNSchedStrategy.cpp index 630442625aa3..8ec46665daf5 100644 --- a/lib/Target/AMDGPU/GCNSchedStrategy.cpp +++ b/lib/Target/AMDGPU/GCNSchedStrategy.cpp @@ -316,46 +316,57 @@ GCNScheduleDAGMILive::GCNScheduleDAGMILive(MachineSchedContext *C, MFI(*MF.getInfo<SIMachineFunctionInfo>()), StartingOccupancy(ST.getOccupancyWithLocalMemSize(MFI.getLDSSize(), *MF.getFunction())), - MinOccupancy(StartingOccupancy), Stage(0) { + MinOccupancy(StartingOccupancy), Stage(0), RegionIdx(0) { DEBUG(dbgs() << "Starting occupancy is " << StartingOccupancy << ".\n"); } void GCNScheduleDAGMILive::schedule() { + if (Stage == 0) { + // Just record regions at the first pass. + Regions.push_back(std::make_pair(RegionBegin, RegionEnd)); + return; + } + std::vector<MachineInstr*> Unsched; Unsched.reserve(NumRegionInstrs); for (auto &I : *this) Unsched.push_back(&I); - std::pair<unsigned, unsigned> PressureBefore; + GCNRegPressure PressureBefore; if (LIS) { - DEBUG(dbgs() << "Pressure before scheduling:\n"); - discoverLiveIns(); - PressureBefore = getRealRegPressure(); + PressureBefore = Pressure[RegionIdx]; + + DEBUG(dbgs() << "Pressure before scheduling:\nRegion live-ins:"; + GCNRPTracker::printLiveRegs(dbgs(), LiveIns[RegionIdx], MRI); + dbgs() << "Region live-in pressure: "; + llvm::getRegPressure(MRI, LiveIns[RegionIdx]).print(dbgs()); + dbgs() << "Region register pressure: "; + PressureBefore.print(dbgs())); } ScheduleDAGMILive::schedule(); - if (Stage == 0) - Regions.push_back(std::make_pair(RegionBegin, RegionEnd)); + Regions[RegionIdx] = std::make_pair(RegionBegin, RegionEnd); if (!LIS) return; // Check the results of scheduling. GCNMaxOccupancySchedStrategy &S = (GCNMaxOccupancySchedStrategy&)*SchedImpl; - DEBUG(dbgs() << "Pressure after scheduling:\n"); auto PressureAfter = getRealRegPressure(); - LiveIns.clear(); - if (PressureAfter.first <= S.SGPRCriticalLimit && - PressureAfter.second <= S.VGPRCriticalLimit) { + DEBUG(dbgs() << "Pressure after scheduling: "; PressureAfter.print(dbgs())); + + if (PressureAfter.getSGPRNum() <= S.SGPRCriticalLimit && + PressureAfter.getVGPRNum() <= S.VGPRCriticalLimit) { + Pressure[RegionIdx] = PressureAfter; DEBUG(dbgs() << "Pressure in desired limits, done.\n"); return; } - unsigned WavesAfter = getMaxWaves(PressureAfter.first, - PressureAfter.second, MF); - unsigned WavesBefore = getMaxWaves(PressureBefore.first, - PressureBefore.second, MF); + unsigned WavesAfter = getMaxWaves(PressureAfter.getSGPRNum(), + PressureAfter.getVGPRNum(), MF); + unsigned WavesBefore = getMaxWaves(PressureBefore.getSGPRNum(), + PressureBefore.getVGPRNum(), MF); DEBUG(dbgs() << "Occupancy before scheduling: " << WavesBefore << ", after " << WavesAfter << ".\n"); @@ -368,8 +379,10 @@ void GCNScheduleDAGMILive::schedule() { << MinOccupancy << ".\n"); } - if (WavesAfter >= WavesBefore) + if (WavesAfter >= WavesBefore) { + Pressure[RegionIdx] = PressureAfter; return; + } DEBUG(dbgs() << "Attempting to revert scheduling.\n"); RegionEnd = RegionBegin; @@ -398,166 +411,139 @@ void GCNScheduleDAGMILive::schedule() { DEBUG(dbgs() << "Scheduling " << *MI); } RegionBegin = Unsched.front()->getIterator(); - if (Stage == 0) - Regions.back() = std::make_pair(RegionBegin, RegionEnd); + Regions[RegionIdx] = std::make_pair(RegionBegin, RegionEnd); placeDebugValues(); } -static inline void setMask(const MachineRegisterInfo &MRI, - const SIRegisterInfo *SRI, unsigned Reg, - LaneBitmask &PrevMask, LaneBitmask NewMask, - unsigned &SGPRs, unsigned &VGPRs) { - int NewRegs = countPopulation(NewMask.getAsInteger()) - - countPopulation(PrevMask.getAsInteger()); - if (SRI->isSGPRReg(MRI, Reg)) - SGPRs += NewRegs; - if (SRI->isVGPR(MRI, Reg)) - VGPRs += NewRegs; - assert ((int)SGPRs >= 0 && (int)VGPRs >= 0); - PrevMask = NewMask; +GCNRegPressure GCNScheduleDAGMILive::getRealRegPressure() const { + GCNDownwardRPTracker RPTracker(*LIS); + RPTracker.advance(begin(), end(), &LiveIns[RegionIdx]); + return RPTracker.moveMaxPressure(); } -void GCNScheduleDAGMILive::discoverLiveIns() { - unsigned SGPRs = 0; - unsigned VGPRs = 0; +void GCNScheduleDAGMILive::computeBlockPressure(const MachineBasicBlock *MBB) { + GCNDownwardRPTracker RPTracker(*LIS); + + // If the block has the only successor then live-ins of that successor are + // live-outs of the current block. We can reuse calculated live set if the + // successor will be sent to scheduling past current block. + const MachineBasicBlock *OnlySucc = nullptr; + if (MBB->succ_size() == 1 && !(*MBB->succ_begin())->empty()) { + SlotIndexes *Ind = LIS->getSlotIndexes(); + if (Ind->getMBBStartIdx(MBB) < Ind->getMBBStartIdx(*MBB->succ_begin())) + OnlySucc = *MBB->succ_begin(); + } - auto &MI = *begin()->getParent()->getFirstNonDebugInstr(); - const SIRegisterInfo *SRI = static_cast<const SIRegisterInfo*>(TRI); - SlotIndex SI = LIS->getInstructionIndex(MI).getBaseIndex(); - assert (SI.isValid()); - - DEBUG(dbgs() << "Region live-ins:"); - for (unsigned I = 0, E = MRI.getNumVirtRegs(); I != E; ++I) { - unsigned Reg = TargetRegisterInfo::index2VirtReg(I); - if (MRI.reg_nodbg_empty(Reg)) - continue; - const LiveInterval &LI = LIS->getInterval(Reg); - LaneBitmask LaneMask = LaneBitmask::getNone(); - if (LI.hasSubRanges()) { - for (const auto &S : LI.subranges()) - if (S.liveAt(SI)) - LaneMask |= S.LaneMask; - } else if (LI.liveAt(SI)) { - LaneMask = MRI.getMaxLaneMaskForVReg(Reg); - } + // Scheduler sends regions from the end of the block upwards. + size_t CurRegion = RegionIdx; + for (size_t E = Regions.size(); CurRegion != E; ++CurRegion) + if (Regions[CurRegion].first->getParent() != MBB) + break; + --CurRegion; + + auto I = MBB->begin(); + auto LiveInIt = MBBLiveIns.find(MBB); + if (LiveInIt != MBBLiveIns.end()) { + auto LiveIn = std::move(LiveInIt->second); + RPTracker.reset(*MBB->begin(), &LiveIn); + MBBLiveIns.erase(LiveInIt); + } else { + I = Regions[CurRegion].first; + RPTracker.reset(*I); + } - if (LaneMask.any()) { - setMask(MRI, SRI, Reg, LiveIns[Reg], LaneMask, SGPRs, VGPRs); + for ( ; ; ) { + I = RPTracker.getNext(); - DEBUG(dbgs() << ' ' << PrintVRegOrUnit(Reg, SRI) << ':' - << PrintLaneMask(LiveIns[Reg])); + if (Regions[CurRegion].first == I) { + LiveIns[CurRegion] = RPTracker.getLiveRegs(); + RPTracker.clearMaxPressure(); } - } - LiveInPressure = std::make_pair(SGPRs, VGPRs); + if (Regions[CurRegion].second == I) { + Pressure[CurRegion] = RPTracker.moveMaxPressure(); + if (CurRegion-- == RegionIdx) + break; + } + RPTracker.advanceToNext(); + RPTracker.advanceBeforeNext(); + } - DEBUG(dbgs() << "\nLive-in pressure:\nSGPR = " << SGPRs - << "\nVGPR = " << VGPRs << '\n'); + if (OnlySucc) { + if (I != MBB->end()) { + RPTracker.advanceToNext(); + RPTracker.advance(MBB->end()); + } + RPTracker.reset(*OnlySucc->begin(), &RPTracker.getLiveRegs()); + RPTracker.advanceBeforeNext(); + MBBLiveIns[OnlySucc] = RPTracker.moveLiveRegs(); + } } -std::pair<unsigned, unsigned> -GCNScheduleDAGMILive::getRealRegPressure() const { - unsigned SGPRs, MaxSGPRs, VGPRs, MaxVGPRs; - SGPRs = MaxSGPRs = LiveInPressure.first; - VGPRs = MaxVGPRs = LiveInPressure.second; - - const SIRegisterInfo *SRI = static_cast<const SIRegisterInfo*>(TRI); - DenseMap<unsigned, LaneBitmask> LiveRegs(LiveIns); +void GCNScheduleDAGMILive::finalizeSchedule() { + GCNMaxOccupancySchedStrategy &S = (GCNMaxOccupancySchedStrategy&)*SchedImpl; + DEBUG(dbgs() << "All regions recorded, starting actual scheduling.\n"); - for (const MachineInstr &MI : *this) { - if (MI.isDebugValue()) - continue; - SlotIndex SI = LIS->getInstructionIndex(MI).getBaseIndex(); - assert (SI.isValid()); + LiveIns.resize(Regions.size()); + Pressure.resize(Regions.size()); - // Remove dead registers or mask bits. - for (auto &It : LiveRegs) { - if (It.second.none()) - continue; - const LiveInterval &LI = LIS->getInterval(It.first); - if (LI.hasSubRanges()) { - for (const auto &S : LI.subranges()) - if (!S.liveAt(SI)) - setMask(MRI, SRI, It.first, It.second, It.second & ~S.LaneMask, - SGPRs, VGPRs); - } else if (!LI.liveAt(SI)) { - setMask(MRI, SRI, It.first, It.second, LaneBitmask::getNone(), - SGPRs, VGPRs); - } - } + do { + Stage++; + RegionIdx = 0; + MachineBasicBlock *MBB = nullptr; - // Add new registers or mask bits. - for (const auto &MO : MI.defs()) { - if (!MO.isReg()) - continue; - unsigned Reg = MO.getReg(); - if (!TargetRegisterInfo::isVirtualRegister(Reg)) - continue; - unsigned SubRegIdx = MO.getSubReg(); - LaneBitmask LaneMask = SubRegIdx != 0 - ? TRI->getSubRegIndexLaneMask(SubRegIdx) - : MRI.getMaxLaneMaskForVReg(Reg); - LaneBitmask &LM = LiveRegs[Reg]; - setMask(MRI, SRI, Reg, LM, LM | LaneMask, SGPRs, VGPRs); - } - MaxSGPRs = std::max(MaxSGPRs, SGPRs); - MaxVGPRs = std::max(MaxVGPRs, VGPRs); - } + if (Stage > 1) { + // Retry function scheduling if we found resulting occupancy and it is + // lower than used for first pass scheduling. This will give more freedom + // to schedule low register pressure blocks. + // Code is partially copied from MachineSchedulerBase::scheduleRegions(). - DEBUG(dbgs() << "Real region's register pressure:\nSGPR = " << MaxSGPRs - << "\nVGPR = " << MaxVGPRs << '\n'); + if (!LIS || StartingOccupancy <= MinOccupancy) + break; - return std::make_pair(MaxSGPRs, MaxVGPRs); -} + DEBUG(dbgs() + << "Retrying function scheduling with lowest recorded occupancy " + << MinOccupancy << ".\n"); -void GCNScheduleDAGMILive::finalizeSchedule() { - // Retry function scheduling if we found resulting occupancy and it is - // lower than used for first pass scheduling. This will give more freedom - // to schedule low register pressure blocks. - // Code is partially copied from MachineSchedulerBase::scheduleRegions(). + S.setTargetOccupancy(MinOccupancy); + } - if (!LIS || StartingOccupancy <= MinOccupancy) - return; + for (auto Region : Regions) { + RegionBegin = Region.first; + RegionEnd = Region.second; - DEBUG(dbgs() << "Retrying function scheduling with lowest recorded occupancy " - << MinOccupancy << ".\n"); + if (RegionBegin->getParent() != MBB) { + if (MBB) finishBlock(); + MBB = RegionBegin->getParent(); + startBlock(MBB); + if (Stage == 1) + computeBlockPressure(MBB); + } - Stage++; - GCNMaxOccupancySchedStrategy &S = (GCNMaxOccupancySchedStrategy&)*SchedImpl; - S.setTargetOccupancy(MinOccupancy); + unsigned NumRegionInstrs = std::distance(begin(), end()); + enterRegion(MBB, begin(), end(), NumRegionInstrs); - MachineBasicBlock *MBB = nullptr; - for (auto Region : Regions) { - RegionBegin = Region.first; - RegionEnd = Region.second; + // Skip empty scheduling regions (0 or 1 schedulable instructions). + if (begin() == end() || begin() == std::prev(end())) { + exitRegion(); + continue; + } - if (RegionBegin->getParent() != MBB) { - if (MBB) finishBlock(); - MBB = RegionBegin->getParent(); - startBlock(MBB); - } + DEBUG(dbgs() << "********** MI Scheduling **********\n"); + DEBUG(dbgs() << MF.getName() + << ":BB#" << MBB->getNumber() << " " << MBB->getName() + << "\n From: " << *begin() << " To: "; + if (RegionEnd != MBB->end()) dbgs() << *RegionEnd; + else dbgs() << "End"; + dbgs() << " RegionInstrs: " << NumRegionInstrs << '\n'); - unsigned NumRegionInstrs = std::distance(begin(), end()); - enterRegion(MBB, begin(), end(), NumRegionInstrs); + schedule(); - // Skip empty scheduling regions (0 or 1 schedulable instructions). - if (begin() == end() || begin() == std::prev(end())) { exitRegion(); - continue; + ++RegionIdx; } - DEBUG(dbgs() << "********** MI Scheduling **********\n"); - DEBUG(dbgs() << MF.getName() - << ":BB#" << MBB->getNumber() << " " << MBB->getName() - << "\n From: " << *begin() << " To: "; - if (RegionEnd != MBB->end()) dbgs() << *RegionEnd; - else dbgs() << "End"; - dbgs() << " RegionInstrs: " << NumRegionInstrs << '\n'); + finishBlock(); - schedule(); - - exitRegion(); - } - finishBlock(); - LiveIns.shrink_and_clear(); + } while (Stage < 2); } diff --git a/lib/Target/AMDGPU/GCNSchedStrategy.h b/lib/Target/AMDGPU/GCNSchedStrategy.h index 15af232704ff..3ed3cd5b3b1c 100644 --- a/lib/Target/AMDGPU/GCNSchedStrategy.h +++ b/lib/Target/AMDGPU/GCNSchedStrategy.h @@ -14,6 +14,7 @@ #ifndef LLVM_LIB_TARGET_AMDGPU_GCNSCHEDSTRATEGY_H #define LLVM_LIB_TARGET_AMDGPU_GCNSCHEDSTRATEGY_H +#include "GCNRegPressure.h" #include "llvm/CodeGen/MachineScheduler.h" namespace llvm { @@ -74,21 +75,28 @@ class GCNScheduleDAGMILive : public ScheduleDAGMILive { // Scheduling stage number. unsigned Stage; + // Current region index. + size_t RegionIdx; + // Vecor of regions recorder for later rescheduling SmallVector<std::pair<MachineBasicBlock::iterator, MachineBasicBlock::iterator>, 32> Regions; - // Region live-ins. - DenseMap<unsigned, LaneBitmask> LiveIns; + // Region live-in cache. + SmallVector<GCNRPTracker::LiveRegSet, 32> LiveIns; + + // Region pressure cache. + SmallVector<GCNRegPressure, 32> Pressure; + + // Temporary basic block live-in cache. + DenseMap<const MachineBasicBlock*, GCNRPTracker::LiveRegSet> MBBLiveIns; - // Number of live-ins to the current region, first SGPR then VGPR. - std::pair<unsigned, unsigned> LiveInPressure; + // Return current region pressure. + GCNRegPressure getRealRegPressure() const; - // Collect current region live-ins. - void discoverLiveIns(); + // Compute and cache live-ins and pressure for all regions in block. + void computeBlockPressure(const MachineBasicBlock *MBB); - // Return current region pressure. First value is SGPR number, second is VGPR. - std::pair<unsigned, unsigned> getRealRegPressure() const; public: GCNScheduleDAGMILive(MachineSchedContext *C, diff --git a/lib/Target/AMDGPU/SIAnnotateControlFlow.cpp b/lib/Target/AMDGPU/SIAnnotateControlFlow.cpp index d8cb98fe1b19..8cb35c506135 100644 --- a/lib/Target/AMDGPU/SIAnnotateControlFlow.cpp +++ b/lib/Target/AMDGPU/SIAnnotateControlFlow.cpp @@ -126,7 +126,7 @@ bool SIAnnotateControlFlow::doInitialization(Module &M) { Void = Type::getVoidTy(Context); Boolean = Type::getInt1Ty(Context); Int64 = Type::getInt64Ty(Context); - ReturnStruct = StructType::get(Boolean, Int64, (Type *)nullptr); + ReturnStruct = StructType::get(Boolean, Int64); BoolTrue = ConstantInt::getTrue(Context); BoolFalse = ConstantInt::getFalse(Context); diff --git a/lib/Target/AMDGPU/SIISelLowering.cpp b/lib/Target/AMDGPU/SIISelLowering.cpp index cc93c27731ff..48a14e4dbea2 100644 --- a/lib/Target/AMDGPU/SIISelLowering.cpp +++ b/lib/Target/AMDGPU/SIISelLowering.cpp @@ -488,6 +488,7 @@ SITargetLowering::SITargetLowering(const TargetMachine &TM, setTargetDAGCombine(ISD::FCANONICALIZE); setTargetDAGCombine(ISD::SCALAR_TO_VECTOR); setTargetDAGCombine(ISD::ZERO_EXTEND); + setTargetDAGCombine(ISD::EXTRACT_VECTOR_ELT); // All memory operations. Some folding on the pointer operand is done to help // matching the constant offsets in the addressing modes. @@ -2003,6 +2004,7 @@ MachineBasicBlock *SITargetLowering::EmitInstrWithCustomInserter( break; } assert(Found); + (void)Found; // This should be before all vector instructions. BuildMI(*BB, FirstMI, DebugLoc(), TII->get(AMDGPU::S_BFE_U32), CountReg) @@ -4604,6 +4606,24 @@ SDValue SITargetLowering::performCvtPkRTZCombine(SDNode *N, return SDValue(); } +SDValue SITargetLowering::performExtractVectorEltCombine( + SDNode *N, DAGCombinerInfo &DCI) const { + SDValue Vec = N->getOperand(0); + + SelectionDAG &DAG= DCI.DAG; + if (Vec.getOpcode() == ISD::FNEG && allUsesHaveSourceMods(N)) { + SDLoc SL(N); + EVT EltVT = N->getValueType(0); + SDValue Idx = N->getOperand(1); + SDValue Elt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, EltVT, + Vec.getOperand(0), Idx); + return DAG.getNode(ISD::FNEG, SL, EltVT, Elt); + } + + return SDValue(); +} + + unsigned SITargetLowering::getFusedOpcode(const SelectionDAG &DAG, const SDNode *N0, const SDNode *N1) const { @@ -4891,6 +4911,8 @@ SDValue SITargetLowering::PerformDAGCombine(SDNode *N, break; } + case ISD::EXTRACT_VECTOR_ELT: + return performExtractVectorEltCombine(N, DCI); } return AMDGPUTargetLowering::PerformDAGCombine(N, DCI); } diff --git a/lib/Target/AMDGPU/SIISelLowering.h b/lib/Target/AMDGPU/SIISelLowering.h index d177777ad5ee..046e677756d1 100644 --- a/lib/Target/AMDGPU/SIISelLowering.h +++ b/lib/Target/AMDGPU/SIISelLowering.h @@ -100,6 +100,7 @@ class SITargetLowering final : public AMDGPUTargetLowering { SDValue performMinMaxCombine(SDNode *N, DAGCombinerInfo &DCI) const; SDValue performFMed3Combine(SDNode *N, DAGCombinerInfo &DCI) const; SDValue performCvtPkRTZCombine(SDNode *N, DAGCombinerInfo &DCI) const; + SDValue performExtractVectorEltCombine(SDNode *N, DAGCombinerInfo &DCI) const; unsigned getFusedOpcode(const SelectionDAG &DAG, const SDNode *N0, const SDNode *N1) const; diff --git a/lib/Target/AMDGPU/SIInstrInfo.cpp b/lib/Target/AMDGPU/SIInstrInfo.cpp index 92e452a3d6a0..065fd09eb356 100644 --- a/lib/Target/AMDGPU/SIInstrInfo.cpp +++ b/lib/Target/AMDGPU/SIInstrInfo.cpp @@ -496,6 +496,188 @@ int SIInstrInfo::commuteOpcode(unsigned Opcode) const { return Opcode; } +void SIInstrInfo::materializeImmediate(MachineBasicBlock &MBB, + MachineBasicBlock::iterator MI, + const DebugLoc &DL, unsigned DestReg, + int64_t Value) const { + MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo(); + const TargetRegisterClass *RegClass = MRI.getRegClass(DestReg); + if (RegClass == &AMDGPU::SReg_32RegClass || + RegClass == &AMDGPU::SGPR_32RegClass || + RegClass == &AMDGPU::SReg_32_XM0RegClass || + RegClass == &AMDGPU::SReg_32_XM0_XEXECRegClass) { + BuildMI(MBB, MI, DL, get(AMDGPU::S_MOV_B32), DestReg) + .addImm(Value); + return; + } + + if (RegClass == &AMDGPU::SReg_64RegClass || + RegClass == &AMDGPU::SGPR_64RegClass || + RegClass == &AMDGPU::SReg_64_XEXECRegClass) { + BuildMI(MBB, MI, DL, get(AMDGPU::S_MOV_B64), DestReg) + .addImm(Value); + return; + } + + if (RegClass == &AMDGPU::VGPR_32RegClass) { + BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B32_e32), DestReg) + .addImm(Value); + return; + } + if (RegClass == &AMDGPU::VReg_64RegClass) { + BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B64_PSEUDO), DestReg) + .addImm(Value); + return; + } + + unsigned EltSize = 4; + unsigned Opcode = AMDGPU::V_MOV_B32_e32; + if (RI.isSGPRClass(RegClass)) { + if (RI.getRegSizeInBits(*RegClass) > 32) { + Opcode = AMDGPU::S_MOV_B64; + EltSize = 8; + } else { + Opcode = AMDGPU::S_MOV_B32; + EltSize = 4; + } + } + + ArrayRef<int16_t> SubIndices = RI.getRegSplitParts(RegClass, EltSize); + for (unsigned Idx = 0; Idx < SubIndices.size(); ++Idx) { + int64_t IdxValue = Idx == 0 ? Value : 0; + + MachineInstrBuilder Builder = BuildMI(MBB, MI, DL, + get(Opcode), RI.getSubReg(DestReg, Idx)); + Builder.addImm(IdxValue); + } +} + +const TargetRegisterClass * +SIInstrInfo::getPreferredSelectRegClass(unsigned Size) const { + return &AMDGPU::VGPR_32RegClass; +} + +void SIInstrInfo::insertVectorSelect(MachineBasicBlock &MBB, + MachineBasicBlock::iterator I, + const DebugLoc &DL, unsigned DstReg, + ArrayRef<MachineOperand> Cond, + unsigned TrueReg, + unsigned FalseReg) const { + MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo(); + assert(MRI.getRegClass(DstReg) == &AMDGPU::VGPR_32RegClass && + "Not a VGPR32 reg"); + + if (Cond.size() == 1) { + BuildMI(MBB, I, DL, get(AMDGPU::V_CNDMASK_B32_e64), DstReg) + .addReg(FalseReg) + .addReg(TrueReg) + .add(Cond[0]); + } else if (Cond.size() == 2) { + assert(Cond[0].isImm() && "Cond[0] is not an immediate"); + switch (Cond[0].getImm()) { + case SIInstrInfo::SCC_TRUE: { + unsigned SReg = MRI.createVirtualRegister(&AMDGPU::SReg_64RegClass); + BuildMI(MBB, I, DL, get(AMDGPU::S_CSELECT_B64), SReg) + .addImm(-1) + .addImm(0); + BuildMI(MBB, I, DL, get(AMDGPU::V_CNDMASK_B32_e64), DstReg) + .addReg(FalseReg) + .addReg(TrueReg) + .addReg(SReg); + break; + } + case SIInstrInfo::SCC_FALSE: { + unsigned SReg = MRI.createVirtualRegister(&AMDGPU::SReg_64RegClass); + BuildMI(MBB, I, DL, get(AMDGPU::S_CSELECT_B64), SReg) + .addImm(0) + .addImm(-1); + BuildMI(MBB, I, DL, get(AMDGPU::V_CNDMASK_B32_e64), DstReg) + .addReg(FalseReg) + .addReg(TrueReg) + .addReg(SReg); + break; + } + case SIInstrInfo::VCCNZ: { + MachineOperand RegOp = Cond[1]; + RegOp.setImplicit(false); + BuildMI(MBB, I, DL, get(AMDGPU::V_CNDMASK_B32_e64), DstReg) + .addReg(FalseReg) + .addReg(TrueReg) + .add(RegOp); + break; + } + case SIInstrInfo::VCCZ: { + MachineOperand RegOp = Cond[1]; + RegOp.setImplicit(false); + BuildMI(MBB, I, DL, get(AMDGPU::V_CNDMASK_B32_e64), DstReg) + .addReg(TrueReg) + .addReg(FalseReg) + .add(RegOp); + break; + } + case SIInstrInfo::EXECNZ: { + unsigned SReg = MRI.createVirtualRegister(&AMDGPU::SReg_64RegClass); + unsigned SReg2 = MRI.createVirtualRegister(&AMDGPU::SReg_64RegClass); + BuildMI(MBB, I, DL, get(AMDGPU::S_OR_SAVEEXEC_B64), SReg2) + .addImm(0); + BuildMI(MBB, I, DL, get(AMDGPU::S_CSELECT_B64), SReg) + .addImm(-1) + .addImm(0); + BuildMI(MBB, I, DL, get(AMDGPU::V_CNDMASK_B32_e64), DstReg) + .addReg(FalseReg) + .addReg(TrueReg) + .addReg(SReg); + break; + } + case SIInstrInfo::EXECZ: { + unsigned SReg = MRI.createVirtualRegister(&AMDGPU::SReg_64RegClass); + unsigned SReg2 = MRI.createVirtualRegister(&AMDGPU::SReg_64RegClass); + BuildMI(MBB, I, DL, get(AMDGPU::S_OR_SAVEEXEC_B64), SReg2) + .addImm(0); + BuildMI(MBB, I, DL, get(AMDGPU::S_CSELECT_B64), SReg) + .addImm(0) + .addImm(-1); + BuildMI(MBB, I, DL, get(AMDGPU::V_CNDMASK_B32_e64), DstReg) + .addReg(FalseReg) + .addReg(TrueReg) + .addReg(SReg); + llvm_unreachable("Unhandled branch predicate EXECZ"); + break; + } + default: + llvm_unreachable("invalid branch predicate"); + } + } else { + llvm_unreachable("Can only handle Cond size 1 or 2"); + } +} + +unsigned SIInstrInfo::insertEQ(MachineBasicBlock *MBB, + MachineBasicBlock::iterator I, + const DebugLoc &DL, + unsigned SrcReg, int Value) const { + MachineRegisterInfo &MRI = MBB->getParent()->getRegInfo(); + unsigned Reg = MRI.createVirtualRegister(&AMDGPU::SReg_64RegClass); + BuildMI(*MBB, I, DL, get(AMDGPU::V_CMP_EQ_I32_e64), Reg) + .addImm(Value) + .addReg(SrcReg); + + return Reg; +} + +unsigned SIInstrInfo::insertNE(MachineBasicBlock *MBB, + MachineBasicBlock::iterator I, + const DebugLoc &DL, + unsigned SrcReg, int Value) const { + MachineRegisterInfo &MRI = MBB->getParent()->getRegInfo(); + unsigned Reg = MRI.createVirtualRegister(&AMDGPU::SReg_64RegClass); + BuildMI(*MBB, I, DL, get(AMDGPU::V_CMP_NE_I32_e64), Reg) + .addImm(Value) + .addReg(SrcReg); + + return Reg; +} + unsigned SIInstrInfo::getMovOpcode(const TargetRegisterClass *DstRC) const { if (RI.getRegSizeInBits(*DstRC) == 32) { @@ -834,6 +1016,20 @@ void SIInstrInfo::insertNoop(MachineBasicBlock &MBB, insertWaitStates(MBB, MI, 1); } +void SIInstrInfo::insertReturn(MachineBasicBlock &MBB) const { + auto MF = MBB.getParent(); + SIMachineFunctionInfo *Info = MF->getInfo<SIMachineFunctionInfo>(); + + assert(Info->isEntryFunction()); + + if (MBB.succ_empty()) { + bool HasNoTerminator = MBB.getFirstTerminator() == MBB.end(); + if (HasNoTerminator) + BuildMI(MBB, MBB.end(), DebugLoc(), + get(Info->returnsVoid() ? AMDGPU::S_ENDPGM : AMDGPU::SI_RETURN_TO_EPILOG)); + } +} + unsigned SIInstrInfo::getNumWaitStates(const MachineInstr &MI) const { switch (MI.getOpcode()) { default: return 1; // FIXME: Do wait states equal cycles? @@ -1241,14 +1437,20 @@ bool SIInstrInfo::analyzeBranchImpl(MachineBasicBlock &MBB, return false; } - BranchPredicate Pred = getBranchPredicate(I->getOpcode()); - if (Pred == INVALID_BR) - return true; + MachineBasicBlock *CondBB = nullptr; - MachineBasicBlock *CondBB = I->getOperand(0).getMBB(); - Cond.push_back(MachineOperand::CreateImm(Pred)); - Cond.push_back(I->getOperand(1)); // Save the branch register. + if (I->getOpcode() == AMDGPU::SI_NON_UNIFORM_BRCOND_PSEUDO) { + CondBB = I->getOperand(1).getMBB(); + Cond.push_back(I->getOperand(0)); + } else { + BranchPredicate Pred = getBranchPredicate(I->getOpcode()); + if (Pred == INVALID_BR) + return true; + CondBB = I->getOperand(0).getMBB(); + Cond.push_back(MachineOperand::CreateImm(Pred)); + Cond.push_back(I->getOperand(1)); // Save the branch register. + } ++I; if (I == MBB.end()) { @@ -1351,6 +1553,13 @@ unsigned SIInstrInfo::insertBranch(MachineBasicBlock &MBB, return 1; } + if(Cond.size() == 1 && Cond[0].isReg()) { + BuildMI(&MBB, DL, get(AMDGPU::SI_NON_UNIFORM_BRCOND_PSEUDO)) + .add(Cond[0]) + .addMBB(TBB); + return 1; + } + assert(TBB && Cond[0].isImm()); unsigned Opcode @@ -1390,9 +1599,16 @@ unsigned SIInstrInfo::insertBranch(MachineBasicBlock &MBB, bool SIInstrInfo::reverseBranchCondition( SmallVectorImpl<MachineOperand> &Cond) const { - assert(Cond.size() == 2); - Cond[0].setImm(-Cond[0].getImm()); - return false; + if (Cond.size() != 2) { + return true; + } + + if (Cond[0].isImm()) { + Cond[0].setImm(-Cond[0].getImm()); + return false; + } + + return true; } bool SIInstrInfo::canInsertSelect(const MachineBasicBlock &MBB, @@ -3920,6 +4136,82 @@ bool SIInstrInfo::mayAccessFlatAddressSpace(const MachineInstr &MI) const { return false; } +bool SIInstrInfo::isNonUniformBranchInstr(MachineInstr &Branch) const { + return Branch.getOpcode() == AMDGPU::SI_NON_UNIFORM_BRCOND_PSEUDO; +} + +void SIInstrInfo::convertNonUniformIfRegion(MachineBasicBlock *IfEntry, + MachineBasicBlock *IfEnd) const { + MachineBasicBlock::iterator TI = IfEntry->getFirstTerminator(); + assert(TI != IfEntry->end()); + + MachineInstr *Branch = &(*TI); + MachineFunction *MF = IfEntry->getParent(); + MachineRegisterInfo &MRI = IfEntry->getParent()->getRegInfo(); + + if (Branch->getOpcode() == AMDGPU::SI_NON_UNIFORM_BRCOND_PSEUDO) { + unsigned DstReg = MRI.createVirtualRegister(&AMDGPU::SReg_64RegClass); + MachineInstr *SIIF = + BuildMI(*MF, Branch->getDebugLoc(), get(AMDGPU::SI_IF), DstReg) + .add(Branch->getOperand(0)) + .add(Branch->getOperand(1)); + MachineInstr *SIEND = + BuildMI(*MF, Branch->getDebugLoc(), get(AMDGPU::SI_END_CF)) + .addReg(DstReg); + + IfEntry->erase(TI); + IfEntry->insert(IfEntry->end(), SIIF); + IfEnd->insert(IfEnd->getFirstNonPHI(), SIEND); + } +} + +void SIInstrInfo::convertNonUniformLoopRegion( + MachineBasicBlock *LoopEntry, MachineBasicBlock *LoopEnd) const { + MachineBasicBlock::iterator TI = LoopEnd->getFirstTerminator(); + // We expect 2 terminators, one conditional and one unconditional. + assert(TI != LoopEnd->end()); + + MachineInstr *Branch = &(*TI); + MachineFunction *MF = LoopEnd->getParent(); + MachineRegisterInfo &MRI = LoopEnd->getParent()->getRegInfo(); + + if (Branch->getOpcode() == AMDGPU::SI_NON_UNIFORM_BRCOND_PSEUDO) { + + unsigned DstReg = MRI.createVirtualRegister(&AMDGPU::SReg_64RegClass); + unsigned BackEdgeReg = MRI.createVirtualRegister(&AMDGPU::SReg_64RegClass); + MachineInstrBuilder HeaderPHIBuilder = + BuildMI(*(MF), Branch->getDebugLoc(), get(TargetOpcode::PHI), DstReg); + for (MachineBasicBlock::pred_iterator PI = LoopEntry->pred_begin(), + E = LoopEntry->pred_end(); + PI != E; ++PI) { + if (*PI == LoopEnd) { + HeaderPHIBuilder.addReg(BackEdgeReg); + } else { + MachineBasicBlock *PMBB = *PI; + unsigned ZeroReg = MRI.createVirtualRegister(&AMDGPU::SReg_64RegClass); + materializeImmediate(*PMBB, PMBB->getFirstTerminator(), DebugLoc(), + ZeroReg, 0); + HeaderPHIBuilder.addReg(ZeroReg); + } + HeaderPHIBuilder.addMBB(*PI); + } + MachineInstr *HeaderPhi = HeaderPHIBuilder; + MachineInstr *SIIFBREAK = BuildMI(*(MF), Branch->getDebugLoc(), + get(AMDGPU::SI_IF_BREAK), BackEdgeReg) + .addReg(DstReg) + .add(Branch->getOperand(0)); + MachineInstr *SILOOP = + BuildMI(*(MF), Branch->getDebugLoc(), get(AMDGPU::SI_LOOP)) + .addReg(BackEdgeReg) + .addMBB(LoopEntry); + + LoopEntry->insert(LoopEntry->begin(), HeaderPhi); + LoopEnd->erase(TI); + LoopEnd->insert(LoopEnd->end(), SIIFBREAK); + LoopEnd->insert(LoopEnd->end(), SILOOP); + } +} + ArrayRef<std::pair<int, const char *>> SIInstrInfo::getSerializableTargetIndices() const { static const std::pair<int, const char *> TargetIndices[] = { diff --git a/lib/Target/AMDGPU/SIInstrInfo.h b/lib/Target/AMDGPU/SIInstrInfo.h index 03a5ef74b179..f6e5e8883f63 100644 --- a/lib/Target/AMDGPU/SIInstrInfo.h +++ b/lib/Target/AMDGPU/SIInstrInfo.h @@ -143,6 +143,23 @@ public: RegScavenger *RS, unsigned TmpReg, unsigned Offset, unsigned Size) const; + void materializeImmediate(MachineBasicBlock &MBB, + MachineBasicBlock::iterator MI, + const DebugLoc &DL, + unsigned DestReg, + int64_t Value) const; + + const TargetRegisterClass *getPreferredSelectRegClass( + unsigned Size) const; + + unsigned insertNE(MachineBasicBlock *MBB, + MachineBasicBlock::iterator I, const DebugLoc &DL, + unsigned SrcReg, int Value) const; + + unsigned insertEQ(MachineBasicBlock *MBB, + MachineBasicBlock::iterator I, const DebugLoc &DL, + unsigned SrcReg, int Value) const; + void storeRegToStackSlot(MachineBasicBlock &MBB, MachineBasicBlock::iterator MI, unsigned SrcReg, bool isKill, int FrameIndex, @@ -193,7 +210,7 @@ public: bool analyzeBranch(MachineBasicBlock &MBB, MachineBasicBlock *&TBB, MachineBasicBlock *&FBB, SmallVectorImpl<MachineOperand> &Cond, - bool AllowModify) const override; + bool AllowModify = false) const override; unsigned removeBranch(MachineBasicBlock &MBB, int *BytesRemoved = nullptr) const override; @@ -218,6 +235,11 @@ public: unsigned DstReg, ArrayRef<MachineOperand> Cond, unsigned TrueReg, unsigned FalseReg) const override; + void insertVectorSelect(MachineBasicBlock &MBB, + MachineBasicBlock::iterator I, const DebugLoc &DL, + unsigned DstReg, ArrayRef<MachineOperand> Cond, + unsigned TrueReg, unsigned FalseReg) const; + bool areMemAccessesTriviallyDisjoint(MachineInstr &MIa, MachineInstr &MIb, AliasAnalysis *AA = nullptr) const override; @@ -705,6 +727,7 @@ public: void insertNoop(MachineBasicBlock &MBB, MachineBasicBlock::iterator MI) const override; + void insertReturn(MachineBasicBlock &MBB) const; /// \brief Return the number of wait states that result from executing this /// instruction. unsigned getNumWaitStates(const MachineInstr &MI) const; @@ -750,6 +773,14 @@ public: bool mayAccessFlatAddressSpace(const MachineInstr &MI) const; + bool isNonUniformBranchInstr(MachineInstr &Instr) const; + + void convertNonUniformIfRegion(MachineBasicBlock *IfEntry, + MachineBasicBlock *IfEnd) const; + + void convertNonUniformLoopRegion(MachineBasicBlock *LoopEntry, + MachineBasicBlock *LoopEnd) const; + ArrayRef<std::pair<int, const char *>> getSerializableTargetIndices() const override; diff --git a/lib/Target/AMDGPU/SIInstructions.td b/lib/Target/AMDGPU/SIInstructions.td index 7ccb54f54e34..3b4bdc864253 100644 --- a/lib/Target/AMDGPU/SIInstructions.td +++ b/lib/Target/AMDGPU/SIInstructions.td @@ -174,6 +174,13 @@ def SI_MASK_BRANCH : VPseudoInstSI < let isTerminator = 1 in { + def SI_NON_UNIFORM_BRCOND_PSEUDO : CFPseudoInstSI < + (outs), + (ins SReg_64:$vcc, brtarget:$target), + [(brcond i1:$vcc, bb:$target)]> { + let Size = 12; +} + def SI_IF: CFPseudoInstSI < (outs SReg_64:$dst), (ins SReg_64:$vcc, brtarget:$target), [(set i64:$dst, (AMDGPUif i1:$vcc, bb:$target))], 1, 1> { diff --git a/lib/Target/AMDGPU/VOP2Instructions.td b/lib/Target/AMDGPU/VOP2Instructions.td index 2281f338ab45..4a11d9471f1d 100644 --- a/lib/Target/AMDGPU/VOP2Instructions.td +++ b/lib/Target/AMDGPU/VOP2Instructions.td @@ -164,8 +164,11 @@ multiclass VOP2eInst <string opName, class VOP_MADAK <ValueType vt> : VOPProfile <[vt, vt, vt, vt]> { field Operand ImmOpType = !if(!eq(vt.Size, 32), f32kimm, f16kimm); field dag Ins32 = (ins VCSrc_f32:$src0, VGPR_32:$src1, ImmOpType:$imm); - field string Asm32 = "$vdst, $src0, $src1, $imm"; field bit HasExt = 0; + + // Hack to stop printing _e64 + let DstRC = RegisterOperand<VGPR_32>; + field string Asm32 = " $vdst, $src0, $src1, $imm"; } def VOP_MADAK_F16 : VOP_MADAK <f16>; @@ -174,8 +177,11 @@ def VOP_MADAK_F32 : VOP_MADAK <f32>; class VOP_MADMK <ValueType vt> : VOPProfile <[vt, vt, vt, vt]> { field Operand ImmOpType = !if(!eq(vt.Size, 32), f32kimm, f16kimm); field dag Ins32 = (ins VCSrc_f32:$src0, ImmOpType:$imm, VGPR_32:$src1); - field string Asm32 = "$vdst, $src0, $imm, $src1"; field bit HasExt = 0; + + // Hack to stop printing _e64 + let DstRC = RegisterOperand<VGPR_32>; + field string Asm32 = " $vdst, $src0, $imm, $src1"; } def VOP_MADMK_F16 : VOP_MADMK <f16>; @@ -298,7 +304,7 @@ def VOP_WRITELANE : VOPProfile<[i32, i32, i32]> { let SubtargetPredicate = isGCN in { defm V_CNDMASK_B32 : VOP2eInst <"v_cndmask_b32", VOP2e_I32_I32_I32_I1>; -def V_MADMK_F32 : VOP2_Pseudo <"v_madmk_f32", VOP_MADMK_F32>; +def V_MADMK_F32 : VOP2_Pseudo <"v_madmk_f32", VOP_MADMK_F32, [], "">; let isCommutable = 1 in { defm V_ADD_F32 : VOP2Inst <"v_add_f32", VOP_F32_F32_F32, fadd>; @@ -328,7 +334,7 @@ let Constraints = "$vdst = $src2", DisableEncoding="$src2", defm V_MAC_F32 : VOP2Inst <"v_mac_f32", VOP_MAC_F32>; } -def V_MADAK_F32 : VOP2_Pseudo <"v_madak_f32", VOP_MADAK_F32>; +def V_MADAK_F32 : VOP2_Pseudo <"v_madak_f32", VOP_MADAK_F32, [], "">; // No patterns so that the scalar instructions are always selected. // The scalar versions will be replaced with vector when needed later. @@ -383,7 +389,7 @@ defm V_LSHL_B32 : VOP2Inst <"v_lshl_b32", VOP_I32_I32_I32>; let SubtargetPredicate = isVI in { -def V_MADMK_F16 : VOP2_Pseudo <"v_madmk_f16", VOP_MADMK_F16>; +def V_MADMK_F16 : VOP2_Pseudo <"v_madmk_f16", VOP_MADMK_F16, [], "">; defm V_LSHLREV_B16 : VOP2Inst <"v_lshlrev_b16", VOP_I16_I16_I16>; defm V_LSHRREV_B16 : VOP2Inst <"v_lshrrev_b16", VOP_I16_I16_I16>; defm V_ASHRREV_I16 : VOP2Inst <"v_ashrrev_i16", VOP_I16_I16_I16>; @@ -394,7 +400,7 @@ defm V_ADD_F16 : VOP2Inst <"v_add_f16", VOP_F16_F16_F16, fadd>; defm V_SUB_F16 : VOP2Inst <"v_sub_f16", VOP_F16_F16_F16, fsub>; defm V_SUBREV_F16 : VOP2Inst <"v_subrev_f16", VOP_F16_F16_F16, null_frag, "v_sub_f16">; defm V_MUL_F16 : VOP2Inst <"v_mul_f16", VOP_F16_F16_F16, fmul>; -def V_MADAK_F16 : VOP2_Pseudo <"v_madak_f16", VOP_MADAK_F16>; +def V_MADAK_F16 : VOP2_Pseudo <"v_madak_f16", VOP_MADAK_F16, [], "">; defm V_ADD_U16 : VOP2Inst <"v_add_u16", VOP_I16_I16_I16>; defm V_SUB_U16 : VOP2Inst <"v_sub_u16" , VOP_I16_I16_I16>; defm V_SUBREV_U16 : VOP2Inst <"v_subrev_u16", VOP_I16_I16_I16, null_frag, "v_sub_u16">; @@ -651,6 +657,17 @@ multiclass VOP2_Real_e64_vi <bits<10> op> { VOP3e_vi <op, !cast<VOP3_Pseudo>(NAME#"_e64").Pfl>; } +multiclass VOP2_Real_e64only_vi <bits<10> op> { + def _e64_vi : + VOP3_Real<!cast<VOP3_Pseudo>(NAME#"_e64"), SIEncodingFamily.VI>, + VOP3e_vi <op, !cast<VOP3_Pseudo>(NAME#"_e64").Pfl> { + // Hack to stop printing _e64 + VOP3_Pseudo ps = !cast<VOP3_Pseudo>(NAME#"_e64"); + let OutOperandList = (outs VGPR_32:$vdst); + let AsmString = ps.Mnemonic # " " # ps.AsmOperands; + } +} + multiclass Base_VOP2be_Real_e32e64_vi <bits<6> op> : VOP2_Real_e32_vi<op> { def _e64_vi : VOP3_Real<!cast<VOP3_Pseudo>(NAME#"_e64"), SIEncodingFamily.VI>, @@ -718,17 +735,17 @@ defm V_SUBBREV_U32 : VOP2be_Real_e32e64_vi <0x1e>; defm V_READLANE_B32 : VOP32_Real_vi <0x289>; defm V_WRITELANE_B32 : VOP32_Real_vi <0x28a>; -defm V_BFM_B32 : VOP2_Real_e64_vi <0x293>; -defm V_BCNT_U32_B32 : VOP2_Real_e64_vi <0x28b>; -defm V_MBCNT_LO_U32_B32 : VOP2_Real_e64_vi <0x28c>; -defm V_MBCNT_HI_U32_B32 : VOP2_Real_e64_vi <0x28d>; -defm V_LDEXP_F32 : VOP2_Real_e64_vi <0x288>; -defm V_CVT_PKACCUM_U8_F32 : VOP2_Real_e64_vi <0x1f0>; -defm V_CVT_PKNORM_I16_F32 : VOP2_Real_e64_vi <0x294>; -defm V_CVT_PKNORM_U16_F32 : VOP2_Real_e64_vi <0x295>; -defm V_CVT_PKRTZ_F16_F32 : VOP2_Real_e64_vi <0x296>; -defm V_CVT_PK_U16_U32 : VOP2_Real_e64_vi <0x297>; -defm V_CVT_PK_I16_I32 : VOP2_Real_e64_vi <0x298>; +defm V_BFM_B32 : VOP2_Real_e64only_vi <0x293>; +defm V_BCNT_U32_B32 : VOP2_Real_e64only_vi <0x28b>; +defm V_MBCNT_LO_U32_B32 : VOP2_Real_e64only_vi <0x28c>; +defm V_MBCNT_HI_U32_B32 : VOP2_Real_e64only_vi <0x28d>; +defm V_LDEXP_F32 : VOP2_Real_e64only_vi <0x288>; +defm V_CVT_PKACCUM_U8_F32 : VOP2_Real_e64only_vi <0x1f0>; +defm V_CVT_PKNORM_I16_F32 : VOP2_Real_e64only_vi <0x294>; +defm V_CVT_PKNORM_U16_F32 : VOP2_Real_e64only_vi <0x295>; +defm V_CVT_PKRTZ_F16_F32 : VOP2_Real_e64only_vi <0x296>; +defm V_CVT_PK_U16_U32 : VOP2_Real_e64only_vi <0x297>; +defm V_CVT_PK_I16_I32 : VOP2_Real_e64only_vi <0x298>; defm V_ADD_F16 : VOP2_Real_e32e64_vi <0x1f>; defm V_SUB_F16 : VOP2_Real_e32e64_vi <0x20>; diff --git a/lib/Target/AMDGPU/VOP3Instructions.td b/lib/Target/AMDGPU/VOP3Instructions.td index 217a07488853..ffa6c60d6b1f 100644 --- a/lib/Target/AMDGPU/VOP3Instructions.td +++ b/lib/Target/AMDGPU/VOP3Instructions.td @@ -232,7 +232,6 @@ def V_ASHRREV_I64 : VOP3Inst <"v_ashrrev_i64", VOP3_Profile<VOP_I64_I32_I64>>; let SubtargetPredicate = isCIVI in { -def V_MQSAD_U16_U8 : VOP3Inst <"v_mqsad_u16_u8", VOP3_Profile<VOP_I32_I32_I32>>; def V_QSAD_PK_U16_U8 : VOP3Inst <"v_qsad_pk_u16_u8", VOP3_Profile<VOP_I64_I64_I32_I64>, int_amdgcn_qsad_pk_u16_u8>; def V_MQSAD_U32_U8 : VOP3Inst <"v_mqsad_u32_u8", VOP3_Profile<VOP_V4I32_I64_I32_V4I32>, int_amdgcn_mqsad_u32_u8>; @@ -402,7 +401,6 @@ multiclass VOP3be_Real_ci<bits<9> op> { } } -defm V_MQSAD_U16_U8 : VOP3_Real_ci <0x172>; defm V_QSAD_PK_U16_U8 : VOP3_Real_ci <0x172>; defm V_MQSAD_U32_U8 : VOP3_Real_ci <0x175>; defm V_MAD_U64_U32 : VOP3be_Real_ci <0x176>; @@ -426,7 +424,6 @@ multiclass VOP3be_Real_vi<bits<10> op> { } // End AssemblerPredicates = [isVI], DecoderNamespace = "VI" -defm V_MQSAD_U16_U8 : VOP3_Real_vi <0x172>; defm V_MAD_U64_U32 : VOP3be_Real_vi <0x1E8>; defm V_MAD_I64_I32 : VOP3be_Real_vi <0x1E9>; diff --git a/lib/Target/ARM/ARMBaseInstrInfo.h b/lib/Target/ARM/ARMBaseInstrInfo.h index 28c407f74125..dd7fe871345a 100644 --- a/lib/Target/ARM/ARMBaseInstrInfo.h +++ b/lib/Target/ARM/ARMBaseInstrInfo.h @@ -404,21 +404,11 @@ public: /// Returns predicate register associated with the given frame instruction. unsigned getFramePred(const MachineInstr &MI) const { assert(isFrameInstr(MI)); - if (isFrameSetup(MI)) - // Operands of ADJCALLSTACKDOWN: - // - argument declared in ADJCALLSTACKDOWN pattern: - // 0 - frame size - // 1 - predicate code (like ARMCC::AL) - // - added by predOps: - // 2 - predicate reg - return MI.getOperand(2).getReg(); - assert(MI.getOpcode() == ARM::ADJCALLSTACKUP || - MI.getOpcode() == ARM::tADJCALLSTACKUP); - // Operands of ADJCALLSTACKUP: - // - argument declared in ADJCALLSTACKUP pattern: + // Operands of ADJCALLSTACKDOWN/ADJCALLSTACKUP: + // - argument declared in the pattern: // 0 - frame size - // 1 - arg of CALLSEQ_END - // 2 - predicate code + // 1 - arg of CALLSEQ_START/CALLSEQ_END + // 2 - predicate code (like ARMCC::AL) // - added by predOps: // 3 - predicate reg return MI.getOperand(3).getReg(); diff --git a/lib/Target/ARM/ARMCallLowering.cpp b/lib/Target/ARM/ARMCallLowering.cpp index 9178c67afa6e..46ac4d0ad933 100644 --- a/lib/Target/ARM/ARMCallLowering.cpp +++ b/lib/Target/ARM/ARMCallLowering.cpp @@ -433,7 +433,7 @@ bool ARMCallLowering::lowerCall(MachineIRBuilder &MIRBuilder, // We now know the size of the stack - update the ADJCALLSTACKDOWN // accordingly. - CallSeqStart.addImm(ArgHandler.StackSize).add(predOps(ARMCC::AL)); + CallSeqStart.addImm(ArgHandler.StackSize).addImm(0).add(predOps(ARMCC::AL)); MIRBuilder.buildInstr(ARM::ADJCALLSTACKUP) .addImm(ArgHandler.StackSize) diff --git a/lib/Target/ARM/ARMFastISel.cpp b/lib/Target/ARM/ARMFastISel.cpp index 56cac855620d..4f6a73b5980d 100644 --- a/lib/Target/ARM/ARMFastISel.cpp +++ b/lib/Target/ARM/ARMFastISel.cpp @@ -1949,7 +1949,7 @@ bool ARMFastISel::ProcessCallArgs(SmallVectorImpl<Value*> &Args, unsigned AdjStackDown = TII.getCallFrameSetupOpcode(); AddOptionalDefs(BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(AdjStackDown)) - .addImm(NumBytes)); + .addImm(NumBytes).addImm(0)); // Process the args. for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) { diff --git a/lib/Target/ARM/ARMISelLowering.cpp b/lib/Target/ARM/ARMISelLowering.cpp index e64582402fe1..f8b584db7b99 100644 --- a/lib/Target/ARM/ARMISelLowering.cpp +++ b/lib/Target/ARM/ARMISelLowering.cpp @@ -473,9 +473,9 @@ ARMTargetLowering::ARMTargetLowering(const TargetMachine &TM, } // Use divmod compiler-rt calls for iOS 5.0 and later. - if (Subtarget->isTargetWatchOS() || - (Subtarget->isTargetIOS() && - !Subtarget->getTargetTriple().isOSVersionLT(5, 0))) { + if (Subtarget->isTargetMachO() && + !(Subtarget->isTargetIOS() && + Subtarget->getTargetTriple().isOSVersionLT(5, 0))) { setLibcallName(RTLIB::SDIVREM_I32, "__divmodsi4"); setLibcallName(RTLIB::UDIVREM_I32, "__udivmodsi4"); } @@ -1817,8 +1817,7 @@ ARMTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI, // Adjust the stack pointer for the new arguments... // These operations are automatically eliminated by the prolog/epilog pass if (!isSibCall) - Chain = DAG.getCALLSEQ_START(Chain, - DAG.getIntPtrConstant(NumBytes, dl, true), dl); + Chain = DAG.getCALLSEQ_START(Chain, NumBytes, 0, dl); SDValue StackPtr = DAG.getCopyFromReg(Chain, dl, ARM::SP, getPointerTy(DAG.getDataLayout())); @@ -7365,7 +7364,7 @@ SDValue ARMTargetLowering::LowerFSINCOS(SDValue Op, SelectionDAG &DAG) const { const TargetLowering &TLI = DAG.getTargetLoweringInfo(); // Pair of floats / doubles used to pass the result. - Type *RetTy = StructType::get(ArgTy, ArgTy, nullptr); + Type *RetTy = StructType::get(ArgTy, ArgTy); auto &DL = DAG.getDataLayout(); ArgListTy Args; @@ -13115,7 +13114,7 @@ SDValue ARMTargetLowering::LowerDivRem(SDValue Op, SelectionDAG &DAG) const { SDValue Callee = DAG.getExternalSymbol(getLibcallName(LC), getPointerTy(DAG.getDataLayout())); - Type *RetTy = (Type*)StructType::get(Ty, Ty, nullptr); + Type *RetTy = StructType::get(Ty, Ty); if (Subtarget->isTargetWindows()) InChain = WinDBZCheckDenominator(DAG, Op.getNode(), InChain); @@ -13417,9 +13416,9 @@ Instruction* ARMTargetLowering::makeDMB(IRBuilder<> &Builder, } // Based on http://www.cl.cam.ac.uk/~pes20/cpp/cpp0xmappings.html -Instruction* ARMTargetLowering::emitLeadingFence(IRBuilder<> &Builder, - AtomicOrdering Ord, bool IsStore, - bool IsLoad) const { +Instruction *ARMTargetLowering::emitLeadingFence(IRBuilder<> &Builder, + Instruction *Inst, + AtomicOrdering Ord) const { switch (Ord) { case AtomicOrdering::NotAtomic: case AtomicOrdering::Unordered: @@ -13428,7 +13427,7 @@ Instruction* ARMTargetLowering::emitLeadingFence(IRBuilder<> &Builder, case AtomicOrdering::Acquire: return nullptr; // Nothing to do case AtomicOrdering::SequentiallyConsistent: - if (!IsStore) + if (!Inst->hasAtomicStore()) return nullptr; // Nothing to do /*FALLTHROUGH*/ case AtomicOrdering::Release: @@ -13442,9 +13441,9 @@ Instruction* ARMTargetLowering::emitLeadingFence(IRBuilder<> &Builder, llvm_unreachable("Unknown fence ordering in emitLeadingFence"); } -Instruction* ARMTargetLowering::emitTrailingFence(IRBuilder<> &Builder, - AtomicOrdering Ord, bool IsStore, - bool IsLoad) const { +Instruction *ARMTargetLowering::emitTrailingFence(IRBuilder<> &Builder, + Instruction *Inst, + AtomicOrdering Ord) const { switch (Ord) { case AtomicOrdering::NotAtomic: case AtomicOrdering::Unordered: diff --git a/lib/Target/ARM/ARMISelLowering.h b/lib/Target/ARM/ARMISelLowering.h index 08c51b66dfe7..875c06210ae6 100644 --- a/lib/Target/ARM/ARMISelLowering.h +++ b/lib/Target/ARM/ARMISelLowering.h @@ -483,10 +483,10 @@ class InstrItineraryData; void emitAtomicCmpXchgNoStoreLLBalance(IRBuilder<> &Builder) const override; - Instruction* emitLeadingFence(IRBuilder<> &Builder, AtomicOrdering Ord, - bool IsStore, bool IsLoad) const override; - Instruction* emitTrailingFence(IRBuilder<> &Builder, AtomicOrdering Ord, - bool IsStore, bool IsLoad) const override; + Instruction *emitLeadingFence(IRBuilder<> &Builder, Instruction *Inst, + AtomicOrdering Ord) const override; + Instruction *emitTrailingFence(IRBuilder<> &Builder, Instruction *Inst, + AtomicOrdering Ord) const override; unsigned getMaxSupportedInterleaveFactor() const override { return 4; } diff --git a/lib/Target/ARM/ARMInstrInfo.td b/lib/Target/ARM/ARMInstrInfo.td index a94d6048f02d..d06b7d0896f1 100644 --- a/lib/Target/ARM/ARMInstrInfo.td +++ b/lib/Target/ARM/ARMInstrInfo.td @@ -16,7 +16,8 @@ // // Type profiles. -def SDT_ARMCallSeqStart : SDCallSeqStart<[ SDTCisVT<0, i32> ]>; +def SDT_ARMCallSeqStart : SDCallSeqStart<[ SDTCisVT<0, i32>, + SDTCisVT<1, i32> ]>; def SDT_ARMCallSeqEnd : SDCallSeqEnd<[ SDTCisVT<0, i32>, SDTCisVT<1, i32> ]>; def SDT_ARMStructByVal : SDTypeProfile<0, 4, [SDTCisVT<0, i32>, SDTCisVT<1, i32>, @@ -1968,8 +1969,8 @@ PseudoInst<(outs), (ins i32imm:$amt1, i32imm:$amt2, pred:$p), NoItinerary, [(ARMcallseq_end timm:$amt1, timm:$amt2)]>; def ADJCALLSTACKDOWN : -PseudoInst<(outs), (ins i32imm:$amt, pred:$p), NoItinerary, - [(ARMcallseq_start timm:$amt)]>; +PseudoInst<(outs), (ins i32imm:$amt, i32imm:$amt2, pred:$p), NoItinerary, + [(ARMcallseq_start timm:$amt, timm:$amt2)]>; } def HINT : AI<(outs), (ins imm0_239:$imm), MiscFrm, NoItinerary, diff --git a/lib/Target/ARM/ARMInstrThumb.td b/lib/Target/ARM/ARMInstrThumb.td index 8048c758e998..bee83dfb6f63 100644 --- a/lib/Target/ARM/ARMInstrThumb.td +++ b/lib/Target/ARM/ARMInstrThumb.td @@ -284,8 +284,8 @@ def tADJCALLSTACKUP : Requires<[IsThumb, IsThumb1Only]>; def tADJCALLSTACKDOWN : - PseudoInst<(outs), (ins i32imm:$amt), NoItinerary, - [(ARMcallseq_start imm:$amt)]>, + PseudoInst<(outs), (ins i32imm:$amt, i32imm:$amt2), NoItinerary, + [(ARMcallseq_start imm:$amt, imm:$amt2)]>, Requires<[IsThumb, IsThumb1Only]>; } diff --git a/lib/Target/ARM/ARMInstructionSelector.cpp b/lib/Target/ARM/ARMInstructionSelector.cpp index 2ac3fda9f448..8c680cdf9b47 100644 --- a/lib/Target/ARM/ARMInstructionSelector.cpp +++ b/lib/Target/ARM/ARMInstructionSelector.cpp @@ -101,14 +101,6 @@ static bool selectCopy(MachineInstr &I, const TargetInstrInfo &TII, assert(RegBank && "Can't get reg bank for virtual register"); const unsigned DstSize = MRI.getType(DstReg).getSizeInBits(); - (void)DstSize; - unsigned SrcReg = I.getOperand(1).getReg(); - const unsigned SrcSize = RBI.getSizeInBits(SrcReg, MRI, TRI); - (void)SrcSize; - // We use copies for trunc, so it's ok for the size of the destination to be - // smaller (the higher bits will just be undefined). - assert(DstSize <= SrcSize && "Copy with different width?!"); - assert((RegBank->getID() == ARM::GPRRegBankID || RegBank->getID() == ARM::FPRRegBankID) && "Unsupported reg bank"); @@ -135,28 +127,6 @@ static bool selectCopy(MachineInstr &I, const TargetInstrInfo &TII, return true; } -static bool selectFAdd(MachineInstrBuilder &MIB, const ARMBaseInstrInfo &TII, - MachineRegisterInfo &MRI) { - assert(TII.getSubtarget().hasVFP2() && "Can't select fp add without vfp"); - - LLT Ty = MRI.getType(MIB->getOperand(0).getReg()); - unsigned ValSize = Ty.getSizeInBits(); - - if (ValSize == 32) { - if (TII.getSubtarget().useNEONForSinglePrecisionFP()) - return false; - MIB->setDesc(TII.get(ARM::VADDS)); - } else { - assert(ValSize == 64 && "Unsupported size for floating point value"); - if (TII.getSubtarget().isFPOnlySP()) - return false; - MIB->setDesc(TII.get(ARM::VADDD)); - } - MIB.add(predOps(ARMCC::AL)); - - return true; -} - static bool selectSequence(MachineInstrBuilder &MIB, const ARMBaseInstrInfo &TII, MachineRegisterInfo &MRI, @@ -352,6 +322,7 @@ bool ARMInstructionSelector::select(MachineInstr &I) const { } break; } + case G_ANYEXT: case G_TRUNC: { // The high bits are undefined, so there's nothing special to do, just // treat it as a copy. @@ -362,12 +333,12 @@ bool ARMInstructionSelector::select(MachineInstr &I) const { const auto &DstRegBank = *RBI.getRegBank(DstReg, MRI, TRI); if (SrcRegBank.getID() != DstRegBank.getID()) { - DEBUG(dbgs() << "G_TRUNC operands on different register banks\n"); + DEBUG(dbgs() << "G_TRUNC/G_ANYEXT operands on different register banks\n"); return false; } if (SrcRegBank.getID() != ARM::GPRRegBankID) { - DEBUG(dbgs() << "G_TRUNC on non-GPR not supported yet\n"); + DEBUG(dbgs() << "G_TRUNC/G_ANYEXT on non-GPR not supported yet\n"); return false; } @@ -393,10 +364,6 @@ bool ARMInstructionSelector::select(MachineInstr &I) const { } MIB.add(predOps(ARMCC::AL)).add(condCodeOp()); break; - case G_FADD: - if (!selectFAdd(MIB, TII, MRI)) - return false; - break; case G_FRAME_INDEX: // Add 0 to the given frame index and hope it will eventually be folded into // the user(s). diff --git a/lib/Target/ARM/ARMLegalizerInfo.cpp b/lib/Target/ARM/ARMLegalizerInfo.cpp index 9b86030fdd29..5bf6c7aed6b8 100644 --- a/lib/Target/ARM/ARMLegalizerInfo.cpp +++ b/lib/Target/ARM/ARMLegalizerInfo.cpp @@ -45,9 +45,11 @@ ARMLegalizerInfo::ARMLegalizerInfo(const ARMSubtarget &ST) { setAction({Op, 1, p0}, Legal); } - for (unsigned Op : {G_ADD, G_SUB, G_MUL}) - for (auto Ty : {s1, s8, s16, s32}) - setAction({Op, Ty}, Legal); + for (unsigned Op : {G_ADD, G_SUB, G_MUL}) { + for (auto Ty : {s1, s8, s16}) + setAction({Op, Ty}, WidenScalar); + setAction({Op, s32}, Legal); + } for (unsigned Op : {G_SDIV, G_UDIV}) { for (auto Ty : {s8, s16}) diff --git a/lib/Target/ARM/ARMOptimizeBarriersPass.cpp b/lib/Target/ARM/ARMOptimizeBarriersPass.cpp index 581d5fe159fd..7e4d598a6e0b 100644 --- a/lib/Target/ARM/ARMOptimizeBarriersPass.cpp +++ b/lib/Target/ARM/ARMOptimizeBarriersPass.cpp @@ -88,13 +88,15 @@ bool ARMOptimizeBarriersPass::runOnMachineFunction(MachineFunction &MF) { } } } + bool Changed = false; // Remove the tagged DMB for (auto MI : ToRemove) { MI->eraseFromParent(); ++NumDMBsRemoved; + Changed = true; } - return NumDMBsRemoved > 0; + return Changed; } /// createARMOptimizeBarriersPass - Returns an instance of the remove double diff --git a/lib/Target/ARM/ARMRegisterBankInfo.cpp b/lib/Target/ARM/ARMRegisterBankInfo.cpp index 13a32211f88c..a20997c95cd9 100644 --- a/lib/Target/ARM/ARMRegisterBankInfo.cpp +++ b/lib/Target/ARM/ARMRegisterBankInfo.cpp @@ -225,6 +225,7 @@ ARMRegisterBankInfo::getInstrMapping(const MachineInstr &MI) const { case G_UDIV: case G_SEXT: case G_ZEXT: + case G_ANYEXT: case G_TRUNC: case G_GEP: // FIXME: We're abusing the fact that everything lives in a GPR for now; in diff --git a/lib/Target/ARM/ARMTargetMachine.cpp b/lib/Target/ARM/ARMTargetMachine.cpp index d09f3ecbaa28..5583d6148b08 100644 --- a/lib/Target/ARM/ARMTargetMachine.cpp +++ b/lib/Target/ARM/ARMTargetMachine.cpp @@ -13,7 +13,9 @@ #include "ARM.h" #include "ARMCallLowering.h" #include "ARMLegalizerInfo.h" +#ifdef LLVM_BUILD_GLOBAL_ISEL #include "ARMRegisterBankInfo.h" +#endif #include "ARMSubtarget.h" #include "ARMTargetMachine.h" #include "ARMTargetObjectFile.h" diff --git a/lib/Target/AVR/AVRFrameLowering.cpp b/lib/Target/AVR/AVRFrameLowering.cpp index c297865db820..0ec8e8b08ceb 100644 --- a/lib/Target/AVR/AVRFrameLowering.cpp +++ b/lib/Target/AVR/AVRFrameLowering.cpp @@ -375,7 +375,7 @@ MachineBasicBlock::iterator AVRFrameLowering::eliminateCallFramePseudoInstr( DebugLoc DL = MI->getDebugLoc(); unsigned int Opcode = MI->getOpcode(); - int Amount = MI->getOperand(0).getImm(); + int Amount = TII.getFrameSize(*MI); // Adjcallstackup does not need to allocate stack space for the call, instead // we insert push instructions that will allocate the necessary stack. diff --git a/lib/Target/AVR/AVRISelLowering.cpp b/lib/Target/AVR/AVRISelLowering.cpp index f0ab6acedad1..ef9c00e4b784 100644 --- a/lib/Target/AVR/AVRISelLowering.cpp +++ b/lib/Target/AVR/AVRISelLowering.cpp @@ -361,7 +361,7 @@ SDValue AVRTargetLowering::LowerDivRem(SDValue Op, SelectionDAG &DAG) const { SDValue Callee = DAG.getExternalSymbol(getLibcallName(LC), getPointerTy(DAG.getDataLayout())); - Type *RetTy = (Type *)StructType::get(Ty, Ty, nullptr); + Type *RetTy = (Type *)StructType::get(Ty, Ty); SDLoc dl(Op); TargetLowering::CallLoweringInfo CLI(DAG); @@ -1166,8 +1166,7 @@ SDValue AVRTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI, // Get a count of how many bytes are to be pushed on the stack. unsigned NumBytes = CCInfo.getNextStackOffset(); - Chain = DAG.getCALLSEQ_START(Chain, DAG.getIntPtrConstant(NumBytes, DL, true), - DL); + Chain = DAG.getCALLSEQ_START(Chain, NumBytes, 0, DL); SmallVector<std::pair<unsigned, SDValue>, 8> RegsToPass; @@ -1611,8 +1610,9 @@ AVRTargetLowering::EmitInstrWithCustomInserter(MachineInstr &MI, MachineBasicBlock *trueMBB = MF->CreateMachineBasicBlock(LLVM_BB); MachineBasicBlock *falseMBB = MF->CreateMachineBasicBlock(LLVM_BB); - MachineFunction::iterator I = MBB->getParent()->begin(); - ++I; + MachineFunction::iterator I; + for (I = MF->begin(); I != MF->end() && &(*I) != MBB; ++I); + if (I != MF->end()) ++I; MF->insert(I, trueMBB); MF->insert(I, falseMBB); diff --git a/lib/Target/AVR/AVRInstrInfo.td b/lib/Target/AVR/AVRInstrInfo.td index 1b6547ef7795..06ad2b3ffdf8 100644 --- a/lib/Target/AVR/AVRInstrInfo.td +++ b/lib/Target/AVR/AVRInstrInfo.td @@ -17,7 +17,7 @@ include "AVRInstrFormats.td" // AVR Type Profiles //===----------------------------------------------------------------------===// -def SDT_AVRCallSeqStart : SDCallSeqStart<[SDTCisVT<0, i16>]>; +def SDT_AVRCallSeqStart : SDCallSeqStart<[SDTCisVT<0, i16>, SDTCisVT<1, i16>]>; def SDT_AVRCallSeqEnd : SDCallSeqEnd<[SDTCisVT<0, i16>, SDTCisVT<1, i16>]>; def SDT_AVRCall : SDTypeProfile<0, -1, [SDTCisVT<0, iPTR>]>; def SDT_AVRWrapper : SDTypeProfile<1, 1, [SDTCisSameAs<0, 1>, SDTCisPtrTy<0>]>; @@ -333,9 +333,9 @@ let Defs = [SP, SREG], Uses = [SP] in { def ADJCALLSTACKDOWN : Pseudo<(outs), - (ins i16imm:$amt), + (ins i16imm:$amt, i16imm:$amt2), "#ADJCALLSTACKDOWN", - [(AVRcallseq_start timm:$amt)]>; + [(AVRcallseq_start timm:$amt, timm:$amt2)]>; // R31R30 is used to update SP, since it is a scratch reg and this instruction // is placed after the function call then R31R30 should be always free. diff --git a/lib/Target/AVR/AVRRegisterInfo.cpp b/lib/Target/AVR/AVRRegisterInfo.cpp index 2813e24d2ac7..11a47bad78ba 100644 --- a/lib/Target/AVR/AVRRegisterInfo.cpp +++ b/lib/Target/AVR/AVRRegisterInfo.cpp @@ -52,7 +52,6 @@ AVRRegisterInfo::getCallPreservedMask(const MachineFunction &MF, BitVector AVRRegisterInfo::getReservedRegs(const MachineFunction &MF) const { BitVector Reserved(getNumRegs()); const AVRTargetMachine &TM = static_cast<const AVRTargetMachine&>(MF.getTarget()); - const TargetFrameLowering *TFI = TM.getSubtargetImpl()->getFrameLowering(); // Reserve the intermediate result registers r1 and r2 // The result of instructions like 'mul' is always stored here. diff --git a/lib/Target/BPF/BPFISelLowering.cpp b/lib/Target/BPF/BPFISelLowering.cpp index b9b3dff95c0a..6897161c903c 100644 --- a/lib/Target/BPF/BPFISelLowering.cpp +++ b/lib/Target/BPF/BPFISelLowering.cpp @@ -257,8 +257,7 @@ SDValue BPFTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI, } auto PtrVT = getPointerTy(MF.getDataLayout()); - Chain = DAG.getCALLSEQ_START( - Chain, DAG.getConstant(NumBytes, CLI.DL, PtrVT, true), CLI.DL); + Chain = DAG.getCALLSEQ_START(Chain, NumBytes, 0, CLI.DL); SmallVector<std::pair<unsigned, SDValue>, MaxArgs> RegsToPass; diff --git a/lib/Target/BPF/BPFInstrInfo.td b/lib/Target/BPF/BPFInstrInfo.td index 93ee24371c4d..c6c0ff587c6b 100644 --- a/lib/Target/BPF/BPFInstrInfo.td +++ b/lib/Target/BPF/BPFInstrInfo.td @@ -16,7 +16,8 @@ include "BPFInstrFormats.td" // Instruction Operands and Patterns // These are target-independent nodes, but have target-specific formats. -def SDT_BPFCallSeqStart : SDCallSeqStart<[SDTCisVT<0, iPTR>]>; +def SDT_BPFCallSeqStart : SDCallSeqStart<[SDTCisVT<0, iPTR>, + SDTCisVT<1, iPTR>]>; def SDT_BPFCallSeqEnd : SDCallSeqEnd<[SDTCisVT<0, iPTR>, SDTCisVT<1, iPTR>]>; def SDT_BPFCall : SDTypeProfile<0, -1, [SDTCisVT<0, iPTR>]>; def SDT_BPFSetFlag : SDTypeProfile<0, 3, [SDTCisSameAs<0, 1>]>; @@ -445,9 +446,9 @@ let isReturn = 1, isTerminator = 1, hasDelaySlot=0, isBarrier = 1, // ADJCALLSTACKDOWN/UP pseudo insns let Defs = [R11], Uses = [R11] in { -def ADJCALLSTACKDOWN : Pseudo<(outs), (ins i64imm:$amt), - "#ADJCALLSTACKDOWN $amt", - [(BPFcallseq_start timm:$amt)]>; +def ADJCALLSTACKDOWN : Pseudo<(outs), (ins i64imm:$amt1, i64imm:$amt2), + "#ADJCALLSTACKDOWN $amt1 $amt2", + [(BPFcallseq_start timm:$amt1, timm:$amt2)]>; def ADJCALLSTACKUP : Pseudo<(outs), (ins i64imm:$amt1, i64imm:$amt2), "#ADJCALLSTACKUP $amt1 $amt2", [(BPFcallseq_end timm:$amt1, timm:$amt2)]>; diff --git a/lib/Target/Hexagon/HexagonISelLowering.cpp b/lib/Target/Hexagon/HexagonISelLowering.cpp index 861af94f1e38..1dffebe97f2d 100644 --- a/lib/Target/Hexagon/HexagonISelLowering.cpp +++ b/lib/Target/Hexagon/HexagonISelLowering.cpp @@ -848,8 +848,7 @@ HexagonTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI, SDValue Glue; if (!IsTailCall) { - SDValue C = DAG.getConstant(NumBytes, dl, PtrVT, true); - Chain = DAG.getCALLSEQ_START(Chain, C, dl); + Chain = DAG.getCALLSEQ_START(Chain, NumBytes, 0, dl); Glue = Chain.getValue(1); } diff --git a/lib/Target/Hexagon/HexagonLoopIdiomRecognition.cpp b/lib/Target/Hexagon/HexagonLoopIdiomRecognition.cpp index 5a5799dbe009..e4df7ff5c200 100644 --- a/lib/Target/Hexagon/HexagonLoopIdiomRecognition.cpp +++ b/lib/Target/Hexagon/HexagonLoopIdiomRecognition.cpp @@ -1209,7 +1209,7 @@ bool PolynomialMultiplyRecognize::highBitsAreZero(Value *V, KnownBits Known(T->getBitWidth()); computeKnownBits(V, Known, DL); - return Known.Zero.countLeadingOnes() >= IterCount; + return Known.countMinLeadingZeros() >= IterCount; } diff --git a/lib/Target/Hexagon/HexagonPatterns.td b/lib/Target/Hexagon/HexagonPatterns.td index 32503d111c24..81b5e10c1173 100644 --- a/lib/Target/Hexagon/HexagonPatterns.td +++ b/lib/Target/Hexagon/HexagonPatterns.td @@ -714,7 +714,8 @@ def: Pat<(i1 0), (PS_false)>; def: Pat<(i1 1), (PS_true)>; // Pseudo instructions. -def SDT_SPCallSeqStart : SDCallSeqStart<[ SDTCisVT<0, i32> ]>; +def SDT_SPCallSeqStart : SDCallSeqStart<[ SDTCisVT<0, i32>, + SDTCisVT<1, i32> ]>; def SDT_SPCallSeqEnd : SDCallSeqEnd<[ SDTCisVT<0, i32>, SDTCisVT<1, i32> ]>; @@ -732,8 +733,8 @@ def HexagonTCRet : SDNode<"HexagonISD::TC_RETURN", SDT_SPCall, [SDNPHasChain, SDNPOptInGlue, SDNPVariadic]>; -def: Pat<(callseq_start timm:$amt), - (ADJCALLSTACKDOWN imm:$amt)>; +def: Pat<(callseq_start timm:$amt, timm:$amt2), + (ADJCALLSTACKDOWN imm:$amt, imm:$amt2)>; def: Pat<(callseq_end timm:$amt1, timm:$amt2), (ADJCALLSTACKUP imm:$amt1, imm:$amt2)>; diff --git a/lib/Target/Hexagon/HexagonPseudo.td b/lib/Target/Hexagon/HexagonPseudo.td index 8c2caea2d5c5..0f99dfe342b8 100644 --- a/lib/Target/Hexagon/HexagonPseudo.td +++ b/lib/Target/Hexagon/HexagonPseudo.td @@ -80,7 +80,7 @@ def PS_false : InstHexagon<(outs PredRegs:$dst), (ins), "", [(set I1:$dst, 0)], "", C2_andn.Itinerary, TypeCR>; let Defs = [R29, R30], Uses = [R31, R30, R29], isPseudo = 1 in -def ADJCALLSTACKDOWN : Pseudo<(outs), (ins i32imm:$amt), +def ADJCALLSTACKDOWN : Pseudo<(outs), (ins i32imm:$amt1, i32imm:$amt2), ".error \"should not emit\" ", []>; let Defs = [R29, R30, R31], Uses = [R29], isPseudo = 1 in diff --git a/lib/Target/Lanai/LanaiISelLowering.cpp b/lib/Target/Lanai/LanaiISelLowering.cpp index d156294a0b0c..0a9cac2565f2 100644 --- a/lib/Target/Lanai/LanaiISelLowering.cpp +++ b/lib/Target/Lanai/LanaiISelLowering.cpp @@ -11,9 +11,9 @@ // //===----------------------------------------------------------------------===// +#include "LanaiISelLowering.h" #include "Lanai.h" #include "LanaiCondCode.h" -#include "LanaiISelLowering.h" #include "LanaiMachineFunctionInfo.h" #include "LanaiSubtarget.h" #include "LanaiTargetObjectFile.h" @@ -38,10 +38,11 @@ #include "llvm/IR/Function.h" #include "llvm/IR/GlobalValue.h" #include "llvm/Support/Casting.h" -#include "llvm/Support/CommandLine.h" #include "llvm/Support/CodeGen.h" +#include "llvm/Support/CommandLine.h" #include "llvm/Support/Debug.h" #include "llvm/Support/ErrorHandling.h" +#include "llvm/Support/KnownBits.h" #include "llvm/Support/MathExtras.h" #include "llvm/Support/raw_ostream.h" #include "llvm/Target/TargetCallingConv.h" @@ -649,10 +650,7 @@ SDValue LanaiTargetLowering::LowerCCCCallTo( ByValArgs.push_back(FIPtr); } - Chain = DAG.getCALLSEQ_START( - Chain, - DAG.getConstant(NumBytes, DL, getPointerTy(DAG.getDataLayout()), true), - DL); + Chain = DAG.getCALLSEQ_START(Chain, NumBytes, 0, DL); SmallVector<std::pair<unsigned, SDValue>, 4> RegsToPass; SmallVector<SDValue, 12> MemOpChains; @@ -1502,3 +1500,24 @@ SDValue LanaiTargetLowering::PerformDAGCombine(SDNode *N, return SDValue(); } + +void LanaiTargetLowering::computeKnownBitsForTargetNode( + const SDValue Op, KnownBits &Known, const APInt &DemandedElts, + const SelectionDAG &DAG, unsigned Depth) const { + unsigned BitWidth = Known.getBitWidth(); + switch (Op.getOpcode()) { + default: + break; + case LanaiISD::SETCC: + Known = KnownBits(BitWidth); + Known.Zero.setBits(1, BitWidth); + break; + case LanaiISD::SELECT_CC: + KnownBits Known2; + DAG.computeKnownBits(Op->getOperand(0), Known, Depth + 1); + DAG.computeKnownBits(Op->getOperand(1), Known2, Depth + 1); + Known.Zero &= Known2.Zero; + Known.One &= Known2.One; + break; + } +} diff --git a/lib/Target/Lanai/LanaiISelLowering.h b/lib/Target/Lanai/LanaiISelLowering.h index c2fba4f9d167..49ad52a39771 100644 --- a/lib/Target/Lanai/LanaiISelLowering.h +++ b/lib/Target/Lanai/LanaiISelLowering.h @@ -106,6 +106,11 @@ public: SDValue PerformDAGCombine(SDNode *N, DAGCombinerInfo &DCI) const override; + void computeKnownBitsForTargetNode(const SDValue Op, KnownBits &Known, + const APInt &DemandedElts, + const SelectionDAG &DAG, + unsigned Depth = 0) const override; + private: SDValue LowerCCCCallTo(SDValue Chain, SDValue Callee, CallingConv::ID CallConv, bool IsVarArg, diff --git a/lib/Target/Lanai/LanaiInstrInfo.td b/lib/Target/Lanai/LanaiInstrInfo.td index 285fca11737d..776fee101dfe 100644 --- a/lib/Target/Lanai/LanaiInstrInfo.td +++ b/lib/Target/Lanai/LanaiInstrInfo.td @@ -22,7 +22,8 @@ include "LanaiInstrFormats.td" // -------------------------------------------------- // // These are target-independent nodes, but have target-specific formats. -def SDT_LanaiCallSeqStart : SDCallSeqStart<[SDTCisVT<0, i32>]>; +def SDT_LanaiCallSeqStart : SDCallSeqStart<[SDTCisVT<0, i32>, + SDTCisVT<1, i32>]>; def SDT_LanaiCallSeqEnd : SDCallSeqEnd<[SDTCisVT<0, i32>, SDTCisVT<1, i32>]>; def SDT_LanaiCall : SDTypeProfile<0, -1, [SDTCisVT<0, i32>]>; @@ -750,9 +751,9 @@ let isReturn = 1, isTerminator = 1, hasDelaySlot = 1, isBarrier = 1, // Pessimistically assume ADJCALLSTACKDOWN / ADJCALLSTACKUP will become // sub / add which can clobber SP. let Defs = [SP], Uses = [SP] in { - def ADJCALLSTACKDOWN : Pseudo<(outs), (ins i32imm:$amt), - "#ADJCALLSTACKDOWN $amt", - [(CallSeqStart timm:$amt)]>; + def ADJCALLSTACKDOWN : Pseudo<(outs), (ins i32imm:$amt1, i32imm:$amt2), + "#ADJCALLSTACKDOWN $amt1 $amt2", + [(CallSeqStart timm:$amt1, timm:$amt2)]>; def ADJCALLSTACKUP : Pseudo<(outs), (ins i32imm:$amt1, i32imm:$amt2), "#ADJCALLSTACKUP $amt1 $amt2", [(CallSeqEnd timm:$amt1, timm:$amt2)]>; @@ -770,9 +771,6 @@ let Uses = [SR] in { [(set (i32 GPR:$Rs1), (LanaiSetCC imm:$DDDI))]>; } -// SCC's output is already 1-bit so and'ing with 1 is redundant. -def : Pat<(and (LanaiSetCC imm:$DDDI), 1), (SCC imm:$DDDI)>; - // Select with hardware support let Uses = [SR], isSelect = 1 in { def SELECT : InstRR<0b111, (outs GPR:$Rd), diff --git a/lib/Target/MSP430/MSP430FrameLowering.cpp b/lib/Target/MSP430/MSP430FrameLowering.cpp index f1cb0b6c031b..b4ff8f66c55f 100644 --- a/lib/Target/MSP430/MSP430FrameLowering.cpp +++ b/lib/Target/MSP430/MSP430FrameLowering.cpp @@ -236,7 +236,7 @@ MachineBasicBlock::iterator MSP430FrameLowering::eliminateCallFramePseudoInstr( // adjcallstackdown instruction into 'add SP, <amt>' // TODO: consider using push / pop instead of sub + store / add MachineInstr &Old = *I; - uint64_t Amount = Old.getOperand(0).getImm(); + uint64_t Amount = TII.getFrameSize(Old); if (Amount != 0) { // We need to keep the stack aligned properly. To do this, we round the // amount of space needed for the outgoing arguments up to the next @@ -252,8 +252,7 @@ MachineBasicBlock::iterator MSP430FrameLowering::eliminateCallFramePseudoInstr( } else { assert(Old.getOpcode() == TII.getCallFrameDestroyOpcode()); // factor out the amount the callee already popped. - uint64_t CalleeAmt = Old.getOperand(1).getImm(); - Amount -= CalleeAmt; + Amount -= TII.getFramePoppedByCallee(Old); if (Amount) New = BuildMI(MF, Old.getDebugLoc(), TII.get(MSP430::ADD16ri), MSP430::SP) @@ -272,7 +271,7 @@ MachineBasicBlock::iterator MSP430FrameLowering::eliminateCallFramePseudoInstr( } else if (I->getOpcode() == TII.getCallFrameDestroyOpcode()) { // If we are performing frame pointer elimination and if the callee pops // something off the stack pointer, add it back. - if (uint64_t CalleeAmt = I->getOperand(1).getImm()) { + if (uint64_t CalleeAmt = TII.getFramePoppedByCallee(*I)) { MachineInstr &Old = *I; MachineInstr *New = BuildMI(MF, Old.getDebugLoc(), TII.get(MSP430::SUB16ri), MSP430::SP) diff --git a/lib/Target/MSP430/MSP430ISelLowering.cpp b/lib/Target/MSP430/MSP430ISelLowering.cpp index 40b1dd3cc2eb..cc6e64043f54 100644 --- a/lib/Target/MSP430/MSP430ISelLowering.cpp +++ b/lib/Target/MSP430/MSP430ISelLowering.cpp @@ -40,21 +40,24 @@ using namespace llvm; typedef enum { NoHWMult, - HWMultIntr, - HWMultNoIntr + HWMult16, + HWMult32, + HWMultF5 } HWMultUseMode; static cl::opt<HWMultUseMode> -HWMultMode("msp430-hwmult-mode", cl::Hidden, +HWMultMode("mhwmult", cl::Hidden, cl::desc("Hardware multiplier use mode"), - cl::init(HWMultNoIntr), + cl::init(NoHWMult), cl::values( - clEnumValN(NoHWMult, "no", + clEnumValN(NoHWMult, "none", "Do not use hardware multiplier"), - clEnumValN(HWMultIntr, "interrupts", - "Assume hardware multiplier can be used inside interrupts"), - clEnumValN(HWMultNoIntr, "use", - "Assume hardware multiplier cannot be used inside interrupts"))); + clEnumValN(HWMult16, "16bit", + "Use 16-bit hardware multiplier"), + clEnumValN(HWMult32, "32bit", + "Use 32-bit hardware multiplier"), + clEnumValN(HWMultF5, "f5series", + "Use F5 series hardware multiplier"))); MSP430TargetLowering::MSP430TargetLowering(const TargetMachine &TM, const MSP430Subtarget &STI) @@ -131,29 +134,29 @@ MSP430TargetLowering::MSP430TargetLowering(const TargetMachine &TM, setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i1, Expand); // FIXME: Implement efficiently multiplication by a constant - setOperationAction(ISD::MUL, MVT::i8, Expand); - setOperationAction(ISD::MULHS, MVT::i8, Expand); - setOperationAction(ISD::MULHU, MVT::i8, Expand); - setOperationAction(ISD::SMUL_LOHI, MVT::i8, Expand); - setOperationAction(ISD::UMUL_LOHI, MVT::i8, Expand); - setOperationAction(ISD::MUL, MVT::i16, Expand); + setOperationAction(ISD::MUL, MVT::i8, Promote); + setOperationAction(ISD::MULHS, MVT::i8, Promote); + setOperationAction(ISD::MULHU, MVT::i8, Promote); + setOperationAction(ISD::SMUL_LOHI, MVT::i8, Promote); + setOperationAction(ISD::UMUL_LOHI, MVT::i8, Promote); + setOperationAction(ISD::MUL, MVT::i16, LibCall); setOperationAction(ISD::MULHS, MVT::i16, Expand); setOperationAction(ISD::MULHU, MVT::i16, Expand); setOperationAction(ISD::SMUL_LOHI, MVT::i16, Expand); setOperationAction(ISD::UMUL_LOHI, MVT::i16, Expand); - setOperationAction(ISD::UDIV, MVT::i8, Expand); - setOperationAction(ISD::UDIVREM, MVT::i8, Expand); - setOperationAction(ISD::UREM, MVT::i8, Expand); - setOperationAction(ISD::SDIV, MVT::i8, Expand); - setOperationAction(ISD::SDIVREM, MVT::i8, Expand); - setOperationAction(ISD::SREM, MVT::i8, Expand); - setOperationAction(ISD::UDIV, MVT::i16, Expand); + setOperationAction(ISD::UDIV, MVT::i8, Promote); + setOperationAction(ISD::UDIVREM, MVT::i8, Promote); + setOperationAction(ISD::UREM, MVT::i8, Promote); + setOperationAction(ISD::SDIV, MVT::i8, Promote); + setOperationAction(ISD::SDIVREM, MVT::i8, Promote); + setOperationAction(ISD::SREM, MVT::i8, Promote); + setOperationAction(ISD::UDIV, MVT::i16, LibCall); setOperationAction(ISD::UDIVREM, MVT::i16, Expand); - setOperationAction(ISD::UREM, MVT::i16, Expand); - setOperationAction(ISD::SDIV, MVT::i16, Expand); + setOperationAction(ISD::UREM, MVT::i16, LibCall); + setOperationAction(ISD::SDIV, MVT::i16, LibCall); setOperationAction(ISD::SDIVREM, MVT::i16, Expand); - setOperationAction(ISD::SREM, MVT::i16, Expand); + setOperationAction(ISD::SREM, MVT::i16, LibCall); // varargs support setOperationAction(ISD::VASTART, MVT::Other, Custom); @@ -162,15 +165,183 @@ MSP430TargetLowering::MSP430TargetLowering(const TargetMachine &TM, setOperationAction(ISD::VACOPY, MVT::Other, Expand); setOperationAction(ISD::JumpTable, MVT::i16, Custom); - // Libcalls names. - if (HWMultMode == HWMultIntr) { - setLibcallName(RTLIB::MUL_I8, "__mulqi3hw"); - setLibcallName(RTLIB::MUL_I16, "__mulhi3hw"); - } else if (HWMultMode == HWMultNoIntr) { - setLibcallName(RTLIB::MUL_I8, "__mulqi3hw_noint"); - setLibcallName(RTLIB::MUL_I16, "__mulhi3hw_noint"); + // EABI Libcalls - EABI Section 6.2 + const struct { + const RTLIB::Libcall Op; + const char * const Name; + const ISD::CondCode Cond; + } LibraryCalls[] = { + // Floating point conversions - EABI Table 6 + { RTLIB::FPROUND_F64_F32, "__mspabi_cvtdf", ISD::SETCC_INVALID }, + { RTLIB::FPEXT_F32_F64, "__mspabi_cvtfd", ISD::SETCC_INVALID }, + // The following is NOT implemented in libgcc + //{ RTLIB::FPTOSINT_F64_I16, "__mspabi_fixdi", ISD::SETCC_INVALID }, + { RTLIB::FPTOSINT_F64_I32, "__mspabi_fixdli", ISD::SETCC_INVALID }, + { RTLIB::FPTOSINT_F64_I64, "__mspabi_fixdlli", ISD::SETCC_INVALID }, + // The following is NOT implemented in libgcc + //{ RTLIB::FPTOUINT_F64_I16, "__mspabi_fixdu", ISD::SETCC_INVALID }, + { RTLIB::FPTOUINT_F64_I32, "__mspabi_fixdul", ISD::SETCC_INVALID }, + { RTLIB::FPTOUINT_F64_I64, "__mspabi_fixdull", ISD::SETCC_INVALID }, + // The following is NOT implemented in libgcc + //{ RTLIB::FPTOSINT_F32_I16, "__mspabi_fixfi", ISD::SETCC_INVALID }, + { RTLIB::FPTOSINT_F32_I32, "__mspabi_fixfli", ISD::SETCC_INVALID }, + { RTLIB::FPTOSINT_F32_I64, "__mspabi_fixflli", ISD::SETCC_INVALID }, + // The following is NOT implemented in libgcc + //{ RTLIB::FPTOUINT_F32_I16, "__mspabi_fixfu", ISD::SETCC_INVALID }, + { RTLIB::FPTOUINT_F32_I32, "__mspabi_fixful", ISD::SETCC_INVALID }, + { RTLIB::FPTOUINT_F32_I64, "__mspabi_fixfull", ISD::SETCC_INVALID }, + // TODO The following IS implemented in libgcc + //{ RTLIB::SINTTOFP_I16_F64, "__mspabi_fltid", ISD::SETCC_INVALID }, + { RTLIB::SINTTOFP_I32_F64, "__mspabi_fltlid", ISD::SETCC_INVALID }, + // TODO The following IS implemented in libgcc but is not in the EABI + { RTLIB::SINTTOFP_I64_F64, "__mspabi_fltllid", ISD::SETCC_INVALID }, + // TODO The following IS implemented in libgcc + //{ RTLIB::UINTTOFP_I16_F64, "__mspabi_fltud", ISD::SETCC_INVALID }, + { RTLIB::UINTTOFP_I32_F64, "__mspabi_fltuld", ISD::SETCC_INVALID }, + // The following IS implemented in libgcc but is not in the EABI + { RTLIB::UINTTOFP_I64_F64, "__mspabi_fltulld", ISD::SETCC_INVALID }, + // TODO The following IS implemented in libgcc + //{ RTLIB::SINTTOFP_I16_F32, "__mspabi_fltif", ISD::SETCC_INVALID }, + { RTLIB::SINTTOFP_I32_F32, "__mspabi_fltlif", ISD::SETCC_INVALID }, + // TODO The following IS implemented in libgcc but is not in the EABI + { RTLIB::SINTTOFP_I64_F32, "__mspabi_fltllif", ISD::SETCC_INVALID }, + // TODO The following IS implemented in libgcc + //{ RTLIB::UINTTOFP_I16_F32, "__mspabi_fltuf", ISD::SETCC_INVALID }, + { RTLIB::UINTTOFP_I32_F32, "__mspabi_fltulf", ISD::SETCC_INVALID }, + // The following IS implemented in libgcc but is not in the EABI + { RTLIB::UINTTOFP_I64_F32, "__mspabi_fltullf", ISD::SETCC_INVALID }, + + // Floating point comparisons - EABI Table 7 + { RTLIB::OEQ_F64, "__mspabi_cmpd", ISD::SETEQ }, + { RTLIB::UNE_F64, "__mspabi_cmpd", ISD::SETNE }, + { RTLIB::OGE_F64, "__mspabi_cmpd", ISD::SETGE }, + { RTLIB::OLT_F64, "__mspabi_cmpd", ISD::SETLT }, + { RTLIB::OLE_F64, "__mspabi_cmpd", ISD::SETLE }, + { RTLIB::OGT_F64, "__mspabi_cmpd", ISD::SETGT }, + { RTLIB::OEQ_F32, "__mspabi_cmpf", ISD::SETEQ }, + { RTLIB::UNE_F32, "__mspabi_cmpf", ISD::SETNE }, + { RTLIB::OGE_F32, "__mspabi_cmpf", ISD::SETGE }, + { RTLIB::OLT_F32, "__mspabi_cmpf", ISD::SETLT }, + { RTLIB::OLE_F32, "__mspabi_cmpf", ISD::SETLE }, + { RTLIB::OGT_F32, "__mspabi_cmpf", ISD::SETGT }, + + // Floating point arithmetic - EABI Table 8 + { RTLIB::ADD_F64, "__mspabi_addd", ISD::SETCC_INVALID }, + { RTLIB::ADD_F32, "__mspabi_addf", ISD::SETCC_INVALID }, + { RTLIB::DIV_F64, "__mspabi_divd", ISD::SETCC_INVALID }, + { RTLIB::DIV_F32, "__mspabi_divf", ISD::SETCC_INVALID }, + { RTLIB::MUL_F64, "__mspabi_mpyd", ISD::SETCC_INVALID }, + { RTLIB::MUL_F32, "__mspabi_mpyf", ISD::SETCC_INVALID }, + { RTLIB::SUB_F64, "__mspabi_subd", ISD::SETCC_INVALID }, + { RTLIB::SUB_F32, "__mspabi_subf", ISD::SETCC_INVALID }, + // The following are NOT implemented in libgcc + // { RTLIB::NEG_F64, "__mspabi_negd", ISD::SETCC_INVALID }, + // { RTLIB::NEG_F32, "__mspabi_negf", ISD::SETCC_INVALID }, + + // TODO: SLL/SRA/SRL are in libgcc, RLL isn't + + // Universal Integer Operations - EABI Table 9 + { RTLIB::SDIV_I16, "__mspabi_divi", ISD::SETCC_INVALID }, + { RTLIB::SDIV_I32, "__mspabi_divli", ISD::SETCC_INVALID }, + { RTLIB::SDIV_I64, "__mspabi_divlli", ISD::SETCC_INVALID }, + { RTLIB::UDIV_I16, "__mspabi_divu", ISD::SETCC_INVALID }, + { RTLIB::UDIV_I32, "__mspabi_divul", ISD::SETCC_INVALID }, + { RTLIB::UDIV_I64, "__mspabi_divull", ISD::SETCC_INVALID }, + { RTLIB::SREM_I16, "__mspabi_remi", ISD::SETCC_INVALID }, + { RTLIB::SREM_I32, "__mspabi_remli", ISD::SETCC_INVALID }, + { RTLIB::SREM_I64, "__mspabi_remlli", ISD::SETCC_INVALID }, + { RTLIB::UREM_I16, "__mspabi_remu", ISD::SETCC_INVALID }, + { RTLIB::UREM_I32, "__mspabi_remul", ISD::SETCC_INVALID }, + { RTLIB::UREM_I64, "__mspabi_remull", ISD::SETCC_INVALID }, + + }; + + for (const auto &LC : LibraryCalls) { + setLibcallName(LC.Op, LC.Name); + if (LC.Cond != ISD::SETCC_INVALID) + setCmpLibcallCC(LC.Op, LC.Cond); + } + + if (HWMultMode == HWMult16) { + const struct { + const RTLIB::Libcall Op; + const char * const Name; + } LibraryCalls[] = { + // Integer Multiply - EABI Table 9 + { RTLIB::MUL_I16, "__mspabi_mpyi_hw" }, + { RTLIB::MUL_I32, "__mspabi_mpyl_hw" }, + { RTLIB::MUL_I64, "__mspabi_mpyll_hw" }, + // TODO The __mspabi_mpysl*_hw functions ARE implemented in libgcc + // TODO The __mspabi_mpyul*_hw functions ARE implemented in libgcc + }; + for (const auto &LC : LibraryCalls) { + setLibcallName(LC.Op, LC.Name); + } + } else if (HWMultMode == HWMult32) { + const struct { + const RTLIB::Libcall Op; + const char * const Name; + } LibraryCalls[] = { + // Integer Multiply - EABI Table 9 + { RTLIB::MUL_I16, "__mspabi_mpyi_hw" }, + { RTLIB::MUL_I32, "__mspabi_mpyl_hw32" }, + { RTLIB::MUL_I64, "__mspabi_mpyll_hw32" }, + // TODO The __mspabi_mpysl*_hw32 functions ARE implemented in libgcc + // TODO The __mspabi_mpyul*_hw32 functions ARE implemented in libgcc + }; + for (const auto &LC : LibraryCalls) { + setLibcallName(LC.Op, LC.Name); + } + } else if (HWMultMode == HWMultF5) { + const struct { + const RTLIB::Libcall Op; + const char * const Name; + } LibraryCalls[] = { + // Integer Multiply - EABI Table 9 + { RTLIB::MUL_I16, "__mspabi_mpyi_f5hw" }, + { RTLIB::MUL_I32, "__mspabi_mpyl_f5hw" }, + { RTLIB::MUL_I64, "__mspabi_mpyll_f5hw" }, + // TODO The __mspabi_mpysl*_f5hw functions ARE implemented in libgcc + // TODO The __mspabi_mpyul*_f5hw functions ARE implemented in libgcc + }; + for (const auto &LC : LibraryCalls) { + setLibcallName(LC.Op, LC.Name); + } + } else { // NoHWMult + const struct { + const RTLIB::Libcall Op; + const char * const Name; + } LibraryCalls[] = { + // Integer Multiply - EABI Table 9 + { RTLIB::MUL_I16, "__mspabi_mpyi" }, + { RTLIB::MUL_I32, "__mspabi_mpyl" }, + { RTLIB::MUL_I64, "__mspabi_mpyll" }, + // The __mspabi_mpysl* functions are NOT implemented in libgcc + // The __mspabi_mpyul* functions are NOT implemented in libgcc + }; + for (const auto &LC : LibraryCalls) { + setLibcallName(LC.Op, LC.Name); + } + setLibcallCallingConv(RTLIB::MUL_I64, CallingConv::MSP430_BUILTIN); } + // Several of the runtime library functions use a special calling conv + setLibcallCallingConv(RTLIB::UDIV_I64, CallingConv::MSP430_BUILTIN); + setLibcallCallingConv(RTLIB::UREM_I64, CallingConv::MSP430_BUILTIN); + setLibcallCallingConv(RTLIB::SDIV_I64, CallingConv::MSP430_BUILTIN); + setLibcallCallingConv(RTLIB::SREM_I64, CallingConv::MSP430_BUILTIN); + setLibcallCallingConv(RTLIB::ADD_F64, CallingConv::MSP430_BUILTIN); + setLibcallCallingConv(RTLIB::SUB_F64, CallingConv::MSP430_BUILTIN); + setLibcallCallingConv(RTLIB::MUL_F64, CallingConv::MSP430_BUILTIN); + setLibcallCallingConv(RTLIB::DIV_F64, CallingConv::MSP430_BUILTIN); + setLibcallCallingConv(RTLIB::OEQ_F64, CallingConv::MSP430_BUILTIN); + setLibcallCallingConv(RTLIB::UNE_F64, CallingConv::MSP430_BUILTIN); + setLibcallCallingConv(RTLIB::OGE_F64, CallingConv::MSP430_BUILTIN); + setLibcallCallingConv(RTLIB::OLT_F64, CallingConv::MSP430_BUILTIN); + setLibcallCallingConv(RTLIB::OLE_F64, CallingConv::MSP430_BUILTIN); + setLibcallCallingConv(RTLIB::OGT_F64, CallingConv::MSP430_BUILTIN); + // TODO: __mspabi_srall, __mspabi_srlll, __mspabi_sllll + setMinFunctionAlignment(1); setPrefFunctionAlignment(2); } @@ -281,10 +452,27 @@ template<typename ArgT> static void AnalyzeArguments(CCState &State, SmallVectorImpl<CCValAssign> &ArgLocs, const SmallVectorImpl<ArgT> &Args) { - static const MCPhysReg RegList[] = { + static const MCPhysReg CRegList[] = { MSP430::R12, MSP430::R13, MSP430::R14, MSP430::R15 }; - static const unsigned NbRegs = array_lengthof(RegList); + static const unsigned CNbRegs = array_lengthof(CRegList); + static const MCPhysReg BuiltinRegList[] = { + MSP430::R8, MSP430::R9, MSP430::R10, MSP430::R11, + MSP430::R12, MSP430::R13, MSP430::R14, MSP430::R15 + }; + static const unsigned BuiltinNbRegs = array_lengthof(BuiltinRegList); + + ArrayRef<MCPhysReg> RegList; + unsigned NbRegs; + + bool Builtin = (State.getCallingConv() == CallingConv::MSP430_BUILTIN); + if (Builtin) { + RegList = BuiltinRegList; + NbRegs = BuiltinNbRegs; + } else { + RegList = CRegList; + NbRegs = CNbRegs; + } if (State.isVarArg()) { AnalyzeVarArgs(State, Args); @@ -294,6 +482,11 @@ static void AnalyzeArguments(CCState &State, SmallVector<unsigned, 4> ArgsParts; ParseFunctionArgs(Args, ArgsParts); + if (Builtin) { + assert(ArgsParts.size() == 2 && + "Builtin calling convention requires two arguments"); + } + unsigned RegsLeft = NbRegs; bool UsedStack = false; unsigned ValNo = 0; @@ -323,6 +516,11 @@ static void AnalyzeArguments(CCState &State, unsigned Parts = ArgsParts[i]; + if (Builtin) { + assert(Parts == 4 && + "Builtin calling convention requires 64-bit arguments"); + } + if (!UsedStack && Parts == 2 && RegsLeft == 1) { // Special case for 32-bit register split, see EABI section 3.3.3 unsigned Reg = State.AllocateReg(RegList); @@ -400,6 +598,7 @@ MSP430TargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI, switch (CallConv) { default: llvm_unreachable("Unsupported calling convention"); + case CallingConv::MSP430_BUILTIN: case CallingConv::Fast: case CallingConv::C: return LowerCCCCallTo(Chain, Callee, CallConv, isVarArg, isTailCall, @@ -598,7 +797,6 @@ MSP430TargetLowering::LowerReturn(SDValue Chain, CallingConv::ID CallConv, /// LowerCCCCallTo - functions arguments are copied from virtual regs to /// (physical regs)/(stack frame), CALLSEQ_START and CALLSEQ_END are emitted. -// TODO: sret. SDValue MSP430TargetLowering::LowerCCCCallTo( SDValue Chain, SDValue Callee, CallingConv::ID CallConv, bool isVarArg, bool isTailCall, const SmallVectorImpl<ISD::OutputArg> &Outs, @@ -615,8 +813,7 @@ SDValue MSP430TargetLowering::LowerCCCCallTo( unsigned NumBytes = CCInfo.getNextStackOffset(); auto PtrVT = getPointerTy(DAG.getDataLayout()); - Chain = DAG.getCALLSEQ_START(Chain, - DAG.getConstant(NumBytes, dl, PtrVT, true), dl); + Chain = DAG.getCALLSEQ_START(Chain, NumBytes, 0, dl); SmallVector<std::pair<unsigned, SDValue>, 4> RegsToPass; SmallVector<SDValue, 12> MemOpChains; diff --git a/lib/Target/MSP430/MSP430InstrInfo.h b/lib/Target/MSP430/MSP430InstrInfo.h index e3259bd6a7bc..d81f17e753c5 100644 --- a/lib/Target/MSP430/MSP430InstrInfo.h +++ b/lib/Target/MSP430/MSP430InstrInfo.h @@ -85,6 +85,12 @@ public: MachineBasicBlock *FBB, ArrayRef<MachineOperand> Cond, const DebugLoc &DL, int *BytesAdded = nullptr) const override; + + int64_t getFramePoppedByCallee(const MachineInstr &I) const { + assert(isFrameInstr(I) && "Not a frame instruction"); + assert(I.getOperand(1).getImm() >= 0 && "Size must not be negative"); + return I.getOperand(1).getImm(); + } }; } diff --git a/lib/Target/MSP430/MSP430InstrInfo.td b/lib/Target/MSP430/MSP430InstrInfo.td index 22fc2474fae6..1cd18611e52c 100644 --- a/lib/Target/MSP430/MSP430InstrInfo.td +++ b/lib/Target/MSP430/MSP430InstrInfo.td @@ -23,7 +23,8 @@ class SDTCisI16<int OpNum> : SDTCisVT<OpNum, i16>; // Type Profiles. //===----------------------------------------------------------------------===// def SDT_MSP430Call : SDTypeProfile<0, -1, [SDTCisVT<0, iPTR>]>; -def SDT_MSP430CallSeqStart : SDCallSeqStart<[SDTCisVT<0, i16>]>; +def SDT_MSP430CallSeqStart : SDCallSeqStart<[SDTCisVT<0, i16>, + SDTCisVT<1, i16>]>; def SDT_MSP430CallSeqEnd : SDCallSeqEnd<[SDTCisVT<0, i16>, SDTCisVT<1, i16>]>; def SDT_MSP430Wrapper : SDTypeProfile<1, 1, [SDTCisSameAs<0, 1>, SDTCisPtrTy<0>]>; @@ -113,9 +114,9 @@ def and_su : PatFrag<(ops node:$lhs, node:$rhs), (and node:$lhs, node:$rhs), [{ // Pessimistically assume ADJCALLSTACKDOWN / ADJCALLSTACKUP will become // sub / add which can clobber SR. let Defs = [SP, SR], Uses = [SP] in { -def ADJCALLSTACKDOWN : Pseudo<(outs), (ins i16imm:$amt), +def ADJCALLSTACKDOWN : Pseudo<(outs), (ins i16imm:$amt1, i16imm:$amt2), "#ADJCALLSTACKDOWN", - [(MSP430callseq_start timm:$amt)]>; + [(MSP430callseq_start timm:$amt1, timm:$amt2)]>; def ADJCALLSTACKUP : Pseudo<(outs), (ins i16imm:$amt1, i16imm:$amt2), "#ADJCALLSTACKUP", [(MSP430callseq_end timm:$amt1, timm:$amt2)]>; @@ -209,7 +210,7 @@ let isCall = 1 in // a use to prevent stack-pointer assignments that appear immediately // before calls from potentially appearing dead. Uses for argument // registers are added manually. - let Defs = [R12, R13, R14, R15, SR], + let Defs = [R11, R12, R13, R14, R15, SR], Uses = [SP] in { def CALLi : II16i<0x0, (outs), (ins i16imm:$dst), diff --git a/lib/Target/MSP430/MSP430RegisterInfo.cpp b/lib/Target/MSP430/MSP430RegisterInfo.cpp index 81cd9d1ad3f8..9600bc28f100 100644 --- a/lib/Target/MSP430/MSP430RegisterInfo.cpp +++ b/lib/Target/MSP430/MSP430RegisterInfo.cpp @@ -41,12 +41,12 @@ MSP430RegisterInfo::getCalleeSavedRegs(const MachineFunction *MF) const { const Function* F = MF->getFunction(); static const MCPhysReg CalleeSavedRegs[] = { MSP430::FP, MSP430::R5, MSP430::R6, MSP430::R7, - MSP430::R8, MSP430::R9, MSP430::R10, MSP430::R11, + MSP430::R8, MSP430::R9, MSP430::R10, 0 }; static const MCPhysReg CalleeSavedRegsFP[] = { MSP430::R5, MSP430::R6, MSP430::R7, - MSP430::R8, MSP430::R9, MSP430::R10, MSP430::R11, + MSP430::R8, MSP430::R9, MSP430::R10, 0 }; static const MCPhysReg CalleeSavedRegsIntr[] = { diff --git a/lib/Target/Mips/MipsFastISel.cpp b/lib/Target/Mips/MipsFastISel.cpp index 21c99da0922d..b83f44a74d5b 100644 --- a/lib/Target/Mips/MipsFastISel.cpp +++ b/lib/Target/Mips/MipsFastISel.cpp @@ -1133,7 +1133,7 @@ bool MipsFastISel::processCallArgs(CallLoweringInfo &CLI, if (NumBytes < 16) NumBytes = 16; - emitInst(Mips::ADJCALLSTACKDOWN).addImm(16); + emitInst(Mips::ADJCALLSTACKDOWN).addImm(16).addImm(0); // Process the args. MVT firstMVT; for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) { diff --git a/lib/Target/Mips/MipsISelLowering.cpp b/lib/Target/Mips/MipsISelLowering.cpp index 8f39ebd42a5c..78bae6954c3c 100644 --- a/lib/Target/Mips/MipsISelLowering.cpp +++ b/lib/Target/Mips/MipsISelLowering.cpp @@ -2787,7 +2787,7 @@ MipsTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI, SDValue NextStackOffsetVal = DAG.getIntPtrConstant(NextStackOffset, DL, true); if (!IsTailCall) - Chain = DAG.getCALLSEQ_START(Chain, NextStackOffsetVal, DL); + Chain = DAG.getCALLSEQ_START(Chain, NextStackOffset, 0, DL); SDValue StackPtr = DAG.getCopyFromReg(Chain, DL, ABI.IsN64() ? Mips::SP_64 : Mips::SP, diff --git a/lib/Target/Mips/MipsInstrInfo.td b/lib/Target/Mips/MipsInstrInfo.td index b90077d7807d..8761946b8dbb 100644 --- a/lib/Target/Mips/MipsInstrInfo.td +++ b/lib/Target/Mips/MipsInstrInfo.td @@ -21,7 +21,7 @@ def SDT_MipsCMov : SDTypeProfile<1, 4, [SDTCisSameAs<0, 1>, SDTCisSameAs<1, 2>, SDTCisSameAs<3, 4>, SDTCisInt<4>]>; -def SDT_MipsCallSeqStart : SDCallSeqStart<[SDTCisVT<0, i32>]>; +def SDT_MipsCallSeqStart : SDCallSeqStart<[SDTCisVT<0, i32>, SDTCisVT<1, i32>]>; def SDT_MipsCallSeqEnd : SDCallSeqEnd<[SDTCisVT<0, i32>, SDTCisVT<1, i32>]>; def SDT_MFLOHI : SDTypeProfile<1, 1, [SDTCisInt<0>, SDTCisVT<1, untyped>]>; def SDT_MTLOHI : SDTypeProfile<1, 2, [SDTCisVT<0, untyped>, @@ -1719,8 +1719,8 @@ let isReturn=1, isTerminator=1, isBarrier=1, hasCtrlDep=1, isCTI=1 in { } let Defs = [SP], Uses = [SP], hasSideEffects = 1 in { -def ADJCALLSTACKDOWN : MipsPseudo<(outs), (ins i32imm:$amt), - [(callseq_start timm:$amt)]>; +def ADJCALLSTACKDOWN : MipsPseudo<(outs), (ins i32imm:$amt1, i32imm:$amt2), + [(callseq_start timm:$amt1, timm:$amt2)]>; def ADJCALLSTACKUP : MipsPseudo<(outs), (ins i32imm:$amt1, i32imm:$amt2), [(callseq_end timm:$amt1, timm:$amt2)]>; } diff --git a/lib/Target/Mips/MipsOptimizePICCall.cpp b/lib/Target/Mips/MipsOptimizePICCall.cpp index 68dcbdfb4211..f8d9c34556bc 100644 --- a/lib/Target/Mips/MipsOptimizePICCall.cpp +++ b/lib/Target/Mips/MipsOptimizePICCall.cpp @@ -257,7 +257,7 @@ bool OptimizePICCall::isCallViaRegister(MachineInstr &MI, unsigned &Reg, // Get the instruction that loads the function address from the GOT. Reg = MO->getReg(); - Val = (Value*)nullptr; + Val = nullptr; MachineRegisterInfo &MRI = MI.getParent()->getParent()->getRegInfo(); MachineInstr *DefMI = MRI.getVRegDef(Reg); diff --git a/lib/Target/NVPTX/NVPTXISelLowering.cpp b/lib/Target/NVPTX/NVPTXISelLowering.cpp index 61fdda8aa109..ebaaf42bc64e 100644 --- a/lib/Target/NVPTX/NVPTXISelLowering.cpp +++ b/lib/Target/NVPTX/NVPTXISelLowering.cpp @@ -1430,8 +1430,7 @@ SDValue NVPTXTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI, return Chain;
SDValue tempChain = Chain;
- Chain = DAG.getCALLSEQ_START(
- Chain, DAG.getIntPtrConstant(uniqueCallSite, dl, true), dl);
+ Chain = DAG.getCALLSEQ_START(Chain, uniqueCallSite, 0, dl);
SDValue InFlag = Chain.getValue(1);
unsigned paramCount = 0;
@@ -1549,7 +1548,9 @@ SDValue NVPTXTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI, Chain = DAG.getMemIntrinsicNode(
Op, dl, DAG.getVTList(MVT::Other, MVT::Glue), StoreOperands,
- TheStoreType, MachinePointerInfo(), EltAlign);
+ TheStoreType, MachinePointerInfo(), EltAlign,
+ /* Volatile */ false, /* ReadMem */ false,
+ /* WriteMem */ true, /* Size */ 0);
InFlag = Chain.getValue(1);
// Cleanup.
@@ -1609,7 +1610,9 @@ SDValue NVPTXTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI, theVal, InFlag };
Chain = DAG.getMemIntrinsicNode(NVPTXISD::StoreParam, dl, CopyParamVTs,
CopyParamOps, elemtype,
- MachinePointerInfo());
+ MachinePointerInfo(), /* Align */ 0,
+ /* Volatile */ false, /* ReadMem */ false,
+ /* WriteMem */ true, /* Size */ 0);
InFlag = Chain.getValue(1);
}
@@ -1795,7 +1798,8 @@ SDValue NVPTXTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI, DAG.getConstant(Offsets[VecIdx], dl, MVT::i32), InFlag};
SDValue RetVal = DAG.getMemIntrinsicNode(
Op, dl, DAG.getVTList(LoadVTs), LoadOperands, TheLoadType,
- MachinePointerInfo(), EltAlign);
+ MachinePointerInfo(), EltAlign, /* Volatile */ false,
+ /* ReadMem */ true, /* WriteMem */ false, /* Size */ 0);
for (unsigned j = 0; j < NumElts; ++j) {
SDValue Ret = RetVal.getValue(j);
@@ -2579,7 +2583,9 @@ NVPTXTargetLowering::LowerReturn(SDValue Chain, CallingConv::ID CallConv, EVT TheStoreType = ExtendIntegerRetVal ? MVT::i32 : VTs[i];
Chain = DAG.getMemIntrinsicNode(Op, dl, DAG.getVTList(MVT::Other),
StoreOperands, TheStoreType,
- MachinePointerInfo(), 1);
+ MachinePointerInfo(), /* Align */ 1,
+ /* Volatile */ false, /* ReadMem */ false,
+ /* WriteMem */ true, /* Size */ 0);
// Cleanup vector state.
StoreOperands.clear();
}
diff --git a/lib/Target/NVPTX/NVPTXInstrInfo.td b/lib/Target/NVPTX/NVPTXInstrInfo.td index 9378b29a9d0e..b5b5ea1ed639 100644 --- a/lib/Target/NVPTX/NVPTXInstrInfo.td +++ b/lib/Target/NVPTX/NVPTXInstrInfo.td @@ -3101,7 +3101,8 @@ def : Pat<(brcond (i1 (setne Int1Regs:$a, -1)), bb:$target), (CBranchOther Int1Regs:$a, bb:$target)>;
// Call
-def SDT_NVPTXCallSeqStart : SDCallSeqStart<[SDTCisVT<0, i32>]>;
+def SDT_NVPTXCallSeqStart : SDCallSeqStart<[SDTCisVT<0, i32>,
+ SDTCisVT<1, i32>]>;
def SDT_NVPTXCallSeqEnd : SDCallSeqEnd<[SDTCisVT<0, i32>, SDTCisVT<1, i32>]>;
def callseq_start : SDNode<"ISD::CALLSEQ_START", SDT_NVPTXCallSeqStart,
@@ -3126,10 +3127,10 @@ class Pseudo<dag outs, dag ins, string asmstr, list<dag> pattern> : NVPTXInst<outs, ins, asmstr, pattern>;
def Callseq_Start :
- NVPTXInst<(outs), (ins i32imm:$amt),
- "\\{ // callseq $amt\n"
+ NVPTXInst<(outs), (ins i32imm:$amt1, i32imm:$amt2),
+ "\\{ // callseq $amt1, $amt2\n"
"\t.reg .b32 temp_param_reg;",
- [(callseq_start timm:$amt)]>;
+ [(callseq_start timm:$amt1, timm:$amt2)]>;
def Callseq_End :
NVPTXInst<(outs), (ins i32imm:$amt1, i32imm:$amt2),
"\\} // callseq $amt1",
diff --git a/lib/Target/PowerPC/Disassembler/PPCDisassembler.cpp b/lib/Target/PowerPC/Disassembler/PPCDisassembler.cpp index 12ffbfdeacc1..11d22377611b 100644 --- a/lib/Target/PowerPC/Disassembler/PPCDisassembler.cpp +++ b/lib/Target/PowerPC/Disassembler/PPCDisassembler.cpp @@ -204,6 +204,17 @@ static const unsigned G8Regs[] = { PPC::X28, PPC::X29, PPC::X30, PPC::X31 }; +static const unsigned G80Regs[] = { + PPC::ZERO8, PPC::X1, PPC::X2, PPC::X3, + PPC::X4, PPC::X5, PPC::X6, PPC::X7, + PPC::X8, PPC::X9, PPC::X10, PPC::X11, + PPC::X12, PPC::X13, PPC::X14, PPC::X15, + PPC::X16, PPC::X17, PPC::X18, PPC::X19, + PPC::X20, PPC::X21, PPC::X22, PPC::X23, + PPC::X24, PPC::X25, PPC::X26, PPC::X27, + PPC::X28, PPC::X29, PPC::X30, PPC::X31 +}; + static const unsigned QFRegs[] = { PPC::QF0, PPC::QF1, PPC::QF2, PPC::QF3, PPC::QF4, PPC::QF5, PPC::QF6, PPC::QF7, @@ -301,6 +312,12 @@ static DecodeStatus DecodeG8RCRegisterClass(MCInst &Inst, uint64_t RegNo, return decodeRegisterClass(Inst, RegNo, G8Regs); } +static DecodeStatus DecodeG8RC_NOX0RegisterClass(MCInst &Inst, uint64_t RegNo, + uint64_t Address, + const void *Decoder) { + return decodeRegisterClass(Inst, RegNo, G80Regs); +} + #define DecodePointerLikeRegClass0 DecodeGPRCRegisterClass #define DecodePointerLikeRegClass1 DecodeGPRC_NOR0RegisterClass diff --git a/lib/Target/PowerPC/InstPrinter/PPCInstPrinter.cpp b/lib/Target/PowerPC/InstPrinter/PPCInstPrinter.cpp index 609d959c6d08..84bb9ec56800 100644 --- a/lib/Target/PowerPC/InstPrinter/PPCInstPrinter.cpp +++ b/lib/Target/PowerPC/InstPrinter/PPCInstPrinter.cpp @@ -95,7 +95,8 @@ void PPCInstPrinter::printInst(const MCInst *MI, raw_ostream &O, return; } - if (MI->getOpcode() == PPC::RLDICR) { + if (MI->getOpcode() == PPC::RLDICR || + MI->getOpcode() == PPC::RLDICR_32) { unsigned char SH = MI->getOperand(2).getImm(); unsigned char ME = MI->getOperand(3).getImm(); // rldicr RA, RS, SH, 63-SH == sldi RA, RS, SH diff --git a/lib/Target/PowerPC/PPCFastISel.cpp b/lib/Target/PowerPC/PPCFastISel.cpp index 9b91b9ab8f82..2fc8654deeab 100644 --- a/lib/Target/PowerPC/PPCFastISel.cpp +++ b/lib/Target/PowerPC/PPCFastISel.cpp @@ -1330,7 +1330,7 @@ bool PPCFastISel::processCallArgs(SmallVectorImpl<Value*> &Args, // Issue CALLSEQ_START. BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(TII.getCallFrameSetupOpcode())) - .addImm(NumBytes); + .addImm(NumBytes).addImm(0); // Prepare to assign register arguments. Every argument uses up a // GPR protocol register even if it's passed in a floating-point @@ -2246,6 +2246,7 @@ bool PPCFastISel::tryToFoldLoadIntoMI(MachineInstr *MI, unsigned OpNo, } case PPC::EXTSW: + case PPC::EXTSW_32: case PPC::EXTSW_32_64: { if (VT != MVT::i32 && VT != MVT::i16 && VT != MVT::i8) return false; diff --git a/lib/Target/PowerPC/PPCISelDAGToDAG.cpp b/lib/Target/PowerPC/PPCISelDAGToDAG.cpp index 1b0402bf003d..5fa7b2c6bfb1 100644 --- a/lib/Target/PowerPC/PPCISelDAGToDAG.cpp +++ b/lib/Target/PowerPC/PPCISelDAGToDAG.cpp @@ -54,6 +54,7 @@ #include "llvm/Support/raw_ostream.h" #include "llvm/Target/TargetInstrInfo.h" #include "llvm/Target/TargetRegisterInfo.h" +#include "llvm/ADT/Statistic.h" #include <algorithm> #include <cassert> #include <cstdint> @@ -68,6 +69,14 @@ using namespace llvm; #define DEBUG_TYPE "ppc-codegen" +STATISTIC(NumSextSetcc, + "Number of (sext(setcc)) nodes expanded into GPR sequence."); +STATISTIC(NumZextSetcc, + "Number of (zext(setcc)) nodes expanded into GPR sequence."); +STATISTIC(SignExtensionsAdded, + "Number of sign extensions for compare inputs added."); +STATISTIC(ZeroExtensionsAdded, + "Number of zero extensions for compare inputs added."); // FIXME: Remove this once the bug has been fixed! cl::opt<bool> ANDIGlueBug("expose-ppc-andi-glue-bug", cl::desc("expose the ANDI glue bug on PPC"), cl::Hidden); @@ -252,7 +261,28 @@ namespace { #include "PPCGenDAGISel.inc" private: + // Conversion type for interpreting results of a 32-bit instruction as + // a 64-bit value or vice versa. + enum ExtOrTruncConversion { Ext, Trunc }; + + // Modifiers to guide how an ISD::SETCC node's result is to be computed + // in a GPR. + // ZExtOrig - use the original condition code, zero-extend value + // ZExtInvert - invert the condition code, zero-extend value + // SExtOrig - use the original condition code, sign-extend value + // SExtInvert - invert the condition code, sign-extend value + enum SetccInGPROpts { ZExtOrig, ZExtInvert, SExtOrig, SExtInvert }; + bool trySETCC(SDNode *N); + bool tryEXTEND(SDNode *N); + SDValue signExtendInputIfNeeded(SDValue Input); + SDValue zeroExtendInputIfNeeded(SDValue Input); + SDValue addExtOrTrunc(SDValue NatWidthRes, ExtOrTruncConversion Conv); + SDValue get32BitZExtCompare(SDValue LHS, SDValue RHS, ISD::CondCode CC, + int64_t RHSValue, SDLoc dl); + SDValue get32BitSExtCompare(SDValue LHS, SDValue RHS, ISD::CondCode CC, + int64_t RHSValue, SDLoc dl); + SDValue getSETCCInGPR(SDValue Compare, SetccInGPROpts ConvOpts); void PeepholePPC64(); void PeepholePPC64ZExt(); @@ -2471,6 +2501,225 @@ bool PPCDAGToDAGISel::trySETCC(SDNode *N) { return true; } +/// If this node is a sign/zero extension of an integer comparison, +/// it can usually be computed in GPR's rather than using comparison +/// instructions and ISEL. We only do this on 64-bit targets for now +/// as the code is specialized for 64-bit (it uses 64-bit instructions +/// and assumes 64-bit registers). +bool PPCDAGToDAGISel::tryEXTEND(SDNode *N) { + if (TM.getOptLevel() == CodeGenOpt::None || !TM.isPPC64()) + return false; + assert((N->getOpcode() == ISD::ZERO_EXTEND || + N->getOpcode() == ISD::SIGN_EXTEND) && + "Expecting a zero/sign extend node!"); + + if (N->getOperand(0).getOpcode() != ISD::SETCC) + return false; + + SDValue WideRes = + getSETCCInGPR(N->getOperand(0), + N->getOpcode() == ISD::SIGN_EXTEND ? + SetccInGPROpts::SExtOrig : SetccInGPROpts::ZExtOrig); + + if (!WideRes) + return false; + + SDLoc dl(N); + bool Inputs32Bit = N->getOperand(0).getOperand(0).getValueType() == MVT::i32; + bool Output32Bit = N->getValueType(0) == MVT::i32; + + NumSextSetcc += N->getOpcode() == ISD::SIGN_EXTEND ? 1 : 0; + NumZextSetcc += N->getOpcode() == ISD::SIGN_EXTEND ? 0 : 1; + + SDValue ConvOp = WideRes; + if (Inputs32Bit != Output32Bit) + ConvOp = addExtOrTrunc(WideRes, Inputs32Bit ? ExtOrTruncConversion::Ext : + ExtOrTruncConversion::Trunc); + ReplaceNode(N, ConvOp.getNode()); + + return true; +} + +/// If the value isn't guaranteed to be sign-extended to 64-bits, extend it. +/// Useful when emitting comparison code for 32-bit values without using +/// the compare instruction (which only considers the lower 32-bits). +SDValue PPCDAGToDAGISel::signExtendInputIfNeeded(SDValue Input) { + assert(Input.getValueType() == MVT::i32 && + "Can only sign-extend 32-bit values here."); + unsigned Opc = Input.getOpcode(); + + // The value was sign extended and then truncated to 32-bits. No need to + // sign extend it again. + if (Opc == ISD::TRUNCATE && + (Input.getOperand(0).getOpcode() == ISD::AssertSext || + Input.getOperand(0).getOpcode() == ISD::SIGN_EXTEND)) + return Input; + + LoadSDNode *InputLoad = dyn_cast<LoadSDNode>(Input); + // The input is a sign-extending load. No reason to sign-extend. + if (InputLoad && InputLoad->getExtensionType() == ISD::SEXTLOAD) + return Input; + + ConstantSDNode *InputConst = dyn_cast<ConstantSDNode>(Input); + // We don't sign-extend constants and already sign-extended values. + if (InputConst || Opc == ISD::AssertSext || Opc == ISD::SIGN_EXTEND_INREG || + Opc == ISD::SIGN_EXTEND) + return Input; + + SDLoc dl(Input); + SignExtensionsAdded++; + return SDValue(CurDAG->getMachineNode(PPC::EXTSW_32, dl, MVT::i32, Input), 0); +} + +/// If the value isn't guaranteed to be zero-extended to 64-bits, extend it. +/// Useful when emitting comparison code for 32-bit values without using +/// the compare instruction (which only considers the lower 32-bits). +SDValue PPCDAGToDAGISel::zeroExtendInputIfNeeded(SDValue Input) { + assert(Input.getValueType() == MVT::i32 && + "Can only zero-extend 32-bit values here."); + LoadSDNode *InputLoad = dyn_cast<LoadSDNode>(Input); + unsigned Opc = Input.getOpcode(); + + // No need to zero-extend loaded values (unless they're loaded with + // a sign-extending load). + if (InputLoad && InputLoad->getExtensionType() != ISD::SEXTLOAD) + return Input; + + ConstantSDNode *InputConst = dyn_cast<ConstantSDNode>(Input); + bool InputZExtConst = InputConst && InputConst->getSExtValue() >= 0; + // An ISD::TRUNCATE will be lowered to an EXTRACT_SUBREG so we have + // to conservatively actually clear the high bits. We also don't need to + // zero-extend constants or values that are already zero-extended. + if (InputZExtConst || Opc == ISD::AssertZext || Opc == ISD::ZERO_EXTEND) + return Input; + + SDLoc dl(Input); + ZeroExtensionsAdded++; + return SDValue(CurDAG->getMachineNode(PPC::RLDICL_32, dl, MVT::i32, Input, + getI64Imm(0, dl), getI64Imm(32, dl)), + 0); +} + +// Handle a 32-bit value in a 64-bit register and vice-versa. These are of +// course not actual zero/sign extensions that will generate machine code, +// they're just a way to reinterpret a 32 bit value in a register as a +// 64 bit value and vice-versa. +SDValue PPCDAGToDAGISel::addExtOrTrunc(SDValue NatWidthRes, + ExtOrTruncConversion Conv) { + SDLoc dl(NatWidthRes); + + // For reinterpreting 32-bit values as 64 bit values, we generate + // INSERT_SUBREG IMPLICIT_DEF:i64, <input>, TargetConstant:i32<1> + if (Conv == ExtOrTruncConversion::Ext) { + SDValue ImDef(CurDAG->getMachineNode(PPC::IMPLICIT_DEF, dl, MVT::i64), 0); + SDValue SubRegIdx = + CurDAG->getTargetConstant(PPC::sub_32, dl, MVT::i32); + return SDValue(CurDAG->getMachineNode(PPC::INSERT_SUBREG, dl, MVT::i64, + ImDef, NatWidthRes, SubRegIdx), 0); + } + + assert(Conv == ExtOrTruncConversion::Trunc && + "Unknown convertion between 32 and 64 bit values."); + // For reinterpreting 64-bit values as 32-bit values, we just need to + // EXTRACT_SUBREG (i.e. extract the low word). + SDValue SubRegIdx = + CurDAG->getTargetConstant(PPC::sub_32, dl, MVT::i32); + return SDValue(CurDAG->getMachineNode(PPC::EXTRACT_SUBREG, dl, MVT::i32, + NatWidthRes, SubRegIdx), 0); +} + +/// Produces a zero-extended result of comparing two 32-bit values according to +/// the passed condition code. +SDValue PPCDAGToDAGISel::get32BitZExtCompare(SDValue LHS, SDValue RHS, + ISD::CondCode CC, + int64_t RHSValue, SDLoc dl) { + bool IsRHSZero = RHSValue == 0; + switch (CC) { + default: return SDValue(); + case ISD::SETEQ: { + // (zext (setcc %a, %b, seteq)) -> (lshr (cntlzw (xor %a, %b)), 5) + // (zext (setcc %a, 0, seteq)) -> (lshr (cntlzw %a), 5) + SDValue Xor = IsRHSZero ? LHS : + SDValue(CurDAG->getMachineNode(PPC::XOR, dl, MVT::i32, LHS, RHS), 0); + SDValue Clz = + SDValue(CurDAG->getMachineNode(PPC::CNTLZW, dl, MVT::i32, Xor), 0); + SDValue ShiftOps[] = { Clz, getI32Imm(27, dl), getI32Imm(5, dl), + getI32Imm(31, dl) }; + return SDValue(CurDAG->getMachineNode(PPC::RLWINM, dl, MVT::i32, + ShiftOps), 0); + } + } +} + +/// Produces a sign-extended result of comparing two 32-bit values according to +/// the passed condition code. +SDValue PPCDAGToDAGISel::get32BitSExtCompare(SDValue LHS, SDValue RHS, + ISD::CondCode CC, + int64_t RHSValue, SDLoc dl) { + bool IsRHSZero = RHSValue == 0; + switch (CC) { + default: return SDValue(); + case ISD::SETEQ: { + // (sext (setcc %a, %b, seteq)) -> + // (ashr (shl (ctlz (xor %a, %b)), 58), 63) + // (sext (setcc %a, 0, seteq)) -> + // (ashr (shl (ctlz %a), 58), 63) + SDValue CountInput = IsRHSZero ? LHS : + SDValue(CurDAG->getMachineNode(PPC::XOR, dl, MVT::i32, LHS, RHS), 0); + SDValue Cntlzw = + SDValue(CurDAG->getMachineNode(PPC::CNTLZW, dl, MVT::i32, CountInput), 0); + SDValue SHLOps[] = { Cntlzw, getI32Imm(58, dl), getI32Imm(0, dl) }; + SDValue Sldi = + SDValue(CurDAG->getMachineNode(PPC::RLDICR_32, dl, MVT::i32, SHLOps), 0); + return SDValue(CurDAG->getMachineNode(PPC::SRADI_32, dl, MVT::i32, Sldi, + getI32Imm(63, dl)), 0); + } + } +} + +/// Returns an equivalent of a SETCC node but with the result the same width as +/// the inputs. This can nalso be used for SELECT_CC if either the true or false +/// values is a power of two while the other is zero. +SDValue PPCDAGToDAGISel::getSETCCInGPR(SDValue Compare, + SetccInGPROpts ConvOpts) { + assert((Compare.getOpcode() == ISD::SETCC || + Compare.getOpcode() == ISD::SELECT_CC) && + "An ISD::SETCC node required here."); + + SDValue LHS = Compare.getOperand(0); + SDValue RHS = Compare.getOperand(1); + + // The condition code is operand 2 for SETCC and operand 4 for SELECT_CC. + int CCOpNum = Compare.getOpcode() == ISD::SELECT_CC ? 4 : 2; + ISD::CondCode CC = + cast<CondCodeSDNode>(Compare.getOperand(CCOpNum))->get(); + EVT InputVT = LHS.getValueType(); + if (InputVT != MVT::i32) + return SDValue(); + + SDLoc dl(Compare); + ConstantSDNode *RHSConst = dyn_cast<ConstantSDNode>(RHS); + int64_t RHSValue = RHSConst ? RHSConst->getSExtValue() : INT64_MAX; + + if (ConvOpts == SetccInGPROpts::ZExtInvert || + ConvOpts == SetccInGPROpts::SExtInvert) + CC = ISD::getSetCCInverse(CC, true); + + if (ISD::isSignedIntSetCC(CC)) { + LHS = signExtendInputIfNeeded(LHS); + RHS = signExtendInputIfNeeded(RHS); + } else if (ISD::isUnsignedIntSetCC(CC)) { + LHS = zeroExtendInputIfNeeded(LHS); + RHS = zeroExtendInputIfNeeded(RHS); + } + + bool IsSext = ConvOpts == SetccInGPROpts::SExtOrig || + ConvOpts == SetccInGPROpts::SExtInvert; + if (IsSext) + return get32BitSExtCompare(LHS, RHS, CC, RHSValue, dl); + return get32BitZExtCompare(LHS, RHS, CC, RHSValue, dl); +} + void PPCDAGToDAGISel::transferMemOperands(SDNode *N, SDNode *Result) { // Transfer memoperands. MachineSDNode::mmo_iterator MemOp = MF->allocateMemRefsArray(1); @@ -2508,6 +2757,12 @@ void PPCDAGToDAGISel::Select(SDNode *N) { } break; + case ISD::ZERO_EXTEND: + case ISD::SIGN_EXTEND: + if (tryEXTEND(N)) + return; + break; + case ISD::SETCC: if (trySETCC(N)) return; diff --git a/lib/Target/PowerPC/PPCISelLowering.cpp b/lib/Target/PowerPC/PPCISelLowering.cpp index 685f24cb502e..17bdd595da10 100644 --- a/lib/Target/PowerPC/PPCISelLowering.cpp +++ b/lib/Target/PowerPC/PPCISelLowering.cpp @@ -923,6 +923,9 @@ PPCTargetLowering::PPCTargetLowering(const PPCTargetMachine &TM, setStackPointerRegisterToSaveRestore(isPPC64 ? PPC::X1 : PPC::R1); // We have target-specific dag combine patterns for the following nodes: + setTargetDAGCombine(ISD::SHL); + setTargetDAGCombine(ISD::SRA); + setTargetDAGCombine(ISD::SRL); setTargetDAGCombine(ISD::SINT_TO_FP); setTargetDAGCombine(ISD::BUILD_VECTOR); if (Subtarget.hasFPCVT()) @@ -4949,8 +4952,7 @@ SDValue PPCTargetLowering::LowerCall_32SVR4( // Adjust the stack pointer for the new arguments... // These operations are automatically eliminated by the prolog/epilog pass - Chain = DAG.getCALLSEQ_START(Chain, DAG.getIntPtrConstant(NumBytes, dl, true), - dl); + Chain = DAG.getCALLSEQ_START(Chain, NumBytes, 0, dl); SDValue CallSeqStart = Chain; // Load the return address and frame pointer so it can be moved somewhere else @@ -5000,9 +5002,8 @@ SDValue PPCTargetLowering::LowerCall_32SVR4( Flags, DAG, dl); // This must go outside the CALLSEQ_START..END. - SDValue NewCallSeqStart = DAG.getCALLSEQ_START(MemcpyCall, - CallSeqStart.getNode()->getOperand(1), - SDLoc(MemcpyCall)); + SDValue NewCallSeqStart = DAG.getCALLSEQ_START(MemcpyCall, NumBytes, 0, + SDLoc(MemcpyCall)); DAG.ReplaceAllUsesWith(CallSeqStart.getNode(), NewCallSeqStart.getNode()); Chain = CallSeqStart = NewCallSeqStart; @@ -5083,9 +5084,9 @@ SDValue PPCTargetLowering::createMemcpyOutsideCallSeq( CallSeqStart.getNode()->getOperand(0), Flags, DAG, dl); // The MEMCPY must go outside the CALLSEQ_START..END. - SDValue NewCallSeqStart = DAG.getCALLSEQ_START(MemcpyCall, - CallSeqStart.getNode()->getOperand(1), - SDLoc(MemcpyCall)); + int64_t FrameSize = CallSeqStart.getConstantOperandVal(1); + SDValue NewCallSeqStart = DAG.getCALLSEQ_START(MemcpyCall, FrameSize, 0, + SDLoc(MemcpyCall)); DAG.ReplaceAllUsesWith(CallSeqStart.getNode(), NewCallSeqStart.getNode()); return NewCallSeqStart; @@ -5268,8 +5269,7 @@ SDValue PPCTargetLowering::LowerCall_64SVR4( // Adjust the stack pointer for the new arguments... // These operations are automatically eliminated by the prolog/epilog pass if (!IsSibCall) - Chain = DAG.getCALLSEQ_START(Chain, - DAG.getIntPtrConstant(NumBytes, dl, true), dl); + Chain = DAG.getCALLSEQ_START(Chain, NumBytes, 0, dl); SDValue CallSeqStart = Chain; // Load the return address and frame pointer so it can be move somewhere else @@ -5828,8 +5828,7 @@ SDValue PPCTargetLowering::LowerCall_Darwin( // Adjust the stack pointer for the new arguments... // These operations are automatically eliminated by the prolog/epilog pass - Chain = DAG.getCALLSEQ_START(Chain, DAG.getIntPtrConstant(NumBytes, dl, true), - dl); + Chain = DAG.getCALLSEQ_START(Chain, NumBytes, 0, dl); SDValue CallSeqStart = Chain; // Load the return address and frame pointer so it can be move somewhere else @@ -8741,9 +8740,9 @@ static Instruction* callIntrinsic(IRBuilder<> &Builder, Intrinsic::ID Id) { // The mappings for emitLeading/TrailingFence is taken from // http://www.cl.cam.ac.uk/~pes20/cpp/cpp0xmappings.html -Instruction* PPCTargetLowering::emitLeadingFence(IRBuilder<> &Builder, - AtomicOrdering Ord, bool IsStore, - bool IsLoad) const { +Instruction *PPCTargetLowering::emitLeadingFence(IRBuilder<> &Builder, + Instruction *Inst, + AtomicOrdering Ord) const { if (Ord == AtomicOrdering::SequentiallyConsistent) return callIntrinsic(Builder, Intrinsic::ppc_sync); if (isReleaseOrStronger(Ord)) @@ -8751,10 +8750,10 @@ Instruction* PPCTargetLowering::emitLeadingFence(IRBuilder<> &Builder, return nullptr; } -Instruction* PPCTargetLowering::emitTrailingFence(IRBuilder<> &Builder, - AtomicOrdering Ord, bool IsStore, - bool IsLoad) const { - if (IsLoad && isAcquireOrStronger(Ord)) +Instruction *PPCTargetLowering::emitTrailingFence(IRBuilder<> &Builder, + Instruction *Inst, + AtomicOrdering Ord) const { + if (Inst->hasAtomicLoad() && isAcquireOrStronger(Ord)) return callIntrinsic(Builder, Intrinsic::ppc_lwsync); // FIXME: this is too conservative, a dependent branch + isync is enough. // See http://www.cl.cam.ac.uk/~pes20/cpp/cpp0xmappings.html and @@ -11316,6 +11315,12 @@ SDValue PPCTargetLowering::PerformDAGCombine(SDNode *N, SDLoc dl(N); switch (N->getOpcode()) { default: break; + case ISD::SHL: + return combineSHL(N, DCI); + case ISD::SRA: + return combineSRA(N, DCI); + case ISD::SRL: + return combineSRL(N, DCI); case PPCISD::SHL: if (isNullConstant(N->getOperand(0))) // 0 << V -> 0. return N->getOperand(0); @@ -12948,3 +12953,58 @@ bool PPCTargetLowering::isFPImmLegal(const APFloat &Imm, EVT VT) const { return Imm.isPosZero(); } } + +// For vector shift operation op, fold +// (op x, (and y, ((1 << numbits(x)) - 1))) -> (target op x, y) +static SDValue stripModuloOnShift(const TargetLowering &TLI, SDNode *N, + SelectionDAG &DAG) { + SDValue N0 = N->getOperand(0); + SDValue N1 = N->getOperand(1); + EVT VT = N0.getValueType(); + unsigned OpSizeInBits = VT.getScalarSizeInBits(); + unsigned Opcode = N->getOpcode(); + unsigned TargetOpcode; + + switch (Opcode) { + default: + llvm_unreachable("Unexpected shift operation"); + case ISD::SHL: + TargetOpcode = PPCISD::SHL; + break; + case ISD::SRL: + TargetOpcode = PPCISD::SRL; + break; + case ISD::SRA: + TargetOpcode = PPCISD::SRA; + break; + } + + if (VT.isVector() && TLI.isOperationLegal(Opcode, VT) && + N1->getOpcode() == ISD::AND) + if (ConstantSDNode *Mask = isConstOrConstSplat(N1->getOperand(1))) + if (Mask->getZExtValue() == OpSizeInBits - 1) + return DAG.getNode(TargetOpcode, SDLoc(N), VT, N0, N1->getOperand(0)); + + return SDValue(); +} + +SDValue PPCTargetLowering::combineSHL(SDNode *N, DAGCombinerInfo &DCI) const { + if (auto Value = stripModuloOnShift(*this, N, DCI.DAG)) + return Value; + + return SDValue(); +} + +SDValue PPCTargetLowering::combineSRA(SDNode *N, DAGCombinerInfo &DCI) const { + if (auto Value = stripModuloOnShift(*this, N, DCI.DAG)) + return Value; + + return SDValue(); +} + +SDValue PPCTargetLowering::combineSRL(SDNode *N, DAGCombinerInfo &DCI) const { + if (auto Value = stripModuloOnShift(*this, N, DCI.DAG)) + return Value; + + return SDValue(); +} diff --git a/lib/Target/PowerPC/PPCISelLowering.h b/lib/Target/PowerPC/PPCISelLowering.h index 32661099b79d..4fc744257262 100644 --- a/lib/Target/PowerPC/PPCISelLowering.h +++ b/lib/Target/PowerPC/PPCISelLowering.h @@ -117,9 +117,13 @@ namespace llvm { /// at function entry, used for PIC code. GlobalBaseReg, - /// These nodes represent the 32-bit PPC shifts that operate on 6-bit - /// shift amounts. These nodes are generated by the multi-precision shift - /// code. + /// These nodes represent PPC shifts. + /// + /// For scalar types, only the last `n + 1` bits of the shift amounts + /// are used, where n is log2(sizeof(element) * 8). See sld/slw, etc. + /// for exact behaviors. + /// + /// For vector types, only the last n bits are used. See vsld. SRL, SRA, SHL, /// The combination of sra[wd]i and addze used to implemented signed @@ -617,10 +621,10 @@ namespace llvm { return true; } - Instruction* emitLeadingFence(IRBuilder<> &Builder, AtomicOrdering Ord, - bool IsStore, bool IsLoad) const override; - Instruction* emitTrailingFence(IRBuilder<> &Builder, AtomicOrdering Ord, - bool IsStore, bool IsLoad) const override; + Instruction *emitLeadingFence(IRBuilder<> &Builder, Instruction *Inst, + AtomicOrdering Ord) const override; + Instruction *emitTrailingFence(IRBuilder<> &Builder, Instruction *Inst, + AtomicOrdering Ord) const override; MachineBasicBlock * EmitInstrWithCustomInserter(MachineInstr &MI, @@ -999,6 +1003,9 @@ namespace llvm { SDValue DAGCombineBuildVector(SDNode *N, DAGCombinerInfo &DCI) const; SDValue DAGCombineTruncBoolExt(SDNode *N, DAGCombinerInfo &DCI) const; SDValue combineFPToIntToFP(SDNode *N, DAGCombinerInfo &DCI) const; + SDValue combineSHL(SDNode *N, DAGCombinerInfo &DCI) const; + SDValue combineSRA(SDNode *N, DAGCombinerInfo &DCI) const; + SDValue combineSRL(SDNode *N, DAGCombinerInfo &DCI) const; /// ConvertSETCCToSubtract - looks at SETCC that compares ints. It replaces /// SETCC with integer subtraction when (1) there is a legal way of doing it @@ -1017,14 +1024,6 @@ namespace llvm { SDValue combineElementTruncationToVectorTruncation(SDNode *N, DAGCombinerInfo &DCI) const; - - bool supportsModuloShift(ISD::NodeType Inst, - EVT ReturnType) const override { - assert((Inst == ISD::SHL || Inst == ISD::SRA || Inst == ISD::SRL) && - "Expect a shift instruction"); - assert(isOperationLegal(Inst, ReturnType)); - return ReturnType.isVector(); - } }; namespace PPC { diff --git a/lib/Target/PowerPC/PPCInstr64Bit.td b/lib/Target/PowerPC/PPCInstr64Bit.td index 997b96ca6ec8..a8433919f0f3 100644 --- a/lib/Target/PowerPC/PPCInstr64Bit.td +++ b/lib/Target/PowerPC/PPCInstr64Bit.td @@ -634,10 +634,19 @@ let Interpretation64Bit = 1, isCodeGenOnly = 1 in defm EXTSW_32_64 : XForm_11r<31, 986, (outs g8rc:$rA), (ins gprc:$rS), "extsw", "$rA, $rS", IIC_IntSimple, [(set i64:$rA, (sext i32:$rS))]>, isPPC64; +let isCodeGenOnly = 1 in +def EXTSW_32 : XForm_11<31, 986, (outs gprc:$rA), (ins gprc:$rS), + "extsw $rA, $rS", IIC_IntSimple, + []>, isPPC64; defm SRADI : XSForm_1rc<31, 413, (outs g8rc:$rA), (ins g8rc:$rS, u6imm:$SH), "sradi", "$rA, $rS, $SH", IIC_IntRotateDI, [(set i64:$rA, (sra i64:$rS, (i32 imm:$SH)))]>, isPPC64; +// For fast-isel: +let isCodeGenOnly = 1 in +def SRADI_32 : XSForm_1<31, 413, (outs gprc:$rA), (ins gprc:$rS, u6imm:$SH), + "sradi $rA, $rS, $SH", IIC_IntRotateDI, []>, isPPC64; + defm CNTLZD : XForm_11r<31, 58, (outs g8rc:$rA), (ins g8rc:$rS), "cntlzd", "$rA, $rS", IIC_IntGeneral, [(set i64:$rA, (ctlz i64:$rS))]>; @@ -721,15 +730,26 @@ defm RLDICL : MDForm_1r<30, 0, // For fast-isel: let isCodeGenOnly = 1 in def RLDICL_32_64 : MDForm_1<30, 0, - (outs g8rc:$rA), - (ins gprc:$rS, u6imm:$SH, u6imm:$MBE), - "rldicl $rA, $rS, $SH, $MBE", IIC_IntRotateDI, - []>, isPPC64; + (outs g8rc:$rA), + (ins gprc:$rS, u6imm:$SH, u6imm:$MBE), + "rldicl $rA, $rS, $SH, $MBE", IIC_IntRotateDI, + []>, isPPC64; // End fast-isel. +let isCodeGenOnly = 1 in +def RLDICL_32 : MDForm_1<30, 0, + (outs gprc:$rA), + (ins gprc:$rS, u6imm:$SH, u6imm:$MBE), + "rldicl $rA, $rS, $SH, $MBE", IIC_IntRotateDI, + []>, isPPC64; defm RLDICR : MDForm_1r<30, 1, (outs g8rc:$rA), (ins g8rc:$rS, u6imm:$SH, u6imm:$MBE), "rldicr", "$rA, $rS, $SH, $MBE", IIC_IntRotateDI, []>, isPPC64; +let isCodeGenOnly = 1 in +def RLDICR_32 : MDForm_1<30, 1, + (outs gprc:$rA), (ins gprc:$rS, u6imm:$SH, u6imm:$MBE), + "rldicr $rA, $rS, $SH, $MBE", IIC_IntRotateDI, + []>, isPPC64; defm RLDIC : MDForm_1r<30, 2, (outs g8rc:$rA), (ins g8rc:$rS, u6imm:$SH, u6imm:$MBE), "rldic", "$rA, $rS, $SH, $MBE", IIC_IntRotateDI, diff --git a/lib/Target/PowerPC/PPCInstrAltivec.td b/lib/Target/PowerPC/PPCInstrAltivec.td index c380766e9f5c..e14d18fd5433 100644 --- a/lib/Target/PowerPC/PPCInstrAltivec.td +++ b/lib/Target/PowerPC/PPCInstrAltivec.td @@ -987,6 +987,12 @@ def : Pat<(v8i16 (shl v8i16:$vA, v8i16:$vB)), (v8i16 (VSLH $vA, $vB))>; def : Pat<(v4i32 (shl v4i32:$vA, v4i32:$vB)), (v4i32 (VSLW $vA, $vB))>; +def : Pat<(v16i8 (PPCshl v16i8:$vA, v16i8:$vB)), + (v16i8 (VSLB $vA, $vB))>; +def : Pat<(v8i16 (PPCshl v8i16:$vA, v8i16:$vB)), + (v8i16 (VSLH $vA, $vB))>; +def : Pat<(v4i32 (PPCshl v4i32:$vA, v4i32:$vB)), + (v4i32 (VSLW $vA, $vB))>; def : Pat<(v16i8 (srl v16i8:$vA, v16i8:$vB)), (v16i8 (VSRB $vA, $vB))>; @@ -994,6 +1000,12 @@ def : Pat<(v8i16 (srl v8i16:$vA, v8i16:$vB)), (v8i16 (VSRH $vA, $vB))>; def : Pat<(v4i32 (srl v4i32:$vA, v4i32:$vB)), (v4i32 (VSRW $vA, $vB))>; +def : Pat<(v16i8 (PPCsrl v16i8:$vA, v16i8:$vB)), + (v16i8 (VSRB $vA, $vB))>; +def : Pat<(v8i16 (PPCsrl v8i16:$vA, v8i16:$vB)), + (v8i16 (VSRH $vA, $vB))>; +def : Pat<(v4i32 (PPCsrl v4i32:$vA, v4i32:$vB)), + (v4i32 (VSRW $vA, $vB))>; def : Pat<(v16i8 (sra v16i8:$vA, v16i8:$vB)), (v16i8 (VSRAB $vA, $vB))>; @@ -1001,6 +1013,12 @@ def : Pat<(v8i16 (sra v8i16:$vA, v8i16:$vB)), (v8i16 (VSRAH $vA, $vB))>; def : Pat<(v4i32 (sra v4i32:$vA, v4i32:$vB)), (v4i32 (VSRAW $vA, $vB))>; +def : Pat<(v16i8 (PPCsra v16i8:$vA, v16i8:$vB)), + (v16i8 (VSRAB $vA, $vB))>; +def : Pat<(v8i16 (PPCsra v8i16:$vA, v8i16:$vB)), + (v8i16 (VSRAH $vA, $vB))>; +def : Pat<(v4i32 (PPCsra v4i32:$vA, v4i32:$vB)), + (v4i32 (VSRAW $vA, $vB))>; // Float to integer and integer to float conversions def : Pat<(v4i32 (fp_to_sint v4f32:$vA)), @@ -1072,14 +1090,24 @@ def:Pat<(vmrgow_swapped_shuffle v16i8:$vA, v16i8:$vB), // Vector shifts def VRLD : VX1_Int_Ty<196, "vrld", int_ppc_altivec_vrld, v2i64>; def VSLD : VXForm_1<1476, (outs vrrc:$vD), (ins vrrc:$vA, vrrc:$vB), - "vsld $vD, $vA, $vB", IIC_VecGeneral, - [(set v2i64:$vD, (shl v2i64:$vA, v2i64:$vB))]>; + "vsld $vD, $vA, $vB", IIC_VecGeneral, []>; def VSRD : VXForm_1<1732, (outs vrrc:$vD), (ins vrrc:$vA, vrrc:$vB), - "vsrd $vD, $vA, $vB", IIC_VecGeneral, - [(set v2i64:$vD, (srl v2i64:$vA, v2i64:$vB))]>; + "vsrd $vD, $vA, $vB", IIC_VecGeneral, []>; def VSRAD : VXForm_1<964, (outs vrrc:$vD), (ins vrrc:$vA, vrrc:$vB), - "vsrad $vD, $vA, $vB", IIC_VecGeneral, - [(set v2i64:$vD, (sra v2i64:$vA, v2i64:$vB))]>; + "vsrad $vD, $vA, $vB", IIC_VecGeneral, []>; + +def : Pat<(v2i64 (shl v2i64:$vA, v2i64:$vB)), + (v2i64 (VSLD $vA, $vB))>; +def : Pat<(v2i64 (PPCshl v2i64:$vA, v2i64:$vB)), + (v2i64 (VSLD $vA, $vB))>; +def : Pat<(v2i64 (srl v2i64:$vA, v2i64:$vB)), + (v2i64 (VSRD $vA, $vB))>; +def : Pat<(v2i64 (PPCsrl v2i64:$vA, v2i64:$vB)), + (v2i64 (VSRD $vA, $vB))>; +def : Pat<(v2i64 (sra v2i64:$vA, v2i64:$vB)), + (v2i64 (VSRAD $vA, $vB))>; +def : Pat<(v2i64 (PPCsra v2i64:$vA, v2i64:$vB)), + (v2i64 (VSRAD $vA, $vB))>; // Vector Integer Arithmetic Instructions let isCommutable = 1 in { diff --git a/lib/Target/PowerPC/PPCInstrInfo.td b/lib/Target/PowerPC/PPCInstrInfo.td index f004ce49cac0..1af5e7f28342 100644 --- a/lib/Target/PowerPC/PPCInstrInfo.td +++ b/lib/Target/PowerPC/PPCInstrInfo.td @@ -33,7 +33,8 @@ def SDT_PPCVexts : SDTypeProfile<1, 2, [ SDTCisVT<0, f64>, SDTCisVT<1, f64>, SDTCisPtrTy<2> ]>; -def SDT_PPCCallSeqStart : SDCallSeqStart<[ SDTCisVT<0, i32> ]>; +def SDT_PPCCallSeqStart : SDCallSeqStart<[ SDTCisVT<0, i32>, + SDTCisVT<1, i32> ]>; def SDT_PPCCallSeqEnd : SDCallSeqEnd<[ SDTCisVT<0, i32>, SDTCisVT<1, i32> ]>; def SDT_PPCvperm : SDTypeProfile<1, 3, [ @@ -1099,9 +1100,11 @@ multiclass AForm_3r<bits<6> opcode, bits<5> xo, dag OOL, dag IOL, let hasCtrlDep = 1 in { let Defs = [R1], Uses = [R1] in { -def ADJCALLSTACKDOWN : Pseudo<(outs), (ins u16imm:$amt), "#ADJCALLSTACKDOWN $amt", - [(callseq_start timm:$amt)]>; -def ADJCALLSTACKUP : Pseudo<(outs), (ins u16imm:$amt1, u16imm:$amt2), "#ADJCALLSTACKUP $amt1 $amt2", +def ADJCALLSTACKDOWN : Pseudo<(outs), (ins u16imm:$amt1, u16imm:$amt2), + "#ADJCALLSTACKDOWN $amt1 $amt2", + [(callseq_start timm:$amt1, timm:$amt2)]>; +def ADJCALLSTACKUP : Pseudo<(outs), (ins u16imm:$amt1, u16imm:$amt2), + "#ADJCALLSTACKUP $amt1 $amt2", [(callseq_end timm:$amt1, timm:$amt2)]>; } @@ -4163,6 +4166,8 @@ def : InstAlias<"rotldi. $rA, $rS, $n", (RLDICLo g8rc:$rA, g8rc:$rS, u6imm:$n, 0 def : InstAlias<"rotld $rA, $rS, $rB", (RLDCL g8rc:$rA, g8rc:$rS, gprc:$rB, 0)>; def : InstAlias<"rotld. $rA, $rS, $rB", (RLDCLo g8rc:$rA, g8rc:$rS, gprc:$rB, 0)>; def : InstAlias<"clrldi $rA, $rS, $n", (RLDICL g8rc:$rA, g8rc:$rS, 0, u6imm:$n)>; +def : InstAlias<"clrldi $rA, $rS, $n", + (RLDICL_32 gprc:$rA, gprc:$rS, 0, u6imm:$n)>; def : InstAlias<"clrldi. $rA, $rS, $n", (RLDICLo g8rc:$rA, g8rc:$rS, 0, u6imm:$n)>; def RLWINMbm : PPCAsmPseudo<"rlwinm $rA, $rS, $n, $b", diff --git a/lib/Target/PowerPC/PPCInstrVSX.td b/lib/Target/PowerPC/PPCInstrVSX.td index 967557452f24..b98140fedfc0 100644 --- a/lib/Target/PowerPC/PPCInstrVSX.td +++ b/lib/Target/PowerPC/PPCInstrVSX.td @@ -1436,7 +1436,7 @@ let Predicates = [IsISA3_0, HasDirectMove] in { def MTVSRWS: XX1_RS6_RD5_XO<31, 403, (outs vsrc:$XT), (ins gprc:$rA), "mtvsrws $XT, $rA", IIC_VecGeneral, []>; - def MTVSRDD: XX1Form<31, 435, (outs vsrc:$XT), (ins g8rc:$rA, g8rc:$rB), + def MTVSRDD: XX1Form<31, 435, (outs vsrc:$XT), (ins g8rc_nox0:$rA, g8rc:$rB), "mtvsrdd $XT, $rA, $rB", IIC_VecGeneral, []>, Requires<[In64BitMode]>; diff --git a/lib/Target/PowerPC/PPCTLSDynamicCall.cpp b/lib/Target/PowerPC/PPCTLSDynamicCall.cpp index 0c1260a2965b..c7aa4cb78b7a 100644 --- a/lib/Target/PowerPC/PPCTLSDynamicCall.cpp +++ b/lib/Target/PowerPC/PPCTLSDynamicCall.cpp @@ -99,7 +99,8 @@ protected: // Don't really need to save data to the stack - the clobbered // registers are already saved when the SDNode (e.g. PPCaddiTlsgdLAddr) // gets translated to the pseudo instruction (e.g. ADDItlsgdLADDR). - BuildMI(MBB, I, DL, TII->get(PPC::ADJCALLSTACKDOWN)).addImm(0); + BuildMI(MBB, I, DL, TII->get(PPC::ADJCALLSTACKDOWN)).addImm(0) + .addImm(0); // Expand into two ops built prior to the existing instruction. MachineInstr *Addi = BuildMI(MBB, I, DL, TII->get(Opc1), GPR3) diff --git a/lib/Target/Sparc/SparcISelLowering.cpp b/lib/Target/Sparc/SparcISelLowering.cpp index acb34d5baaa8..9e7e3c6b705a 100644 --- a/lib/Target/Sparc/SparcISelLowering.cpp +++ b/lib/Target/Sparc/SparcISelLowering.cpp @@ -773,8 +773,7 @@ SparcTargetLowering::LowerCall_32(TargetLowering::CallLoweringInfo &CLI, } } - Chain = DAG.getCALLSEQ_START(Chain, DAG.getIntPtrConstant(ArgsSize, dl, true), - dl); + Chain = DAG.getCALLSEQ_START(Chain, ArgsSize, 0, dl); SmallVector<std::pair<unsigned, SDValue>, 8> RegsToPass; SmallVector<SDValue, 8> MemOpChains; @@ -1165,8 +1164,7 @@ SparcTargetLowering::LowerCall_64(TargetLowering::CallLoweringInfo &CLI, // Adjust the stack pointer to make room for the arguments. // FIXME: Use hasReservedCallFrame to avoid %sp adjustments around all calls // with more than 6 arguments. - Chain = DAG.getCALLSEQ_START(Chain, DAG.getIntPtrConstant(ArgsSize, DL, true), - DL); + Chain = DAG.getCALLSEQ_START(Chain, ArgsSize, 0, DL); // Collect the set of registers to pass to the function and their values. // This will be emitted as a sequence of CopyToReg nodes glued to the call @@ -2058,7 +2056,7 @@ SDValue SparcTargetLowering::LowerGlobalTLSAddress(SDValue Op, SDValue Chain = DAG.getEntryNode(); SDValue InFlag; - Chain = DAG.getCALLSEQ_START(Chain, DAG.getIntPtrConstant(1, DL, true), DL); + Chain = DAG.getCALLSEQ_START(Chain, 1, 0, DL); Chain = DAG.getCopyToReg(Chain, DL, SP::O0, Argument, InFlag); InFlag = Chain.getValue(1); SDValue Callee = DAG.getTargetExternalSymbol("__tls_get_addr", PtrVT); @@ -3386,7 +3384,10 @@ SparcTargetLowering::getConstraintType(StringRef Constraint) const { if (Constraint.size() == 1) { switch (Constraint[0]) { default: break; - case 'r': return C_RegisterClass; + case 'r': + case 'f': + case 'e': + return C_RegisterClass; case 'I': // SIMM13 return C_Other; } @@ -3465,6 +3466,24 @@ SparcTargetLowering::getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI, return std::make_pair(0U, &SP::IntPairRegClass); else return std::make_pair(0U, &SP::IntRegsRegClass); + case 'f': + if (VT == MVT::f32) + return std::make_pair(0U, &SP::FPRegsRegClass); + else if (VT == MVT::f64) + return std::make_pair(0U, &SP::LowDFPRegsRegClass); + else if (VT == MVT::f128) + return std::make_pair(0U, &SP::LowQFPRegsRegClass); + llvm_unreachable("Unknown ValueType for f-register-type!"); + break; + case 'e': + if (VT == MVT::f32) + return std::make_pair(0U, &SP::FPRegsRegClass); + else if (VT == MVT::f64) + return std::make_pair(0U, &SP::DFPRegsRegClass); + else if (VT == MVT::f128) + return std::make_pair(0U, &SP::QFPRegsRegClass); + llvm_unreachable("Unknown ValueType for e-register-type!"); + break; } } else if (!Constraint.empty() && Constraint.size() <= 5 && Constraint[0] == '{' && *(Constraint.end()-1) == '}') { diff --git a/lib/Target/Sparc/SparcInstrInfo.td b/lib/Target/Sparc/SparcInstrInfo.td index 5a19c624abb5..ae45c8be6752 100644 --- a/lib/Target/Sparc/SparcInstrInfo.td +++ b/lib/Target/Sparc/SparcInstrInfo.td @@ -195,7 +195,8 @@ def SPsjlj_longjmp: SDNode<"SPISD::EH_SJLJ_LONGJMP", [SDNPHasChain, SDNPSideEffect]>; // These are target-independent nodes, but have target-specific formats. -def SDT_SPCallSeqStart : SDCallSeqStart<[ SDTCisVT<0, i32> ]>; +def SDT_SPCallSeqStart : SDCallSeqStart<[ SDTCisVT<0, i32>, + SDTCisVT<1, i32> ]>; def SDT_SPCallSeqEnd : SDCallSeqEnd<[ SDTCisVT<0, i32>, SDTCisVT<1, i32> ]>; @@ -404,9 +405,9 @@ let Defs = [O7] in { } let Defs = [O6], Uses = [O6] in { -def ADJCALLSTACKDOWN : Pseudo<(outs), (ins i32imm:$amt), - "!ADJCALLSTACKDOWN $amt", - [(callseq_start timm:$amt)]>; +def ADJCALLSTACKDOWN : Pseudo<(outs), (ins i32imm:$amt1, i32imm:$amt2), + "!ADJCALLSTACKDOWN $amt1, $amt2", + [(callseq_start timm:$amt1, timm:$amt2)]>; def ADJCALLSTACKUP : Pseudo<(outs), (ins i32imm:$amt1, i32imm:$amt2), "!ADJCALLSTACKUP $amt1", [(callseq_end timm:$amt1, timm:$amt2)]>; diff --git a/lib/Target/Sparc/SparcRegisterInfo.td b/lib/Target/Sparc/SparcRegisterInfo.td index 6ecfddfc7d66..6625eaafd992 100644 --- a/lib/Target/Sparc/SparcRegisterInfo.td +++ b/lib/Target/Sparc/SparcRegisterInfo.td @@ -346,11 +346,13 @@ def I64Regs : RegisterClass<"SP", [i64], 64, (add IntRegs)>; // Floating point register classes. def FPRegs : RegisterClass<"SP", [f32], 32, (sequence "F%u", 0, 31)>; - def DFPRegs : RegisterClass<"SP", [f64], 64, (sequence "D%u", 0, 31)>; - def QFPRegs : RegisterClass<"SP", [f128], 128, (sequence "Q%u", 0, 15)>; +// The Low?FPRegs classes are used only for inline-asm constraints. +def LowDFPRegs : RegisterClass<"SP", [f64], 64, (sequence "D%u", 0, 15)>; +def LowQFPRegs : RegisterClass<"SP", [f128], 128, (sequence "Q%u", 0, 7)>; + // Floating point control register classes. def FCCRegs : RegisterClass<"SP", [i1], 1, (sequence "FCC%u", 0, 3)>; diff --git a/lib/Target/SystemZ/AsmParser/SystemZAsmParser.cpp b/lib/Target/SystemZ/AsmParser/SystemZAsmParser.cpp index 3f91ca9035a6..efcf6696fd50 100644 --- a/lib/Target/SystemZ/AsmParser/SystemZAsmParser.cpp +++ b/lib/Target/SystemZ/AsmParser/SystemZAsmParser.cpp @@ -262,6 +262,9 @@ public: bool isMemDisp20(MemoryKind MemKind, RegisterKind RegKind) const { return isMem(MemKind, RegKind) && inRange(Mem.Disp, -524288, 524287); } + bool isMemDisp12Len4(RegisterKind RegKind) const { + return isMemDisp12(BDLMem, RegKind) && inRange(Mem.Length.Imm, 1, 0x10); + } bool isMemDisp12Len8(RegisterKind RegKind) const { return isMemDisp12(BDLMem, RegKind) && inRange(Mem.Length.Imm, 1, 0x100); } @@ -347,6 +350,7 @@ public: bool isBDAddr64Disp20() const { return isMemDisp20(BDMem, ADDR64Reg); } bool isBDXAddr64Disp12() const { return isMemDisp12(BDXMem, ADDR64Reg); } bool isBDXAddr64Disp20() const { return isMemDisp20(BDXMem, ADDR64Reg); } + bool isBDLAddr64Disp12Len4() const { return isMemDisp12Len4(ADDR64Reg); } bool isBDLAddr64Disp12Len8() const { return isMemDisp12Len8(ADDR64Reg); } bool isBDRAddr64Disp12() const { return isMemDisp12(BDRMem, ADDR64Reg); } bool isBDVAddr64Disp12() const { return isMemDisp12(BDVMem, ADDR64Reg); } diff --git a/lib/Target/SystemZ/Disassembler/SystemZDisassembler.cpp b/lib/Target/SystemZ/Disassembler/SystemZDisassembler.cpp index a281a0aa6bcc..27fd70bc6092 100644 --- a/lib/Target/SystemZ/Disassembler/SystemZDisassembler.cpp +++ b/lib/Target/SystemZ/Disassembler/SystemZDisassembler.cpp @@ -327,6 +327,18 @@ static DecodeStatus decodeBDXAddr20Operand(MCInst &Inst, uint64_t Field, return MCDisassembler::Success; } +static DecodeStatus decodeBDLAddr12Len4Operand(MCInst &Inst, uint64_t Field, + const unsigned *Regs) { + uint64_t Length = Field >> 16; + uint64_t Base = (Field >> 12) & 0xf; + uint64_t Disp = Field & 0xfff; + assert(Length < 16 && "Invalid BDLAddr12Len4"); + Inst.addOperand(MCOperand::createReg(Base == 0 ? 0 : Regs[Base])); + Inst.addOperand(MCOperand::createImm(Disp)); + Inst.addOperand(MCOperand::createImm(Length + 1)); + return MCDisassembler::Success; +} + static DecodeStatus decodeBDLAddr12Len8Operand(MCInst &Inst, uint64_t Field, const unsigned *Regs) { uint64_t Length = Field >> 16; @@ -399,6 +411,13 @@ static DecodeStatus decodeBDXAddr64Disp20Operand(MCInst &Inst, uint64_t Field, return decodeBDXAddr20Operand(Inst, Field, SystemZMC::GR64Regs); } +static DecodeStatus decodeBDLAddr64Disp12Len4Operand(MCInst &Inst, + uint64_t Field, + uint64_t Address, + const void *Decoder) { + return decodeBDLAddr12Len4Operand(Inst, Field, SystemZMC::GR64Regs); +} + static DecodeStatus decodeBDLAddr64Disp12Len8Operand(MCInst &Inst, uint64_t Field, uint64_t Address, diff --git a/lib/Target/SystemZ/MCTargetDesc/SystemZMCCodeEmitter.cpp b/lib/Target/SystemZ/MCTargetDesc/SystemZMCCodeEmitter.cpp index 092eb4011adc..d188f56512ab 100644 --- a/lib/Target/SystemZ/MCTargetDesc/SystemZMCCodeEmitter.cpp +++ b/lib/Target/SystemZ/MCTargetDesc/SystemZMCCodeEmitter.cpp @@ -77,6 +77,9 @@ private: uint64_t getBDXAddr20Encoding(const MCInst &MI, unsigned OpNum, SmallVectorImpl<MCFixup> &Fixups, const MCSubtargetInfo &STI) const; + uint64_t getBDLAddr12Len4Encoding(const MCInst &MI, unsigned OpNum, + SmallVectorImpl<MCFixup> &Fixups, + const MCSubtargetInfo &STI) const; uint64_t getBDLAddr12Len8Encoding(const MCInst &MI, unsigned OpNum, SmallVectorImpl<MCFixup> &Fixups, const MCSubtargetInfo &STI) const; @@ -220,6 +223,17 @@ getBDXAddr20Encoding(const MCInst &MI, unsigned OpNum, } uint64_t SystemZMCCodeEmitter:: +getBDLAddr12Len4Encoding(const MCInst &MI, unsigned OpNum, + SmallVectorImpl<MCFixup> &Fixups, + const MCSubtargetInfo &STI) const { + uint64_t Base = getMachineOpValue(MI, MI.getOperand(OpNum), Fixups, STI); + uint64_t Disp = getMachineOpValue(MI, MI.getOperand(OpNum + 1), Fixups, STI); + uint64_t Len = getMachineOpValue(MI, MI.getOperand(OpNum + 2), Fixups, STI) - 1; + assert(isUInt<4>(Base) && isUInt<12>(Disp) && isUInt<4>(Len)); + return (Len << 16) | (Base << 12) | Disp; +} + +uint64_t SystemZMCCodeEmitter:: getBDLAddr12Len8Encoding(const MCInst &MI, unsigned OpNum, SmallVectorImpl<MCFixup> &Fixups, const MCSubtargetInfo &STI) const { diff --git a/lib/Target/SystemZ/README.txt b/lib/Target/SystemZ/README.txt index 86a1322c9e23..74cf653b9d95 100644 --- a/lib/Target/SystemZ/README.txt +++ b/lib/Target/SystemZ/README.txt @@ -63,7 +63,7 @@ via a register.) -- -We don't use ICM or STCM. +We don't use ICM, STCM, or CLM. -- diff --git a/lib/Target/SystemZ/SystemZFeatures.td b/lib/Target/SystemZ/SystemZFeatures.td index 716e5add8051..7bfa378aa85c 100644 --- a/lib/Target/SystemZ/SystemZFeatures.td +++ b/lib/Target/SystemZ/SystemZFeatures.td @@ -68,6 +68,11 @@ def FeaturePopulationCount : SystemZFeature< "Assume that the population-count facility is installed" >; +def FeatureMessageSecurityAssist4 : SystemZFeature< + "message-security-assist-extension4", "MessageSecurityAssist4", + "Assume that the message-security-assist extension facility 4 is installed" +>; + def Arch9NewFeatures : SystemZFeatureList<[ FeatureDistinctOps, FeatureFastSerialization, @@ -75,7 +80,8 @@ def Arch9NewFeatures : SystemZFeatureList<[ FeatureHighWord, FeatureInterlockedAccess1, FeatureLoadStoreOnCond, - FeaturePopulationCount + FeaturePopulationCount, + FeatureMessageSecurityAssist4 ]>; //===----------------------------------------------------------------------===// @@ -133,6 +139,11 @@ def FeatureLoadStoreOnCond2 : SystemZFeature< "Assume that the load/store-on-condition facility 2 is installed" >; +def FeatureMessageSecurityAssist5 : SystemZFeature< + "message-security-assist-extension5", "MessageSecurityAssist5", + "Assume that the message-security-assist extension facility 5 is installed" +>; + def FeatureVector : SystemZFeature< "vector", "Vector", "Assume that the vectory facility is installed" @@ -142,6 +153,7 @@ def FeatureNoVector : SystemZMissingFeature<"Vector">; def Arch11NewFeatures : SystemZFeatureList<[ FeatureLoadAndZeroRightmostByte, FeatureLoadStoreOnCond2, + FeatureMessageSecurityAssist5, FeatureVector ]>; diff --git a/lib/Target/SystemZ/SystemZISelLowering.cpp b/lib/Target/SystemZ/SystemZISelLowering.cpp index 6989aabb8c6a..235e095f0010 100644 --- a/lib/Target/SystemZ/SystemZISelLowering.cpp +++ b/lib/Target/SystemZ/SystemZISelLowering.cpp @@ -1110,9 +1110,7 @@ SystemZTargetLowering::LowerCall(CallLoweringInfo &CLI, // Mark the start of the call. if (!IsTailCall) - Chain = DAG.getCALLSEQ_START(Chain, - DAG.getConstant(NumBytes, DL, PtrVT, true), - DL); + Chain = DAG.getCALLSEQ_START(Chain, NumBytes, 0, DL); // Copy argument values to their designated locations. SmallVector<std::pair<unsigned, SDValue>, 9> RegsToPass; @@ -6354,3 +6352,12 @@ MachineBasicBlock *SystemZTargetLowering::EmitInstrWithCustomInserter( llvm_unreachable("Unexpected instr type to insert"); } } + +// This is only used by the isel schedulers, and is needed only to prevent +// compiler from crashing when list-ilp is used. +const TargetRegisterClass * +SystemZTargetLowering::getRepRegClassFor(MVT VT) const { + if (VT == MVT::Untyped) + return &SystemZ::ADDR128BitRegClass; + return TargetLowering::getRepRegClassFor(VT); +} diff --git a/lib/Target/SystemZ/SystemZISelLowering.h b/lib/Target/SystemZ/SystemZISelLowering.h index 1c34dc43e8bb..79c8c4d92669 100644 --- a/lib/Target/SystemZ/SystemZISelLowering.h +++ b/lib/Target/SystemZ/SystemZISelLowering.h @@ -590,6 +590,8 @@ private: MachineBasicBlock *emitLoadAndTestCmp0(MachineInstr &MI, MachineBasicBlock *MBB, unsigned Opcode) const; + + const TargetRegisterClass *getRepRegClassFor(MVT VT) const override; }; } // end namespace llvm diff --git a/lib/Target/SystemZ/SystemZInstrFP.td b/lib/Target/SystemZ/SystemZInstrFP.td index bb6d27e24828..364b81f98eed 100644 --- a/lib/Target/SystemZ/SystemZInstrFP.td +++ b/lib/Target/SystemZ/SystemZInstrFP.td @@ -458,6 +458,12 @@ def DXBR : BinaryRRE<"dxbr", 0xB34D, fdiv, FP128, FP128>; def DEB : BinaryRXE<"deb", 0xED0D, fdiv, FP32, load, 4>; def DDB : BinaryRXE<"ddb", 0xED1D, fdiv, FP64, load, 8>; +// Divide to integer. +let Defs = [CC] in { + def DIEBR : TernaryRRFb<"diebr", 0xB353, FP32, FP32, FP32>; + def DIDBR : TernaryRRFb<"didbr", 0xB35B, FP64, FP64, FP64>; +} + //===----------------------------------------------------------------------===// // Comparisons //===----------------------------------------------------------------------===// @@ -469,6 +475,13 @@ let Defs = [CC], CCValues = 0xF in { def CEB : CompareRXE<"ceb", 0xED09, z_fcmp, FP32, load, 4>; def CDB : CompareRXE<"cdb", 0xED19, z_fcmp, FP64, load, 8>; + + def KEBR : CompareRRE<"kebr", 0xB308, null_frag, FP32, FP32>; + def KDBR : CompareRRE<"kdbr", 0xB318, null_frag, FP64, FP64>; + def KXBR : CompareRRE<"kxbr", 0xB348, null_frag, FP128, FP128>; + + def KEB : CompareRXE<"keb", 0xED08, null_frag, FP32, load, 4>; + def KDB : CompareRXE<"kdb", 0xED18, null_frag, FP64, load, 8>; } // Test Data Class. diff --git a/lib/Target/SystemZ/SystemZInstrFormats.td b/lib/Target/SystemZ/SystemZInstrFormats.td index c727f486087e..a37da2807854 100644 --- a/lib/Target/SystemZ/SystemZInstrFormats.td +++ b/lib/Target/SystemZ/SystemZInstrFormats.td @@ -710,6 +710,21 @@ class InstRSI<bits<8> op, dag outs, dag ins, string asmstr, list<dag> pattern> let Inst{15-0} = RI2; } +class InstRSLa<bits<16> op, dag outs, dag ins, string asmstr, list<dag> pattern> + : InstSystemZ<6, outs, ins, asmstr, pattern> { + field bits<48> Inst; + field bits<48> SoftFail = 0; + + bits<20> BDL1; + + let Inst{47-40} = op{15-8}; + let Inst{39-36} = BDL1{19-16}; + let Inst{35-32} = 0; + let Inst{31-16} = BDL1{15-0}; + let Inst{15-8} = 0; + let Inst{7-0} = op{7-0}; +} + class InstRSYa<bits<16> op, dag outs, dag ins, string asmstr, list<dag> pattern> : InstSystemZ<6, outs, ins, asmstr, pattern> { field bits<48> Inst; @@ -817,6 +832,37 @@ class InstSSa<bits<8> op, dag outs, dag ins, string asmstr, list<dag> pattern> let Inst{15-0} = BD2; } +class InstSSb<bits<8> op, dag outs, dag ins, string asmstr, list<dag> pattern> + : InstSystemZ<6, outs, ins, asmstr, pattern> { + field bits<48> Inst; + field bits<48> SoftFail = 0; + + bits<20> BDL1; + bits<20> BDL2; + + let Inst{47-40} = op; + let Inst{39-36} = BDL1{19-16}; + let Inst{35-32} = BDL2{19-16}; + let Inst{31-16} = BDL1{15-0}; + let Inst{15-0} = BDL2{15-0}; +} + +class InstSSc<bits<8> op, dag outs, dag ins, string asmstr, list<dag> pattern> + : InstSystemZ<6, outs, ins, asmstr, pattern> { + field bits<48> Inst; + field bits<48> SoftFail = 0; + + bits<20> BDL1; + bits<16> BD2; + bits<4> I3; + + let Inst{47-40} = op; + let Inst{39-36} = BDL1{19-16}; + let Inst{35-32} = I3; + let Inst{31-16} = BDL1{15-0}; + let Inst{15-0} = BD2; +} + class InstSSd<bits<8> op, dag outs, dag ins, string asmstr, list<dag> pattern> : InstSystemZ<6, outs, ins, asmstr, pattern> { field bits<48> Inst; @@ -850,6 +896,20 @@ class InstSSe<bits<8> op, dag outs, dag ins, string asmstr, list<dag> pattern> let Inst{15-0} = BD4; } +class InstSSf<bits<8> op, dag outs, dag ins, string asmstr, list<dag> pattern> + : InstSystemZ<6, outs, ins, asmstr, pattern> { + field bits<48> Inst; + field bits<48> SoftFail = 0; + + bits<16> BD1; + bits<24> BDL2; + + let Inst{47-40} = op; + let Inst{39-32} = BDL2{23-16}; + let Inst{31-16} = BD1; + let Inst{15-0} = BDL2{15-0}; +} + class InstSSE<bits<16> op, dag outs, dag ins, string asmstr, list<dag> pattern> : InstSystemZ<6, outs, ins, asmstr, pattern> { field bits<48> Inst; @@ -1567,6 +1627,9 @@ class ICV<string name> // Inherent: // One register output operand and no input operands. // +// InherentDual: +// Two register output operands and no input operands. +// // StoreInherent: // One address operand. The instruction stores to the address. // @@ -1642,8 +1705,9 @@ class ICV<string name> // Two input operands and an implicit CC output operand. // // Test: -// Two input operands and an implicit CC output operand. The second -// input operand is an "address" operand used as a test class mask. +// One or two input operands and an implicit CC output operand. If +// present, the second input operand is an "address" operand used as +// a test class mask. // // Ternary: // One register output operand and three input operands. @@ -1691,6 +1755,10 @@ class InherentRRE<string mnemonic, bits<16> opcode, RegisterOperand cls, let R2 = 0; } +class InherentDualRRE<string mnemonic, bits<16> opcode, RegisterOperand cls> + : InstRRE<opcode, (outs cls:$R1, cls:$R2), (ins), + mnemonic#"\t$R1, $R2", []>; + class InherentVRIa<string mnemonic, bits<16> opcode, bits<16> value> : InstVRIa<opcode, (outs VR128:$V1), (ins), mnemonic#"\t$V1", []> { let I2 = value; @@ -1714,6 +1782,12 @@ class SideEffectInherentS<string mnemonic, bits<16> opcode, let BD2 = 0; } +class SideEffectInherentRRE<string mnemonic, bits<16> opcode> + : InstRRE<opcode, (outs), (ins), mnemonic, []> { + let R1 = 0; + let R2 = 0; +} + // Allow an optional TLS marker symbol to generate TLS call relocations. class CallRI<string mnemonic, bits<12> opcode> : InstRIb<opcode, (outs), (ins GR64:$R1, brtarget16tls:$RI2), @@ -2084,6 +2158,13 @@ multiclass LoadMultipleRSPair<string mnemonic, bits<8> rsOpcode, } } +class LoadMultipleSSe<string mnemonic, bits<8> opcode, RegisterOperand cls> + : InstSSe<opcode, (outs cls:$R1, cls:$R3), + (ins bdaddr12only:$BD2, bdaddr12only:$BD4), + mnemonic#"\t$R1, $R3, $BD2, $BD4", []> { + let mayLoad = 1; +} + class LoadMultipleVRSa<string mnemonic, bits<16> opcode> : InstVRSa<opcode, (outs VR128:$V1, VR128:$V3), (ins bdaddr12only:$BD2), mnemonic#"\t$V1, $V3, $BD2", []> { @@ -2355,6 +2436,15 @@ class UnaryRRE<string mnemonic, bits<16> opcode, SDPatternOperator operator, let OpType = "reg"; } +class UnaryMemRRFc<string mnemonic, bits<16> opcode, + RegisterOperand cls1, RegisterOperand cls2> + : InstRRFc<opcode, (outs cls2:$R2, cls1:$R1), (ins cls1:$R1src), + mnemonic#"\t$R1, $R2", []> { + let Constraints = "$R1 = $R1src"; + let DisableEncoding = "$R1src"; + let M3 = 0; +} + class UnaryRI<string mnemonic, bits<12> opcode, SDPatternOperator operator, RegisterOperand cls, Immediate imm> : InstRIa<opcode, (outs cls:$R1), (ins imm:$I2), @@ -2585,11 +2675,61 @@ class SideEffectBinaryIE<string mnemonic, bits<16> opcode, : InstIE<opcode, (outs), (ins imm1:$I1, imm2:$I2), mnemonic#"\t$I1, $I2", []>; +class SideEffectBinarySI<string mnemonic, bits<8> opcode, Operand imm> + : InstSI<opcode, (outs), (ins bdaddr12only:$BD1, imm:$I2), + mnemonic#"\t$BD1, $I2", []>; + class SideEffectBinarySIL<string mnemonic, bits<16> opcode, SDPatternOperator operator, Immediate imm> : InstSIL<opcode, (outs), (ins bdaddr12only:$BD1, imm:$I2), mnemonic#"\t$BD1, $I2", [(operator bdaddr12only:$BD1, imm:$I2)]>; +class SideEffectBinarySSa<string mnemonic, bits<8> opcode> + : InstSSa<opcode, (outs), (ins bdladdr12onlylen8:$BDL1, bdaddr12only:$BD2), + mnemonic##"\t$BDL1, $BD2", []>; + +class SideEffectBinarySSb<string mnemonic, bits<8> opcode> + : InstSSb<opcode, + (outs), (ins bdladdr12onlylen4:$BDL1, bdladdr12onlylen4:$BDL2), + mnemonic##"\t$BDL1, $BDL2", []>; + +class SideEffectBinarySSf<string mnemonic, bits<8> opcode> + : InstSSf<opcode, (outs), (ins bdaddr12only:$BD1, bdladdr12onlylen8:$BDL2), + mnemonic##"\t$BD1, $BDL2", []>; + +class SideEffectBinaryMemMemRR<string mnemonic, bits<8> opcode, + RegisterOperand cls1, RegisterOperand cls2> + : InstRR<opcode, (outs cls1:$R1, cls2:$R2), (ins cls1:$R1src, cls2:$R2src), + mnemonic#"\t$R1, $R2", []> { + let Constraints = "$R1 = $R1src, $R2 = $R2src"; + let DisableEncoding = "$R1src, $R2src"; +} + +class SideEffectBinaryMemRRE<string mnemonic, bits<16> opcode, + RegisterOperand cls1, RegisterOperand cls2> + : InstRRE<opcode, (outs cls2:$R2), (ins cls1:$R1, cls2:$R2src), + mnemonic#"\t$R1, $R2", []> { + let Constraints = "$R2 = $R2src"; + let DisableEncoding = "$R2src"; +} + +class SideEffectBinaryMemMemRRE<string mnemonic, bits<16> opcode, + RegisterOperand cls1, RegisterOperand cls2> + : InstRRE<opcode, (outs cls1:$R1, cls2:$R2), (ins cls1:$R1src, cls2:$R2src), + mnemonic#"\t$R1, $R2", []> { + let Constraints = "$R1 = $R1src, $R2 = $R2src"; + let DisableEncoding = "$R1src, $R2src"; +} + +class SideEffectBinaryMemMemRRFc<string mnemonic, bits<16> opcode, + RegisterOperand cls1, RegisterOperand cls2> + : InstRRFc<opcode, (outs cls1:$R1, cls2:$R2), (ins cls1:$R1src, cls2:$R2src), + mnemonic#"\t$R1, $R2", []> { + let Constraints = "$R1 = $R1src, $R2 = $R2src"; + let DisableEncoding = "$R1src, $R2src"; + let M3 = 0; +} + class BinaryRR<string mnemonic, bits<8> opcode, SDPatternOperator operator, RegisterOperand cls1, RegisterOperand cls2> : InstRR<opcode, (outs cls1:$R1), (ins cls1:$R1src, cls2:$R2), @@ -2654,6 +2794,20 @@ class BinaryRRFb<string mnemonic, bits<16> opcode, SDPatternOperator operator, let M4 = 0; } +class BinaryMemRRFc<string mnemonic, bits<16> opcode, + RegisterOperand cls1, RegisterOperand cls2, Immediate imm> + : InstRRFc<opcode, (outs cls2:$R2, cls1:$R1), (ins cls1:$R1src, imm:$M3), + mnemonic#"\t$R1, $R2, $M3", []> { + let Constraints = "$R1 = $R1src"; + let DisableEncoding = "$R1src"; +} + +multiclass BinaryMemRRFcOpt<string mnemonic, bits<16> opcode, + RegisterOperand cls1, RegisterOperand cls2> { + def "" : BinaryMemRRFc<mnemonic, opcode, cls1, cls2, imm32zx4>; + def Opt : UnaryMemRRFc<mnemonic, opcode, cls1, cls2>; +} + class BinaryRRFe<string mnemonic, bits<16> opcode, RegisterOperand cls1, RegisterOperand cls2> : InstRRFe<opcode, (outs cls1:$R1), (ins imm32zx4:$M3, cls2:$R2), @@ -3112,6 +3266,34 @@ class BinaryVRX<string mnemonic, bits<16> opcode, SDPatternOperator operator, let AccessBytes = bytes; } +class StoreBinaryRS<string mnemonic, bits<8> opcode, RegisterOperand cls, + bits<5> bytes, AddressingMode mode = bdaddr12only> + : InstRSb<opcode, (outs), (ins cls:$R1, imm32zx4:$M3, mode:$BD2), + mnemonic#"\t$R1, $M3, $BD2", []> { + let mayStore = 1; + let AccessBytes = bytes; +} + +class StoreBinaryRSY<string mnemonic, bits<16> opcode, RegisterOperand cls, + bits<5> bytes, AddressingMode mode = bdaddr20only> + : InstRSYb<opcode, (outs), (ins cls:$R1, imm32zx4:$M3, mode:$BD2), + mnemonic#"\t$R1, $M3, $BD2", []> { + let mayStore = 1; + let AccessBytes = bytes; +} + +multiclass StoreBinaryRSPair<string mnemonic, bits<8> rsOpcode, + bits<16> rsyOpcode, RegisterOperand cls, + bits<5> bytes> { + let DispKey = mnemonic ## #cls in { + let DispSize = "12" in + def "" : StoreBinaryRS<mnemonic, rsOpcode, cls, bytes, bdaddr12pair>; + let DispSize = "20" in + def Y : StoreBinaryRSY<mnemonic#"y", rsyOpcode, cls, bytes, + bdaddr20pair>; + } +} + class StoreBinaryVRV<string mnemonic, bits<16> opcode, bits<5> bytes, Immediate index> : InstVRV<opcode, (outs), (ins VR128:$V1, bdvaddr12only:$VBD2, index:$M3), @@ -3237,6 +3419,40 @@ multiclass CompareRXPair<string mnemonic, bits<8> rxOpcode, bits<16> rxyOpcode, } } +class CompareRS<string mnemonic, bits<8> opcode, RegisterOperand cls, + bits<5> bytes, AddressingMode mode = bdaddr12only> + : InstRSb<opcode, (outs), (ins cls:$R1, imm32zx4:$M3, mode:$BD2), + mnemonic#"\t$R1, $M3, $BD2", []> { + let mayLoad = 1; + let AccessBytes = bytes; +} + +class CompareRSY<string mnemonic, bits<16> opcode, RegisterOperand cls, + bits<5> bytes, AddressingMode mode = bdaddr20only> + : InstRSYb<opcode, (outs), (ins cls:$R1, imm32zx4:$M3, mode:$BD2), + mnemonic#"\t$R1, $M3, $BD2", []> { + let mayLoad = 1; + let AccessBytes = bytes; +} + +multiclass CompareRSPair<string mnemonic, bits<8> rsOpcode, bits<16> rsyOpcode, + RegisterOperand cls, bits<5> bytes> { + let DispKey = mnemonic ## #cls in { + let DispSize = "12" in + def "" : CompareRS<mnemonic, rsOpcode, cls, bytes, bdaddr12pair>; + let DispSize = "20" in + def Y : CompareRSY<mnemonic#"y", rsyOpcode, cls, bytes, bdaddr20pair>; + } +} + +class CompareSSb<string mnemonic, bits<8> opcode> + : InstSSb<opcode, + (outs), (ins bdladdr12onlylen4:$BDL1, bdladdr12onlylen4:$BDL2), + mnemonic##"\t$BDL1, $BDL2", []> { + let isCompare = 1; + let mayLoad = 1; +} + class CompareSI<string mnemonic, bits<8> opcode, SDPatternOperator operator, SDPatternOperator load, Immediate imm, AddressingMode mode = bdaddr12only> @@ -3313,18 +3529,68 @@ class TestRXE<string mnemonic, bits<16> opcode, SDPatternOperator operator, let M3 = 0; } +class TestRSL<string mnemonic, bits<16> opcode> + : InstRSLa<opcode, (outs), (ins bdladdr12onlylen4:$BDL1), + mnemonic#"\t$BDL1", []> { + let mayLoad = 1; +} + +class SideEffectTernarySSc<string mnemonic, bits<8> opcode> + : InstSSc<opcode, (outs), (ins bdladdr12onlylen4:$BDL1, + shift12only:$BD2, imm32zx4:$I3), + mnemonic##"\t$BDL1, $BD2, $I3", []>; + +class SideEffectTernaryMemMemMemRRFb<string mnemonic, bits<16> opcode, + RegisterOperand cls1, + RegisterOperand cls2, + RegisterOperand cls3> + : InstRRFb<opcode, (outs cls1:$R1, cls2:$R2, cls3:$R3), + (ins cls1:$R1src, cls2:$R2src, cls3:$R3src), + mnemonic#"\t$R1, $R3, $R2", []> { + let Constraints = "$R1 = $R1src, $R2 = $R2src, $R3 = $R3src"; + let DisableEncoding = "$R1src, $R2src, $R3src"; + let M4 = 0; +} + class SideEffectTernaryRRFc<string mnemonic, bits<16> opcode, RegisterOperand cls1, RegisterOperand cls2, Immediate imm> : InstRRFc<opcode, (outs), (ins cls1:$R1, cls2:$R2, imm:$M3), mnemonic#"\t$R1, $R2, $M3", []>; +class SideEffectTernaryMemMemRRFc<string mnemonic, bits<16> opcode, + RegisterOperand cls1, RegisterOperand cls2, + Immediate imm> + : InstRRFc<opcode, (outs cls1:$R1, cls2:$R2), + (ins cls1:$R1src, cls2:$R2src, imm:$M3), + mnemonic#"\t$R1, $R2, $M3", []> { + let Constraints = "$R1 = $R1src, $R2 = $R2src"; + let DisableEncoding = "$R1src, $R2src"; +} + +multiclass SideEffectTernaryMemMemRRFcOpt<string mnemonic, bits<16> opcode, + RegisterOperand cls1, + RegisterOperand cls2> { + def "" : SideEffectTernaryMemMemRRFc<mnemonic, opcode, cls1, cls2, imm32zx4>; + def Opt : SideEffectBinaryMemMemRRFc<mnemonic, opcode, cls1, cls2>; +} + class SideEffectTernarySSF<string mnemonic, bits<12> opcode, RegisterOperand cls> : InstSSF<opcode, (outs), (ins bdaddr12only:$BD1, bdaddr12only:$BD2, cls:$R3), mnemonic#"\t$BD1, $BD2, $R3", []>; +class TernaryRRFb<string mnemonic, bits<16> opcode, + RegisterOperand cls1, RegisterOperand cls2, + RegisterOperand cls3> + : InstRRFb<opcode, (outs cls1:$R1, cls3:$R3), + (ins cls1:$R1src, cls2:$R2, imm32zx4:$M4), + mnemonic#"\t$R1, $R3, $R2, $M4", []> { + let Constraints = "$R1 = $R1src"; + let DisableEncoding = "$R1src"; +} + class TernaryRRFe<string mnemonic, bits<16> opcode, RegisterOperand cls1, RegisterOperand cls2> : InstRRFe<opcode, (outs cls1:$R1), @@ -3376,6 +3642,24 @@ multiclass TernaryRSPair<string mnemonic, bits<8> rsOpcode, bits<16> rsyOpcode, } } +class SideEffectTernaryMemMemRS<string mnemonic, bits<8> opcode, + RegisterOperand cls1, RegisterOperand cls2> + : InstRSa<opcode, (outs cls1:$R1, cls2:$R3), + (ins cls1:$R1src, cls2:$R3src, shift12only:$BD2), + mnemonic#"\t$R1, $R3, $BD2", []> { + let Constraints = "$R1 = $R1src, $R3 = $R3src"; + let DisableEncoding = "$R1src, $R3src"; +} + +class SideEffectTernaryMemMemRSY<string mnemonic, bits<16> opcode, + RegisterOperand cls1, RegisterOperand cls2> + : InstRSYa<opcode, (outs cls1:$R1, cls2:$R3), + (ins cls1:$R1src, cls2:$R3src, shift20only:$BD2), + mnemonic#"\t$R1, $R3, $BD2", []> { + let Constraints = "$R1 = $R1src, $R3 = $R3src"; + let DisableEncoding = "$R1src, $R3src"; +} + class TernaryRXF<string mnemonic, bits<16> opcode, SDPatternOperator operator, RegisterOperand cls, SDPatternOperator load, bits<5> bytes> : InstRXF<opcode, (outs cls:$R1), @@ -3981,9 +4265,7 @@ class AtomicLoadWBinaryImm<SDPatternOperator operator, Immediate imm> // another instruction to handle the excess. multiclass MemorySS<string mnemonic, bits<8> opcode, SDPatternOperator sequence, SDPatternOperator loop> { - def "" : InstSSa<opcode, (outs), (ins bdladdr12onlylen8:$BDL1, - bdaddr12only:$BD2), - mnemonic##"\t$BDL1, $BD2", []>; + def "" : SideEffectBinarySSa<mnemonic, opcode>; let usesCustomInserter = 1, hasNoSchedulingInfo = 1 in { def Sequence : Pseudo<(outs), (ins bdaddr12only:$dest, bdaddr12only:$src, imm64:$length), @@ -4003,13 +4285,8 @@ multiclass MemorySS<string mnemonic, bits<8> opcode, // the full loop (the main instruction plus the branch on CC==3). multiclass StringRRE<string mnemonic, bits<16> opcode, SDPatternOperator operator> { - def "" : InstRRE<opcode, (outs GR64:$R1, GR64:$R2), - (ins GR64:$R1src, GR64:$R2src), - mnemonic#"\t$R1, $R2", []> { - let Uses = [R0L]; - let Constraints = "$R1 = $R1src, $R2 = $R2src"; - let DisableEncoding = "$R1src, $R2src"; - } + let Uses = [R0L] in + def "" : SideEffectBinaryMemMemRRE<mnemonic, opcode, GR64, GR64>; let usesCustomInserter = 1, hasNoSchedulingInfo = 1 in def Loop : Pseudo<(outs GR64:$end), (ins GR64:$start1, GR64:$start2, GR32:$char), diff --git a/lib/Target/SystemZ/SystemZInstrInfo.td b/lib/Target/SystemZ/SystemZInstrInfo.td index d63525f29412..fa5ecdd85243 100644 --- a/lib/Target/SystemZ/SystemZInstrInfo.td +++ b/lib/Target/SystemZ/SystemZInstrInfo.td @@ -12,8 +12,8 @@ //===----------------------------------------------------------------------===// let hasNoSchedulingInfo = 1 in { - def ADJCALLSTACKDOWN : Pseudo<(outs), (ins i64imm:$amt), - [(callseq_start timm:$amt)]>; + def ADJCALLSTACKDOWN : Pseudo<(outs), (ins i64imm:$amt1, i64imm:$amt2), + [(callseq_start timm:$amt1, timm:$amt2)]>; def ADJCALLSTACKUP : Pseudo<(outs), (ins i64imm:$amt1, i64imm:$amt2), [(callseq_end timm:$amt1, timm:$amt2)]>; } @@ -464,6 +464,11 @@ def MVGHI : StoreSIL<"mvghi", 0xE548, store, imm64sx16>; // Memory-to-memory moves. let mayLoad = 1, mayStore = 1 in defm MVC : MemorySS<"mvc", 0xD2, z_mvc, z_mvc_loop>; +let mayLoad = 1, mayStore = 1, Defs = [CC] in { + def MVCL : SideEffectBinaryMemMemRR<"mvcl", 0x0E, GR128, GR128>; + def MVCLE : SideEffectTernaryMemMemRS<"mvcle", 0xA8, GR128, GR128>; + def MVCLU : SideEffectTernaryMemMemRSY<"mvclu", 0xEB8E, GR128, GR128>; +} // String moves. let mayLoad = 1, mayStore = 1, Defs = [CC] in @@ -707,6 +712,10 @@ def : StoreGR64PC<STHRL, aligned_truncstorei16>; defm : StoreGR64Pair<ST, STY, truncstorei32>; def : StoreGR64PC<STRL, aligned_truncstorei32>; +// Store characters under mask -- not (yet) used for codegen. +defm STCM : StoreBinaryRSPair<"stcm", 0xBE, 0xEB2D, GR32, 0>; +def STCMH : StoreBinaryRSY<"stcmh", 0xEB2C, GRH32, 0>; + //===----------------------------------------------------------------------===// // Multi-register moves //===----------------------------------------------------------------------===// @@ -715,6 +724,7 @@ def : StoreGR64PC<STRL, aligned_truncstorei32>; defm LM : LoadMultipleRSPair<"lm", 0x98, 0xEB98, GR32>; def LMG : LoadMultipleRSY<"lmg", 0xEB04, GR64>; def LMH : LoadMultipleRSY<"lmh", 0xEB96, GRH32>; +def LMD : LoadMultipleSSe<"lmd", 0xEF, GR64>; // Multi-register stores. defm STM : StoreMultipleRSPair<"stm", 0x90, 0xEB90, GR32>; @@ -742,6 +752,10 @@ def STRVH : StoreRXY<"strvh", 0xE33F, z_strvh, GR32, 2>; def STRV : StoreRXY<"strv", 0xE33E, z_strv, GR32, 4>; def STRVG : StoreRXY<"strvg", 0xE32F, z_strvg, GR64, 8>; +// Byte-swapping memory-to-memory moves. +let mayLoad = 1, mayStore = 1 in + def MVCIN : SideEffectBinarySSa<"mvcin", 0xE8>; + //===----------------------------------------------------------------------===// // Load address instructions //===----------------------------------------------------------------------===// @@ -816,6 +830,7 @@ defm : InsertMem<"inserti8", IC32Y, GR32, azextloadi8, bdxaddr20pair>; defm : InsertMem<"inserti8", IC, GR64, azextloadi8, bdxaddr12pair>; defm : InsertMem<"inserti8", ICY, GR64, azextloadi8, bdxaddr20pair>; +// Insert characters under mask -- not (yet) used for codegen. let Defs = [CC] in { defm ICM : TernaryRSPair<"icm", 0xBF, 0xEB81, GR32, 0>; def ICMH : TernaryRSY<"icmh", 0xEB80, GRH32, 0>; @@ -919,6 +934,10 @@ let Defs = [CC] in { defm AL : BinaryRXPair<"al", 0x5E, 0xE35E, addc, GR32, load, 4>; def ALGF : BinaryRXY<"algf", 0xE31A, addc, GR64, azextloadi32, 4>; def ALG : BinaryRXY<"alg", 0xE30A, addc, GR64, load, 8>; + + // Addition to memory. + def ALSI : BinarySIY<"alsi", 0xEB6E, null_frag, imm32sx8>; + def ALGSI : BinarySIY<"algsi", 0xEB7E, null_frag, imm64sx8>; } defm : ZXB<addc, GR64, ALGFR>; @@ -1166,9 +1185,14 @@ def MSGF : BinaryRXY<"msgf", 0xE31C, mul, GR64, asextloadi32, 4>; def MSG : BinaryRXY<"msg", 0xE30C, mul, GR64, load, 8>; // Multiplication of a register, producing two results. +def MR : BinaryRR <"mr", 0x1C, null_frag, GR128, GR32>; +def MLR : BinaryRRE<"mlr", 0xB996, null_frag, GR128, GR32>; def MLGR : BinaryRRE<"mlgr", 0xB986, z_umul_lohi64, GR128, GR64>; // Multiplication of memory, producing two results. +def M : BinaryRX <"m", 0x5C, null_frag, GR128, load, 4>; +def MFY : BinaryRXY<"mfy", 0xE35C, null_frag, GR128, load, 4>; +def ML : BinaryRXY<"ml", 0xE396, null_frag, GR128, load, 4>; def MLG : BinaryRXY<"mlg", 0xE386, z_umul_lohi64, GR128, load, 8>; //===----------------------------------------------------------------------===// @@ -1177,12 +1201,14 @@ def MLG : BinaryRXY<"mlg", 0xE386, z_umul_lohi64, GR128, load, 8>; let hasSideEffects = 1 in { // Do not speculatively execute. // Division and remainder, from registers. + def DR : BinaryRR <"dr", 0x1D, null_frag, GR128, GR32>; def DSGFR : BinaryRRE<"dsgfr", 0xB91D, z_sdivrem32, GR128, GR32>; def DSGR : BinaryRRE<"dsgr", 0xB90D, z_sdivrem64, GR128, GR64>; def DLR : BinaryRRE<"dlr", 0xB997, z_udivrem32, GR128, GR32>; def DLGR : BinaryRRE<"dlgr", 0xB987, z_udivrem64, GR128, GR64>; // Division and remainder, from memory. + def D : BinaryRX <"d", 0x5D, null_frag, GR128, load, 4>; def DSGF : BinaryRXY<"dsgf", 0xE31D, z_sdivrem32, GR128, load, 4>; def DSG : BinaryRXY<"dsg", 0xE30D, z_sdivrem64, GR128, load, 8>; def DL : BinaryRXY<"dl", 0xE397, z_udivrem32, GR128, load, 4>; @@ -1193,23 +1219,32 @@ let hasSideEffects = 1 in { // Do not speculatively execute. // Shifts //===----------------------------------------------------------------------===// -// Shift left. +// Logical shift left. let hasSideEffects = 0 in { defm SLL : BinaryRSAndK<"sll", 0x89, 0xEBDF, shl, GR32>; - defm SLA : BinaryRSAndK<"sla", 0x8B, 0xEBDD, null_frag, GR32>; def SLLG : BinaryRSY<"sllg", 0xEB0D, shl, GR64>; + def SLDL : BinaryRS<"sldl", 0x8D, null_frag, GR128>; +} + +// Arithmetic shift left. +let Defs = [CC] in { + defm SLA : BinaryRSAndK<"sla", 0x8B, 0xEBDD, null_frag, GR32>; + def SLAG : BinaryRSY<"slag", 0xEB0B, null_frag, GR64>; + def SLDA : BinaryRS<"slda", 0x8F, null_frag, GR128>; } // Logical shift right. let hasSideEffects = 0 in { defm SRL : BinaryRSAndK<"srl", 0x88, 0xEBDE, srl, GR32>; def SRLG : BinaryRSY<"srlg", 0xEB0C, srl, GR64>; + def SRDL : BinaryRS<"srdl", 0x8C, null_frag, GR128>; } // Arithmetic shift right. let Defs = [CC], CCValues = 0xE, CompareZeroCCMask = 0xE in { defm SRA : BinaryRSAndK<"sra", 0x8A, 0xEBDC, sra, GR32>; def SRAG : BinaryRSY<"srag", 0xEB0A, sra, GR64>; + def SRDA : BinaryRS<"srda", 0x8E, null_frag, GR128>; } // Rotate left. @@ -1351,8 +1386,12 @@ let Defs = [CC], CCValues = 0xE, IsLogical = 1 in { defm : ZXB<z_ucmp, GR64, CLGFR>; // Memory-to-memory comparison. -let mayLoad = 1, Defs = [CC] in +let mayLoad = 1, Defs = [CC] in { defm CLC : MemorySS<"clc", 0xD5, z_clc, z_clc_loop>; + def CLCL : SideEffectBinaryMemMemRR<"clcl", 0x0F, GR128, GR128>; + def CLCLE : SideEffectTernaryMemMemRS<"clcle", 0xA9, GR128, GR128>; + def CLCLU : SideEffectTernaryMemMemRSY<"clclu", 0xEB8F, GR128, GR128>; +} // String comparison. let mayLoad = 1, Defs = [CC] in @@ -1381,6 +1420,12 @@ let Defs = [CC] in { def TML : InstAlias<"tml\t$R, $I", (TMLL GR32:$R, imm32ll16:$I), 0>; def TMH : InstAlias<"tmh\t$R, $I", (TMLH GR32:$R, imm32lh16:$I), 0>; +// Compare logical characters under mask -- not (yet) used for codegen. +let Defs = [CC] in { + defm CLM : CompareRSPair<"clm", 0xBD, 0xEB21, GR32, 0>; + def CLMH : CompareRSY<"clmh", 0xEB20, GRH32, 0>; +} + //===----------------------------------------------------------------------===// // Prefetch and execution hint //===----------------------------------------------------------------------===// @@ -1581,6 +1626,115 @@ let Predicates = [FeatureInterlockedAccess1], Defs = [CC] in { } //===----------------------------------------------------------------------===// +// Translate and convert +//===----------------------------------------------------------------------===// + +let mayLoad = 1, mayStore = 1 in + def TR : SideEffectBinarySSa<"tr", 0xDC>; + +let mayLoad = 1, Defs = [CC, R0L, R1D] in { + def TRT : SideEffectBinarySSa<"trt", 0xDD>; + def TRTR : SideEffectBinarySSa<"trtr", 0xD0>; +} + +let mayLoad = 1, mayStore = 1, Uses = [R0L] in + def TRE : SideEffectBinaryMemMemRRE<"tre", 0xB2A5, GR128, GR64>; + +let mayLoad = 1, Uses = [R1D], Defs = [CC] in { + defm TRTE : BinaryMemRRFcOpt<"trte", 0xB9BF, GR128, GR64>; + defm TRTRE : BinaryMemRRFcOpt<"trtre", 0xB9BD, GR128, GR64>; +} + +let mayLoad = 1, mayStore = 1, Uses = [R0L, R1D], Defs = [CC] in { + defm TROO : SideEffectTernaryMemMemRRFcOpt<"troo", 0xB993, GR128, GR64>; + defm TROT : SideEffectTernaryMemMemRRFcOpt<"trot", 0xB992, GR128, GR64>; + defm TRTO : SideEffectTernaryMemMemRRFcOpt<"trto", 0xB991, GR128, GR64>; + defm TRTT : SideEffectTernaryMemMemRRFcOpt<"trtt", 0xB990, GR128, GR64>; +} + +let mayLoad = 1, mayStore = 1, Defs = [CC] in { + defm CU12 : SideEffectTernaryMemMemRRFcOpt<"cu12", 0xB2A7, GR128, GR128>; + defm CU14 : SideEffectTernaryMemMemRRFcOpt<"cu14", 0xB9B0, GR128, GR128>; + defm CU21 : SideEffectTernaryMemMemRRFcOpt<"cu21", 0xB2A6, GR128, GR128>; + defm CU24 : SideEffectTernaryMemMemRRFcOpt<"cu24", 0xB9B1, GR128, GR128>; + def CU41 : SideEffectBinaryMemMemRRE<"cu41", 0xB9B2, GR128, GR128>; + def CU42 : SideEffectBinaryMemMemRRE<"cu42", 0xB9B3, GR128, GR128>; + + let isAsmParserOnly = 1 in { + defm CUUTF : SideEffectTernaryMemMemRRFcOpt<"cuutf", 0xB2A6, GR128, GR128>; + defm CUTFU : SideEffectTernaryMemMemRRFcOpt<"cutfu", 0xB2A7, GR128, GR128>; + } +} + +//===----------------------------------------------------------------------===// +// Message-security assist +//===----------------------------------------------------------------------===// + +let mayLoad = 1, mayStore = 1, Uses = [R0L, R1D], Defs = [CC] in { + def KM : SideEffectBinaryMemMemRRE<"km", 0xB92E, GR128, GR128>; + def KMC : SideEffectBinaryMemMemRRE<"kmc", 0xB92F, GR128, GR128>; + + def KIMD : SideEffectBinaryMemRRE<"kimd", 0xB93E, GR64, GR128>; + def KLMD : SideEffectBinaryMemRRE<"klmd", 0xB93F, GR64, GR128>; + def KMAC : SideEffectBinaryMemRRE<"kmac", 0xB91E, GR64, GR128>; + + let Predicates = [FeatureMessageSecurityAssist4] in { + def KMF : SideEffectBinaryMemMemRRE<"kmf", 0xB92A, GR128, GR128>; + def KMO : SideEffectBinaryMemMemRRE<"kmo", 0xB92B, GR128, GR128>; + def KMCTR : SideEffectTernaryMemMemMemRRFb<"kmctr", 0xB92D, + GR128, GR128, GR128>; + def PCC : SideEffectInherentRRE<"pcc", 0xB92C>; + } + let Predicates = [FeatureMessageSecurityAssist5] in + def PPNO : SideEffectBinaryMemMemRRE<"ppno", 0xB93C, GR128, GR128>; +} + +//===----------------------------------------------------------------------===// +// Decimal arithmetic +//===----------------------------------------------------------------------===// + +defm CVB : BinaryRXPair<"cvb",0x4F, 0xE306, null_frag, GR32, load, 4>; +def CVBG : BinaryRXY<"cvbg", 0xE30E, null_frag, GR64, load, 8>; + +defm CVD : StoreRXPair<"cvd", 0x4E, 0xE326, null_frag, GR32, 4>; +def CVDG : StoreRXY<"cvdg", 0xE32E, null_frag, GR64, 8>; + +let mayLoad = 1, mayStore = 1 in { + def MVN : SideEffectBinarySSa<"mvn", 0xD1>; + def MVZ : SideEffectBinarySSa<"mvz", 0xD3>; + def MVO : SideEffectBinarySSb<"mvo", 0xF1>; + + def PACK : SideEffectBinarySSb<"pack", 0xF2>; + def PKA : SideEffectBinarySSf<"pka", 0xE9>; + def PKU : SideEffectBinarySSf<"pku", 0xE1>; + def UNPK : SideEffectBinarySSb<"unpk", 0xF3>; + let Defs = [CC] in { + def UNPKA : SideEffectBinarySSa<"unpka", 0xEA>; + def UNPKU : SideEffectBinarySSa<"unpku", 0xE2>; + } +} + +let mayLoad = 1, mayStore = 1 in { + let Defs = [CC] in { + def AP : SideEffectBinarySSb<"ap", 0xFA>; + def SP : SideEffectBinarySSb<"sp", 0xFB>; + def ZAP : SideEffectBinarySSb<"zap", 0xF8>; + def SRP : SideEffectTernarySSc<"srp", 0xF0>; + } + def MP : SideEffectBinarySSb<"mp", 0xFC>; + def DP : SideEffectBinarySSb<"dp", 0xFD>; + let Defs = [CC] in { + def ED : SideEffectBinarySSa<"ed", 0xDE>; + def EDMK : SideEffectBinarySSa<"edmk", 0xDF>; + } +} + +let Defs = [CC] in { + def CP : CompareSSb<"cp", 0xF9>; + def TP : TestRSL<"tp", 0xEBC0>; +} + +//===----------------------------------------------------------------------===// // Access registers //===----------------------------------------------------------------------===// @@ -1712,12 +1866,39 @@ let usesCustomInserter = 1 in { // Search a block of memory for a character. let mayLoad = 1, Defs = [CC] in - defm SRST : StringRRE<"srst", 0xb25e, z_search_string>; + defm SRST : StringRRE<"srst", 0xB25E, z_search_string>; +let mayLoad = 1, Defs = [CC], Uses = [R0L] in + def SRSTU : SideEffectBinaryMemMemRRE<"srstu", 0xB9BE, GR64, GR64>; + +// Compare until substring equal. +let mayLoad = 1, Defs = [CC], Uses = [R0L, R1L] in + def CUSE : SideEffectBinaryMemMemRRE<"cuse", 0xB257, GR128, GR128>; + +// Compare and form codeword. +let mayLoad = 1, Defs = [CC, R1D, R2D, R3D], Uses = [R1D, R2D, R3D] in + def CFC : SideEffectAddressS<"cfc", 0xB21A, null_frag>; + +// Update tree. +let mayLoad = 1, mayStore = 1, Defs = [CC, R0D, R1D, R2D, R3D, R5D], + Uses = [R0D, R1D, R2D, R3D, R4D, R5D] in + def UPT : SideEffectInherentE<"upt", 0x0102>; + +// Checksum. +let mayLoad = 1, Defs = [CC] in + def CKSM : SideEffectBinaryMemMemRRE<"cksm", 0xB241, GR64, GR128>; + +// Compression call. +let mayLoad = 1, mayStore = 1, Defs = [CC, R1D], Uses = [R0L, R1D] in + def CMPSC : SideEffectBinaryMemMemRRE<"cmpsc", 0xB263, GR128, GR128>; // Supervisor call. let hasSideEffects = 1, isCall = 1, Defs = [CC] in def SVC : SideEffectUnaryI<"svc", 0x0A, imm32zx8>; +// Monitor call. +let hasSideEffects = 1, isCall = 1 in + def MC : SideEffectBinarySI<"mc", 0xAF, imm32zx8>; + // Store clock. let hasSideEffects = 1, Defs = [CC] in { def STCK : StoreInherentS<"stck", 0xB205, null_frag, 8>; @@ -1729,10 +1910,18 @@ let hasSideEffects = 1, Defs = [CC] in { let hasSideEffects = 1, Uses = [R0D], Defs = [R0D, CC] in def STFLE : StoreInherentS<"stfle", 0xB2B0, null_frag, 0>; +// Extract CPU attribute. +let hasSideEffects = 1 in + def ECAG : BinaryRSY<"ecag", 0xEB4C, null_frag, GR64>; + // Extract CPU time. let Defs = [R0D, R1D], hasSideEffects = 1, mayLoad = 1 in def ECTG : SideEffectTernarySSF<"ectg", 0xC81, GR64>; +// Extract PSW. +let hasSideEffects = 1, Uses = [CC] in + def EPSW : InherentDualRRE<"epsw", 0xB98D, GR32>; + // Execute. let hasSideEffects = 1 in { def EX : SideEffectBinaryRX<"ex", 0x44, GR64>; diff --git a/lib/Target/SystemZ/SystemZOperands.td b/lib/Target/SystemZ/SystemZOperands.td index 7bb4fe5afb3f..713612129d90 100644 --- a/lib/Target/SystemZ/SystemZOperands.td +++ b/lib/Target/SystemZ/SystemZOperands.td @@ -531,6 +531,7 @@ def BDAddr64Disp12 : AddressAsmOperand<"BDAddr", "64", "12">; def BDAddr64Disp20 : AddressAsmOperand<"BDAddr", "64", "20">; def BDXAddr64Disp12 : AddressAsmOperand<"BDXAddr", "64", "12">; def BDXAddr64Disp20 : AddressAsmOperand<"BDXAddr", "64", "20">; +def BDLAddr64Disp12Len4 : AddressAsmOperand<"BDLAddr", "64", "12", "Len4">; def BDLAddr64Disp12Len8 : AddressAsmOperand<"BDLAddr", "64", "12", "Len8">; def BDRAddr64Disp12 : AddressAsmOperand<"BDRAddr", "64", "12">; def BDVAddr64Disp12 : AddressAsmOperand<"BDVAddr", "64", "12">; @@ -578,6 +579,7 @@ def bdxaddr20pair : BDXMode<"BDXAddr", "64", "20", "Pair">; def dynalloc12only : BDXMode<"DynAlloc", "64", "12", "Only">; def laaddr12pair : BDXMode<"LAAddr", "64", "12", "Pair">; def laaddr20pair : BDXMode<"LAAddr", "64", "20", "Pair">; +def bdladdr12onlylen4 : BDLMode<"BDLAddr", "64", "12", "Only", "4">; def bdladdr12onlylen8 : BDLMode<"BDLAddr", "64", "12", "Only", "8">; def bdraddr12only : BDRMode<"BDRAddr", "64", "12", "Only">; def bdvaddr12only : BDVMode< "64", "12">; diff --git a/lib/Target/SystemZ/SystemZOperators.td b/lib/Target/SystemZ/SystemZOperators.td index fde26ed4e1c5..adfc69c5d4cf 100644 --- a/lib/Target/SystemZ/SystemZOperators.td +++ b/lib/Target/SystemZ/SystemZOperators.td @@ -10,7 +10,8 @@ //===----------------------------------------------------------------------===// // Type profiles //===----------------------------------------------------------------------===// -def SDT_CallSeqStart : SDCallSeqStart<[SDTCisVT<0, i64>]>; +def SDT_CallSeqStart : SDCallSeqStart<[SDTCisVT<0, i64>, + SDTCisVT<1, i64>]>; def SDT_CallSeqEnd : SDCallSeqEnd<[SDTCisVT<0, i64>, SDTCisVT<1, i64>]>; def SDT_ZCall : SDTypeProfile<0, -1, [SDTCisPtrTy<0>]>; diff --git a/lib/Target/SystemZ/SystemZSchedule.td b/lib/Target/SystemZ/SystemZSchedule.td index dbba8ab42b5a..1ce0168f95e9 100644 --- a/lib/Target/SystemZ/SystemZSchedule.td +++ b/lib/Target/SystemZ/SystemZSchedule.td @@ -56,12 +56,16 @@ def LSU_lat1 : SchedWrite; // Floating point unit (zEC12 and earlier) def FPU : SchedWrite; def FPU2 : SchedWrite; +def DFU : SchedWrite; +def DFU2 : SchedWrite; // Vector sub units (z13) def VecBF : SchedWrite; def VecBF2 : SchedWrite; def VecDF : SchedWrite; def VecDF2 : SchedWrite; +def VecDFX : SchedWrite; +def VecDFX2 : SchedWrite; def VecFPd : SchedWrite; // Blocking BFP div/sqrt unit. def VecMul : SchedWrite; def VecStr : SchedWrite; diff --git a/lib/Target/SystemZ/SystemZScheduleZ13.td b/lib/Target/SystemZ/SystemZScheduleZ13.td index 7aee6f52e9a7..612c3b6cf96e 100644 --- a/lib/Target/SystemZ/SystemZScheduleZ13.td +++ b/lib/Target/SystemZ/SystemZScheduleZ13.td @@ -76,6 +76,8 @@ def : WriteRes<VecBF, [Z13_VecUnit]> { let Latency = 8; } def : WriteRes<VecBF2, [Z13_VecUnit, Z13_VecUnit]> { let Latency = 9; } def : WriteRes<VecDF, [Z13_VecUnit]> { let Latency = 8; } def : WriteRes<VecDF2, [Z13_VecUnit, Z13_VecUnit]> { let Latency = 9; } +def : WriteRes<VecDFX, [Z13_VecUnit]> { let Latency = 1; } +def : WriteRes<VecDFX2, [Z13_VecUnit, Z13_VecUnit]> { let Latency = 2; } def : WriteRes<VecFPd, [Z13_VecFPdUnit, Z13_VecFPdUnit, Z13_VecFPdUnit, Z13_VecFPdUnit, Z13_VecFPdUnit, Z13_VecFPdUnit, Z13_VecFPdUnit, Z13_VecFPdUnit, Z13_VecFPdUnit, @@ -179,6 +181,7 @@ def : InstRW<[FXb, LSU, Lat5], (instregex "MVI(Y)?$")>; // Move character def : InstRW<[FXb, LSU, LSU, LSU, Lat8, GroupAlone], (instregex "MVC$")>; +def : InstRW<[LSU, Lat30, GroupAlone], (instregex "MVCL(E|U)?$")>; // Pseudo -> reg move def : InstRW<[FXa], (instregex "COPY(_TO_REGCLASS)?$")>; @@ -268,6 +271,7 @@ def : InstRW<[FXb, LSU, Lat5], (instregex "LLG(F|T)?AT$")>; def : InstRW<[FXb, LSU, Lat5], (instregex "STC(H|Y|Mux)?$")>; def : InstRW<[FXb, LSU, Lat5], (instregex "STH(H|Y|RL|Mux)?$")>; +def : InstRW<[FXb, LSU, Lat5], (instregex "STCM(H|Y)?$")>; //===----------------------------------------------------------------------===// // Multi-register moves @@ -277,6 +281,9 @@ def : InstRW<[FXb, LSU, Lat5], (instregex "STH(H|Y|RL|Mux)?$")>; def : InstRW<[LSU, LSU, LSU, LSU, LSU, Lat10, GroupAlone], (instregex "LM(H|Y|G)?$")>; +// Load multiple disjoint +def : InstRW<[FXb, Lat30, GroupAlone], (instregex "LMD$")>; + // Store multiple (estimated average of ceil(5/2) FXb ops) def : InstRW<[LSU, LSU, FXb, FXb, FXb, Lat10, GroupAlone], (instregex "STM(G|H|Y)?$")>; @@ -288,6 +295,7 @@ def : InstRW<[LSU, LSU, FXb, FXb, FXb, Lat10, def : InstRW<[FXa], (instregex "LRV(G)?R$")>; def : InstRW<[FXa, LSU, Lat5], (instregex "LRV(G|H)?$")>; def : InstRW<[FXb, LSU, Lat5], (instregex "STRV(G|H)?$")>; +def : InstRW<[LSU, Lat30, GroupAlone], (instregex "MVCIN$")>; //===----------------------------------------------------------------------===// // Load address instructions @@ -345,7 +353,7 @@ def : InstRW<[FXa], (instregex "ALGF(I|R)$")>; def : InstRW<[FXa], (instregex "ALGR(K)?$")>; def : InstRW<[FXa], (instregex "ALR(K)?$")>; def : InstRW<[FXa], (instregex "AR(K)?$")>; -def : InstRW<[FXb, LSU, Lat5], (instregex "A(G)?SI$")>; +def : InstRW<[FXb, LSU, Lat5], (instregex "A(L)?(G)?SI$")>; // Logical addition with carry def : InstRW<[FXa, LSU, Lat6, GroupAlone], (instregex "ALC(G)?$")>; @@ -438,11 +446,15 @@ def : InstRW<[FXa, Lat9, GroupAlone], (instregex "MLGR$")>; def : InstRW<[FXa, Lat5], (instregex "MGHI$")>; def : InstRW<[FXa, Lat5], (instregex "MHI$")>; def : InstRW<[FXa, LSU, Lat9], (instregex "MH(Y)?$")>; +def : InstRW<[FXa, Lat7, GroupAlone], (instregex "M(L)?R$")>; +def : InstRW<[FXa, LSU, Lat7, GroupAlone], (instregex "M(FY|L)?$")>; //===----------------------------------------------------------------------===// // Division and remainder //===----------------------------------------------------------------------===// +def : InstRW<[FXa2, FXa2, Lat20, GroupAlone], (instregex "DR$")>; +def : InstRW<[FXa2, FXa2, LSU, Lat30, GroupAlone], (instregex "D$")>; def : InstRW<[FXa, Lat30, GroupAlone], (instregex "DSG(F)?R$")>; def : InstRW<[LSU, FXa, Lat30, GroupAlone], (instregex "DSG(F)?$")>; def : InstRW<[FXa2, FXa2, Lat20, GroupAlone], (instregex "DLR$")>; @@ -456,7 +468,8 @@ def : InstRW<[FXa2, FXa2, LSU, Lat30, GroupAlone], (instregex "DL(G)?$")>; def : InstRW<[FXa], (instregex "SLL(G|K)?$")>; def : InstRW<[FXa], (instregex "SRL(G|K)?$")>; def : InstRW<[FXa], (instregex "SRA(G|K)?$")>; -def : InstRW<[FXa], (instregex "SLA(K)?$")>; +def : InstRW<[FXa], (instregex "SLA(G|K)?$")>; +def : InstRW<[FXa, FXa, FXa, FXa, Lat8], (instregex "S(L|R)D(A|L)$")>; // Rotate def : InstRW<[FXa, LSU, Lat6], (instregex "RLL(G)?$")>; @@ -505,7 +518,7 @@ def : InstRW<[FXb, Lat2], (instregex "CGFR$")>; // Compare logical character def : InstRW<[FXb, LSU, LSU, Lat9, BeginGroup], (instregex "CLC$")>; - +def : InstRW<[LSU, Lat30, GroupAlone], (instregex "CLCL(E|U)?$")>; def : InstRW<[LSU, Lat30, GroupAlone], (instregex "CLST$")>; // Test under mask @@ -516,6 +529,9 @@ def : InstRW<[FXb], (instregex "TMHL(64)?$")>; def : InstRW<[FXb], (instregex "TMLH(64)?$")>; def : InstRW<[FXb], (instregex "TMLL(64)?$")>; +// Compare logical characters under mask +def : InstRW<[FXb, LSU, Lat5], (instregex "CLM(H|Y)?$")>; + //===----------------------------------------------------------------------===// // Prefetch and execution hint //===----------------------------------------------------------------------===// @@ -563,6 +579,42 @@ def : InstRW<[FXb, FXb, LSU, Lat6, GroupAlone], (instregex "STPQ$")>; def : InstRW<[LSU, LSU, Lat5, GroupAlone], (instregex "LPD(G)?$")>; //===----------------------------------------------------------------------===// +// Translate and convert +//===----------------------------------------------------------------------===// + +def : InstRW<[FXa, Lat30, GroupAlone], (instregex "TR(T|TR)?(E|EOpt)?$")>; +def : InstRW<[FXa, Lat30, GroupAlone], (instregex "TR(T|O)(T|O)(Opt)?$")>; +def : InstRW<[FXa, Lat30, GroupAlone], (instregex "CU(12|14|21|24|41|42)(Opt)?$")>; +def : InstRW<[FXa, Lat30, GroupAlone], (instregex "(CUUTF|CUTFU)(Opt)?$")>; + +//===----------------------------------------------------------------------===// +// Message-security assist +//===----------------------------------------------------------------------===// + +def : InstRW<[FXa, Lat30, GroupAlone], (instregex "KM(C|F|O|CTR)?$")>; +def : InstRW<[FXa, Lat30, GroupAlone], (instregex "(KIMD|KLMD|KMAC|PCC|PPNO)$")>; + +//===----------------------------------------------------------------------===// +// Decimal arithmetic +//===----------------------------------------------------------------------===// + +def : InstRW<[FXb, VecDF, LSU, Lat30, GroupAlone], (instregex "CVB(Y|G)?$")>; +def : InstRW<[FXb, VecDF, FXb, Lat30, GroupAlone], (instregex "CVD(Y|G)?$")>; +def : InstRW<[LSU, Lat30, GroupAlone], (instregex "MV(N|Z|O)$")>; +def : InstRW<[LSU, Lat30, GroupAlone], (instregex "(PACK|PKA|PKU)$")>; +def : InstRW<[LSU, Lat30, GroupAlone], (instregex "UNPK(A|U)?$")>; + +def : InstRW<[FXb, VecDFX, LSU, LSU, Lat9, GroupAlone], + (instregex "(A|S|ZA)P$")>; +def : InstRW<[FXb, VecDFX2, LSU, LSU, Lat30, GroupAlone], + (instregex "(M|D)P$")>; +def : InstRW<[FXb, FXb, VecDFX2, LSU, LSU, LSU, Lat15, GroupAlone], + (instregex "SRP$")>; +def : InstRW<[VecDFX, LSU, LSU, Lat5, GroupAlone], (instregex "CP$")>; +def : InstRW<[VecDFX, LSU, Lat4, GroupAlone], (instregex "TP$")>; +def : InstRW<[LSU, Lat30, GroupAlone], (instregex "ED(MK)?$")>; + +//===----------------------------------------------------------------------===// // Access registers //===----------------------------------------------------------------------===// @@ -640,13 +692,30 @@ def : InstRW<[FXa], (instregex "ZEXT128_(32|64)$")>; // String instructions def : InstRW<[FXa, LSU, Lat30], (instregex "SRST$")>; +def : InstRW<[LSU, Lat30], (instregex "SRSTU$")>; +def : InstRW<[LSU, Lat30, GroupAlone], (instregex "CUSE$")>; + +// Various complex instructions +def : InstRW<[LSU, Lat30, GroupAlone], (instregex "CFC$")>; +def : InstRW<[LSU, Lat30, GroupAlone], (instregex "UPT$")>; +def : InstRW<[LSU, Lat30, GroupAlone], (instregex "CKSM$")>; +def : InstRW<[LSU, Lat30, GroupAlone], (instregex "CMPSC$")>; // Move with key def : InstRW<[FXa, FXa, FXb, LSU, Lat8, GroupAlone], (instregex "MVCK$")>; +// Monitor call +def : InstRW<[FXb], (instregex "MC$")>; + +// Extract CPU attribute +def : InstRW<[FXb, Lat30], (instregex "ECAG$")>; + // Extract CPU Time def : InstRW<[FXa, Lat5, LSU], (instregex "ECTG$")>; +// Extract PSW +def : InstRW<[FXb, Lat30], (instregex "EPSW$")>; + // Execute def : InstRW<[FXb, GroupAlone], (instregex "EX(RL)?$")>; @@ -811,14 +880,17 @@ def : InstRW<[VecFPd, LSU], (instregex "D(E|D)B$")>; def : InstRW<[VecFPd], (instregex "D(E|D)BR$")>; def : InstRW<[VecFPd, VecFPd, GroupAlone], (instregex "DXBR$")>; +// Divide to integer +def : InstRW<[VecFPd, Lat30, GroupAlone], (instregex "DI(E|D)BR$")>; + //===----------------------------------------------------------------------===// // FP: Comparisons //===----------------------------------------------------------------------===// // Compare -def : InstRW<[VecXsPm, LSU, Lat8], (instregex "C(E|D)B$")>; -def : InstRW<[VecXsPm, Lat4], (instregex "C(E|D)BR?$")>; -def : InstRW<[VecDF, VecDF, Lat20, GroupAlone], (instregex "CXBR$")>; +def : InstRW<[VecXsPm, LSU, Lat8], (instregex "(K|C)(E|D)B$")>; +def : InstRW<[VecXsPm, Lat4], (instregex "(K|C)(E|D)BR?$")>; +def : InstRW<[VecDF, VecDF, Lat20, GroupAlone], (instregex "(K|C)XBR$")>; // Test Data Class def : InstRW<[LSU, VecXsPm, Lat9], (instregex "TC(E|D)B$")>; diff --git a/lib/Target/SystemZ/SystemZScheduleZ196.td b/lib/Target/SystemZ/SystemZScheduleZ196.td index a950e54e7601..670df8ff5541 100644 --- a/lib/Target/SystemZ/SystemZScheduleZ196.td +++ b/lib/Target/SystemZ/SystemZScheduleZ196.td @@ -59,6 +59,7 @@ def : WriteRes<Lat30, []> { let Latency = 30; let NumMicroOps = 0;} def Z196_FXUnit : ProcResource<2>; def Z196_LSUnit : ProcResource<2>; def Z196_FPUnit : ProcResource<1>; +def Z196_DFUnit : ProcResource<1>; // Subtarget specific definitions of scheduling resources. def : WriteRes<FXU, [Z196_FXUnit]> { let Latency = 1; } @@ -66,6 +67,8 @@ def : WriteRes<LSU, [Z196_LSUnit]> { let Latency = 4; } def : WriteRes<LSU_lat1, [Z196_LSUnit]> { let Latency = 1; } def : WriteRes<FPU, [Z196_FPUnit]> { let Latency = 8; } def : WriteRes<FPU2, [Z196_FPUnit, Z196_FPUnit]> { let Latency = 9; } +def : WriteRes<DFU, [Z196_DFUnit]> { let Latency = 2; } +def : WriteRes<DFU2, [Z196_DFUnit, Z196_DFUnit]> { let Latency = 3; } // -------------------------- INSTRUCTIONS ---------------------------------- // @@ -152,6 +155,7 @@ def : InstRW<[FXU, LSU, Lat5], (instregex "MVI(Y)?$")>; // Move character def : InstRW<[LSU, LSU, LSU, FXU, Lat8, GroupAlone], (instregex "MVC$")>; +def : InstRW<[LSU, Lat30, GroupAlone], (instregex "MVCL(E|U)?$")>; // Pseudo -> reg move def : InstRW<[FXU], (instregex "COPY(_TO_REGCLASS)?$")>; @@ -226,6 +230,7 @@ def : InstRW<[LSU], (instregex "LLG(C|F|H|T|FRL|HRL)$")>; def : InstRW<[FXU, LSU, Lat5], (instregex "STC(H|Y|Mux)?$")>; def : InstRW<[FXU, LSU, Lat5], (instregex "STH(H|Y|RL|Mux)?$")>; +def : InstRW<[FXU, LSU, Lat5], (instregex "STCM(H|Y)?$")>; //===----------------------------------------------------------------------===// // Multi-register moves @@ -235,6 +240,9 @@ def : InstRW<[FXU, LSU, Lat5], (instregex "STH(H|Y|RL|Mux)?$")>; def : InstRW<[LSU, LSU, LSU, LSU, LSU, Lat10, GroupAlone], (instregex "LM(H|Y|G)?$")>; +// Load multiple disjoint +def : InstRW<[LSU, Lat30, GroupAlone], (instregex "LMD$")>; + // Store multiple (estimated average of 3 ops) def : InstRW<[LSU, LSU, FXU, FXU, FXU, Lat10, GroupAlone], (instregex "STM(H|Y|G)?$")>; @@ -246,6 +254,7 @@ def : InstRW<[LSU, LSU, FXU, FXU, FXU, Lat10, GroupAlone], def : InstRW<[FXU], (instregex "LRV(G)?R$")>; def : InstRW<[FXU, LSU, Lat5], (instregex "LRV(G|H)?$")>; def : InstRW<[FXU, LSU, Lat5], (instregex "STRV(G|H)?$")>; +def : InstRW<[LSU, Lat30, GroupAlone], (instregex "MVCIN$")>; //===----------------------------------------------------------------------===// // Load address instructions @@ -285,7 +294,7 @@ def : InstRW<[FXU], (instregex "IILL(64)?$")>; // Addition //===----------------------------------------------------------------------===// -def : InstRW<[FXU, LSU, Lat5], (instregex "A(Y|SI)?$")>; +def : InstRW<[FXU, LSU, Lat5], (instregex "A(L)?(Y|SI)?$")>; def : InstRW<[FXU, FXU, LSU, Lat6, GroupAlone], (instregex "AH(Y)?$")>; def : InstRW<[FXU], (instregex "AIH$")>; def : InstRW<[FXU], (instregex "AFI(Mux)?$")>; @@ -294,15 +303,14 @@ def : InstRW<[FXU], (instregex "AGHI(K)?$")>; def : InstRW<[FXU], (instregex "AGR(K)?$")>; def : InstRW<[FXU], (instregex "AHI(K)?$")>; def : InstRW<[FXU], (instregex "AHIMux(K)?$")>; -def : InstRW<[FXU, LSU, Lat5], (instregex "AL(Y)?$")>; def : InstRW<[FXU], (instregex "AL(FI|HSIK)$")>; -def : InstRW<[FXU, LSU, Lat5], (instregex "ALG(F)?$")>; +def : InstRW<[FXU, LSU, Lat5], (instregex "ALGF$")>; def : InstRW<[FXU], (instregex "ALGHSIK$")>; def : InstRW<[FXU], (instregex "ALGF(I|R)$")>; def : InstRW<[FXU], (instregex "ALGR(K)?$")>; def : InstRW<[FXU], (instregex "ALR(K)?$")>; def : InstRW<[FXU], (instregex "AR(K)?$")>; -def : InstRW<[FXU, LSU, Lat5], (instregex "AG(SI)?$")>; +def : InstRW<[FXU, LSU, Lat5], (instregex "A(L)?G(SI)?$")>; // Logical addition with carry def : InstRW<[FXU, LSU, Lat7, GroupAlone], (instregex "ALC(G)?$")>; @@ -395,11 +403,17 @@ def : InstRW<[FXU, Lat9, GroupAlone], (instregex "MLGR$")>; def : InstRW<[FXU, Lat5], (instregex "MGHI$")>; def : InstRW<[FXU, Lat5], (instregex "MHI$")>; def : InstRW<[FXU, LSU, Lat9], (instregex "MH(Y)?$")>; +def : InstRW<[FXU, Lat7, GroupAlone], (instregex "M(L)?R$")>; +def : InstRW<[FXU, LSU, Lat7, GroupAlone], (instregex "M(FY|L)?$")>; //===----------------------------------------------------------------------===// // Division and remainder //===----------------------------------------------------------------------===// +def : InstRW<[FPU2, FPU2, FXU, FXU, FXU, FXU, FXU, Lat30, GroupAlone], + (instregex "DR$")>; +def : InstRW<[FPU2, FPU2, LSU, FXU, FXU, FXU, FXU, Lat30, GroupAlone], + (instregex "D$")>; def : InstRW<[FPU2, FPU2, FXU, FXU, FXU, FXU, Lat30, GroupAlone], (instregex "DSG(F)?R$")>; def : InstRW<[FPU2, FPU2, LSU, FXU, FXU, FXU, Lat30, GroupAlone], @@ -416,7 +430,8 @@ def : InstRW<[FPU2, FPU2, LSU, FXU, FXU, FXU, FXU, Lat30, GroupAlone], def : InstRW<[FXU], (instregex "SLL(G|K)?$")>; def : InstRW<[FXU], (instregex "SRL(G|K)?$")>; def : InstRW<[FXU], (instregex "SRA(G|K)?$")>; -def : InstRW<[FXU, Lat2], (instregex "SLA(K)?$")>; +def : InstRW<[FXU, Lat2], (instregex "SLA(G|K)?$")>; +def : InstRW<[FXU, FXU, FXU, FXU, Lat8], (instregex "S(L|R)D(A|L)$")>; // Rotate def : InstRW<[FXU, LSU, Lat6], (instregex "RLL(G)?$")>; @@ -465,7 +480,7 @@ def : InstRW<[FXU, FXU, Lat2, GroupAlone], (instregex "CGFR$")>; // Compare logical character def : InstRW<[LSU, LSU, FXU, Lat9, GroupAlone], (instregex "CLC$")>; - +def : InstRW<[LSU, Lat30, GroupAlone], (instregex "CLCL(E|U)?$")>; def : InstRW<[LSU, Lat30, GroupAlone], (instregex "CLST$")>; // Test under mask @@ -476,6 +491,9 @@ def : InstRW<[FXU], (instregex "TMHL(64)?$")>; def : InstRW<[FXU], (instregex "TMLH(64)?$")>; def : InstRW<[FXU], (instregex "TMLL(64)?$")>; +// Compare logical characters under mask +def : InstRW<[FXU, LSU, Lat5], (instregex "CLM(H|Y)?$")>; + //===----------------------------------------------------------------------===// // Prefetch //===----------------------------------------------------------------------===// @@ -520,6 +538,42 @@ def : InstRW<[FXU, FXU, LSU, LSU, Lat6, GroupAlone], (instregex "STPQ$")>; def : InstRW<[LSU, LSU, Lat5, GroupAlone], (instregex "LPD(G)?$")>; //===----------------------------------------------------------------------===// +// Translate and convert +//===----------------------------------------------------------------------===// + +def : InstRW<[FXU, Lat30, GroupAlone], (instregex "TR(T|TR)?(E|EOpt)?$")>; +def : InstRW<[FXU, Lat30, GroupAlone], (instregex "TR(T|O)(T|O)(Opt)?$")>; +def : InstRW<[FXU, Lat30, GroupAlone], (instregex "CU(12|14|21|24|41|42)(Opt)?$")>; +def : InstRW<[FXU, Lat30, GroupAlone], (instregex "(CUUTF|CUTFU)(Opt)?$")>; + +//===----------------------------------------------------------------------===// +// Message-security assist +//===----------------------------------------------------------------------===// + +def : InstRW<[FXU, Lat30, GroupAlone], (instregex "KM(C|F|O|CTR)?$")>; +def : InstRW<[FXU, Lat30, GroupAlone], (instregex "(KIMD|KLMD|KMAC|PCC)$")>; + +//===----------------------------------------------------------------------===// +// Decimal arithmetic +//===----------------------------------------------------------------------===// + +def : InstRW<[FXU, DFU, LSU, Lat30, GroupAlone], (instregex "CVB(Y|G)?$")>; +def : InstRW<[FXU, DFU, FXU, Lat30, GroupAlone], (instregex "CVD(Y|G)?$")>; +def : InstRW<[LSU, Lat30, GroupAlone], (instregex "MV(N|Z|O)$")>; +def : InstRW<[LSU, Lat30, GroupAlone], (instregex "(PACK|PKA|PKU)$")>; +def : InstRW<[LSU, Lat30, GroupAlone], (instregex "UNPK(A|U)?$")>; + +def : InstRW<[FXU, FXU, DFU2, LSU, LSU, LSU, LSU, Lat15, GroupAlone], + (instregex "(A|S|ZA)P$")>; +def : InstRW<[FXU, FXU, DFU2, LSU, LSU, LSU, LSU, Lat30, GroupAlone], + (instregex "(M|D)P$")>; +def : InstRW<[FXU, FXU, DFU2, LSU, LSU, Lat15, GroupAlone], + (instregex "SRP$")>; +def : InstRW<[DFU2, LSU, LSU, LSU, LSU, Lat11, GroupAlone], (instregex "CP$")>; +def : InstRW<[DFU2, LSU, LSU, Lat3, GroupAlone], (instregex "TP$")>; +def : InstRW<[LSU, Lat30, GroupAlone], (instregex "ED(MK)?$")>; + +//===----------------------------------------------------------------------===// // Access registers //===----------------------------------------------------------------------===// @@ -571,13 +625,30 @@ def : InstRW<[FXU], (instregex "ZEXT128_(32|64)$")>; // String instructions def : InstRW<[FXU, LSU, Lat30], (instregex "SRST$")>; +def : InstRW<[LSU, Lat30], (instregex "SRSTU$")>; +def : InstRW<[LSU, Lat30, GroupAlone], (instregex "CUSE$")>; + +// Various complex instructions +def : InstRW<[LSU, Lat30, GroupAlone], (instregex "CFC$")>; +def : InstRW<[LSU, Lat30, GroupAlone], (instregex "UPT$")>; +def : InstRW<[LSU, Lat30, GroupAlone], (instregex "CKSM$")>; +def : InstRW<[LSU, Lat30, GroupAlone], (instregex "CMPSC$")>; // Move with key def : InstRW<[LSU, Lat8, GroupAlone], (instregex "MVCK$")>; +// Monitor call +def : InstRW<[FXU], (instregex "MC$")>; + +// Extract CPU attribute +def : InstRW<[FXU, Lat30], (instregex "ECAG$")>; + // Extract CPU Time def : InstRW<[FXU, Lat5, LSU], (instregex "ECTG$")>; +// Extract PSW +def : InstRW<[FXU, Lat30], (instregex "EPSW$")>; + // Execute def : InstRW<[LSU, GroupAlone], (instregex "EX(RL)?$")>; @@ -740,14 +811,17 @@ def : InstRW<[FPU, LSU, Lat30], (instregex "D(E|D)B$")>; def : InstRW<[FPU, Lat30], (instregex "D(E|D)BR$")>; def : InstRW<[FPU2, FPU2, Lat30, GroupAlone], (instregex "DXBR$")>; +// Divide to integer +def : InstRW<[FPU, Lat30, GroupAlone], (instregex "DI(E|D)BR$")>; + //===----------------------------------------------------------------------===// // FP: Comparisons //===----------------------------------------------------------------------===// // Compare -def : InstRW<[FPU, LSU, Lat12], (instregex "C(E|D)B$")>; -def : InstRW<[FPU], (instregex "C(E|D)BR$")>; -def : InstRW<[FPU, FPU, Lat30], (instregex "CXBR$")>; +def : InstRW<[FPU, LSU, Lat12], (instregex "(K|C)(E|D)B$")>; +def : InstRW<[FPU], (instregex "(K|C)(E|D)BR$")>; +def : InstRW<[FPU, FPU, Lat30], (instregex "(K|C)XBR$")>; // Test Data Class def : InstRW<[FPU, LSU, Lat15], (instregex "TC(E|D)B$")>; diff --git a/lib/Target/SystemZ/SystemZScheduleZEC12.td b/lib/Target/SystemZ/SystemZScheduleZEC12.td index 8ab6c826f1ed..1bdb8779dc72 100644 --- a/lib/Target/SystemZ/SystemZScheduleZEC12.td +++ b/lib/Target/SystemZ/SystemZScheduleZEC12.td @@ -59,6 +59,7 @@ def : WriteRes<Lat30, []> { let Latency = 30; let NumMicroOps = 0;} def ZEC12_FXUnit : ProcResource<2>; def ZEC12_LSUnit : ProcResource<2>; def ZEC12_FPUnit : ProcResource<1>; +def ZEC12_DFUnit : ProcResource<1>; def ZEC12_VBUnit : ProcResource<1>; // Subtarget specific definitions of scheduling resources. @@ -67,6 +68,8 @@ def : WriteRes<LSU, [ZEC12_LSUnit]> { let Latency = 4; } def : WriteRes<LSU_lat1, [ZEC12_LSUnit]> { let Latency = 1; } def : WriteRes<FPU, [ZEC12_FPUnit]> { let Latency = 8; } def : WriteRes<FPU2, [ZEC12_FPUnit, ZEC12_FPUnit]> { let Latency = 9; } +def : WriteRes<DFU, [ZEC12_DFUnit]> { let Latency = 2; } +def : WriteRes<DFU2, [ZEC12_DFUnit, ZEC12_FPUnit]> { let Latency = 3; } def : WriteRes<VBU, [ZEC12_VBUnit]>; // Virtual Branching Unit // -------------------------- INSTRUCTIONS ---------------------------------- // @@ -155,6 +158,7 @@ def : InstRW<[FXU, LSU, Lat5], (instregex "MVI(Y)?$")>; // Move character def : InstRW<[LSU, LSU, LSU, FXU, Lat8, GroupAlone], (instregex "MVC$")>; +def : InstRW<[LSU, Lat30, GroupAlone], (instregex "MVCL(E|U)?$")>; // Pseudo -> reg move def : InstRW<[FXU], (instregex "COPY(_TO_REGCLASS)?$")>; @@ -236,6 +240,7 @@ def : InstRW<[FXU, LSU, Lat5], (instregex "LLG(F|T)?AT$")>; def : InstRW<[FXU, LSU, Lat5], (instregex "STC(H|Y|Mux)?$")>; def : InstRW<[FXU, LSU, Lat5], (instregex "STH(H|Y|RL|Mux)?$")>; +def : InstRW<[FXU, LSU, Lat5], (instregex "STCM(H|Y)?$")>; //===----------------------------------------------------------------------===// // Multi-register moves @@ -245,6 +250,9 @@ def : InstRW<[FXU, LSU, Lat5], (instregex "STH(H|Y|RL|Mux)?$")>; def : InstRW<[LSU, LSU, LSU, LSU, LSU, Lat10, GroupAlone], (instregex "LM(H|Y|G)?$")>; +// Load multiple disjoint +def : InstRW<[FXU, Lat30, GroupAlone], (instregex "LMD$")>; + // Store multiple (estimated average of 3 ops) def : InstRW<[LSU, LSU, FXU, FXU, FXU, Lat10, GroupAlone], (instregex "STM(H|Y|G)?$")>; @@ -256,6 +264,7 @@ def : InstRW<[LSU, LSU, FXU, FXU, FXU, Lat10, GroupAlone], def : InstRW<[FXU], (instregex "LRV(G)?R$")>; def : InstRW<[FXU, LSU, Lat5], (instregex "LRV(G|H)?$")>; def : InstRW<[FXU, LSU, Lat5], (instregex "STRV(G|H)?$")>; +def : InstRW<[LSU, Lat30, GroupAlone], (instregex "MVCIN$")>; //===----------------------------------------------------------------------===// // Load address instructions @@ -295,7 +304,7 @@ def : InstRW<[FXU], (instregex "IILL(64)?$")>; // Addition //===----------------------------------------------------------------------===// -def : InstRW<[FXU, LSU, Lat5], (instregex "A(Y|SI)?$")>; +def : InstRW<[FXU, LSU, Lat5], (instregex "A(L)?(Y|SI)?$")>; def : InstRW<[FXU, LSU, Lat6], (instregex "AH(Y)?$")>; def : InstRW<[FXU], (instregex "AIH$")>; def : InstRW<[FXU], (instregex "AFI(Mux)?$")>; @@ -304,15 +313,14 @@ def : InstRW<[FXU], (instregex "AGHI(K)?$")>; def : InstRW<[FXU], (instregex "AGR(K)?$")>; def : InstRW<[FXU], (instregex "AHI(K)?$")>; def : InstRW<[FXU], (instregex "AHIMux(K)?$")>; -def : InstRW<[FXU, LSU, Lat5], (instregex "AL(Y)?$")>; def : InstRW<[FXU], (instregex "AL(FI|HSIK)$")>; -def : InstRW<[FXU, LSU, Lat5], (instregex "ALG(F)?$")>; +def : InstRW<[FXU, LSU, Lat5], (instregex "ALGF$")>; def : InstRW<[FXU], (instregex "ALGHSIK$")>; def : InstRW<[FXU], (instregex "ALGF(I|R)$")>; def : InstRW<[FXU], (instregex "ALGR(K)?$")>; def : InstRW<[FXU], (instregex "ALR(K)?$")>; def : InstRW<[FXU], (instregex "AR(K)?$")>; -def : InstRW<[FXU, LSU, Lat5], (instregex "AG(SI)?$")>; +def : InstRW<[FXU, LSU, Lat5], (instregex "A(L)?G(SI)?$")>; // Logical addition with carry def : InstRW<[FXU, LSU, Lat7, GroupAlone], (instregex "ALC(G)?$")>; @@ -405,11 +413,17 @@ def : InstRW<[FXU, Lat9, GroupAlone], (instregex "MLGR$")>; def : InstRW<[FXU, Lat5], (instregex "MGHI$")>; def : InstRW<[FXU, Lat5], (instregex "MHI$")>; def : InstRW<[FXU, LSU, Lat9], (instregex "MH(Y)?$")>; +def : InstRW<[FXU, Lat7, GroupAlone], (instregex "M(L)?R$")>; +def : InstRW<[FXU, LSU, Lat7, GroupAlone], (instregex "M(FY|L)?$")>; //===----------------------------------------------------------------------===// // Division and remainder //===----------------------------------------------------------------------===// +def : InstRW<[FPU2, FPU2, FXU, FXU, FXU, FXU, FXU, Lat30, GroupAlone], + (instregex "DR$")>; +def : InstRW<[FPU2, FPU2, LSU, FXU, FXU, FXU, FXU, Lat30, GroupAlone], + (instregex "D$")>; def : InstRW<[FPU2, FPU2, FXU, FXU, FXU, FXU, Lat30, GroupAlone], (instregex "DSG(F)?R$")>; def : InstRW<[FPU2, FPU2, LSU, FXU, FXU, FXU, Lat30, GroupAlone], @@ -426,7 +440,8 @@ def : InstRW<[FPU2, FPU2, LSU, FXU, FXU, FXU, FXU, Lat30, GroupAlone], def : InstRW<[FXU], (instregex "SLL(G|K)?$")>; def : InstRW<[FXU], (instregex "SRL(G|K)?$")>; def : InstRW<[FXU], (instregex "SRA(G|K)?$")>; -def : InstRW<[FXU], (instregex "SLA(K)?$")>; +def : InstRW<[FXU], (instregex "SLA(G|K)?$")>; +def : InstRW<[FXU, FXU, FXU, FXU, Lat8], (instregex "S(L|R)D(A|L)$")>; // Rotate def : InstRW<[FXU, LSU, Lat6], (instregex "RLL(G)?$")>; @@ -475,7 +490,7 @@ def : InstRW<[FXU, Lat2], (instregex "CGFR$")>; // Compare logical character def : InstRW<[FXU, LSU, LSU, Lat9, GroupAlone], (instregex "CLC$")>; - +def : InstRW<[LSU, Lat30, GroupAlone], (instregex "CLCL(E|U)?$")>; def : InstRW<[LSU, Lat30, GroupAlone], (instregex "CLST$")>; // Test under mask @@ -486,6 +501,9 @@ def : InstRW<[FXU], (instregex "TMHL(64)?$")>; def : InstRW<[FXU], (instregex "TMLH(64)?$")>; def : InstRW<[FXU], (instregex "TMLL(64)?$")>; +// Compare logical characters under mask +def : InstRW<[FXU, LSU, Lat5], (instregex "CLM(H|Y)?$")>; + //===----------------------------------------------------------------------===// // Prefetch and execution hint //===----------------------------------------------------------------------===// @@ -532,6 +550,42 @@ def : InstRW<[FXU, FXU, LSU, LSU, Lat6, GroupAlone], (instregex "STPQ$")>; def : InstRW<[LSU, LSU, Lat5, GroupAlone], (instregex "LPD(G)?$")>; //===----------------------------------------------------------------------===// +// Translate and convert +//===----------------------------------------------------------------------===// + +def : InstRW<[FXU, Lat30, GroupAlone], (instregex "TR(T|TR)?(E|EOpt)?$")>; +def : InstRW<[FXU, Lat30, GroupAlone], (instregex "TR(T|O)(T|O)(Opt)?$")>; +def : InstRW<[FXU, Lat30, GroupAlone], (instregex "CU(12|14|21|24|41|42)(Opt)?$")>; +def : InstRW<[FXU, Lat30, GroupAlone], (instregex "(CUUTF|CUTFU)(Opt)?$")>; + +//===----------------------------------------------------------------------===// +// Message-security assist +//===----------------------------------------------------------------------===// + +def : InstRW<[FXU, Lat30, GroupAlone], (instregex "KM(C|F|O|CTR)?$")>; +def : InstRW<[FXU, Lat30, GroupAlone], (instregex "(KIMD|KLMD|KMAC|PCC)$")>; + +//===----------------------------------------------------------------------===// +// Decimal arithmetic +//===----------------------------------------------------------------------===// + +def : InstRW<[FXU, DFU, LSU, Lat30, GroupAlone], (instregex "CVB(Y|G)?$")>; +def : InstRW<[FXU, DFU, FXU, Lat30, GroupAlone], (instregex "CVD(Y|G)?$")>; +def : InstRW<[LSU, Lat30, GroupAlone], (instregex "MV(N|Z|O)$")>; +def : InstRW<[LSU, Lat30, GroupAlone], (instregex "(PACK|PKA|PKU)$")>; +def : InstRW<[LSU, Lat30, GroupAlone], (instregex "UNPK(A|U)?$")>; + +def : InstRW<[FXU, FXU, DFU2, LSU, LSU, LSU, LSU, Lat15, GroupAlone], + (instregex "(A|S|ZA)P$")>; +def : InstRW<[FXU, FXU, DFU2, LSU, LSU, LSU, LSU, Lat30, GroupAlone], + (instregex "(M|D)P$")>; +def : InstRW<[FXU, FXU, DFU2, LSU, LSU, Lat15, GroupAlone], + (instregex "SRP$")>; +def : InstRW<[DFU2, LSU, LSU, LSU, LSU, Lat11, GroupAlone], (instregex "CP$")>; +def : InstRW<[DFU2, LSU, LSU, Lat3, GroupAlone], (instregex "TP$")>; +def : InstRW<[LSU, Lat30, GroupAlone], (instregex "ED(MK)?$")>; + +//===----------------------------------------------------------------------===// // Access registers //===----------------------------------------------------------------------===// @@ -609,13 +663,30 @@ def : InstRW<[FXU], (instregex "ZEXT128_(32|64)$")>; // String instructions def : InstRW<[FXU, LSU, Lat30], (instregex "SRST$")>; +def : InstRW<[LSU, Lat30], (instregex "SRSTU$")>; +def : InstRW<[LSU, Lat30, GroupAlone], (instregex "CUSE$")>; + +// Various complex instructions +def : InstRW<[LSU, Lat30, GroupAlone], (instregex "CFC$")>; +def : InstRW<[LSU, Lat30, GroupAlone], (instregex "UPT$")>; +def : InstRW<[LSU, Lat30, GroupAlone], (instregex "CKSM$")>; +def : InstRW<[LSU, Lat30, GroupAlone], (instregex "CMPSC$")>; // Move with key def : InstRW<[LSU, Lat8, GroupAlone], (instregex "MVCK$")>; +// Monitor call +def : InstRW<[FXU], (instregex "MC$")>; + +// Extract CPU attribute +def : InstRW<[FXU, Lat30], (instregex "ECAG$")>; + // Extract CPU Time def : InstRW<[FXU, Lat5, LSU], (instregex "ECTG$")>; +// Extract PSW +def : InstRW<[FXU, Lat30], (instregex "EPSW$")>; + // Execute def : InstRW<[LSU, GroupAlone], (instregex "EX(RL)?$")>; @@ -778,14 +849,17 @@ def : InstRW<[FPU, LSU, Lat30], (instregex "D(E|D)B$")>; def : InstRW<[FPU, Lat30], (instregex "D(E|D)BR$")>; def : InstRW<[FPU2, FPU2, Lat30, GroupAlone], (instregex "DXBR$")>; +// Divide to integer +def : InstRW<[FPU, Lat30, GroupAlone], (instregex "DI(E|D)BR$")>; + //===----------------------------------------------------------------------===// // FP: Comparisons //===----------------------------------------------------------------------===// // Compare -def : InstRW<[FPU, LSU, Lat12], (instregex "C(E|D)B$")>; -def : InstRW<[FPU], (instregex "C(E|D)BR$")>; -def : InstRW<[FPU, FPU, Lat30], (instregex "CXBR$")>; +def : InstRW<[FPU, LSU, Lat12], (instregex "(K|C)(E|D)B$")>; +def : InstRW<[FPU], (instregex "(K|C)(E|D)BR$")>; +def : InstRW<[FPU, FPU, Lat30], (instregex "(K|C)XBR$")>; // Test Data Class def : InstRW<[FPU, LSU, Lat15], (instregex "TC(E|D)B$")>; diff --git a/lib/Target/SystemZ/SystemZSubtarget.cpp b/lib/Target/SystemZ/SystemZSubtarget.cpp index ce07ea3318a5..022679a7bc18 100644 --- a/lib/Target/SystemZ/SystemZSubtarget.cpp +++ b/lib/Target/SystemZ/SystemZSubtarget.cpp @@ -37,12 +37,13 @@ SystemZSubtarget::SystemZSubtarget(const Triple &TT, const std::string &CPU, const TargetMachine &TM) : SystemZGenSubtargetInfo(TT, CPU, FS), HasDistinctOps(false), HasLoadStoreOnCond(false), HasHighWord(false), HasFPExtension(false), - HasPopulationCount(false), HasFastSerialization(false), - HasInterlockedAccess1(false), HasMiscellaneousExtensions(false), + HasPopulationCount(false), HasMessageSecurityAssist4(false), + HasFastSerialization(false), HasInterlockedAccess1(false), + HasMiscellaneousExtensions(false), HasExecutionHint(false), HasLoadAndTrap(false), HasTransactionalExecution(false), HasProcessorAssist(false), HasVector(false), HasLoadStoreOnCond2(false), - HasLoadAndZeroRightmostByte(false), + HasLoadAndZeroRightmostByte(false), HasMessageSecurityAssist5(false), TargetTriple(TT), InstrInfo(initializeSubtargetDependencies(CPU, FS)), TLInfo(TM, *this), TSInfo(), FrameLowering() {} diff --git a/lib/Target/SystemZ/SystemZSubtarget.h b/lib/Target/SystemZ/SystemZSubtarget.h index cdb61327a16a..770dd7cd939f 100644 --- a/lib/Target/SystemZ/SystemZSubtarget.h +++ b/lib/Target/SystemZ/SystemZSubtarget.h @@ -39,6 +39,7 @@ protected: bool HasHighWord; bool HasFPExtension; bool HasPopulationCount; + bool HasMessageSecurityAssist4; bool HasFastSerialization; bool HasInterlockedAccess1; bool HasMiscellaneousExtensions; @@ -49,6 +50,7 @@ protected: bool HasVector; bool HasLoadStoreOnCond2; bool HasLoadAndZeroRightmostByte; + bool HasMessageSecurityAssist5; private: Triple TargetTriple; @@ -104,6 +106,10 @@ public: // Return true if the target has the population-count facility. bool hasPopulationCount() const { return HasPopulationCount; } + // Return true if the target has the message-security-assist + // extension facility 4. + bool hasMessageSecurityAssist4() const { return HasMessageSecurityAssist4; } + // Return true if the target has the fast-serialization facility. bool hasFastSerialization() const { return HasFastSerialization; } @@ -132,6 +138,10 @@ public: return HasLoadAndZeroRightmostByte; } + // Return true if the target has the message-security-assist + // extension facility 5. + bool hasMessageSecurityAssist5() const { return HasMessageSecurityAssist5; } + // Return true if the target has the vector facility. bool hasVector() const { return HasVector; } diff --git a/lib/Target/WebAssembly/WebAssemblyInstrCall.td b/lib/Target/WebAssembly/WebAssemblyInstrCall.td index 73d1d4be293b..6b45839c14b0 100644 --- a/lib/Target/WebAssembly/WebAssemblyInstrCall.td +++ b/lib/Target/WebAssembly/WebAssemblyInstrCall.td @@ -19,8 +19,8 @@ let Defs = [ARGUMENTS] in { // Call sequence markers. These have an immediate which represents the amount of // stack space to allocate or free, which is used for varargs lowering. let Uses = [SP32, SP64], Defs = [SP32, SP64], isCodeGenOnly = 1 in { -def ADJCALLSTACKDOWN : I<(outs), (ins i32imm:$amt), - [(WebAssemblycallseq_start timm:$amt)]>; +def ADJCALLSTACKDOWN : I<(outs), (ins i32imm:$amt, i32imm:$amt2), + [(WebAssemblycallseq_start timm:$amt, timm:$amt2)]>; def ADJCALLSTACKUP : I<(outs), (ins i32imm:$amt, i32imm:$amt2), [(WebAssemblycallseq_end timm:$amt, timm:$amt2)]>; } // isCodeGenOnly = 1 diff --git a/lib/Target/WebAssembly/WebAssemblyInstrInfo.td b/lib/Target/WebAssembly/WebAssemblyInstrInfo.td index a601b575f579..fa2146f7db84 100644 --- a/lib/Target/WebAssembly/WebAssemblyInstrInfo.td +++ b/lib/Target/WebAssembly/WebAssemblyInstrInfo.td @@ -25,7 +25,8 @@ def HasSIMD128 : Predicate<"Subtarget->hasSIMD128()">, // WebAssembly-specific DAG Node Types. //===----------------------------------------------------------------------===// -def SDT_WebAssemblyCallSeqStart : SDCallSeqStart<[SDTCisVT<0, iPTR>]>; +def SDT_WebAssemblyCallSeqStart : SDCallSeqStart<[SDTCisVT<0, iPTR>, + SDTCisVT<1, iPTR>]>; def SDT_WebAssemblyCallSeqEnd : SDCallSeqEnd<[SDTCisVT<0, iPTR>, SDTCisVT<1, iPTR>]>; def SDT_WebAssemblyCall0 : SDTypeProfile<0, -1, [SDTCisPtrTy<0>]>; diff --git a/lib/Target/X86/X86.td b/lib/Target/X86/X86.td index 784c3a6557ff..3a421fe77392 100644 --- a/lib/Target/X86/X86.td +++ b/lib/Target/X86/X86.td @@ -235,6 +235,8 @@ def FeatureLEAUsesAG : SubtargetFeature<"lea-uses-ag", "LEAUsesAG", "true", "LEA instruction needs inputs at AG stage">; def FeatureSlowLEA : SubtargetFeature<"slow-lea", "SlowLEA", "true", "LEA instruction with certain arguments is slow">; +def FeatureSlow3OpsLEA : SubtargetFeature<"slow-3ops-lea", "Slow3OpsLEA", "true", + "LEA instruction with 3 ops or certain registers is slow">; def FeatureSlowIncDec : SubtargetFeature<"slow-incdec", "SlowIncDec", "true", "INC and DEC instructions are slower than ADD and SUB">; def FeatureSoftFloat @@ -480,6 +482,7 @@ def SNBFeatures : ProcessorFeatures<[], [ FeatureXSAVE, FeatureXSAVEOPT, FeatureLAHFSAHF, + FeatureSlow3OpsLEA, FeatureFastScalarFSQRT, FeatureFastSHLDRotate ]>; diff --git a/lib/Target/X86/X86FastISel.cpp b/lib/Target/X86/X86FastISel.cpp index ebd179e786da..fc3b4836c178 100644 --- a/lib/Target/X86/X86FastISel.cpp +++ b/lib/Target/X86/X86FastISel.cpp @@ -180,44 +180,6 @@ private: } // end anonymous namespace. -static std::pair<X86::CondCode, bool> -getX86ConditionCode(CmpInst::Predicate Predicate) { - X86::CondCode CC = X86::COND_INVALID; - bool NeedSwap = false; - switch (Predicate) { - default: break; - // Floating-point Predicates - case CmpInst::FCMP_UEQ: CC = X86::COND_E; break; - case CmpInst::FCMP_OLT: NeedSwap = true; LLVM_FALLTHROUGH; - case CmpInst::FCMP_OGT: CC = X86::COND_A; break; - case CmpInst::FCMP_OLE: NeedSwap = true; LLVM_FALLTHROUGH; - case CmpInst::FCMP_OGE: CC = X86::COND_AE; break; - case CmpInst::FCMP_UGT: NeedSwap = true; LLVM_FALLTHROUGH; - case CmpInst::FCMP_ULT: CC = X86::COND_B; break; - case CmpInst::FCMP_UGE: NeedSwap = true; LLVM_FALLTHROUGH; - case CmpInst::FCMP_ULE: CC = X86::COND_BE; break; - case CmpInst::FCMP_ONE: CC = X86::COND_NE; break; - case CmpInst::FCMP_UNO: CC = X86::COND_P; break; - case CmpInst::FCMP_ORD: CC = X86::COND_NP; break; - case CmpInst::FCMP_OEQ: LLVM_FALLTHROUGH; - case CmpInst::FCMP_UNE: CC = X86::COND_INVALID; break; - - // Integer Predicates - case CmpInst::ICMP_EQ: CC = X86::COND_E; break; - case CmpInst::ICMP_NE: CC = X86::COND_NE; break; - case CmpInst::ICMP_UGT: CC = X86::COND_A; break; - case CmpInst::ICMP_UGE: CC = X86::COND_AE; break; - case CmpInst::ICMP_ULT: CC = X86::COND_B; break; - case CmpInst::ICMP_ULE: CC = X86::COND_BE; break; - case CmpInst::ICMP_SGT: CC = X86::COND_G; break; - case CmpInst::ICMP_SGE: CC = X86::COND_GE; break; - case CmpInst::ICMP_SLT: CC = X86::COND_L; break; - case CmpInst::ICMP_SLE: CC = X86::COND_LE; break; - } - - return std::make_pair(CC, NeedSwap); -} - static std::pair<unsigned, bool> getX86SSEConditionCode(CmpInst::Predicate Predicate) { unsigned CC; @@ -1559,7 +1521,7 @@ bool X86FastISel::X86SelectCmp(const Instruction *I) { X86::CondCode CC; bool SwapArgs; - std::tie(CC, SwapArgs) = getX86ConditionCode(Predicate); + std::tie(CC, SwapArgs) = X86::getX86ConditionCode(Predicate); assert(CC <= X86::LAST_VALID_COND && "Unexpected condition code."); unsigned Opc = X86::getSETFromCond(CC); @@ -1697,7 +1659,7 @@ bool X86FastISel::X86SelectBranch(const Instruction *I) { bool SwapArgs; unsigned BranchOpc; - std::tie(CC, SwapArgs) = getX86ConditionCode(Predicate); + std::tie(CC, SwapArgs) = X86::getX86ConditionCode(Predicate); assert(CC <= X86::LAST_VALID_COND && "Unexpected condition code."); BranchOpc = X86::GetCondBranchFromCond(CC); @@ -2070,7 +2032,7 @@ bool X86FastISel::X86FastEmitCMoveSelect(MVT RetVT, const Instruction *I) { } bool NeedSwap; - std::tie(CC, NeedSwap) = getX86ConditionCode(Predicate); + std::tie(CC, NeedSwap) = X86::getX86ConditionCode(Predicate); assert(CC <= X86::LAST_VALID_COND && "Unexpected condition code."); const Value *CmpLHS = CI->getOperand(0); @@ -2319,7 +2281,7 @@ bool X86FastISel::X86FastEmitPseudoSelect(MVT RetVT, const Instruction *I) { const auto *CI = dyn_cast<CmpInst>(Cond); if (CI && (CI->getParent() == I->getParent())) { bool NeedSwap; - std::tie(CC, NeedSwap) = getX86ConditionCode(CI->getPredicate()); + std::tie(CC, NeedSwap) = X86::getX86ConditionCode(CI->getPredicate()); if (CC > X86::LAST_VALID_COND) return false; @@ -3293,7 +3255,7 @@ bool X86FastISel::fastLowerCall(CallLoweringInfo &CLI) { // Issue CALLSEQ_START unsigned AdjStackDown = TII.getCallFrameSetupOpcode(); BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(AdjStackDown)) - .addImm(NumBytes).addImm(0); + .addImm(NumBytes).addImm(0).addImm(0); // Walk the register/memloc assignments, inserting copies/loads. const X86RegisterInfo *RegInfo = Subtarget->getRegisterInfo(); diff --git a/lib/Target/X86/X86FixupLEAs.cpp b/lib/Target/X86/X86FixupLEAs.cpp index 2cd4c1a3e7b3..9f649dad8bc0 100644 --- a/lib/Target/X86/X86FixupLEAs.cpp +++ b/lib/Target/X86/X86FixupLEAs.cpp @@ -27,20 +27,26 @@ #include "llvm/Target/TargetInstrInfo.h" using namespace llvm; -#define DEBUG_TYPE "x86-fixup-LEAs" +namespace llvm { +void initializeFixupLEAPassPass(PassRegistry &); +} + +#define FIXUPLEA_DESC "X86 LEA Fixup" +#define FIXUPLEA_NAME "x86-fixup-LEAs" + +#define DEBUG_TYPE FIXUPLEA_NAME STATISTIC(NumLEAs, "Number of LEA instructions created"); namespace { class FixupLEAPass : public MachineFunctionPass { enum RegUsageState { RU_NotUsed, RU_Write, RU_Read }; - static char ID; + /// \brief Loop over all of the instructions in the basic block /// replacing applicable instructions with LEA instructions, /// where appropriate. bool processBasicBlock(MachineFunction &MF, MachineFunction::iterator MFI); - StringRef getPassName() const override { return "X86 LEA Fixup"; } /// \brief Given a machine register, look for the instruction /// which writes it in the current basic block. If found, @@ -62,6 +68,22 @@ class FixupLEAPass : public MachineFunctionPass { void processInstructionForSLM(MachineBasicBlock::iterator &I, MachineFunction::iterator MFI); + + /// \brief Given a LEA instruction which is unprofitable + /// on SNB+ try to replace it with other instructions. + /// According to Intel's Optimization Reference Manual: + /// " For LEA instructions with three source operands and some specific + /// situations, instruction latency has increased to 3 cycles, and must + /// dispatch via port 1: + /// - LEA that has all three source operands: base, index, and offset + /// - LEA that uses base and index registers where the base is EBP, RBP, + /// or R13 + /// - LEA that uses RIP relative addressing mode + /// - LEA that uses 16-bit addressing mode " + /// This function currently handles the first 2 cases only. + MachineInstr *processInstrForSlow3OpLEA(MachineInstr &MI, + MachineFunction::iterator MFI); + /// \brief Look for LEAs that add 1 to reg or subtract 1 from reg /// and convert them to INC or DEC respectively. bool fixupIncDec(MachineBasicBlock::iterator &I, @@ -85,7 +107,13 @@ class FixupLEAPass : public MachineFunctionPass { MachineBasicBlock::iterator &MBBI) const; public: - FixupLEAPass() : MachineFunctionPass(ID) {} + static char ID; + + StringRef getPassName() const override { return FIXUPLEA_DESC; } + + FixupLEAPass() : MachineFunctionPass(ID) { + initializeFixupLEAPassPass(*PassRegistry::getPassRegistry()); + } /// \brief Loop over all of the basic blocks, /// replacing instructions by equivalent LEA instructions @@ -104,9 +132,12 @@ private: bool OptIncDec; bool OptLEA; }; -char FixupLEAPass::ID = 0; } +char FixupLEAPass::ID = 0; + +INITIALIZE_PASS(FixupLEAPass, FIXUPLEA_NAME, FIXUPLEA_DESC, false, false) + MachineInstr * FixupLEAPass::postRAConvertToLEA(MachineFunction::iterator &MFI, MachineBasicBlock::iterator &MBBI) const { @@ -168,7 +199,7 @@ bool FixupLEAPass::runOnMachineFunction(MachineFunction &Func) { MF = &Func; const X86Subtarget &ST = Func.getSubtarget<X86Subtarget>(); OptIncDec = !ST.slowIncDec() || Func.getFunction()->optForMinSize(); - OptLEA = ST.LEAusesAG() || ST.slowLEA(); + OptLEA = ST.LEAusesAG() || ST.slowLEA() || ST.slow3OpsLEA(); if (!OptLEA && !OptIncDec) return false; @@ -242,9 +273,64 @@ FixupLEAPass::searchBackwards(MachineOperand &p, MachineBasicBlock::iterator &I, return MachineBasicBlock::iterator(); } -static inline bool isLEA(const int opcode) { - return opcode == X86::LEA16r || opcode == X86::LEA32r || - opcode == X86::LEA64r || opcode == X86::LEA64_32r; +static inline bool isLEA(const int Opcode) { + return Opcode == X86::LEA16r || Opcode == X86::LEA32r || + Opcode == X86::LEA64r || Opcode == X86::LEA64_32r; +} + +static inline bool isInefficientLEAReg(unsigned int Reg) { + return Reg == X86::EBP || Reg == X86::RBP || Reg == X86::R13; +} + +static inline bool isRegOperand(const MachineOperand &Op) { + return Op.isReg() && Op.getReg() != X86::NoRegister; +} +/// hasIneffecientLEARegs - LEA that uses base and index registers +/// where the base is EBP, RBP, or R13 +static inline bool hasInefficientLEABaseReg(const MachineOperand &Base, + const MachineOperand &Index) { + return Base.isReg() && isInefficientLEAReg(Base.getReg()) && + isRegOperand(Index); +} + +static inline bool hasLEAOffset(const MachineOperand &Offset) { + return (Offset.isImm() && Offset.getImm() != 0) || Offset.isGlobal(); +} + +// LEA instruction that has all three operands: offset, base and index +static inline bool isThreeOperandsLEA(const MachineOperand &Base, + const MachineOperand &Index, + const MachineOperand &Offset) { + return isRegOperand(Base) && isRegOperand(Index) && hasLEAOffset(Offset); +} + +static inline int getADDrrFromLEA(int LEAOpcode) { + switch (LEAOpcode) { + default: + llvm_unreachable("Unexpected LEA instruction"); + case X86::LEA16r: + return X86::ADD16rr; + case X86::LEA32r: + return X86::ADD32rr; + case X86::LEA64_32r: + case X86::LEA64r: + return X86::ADD64rr; + } +} + +static inline int getADDriFromLEA(int LEAOpcode, const MachineOperand &Offset) { + bool IsInt8 = Offset.isImm() && isInt<8>(Offset.getImm()); + switch (LEAOpcode) { + default: + llvm_unreachable("Unexpected LEA instruction"); + case X86::LEA16r: + return IsInt8 ? X86::ADD16ri8 : X86::ADD16ri; + case X86::LEA32r: + case X86::LEA64_32r: + return IsInt8 ? X86::ADD32ri8 : X86::ADD32ri; + case X86::LEA64r: + return IsInt8 ? X86::ADD64ri8 : X86::ADD64ri32; + } } /// isLEASimpleIncOrDec - Does this LEA have one these forms: @@ -337,8 +423,8 @@ void FixupLEAPass::seekLEAFixup(MachineOperand &p, void FixupLEAPass::processInstructionForSLM(MachineBasicBlock::iterator &I, MachineFunction::iterator MFI) { MachineInstr &MI = *I; - const int opcode = MI.getOpcode(); - if (!isLEA(opcode)) + const int Opcode = MI.getOpcode(); + if (!isLEA(Opcode)) return; if (MI.getOperand(5).getReg() != 0 || !MI.getOperand(4).isImm() || !TII->isSafeToClobberEFLAGS(*MFI, I)) @@ -350,53 +436,142 @@ void FixupLEAPass::processInstructionForSLM(MachineBasicBlock::iterator &I, return; if (MI.getOperand(2).getImm() > 1) return; - int addrr_opcode, addri_opcode; - switch (opcode) { - default: - llvm_unreachable("Unexpected LEA instruction"); - case X86::LEA16r: - addrr_opcode = X86::ADD16rr; - addri_opcode = X86::ADD16ri; - break; - case X86::LEA32r: - addrr_opcode = X86::ADD32rr; - addri_opcode = X86::ADD32ri; - break; - case X86::LEA64_32r: - case X86::LEA64r: - addrr_opcode = X86::ADD64rr; - addri_opcode = X86::ADD64ri32; - break; - } DEBUG(dbgs() << "FixLEA: Candidate to replace:"; I->dump();); DEBUG(dbgs() << "FixLEA: Replaced by: ";); MachineInstr *NewMI = nullptr; - const MachineOperand &Dst = MI.getOperand(0); // Make ADD instruction for two registers writing to LEA's destination if (SrcR1 != 0 && SrcR2 != 0) { - const MachineOperand &Src1 = MI.getOperand(SrcR1 == DstR ? 1 : 3); - const MachineOperand &Src2 = MI.getOperand(SrcR1 == DstR ? 3 : 1); - NewMI = BuildMI(*MF, MI.getDebugLoc(), TII->get(addrr_opcode)) - .add(Dst) - .add(Src1) - .add(Src2); - MFI->insert(I, NewMI); + const MCInstrDesc &ADDrr = TII->get(getADDrrFromLEA(Opcode)); + const MachineOperand &Src = MI.getOperand(SrcR1 == DstR ? 3 : 1); + NewMI = + BuildMI(*MFI, I, MI.getDebugLoc(), ADDrr, DstR).addReg(DstR).add(Src); DEBUG(NewMI->dump();); } // Make ADD instruction for immediate if (MI.getOperand(4).getImm() != 0) { + const MCInstrDesc &ADDri = + TII->get(getADDriFromLEA(Opcode, MI.getOperand(4))); const MachineOperand &SrcR = MI.getOperand(SrcR1 == DstR ? 1 : 3); - NewMI = BuildMI(*MF, MI.getDebugLoc(), TII->get(addri_opcode)) - .add(Dst) + NewMI = BuildMI(*MFI, I, MI.getDebugLoc(), ADDri, DstR) .add(SrcR) .addImm(MI.getOperand(4).getImm()); - MFI->insert(I, NewMI); DEBUG(NewMI->dump();); } if (NewMI) { MFI->erase(I); - I = static_cast<MachineBasicBlock::iterator>(NewMI); + I = NewMI; + } +} + +MachineInstr * +FixupLEAPass::processInstrForSlow3OpLEA(MachineInstr &MI, + MachineFunction::iterator MFI) { + + const int LEAOpcode = MI.getOpcode(); + if (!isLEA(LEAOpcode)) + return nullptr; + + const MachineOperand &Dst = MI.getOperand(0); + const MachineOperand &Base = MI.getOperand(1); + const MachineOperand &Scale = MI.getOperand(2); + const MachineOperand &Index = MI.getOperand(3); + const MachineOperand &Offset = MI.getOperand(4); + const MachineOperand &Segment = MI.getOperand(5); + + if (!(isThreeOperandsLEA(Base, Index, Offset) || + hasInefficientLEABaseReg(Base, Index)) || + !TII->isSafeToClobberEFLAGS(*MFI, MI) || + Segment.getReg() != X86::NoRegister) + return nullptr; + + unsigned int DstR = Dst.getReg(); + unsigned int BaseR = Base.getReg(); + unsigned int IndexR = Index.getReg(); + unsigned SSDstR = + (LEAOpcode == X86::LEA64_32r) ? getX86SubSuperRegister(DstR, 64) : DstR; + bool IsScale1 = Scale.getImm() == 1; + bool IsInefficientBase = isInefficientLEAReg(BaseR); + bool IsInefficientIndex = isInefficientLEAReg(IndexR); + + // Skip these cases since it takes more than 2 instructions + // to replace the LEA instruction. + if (IsInefficientBase && SSDstR == BaseR && !IsScale1) + return nullptr; + if (LEAOpcode == X86::LEA64_32r && IsInefficientBase && + (IsInefficientIndex || !IsScale1)) + return nullptr; + + const DebugLoc DL = MI.getDebugLoc(); + const MCInstrDesc &ADDrr = TII->get(getADDrrFromLEA(LEAOpcode)); + const MCInstrDesc &ADDri = TII->get(getADDriFromLEA(LEAOpcode, Offset)); + + DEBUG(dbgs() << "FixLEA: Candidate to replace:"; MI.dump();); + DEBUG(dbgs() << "FixLEA: Replaced by: ";); + + // First try to replace LEA with one or two (for the 3-op LEA case) + // add instructions: + // 1.lea (%base,%index,1), %base => add %index,%base + // 2.lea (%base,%index,1), %index => add %base,%index + if (IsScale1 && (DstR == BaseR || DstR == IndexR)) { + const MachineOperand &Src = DstR == BaseR ? Index : Base; + MachineInstr *NewMI = + BuildMI(*MFI, MI, DL, ADDrr, DstR).addReg(DstR).add(Src); + DEBUG(NewMI->dump();); + // Create ADD instruction for the Offset in case of 3-Ops LEA. + if (hasLEAOffset(Offset)) { + NewMI = BuildMI(*MFI, MI, DL, ADDri, DstR).addReg(DstR).add(Offset); + DEBUG(NewMI->dump();); + } + return NewMI; + } + // If the base is inefficient try switching the index and base operands, + // otherwise just break the 3-Ops LEA inst into 2-Ops LEA + ADD instruction: + // lea offset(%base,%index,scale),%dst => + // lea (%base,%index,scale); add offset,%dst + if (!IsInefficientBase || (!IsInefficientIndex && IsScale1)) { + MachineInstr *NewMI = BuildMI(*MFI, MI, DL, TII->get(LEAOpcode)) + .add(Dst) + .add(IsInefficientBase ? Index : Base) + .add(Scale) + .add(IsInefficientBase ? Base : Index) + .addImm(0) + .add(Segment); + DEBUG(NewMI->dump();); + // Create ADD instruction for the Offset in case of 3-Ops LEA. + if (hasLEAOffset(Offset)) { + NewMI = BuildMI(*MFI, MI, DL, ADDri, DstR).addReg(DstR).add(Offset); + DEBUG(NewMI->dump();); + } + return NewMI; + } + // Handle the rest of the cases with inefficient base register: + assert(SSDstR != BaseR && "SSDstR == BaseR should be handled already!"); + assert(IsInefficientBase && "efficient base should be handled already!"); + + // lea (%base,%index,1), %dst => mov %base,%dst; add %index,%dst + if (IsScale1 && !hasLEAOffset(Offset)) { + TII->copyPhysReg(*MFI, MI, DL, DstR, BaseR, Base.isKill()); + DEBUG(MI.getPrevNode()->dump();); + + MachineInstr *NewMI = + BuildMI(*MFI, MI, DL, ADDrr, DstR).addReg(DstR).add(Index); + DEBUG(NewMI->dump();); + return NewMI; } + // lea offset(%base,%index,scale), %dst => + // lea offset( ,%index,scale), %dst; add %base,%dst + MachineInstr *NewMI = BuildMI(*MFI, MI, DL, TII->get(LEAOpcode)) + .add(Dst) + .addReg(0) + .add(Scale) + .add(Index) + .add(Offset) + .add(Segment); + DEBUG(NewMI->dump();); + + NewMI = BuildMI(*MFI, MI, DL, ADDrr, DstR).addReg(DstR).add(Base); + DEBUG(NewMI->dump();); + return NewMI; } bool FixupLEAPass::processBasicBlock(MachineFunction &MF, @@ -410,8 +585,16 @@ bool FixupLEAPass::processBasicBlock(MachineFunction &MF, if (OptLEA) { if (MF.getSubtarget<X86Subtarget>().isSLM()) processInstructionForSLM(I, MFI); - else - processInstruction(I, MFI); + + else { + if (MF.getSubtarget<X86Subtarget>().slow3OpsLEA()) { + if (auto *NewMI = processInstrForSlow3OpLEA(*I, MFI)) { + MFI->erase(I); + I = NewMI; + } + } else + processInstruction(I, MFI); + } } } return false; diff --git a/lib/Target/X86/X86ISelDAGToDAG.cpp b/lib/Target/X86/X86ISelDAGToDAG.cpp index 12a10bf3072f..c899f0fd5100 100644 --- a/lib/Target/X86/X86ISelDAGToDAG.cpp +++ b/lib/Target/X86/X86ISelDAGToDAG.cpp @@ -1178,8 +1178,7 @@ bool X86DAGToDAGISel::matchAddressRecursively(SDValue N, X86ISelAddressMode &AM, if (AM.IndexReg.getNode() != nullptr || AM.Scale != 1) break; - if (ConstantSDNode - *CN = dyn_cast<ConstantSDNode>(N.getNode()->getOperand(1))) { + if (ConstantSDNode *CN = dyn_cast<ConstantSDNode>(N.getOperand(1))) { unsigned Val = CN->getZExtValue(); // Note that we handle x<<1 as (,x,2) rather than (x,x) here so // that the base operand remains free for further matching. If @@ -1187,15 +1186,14 @@ bool X86DAGToDAGISel::matchAddressRecursively(SDValue N, X86ISelAddressMode &AM, // in MatchAddress turns (,x,2) into (x,x), which is cheaper. if (Val == 1 || Val == 2 || Val == 3) { AM.Scale = 1 << Val; - SDValue ShVal = N.getNode()->getOperand(0); + SDValue ShVal = N.getOperand(0); // Okay, we know that we have a scale by now. However, if the scaled // value is an add of something and a constant, we can fold the // constant into the disp field here. if (CurDAG->isBaseWithConstantOffset(ShVal)) { - AM.IndexReg = ShVal.getNode()->getOperand(0); - ConstantSDNode *AddVal = - cast<ConstantSDNode>(ShVal.getNode()->getOperand(1)); + AM.IndexReg = ShVal.getOperand(0); + ConstantSDNode *AddVal = cast<ConstantSDNode>(ShVal.getOperand(1)); uint64_t Disp = (uint64_t)AddVal->getSExtValue() << Val; if (!foldOffsetIntoAddress(Disp, AM)) return false; @@ -1245,28 +1243,27 @@ bool X86DAGToDAGISel::matchAddressRecursively(SDValue N, X86ISelAddressMode &AM, if (AM.BaseType == X86ISelAddressMode::RegBase && AM.Base_Reg.getNode() == nullptr && AM.IndexReg.getNode() == nullptr) { - if (ConstantSDNode - *CN = dyn_cast<ConstantSDNode>(N.getNode()->getOperand(1))) + if (ConstantSDNode *CN = dyn_cast<ConstantSDNode>(N.getOperand(1))) if (CN->getZExtValue() == 3 || CN->getZExtValue() == 5 || CN->getZExtValue() == 9) { AM.Scale = unsigned(CN->getZExtValue())-1; - SDValue MulVal = N.getNode()->getOperand(0); + SDValue MulVal = N.getOperand(0); SDValue Reg; // Okay, we know that we have a scale by now. However, if the scaled // value is an add of something and a constant, we can fold the // constant into the disp field here. if (MulVal.getNode()->getOpcode() == ISD::ADD && MulVal.hasOneUse() && - isa<ConstantSDNode>(MulVal.getNode()->getOperand(1))) { - Reg = MulVal.getNode()->getOperand(0); + isa<ConstantSDNode>(MulVal.getOperand(1))) { + Reg = MulVal.getOperand(0); ConstantSDNode *AddVal = - cast<ConstantSDNode>(MulVal.getNode()->getOperand(1)); + cast<ConstantSDNode>(MulVal.getOperand(1)); uint64_t Disp = AddVal->getSExtValue() * CN->getZExtValue(); if (foldOffsetIntoAddress(Disp, AM)) - Reg = N.getNode()->getOperand(0); + Reg = N.getOperand(0); } else { - Reg = N.getNode()->getOperand(0); + Reg = N.getOperand(0); } AM.IndexReg = AM.Base_Reg = Reg; @@ -1289,7 +1286,7 @@ bool X86DAGToDAGISel::matchAddressRecursively(SDValue N, X86ISelAddressMode &AM, // Test if the LHS of the sub can be folded. X86ISelAddressMode Backup = AM; - if (matchAddressRecursively(N.getNode()->getOperand(0), AM, Depth+1)) { + if (matchAddressRecursively(N.getOperand(0), AM, Depth+1)) { AM = Backup; break; } @@ -1300,7 +1297,7 @@ bool X86DAGToDAGISel::matchAddressRecursively(SDValue N, X86ISelAddressMode &AM, } int Cost = 0; - SDValue RHS = Handle.getValue().getNode()->getOperand(1); + SDValue RHS = Handle.getValue().getOperand(1); // If the RHS involves a register with multiple uses, this // transformation incurs an extra mov, due to the neg instruction // clobbering its operand. @@ -1309,7 +1306,7 @@ bool X86DAGToDAGISel::matchAddressRecursively(SDValue N, X86ISelAddressMode &AM, RHS.getNode()->getOpcode() == ISD::TRUNCATE || RHS.getNode()->getOpcode() == ISD::ANY_EXTEND || (RHS.getNode()->getOpcode() == ISD::ZERO_EXTEND && - RHS.getNode()->getOperand(0).getValueType() == MVT::i32)) + RHS.getOperand(0).getValueType() == MVT::i32)) ++Cost; // If the base is a register with multiple uses, this // transformation may save a mov. @@ -2524,7 +2521,7 @@ void X86DAGToDAGISel::Select(SDNode *Node) { N0.getNode()->hasOneUse() && N0.getValueType() != MVT::i8 && X86::isZeroNode(N1)) { - ConstantSDNode *C = dyn_cast<ConstantSDNode>(N0.getNode()->getOperand(1)); + ConstantSDNode *C = dyn_cast<ConstantSDNode>(N0.getOperand(1)); if (!C) break; // For example, convert "testl %eax, $8" to "testb %al, $8" @@ -2532,7 +2529,7 @@ void X86DAGToDAGISel::Select(SDNode *Node) { (!(C->getZExtValue() & 0x80) || hasNoSignedComparisonUses(Node))) { SDValue Imm = CurDAG->getTargetConstant(C->getZExtValue(), dl, MVT::i8); - SDValue Reg = N0.getNode()->getOperand(0); + SDValue Reg = N0.getOperand(0); // On x86-32, only the ABCD registers have 8-bit subregisters. if (!Subtarget->is64Bit()) { @@ -2568,7 +2565,7 @@ void X86DAGToDAGISel::Select(SDNode *Node) { // Shift the immediate right by 8 bits. SDValue ShiftedImm = CurDAG->getTargetConstant(C->getZExtValue() >> 8, dl, MVT::i8); - SDValue Reg = N0.getNode()->getOperand(0); + SDValue Reg = N0.getOperand(0); // Put the value in an ABCD register. const TargetRegisterClass *TRC; @@ -2605,7 +2602,7 @@ void X86DAGToDAGISel::Select(SDNode *Node) { hasNoSignedComparisonUses(Node))) { SDValue Imm = CurDAG->getTargetConstant(C->getZExtValue(), dl, MVT::i16); - SDValue Reg = N0.getNode()->getOperand(0); + SDValue Reg = N0.getOperand(0); // Extract the 16-bit subregister. SDValue Subreg = CurDAG->getTargetExtractSubreg(X86::sub_16bit, dl, @@ -2628,7 +2625,7 @@ void X86DAGToDAGISel::Select(SDNode *Node) { hasNoSignedComparisonUses(Node))) { SDValue Imm = CurDAG->getTargetConstant(C->getZExtValue(), dl, MVT::i32); - SDValue Reg = N0.getNode()->getOperand(0); + SDValue Reg = N0.getOperand(0); // Extract the 32-bit subregister. SDValue Subreg = CurDAG->getTargetExtractSubreg(X86::sub_32bit, dl, diff --git a/lib/Target/X86/X86ISelLowering.cpp b/lib/Target/X86/X86ISelLowering.cpp index 9ee2234595f9..11c08292518a 100644 --- a/lib/Target/X86/X86ISelLowering.cpp +++ b/lib/Target/X86/X86ISelLowering.cpp @@ -40,6 +40,7 @@ #include "llvm/IR/CallingConv.h" #include "llvm/IR/Constants.h" #include "llvm/IR/DerivedTypes.h" +#include "llvm/IR/DiagnosticInfo.h" #include "llvm/IR/Function.h" #include "llvm/IR/GlobalAlias.h" #include "llvm/IR/GlobalVariable.h" @@ -79,6 +80,17 @@ static cl::opt<int> ExperimentalPrefLoopAlignment( " of the loop header PC will be 0)."), cl::Hidden); +/// Call this when the user attempts to do something unsupported, like +/// returning a double without SSE2 enabled on x86_64. This is not fatal, unlike +/// report_fatal_error, so calling code should attempt to recover without +/// crashing. +static void errorUnsupported(SelectionDAG &DAG, const SDLoc &dl, + const char *Msg) { + MachineFunction &MF = DAG.getMachineFunction(); + DAG.getContext()->diagnose( + DiagnosticInfoUnsupported(*MF.getFunction(), Msg, dl.getDebugLoc())); +} + X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM, const X86Subtarget &STI) : TargetLowering(TM), Subtarget(STI) { @@ -1381,7 +1393,7 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM, setOperationAction(ISD::VECTOR_SHUFFLE, VT, Custom); setOperationAction(ISD::INSERT_VECTOR_ELT, VT, Custom); setOperationAction(ISD::BUILD_VECTOR, VT, Custom); - setOperationAction(ISD::VSELECT, VT, Legal); + setOperationAction(ISD::VSELECT, VT, Custom); setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT, Custom); setOperationAction(ISD::SCALAR_TO_VECTOR, VT, Custom); setOperationAction(ISD::INSERT_SUBVECTOR, VT, Legal); @@ -1445,8 +1457,6 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM, setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v64i1, Custom); setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v32i16, Custom); setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v64i8, Custom); - setOperationAction(ISD::VSELECT, MVT::v32i16, Legal); - setOperationAction(ISD::VSELECT, MVT::v64i8, Legal); setOperationAction(ISD::TRUNCATE, MVT::v32i1, Custom); setOperationAction(ISD::TRUNCATE, MVT::v64i1, Custom); setOperationAction(ISD::TRUNCATE, MVT::v32i8, Custom); @@ -1479,7 +1489,7 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM, for (auto VT : { MVT::v64i8, MVT::v32i16 }) { setOperationAction(ISD::BUILD_VECTOR, VT, Custom); - setOperationAction(ISD::VSELECT, VT, Legal); + setOperationAction(ISD::VSELECT, VT, Custom); setOperationAction(ISD::ABS, VT, Legal); setOperationAction(ISD::SRL, VT, Custom); setOperationAction(ISD::SHL, VT, Custom); @@ -2207,15 +2217,17 @@ X86TargetLowering::LowerReturn(SDValue Chain, CallingConv::ID CallConv, // or SSE or MMX vectors. if ((ValVT == MVT::f32 || ValVT == MVT::f64 || VA.getLocReg() == X86::XMM0 || VA.getLocReg() == X86::XMM1) && - (Subtarget.is64Bit() && !Subtarget.hasSSE1())) { - report_fatal_error("SSE register return with SSE disabled"); + (Subtarget.is64Bit() && !Subtarget.hasSSE1())) { + errorUnsupported(DAG, dl, "SSE register return with SSE disabled"); + VA.convertToReg(X86::FP0); // Set reg to FP0, avoid hitting asserts. + } else if (ValVT == MVT::f64 && + (Subtarget.is64Bit() && !Subtarget.hasSSE2())) { + // Likewise we can't return F64 values with SSE1 only. gcc does so, but + // llvm-gcc has never done it right and no one has noticed, so this + // should be OK for now. + errorUnsupported(DAG, dl, "SSE2 register return with SSE2 disabled"); + VA.convertToReg(X86::FP0); // Set reg to FP0, avoid hitting asserts. } - // Likewise we can't return F64 values with SSE1 only. gcc does so, but - // llvm-gcc has never done it right and no one has noticed, so this - // should be OK for now. - if (ValVT == MVT::f64 && - (Subtarget.is64Bit() && !Subtarget.hasSSE2())) - report_fatal_error("SSE2 register return with SSE2 disabled"); // Returns in ST0/ST1 are handled specially: these are pushed as operands to // the RET instruction and handled by the FP Stackifier. @@ -2528,7 +2540,8 @@ SDValue X86TargetLowering::LowerCallResult( // If this is x86-64, and we disabled SSE, we can't return FP values if ((CopyVT == MVT::f32 || CopyVT == MVT::f64 || CopyVT == MVT::f128) && ((Is64Bit || Ins[InsIndex].Flags.isInReg()) && !Subtarget.hasSSE1())) { - report_fatal_error("SSE register return with SSE disabled"); + errorUnsupported(DAG, dl, "SSE register return with SSE disabled"); + VA.convertToReg(X86::FP0); // Set reg to FP0, avoid hitting asserts. } // If we prefer to use the value in xmm registers, copy it out as f80 and @@ -3415,8 +3428,8 @@ X86TargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI, } if (!IsSibcall) - Chain = DAG.getCALLSEQ_START( - Chain, DAG.getIntPtrConstant(NumBytesToPush, dl, true), dl); + Chain = DAG.getCALLSEQ_START(Chain, NumBytesToPush, + NumBytes - NumBytesToPush, dl); SDValue RetAddrFrIdx; // Load return address for tail calls. @@ -6912,9 +6925,9 @@ X86TargetLowering::LowerBUILD_VECTORvXi1(SDValue Op, SelectionDAG &DAG) const { // for splat use " (select i1 splat_elt, all-ones, all-zeroes)" if (IsSplat) - return DAG.getNode(ISD::SELECT, dl, VT, Op.getOperand(SplatIdx), - DAG.getConstant(1, dl, VT), - DAG.getConstant(0, dl, VT)); + return DAG.getSelect(dl, VT, Op.getOperand(SplatIdx), + DAG.getConstant(1, dl, VT), + DAG.getConstant(0, dl, VT)); // insert elements one by one SDValue DstVec; @@ -8386,9 +8399,9 @@ static SDValue lowerVectorShuffleToEXPAND(const SDLoc &DL, MVT VT, Subtarget, DAG, DL); SDValue ZeroVector = getZeroVector(VT, Subtarget, DAG, DL); SDValue ExpandedVector = IsLeftZeroSide ? V2 : V1; - return DAG.getNode(ISD::VSELECT, DL, VT, VMask, - DAG.getNode(X86ISD::EXPAND, DL, VT, ExpandedVector), - ZeroVector); + return DAG.getSelect(DL, VT, VMask, + DAG.getNode(X86ISD::EXPAND, DL, VT, ExpandedVector), + ZeroVector); } static bool matchVectorShuffleWithUNPCK(MVT VT, SDValue &V1, SDValue &V2, @@ -8748,8 +8761,9 @@ static SDValue lowerVectorShuffleAsBlend(const SDLoc &DL, MVT VT, SDValue V1, V1 = DAG.getBitcast(BlendVT, V1); V2 = DAG.getBitcast(BlendVT, V2); return DAG.getBitcast( - VT, DAG.getNode(ISD::VSELECT, DL, BlendVT, - DAG.getBuildVector(BlendVT, DL, VSELECTMask), V1, V2)); + VT, + DAG.getSelect(DL, BlendVT, DAG.getBuildVector(BlendVT, DL, VSELECTMask), + V1, V2)); } case MVT::v16f32: case MVT::v8f64: @@ -13817,6 +13831,11 @@ SDValue X86TargetLowering::LowerVSELECT(SDValue Op, SelectionDAG &DAG) const { ISD::isBuildVectorOfConstantSDNodes(Op.getOperand(2).getNode())) return SDValue(); + // If this VSELECT has a vector if i1 as a mask, it will be directly matched + // with patterns on the mask registers on AVX-512. + if (Op->getOperand(0).getValueType().getScalarSizeInBits() == 1) + return Op; + // Try to lower this to a blend-style vector shuffle. This can handle all // constant condition cases. if (SDValue BlendOp = lowerVSELECTtoVectorShuffle(Op, Subtarget, DAG)) @@ -13826,10 +13845,30 @@ SDValue X86TargetLowering::LowerVSELECT(SDValue Op, SelectionDAG &DAG) const { if (!Subtarget.hasSSE41()) return SDValue(); + SDLoc dl(Op); + MVT VT = Op.getSimpleValueType(); + + // If the VSELECT is on a 512-bit type, we have to convert a non-i1 condition + // into an i1 condition so that we can use the mask-based 512-bit blend + // instructions. + if (VT.getSizeInBits() == 512) { + SDValue Cond = Op.getOperand(0); + // The vNi1 condition case should be handled above as it can be trivially + // lowered. + assert(Cond.getValueType().getScalarSizeInBits() == + VT.getScalarSizeInBits() && + "Should have a size-matched integer condition!"); + // Build a mask by testing the condition against itself (tests for zero). + MVT MaskVT = MVT::getVectorVT(MVT::i1, VT.getVectorNumElements()); + SDValue Mask = DAG.getNode(X86ISD::TESTM, dl, MaskVT, Cond, Cond); + // Now return a new VSELECT using the mask. + return DAG.getSelect(dl, VT, Mask, Op.getOperand(1), Op.getOperand(2)); + } + // Only some types will be legal on some subtargets. If we can emit a legal // VSELECT-matching blend, return Op, and but if we need to expand, return // a null value. - switch (Op.getSimpleValueType().SimpleTy) { + switch (VT.SimpleTy) { default: // Most of the vector types have blends past SSE4.1. return Op; @@ -14725,7 +14764,7 @@ X86TargetLowering::LowerGlobalTLSAddress(SDValue Op, SelectionDAG &DAG) const { // location. SDValue Chain = DAG.getEntryNode(); SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Glue); - Chain = DAG.getCALLSEQ_START(Chain, DAG.getIntPtrConstant(0, DL, true), DL); + Chain = DAG.getCALLSEQ_START(Chain, 0, 0, DL); SDValue Args[] = { Chain, Offset }; Chain = DAG.getNode(X86ISD::TLSCALL, DL, NodeTys, Args); Chain = DAG.getCALLSEQ_END(Chain, DAG.getIntPtrConstant(0, DL, true), @@ -15348,8 +15387,7 @@ SDValue X86TargetLowering::LowerUINT_TO_FP(SDValue Op, // Get a pointer to FF if the sign bit was set, or to 0 otherwise. SDValue Zero = DAG.getIntPtrConstant(0, dl); SDValue Four = DAG.getIntPtrConstant(4, dl); - SDValue Offset = DAG.getNode(ISD::SELECT, dl, Zero.getValueType(), SignSet, - Zero, Four); + SDValue Offset = DAG.getSelect(dl, Zero.getValueType(), SignSet, Zero, Four); FudgePtr = DAG.getNode(ISD::ADD, dl, PtrVT, FudgePtr, Offset); // Load the value out, extending it from f32 to f80. @@ -15621,7 +15659,7 @@ static SDValue LowerZERO_EXTEND_AVX512(SDValue Op, SDValue Zero = DAG.getConstant(APInt::getNullValue(ExtVT.getScalarSizeInBits()), DL, ExtVT); - SDValue SelectedVal = DAG.getNode(ISD::VSELECT, DL, ExtVT, In, One, Zero); + SDValue SelectedVal = DAG.getSelect(DL, ExtVT, In, One, Zero); if (VT == ExtVT) return SelectedVal; return DAG.getNode(X86ISD::VTRUNC, DL, VT, SelectedVal); @@ -16713,7 +16751,7 @@ static SDValue LowerAndToBT(SDValue And, ISD::CondCode CC, if (BitWidth > AndBitWidth) { KnownBits Known; DAG.computeKnownBits(Op0, Known); - if (Known.Zero.countLeadingOnes() < BitWidth - AndBitWidth) + if (Known.countMinLeadingZeros() < BitWidth - AndBitWidth) return SDValue(); } LHS = Op1; @@ -17455,7 +17493,7 @@ SDValue X86TargetLowering::LowerSELECT(SDValue Op, SelectionDAG &DAG) const { MVT VCmpVT = VT == MVT::f32 ? MVT::v4i32 : MVT::v2i64; VCmp = DAG.getBitcast(VCmpVT, VCmp); - SDValue VSel = DAG.getNode(ISD::VSELECT, DL, VecVT, VCmp, VOp1, VOp2); + SDValue VSel = DAG.getSelect(DL, VecVT, VCmp, VOp1, VOp2); return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT, VSel, DAG.getIntPtrConstant(0, DL)); @@ -17483,9 +17521,8 @@ SDValue X86TargetLowering::LowerSELECT(SDValue Op, SelectionDAG &DAG) const { else if (Op2.getOpcode() == ISD::BITCAST && Op2.getOperand(0)) Op2Scalar = Op2.getOperand(0); if (Op1Scalar.getNode() && Op2Scalar.getNode()) { - SDValue newSelect = DAG.getNode(ISD::SELECT, DL, - Op1Scalar.getValueType(), - Cond, Op1Scalar, Op2Scalar); + SDValue newSelect = DAG.getSelect(DL, Op1Scalar.getValueType(), Cond, + Op1Scalar, Op2Scalar); if (newSelect.getValueSizeInBits() == VT.getSizeInBits()) return DAG.getBitcast(VT, newSelect); SDValue ExtVec = DAG.getBitcast(MVT::v8i1, newSelect); @@ -17500,8 +17537,7 @@ SDValue X86TargetLowering::LowerSELECT(SDValue Op, SelectionDAG &DAG) const { DAG.getUNDEF(MVT::v8i1), Op1, zeroConst); Op2 = DAG.getNode(ISD::INSERT_SUBVECTOR, DL, MVT::v8i1, DAG.getUNDEF(MVT::v8i1), Op2, zeroConst); - SDValue newSelect = DAG.getNode(ISD::SELECT, DL, MVT::v8i1, - Cond, Op1, Op2); + SDValue newSelect = DAG.getSelect(DL, MVT::v8i1, Cond, Op1, Op2); return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, newSelect, zeroConst); } @@ -17770,7 +17806,7 @@ static SDValue LowerSIGN_EXTEND_AVX512(SDValue Op, } else { SDValue NegOne = getOnesVector(ExtVT, DAG, dl); SDValue Zero = getZeroVector(ExtVT, Subtarget, DAG, dl); - V = DAG.getNode(ISD::VSELECT, dl, ExtVT, In, NegOne, Zero); + V = DAG.getSelect(dl, ExtVT, In, NegOne, Zero); if (ExtVT == VT) return V; } @@ -18572,7 +18608,7 @@ X86TargetLowering::LowerDYNAMIC_STACKALLOC(SDValue Op, // Chain the dynamic stack allocation so that it doesn't modify the stack // pointer when other instructions are using the stack. - Chain = DAG.getCALLSEQ_START(Chain, DAG.getIntPtrConstant(0, dl, true), dl); + Chain = DAG.getCALLSEQ_START(Chain, 0, 0, dl); bool Is64Bit = Subtarget.is64Bit(); MVT SPTy = getPointerTy(DAG.getDataLayout()); @@ -19021,8 +19057,10 @@ static SDValue getScalarMaskingNode(SDValue Op, SDValue Mask, SDValue PreservedSrc, const X86Subtarget &Subtarget, SelectionDAG &DAG) { - if (isAllOnesConstant(Mask)) - return Op; + + if (auto *MaskConst = dyn_cast<ConstantSDNode>(Mask)) + if (MaskConst->getZExtValue() & 0x1) + return Op; MVT VT = Op.getSimpleValueType(); SDLoc dl(Op); @@ -19081,7 +19119,7 @@ static SDValue recoverFramePointer(SelectionDAG &DAG, const Function *Fn, // registration, or the .set_setframe offset. MCSymbol *OffsetSym = MF.getMMI().getContext().getOrCreateParentFrameOffsetSymbol( - GlobalValue::getRealLinkageName(Fn->getName())); + GlobalValue::dropLLVMManglingEscape(Fn->getName())); SDValue OffsetSymVal = DAG.getMCSymbol(OffsetSym, PtrVT); SDValue ParentFrameOffset = DAG.getNode(ISD::LOCAL_RECOVER, dl, PtrVT, OffsetSymVal); @@ -19683,12 +19721,6 @@ static SDValue LowerINTRINSIC_WO_CHAIN(SDValue Op, const X86Subtarget &Subtarget DAG.getIntPtrConstant(0, dl)); return DAG.getBitcast(Op.getValueType(), Res); } - case CONVERT_MASK_TO_VEC: { - SDValue Mask = Op.getOperand(1); - MVT MaskVT = MVT::getVectorVT(MVT::i1, VT.getVectorNumElements()); - SDValue VMask = getMaskNode(Mask, MaskVT, Subtarget, DAG, dl); - return DAG.getNode(IntrData->Opc0, dl, VT, VMask); - } case BRCST_SUBVEC_TO_VEC: { SDValue Src = Op.getOperand(1); SDValue Passthru = Op.getOperand(2); @@ -19932,7 +19964,7 @@ static SDValue LowerINTRINSIC_WO_CHAIN(SDValue Op, const X86Subtarget &Subtarget SDValue Op1 = Op.getOperand(1); auto *Fn = cast<Function>(cast<GlobalAddressSDNode>(Op1)->getGlobal()); MCSymbol *LSDASym = MF.getMMI().getContext().getOrCreateLSDASymbol( - GlobalValue::getRealLinkageName(Fn->getName())); + GlobalValue::dropLLVMManglingEscape(Fn->getName())); // Generate a simple absolute symbol reference. This intrinsic is only // supported on 32-bit Windows, which isn't PIC. @@ -21741,6 +21773,14 @@ static SDValue LowerScalarImmediateShift(SDValue Op, SelectionDAG &DAG, MVT ExVT = MVT::getVectorVT(MVT::i32, VT.getVectorNumElements() * 2); SDValue Ex = DAG.getBitcast(ExVT, R); + // ashr(R, 63) === cmp_slt(R, 0) + if (ShiftAmt == 63 && Subtarget.hasSSE42()) { + assert((VT != MVT::v4i64 || Subtarget.hasInt256()) && + "Unsupported PCMPGT op"); + return DAG.getNode(X86ISD::PCMPGT, dl, VT, + getZeroVector(VT, Subtarget, DAG, dl), R); + } + if (ShiftAmt >= 32) { // Splat sign to upper i32 dst, and SRA upper i32 src to lower i32. SDValue Upper = @@ -21839,10 +21879,19 @@ static SDValue LowerScalarImmediateShift(SDValue Op, SelectionDAG &DAG, } // Special case in 32-bit mode, where i64 is expanded into high and low parts. + // TODO: Replace constant extraction with getTargetConstantBitsFromNode. if (!Subtarget.is64Bit() && !Subtarget.hasXOP() && (VT == MVT::v2i64 || (Subtarget.hasInt256() && VT == MVT::v4i64) || (Subtarget.hasAVX512() && VT == MVT::v8i64))) { + // AVX1 targets maybe extracting a 128-bit vector from a 256-bit constant. + unsigned SubVectorScale = 1; + if (Amt.getOpcode() == ISD::EXTRACT_SUBVECTOR) { + SubVectorScale = + Amt.getOperand(0).getValueSizeInBits() / Amt.getValueSizeInBits(); + Amt = Amt.getOperand(0); + } + // Peek through any splat that was introduced for i64 shift vectorization. int SplatIndex = -1; if (ShuffleVectorSDNode *SVN = dyn_cast<ShuffleVectorSDNode>(Amt.getNode())) @@ -21859,7 +21908,7 @@ static SDValue LowerScalarImmediateShift(SDValue Op, SelectionDAG &DAG, Amt = Amt.getOperand(0); unsigned Ratio = Amt.getSimpleValueType().getVectorNumElements() / - VT.getVectorNumElements(); + (SubVectorScale * VT.getVectorNumElements()); unsigned RatioInLog2 = Log2_32_Ceil(Ratio); uint64_t ShiftAmt = 0; unsigned BaseOp = (SplatIndex < 0 ? 0 : SplatIndex * Ratio); @@ -22233,23 +22282,21 @@ static SDValue LowerShift(SDValue Op, const X86Subtarget &Subtarget, V1 = DAG.getBitcast(VT, V1); Sel = DAG.getBitcast(VT, Sel); Sel = DAG.getNode(X86ISD::CVT2MASK, dl, MaskVT, Sel); - return DAG.getBitcast(SelVT, - DAG.getNode(ISD::VSELECT, dl, VT, Sel, V0, V1)); + return DAG.getBitcast(SelVT, DAG.getSelect(dl, VT, Sel, V0, V1)); } else if (Subtarget.hasSSE41()) { // On SSE41 targets we make use of the fact that VSELECT lowers // to PBLENDVB which selects bytes based just on the sign bit. V0 = DAG.getBitcast(VT, V0); V1 = DAG.getBitcast(VT, V1); Sel = DAG.getBitcast(VT, Sel); - return DAG.getBitcast(SelVT, - DAG.getNode(ISD::VSELECT, dl, VT, Sel, V0, V1)); + return DAG.getBitcast(SelVT, DAG.getSelect(dl, VT, Sel, V0, V1)); } // On pre-SSE41 targets we test for the sign bit by comparing to // zero - a negative value will set all bits of the lanes to true // and VSELECT uses that in its OR(AND(V0,C),AND(V1,~C)) lowering. SDValue Z = getZeroVector(SelVT, Subtarget, DAG, dl); SDValue C = DAG.getNode(X86ISD::PCMPGT, dl, SelVT, Z, Sel); - return DAG.getNode(ISD::VSELECT, dl, SelVT, C, V0, V1); + return DAG.getSelect(dl, SelVT, C, V0, V1); }; // Turn 'a' into a mask suitable for VSELECT: a = a << 5; @@ -22371,15 +22418,14 @@ static SDValue LowerShift(SDValue Op, const X86Subtarget &Subtarget, V0 = DAG.getBitcast(ExtVT, V0); V1 = DAG.getBitcast(ExtVT, V1); Sel = DAG.getBitcast(ExtVT, Sel); - return DAG.getBitcast( - VT, DAG.getNode(ISD::VSELECT, dl, ExtVT, Sel, V0, V1)); + return DAG.getBitcast(VT, DAG.getSelect(dl, ExtVT, Sel, V0, V1)); } // On pre-SSE41 targets we splat the sign bit - a negative value will // set all bits of the lanes to true and VSELECT uses that in // its OR(AND(V0,C),AND(V1,~C)) lowering. SDValue C = DAG.getNode(ISD::SRA, dl, VT, Sel, DAG.getConstant(15, dl, VT)); - return DAG.getNode(ISD::VSELECT, dl, VT, C, V0, V1); + return DAG.getSelect(dl, VT, C, V0, V1); }; // Turn 'a' into a mask suitable for VSELECT: a = a << 12; @@ -23296,9 +23342,8 @@ static SDValue LowerFSINCOS(SDValue Op, const X86Subtarget &Subtarget, SDValue Callee = DAG.getExternalSymbol(LibcallName, TLI.getPointerTy(DAG.getDataLayout())); - Type *RetTy = isF64 - ? (Type*)StructType::get(ArgTy, ArgTy, nullptr) - : (Type*)VectorType::get(ArgTy, 4); + Type *RetTy = isF64 ? (Type *)StructType::get(ArgTy, ArgTy) + : (Type *)VectorType::get(ArgTy, 4); TargetLowering::CallLoweringInfo CLI(DAG); CLI.setDebugLoc(dl) @@ -25779,7 +25824,7 @@ X86TargetLowering::EmitLoweredTLSAddr(MachineInstr &MI, // Emit CALLSEQ_START right before the instruction. unsigned AdjStackDown = TII.getCallFrameSetupOpcode(); MachineInstrBuilder CallseqStart = - BuildMI(MF, DL, TII.get(AdjStackDown)).addImm(0).addImm(0); + BuildMI(MF, DL, TII.get(AdjStackDown)).addImm(0).addImm(0).addImm(0); BB->insert(MachineBasicBlock::iterator(MI), CallseqStart); // Emit CALLSEQ_END right after the instruction. @@ -26517,7 +26562,7 @@ X86TargetLowering::EmitInstrWithCustomInserter(MachineInstr &MI, case TargetOpcode::STACKMAP: case TargetOpcode::PATCHPOINT: return emitPatchPoint(MI, BB); - + case TargetOpcode::PATCHABLE_EVENT_CALL: // Do nothing here, handle in xray instrumentation pass. return BB; @@ -29532,7 +29577,7 @@ combineVSelectWithAllOnesOrZeros(SDNode *N, SelectionDAG &DAG, SDValue CondNew = DAG.getNode(ISD::XOR, DL, Cond.getValueType(), Cond, DAG.getAllOnesConstant(DL, CondVT)); // Vselect cond, op1, op2 = Vselect not(cond), op2, op1 - return DAG.getNode(ISD::VSELECT, DL, VT, CondNew, RHS, LHS); + return DAG.getSelect(DL, VT, CondNew, RHS, LHS); } // To use the condition operand as a bitwise mask, it must have elements that @@ -30015,7 +30060,7 @@ static SDValue combineSelect(SDNode *N, SelectionDAG &DAG, ISD::CondCode NewCC = (CC == ISD::SETLT) ? ISD::SETLE : ISD::SETGE; Cond = DAG.getSetCC(SDLoc(Cond), Cond.getValueType(), Cond.getOperand(0), Cond.getOperand(1), NewCC); - return DAG.getNode(ISD::SELECT, DL, VT, Cond, LHS, RHS); + return DAG.getSelect(DL, VT, Cond, LHS, RHS); } } } @@ -31561,20 +31606,22 @@ static SDValue combineAnd(SDNode *N, SelectionDAG &DAG, // (sub (xor X, M), M) static SDValue combineLogicBlendIntoPBLENDV(SDNode *N, SelectionDAG &DAG, const X86Subtarget &Subtarget) { - assert(N->getOpcode() == ISD::OR); + assert(N->getOpcode() == ISD::OR && "Unexpected Opcode"); SDValue N0 = N->getOperand(0); SDValue N1 = N->getOperand(1); EVT VT = N->getValueType(0); - if (!((VT == MVT::v2i64) || (VT == MVT::v4i64 && Subtarget.hasInt256()))) + if (!((VT.is128BitVector() && Subtarget.hasSSE2()) || + (VT.is256BitVector() && Subtarget.hasInt256()))) return SDValue(); - assert(Subtarget.hasSSE2() && "Unexpected i64 vector without SSE2!"); - // Canonicalize pandn to RHS - if (N0.getOpcode() == X86ISD::ANDNP) + // Canonicalize AND to LHS. + if (N1.getOpcode() == ISD::AND) std::swap(N0, N1); + // TODO: Attempt to match against AND(XOR(-1,X),Y) as well, waiting for + // ANDNP combine allows other combines to happen that prevent matching. if (N0.getOpcode() != ISD::AND || N1.getOpcode() != X86ISD::ANDNP) return SDValue(); @@ -31596,20 +31643,10 @@ static SDValue combineLogicBlendIntoPBLENDV(SDNode *N, SelectionDAG &DAG, Y = peekThroughBitcasts(Y); EVT MaskVT = Mask.getValueType(); - - // Validate that the Mask operand is a vector sra node. - // FIXME: what to do for bytes, since there is a psignb/pblendvb, but - // there is no psrai.b unsigned EltBits = MaskVT.getScalarSizeInBits(); - unsigned SraAmt = ~0; - if (Mask.getOpcode() == ISD::SRA) { - if (auto *AmtBV = dyn_cast<BuildVectorSDNode>(Mask.getOperand(1))) - if (auto *AmtConst = AmtBV->getConstantSplatNode()) - SraAmt = AmtConst->getZExtValue(); - } else if (Mask.getOpcode() == X86ISD::VSRAI) - SraAmt = Mask.getConstantOperandVal(1); - if ((SraAmt + 1) != EltBits) + // TODO: Attempt to handle floating point cases as well? + if (!MaskVT.isInteger() || DAG.ComputeNumSignBits(Mask) != EltBits) return SDValue(); SDLoc DL(N); @@ -31630,7 +31667,8 @@ static SDValue combineLogicBlendIntoPBLENDV(SDNode *N, SelectionDAG &DAG, // (add (xor X, M), (and M, 1)) // And further to: // (sub (xor X, M), M) - if (X.getValueType() == MaskVT && Y.getValueType() == MaskVT) { + if (X.getValueType() == MaskVT && Y.getValueType() == MaskVT && + DAG.getTargetLoweringInfo().isOperationLegal(ISD::SUB, MaskVT)) { auto IsNegV = [](SDNode *N, SDValue V) { return N->getOpcode() == ISD::SUB && N->getOperand(1) == V && ISD::isBuildVectorAllZeros(N->getOperand(0).getNode()); @@ -31642,9 +31680,6 @@ static SDValue combineLogicBlendIntoPBLENDV(SDNode *N, SelectionDAG &DAG, V = Y; if (V) { - if (EltBits != 8 && EltBits != 16 && EltBits != 32) - return SDValue(); - SDValue SubOp1 = DAG.getNode(ISD::XOR, DL, MaskVT, V, Mask); SDValue SubOp2 = Mask; @@ -31661,8 +31696,8 @@ static SDValue combineLogicBlendIntoPBLENDV(SDNode *N, SelectionDAG &DAG, if (V == Y) std::swap(SubOp1, SubOp2); - return DAG.getBitcast(VT, - DAG.getNode(ISD::SUB, DL, MaskVT, SubOp1, SubOp2)); + SDValue Res = DAG.getNode(ISD::SUB, DL, MaskVT, SubOp1, SubOp2); + return DAG.getBitcast(VT, Res); } } @@ -31675,7 +31710,7 @@ static SDValue combineLogicBlendIntoPBLENDV(SDNode *N, SelectionDAG &DAG, X = DAG.getBitcast(BlendVT, X); Y = DAG.getBitcast(BlendVT, Y); Mask = DAG.getBitcast(BlendVT, Mask); - Mask = DAG.getNode(ISD::VSELECT, DL, BlendVT, Mask, Y, X); + Mask = DAG.getSelect(DL, BlendVT, Mask, Y, X); return DAG.getBitcast(VT, Mask); } @@ -33655,8 +33690,7 @@ static SDValue combineFMinNumFMaxNum(SDNode *N, SelectionDAG &DAG, // If Op0 is a NaN, select Op1. Otherwise, select the max. If both operands // are NaN, the NaN value of Op1 is the result. - auto SelectOpcode = VT.isVector() ? ISD::VSELECT : ISD::SELECT; - return DAG.getNode(SelectOpcode, DL, VT, IsOp0Nan, Op1, MinOrMax); + return DAG.getSelect(DL, VT, IsOp0Nan, Op1, MinOrMax); } /// Do target-specific dag combines on X86ISD::ANDNP nodes. @@ -33949,7 +33983,7 @@ static SDValue combineSext(SDNode *N, SelectionDAG &DAG, if (InVT == MVT::i1) { SDValue Zero = DAG.getConstant(0, DL, VT); SDValue AllOnes = DAG.getAllOnesConstant(DL, VT); - return DAG.getNode(ISD::SELECT, DL, VT, N0, AllOnes, Zero); + return DAG.getSelect(DL, VT, N0, AllOnes, Zero); } return SDValue(); } diff --git a/lib/Target/X86/X86InstrCompiler.td b/lib/Target/X86/X86InstrCompiler.td index 3dc673e3c35a..d003d027ddb9 100644 --- a/lib/Target/X86/X86InstrCompiler.td +++ b/lib/Target/X86/X86InstrCompiler.td @@ -43,7 +43,8 @@ let hasSideEffects = 0, isNotDuplicable = 1, Uses = [ESP] in // Pessimistically assume ADJCALLSTACKDOWN / ADJCALLSTACKUP will become // sub / add which can clobber EFLAGS. let Defs = [ESP, EFLAGS], Uses = [ESP] in { -def ADJCALLSTACKDOWN32 : I<0, Pseudo, (outs), (ins i32imm:$amt1, i32imm:$amt2), +def ADJCALLSTACKDOWN32 : I<0, Pseudo, (outs), + (ins i32imm:$amt1, i32imm:$amt2, i32imm:$amt3), "#ADJCALLSTACKDOWN", []>, Requires<[NotLP64]>; @@ -52,8 +53,8 @@ def ADJCALLSTACKUP32 : I<0, Pseudo, (outs), (ins i32imm:$amt1, i32imm:$amt2), [(X86callseq_end timm:$amt1, timm:$amt2)]>, Requires<[NotLP64]>; } -def : Pat<(X86callseq_start timm:$amt1), - (ADJCALLSTACKDOWN32 i32imm:$amt1, 0)>, Requires<[NotLP64]>; +def : Pat<(X86callseq_start timm:$amt1, timm:$amt2), + (ADJCALLSTACKDOWN32 i32imm:$amt1, i32imm:$amt2, 0)>, Requires<[NotLP64]>; // ADJCALLSTACKDOWN/UP implicitly use/def RSP because they may be expanded into @@ -62,7 +63,8 @@ def : Pat<(X86callseq_start timm:$amt1), // Pessimistically assume ADJCALLSTACKDOWN / ADJCALLSTACKUP will become // sub / add which can clobber EFLAGS. let Defs = [RSP, EFLAGS], Uses = [RSP] in { -def ADJCALLSTACKDOWN64 : I<0, Pseudo, (outs), (ins i32imm:$amt1, i32imm:$amt2), +def ADJCALLSTACKDOWN64 : I<0, Pseudo, (outs), + (ins i32imm:$amt1, i32imm:$amt2, i32imm:$amt3), "#ADJCALLSTACKDOWN", []>, Requires<[IsLP64]>; @@ -71,8 +73,8 @@ def ADJCALLSTACKUP64 : I<0, Pseudo, (outs), (ins i32imm:$amt1, i32imm:$amt2), [(X86callseq_end timm:$amt1, timm:$amt2)]>, Requires<[IsLP64]>; } -def : Pat<(X86callseq_start timm:$amt1), - (ADJCALLSTACKDOWN64 i32imm:$amt1, 0)>, Requires<[IsLP64]>; +def : Pat<(X86callseq_start timm:$amt1, timm:$amt2), + (ADJCALLSTACKDOWN64 i32imm:$amt1, i32imm:$amt2, 0)>, Requires<[IsLP64]>; // x86-64 va_start lowering magic. diff --git a/lib/Target/X86/X86InstrInfo.cpp b/lib/Target/X86/X86InstrInfo.cpp index 888daa275265..092ceb207ada 100644 --- a/lib/Target/X86/X86InstrInfo.cpp +++ b/lib/Target/X86/X86InstrInfo.cpp @@ -5729,6 +5729,44 @@ static X86::CondCode getSwappedCondition(X86::CondCode CC) { } } +std::pair<X86::CondCode, bool> +X86::getX86ConditionCode(CmpInst::Predicate Predicate) { + X86::CondCode CC = X86::COND_INVALID; + bool NeedSwap = false; + switch (Predicate) { + default: break; + // Floating-point Predicates + case CmpInst::FCMP_UEQ: CC = X86::COND_E; break; + case CmpInst::FCMP_OLT: NeedSwap = true; LLVM_FALLTHROUGH; + case CmpInst::FCMP_OGT: CC = X86::COND_A; break; + case CmpInst::FCMP_OLE: NeedSwap = true; LLVM_FALLTHROUGH; + case CmpInst::FCMP_OGE: CC = X86::COND_AE; break; + case CmpInst::FCMP_UGT: NeedSwap = true; LLVM_FALLTHROUGH; + case CmpInst::FCMP_ULT: CC = X86::COND_B; break; + case CmpInst::FCMP_UGE: NeedSwap = true; LLVM_FALLTHROUGH; + case CmpInst::FCMP_ULE: CC = X86::COND_BE; break; + case CmpInst::FCMP_ONE: CC = X86::COND_NE; break; + case CmpInst::FCMP_UNO: CC = X86::COND_P; break; + case CmpInst::FCMP_ORD: CC = X86::COND_NP; break; + case CmpInst::FCMP_OEQ: LLVM_FALLTHROUGH; + case CmpInst::FCMP_UNE: CC = X86::COND_INVALID; break; + + // Integer Predicates + case CmpInst::ICMP_EQ: CC = X86::COND_E; break; + case CmpInst::ICMP_NE: CC = X86::COND_NE; break; + case CmpInst::ICMP_UGT: CC = X86::COND_A; break; + case CmpInst::ICMP_UGE: CC = X86::COND_AE; break; + case CmpInst::ICMP_ULT: CC = X86::COND_B; break; + case CmpInst::ICMP_ULE: CC = X86::COND_BE; break; + case CmpInst::ICMP_SGT: CC = X86::COND_G; break; + case CmpInst::ICMP_SGE: CC = X86::COND_GE; break; + case CmpInst::ICMP_SLT: CC = X86::COND_L; break; + case CmpInst::ICMP_SLE: CC = X86::COND_LE; break; + } + + return std::make_pair(CC, NeedSwap); +} + /// Return a set opcode for the given condition and /// whether it has memory operand. unsigned X86::getSETFromCond(CondCode CC, bool HasMemoryOperand) { @@ -7589,6 +7627,13 @@ bool X86InstrInfo::expandPostRAPseudo(MachineInstr &MI) const { return Expand2AddrUndef(MIB, get(HasAVX ? X86::VPCMPEQDrr : X86::PCMPEQDrr)); case X86::AVX2_SETALLONES: return Expand2AddrUndef(MIB, get(X86::VPCMPEQDYrr)); + case X86::AVX1_SETALLONES: { + unsigned Reg = MIB->getOperand(0).getReg(); + // VCMPPSYrri with an immediate 0xf should produce VCMPTRUEPS. + MIB->setDesc(get(X86::VCMPPSYrri)); + MIB.addReg(Reg, RegState::Undef).addReg(Reg, RegState::Undef).addImm(0xf); + return true; + } case X86::AVX512_512_SETALLONES: { unsigned Reg = MIB->getOperand(0).getReg(); MIB->setDesc(get(X86::VPTERNLOGDZrri)); @@ -8477,6 +8522,7 @@ MachineInstr *X86InstrInfo::foldMemoryOperandImpl( Alignment = 64; break; case X86::AVX2_SETALLONES: + case X86::AVX1_SETALLONES: case X86::AVX_SET0: case X86::AVX512_256_SET0: Alignment = 32; @@ -8522,6 +8568,7 @@ MachineInstr *X86InstrInfo::foldMemoryOperandImpl( case X86::V_SET0: case X86::V_SETALLONES: case X86::AVX2_SETALLONES: + case X86::AVX1_SETALLONES: case X86::AVX_SET0: case X86::AVX512_128_SET0: case X86::AVX512_256_SET0: @@ -8563,13 +8610,14 @@ MachineInstr *X86InstrInfo::foldMemoryOperandImpl( else if (Opc == X86::AVX512_512_SET0 || Opc == X86::AVX512_512_SETALLONES) Ty = VectorType::get(Type::getInt32Ty(MF.getFunction()->getContext()),16); else if (Opc == X86::AVX2_SETALLONES || Opc == X86::AVX_SET0 || - Opc == X86::AVX512_256_SET0) + Opc == X86::AVX512_256_SET0 || Opc == X86::AVX1_SETALLONES) Ty = VectorType::get(Type::getInt32Ty(MF.getFunction()->getContext()), 8); else Ty = VectorType::get(Type::getInt32Ty(MF.getFunction()->getContext()), 4); bool IsAllOnes = (Opc == X86::V_SETALLONES || Opc == X86::AVX2_SETALLONES || - Opc == X86::AVX512_512_SETALLONES); + Opc == X86::AVX512_512_SETALLONES || + Opc == X86::AVX1_SETALLONES); const Constant *C = IsAllOnes ? Constant::getAllOnesValue(Ty) : Constant::getNullValue(Ty); unsigned CPI = MCP.getConstantPoolIndex(C, Alignment); diff --git a/lib/Target/X86/X86InstrInfo.h b/lib/Target/X86/X86InstrInfo.h index 38567831b3a4..e64876073ccf 100644 --- a/lib/Target/X86/X86InstrInfo.h +++ b/lib/Target/X86/X86InstrInfo.h @@ -64,6 +64,10 @@ enum CondCode { // Turn condition code into conditional branch opcode. unsigned GetCondBranchFromCond(CondCode CC); +/// \brief Return a pair of condition code for the given predicate and whether +/// the instruction operands should be swaped to match the condition code. +std::pair<CondCode, bool> getX86ConditionCode(CmpInst::Predicate Predicate); + /// \brief Return a set opcode for the given condition and whether it has /// a memory operand. unsigned getSETFromCond(CondCode CC, bool HasMemoryOperand = false); @@ -186,6 +190,8 @@ public: /// setup..destroy sequence (e.g. by pushes, or inside the callee). int64_t getFrameAdjustment(const MachineInstr &I) const { assert(isFrameInstr(I)); + if (isFrameSetup(I)) + return I.getOperand(2).getImm(); return I.getOperand(1).getImm(); } @@ -193,7 +199,10 @@ public: /// instruction. void setFrameAdjustment(MachineInstr &I, int64_t V) const { assert(isFrameInstr(I)); - I.getOperand(1).setImm(V); + if (isFrameSetup(I)) + I.getOperand(2).setImm(V); + else + I.getOperand(1).setImm(V); } /// getSPAdjust - This returns the stack pointer adjustment made by diff --git a/lib/Target/X86/X86InstrInfo.td b/lib/Target/X86/X86InstrInfo.td index 902b0c2c04e3..4d7d8ece92d9 100644 --- a/lib/Target/X86/X86InstrInfo.td +++ b/lib/Target/X86/X86InstrInfo.td @@ -84,7 +84,8 @@ def SDTLockBinaryArithWithFlags : SDTypeProfile<1, 2, [SDTCisVT<0, i32>, def SDTX86Ret : SDTypeProfile<0, -1, [SDTCisVT<0, i32>]>; -def SDT_X86CallSeqStart : SDCallSeqStart<[SDTCisVT<0, i32>]>; +def SDT_X86CallSeqStart : SDCallSeqStart<[SDTCisVT<0, i32>, + SDTCisVT<1, i32>]>; def SDT_X86CallSeqEnd : SDCallSeqEnd<[SDTCisVT<0, i32>, SDTCisVT<1, i32>]>; @@ -2351,6 +2352,38 @@ let Predicates = [HasBMI2] in { def : Pat<(and (loadi64 addr:$src), (add (shl 1, GR8:$lz), -1)), (BZHI64rm addr:$src, (INSERT_SUBREG (i64 (IMPLICIT_DEF)), GR8:$lz, sub_8bit))>; + + // x & (-1 >> (32 - y)) + def : Pat<(and GR32:$src, (srl -1, (i8 (trunc (sub 32, GR32:$lz))))), + (BZHI32rr GR32:$src, GR32:$lz)>; + def : Pat<(and (loadi32 addr:$src), (srl -1, (i8 (trunc (sub 32, GR32:$lz))))), + (BZHI32rm addr:$src, GR32:$lz)>; + + // x & (-1 >> (64 - y)) + def : Pat<(and GR64:$src, (srl -1, (i8 (trunc (sub 64, GR32:$lz))))), + (BZHI64rr GR64:$src, + (INSERT_SUBREG (i64 (IMPLICIT_DEF)), GR32:$lz, sub_32bit))>; + def : Pat<(and (loadi64 addr:$src), (srl -1, (i8 (trunc (sub 64, GR32:$lz))))), + (BZHI64rm addr:$src, + (INSERT_SUBREG (i64 (IMPLICIT_DEF)), GR32:$lz, sub_32bit))>; + + // x << (32 - y) >> (32 - y) + def : Pat<(srl (shl GR32:$src, (i8 (trunc (sub 32, GR32:$lz)))), + (i8 (trunc (sub 32, GR32:$lz)))), + (BZHI32rr GR32:$src, GR32:$lz)>; + def : Pat<(srl (shl (loadi32 addr:$src), (i8 (trunc (sub 32, GR32:$lz)))), + (i8 (trunc (sub 32, GR32:$lz)))), + (BZHI32rm addr:$src, GR32:$lz)>; + + // x << (64 - y) >> (64 - y) + def : Pat<(srl (shl GR64:$src, (i8 (trunc (sub 64, GR32:$lz)))), + (i8 (trunc (sub 64, GR32:$lz)))), + (BZHI64rr GR64:$src, + (INSERT_SUBREG (i64 (IMPLICIT_DEF)), GR32:$lz, sub_32bit))>; + def : Pat<(srl (shl (loadi64 addr:$src), (i8 (trunc (sub 64, GR32:$lz)))), + (i8 (trunc (sub 64, GR32:$lz)))), + (BZHI64rm addr:$src, + (INSERT_SUBREG (i64 (IMPLICIT_DEF)), GR32:$lz, sub_32bit))>; } // HasBMI2 let Predicates = [HasBMI] in { diff --git a/lib/Target/X86/X86InstrSSE.td b/lib/Target/X86/X86InstrSSE.td index 48da2fa607af..f73d85e7e01b 100644 --- a/lib/Target/X86/X86InstrSSE.td +++ b/lib/Target/X86/X86InstrSSE.td @@ -486,6 +486,10 @@ let isReMaterializable = 1, isAsCheapAsAMove = 1, canFoldAsLoad = 1, isPseudo = 1, SchedRW = [WriteZero] in { def V_SETALLONES : I<0, Pseudo, (outs VR128:$dst), (ins), "", [(set VR128:$dst, (v4i32 immAllOnesV))]>; + let Predicates = [HasAVX1Only, OptForMinSize] in { + def AVX1_SETALLONES: I<0, Pseudo, (outs VR256:$dst), (ins), "", + [(set VR256:$dst, (v8i32 immAllOnesV))]>; + } let Predicates = [HasAVX2] in def AVX2_SETALLONES : I<0, Pseudo, (outs VR256:$dst), (ins), "", [(set VR256:$dst, (v8i32 immAllOnesV))]>; @@ -7755,14 +7759,12 @@ def VINSERTF128rm : AVXAIi8<0x18, MRMSrcMem, (outs VR256:$dst), []>, Sched<[WriteFShuffleLd, ReadAfterLd]>, VEX_4V, VEX_L; } - -// Without AVX2 we need to concat two v4i32 V_SETALLONES to create a 256-bit -// all ones value. -let Predicates = [HasAVX1Only] in -def : Pat<(v8i32 immAllOnesV), - (VINSERTF128rr - (INSERT_SUBREG (v8i32 (IMPLICIT_DEF)), (V_SETALLONES), sub_xmm), - (V_SETALLONES), 1)>; +// To create a 256-bit all ones value, we should produce VCMPTRUEPS +// with YMM register containing zero. +// FIXME: Avoid producing vxorps to clear the fake inputs. +let Predicates = [HasAVX1Only] in { +def : Pat<(v8i32 immAllOnesV), (VCMPPSYrri (AVX_SET0), (AVX_SET0), 0xf)>; +} multiclass vinsert_lowering<string InstrStr, ValueType From, ValueType To, PatFrag memop_frag> { diff --git a/lib/Target/X86/X86InstructionSelector.cpp b/lib/Target/X86/X86InstructionSelector.cpp index d65eb1de8d09..de58d719acb4 100644 --- a/lib/Target/X86/X86InstructionSelector.cpp +++ b/lib/Target/X86/X86InstructionSelector.cpp @@ -56,13 +56,9 @@ private: bool selectImpl(MachineInstr &I) const; // TODO: remove after suported by Tablegen-erated instruction selection. - unsigned getFAddOp(LLT &Ty, const RegisterBank &RB) const; - unsigned getFSubOp(LLT &Ty, const RegisterBank &RB) const; unsigned getLoadStoreOp(LLT &Ty, const RegisterBank &RB, unsigned Opc, uint64_t Alignment) const; - bool selectBinaryOp(MachineInstr &I, MachineRegisterInfo &MRI, - MachineFunction &MF) const; bool selectLoadStoreOp(MachineInstr &I, MachineRegisterInfo &MRI, MachineFunction &MF) const; bool selectFrameIndexOrGep(MachineInstr &I, MachineRegisterInfo &MRI, @@ -71,6 +67,10 @@ private: MachineFunction &MF) const; bool selectTrunc(MachineInstr &I, MachineRegisterInfo &MRI, MachineFunction &MF) const; + bool selectZext(MachineInstr &I, MachineRegisterInfo &MRI, + MachineFunction &MF) const; + bool selectCmp(MachineInstr &I, MachineRegisterInfo &MRI, + MachineFunction &MF) const; const X86TargetMachine &TM; const X86Subtarget &STI; @@ -226,13 +226,11 @@ bool X86InstructionSelector::select(MachineInstr &I) const { "Generic instruction has unexpected implicit operands\n"); if (selectImpl(I)) - return true; + return true; DEBUG(dbgs() << " C++ instruction selection: "; I.print(dbgs())); // TODO: This should be implemented by tblgen. - if (selectBinaryOp(I, MRI, MF)) - return true; if (selectLoadStoreOp(I, MRI, MF)) return true; if (selectFrameIndexOrGep(I, MRI, MF)) @@ -241,109 +239,14 @@ bool X86InstructionSelector::select(MachineInstr &I) const { return true; if (selectTrunc(I, MRI, MF)) return true; + if (selectZext(I, MRI, MF)) + return true; + if (selectCmp(I, MRI, MF)) + return true; return false; } -unsigned X86InstructionSelector::getFAddOp(LLT &Ty, - const RegisterBank &RB) const { - - if (X86::VECRRegBankID != RB.getID()) - return TargetOpcode::G_FADD; - - if (Ty == LLT::scalar(32)) { - if (STI.hasAVX512()) { - return X86::VADDSSZrr; - } else if (STI.hasAVX()) { - return X86::VADDSSrr; - } else if (STI.hasSSE1()) { - return X86::ADDSSrr; - } - } else if (Ty == LLT::scalar(64)) { - if (STI.hasAVX512()) { - return X86::VADDSDZrr; - } else if (STI.hasAVX()) { - return X86::VADDSDrr; - } else if (STI.hasSSE2()) { - return X86::ADDSDrr; - } - } else if (Ty == LLT::vector(4, 32)) { - if ((STI.hasAVX512()) && (STI.hasVLX())) { - return X86::VADDPSZ128rr; - } else if (STI.hasAVX()) { - return X86::VADDPSrr; - } else if (STI.hasSSE1()) { - return X86::ADDPSrr; - } - } - - return TargetOpcode::G_FADD; -} - -unsigned X86InstructionSelector::getFSubOp(LLT &Ty, - const RegisterBank &RB) const { - - if (X86::VECRRegBankID != RB.getID()) - return TargetOpcode::G_FSUB; - - if (Ty == LLT::scalar(32)) { - if (STI.hasAVX512()) { - return X86::VSUBSSZrr; - } else if (STI.hasAVX()) { - return X86::VSUBSSrr; - } else if (STI.hasSSE1()) { - return X86::SUBSSrr; - } - } else if (Ty == LLT::scalar(64)) { - if (STI.hasAVX512()) { - return X86::VSUBSDZrr; - } else if (STI.hasAVX()) { - return X86::VSUBSDrr; - } else if (STI.hasSSE2()) { - return X86::SUBSDrr; - } - } else if (Ty == LLT::vector(4, 32)) { - if ((STI.hasAVX512()) && (STI.hasVLX())) { - return X86::VSUBPSZ128rr; - } else if (STI.hasAVX()) { - return X86::VSUBPSrr; - } else if (STI.hasSSE1()) { - return X86::SUBPSrr; - } - } - - return TargetOpcode::G_FSUB; -} - -bool X86InstructionSelector::selectBinaryOp(MachineInstr &I, - MachineRegisterInfo &MRI, - MachineFunction &MF) const { - - const unsigned DefReg = I.getOperand(0).getReg(); - LLT Ty = MRI.getType(DefReg); - const RegisterBank &RB = *RBI.getRegBank(DefReg, MRI, TRI); - - unsigned NewOpc = I.getOpcode(); - - switch (NewOpc) { - case TargetOpcode::G_FADD: - NewOpc = getFAddOp(Ty, RB); - break; - case TargetOpcode::G_FSUB: - NewOpc = getFSubOp(Ty, RB); - break; - default: - break; - } - - if (NewOpc == I.getOpcode()) - return false; - - I.setDesc(TII.get(NewOpc)); - - return constrainSelectedInstRegOperands(I, TII, TRI, RBI); -} - unsigned X86InstructionSelector::getLoadStoreOp(LLT &Ty, const RegisterBank &RB, unsigned Opc, uint64_t Alignment) const { @@ -562,6 +465,105 @@ bool X86InstructionSelector::selectTrunc(MachineInstr &I, return true; } +bool X86InstructionSelector::selectZext(MachineInstr &I, + MachineRegisterInfo &MRI, + MachineFunction &MF) const { + if (I.getOpcode() != TargetOpcode::G_ZEXT) + return false; + + const unsigned DstReg = I.getOperand(0).getReg(); + const unsigned SrcReg = I.getOperand(1).getReg(); + + const LLT DstTy = MRI.getType(DstReg); + const LLT SrcTy = MRI.getType(SrcReg); + + if (SrcTy == LLT::scalar(1)) { + + unsigned AndOpc; + if (DstTy == LLT::scalar(32)) + AndOpc = X86::AND32ri8; + else if (DstTy == LLT::scalar(64)) + AndOpc = X86::AND64ri8; + else + return false; + + const RegisterBank &RegBank = *RBI.getRegBank(DstReg, MRI, TRI); + unsigned DefReg = + MRI.createVirtualRegister(getRegClassForTypeOnBank(DstTy, RegBank)); + + BuildMI(*I.getParent(), I, I.getDebugLoc(), + TII.get(TargetOpcode::SUBREG_TO_REG), DefReg) + .addImm(0) + .addReg(SrcReg) + .addImm(X86::sub_8bit); + + MachineInstr &AndInst = + *BuildMI(*I.getParent(), I, I.getDebugLoc(), TII.get(AndOpc), DstReg) + .addReg(DefReg) + .addImm(1); + + constrainSelectedInstRegOperands(AndInst, TII, TRI, RBI); + + I.eraseFromParent(); + return true; + } + + return false; +} + +bool X86InstructionSelector::selectCmp(MachineInstr &I, + MachineRegisterInfo &MRI, + MachineFunction &MF) const { + if (I.getOpcode() != TargetOpcode::G_ICMP) + return false; + + X86::CondCode CC; + bool SwapArgs; + std::tie(CC, SwapArgs) = X86::getX86ConditionCode( + (CmpInst::Predicate)I.getOperand(1).getPredicate()); + unsigned OpSet = X86::getSETFromCond(CC); + + unsigned LHS = I.getOperand(2).getReg(); + unsigned RHS = I.getOperand(3).getReg(); + + if (SwapArgs) + std::swap(LHS, RHS); + + unsigned OpCmp; + LLT Ty = MRI.getType(LHS); + + switch (Ty.getSizeInBits()) { + default: + return false; + case 8: + OpCmp = X86::CMP8rr; + break; + case 16: + OpCmp = X86::CMP16rr; + break; + case 32: + OpCmp = X86::CMP32rr; + break; + case 64: + OpCmp = X86::CMP64rr; + break; + } + + MachineInstr &CmpInst = + *BuildMI(*I.getParent(), I, I.getDebugLoc(), TII.get(OpCmp)) + .addReg(LHS) + .addReg(RHS); + + MachineInstr &SetInst = *BuildMI(*I.getParent(), I, I.getDebugLoc(), + TII.get(OpSet), I.getOperand(0).getReg()); + + constrainSelectedInstRegOperands(CmpInst, TII, TRI, RBI); + constrainSelectedInstRegOperands(SetInst, TII, TRI, RBI); + + I.eraseFromParent(); + return true; +} + InstructionSelector * llvm::createX86InstructionSelector(const X86TargetMachine &TM, X86Subtarget &Subtarget, diff --git a/lib/Target/X86/X86IntrinsicsInfo.h b/lib/Target/X86/X86IntrinsicsInfo.h index 2a40399ba571..bc73bb1ae8c5 100644 --- a/lib/Target/X86/X86IntrinsicsInfo.h +++ b/lib/Target/X86/X86IntrinsicsInfo.h @@ -36,7 +36,7 @@ enum IntrinsicType : uint16_t { TRUNCATE_TO_MEM_VI8, TRUNCATE_TO_MEM_VI16, TRUNCATE_TO_MEM_VI32, EXPAND_FROM_MEM, TERLOG_OP_MASK, TERLOG_OP_MASKZ, BROADCASTM, KUNPCK, FIXUPIMM, FIXUPIMM_MASKZ, FIXUPIMMS, - FIXUPIMMS_MASKZ, CONVERT_MASK_TO_VEC, CONVERT_TO_MASK, GATHER_AVX2, MASK_BINOP, + FIXUPIMMS_MASKZ, CONVERT_TO_MASK, GATHER_AVX2, MASK_BINOP, }; struct IntrinsicData { diff --git a/lib/Target/X86/X86LegalizerInfo.cpp b/lib/Target/X86/X86LegalizerInfo.cpp index 4f5e70414aa9..cf26238c0239 100644 --- a/lib/Target/X86/X86LegalizerInfo.cpp +++ b/lib/Target/X86/X86LegalizerInfo.cpp @@ -87,10 +87,16 @@ void X86LegalizerInfo::setLegalizerInfo32bit() { setAction({G_ZEXT, s32}, Legal); setAction({G_SEXT, s32}, Legal); - for (auto Ty : {s8, s16}) { + for (auto Ty : {s1, s8, s16}) { setAction({G_ZEXT, 1, Ty}, Legal); setAction({G_SEXT, 1, Ty}, Legal); } + + // Comparison + setAction({G_ICMP, s1}, Legal); + + for (auto Ty : {s8, s16, s32, p0}) + setAction({G_ICMP, 1, Ty}, Legal); } void X86LegalizerInfo::setLegalizerInfo64bit() { @@ -139,10 +145,16 @@ void X86LegalizerInfo::setLegalizerInfo64bit() { setAction({G_SEXT, Ty}, Legal); } - for (auto Ty : {s8, s16, s32}) { + for (auto Ty : {s1, s8, s16, s32}) { setAction({G_ZEXT, 1, Ty}, Legal); setAction({G_SEXT, 1, Ty}, Legal); } + + // Comparison + setAction({G_ICMP, s1}, Legal); + + for (auto Ty : {s8, s16, s32, s64, p0}) + setAction({G_ICMP, 1, Ty}, Legal); } void X86LegalizerInfo::setLegalizerInfoSSE1() { diff --git a/lib/Target/X86/X86RegisterInfo.cpp b/lib/Target/X86/X86RegisterInfo.cpp index cf2ceef8013a..7e4cba1c8345 100644 --- a/lib/Target/X86/X86RegisterInfo.cpp +++ b/lib/Target/X86/X86RegisterInfo.cpp @@ -320,14 +320,14 @@ X86RegisterInfo::getCalleeSavedRegs(const MachineFunction *MF) const { case CallingConv::X86_RegCall: if (Is64Bit) { if (IsWin64) { - return (HasSSE ? CSR_Win64_RegCall_SaveList : + return (HasSSE ? CSR_Win64_RegCall_SaveList : CSR_Win64_RegCall_NoSSE_SaveList); } else { - return (HasSSE ? CSR_SysV64_RegCall_SaveList : + return (HasSSE ? CSR_SysV64_RegCall_SaveList : CSR_SysV64_RegCall_NoSSE_SaveList); } } else { - return (HasSSE ? CSR_32_RegCall_SaveList : + return (HasSSE ? CSR_32_RegCall_SaveList : CSR_32_RegCall_NoSSE_SaveList); } case CallingConv::Cold: @@ -435,15 +435,15 @@ X86RegisterInfo::getCallPreservedMask(const MachineFunction &MF, return CSR_64_HHVM_RegMask; case CallingConv::X86_RegCall: if (Is64Bit) { - if (IsWin64) { - return (HasSSE ? CSR_Win64_RegCall_RegMask : + if (IsWin64) { + return (HasSSE ? CSR_Win64_RegCall_RegMask : CSR_Win64_RegCall_NoSSE_RegMask); } else { - return (HasSSE ? CSR_SysV64_RegCall_RegMask : + return (HasSSE ? CSR_SysV64_RegCall_RegMask : CSR_SysV64_RegCall_NoSSE_RegMask); } } else { - return (HasSSE ? CSR_32_RegCall_RegMask : + return (HasSSE ? CSR_32_RegCall_RegMask : CSR_32_RegCall_NoSSE_RegMask); } case CallingConv::Cold: diff --git a/lib/Target/X86/X86Subtarget.h b/lib/Target/X86/X86Subtarget.h index de1514243aeb..02be95e2e556 100644 --- a/lib/Target/X86/X86Subtarget.h +++ b/lib/Target/X86/X86Subtarget.h @@ -253,6 +253,11 @@ protected: /// True if the LEA instruction with certain arguments is slow bool SlowLEA; + /// True if the LEA instruction has all three source operands: base, index, + /// and offset or if the LEA instruction uses base and index registers where + /// the base is EBP, RBP,or R13 + bool Slow3OpsLEA; + /// True if INC and DEC instructions are slow when writing to flags bool SlowIncDec; @@ -490,6 +495,7 @@ public: bool callRegIndirect() const { return CallRegIndirect; } bool LEAusesAG() const { return LEAUsesAG; } bool slowLEA() const { return SlowLEA; } + bool slow3OpsLEA() const { return Slow3OpsLEA; } bool slowIncDec() const { return SlowIncDec; } bool hasCDI() const { return HasCDI; } bool hasPFI() const { return HasPFI; } diff --git a/lib/Target/X86/X86TargetMachine.cpp b/lib/Target/X86/X86TargetMachine.cpp index 086f55dd60b5..c6a90725d89c 100644 --- a/lib/Target/X86/X86TargetMachine.cpp +++ b/lib/Target/X86/X86TargetMachine.cpp @@ -61,6 +61,7 @@ static cl::opt<bool> EnableMachineCombinerPass("x86-machine-combiner", namespace llvm { void initializeWinEHStatePassPass(PassRegistry &); +void initializeFixupLEAPassPass(PassRegistry &); void initializeX86ExecutionDepsFixPass(PassRegistry &); } // end namespace llvm @@ -75,6 +76,7 @@ extern "C" void LLVMInitializeX86Target() { initializeWinEHStatePassPass(PR); initializeFixupBWInstPassPass(PR); initializeEvexToVexInstPassPass(PR); + initializeFixupLEAPassPass(PR); initializeX86ExecutionDepsFixPass(PR); } @@ -87,7 +89,7 @@ static std::unique_ptr<TargetLoweringObjectFile> createTLOF(const Triple &TT) { if (TT.isOSFreeBSD()) return llvm::make_unique<X86FreeBSDTargetObjectFile>(); - if (TT.isOSLinux() || TT.isOSNaCl()) + if (TT.isOSLinux() || TT.isOSNaCl() || TT.isOSIAMCU()) return llvm::make_unique<X86LinuxNaClTargetObjectFile>(); if (TT.isOSFuchsia()) return llvm::make_unique<X86FuchsiaTargetObjectFile>(); diff --git a/lib/Target/X86/X86TargetTransformInfo.cpp b/lib/Target/X86/X86TargetTransformInfo.cpp index f3b619a2956a..80e18161a94b 100644 --- a/lib/Target/X86/X86TargetTransformInfo.cpp +++ b/lib/Target/X86/X86TargetTransformInfo.cpp @@ -247,35 +247,38 @@ int X86TTIImpl::getArithmeticInstrCost( } static const CostTblEntry SSE2UniformConstCostTable[] = { - { ISD::SHL, MVT::v16i8, 2 }, // psllw + pand. - { ISD::SRL, MVT::v16i8, 2 }, // psrlw + pand. - { ISD::SRA, MVT::v16i8, 4 }, // psrlw, pand, pxor, psubb. - - { ISD::SHL, MVT::v32i8, 4 }, // 2*(psllw + pand). - { ISD::SRL, MVT::v32i8, 4 }, // 2*(psrlw + pand). - { ISD::SRA, MVT::v32i8, 8 }, // 2*(psrlw, pand, pxor, psubb). - - { ISD::SDIV, MVT::v16i16, 12 }, // pmulhw sequence - { ISD::SDIV, MVT::v8i16, 6 }, // pmulhw sequence - { ISD::UDIV, MVT::v16i16, 12 }, // pmulhuw sequence - { ISD::UDIV, MVT::v8i16, 6 }, // pmulhuw sequence - { ISD::SDIV, MVT::v8i32, 38 }, // pmuludq sequence - { ISD::SDIV, MVT::v4i32, 19 }, // pmuludq sequence - { ISD::UDIV, MVT::v8i32, 30 }, // pmuludq sequence - { ISD::UDIV, MVT::v4i32, 15 }, // pmuludq sequence + { ISD::SHL, MVT::v16i8, 2 }, // psllw + pand. + { ISD::SRL, MVT::v16i8, 2 }, // psrlw + pand. + { ISD::SRA, MVT::v16i8, 4 }, // psrlw, pand, pxor, psubb. + + { ISD::SHL, MVT::v32i8, 4+2 }, // 2*(psllw + pand) + split. + { ISD::SRL, MVT::v32i8, 4+2 }, // 2*(psrlw + pand) + split. + { ISD::SRA, MVT::v32i8, 8+2 }, // 2*(psrlw, pand, pxor, psubb) + split. + + { ISD::SDIV, MVT::v16i16, 12+2 }, // 2*pmulhw sequence + split. + { ISD::SDIV, MVT::v8i16, 6 }, // pmulhw sequence + { ISD::UDIV, MVT::v16i16, 12+2 }, // 2*pmulhuw sequence + split. + { ISD::UDIV, MVT::v8i16, 6 }, // pmulhuw sequence + { ISD::SDIV, MVT::v8i32, 38+2 }, // 2*pmuludq sequence + split. + { ISD::SDIV, MVT::v4i32, 19 }, // pmuludq sequence + { ISD::UDIV, MVT::v8i32, 30+2 }, // 2*pmuludq sequence + split. + { ISD::UDIV, MVT::v4i32, 15 }, // pmuludq sequence }; if (Op2Info == TargetTransformInfo::OK_UniformConstantValue && ST->hasSSE2()) { // pmuldq sequence. if (ISD == ISD::SDIV && LT.second == MVT::v8i32 && ST->hasAVX()) - return LT.first * 30; + return LT.first * 32; if (ISD == ISD::SDIV && LT.second == MVT::v4i32 && ST->hasSSE41()) return LT.first * 15; - if (const auto *Entry = CostTableLookup(SSE2UniformConstCostTable, ISD, - LT.second)) - return LT.first * Entry->Cost; + // XOP has faster vXi8 shifts. + if ((ISD != ISD::SHL && ISD != ISD::SRL && ISD != ISD::SRA) || + !ST->hasXOP()) + if (const auto *Entry = + CostTableLookup(SSE2UniformConstCostTable, ISD, LT.second)) + return LT.first * Entry->Cost; } static const CostTblEntry AVX2UniformCostTable[] = { @@ -430,18 +433,18 @@ int X86TTIImpl::getArithmeticInstrCost( { ISD::SRL, MVT::v2i64, 2 }, { ISD::SRA, MVT::v2i64, 2 }, // 256bit shifts require splitting if AVX2 didn't catch them above. - { ISD::SHL, MVT::v32i8, 2 }, - { ISD::SRL, MVT::v32i8, 4 }, - { ISD::SRA, MVT::v32i8, 4 }, - { ISD::SHL, MVT::v16i16, 2 }, - { ISD::SRL, MVT::v16i16, 4 }, - { ISD::SRA, MVT::v16i16, 4 }, - { ISD::SHL, MVT::v8i32, 2 }, - { ISD::SRL, MVT::v8i32, 4 }, - { ISD::SRA, MVT::v8i32, 4 }, - { ISD::SHL, MVT::v4i64, 2 }, - { ISD::SRL, MVT::v4i64, 4 }, - { ISD::SRA, MVT::v4i64, 4 }, + { ISD::SHL, MVT::v32i8, 2+2 }, + { ISD::SRL, MVT::v32i8, 4+2 }, + { ISD::SRA, MVT::v32i8, 4+2 }, + { ISD::SHL, MVT::v16i16, 2+2 }, + { ISD::SRL, MVT::v16i16, 4+2 }, + { ISD::SRA, MVT::v16i16, 4+2 }, + { ISD::SHL, MVT::v8i32, 2+2 }, + { ISD::SRL, MVT::v8i32, 4+2 }, + { ISD::SRA, MVT::v8i32, 4+2 }, + { ISD::SHL, MVT::v4i64, 2+2 }, + { ISD::SRL, MVT::v4i64, 4+2 }, + { ISD::SRA, MVT::v4i64, 4+2 }, }; // Look for XOP lowering tricks. @@ -451,23 +454,28 @@ int X86TTIImpl::getArithmeticInstrCost( static const CostTblEntry SSE2UniformShiftCostTable[] = { // Uniform splats are cheaper for the following instructions. - { ISD::SHL, MVT::v16i16, 2 }, // psllw. - { ISD::SHL, MVT::v8i32, 2 }, // pslld - { ISD::SHL, MVT::v4i64, 2 }, // psllq. - - { ISD::SRL, MVT::v16i16, 2 }, // psrlw. - { ISD::SRL, MVT::v8i32, 2 }, // psrld. - { ISD::SRL, MVT::v4i64, 2 }, // psrlq. - - { ISD::SRA, MVT::v16i16, 2 }, // psraw. - { ISD::SRA, MVT::v8i32, 2 }, // psrad. - { ISD::SRA, MVT::v2i64, 4 }, // 2 x psrad + shuffle. - { ISD::SRA, MVT::v4i64, 8 }, // 2 x psrad + shuffle. + { ISD::SHL, MVT::v16i16, 2+2 }, // 2*psllw + split. + { ISD::SHL, MVT::v8i32, 2+2 }, // 2*pslld + split. + { ISD::SHL, MVT::v4i64, 2+2 }, // 2*psllq + split. + + { ISD::SRL, MVT::v16i16, 2+2 }, // 2*psrlw + split. + { ISD::SRL, MVT::v8i32, 2+2 }, // 2*psrld + split. + { ISD::SRL, MVT::v4i64, 2+2 }, // 2*psrlq + split. + + { ISD::SRA, MVT::v16i16, 2+2 }, // 2*psraw + split. + { ISD::SRA, MVT::v8i32, 2+2 }, // 2*psrad + split. + { ISD::SRA, MVT::v2i64, 4 }, // 2*psrad + shuffle. + { ISD::SRA, MVT::v4i64, 8+2 }, // 2*(2*psrad + shuffle) + split. }; if (ST->hasSSE2() && ((Op2Info == TargetTransformInfo::OK_UniformConstantValue) || (Op2Info == TargetTransformInfo::OK_UniformValue))) { + + // Handle AVX2 uniform v4i64 ISD::SRA, it's not worth a table. + if (ISD == ISD::SRA && LT.second == MVT::v4i64 && ST->hasAVX2()) + return LT.first * 4; // 2*psrad + shuffle. + if (const auto *Entry = CostTableLookup(SSE2UniformShiftCostTable, ISD, LT.second)) return LT.first * Entry->Cost; @@ -581,28 +589,28 @@ int X86TTIImpl::getArithmeticInstrCost( return LT.first * Entry->Cost; static const CostTblEntry SSE41CostTable[] = { - { ISD::SHL, MVT::v16i8, 11 }, // pblendvb sequence. - { ISD::SHL, MVT::v32i8, 2*11 }, // pblendvb sequence. - { ISD::SHL, MVT::v8i16, 14 }, // pblendvb sequence. - { ISD::SHL, MVT::v16i16, 2*14 }, // pblendvb sequence. - { ISD::SHL, MVT::v4i32, 4 }, // pslld/paddd/cvttps2dq/pmulld - { ISD::SHL, MVT::v8i32, 2*4 }, // pslld/paddd/cvttps2dq/pmulld - - { ISD::SRL, MVT::v16i8, 12 }, // pblendvb sequence. - { ISD::SRL, MVT::v32i8, 2*12 }, // pblendvb sequence. - { ISD::SRL, MVT::v8i16, 14 }, // pblendvb sequence. - { ISD::SRL, MVT::v16i16, 2*14 }, // pblendvb sequence. - { ISD::SRL, MVT::v4i32, 11 }, // Shift each lane + blend. - { ISD::SRL, MVT::v8i32, 2*11 }, // Shift each lane + blend. - - { ISD::SRA, MVT::v16i8, 24 }, // pblendvb sequence. - { ISD::SRA, MVT::v32i8, 2*24 }, // pblendvb sequence. - { ISD::SRA, MVT::v8i16, 14 }, // pblendvb sequence. - { ISD::SRA, MVT::v16i16, 2*14 }, // pblendvb sequence. - { ISD::SRA, MVT::v4i32, 12 }, // Shift each lane + blend. - { ISD::SRA, MVT::v8i32, 2*12 }, // Shift each lane + blend. - - { ISD::MUL, MVT::v4i32, 1 } // pmulld + { ISD::SHL, MVT::v16i8, 11 }, // pblendvb sequence. + { ISD::SHL, MVT::v32i8, 2*11+2 }, // pblendvb sequence + split. + { ISD::SHL, MVT::v8i16, 14 }, // pblendvb sequence. + { ISD::SHL, MVT::v16i16, 2*14+2 }, // pblendvb sequence + split. + { ISD::SHL, MVT::v4i32, 4 }, // pslld/paddd/cvttps2dq/pmulld + { ISD::SHL, MVT::v8i32, 2*4+2 }, // pslld/paddd/cvttps2dq/pmulld + split + + { ISD::SRL, MVT::v16i8, 12 }, // pblendvb sequence. + { ISD::SRL, MVT::v32i8, 2*12+2 }, // pblendvb sequence + split. + { ISD::SRL, MVT::v8i16, 14 }, // pblendvb sequence. + { ISD::SRL, MVT::v16i16, 2*14+2 }, // pblendvb sequence + split. + { ISD::SRL, MVT::v4i32, 11 }, // Shift each lane + blend. + { ISD::SRL, MVT::v8i32, 2*11+2 }, // Shift each lane + blend + split. + + { ISD::SRA, MVT::v16i8, 24 }, // pblendvb sequence. + { ISD::SRA, MVT::v32i8, 2*24+2 }, // pblendvb sequence + split. + { ISD::SRA, MVT::v8i16, 14 }, // pblendvb sequence. + { ISD::SRA, MVT::v16i16, 2*14+2 }, // pblendvb sequence + split. + { ISD::SRA, MVT::v4i32, 12 }, // Shift each lane + blend. + { ISD::SRA, MVT::v8i32, 2*12+2 }, // Shift each lane + blend + split. + + { ISD::MUL, MVT::v4i32, 1 } // pmulld }; if (ST->hasSSE41()) @@ -612,33 +620,33 @@ int X86TTIImpl::getArithmeticInstrCost( static const CostTblEntry SSE2CostTable[] = { // We don't correctly identify costs of casts because they are marked as // custom. - { ISD::SHL, MVT::v16i8, 26 }, // cmpgtb sequence. - { ISD::SHL, MVT::v8i16, 32 }, // cmpgtb sequence. - { ISD::SHL, MVT::v4i32, 2*5 }, // We optimized this using mul. - { ISD::SHL, MVT::v2i64, 4 }, // splat+shuffle sequence. - { ISD::SHL, MVT::v4i64, 2*4 }, // splat+shuffle sequence. - - { ISD::SRL, MVT::v16i8, 26 }, // cmpgtb sequence. - { ISD::SRL, MVT::v8i16, 32 }, // cmpgtb sequence. - { ISD::SRL, MVT::v4i32, 16 }, // Shift each lane + blend. - { ISD::SRL, MVT::v2i64, 4 }, // splat+shuffle sequence. - { ISD::SRL, MVT::v4i64, 2*4 }, // splat+shuffle sequence. - - { ISD::SRA, MVT::v16i8, 54 }, // unpacked cmpgtb sequence. - { ISD::SRA, MVT::v8i16, 32 }, // cmpgtb sequence. - { ISD::SRA, MVT::v4i32, 16 }, // Shift each lane + blend. - { ISD::SRA, MVT::v2i64, 12 }, // srl/xor/sub sequence. - { ISD::SRA, MVT::v4i64, 2*12 }, // srl/xor/sub sequence. - - { ISD::MUL, MVT::v16i8, 12 }, // extend/pmullw/trunc sequence. - { ISD::MUL, MVT::v8i16, 1 }, // pmullw - { ISD::MUL, MVT::v4i32, 6 }, // 3*pmuludq/4*shuffle - { ISD::MUL, MVT::v2i64, 8 }, // 3*pmuludq/3*shift/2*add - - { ISD::FDIV, MVT::f32, 23 }, // Pentium IV from http://www.agner.org/ - { ISD::FDIV, MVT::v4f32, 39 }, // Pentium IV from http://www.agner.org/ - { ISD::FDIV, MVT::f64, 38 }, // Pentium IV from http://www.agner.org/ - { ISD::FDIV, MVT::v2f64, 69 }, // Pentium IV from http://www.agner.org/ + { ISD::SHL, MVT::v16i8, 26 }, // cmpgtb sequence. + { ISD::SHL, MVT::v8i16, 32 }, // cmpgtb sequence. + { ISD::SHL, MVT::v4i32, 2*5 }, // We optimized this using mul. + { ISD::SHL, MVT::v2i64, 4 }, // splat+shuffle sequence. + { ISD::SHL, MVT::v4i64, 2*4+2 }, // splat+shuffle sequence + split. + + { ISD::SRL, MVT::v16i8, 26 }, // cmpgtb sequence. + { ISD::SRL, MVT::v8i16, 32 }, // cmpgtb sequence. + { ISD::SRL, MVT::v4i32, 16 }, // Shift each lane + blend. + { ISD::SRL, MVT::v2i64, 4 }, // splat+shuffle sequence. + { ISD::SRL, MVT::v4i64, 2*4+2 }, // splat+shuffle sequence + split. + + { ISD::SRA, MVT::v16i8, 54 }, // unpacked cmpgtb sequence. + { ISD::SRA, MVT::v8i16, 32 }, // cmpgtb sequence. + { ISD::SRA, MVT::v4i32, 16 }, // Shift each lane + blend. + { ISD::SRA, MVT::v2i64, 12 }, // srl/xor/sub sequence. + { ISD::SRA, MVT::v4i64, 2*12+2 }, // srl/xor/sub sequence+split. + + { ISD::MUL, MVT::v16i8, 12 }, // extend/pmullw/trunc sequence. + { ISD::MUL, MVT::v8i16, 1 }, // pmullw + { ISD::MUL, MVT::v4i32, 6 }, // 3*pmuludq/4*shuffle + { ISD::MUL, MVT::v2i64, 8 }, // 3*pmuludq/3*shift/2*add + + { ISD::FDIV, MVT::f32, 23 }, // Pentium IV from http://www.agner.org/ + { ISD::FDIV, MVT::v4f32, 39 }, // Pentium IV from http://www.agner.org/ + { ISD::FDIV, MVT::f64, 38 }, // Pentium IV from http://www.agner.org/ + { ISD::FDIV, MVT::v2f64, 69 }, // Pentium IV from http://www.agner.org/ // It is not a good idea to vectorize division. We have to scalarize it and // in the process we will often end up having to spilling regular diff --git a/lib/Target/X86/X86WinEHState.cpp b/lib/Target/X86/X86WinEHState.cpp index 500b26b3be17..3ee14a0ff7b1 100644 --- a/lib/Target/X86/X86WinEHState.cpp +++ b/lib/Target/X86/X86WinEHState.cpp @@ -398,7 +398,7 @@ Function *WinEHStatePass::generateLSDAInEAXThunk(Function *ParentFunc) { /*isVarArg=*/false); Function *Trampoline = Function::Create(TrampolineTy, GlobalValue::InternalLinkage, - Twine("__ehhandler$") + GlobalValue::getRealLinkageName( + Twine("__ehhandler$") + GlobalValue::dropLLVMManglingEscape( ParentFunc->getName()), TheModule); BasicBlock *EntryBB = BasicBlock::Create(Context, "entry", Trampoline); diff --git a/lib/Target/XCore/XCoreISelLowering.cpp b/lib/Target/XCore/XCoreISelLowering.cpp index b8742683a0c8..1da189c5cd31 100644 --- a/lib/Target/XCore/XCoreISelLowering.cpp +++ b/lib/Target/XCore/XCoreISelLowering.cpp @@ -409,7 +409,7 @@ static bool isWordAligned(SDValue Value, SelectionDAG &DAG) { KnownBits Known; DAG.computeKnownBits(Value, Known); - return Known.Zero.countTrailingOnes() >= 2; + return Known.countMinTrailingZeros() >= 2; } SDValue XCoreTargetLowering:: @@ -1131,8 +1131,7 @@ SDValue XCoreTargetLowering::LowerCCCCallTo( unsigned NumBytes = RetCCInfo.getNextStackOffset(); auto PtrVT = getPointerTy(DAG.getDataLayout()); - Chain = DAG.getCALLSEQ_START(Chain, - DAG.getConstant(NumBytes, dl, PtrVT, true), dl); + Chain = DAG.getCALLSEQ_START(Chain, NumBytes, 0, dl); SmallVector<std::pair<unsigned, SDValue>, 4> RegsToPass; SmallVector<SDValue, 12> MemOpChains; diff --git a/lib/Target/XCore/XCoreInstrInfo.td b/lib/Target/XCore/XCoreInstrInfo.td index f1d52d5a191f..b87ba6548962 100644 --- a/lib/Target/XCore/XCoreInstrInfo.td +++ b/lib/Target/XCore/XCoreInstrInfo.td @@ -73,9 +73,10 @@ def XCoreLdwsp : SDNode<"XCoreISD::LDWSP", SDT_XCoreLdwsp, [SDNPHasChain, SDNPMayLoad]>; // These are target-independent nodes, but have target-specific formats. -def SDT_XCoreCallSeqStart : SDCallSeqStart<[ SDTCisVT<0, i32> ]>; +def SDT_XCoreCallSeqStart : SDCallSeqStart<[ SDTCisVT<0, i32>, + SDTCisVT<1, i32> ]>; def SDT_XCoreCallSeqEnd : SDCallSeqEnd<[ SDTCisVT<0, i32>, - SDTCisVT<1, i32> ]>; + SDTCisVT<1, i32> ]>; def callseq_start : SDNode<"ISD::CALLSEQ_START", SDT_XCoreCallSeqStart, [SDNPHasChain, SDNPOutGlue]>; @@ -323,9 +324,9 @@ class F2R_np<bits<6> opc, string OpcStr> : //===----------------------------------------------------------------------===// let Defs = [SP], Uses = [SP] in { -def ADJCALLSTACKDOWN : PseudoInstXCore<(outs), (ins i32imm:$amt), - "# ADJCALLSTACKDOWN $amt", - [(callseq_start timm:$amt)]>; +def ADJCALLSTACKDOWN : PseudoInstXCore<(outs), (ins i32imm:$amt, i32imm:$amt2), + "# ADJCALLSTACKDOWN $amt, $amt2", + [(callseq_start timm:$amt, timm:$amt2)]>; def ADJCALLSTACKUP : PseudoInstXCore<(outs), (ins i32imm:$amt1, i32imm:$amt2), "# ADJCALLSTACKUP $amt1", [(callseq_end timm:$amt1, timm:$amt2)]>; |