diff options
author | Dimitry Andric <dim@FreeBSD.org> | 2017-12-18 20:10:56 +0000 |
---|---|---|
committer | Dimitry Andric <dim@FreeBSD.org> | 2017-12-18 20:10:56 +0000 |
commit | 044eb2f6afba375a914ac9d8024f8f5142bb912e (patch) | |
tree | 1475247dc9f9fe5be155ebd4c9069c75aadf8c20 /lib/Target/PowerPC/P9InstrResources.td | |
parent | eb70dddbd77e120e5d490bd8fbe7ff3f8fa81c6b (diff) | |
download | src-test2-044eb2f6afba375a914ac9d8024f8f5142bb912e.tar.gz src-test2-044eb2f6afba375a914ac9d8024f8f5142bb912e.zip |
Notes
Diffstat (limited to 'lib/Target/PowerPC/P9InstrResources.td')
-rw-r--r-- | lib/Target/PowerPC/P9InstrResources.td | 687 |
1 files changed, 435 insertions, 252 deletions
diff --git a/lib/Target/PowerPC/P9InstrResources.td b/lib/Target/PowerPC/P9InstrResources.td index aea022f88766..dc6ed16e53ce 100644 --- a/lib/Target/PowerPC/P9InstrResources.td +++ b/lib/Target/PowerPC/P9InstrResources.td @@ -12,11 +12,29 @@ // is listed here. Instructions in this file belong to itinerary classes that // have instructions with different resource requirements. // +// The makeup of the P9 CPU is modeled as follows: +// - Each CPU is made up of two superslices. +// - Each superslice is made up of two slices. Therefore, there are 4 slices +// for each CPU. +// - Up to 6 instructions can be dispatched to each CPU. Three per superslice. +// - Each CPU has: +// - One CY (Crypto) unit P9_CY_* +// - One DFU (Decimal Floating Point and Quad Precision) unit P9_DFU_* +// - Two PM (Permute) units. One on each superslice. P9_PM_* +// - Two DIV (Fixed Point Divide) units. One on each superslize. P9_DIV_* +// - Four ALU (Fixed Point Arithmetic) units. One on each slice. P9_ALU_* +// - Four DP (Floating Point) units. One on each slice. P9_DP_* +// This also includes fixed point multiply add. +// - Four AGEN (Address Generation) units. One for each slice. P9_AGEN_* +// - Four Load/Store Queues. P9_LS_* +// - Each set of instructions will require a number of these resources. //===----------------------------------------------------------------------===// - +// Two cycle ALU vector operation that uses an entire superslice. +// Uses both ALU units (the even ALUE and odd ALUO units), two pipelines +// (EXECE, EXECO) and all three dispatches (DISP) to the given superslice. def : InstRW<[P9_ALUE_2C, P9_ALUO_2C, IP_EXECE_1C, IP_EXECO_1C, - DISP_1C, DISP_1C], + DISP_1C, DISP_1C, DISP_1C], (instrs VADDCUW, VADDUBM, @@ -26,47 +44,41 @@ def : InstRW<[P9_ALUE_2C, P9_ALUO_2C, IP_EXECE_1C, IP_EXECO_1C, VAND, VANDC, VCMPEQUB, - VCMPEQUBo, VCMPEQUD, - VCMPEQUDo, VCMPEQUH, - VCMPEQUHo, VCMPEQUW, - VCMPEQUWo, - VCMPGTSB, - VCMPGTSBo, - VCMPGTSD, - VCMPGTSDo, - VCMPGTSH, - VCMPGTSHo, - VCMPGTSW, - VCMPGTSWo, - VCMPGTUB, - VCMPGTUBo, - VCMPGTUD, - VCMPGTUDo, - VCMPGTUH, - VCMPGTUHo, - VCMPGTUW, - VCMPGTUWo, VCMPNEB, - VCMPNEBo, VCMPNEH, - VCMPNEHo, VCMPNEW, - VCMPNEWo, VCMPNEZB, - VCMPNEZBo, VCMPNEZH, - VCMPNEZHo, VCMPNEZW, - VCMPNEZWo, VEQV, VEXTSB2D, VEXTSB2W, VEXTSH2D, VEXTSH2W, VEXTSW2D, + VRLB, + VRLD, + VRLDMI, + VRLDNM, + VRLH, + VRLW, + VRLWMI, + VRLWNM, + VSRAB, + VSRAD, + VSRAH, + VSRAW, + VSRB, + VSRD, + VSRH, + VSRW, + VSLB, + VSLD, + VSLH, + VSLW, VMRGEW, VMRGOW, VNAND, @@ -77,9 +89,7 @@ def : InstRW<[P9_ALUE_2C, P9_ALUO_2C, IP_EXECE_1C, IP_EXECO_1C, VORC, VPOPCNTB, VPOPCNTH, - VPOPCNTW, VSEL, - VSUBCUW, VSUBUBM, VSUBUDM, VSUBUHM, @@ -98,6 +108,8 @@ def : InstRW<[P9_ALUE_2C, P9_ALUO_2C, IP_EXECE_1C, IP_EXECO_1C, XVNEGDP, XVNEGSP, XVXEXPDP, + XVIEXPSP, + XVXEXPSP, XXLAND, XXLANDC, XXLEQV, @@ -107,28 +119,128 @@ def : InstRW<[P9_ALUE_2C, P9_ALUO_2C, IP_EXECE_1C, IP_EXECO_1C, XXLORf, XXLORC, XXLXOR, - XXSEL -)>; - -def : InstRW<[P9_ALU_2C, IP_EXEC_1C, DISP_1C, DISP_1C], - (instrs + XXSEL, XSABSQP, XSCPSGNQP, XSIEXPQP, XSNABSQP, XSNEGQP, - XSXEXPQP, - XSABSDP, - XSCPSGNDP, - XSIEXPDP, + XSXEXPQP +)>; + +// Restricted Dispatch ALU operation for 3 cycles. The operation runs on a +// slingle slice. However, since it is Restricted it requires all 3 dispatches +// (DISP) for that superslice. +def : InstRW<[P9_ALU_3C, IP_EXEC_1C, DISP_1C, DISP_1C, DISP_1C], + (instrs + FCMPUS, + FCMPUD, + XSTSTDCDP, + XSTSTDCSP +)>; + +// Standard Dispatch ALU operation for 3 cycles. Only one slice used. +def : InstRW<[P9_ALU_3C, IP_EXEC_1C, DISP_1C, DISP_1C], + (instrs + XSMAXCDP, + XSMAXDP, + XSMAXJDP, + XSMINCDP, + XSMINDP, + XSMINJDP, + XSTDIVDP, + XSTSQRTDP, + XSCMPEQDP, + XSCMPEXPDP, + XSCMPGEDP, + XSCMPGTDP, + XSCMPODP, + XSCMPUDP, + XSXSIGDP, + XSCVSPDPN +)>; + +// Standard Dispatch ALU operation for 2 cycles. Only one slice used. +def : InstRW<[P9_ALU_2C, IP_EXEC_1C, DISP_1C, DISP_1C], + (instrs + ADDIStocHA, + ADDItocL, + MCRF, + MCRXRX, + SLD, + SRD, + SRAD, + SRADI, + RLDIC, XSNABSDP, + XSXEXPDP, + XSABSDP, XSNEGDP, - XSXEXPDP + XSCPSGNDP )>; -def : InstRW<[P9_ALUE_3C, P9_ALUO_3C, IP_EXECE_1C, IP_EXECO_1C, DISP_1C, DISP_1C], +// Restricted Dispatch ALU operation for 2 cycles. The operation runs on a +// slingle slice. However, since it is Restricted it requires all 3 dispatches +// (DISP) for that superslice. +def : InstRW<[P9_ALU_2C, IP_EXEC_1C, DISP_1C, DISP_1C, DISP_1C], (instrs + RLDCL, + RLDCR, + RLDIMI, + RLDICL, + RLDICR, + RLDICL_32_64, + XSIEXPDP, + FMR, + FABSD, + FABSS, + FNABSD, + FNABSS, + FNEGD, + FNEGS, + FCPSGND, + FCPSGNS +)>; +// Three cycle ALU vector operation that uses an entire superslice. +// Uses both ALU units (the even ALUE and odd ALUO units), two pipelines +// (EXECE, EXECO) and all three dispatches (DISP) to the given superslice. +def : InstRW<[P9_ALUE_3C, P9_ALUO_3C, IP_EXECE_1C, IP_EXECO_1C, + DISP_1C, DISP_1C, DISP_1C], + (instrs + VBPERMD, + VABSDUB, + VABSDUH, + VABSDUW, + VADDUBS, + VADDUHS, + VADDUWS, + VAVGSB, + VAVGSH, + VAVGSW, + VAVGUB, + VAVGUH, + VAVGUW, + VCMPEQFP, + VCMPEQFPo, + VCMPGEFP, + VCMPGEFPo, + VCMPBFP, + VCMPBFPo, + VCMPGTFP, + VCMPGTFPo, + VCLZB, + VCLZD, + VCLZH, + VCLZW, + VCTZB, + VCTZD, + VCTZH, + VCTZW, + VADDSBS, + VADDSHS, + VADDSWS, + VMINFP, VMINSB, VMINSD, VMINSH, @@ -137,55 +249,54 @@ def : InstRW<[P9_ALUE_3C, P9_ALUO_3C, IP_EXECE_1C, IP_EXECO_1C, DISP_1C, DISP_1C VMINUD, VMINUH, VMINUW, + VMAXFP, + VMAXSB, + VMAXSD, + VMAXSH, + VMAXSW, + VMAXUB, + VMAXUD, + VMAXUH, + VMAXUW, + VPOPCNTW, VPOPCNTD, VPRTYBD, VPRTYBW, - VRLB, - VRLD, - VRLDMI, - VRLDNM, - VRLH, - VRLW, - VRLWMI, - VRLWNM, VSHASIGMAD, VSHASIGMAW, - VSLB, - VSLD, - VSLH, - VSLW, - VSRAB, - VSRAD, - VSRAH, - VSRAW, - VSRB, - VSRD, - VSRH, - VSRW, VSUBSBS, VSUBSHS, VSUBSWS, VSUBUBS, VSUBUHS, VSUBUWS, - XSCMPEQDP, - XSCMPEXPDP, - XSCMPGEDP, - XSCMPGTDP, - XSCMPODP, - XSCMPUDP, - XSCVSPDPN, - XSMAXCDP, - XSMAXDP, - XSMAXJDP, - XSMINCDP, - XSMINDP, - XSMINJDP, - XSTDIVDP, - XSTSQRTDP, - XSTSTDCDP, - XSTSTDCSP, - XSXSIGDP, + VSUBCUW, + VCMPGTSB, + VCMPGTSBo, + VCMPGTSD, + VCMPGTSDo, + VCMPGTSH, + VCMPGTSHo, + VCMPGTSW, + VCMPGTSWo, + VCMPGTUB, + VCMPGTUBo, + VCMPGTUD, + VCMPGTUDo, + VCMPGTUH, + VCMPGTUHo, + VCMPGTUW, + VCMPGTUWo, + VCMPNEBo, + VCMPNEHo, + VCMPNEWo, + VCMPNEZBo, + VCMPNEZHo, + VCMPNEZWo, + VCMPEQUBo, + VCMPEQUDo, + VCMPEQUHo, + VCMPEQUWo, XVCMPEQDP, XVCMPEQDPo, XVCMPEQSP, @@ -198,7 +309,6 @@ def : InstRW<[P9_ALUE_3C, P9_ALUO_3C, IP_EXECE_1C, IP_EXECO_1C, DISP_1C, DISP_1C XVCMPGTDPo, XVCMPGTSP, XVCMPGTSPo, - XVIEXPSP, XVMAXDP, XVMAXSP, XVMINDP, @@ -209,58 +319,15 @@ def : InstRW<[P9_ALUE_3C, P9_ALUO_3C, IP_EXECE_1C, IP_EXECO_1C, DISP_1C, DISP_1C XVTSQRTSP, XVTSTDCDP, XVTSTDCSP, - XVXEXPSP, XVXSIGDP, XVXSIGSP )>; -def : InstRW<[P9_ALUE_4C, P9_ALUO_4C, IP_EXECE_1C, IP_EXECO_1C, DISP_1C, DISP_1C], - (instrs - VABSDUB, - VABSDUH, - VABSDUW, - VADDSBS, - VADDSHS, - VADDSWS, - VADDUBS, - VADDUHS, - VADDUWS, - VAVGSB, - VAVGSH, - VAVGSW, - VAVGUB, - VAVGUH, - VAVGUW, - VBPERMD, - VCLZB, - VCLZD, - VCLZH, - VCLZW, - VCMPBFP, - VCMPBFPo, - VCMPGTFP, - VCMPGTFPo, - VCTZB, - VCTZD, - VCTZH, - VCTZW, - VMAXFP, - VMAXSB, - VMAXSD, - VMAXSH, - VMAXSW, - VMAXUB, - VMAXUD, - VMAXUH, - VMAXUW, - VMINFP, - VCMPEQFP, - VCMPEQFPo, - VCMPGEFP, - VCMPGEFPo -)>; - -def : InstRW<[P9_DPE_7C, P9_DPO_7C, IP_EXECE_1C, IP_EXECO_1C, DISP_1C, DISP_1C], +// 7 cycle DP vector operation that uses an entire superslice. +// Uses both DP units (the even DPE and odd DPO units), two pipelines +// (EXECE, EXECO) and all three dispatches (DISP) to the given superslice. +def : InstRW<[P9_DPE_7C, P9_DPO_7C, IP_EXECE_1C, IP_EXECO_1C, + DISP_1C, DISP_1C, DISP_1C], (instrs VADDFP, VCTSXS, @@ -367,8 +434,47 @@ def : InstRW<[P9_DPE_7C, P9_DPO_7C, IP_EXECE_1C, IP_EXECO_1C, DISP_1C, DISP_1C], VSUMSWS )>; +// 7 cycle Restricted DP operation. One DP unit, one EXEC pipeline and all three +// dispatch units for the superslice. def : InstRW<[P9_DP_7C, IP_EXEC_1C, DISP_1C, DISP_1C, DISP_1C], (instrs + FRSP, + FRIND, + FRINS, + FRIPD, + FRIPS, + FRIZD, + FRIZS, + FRIMD, + FRIMS, + FRE, + FRES, + FRSQRTE, + FRSQRTES, + FMADDS, + FMADD, + FMSUBS, + FMSUB, + FNMADDS, + FNMADD, + FNMSUBS, + FNMSUB, + FSELD, + FSELS, + FADDS, + FMULS, + FMUL, + FSUBS, + FCFID, + FCTID, + FCTIDZ, + FCFIDU, + FCFIDS, + FCFIDUS, + FCTIDUZ, + FCTIWUZ, + FCTIW, + FCTIWZ, XSMADDADP, XSMADDASP, XSMADDMDP, @@ -389,7 +495,19 @@ def : InstRW<[P9_DP_7C, IP_EXEC_1C, DISP_1C, DISP_1C, DISP_1C], XSNMSUBMSP )>; +// 7 cycle Restricted DP operation and one 2 cycle ALU operation. +// The DP is restricted so we need a full 5 dispatches. +def : InstRW<[P9_DPOpAndALUOp_9C, IP_EXEC_1C, IP_EXEC_1C, + DISP_1C, DISP_1C, DISP_1C, DISP_1C, DISP_1C], + (instrs + FMULo, + FMADDo, + FMSUBo, + FNMADDo, + FNMSUBo +)>; +// 7 cycle DP operation. One DP unit, one EXEC pipeline and two dispatch units. def : InstRW<[P9_DP_7C, IP_EXEC_1C, DISP_1C, DISP_1C], (instrs XSADDDP, @@ -397,8 +515,10 @@ def : InstRW<[P9_DP_7C, IP_EXEC_1C, DISP_1C, DISP_1C], XSCVDPHP, XSCVDPSP, XSCVDPSXDS, + XSCVDPSXDSs, XSCVDPSXWS, XSCVDPUXDS, + XSCVDPUXDSs, XSCVDPUXWS, XSCVHPDP, XSCVSPDP, @@ -421,7 +541,10 @@ def : InstRW<[P9_DP_7C, IP_EXEC_1C, DISP_1C, DISP_1C], XSCVDPSPN )>; -def : InstRW<[P9_PM_3C, IP_EXECO_1C, IP_EXECE_1C, DISP_1C, DISP_1C], +// Three Cycle PM operation. Only one PM unit per superslice so we use the whole +// superslice. That includes both exec pipelines (EXECO, EXECE) and all three +// dispatches. +def : InstRW<[P9_PM_3C, IP_EXECO_1C, IP_EXECE_1C, DISP_1C, DISP_1C, DISP_1C], (instrs VBPERMQ, VCLZLSBB, @@ -469,7 +592,9 @@ def : InstRW<[P9_PM_3C, IP_EXECO_1C, IP_EXECE_1C, DISP_1C, DISP_1C], VSLO, VSLV, VSPLTB, + VSPLTBs, VSPLTH, + VSPLTHs, VSPLTISB, VSPLTISH, VSPLTISW, @@ -498,6 +623,9 @@ def : InstRW<[P9_PM_3C, IP_EXECO_1C, IP_EXECE_1C, DISP_1C, DISP_1C], XXSLDWI, XXSPLTIB, XXSPLTW, + XXSPLTWs, + XXPERMDI, + XXPERMDIs, VADDCUQ, VADDECUQ, VADDEUQM, @@ -517,7 +645,10 @@ def : InstRW<[P9_PM_3C, IP_EXECO_1C, IP_EXECE_1C, DISP_1C, DISP_1C], XSXSIGQP )>; -def : InstRW<[P9_DFU_12C, IP_EXECE_1C, IP_EXECO_1C, DISP_1C, DISP_1C], +// 12 Cycle DFU operation. Only one DFU unit per CPU so we use a whole +// superslice. That includes both exec pipelines (EXECO, EXECE) and all three +// dispatches. +def : InstRW<[P9_DFU_12C, IP_EXECE_1C, IP_EXECO_1C, DISP_1C, DISP_1C, DISP_1C], (instrs XSADDQP, XSADDQPO, @@ -536,7 +667,10 @@ def : InstRW<[P9_DFU_12C, IP_EXECE_1C, IP_EXECO_1C, DISP_1C, DISP_1C], XSSUBQPO )>; -def : InstRW<[P9_DFU_24C, IP_EXECE_1C, IP_EXECO_1C, DISP_1C, DISP_1C], +// 24 Cycle DFU operation. Only one DFU unit per CPU so we use a whole +// superslice. That includes both exec pipelines (EXECO, EXECE) and all three +// dispatches. +def : InstRW<[P9_DFU_24C, IP_EXECE_1C, IP_EXECO_1C, DISP_1C, DISP_1C, DISP_1C], (instrs XSMADDQP, XSMADDQPO, @@ -550,45 +684,57 @@ def : InstRW<[P9_DFU_24C, IP_EXECE_1C, IP_EXECO_1C, DISP_1C, DISP_1C], XSNMSUBQPO )>; -def : InstRW<[P9_DFU_58C, IP_EXECE_1C, IP_EXECO_1C, DISP_1C, DISP_1C], +// 58 Cycle DFU operation. Only one DFU unit per CPU so we use a whole +// superslice. That includes both exec pipelines (EXECO, EXECE) and all three +// dispatches. +def : InstRW<[P9_DFU_58C, IP_EXECE_1C, IP_EXECO_1C, DISP_1C, DISP_1C, DISP_1C], (instrs XSDIVQP, XSDIVQPO )>; -def : InstRW<[P9_DFU_76C, IP_EXECE_1C, IP_EXECO_1C, DISP_1C, DISP_1C], +// 76 Cycle DFU operation. Only one DFU unit per CPU so we use a whole +// superslice. That includes both exec pipelines (EXECO, EXECE) and all three +// dispatches. +def : InstRW<[P9_DFU_76C, IP_EXECE_1C, IP_EXECO_1C, DISP_1C, DISP_1C, DISP_1C], (instrs XSSQRTQP, XSSQRTQPO )>; -// Load Operation in IIC_LdStLFD - +// 5 Cycle load uses a single slice. def : InstRW<[P9_LS_5C, IP_AGEN_1C, DISP_1C, DISP_1C], (instrs LXSDX, LXVD2X, LXSIWZX, LXV, - LXSD + LXVX, + LXSD, + DFLOADf64, + XFLOADf64 )>; -def : InstRW<[P9_LS_5C, IP_AGEN_1C, DISP_1C, DISP_1C, DISP_1C], +// 4 Cycle load uses a single slice. +def : InstRW<[P9_LS_4C, IP_AGEN_1C, DISP_1C, DISP_1C], (instrs - LFIWZX, - LFDX, - LFD + COPY )>; -def : InstRW<[P9_LoadAndALUOp_7C, IP_AGEN_1C, IP_EXEC_1C, - DISP_1C, DISP_1C, DISP_1C, DISP_1C], +// 4 Cycle Restricted load uses a single slice but the dispatch for the whole +// superslice. +def : InstRW<[P9_LS_4C, IP_AGEN_1C, DISP_1C, DISP_1C, DISP_1C], (instrs - LXSSPX, - LXSIWAX, - LXSSP + LFIWZX, + LFDX, + LFD )>; -def : InstRW<[P9_LoadAndALUOp_7C, IP_AGEN_1C, IP_EXEC_1C, +// Cracked Restricted Load instruction. +// Requires consecutive Load and ALU pieces totaling 6 cycles. The Load and ALU +// operations cannot be done at the same time and so their latencies are added. +// Full 6 dispatches are required as this is both cracked and restricted. +def : InstRW<[P9_LoadAndALUOp_6C, IP_EXEC_1C, IP_AGEN_1C, DISP_1C, DISP_1C, DISP_1C, DISP_1C, DISP_1C, DISP_1C], (instrs LFIWAX, @@ -596,14 +742,38 @@ def : InstRW<[P9_LoadAndALUOp_7C, IP_AGEN_1C, IP_EXEC_1C, LFS )>; -def : InstRW<[P9_LoadAndPMOp_8C, IP_AGEN_1C, IP_EXEC_1C, DISP_1C, DISP_1C], +// Cracked Load instruction. +// Requires consecutive Load and ALU pieces totaling 7 cycles. The Load and ALU +// operations cannot be done at the same time and so their latencies are added. +// Full 4 dispatches are required as this is a cracked instruction. +def : InstRW<[P9_LoadAndALUOp_7C, IP_AGEN_1C, IP_EXEC_1C, + DISP_1C, DISP_1C, DISP_1C, DISP_1C], + (instrs + LXSSPX, + LXSIWAX, + LXSSP, + DFLOADf32, + XFLOADf32, + LIWAX, + LIWZX +)>; + +// Cracked Load that requires the PM resource. +// Since the Load and the PM cannot be done at the same time the latencies are +// added. Requires 8 cycles. +// Since the PM requires the full superslice we need both EXECE, EXECO pipelines +// as well as 3 dispatches for the PM. The Load requires the remaining 2 +// dispatches. +def : InstRW<[P9_LoadAndPMOp_8C, IP_AGEN_1C, IP_EXECE_1C, IP_EXECO_1C, + DISP_1C, DISP_1C, DISP_1C, DISP_1C, DISP_1C], (instrs LXVDSX, + LXVWSX, LXVW4X )>; -// Store Operations in IIC_LdStSTFD. - +// Single slice Restricted store operation. The restricted operation requires +// all three dispatches for the superslice. def : InstRW<[P9_LS_1C, IP_EXEC_1C, IP_AGEN_1C, DISP_1C, DISP_1C, DISP_1C], (instrs STFS, @@ -613,74 +783,88 @@ def : InstRW<[P9_LS_1C, IP_EXEC_1C, IP_AGEN_1C, DISP_1C, DISP_1C, DISP_1C], STFDX, STXSDX, STXSSPX, - STXSIWX + STXSIWX, + DFSTOREf32, + DFSTOREf64, + XFSTOREf32, + XFSTOREf64, + STIWX )>; -def : InstRW<[P9_LS_1C, IP_EXEC_1C, IP_EXEC_1C, IP_AGEN_1C, DISP_1C, DISP_1C], +// Store operation that requires the whole superslice. +def : InstRW<[P9_LS_1C, IP_EXECE_1C, IP_EXECO_1C, IP_AGEN_1C, + DISP_1C, DISP_1C, DISP_1C], (instrs STXVD2X, STXVW4X )>; -// Divide Operations in IIC_IntDivW, IIC_IntDivD. - -def : InstRW<[P9_DIV_16C_8, IP_EXECE_1C, DISP_1C, DISP_1C], +// 16 Cycle DIV operation. Only one DIV unit per superslice so we use the whole +// superslice. That includes both exec pipelines (EXECO, EXECE) and all three +// dispatches. +def : InstRW<[P9_DIV_16C_8, IP_EXECO_1C, IP_EXECE_1C, + DISP_1C, DISP_1C, DISP_1C], (instrs DIVW, - DIVWU + DIVWU, + MODSW )>; -def : InstRW<[P9_DIV_24C_8, IP_EXECE_1C, DISP_1C, DISP_1C], +// 24 Cycle DIV operation. Only one DIV unit per superslice so we use the whole +// superslice. That includes both exec pipelines (EXECO, EXECE) and all three +// dispatches. +def : InstRW<[P9_DIV_24C_8, IP_EXECO_1C, IP_EXECE_1C, + DISP_1C, DISP_1C, DISP_1C], (instrs DIVWE, DIVD, DIVWEU, - DIVDU + DIVDU, + MODSD, + MODUD, + MODUW )>; -def : InstRW<[P9_DIV_40C_8, IP_EXECE_1C, DISP_1C, DISP_1C], +// 40 Cycle DIV operation. Only one DIV unit per superslice so we use the whole +// superslice. That includes both exec pipelines (EXECO, EXECE) and all three +// dispatches. +def : InstRW<[P9_DIV_40C_8, IP_EXECO_1C, IP_EXECE_1C, + DISP_1C, DISP_1C, DISP_1C], (instrs DIVDE, DIVDEU )>; -def : InstRW<[P9_IntDivAndALUOp_26C_8, IP_EXECE_1C, IP_EXEC_1C, - DISP_1C, DISP_1C, DISP_1C, DISP_1C], +// Cracked DIV and ALU operation. Requires one full slice for the ALU operation +// and one full superslice for the DIV operation since there is only one DIV +// per superslice. Latency of DIV plus ALU is 26. +def : InstRW<[P9_IntDivAndALUOp_26C_8, IP_EXECE_1C, IP_EXECO_1C, IP_EXEC_1C, + DISP_1C, DISP_1C, DISP_1C, DISP_1C, DISP_1C], (instrs + DIVDo, + DIVDUo, DIVWEo, DIVWEUo )>; -def : InstRW<[P9_IntDivAndALUOp_42C_8, IP_EXECE_1C, IP_EXEC_1C, - DISP_1C, DISP_1C, DISP_1C, DISP_1C], +// Cracked DIV and ALU operation. Requires one full slice for the ALU operation +// and one full superslice for the DIV operation since there is only one DIV +// per superslice. Latency of DIV plus ALU is 42. +def : InstRW<[P9_IntDivAndALUOp_42C_8, IP_EXECE_1C, IP_EXECO_1C, IP_EXEC_1C, + DISP_1C, DISP_1C, DISP_1C, DISP_1C, DISP_1C], (instrs DIVDEo, DIVDEUo )>; -// Rotate Operations in IIC_IntRotateD, IIC_IntRotateDI -def : InstRW<[P9_ALU_2C, IP_EXEC_1C, DISP_1C, DISP_1C], - (instrs - SLD, - SRD, - SRAD, - SRADI, - RLDIC -)>; - -def : InstRW<[P9_ALU_2C, IP_EXEC_1C, DISP_1C, DISP_1C, DISP_1C], - (instrs - RLDCL, - RLDCR, - RLDIMI, - RLDICL, - RLDICR, - RLDICL_32_64 -)>; - // CR access instructions in _BrMCR, IIC_BrMCRX. +// Cracked, restricted, ALU operations. +// Here the two ALU ops can actually be done in parallel and therefore the +// latencies are not added together. Otherwise this is like having two +// instructions running together on two pipelines and 6 dispatches. +// ALU ops are 2 cycles each. def : InstRW<[P9_ALU_2C, P9_ALU_2C, IP_EXEC_1C, IP_EXEC_1C, DISP_1C, DISP_1C, DISP_1C, DISP_1C, DISP_1C, DISP_1C], (instrs @@ -690,13 +874,12 @@ def : InstRW<[P9_ALU_2C, P9_ALU_2C, IP_EXEC_1C, IP_EXEC_1C, MTCRF8 )>; -def : InstRW<[P9_ALU_5C, IP_EXEC_1C, DISP_1C, DISP_1C], - (instrs - MCRF, - MCRXRX -)>; - -def : InstRW<[P9_ALU_5C, P9_ALU_5C, IP_EXEC_1C, IP_EXEC_1C, +// Cracked, restricted, ALU operations. +// Here the two ALU ops can actually be done in parallel and therefore the +// latencies are not added together. Otherwise this is like having two +// instructions running together on two pipelines and 6 dispatches. +// ALU ops are 3 cycles each. +def : InstRW<[P9_ALU_3C, P9_ALU_3C, IP_EXEC_1C, IP_EXEC_1C, DISP_1C, DISP_1C, DISP_1C, DISP_1C], (instrs MCRFS @@ -704,93 +887,71 @@ def : InstRW<[P9_ALU_5C, P9_ALU_5C, IP_EXEC_1C, IP_EXEC_1C, // FP Div instructions in IIC_FPDivD and IIC_FPDivS. +// 33 Cycle DP Instruction Restricted. Takes one slice and 3 dispatches. def : InstRW<[P9_DP_33C_8, IP_EXEC_1C, DISP_1C, DISP_1C, DISP_1C], (instrs - FDIV, - XSDIVDP + FDIV )>; -def : InstRW<[P9_DP_22C_5, IP_EXEC_1C, DISP_1C, DISP_1C, DISP_1C], +// 33 Cycle DP Instruction Restricted and Cracked with 2 Cycle ALU. +def : InstRW<[P9_DPOpAndALUOp_35C_8, IP_EXEC_1C, IP_EXEC_1C, + DISP_1C, DISP_1C, DISP_1C, DISP_1C, DISP_1C], (instrs - FDIVS, - XSDIVSP + FDIVo )>; -def : InstRW<[P9_DP_24C_8, IP_EXECE_1C, IP_EXECO_1C, DISP_1C, DISP_1C], +// 33 Cycle DP Instruction. Takes one slice and 2 dispatches. +def : InstRW<[P9_DP_33C_8, IP_EXEC_1C, DISP_1C, DISP_1C], (instrs - XVDIVSP + XSDIVDP )>; -def : InstRW<[P9_DP_33C_8, IP_EXECE_1C, IP_EXECO_1C, DISP_1C, DISP_1C], +// 22 Cycle DP Instruction Restricted. Takes one slice and 3 dispatches. +def : InstRW<[P9_DP_22C_5, IP_EXEC_1C, DISP_1C, DISP_1C, DISP_1C], (instrs - XVDIVDP + FDIVS )>; -// FP Instructions in IIC_FPGeneral, IIC_FPFused +// 22 Cycle DP Instruction Restricted and Cracked with 2 Cycle ALU. +def : InstRW<[P9_DPOpAndALUOp_24C_5, IP_EXEC_1C, IP_EXEC_1C, + DISP_1C, DISP_1C, DISP_1C, DISP_1C, DISP_1C], + (instrs + FDIVSo +)>; -def : InstRW<[P9_DP_7C, IP_EXEC_1C, DISP_1C, DISP_1C, DISP_1C], +// 22 Cycle DP Instruction. Takes one slice and 2 dispatches. +def : InstRW<[P9_DP_22C_5, IP_EXEC_1C, DISP_1C, DISP_1C], (instrs - FRSP, - FRIND, - FRINS, - FRIPD, - FRIPS, - FRIZD, - FRIZS, - FRIMD, - FRIMS, - FRE, - FRES, - FRSQRTE, - FRSQRTES, - FMADDS, - FMADD, - FMSUBS, - FMSUB, - FNMADDS, - FNMADD, - FNMSUBS, - FNMSUB, - FSELD, - FSELS, - FADDS, - FMULS, - FMUL, - FSUBS, - FCFID, - FCTID, - FCTIDZ, - FCFIDU, - FCFIDS, - FCFIDUS, - FCTIDUZ, - FCTIWUZ, - FCTIW, - FCTIWZ + XSDIVSP )>; -def : InstRW<[P9_DP_7C, IP_EXEC_1C, DISP_1C, DISP_1C, DISP_1C], +// 24 Cycle DP Vector Instruction. Takes one full superslice. +// Includes both EXECE, EXECO pipelines and all 3 dispatches for the given +// superslice. +def : InstRW<[P9_DPE_24C_8, P9_DPO_24C_8, IP_EXECE_1C, IP_EXECO_1C, + DISP_1C, DISP_1C, DISP_1C], (instrs - FMR, - FABSD, - FABSS, - FNABSD, - FNABSS, - FNEGD, - FNEGS, - FCPSGND, - FCPSGNS + XVDIVSP )>; -def : InstRW<[P9_ALU_3C, IP_EXEC_1C, DISP_1C, DISP_1C, DISP_1C], +// 33 Cycle DP Vector Instruction. Takes one full superslice. +// Includes both EXECE, EXECO pipelines and all 3 dispatches for the given +// superslice. +def : InstRW<[P9_DPE_33C_8, P9_DPO_33C_8, IP_EXECE_1C, IP_EXECO_1C, + DISP_1C, DISP_1C, DISP_1C], (instrs - FCMPUS, - FCMPUD + XVDIVDP )>; // Load instructions in IIC_LdStLFDU and IIC_LdStLFDUX. -def : InstRW<[P9_LoadAndALUOp_7C, P9_ALU_2C, +// Instruction cracked into three pieces. One Load and two ALU operations. +// The Load and one of the ALU ops cannot be run at the same time and so the +// latencies are added together for 6 cycles. The remainaing ALU is 2 cycles. +// Both the load and the ALU that depends on it are restricted and so they take +// a total of 6 dispatches. The final 2 dispatches come from the second ALU op. +// The two EXEC pipelines are for the 2 ALUs while the AGEN is for the load. +def : InstRW<[P9_LoadAndALUOp_6C, P9_ALU_2C, IP_AGEN_1C, IP_EXEC_1C, IP_EXEC_1C, DISP_1C, DISP_1C, DISP_1C, DISP_1C, DISP_1C, DISP_1C, DISP_1C, DISP_1C], @@ -799,10 +960,32 @@ def : InstRW<[P9_LoadAndALUOp_7C, P9_ALU_2C, LFSUX )>; -def : InstRW<[P9_LS_5C, P9_ALU_2C, IP_AGEN_1C, IP_EXEC_1C, +// Cracked instruction made up of a Load and an ALU. The ALU does not depend on +// the load and so it can be run at the same time as the load. The load is also +// restricted. 3 dispatches are from the restricted load while the other two +// are from the ALU. The AGEN pipeline is from the load and the EXEC pipeline +// is required for the ALU. +def : InstRW<[P9_LS_4C, P9_ALU_2C, IP_AGEN_1C, IP_EXEC_1C, DISP_1C, DISP_1C, DISP_1C, DISP_1C, DISP_1C], (instrs LFDU, LFDUX )>; +// Crypto Instructions + +// 6 Cycle CY operation. Only one CY unit per CPU so we use a whole +// superslice. That includes both exec pipelines (EXECO, EXECE) and all three +// dispatches. +def : InstRW<[P9_CY_6C, IP_EXECO_1C, IP_EXECE_1C, DISP_1C, DISP_1C, DISP_1C], + (instrs + VPMSUMB, + VPMSUMD, + VPMSUMH, + VPMSUMW, + VCIPHER, + VCIPHERLAST, + VNCIPHER, + VNCIPHERLAST, + VSBOX +)>; |