summaryrefslogtreecommitdiff
path: root/lib/Target/PowerPC/P9InstrResources.td
diff options
context:
space:
mode:
authorDimitry Andric <dim@FreeBSD.org>2017-12-18 20:10:56 +0000
committerDimitry Andric <dim@FreeBSD.org>2017-12-18 20:10:56 +0000
commit044eb2f6afba375a914ac9d8024f8f5142bb912e (patch)
tree1475247dc9f9fe5be155ebd4c9069c75aadf8c20 /lib/Target/PowerPC/P9InstrResources.td
parenteb70dddbd77e120e5d490bd8fbe7ff3f8fa81c6b (diff)
downloadsrc-test2-044eb2f6afba375a914ac9d8024f8f5142bb912e.tar.gz
src-test2-044eb2f6afba375a914ac9d8024f8f5142bb912e.zip
Notes
Diffstat (limited to 'lib/Target/PowerPC/P9InstrResources.td')
-rw-r--r--lib/Target/PowerPC/P9InstrResources.td687
1 files changed, 435 insertions, 252 deletions
diff --git a/lib/Target/PowerPC/P9InstrResources.td b/lib/Target/PowerPC/P9InstrResources.td
index aea022f88766..dc6ed16e53ce 100644
--- a/lib/Target/PowerPC/P9InstrResources.td
+++ b/lib/Target/PowerPC/P9InstrResources.td
@@ -12,11 +12,29 @@
// is listed here. Instructions in this file belong to itinerary classes that
// have instructions with different resource requirements.
//
+// The makeup of the P9 CPU is modeled as follows:
+// - Each CPU is made up of two superslices.
+// - Each superslice is made up of two slices. Therefore, there are 4 slices
+// for each CPU.
+// - Up to 6 instructions can be dispatched to each CPU. Three per superslice.
+// - Each CPU has:
+// - One CY (Crypto) unit P9_CY_*
+// - One DFU (Decimal Floating Point and Quad Precision) unit P9_DFU_*
+// - Two PM (Permute) units. One on each superslice. P9_PM_*
+// - Two DIV (Fixed Point Divide) units. One on each superslize. P9_DIV_*
+// - Four ALU (Fixed Point Arithmetic) units. One on each slice. P9_ALU_*
+// - Four DP (Floating Point) units. One on each slice. P9_DP_*
+// This also includes fixed point multiply add.
+// - Four AGEN (Address Generation) units. One for each slice. P9_AGEN_*
+// - Four Load/Store Queues. P9_LS_*
+// - Each set of instructions will require a number of these resources.
//===----------------------------------------------------------------------===//
-
+// Two cycle ALU vector operation that uses an entire superslice.
+// Uses both ALU units (the even ALUE and odd ALUO units), two pipelines
+// (EXECE, EXECO) and all three dispatches (DISP) to the given superslice.
def : InstRW<[P9_ALUE_2C, P9_ALUO_2C, IP_EXECE_1C, IP_EXECO_1C,
- DISP_1C, DISP_1C],
+ DISP_1C, DISP_1C, DISP_1C],
(instrs
VADDCUW,
VADDUBM,
@@ -26,47 +44,41 @@ def : InstRW<[P9_ALUE_2C, P9_ALUO_2C, IP_EXECE_1C, IP_EXECO_1C,
VAND,
VANDC,
VCMPEQUB,
- VCMPEQUBo,
VCMPEQUD,
- VCMPEQUDo,
VCMPEQUH,
- VCMPEQUHo,
VCMPEQUW,
- VCMPEQUWo,
- VCMPGTSB,
- VCMPGTSBo,
- VCMPGTSD,
- VCMPGTSDo,
- VCMPGTSH,
- VCMPGTSHo,
- VCMPGTSW,
- VCMPGTSWo,
- VCMPGTUB,
- VCMPGTUBo,
- VCMPGTUD,
- VCMPGTUDo,
- VCMPGTUH,
- VCMPGTUHo,
- VCMPGTUW,
- VCMPGTUWo,
VCMPNEB,
- VCMPNEBo,
VCMPNEH,
- VCMPNEHo,
VCMPNEW,
- VCMPNEWo,
VCMPNEZB,
- VCMPNEZBo,
VCMPNEZH,
- VCMPNEZHo,
VCMPNEZW,
- VCMPNEZWo,
VEQV,
VEXTSB2D,
VEXTSB2W,
VEXTSH2D,
VEXTSH2W,
VEXTSW2D,
+ VRLB,
+ VRLD,
+ VRLDMI,
+ VRLDNM,
+ VRLH,
+ VRLW,
+ VRLWMI,
+ VRLWNM,
+ VSRAB,
+ VSRAD,
+ VSRAH,
+ VSRAW,
+ VSRB,
+ VSRD,
+ VSRH,
+ VSRW,
+ VSLB,
+ VSLD,
+ VSLH,
+ VSLW,
VMRGEW,
VMRGOW,
VNAND,
@@ -77,9 +89,7 @@ def : InstRW<[P9_ALUE_2C, P9_ALUO_2C, IP_EXECE_1C, IP_EXECO_1C,
VORC,
VPOPCNTB,
VPOPCNTH,
- VPOPCNTW,
VSEL,
- VSUBCUW,
VSUBUBM,
VSUBUDM,
VSUBUHM,
@@ -98,6 +108,8 @@ def : InstRW<[P9_ALUE_2C, P9_ALUO_2C, IP_EXECE_1C, IP_EXECO_1C,
XVNEGDP,
XVNEGSP,
XVXEXPDP,
+ XVIEXPSP,
+ XVXEXPSP,
XXLAND,
XXLANDC,
XXLEQV,
@@ -107,28 +119,128 @@ def : InstRW<[P9_ALUE_2C, P9_ALUO_2C, IP_EXECE_1C, IP_EXECO_1C,
XXLORf,
XXLORC,
XXLXOR,
- XXSEL
-)>;
-
-def : InstRW<[P9_ALU_2C, IP_EXEC_1C, DISP_1C, DISP_1C],
- (instrs
+ XXSEL,
XSABSQP,
XSCPSGNQP,
XSIEXPQP,
XSNABSQP,
XSNEGQP,
- XSXEXPQP,
- XSABSDP,
- XSCPSGNDP,
- XSIEXPDP,
+ XSXEXPQP
+)>;
+
+// Restricted Dispatch ALU operation for 3 cycles. The operation runs on a
+// slingle slice. However, since it is Restricted it requires all 3 dispatches
+// (DISP) for that superslice.
+def : InstRW<[P9_ALU_3C, IP_EXEC_1C, DISP_1C, DISP_1C, DISP_1C],
+ (instrs
+ FCMPUS,
+ FCMPUD,
+ XSTSTDCDP,
+ XSTSTDCSP
+)>;
+
+// Standard Dispatch ALU operation for 3 cycles. Only one slice used.
+def : InstRW<[P9_ALU_3C, IP_EXEC_1C, DISP_1C, DISP_1C],
+ (instrs
+ XSMAXCDP,
+ XSMAXDP,
+ XSMAXJDP,
+ XSMINCDP,
+ XSMINDP,
+ XSMINJDP,
+ XSTDIVDP,
+ XSTSQRTDP,
+ XSCMPEQDP,
+ XSCMPEXPDP,
+ XSCMPGEDP,
+ XSCMPGTDP,
+ XSCMPODP,
+ XSCMPUDP,
+ XSXSIGDP,
+ XSCVSPDPN
+)>;
+
+// Standard Dispatch ALU operation for 2 cycles. Only one slice used.
+def : InstRW<[P9_ALU_2C, IP_EXEC_1C, DISP_1C, DISP_1C],
+ (instrs
+ ADDIStocHA,
+ ADDItocL,
+ MCRF,
+ MCRXRX,
+ SLD,
+ SRD,
+ SRAD,
+ SRADI,
+ RLDIC,
XSNABSDP,
+ XSXEXPDP,
+ XSABSDP,
XSNEGDP,
- XSXEXPDP
+ XSCPSGNDP
)>;
-def : InstRW<[P9_ALUE_3C, P9_ALUO_3C, IP_EXECE_1C, IP_EXECO_1C, DISP_1C, DISP_1C],
+// Restricted Dispatch ALU operation for 2 cycles. The operation runs on a
+// slingle slice. However, since it is Restricted it requires all 3 dispatches
+// (DISP) for that superslice.
+def : InstRW<[P9_ALU_2C, IP_EXEC_1C, DISP_1C, DISP_1C, DISP_1C],
(instrs
+ RLDCL,
+ RLDCR,
+ RLDIMI,
+ RLDICL,
+ RLDICR,
+ RLDICL_32_64,
+ XSIEXPDP,
+ FMR,
+ FABSD,
+ FABSS,
+ FNABSD,
+ FNABSS,
+ FNEGD,
+ FNEGS,
+ FCPSGND,
+ FCPSGNS
+)>;
+// Three cycle ALU vector operation that uses an entire superslice.
+// Uses both ALU units (the even ALUE and odd ALUO units), two pipelines
+// (EXECE, EXECO) and all three dispatches (DISP) to the given superslice.
+def : InstRW<[P9_ALUE_3C, P9_ALUO_3C, IP_EXECE_1C, IP_EXECO_1C,
+ DISP_1C, DISP_1C, DISP_1C],
+ (instrs
+ VBPERMD,
+ VABSDUB,
+ VABSDUH,
+ VABSDUW,
+ VADDUBS,
+ VADDUHS,
+ VADDUWS,
+ VAVGSB,
+ VAVGSH,
+ VAVGSW,
+ VAVGUB,
+ VAVGUH,
+ VAVGUW,
+ VCMPEQFP,
+ VCMPEQFPo,
+ VCMPGEFP,
+ VCMPGEFPo,
+ VCMPBFP,
+ VCMPBFPo,
+ VCMPGTFP,
+ VCMPGTFPo,
+ VCLZB,
+ VCLZD,
+ VCLZH,
+ VCLZW,
+ VCTZB,
+ VCTZD,
+ VCTZH,
+ VCTZW,
+ VADDSBS,
+ VADDSHS,
+ VADDSWS,
+ VMINFP,
VMINSB,
VMINSD,
VMINSH,
@@ -137,55 +249,54 @@ def : InstRW<[P9_ALUE_3C, P9_ALUO_3C, IP_EXECE_1C, IP_EXECO_1C, DISP_1C, DISP_1C
VMINUD,
VMINUH,
VMINUW,
+ VMAXFP,
+ VMAXSB,
+ VMAXSD,
+ VMAXSH,
+ VMAXSW,
+ VMAXUB,
+ VMAXUD,
+ VMAXUH,
+ VMAXUW,
+ VPOPCNTW,
VPOPCNTD,
VPRTYBD,
VPRTYBW,
- VRLB,
- VRLD,
- VRLDMI,
- VRLDNM,
- VRLH,
- VRLW,
- VRLWMI,
- VRLWNM,
VSHASIGMAD,
VSHASIGMAW,
- VSLB,
- VSLD,
- VSLH,
- VSLW,
- VSRAB,
- VSRAD,
- VSRAH,
- VSRAW,
- VSRB,
- VSRD,
- VSRH,
- VSRW,
VSUBSBS,
VSUBSHS,
VSUBSWS,
VSUBUBS,
VSUBUHS,
VSUBUWS,
- XSCMPEQDP,
- XSCMPEXPDP,
- XSCMPGEDP,
- XSCMPGTDP,
- XSCMPODP,
- XSCMPUDP,
- XSCVSPDPN,
- XSMAXCDP,
- XSMAXDP,
- XSMAXJDP,
- XSMINCDP,
- XSMINDP,
- XSMINJDP,
- XSTDIVDP,
- XSTSQRTDP,
- XSTSTDCDP,
- XSTSTDCSP,
- XSXSIGDP,
+ VSUBCUW,
+ VCMPGTSB,
+ VCMPGTSBo,
+ VCMPGTSD,
+ VCMPGTSDo,
+ VCMPGTSH,
+ VCMPGTSHo,
+ VCMPGTSW,
+ VCMPGTSWo,
+ VCMPGTUB,
+ VCMPGTUBo,
+ VCMPGTUD,
+ VCMPGTUDo,
+ VCMPGTUH,
+ VCMPGTUHo,
+ VCMPGTUW,
+ VCMPGTUWo,
+ VCMPNEBo,
+ VCMPNEHo,
+ VCMPNEWo,
+ VCMPNEZBo,
+ VCMPNEZHo,
+ VCMPNEZWo,
+ VCMPEQUBo,
+ VCMPEQUDo,
+ VCMPEQUHo,
+ VCMPEQUWo,
XVCMPEQDP,
XVCMPEQDPo,
XVCMPEQSP,
@@ -198,7 +309,6 @@ def : InstRW<[P9_ALUE_3C, P9_ALUO_3C, IP_EXECE_1C, IP_EXECO_1C, DISP_1C, DISP_1C
XVCMPGTDPo,
XVCMPGTSP,
XVCMPGTSPo,
- XVIEXPSP,
XVMAXDP,
XVMAXSP,
XVMINDP,
@@ -209,58 +319,15 @@ def : InstRW<[P9_ALUE_3C, P9_ALUO_3C, IP_EXECE_1C, IP_EXECO_1C, DISP_1C, DISP_1C
XVTSQRTSP,
XVTSTDCDP,
XVTSTDCSP,
- XVXEXPSP,
XVXSIGDP,
XVXSIGSP
)>;
-def : InstRW<[P9_ALUE_4C, P9_ALUO_4C, IP_EXECE_1C, IP_EXECO_1C, DISP_1C, DISP_1C],
- (instrs
- VABSDUB,
- VABSDUH,
- VABSDUW,
- VADDSBS,
- VADDSHS,
- VADDSWS,
- VADDUBS,
- VADDUHS,
- VADDUWS,
- VAVGSB,
- VAVGSH,
- VAVGSW,
- VAVGUB,
- VAVGUH,
- VAVGUW,
- VBPERMD,
- VCLZB,
- VCLZD,
- VCLZH,
- VCLZW,
- VCMPBFP,
- VCMPBFPo,
- VCMPGTFP,
- VCMPGTFPo,
- VCTZB,
- VCTZD,
- VCTZH,
- VCTZW,
- VMAXFP,
- VMAXSB,
- VMAXSD,
- VMAXSH,
- VMAXSW,
- VMAXUB,
- VMAXUD,
- VMAXUH,
- VMAXUW,
- VMINFP,
- VCMPEQFP,
- VCMPEQFPo,
- VCMPGEFP,
- VCMPGEFPo
-)>;
-
-def : InstRW<[P9_DPE_7C, P9_DPO_7C, IP_EXECE_1C, IP_EXECO_1C, DISP_1C, DISP_1C],
+// 7 cycle DP vector operation that uses an entire superslice.
+// Uses both DP units (the even DPE and odd DPO units), two pipelines
+// (EXECE, EXECO) and all three dispatches (DISP) to the given superslice.
+def : InstRW<[P9_DPE_7C, P9_DPO_7C, IP_EXECE_1C, IP_EXECO_1C,
+ DISP_1C, DISP_1C, DISP_1C],
(instrs
VADDFP,
VCTSXS,
@@ -367,8 +434,47 @@ def : InstRW<[P9_DPE_7C, P9_DPO_7C, IP_EXECE_1C, IP_EXECO_1C, DISP_1C, DISP_1C],
VSUMSWS
)>;
+// 7 cycle Restricted DP operation. One DP unit, one EXEC pipeline and all three
+// dispatch units for the superslice.
def : InstRW<[P9_DP_7C, IP_EXEC_1C, DISP_1C, DISP_1C, DISP_1C],
(instrs
+ FRSP,
+ FRIND,
+ FRINS,
+ FRIPD,
+ FRIPS,
+ FRIZD,
+ FRIZS,
+ FRIMD,
+ FRIMS,
+ FRE,
+ FRES,
+ FRSQRTE,
+ FRSQRTES,
+ FMADDS,
+ FMADD,
+ FMSUBS,
+ FMSUB,
+ FNMADDS,
+ FNMADD,
+ FNMSUBS,
+ FNMSUB,
+ FSELD,
+ FSELS,
+ FADDS,
+ FMULS,
+ FMUL,
+ FSUBS,
+ FCFID,
+ FCTID,
+ FCTIDZ,
+ FCFIDU,
+ FCFIDS,
+ FCFIDUS,
+ FCTIDUZ,
+ FCTIWUZ,
+ FCTIW,
+ FCTIWZ,
XSMADDADP,
XSMADDASP,
XSMADDMDP,
@@ -389,7 +495,19 @@ def : InstRW<[P9_DP_7C, IP_EXEC_1C, DISP_1C, DISP_1C, DISP_1C],
XSNMSUBMSP
)>;
+// 7 cycle Restricted DP operation and one 2 cycle ALU operation.
+// The DP is restricted so we need a full 5 dispatches.
+def : InstRW<[P9_DPOpAndALUOp_9C, IP_EXEC_1C, IP_EXEC_1C,
+ DISP_1C, DISP_1C, DISP_1C, DISP_1C, DISP_1C],
+ (instrs
+ FMULo,
+ FMADDo,
+ FMSUBo,
+ FNMADDo,
+ FNMSUBo
+)>;
+// 7 cycle DP operation. One DP unit, one EXEC pipeline and two dispatch units.
def : InstRW<[P9_DP_7C, IP_EXEC_1C, DISP_1C, DISP_1C],
(instrs
XSADDDP,
@@ -397,8 +515,10 @@ def : InstRW<[P9_DP_7C, IP_EXEC_1C, DISP_1C, DISP_1C],
XSCVDPHP,
XSCVDPSP,
XSCVDPSXDS,
+ XSCVDPSXDSs,
XSCVDPSXWS,
XSCVDPUXDS,
+ XSCVDPUXDSs,
XSCVDPUXWS,
XSCVHPDP,
XSCVSPDP,
@@ -421,7 +541,10 @@ def : InstRW<[P9_DP_7C, IP_EXEC_1C, DISP_1C, DISP_1C],
XSCVDPSPN
)>;
-def : InstRW<[P9_PM_3C, IP_EXECO_1C, IP_EXECE_1C, DISP_1C, DISP_1C],
+// Three Cycle PM operation. Only one PM unit per superslice so we use the whole
+// superslice. That includes both exec pipelines (EXECO, EXECE) and all three
+// dispatches.
+def : InstRW<[P9_PM_3C, IP_EXECO_1C, IP_EXECE_1C, DISP_1C, DISP_1C, DISP_1C],
(instrs
VBPERMQ,
VCLZLSBB,
@@ -469,7 +592,9 @@ def : InstRW<[P9_PM_3C, IP_EXECO_1C, IP_EXECE_1C, DISP_1C, DISP_1C],
VSLO,
VSLV,
VSPLTB,
+ VSPLTBs,
VSPLTH,
+ VSPLTHs,
VSPLTISB,
VSPLTISH,
VSPLTISW,
@@ -498,6 +623,9 @@ def : InstRW<[P9_PM_3C, IP_EXECO_1C, IP_EXECE_1C, DISP_1C, DISP_1C],
XXSLDWI,
XXSPLTIB,
XXSPLTW,
+ XXSPLTWs,
+ XXPERMDI,
+ XXPERMDIs,
VADDCUQ,
VADDECUQ,
VADDEUQM,
@@ -517,7 +645,10 @@ def : InstRW<[P9_PM_3C, IP_EXECO_1C, IP_EXECE_1C, DISP_1C, DISP_1C],
XSXSIGQP
)>;
-def : InstRW<[P9_DFU_12C, IP_EXECE_1C, IP_EXECO_1C, DISP_1C, DISP_1C],
+// 12 Cycle DFU operation. Only one DFU unit per CPU so we use a whole
+// superslice. That includes both exec pipelines (EXECO, EXECE) and all three
+// dispatches.
+def : InstRW<[P9_DFU_12C, IP_EXECE_1C, IP_EXECO_1C, DISP_1C, DISP_1C, DISP_1C],
(instrs
XSADDQP,
XSADDQPO,
@@ -536,7 +667,10 @@ def : InstRW<[P9_DFU_12C, IP_EXECE_1C, IP_EXECO_1C, DISP_1C, DISP_1C],
XSSUBQPO
)>;
-def : InstRW<[P9_DFU_24C, IP_EXECE_1C, IP_EXECO_1C, DISP_1C, DISP_1C],
+// 24 Cycle DFU operation. Only one DFU unit per CPU so we use a whole
+// superslice. That includes both exec pipelines (EXECO, EXECE) and all three
+// dispatches.
+def : InstRW<[P9_DFU_24C, IP_EXECE_1C, IP_EXECO_1C, DISP_1C, DISP_1C, DISP_1C],
(instrs
XSMADDQP,
XSMADDQPO,
@@ -550,45 +684,57 @@ def : InstRW<[P9_DFU_24C, IP_EXECE_1C, IP_EXECO_1C, DISP_1C, DISP_1C],
XSNMSUBQPO
)>;
-def : InstRW<[P9_DFU_58C, IP_EXECE_1C, IP_EXECO_1C, DISP_1C, DISP_1C],
+// 58 Cycle DFU operation. Only one DFU unit per CPU so we use a whole
+// superslice. That includes both exec pipelines (EXECO, EXECE) and all three
+// dispatches.
+def : InstRW<[P9_DFU_58C, IP_EXECE_1C, IP_EXECO_1C, DISP_1C, DISP_1C, DISP_1C],
(instrs
XSDIVQP,
XSDIVQPO
)>;
-def : InstRW<[P9_DFU_76C, IP_EXECE_1C, IP_EXECO_1C, DISP_1C, DISP_1C],
+// 76 Cycle DFU operation. Only one DFU unit per CPU so we use a whole
+// superslice. That includes both exec pipelines (EXECO, EXECE) and all three
+// dispatches.
+def : InstRW<[P9_DFU_76C, IP_EXECE_1C, IP_EXECO_1C, DISP_1C, DISP_1C, DISP_1C],
(instrs
XSSQRTQP,
XSSQRTQPO
)>;
-// Load Operation in IIC_LdStLFD
-
+// 5 Cycle load uses a single slice.
def : InstRW<[P9_LS_5C, IP_AGEN_1C, DISP_1C, DISP_1C],
(instrs
LXSDX,
LXVD2X,
LXSIWZX,
LXV,
- LXSD
+ LXVX,
+ LXSD,
+ DFLOADf64,
+ XFLOADf64
)>;
-def : InstRW<[P9_LS_5C, IP_AGEN_1C, DISP_1C, DISP_1C, DISP_1C],
+// 4 Cycle load uses a single slice.
+def : InstRW<[P9_LS_4C, IP_AGEN_1C, DISP_1C, DISP_1C],
(instrs
- LFIWZX,
- LFDX,
- LFD
+ COPY
)>;
-def : InstRW<[P9_LoadAndALUOp_7C, IP_AGEN_1C, IP_EXEC_1C,
- DISP_1C, DISP_1C, DISP_1C, DISP_1C],
+// 4 Cycle Restricted load uses a single slice but the dispatch for the whole
+// superslice.
+def : InstRW<[P9_LS_4C, IP_AGEN_1C, DISP_1C, DISP_1C, DISP_1C],
(instrs
- LXSSPX,
- LXSIWAX,
- LXSSP
+ LFIWZX,
+ LFDX,
+ LFD
)>;
-def : InstRW<[P9_LoadAndALUOp_7C, IP_AGEN_1C, IP_EXEC_1C,
+// Cracked Restricted Load instruction.
+// Requires consecutive Load and ALU pieces totaling 6 cycles. The Load and ALU
+// operations cannot be done at the same time and so their latencies are added.
+// Full 6 dispatches are required as this is both cracked and restricted.
+def : InstRW<[P9_LoadAndALUOp_6C, IP_EXEC_1C, IP_AGEN_1C,
DISP_1C, DISP_1C, DISP_1C, DISP_1C, DISP_1C, DISP_1C],
(instrs
LFIWAX,
@@ -596,14 +742,38 @@ def : InstRW<[P9_LoadAndALUOp_7C, IP_AGEN_1C, IP_EXEC_1C,
LFS
)>;
-def : InstRW<[P9_LoadAndPMOp_8C, IP_AGEN_1C, IP_EXEC_1C, DISP_1C, DISP_1C],
+// Cracked Load instruction.
+// Requires consecutive Load and ALU pieces totaling 7 cycles. The Load and ALU
+// operations cannot be done at the same time and so their latencies are added.
+// Full 4 dispatches are required as this is a cracked instruction.
+def : InstRW<[P9_LoadAndALUOp_7C, IP_AGEN_1C, IP_EXEC_1C,
+ DISP_1C, DISP_1C, DISP_1C, DISP_1C],
+ (instrs
+ LXSSPX,
+ LXSIWAX,
+ LXSSP,
+ DFLOADf32,
+ XFLOADf32,
+ LIWAX,
+ LIWZX
+)>;
+
+// Cracked Load that requires the PM resource.
+// Since the Load and the PM cannot be done at the same time the latencies are
+// added. Requires 8 cycles.
+// Since the PM requires the full superslice we need both EXECE, EXECO pipelines
+// as well as 3 dispatches for the PM. The Load requires the remaining 2
+// dispatches.
+def : InstRW<[P9_LoadAndPMOp_8C, IP_AGEN_1C, IP_EXECE_1C, IP_EXECO_1C,
+ DISP_1C, DISP_1C, DISP_1C, DISP_1C, DISP_1C],
(instrs
LXVDSX,
+ LXVWSX,
LXVW4X
)>;
-// Store Operations in IIC_LdStSTFD.
-
+// Single slice Restricted store operation. The restricted operation requires
+// all three dispatches for the superslice.
def : InstRW<[P9_LS_1C, IP_EXEC_1C, IP_AGEN_1C, DISP_1C, DISP_1C, DISP_1C],
(instrs
STFS,
@@ -613,74 +783,88 @@ def : InstRW<[P9_LS_1C, IP_EXEC_1C, IP_AGEN_1C, DISP_1C, DISP_1C, DISP_1C],
STFDX,
STXSDX,
STXSSPX,
- STXSIWX
+ STXSIWX,
+ DFSTOREf32,
+ DFSTOREf64,
+ XFSTOREf32,
+ XFSTOREf64,
+ STIWX
)>;
-def : InstRW<[P9_LS_1C, IP_EXEC_1C, IP_EXEC_1C, IP_AGEN_1C, DISP_1C, DISP_1C],
+// Store operation that requires the whole superslice.
+def : InstRW<[P9_LS_1C, IP_EXECE_1C, IP_EXECO_1C, IP_AGEN_1C,
+ DISP_1C, DISP_1C, DISP_1C],
(instrs
STXVD2X,
STXVW4X
)>;
-// Divide Operations in IIC_IntDivW, IIC_IntDivD.
-
-def : InstRW<[P9_DIV_16C_8, IP_EXECE_1C, DISP_1C, DISP_1C],
+// 16 Cycle DIV operation. Only one DIV unit per superslice so we use the whole
+// superslice. That includes both exec pipelines (EXECO, EXECE) and all three
+// dispatches.
+def : InstRW<[P9_DIV_16C_8, IP_EXECO_1C, IP_EXECE_1C,
+ DISP_1C, DISP_1C, DISP_1C],
(instrs
DIVW,
- DIVWU
+ DIVWU,
+ MODSW
)>;
-def : InstRW<[P9_DIV_24C_8, IP_EXECE_1C, DISP_1C, DISP_1C],
+// 24 Cycle DIV operation. Only one DIV unit per superslice so we use the whole
+// superslice. That includes both exec pipelines (EXECO, EXECE) and all three
+// dispatches.
+def : InstRW<[P9_DIV_24C_8, IP_EXECO_1C, IP_EXECE_1C,
+ DISP_1C, DISP_1C, DISP_1C],
(instrs
DIVWE,
DIVD,
DIVWEU,
- DIVDU
+ DIVDU,
+ MODSD,
+ MODUD,
+ MODUW
)>;
-def : InstRW<[P9_DIV_40C_8, IP_EXECE_1C, DISP_1C, DISP_1C],
+// 40 Cycle DIV operation. Only one DIV unit per superslice so we use the whole
+// superslice. That includes both exec pipelines (EXECO, EXECE) and all three
+// dispatches.
+def : InstRW<[P9_DIV_40C_8, IP_EXECO_1C, IP_EXECE_1C,
+ DISP_1C, DISP_1C, DISP_1C],
(instrs
DIVDE,
DIVDEU
)>;
-def : InstRW<[P9_IntDivAndALUOp_26C_8, IP_EXECE_1C, IP_EXEC_1C,
- DISP_1C, DISP_1C, DISP_1C, DISP_1C],
+// Cracked DIV and ALU operation. Requires one full slice for the ALU operation
+// and one full superslice for the DIV operation since there is only one DIV
+// per superslice. Latency of DIV plus ALU is 26.
+def : InstRW<[P9_IntDivAndALUOp_26C_8, IP_EXECE_1C, IP_EXECO_1C, IP_EXEC_1C,
+ DISP_1C, DISP_1C, DISP_1C, DISP_1C, DISP_1C],
(instrs
+ DIVDo,
+ DIVDUo,
DIVWEo,
DIVWEUo
)>;
-def : InstRW<[P9_IntDivAndALUOp_42C_8, IP_EXECE_1C, IP_EXEC_1C,
- DISP_1C, DISP_1C, DISP_1C, DISP_1C],
+// Cracked DIV and ALU operation. Requires one full slice for the ALU operation
+// and one full superslice for the DIV operation since there is only one DIV
+// per superslice. Latency of DIV plus ALU is 42.
+def : InstRW<[P9_IntDivAndALUOp_42C_8, IP_EXECE_1C, IP_EXECO_1C, IP_EXEC_1C,
+ DISP_1C, DISP_1C, DISP_1C, DISP_1C, DISP_1C],
(instrs
DIVDEo,
DIVDEUo
)>;
-// Rotate Operations in IIC_IntRotateD, IIC_IntRotateDI
-def : InstRW<[P9_ALU_2C, IP_EXEC_1C, DISP_1C, DISP_1C],
- (instrs
- SLD,
- SRD,
- SRAD,
- SRADI,
- RLDIC
-)>;
-
-def : InstRW<[P9_ALU_2C, IP_EXEC_1C, DISP_1C, DISP_1C, DISP_1C],
- (instrs
- RLDCL,
- RLDCR,
- RLDIMI,
- RLDICL,
- RLDICR,
- RLDICL_32_64
-)>;
-
// CR access instructions in _BrMCR, IIC_BrMCRX.
+// Cracked, restricted, ALU operations.
+// Here the two ALU ops can actually be done in parallel and therefore the
+// latencies are not added together. Otherwise this is like having two
+// instructions running together on two pipelines and 6 dispatches.
+// ALU ops are 2 cycles each.
def : InstRW<[P9_ALU_2C, P9_ALU_2C, IP_EXEC_1C, IP_EXEC_1C,
DISP_1C, DISP_1C, DISP_1C, DISP_1C, DISP_1C, DISP_1C],
(instrs
@@ -690,13 +874,12 @@ def : InstRW<[P9_ALU_2C, P9_ALU_2C, IP_EXEC_1C, IP_EXEC_1C,
MTCRF8
)>;
-def : InstRW<[P9_ALU_5C, IP_EXEC_1C, DISP_1C, DISP_1C],
- (instrs
- MCRF,
- MCRXRX
-)>;
-
-def : InstRW<[P9_ALU_5C, P9_ALU_5C, IP_EXEC_1C, IP_EXEC_1C,
+// Cracked, restricted, ALU operations.
+// Here the two ALU ops can actually be done in parallel and therefore the
+// latencies are not added together. Otherwise this is like having two
+// instructions running together on two pipelines and 6 dispatches.
+// ALU ops are 3 cycles each.
+def : InstRW<[P9_ALU_3C, P9_ALU_3C, IP_EXEC_1C, IP_EXEC_1C,
DISP_1C, DISP_1C, DISP_1C, DISP_1C],
(instrs
MCRFS
@@ -704,93 +887,71 @@ def : InstRW<[P9_ALU_5C, P9_ALU_5C, IP_EXEC_1C, IP_EXEC_1C,
// FP Div instructions in IIC_FPDivD and IIC_FPDivS.
+// 33 Cycle DP Instruction Restricted. Takes one slice and 3 dispatches.
def : InstRW<[P9_DP_33C_8, IP_EXEC_1C, DISP_1C, DISP_1C, DISP_1C],
(instrs
- FDIV,
- XSDIVDP
+ FDIV
)>;
-def : InstRW<[P9_DP_22C_5, IP_EXEC_1C, DISP_1C, DISP_1C, DISP_1C],
+// 33 Cycle DP Instruction Restricted and Cracked with 2 Cycle ALU.
+def : InstRW<[P9_DPOpAndALUOp_35C_8, IP_EXEC_1C, IP_EXEC_1C,
+ DISP_1C, DISP_1C, DISP_1C, DISP_1C, DISP_1C],
(instrs
- FDIVS,
- XSDIVSP
+ FDIVo
)>;
-def : InstRW<[P9_DP_24C_8, IP_EXECE_1C, IP_EXECO_1C, DISP_1C, DISP_1C],
+// 33 Cycle DP Instruction. Takes one slice and 2 dispatches.
+def : InstRW<[P9_DP_33C_8, IP_EXEC_1C, DISP_1C, DISP_1C],
(instrs
- XVDIVSP
+ XSDIVDP
)>;
-def : InstRW<[P9_DP_33C_8, IP_EXECE_1C, IP_EXECO_1C, DISP_1C, DISP_1C],
+// 22 Cycle DP Instruction Restricted. Takes one slice and 3 dispatches.
+def : InstRW<[P9_DP_22C_5, IP_EXEC_1C, DISP_1C, DISP_1C, DISP_1C],
(instrs
- XVDIVDP
+ FDIVS
)>;
-// FP Instructions in IIC_FPGeneral, IIC_FPFused
+// 22 Cycle DP Instruction Restricted and Cracked with 2 Cycle ALU.
+def : InstRW<[P9_DPOpAndALUOp_24C_5, IP_EXEC_1C, IP_EXEC_1C,
+ DISP_1C, DISP_1C, DISP_1C, DISP_1C, DISP_1C],
+ (instrs
+ FDIVSo
+)>;
-def : InstRW<[P9_DP_7C, IP_EXEC_1C, DISP_1C, DISP_1C, DISP_1C],
+// 22 Cycle DP Instruction. Takes one slice and 2 dispatches.
+def : InstRW<[P9_DP_22C_5, IP_EXEC_1C, DISP_1C, DISP_1C],
(instrs
- FRSP,
- FRIND,
- FRINS,
- FRIPD,
- FRIPS,
- FRIZD,
- FRIZS,
- FRIMD,
- FRIMS,
- FRE,
- FRES,
- FRSQRTE,
- FRSQRTES,
- FMADDS,
- FMADD,
- FMSUBS,
- FMSUB,
- FNMADDS,
- FNMADD,
- FNMSUBS,
- FNMSUB,
- FSELD,
- FSELS,
- FADDS,
- FMULS,
- FMUL,
- FSUBS,
- FCFID,
- FCTID,
- FCTIDZ,
- FCFIDU,
- FCFIDS,
- FCFIDUS,
- FCTIDUZ,
- FCTIWUZ,
- FCTIW,
- FCTIWZ
+ XSDIVSP
)>;
-def : InstRW<[P9_DP_7C, IP_EXEC_1C, DISP_1C, DISP_1C, DISP_1C],
+// 24 Cycle DP Vector Instruction. Takes one full superslice.
+// Includes both EXECE, EXECO pipelines and all 3 dispatches for the given
+// superslice.
+def : InstRW<[P9_DPE_24C_8, P9_DPO_24C_8, IP_EXECE_1C, IP_EXECO_1C,
+ DISP_1C, DISP_1C, DISP_1C],
(instrs
- FMR,
- FABSD,
- FABSS,
- FNABSD,
- FNABSS,
- FNEGD,
- FNEGS,
- FCPSGND,
- FCPSGNS
+ XVDIVSP
)>;
-def : InstRW<[P9_ALU_3C, IP_EXEC_1C, DISP_1C, DISP_1C, DISP_1C],
+// 33 Cycle DP Vector Instruction. Takes one full superslice.
+// Includes both EXECE, EXECO pipelines and all 3 dispatches for the given
+// superslice.
+def : InstRW<[P9_DPE_33C_8, P9_DPO_33C_8, IP_EXECE_1C, IP_EXECO_1C,
+ DISP_1C, DISP_1C, DISP_1C],
(instrs
- FCMPUS,
- FCMPUD
+ XVDIVDP
)>;
// Load instructions in IIC_LdStLFDU and IIC_LdStLFDUX.
-def : InstRW<[P9_LoadAndALUOp_7C, P9_ALU_2C,
+// Instruction cracked into three pieces. One Load and two ALU operations.
+// The Load and one of the ALU ops cannot be run at the same time and so the
+// latencies are added together for 6 cycles. The remainaing ALU is 2 cycles.
+// Both the load and the ALU that depends on it are restricted and so they take
+// a total of 6 dispatches. The final 2 dispatches come from the second ALU op.
+// The two EXEC pipelines are for the 2 ALUs while the AGEN is for the load.
+def : InstRW<[P9_LoadAndALUOp_6C, P9_ALU_2C,
IP_AGEN_1C, IP_EXEC_1C, IP_EXEC_1C,
DISP_1C, DISP_1C, DISP_1C, DISP_1C,
DISP_1C, DISP_1C, DISP_1C, DISP_1C],
@@ -799,10 +960,32 @@ def : InstRW<[P9_LoadAndALUOp_7C, P9_ALU_2C,
LFSUX
)>;
-def : InstRW<[P9_LS_5C, P9_ALU_2C, IP_AGEN_1C, IP_EXEC_1C,
+// Cracked instruction made up of a Load and an ALU. The ALU does not depend on
+// the load and so it can be run at the same time as the load. The load is also
+// restricted. 3 dispatches are from the restricted load while the other two
+// are from the ALU. The AGEN pipeline is from the load and the EXEC pipeline
+// is required for the ALU.
+def : InstRW<[P9_LS_4C, P9_ALU_2C, IP_AGEN_1C, IP_EXEC_1C,
DISP_1C, DISP_1C, DISP_1C, DISP_1C, DISP_1C],
(instrs
LFDU,
LFDUX
)>;
+// Crypto Instructions
+
+// 6 Cycle CY operation. Only one CY unit per CPU so we use a whole
+// superslice. That includes both exec pipelines (EXECO, EXECE) and all three
+// dispatches.
+def : InstRW<[P9_CY_6C, IP_EXECO_1C, IP_EXECE_1C, DISP_1C, DISP_1C, DISP_1C],
+ (instrs
+ VPMSUMB,
+ VPMSUMD,
+ VPMSUMH,
+ VPMSUMW,
+ VCIPHER,
+ VCIPHERLAST,
+ VNCIPHER,
+ VNCIPHERLAST,
+ VSBOX
+)>;