diff options
Diffstat (limited to 'lib/Target/X86/X86ScheduleZnver1.td')
-rw-r--r-- | lib/Target/X86/X86ScheduleZnver1.td | 223 |
1 files changed, 223 insertions, 0 deletions
diff --git a/lib/Target/X86/X86ScheduleZnver1.td b/lib/Target/X86/X86ScheduleZnver1.td new file mode 100644 index 0000000000000..d5b4cfe2ddee0 --- /dev/null +++ b/lib/Target/X86/X86ScheduleZnver1.td @@ -0,0 +1,223 @@ +//=- X86ScheduleZnver1.td - X86 Znver1 Scheduling -------------*- tablegen -*-=// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This file defines the machine model for Znver1 to support instruction +// scheduling and other instruction cost heuristics. +// +//===----------------------------------------------------------------------===// + +def Znver1Model : SchedMachineModel { + // Zen can decode 4 instructions per cycle. + let IssueWidth = 4; + // Based on the reorder buffer we define MicroOpBufferSize + let MicroOpBufferSize = 192; + let LoadLatency = 4; + let MispredictPenalty = 17; + let HighLatency = 25; + let PostRAScheduler = 1; + + // FIXME: This variable is required for incomplete model. + // We haven't catered all instructions. + // So, we reset the value of this variable so as to + // say that the model is incomplete. + let CompleteModel = 0; +} + +let SchedModel = Znver1Model in { + +// Zen can issue micro-ops to 10 different units in one cycle. +// These are +// * Four integer ALU units (ZALU0, ZALU1, ZALU2, ZALU3) +// * Two AGU units (ZAGU0, ZAGU1) +// * Four FPU units (ZFPU0, ZFPU1, ZFPU2, ZFPU3) +// AGUs feed load store queues @two loads and 1 store per cycle. + +// Four ALU units are defined below +def ZnALU0 : ProcResource<1>; +def ZnALU1 : ProcResource<1>; +def ZnALU2 : ProcResource<1>; +def ZnALU3 : ProcResource<1>; + +// Two AGU units are defined below +def ZnAGU0 : ProcResource<1>; +def ZnAGU1 : ProcResource<1>; + +// Four FPU units are defined below +def ZnFPU0 : ProcResource<1>; +def ZnFPU1 : ProcResource<1>; +def ZnFPU2 : ProcResource<1>; +def ZnFPU3 : ProcResource<1>; + +// FPU grouping +def ZnFPU : ProcResGroup<[ZnFPU0, ZnFPU1, ZnFPU2, ZnFPU3]>; +def ZnFPU013 : ProcResGroup<[ZnFPU0, ZnFPU1, ZnFPU3]>; +def ZnFPU01 : ProcResGroup<[ZnFPU0, ZnFPU1]>; +def ZnFPU12 : ProcResGroup<[ZnFPU1, ZnFPU2]>; +def ZnFPU13 : ProcResGroup<[ZnFPU1, ZnFPU3]>; +def ZnFPU23 : ProcResGroup<[ZnFPU2, ZnFPU3]>; +def ZnFPU02 : ProcResGroup<[ZnFPU0, ZnFPU2]>; +def ZnFPU03 : ProcResGroup<[ZnFPU0, ZnFPU3]>; + +// Below are the grouping of the units. +// Micro-ops to be issued to multiple units are tackled this way. + +// ALU grouping +// ZnALU03 - 0,3 grouping +def ZnALU03: ProcResGroup<[ZnALU0, ZnALU3]>; + +// 56 Entry (14x4 entries) Int Scheduler +def ZnALU : ProcResGroup<[ZnALU0, ZnALU1, ZnALU2, ZnALU3]> { + let BufferSize=56; +} + +// 28 Entry (14x2) AGU group. AGUs can't be used for all ALU operations +// but are relevant for some instructions +def ZnAGU : ProcResGroup<[ZnAGU0, ZnAGU1]> { + let BufferSize=28; +} + +// Integer Multiplication issued on ALU1. +def ZnMultiplier : ProcResource<1>; + +// Integer division issued on ALU2. +def ZnDivider : ProcResource<1>; + +// 4 Cycles load-to use Latency is captured +def : ReadAdvance<ReadAfterLd, 4>; + +// (a folded load is an instruction that loads and does some operation) +// Ex: ADDPD xmm,[mem]-> This instruction has two micro-ops +// Instructions with folded loads are usually micro-fused, so they only appear +// as two micro-ops. +// a. load and +// b. addpd +// This multiclass is for folded loads for integer units. +multiclass ZnWriteResPair<X86FoldableSchedWrite SchedRW, + ProcResourceKind ExePort, + int Lat> { + // Register variant takes 1-cycle on Execution Port. + def : WriteRes<SchedRW, [ExePort]> { let Latency = Lat; } + + // Memory variant also uses a cycle on ZnAGU + // adds 4 cycles to the latency. + def : WriteRes<SchedRW.Folded, [ZnAGU, ExePort]> { + let Latency = !add(Lat, 4); + } +} + +// This multiclass is for folded loads for floating point units. +multiclass ZnWriteResFpuPair<X86FoldableSchedWrite SchedRW, + ProcResourceKind ExePort, + int Lat> { + // Register variant takes 1-cycle on Execution Port. + def : WriteRes<SchedRW, [ExePort]> { let Latency = Lat; } + + // Memory variant also uses a cycle on ZnAGU + // adds 7 cycles to the latency. + def : WriteRes<SchedRW.Folded, [ZnAGU, ExePort]> { + let Latency = !add(Lat, 7); + } +} + +// WriteRMW is set for instructions with Memory write +// operation in codegen +def : WriteRes<WriteRMW, [ZnAGU]>; + +def : WriteRes<WriteStore, [ZnAGU]>; +def : WriteRes<WriteMove, [ZnALU]>; +def : WriteRes<WriteLoad, [ZnAGU]> { let Latency = 8; } + +def : WriteRes<WriteZero, []>; +def : WriteRes<WriteLEA, [ZnALU]>; +defm : ZnWriteResPair<WriteALU, ZnALU, 1>; +defm : ZnWriteResPair<WriteShift, ZnALU, 1>; +defm : ZnWriteResPair<WriteJump, ZnALU, 1>; + +// IDIV +def : WriteRes<WriteIDiv, [ZnALU2, ZnDivider]> { + let Latency = 41; + let ResourceCycles = [1, 41]; +} + +def : WriteRes<WriteIDivLd, [ZnALU2, ZnAGU, ZnDivider]> { + let Latency = 45; + let ResourceCycles = [1, 4, 41]; +} + +// IMUL +def : WriteRes<WriteIMulH, [ZnALU1, ZnMultiplier]>{ + let Latency = 4; +} +def : WriteRes<WriteIMul, [ZnALU1, ZnMultiplier]> { + let Latency = 4; +} + +def : WriteRes<WriteIMulLd,[ZnALU1, ZnMultiplier]> { + let Latency = 8; +} + +// Floating point operations +defm : ZnWriteResFpuPair<WriteFHAdd, ZnFPU0, 3>; +defm : ZnWriteResFpuPair<WriteFAdd, ZnFPU0, 3>; +defm : ZnWriteResFpuPair<WriteFBlend, ZnFPU01, 1>; +defm : ZnWriteResFpuPair<WriteFVarBlend, ZnFPU01, 1>; +defm : ZnWriteResFpuPair<WriteVarBlend, ZnFPU0, 1>; +defm : ZnWriteResFpuPair<WriteCvtI2F, ZnFPU3, 5>; +defm : ZnWriteResFpuPair<WriteCvtF2F, ZnFPU3, 5>; +defm : ZnWriteResFpuPair<WriteCvtF2I, ZnFPU3, 5>; +defm : ZnWriteResFpuPair<WriteFDiv, ZnFPU3, 15>; +defm : ZnWriteResFpuPair<WriteFShuffle, ZnFPU12, 1>; +defm : ZnWriteResFpuPair<WriteFMul, ZnFPU0, 5>; +defm : ZnWriteResFpuPair<WriteFRcp, ZnFPU01, 5>; +defm : ZnWriteResFpuPair<WriteFRsqrt, ZnFPU01, 5>; +defm : ZnWriteResFpuPair<WriteFSqrt, ZnFPU3, 20>; + +// Vector integer operations which uses FPU units +defm : ZnWriteResFpuPair<WriteVecShift, ZnFPU, 1>; +defm : ZnWriteResFpuPair<WriteVecLogic, ZnFPU, 1>; +defm : ZnWriteResFpuPair<WritePHAdd, ZnFPU, 1>; +defm : ZnWriteResFpuPair<WriteVecALU, ZnFPU, 1>; +defm : ZnWriteResFpuPair<WriteVecIMul, ZnFPU0, 4>; +defm : ZnWriteResFpuPair<WriteShuffle, ZnFPU, 1>; +defm : ZnWriteResFpuPair<WriteBlend, ZnFPU01, 1>; +defm : ZnWriteResFpuPair<WriteShuffle256, ZnFPU, 2>; + +// Vector Shift Operations +defm : ZnWriteResFpuPair<WriteVarVecShift, ZnFPU12, 1>; + +// AES Instructions. +defm : ZnWriteResFpuPair<WriteAESDecEnc, ZnFPU01, 4>; +defm : ZnWriteResFpuPair<WriteAESIMC, ZnFPU01, 4>; +defm : ZnWriteResFpuPair<WriteAESKeyGen, ZnFPU01, 4>; + +def : WriteRes<WriteFence, [ZnAGU]>; +def : WriteRes<WriteNop, []>; + +// Following instructions with latency=100 are microcoded. +// We set long latency so as to block the entire pipeline. +defm : ZnWriteResFpuPair<WriteFShuffle256, ZnFPU, 100>; + +//Microcoded Instructions +let Latency = 100 in { + def : WriteRes<WriteMicrocoded, []>; + def : WriteRes<WriteSystem, []>; + def : WriteRes<WriteMPSAD, []>; + def : WriteRes<WriteMPSADLd, []>; + def : WriteRes<WriteCLMul, []>; + def : WriteRes<WriteCLMulLd, []>; + def : WriteRes<WritePCmpIStrM, []>; + def : WriteRes<WritePCmpIStrMLd, []>; + def : WriteRes<WritePCmpEStrI, []>; + def : WriteRes<WritePCmpEStrILd, []>; + def : WriteRes<WritePCmpEStrM, []>; + def : WriteRes<WritePCmpEStrMLd, []>; + def : WriteRes<WritePCmpIStrI, []>; + def : WriteRes<WritePCmpIStrILd, []>; + } +} |