//===-- Uops.cpp ------------------------------------------------*- C++ -*-===// // // The LLVM Compiler Infrastructure // // This file is distributed under the University of Illinois Open Source // License. See LICENSE.TXT for details. // //===----------------------------------------------------------------------===// #include "Uops.h" #include "Assembler.h" #include "BenchmarkRunner.h" #include "MCInstrDescView.h" #include "Target.h" // FIXME: Load constants into registers (e.g. with fld1) to not break // instructions like x87. // Ideally we would like the only limitation on executing uops to be the issue // ports. Maximizing port pressure increases the likelihood that the load is // distributed evenly across possible ports. // To achieve that, one approach is to generate instructions that do not have // data dependencies between them. // // For some instructions, this is trivial: // mov rax, qword ptr [rsi] // mov rax, qword ptr [rsi] // mov rax, qword ptr [rsi] // mov rax, qword ptr [rsi] // For the above snippet, haswell just renames rax four times and executes the // four instructions two at a time on P23 and P0126. // // For some instructions, we just need to make sure that the source is // different from the destination. For example, IDIV8r reads from GPR and // writes to AX. We just need to ensure that the Var is assigned a // register which is different from AX: // idiv bx // idiv bx // idiv bx // idiv bx // The above snippet will be able to fully saturate the ports, while the same // with ax would issue one uop every `latency(IDIV8r)` cycles. // // Some instructions make this harder because they both read and write from // the same register: // inc rax // inc rax // inc rax // inc rax // This has a data dependency from each instruction to the next, limit the // number of instructions that can be issued in parallel. // It turns out that this is not a big issue on recent Intel CPUs because they // have heuristics to balance port pressure. In the snippet above, subsequent // instructions will end up evenly distributed on {P0,P1,P5,P6}, but some CPUs // might end up executing them all on P0 (just because they can), or try // avoiding P5 because it's usually under high pressure from vector // instructions. // This issue is even more important for high-latency instructions because // they increase the idle time of the CPU, e.g. : // imul rax, rbx // imul rax, rbx // imul rax, rbx // imul rax, rbx // // To avoid that, we do the renaming statically by generating as many // independent exclusive assignments as possible (until all possible registers // are exhausted) e.g.: // imul rax, rbx // imul rcx, rbx // imul rdx, rbx // imul r8, rbx // // Some instruction even make the above static renaming impossible because // they implicitly read and write from the same operand, e.g. ADC16rr reads // and writes from EFLAGS. // In that case we just use a greedy register assignment and hope for the // best. namespace llvm { namespace exegesis { static llvm::SmallVector getVariablesWithTiedOperands(const Instruction &Instr) { llvm::SmallVector Result; for (const auto &Var : Instr.Variables) if (Var.hasTiedOperands()) Result.push_back(&Var); return Result; } static void remove(llvm::BitVector &a, const llvm::BitVector &b) { assert(a.size() == b.size()); for (auto I : b.set_bits()) a.reset(I); } UopsBenchmarkRunner::~UopsBenchmarkRunner() = default; UopsSnippetGenerator::~UopsSnippetGenerator() = default; void UopsSnippetGenerator::instantiateMemoryOperands( const unsigned ScratchSpacePointerInReg, std::vector &Instructions) const { if (ScratchSpacePointerInReg == 0) return; // no memory operands. const auto &ET = State.getExegesisTarget(); const unsigned MemStep = ET.getMaxMemoryAccessSize(); const size_t OriginalInstructionsSize = Instructions.size(); size_t I = 0; for (InstructionTemplate &IT : Instructions) { ET.fillMemoryOperands(IT, ScratchSpacePointerInReg, I * MemStep); ++I; } while (Instructions.size() < kMinNumDifferentAddresses) { InstructionTemplate IT = Instructions[I % OriginalInstructionsSize]; ET.fillMemoryOperands(IT, ScratchSpacePointerInReg, I * MemStep); ++I; Instructions.push_back(std::move(IT)); } assert(I * MemStep < BenchmarkRunner::ScratchSpace::kSize && "not enough scratch space"); } llvm::Expected> UopsSnippetGenerator::generateCodeTemplates(const Instruction &Instr) const { CodeTemplate CT; const llvm::BitVector *ScratchSpaceAliasedRegs = nullptr; if (Instr.hasMemoryOperands()) { const auto &ET = State.getExegesisTarget(); CT.ScratchSpacePointerInReg = ET.getScratchMemoryRegister(State.getTargetMachine().getTargetTriple()); if (CT.ScratchSpacePointerInReg == 0) return llvm::make_error( "Infeasible : target does not support memory instructions"); ScratchSpaceAliasedRegs = &State.getRATC().getRegister(CT.ScratchSpacePointerInReg).aliasedBits(); // If the instruction implicitly writes to ScratchSpacePointerInReg , abort. // FIXME: We could make a copy of the scratch register. for (const auto &Op : Instr.Operands) { if (Op.isDef() && Op.isImplicitReg() && ScratchSpaceAliasedRegs->test(Op.getImplicitReg())) return llvm::make_error( "Infeasible : memory instruction uses scratch memory register"); } } const AliasingConfigurations SelfAliasing(Instr, Instr); InstructionTemplate IT(Instr); if (SelfAliasing.empty()) { CT.Info = "instruction is parallel, repeating a random one."; CT.Instructions.push_back(std::move(IT)); instantiateMemoryOperands(CT.ScratchSpacePointerInReg, CT.Instructions); return getSingleton(std::move(CT)); } if (SelfAliasing.hasImplicitAliasing()) { CT.Info = "instruction is serial, repeating a random one."; CT.Instructions.push_back(std::move(IT)); instantiateMemoryOperands(CT.ScratchSpacePointerInReg, CT.Instructions); return getSingleton(std::move(CT)); } const auto TiedVariables = getVariablesWithTiedOperands(Instr); if (!TiedVariables.empty()) { if (TiedVariables.size() > 1) return llvm::make_error( "Infeasible : don't know how to handle several tied variables", llvm::inconvertibleErrorCode()); const Variable *Var = TiedVariables.front(); assert(Var); const Operand &Op = Instr.getPrimaryOperand(*Var); assert(Op.isReg()); CT.Info = "instruction has tied variables using static renaming."; for (const llvm::MCPhysReg Reg : Op.getRegisterAliasing().sourceBits().set_bits()) { if (ScratchSpaceAliasedRegs && ScratchSpaceAliasedRegs->test(Reg)) continue; // Do not use the scratch memory address register. InstructionTemplate TmpIT = IT; TmpIT.getValueFor(*Var) = llvm::MCOperand::createReg(Reg); CT.Instructions.push_back(std::move(TmpIT)); } instantiateMemoryOperands(CT.ScratchSpacePointerInReg, CT.Instructions); return getSingleton(std::move(CT)); } const auto &ReservedRegisters = State.getRATC().reservedRegisters(); // No tied variables, we pick random values for defs. llvm::BitVector Defs(State.getRegInfo().getNumRegs()); for (const auto &Op : Instr.Operands) { if (Op.isReg() && Op.isExplicit() && Op.isDef() && !Op.isMemory()) { auto PossibleRegisters = Op.getRegisterAliasing().sourceBits(); remove(PossibleRegisters, ReservedRegisters); // Do not use the scratch memory address register. if (ScratchSpaceAliasedRegs) remove(PossibleRegisters, *ScratchSpaceAliasedRegs); assert(PossibleRegisters.any() && "No register left to choose from"); const auto RandomReg = randomBit(PossibleRegisters); Defs.set(RandomReg); IT.getValueFor(Op) = llvm::MCOperand::createReg(RandomReg); } } // And pick random use values that are not reserved and don't alias with defs. const auto DefAliases = getAliasedBits(State.getRegInfo(), Defs); for (const auto &Op : Instr.Operands) { if (Op.isReg() && Op.isExplicit() && Op.isUse() && !Op.isMemory()) { auto PossibleRegisters = Op.getRegisterAliasing().sourceBits(); remove(PossibleRegisters, ReservedRegisters); // Do not use the scratch memory address register. if (ScratchSpaceAliasedRegs) remove(PossibleRegisters, *ScratchSpaceAliasedRegs); remove(PossibleRegisters, DefAliases); assert(PossibleRegisters.any() && "No register left to choose from"); const auto RandomReg = randomBit(PossibleRegisters); IT.getValueFor(Op) = llvm::MCOperand::createReg(RandomReg); } } CT.Info = "instruction has no tied variables picking Uses different from defs"; CT.Instructions.push_back(std::move(IT)); instantiateMemoryOperands(CT.ScratchSpacePointerInReg, CT.Instructions); return getSingleton(std::move(CT)); } llvm::Expected> UopsBenchmarkRunner::runMeasurements(const FunctionExecutor &Executor) const { std::vector Result; const PfmCountersInfo &PCI = State.getPfmCounters(); // Uops per port. for (const auto *IssueCounter = PCI.IssueCounters, *IssueCounterEnd = PCI.IssueCounters + PCI.NumIssueCounters; IssueCounter != IssueCounterEnd; ++IssueCounter) { if (!IssueCounter->Counter) continue; auto ExpectedCounterValue = Executor.runAndMeasure(IssueCounter->Counter); if (!ExpectedCounterValue) return ExpectedCounterValue.takeError(); Result.push_back(BenchmarkMeasure::Create(IssueCounter->ProcResName, *ExpectedCounterValue)); } // NumMicroOps. if (const char *const UopsCounter = PCI.UopsCounter) { auto ExpectedCounterValue = Executor.runAndMeasure(UopsCounter); if (!ExpectedCounterValue) return ExpectedCounterValue.takeError(); Result.push_back( BenchmarkMeasure::Create("NumMicroOps", *ExpectedCounterValue)); } return std::move(Result); } constexpr const size_t UopsSnippetGenerator::kMinNumDifferentAddresses; } // namespace exegesis } // namespace llvm