diff options
Diffstat (limited to 'llvm/lib/Transforms/Utils/BypassSlowDivision.cpp')
| -rw-r--r-- | llvm/lib/Transforms/Utils/BypassSlowDivision.cpp | 478 | 
1 files changed, 478 insertions, 0 deletions
| diff --git a/llvm/lib/Transforms/Utils/BypassSlowDivision.cpp b/llvm/lib/Transforms/Utils/BypassSlowDivision.cpp new file mode 100644 index 000000000000..9a6761040bd8 --- /dev/null +++ b/llvm/lib/Transforms/Utils/BypassSlowDivision.cpp @@ -0,0 +1,478 @@ +//===- BypassSlowDivision.cpp - Bypass slow division ----------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +// This file contains an optimization for div and rem on architectures that +// execute short instructions significantly faster than longer instructions. +// For example, on Intel Atom 32-bit divides are slow enough that during +// runtime it is profitable to check the value of the operands, and if they are +// positive and less than 256 use an unsigned 8-bit divide. +// +//===----------------------------------------------------------------------===// + +#include "llvm/Transforms/Utils/BypassSlowDivision.h" +#include "llvm/ADT/DenseMap.h" +#include "llvm/ADT/None.h" +#include "llvm/ADT/Optional.h" +#include "llvm/ADT/STLExtras.h" +#include "llvm/ADT/SmallPtrSet.h" +#include "llvm/Transforms/Utils/Local.h" +#include "llvm/Analysis/ValueTracking.h" +#include "llvm/IR/BasicBlock.h" +#include "llvm/IR/Constants.h" +#include "llvm/IR/DerivedTypes.h" +#include "llvm/IR/Function.h" +#include "llvm/IR/IRBuilder.h" +#include "llvm/IR/Instruction.h" +#include "llvm/IR/Instructions.h" +#include "llvm/IR/Module.h" +#include "llvm/IR/Type.h" +#include "llvm/IR/Value.h" +#include "llvm/Support/Casting.h" +#include "llvm/Support/KnownBits.h" +#include <cassert> +#include <cstdint> + +using namespace llvm; + +#define DEBUG_TYPE "bypass-slow-division" + +namespace { + +  struct QuotRemPair { +    Value *Quotient; +    Value *Remainder; + +    QuotRemPair(Value *InQuotient, Value *InRemainder) +        : Quotient(InQuotient), Remainder(InRemainder) {} +  }; + +  /// A quotient and remainder, plus a BB from which they logically "originate". +  /// If you use Quotient or Remainder in a Phi node, you should use BB as its +  /// corresponding predecessor. +  struct QuotRemWithBB { +    BasicBlock *BB = nullptr; +    Value *Quotient = nullptr; +    Value *Remainder = nullptr; +  }; + +using DivCacheTy = DenseMap<DivRemMapKey, QuotRemPair>; +using BypassWidthsTy = DenseMap<unsigned, unsigned>; +using VisitedSetTy = SmallPtrSet<Instruction *, 4>; + +enum ValueRange { +  /// Operand definitely fits into BypassType. No runtime checks are needed. +  VALRNG_KNOWN_SHORT, +  /// A runtime check is required, as value range is unknown. +  VALRNG_UNKNOWN, +  /// Operand is unlikely to fit into BypassType. The bypassing should be +  /// disabled. +  VALRNG_LIKELY_LONG +}; + +class FastDivInsertionTask { +  bool IsValidTask = false; +  Instruction *SlowDivOrRem = nullptr; +  IntegerType *BypassType = nullptr; +  BasicBlock *MainBB = nullptr; + +  bool isHashLikeValue(Value *V, VisitedSetTy &Visited); +  ValueRange getValueRange(Value *Op, VisitedSetTy &Visited); +  QuotRemWithBB createSlowBB(BasicBlock *Successor); +  QuotRemWithBB createFastBB(BasicBlock *Successor); +  QuotRemPair createDivRemPhiNodes(QuotRemWithBB &LHS, QuotRemWithBB &RHS, +                                   BasicBlock *PhiBB); +  Value *insertOperandRuntimeCheck(Value *Op1, Value *Op2); +  Optional<QuotRemPair> insertFastDivAndRem(); + +  bool isSignedOp() { +    return SlowDivOrRem->getOpcode() == Instruction::SDiv || +           SlowDivOrRem->getOpcode() == Instruction::SRem; +  } + +  bool isDivisionOp() { +    return SlowDivOrRem->getOpcode() == Instruction::SDiv || +           SlowDivOrRem->getOpcode() == Instruction::UDiv; +  } + +  Type *getSlowType() { return SlowDivOrRem->getType(); } + +public: +  FastDivInsertionTask(Instruction *I, const BypassWidthsTy &BypassWidths); + +  Value *getReplacement(DivCacheTy &Cache); +}; + +} // end anonymous namespace + +FastDivInsertionTask::FastDivInsertionTask(Instruction *I, +                                           const BypassWidthsTy &BypassWidths) { +  switch (I->getOpcode()) { +  case Instruction::UDiv: +  case Instruction::SDiv: +  case Instruction::URem: +  case Instruction::SRem: +    SlowDivOrRem = I; +    break; +  default: +    // I is not a div/rem operation. +    return; +  } + +  // Skip division on vector types. Only optimize integer instructions. +  IntegerType *SlowType = dyn_cast<IntegerType>(SlowDivOrRem->getType()); +  if (!SlowType) +    return; + +  // Skip if this bitwidth is not bypassed. +  auto BI = BypassWidths.find(SlowType->getBitWidth()); +  if (BI == BypassWidths.end()) +    return; + +  // Get type for div/rem instruction with bypass bitwidth. +  IntegerType *BT = IntegerType::get(I->getContext(), BI->second); +  BypassType = BT; + +  // The original basic block. +  MainBB = I->getParent(); + +  // The instruction is indeed a slow div or rem operation. +  IsValidTask = true; +} + +/// Reuses previously-computed dividend or remainder from the current BB if +/// operands and operation are identical. Otherwise calls insertFastDivAndRem to +/// perform the optimization and caches the resulting dividend and remainder. +/// If no replacement can be generated, nullptr is returned. +Value *FastDivInsertionTask::getReplacement(DivCacheTy &Cache) { +  // First, make sure that the task is valid. +  if (!IsValidTask) +    return nullptr; + +  // Then, look for a value in Cache. +  Value *Dividend = SlowDivOrRem->getOperand(0); +  Value *Divisor = SlowDivOrRem->getOperand(1); +  DivRemMapKey Key(isSignedOp(), Dividend, Divisor); +  auto CacheI = Cache.find(Key); + +  if (CacheI == Cache.end()) { +    // If previous instance does not exist, try to insert fast div. +    Optional<QuotRemPair> OptResult = insertFastDivAndRem(); +    // Bail out if insertFastDivAndRem has failed. +    if (!OptResult) +      return nullptr; +    CacheI = Cache.insert({Key, *OptResult}).first; +  } + +  QuotRemPair &Value = CacheI->second; +  return isDivisionOp() ? Value.Quotient : Value.Remainder; +} + +/// Check if a value looks like a hash. +/// +/// The routine is expected to detect values computed using the most common hash +/// algorithms. Typically, hash computations end with one of the following +/// instructions: +/// +/// 1) MUL with a constant wider than BypassType +/// 2) XOR instruction +/// +/// And even if we are wrong and the value is not a hash, it is still quite +/// unlikely that such values will fit into BypassType. +/// +/// To detect string hash algorithms like FNV we have to look through PHI-nodes. +/// It is implemented as a depth-first search for values that look neither long +/// nor hash-like. +bool FastDivInsertionTask::isHashLikeValue(Value *V, VisitedSetTy &Visited) { +  Instruction *I = dyn_cast<Instruction>(V); +  if (!I) +    return false; + +  switch (I->getOpcode()) { +  case Instruction::Xor: +    return true; +  case Instruction::Mul: { +    // After Constant Hoisting pass, long constants may be represented as +    // bitcast instructions. As a result, some constants may look like an +    // instruction at first, and an additional check is necessary to find out if +    // an operand is actually a constant. +    Value *Op1 = I->getOperand(1); +    ConstantInt *C = dyn_cast<ConstantInt>(Op1); +    if (!C && isa<BitCastInst>(Op1)) +      C = dyn_cast<ConstantInt>(cast<BitCastInst>(Op1)->getOperand(0)); +    return C && C->getValue().getMinSignedBits() > BypassType->getBitWidth(); +  } +  case Instruction::PHI: +    // Stop IR traversal in case of a crazy input code. This limits recursion +    // depth. +    if (Visited.size() >= 16) +      return false; +    // Do not visit nodes that have been visited already. We return true because +    // it means that we couldn't find any value that doesn't look hash-like. +    if (Visited.find(I) != Visited.end()) +      return true; +    Visited.insert(I); +    return llvm::all_of(cast<PHINode>(I)->incoming_values(), [&](Value *V) { +      // Ignore undef values as they probably don't affect the division +      // operands. +      return getValueRange(V, Visited) == VALRNG_LIKELY_LONG || +             isa<UndefValue>(V); +    }); +  default: +    return false; +  } +} + +/// Check if an integer value fits into our bypass type. +ValueRange FastDivInsertionTask::getValueRange(Value *V, +                                               VisitedSetTy &Visited) { +  unsigned ShortLen = BypassType->getBitWidth(); +  unsigned LongLen = V->getType()->getIntegerBitWidth(); + +  assert(LongLen > ShortLen && "Value type must be wider than BypassType"); +  unsigned HiBits = LongLen - ShortLen; + +  const DataLayout &DL = SlowDivOrRem->getModule()->getDataLayout(); +  KnownBits Known(LongLen); + +  computeKnownBits(V, Known, DL); + +  if (Known.countMinLeadingZeros() >= HiBits) +    return VALRNG_KNOWN_SHORT; + +  if (Known.countMaxLeadingZeros() < HiBits) +    return VALRNG_LIKELY_LONG; + +  // Long integer divisions are often used in hashtable implementations. It's +  // not worth bypassing such divisions because hash values are extremely +  // unlikely to have enough leading zeros. The call below tries to detect +  // values that are unlikely to fit BypassType (including hashes). +  if (isHashLikeValue(V, Visited)) +    return VALRNG_LIKELY_LONG; + +  return VALRNG_UNKNOWN; +} + +/// Add new basic block for slow div and rem operations and put it before +/// SuccessorBB. +QuotRemWithBB FastDivInsertionTask::createSlowBB(BasicBlock *SuccessorBB) { +  QuotRemWithBB DivRemPair; +  DivRemPair.BB = BasicBlock::Create(MainBB->getParent()->getContext(), "", +                                     MainBB->getParent(), SuccessorBB); +  IRBuilder<> Builder(DivRemPair.BB, DivRemPair.BB->begin()); + +  Value *Dividend = SlowDivOrRem->getOperand(0); +  Value *Divisor = SlowDivOrRem->getOperand(1); + +  if (isSignedOp()) { +    DivRemPair.Quotient = Builder.CreateSDiv(Dividend, Divisor); +    DivRemPair.Remainder = Builder.CreateSRem(Dividend, Divisor); +  } else { +    DivRemPair.Quotient = Builder.CreateUDiv(Dividend, Divisor); +    DivRemPair.Remainder = Builder.CreateURem(Dividend, Divisor); +  } + +  Builder.CreateBr(SuccessorBB); +  return DivRemPair; +} + +/// Add new basic block for fast div and rem operations and put it before +/// SuccessorBB. +QuotRemWithBB FastDivInsertionTask::createFastBB(BasicBlock *SuccessorBB) { +  QuotRemWithBB DivRemPair; +  DivRemPair.BB = BasicBlock::Create(MainBB->getParent()->getContext(), "", +                                     MainBB->getParent(), SuccessorBB); +  IRBuilder<> Builder(DivRemPair.BB, DivRemPair.BB->begin()); + +  Value *Dividend = SlowDivOrRem->getOperand(0); +  Value *Divisor = SlowDivOrRem->getOperand(1); +  Value *ShortDivisorV = +      Builder.CreateCast(Instruction::Trunc, Divisor, BypassType); +  Value *ShortDividendV = +      Builder.CreateCast(Instruction::Trunc, Dividend, BypassType); + +  // udiv/urem because this optimization only handles positive numbers. +  Value *ShortQV = Builder.CreateUDiv(ShortDividendV, ShortDivisorV); +  Value *ShortRV = Builder.CreateURem(ShortDividendV, ShortDivisorV); +  DivRemPair.Quotient = +      Builder.CreateCast(Instruction::ZExt, ShortQV, getSlowType()); +  DivRemPair.Remainder = +      Builder.CreateCast(Instruction::ZExt, ShortRV, getSlowType()); +  Builder.CreateBr(SuccessorBB); + +  return DivRemPair; +} + +/// Creates Phi nodes for result of Div and Rem. +QuotRemPair FastDivInsertionTask::createDivRemPhiNodes(QuotRemWithBB &LHS, +                                                       QuotRemWithBB &RHS, +                                                       BasicBlock *PhiBB) { +  IRBuilder<> Builder(PhiBB, PhiBB->begin()); +  PHINode *QuoPhi = Builder.CreatePHI(getSlowType(), 2); +  QuoPhi->addIncoming(LHS.Quotient, LHS.BB); +  QuoPhi->addIncoming(RHS.Quotient, RHS.BB); +  PHINode *RemPhi = Builder.CreatePHI(getSlowType(), 2); +  RemPhi->addIncoming(LHS.Remainder, LHS.BB); +  RemPhi->addIncoming(RHS.Remainder, RHS.BB); +  return QuotRemPair(QuoPhi, RemPhi); +} + +/// Creates a runtime check to test whether both the divisor and dividend fit +/// into BypassType. The check is inserted at the end of MainBB. True return +/// value means that the operands fit. Either of the operands may be NULL if it +/// doesn't need a runtime check. +Value *FastDivInsertionTask::insertOperandRuntimeCheck(Value *Op1, Value *Op2) { +  assert((Op1 || Op2) && "Nothing to check"); +  IRBuilder<> Builder(MainBB, MainBB->end()); + +  Value *OrV; +  if (Op1 && Op2) +    OrV = Builder.CreateOr(Op1, Op2); +  else +    OrV = Op1 ? Op1 : Op2; + +  // BitMask is inverted to check if the operands are +  // larger than the bypass type +  uint64_t BitMask = ~BypassType->getBitMask(); +  Value *AndV = Builder.CreateAnd(OrV, BitMask); + +  // Compare operand values +  Value *ZeroV = ConstantInt::getSigned(getSlowType(), 0); +  return Builder.CreateICmpEQ(AndV, ZeroV); +} + +/// Substitutes the div/rem instruction with code that checks the value of the +/// operands and uses a shorter-faster div/rem instruction when possible. +Optional<QuotRemPair> FastDivInsertionTask::insertFastDivAndRem() { +  Value *Dividend = SlowDivOrRem->getOperand(0); +  Value *Divisor = SlowDivOrRem->getOperand(1); + +  VisitedSetTy SetL; +  ValueRange DividendRange = getValueRange(Dividend, SetL); +  if (DividendRange == VALRNG_LIKELY_LONG) +    return None; + +  VisitedSetTy SetR; +  ValueRange DivisorRange = getValueRange(Divisor, SetR); +  if (DivisorRange == VALRNG_LIKELY_LONG) +    return None; + +  bool DividendShort = (DividendRange == VALRNG_KNOWN_SHORT); +  bool DivisorShort = (DivisorRange == VALRNG_KNOWN_SHORT); + +  if (DividendShort && DivisorShort) { +    // If both operands are known to be short then just replace the long +    // division with a short one in-place.  Since we're not introducing control +    // flow in this case, narrowing the division is always a win, even if the +    // divisor is a constant (and will later get replaced by a multiplication). + +    IRBuilder<> Builder(SlowDivOrRem); +    Value *TruncDividend = Builder.CreateTrunc(Dividend, BypassType); +    Value *TruncDivisor = Builder.CreateTrunc(Divisor, BypassType); +    Value *TruncDiv = Builder.CreateUDiv(TruncDividend, TruncDivisor); +    Value *TruncRem = Builder.CreateURem(TruncDividend, TruncDivisor); +    Value *ExtDiv = Builder.CreateZExt(TruncDiv, getSlowType()); +    Value *ExtRem = Builder.CreateZExt(TruncRem, getSlowType()); +    return QuotRemPair(ExtDiv, ExtRem); +  } + +  if (isa<ConstantInt>(Divisor)) { +    // If the divisor is not a constant, DAGCombiner will convert it to a +    // multiplication by a magic constant.  It isn't clear if it is worth +    // introducing control flow to get a narrower multiply. +    return None; +  } + +  // After Constant Hoisting pass, long constants may be represented as +  // bitcast instructions. As a result, some constants may look like an +  // instruction at first, and an additional check is necessary to find out if +  // an operand is actually a constant. +  if (auto *BCI = dyn_cast<BitCastInst>(Divisor)) +    if (BCI->getParent() == SlowDivOrRem->getParent() && +        isa<ConstantInt>(BCI->getOperand(0))) +      return None; + +  if (DividendShort && !isSignedOp()) { +    // If the division is unsigned and Dividend is known to be short, then +    // either +    // 1) Divisor is less or equal to Dividend, and the result can be computed +    //    with a short division. +    // 2) Divisor is greater than Dividend. In this case, no division is needed +    //    at all: The quotient is 0 and the remainder is equal to Dividend. +    // +    // So instead of checking at runtime whether Divisor fits into BypassType, +    // we emit a runtime check to differentiate between these two cases. This +    // lets us entirely avoid a long div. + +    // Split the basic block before the div/rem. +    BasicBlock *SuccessorBB = MainBB->splitBasicBlock(SlowDivOrRem); +    // Remove the unconditional branch from MainBB to SuccessorBB. +    MainBB->getInstList().back().eraseFromParent(); +    QuotRemWithBB Long; +    Long.BB = MainBB; +    Long.Quotient = ConstantInt::get(getSlowType(), 0); +    Long.Remainder = Dividend; +    QuotRemWithBB Fast = createFastBB(SuccessorBB); +    QuotRemPair Result = createDivRemPhiNodes(Fast, Long, SuccessorBB); +    IRBuilder<> Builder(MainBB, MainBB->end()); +    Value *CmpV = Builder.CreateICmpUGE(Dividend, Divisor); +    Builder.CreateCondBr(CmpV, Fast.BB, SuccessorBB); +    return Result; +  } else { +    // General case. Create both slow and fast div/rem pairs and choose one of +    // them at runtime. + +    // Split the basic block before the div/rem. +    BasicBlock *SuccessorBB = MainBB->splitBasicBlock(SlowDivOrRem); +    // Remove the unconditional branch from MainBB to SuccessorBB. +    MainBB->getInstList().back().eraseFromParent(); +    QuotRemWithBB Fast = createFastBB(SuccessorBB); +    QuotRemWithBB Slow = createSlowBB(SuccessorBB); +    QuotRemPair Result = createDivRemPhiNodes(Fast, Slow, SuccessorBB); +    Value *CmpV = insertOperandRuntimeCheck(DividendShort ? nullptr : Dividend, +                                            DivisorShort ? nullptr : Divisor); +    IRBuilder<> Builder(MainBB, MainBB->end()); +    Builder.CreateCondBr(CmpV, Fast.BB, Slow.BB); +    return Result; +  } +} + +/// This optimization identifies DIV/REM instructions in a BB that can be +/// profitably bypassed and carried out with a shorter, faster divide. +bool llvm::bypassSlowDivision(BasicBlock *BB, +                              const BypassWidthsTy &BypassWidths) { +  DivCacheTy PerBBDivCache; + +  bool MadeChange = false; +  Instruction *Next = &*BB->begin(); +  while (Next != nullptr) { +    // We may add instructions immediately after I, but we want to skip over +    // them. +    Instruction *I = Next; +    Next = Next->getNextNode(); + +    // Ignore dead code to save time and avoid bugs. +    if (I->hasNUses(0)) +      continue; + +    FastDivInsertionTask Task(I, BypassWidths); +    if (Value *Replacement = Task.getReplacement(PerBBDivCache)) { +      I->replaceAllUsesWith(Replacement); +      I->eraseFromParent(); +      MadeChange = true; +    } +  } + +  // Above we eagerly create divs and rems, as pairs, so that we can efficiently +  // create divrem machine instructions.  Now erase any unused divs / rems so we +  // don't leave extra instructions sitting around. +  for (auto &KV : PerBBDivCache) +    for (Value *V : {KV.second.Quotient, KV.second.Remainder}) +      RecursivelyDeleteTriviallyDeadInstructions(V); + +  return MadeChange; +} | 
