aboutsummaryrefslogtreecommitdiff
path: root/lib/Target
diff options
context:
space:
mode:
Diffstat (limited to 'lib/Target')
-rw-r--r--lib/Target/AArch64/AArch64.td1
-rw-r--r--lib/Target/AArch64/AArch64ISelLowering.cpp11
-rw-r--r--lib/Target/AMDGPU/AMDGPU.h3
-rw-r--r--lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp22
-rw-r--r--lib/Target/AMDGPU/AMDGPUCodeGenPrepare.cpp123
-rw-r--r--lib/Target/AMDGPU/AMDGPUInstructions.td3
-rw-r--r--lib/Target/AMDGPU/AMDGPUIntrinsicInfo.cpp53
-rw-r--r--lib/Target/AMDGPU/AMDGPUIntrinsicInfo.h14
-rw-r--r--lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp47
-rw-r--r--lib/Target/AMDGPU/AMDGPUTargetMachine.cpp8
-rw-r--r--lib/Target/AMDGPU/R600ISelLowering.cpp25
-rw-r--r--lib/Target/AMDGPU/R600ISelLowering.h3
-rw-r--r--lib/Target/AMDGPU/SIDefines.h3
-rw-r--r--lib/Target/AMDGPU/SIISelLowering.cpp69
-rw-r--r--lib/Target/AMDGPU/SIISelLowering.h3
-rw-r--r--lib/Target/AMDGPU/SIInstrFormats.td6
-rw-r--r--lib/Target/AMDGPU/SIInstrInfo.cpp3
-rw-r--r--lib/Target/AMDGPU/SIInstrInfo.h8
-rw-r--r--lib/Target/AMDGPU/SIInstrInfo.td9
-rw-r--r--lib/Target/AMDGPU/SIInstructions.td18
-rw-r--r--lib/Target/AMDGPU/SIIntrinsics.td9
-rw-r--r--lib/Target/AMDGPU/SIMachineFunctionInfo.cpp3
-rw-r--r--lib/Target/AMDGPU/SIRegisterInfo.cpp9
-rw-r--r--lib/Target/AMDGPU/SIRegisterInfo.h3
-rw-r--r--lib/Target/AMDGPU/SIWholeQuadMode.cpp88
-rw-r--r--lib/Target/ARM/ARMISelLowering.cpp3
-rw-r--r--lib/Target/ARM/ARMInstrInfo.td12
-rw-r--r--lib/Target/ARM/ARMInstrThumb2.td10
-rw-r--r--lib/Target/Mips/MCTargetDesc/MipsELFObjectWriter.cpp8
-rw-r--r--lib/Target/Mips/MCTargetDesc/MipsMCAsmInfo.cpp11
-rw-r--r--lib/Target/Mips/MipsTargetMachine.cpp5
-rw-r--r--lib/Target/X86/X86ISelLowering.cpp21
-rw-r--r--lib/Target/X86/X86InstrInfo.cpp21
-rw-r--r--lib/Target/X86/X86InstrInfo.h2
-rw-r--r--lib/Target/X86/X86InstrSSE.td35
35 files changed, 489 insertions, 183 deletions
diff --git a/lib/Target/AArch64/AArch64.td b/lib/Target/AArch64/AArch64.td
index b1e881685b0c..b97a0f155dc2 100644
--- a/lib/Target/AArch64/AArch64.td
+++ b/lib/Target/AArch64/AArch64.td
@@ -250,6 +250,7 @@ def ProcVulcan : SubtargetFeature<"vulcan", "ARMProcFamily", "Vulcan",
FeatureMacroOpFusion,
FeatureNEON,
FeaturePostRAScheduler,
+ FeaturePredictableSelectIsExpensive,
HasV8_1aOps]>;
def : ProcessorModel<"generic", NoSchedModel, [
diff --git a/lib/Target/AArch64/AArch64ISelLowering.cpp b/lib/Target/AArch64/AArch64ISelLowering.cpp
index d6f2a190d4c8..ac7de1b422e0 100644
--- a/lib/Target/AArch64/AArch64ISelLowering.cpp
+++ b/lib/Target/AArch64/AArch64ISelLowering.cpp
@@ -7685,6 +7685,7 @@ static SDValue performIntToFpCombine(SDNode *N, SelectionDAG &DAG,
/// Fold a floating-point multiply by power of two into floating-point to
/// fixed-point conversion.
static SDValue performFpToIntCombine(SDNode *N, SelectionDAG &DAG,
+ TargetLowering::DAGCombinerInfo &DCI,
const AArch64Subtarget *Subtarget) {
if (!Subtarget->hasNEON())
return SDValue();
@@ -7728,10 +7729,16 @@ static SDValue performFpToIntCombine(SDNode *N, SelectionDAG &DAG,
ResTy = FloatBits == 32 ? MVT::v2i32 : MVT::v2i64;
break;
case 4:
- ResTy = MVT::v4i32;
+ ResTy = FloatBits == 32 ? MVT::v4i32 : MVT::v4i64;
break;
}
+ if (ResTy == MVT::v4i64 && DCI.isBeforeLegalizeOps())
+ return SDValue();
+
+ assert((ResTy != MVT::v4i64 || DCI.isBeforeLegalizeOps()) &&
+ "Illegal vector type after legalization");
+
SDLoc DL(N);
bool IsSigned = N->getOpcode() == ISD::FP_TO_SINT;
unsigned IntrinsicOpcode = IsSigned ? Intrinsic::aarch64_neon_vcvtfp2fxs
@@ -9853,7 +9860,7 @@ SDValue AArch64TargetLowering::PerformDAGCombine(SDNode *N,
return performIntToFpCombine(N, DAG, Subtarget);
case ISD::FP_TO_SINT:
case ISD::FP_TO_UINT:
- return performFpToIntCombine(N, DAG, Subtarget);
+ return performFpToIntCombine(N, DAG, DCI, Subtarget);
case ISD::FDIV:
return performFDivCombine(N, DAG, Subtarget);
case ISD::OR:
diff --git a/lib/Target/AMDGPU/AMDGPU.h b/lib/Target/AMDGPU/AMDGPU.h
index 7e59710a427a..d4784b5463d7 100644
--- a/lib/Target/AMDGPU/AMDGPU.h
+++ b/lib/Target/AMDGPU/AMDGPU.h
@@ -20,6 +20,7 @@ class AMDGPUInstrPrinter;
class AMDGPUSubtarget;
class AMDGPUTargetMachine;
class FunctionPass;
+class GCNTargetMachine;
struct MachineSchedContext;
class MCAsmInfo;
class raw_ostream;
@@ -50,7 +51,7 @@ FunctionPass *createSIFixSGPRCopiesPass();
FunctionPass *createSICodeEmitterPass(formatted_raw_ostream &OS);
FunctionPass *createSIDebuggerInsertNopsPass();
FunctionPass *createSIInsertWaitsPass();
-FunctionPass *createAMDGPUCodeGenPreparePass(const TargetMachine *TM = nullptr);
+FunctionPass *createAMDGPUCodeGenPreparePass(const GCNTargetMachine *TM = nullptr);
ScheduleDAGInstrs *createSIMachineScheduler(MachineSchedContext *C);
diff --git a/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp b/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp
index cfe6346fb6b1..c9c95c796a69 100644
--- a/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp
+++ b/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp
@@ -783,15 +783,19 @@ void AMDGPUAsmPrinter::emitStartOfRuntimeMetadata(const Module &M) {
emitRuntimeMDIntValue(OutStreamer, RuntimeMD::KeyMDVersion,
RuntimeMD::MDVersion << 8 | RuntimeMD::MDRevision, 2);
if (auto MD = M.getNamedMetadata("opencl.ocl.version")) {
- emitRuntimeMDIntValue(OutStreamer, RuntimeMD::KeyLanguage,
- RuntimeMD::OpenCL_C, 1);
- auto Node = MD->getOperand(0);
- unsigned short Major = mdconst::extract<ConstantInt>(Node->getOperand(0))
- ->getZExtValue();
- unsigned short Minor = mdconst::extract<ConstantInt>(Node->getOperand(1))
- ->getZExtValue();
- emitRuntimeMDIntValue(OutStreamer, RuntimeMD::KeyLanguageVersion,
- Major * 100 + Minor * 10, 2);
+ if (MD->getNumOperands()) {
+ auto Node = MD->getOperand(0);
+ if (Node->getNumOperands() > 1) {
+ emitRuntimeMDIntValue(OutStreamer, RuntimeMD::KeyLanguage,
+ RuntimeMD::OpenCL_C, 1);
+ uint16_t Major = mdconst::extract<ConstantInt>(Node->getOperand(0))
+ ->getZExtValue();
+ uint16_t Minor = mdconst::extract<ConstantInt>(Node->getOperand(1))
+ ->getZExtValue();
+ emitRuntimeMDIntValue(OutStreamer, RuntimeMD::KeyLanguageVersion,
+ Major * 100 + Minor * 10, 2);
+ }
+ }
}
}
diff --git a/lib/Target/AMDGPU/AMDGPUCodeGenPrepare.cpp b/lib/Target/AMDGPU/AMDGPUCodeGenPrepare.cpp
index 3b415774df49..b955e231699a 100644
--- a/lib/Target/AMDGPU/AMDGPUCodeGenPrepare.cpp
+++ b/lib/Target/AMDGPU/AMDGPUCodeGenPrepare.cpp
@@ -14,7 +14,9 @@
//===----------------------------------------------------------------------===//
#include "AMDGPU.h"
+#include "AMDGPUIntrinsicInfo.h"
#include "AMDGPUSubtarget.h"
+#include "AMDGPUTargetMachine.h"
#include "llvm/Analysis/DivergenceAnalysis.h"
#include "llvm/CodeGen/Passes.h"
@@ -30,15 +32,28 @@ using namespace llvm;
namespace {
class AMDGPUCodeGenPrepare : public FunctionPass,
- public InstVisitor<AMDGPUCodeGenPrepare> {
+ public InstVisitor<AMDGPUCodeGenPrepare, bool> {
+ const GCNTargetMachine *TM;
+ const SISubtarget *ST;
DivergenceAnalysis *DA;
- const TargetMachine *TM;
+ Module *Mod;
+ bool HasUnsafeFPMath;
public:
static char ID;
AMDGPUCodeGenPrepare(const TargetMachine *TM = nullptr) :
FunctionPass(ID),
- TM(TM) { }
+ TM(static_cast<const GCNTargetMachine *>(TM)),
+ ST(nullptr),
+ DA(nullptr),
+ Mod(nullptr),
+ HasUnsafeFPMath(false) { }
+
+ bool visitFDiv(BinaryOperator &I);
+
+ bool visitInstruction(Instruction &I) {
+ return false;
+ }
bool doInitialization(Module &M) override;
bool runOnFunction(Function &F) override;
@@ -55,7 +70,92 @@ public:
} // End anonymous namespace
+static bool shouldKeepFDivF32(Value *Num, bool UnsafeDiv) {
+ const ConstantFP *CNum = dyn_cast<ConstantFP>(Num);
+ if (!CNum)
+ return false;
+
+ // Reciprocal f32 is handled separately without denormals.
+ return UnsafeDiv || CNum->isExactlyValue(+1.0);
+}
+
+// Insert an intrinsic for fast fdiv for safe math situations where we can
+// reduce precision. Leave fdiv for situations where the generic node is
+// expected to be optimized.
+bool AMDGPUCodeGenPrepare::visitFDiv(BinaryOperator &FDiv) {
+ Type *Ty = FDiv.getType();
+
+ // TODO: Handle half
+ if (!Ty->getScalarType()->isFloatTy())
+ return false;
+
+ MDNode *FPMath = FDiv.getMetadata(LLVMContext::MD_fpmath);
+ if (!FPMath)
+ return false;
+
+ const FPMathOperator *FPOp = cast<const FPMathOperator>(&FDiv);
+ float ULP = FPOp->getFPAccuracy();
+ if (ULP < 2.5f)
+ return false;
+
+ FastMathFlags FMF = FPOp->getFastMathFlags();
+ bool UnsafeDiv = HasUnsafeFPMath || FMF.unsafeAlgebra() ||
+ FMF.allowReciprocal();
+ if (ST->hasFP32Denormals() && !UnsafeDiv)
+ return false;
+
+ IRBuilder<> Builder(FDiv.getParent(), std::next(FDiv.getIterator()), FPMath);
+ Builder.setFastMathFlags(FMF);
+ Builder.SetCurrentDebugLocation(FDiv.getDebugLoc());
+
+ const AMDGPUIntrinsicInfo *II = TM->getIntrinsicInfo();
+ Function *Decl
+ = II->getDeclaration(Mod, AMDGPUIntrinsic::amdgcn_fdiv_fast, {});
+
+ Value *Num = FDiv.getOperand(0);
+ Value *Den = FDiv.getOperand(1);
+
+ Value *NewFDiv = nullptr;
+
+ if (VectorType *VT = dyn_cast<VectorType>(Ty)) {
+ NewFDiv = UndefValue::get(VT);
+
+ // FIXME: Doesn't do the right thing for cases where the vector is partially
+ // constant. This works when the scalarizer pass is run first.
+ for (unsigned I = 0, E = VT->getNumElements(); I != E; ++I) {
+ Value *NumEltI = Builder.CreateExtractElement(Num, I);
+ Value *DenEltI = Builder.CreateExtractElement(Den, I);
+ Value *NewElt;
+
+ if (shouldKeepFDivF32(NumEltI, UnsafeDiv)) {
+ NewElt = Builder.CreateFDiv(NumEltI, DenEltI);
+ } else {
+ NewElt = Builder.CreateCall(Decl, { NumEltI, DenEltI });
+ }
+
+ NewFDiv = Builder.CreateInsertElement(NewFDiv, NewElt, I);
+ }
+ } else {
+ if (!shouldKeepFDivF32(Num, UnsafeDiv))
+ NewFDiv = Builder.CreateCall(Decl, { Num, Den });
+ }
+
+ if (NewFDiv) {
+ FDiv.replaceAllUsesWith(NewFDiv);
+ NewFDiv->takeName(&FDiv);
+ FDiv.eraseFromParent();
+ }
+
+ return true;
+}
+
+static bool hasUnsafeFPMath(const Function &F) {
+ Attribute Attr = F.getFnAttribute("unsafe-fp-math");
+ return Attr.getValueAsString() == "true";
+}
+
bool AMDGPUCodeGenPrepare::doInitialization(Module &M) {
+ Mod = &M;
return false;
}
@@ -63,10 +163,21 @@ bool AMDGPUCodeGenPrepare::runOnFunction(Function &F) {
if (!TM || skipFunction(F))
return false;
+ ST = &TM->getSubtarget<SISubtarget>(F);
DA = &getAnalysis<DivergenceAnalysis>();
- visit(F);
+ HasUnsafeFPMath = hasUnsafeFPMath(F);
- return true;
+ bool MadeChange = false;
+
+ for (BasicBlock &BB : F) {
+ BasicBlock::iterator Next;
+ for (BasicBlock::iterator I = BB.begin(), E = BB.end(); I != E; I = Next) {
+ Next = std::next(I);
+ MadeChange |= visit(*I);
+ }
+ }
+
+ return MadeChange;
}
INITIALIZE_TM_PASS_BEGIN(AMDGPUCodeGenPrepare, DEBUG_TYPE,
@@ -77,6 +188,6 @@ INITIALIZE_TM_PASS_END(AMDGPUCodeGenPrepare, DEBUG_TYPE,
char AMDGPUCodeGenPrepare::ID = 0;
-FunctionPass *llvm::createAMDGPUCodeGenPreparePass(const TargetMachine *TM) {
+FunctionPass *llvm::createAMDGPUCodeGenPreparePass(const GCNTargetMachine *TM) {
return new AMDGPUCodeGenPrepare(TM);
}
diff --git a/lib/Target/AMDGPU/AMDGPUInstructions.td b/lib/Target/AMDGPU/AMDGPUInstructions.td
index 6761b4b5df95..3944fdbd31e3 100644
--- a/lib/Target/AMDGPU/AMDGPUInstructions.td
+++ b/lib/Target/AMDGPU/AMDGPUInstructions.td
@@ -420,9 +420,10 @@ int TWO_PI = 0x40c90fdb;
int PI = 0x40490fdb;
int TWO_PI_INV = 0x3e22f983;
int FP_UINT_MAX_PLUS_1 = 0x4f800000; // 1 << 32 in floating point encoding
-int FP32_NEG_ONE = 0xbf800000;
int FP32_ONE = 0x3f800000;
+int FP32_NEG_ONE = 0xbf800000;
int FP64_ONE = 0x3ff0000000000000;
+int FP64_NEG_ONE = 0xbff0000000000000;
}
def CONST : Constants;
diff --git a/lib/Target/AMDGPU/AMDGPUIntrinsicInfo.cpp b/lib/Target/AMDGPU/AMDGPUIntrinsicInfo.cpp
index 791872a9db40..8e3471bd2083 100644
--- a/lib/Target/AMDGPU/AMDGPUIntrinsicInfo.cpp
+++ b/lib/Target/AMDGPU/AMDGPUIntrinsicInfo.cpp
@@ -29,16 +29,39 @@ static const char *const IntrinsicNameTable[] = {
#undef GET_INTRINSIC_NAME_TABLE
};
-std::string AMDGPUIntrinsicInfo::getName(unsigned IntrID, Type **Tys,
- unsigned numTys) const {
- if (IntrID < Intrinsic::num_intrinsics) {
- return nullptr;
- }
+namespace {
+#define GET_INTRINSIC_ATTRIBUTES
+#include "AMDGPUGenIntrinsics.inc"
+#undef GET_INTRINSIC_ATTRIBUTES
+}
+
+StringRef AMDGPUIntrinsicInfo::getName(unsigned IntrID,
+ ArrayRef<Type *> Tys) const {
+ if (IntrID < Intrinsic::num_intrinsics)
+ return StringRef();
+
assert(IntrID < AMDGPUIntrinsic::num_AMDGPU_intrinsics &&
"Invalid intrinsic ID");
- std::string Result(IntrinsicNameTable[IntrID - Intrinsic::num_intrinsics]);
- return Result;
+ return IntrinsicNameTable[IntrID - Intrinsic::num_intrinsics];
+}
+
+std::string AMDGPUIntrinsicInfo::getName(unsigned IntrID, Type **Tys,
+ unsigned NumTys) const {
+ return getName(IntrID, makeArrayRef(Tys, NumTys)).str();
+}
+
+FunctionType *AMDGPUIntrinsicInfo::getType(LLVMContext &Context, unsigned ID,
+ ArrayRef<Type*> Tys) const {
+ // FIXME: Re-use Intrinsic::getType machinery
+ switch (ID) {
+ case AMDGPUIntrinsic::amdgcn_fdiv_fast: {
+ Type *F32Ty = Type::getFloatTy(Context);
+ return FunctionType::get(F32Ty, { F32Ty, F32Ty }, false);
+ }
+ default:
+ llvm_unreachable("unhandled intrinsic");
+ }
}
unsigned AMDGPUIntrinsicInfo::lookupName(const char *NameData,
@@ -69,7 +92,19 @@ bool AMDGPUIntrinsicInfo::isOverloaded(unsigned id) const {
}
Function *AMDGPUIntrinsicInfo::getDeclaration(Module *M, unsigned IntrID,
+ ArrayRef<Type *> Tys) const {
+ FunctionType *FTy = getType(M->getContext(), IntrID, Tys);
+ Function *F
+ = cast<Function>(M->getOrInsertFunction(getName(IntrID, Tys), FTy));
+
+ AttributeSet AS = getAttributes(M->getContext(),
+ static_cast<AMDGPUIntrinsic::ID>(IntrID));
+ F->setAttributes(AS);
+ return F;
+}
+
+Function *AMDGPUIntrinsicInfo::getDeclaration(Module *M, unsigned IntrID,
Type **Tys,
- unsigned numTys) const {
- llvm_unreachable("Not implemented");
+ unsigned NumTys) const {
+ return getDeclaration(M, IntrID, makeArrayRef(Tys, NumTys));
}
diff --git a/lib/Target/AMDGPU/AMDGPUIntrinsicInfo.h b/lib/Target/AMDGPU/AMDGPUIntrinsicInfo.h
index f4173929259c..6cb8b9644642 100644
--- a/lib/Target/AMDGPU/AMDGPUIntrinsicInfo.h
+++ b/lib/Target/AMDGPU/AMDGPUIntrinsicInfo.h
@@ -34,13 +34,23 @@ enum ID {
class AMDGPUIntrinsicInfo final : public TargetIntrinsicInfo {
public:
AMDGPUIntrinsicInfo();
+
+ StringRef getName(unsigned IntrId, ArrayRef<Type *> Tys = None) const;
+
std::string getName(unsigned IntrId, Type **Tys = nullptr,
- unsigned numTys = 0) const override;
+ unsigned NumTys = 0) const override;
+
unsigned lookupName(const char *Name, unsigned Len) const override;
bool isOverloaded(unsigned IID) const override;
Function *getDeclaration(Module *M, unsigned ID,
Type **Tys = nullptr,
- unsigned numTys = 0) const override;
+ unsigned NumTys = 0) const override;
+
+ Function *getDeclaration(Module *M, unsigned ID,
+ ArrayRef<Type *> = None) const;
+
+ FunctionType *getType(LLVMContext &Context, unsigned ID,
+ ArrayRef<Type*> Tys = None) const;
};
} // end namespace llvm
diff --git a/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp b/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp
index 775463809634..0bad63fa77ad 100644
--- a/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp
+++ b/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp
@@ -348,9 +348,6 @@ static VectorType *arrayTypeToVecType(Type *ArrayTy) {
static Value *
calculateVectorIndex(Value *Ptr,
const std::map<GetElementPtrInst *, Value *> &GEPIdx) {
- if (isa<AllocaInst>(Ptr))
- return Constant::getNullValue(Type::getInt32Ty(Ptr->getContext()));
-
GetElementPtrInst *GEP = cast<GetElementPtrInst>(Ptr);
auto I = GEPIdx.find(GEP);
@@ -360,11 +357,11 @@ calculateVectorIndex(Value *Ptr,
static Value* GEPToVectorIndex(GetElementPtrInst *GEP) {
// FIXME we only support simple cases
if (GEP->getNumOperands() != 3)
- return NULL;
+ return nullptr;
ConstantInt *I0 = dyn_cast<ConstantInt>(GEP->getOperand(1));
if (!I0 || !I0->isZero())
- return NULL;
+ return nullptr;
return GEP->getOperand(2);
}
@@ -398,7 +395,8 @@ static bool tryPromoteAllocaToVector(AllocaInst *Alloca) {
// are just being conservative for now.
if (!AllocaTy ||
AllocaTy->getElementType()->isVectorTy() ||
- AllocaTy->getNumElements() > 4) {
+ AllocaTy->getNumElements() > 4 ||
+ AllocaTy->getNumElements() < 2) {
DEBUG(dbgs() << " Cannot convert type to vector\n");
return false;
}
@@ -443,9 +441,11 @@ static bool tryPromoteAllocaToVector(AllocaInst *Alloca) {
IRBuilder<> Builder(Inst);
switch (Inst->getOpcode()) {
case Instruction::Load: {
+ Type *VecPtrTy = VectorTy->getPointerTo(AMDGPUAS::PRIVATE_ADDRESS);
Value *Ptr = Inst->getOperand(0);
Value *Index = calculateVectorIndex(Ptr, GEPVectorIdx);
- Value *BitCast = Builder.CreateBitCast(Alloca, VectorTy->getPointerTo(0));
+
+ Value *BitCast = Builder.CreateBitCast(Alloca, VecPtrTy);
Value *VecValue = Builder.CreateLoad(BitCast);
Value *ExtractElement = Builder.CreateExtractElement(VecValue, Index);
Inst->replaceAllUsesWith(ExtractElement);
@@ -453,9 +453,11 @@ static bool tryPromoteAllocaToVector(AllocaInst *Alloca) {
break;
}
case Instruction::Store: {
+ Type *VecPtrTy = VectorTy->getPointerTo(AMDGPUAS::PRIVATE_ADDRESS);
+
Value *Ptr = Inst->getOperand(1);
Value *Index = calculateVectorIndex(Ptr, GEPVectorIdx);
- Value *BitCast = Builder.CreateBitCast(Alloca, VectorTy->getPointerTo(0));
+ Value *BitCast = Builder.CreateBitCast(Alloca, VecPtrTy);
Value *VecValue = Builder.CreateLoad(BitCast);
Value *NewVecValue = Builder.CreateInsertElement(VecValue,
Inst->getOperand(0),
@@ -469,7 +471,6 @@ static bool tryPromoteAllocaToVector(AllocaInst *Alloca) {
break;
default:
- Inst->dump();
llvm_unreachable("Inconsistency in instructions promotable to vector");
}
}
@@ -477,11 +478,6 @@ static bool tryPromoteAllocaToVector(AllocaInst *Alloca) {
}
static bool isCallPromotable(CallInst *CI) {
- // TODO: We might be able to handle some cases where the callee is a
- // constantexpr bitcast of a function.
- if (!CI->getCalledFunction())
- return false;
-
IntrinsicInst *II = dyn_cast<IntrinsicInst>(CI);
if (!II)
return false;
@@ -773,28 +769,7 @@ void AMDGPUPromoteAlloca::handleAlloca(AllocaInst &I) {
continue;
}
- IntrinsicInst *Intr = dyn_cast<IntrinsicInst>(Call);
- if (!Intr) {
- // FIXME: What is this for? It doesn't make sense to promote arbitrary
- // function calls. If the call is to a defined function that can also be
- // promoted, we should be able to do this once that function is also
- // rewritten.
-
- std::vector<Type*> ArgTypes;
- for (unsigned ArgIdx = 0, ArgEnd = Call->getNumArgOperands();
- ArgIdx != ArgEnd; ++ArgIdx) {
- ArgTypes.push_back(Call->getArgOperand(ArgIdx)->getType());
- }
- Function *F = Call->getCalledFunction();
- FunctionType *NewType = FunctionType::get(Call->getType(), ArgTypes,
- F->isVarArg());
- Constant *C = Mod->getOrInsertFunction((F->getName() + ".local").str(),
- NewType, F->getAttributes());
- Function *NewF = cast<Function>(C);
- Call->setCalledFunction(NewF);
- continue;
- }
-
+ IntrinsicInst *Intr = cast<IntrinsicInst>(Call);
Builder.SetInsertPoint(Intr);
switch (Intr->getIntrinsicID()) {
case Intrinsic::lifetime_start:
diff --git a/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp b/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
index 3e53f52c689f..b2d4e1144c75 100644
--- a/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
+++ b/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
@@ -309,6 +309,7 @@ public:
ScheduleDAGInstrs *
createMachineScheduler(MachineSchedContext *C) const override;
+ void addIRPasses() override;
bool addPreISel() override;
void addMachineSSAOptimization() override;
bool addInstSelector() override;
@@ -499,6 +500,13 @@ void GCNPassConfig::addMachineSSAOptimization() {
addPass(&DeadMachineInstructionElimID);
}
+void GCNPassConfig::addIRPasses() {
+ // TODO: May want to move later or split into an early and late one.
+ addPass(createAMDGPUCodeGenPreparePass(&getGCNTargetMachine()));
+
+ AMDGPUPassConfig::addIRPasses();
+}
+
bool GCNPassConfig::addInstSelector() {
AMDGPUPassConfig::addInstSelector();
addPass(createSILowerI1CopiesPass());
diff --git a/lib/Target/AMDGPU/R600ISelLowering.cpp b/lib/Target/AMDGPU/R600ISelLowering.cpp
index 8f78edd76a51..8ccd176930a6 100644
--- a/lib/Target/AMDGPU/R600ISelLowering.cpp
+++ b/lib/Target/AMDGPU/R600ISelLowering.cpp
@@ -122,6 +122,7 @@ R600TargetLowering::R600TargetLowering(const TargetMachine &TM,
setOperationAction(ISD::SETCC, MVT::i32, Expand);
setOperationAction(ISD::SETCC, MVT::f32, Expand);
setOperationAction(ISD::FP_TO_UINT, MVT::i1, Custom);
+ setOperationAction(ISD::FP_TO_SINT, MVT::i1, Custom);
setOperationAction(ISD::FP_TO_SINT, MVT::i64, Custom);
setOperationAction(ISD::FP_TO_UINT, MVT::i64, Custom);
@@ -832,13 +833,18 @@ void R600TargetLowering::ReplaceNodeResults(SDNode *N,
return;
case ISD::FP_TO_UINT:
if (N->getValueType(0) == MVT::i1) {
- Results.push_back(LowerFPTOUINT(N->getOperand(0), DAG));
+ Results.push_back(lowerFP_TO_UINT(N->getOperand(0), DAG));
return;
}
// Fall-through. Since we don't care about out of bounds values
// we can use FP_TO_SINT for uints too. The DAGLegalizer code for uint
// considers some extra cases which are not necessary here.
case ISD::FP_TO_SINT: {
+ if (N->getValueType(0) == MVT::i1) {
+ Results.push_back(lowerFP_TO_SINT(N->getOperand(0), DAG));
+ return;
+ }
+
SDValue Result;
if (expandFP_TO_SINT(N, Result, DAG))
Results.push_back(Result);
@@ -1052,15 +1058,24 @@ SDValue R600TargetLowering::LowerUADDSUBO(SDValue Op, SelectionDAG &DAG,
return DAG.getNode(ISD::MERGE_VALUES, DL, DAG.getVTList(VT, VT), Res, OVF);
}
-SDValue R600TargetLowering::LowerFPTOUINT(SDValue Op, SelectionDAG &DAG) const {
+SDValue R600TargetLowering::lowerFP_TO_UINT(SDValue Op, SelectionDAG &DAG) const {
+ SDLoc DL(Op);
+ return DAG.getNode(
+ ISD::SETCC,
+ DL,
+ MVT::i1,
+ Op, DAG.getConstantFP(1.0f, DL, MVT::f32),
+ DAG.getCondCode(ISD::SETEQ));
+}
+
+SDValue R600TargetLowering::lowerFP_TO_SINT(SDValue Op, SelectionDAG &DAG) const {
SDLoc DL(Op);
return DAG.getNode(
ISD::SETCC,
DL,
MVT::i1,
- Op, DAG.getConstantFP(0.0f, DL, MVT::f32),
- DAG.getCondCode(ISD::SETNE)
- );
+ Op, DAG.getConstantFP(-1.0f, DL, MVT::f32),
+ DAG.getCondCode(ISD::SETEQ));
}
SDValue R600TargetLowering::LowerImplicitParameter(SelectionDAG &DAG, EVT VT,
diff --git a/lib/Target/AMDGPU/R600ISelLowering.h b/lib/Target/AMDGPU/R600ISelLowering.h
index 2fb6ee25caa9..9700ce14c6f3 100644
--- a/lib/Target/AMDGPU/R600ISelLowering.h
+++ b/lib/Target/AMDGPU/R600ISelLowering.h
@@ -72,7 +72,8 @@ private:
SDValue lowerPrivateTruncStore(StoreSDNode *Store, SelectionDAG &DAG) const;
SDValue LowerSTORE(SDValue Op, SelectionDAG &DAG) const;
- SDValue LowerFPTOUINT(SDValue Op, SelectionDAG &DAG) const;
+ SDValue lowerFP_TO_UINT(SDValue Op, SelectionDAG &DAG) const;
+ SDValue lowerFP_TO_SINT(SDValue Op, SelectionDAG &DAG) const;
SDValue lowerPrivateExtLoad(SDValue Op, SelectionDAG &DAG) const;
SDValue LowerLOAD(SDValue Op, SelectionDAG &DAG) const;
diff --git a/lib/Target/AMDGPU/SIDefines.h b/lib/Target/AMDGPU/SIDefines.h
index 54efdc0a0466..f4b04e3631a5 100644
--- a/lib/Target/AMDGPU/SIDefines.h
+++ b/lib/Target/AMDGPU/SIDefines.h
@@ -41,7 +41,8 @@ enum {
WQM = 1 << 22,
VGPRSpill = 1 << 23,
VOPAsmPrefer32Bit = 1 << 24,
- Gather4 = 1 << 25
+ Gather4 = 1 << 25,
+ DisableWQM = 1 << 26
};
}
diff --git a/lib/Target/AMDGPU/SIISelLowering.cpp b/lib/Target/AMDGPU/SIISelLowering.cpp
index 51241cf0a432..80d44351267d 100644
--- a/lib/Target/AMDGPU/SIISelLowering.cpp
+++ b/lib/Target/AMDGPU/SIISelLowering.cpp
@@ -1134,9 +1134,9 @@ MachineBasicBlock *SITargetLowering::EmitInstrWithCustomInserter(
MachineFunction *MF = BB->getParent();
SIMachineFunctionInfo *MFI = MF->getInfo<SIMachineFunctionInfo>();
DebugLoc DL = MI.getDebugLoc();
- BuildMI(*BB, MI, DL, TII->get(AMDGPU::S_MOVK_I32))
- .addOperand(MI.getOperand(0))
- .addImm(MFI->LDSSize);
+ BuildMI(*BB, MI, DL, TII->get(AMDGPU::S_MOV_B32))
+ .addOperand(MI.getOperand(0))
+ .addImm(MFI->LDSSize);
MI.eraseFromParent();
return BB;
}
@@ -1792,6 +1792,9 @@ SDValue SITargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op,
return DAG.getMemIntrinsicNode(AMDGPUISD::LOAD_CONSTANT, DL,
Op->getVTList(), Ops, VT, MMO);
}
+ case AMDGPUIntrinsic::amdgcn_fdiv_fast: {
+ return lowerFDIV_FAST(Op, DAG);
+ }
case AMDGPUIntrinsic::SI_vs_load_input:
return DAG.getNode(AMDGPUISD::LOAD_INPUT, DL, VT,
Op.getOperand(1),
@@ -2098,7 +2101,8 @@ SDValue SITargetLowering::LowerSELECT(SDValue Op, SelectionDAG &DAG) const {
// Catch division cases where we can use shortcuts with rcp and rsq
// instructions.
-SDValue SITargetLowering::LowerFastFDIV(SDValue Op, SelectionDAG &DAG) const {
+SDValue SITargetLowering::lowerFastUnsafeFDIV(SDValue Op,
+ SelectionDAG &DAG) const {
SDLoc SL(Op);
SDValue LHS = Op.getOperand(0);
SDValue RHS = Op.getOperand(1);
@@ -2139,47 +2143,48 @@ SDValue SITargetLowering::LowerFastFDIV(SDValue Op, SelectionDAG &DAG) const {
return SDValue();
}
-SDValue SITargetLowering::LowerFDIV32(SDValue Op, SelectionDAG &DAG) const {
- if (SDValue FastLowered = LowerFastFDIV(Op, DAG))
- return FastLowered;
-
+// Faster 2.5 ULP division that does not support denormals.
+SDValue SITargetLowering::lowerFDIV_FAST(SDValue Op, SelectionDAG &DAG) const {
SDLoc SL(Op);
- SDValue LHS = Op.getOperand(0);
- SDValue RHS = Op.getOperand(1);
+ SDValue LHS = Op.getOperand(1);
+ SDValue RHS = Op.getOperand(2);
- // faster 2.5 ulp fdiv when using -amdgpu-fast-fdiv flag
- if (EnableAMDGPUFastFDIV) {
- // This does not support denormals.
- SDValue r1 = DAG.getNode(ISD::FABS, SL, MVT::f32, RHS);
+ SDValue r1 = DAG.getNode(ISD::FABS, SL, MVT::f32, RHS);
- const APFloat K0Val(BitsToFloat(0x6f800000));
- const SDValue K0 = DAG.getConstantFP(K0Val, SL, MVT::f32);
+ const APFloat K0Val(BitsToFloat(0x6f800000));
+ const SDValue K0 = DAG.getConstantFP(K0Val, SL, MVT::f32);
- const APFloat K1Val(BitsToFloat(0x2f800000));
- const SDValue K1 = DAG.getConstantFP(K1Val, SL, MVT::f32);
+ const APFloat K1Val(BitsToFloat(0x2f800000));
+ const SDValue K1 = DAG.getConstantFP(K1Val, SL, MVT::f32);
- const SDValue One = DAG.getConstantFP(1.0, SL, MVT::f32);
+ const SDValue One = DAG.getConstantFP(1.0, SL, MVT::f32);
- EVT SetCCVT =
- getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), MVT::f32);
+ EVT SetCCVT =
+ getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), MVT::f32);
- SDValue r2 = DAG.getSetCC(SL, SetCCVT, r1, K0, ISD::SETOGT);
+ SDValue r2 = DAG.getSetCC(SL, SetCCVT, r1, K0, ISD::SETOGT);
- SDValue r3 = DAG.getNode(ISD::SELECT, SL, MVT::f32, r2, K1, One);
+ SDValue r3 = DAG.getNode(ISD::SELECT, SL, MVT::f32, r2, K1, One);
- // TODO: Should this propagate fast-math-flags?
+ // TODO: Should this propagate fast-math-flags?
+ r1 = DAG.getNode(ISD::FMUL, SL, MVT::f32, RHS, r3);
- r1 = DAG.getNode(ISD::FMUL, SL, MVT::f32, RHS, r3);
+ // rcp does not support denormals.
+ SDValue r0 = DAG.getNode(AMDGPUISD::RCP, SL, MVT::f32, r1);
- // rcp does not support denormals.
- SDValue r0 = DAG.getNode(AMDGPUISD::RCP, SL, MVT::f32, r1);
+ SDValue Mul = DAG.getNode(ISD::FMUL, SL, MVT::f32, LHS, r0);
- SDValue Mul = DAG.getNode(ISD::FMUL, SL, MVT::f32, LHS, r0);
+ return DAG.getNode(ISD::FMUL, SL, MVT::f32, r3, Mul);
+}
- return DAG.getNode(ISD::FMUL, SL, MVT::f32, r3, Mul);
- }
+SDValue SITargetLowering::LowerFDIV32(SDValue Op, SelectionDAG &DAG) const {
+ if (SDValue FastLowered = lowerFastUnsafeFDIV(Op, DAG))
+ return FastLowered;
+
+ SDLoc SL(Op);
+ SDValue LHS = Op.getOperand(0);
+ SDValue RHS = Op.getOperand(1);
- // Generates more precise fpdiv32.
const SDValue One = DAG.getConstantFP(1.0, SL, MVT::f32);
SDVTList ScaleVT = DAG.getVTList(MVT::f32, MVT::i1);
@@ -2209,7 +2214,7 @@ SDValue SITargetLowering::LowerFDIV32(SDValue Op, SelectionDAG &DAG) const {
SDValue SITargetLowering::LowerFDIV64(SDValue Op, SelectionDAG &DAG) const {
if (DAG.getTarget().Options.UnsafeFPMath)
- return LowerFastFDIV(Op, DAG);
+ return lowerFastUnsafeFDIV(Op, DAG);
SDLoc SL(Op);
SDValue X = Op.getOperand(0);
diff --git a/lib/Target/AMDGPU/SIISelLowering.h b/lib/Target/AMDGPU/SIISelLowering.h
index 8e055eea58c2..1d349faa592c 100644
--- a/lib/Target/AMDGPU/SIISelLowering.h
+++ b/lib/Target/AMDGPU/SIISelLowering.h
@@ -36,7 +36,8 @@ class SITargetLowering final : public AMDGPUTargetLowering {
SDValue LowerFrameIndex(SDValue Op, SelectionDAG &DAG) const;
SDValue LowerLOAD(SDValue Op, SelectionDAG &DAG) const;
SDValue LowerSELECT(SDValue Op, SelectionDAG &DAG) const;
- SDValue LowerFastFDIV(SDValue Op, SelectionDAG &DAG) const;
+ SDValue lowerFastUnsafeFDIV(SDValue Op, SelectionDAG &DAG) const;
+ SDValue lowerFDIV_FAST(SDValue Op, SelectionDAG &DAG) const;
SDValue LowerFDIV32(SDValue Op, SelectionDAG &DAG) const;
SDValue LowerFDIV64(SDValue Op, SelectionDAG &DAG) const;
SDValue LowerFDIV(SDValue Op, SelectionDAG &DAG) const;
diff --git a/lib/Target/AMDGPU/SIInstrFormats.td b/lib/Target/AMDGPU/SIInstrFormats.td
index 2f63d4ed13b3..6163f0547bd5 100644
--- a/lib/Target/AMDGPU/SIInstrFormats.td
+++ b/lib/Target/AMDGPU/SIInstrFormats.td
@@ -41,6 +41,8 @@ class InstSI <dag outs, dag ins, string asm = "",
field bits<1> DS = 0;
field bits<1> MIMG = 0;
field bits<1> FLAT = 0;
+
+ // Whether WQM _must_ be enabled for this instruction.
field bits<1> WQM = 0;
field bits<1> VGPRSpill = 0;
@@ -50,6 +52,9 @@ class InstSI <dag outs, dag ins, string asm = "",
field bits<1> Gather4 = 0;
+ // Whether WQM _must_ be disabled for this instruction.
+ field bits<1> DisableWQM = 0;
+
// These need to be kept in sync with the enum in SIInstrFlags.
let TSFlags{0} = VM_CNT;
let TSFlags{1} = EXP_CNT;
@@ -81,6 +86,7 @@ class InstSI <dag outs, dag ins, string asm = "",
let TSFlags{23} = VGPRSpill;
let TSFlags{24} = VOPAsmPrefer32Bit;
let TSFlags{25} = Gather4;
+ let TSFlags{26} = DisableWQM;
let SchedRW = [Write32Bit];
diff --git a/lib/Target/AMDGPU/SIInstrInfo.cpp b/lib/Target/AMDGPU/SIInstrInfo.cpp
index d171e21c8a4f..5cc6a4e0e83e 100644
--- a/lib/Target/AMDGPU/SIInstrInfo.cpp
+++ b/lib/Target/AMDGPU/SIInstrInfo.cpp
@@ -738,7 +738,8 @@ unsigned SIInstrInfo::calculateLDSSpillAddress(
MachineBasicBlock::iterator Insert = Entry.front();
DebugLoc DL = Insert->getDebugLoc();
- TIDReg = RI.findUnusedRegister(MF->getRegInfo(), &AMDGPU::VGPR_32RegClass);
+ TIDReg = RI.findUnusedRegister(MF->getRegInfo(), &AMDGPU::VGPR_32RegClass,
+ *MF);
if (TIDReg == AMDGPU::NoRegister)
return TIDReg;
diff --git a/lib/Target/AMDGPU/SIInstrInfo.h b/lib/Target/AMDGPU/SIInstrInfo.h
index 227b817227c2..fef8904a6c87 100644
--- a/lib/Target/AMDGPU/SIInstrInfo.h
+++ b/lib/Target/AMDGPU/SIInstrInfo.h
@@ -340,6 +340,14 @@ public:
return get(Opcode).TSFlags & SIInstrFlags::WQM;
}
+ static bool isDisableWQM(const MachineInstr &MI) {
+ return MI.getDesc().TSFlags & SIInstrFlags::DisableWQM;
+ }
+
+ bool isDisableWQM(uint16_t Opcode) const {
+ return get(Opcode).TSFlags & SIInstrFlags::DisableWQM;
+ }
+
static bool isVGPRSpill(const MachineInstr &MI) {
return MI.getDesc().TSFlags & SIInstrFlags::VGPRSpill;
}
diff --git a/lib/Target/AMDGPU/SIInstrInfo.td b/lib/Target/AMDGPU/SIInstrInfo.td
index 253cc32b27e4..00f53e846db4 100644
--- a/lib/Target/AMDGPU/SIInstrInfo.td
+++ b/lib/Target/AMDGPU/SIInstrInfo.td
@@ -2949,6 +2949,10 @@ multiclass MUBUF_m <mubuf op, string opName, dag outs, dag ins, string asm,
def "" : MUBUF_Pseudo <opName, outs, ins, pattern>,
MUBUFAddr64Table <0>;
+ let DisableWQM = 1 in {
+ def "_exact" : MUBUF_Pseudo <opName, outs, ins, []>;
+ }
+
let addr64 = 0, isCodeGenOnly = 0 in {
def _si : MUBUF_Real_si <op, opName, outs, ins, asm>;
}
@@ -3019,7 +3023,8 @@ multiclass MUBUFAtomicOther_m <mubuf op, string opName, dag outs, dag ins,
multiclass MUBUF_Atomic <mubuf op, string name, RegisterClass rc,
ValueType vt, SDPatternOperator atomic> {
- let mayStore = 1, mayLoad = 1, hasPostISelHook = 1, hasSideEffects = 1 in {
+ let mayStore = 1, mayLoad = 1, hasPostISelHook = 1, hasSideEffects = 1,
+ DisableWQM = 1 in {
// No return variants
let glc = 0, AsmMatchConverter = "cvtMubufAtomic" in {
@@ -3423,6 +3428,7 @@ class MIMG_Store_Helper <bits<7> op, string asm,
let mayStore = 1;
let hasSideEffects = 1;
let hasPostISelHook = 0;
+ let DisableWQM = 1;
}
multiclass MIMG_Store_Addr_Helper <bits<7> op, string asm,
@@ -3454,6 +3460,7 @@ class MIMG_Atomic_Helper <string asm, RegisterClass data_rc,
let mayStore = 1;
let hasSideEffects = 1;
let hasPostISelHook = 0;
+ let DisableWQM = 1;
let Constraints = "$vdst = $vdata";
let AsmMatchConverter = "cvtMIMGAtomic";
}
diff --git a/lib/Target/AMDGPU/SIInstructions.td b/lib/Target/AMDGPU/SIInstructions.td
index 6427db87cd6f..18b7d5d62efe 100644
--- a/lib/Target/AMDGPU/SIInstructions.td
+++ b/lib/Target/AMDGPU/SIInstructions.td
@@ -2200,7 +2200,7 @@ multiclass MUBUF_StoreIntrinsicPat<SDPatternOperator name, ValueType vt,
(name vt:$vdata, v4i32:$rsrc, 0,
(MUBUFIntrinsicOffset i32:$soffset, i16:$offset),
imm:$glc, imm:$slc),
- (!cast<MUBUF>(opcode # _OFFSET) $vdata, $rsrc, $soffset, (as_i16imm $offset),
+ (!cast<MUBUF>(opcode # _OFFSET_exact) $vdata, $rsrc, $soffset, (as_i16imm $offset),
(as_i1imm $glc), (as_i1imm $slc), 0)
>;
@@ -2208,7 +2208,7 @@ multiclass MUBUF_StoreIntrinsicPat<SDPatternOperator name, ValueType vt,
(name vt:$vdata, v4i32:$rsrc, i32:$vindex,
(MUBUFIntrinsicOffset i32:$soffset, i16:$offset),
imm:$glc, imm:$slc),
- (!cast<MUBUF>(opcode # _IDXEN) $vdata, $vindex, $rsrc, $soffset,
+ (!cast<MUBUF>(opcode # _IDXEN_exact) $vdata, $vindex, $rsrc, $soffset,
(as_i16imm $offset), (as_i1imm $glc),
(as_i1imm $slc), 0)
>;
@@ -2217,7 +2217,7 @@ multiclass MUBUF_StoreIntrinsicPat<SDPatternOperator name, ValueType vt,
(name vt:$vdata, v4i32:$rsrc, 0,
(MUBUFIntrinsicVOffset i32:$soffset, i16:$offset, i32:$voffset),
imm:$glc, imm:$slc),
- (!cast<MUBUF>(opcode # _OFFEN) $vdata, $voffset, $rsrc, $soffset,
+ (!cast<MUBUF>(opcode # _OFFEN_exact) $vdata, $voffset, $rsrc, $soffset,
(as_i16imm $offset), (as_i1imm $glc),
(as_i1imm $slc), 0)
>;
@@ -2226,7 +2226,7 @@ multiclass MUBUF_StoreIntrinsicPat<SDPatternOperator name, ValueType vt,
(name vt:$vdata, v4i32:$rsrc, i32:$vindex,
(MUBUFIntrinsicVOffset i32:$soffset, i16:$offset, i32:$voffset),
imm:$glc, imm:$slc),
- (!cast<MUBUF>(opcode # _BOTHEN)
+ (!cast<MUBUF>(opcode # _BOTHEN_exact)
$vdata,
(REG_SEQUENCE VReg_64, $vindex, sub0, $voffset, sub1),
$rsrc, $soffset, (as_i16imm $offset),
@@ -3391,6 +3391,16 @@ def : Pat <
(V_CNDMASK_B32_e64 0, -1, $src), sub1)
>;
+class FPToI1Pat<Instruction Inst, int KOne, ValueType vt, SDPatternOperator fp_to_int> : Pat <
+ (i1 (fp_to_int (vt (VOP3Mods vt:$src0, i32:$src0_modifiers)))),
+ (i1 (Inst 0, KOne, $src0_modifiers, $src0, DSTCLAMP.NONE, DSTOMOD.NONE))
+>;
+
+def : FPToI1Pat<V_CMP_EQ_F32_e64, CONST.FP32_ONE, f32, fp_to_uint>;
+def : FPToI1Pat<V_CMP_EQ_F32_e64, CONST.FP32_NEG_ONE, f32, fp_to_sint>;
+def : FPToI1Pat<V_CMP_EQ_F64_e64, CONST.FP64_ONE, f64, fp_to_uint>;
+def : FPToI1Pat<V_CMP_EQ_F64_e64, CONST.FP64_NEG_ONE, f64, fp_to_sint>;
+
// If we need to perform a logical operation on i1 values, we need to
// use vector comparisons since there is only one SCC register. Vector
// comparisions still write to a pair of SGPRs, so treat these as
diff --git a/lib/Target/AMDGPU/SIIntrinsics.td b/lib/Target/AMDGPU/SIIntrinsics.td
index a9b7c39096e7..9d06ccfc6c7f 100644
--- a/lib/Target/AMDGPU/SIIntrinsics.td
+++ b/lib/Target/AMDGPU/SIIntrinsics.td
@@ -7,7 +7,8 @@
//
//===----------------------------------------------------------------------===//
//
-// SI Intrinsic Definitions
+// Backend internal SI Intrinsic Definitions. User code should not
+// directly use these.
//
//===----------------------------------------------------------------------===//
@@ -177,6 +178,12 @@ let TargetPrefix = "SI", isTarget = 1 in {
} // End TargetPrefix = "SI", isTarget = 1
let TargetPrefix = "amdgcn", isTarget = 1 in {
+ // Emit 2.5 ulp, no denormal division. Should only be inserted by
+ // pass based on !fpmath metadata.
+ def int_amdgcn_fdiv_fast : Intrinsic<
+ [llvm_float_ty], [llvm_float_ty], [IntrNoMem]
+ >;
+
/* Control flow Intrinsics */
def int_amdgcn_if : Intrinsic<[llvm_i64_ty], [llvm_i1_ty, llvm_empty_ty], []>;
diff --git a/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp b/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp
index 4d12a1ef9a93..848be32cd515 100644
--- a/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp
+++ b/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp
@@ -203,7 +203,8 @@ SIMachineFunctionInfo::SpilledReg SIMachineFunctionInfo::getSpilledReg (
Spill.Lane = Lane;
if (!LaneVGPRs.count(LaneVGPRIdx)) {
- unsigned LaneVGPR = TRI->findUnusedRegister(MRI, &AMDGPU::VGPR_32RegClass);
+ unsigned LaneVGPR = TRI->findUnusedRegister(MRI, &AMDGPU::VGPR_32RegClass,
+ *MF);
if (LaneVGPR == AMDGPU::NoRegister)
// We have no VGPRs left for spilling SGPRs.
diff --git a/lib/Target/AMDGPU/SIRegisterInfo.cpp b/lib/Target/AMDGPU/SIRegisterInfo.cpp
index 0dd88ee45c58..347c33fb3760 100644
--- a/lib/Target/AMDGPU/SIRegisterInfo.cpp
+++ b/lib/Target/AMDGPU/SIRegisterInfo.cpp
@@ -957,10 +957,13 @@ unsigned SIRegisterInfo::getPreloadedValue(const MachineFunction &MF,
/// \brief Returns a register that is not used at any point in the function.
/// If all registers are used, then this function will return
// AMDGPU::NoRegister.
-unsigned SIRegisterInfo::findUnusedRegister(const MachineRegisterInfo &MRI,
- const TargetRegisterClass *RC) const {
+unsigned
+SIRegisterInfo::findUnusedRegister(const MachineRegisterInfo &MRI,
+ const TargetRegisterClass *RC,
+ const MachineFunction &MF) const {
+
for (unsigned Reg : *RC)
- if (!MRI.isPhysRegUsed(Reg))
+ if (MRI.isAllocatable(Reg) && !MRI.isPhysRegUsed(Reg))
return Reg;
return AMDGPU::NoRegister;
}
diff --git a/lib/Target/AMDGPU/SIRegisterInfo.h b/lib/Target/AMDGPU/SIRegisterInfo.h
index 6e97b1b910a9..d8b2d9f4e975 100644
--- a/lib/Target/AMDGPU/SIRegisterInfo.h
+++ b/lib/Target/AMDGPU/SIRegisterInfo.h
@@ -185,7 +185,8 @@ public:
unsigned getNumSGPRsAllowed(const SISubtarget &ST, unsigned WaveCount) const;
unsigned findUnusedRegister(const MachineRegisterInfo &MRI,
- const TargetRegisterClass *RC) const;
+ const TargetRegisterClass *RC,
+ const MachineFunction &MF) const;
unsigned getSGPR32PressureSet() const { return SGPR32SetID; };
unsigned getVGPR32PressureSet() const { return VGPR32SetID; };
diff --git a/lib/Target/AMDGPU/SIWholeQuadMode.cpp b/lib/Target/AMDGPU/SIWholeQuadMode.cpp
index c1a237ea5f51..b200c153df0b 100644
--- a/lib/Target/AMDGPU/SIWholeQuadMode.cpp
+++ b/lib/Target/AMDGPU/SIWholeQuadMode.cpp
@@ -94,12 +94,15 @@ private:
const SIInstrInfo *TII;
const SIRegisterInfo *TRI;
MachineRegisterInfo *MRI;
+ LiveIntervals *LIS;
DenseMap<const MachineInstr *, InstrInfo> Instructions;
DenseMap<MachineBasicBlock *, BlockInfo> Blocks;
SmallVector<const MachineInstr *, 2> ExecExports;
SmallVector<MachineInstr *, 1> LiveMaskQueries;
+ void markInstruction(MachineInstr &MI, char Flag,
+ std::vector<WorkItem> &Worklist);
char scanInstructions(MachineFunction &MF, std::vector<WorkItem> &Worklist);
void propagateInstruction(MachineInstr &MI, std::vector<WorkItem> &Worklist);
void propagateBlock(MachineBasicBlock &MBB, std::vector<WorkItem> &Worklist);
@@ -126,6 +129,7 @@ public:
}
void getAnalysisUsage(AnalysisUsage &AU) const override {
+ AU.addRequired<LiveIntervals>();
AU.setPreservesCFG();
MachineFunctionPass::getAnalysisUsage(AU);
}
@@ -135,8 +139,11 @@ public:
char SIWholeQuadMode::ID = 0;
-INITIALIZE_PASS(SIWholeQuadMode, DEBUG_TYPE,
- "SI Whole Quad Mode", false, false)
+INITIALIZE_PASS_BEGIN(SIWholeQuadMode, DEBUG_TYPE, "SI Whole Quad Mode", false,
+ false)
+INITIALIZE_PASS_DEPENDENCY(LiveIntervals)
+INITIALIZE_PASS_END(SIWholeQuadMode, DEBUG_TYPE, "SI Whole Quad Mode", false,
+ false)
char &llvm::SIWholeQuadModeID = SIWholeQuadMode::ID;
@@ -144,6 +151,23 @@ FunctionPass *llvm::createSIWholeQuadModePass() {
return new SIWholeQuadMode;
}
+void SIWholeQuadMode::markInstruction(MachineInstr &MI, char Flag,
+ std::vector<WorkItem> &Worklist) {
+ InstrInfo &II = Instructions[&MI];
+
+ assert(Flag == StateWQM || Flag == StateExact);
+
+ // Ignore if the instruction is already marked. The typical case is that we
+ // mark an instruction WQM multiple times, but for atomics it can happen that
+ // Flag is StateWQM, but Needs is already set to StateExact. In this case,
+ // letting the atomic run in StateExact is correct as per the relevant specs.
+ if (II.Needs)
+ return;
+
+ II.Needs = Flag;
+ Worklist.push_back(&MI);
+}
+
// Scan instructions to determine which ones require an Exact execmask and
// which ones seed WQM requirements.
char SIWholeQuadMode::scanInstructions(MachineFunction &MF,
@@ -161,7 +185,7 @@ char SIWholeQuadMode::scanInstructions(MachineFunction &MF,
if (TII->isWQM(Opcode) || TII->isDS(Opcode)) {
Flags = StateWQM;
- } else if (MI.mayStore() && TII->usesVM_CNT(MI)) {
+ } else if (TII->isDisableWQM(MI)) {
Flags = StateExact;
} else {
// Handle export instructions with the exec mask valid flag set
@@ -192,8 +216,7 @@ char SIWholeQuadMode::scanInstructions(MachineFunction &MF,
continue;
}
- Instructions[&MI].Needs = Flags;
- Worklist.push_back(&MI);
+ markInstruction(MI, Flags, Worklist);
GlobalFlags |= Flags;
}
@@ -214,9 +237,10 @@ void SIWholeQuadMode::propagateInstruction(MachineInstr &MI,
InstrInfo II = Instructions[&MI]; // take a copy to prevent dangling references
BlockInfo &BI = Blocks[MBB];
- // Control flow-type instructions that are followed by WQM computations
- // must themselves be in WQM.
- if ((II.OutNeeds & StateWQM) && !(II.Needs & StateWQM) && MI.isTerminator()) {
+ // Control flow-type instructions and stores to temporary memory that are
+ // followed by WQM computations must themselves be in WQM.
+ if ((II.OutNeeds & StateWQM) && !II.Needs &&
+ (MI.isTerminator() || (TII->usesVM_CNT(MI) && MI.mayStore()))) {
Instructions[&MI].Needs = StateWQM;
II.Needs = StateWQM;
}
@@ -249,32 +273,35 @@ void SIWholeQuadMode::propagateInstruction(MachineInstr &MI,
if (!Use.isReg() || !Use.isUse())
continue;
- // At this point, physical registers appear as inputs or outputs
- // and following them makes no sense (and would in fact be incorrect
- // when the same VGPR is used as both an output and an input that leads
- // to a NeedsWQM instruction).
- //
- // Note: VCC appears e.g. in 64-bit addition with carry - theoretically we
- // have to trace this, in practice it happens for 64-bit computations like
- // pointers where both dwords are followed already anyway.
- if (!TargetRegisterInfo::isVirtualRegister(Use.getReg()))
- continue;
-
- for (MachineInstr &DefMI : MRI->def_instructions(Use.getReg())) {
- InstrInfo &DefII = Instructions[&DefMI];
+ unsigned Reg = Use.getReg();
- // Obviously skip if DefMI is already flagged as NeedWQM.
- //
- // The instruction might also be flagged as NeedExact. This happens when
- // the result of an atomic is used in a WQM computation. In this case,
- // the atomic must not run for helper pixels and the WQM result is
- // undefined.
- if (DefII.Needs != 0)
+ // Handle physical registers that we need to track; this is mostly relevant
+ // for VCC, which can appear as the (implicit) input of a uniform branch,
+ // e.g. when a loop counter is stored in a VGPR.
+ if (!TargetRegisterInfo::isVirtualRegister(Reg)) {
+ if (Reg == AMDGPU::EXEC)
continue;
- DefII.Needs = StateWQM;
- Worklist.push_back(&DefMI);
+ for (MCRegUnitIterator RegUnit(Reg, TRI); RegUnit.isValid(); ++RegUnit) {
+ LiveRange &LR = LIS->getRegUnit(*RegUnit);
+ const VNInfo *Value = LR.Query(LIS->getInstructionIndex(MI)).valueIn();
+ if (!Value)
+ continue;
+
+ // Since we're in machine SSA, we do not need to track physical
+ // registers across basic blocks.
+ if (Value->isPHIDef())
+ continue;
+
+ markInstruction(*LIS->getInstructionFromIndex(Value->def), StateWQM,
+ Worklist);
+ }
+
+ continue;
}
+
+ for (MachineInstr &DefMI : MRI->def_instructions(Use.getReg()))
+ markInstruction(DefMI, StateWQM, Worklist);
}
}
@@ -468,6 +495,7 @@ bool SIWholeQuadMode::runOnMachineFunction(MachineFunction &MF) {
TII = ST.getInstrInfo();
TRI = &TII->getRegisterInfo();
MRI = &MF.getRegInfo();
+ LIS = &getAnalysis<LiveIntervals>();
char GlobalFlags = analyzeFunction(MF);
if (!(GlobalFlags & StateWQM)) {
diff --git a/lib/Target/ARM/ARMISelLowering.cpp b/lib/Target/ARM/ARMISelLowering.cpp
index d6e7caf98a80..3cfcb1e09f0b 100644
--- a/lib/Target/ARM/ARMISelLowering.cpp
+++ b/lib/Target/ARM/ARMISelLowering.cpp
@@ -3857,7 +3857,8 @@ SDValue ARMTargetLowering::LowerSELECT_CC(SDValue Op, SelectionDAG &DAG) const {
// Try to convert two saturating conditional selects into a single SSAT
SDValue SatValue;
uint64_t SatConstant;
- if (isSaturatingConditional(Op, SatValue, SatConstant))
+ if (((!Subtarget->isThumb() && Subtarget->hasV6Ops()) || Subtarget->isThumb2()) &&
+ isSaturatingConditional(Op, SatValue, SatConstant))
return DAG.getNode(ARMISD::SSAT, dl, VT, SatValue,
DAG.getConstant(countTrailingOnes(SatConstant), dl, VT));
diff --git a/lib/Target/ARM/ARMInstrInfo.td b/lib/Target/ARM/ARMInstrInfo.td
index 060376b0a273..c9735f3ec277 100644
--- a/lib/Target/ARM/ARMInstrInfo.td
+++ b/lib/Target/ARM/ARMInstrInfo.td
@@ -3650,7 +3650,8 @@ def USADA8 : AI<(outs GPR:$Rd), (ins GPR:$Rn, GPR:$Rm, GPR:$Ra),
def SSAT : AI<(outs GPRnopc:$Rd),
(ins imm1_32:$sat_imm, GPRnopc:$Rn, shift_imm:$sh),
- SatFrm, NoItinerary, "ssat", "\t$Rd, $sat_imm, $Rn$sh", []> {
+ SatFrm, NoItinerary, "ssat", "\t$Rd, $sat_imm, $Rn$sh", []>,
+ Requires<[IsARM,HasV6]>{
bits<4> Rd;
bits<5> sat_imm;
bits<4> Rn;
@@ -3666,7 +3667,8 @@ def SSAT : AI<(outs GPRnopc:$Rd),
def SSAT16 : AI<(outs GPRnopc:$Rd),
(ins imm1_16:$sat_imm, GPRnopc:$Rn), SatFrm,
- NoItinerary, "ssat16", "\t$Rd, $sat_imm, $Rn", []> {
+ NoItinerary, "ssat16", "\t$Rd, $sat_imm, $Rn", []>,
+ Requires<[IsARM,HasV6]>{
bits<4> Rd;
bits<4> sat_imm;
bits<4> Rn;
@@ -3679,7 +3681,8 @@ def SSAT16 : AI<(outs GPRnopc:$Rd),
def USAT : AI<(outs GPRnopc:$Rd),
(ins imm0_31:$sat_imm, GPRnopc:$Rn, shift_imm:$sh),
- SatFrm, NoItinerary, "usat", "\t$Rd, $sat_imm, $Rn$sh", []> {
+ SatFrm, NoItinerary, "usat", "\t$Rd, $sat_imm, $Rn$sh", []>,
+ Requires<[IsARM,HasV6]> {
bits<4> Rd;
bits<5> sat_imm;
bits<4> Rn;
@@ -3695,7 +3698,8 @@ def USAT : AI<(outs GPRnopc:$Rd),
def USAT16 : AI<(outs GPRnopc:$Rd),
(ins imm0_15:$sat_imm, GPRnopc:$Rn), SatFrm,
- NoItinerary, "usat16", "\t$Rd, $sat_imm, $Rn", []> {
+ NoItinerary, "usat16", "\t$Rd, $sat_imm, $Rn", []>,
+ Requires<[IsARM,HasV6]>{
bits<4> Rd;
bits<4> sat_imm;
bits<4> Rn;
diff --git a/lib/Target/ARM/ARMInstrThumb2.td b/lib/Target/ARM/ARMInstrThumb2.td
index 55e5308be40e..fe699b284882 100644
--- a/lib/Target/ARM/ARMInstrThumb2.td
+++ b/lib/Target/ARM/ARMInstrThumb2.td
@@ -2240,7 +2240,8 @@ class T2SatI<dag oops, dag iops, InstrItinClass itin,
def t2SSAT: T2SatI<
(outs rGPR:$Rd),
(ins imm1_32:$sat_imm, rGPR:$Rn, t2_shift_imm:$sh),
- NoItinerary, "ssat", "\t$Rd, $sat_imm, $Rn$sh", []> {
+ NoItinerary, "ssat", "\t$Rd, $sat_imm, $Rn$sh", []>,
+ Requires<[IsThumb2]> {
let Inst{31-27} = 0b11110;
let Inst{25-22} = 0b1100;
let Inst{20} = 0;
@@ -2251,7 +2252,7 @@ def t2SSAT: T2SatI<
def t2SSAT16: T2SatI<
(outs rGPR:$Rd), (ins imm1_16:$sat_imm, rGPR:$Rn), NoItinerary,
"ssat16", "\t$Rd, $sat_imm, $Rn", []>,
- Requires<[IsThumb2, HasDSP]> {
+ Requires<[IsThumb2, HasDSP]> {
let Inst{31-27} = 0b11110;
let Inst{25-22} = 0b1100;
let Inst{20} = 0;
@@ -2265,7 +2266,8 @@ def t2SSAT16: T2SatI<
def t2USAT: T2SatI<
(outs rGPR:$Rd),
(ins imm0_31:$sat_imm, rGPR:$Rn, t2_shift_imm:$sh),
- NoItinerary, "usat", "\t$Rd, $sat_imm, $Rn$sh", []> {
+ NoItinerary, "usat", "\t$Rd, $sat_imm, $Rn$sh", []>,
+ Requires<[IsThumb2]> {
let Inst{31-27} = 0b11110;
let Inst{25-22} = 0b1110;
let Inst{20} = 0;
@@ -2275,7 +2277,7 @@ def t2USAT: T2SatI<
def t2USAT16: T2SatI<(outs rGPR:$Rd), (ins imm0_15:$sat_imm, rGPR:$Rn),
NoItinerary,
"usat16", "\t$Rd, $sat_imm, $Rn", []>,
- Requires<[IsThumb2, HasDSP]> {
+ Requires<[IsThumb2, HasDSP]> {
let Inst{31-22} = 0b1111001110;
let Inst{20} = 0;
let Inst{15} = 0;
diff --git a/lib/Target/Mips/MCTargetDesc/MipsELFObjectWriter.cpp b/lib/Target/Mips/MCTargetDesc/MipsELFObjectWriter.cpp
index cdad7ce1b73a..20c5f3691d23 100644
--- a/lib/Target/Mips/MCTargetDesc/MipsELFObjectWriter.cpp
+++ b/lib/Target/Mips/MCTargetDesc/MipsELFObjectWriter.cpp
@@ -518,6 +518,10 @@ bool MipsELFObjectWriter::needsRelocateWithSymbol(const MCSymbol &Sym,
return true;
return false;
+ case ELF::R_MIPS_GOT_PAGE:
+ case ELF::R_MICROMIPS_GOT_PAGE:
+ case ELF::R_MIPS_GOT_OFST:
+ case ELF::R_MICROMIPS_GOT_OFST:
case ELF::R_MIPS_16:
case ELF::R_MIPS_32:
case ELF::R_MIPS_GPREL32:
@@ -539,8 +543,6 @@ bool MipsELFObjectWriter::needsRelocateWithSymbol(const MCSymbol &Sym,
case ELF::R_MIPS_SHIFT5:
case ELF::R_MIPS_SHIFT6:
case ELF::R_MIPS_GOT_DISP:
- case ELF::R_MIPS_GOT_PAGE:
- case ELF::R_MIPS_GOT_OFST:
case ELF::R_MIPS_GOT_HI16:
case ELF::R_MIPS_GOT_LO16:
case ELF::R_MIPS_INSERT_A:
@@ -589,8 +591,6 @@ bool MipsELFObjectWriter::needsRelocateWithSymbol(const MCSymbol &Sym,
case ELF::R_MICROMIPS_PC16_S1:
case ELF::R_MICROMIPS_CALL16:
case ELF::R_MICROMIPS_GOT_DISP:
- case ELF::R_MICROMIPS_GOT_PAGE:
- case ELF::R_MICROMIPS_GOT_OFST:
case ELF::R_MICROMIPS_GOT_HI16:
case ELF::R_MICROMIPS_GOT_LO16:
case ELF::R_MICROMIPS_SUB:
diff --git a/lib/Target/Mips/MCTargetDesc/MipsMCAsmInfo.cpp b/lib/Target/Mips/MCTargetDesc/MipsMCAsmInfo.cpp
index 1622b2212665..1ce8f07092b1 100644
--- a/lib/Target/Mips/MCTargetDesc/MipsMCAsmInfo.cpp
+++ b/lib/Target/Mips/MCTargetDesc/MipsMCAsmInfo.cpp
@@ -28,12 +28,19 @@ MipsMCAsmInfo::MipsMCAsmInfo(const Triple &TheTriple) {
PointerSize = CalleeSaveStackSlotSize = 8;
}
+ // FIXME: This condition isn't quite right but it's the best we can do until
+ // this object can identify the ABI. It will misbehave when using O32
+ // on a mips64*-* triple.
+ if ((TheTriple.getArch() == Triple::mipsel) ||
+ (TheTriple.getArch() == Triple::mips)) {
+ PrivateGlobalPrefix = "$";
+ PrivateLabelPrefix = "$";
+ }
+
AlignmentIsInBytes = false;
Data16bitsDirective = "\t.2byte\t";
Data32bitsDirective = "\t.4byte\t";
Data64bitsDirective = "\t.8byte\t";
- PrivateGlobalPrefix = "$";
- PrivateLabelPrefix = "$";
CommentString = "#";
ZeroDirective = "\t.space\t";
GPRel32Directive = "\t.gpword\t";
diff --git a/lib/Target/Mips/MipsTargetMachine.cpp b/lib/Target/Mips/MipsTargetMachine.cpp
index c248c3a50ac8..80641ed9bd31 100644
--- a/lib/Target/Mips/MipsTargetMachine.cpp
+++ b/lib/Target/Mips/MipsTargetMachine.cpp
@@ -57,7 +57,10 @@ static std::string computeDataLayout(const Triple &TT, StringRef CPU,
else
Ret += "E";
- Ret += "-m:m";
+ if (ABI.IsO32())
+ Ret += "-m:m";
+ else
+ Ret += "-m:e";
// Pointers are 32 bit on some ABIs.
if (!ABI.IsN64())
diff --git a/lib/Target/X86/X86ISelLowering.cpp b/lib/Target/X86/X86ISelLowering.cpp
index e54711195900..2c548384f1cb 100644
--- a/lib/Target/X86/X86ISelLowering.cpp
+++ b/lib/Target/X86/X86ISelLowering.cpp
@@ -1187,6 +1187,14 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
setOperationAction(ISD::UINT_TO_FP, MVT::v4i32, Legal);
setOperationAction(ISD::UINT_TO_FP, MVT::v16i8, Custom);
setOperationAction(ISD::UINT_TO_FP, MVT::v16i16, Custom);
+ setOperationAction(ISD::SINT_TO_FP, MVT::v16i1, Custom);
+ setOperationAction(ISD::UINT_TO_FP, MVT::v16i1, Custom);
+ setOperationAction(ISD::SINT_TO_FP, MVT::v8i1, Custom);
+ setOperationAction(ISD::UINT_TO_FP, MVT::v8i1, Custom);
+ setOperationAction(ISD::SINT_TO_FP, MVT::v4i1, Custom);
+ setOperationAction(ISD::UINT_TO_FP, MVT::v4i1, Custom);
+ setOperationAction(ISD::SINT_TO_FP, MVT::v2i1, Custom);
+ setOperationAction(ISD::UINT_TO_FP, MVT::v2i1, Custom);
setOperationAction(ISD::FP_ROUND, MVT::v8f32, Legal);
setOperationAction(ISD::FP_EXTEND, MVT::v8f32, Legal);
@@ -13373,6 +13381,7 @@ SDValue X86TargetLowering::LowerSINT_TO_FP(SDValue Op,
MVT VT = Op.getSimpleValueType();
SDLoc dl(Op);
+ const TargetLowering &TLI = DAG.getTargetLoweringInfo();
if (SrcVT.isVector()) {
if (SrcVT == MVT::v2i32 && VT == MVT::v2f64) {
return DAG.getNode(X86ISD::CVTDQ2PD, dl, VT,
@@ -13380,6 +13389,9 @@ SDValue X86TargetLowering::LowerSINT_TO_FP(SDValue Op,
DAG.getUNDEF(SrcVT)));
}
if (SrcVT.getVectorElementType() == MVT::i1) {
+ if (SrcVT == MVT::v2i1 && TLI.isTypeLegal(SrcVT))
+ return DAG.getNode(ISD::SINT_TO_FP, dl, Op.getValueType(),
+ DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::v2i64, Src));
MVT IntegerVT = MVT::getVectorVT(MVT::i32, SrcVT.getVectorNumElements());
return DAG.getNode(ISD::SINT_TO_FP, dl, Op.getValueType(),
DAG.getNode(ISD::SIGN_EXTEND, dl, IntegerVT, Src));
@@ -13694,6 +13706,15 @@ SDValue X86TargetLowering::lowerUINT_TO_FP_vec(SDValue Op,
MVT SVT = N0.getSimpleValueType();
SDLoc dl(Op);
+ if (SVT.getVectorElementType() == MVT::i1) {
+ if (SVT == MVT::v2i1)
+ return DAG.getNode(ISD::UINT_TO_FP, dl, Op.getValueType(),
+ DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::v2i64, N0));
+ MVT IntegerVT = MVT::getVectorVT(MVT::i32, SVT.getVectorNumElements());
+ return DAG.getNode(ISD::UINT_TO_FP, dl, Op.getValueType(),
+ DAG.getNode(ISD::ZERO_EXTEND, dl, IntegerVT, N0));
+ }
+
switch (SVT.SimpleTy) {
default:
llvm_unreachable("Custom UINT_TO_FP is not supported!");
diff --git a/lib/Target/X86/X86InstrInfo.cpp b/lib/Target/X86/X86InstrInfo.cpp
index 1672b3855b79..5f0aab9ddc68 100644
--- a/lib/Target/X86/X86InstrInfo.cpp
+++ b/lib/Target/X86/X86InstrInfo.cpp
@@ -2661,7 +2661,8 @@ inline static bool isTruncatedShiftCountForLEA(unsigned ShAmt) {
bool X86InstrInfo::classifyLEAReg(MachineInstr &MI, const MachineOperand &Src,
unsigned Opc, bool AllowSP, unsigned &NewSrc,
bool &isKill, bool &isUndef,
- MachineOperand &ImplicitOp) const {
+ MachineOperand &ImplicitOp,
+ LiveVariables *LV) const {
MachineFunction &MF = *MI.getParent()->getParent();
const TargetRegisterClass *RC;
if (AllowSP) {
@@ -2715,13 +2716,17 @@ bool X86InstrInfo::classifyLEAReg(MachineInstr &MI, const MachineOperand &Src,
// Virtual register of the wrong class, we have to create a temporary 64-bit
// vreg to feed into the LEA.
NewSrc = MF.getRegInfo().createVirtualRegister(RC);
- BuildMI(*MI.getParent(), MI, MI.getDebugLoc(), get(TargetOpcode::COPY))
+ MachineInstr *Copy = BuildMI(*MI.getParent(), MI, MI.getDebugLoc(),
+ get(TargetOpcode::COPY))
.addReg(NewSrc, RegState::Define | RegState::Undef, X86::sub_32bit)
.addOperand(Src);
// Which is obviously going to be dead after we're done with it.
isKill = true;
isUndef = false;
+
+ if (LV)
+ LV->replaceKillInstruction(SrcReg, MI, *Copy);
}
// We've set all the parameters without issue.
@@ -2900,7 +2905,7 @@ X86InstrInfo::convertToThreeAddress(MachineFunction::iterator &MFI,
unsigned SrcReg;
MachineOperand ImplicitOp = MachineOperand::CreateReg(0, false);
if (!classifyLEAReg(MI, Src, Opc, /*AllowSP=*/ false,
- SrcReg, isKill, isUndef, ImplicitOp))
+ SrcReg, isKill, isUndef, ImplicitOp, LV))
return nullptr;
MachineInstrBuilder MIB =
@@ -2943,7 +2948,7 @@ X86InstrInfo::convertToThreeAddress(MachineFunction::iterator &MFI,
unsigned SrcReg;
MachineOperand ImplicitOp = MachineOperand::CreateReg(0, false);
if (!classifyLEAReg(MI, Src, Opc, /*AllowSP=*/ false,
- SrcReg, isKill, isUndef, ImplicitOp))
+ SrcReg, isKill, isUndef, ImplicitOp, LV))
return nullptr;
MachineInstrBuilder MIB =
@@ -2977,7 +2982,7 @@ X86InstrInfo::convertToThreeAddress(MachineFunction::iterator &MFI,
unsigned SrcReg;
MachineOperand ImplicitOp = MachineOperand::CreateReg(0, false);
if (!classifyLEAReg(MI, Src, Opc, /*AllowSP=*/ false,
- SrcReg, isKill, isUndef, ImplicitOp))
+ SrcReg, isKill, isUndef, ImplicitOp, LV))
return nullptr;
MachineInstrBuilder MIB = BuildMI(MF, MI.getDebugLoc(), get(Opc))
@@ -3016,7 +3021,7 @@ X86InstrInfo::convertToThreeAddress(MachineFunction::iterator &MFI,
unsigned SrcReg;
MachineOperand ImplicitOp = MachineOperand::CreateReg(0, false);
if (!classifyLEAReg(MI, Src, Opc, /*AllowSP=*/ true,
- SrcReg, isKill, isUndef, ImplicitOp))
+ SrcReg, isKill, isUndef, ImplicitOp, LV))
return nullptr;
const MachineOperand &Src2 = MI.getOperand(2);
@@ -3024,7 +3029,7 @@ X86InstrInfo::convertToThreeAddress(MachineFunction::iterator &MFI,
unsigned SrcReg2;
MachineOperand ImplicitOp2 = MachineOperand::CreateReg(0, false);
if (!classifyLEAReg(MI, Src2, Opc, /*AllowSP=*/ false,
- SrcReg2, isKill2, isUndef2, ImplicitOp2))
+ SrcReg2, isKill2, isUndef2, ImplicitOp2, LV))
return nullptr;
MachineInstrBuilder MIB =
@@ -3087,7 +3092,7 @@ X86InstrInfo::convertToThreeAddress(MachineFunction::iterator &MFI,
unsigned SrcReg;
MachineOperand ImplicitOp = MachineOperand::CreateReg(0, false);
if (!classifyLEAReg(MI, Src, Opc, /*AllowSP=*/ true,
- SrcReg, isKill, isUndef, ImplicitOp))
+ SrcReg, isKill, isUndef, ImplicitOp, LV))
return nullptr;
MachineInstrBuilder MIB = BuildMI(MF, MI.getDebugLoc(), get(Opc))
diff --git a/lib/Target/X86/X86InstrInfo.h b/lib/Target/X86/X86InstrInfo.h
index 858f35d1cbf0..a8a9f629fc1d 100644
--- a/lib/Target/X86/X86InstrInfo.h
+++ b/lib/Target/X86/X86InstrInfo.h
@@ -230,7 +230,7 @@ public:
bool classifyLEAReg(MachineInstr &MI, const MachineOperand &Src,
unsigned LEAOpcode, bool AllowSP, unsigned &NewSrc,
bool &isKill, bool &isUndef,
- MachineOperand &ImplicitOp) const;
+ MachineOperand &ImplicitOp, LiveVariables *LV) const;
/// convertToThreeAddress - This method must be implemented by targets that
/// set the M_CONVERTIBLE_TO_3_ADDR flag. When this flag is set, the target
diff --git a/lib/Target/X86/X86InstrSSE.td b/lib/Target/X86/X86InstrSSE.td
index 9a98f5cac2ee..f91764a67d1c 100644
--- a/lib/Target/X86/X86InstrSSE.td
+++ b/lib/Target/X86/X86InstrSSE.td
@@ -1820,7 +1820,7 @@ def Int_VCVTSD2SSrr: I<0x5A, MRMSrcReg,
(int_x86_sse2_cvtsd2ss VR128:$src1, VR128:$src2))],
IIC_SSE_CVT_Scalar_RR>, XD, VEX_4V, Requires<[HasAVX]>,
Sched<[WriteCvtF2F]>;
-def Int_VCVTSD2SSrm: I<0x5A, MRMSrcReg,
+def Int_VCVTSD2SSrm: I<0x5A, MRMSrcMem,
(outs VR128:$dst), (ins VR128:$src1, sdmem:$src2),
"vcvtsd2ss\t{$src2, $src1, $dst|$dst, $src1, $src2}",
[(set VR128:$dst, (int_x86_sse2_cvtsd2ss
@@ -1836,7 +1836,7 @@ def Int_CVTSD2SSrr: I<0x5A, MRMSrcReg,
(int_x86_sse2_cvtsd2ss VR128:$src1, VR128:$src2))],
IIC_SSE_CVT_Scalar_RR>, XD, Requires<[UseSSE2]>,
Sched<[WriteCvtF2F]>;
-def Int_CVTSD2SSrm: I<0x5A, MRMSrcReg,
+def Int_CVTSD2SSrm: I<0x5A, MRMSrcMem,
(outs VR128:$dst), (ins VR128:$src1, sdmem:$src2),
"cvtsd2ss\t{$src2, $dst|$dst, $src2}",
[(set VR128:$dst, (int_x86_sse2_cvtsd2ss
@@ -2009,24 +2009,35 @@ def CVTPD2DQrr : SDI<0xE6, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
// SSE2 packed instructions with XS prefix
def VCVTTPS2DQrr : VS2SI<0x5B, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
"cvttps2dq\t{$src, $dst|$dst, $src}",
- [], IIC_SSE_CVT_PS_RR>, VEX, Sched<[WriteCvtF2I]>;
+ [(set VR128:$dst,
+ (int_x86_sse2_cvttps2dq VR128:$src))],
+ IIC_SSE_CVT_PS_RR>, VEX, Sched<[WriteCvtF2I]>;
def VCVTTPS2DQrm : VS2SI<0x5B, MRMSrcMem, (outs VR128:$dst), (ins f128mem:$src),
"cvttps2dq\t{$src, $dst|$dst, $src}",
- [], IIC_SSE_CVT_PS_RM>, VEX, Sched<[WriteCvtF2ILd]>;
+ [(set VR128:$dst, (int_x86_sse2_cvttps2dq
+ (loadv4f32 addr:$src)))],
+ IIC_SSE_CVT_PS_RM>, VEX, Sched<[WriteCvtF2ILd]>;
def VCVTTPS2DQYrr : VS2SI<0x5B, MRMSrcReg, (outs VR256:$dst), (ins VR256:$src),
"cvttps2dq\t{$src, $dst|$dst, $src}",
- [], IIC_SSE_CVT_PS_RR>, VEX, VEX_L, Sched<[WriteCvtF2I]>;
+ [(set VR256:$dst,
+ (int_x86_avx_cvtt_ps2dq_256 VR256:$src))],
+ IIC_SSE_CVT_PS_RR>, VEX, VEX_L, Sched<[WriteCvtF2I]>;
def VCVTTPS2DQYrm : VS2SI<0x5B, MRMSrcMem, (outs VR256:$dst), (ins f256mem:$src),
"cvttps2dq\t{$src, $dst|$dst, $src}",
- [], IIC_SSE_CVT_PS_RM>, VEX, VEX_L,
+ [(set VR256:$dst, (int_x86_avx_cvtt_ps2dq_256
+ (loadv8f32 addr:$src)))],
+ IIC_SSE_CVT_PS_RM>, VEX, VEX_L,
Sched<[WriteCvtF2ILd]>;
def CVTTPS2DQrr : S2SI<0x5B, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
"cvttps2dq\t{$src, $dst|$dst, $src}",
- [], IIC_SSE_CVT_PS_RR>, Sched<[WriteCvtF2I]>;
+ [(set VR128:$dst, (int_x86_sse2_cvttps2dq VR128:$src))],
+ IIC_SSE_CVT_PS_RR>, Sched<[WriteCvtF2I]>;
def CVTTPS2DQrm : S2SI<0x5B, MRMSrcMem, (outs VR128:$dst), (ins f128mem:$src),
"cvttps2dq\t{$src, $dst|$dst, $src}",
- [], IIC_SSE_CVT_PS_RM>, Sched<[WriteCvtF2ILd]>;
+ [(set VR128:$dst,
+ (int_x86_sse2_cvttps2dq (memopv4f32 addr:$src)))],
+ IIC_SSE_CVT_PS_RM>, Sched<[WriteCvtF2ILd]>;
let Predicates = [HasAVX] in {
def : Pat<(int_x86_sse2_cvtdq2ps VR128:$src),
@@ -2096,10 +2107,14 @@ def VCVTTPD2DQXrm : VPDI<0xE6, MRMSrcMem, (outs VR128:$dst), (ins f128mem:$src),
// YMM only
def VCVTTPD2DQYrr : VPDI<0xE6, MRMSrcReg, (outs VR128:$dst), (ins VR256:$src),
"cvttpd2dq{y}\t{$src, $dst|$dst, $src}",
- [], IIC_SSE_CVT_PD_RR>, VEX, VEX_L, Sched<[WriteCvtF2I]>;
+ [(set VR128:$dst,
+ (int_x86_avx_cvtt_pd2dq_256 VR256:$src))],
+ IIC_SSE_CVT_PD_RR>, VEX, VEX_L, Sched<[WriteCvtF2I]>;
def VCVTTPD2DQYrm : VPDI<0xE6, MRMSrcMem, (outs VR128:$dst), (ins f256mem:$src),
"cvttpd2dq{y}\t{$src, $dst|$dst, $src}",
- [], IIC_SSE_CVT_PD_RM>, VEX, VEX_L, Sched<[WriteCvtF2ILd]>;
+ [(set VR128:$dst,
+ (int_x86_avx_cvtt_pd2dq_256 (loadv4f64 addr:$src)))],
+ IIC_SSE_CVT_PD_RM>, VEX, VEX_L, Sched<[WriteCvtF2ILd]>;
def : InstAlias<"vcvttpd2dq\t{$src, $dst|$dst, $src}",
(VCVTTPD2DQYrr VR128:$dst, VR256:$src), 0>;