diff options
Diffstat (limited to 'lib/Target/AMDGPU/R600ISelLowering.cpp')
-rw-r--r-- | lib/Target/AMDGPU/R600ISelLowering.cpp | 1006 |
1 files changed, 560 insertions, 446 deletions
diff --git a/lib/Target/AMDGPU/R600ISelLowering.cpp b/lib/Target/AMDGPU/R600ISelLowering.cpp index 124a9c6e0f564..8f78edd76a51e 100644 --- a/lib/Target/AMDGPU/R600ISelLowering.cpp +++ b/lib/Target/AMDGPU/R600ISelLowering.cpp @@ -30,18 +30,61 @@ using namespace llvm; -R600TargetLowering::R600TargetLowering(TargetMachine &TM, - const AMDGPUSubtarget &STI) +R600TargetLowering::R600TargetLowering(const TargetMachine &TM, + const R600Subtarget &STI) : AMDGPUTargetLowering(TM, STI), Gen(STI.getGeneration()) { - addRegisterClass(MVT::v4f32, &AMDGPU::R600_Reg128RegClass); addRegisterClass(MVT::f32, &AMDGPU::R600_Reg32RegClass); - addRegisterClass(MVT::v4i32, &AMDGPU::R600_Reg128RegClass); addRegisterClass(MVT::i32, &AMDGPU::R600_Reg32RegClass); addRegisterClass(MVT::v2f32, &AMDGPU::R600_Reg64RegClass); addRegisterClass(MVT::v2i32, &AMDGPU::R600_Reg64RegClass); + addRegisterClass(MVT::v4f32, &AMDGPU::R600_Reg128RegClass); + addRegisterClass(MVT::v4i32, &AMDGPU::R600_Reg128RegClass); computeRegisterProperties(STI.getRegisterInfo()); + // Legalize loads and stores to the private address space. + setOperationAction(ISD::LOAD, MVT::i32, Custom); + setOperationAction(ISD::LOAD, MVT::v2i32, Custom); + setOperationAction(ISD::LOAD, MVT::v4i32, Custom); + + // EXTLOAD should be the same as ZEXTLOAD. It is legal for some address + // spaces, so it is custom lowered to handle those where it isn't. + for (MVT VT : MVT::integer_valuetypes()) { + setLoadExtAction(ISD::SEXTLOAD, VT, MVT::i1, Promote); + setLoadExtAction(ISD::SEXTLOAD, VT, MVT::i8, Custom); + setLoadExtAction(ISD::SEXTLOAD, VT, MVT::i16, Custom); + + setLoadExtAction(ISD::ZEXTLOAD, VT, MVT::i1, Promote); + setLoadExtAction(ISD::ZEXTLOAD, VT, MVT::i8, Custom); + setLoadExtAction(ISD::ZEXTLOAD, VT, MVT::i16, Custom); + + setLoadExtAction(ISD::EXTLOAD, VT, MVT::i1, Promote); + setLoadExtAction(ISD::EXTLOAD, VT, MVT::i8, Custom); + setLoadExtAction(ISD::EXTLOAD, VT, MVT::i16, Custom); + } + + // Workaround for LegalizeDAG asserting on expansion of i1 vector loads. + setLoadExtAction(ISD::EXTLOAD, MVT::v2i32, MVT::v2i1, Expand); + setLoadExtAction(ISD::SEXTLOAD, MVT::v2i32, MVT::v2i1, Expand); + setLoadExtAction(ISD::ZEXTLOAD, MVT::v2i32, MVT::v2i1, Expand); + + setLoadExtAction(ISD::EXTLOAD, MVT::v4i32, MVT::v4i1, Expand); + setLoadExtAction(ISD::SEXTLOAD, MVT::v4i32, MVT::v4i1, Expand); + setLoadExtAction(ISD::ZEXTLOAD, MVT::v4i32, MVT::v4i1, Expand); + + + setOperationAction(ISD::STORE, MVT::i8, Custom); + setOperationAction(ISD::STORE, MVT::i32, Custom); + setOperationAction(ISD::STORE, MVT::v2i32, Custom); + setOperationAction(ISD::STORE, MVT::v4i32, Custom); + + setTruncStoreAction(MVT::i32, MVT::i8, Custom); + setTruncStoreAction(MVT::i32, MVT::i16, Custom); + + // Workaround for LegalizeDAG asserting on expansion of i1 vector stores. + setTruncStoreAction(MVT::v2i32, MVT::v2i1, Expand); + setTruncStoreAction(MVT::v4i32, MVT::v4i1, Expand); + // Set condition code actions setCondCodeAction(ISD::SETO, MVT::f32, Expand); setCondCodeAction(ISD::SETUO, MVT::f32, Expand); @@ -73,10 +116,6 @@ R600TargetLowering::R600TargetLowering(TargetMachine &TM, setOperationAction(ISD::FSUB, MVT::f32, Expand); - setOperationAction(ISD::INTRINSIC_VOID, MVT::Other, Custom); - setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::Other, Custom); - setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::i1, Custom); - setOperationAction(ISD::SELECT_CC, MVT::f32, Custom); setOperationAction(ISD::SELECT_CC, MVT::i32, Custom); @@ -122,37 +161,6 @@ R600TargetLowering::R600TargetLowering(TargetMachine &TM, setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::Other, Expand); - - // Legalize loads and stores to the private address space. - setOperationAction(ISD::LOAD, MVT::i32, Custom); - setOperationAction(ISD::LOAD, MVT::v2i32, Custom); - setOperationAction(ISD::LOAD, MVT::v4i32, Custom); - - // EXTLOAD should be the same as ZEXTLOAD. It is legal for some address - // spaces, so it is custom lowered to handle those where it isn't. - for (MVT VT : MVT::integer_valuetypes()) { - setLoadExtAction(ISD::SEXTLOAD, VT, MVT::i1, Promote); - setLoadExtAction(ISD::SEXTLOAD, VT, MVT::i8, Custom); - setLoadExtAction(ISD::SEXTLOAD, VT, MVT::i16, Custom); - - setLoadExtAction(ISD::ZEXTLOAD, VT, MVT::i1, Promote); - setLoadExtAction(ISD::ZEXTLOAD, VT, MVT::i8, Custom); - setLoadExtAction(ISD::ZEXTLOAD, VT, MVT::i16, Custom); - - setLoadExtAction(ISD::EXTLOAD, VT, MVT::i1, Promote); - setLoadExtAction(ISD::EXTLOAD, VT, MVT::i8, Custom); - setLoadExtAction(ISD::EXTLOAD, VT, MVT::i16, Custom); - } - - setOperationAction(ISD::STORE, MVT::i8, Custom); - setOperationAction(ISD::STORE, MVT::i32, Custom); - setOperationAction(ISD::STORE, MVT::v2i32, Custom); - setOperationAction(ISD::STORE, MVT::v4i32, Custom); - setTruncStoreAction(MVT::i32, MVT::i8, Custom); - setTruncStoreAction(MVT::i32, MVT::i16, Custom); - - setOperationAction(ISD::LOAD, MVT::i32, Custom); - setOperationAction(ISD::LOAD, MVT::v4i32, Custom); setOperationAction(ISD::FrameIndex, MVT::i32, Custom); setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v2i32, Custom); @@ -165,12 +173,6 @@ R600TargetLowering::R600TargetLowering(TargetMachine &TM, setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v4i32, Custom); setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v4f32, Custom); - setTargetDAGCombine(ISD::FP_ROUND); - setTargetDAGCombine(ISD::FP_TO_SINT); - setTargetDAGCombine(ISD::EXTRACT_VECTOR_ELT); - setTargetDAGCombine(ISD::SELECT_CC); - setTargetDAGCombine(ISD::INSERT_VECTOR_ELT); - // We don't have 64-bit shifts. Thus we need either SHX i64 or SHX_PARTS i32 // to be Legal/Custom in order to avoid library calls. setOperationAction(ISD::SHL_PARTS, MVT::i32, Custom); @@ -188,119 +190,138 @@ R600TargetLowering::R600TargetLowering(TargetMachine &TM, } setSchedulingPreference(Sched::Source); + + + setTargetDAGCombine(ISD::FP_ROUND); + setTargetDAGCombine(ISD::FP_TO_SINT); + setTargetDAGCombine(ISD::EXTRACT_VECTOR_ELT); + setTargetDAGCombine(ISD::SELECT_CC); + setTargetDAGCombine(ISD::INSERT_VECTOR_ELT); +} + +const R600Subtarget *R600TargetLowering::getSubtarget() const { + return static_cast<const R600Subtarget *>(Subtarget); } static inline bool isEOP(MachineBasicBlock::iterator I) { return std::next(I)->getOpcode() == AMDGPU::RETURN; } -MachineBasicBlock * R600TargetLowering::EmitInstrWithCustomInserter( - MachineInstr * MI, MachineBasicBlock * BB) const { +MachineBasicBlock * +R600TargetLowering::EmitInstrWithCustomInserter(MachineInstr &MI, + MachineBasicBlock *BB) const { MachineFunction * MF = BB->getParent(); MachineRegisterInfo &MRI = MF->getRegInfo(); - MachineBasicBlock::iterator I = *MI; - const R600InstrInfo *TII = - static_cast<const R600InstrInfo *>(Subtarget->getInstrInfo()); + MachineBasicBlock::iterator I = MI; + const R600InstrInfo *TII = getSubtarget()->getInstrInfo(); - switch (MI->getOpcode()) { + switch (MI.getOpcode()) { default: // Replace LDS_*_RET instruction that don't have any uses with the // equivalent LDS_*_NORET instruction. - if (TII->isLDSRetInstr(MI->getOpcode())) { - int DstIdx = TII->getOperandIdx(MI->getOpcode(), AMDGPU::OpName::dst); + if (TII->isLDSRetInstr(MI.getOpcode())) { + int DstIdx = TII->getOperandIdx(MI.getOpcode(), AMDGPU::OpName::dst); assert(DstIdx != -1); MachineInstrBuilder NewMI; // FIXME: getLDSNoRetOp method only handles LDS_1A1D LDS ops. Add // LDS_1A2D support and remove this special case. - if (!MRI.use_empty(MI->getOperand(DstIdx).getReg()) || - MI->getOpcode() == AMDGPU::LDS_CMPST_RET) + if (!MRI.use_empty(MI.getOperand(DstIdx).getReg()) || + MI.getOpcode() == AMDGPU::LDS_CMPST_RET) return BB; NewMI = BuildMI(*BB, I, BB->findDebugLoc(I), - TII->get(AMDGPU::getLDSNoRetOp(MI->getOpcode()))); - for (unsigned i = 1, e = MI->getNumOperands(); i < e; ++i) { - NewMI.addOperand(MI->getOperand(i)); + TII->get(AMDGPU::getLDSNoRetOp(MI.getOpcode()))); + for (unsigned i = 1, e = MI.getNumOperands(); i < e; ++i) { + NewMI.addOperand(MI.getOperand(i)); } } else { return AMDGPUTargetLowering::EmitInstrWithCustomInserter(MI, BB); } break; case AMDGPU::CLAMP_R600: { - MachineInstr *NewMI = TII->buildDefaultInstruction(*BB, I, - AMDGPU::MOV, - MI->getOperand(0).getReg(), - MI->getOperand(1).getReg()); - TII->addFlag(NewMI, 0, MO_FLAG_CLAMP); + MachineInstr *NewMI = TII->buildDefaultInstruction( + *BB, I, AMDGPU::MOV, MI.getOperand(0).getReg(), + MI.getOperand(1).getReg()); + TII->addFlag(*NewMI, 0, MO_FLAG_CLAMP); break; } case AMDGPU::FABS_R600: { - MachineInstr *NewMI = TII->buildDefaultInstruction(*BB, I, - AMDGPU::MOV, - MI->getOperand(0).getReg(), - MI->getOperand(1).getReg()); - TII->addFlag(NewMI, 0, MO_FLAG_ABS); + MachineInstr *NewMI = TII->buildDefaultInstruction( + *BB, I, AMDGPU::MOV, MI.getOperand(0).getReg(), + MI.getOperand(1).getReg()); + TII->addFlag(*NewMI, 0, MO_FLAG_ABS); break; } case AMDGPU::FNEG_R600: { - MachineInstr *NewMI = TII->buildDefaultInstruction(*BB, I, - AMDGPU::MOV, - MI->getOperand(0).getReg(), - MI->getOperand(1).getReg()); - TII->addFlag(NewMI, 0, MO_FLAG_NEG); + MachineInstr *NewMI = TII->buildDefaultInstruction( + *BB, I, AMDGPU::MOV, MI.getOperand(0).getReg(), + MI.getOperand(1).getReg()); + TII->addFlag(*NewMI, 0, MO_FLAG_NEG); break; } case AMDGPU::MASK_WRITE: { - unsigned maskedRegister = MI->getOperand(0).getReg(); + unsigned maskedRegister = MI.getOperand(0).getReg(); assert(TargetRegisterInfo::isVirtualRegister(maskedRegister)); MachineInstr * defInstr = MRI.getVRegDef(maskedRegister); - TII->addFlag(defInstr, 0, MO_FLAG_MASK); + TII->addFlag(*defInstr, 0, MO_FLAG_MASK); break; } case AMDGPU::MOV_IMM_F32: - TII->buildMovImm(*BB, I, MI->getOperand(0).getReg(), - MI->getOperand(1).getFPImm()->getValueAPF() - .bitcastToAPInt().getZExtValue()); + TII->buildMovImm(*BB, I, MI.getOperand(0).getReg(), MI.getOperand(1) + .getFPImm() + ->getValueAPF() + .bitcastToAPInt() + .getZExtValue()); break; case AMDGPU::MOV_IMM_I32: - TII->buildMovImm(*BB, I, MI->getOperand(0).getReg(), - MI->getOperand(1).getImm()); + TII->buildMovImm(*BB, I, MI.getOperand(0).getReg(), + MI.getOperand(1).getImm()); break; + case AMDGPU::MOV_IMM_GLOBAL_ADDR: { + //TODO: Perhaps combine this instruction with the next if possible + auto MIB = TII->buildDefaultInstruction( + *BB, MI, AMDGPU::MOV, MI.getOperand(0).getReg(), AMDGPU::ALU_LITERAL_X); + int Idx = TII->getOperandIdx(*MIB, AMDGPU::OpName::literal); + //TODO: Ugh this is rather ugly + MIB->getOperand(Idx) = MI.getOperand(1); + break; + } case AMDGPU::CONST_COPY: { - MachineInstr *NewMI = TII->buildDefaultInstruction(*BB, MI, AMDGPU::MOV, - MI->getOperand(0).getReg(), AMDGPU::ALU_CONST); - TII->setImmOperand(NewMI, AMDGPU::OpName::src0_sel, - MI->getOperand(1).getImm()); + MachineInstr *NewMI = TII->buildDefaultInstruction( + *BB, MI, AMDGPU::MOV, MI.getOperand(0).getReg(), AMDGPU::ALU_CONST); + TII->setImmOperand(*NewMI, AMDGPU::OpName::src0_sel, + MI.getOperand(1).getImm()); break; } case AMDGPU::RAT_WRITE_CACHELESS_32_eg: case AMDGPU::RAT_WRITE_CACHELESS_64_eg: case AMDGPU::RAT_WRITE_CACHELESS_128_eg: { - BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(MI->getOpcode())) - .addOperand(MI->getOperand(0)) - .addOperand(MI->getOperand(1)) - .addImm(isEOP(I)); // Set End of program bit + BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(MI.getOpcode())) + .addOperand(MI.getOperand(0)) + .addOperand(MI.getOperand(1)) + .addImm(isEOP(I)); // Set End of program bit break; } case AMDGPU::RAT_STORE_TYPED_eg: { - BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(MI->getOpcode())) - .addOperand(MI->getOperand(0)) - .addOperand(MI->getOperand(1)) - .addOperand(MI->getOperand(2)) - .addImm(isEOP(I)); // Set End of program bit + BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(MI.getOpcode())) + .addOperand(MI.getOperand(0)) + .addOperand(MI.getOperand(1)) + .addOperand(MI.getOperand(2)) + .addImm(isEOP(I)); // Set End of program bit break; } case AMDGPU::TXD: { unsigned T0 = MRI.createVirtualRegister(&AMDGPU::R600_Reg128RegClass); unsigned T1 = MRI.createVirtualRegister(&AMDGPU::R600_Reg128RegClass); - MachineOperand &RID = MI->getOperand(4); - MachineOperand &SID = MI->getOperand(5); - unsigned TextureId = MI->getOperand(6).getImm(); + MachineOperand &RID = MI.getOperand(4); + MachineOperand &SID = MI.getOperand(5); + unsigned TextureId = MI.getOperand(6).getImm(); unsigned SrcX = 0, SrcY = 1, SrcZ = 2, SrcW = 3; unsigned CTX = 1, CTY = 1, CTZ = 1, CTW = 1; @@ -333,75 +354,77 @@ MachineBasicBlock * R600TargetLowering::EmitInstrWithCustomInserter( CTZ = 0; break; } - BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(AMDGPU::TEX_SET_GRADIENTS_H), T0) - .addOperand(MI->getOperand(3)) - .addImm(SrcX) - .addImm(SrcY) - .addImm(SrcZ) - .addImm(SrcW) - .addImm(0) - .addImm(0) - .addImm(0) - .addImm(0) - .addImm(1) - .addImm(2) - .addImm(3) - .addOperand(RID) - .addOperand(SID) - .addImm(CTX) - .addImm(CTY) - .addImm(CTZ) - .addImm(CTW); - BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(AMDGPU::TEX_SET_GRADIENTS_V), T1) - .addOperand(MI->getOperand(2)) - .addImm(SrcX) - .addImm(SrcY) - .addImm(SrcZ) - .addImm(SrcW) - .addImm(0) - .addImm(0) - .addImm(0) - .addImm(0) - .addImm(1) - .addImm(2) - .addImm(3) - .addOperand(RID) - .addOperand(SID) - .addImm(CTX) - .addImm(CTY) - .addImm(CTZ) - .addImm(CTW); + BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(AMDGPU::TEX_SET_GRADIENTS_H), + T0) + .addOperand(MI.getOperand(3)) + .addImm(SrcX) + .addImm(SrcY) + .addImm(SrcZ) + .addImm(SrcW) + .addImm(0) + .addImm(0) + .addImm(0) + .addImm(0) + .addImm(1) + .addImm(2) + .addImm(3) + .addOperand(RID) + .addOperand(SID) + .addImm(CTX) + .addImm(CTY) + .addImm(CTZ) + .addImm(CTW); + BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(AMDGPU::TEX_SET_GRADIENTS_V), + T1) + .addOperand(MI.getOperand(2)) + .addImm(SrcX) + .addImm(SrcY) + .addImm(SrcZ) + .addImm(SrcW) + .addImm(0) + .addImm(0) + .addImm(0) + .addImm(0) + .addImm(1) + .addImm(2) + .addImm(3) + .addOperand(RID) + .addOperand(SID) + .addImm(CTX) + .addImm(CTY) + .addImm(CTZ) + .addImm(CTW); BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(AMDGPU::TEX_SAMPLE_G)) - .addOperand(MI->getOperand(0)) - .addOperand(MI->getOperand(1)) - .addImm(SrcX) - .addImm(SrcY) - .addImm(SrcZ) - .addImm(SrcW) - .addImm(0) - .addImm(0) - .addImm(0) - .addImm(0) - .addImm(1) - .addImm(2) - .addImm(3) - .addOperand(RID) - .addOperand(SID) - .addImm(CTX) - .addImm(CTY) - .addImm(CTZ) - .addImm(CTW) - .addReg(T0, RegState::Implicit) - .addReg(T1, RegState::Implicit); + .addOperand(MI.getOperand(0)) + .addOperand(MI.getOperand(1)) + .addImm(SrcX) + .addImm(SrcY) + .addImm(SrcZ) + .addImm(SrcW) + .addImm(0) + .addImm(0) + .addImm(0) + .addImm(0) + .addImm(1) + .addImm(2) + .addImm(3) + .addOperand(RID) + .addOperand(SID) + .addImm(CTX) + .addImm(CTY) + .addImm(CTZ) + .addImm(CTW) + .addReg(T0, RegState::Implicit) + .addReg(T1, RegState::Implicit); break; } case AMDGPU::TXD_SHADOW: { unsigned T0 = MRI.createVirtualRegister(&AMDGPU::R600_Reg128RegClass); unsigned T1 = MRI.createVirtualRegister(&AMDGPU::R600_Reg128RegClass); - MachineOperand &RID = MI->getOperand(4); - MachineOperand &SID = MI->getOperand(5); - unsigned TextureId = MI->getOperand(6).getImm(); + MachineOperand &RID = MI.getOperand(4); + MachineOperand &SID = MI.getOperand(5); + unsigned TextureId = MI.getOperand(6).getImm(); unsigned SrcX = 0, SrcY = 1, SrcZ = 2, SrcW = 3; unsigned CTX = 1, CTY = 1, CTZ = 1, CTW = 1; @@ -435,99 +458,101 @@ MachineBasicBlock * R600TargetLowering::EmitInstrWithCustomInserter( break; } - BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(AMDGPU::TEX_SET_GRADIENTS_H), T0) - .addOperand(MI->getOperand(3)) - .addImm(SrcX) - .addImm(SrcY) - .addImm(SrcZ) - .addImm(SrcW) - .addImm(0) - .addImm(0) - .addImm(0) - .addImm(0) - .addImm(1) - .addImm(2) - .addImm(3) - .addOperand(RID) - .addOperand(SID) - .addImm(CTX) - .addImm(CTY) - .addImm(CTZ) - .addImm(CTW); - BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(AMDGPU::TEX_SET_GRADIENTS_V), T1) - .addOperand(MI->getOperand(2)) - .addImm(SrcX) - .addImm(SrcY) - .addImm(SrcZ) - .addImm(SrcW) - .addImm(0) - .addImm(0) - .addImm(0) - .addImm(0) - .addImm(1) - .addImm(2) - .addImm(3) - .addOperand(RID) - .addOperand(SID) - .addImm(CTX) - .addImm(CTY) - .addImm(CTZ) - .addImm(CTW); + BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(AMDGPU::TEX_SET_GRADIENTS_H), + T0) + .addOperand(MI.getOperand(3)) + .addImm(SrcX) + .addImm(SrcY) + .addImm(SrcZ) + .addImm(SrcW) + .addImm(0) + .addImm(0) + .addImm(0) + .addImm(0) + .addImm(1) + .addImm(2) + .addImm(3) + .addOperand(RID) + .addOperand(SID) + .addImm(CTX) + .addImm(CTY) + .addImm(CTZ) + .addImm(CTW); + BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(AMDGPU::TEX_SET_GRADIENTS_V), + T1) + .addOperand(MI.getOperand(2)) + .addImm(SrcX) + .addImm(SrcY) + .addImm(SrcZ) + .addImm(SrcW) + .addImm(0) + .addImm(0) + .addImm(0) + .addImm(0) + .addImm(1) + .addImm(2) + .addImm(3) + .addOperand(RID) + .addOperand(SID) + .addImm(CTX) + .addImm(CTY) + .addImm(CTZ) + .addImm(CTW); BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(AMDGPU::TEX_SAMPLE_C_G)) - .addOperand(MI->getOperand(0)) - .addOperand(MI->getOperand(1)) - .addImm(SrcX) - .addImm(SrcY) - .addImm(SrcZ) - .addImm(SrcW) - .addImm(0) - .addImm(0) - .addImm(0) - .addImm(0) - .addImm(1) - .addImm(2) - .addImm(3) - .addOperand(RID) - .addOperand(SID) - .addImm(CTX) - .addImm(CTY) - .addImm(CTZ) - .addImm(CTW) - .addReg(T0, RegState::Implicit) - .addReg(T1, RegState::Implicit); + .addOperand(MI.getOperand(0)) + .addOperand(MI.getOperand(1)) + .addImm(SrcX) + .addImm(SrcY) + .addImm(SrcZ) + .addImm(SrcW) + .addImm(0) + .addImm(0) + .addImm(0) + .addImm(0) + .addImm(1) + .addImm(2) + .addImm(3) + .addOperand(RID) + .addOperand(SID) + .addImm(CTX) + .addImm(CTY) + .addImm(CTZ) + .addImm(CTW) + .addReg(T0, RegState::Implicit) + .addReg(T1, RegState::Implicit); break; } case AMDGPU::BRANCH: - BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(AMDGPU::JUMP)) - .addOperand(MI->getOperand(0)); - break; + BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(AMDGPU::JUMP)) + .addOperand(MI.getOperand(0)); + break; case AMDGPU::BRANCH_COND_f32: { MachineInstr *NewMI = - BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(AMDGPU::PRED_X), - AMDGPU::PREDICATE_BIT) - .addOperand(MI->getOperand(1)) - .addImm(OPCODE_IS_NOT_ZERO) - .addImm(0); // Flags - TII->addFlag(NewMI, 0, MO_FLAG_PUSH); + BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(AMDGPU::PRED_X), + AMDGPU::PREDICATE_BIT) + .addOperand(MI.getOperand(1)) + .addImm(OPCODE_IS_NOT_ZERO) + .addImm(0); // Flags + TII->addFlag(*NewMI, 0, MO_FLAG_PUSH); BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(AMDGPU::JUMP_COND)) - .addOperand(MI->getOperand(0)) - .addReg(AMDGPU::PREDICATE_BIT, RegState::Kill); + .addOperand(MI.getOperand(0)) + .addReg(AMDGPU::PREDICATE_BIT, RegState::Kill); break; } case AMDGPU::BRANCH_COND_i32: { MachineInstr *NewMI = - BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(AMDGPU::PRED_X), - AMDGPU::PREDICATE_BIT) - .addOperand(MI->getOperand(1)) + BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(AMDGPU::PRED_X), + AMDGPU::PREDICATE_BIT) + .addOperand(MI.getOperand(1)) .addImm(OPCODE_IS_NOT_ZERO_INT) .addImm(0); // Flags - TII->addFlag(NewMI, 0, MO_FLAG_PUSH); + TII->addFlag(*NewMI, 0, MO_FLAG_PUSH); BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(AMDGPU::JUMP_COND)) - .addOperand(MI->getOperand(0)) - .addReg(AMDGPU::PREDICATE_BIT, RegState::Kill); + .addOperand(MI.getOperand(0)) + .addReg(AMDGPU::PREDICATE_BIT, RegState::Kill); break; } @@ -535,7 +560,7 @@ MachineBasicBlock * R600TargetLowering::EmitInstrWithCustomInserter( case AMDGPU::R600_ExportSwz: { // Instruction is left unmodified if its not the last one of its type bool isLastInstructionOfItsType = true; - unsigned InstExportType = MI->getOperand(1).getImm(); + unsigned InstExportType = MI.getOperand(1).getImm(); for (MachineBasicBlock::iterator NextExportInst = std::next(I), EndBlock = BB->end(); NextExportInst != EndBlock; NextExportInst = std::next(NextExportInst)) { @@ -552,17 +577,17 @@ MachineBasicBlock * R600TargetLowering::EmitInstrWithCustomInserter( bool EOP = isEOP(I); if (!EOP && !isLastInstructionOfItsType) return BB; - unsigned CfInst = (MI->getOpcode() == AMDGPU::EG_ExportSwz)? 84 : 40; - BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(MI->getOpcode())) - .addOperand(MI->getOperand(0)) - .addOperand(MI->getOperand(1)) - .addOperand(MI->getOperand(2)) - .addOperand(MI->getOperand(3)) - .addOperand(MI->getOperand(4)) - .addOperand(MI->getOperand(5)) - .addOperand(MI->getOperand(6)) - .addImm(CfInst) - .addImm(EOP); + unsigned CfInst = (MI.getOpcode() == AMDGPU::EG_ExportSwz) ? 84 : 40; + BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(MI.getOpcode())) + .addOperand(MI.getOperand(0)) + .addOperand(MI.getOperand(1)) + .addOperand(MI.getOperand(2)) + .addOperand(MI.getOperand(3)) + .addOperand(MI.getOperand(4)) + .addOperand(MI.getOperand(5)) + .addOperand(MI.getOperand(6)) + .addImm(CfInst) + .addImm(EOP); break; } case AMDGPU::RETURN: { @@ -576,7 +601,7 @@ MachineBasicBlock * R600TargetLowering::EmitInstrWithCustomInserter( } } - MI->eraseFromParent(); + MI.eraseFromParent(); return BB; } @@ -610,18 +635,13 @@ SDValue R600TargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const case ISD::BRCOND: return LowerBRCOND(Op, DAG); case ISD::GlobalAddress: return LowerGlobalAddress(MFI, Op, DAG); + case ISD::FrameIndex: return lowerFrameIndex(Op, DAG); case ISD::INTRINSIC_VOID: { SDValue Chain = Op.getOperand(0); unsigned IntrinsicID = cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue(); switch (IntrinsicID) { - case AMDGPUIntrinsic::AMDGPU_store_output: { - int64_t RegIndex = cast<ConstantSDNode>(Op.getOperand(3))->getZExtValue(); - unsigned Reg = AMDGPU::R600_TReg32RegClass.getRegister(RegIndex); - MFI->LiveOuts.push_back(Reg); - return DAG.getCopyToReg(Chain, SDLoc(Op), Reg, Op.getOperand(2)); - } - case AMDGPUIntrinsic::R600_store_swizzle: { + case AMDGPUIntrinsic::r600_store_swizzle: { SDLoc DL(Op); const SDValue Args[8] = { Chain, @@ -649,114 +669,48 @@ SDValue R600TargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const SDLoc DL(Op); switch(IntrinsicID) { default: return AMDGPUTargetLowering::LowerOperation(Op, DAG); - case AMDGPUIntrinsic::R600_load_input: { - int64_t RegIndex = cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue(); - unsigned Reg = AMDGPU::R600_TReg32RegClass.getRegister(RegIndex); - MachineFunction &MF = DAG.getMachineFunction(); - MachineRegisterInfo &MRI = MF.getRegInfo(); - MRI.addLiveIn(Reg); - return DAG.getCopyFromReg(DAG.getEntryNode(), - SDLoc(DAG.getEntryNode()), Reg, VT); - } - - case AMDGPUIntrinsic::R600_interp_input: { - int slot = cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue(); - int ijb = cast<ConstantSDNode>(Op.getOperand(2))->getSExtValue(); - MachineSDNode *interp; - if (ijb < 0) { - const R600InstrInfo *TII = - static_cast<const R600InstrInfo *>(Subtarget->getInstrInfo()); - interp = DAG.getMachineNode(AMDGPU::INTERP_VEC_LOAD, DL, - MVT::v4f32, DAG.getTargetConstant(slot / 4, DL, MVT::i32)); - return DAG.getTargetExtractSubreg( - TII->getRegisterInfo().getSubRegFromChannel(slot % 4), - DL, MVT::f32, SDValue(interp, 0)); - } - MachineFunction &MF = DAG.getMachineFunction(); - MachineRegisterInfo &MRI = MF.getRegInfo(); - unsigned RegisterI = AMDGPU::R600_TReg32RegClass.getRegister(2 * ijb); - unsigned RegisterJ = AMDGPU::R600_TReg32RegClass.getRegister(2 * ijb + 1); - MRI.addLiveIn(RegisterI); - MRI.addLiveIn(RegisterJ); - SDValue RegisterINode = DAG.getCopyFromReg(DAG.getEntryNode(), - SDLoc(DAG.getEntryNode()), RegisterI, MVT::f32); - SDValue RegisterJNode = DAG.getCopyFromReg(DAG.getEntryNode(), - SDLoc(DAG.getEntryNode()), RegisterJ, MVT::f32); - - if (slot % 4 < 2) - interp = DAG.getMachineNode(AMDGPU::INTERP_PAIR_XY, DL, - MVT::f32, MVT::f32, DAG.getTargetConstant(slot / 4, DL, MVT::i32), - RegisterJNode, RegisterINode); - else - interp = DAG.getMachineNode(AMDGPU::INTERP_PAIR_ZW, DL, - MVT::f32, MVT::f32, DAG.getTargetConstant(slot / 4, DL, MVT::i32), - RegisterJNode, RegisterINode); - return SDValue(interp, slot % 2); - } - case AMDGPUIntrinsic::R600_interp_xy: - case AMDGPUIntrinsic::R600_interp_zw: { - int slot = cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue(); - MachineSDNode *interp; - SDValue RegisterINode = Op.getOperand(2); - SDValue RegisterJNode = Op.getOperand(3); - - if (IntrinsicID == AMDGPUIntrinsic::R600_interp_xy) - interp = DAG.getMachineNode(AMDGPU::INTERP_PAIR_XY, DL, - MVT::f32, MVT::f32, DAG.getTargetConstant(slot, DL, MVT::i32), - RegisterJNode, RegisterINode); - else - interp = DAG.getMachineNode(AMDGPU::INTERP_PAIR_ZW, DL, - MVT::f32, MVT::f32, DAG.getTargetConstant(slot, DL, MVT::i32), - RegisterJNode, RegisterINode); - return DAG.getNode(ISD::BUILD_VECTOR, DL, MVT::v2f32, - SDValue(interp, 0), SDValue(interp, 1)); - } - case AMDGPUIntrinsic::R600_tex: - case AMDGPUIntrinsic::R600_texc: - case AMDGPUIntrinsic::R600_txl: - case AMDGPUIntrinsic::R600_txlc: - case AMDGPUIntrinsic::R600_txb: - case AMDGPUIntrinsic::R600_txbc: - case AMDGPUIntrinsic::R600_txf: - case AMDGPUIntrinsic::R600_txq: - case AMDGPUIntrinsic::R600_ddx: - case AMDGPUIntrinsic::R600_ddy: - case AMDGPUIntrinsic::R600_ldptr: { + case AMDGPUIntrinsic::r600_tex: + case AMDGPUIntrinsic::r600_texc: + case AMDGPUIntrinsic::r600_txl: + case AMDGPUIntrinsic::r600_txlc: + case AMDGPUIntrinsic::r600_txb: + case AMDGPUIntrinsic::r600_txbc: + case AMDGPUIntrinsic::r600_txf: + case AMDGPUIntrinsic::r600_txq: + case AMDGPUIntrinsic::r600_ddx: + case AMDGPUIntrinsic::r600_ddy: { unsigned TextureOp; switch (IntrinsicID) { - case AMDGPUIntrinsic::R600_tex: + case AMDGPUIntrinsic::r600_tex: TextureOp = 0; break; - case AMDGPUIntrinsic::R600_texc: + case AMDGPUIntrinsic::r600_texc: TextureOp = 1; break; - case AMDGPUIntrinsic::R600_txl: + case AMDGPUIntrinsic::r600_txl: TextureOp = 2; break; - case AMDGPUIntrinsic::R600_txlc: + case AMDGPUIntrinsic::r600_txlc: TextureOp = 3; break; - case AMDGPUIntrinsic::R600_txb: + case AMDGPUIntrinsic::r600_txb: TextureOp = 4; break; - case AMDGPUIntrinsic::R600_txbc: + case AMDGPUIntrinsic::r600_txbc: TextureOp = 5; break; - case AMDGPUIntrinsic::R600_txf: + case AMDGPUIntrinsic::r600_txf: TextureOp = 6; break; - case AMDGPUIntrinsic::R600_txq: + case AMDGPUIntrinsic::r600_txq: TextureOp = 7; break; - case AMDGPUIntrinsic::R600_ddx: + case AMDGPUIntrinsic::r600_ddx: TextureOp = 8; break; - case AMDGPUIntrinsic::R600_ddy: + case AMDGPUIntrinsic::r600_ddy: TextureOp = 9; break; - case AMDGPUIntrinsic::R600_ldptr: - TextureOp = 10; - break; default: llvm_unreachable("Unknow Texture Operation"); } @@ -784,7 +738,7 @@ SDValue R600TargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const }; return DAG.getNode(AMDGPUISD::TEXTURE_FETCH, DL, MVT::v4f32, TexArgs); } - case AMDGPUIntrinsic::AMDGPU_dp4: { + case AMDGPUIntrinsic::r600_dot4: { SDValue Args[8] = { DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32, Op.getOperand(1), DAG.getConstant(0, DL, MVT::i32)), @@ -806,6 +760,11 @@ SDValue R600TargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const return DAG.getNode(AMDGPUISD::DOT4, DL, MVT::f32, Args); } + case Intrinsic::r600_implicitarg_ptr: { + MVT PtrVT = getPointerTy(DAG.getDataLayout(), AMDGPUAS::PARAM_I_ADDRESS); + uint32_t ByteOffset = getImplicitParameterOffset(MFI, FIRST_IMPLICIT); + return DAG.getConstant(ByteOffset, DL, PtrVT); + } case Intrinsic::r600_read_ngroups_x: return LowerImplicitParameter(DAG, VT, DL, 0); case Intrinsic::r600_read_ngroups_y: @@ -825,7 +784,8 @@ SDValue R600TargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const case Intrinsic::r600_read_local_size_z: return LowerImplicitParameter(DAG, VT, DL, 8); - case Intrinsic::AMDGPU_read_workdim: { + case Intrinsic::r600_read_workdim: + case AMDGPUIntrinsic::AMDGPU_read_workdim: { // Legacy name. uint32_t ByteOffset = getImplicitParameterOffset(MFI, GRID_DIM); return LowerImplicitParameter(DAG, VT, DL, ByteOffset / 4); } @@ -848,14 +808,14 @@ SDValue R600TargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const case Intrinsic::r600_read_tidig_z: return CreateLiveInRegister(DAG, &AMDGPU::R600_TReg32RegClass, AMDGPU::T0_Z, VT); - case Intrinsic::AMDGPU_rsq: - // XXX - I'm assuming SI's RSQ_LEGACY matches R600's behavior. - return DAG.getNode(AMDGPUISD::RSQ_LEGACY, DL, VT, Op.getOperand(1)); - case AMDGPUIntrinsic::AMDGPU_fract: - case AMDGPUIntrinsic::AMDIL_fraction: // Legacy name. - return DAG.getNode(AMDGPUISD::FRACT, DL, VT, Op.getOperand(1)); + case Intrinsic::r600_recipsqrt_ieee: + return DAG.getNode(AMDGPUISD::RSQ, DL, VT, Op.getOperand(1)); + + case Intrinsic::r600_recipsqrt_clamped: + return DAG.getNode(AMDGPUISD::RSQ_CLAMP, DL, VT, Op.getOperand(1)); } + // break out of case ISD::INTRINSIC_WO_CHAIN in switch(Op.getOpcode()) break; } @@ -950,6 +910,22 @@ SDValue R600TargetLowering::LowerINSERT_VECTOR_ELT(SDValue Op, return vectorToVerticalVector(DAG, Insert); } +SDValue R600TargetLowering::LowerGlobalAddress(AMDGPUMachineFunction *MFI, + SDValue Op, + SelectionDAG &DAG) const { + + GlobalAddressSDNode *GSD = cast<GlobalAddressSDNode>(Op); + if (GSD->getAddressSpace() != AMDGPUAS::CONSTANT_ADDRESS) + return AMDGPUTargetLowering::LowerGlobalAddress(MFI, Op, DAG); + + const DataLayout &DL = DAG.getDataLayout(); + const GlobalValue *GV = GSD->getGlobal(); + MVT ConstPtrVT = getPointerTy(DL, AMDGPUAS::CONSTANT_ADDRESS); + + SDValue GA = DAG.getTargetGlobalAddress(GV, SDLoc(GSD), ConstPtrVT); + return DAG.getNode(AMDGPUISD::CONST_DATA_PTR, SDLoc(GSD), ConstPtrVT, GA); +} + SDValue R600TargetLowering::LowerTrig(SDValue Op, SelectionDAG &DAG) const { // On hw >= R700, COS/SIN input must be between -1. and 1. // Thus we lower them to TRIG ( FRACT ( x / 2Pi + 0.5) - 0.5) @@ -977,7 +953,7 @@ SDValue R600TargetLowering::LowerTrig(SDValue Op, SelectionDAG &DAG) const { SDValue TrigVal = DAG.getNode(TrigNode, DL, VT, DAG.getNode(ISD::FADD, DL, VT, FractPart, DAG.getConstantFP(-0.5, DL, MVT::f32))); - if (Gen >= AMDGPUSubtarget::R700) + if (Gen >= R600Subtarget::R700) return TrigVal; // On R600 hw, COS/SIN input must be between -Pi and Pi. return DAG.getNode(ISD::FMUL, DL, VT, TrigVal, @@ -1088,7 +1064,7 @@ SDValue R600TargetLowering::LowerFPTOUINT(SDValue Op, SelectionDAG &DAG) const { } SDValue R600TargetLowering::LowerImplicitParameter(SelectionDAG &DAG, EVT VT, - SDLoc DL, + const SDLoc &DL, unsigned DwordOffset) const { unsigned ByteOffset = DwordOffset * 4; PointerType * PtrType = PointerType::get(VT.getTypeForEVT(*DAG.getContext()), @@ -1099,8 +1075,7 @@ SDValue R600TargetLowering::LowerImplicitParameter(SelectionDAG &DAG, EVT VT, return DAG.getLoad(VT, DL, DAG.getEntryNode(), DAG.getConstant(ByteOffset, DL, MVT::i32), // PTR - MachinePointerInfo(ConstantPointerNull::get(PtrType)), - false, false, false, 0); + MachinePointerInfo(ConstantPointerNull::get(PtrType))); } bool R600TargetLowering::isZero(SDValue Op) const { @@ -1113,6 +1088,20 @@ bool R600TargetLowering::isZero(SDValue Op) const { } } +bool R600TargetLowering::isHWTrueValue(SDValue Op) const { + if (ConstantFPSDNode * CFP = dyn_cast<ConstantFPSDNode>(Op)) { + return CFP->isExactlyValue(1.0); + } + return isAllOnesConstant(Op); +} + +bool R600TargetLowering::isHWFalseValue(SDValue Op) const { + if (ConstantFPSDNode * CFP = dyn_cast<ConstantFPSDNode>(Op)) { + return CFP->getValueAPF().isZero(); + } + return isNullConstant(Op); +} + SDValue R600TargetLowering::LowerSELECT_CC(SDValue Op, SelectionDAG &DAG) const { SDLoc DL(Op); EVT VT = Op.getValueType(); @@ -1311,19 +1300,73 @@ void R600TargetLowering::getStackAddress(unsigned StackWidth, } } +SDValue R600TargetLowering::lowerPrivateTruncStore(StoreSDNode *Store, + SelectionDAG &DAG) const { + SDLoc DL(Store); + + unsigned Mask = 0; + if (Store->getMemoryVT() == MVT::i8) { + Mask = 0xff; + } else if (Store->getMemoryVT() == MVT::i16) { + Mask = 0xffff; + } + + SDValue Chain = Store->getChain(); + SDValue BasePtr = Store->getBasePtr(); + EVT MemVT = Store->getMemoryVT(); + + SDValue Ptr = DAG.getNode(ISD::SRL, DL, MVT::i32, BasePtr, + DAG.getConstant(2, DL, MVT::i32)); + SDValue Dst = DAG.getNode(AMDGPUISD::REGISTER_LOAD, DL, MVT::i32, + Chain, Ptr, + DAG.getTargetConstant(0, DL, MVT::i32)); + + SDValue ByteIdx = DAG.getNode(ISD::AND, DL, MVT::i32, BasePtr, + DAG.getConstant(0x3, DL, MVT::i32)); + + SDValue ShiftAmt = DAG.getNode(ISD::SHL, DL, MVT::i32, ByteIdx, + DAG.getConstant(3, DL, MVT::i32)); + + SDValue SExtValue = DAG.getNode(ISD::SIGN_EXTEND, DL, MVT::i32, + Store->getValue()); + + SDValue MaskedValue = DAG.getZeroExtendInReg(SExtValue, DL, MemVT); + + SDValue ShiftedValue = DAG.getNode(ISD::SHL, DL, MVT::i32, + MaskedValue, ShiftAmt); + + SDValue DstMask = DAG.getNode(ISD::SHL, DL, MVT::i32, + DAG.getConstant(Mask, DL, MVT::i32), + ShiftAmt); + DstMask = DAG.getNode(ISD::XOR, DL, MVT::i32, DstMask, + DAG.getConstant(0xffffffff, DL, MVT::i32)); + Dst = DAG.getNode(ISD::AND, DL, MVT::i32, Dst, DstMask); + + SDValue Value = DAG.getNode(ISD::OR, DL, MVT::i32, Dst, ShiftedValue); + return DAG.getNode(AMDGPUISD::REGISTER_STORE, DL, MVT::Other, + Chain, Value, Ptr, + DAG.getTargetConstant(0, DL, MVT::i32)); +} + SDValue R600TargetLowering::LowerSTORE(SDValue Op, SelectionDAG &DAG) const { - SDLoc DL(Op); + if (SDValue Result = AMDGPUTargetLowering::MergeVectorStore(Op, DAG)) + return Result; + StoreSDNode *StoreNode = cast<StoreSDNode>(Op); - SDValue Chain = Op.getOperand(0); - SDValue Value = Op.getOperand(1); - SDValue Ptr = Op.getOperand(2); + unsigned AS = StoreNode->getAddressSpace(); + SDValue Value = StoreNode->getValue(); + EVT ValueVT = Value.getValueType(); - SDValue Result = AMDGPUTargetLowering::LowerSTORE(Op, DAG); - if (Result.getNode()) { - return Result; + if ((AS == AMDGPUAS::LOCAL_ADDRESS || AS == AMDGPUAS::PRIVATE_ADDRESS) && + ValueVT.isVector()) { + return SplitVectorStore(Op, DAG); } - if (StoreNode->getAddressSpace() == AMDGPUAS::GLOBAL_ADDRESS) { + SDLoc DL(Op); + SDValue Chain = StoreNode->getChain(); + SDValue Ptr = StoreNode->getBasePtr(); + + if (AS == AMDGPUAS::GLOBAL_ADDRESS) { if (StoreNode->isTruncatingStore()) { EVT VT = Value.getValueType(); assert(VT.bitsLE(MVT::i32)); @@ -1352,13 +1395,13 @@ SDValue R600TargetLowering::LowerSTORE(SDValue Op, SelectionDAG &DAG) const { DAG.getConstant(0, DL, MVT::i32), Mask }; - SDValue Input = DAG.getNode(ISD::BUILD_VECTOR, DL, MVT::v4i32, Src); + SDValue Input = DAG.getBuildVector(MVT::v4i32, DL, Src); SDValue Args[3] = { Chain, Input, DWordAddr }; return DAG.getMemIntrinsicNode(AMDGPUISD::STORE_MSKOR, DL, Op->getVTList(), Args, MemVT, StoreNode->getMemOperand()); } else if (Ptr->getOpcode() != AMDGPUISD::DWORDADDR && - Value.getValueType().bitsGE(MVT::i32)) { + ValueVT.bitsGE(MVT::i32)) { // Convert pointer from byte address to dword address. Ptr = DAG.getNode(AMDGPUISD::DWORDADDR, DL, Ptr.getValueType(), DAG.getNode(ISD::SRL, DL, Ptr.getValueType(), @@ -1373,21 +1416,16 @@ SDValue R600TargetLowering::LowerSTORE(SDValue Op, SelectionDAG &DAG) const { } } - EVT ValueVT = Value.getValueType(); - - if (StoreNode->getAddressSpace() != AMDGPUAS::PRIVATE_ADDRESS) { + if (AS != AMDGPUAS::PRIVATE_ADDRESS) return SDValue(); - } - SDValue Ret = AMDGPUTargetLowering::LowerSTORE(Op, DAG); - if (Ret.getNode()) { - return Ret; - } - // Lowering for indirect addressing + EVT MemVT = StoreNode->getMemoryVT(); + if (MemVT.bitsLT(MVT::i32)) + return lowerPrivateTruncStore(StoreNode, DAG); + // Lowering for indirect addressing const MachineFunction &MF = DAG.getMachineFunction(); - const AMDGPUFrameLowering *TFL = - static_cast<const AMDGPUFrameLowering *>(Subtarget->getFrameLowering()); + const R600FrameLowering *TFL = getSubtarget()->getFrameLowering(); unsigned StackWidth = TFL->getStackWidth(MF); Ptr = stackPtrToRegIndex(Ptr, StackWidth, DAG); @@ -1465,37 +1503,81 @@ ConstantAddressBlock(unsigned AddressSpace) { } } -SDValue R600TargetLowering::LowerLOAD(SDValue Op, SelectionDAG &DAG) const -{ - EVT VT = Op.getValueType(); +SDValue R600TargetLowering::lowerPrivateExtLoad(SDValue Op, + SelectionDAG &DAG) const { SDLoc DL(Op); - LoadSDNode *LoadNode = cast<LoadSDNode>(Op); - SDValue Chain = Op.getOperand(0); - SDValue Ptr = Op.getOperand(1); - SDValue LoweredLoad; + LoadSDNode *Load = cast<LoadSDNode>(Op); + ISD::LoadExtType ExtType = Load->getExtensionType(); + EVT MemVT = Load->getMemoryVT(); + + // <SI && AS=PRIVATE && EXTLOAD && size < 32bit, + // register (2-)byte extract. + + // Get Register holding the target. + SDValue Ptr = DAG.getNode(ISD::SRL, DL, MVT::i32, Load->getBasePtr(), + DAG.getConstant(2, DL, MVT::i32)); + // Load the Register. + SDValue Ret = DAG.getNode(AMDGPUISD::REGISTER_LOAD, DL, Op.getValueType(), + Load->getChain(), + Ptr, + DAG.getTargetConstant(0, DL, MVT::i32), + Op.getOperand(2)); + + // Get offset within the register. + SDValue ByteIdx = DAG.getNode(ISD::AND, DL, MVT::i32, + Load->getBasePtr(), + DAG.getConstant(0x3, DL, MVT::i32)); + + // Bit offset of target byte (byteIdx * 8). + SDValue ShiftAmt = DAG.getNode(ISD::SHL, DL, MVT::i32, ByteIdx, + DAG.getConstant(3, DL, MVT::i32)); + + // Shift to the right. + Ret = DAG.getNode(ISD::SRL, DL, MVT::i32, Ret, ShiftAmt); + + // Eliminate the upper bits by setting them to ... + EVT MemEltVT = MemVT.getScalarType(); + + // ... ones. + if (ExtType == ISD::SEXTLOAD) { + SDValue MemEltVTNode = DAG.getValueType(MemEltVT); + + SDValue Ops[] = { + DAG.getNode(ISD::SIGN_EXTEND_INREG, DL, MVT::i32, Ret, MemEltVTNode), + Load->getChain() + }; - if (SDValue Ret = AMDGPUTargetLowering::LowerLOAD(Op, DAG)) - return Ret; + return DAG.getMergeValues(Ops, DL); + } + + // ... or zeros. + SDValue Ops[] = { + DAG.getZeroExtendInReg(Ret, DL, MemEltVT), + Load->getChain() + }; - // Lower loads constant address space global variable loads - if (LoadNode->getAddressSpace() == AMDGPUAS::CONSTANT_ADDRESS && - isa<GlobalVariable>(GetUnderlyingObject( - LoadNode->getMemOperand()->getValue(), DAG.getDataLayout()))) { + return DAG.getMergeValues(Ops, DL); +} - SDValue Ptr = DAG.getZExtOrTrunc( - LoadNode->getBasePtr(), DL, - getPointerTy(DAG.getDataLayout(), AMDGPUAS::PRIVATE_ADDRESS)); - Ptr = DAG.getNode(ISD::SRL, DL, MVT::i32, Ptr, - DAG.getConstant(2, DL, MVT::i32)); - return DAG.getNode(AMDGPUISD::REGISTER_LOAD, DL, Op->getVTList(), - LoadNode->getChain(), Ptr, - DAG.getTargetConstant(0, DL, MVT::i32), - Op.getOperand(2)); +SDValue R600TargetLowering::LowerLOAD(SDValue Op, SelectionDAG &DAG) const { + LoadSDNode *LoadNode = cast<LoadSDNode>(Op); + unsigned AS = LoadNode->getAddressSpace(); + EVT MemVT = LoadNode->getMemoryVT(); + ISD::LoadExtType ExtType = LoadNode->getExtensionType(); + + if (AS == AMDGPUAS::PRIVATE_ADDRESS && + ExtType != ISD::NON_EXTLOAD && MemVT.bitsLT(MVT::i32)) { + return lowerPrivateExtLoad(Op, DAG); } + SDLoc DL(Op); + EVT VT = Op.getValueType(); + SDValue Chain = LoadNode->getChain(); + SDValue Ptr = LoadNode->getBasePtr(); + if (LoadNode->getAddressSpace() == AMDGPUAS::LOCAL_ADDRESS && VT.isVector()) { SDValue MergedValues[2] = { - ScalarizeVectorLoad(Op, DAG), + scalarizeVectorLoad(LoadNode, DAG), Chain }; return DAG.getMergeValues(MergedValues, DL); @@ -1526,8 +1608,7 @@ SDValue R600TargetLowering::LowerLOAD(SDValue Op, SelectionDAG &DAG) const NewVT = VT; NumElements = VT.getVectorNumElements(); } - Result = DAG.getNode(ISD::BUILD_VECTOR, DL, NewVT, - makeArrayRef(Slots, NumElements)); + Result = DAG.getBuildVector(NewVT, DL, makeArrayRef(Slots, NumElements)); } else { // non-constant ptr can't be folded, keeps it as a v4f32 load Result = DAG.getNode(AMDGPUISD::CONST_ADDRESS, DL, MVT::v4i32, @@ -1550,6 +1631,8 @@ SDValue R600TargetLowering::LowerLOAD(SDValue Op, SelectionDAG &DAG) const return DAG.getMergeValues(MergedValues, DL); } + SDValue LoweredLoad; + // For most operations returning SDValue() will result in the node being // expanded by the DAG Legalizer. This is not the case for ISD::LOAD, so we // need to manually expand loads that may be legal in some address spaces and @@ -1560,12 +1643,9 @@ SDValue R600TargetLowering::LowerLOAD(SDValue Op, SelectionDAG &DAG) const if (LoadNode->getExtensionType() == ISD::SEXTLOAD) { EVT MemVT = LoadNode->getMemoryVT(); assert(!MemVT.isVector() && (MemVT == MVT::i16 || MemVT == MVT::i8)); - SDValue NewLoad = DAG.getExtLoad(ISD::EXTLOAD, DL, VT, Chain, Ptr, - LoadNode->getPointerInfo(), MemVT, - LoadNode->isVolatile(), - LoadNode->isNonTemporal(), - LoadNode->isInvariant(), - LoadNode->getAlignment()); + SDValue NewLoad = DAG.getExtLoad( + ISD::EXTLOAD, DL, VT, Chain, Ptr, LoadNode->getPointerInfo(), MemVT, + LoadNode->getAlignment(), LoadNode->getMemOperand()->getFlags()); SDValue Res = DAG.getNode(ISD::SIGN_EXTEND_INREG, DL, VT, NewLoad, DAG.getValueType(MemVT)); @@ -1579,8 +1659,7 @@ SDValue R600TargetLowering::LowerLOAD(SDValue Op, SelectionDAG &DAG) const // Lowering for indirect addressing const MachineFunction &MF = DAG.getMachineFunction(); - const AMDGPUFrameLowering *TFL = - static_cast<const AMDGPUFrameLowering *>(Subtarget->getFrameLowering()); + const R600FrameLowering *TFL = getSubtarget()->getFrameLowering(); unsigned StackWidth = TFL->getStackWidth(MF); Ptr = stackPtrToRegIndex(Ptr, StackWidth, DAG); @@ -1590,6 +1669,7 @@ SDValue R600TargetLowering::LowerLOAD(SDValue Op, SelectionDAG &DAG) const EVT ElemVT = VT.getVectorElementType(); SDValue Loads[4]; + assert(NumElemVT <= 4); assert(NumElemVT >= StackWidth && "Stack width cannot be greater than " "vector width in load"); @@ -1603,11 +1683,8 @@ SDValue R600TargetLowering::LowerLOAD(SDValue Op, SelectionDAG &DAG) const DAG.getTargetConstant(Channel, DL, MVT::i32), Op.getOperand(2)); } - for (unsigned i = NumElemVT; i < 4; ++i) { - Loads[i] = DAG.getUNDEF(ElemVT); - } - EVT TargetVT = EVT::getVectorVT(*DAG.getContext(), ElemVT, 4); - LoweredLoad = DAG.getNode(ISD::BUILD_VECTOR, DL, TargetVT, Loads); + EVT TargetVT = EVT::getVectorVT(*DAG.getContext(), ElemVT, NumElemVT); + LoweredLoad = DAG.getBuildVector(TargetVT, DL, makeArrayRef(Loads, NumElemVT)); } else { LoweredLoad = DAG.getNode(AMDGPUISD::REGISTER_LOAD, DL, VT, Chain, Ptr, @@ -1632,16 +1709,28 @@ SDValue R600TargetLowering::LowerBRCOND(SDValue Op, SelectionDAG &DAG) const { Chain, Jump, Cond); } +SDValue R600TargetLowering::lowerFrameIndex(SDValue Op, + SelectionDAG &DAG) const { + MachineFunction &MF = DAG.getMachineFunction(); + const R600FrameLowering *TFL = getSubtarget()->getFrameLowering(); + + FrameIndexSDNode *FIN = cast<FrameIndexSDNode>(Op); + + unsigned FrameIndex = FIN->getIndex(); + unsigned IgnoredFrameReg; + unsigned Offset = + TFL->getFrameIndexReference(MF, FrameIndex, IgnoredFrameReg); + return DAG.getConstant(Offset * 4 * TFL->getStackWidth(MF), SDLoc(Op), + Op.getValueType()); +} + /// XXX Only kernel functions are supported, so we can assume for now that /// every function is a kernel function, but in the future we should use /// separate calling conventions for kernel and non-kernel functions. SDValue R600TargetLowering::LowerFormalArguments( - SDValue Chain, - CallingConv::ID CallConv, - bool isVarArg, - const SmallVectorImpl<ISD::InputArg> &Ins, - SDLoc DL, SelectionDAG &DAG, - SmallVectorImpl<SDValue> &InVals) const { + SDValue Chain, CallingConv::ID CallConv, bool isVarArg, + const SmallVectorImpl<ISD::InputArg> &Ins, const SDLoc &DL, + SelectionDAG &DAG, SmallVectorImpl<SDValue> &InVals) const { SmallVector<CCValAssign, 16> ArgLocs; CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(), ArgLocs, *DAG.getContext()); @@ -1664,7 +1753,7 @@ SDValue R600TargetLowering::LowerFormalArguments( MemVT = MemVT.getVectorElementType(); } - if (MFI->getShaderType() != ShaderType::COMPUTE) { + if (AMDGPU::isShader(CallConv)) { unsigned Reg = MF.addLiveIn(VA.getLocReg(), &AMDGPU::R600_Reg128RegClass); SDValue Register = DAG.getCopyFromReg(Chain, DL, Reg, VT); InVals.push_back(Register); @@ -1699,11 +1788,11 @@ SDValue R600TargetLowering::LowerFormalArguments( unsigned Offset = 36 + VA.getLocMemOffset(); MachinePointerInfo PtrInfo(UndefValue::get(PtrTy), PartOffset - ValBase); - SDValue Arg = DAG.getLoad(ISD::UNINDEXED, Ext, VT, DL, Chain, - DAG.getConstant(Offset, DL, MVT::i32), - DAG.getUNDEF(MVT::i32), - PtrInfo, - MemVT, false, true, true, 4); + SDValue Arg = DAG.getLoad( + ISD::UNINDEXED, Ext, VT, DL, Chain, + DAG.getConstant(Offset, DL, MVT::i32), DAG.getUNDEF(MVT::i32), PtrInfo, + MemVT, /* Alignment = */ 4, + MachineMemOperand::MONonTemporal | MachineMemOperand::MOInvariant); // 4 is the preferred alignment for the CONSTANT memory space. InVals.push_back(Arg); @@ -1719,6 +1808,26 @@ EVT R600TargetLowering::getSetCCResultType(const DataLayout &DL, LLVMContext &, return VT.changeVectorElementTypeToInteger(); } +bool R600TargetLowering::allowsMisalignedMemoryAccesses(EVT VT, + unsigned AddrSpace, + unsigned Align, + bool *IsFast) const { + if (IsFast) + *IsFast = false; + + if (!VT.isSimple() || VT == MVT::Other) + return false; + + if (VT.bitsLT(MVT::i32)) + return false; + + // TODO: This is a rough estimate. + if (IsFast) + *IsFast = true; + + return VT.bitsGT(MVT::i32) && Align % 4 == 0; +} + static SDValue CompactSwizzlableVector( SelectionDAG &DAG, SDValue VectorEntry, DenseMap<unsigned, unsigned> &RemapSwizzle) { @@ -1732,7 +1841,7 @@ static SDValue CompactSwizzlableVector( }; for (unsigned i = 0; i < 4; i++) { - if (NewBldVec[i].getOpcode() == ISD::UNDEF) + if (NewBldVec[i].isUndef()) // We mask write here to teach later passes that the ith element of this // vector is undef. Thus we can use it to reduce 128 bits reg usage, // break false dependencies and additionnaly make assembly easier to read. @@ -1747,7 +1856,7 @@ static SDValue CompactSwizzlableVector( } } - if (NewBldVec[i].getOpcode() == ISD::UNDEF) + if (NewBldVec[i].isUndef()) continue; for (unsigned j = 0; j < i; j++) { if (NewBldVec[i] == NewBldVec[j]) { @@ -1758,8 +1867,8 @@ static SDValue CompactSwizzlableVector( } } - return DAG.getNode(ISD::BUILD_VECTOR, SDLoc(VectorEntry), - VectorEntry.getValueType(), NewBldVec); + return DAG.getBuildVector(VectorEntry.getValueType(), SDLoc(VectorEntry), + NewBldVec); } static SDValue ReorganizeVector(SelectionDAG &DAG, SDValue VectorEntry, @@ -1796,14 +1905,13 @@ static SDValue ReorganizeVector(SelectionDAG &DAG, SDValue VectorEntry, } } - return DAG.getNode(ISD::BUILD_VECTOR, SDLoc(VectorEntry), - VectorEntry.getValueType(), NewBldVec); + return DAG.getBuildVector(VectorEntry.getValueType(), SDLoc(VectorEntry), + NewBldVec); } - -SDValue R600TargetLowering::OptimizeSwizzle(SDValue BuildVector, - SDValue Swz[4], SelectionDAG &DAG, - SDLoc DL) const { +SDValue R600TargetLowering::OptimizeSwizzle(SDValue BuildVector, SDValue Swz[4], + SelectionDAG &DAG, + const SDLoc &DL) const { assert(BuildVector.getOpcode() == ISD::BUILD_VECTOR); // Old -> New swizzle values DenseMap<unsigned, unsigned> SwizzleRemap; @@ -1886,7 +1994,7 @@ SDValue R600TargetLowering::PerformDAGCombine(SDNode *N, SDLoc dl(N); // If the inserted element is an UNDEF, just use the input vector. - if (InVal.getOpcode() == ISD::UNDEF) + if (InVal.isUndef()) return InVec; EVT VT = InVec.getValueType(); @@ -1907,7 +2015,7 @@ SDValue R600TargetLowering::PerformDAGCombine(SDNode *N, if (InVec.getOpcode() == ISD::BUILD_VECTOR) { Ops.append(InVec.getNode()->op_begin(), InVec.getNode()->op_end()); - } else if (InVec.getOpcode() == ISD::UNDEF) { + } else if (InVec.isUndef()) { unsigned NElts = VT.getVectorNumElements(); Ops.append(NElts, DAG.getUNDEF(InVal.getValueType())); } else { @@ -1927,7 +2035,7 @@ SDValue R600TargetLowering::PerformDAGCombine(SDNode *N, } // Return the new vector - return DAG.getNode(ISD::BUILD_VECTOR, dl, VT, Ops); + return DAG.getBuildVector(VT, dl, Ops); } // Extract_vec (Build_vector) generated by custom lowering @@ -1953,8 +2061,7 @@ SDValue R600TargetLowering::PerformDAGCombine(SDNode *N, case ISD::SELECT_CC: { // Try common optimizations - SDValue Ret = AMDGPUTargetLowering::PerformDAGCombine(N, DCI); - if (Ret.getNode()) + if (SDValue Ret = AMDGPUTargetLowering::PerformDAGCombine(N, DCI)) return Ret; // fold selectcc (selectcc x, y, a, b, cc), b, a, b, seteq -> @@ -2053,13 +2160,14 @@ SDValue R600TargetLowering::PerformDAGCombine(SDNode *N, return AMDGPUTargetLowering::PerformDAGCombine(N, DCI); } -static bool -FoldOperand(SDNode *ParentNode, unsigned SrcIdx, SDValue &Src, SDValue &Neg, - SDValue &Abs, SDValue &Sel, SDValue &Imm, SelectionDAG &DAG) { - const R600InstrInfo *TII = - static_cast<const R600InstrInfo *>(DAG.getSubtarget().getInstrInfo()); +bool R600TargetLowering::FoldOperand(SDNode *ParentNode, unsigned SrcIdx, + SDValue &Src, SDValue &Neg, SDValue &Abs, + SDValue &Sel, SDValue &Imm, + SelectionDAG &DAG) const { + const R600InstrInfo *TII = getSubtarget()->getInstrInfo(); if (!Src.isMachineOpcode()) return false; + switch (Src.getMachineOpcode()) { case AMDGPU::FNEG_R600: if (!Neg.getNode()) @@ -2127,6 +2235,13 @@ FoldOperand(SDNode *ParentNode, unsigned SrcIdx, SDValue &Src, SDValue &Neg, Src = DAG.getRegister(AMDGPU::ALU_CONST, MVT::f32); return true; } + case AMDGPU::MOV_IMM_GLOBAL_ADDR: + // Check if the Imm slot is used. Taken from below. + if (cast<ConstantSDNode>(Imm)->getZExtValue()) + return false; + Imm = Src.getOperand(0); + Src = DAG.getRegister(AMDGPU::ALU_LITERAL_X, MVT::i32); + return true; case AMDGPU::MOV_IMM_I32: case AMDGPU::MOV_IMM_F32: { unsigned ImmReg = AMDGPU::ALU_LITERAL_X; @@ -2177,14 +2292,13 @@ FoldOperand(SDNode *ParentNode, unsigned SrcIdx, SDValue &Src, SDValue &Neg, } } - /// \brief Fold the instructions after selecting them SDNode *R600TargetLowering::PostISelFolding(MachineSDNode *Node, SelectionDAG &DAG) const { - const R600InstrInfo *TII = - static_cast<const R600InstrInfo *>(DAG.getSubtarget().getInstrInfo()); + const R600InstrInfo *TII = getSubtarget()->getInstrInfo(); if (!Node->isMachineOpcode()) return Node; + unsigned Opcode = Node->getMachineOpcode(); SDValue FakeOp; |