summaryrefslogtreecommitdiff
path: root/lib/Target/AMDGPU
diff options
context:
space:
mode:
authorDimitry Andric <dim@FreeBSD.org>2017-06-01 20:58:36 +0000
committerDimitry Andric <dim@FreeBSD.org>2017-06-01 20:58:36 +0000
commitf382538d471e38a9b98f016c4caebd24c8d60b62 (patch)
treed30f3d58b1044b5355d50c17a6a96c6a0b35703a /lib/Target/AMDGPU
parentee2f195dd3e40f49698ca4dc2666ec09c770e80d (diff)
Diffstat (limited to 'lib/Target/AMDGPU')
-rw-r--r--lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp55
-rw-r--r--lib/Target/AMDGPU/AMDGPUTargetMachine.cpp10
-rw-r--r--lib/Target/AMDGPU/AMDGPUTargetMachine.h5
-rw-r--r--lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp324
-rw-r--r--lib/Target/AMDGPU/DSInstructions.td6
-rw-r--r--lib/Target/AMDGPU/InstPrinter/AMDGPUInstPrinter.cpp106
-rw-r--r--lib/Target/AMDGPU/InstPrinter/AMDGPUInstPrinter.h2
-rw-r--r--lib/Target/AMDGPU/SIDefines.h40
-rw-r--r--lib/Target/AMDGPU/SIInsertWaitcnts.cpp26
-rw-r--r--lib/Target/AMDGPU/SIInstrInfo.td13
-rw-r--r--lib/Target/AMDGPU/SIRegisterInfo.h6
-rw-r--r--lib/Target/AMDGPU/Utils/AMDGPUAsmUtils.cpp13
-rw-r--r--lib/Target/AMDGPU/Utils/AMDGPUAsmUtils.h6
13 files changed, 568 insertions, 44 deletions
diff --git a/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp b/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp
index f473944cd5283..0959014812d8b 100644
--- a/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp
+++ b/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp
@@ -503,40 +503,37 @@ AMDGPUAsmPrinter::SIFunctionResourceInfo AMDGPUAsmPrinter::analyzeResourceUsage(
Info.HasDynamicallySizedStack = FrameInfo.hasVarSizedObjects();
Info.PrivateSegmentSize = FrameInfo.getStackSize();
- if (!FrameInfo.hasCalls()) {
- Info.UsesVCC = MRI.isPhysRegUsed(AMDGPU::VCC_LO) ||
- MRI.isPhysRegUsed(AMDGPU::VCC_HI);
-
- // If there are no calls, MachineRegisterInfo can tell us the used register
- // count easily.
-
- MCPhysReg HighestVGPRReg = AMDGPU::NoRegister;
- for (MCPhysReg Reg : reverse(AMDGPU::VGPR_32RegClass.getRegisters())) {
- if (MRI.isPhysRegUsed(Reg)) {
- HighestVGPRReg = Reg;
- break;
- }
- }
- MCPhysReg HighestSGPRReg = AMDGPU::NoRegister;
- for (MCPhysReg Reg : reverse(AMDGPU::SGPR_32RegClass.getRegisters())) {
- if (MRI.isPhysRegUsed(Reg)) {
- HighestSGPRReg = Reg;
- break;
- }
- }
+ Info.UsesVCC = MRI.isPhysRegUsed(AMDGPU::VCC_LO) ||
+ MRI.isPhysRegUsed(AMDGPU::VCC_HI);
- // We found the maximum register index. They start at 0, so add one to get the
- // number of registers.
- Info.NumVGPR = HighestVGPRReg == AMDGPU::NoRegister ? 0 :
- TRI.getHWRegIndex(HighestVGPRReg) + 1;
- Info.NumExplicitSGPR = HighestSGPRReg == AMDGPU::NoRegister ? 0 :
- TRI.getHWRegIndex(HighestSGPRReg) + 1;
+ // If there are no calls, MachineRegisterInfo can tell us the used register
+ // count easily.
- return Info;
+ MCPhysReg HighestVGPRReg = AMDGPU::NoRegister;
+ for (MCPhysReg Reg : reverse(AMDGPU::VGPR_32RegClass.getRegisters())) {
+ if (MRI.isPhysRegUsed(Reg)) {
+ HighestVGPRReg = Reg;
+ break;
+ }
}
- llvm_unreachable("calls not implemented");
+ MCPhysReg HighestSGPRReg = AMDGPU::NoRegister;
+ for (MCPhysReg Reg : reverse(AMDGPU::SGPR_32RegClass.getRegisters())) {
+ if (MRI.isPhysRegUsed(Reg)) {
+ HighestSGPRReg = Reg;
+ break;
+ }
+ }
+
+ // We found the maximum register index. They start at 0, so add one to get the
+ // number of registers.
+ Info.NumVGPR = HighestVGPRReg == AMDGPU::NoRegister ? 0 :
+ TRI.getHWRegIndex(HighestVGPRReg) + 1;
+ Info.NumExplicitSGPR = HighestSGPRReg == AMDGPU::NoRegister ? 0 :
+ TRI.getHWRegIndex(HighestSGPRReg) + 1;
+
+ return Info;
}
void AMDGPUAsmPrinter::getSIProgramInfo(SIProgramInfo &ProgInfo,
diff --git a/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp b/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
index 48827f4639974..596f02ae4a649 100644
--- a/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
+++ b/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
@@ -456,7 +456,7 @@ namespace {
class AMDGPUPassConfig : public TargetPassConfig {
public:
- AMDGPUPassConfig(TargetMachine *TM, PassManagerBase &PM)
+ AMDGPUPassConfig(LLVMTargetMachine &TM, PassManagerBase &PM)
: TargetPassConfig(TM, PM) {
// Exceptions and StackMaps are not supported, so these passes will never do
// anything.
@@ -487,7 +487,7 @@ public:
class R600PassConfig final : public AMDGPUPassConfig {
public:
- R600PassConfig(TargetMachine *TM, PassManagerBase &PM)
+ R600PassConfig(LLVMTargetMachine &TM, PassManagerBase &PM)
: AMDGPUPassConfig(TM, PM) {}
ScheduleDAGInstrs *createMachineScheduler(
@@ -503,7 +503,7 @@ public:
class GCNPassConfig final : public AMDGPUPassConfig {
public:
- GCNPassConfig(TargetMachine *TM, PassManagerBase &PM)
+ GCNPassConfig(LLVMTargetMachine &TM, PassManagerBase &PM)
: AMDGPUPassConfig(TM, PM) {}
GCNTargetMachine &getGCNTargetMachine() const {
@@ -682,7 +682,7 @@ void R600PassConfig::addPreEmitPass() {
}
TargetPassConfig *R600TargetMachine::createPassConfig(PassManagerBase &PM) {
- return new R600PassConfig(this, PM);
+ return new R600PassConfig(*this, PM);
}
//===----------------------------------------------------------------------===//
@@ -844,6 +844,6 @@ void GCNPassConfig::addPreEmitPass() {
}
TargetPassConfig *GCNTargetMachine::createPassConfig(PassManagerBase &PM) {
- return new GCNPassConfig(this, PM);
+ return new GCNPassConfig(*this, PM);
}
diff --git a/lib/Target/AMDGPU/AMDGPUTargetMachine.h b/lib/Target/AMDGPU/AMDGPUTargetMachine.h
index 934bf7f31bab4..a3c7c1982d0a6 100644
--- a/lib/Target/AMDGPU/AMDGPUTargetMachine.h
+++ b/lib/Target/AMDGPU/AMDGPUTargetMachine.h
@@ -69,7 +69,6 @@ public:
return -1;
return 0;
}
-
};
//===----------------------------------------------------------------------===//
@@ -89,6 +88,10 @@ public:
TargetPassConfig *createPassConfig(PassManagerBase &PM) override;
const R600Subtarget *getSubtargetImpl(const Function &) const override;
+
+ bool isMachineVerifierClean() const override {
+ return false;
+ }
};
//===----------------------------------------------------------------------===//
diff --git a/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp b/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp
index f5541e08e1b72..cc68c971b2490 100644
--- a/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp
+++ b/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp
@@ -161,7 +161,8 @@ public:
ImmTyOpSel,
ImmTyOpSelHi,
ImmTyNegLo,
- ImmTyNegHi
+ ImmTyNegHi,
+ ImmTySwizzle
};
struct TokOp {
@@ -474,6 +475,7 @@ public:
bool isSWaitCnt() const;
bool isHwreg() const;
bool isSendMsg() const;
+ bool isSwizzle() const;
bool isSMRDOffset8() const;
bool isSMRDOffset20() const;
bool isSMRDLiteralOffset() const;
@@ -659,6 +661,7 @@ public:
case ImmTyOpSelHi: OS << "OpSelHi"; break;
case ImmTyNegLo: OS << "NegLo"; break;
case ImmTyNegHi: OS << "NegHi"; break;
+ case ImmTySwizzle: OS << "Swizzle"; break;
}
}
@@ -994,6 +997,12 @@ private:
bool isInlineConstant(const MCInst &Inst, unsigned OpIdx) const;
unsigned findImplicitSGPRReadInVOP(const MCInst &Inst) const;
+ bool trySkipId(const StringRef Id);
+ bool trySkipToken(const AsmToken::TokenKind Kind);
+ bool skipToken(const AsmToken::TokenKind Kind, const StringRef ErrMsg);
+ bool parseString(StringRef &Val, const StringRef ErrMsg = "expected a string");
+ bool parseExpr(int64_t &Imm);
+
public:
OperandMatchResultTy parseOptionalOperand(OperandVector &Operands);
@@ -1003,6 +1012,19 @@ public:
OperandMatchResultTy parseInterpAttr(OperandVector &Operands);
OperandMatchResultTy parseSOppBrTarget(OperandVector &Operands);
+ bool parseSwizzleOperands(const unsigned OpNum, int64_t* Op,
+ const unsigned MinVal,
+ const unsigned MaxVal,
+ const StringRef ErrMsg);
+ OperandMatchResultTy parseSwizzleOp(OperandVector &Operands);
+ bool parseSwizzleOffset(int64_t &Imm);
+ bool parseSwizzleMacro(int64_t &Imm);
+ bool parseSwizzleQuadPerm(int64_t &Imm);
+ bool parseSwizzleBitmaskPerm(int64_t &Imm);
+ bool parseSwizzleBroadcast(int64_t &Imm);
+ bool parseSwizzleSwap(int64_t &Imm);
+ bool parseSwizzleReverse(int64_t &Imm);
+
void cvtMubuf(MCInst &Inst, const OperandVector &Operands) { cvtMubufImpl(Inst, Operands, false, false); }
void cvtMubufAtomic(MCInst &Inst, const OperandVector &Operands) { cvtMubufImpl(Inst, Operands, true, false); }
void cvtMubufAtomicReturn(MCInst &Inst, const OperandVector &Operands) { cvtMubufImpl(Inst, Operands, true, true); }
@@ -2785,7 +2807,13 @@ void AMDGPUAsmParser::cvtDSImpl(MCInst &Inst, const OperandVector &Operands,
OptionalIdx[Op.getImmTy()] = i;
}
- addOptionalImmOperand(Inst, Operands, OptionalIdx, AMDGPUOperand::ImmTyOffset);
+ AMDGPUOperand::ImmTy OffsetType =
+ (Inst.getOpcode() == AMDGPU::DS_SWIZZLE_B32_si ||
+ Inst.getOpcode() == AMDGPU::DS_SWIZZLE_B32_vi) ? AMDGPUOperand::ImmTySwizzle :
+ AMDGPUOperand::ImmTyOffset;
+
+ addOptionalImmOperand(Inst, Operands, OptionalIdx, OffsetType);
+
if (!IsGdsHardcoded) {
addOptionalImmOperand(Inst, Operands, OptionalIdx, AMDGPUOperand::ImmTyGDS);
}
@@ -3384,6 +3412,298 @@ bool AMDGPUOperand::isSendMsg() const {
}
//===----------------------------------------------------------------------===//
+// parser helpers
+//===----------------------------------------------------------------------===//
+
+bool
+AMDGPUAsmParser::trySkipId(const StringRef Id) {
+ if (getLexer().getKind() == AsmToken::Identifier &&
+ Parser.getTok().getString() == Id) {
+ Parser.Lex();
+ return true;
+ }
+ return false;
+}
+
+bool
+AMDGPUAsmParser::trySkipToken(const AsmToken::TokenKind Kind) {
+ if (getLexer().getKind() == Kind) {
+ Parser.Lex();
+ return true;
+ }
+ return false;
+}
+
+bool
+AMDGPUAsmParser::skipToken(const AsmToken::TokenKind Kind,
+ const StringRef ErrMsg) {
+ if (!trySkipToken(Kind)) {
+ Error(Parser.getTok().getLoc(), ErrMsg);
+ return false;
+ }
+ return true;
+}
+
+bool
+AMDGPUAsmParser::parseExpr(int64_t &Imm) {
+ return !getParser().parseAbsoluteExpression(Imm);
+}
+
+bool
+AMDGPUAsmParser::parseString(StringRef &Val, const StringRef ErrMsg) {
+ SMLoc S = Parser.getTok().getLoc();
+ if (getLexer().getKind() == AsmToken::String) {
+ Val = Parser.getTok().getStringContents();
+ Parser.Lex();
+ return true;
+ } else {
+ Error(S, ErrMsg);
+ return false;
+ }
+}
+
+//===----------------------------------------------------------------------===//
+// swizzle
+//===----------------------------------------------------------------------===//
+
+LLVM_READNONE
+static unsigned
+encodeBitmaskPerm(const unsigned AndMask,
+ const unsigned OrMask,
+ const unsigned XorMask) {
+ using namespace llvm::AMDGPU::Swizzle;
+
+ return BITMASK_PERM_ENC |
+ (AndMask << BITMASK_AND_SHIFT) |
+ (OrMask << BITMASK_OR_SHIFT) |
+ (XorMask << BITMASK_XOR_SHIFT);
+}
+
+bool
+AMDGPUAsmParser::parseSwizzleOperands(const unsigned OpNum, int64_t* Op,
+ const unsigned MinVal,
+ const unsigned MaxVal,
+ const StringRef ErrMsg) {
+ for (unsigned i = 0; i < OpNum; ++i) {
+ if (!skipToken(AsmToken::Comma, "expected a comma")){
+ return false;
+ }
+ SMLoc ExprLoc = Parser.getTok().getLoc();
+ if (!parseExpr(Op[i])) {
+ return false;
+ }
+ if (Op[i] < MinVal || Op[i] > MaxVal) {
+ Error(ExprLoc, ErrMsg);
+ return false;
+ }
+ }
+
+ return true;
+}
+
+bool
+AMDGPUAsmParser::parseSwizzleQuadPerm(int64_t &Imm) {
+ using namespace llvm::AMDGPU::Swizzle;
+
+ int64_t Lane[LANE_NUM];
+ if (parseSwizzleOperands(LANE_NUM, Lane, 0, LANE_MAX,
+ "expected a 2-bit lane id")) {
+ Imm = QUAD_PERM_ENC;
+ for (auto i = 0; i < LANE_NUM; ++i) {
+ Imm |= Lane[i] << (LANE_SHIFT * i);
+ }
+ return true;
+ }
+ return false;
+}
+
+bool
+AMDGPUAsmParser::parseSwizzleBroadcast(int64_t &Imm) {
+ using namespace llvm::AMDGPU::Swizzle;
+
+ SMLoc S = Parser.getTok().getLoc();
+ int64_t GroupSize;
+ int64_t LaneIdx;
+
+ if (!parseSwizzleOperands(1, &GroupSize,
+ 2, 32,
+ "group size must be in the interval [2,32]")) {
+ return false;
+ }
+ if (!isPowerOf2_64(GroupSize)) {
+ Error(S, "group size must be a power of two");
+ return false;
+ }
+ if (parseSwizzleOperands(1, &LaneIdx,
+ 0, GroupSize - 1,
+ "lane id must be in the interval [0,group size - 1]")) {
+ Imm = encodeBitmaskPerm(BITMASK_MAX - GroupSize + 1, LaneIdx, 0);
+ return true;
+ }
+ return false;
+}
+
+bool
+AMDGPUAsmParser::parseSwizzleReverse(int64_t &Imm) {
+ using namespace llvm::AMDGPU::Swizzle;
+
+ SMLoc S = Parser.getTok().getLoc();
+ int64_t GroupSize;
+
+ if (!parseSwizzleOperands(1, &GroupSize,
+ 2, 32, "group size must be in the interval [2,32]")) {
+ return false;
+ }
+ if (!isPowerOf2_64(GroupSize)) {
+ Error(S, "group size must be a power of two");
+ return false;
+ }
+
+ Imm = encodeBitmaskPerm(BITMASK_MAX, 0, GroupSize - 1);
+ return true;
+}
+
+bool
+AMDGPUAsmParser::parseSwizzleSwap(int64_t &Imm) {
+ using namespace llvm::AMDGPU::Swizzle;
+
+ SMLoc S = Parser.getTok().getLoc();
+ int64_t GroupSize;
+
+ if (!parseSwizzleOperands(1, &GroupSize,
+ 1, 16, "group size must be in the interval [1,16]")) {
+ return false;
+ }
+ if (!isPowerOf2_64(GroupSize)) {
+ Error(S, "group size must be a power of two");
+ return false;
+ }
+
+ Imm = encodeBitmaskPerm(BITMASK_MAX, 0, GroupSize);
+ return true;
+}
+
+bool
+AMDGPUAsmParser::parseSwizzleBitmaskPerm(int64_t &Imm) {
+ using namespace llvm::AMDGPU::Swizzle;
+
+ if (!skipToken(AsmToken::Comma, "expected a comma")) {
+ return false;
+ }
+
+ StringRef Ctl;
+ SMLoc StrLoc = Parser.getTok().getLoc();
+ if (!parseString(Ctl)) {
+ return false;
+ }
+ if (Ctl.size() != BITMASK_WIDTH) {
+ Error(StrLoc, "expected a 5-character mask");
+ return false;
+ }
+
+ unsigned AndMask = 0;
+ unsigned OrMask = 0;
+ unsigned XorMask = 0;
+
+ for (size_t i = 0; i < Ctl.size(); ++i) {
+ unsigned Mask = 1 << (BITMASK_WIDTH - 1 - i);
+ switch(Ctl[i]) {
+ default:
+ Error(StrLoc, "invalid mask");
+ return false;
+ case '0':
+ break;
+ case '1':
+ OrMask |= Mask;
+ break;
+ case 'p':
+ AndMask |= Mask;
+ break;
+ case 'i':
+ AndMask |= Mask;
+ XorMask |= Mask;
+ break;
+ }
+ }
+
+ Imm = encodeBitmaskPerm(AndMask, OrMask, XorMask);
+ return true;
+}
+
+bool
+AMDGPUAsmParser::parseSwizzleOffset(int64_t &Imm) {
+
+ SMLoc OffsetLoc = Parser.getTok().getLoc();
+
+ if (!parseExpr(Imm)) {
+ return false;
+ }
+ if (!isUInt<16>(Imm)) {
+ Error(OffsetLoc, "expected a 16-bit offset");
+ return false;
+ }
+ return true;
+}
+
+bool
+AMDGPUAsmParser::parseSwizzleMacro(int64_t &Imm) {
+ using namespace llvm::AMDGPU::Swizzle;
+
+ if (skipToken(AsmToken::LParen, "expected a left parentheses")) {
+
+ SMLoc ModeLoc = Parser.getTok().getLoc();
+ bool Ok = false;
+
+ if (trySkipId(IdSymbolic[ID_QUAD_PERM])) {
+ Ok = parseSwizzleQuadPerm(Imm);
+ } else if (trySkipId(IdSymbolic[ID_BITMASK_PERM])) {
+ Ok = parseSwizzleBitmaskPerm(Imm);
+ } else if (trySkipId(IdSymbolic[ID_BROADCAST])) {
+ Ok = parseSwizzleBroadcast(Imm);
+ } else if (trySkipId(IdSymbolic[ID_SWAP])) {
+ Ok = parseSwizzleSwap(Imm);
+ } else if (trySkipId(IdSymbolic[ID_REVERSE])) {
+ Ok = parseSwizzleReverse(Imm);
+ } else {
+ Error(ModeLoc, "expected a swizzle mode");
+ }
+
+ return Ok && skipToken(AsmToken::RParen, "expected a closing parentheses");
+ }
+
+ return false;
+}
+
+OperandMatchResultTy
+AMDGPUAsmParser::parseSwizzleOp(OperandVector &Operands) {
+ SMLoc S = Parser.getTok().getLoc();
+ int64_t Imm = 0;
+
+ if (trySkipId("offset")) {
+
+ bool Ok = false;
+ if (skipToken(AsmToken::Colon, "expected a colon")) {
+ if (trySkipId("swizzle")) {
+ Ok = parseSwizzleMacro(Imm);
+ } else {
+ Ok = parseSwizzleOffset(Imm);
+ }
+ }
+
+ Operands.push_back(AMDGPUOperand::CreateImm(this, Imm, S, AMDGPUOperand::ImmTySwizzle));
+
+ return Ok? MatchOperand_Success : MatchOperand_ParseFail;
+ } else {
+ return MatchOperand_NoMatch;
+ }
+}
+
+bool
+AMDGPUOperand::isSwizzle() const {
+ return isImmTy(ImmTySwizzle);
+}
+
+//===----------------------------------------------------------------------===//
// sopp branch targets
//===----------------------------------------------------------------------===//
diff --git a/lib/Target/AMDGPU/DSInstructions.td b/lib/Target/AMDGPU/DSInstructions.td
index 357e18108e7e8..fc516c3b39c28 100644
--- a/lib/Target/AMDGPU/DSInstructions.td
+++ b/lib/Target/AMDGPU/DSInstructions.td
@@ -145,10 +145,10 @@ class DS_1A2D_Off8_RET<string opName,
let hasPostISelHook = 1;
}
-class DS_1A_RET<string opName, RegisterClass rc = VGPR_32>
+class DS_1A_RET<string opName, RegisterClass rc = VGPR_32, Operand ofs = offset>
: DS_Pseudo<opName,
(outs rc:$vdst),
- (ins VGPR_32:$addr, offset:$offset, gds:$gds),
+ (ins VGPR_32:$addr, ofs:$offset, gds:$gds),
"$vdst, $addr$offset$gds"> {
let has_data0 = 0;
@@ -440,7 +440,7 @@ def DS_WRITE_SRC2_B32 : DS_1A<"ds_write_src2_b32">;
def DS_WRITE_SRC2_B64 : DS_1A<"ds_write_src2_b64">;
let Uses = [EXEC], mayLoad = 0, mayStore = 0, isConvergent = 1 in {
-def DS_SWIZZLE_B32 : DS_1A_RET <"ds_swizzle_b32">;
+def DS_SWIZZLE_B32 : DS_1A_RET <"ds_swizzle_b32", VGPR_32, SwizzleImm>;
}
let mayStore = 0 in {
diff --git a/lib/Target/AMDGPU/InstPrinter/AMDGPUInstPrinter.cpp b/lib/Target/AMDGPU/InstPrinter/AMDGPUInstPrinter.cpp
index a817ff3cbaf09..523eea41897ea 100644
--- a/lib/Target/AMDGPU/InstPrinter/AMDGPUInstPrinter.cpp
+++ b/lib/Target/AMDGPU/InstPrinter/AMDGPUInstPrinter.cpp
@@ -1160,6 +1160,112 @@ void AMDGPUInstPrinter::printSendMsg(const MCInst *MI, unsigned OpNo,
O << SImm16; // Unknown simm16 code.
}
+static void printSwizzleBitmask(const uint16_t AndMask,
+ const uint16_t OrMask,
+ const uint16_t XorMask,
+ raw_ostream &O) {
+ using namespace llvm::AMDGPU::Swizzle;
+
+ uint16_t Probe0 = ((0 & AndMask) | OrMask) ^ XorMask;
+ uint16_t Probe1 = ((BITMASK_MASK & AndMask) | OrMask) ^ XorMask;
+
+ O << "\"";
+
+ for (unsigned Mask = 1 << (BITMASK_WIDTH - 1); Mask > 0; Mask >>= 1) {
+ uint16_t p0 = Probe0 & Mask;
+ uint16_t p1 = Probe1 & Mask;
+
+ if (p0 == p1) {
+ if (p0 == 0) {
+ O << "0";
+ } else {
+ O << "1";
+ }
+ } else {
+ if (p0 == 0) {
+ O << "p";
+ } else {
+ O << "i";
+ }
+ }
+ }
+
+ O << "\"";
+}
+
+void AMDGPUInstPrinter::printSwizzle(const MCInst *MI, unsigned OpNo,
+ const MCSubtargetInfo &STI,
+ raw_ostream &O) {
+ using namespace llvm::AMDGPU::Swizzle;
+
+ uint16_t Imm = MI->getOperand(OpNo).getImm();
+ if (Imm == 0) {
+ return;
+ }
+
+ O << " offset:";
+
+ if ((Imm & QUAD_PERM_ENC_MASK) == QUAD_PERM_ENC) {
+
+ O << "swizzle(" << IdSymbolic[ID_QUAD_PERM];
+ for (auto i = 0; i < LANE_NUM; ++i) {
+ O << ",";
+ O << formatDec(Imm & LANE_MASK);
+ Imm >>= LANE_SHIFT;
+ }
+ O << ")";
+
+ } else if ((Imm & BITMASK_PERM_ENC_MASK) == BITMASK_PERM_ENC) {
+
+ uint16_t AndMask = (Imm >> BITMASK_AND_SHIFT) & BITMASK_MASK;
+ uint16_t OrMask = (Imm >> BITMASK_OR_SHIFT) & BITMASK_MASK;
+ uint16_t XorMask = (Imm >> BITMASK_XOR_SHIFT) & BITMASK_MASK;
+
+ if (AndMask == BITMASK_MAX &&
+ OrMask == 0 &&
+ countPopulation(XorMask) == 1) {
+
+ O << "swizzle(" << IdSymbolic[ID_SWAP];
+ O << ",";
+ O << formatDec(XorMask);
+ O << ")";
+
+ } else if (AndMask == BITMASK_MAX &&
+ OrMask == 0 && XorMask > 0 &&
+ isPowerOf2_64(XorMask + 1)) {
+
+ O << "swizzle(" << IdSymbolic[ID_REVERSE];
+ O << ",";
+ O << formatDec(XorMask + 1);
+ O << ")";
+
+ } else {
+
+ uint16_t GroupSize = BITMASK_MAX - AndMask + 1;
+ if (GroupSize > 1 &&
+ isPowerOf2_64(GroupSize) &&
+ OrMask < GroupSize &&
+ XorMask == 0) {
+
+ O << "swizzle(" << IdSymbolic[ID_BROADCAST];
+ O << ",";
+ O << formatDec(GroupSize);
+ O << ",";
+ O << formatDec(OrMask);
+ O << ")";
+
+ } else {
+ O << "swizzle(" << IdSymbolic[ID_BITMASK_PERM];
+ O << ",";
+ printSwizzleBitmask(AndMask, OrMask, XorMask, O);
+ O << ")";
+ }
+ }
+ } else {
+ printU16ImmDecOperand(MI, OpNo, O);
+ }
+}
+
void AMDGPUInstPrinter::printWaitFlag(const MCInst *MI, unsigned OpNo,
const MCSubtargetInfo &STI,
raw_ostream &O) {
diff --git a/lib/Target/AMDGPU/InstPrinter/AMDGPUInstPrinter.h b/lib/Target/AMDGPU/InstPrinter/AMDGPUInstPrinter.h
index c0b8e5c510893..c8094c4b840a1 100644
--- a/lib/Target/AMDGPU/InstPrinter/AMDGPUInstPrinter.h
+++ b/lib/Target/AMDGPU/InstPrinter/AMDGPUInstPrinter.h
@@ -193,6 +193,8 @@ private:
raw_ostream &O);
void printSendMsg(const MCInst *MI, unsigned OpNo, const MCSubtargetInfo &STI,
raw_ostream &O);
+ void printSwizzle(const MCInst *MI, unsigned OpNo, const MCSubtargetInfo &STI,
+ raw_ostream &O);
void printWaitFlag(const MCInst *MI, unsigned OpNo,
const MCSubtargetInfo &STI, raw_ostream &O);
void printHwreg(const MCInst *MI, unsigned OpNo, const MCSubtargetInfo &STI,
diff --git a/lib/Target/AMDGPU/SIDefines.h b/lib/Target/AMDGPU/SIDefines.h
index 80967edee0ab1..5cd90323ff67b 100644
--- a/lib/Target/AMDGPU/SIDefines.h
+++ b/lib/Target/AMDGPU/SIDefines.h
@@ -281,6 +281,46 @@ enum WidthMinusOne { // WidthMinusOne, (5) [15:11]
} // namespace Hwreg
+namespace Swizzle { // Encoding of swizzle macro used in ds_swizzle_b32.
+
+enum Id { // id of symbolic names
+ ID_QUAD_PERM = 0,
+ ID_BITMASK_PERM,
+ ID_SWAP,
+ ID_REVERSE,
+ ID_BROADCAST
+};
+
+enum EncBits {
+
+ // swizzle mode encodings
+
+ QUAD_PERM_ENC = 0x8000,
+ QUAD_PERM_ENC_MASK = 0xFF00,
+
+ BITMASK_PERM_ENC = 0x0000,
+ BITMASK_PERM_ENC_MASK = 0x8000,
+
+ // QUAD_PERM encodings
+
+ LANE_MASK = 0x3,
+ LANE_MAX = LANE_MASK,
+ LANE_SHIFT = 2,
+ LANE_NUM = 4,
+
+ // BITMASK_PERM encodings
+
+ BITMASK_MASK = 0x1F,
+ BITMASK_MAX = BITMASK_MASK,
+ BITMASK_WIDTH = 5,
+
+ BITMASK_AND_SHIFT = 0,
+ BITMASK_OR_SHIFT = 5,
+ BITMASK_XOR_SHIFT = 10
+};
+
+} // namespace Swizzle
+
namespace SDWA {
enum SdwaSel {
diff --git a/lib/Target/AMDGPU/SIInsertWaitcnts.cpp b/lib/Target/AMDGPU/SIInsertWaitcnts.cpp
index b5e3ce3dfe3ed..e22166d03e9ae 100644
--- a/lib/Target/AMDGPU/SIInsertWaitcnts.cpp
+++ b/lib/Target/AMDGPU/SIInsertWaitcnts.cpp
@@ -826,7 +826,8 @@ MachineInstr *SIInsertWaitcnts::generateSWaitCntInstBefore(
// NOTE: this could be improved with knowledge of all call sites or
// with knowledge of the called routines.
if (MI.getOpcode() == AMDGPU::RETURN ||
- MI.getOpcode() == AMDGPU::SI_RETURN_TO_EPILOG) {
+ MI.getOpcode() == AMDGPU::SI_RETURN_TO_EPILOG ||
+ MI.getOpcode() == AMDGPU::S_SETPC_B64_return) {
for (enum InstCounterType T = VM_CNT; T < NUM_INST_CNTS;
T = (enum InstCounterType)(T + 1)) {
if (ScoreBrackets->getScoreUB(T) > ScoreBrackets->getScoreLB(T)) {
@@ -1149,8 +1150,10 @@ void SIInsertWaitcnts::updateEventWaitCntAfter(
// instruction, update the upper-bound of the appropriate counter's
// bracket and the destination operand scores.
// TODO: Use the (TSFlags & SIInstrFlags::LGKM_CNT) property everywhere.
- if (TII->isDS(Inst) && (Inst.mayLoad() || Inst.mayStore())) {
- if (TII->getNamedOperand(Inst, AMDGPU::OpName::gds)->getImm() != 0) {
+ uint64_t TSFlags = Inst.getDesc().TSFlags;
+ if (TII->isDS(Inst) && (TSFlags & SIInstrFlags::LGKM_CNT)) {
+ if (TII->getNamedOperand(Inst, AMDGPU::OpName::gds) &&
+ TII->getNamedOperand(Inst, AMDGPU::OpName::gds)->getImm() != 0) {
ScoreBrackets->updateByEvent(TII, TRI, MRI, GDS_ACCESS, Inst);
ScoreBrackets->updateByEvent(TII, TRI, MRI, GDS_GPR_LOCK, Inst);
} else {
@@ -1183,7 +1186,7 @@ void SIInsertWaitcnts::updateEventWaitCntAfter(
Inst.getOpcode() != AMDGPU::BUFFER_WBINVL1_VOL) {
ScoreBrackets->updateByEvent(TII, TRI, MRI, VMEM_ACCESS, Inst);
if ( // TODO: assumed yes -- target_info->MemWriteNeedsExpWait() &&
- (Inst.mayStore() || AMDGPU::getAtomicNoRetOp(Inst.getOpcode()))) {
+ (Inst.mayStore() || AMDGPU::getAtomicNoRetOp(Inst.getOpcode()) != -1)) {
ScoreBrackets->updateByEvent(TII, TRI, MRI, VMW_GPR_LOCK, Inst);
}
} else if (TII->isSMRD(Inst)) {
@@ -1715,6 +1718,7 @@ bool SIInsertWaitcnts::runOnMachineFunction(MachineFunction &MF) {
MRI = &MF.getRegInfo();
MLI = &getAnalysis<MachineLoopInfo>();
IV = AMDGPU::IsaInfo::getIsaVersion(ST->getFeatureBits());
+ const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
AMDGPUASI = ST->getAMDGPUAS();
HardwareLimits.VmcntMax = AMDGPU::getVmcntBitMask(IV);
@@ -1859,5 +1863,19 @@ bool SIInsertWaitcnts::runOnMachineFunction(MachineFunction &MF) {
}
}
+ if (!MFI->isEntryFunction()) {
+ // Wait for any outstanding memory operations that the input registers may
+ // depend on. We can't track them and it's better to to the wait after the
+ // costly call sequence.
+
+ // TODO: Could insert earlier and schedule more liberally with operations
+ // that only use caller preserved registers.
+ MachineBasicBlock &EntryBB = MF.front();
+ BuildMI(EntryBB, EntryBB.getFirstNonPHI(), DebugLoc(), TII->get(AMDGPU::S_WAITCNT))
+ .addImm(0);
+
+ Modified = true;
+ }
+
return Modified;
}
diff --git a/lib/Target/AMDGPU/SIInstrInfo.td b/lib/Target/AMDGPU/SIInstrInfo.td
index c5287c7f64ba4..445bf79a7814e 100644
--- a/lib/Target/AMDGPU/SIInstrInfo.td
+++ b/lib/Target/AMDGPU/SIInstrInfo.td
@@ -383,6 +383,14 @@ def SendMsgMatchClass : AsmOperandClass {
let RenderMethod = "addImmOperands";
}
+def SwizzleMatchClass : AsmOperandClass {
+ let Name = "Swizzle";
+ let PredicateMethod = "isSwizzle";
+ let ParserMethod = "parseSwizzleOp";
+ let RenderMethod = "addImmOperands";
+ let IsOptional = 1;
+}
+
def ExpTgtMatchClass : AsmOperandClass {
let Name = "ExpTgt";
let PredicateMethod = "isExpTgt";
@@ -395,6 +403,11 @@ def SendMsgImm : Operand<i32> {
let ParserMatchClass = SendMsgMatchClass;
}
+def SwizzleImm : Operand<i16> {
+ let PrintMethod = "printSwizzle";
+ let ParserMatchClass = SwizzleMatchClass;
+}
+
def SWaitMatchClass : AsmOperandClass {
let Name = "SWaitCnt";
let RenderMethod = "addImmOperands";
diff --git a/lib/Target/AMDGPU/SIRegisterInfo.h b/lib/Target/AMDGPU/SIRegisterInfo.h
index b91cdddc5520f..a648c178101a1 100644
--- a/lib/Target/AMDGPU/SIRegisterInfo.h
+++ b/lib/Target/AMDGPU/SIRegisterInfo.h
@@ -66,6 +66,12 @@ public:
const uint32_t *getCallPreservedMask(const MachineFunction &MF,
CallingConv::ID) const override;
+ // Stack access is very expensive. CSRs are also the high registers, and we
+ // want to minimize the number of used registers.
+ unsigned getCSRFirstUseCost() const override {
+ return 100;
+ }
+
unsigned getFrameRegister(const MachineFunction &MF) const override;
bool requiresRegisterScavenging(const MachineFunction &Fn) const override;
diff --git a/lib/Target/AMDGPU/Utils/AMDGPUAsmUtils.cpp b/lib/Target/AMDGPU/Utils/AMDGPUAsmUtils.cpp
index b6868de6a74e3..03b11ae80500e 100644
--- a/lib/Target/AMDGPU/Utils/AMDGPUAsmUtils.cpp
+++ b/lib/Target/AMDGPU/Utils/AMDGPUAsmUtils.cpp
@@ -65,5 +65,18 @@ const char* const IdSymbolic[] = {
};
} // namespace Hwreg
+
+namespace Swizzle {
+
+// This must be in sync with llvm::AMDGPU::Swizzle::Id enum members, see SIDefines.h.
+const char* const IdSymbolic[] = {
+ "QUAD_PERM",
+ "BITMASK_PERM",
+ "SWAP",
+ "REVERSE",
+ "BROADCAST",
+};
+
+} // namespace Swizzle
} // namespace AMDGPU
} // namespace llvm
diff --git a/lib/Target/AMDGPU/Utils/AMDGPUAsmUtils.h b/lib/Target/AMDGPU/Utils/AMDGPUAsmUtils.h
index b2dc2c0e364cd..ebb2be22b4879 100644
--- a/lib/Target/AMDGPU/Utils/AMDGPUAsmUtils.h
+++ b/lib/Target/AMDGPU/Utils/AMDGPUAsmUtils.h
@@ -25,6 +25,12 @@ namespace Hwreg { // Symbolic names for the hwreg(...) syntax.
extern const char* const IdSymbolic[];
} // namespace Hwreg
+
+namespace Swizzle { // Symbolic names for the swizzle(...) syntax.
+
+extern const char* const IdSymbolic[];
+
+} // namespace Swizzle
} // namespace AMDGPU
} // namespace llvm