aboutsummaryrefslogtreecommitdiff
path: root/lib/Target/AMDGPU/SIMemoryLegalizer.cpp
diff options
context:
space:
mode:
authorDimitry Andric <dim@FreeBSD.org>2019-08-20 20:50:12 +0000
committerDimitry Andric <dim@FreeBSD.org>2019-08-20 20:50:12 +0000
commite6d1592492a3a379186bfb02bd0f4eda0669c0d5 (patch)
tree599ab169a01f1c86eda9adc774edaedde2f2db5b /lib/Target/AMDGPU/SIMemoryLegalizer.cpp
parent1a56a5ead7a2e84bee8240f5f6b033b5f1707154 (diff)
Notes
Diffstat (limited to 'lib/Target/AMDGPU/SIMemoryLegalizer.cpp')
-rw-r--r--lib/Target/AMDGPU/SIMemoryLegalizer.cpp322
1 files changed, 296 insertions, 26 deletions
diff --git a/lib/Target/AMDGPU/SIMemoryLegalizer.cpp b/lib/Target/AMDGPU/SIMemoryLegalizer.cpp
index b4a4e9e33133..4320e6c957a0 100644
--- a/lib/Target/AMDGPU/SIMemoryLegalizer.cpp
+++ b/lib/Target/AMDGPU/SIMemoryLegalizer.cpp
@@ -1,9 +1,8 @@
//===- SIMemoryLegalizer.cpp ----------------------------------------------===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
@@ -146,7 +145,7 @@ private:
// only contains a single address space.
if ((OrderingAddrSpace == InstrAddrSpace) &&
isPowerOf2_32(uint32_t(InstrAddrSpace)))
- IsCrossAddressSpaceOrdering = false;
+ this->IsCrossAddressSpaceOrdering = false;
}
public:
@@ -353,6 +352,40 @@ public:
};
+class SIGfx10CacheControl : public SIGfx7CacheControl {
+protected:
+ bool CuMode = false;
+
+ /// Sets DLC bit to "true" if present in \p MI. Returns true if \p MI
+ /// is modified, false otherwise.
+ bool enableDLCBit(const MachineBasicBlock::iterator &MI) const {
+ return enableNamedBit<AMDGPU::OpName::dlc>(MI);
+ }
+
+public:
+
+ SIGfx10CacheControl(const GCNSubtarget &ST, bool CuMode) :
+ SIGfx7CacheControl(ST), CuMode(CuMode) {};
+
+ bool enableLoadCacheBypass(const MachineBasicBlock::iterator &MI,
+ SIAtomicScope Scope,
+ SIAtomicAddrSpace AddrSpace) const override;
+
+ bool enableNonTemporal(const MachineBasicBlock::iterator &MI) const override;
+
+ bool insertCacheInvalidate(MachineBasicBlock::iterator &MI,
+ SIAtomicScope Scope,
+ SIAtomicAddrSpace AddrSpace,
+ Position Pos) const override;
+
+ bool insertWait(MachineBasicBlock::iterator &MI,
+ SIAtomicScope Scope,
+ SIAtomicAddrSpace AddrSpace,
+ SIMemOp Op,
+ bool IsCrossAddrSpaceOrdering,
+ Position Pos) const override;
+};
+
class SIMemoryLegalizer final : public MachineFunctionPass {
private:
@@ -418,35 +451,46 @@ void SIMemOpAccess::reportUnsupported(const MachineBasicBlock::iterator &MI,
Optional<std::tuple<SIAtomicScope, SIAtomicAddrSpace, bool>>
SIMemOpAccess::toSIAtomicScope(SyncScope::ID SSID,
SIAtomicAddrSpace InstrScope) const {
- /// TODO: For now assume OpenCL memory model which treats each
- /// address space as having a separate happens-before relation, and
- /// so an instruction only has ordering with respect to the address
- /// space it accesses, and if it accesses multiple address spaces it
- /// does not require ordering of operations in different address
- /// spaces.
- if (SSID == SyncScope::System)
+ if (SSID == SyncScope::System)
+ return std::make_tuple(SIAtomicScope::SYSTEM,
+ SIAtomicAddrSpace::ATOMIC,
+ true);
+ if (SSID == MMI->getAgentSSID())
+ return std::make_tuple(SIAtomicScope::AGENT,
+ SIAtomicAddrSpace::ATOMIC,
+ true);
+ if (SSID == MMI->getWorkgroupSSID())
+ return std::make_tuple(SIAtomicScope::WORKGROUP,
+ SIAtomicAddrSpace::ATOMIC,
+ true);
+ if (SSID == MMI->getWavefrontSSID())
+ return std::make_tuple(SIAtomicScope::WAVEFRONT,
+ SIAtomicAddrSpace::ATOMIC,
+ true);
+ if (SSID == SyncScope::SingleThread)
+ return std::make_tuple(SIAtomicScope::SINGLETHREAD,
+ SIAtomicAddrSpace::ATOMIC,
+ true);
+ if (SSID == MMI->getSystemOneAddressSpaceSSID())
return std::make_tuple(SIAtomicScope::SYSTEM,
SIAtomicAddrSpace::ATOMIC & InstrScope,
false);
- if (SSID == MMI->getAgentSSID())
+ if (SSID == MMI->getAgentOneAddressSpaceSSID())
return std::make_tuple(SIAtomicScope::AGENT,
SIAtomicAddrSpace::ATOMIC & InstrScope,
false);
- if (SSID == MMI->getWorkgroupSSID())
+ if (SSID == MMI->getWorkgroupOneAddressSpaceSSID())
return std::make_tuple(SIAtomicScope::WORKGROUP,
SIAtomicAddrSpace::ATOMIC & InstrScope,
false);
- if (SSID == MMI->getWavefrontSSID())
+ if (SSID == MMI->getWavefrontOneAddressSpaceSSID())
return std::make_tuple(SIAtomicScope::WAVEFRONT,
SIAtomicAddrSpace::ATOMIC & InstrScope,
false);
- if (SSID == SyncScope::SingleThread)
+ if (SSID == MMI->getSingleThreadOneAddressSpaceSSID())
return std::make_tuple(SIAtomicScope::SINGLETHREAD,
SIAtomicAddrSpace::ATOMIC & InstrScope,
false);
- /// TODO: To support HSA Memory Model need to add additional memory
- /// scopes that specify that do require cross address space
- /// ordering.
return None;
}
@@ -613,7 +657,9 @@ std::unique_ptr<SICacheControl> SICacheControl::create(const GCNSubtarget &ST) {
GCNSubtarget::Generation Generation = ST.getGeneration();
if (Generation <= AMDGPUSubtarget::SOUTHERN_ISLANDS)
return make_unique<SIGfx6CacheControl>(ST);
- return make_unique<SIGfx7CacheControl>(ST);
+ if (Generation < AMDGPUSubtarget::GFX10)
+ return make_unique<SIGfx7CacheControl>(ST);
+ return make_unique<SIGfx10CacheControl>(ST, ST.isCuModeEnabled());
}
bool SIGfx6CacheControl::enableLoadCacheBypass(
@@ -722,13 +768,12 @@ bool SIGfx6CacheControl::insertWait(MachineBasicBlock::iterator &MI,
bool VMCnt = false;
bool LGKMCnt = false;
- bool EXPCnt = false;
if ((AddrSpace & SIAtomicAddrSpace::GLOBAL) != SIAtomicAddrSpace::NONE) {
switch (Scope) {
case SIAtomicScope::SYSTEM:
case SIAtomicScope::AGENT:
- VMCnt = true;
+ VMCnt |= true;
break;
case SIAtomicScope::WORKGROUP:
case SIAtomicScope::WAVEFRONT:
@@ -752,7 +797,7 @@ bool SIGfx6CacheControl::insertWait(MachineBasicBlock::iterator &MI,
// also synchronizing with global/GDS memory as LDS operations
// could be reordered with respect to later global/GDS memory
// operations of the same wave.
- LGKMCnt = IsCrossAddrSpaceOrdering;
+ LGKMCnt |= IsCrossAddrSpaceOrdering;
break;
case SIAtomicScope::WAVEFRONT:
case SIAtomicScope::SINGLETHREAD:
@@ -774,7 +819,7 @@ bool SIGfx6CacheControl::insertWait(MachineBasicBlock::iterator &MI,
// also synchronizing with global/LDS memory as GDS operations
// could be reordered with respect to later global/LDS memory
// operations of the same wave.
- EXPCnt = IsCrossAddrSpaceOrdering;
+ LGKMCnt |= IsCrossAddrSpaceOrdering;
break;
case SIAtomicScope::WORKGROUP:
case SIAtomicScope::WAVEFRONT:
@@ -787,11 +832,11 @@ bool SIGfx6CacheControl::insertWait(MachineBasicBlock::iterator &MI,
}
}
- if (VMCnt || LGKMCnt || EXPCnt) {
+ if (VMCnt || LGKMCnt) {
unsigned WaitCntImmediate =
AMDGPU::encodeWaitcnt(IV,
VMCnt ? 0 : getVmcntBitMask(IV),
- EXPCnt ? 0 : getExpcntBitMask(IV),
+ getExpcntBitMask(IV),
LGKMCnt ? 0 : getLgkmcntBitMask(IV));
BuildMI(MBB, MI, DL, TII->get(AMDGPU::S_WAITCNT)).addImm(WaitCntImmediate);
Changed = true;
@@ -851,6 +896,231 @@ bool SIGfx7CacheControl::insertCacheInvalidate(MachineBasicBlock::iterator &MI,
return Changed;
}
+bool SIGfx10CacheControl::enableLoadCacheBypass(
+ const MachineBasicBlock::iterator &MI,
+ SIAtomicScope Scope,
+ SIAtomicAddrSpace AddrSpace) const {
+ assert(MI->mayLoad() && !MI->mayStore());
+ bool Changed = false;
+
+ if ((AddrSpace & SIAtomicAddrSpace::GLOBAL) != SIAtomicAddrSpace::NONE) {
+ /// TODO Do not set glc for rmw atomic operations as they
+ /// implicitly bypass the L0/L1 caches.
+
+ switch (Scope) {
+ case SIAtomicScope::SYSTEM:
+ case SIAtomicScope::AGENT:
+ Changed |= enableGLCBit(MI);
+ Changed |= enableDLCBit(MI);
+ break;
+ case SIAtomicScope::WORKGROUP:
+ // In WGP mode the waves of a work-group can be executing on either CU of
+ // the WGP. Therefore need to bypass the L0 which is per CU. Otherwise in
+ // CU mode and all waves of a work-group are on the same CU, and so the
+ // L0 does not need to be bypassed.
+ if (!CuMode) Changed |= enableGLCBit(MI);
+ break;
+ case SIAtomicScope::WAVEFRONT:
+ case SIAtomicScope::SINGLETHREAD:
+ // No cache to bypass.
+ break;
+ default:
+ llvm_unreachable("Unsupported synchronization scope");
+ }
+ }
+
+ /// The scratch address space does not need the global memory caches
+ /// to be bypassed as all memory operations by the same thread are
+ /// sequentially consistent, and no other thread can access scratch
+ /// memory.
+
+ /// Other address spaces do not hava a cache.
+
+ return Changed;
+}
+
+bool SIGfx10CacheControl::enableNonTemporal(
+ const MachineBasicBlock::iterator &MI) const {
+ assert(MI->mayLoad() ^ MI->mayStore());
+ bool Changed = false;
+
+ Changed |= enableSLCBit(MI);
+ /// TODO for store (non-rmw atomic) instructions also enableGLCBit(MI)
+
+ return Changed;
+}
+
+bool SIGfx10CacheControl::insertCacheInvalidate(MachineBasicBlock::iterator &MI,
+ SIAtomicScope Scope,
+ SIAtomicAddrSpace AddrSpace,
+ Position Pos) const {
+ bool Changed = false;
+
+ MachineBasicBlock &MBB = *MI->getParent();
+ DebugLoc DL = MI->getDebugLoc();
+
+ if (Pos == Position::AFTER)
+ ++MI;
+
+ if ((AddrSpace & SIAtomicAddrSpace::GLOBAL) != SIAtomicAddrSpace::NONE) {
+ switch (Scope) {
+ case SIAtomicScope::SYSTEM:
+ case SIAtomicScope::AGENT:
+ BuildMI(MBB, MI, DL, TII->get(AMDGPU::BUFFER_GL0_INV));
+ BuildMI(MBB, MI, DL, TII->get(AMDGPU::BUFFER_GL1_INV));
+ Changed = true;
+ break;
+ case SIAtomicScope::WORKGROUP:
+ // In WGP mode the waves of a work-group can be executing on either CU of
+ // the WGP. Therefore need to invalidate the L0 which is per CU. Otherwise
+ // in CU mode and all waves of a work-group are on the same CU, and so the
+ // L0 does not need to be invalidated.
+ if (!CuMode) {
+ BuildMI(MBB, MI, DL, TII->get(AMDGPU::BUFFER_GL0_INV));
+ Changed = true;
+ }
+ break;
+ case SIAtomicScope::WAVEFRONT:
+ case SIAtomicScope::SINGLETHREAD:
+ // No cache to invalidate.
+ break;
+ default:
+ llvm_unreachable("Unsupported synchronization scope");
+ }
+ }
+
+ /// The scratch address space does not need the global memory cache
+ /// to be flushed as all memory operations by the same thread are
+ /// sequentially consistent, and no other thread can access scratch
+ /// memory.
+
+ /// Other address spaces do not hava a cache.
+
+ if (Pos == Position::AFTER)
+ --MI;
+
+ return Changed;
+}
+
+bool SIGfx10CacheControl::insertWait(MachineBasicBlock::iterator &MI,
+ SIAtomicScope Scope,
+ SIAtomicAddrSpace AddrSpace,
+ SIMemOp Op,
+ bool IsCrossAddrSpaceOrdering,
+ Position Pos) const {
+ bool Changed = false;
+
+ MachineBasicBlock &MBB = *MI->getParent();
+ DebugLoc DL = MI->getDebugLoc();
+
+ if (Pos == Position::AFTER)
+ ++MI;
+
+ bool VMCnt = false;
+ bool VSCnt = false;
+ bool LGKMCnt = false;
+
+ if ((AddrSpace & SIAtomicAddrSpace::GLOBAL) != SIAtomicAddrSpace::NONE) {
+ switch (Scope) {
+ case SIAtomicScope::SYSTEM:
+ case SIAtomicScope::AGENT:
+ if ((Op & SIMemOp::LOAD) != SIMemOp::NONE)
+ VMCnt |= true;
+ if ((Op & SIMemOp::STORE) != SIMemOp::NONE)
+ VSCnt |= true;
+ break;
+ case SIAtomicScope::WORKGROUP:
+ // In WGP mode the waves of a work-group can be executing on either CU of
+ // the WGP. Therefore need to wait for operations to complete to ensure
+ // they are visible to waves in the other CU as the L0 is per CU.
+ // Otherwise in CU mode and all waves of a work-group are on the same CU
+ // which shares the same L0.
+ if (!CuMode) {
+ if ((Op & SIMemOp::LOAD) != SIMemOp::NONE)
+ VMCnt |= true;
+ if ((Op & SIMemOp::STORE) != SIMemOp::NONE)
+ VSCnt |= true;
+ }
+ break;
+ case SIAtomicScope::WAVEFRONT:
+ case SIAtomicScope::SINGLETHREAD:
+ // The L0 cache keeps all memory operations in order for
+ // work-items in the same wavefront.
+ break;
+ default:
+ llvm_unreachable("Unsupported synchronization scope");
+ }
+ }
+
+ if ((AddrSpace & SIAtomicAddrSpace::LDS) != SIAtomicAddrSpace::NONE) {
+ switch (Scope) {
+ case SIAtomicScope::SYSTEM:
+ case SIAtomicScope::AGENT:
+ case SIAtomicScope::WORKGROUP:
+ // If no cross address space ordering then an LDS waitcnt is not
+ // needed as LDS operations for all waves are executed in a
+ // total global ordering as observed by all waves. Required if
+ // also synchronizing with global/GDS memory as LDS operations
+ // could be reordered with respect to later global/GDS memory
+ // operations of the same wave.
+ LGKMCnt |= IsCrossAddrSpaceOrdering;
+ break;
+ case SIAtomicScope::WAVEFRONT:
+ case SIAtomicScope::SINGLETHREAD:
+ // The LDS keeps all memory operations in order for
+ // the same wavesfront.
+ break;
+ default:
+ llvm_unreachable("Unsupported synchronization scope");
+ }
+ }
+
+ if ((AddrSpace & SIAtomicAddrSpace::GDS) != SIAtomicAddrSpace::NONE) {
+ switch (Scope) {
+ case SIAtomicScope::SYSTEM:
+ case SIAtomicScope::AGENT:
+ // If no cross address space ordering then an GDS waitcnt is not
+ // needed as GDS operations for all waves are executed in a
+ // total global ordering as observed by all waves. Required if
+ // also synchronizing with global/LDS memory as GDS operations
+ // could be reordered with respect to later global/LDS memory
+ // operations of the same wave.
+ LGKMCnt |= IsCrossAddrSpaceOrdering;
+ break;
+ case SIAtomicScope::WORKGROUP:
+ case SIAtomicScope::WAVEFRONT:
+ case SIAtomicScope::SINGLETHREAD:
+ // The GDS keeps all memory operations in order for
+ // the same work-group.
+ break;
+ default:
+ llvm_unreachable("Unsupported synchronization scope");
+ }
+ }
+
+ if (VMCnt || LGKMCnt) {
+ unsigned WaitCntImmediate =
+ AMDGPU::encodeWaitcnt(IV,
+ VMCnt ? 0 : getVmcntBitMask(IV),
+ getExpcntBitMask(IV),
+ LGKMCnt ? 0 : getLgkmcntBitMask(IV));
+ BuildMI(MBB, MI, DL, TII->get(AMDGPU::S_WAITCNT)).addImm(WaitCntImmediate);
+ Changed = true;
+ }
+
+ if (VSCnt) {
+ BuildMI(MBB, MI, DL, TII->get(AMDGPU::S_WAITCNT_VSCNT))
+ .addReg(AMDGPU::SGPR_NULL, RegState::Undef)
+ .addImm(0);
+ Changed = true;
+ }
+
+ if (Pos == Position::AFTER)
+ --MI;
+
+ return Changed;
+}
+
bool SIMemoryLegalizer::removeAtomicPseudoMIs() {
if (AtomicPseudoMIs.empty())
return false;