aboutsummaryrefslogtreecommitdiff
path: root/llvm/lib/Target/AMDGPU/SIMemoryLegalizer.cpp
diff options
context:
space:
mode:
authorDimitry Andric <dim@FreeBSD.org>2022-07-03 14:10:23 +0000
committerDimitry Andric <dim@FreeBSD.org>2022-07-03 14:10:23 +0000
commit145449b1e420787bb99721a429341fa6be3adfb6 (patch)
tree1d56ae694a6de602e348dd80165cf881a36600ed /llvm/lib/Target/AMDGPU/SIMemoryLegalizer.cpp
parentecbca9f5fb7d7613d2b94982c4825eb0d33d6842 (diff)
Diffstat (limited to 'llvm/lib/Target/AMDGPU/SIMemoryLegalizer.cpp')
-rw-r--r--llvm/lib/Target/AMDGPU/SIMemoryLegalizer.cpp484
1 files changed, 476 insertions, 8 deletions
diff --git a/llvm/lib/Target/AMDGPU/SIMemoryLegalizer.cpp b/llvm/lib/Target/AMDGPU/SIMemoryLegalizer.cpp
index fff4f6729c99..8a66213931ff 100644
--- a/llvm/lib/Target/AMDGPU/SIMemoryLegalizer.cpp
+++ b/llvm/lib/Target/AMDGPU/SIMemoryLegalizer.cpp
@@ -19,6 +19,7 @@
#include "MCTargetDesc/AMDGPUMCTargetDesc.h"
#include "llvm/ADT/BitmaskEnum.h"
#include "llvm/CodeGen/MachineBasicBlock.h"
+#include "llvm/CodeGen/MachineFunctionPass.h"
#include "llvm/IR/DiagnosticInfo.h"
#include "llvm/Support/AtomicOrdering.h"
#include "llvm/Support/TargetParser.h"
@@ -63,7 +64,7 @@ enum class SIAtomicScope {
};
/// The distinct address spaces supported by the AMDGPU target for
-/// atomic memory operation. Can be ORed toether.
+/// atomic memory operation. Can be ORed together.
enum class SIAtomicAddrSpace {
NONE = 0u,
GLOBAL = 1u << 0,
@@ -459,6 +460,56 @@ public:
Position Pos) const override;
};
+class SIGfx940CacheControl : public SIGfx90ACacheControl {
+protected:
+
+ /// Sets SC0 bit to "true" if present in \p MI. Returns true if \p MI
+ /// is modified, false otherwise.
+ bool enableSC0Bit(const MachineBasicBlock::iterator &MI) const {
+ return enableNamedBit(MI, AMDGPU::CPol::SC0);
+ }
+
+ /// Sets SC1 bit to "true" if present in \p MI. Returns true if \p MI
+ /// is modified, false otherwise.
+ bool enableSC1Bit(const MachineBasicBlock::iterator &MI) const {
+ return enableNamedBit(MI, AMDGPU::CPol::SC1);
+ }
+
+ /// Sets NT bit to "true" if present in \p MI. Returns true if \p MI
+ /// is modified, false otherwise.
+ bool enableNTBit(const MachineBasicBlock::iterator &MI) const {
+ return enableNamedBit(MI, AMDGPU::CPol::NT);
+ }
+
+public:
+
+ SIGfx940CacheControl(const GCNSubtarget &ST) : SIGfx90ACacheControl(ST) {};
+
+ bool enableLoadCacheBypass(const MachineBasicBlock::iterator &MI,
+ SIAtomicScope Scope,
+ SIAtomicAddrSpace AddrSpace) const override;
+
+ bool enableStoreCacheBypass(const MachineBasicBlock::iterator &MI,
+ SIAtomicScope Scope,
+ SIAtomicAddrSpace AddrSpace) const override;
+
+ bool enableRMWCacheBypass(const MachineBasicBlock::iterator &MI,
+ SIAtomicScope Scope,
+ SIAtomicAddrSpace AddrSpace) const override;
+
+ bool enableVolatileAndOrNonTemporal(MachineBasicBlock::iterator &MI,
+ SIAtomicAddrSpace AddrSpace, SIMemOp Op,
+ bool IsVolatile,
+ bool IsNonTemporal) const override;
+
+ bool insertAcquire(MachineBasicBlock::iterator &MI, SIAtomicScope Scope,
+ SIAtomicAddrSpace AddrSpace, Position Pos) const override;
+
+ bool insertRelease(MachineBasicBlock::iterator &MI, SIAtomicScope Scope,
+ SIAtomicAddrSpace AddrSpace, bool IsCrossAddrSpaceOrdering,
+ Position Pos) const override;
+};
+
class SIGfx10CacheControl : public SIGfx7CacheControl {
protected:
@@ -494,6 +545,20 @@ public:
Position Pos) const override;
};
+class SIGfx11CacheControl : public SIGfx10CacheControl {
+public:
+ SIGfx11CacheControl(const GCNSubtarget &ST) : SIGfx10CacheControl(ST) {}
+
+ bool enableLoadCacheBypass(const MachineBasicBlock::iterator &MI,
+ SIAtomicScope Scope,
+ SIAtomicAddrSpace AddrSpace) const override;
+
+ bool enableVolatileAndOrNonTemporal(MachineBasicBlock::iterator &MI,
+ SIAtomicAddrSpace AddrSpace, SIMemOp Op,
+ bool IsVolatile,
+ bool IsNonTemporal) const override;
+};
+
class SIMemoryLegalizer final : public MachineFunctionPass {
private:
@@ -649,7 +714,7 @@ Optional<SIMemOpInfo> SIMemOpAccess::constructFromMIWithMMO(
return None;
}
- SSID = IsSyncScopeInclusion.getValue() ? SSID : MMO->getSyncScopeID();
+ SSID = *IsSyncScopeInclusion ? SSID : MMO->getSyncScopeID();
Ordering = getMergedAtomicOrdering(Ordering, OpOrdering);
assert(MMO->getFailureOrdering() != AtomicOrdering::Release &&
MMO->getFailureOrdering() != AtomicOrdering::AcquireRelease);
@@ -668,7 +733,7 @@ Optional<SIMemOpInfo> SIMemOpAccess::constructFromMIWithMMO(
return None;
}
std::tie(Scope, OrderingAddrSpace, IsCrossAddressSpaceOrdering) =
- ScopeOrNone.getValue();
+ *ScopeOrNone;
if ((OrderingAddrSpace == SIAtomicAddrSpace::NONE) ||
((OrderingAddrSpace & SIAtomicAddrSpace::ATOMIC) != OrderingAddrSpace) ||
((InstrAddrSpace & SIAtomicAddrSpace::ATOMIC) == SIAtomicAddrSpace::NONE)) {
@@ -730,7 +795,7 @@ Optional<SIMemOpInfo> SIMemOpAccess::getAtomicFenceInfo(
SIAtomicAddrSpace OrderingAddrSpace = SIAtomicAddrSpace::NONE;
bool IsCrossAddressSpaceOrdering = false;
std::tie(Scope, OrderingAddrSpace, IsCrossAddressSpaceOrdering) =
- ScopeOrNone.getValue();
+ *ScopeOrNone;
if ((OrderingAddrSpace == SIAtomicAddrSpace::NONE) ||
((OrderingAddrSpace & SIAtomicAddrSpace::ATOMIC) != OrderingAddrSpace)) {
@@ -775,13 +840,17 @@ bool SICacheControl::enableNamedBit(const MachineBasicBlock::iterator MI,
/* static */
std::unique_ptr<SICacheControl> SICacheControl::create(const GCNSubtarget &ST) {
GCNSubtarget::Generation Generation = ST.getGeneration();
+ if (ST.hasGFX940Insts())
+ return std::make_unique<SIGfx940CacheControl>(ST);
if (ST.hasGFX90AInsts())
return std::make_unique<SIGfx90ACacheControl>(ST);
if (Generation <= AMDGPUSubtarget::SOUTHERN_ISLANDS)
return std::make_unique<SIGfx6CacheControl>(ST);
if (Generation < AMDGPUSubtarget::GFX10)
return std::make_unique<SIGfx7CacheControl>(ST);
- return std::make_unique<SIGfx10CacheControl>(ST);
+ if (Generation < AMDGPUSubtarget::GFX11)
+ return std::make_unique<SIGfx10CacheControl>(ST);
+ return std::make_unique<SIGfx11CacheControl>(ST);
}
bool SIGfx6CacheControl::enableLoadCacheBypass(
@@ -943,7 +1012,7 @@ bool SIGfx6CacheControl::insertWait(MachineBasicBlock::iterator &MI,
case SIAtomicScope::WAVEFRONT:
case SIAtomicScope::SINGLETHREAD:
// The LDS keeps all memory operations in order for
- // the same wavesfront.
+ // the same wavefront.
break;
default:
llvm_unreachable("Unsupported synchronization scope");
@@ -1360,7 +1429,9 @@ bool SIGfx90ACacheControl::insertRelease(MachineBasicBlock::iterator &MI,
// to initiate writeback of any dirty cache lines of earlier writes by the
// same wave. A "S_WAITCNT vmcnt(0)" is needed after to ensure the
// writeback has completed.
- BuildMI(MBB, MI, DL, TII->get(AMDGPU::BUFFER_WBL2));
+ BuildMI(MBB, MI, DL, TII->get(AMDGPU::BUFFER_WBL2))
+ // Set SC bits to indicate system scope.
+ .addImm(AMDGPU::CPol::SC0 | AMDGPU::CPol::SC1);
// Followed by same as GFX7, which will ensure the necessary "S_WAITCNT
// vmcnt(0)" needed by the "BUFFER_WBL2".
Changed = true;
@@ -1386,6 +1457,308 @@ bool SIGfx90ACacheControl::insertRelease(MachineBasicBlock::iterator &MI,
return Changed;
}
+bool SIGfx940CacheControl::enableLoadCacheBypass(
+ const MachineBasicBlock::iterator &MI, SIAtomicScope Scope,
+ SIAtomicAddrSpace AddrSpace) const {
+ assert(MI->mayLoad() && !MI->mayStore());
+ bool Changed = false;
+
+ if ((AddrSpace & SIAtomicAddrSpace::GLOBAL) != SIAtomicAddrSpace::NONE) {
+ switch (Scope) {
+ case SIAtomicScope::SYSTEM:
+ // Set SC bits to indicate system scope.
+ Changed |= enableSC0Bit(MI);
+ Changed |= enableSC1Bit(MI);
+ break;
+ case SIAtomicScope::AGENT:
+ // Set SC bits to indicate agent scope.
+ Changed |= enableSC1Bit(MI);
+ break;
+ case SIAtomicScope::WORKGROUP:
+ // In threadgroup split mode the waves of a work-group can be executing on
+ // different CUs. Therefore need to bypass the L1 which is per CU.
+ // Otherwise in non-threadgroup split mode all waves of a work-group are
+ // on the same CU, and so the L1 does not need to be bypassed. Setting SC
+ // bits to indicate work-group scope will do this automatically.
+ Changed |= enableSC0Bit(MI);
+ break;
+ case SIAtomicScope::WAVEFRONT:
+ case SIAtomicScope::SINGLETHREAD:
+ // Leave SC bits unset to indicate wavefront scope.
+ break;
+ default:
+ llvm_unreachable("Unsupported synchronization scope");
+ }
+ }
+
+ /// The scratch address space does not need the global memory caches
+ /// to be bypassed as all memory operations by the same thread are
+ /// sequentially consistent, and no other thread can access scratch
+ /// memory.
+
+ /// Other address spaces do not have a cache.
+
+ return Changed;
+}
+
+bool SIGfx940CacheControl::enableStoreCacheBypass(
+ const MachineBasicBlock::iterator &MI,
+ SIAtomicScope Scope, SIAtomicAddrSpace AddrSpace) const {
+ assert(!MI->mayLoad() && MI->mayStore());
+ bool Changed = false;
+
+ if ((AddrSpace & SIAtomicAddrSpace::GLOBAL) != SIAtomicAddrSpace::NONE) {
+ switch (Scope) {
+ case SIAtomicScope::SYSTEM:
+ // Set SC bits to indicate system scope.
+ Changed |= enableSC0Bit(MI);
+ Changed |= enableSC1Bit(MI);
+ break;
+ case SIAtomicScope::AGENT:
+ // Set SC bits to indicate agent scope.
+ Changed |= enableSC1Bit(MI);
+ break;
+ case SIAtomicScope::WORKGROUP:
+ // Set SC bits to indicate workgroup scope.
+ Changed |= enableSC0Bit(MI);
+ break;
+ case SIAtomicScope::WAVEFRONT:
+ case SIAtomicScope::SINGLETHREAD:
+ // Leave SC bits unset to indicate wavefront scope.
+ break;
+ default:
+ llvm_unreachable("Unsupported synchronization scope");
+ }
+ }
+
+ /// The scratch address space does not need the global memory caches
+ /// to be bypassed as all memory operations by the same thread are
+ /// sequentially consistent, and no other thread can access scratch
+ /// memory.
+
+ /// Other address spaces do not have a cache.
+
+ return Changed;
+}
+
+bool SIGfx940CacheControl::enableRMWCacheBypass(
+ const MachineBasicBlock::iterator &MI, SIAtomicScope Scope,
+ SIAtomicAddrSpace AddrSpace) const {
+ assert(MI->mayLoad() && MI->mayStore());
+ bool Changed = false;
+
+ if ((AddrSpace & SIAtomicAddrSpace::GLOBAL) != SIAtomicAddrSpace::NONE) {
+ switch (Scope) {
+ case SIAtomicScope::SYSTEM:
+ // Set SC1 bit to indicate system scope.
+ Changed |= enableSC1Bit(MI);
+ break;
+ case SIAtomicScope::AGENT:
+ case SIAtomicScope::WORKGROUP:
+ case SIAtomicScope::WAVEFRONT:
+ case SIAtomicScope::SINGLETHREAD:
+ // RMW atomic operations implicitly bypass the L1 cache and only use SC1
+ // to indicate system or agent scope. The SC0 bit is used to indicate if
+ // they are return or no-return. Leave SC1 bit unset to indicate agent
+ // scope.
+ break;
+ default:
+ llvm_unreachable("Unsupported synchronization scope");
+ }
+ }
+
+ return Changed;
+}
+
+bool SIGfx940CacheControl::enableVolatileAndOrNonTemporal(
+ MachineBasicBlock::iterator &MI, SIAtomicAddrSpace AddrSpace, SIMemOp Op,
+ bool IsVolatile, bool IsNonTemporal) const {
+ // Only handle load and store, not atomic read-modify-write insructions. The
+ // latter use glc to indicate if the atomic returns a result and so must not
+ // be used for cache control.
+ assert(MI->mayLoad() ^ MI->mayStore());
+
+ // Only update load and store, not LLVM IR atomic read-modify-write
+ // instructions. The latter are always marked as volatile so cannot sensibly
+ // handle it as do not want to pessimize all atomics. Also they do not support
+ // the nontemporal attribute.
+ assert(Op == SIMemOp::LOAD || Op == SIMemOp::STORE);
+
+ bool Changed = false;
+
+ if (IsVolatile) {
+ // Set SC bits to indicate system scope.
+ Changed |= enableSC0Bit(MI);
+ Changed |= enableSC1Bit(MI);
+
+ // Ensure operation has completed at system scope to cause all volatile
+ // operations to be visible outside the program in a global order. Do not
+ // request cross address space as only the global address space can be
+ // observable outside the program, so no need to cause a waitcnt for LDS
+ // address space operations.
+ Changed |= insertWait(MI, SIAtomicScope::SYSTEM, AddrSpace, Op, false,
+ Position::AFTER);
+
+ return Changed;
+ }
+
+ if (IsNonTemporal) {
+ Changed |= enableNTBit(MI);
+ return Changed;
+ }
+
+ return Changed;
+}
+
+bool SIGfx940CacheControl::insertAcquire(MachineBasicBlock::iterator &MI,
+ SIAtomicScope Scope,
+ SIAtomicAddrSpace AddrSpace,
+ Position Pos) const {
+ if (!InsertCacheInv)
+ return false;
+
+ bool Changed = false;
+
+ MachineBasicBlock &MBB = *MI->getParent();
+ DebugLoc DL = MI->getDebugLoc();
+
+ if (Pos == Position::AFTER)
+ ++MI;
+
+ if ((AddrSpace & SIAtomicAddrSpace::GLOBAL) != SIAtomicAddrSpace::NONE) {
+ switch (Scope) {
+ case SIAtomicScope::SYSTEM:
+ // Ensures that following loads will not see stale remote VMEM data or
+ // stale local VMEM data with MTYPE NC. Local VMEM data with MTYPE RW and
+ // CC will never be stale due to the local memory probes.
+ BuildMI(MBB, MI, DL, TII->get(AMDGPU::BUFFER_INV))
+ // Set SC bits to indicate system scope.
+ .addImm(AMDGPU::CPol::SC0 | AMDGPU::CPol::SC1);
+ // Inserting a "S_WAITCNT vmcnt(0)" after is not required because the
+ // hardware does not reorder memory operations by the same wave with
+ // respect to a preceding "BUFFER_INV". The invalidate is guaranteed to
+ // remove any cache lines of earlier writes by the same wave and ensures
+ // later reads by the same wave will refetch the cache lines.
+ Changed = true;
+ break;
+ case SIAtomicScope::AGENT:
+ // Ensures that following loads will not see stale remote date or local
+ // MTYPE NC global data. Local MTYPE RW and CC memory will never be stale
+ // due to the memory probes.
+ BuildMI(MBB, MI, DL, TII->get(AMDGPU::BUFFER_INV))
+ // Set SC bits to indicate agent scope.
+ .addImm(AMDGPU::CPol::SC1);
+ // Inserting "S_WAITCNT vmcnt(0)" is not required because the hardware
+ // does not reorder memory operations with respect to preceeding buffer
+ // invalidate. The invalidate is guaranteed to remove any cache lines of
+ // earlier writes and ensures later writes will refetch the cache lines.
+ Changed = true;
+ break;
+ case SIAtomicScope::WORKGROUP:
+ // In threadgroup split mode the waves of a work-group can be executing on
+ // different CUs. Therefore need to invalidate the L1 which is per CU.
+ // Otherwise in non-threadgroup split mode all waves of a work-group are
+ // on the same CU, and so the L1 does not need to be invalidated.
+ if (ST.isTgSplitEnabled()) {
+ // Ensures L1 is invalidated if in threadgroup split mode. In
+ // non-threadgroup split mode it is a NOP, but no point generating it in
+ // that case if know not in that mode.
+ BuildMI(MBB, MI, DL, TII->get(AMDGPU::BUFFER_INV))
+ // Set SC bits to indicate work-group scope.
+ .addImm(AMDGPU::CPol::SC0);
+ // Inserting "S_WAITCNT vmcnt(0)" is not required because the hardware
+ // does not reorder memory operations with respect to preceeding buffer
+ // invalidate. The invalidate is guaranteed to remove any cache lines of
+ // earlier writes and ensures later writes will refetch the cache lines.
+ Changed = true;
+ }
+ break;
+ case SIAtomicScope::WAVEFRONT:
+ case SIAtomicScope::SINGLETHREAD:
+ // Could generate "BUFFER_INV" but it would do nothing as there are no
+ // caches to invalidate.
+ break;
+ default:
+ llvm_unreachable("Unsupported synchronization scope");
+ }
+ }
+
+ /// The scratch address space does not need the global memory cache
+ /// to be flushed as all memory operations by the same thread are
+ /// sequentially consistent, and no other thread can access scratch
+ /// memory.
+
+ /// Other address spaces do not have a cache.
+
+ if (Pos == Position::AFTER)
+ --MI;
+
+ return Changed;
+}
+
+bool SIGfx940CacheControl::insertRelease(MachineBasicBlock::iterator &MI,
+ SIAtomicScope Scope,
+ SIAtomicAddrSpace AddrSpace,
+ bool IsCrossAddrSpaceOrdering,
+ Position Pos) const {
+ bool Changed = false;
+
+ MachineBasicBlock &MBB = *MI->getParent();
+ DebugLoc DL = MI->getDebugLoc();
+
+ if (Pos == Position::AFTER)
+ ++MI;
+
+ if ((AddrSpace & SIAtomicAddrSpace::GLOBAL) != SIAtomicAddrSpace::NONE) {
+ switch (Scope) {
+ case SIAtomicScope::SYSTEM:
+ // Inserting a "S_WAITCNT vmcnt(0)" before is not required because the
+ // hardware does not reorder memory operations by the same wave with
+ // respect to a following "BUFFER_WBL2". The "BUFFER_WBL2" is guaranteed
+ // to initiate writeback of any dirty cache lines of earlier writes by the
+ // same wave. A "S_WAITCNT vmcnt(0)" is needed after to ensure the
+ // writeback has completed.
+ BuildMI(MBB, MI, DL, TII->get(AMDGPU::BUFFER_WBL2))
+ // Set SC bits to indicate system scope.
+ .addImm(AMDGPU::CPol::SC0 | AMDGPU::CPol::SC1);
+ // Since AddrSpace contains SIAtomicAddrSpace::GLOBAL and Scope is
+ // SIAtomicScope::SYSTEM, the following insertWait will generate the
+ // required "S_WAITCNT vmcnt(0)" needed by the "BUFFER_WBL2".
+ Changed = true;
+ break;
+ case SIAtomicScope::AGENT:
+ BuildMI(MBB, MI, DL, TII->get(AMDGPU::BUFFER_WBL2))
+ // Set SC bits to indicate agent scope.
+ .addImm(AMDGPU::CPol::SC1);
+
+ // Since AddrSpace contains SIAtomicAddrSpace::GLOBAL and Scope is
+ // SIAtomicScope::AGENT, the following insertWait will generate the
+ // required "S_WAITCNT vmcnt(0)".
+ Changed = true;
+ break;
+ case SIAtomicScope::WORKGROUP:
+ case SIAtomicScope::WAVEFRONT:
+ case SIAtomicScope::SINGLETHREAD:
+ // Do not generate "BUFFER_WBL2" as there are no caches it would
+ // writeback, and would require an otherwise unnecessary
+ // "S_WAITCNT vmcnt(0)".
+ break;
+ default:
+ llvm_unreachable("Unsupported synchronization scope");
+ }
+ }
+
+ if (Pos == Position::AFTER)
+ --MI;
+
+ // Ensure the necessary S_WAITCNT needed by any "BUFFER_WBL2" as well as other
+ // S_WAITCNT needed.
+ Changed |= insertWait(MI, Scope, AddrSpace, SIMemOp::LOAD | SIMemOp::STORE,
+ IsCrossAddrSpaceOrdering, Pos);
+
+ return Changed;
+}
+
bool SIGfx10CacheControl::enableLoadCacheBypass(
const MachineBasicBlock::iterator &MI,
SIAtomicScope Scope,
@@ -1547,7 +1920,7 @@ bool SIGfx10CacheControl::insertWait(MachineBasicBlock::iterator &MI,
case SIAtomicScope::WAVEFRONT:
case SIAtomicScope::SINGLETHREAD:
// The LDS keeps all memory operations in order for
- // the same wavesfront.
+ // the same wavefront.
break;
default:
llvm_unreachable("Unsupported synchronization scope");
@@ -1655,6 +2028,101 @@ bool SIGfx10CacheControl::insertAcquire(MachineBasicBlock::iterator &MI,
return Changed;
}
+bool SIGfx11CacheControl::enableLoadCacheBypass(
+ const MachineBasicBlock::iterator &MI, SIAtomicScope Scope,
+ SIAtomicAddrSpace AddrSpace) const {
+ assert(MI->mayLoad() && !MI->mayStore());
+ bool Changed = false;
+
+ if ((AddrSpace & SIAtomicAddrSpace::GLOBAL) != SIAtomicAddrSpace::NONE) {
+ switch (Scope) {
+ case SIAtomicScope::SYSTEM:
+ case SIAtomicScope::AGENT:
+ // Set the L0 and L1 cache policies to MISS_EVICT.
+ // Note: there is no L2 cache coherent bypass control at the ISA level.
+ Changed |= enableGLCBit(MI);
+ break;
+ case SIAtomicScope::WORKGROUP:
+ // In WGP mode the waves of a work-group can be executing on either CU of
+ // the WGP. Therefore need to bypass the L0 which is per CU. Otherwise in
+ // CU mode all waves of a work-group are on the same CU, and so the L0
+ // does not need to be bypassed.
+ if (!ST.isCuModeEnabled())
+ Changed |= enableGLCBit(MI);
+ break;
+ case SIAtomicScope::WAVEFRONT:
+ case SIAtomicScope::SINGLETHREAD:
+ // No cache to bypass.
+ break;
+ default:
+ llvm_unreachable("Unsupported synchronization scope");
+ }
+ }
+
+ /// The scratch address space does not need the global memory caches
+ /// to be bypassed as all memory operations by the same thread are
+ /// sequentially consistent, and no other thread can access scratch
+ /// memory.
+
+ /// Other address spaces do not have a cache.
+
+ return Changed;
+}
+
+bool SIGfx11CacheControl::enableVolatileAndOrNonTemporal(
+ MachineBasicBlock::iterator &MI, SIAtomicAddrSpace AddrSpace, SIMemOp Op,
+ bool IsVolatile, bool IsNonTemporal) const {
+
+ // Only handle load and store, not atomic read-modify-write insructions. The
+ // latter use glc to indicate if the atomic returns a result and so must not
+ // be used for cache control.
+ assert(MI->mayLoad() ^ MI->mayStore());
+
+ // Only update load and store, not LLVM IR atomic read-modify-write
+ // instructions. The latter are always marked as volatile so cannot sensibly
+ // handle it as do not want to pessimize all atomics. Also they do not support
+ // the nontemporal attribute.
+ assert(Op == SIMemOp::LOAD || Op == SIMemOp::STORE);
+
+ bool Changed = false;
+
+ if (IsVolatile) {
+ // Set L0 and L1 cache policy to be MISS_EVICT for load instructions
+ // and MISS_LRU for store instructions.
+ // Note: there is no L2 cache coherent bypass control at the ISA level.
+ if (Op == SIMemOp::LOAD)
+ Changed |= enableGLCBit(MI);
+
+ // Set MALL NOALLOC for load and store instructions.
+ Changed |= enableDLCBit(MI);
+
+ // Ensure operation has completed at system scope to cause all volatile
+ // operations to be visible outside the program in a global order. Do not
+ // request cross address space as only the global address space can be
+ // observable outside the program, so no need to cause a waitcnt for LDS
+ // address space operations.
+ Changed |= insertWait(MI, SIAtomicScope::SYSTEM, AddrSpace, Op, false,
+ Position::AFTER);
+ return Changed;
+ }
+
+ if (IsNonTemporal) {
+ // For loads setting SLC configures L0 and L1 cache policy to HIT_EVICT
+ // and L2 cache policy to STREAM.
+ // For stores setting both GLC and SLC configures L0 and L1 cache policy
+ // to MISS_EVICT and the L2 cache policy to STREAM.
+ if (Op == SIMemOp::STORE)
+ Changed |= enableGLCBit(MI);
+ Changed |= enableSLCBit(MI);
+
+ // Set MALL NOALLOC for load and store instructions.
+ Changed |= enableDLCBit(MI);
+ return Changed;
+ }
+
+ return Changed;
+}
+
bool SIMemoryLegalizer::removeAtomicPseudoMIs() {
if (AtomicPseudoMIs.empty())
return false;