src - FreeBSD source tree

diff options


context:
space:
mode:

author	Dimitry Andric <dim@FreeBSD.org>	2022-07-03 14:10:23 +0000
committer	Dimitry Andric <dim@FreeBSD.org>	2022-07-03 14:10:23 +0000
commit	145449b1e420787bb99721a429341fa6be3adfb6 (patch)
tree	1d56ae694a6de602e348dd80165cf881a36600ed /llvm/lib/Target/AMDGPU/SIMemoryLegalizer.cpp
parent	ecbca9f5fb7d7613d2b94982c4825eb0d33d6842 (diff)

vendor/llvm-project/llvmorg-15-init-15358-g53dc0f107877

Diffstat (limited to 'llvm/lib/Target/AMDGPU/SIMemoryLegalizer.cpp')

-rw-r--r--

llvm/lib/Target/AMDGPU/SIMemoryLegalizer.cpp

484

1 files changed, 476 insertions, 8 deletions

diff --git a/llvm/lib/Target/AMDGPU/SIMemoryLegalizer.cpp b/llvm/lib/Target/AMDGPU/SIMemoryLegalizer.cpp
index fff4f6729c99..8a66213931ff 100644
--- a/llvm/lib/Target/AMDGPU/SIMemoryLegalizer.cpp
+++ b/llvm/lib/Target/AMDGPU/SIMemoryLegalizer.cpp

@@ -19,6 +19,7 @@

#include "MCTargetDesc/AMDGPUMCTargetDesc.h"

#include "llvm/ADT/BitmaskEnum.h"

#include "llvm/CodeGen/MachineBasicBlock.h"

+#include "llvm/CodeGen/MachineFunctionPass.h"

#include "llvm/IR/DiagnosticInfo.h"

#include "llvm/Support/AtomicOrdering.h"

#include "llvm/Support/TargetParser.h"

@@ -63,7 +64,7 @@ enum class SIAtomicScope {

};

/// The distinct address spaces supported by the AMDGPU target for

-/// atomic memory operation. Can be ORed toether.

+/// atomic memory operation. Can be ORed together.

enum class SIAtomicAddrSpace {

NONE = 0u,

GLOBAL = 1u << 0,

@@ -459,6 +460,56 @@ public:

Position Pos) const override;

};

+class SIGfx940CacheControl : public SIGfx90ACacheControl {

+protected:

+ /// Sets SC0 bit to "true" if present in \p MI. Returns true if \p MI

+ /// is modified, false otherwise.

+ bool enableSC0Bit(const MachineBasicBlock::iterator &MI) const {

+ return enableNamedBit(MI, AMDGPU::CPol::SC0);

+ }

+ /// Sets SC1 bit to "true" if present in \p MI. Returns true if \p MI

+ /// is modified, false otherwise.

+ bool enableSC1Bit(const MachineBasicBlock::iterator &MI) const {

+ return enableNamedBit(MI, AMDGPU::CPol::SC1);

+ }

+ /// Sets NT bit to "true" if present in \p MI. Returns true if \p MI

+ /// is modified, false otherwise.

+ bool enableNTBit(const MachineBasicBlock::iterator &MI) const {

+ return enableNamedBit(MI, AMDGPU::CPol::NT);

+ }

+public:

+ SIGfx940CacheControl(const GCNSubtarget &ST) : SIGfx90ACacheControl(ST) {};

+ bool enableLoadCacheBypass(const MachineBasicBlock::iterator &MI,

+ SIAtomicScope Scope,

+ SIAtomicAddrSpace AddrSpace) const override;

+ bool enableStoreCacheBypass(const MachineBasicBlock::iterator &MI,

+ SIAtomicScope Scope,

+ SIAtomicAddrSpace AddrSpace) const override;

+ bool enableRMWCacheBypass(const MachineBasicBlock::iterator &MI,

+ SIAtomicScope Scope,

+ SIAtomicAddrSpace AddrSpace) const override;

+ bool enableVolatileAndOrNonTemporal(MachineBasicBlock::iterator &MI,

+ SIAtomicAddrSpace AddrSpace, SIMemOp Op,

+ bool IsVolatile,

+ bool IsNonTemporal) const override;

+ bool insertAcquire(MachineBasicBlock::iterator &MI, SIAtomicScope Scope,

+ SIAtomicAddrSpace AddrSpace, Position Pos) const override;

+ bool insertRelease(MachineBasicBlock::iterator &MI, SIAtomicScope Scope,

+ SIAtomicAddrSpace AddrSpace, bool IsCrossAddrSpaceOrdering,

+ Position Pos) const override;

+};

class SIGfx10CacheControl : public SIGfx7CacheControl {

protected:

@@ -494,6 +545,20 @@ public:

Position Pos) const override;

};

+class SIGfx11CacheControl : public SIGfx10CacheControl {

+public:

+ SIGfx11CacheControl(const GCNSubtarget &ST) : SIGfx10CacheControl(ST) {}

+ bool enableLoadCacheBypass(const MachineBasicBlock::iterator &MI,

+ SIAtomicScope Scope,

+ SIAtomicAddrSpace AddrSpace) const override;

+ bool enableVolatileAndOrNonTemporal(MachineBasicBlock::iterator &MI,

+ SIAtomicAddrSpace AddrSpace, SIMemOp Op,

+ bool IsVolatile,

+ bool IsNonTemporal) const override;

+};

class SIMemoryLegalizer final : public MachineFunctionPass {

private:

@@ -649,7 +714,7 @@ Optional<SIMemOpInfo> SIMemOpAccess::constructFromMIWithMMO(

return None;

}

- SSID = IsSyncScopeInclusion.getValue() ? SSID : MMO->getSyncScopeID();

+ SSID = *IsSyncScopeInclusion ? SSID : MMO->getSyncScopeID();

Ordering = getMergedAtomicOrdering(Ordering, OpOrdering);

assert(MMO->getFailureOrdering() != AtomicOrdering::Release &&

MMO->getFailureOrdering() != AtomicOrdering::AcquireRelease);

@@ -668,7 +733,7 @@ Optional<SIMemOpInfo> SIMemOpAccess::constructFromMIWithMMO(

return None;

}

std::tie(Scope, OrderingAddrSpace, IsCrossAddressSpaceOrdering) =

- ScopeOrNone.getValue();

+ *ScopeOrNone;

if ((OrderingAddrSpace == SIAtomicAddrSpace::NONE) ||

((OrderingAddrSpace & SIAtomicAddrSpace::ATOMIC) != OrderingAddrSpace) ||

((InstrAddrSpace & SIAtomicAddrSpace::ATOMIC) == SIAtomicAddrSpace::NONE)) {

@@ -730,7 +795,7 @@ Optional<SIMemOpInfo> SIMemOpAccess::getAtomicFenceInfo(

SIAtomicAddrSpace OrderingAddrSpace = SIAtomicAddrSpace::NONE;

bool IsCrossAddressSpaceOrdering = false;

std::tie(Scope, OrderingAddrSpace, IsCrossAddressSpaceOrdering) =

- ScopeOrNone.getValue();

+ *ScopeOrNone;

if ((OrderingAddrSpace == SIAtomicAddrSpace::NONE) ||

((OrderingAddrSpace & SIAtomicAddrSpace::ATOMIC) != OrderingAddrSpace)) {

@@ -775,13 +840,17 @@ bool SICacheControl::enableNamedBit(const MachineBasicBlock::iterator MI,

/* static */

std::unique_ptr<SICacheControl> SICacheControl::create(const GCNSubtarget &ST) {

GCNSubtarget::Generation Generation = ST.getGeneration();

+ if (ST.hasGFX940Insts())

+ return std::make_unique<SIGfx940CacheControl>(ST);

if (ST.hasGFX90AInsts())

return std::make_unique<SIGfx90ACacheControl>(ST);

if (Generation <= AMDGPUSubtarget::SOUTHERN_ISLANDS)

return std::make_unique<SIGfx6CacheControl>(ST);

if (Generation < AMDGPUSubtarget::GFX10)

return std::make_unique<SIGfx7CacheControl>(ST);

- return std::make_unique<SIGfx10CacheControl>(ST);

+ if (Generation < AMDGPUSubtarget::GFX11)

+ return std::make_unique<SIGfx10CacheControl>(ST);

+ return std::make_unique<SIGfx11CacheControl>(ST);

}

bool SIGfx6CacheControl::enableLoadCacheBypass(

@@ -943,7 +1012,7 @@ bool SIGfx6CacheControl::insertWait(MachineBasicBlock::iterator &MI,

case SIAtomicScope::WAVEFRONT:

case SIAtomicScope::SINGLETHREAD:

// The LDS keeps all memory operations in order for

- // the same wavesfront.

+ // the same wavefront.

break;

default:

llvm_unreachable("Unsupported synchronization scope");

@@ -1360,7 +1429,9 @@ bool SIGfx90ACacheControl::insertRelease(MachineBasicBlock::iterator &MI,

// to initiate writeback of any dirty cache lines of earlier writes by the

// same wave. A "S_WAITCNT vmcnt(0)" is needed after to ensure the

// writeback has completed.

- BuildMI(MBB, MI, DL, TII->get(AMDGPU::BUFFER_WBL2));

+ BuildMI(MBB, MI, DL, TII->get(AMDGPU::BUFFER_WBL2))

+ // Set SC bits to indicate system scope.

+ .addImm(AMDGPU::CPol::SC0 | AMDGPU::CPol::SC1);

// Followed by same as GFX7, which will ensure the necessary "S_WAITCNT

// vmcnt(0)" needed by the "BUFFER_WBL2".

Changed = true;

@@ -1386,6 +1457,308 @@ bool SIGfx90ACacheControl::insertRelease(MachineBasicBlock::iterator &MI,

return Changed;

}

+bool SIGfx940CacheControl::enableLoadCacheBypass(

+ const MachineBasicBlock::iterator &MI, SIAtomicScope Scope,

+ SIAtomicAddrSpace AddrSpace) const {

+ assert(MI->mayLoad() && !MI->mayStore());

+ bool Changed = false;

+ if ((AddrSpace & SIAtomicAddrSpace::GLOBAL) != SIAtomicAddrSpace::NONE) {

+ switch (Scope) {

+ case SIAtomicScope::SYSTEM:

+ // Set SC bits to indicate system scope.

+ Changed |= enableSC0Bit(MI);

+ Changed |= enableSC1Bit(MI);

+ break;

+ case SIAtomicScope::AGENT:

+ // Set SC bits to indicate agent scope.

+ Changed |= enableSC1Bit(MI);

+ break;

+ case SIAtomicScope::WORKGROUP:

+ // In threadgroup split mode the waves of a work-group can be executing on

+ // different CUs. Therefore need to bypass the L1 which is per CU.

+ // Otherwise in non-threadgroup split mode all waves of a work-group are

+ // on the same CU, and so the L1 does not need to be bypassed. Setting SC

+ // bits to indicate work-group scope will do this automatically.

+ Changed |= enableSC0Bit(MI);

+ break;

+ case SIAtomicScope::WAVEFRONT:

+ case SIAtomicScope::SINGLETHREAD:

+ // Leave SC bits unset to indicate wavefront scope.

+ break;

+ default:

+ llvm_unreachable("Unsupported synchronization scope");

+ }

+ /// The scratch address space does not need the global memory caches

+ /// to be bypassed as all memory operations by the same thread are

+ /// sequentially consistent, and no other thread can access scratch

+ /// memory.

+ /// Other address spaces do not have a cache.

+ return Changed;

+bool SIGfx940CacheControl::enableStoreCacheBypass(

+ const MachineBasicBlock::iterator &MI,

+ SIAtomicScope Scope, SIAtomicAddrSpace AddrSpace) const {

+ assert(!MI->mayLoad() && MI->mayStore());

+ bool Changed = false;

+ if ((AddrSpace & SIAtomicAddrSpace::GLOBAL) != SIAtomicAddrSpace::NONE) {

+ switch (Scope) {

+ case SIAtomicScope::SYSTEM:

+ // Set SC bits to indicate system scope.

+ Changed |= enableSC0Bit(MI);

+ Changed |= enableSC1Bit(MI);

+ break;

+ case SIAtomicScope::AGENT:

+ // Set SC bits to indicate agent scope.

+ Changed |= enableSC1Bit(MI);

+ break;

+ case SIAtomicScope::WORKGROUP:

+ // Set SC bits to indicate workgroup scope.

+ Changed |= enableSC0Bit(MI);

+ break;

+ case SIAtomicScope::WAVEFRONT:

+ case SIAtomicScope::SINGLETHREAD:

+ // Leave SC bits unset to indicate wavefront scope.

+ break;

+ default:

+ llvm_unreachable("Unsupported synchronization scope");

+ }

+ /// The scratch address space does not need the global memory caches

+ /// to be bypassed as all memory operations by the same thread are

+ /// sequentially consistent, and no other thread can access scratch

+ /// memory.

+ /// Other address spaces do not have a cache.

+ return Changed;

+bool SIGfx940CacheControl::enableRMWCacheBypass(

+ const MachineBasicBlock::iterator &MI, SIAtomicScope Scope,

+ SIAtomicAddrSpace AddrSpace) const {

+ assert(MI->mayLoad() && MI->mayStore());

+ bool Changed = false;

+ if ((AddrSpace & SIAtomicAddrSpace::GLOBAL) != SIAtomicAddrSpace::NONE) {

+ switch (Scope) {

+ case SIAtomicScope::SYSTEM:

+ // Set SC1 bit to indicate system scope.

+ Changed |= enableSC1Bit(MI);

+ break;

+ case SIAtomicScope::AGENT:

+ case SIAtomicScope::WORKGROUP:

+ case SIAtomicScope::WAVEFRONT:

+ case SIAtomicScope::SINGLETHREAD:

+ // RMW atomic operations implicitly bypass the L1 cache and only use SC1

+ // to indicate system or agent scope. The SC0 bit is used to indicate if

+ // they are return or no-return. Leave SC1 bit unset to indicate agent

+ // scope.

+ break;

+ default:

+ llvm_unreachable("Unsupported synchronization scope");

+ }

+ return Changed;

+bool SIGfx940CacheControl::enableVolatileAndOrNonTemporal(

+ MachineBasicBlock::iterator &MI, SIAtomicAddrSpace AddrSpace, SIMemOp Op,

+ bool IsVolatile, bool IsNonTemporal) const {

+ // Only handle load and store, not atomic read-modify-write insructions. The

+ // latter use glc to indicate if the atomic returns a result and so must not

+ // be used for cache control.

+ assert(MI->mayLoad() ^ MI->mayStore());

+ // Only update load and store, not LLVM IR atomic read-modify-write

+ // instructions. The latter are always marked as volatile so cannot sensibly

+ // handle it as do not want to pessimize all atomics. Also they do not support

+ // the nontemporal attribute.

+ assert(Op == SIMemOp::LOAD || Op == SIMemOp::STORE);

+ bool Changed = false;

+ if (IsVolatile) {

+ // Set SC bits to indicate system scope.

+ Changed |= enableSC0Bit(MI);

+ Changed |= enableSC1Bit(MI);

+ // Ensure operation has completed at system scope to cause all volatile

+ // operations to be visible outside the program in a global order. Do not

+ // request cross address space as only the global address space can be

+ // observable outside the program, so no need to cause a waitcnt for LDS

+ // address space operations.

+ Changed |= insertWait(MI, SIAtomicScope::SYSTEM, AddrSpace, Op, false,

+ Position::AFTER);

+ return Changed;

+ }

+ if (IsNonTemporal) {

+ Changed |= enableNTBit(MI);

+ return Changed;

+ }

+ return Changed;

+bool SIGfx940CacheControl::insertAcquire(MachineBasicBlock::iterator &MI,

+ SIAtomicScope Scope,

+ SIAtomicAddrSpace AddrSpace,

+ Position Pos) const {

+ if (!InsertCacheInv)

+ return false;

+ bool Changed = false;

+ MachineBasicBlock &MBB = *MI->getParent();

+ DebugLoc DL = MI->getDebugLoc();

+ if (Pos == Position::AFTER)

+ ++MI;

+ if ((AddrSpace & SIAtomicAddrSpace::GLOBAL) != SIAtomicAddrSpace::NONE) {

+ switch (Scope) {

+ case SIAtomicScope::SYSTEM:

+ // Ensures that following loads will not see stale remote VMEM data or

+ // stale local VMEM data with MTYPE NC. Local VMEM data with MTYPE RW and

+ // CC will never be stale due to the local memory probes.

+ BuildMI(MBB, MI, DL, TII->get(AMDGPU::BUFFER_INV))

+ // Set SC bits to indicate system scope.

+ .addImm(AMDGPU::CPol::SC0 | AMDGPU::CPol::SC1);

+ // Inserting a "S_WAITCNT vmcnt(0)" after is not required because the

+ // hardware does not reorder memory operations by the same wave with

+ // respect to a preceding "BUFFER_INV". The invalidate is guaranteed to

+ // remove any cache lines of earlier writes by the same wave and ensures

+ // later reads by the same wave will refetch the cache lines.

+ Changed = true;

+ break;

+ case SIAtomicScope::AGENT:

+ // Ensures that following loads will not see stale remote date or local

+ // MTYPE NC global data. Local MTYPE RW and CC memory will never be stale

+ // due to the memory probes.

+ BuildMI(MBB, MI, DL, TII->get(AMDGPU::BUFFER_INV))

+ // Set SC bits to indicate agent scope.

+ .addImm(AMDGPU::CPol::SC1);

+ // Inserting "S_WAITCNT vmcnt(0)" is not required because the hardware

+ // does not reorder memory operations with respect to preceeding buffer

+ // invalidate. The invalidate is guaranteed to remove any cache lines of

+ // earlier writes and ensures later writes will refetch the cache lines.

+ Changed = true;

+ break;

+ case SIAtomicScope::WORKGROUP:

+ // In threadgroup split mode the waves of a work-group can be executing on

+ // different CUs. Therefore need to invalidate the L1 which is per CU.

+ // Otherwise in non-threadgroup split mode all waves of a work-group are

+ // on the same CU, and so the L1 does not need to be invalidated.

+ if (ST.isTgSplitEnabled()) {

+ // Ensures L1 is invalidated if in threadgroup split mode. In

+ // non-threadgroup split mode it is a NOP, but no point generating it in

+ // that case if know not in that mode.

+ BuildMI(MBB, MI, DL, TII->get(AMDGPU::BUFFER_INV))

+ // Set SC bits to indicate work-group scope.

+ .addImm(AMDGPU::CPol::SC0);

+ // Inserting "S_WAITCNT vmcnt(0)" is not required because the hardware

+ // does not reorder memory operations with respect to preceeding buffer

+ // invalidate. The invalidate is guaranteed to remove any cache lines of

+ // earlier writes and ensures later writes will refetch the cache lines.

+ Changed = true;

+ }

+ break;

+ case SIAtomicScope::WAVEFRONT:

+ case SIAtomicScope::SINGLETHREAD:

+ // Could generate "BUFFER_INV" but it would do nothing as there are no

+ // caches to invalidate.

+ break;

+ default:

+ llvm_unreachable("Unsupported synchronization scope");

+ }

+ /// The scratch address space does not need the global memory cache

+ /// to be flushed as all memory operations by the same thread are

+ /// sequentially consistent, and no other thread can access scratch

+ /// memory.

+ /// Other address spaces do not have a cache.

+ if (Pos == Position::AFTER)

+ --MI;

+ return Changed;

+bool SIGfx940CacheControl::insertRelease(MachineBasicBlock::iterator &MI,

+ SIAtomicScope Scope,

+ SIAtomicAddrSpace AddrSpace,

+ bool IsCrossAddrSpaceOrdering,

+ Position Pos) const {

+ bool Changed = false;

+ MachineBasicBlock &MBB = *MI->getParent();

+ DebugLoc DL = MI->getDebugLoc();

+ if (Pos == Position::AFTER)

+ ++MI;

+ if ((AddrSpace & SIAtomicAddrSpace::GLOBAL) != SIAtomicAddrSpace::NONE) {

+ switch (Scope) {

+ case SIAtomicScope::SYSTEM:

+ // Inserting a "S_WAITCNT vmcnt(0)" before is not required because the

+ // hardware does not reorder memory operations by the same wave with

+ // respect to a following "BUFFER_WBL2". The "BUFFER_WBL2" is guaranteed

+ // to initiate writeback of any dirty cache lines of earlier writes by the

+ // same wave. A "S_WAITCNT vmcnt(0)" is needed after to ensure the

+ // writeback has completed.

+ BuildMI(MBB, MI, DL, TII->get(AMDGPU::BUFFER_WBL2))

+ // Set SC bits to indicate system scope.

+ .addImm(AMDGPU::CPol::SC0 | AMDGPU::CPol::SC1);

+ // Since AddrSpace contains SIAtomicAddrSpace::GLOBAL and Scope is

+ // SIAtomicScope::SYSTEM, the following insertWait will generate the

+ // required "S_WAITCNT vmcnt(0)" needed by the "BUFFER_WBL2".

+ Changed = true;

+ break;

+ case SIAtomicScope::AGENT:

+ BuildMI(MBB, MI, DL, TII->get(AMDGPU::BUFFER_WBL2))

+ // Set SC bits to indicate agent scope.

+ .addImm(AMDGPU::CPol::SC1);

+ // Since AddrSpace contains SIAtomicAddrSpace::GLOBAL and Scope is

+ // SIAtomicScope::AGENT, the following insertWait will generate the

+ // required "S_WAITCNT vmcnt(0)".

+ Changed = true;

+ break;

+ case SIAtomicScope::WORKGROUP:

+ case SIAtomicScope::WAVEFRONT:

+ case SIAtomicScope::SINGLETHREAD:

+ // Do not generate "BUFFER_WBL2" as there are no caches it would

+ // writeback, and would require an otherwise unnecessary

+ // "S_WAITCNT vmcnt(0)".

+ break;

+ default:

+ llvm_unreachable("Unsupported synchronization scope");

+ }

+ if (Pos == Position::AFTER)

+ --MI;

+ // Ensure the necessary S_WAITCNT needed by any "BUFFER_WBL2" as well as other

+ // S_WAITCNT needed.

+ Changed |= insertWait(MI, Scope, AddrSpace, SIMemOp::LOAD | SIMemOp::STORE,

+ IsCrossAddrSpaceOrdering, Pos);

+ return Changed;

bool SIGfx10CacheControl::enableLoadCacheBypass(

const MachineBasicBlock::iterator &MI,

SIAtomicScope Scope,

@@ -1547,7 +1920,7 @@ bool SIGfx10CacheControl::insertWait(MachineBasicBlock::iterator &MI,

case SIAtomicScope::WAVEFRONT:

case SIAtomicScope::SINGLETHREAD:

// The LDS keeps all memory operations in order for

- // the same wavesfront.

+ // the same wavefront.

break;

default:

llvm_unreachable("Unsupported synchronization scope");

@@ -1655,6 +2028,101 @@ bool SIGfx10CacheControl::insertAcquire(MachineBasicBlock::iterator &MI,

return Changed;

}

+bool SIGfx11CacheControl::enableLoadCacheBypass(

+ const MachineBasicBlock::iterator &MI, SIAtomicScope Scope,

+ SIAtomicAddrSpace AddrSpace) const {

+ assert(MI->mayLoad() && !MI->mayStore());

+ bool Changed = false;

+ if ((AddrSpace & SIAtomicAddrSpace::GLOBAL) != SIAtomicAddrSpace::NONE) {

+ switch (Scope) {

+ case SIAtomicScope::SYSTEM:

+ case SIAtomicScope::AGENT:

+ // Set the L0 and L1 cache policies to MISS_EVICT.

+ // Note: there is no L2 cache coherent bypass control at the ISA level.

+ Changed |= enableGLCBit(MI);

+ break;

+ case SIAtomicScope::WORKGROUP:

+ // In WGP mode the waves of a work-group can be executing on either CU of

+ // the WGP. Therefore need to bypass the L0 which is per CU. Otherwise in

+ // CU mode all waves of a work-group are on the same CU, and so the L0

+ // does not need to be bypassed.

+ if (!ST.isCuModeEnabled())

+ Changed |= enableGLCBit(MI);

+ break;

+ case SIAtomicScope::WAVEFRONT:

+ case SIAtomicScope::SINGLETHREAD:

+ // No cache to bypass.

+ break;

+ default:

+ llvm_unreachable("Unsupported synchronization scope");

+ }

+ /// The scratch address space does not need the global memory caches

+ /// to be bypassed as all memory operations by the same thread are

+ /// sequentially consistent, and no other thread can access scratch

+ /// memory.

+ /// Other address spaces do not have a cache.

+ return Changed;

+bool SIGfx11CacheControl::enableVolatileAndOrNonTemporal(

+ MachineBasicBlock::iterator &MI, SIAtomicAddrSpace AddrSpace, SIMemOp Op,

+ bool IsVolatile, bool IsNonTemporal) const {

+ // Only handle load and store, not atomic read-modify-write insructions. The

+ // latter use glc to indicate if the atomic returns a result and so must not

+ // be used for cache control.

+ assert(MI->mayLoad() ^ MI->mayStore());

+ // Only update load and store, not LLVM IR atomic read-modify-write

+ // instructions. The latter are always marked as volatile so cannot sensibly

+ // handle it as do not want to pessimize all atomics. Also they do not support

+ // the nontemporal attribute.

+ assert(Op == SIMemOp::LOAD || Op == SIMemOp::STORE);

+ bool Changed = false;

+ if (IsVolatile) {

+ // Set L0 and L1 cache policy to be MISS_EVICT for load instructions

+ // and MISS_LRU for store instructions.

+ // Note: there is no L2 cache coherent bypass control at the ISA level.

+ if (Op == SIMemOp::LOAD)

+ Changed |= enableGLCBit(MI);

+ // Set MALL NOALLOC for load and store instructions.

+ Changed |= enableDLCBit(MI);

+ // Ensure operation has completed at system scope to cause all volatile

+ // operations to be visible outside the program in a global order. Do not

+ // request cross address space as only the global address space can be

+ // observable outside the program, so no need to cause a waitcnt for LDS

+ // address space operations.

+ Changed |= insertWait(MI, SIAtomicScope::SYSTEM, AddrSpace, Op, false,

+ Position::AFTER);

+ return Changed;

+ }

+ if (IsNonTemporal) {

+ // For loads setting SLC configures L0 and L1 cache policy to HIT_EVICT

+ // and L2 cache policy to STREAM.

+ // For stores setting both GLC and SLC configures L0 and L1 cache policy

+ // to MISS_EVICT and the L2 cache policy to STREAM.

+ if (Op == SIMemOp::STORE)

+ Changed |= enableGLCBit(MI);

+ Changed |= enableSLCBit(MI);

+ // Set MALL NOALLOC for load and store instructions.

+ Changed |= enableDLCBit(MI);

+ return Changed;

+ }

+ return Changed;

bool SIMemoryLegalizer::removeAtomicPseudoMIs() {

if (AtomicPseudoMIs.empty())

return false;