5 files changed, 364 insertions, 119 deletions
diff --git a/llvm/lib/Target/AMDGPU/Utils/AMDGPUAsmUtils.cpp b/llvm/lib/Target/AMDGPU/Utils/AMDGPUAsmUtils.cpp
index 075e08986c0c0..5819a621f55d6 100644
--- a/llvm/lib/Target/AMDGPU/Utils/AMDGPUAsmUtils.cpp
+++ b/llvm/lib/Target/AMDGPU/Utils/AMDGPUAsmUtils.cpp
@@ -78,7 +78,11 @@ const char* const IdSymbolic[] = {
   "HW_REG_XNACK_MASK",
   nullptr, // HW_ID1, no predictable values
   nullptr, // HW_ID2, no predictable values
-  "HW_REG_POPS_PACKER"
+  "HW_REG_POPS_PACKER",
+  nullptr,
+  nullptr,
+  nullptr,
+  "HW_REG_SHADER_CYCLES"
 };
 
 } // namespace Hwreg
diff --git a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp
index 5271bc3aacc65..00e6d517bde58 100644
--- a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp
@@ -108,6 +108,7 @@ namespace AMDGPU {
 #define GET_MIMGInfoTable_IMPL
 #define GET_MIMGLZMappingTable_IMPL
 #define GET_MIMGMIPMappingTable_IMPL
+#define GET_MIMGG16MappingTable_IMPL
 #include "AMDGPUGenSearchableTables.inc"
 
 int getMIMGOpcode(unsigned BaseOpcode, unsigned MIMGEncoding,
@@ -148,10 +149,17 @@ struct MTBUFInfo {
   bool has_soffset;
 };
 
+struct SMInfo {
+  uint16_t Opcode;
+  bool IsBuffer;
+};
+
 #define GET_MTBUFInfoTable_DECL
 #define GET_MTBUFInfoTable_IMPL
 #define GET_MUBUFInfoTable_DECL
 #define GET_MUBUFInfoTable_IMPL
+#define GET_SMInfoTable_DECL
+#define GET_SMInfoTable_IMPL
 #include "AMDGPUGenSearchableTables.inc"
 
 int getMTBUFBaseOpcode(unsigned Opc) {
@@ -214,6 +222,11 @@ bool getMUBUFHasSoffset(unsigned Opc) {
   return Info ? Info->has_soffset : false;
 }
 
+bool getSMEMIsBuffer(unsigned Opc) {
+  const SMInfo *Info = getSMEMOpcodeHelper(Opc);
+  return Info ? Info->IsBuffer : false;
+}
+
 // Wrapper for Tablegen'd function.  enum Subtarget is not defined in any
 // header files, so we need to wrap it in a function that takes unsigned
 // instead.
@@ -268,6 +281,13 @@ unsigned getLocalMemorySize(const MCSubtargetInfo *STI) {
 }
 
 unsigned getEUsPerCU(const MCSubtargetInfo *STI) {
+  // "Per CU" really means "per whatever functional block the waves of a
+  // workgroup must share". For gfx10 in CU mode this is the CU, which contains
+  // two SIMDs.
+  if (isGFX10(*STI) && STI->getFeatureBits().test(FeatureCuMode))
+    return 2;
+  // Pre-gfx10 a CU contains four SIMDs. For gfx10 in WGP mode the WGP contains
+  // two CUs, so a total of four SIMDs.
   return 4;
 }
 
@@ -283,15 +303,6 @@ unsigned getMaxWorkGroupsPerCU(const MCSubtargetInfo *STI,
   return std::min(N, 16u);
 }
 
-unsigned getMaxWavesPerCU(const MCSubtargetInfo *STI) {
-  return getMaxWavesPerEU(STI) * getEUsPerCU(STI);
-}
-
-unsigned getMaxWavesPerCU(const MCSubtargetInfo *STI,
-                          unsigned FlatWorkGroupSize) {
-  return getWavesPerWorkGroup(STI, FlatWorkGroupSize);
-}
-
 unsigned getMinWavesPerEU(const MCSubtargetInfo *STI) {
   return 1;
 }
@@ -300,13 +311,13 @@ unsigned getMaxWavesPerEU(const MCSubtargetInfo *STI) {
   // FIXME: Need to take scratch memory into account.
   if (!isGFX10(*STI))
     return 10;
-  return 20;
+  return hasGFX10_3Insts(*STI) ? 16 : 20;
 }
 
-unsigned getMaxWavesPerEU(const MCSubtargetInfo *STI,
-                          unsigned FlatWorkGroupSize) {
-  return alignTo(getMaxWavesPerCU(STI, FlatWorkGroupSize),
-                 getEUsPerCU(STI)) / getEUsPerCU(STI);
+unsigned getWavesPerEUForWorkGroup(const MCSubtargetInfo *STI,
+                                   unsigned FlatWorkGroupSize) {
+  return divideCeil(getWavesPerWorkGroup(STI, FlatWorkGroupSize),
+                    getEUsPerCU(STI));
 }
 
 unsigned getMinFlatWorkGroupSize(const MCSubtargetInfo *STI) {
@@ -320,8 +331,7 @@ unsigned getMaxFlatWorkGroupSize(const MCSubtargetInfo *STI) {
 
 unsigned getWavesPerWorkGroup(const MCSubtargetInfo *STI,
                               unsigned FlatWorkGroupSize) {
-  return alignTo(FlatWorkGroupSize, getWavefrontSize(STI)) /
-                 getWavefrontSize(STI);
+  return divideCeil(FlatWorkGroupSize, getWavefrontSize(STI));
 }
 
 unsigned getSGPRAllocGranule(const MCSubtargetInfo *STI) {
@@ -431,12 +441,21 @@ unsigned getVGPRAllocGranule(const MCSubtargetInfo *STI,
   bool IsWave32 = EnableWavefrontSize32 ?
       *EnableWavefrontSize32 :
       STI->getFeatureBits().test(FeatureWavefrontSize32);
+
+  if (hasGFX10_3Insts(*STI))
+    return IsWave32 ? 16 : 8;
+
   return IsWave32 ? 8 : 4;
 }
 
 unsigned getVGPREncodingGranule(const MCSubtargetInfo *STI,
                                 Optional<bool> EnableWavefrontSize32) {
-  return getVGPRAllocGranule(STI, EnableWavefrontSize32);
+
+  bool IsWave32 = EnableWavefrontSize32 ?
+      *EnableWavefrontSize32 :
+      STI->getFeatureBits().test(FeatureWavefrontSize32);
+
+  return IsWave32 ? 8 : 4;
 }
 
 unsigned getTotalNumVGPRs(const MCSubtargetInfo *STI) {
@@ -559,7 +578,7 @@ bool isReadOnlySegment(const GlobalValue *GV) {
 }
 
 bool shouldEmitConstantsToTextSection(const Triple &TT) {
-  return TT.getOS() == Triple::AMDPAL;
+  return TT.getOS() == Triple::AMDPAL || TT.getArch() == Triple::r600;
 }
 
 int getIntegerAttribute(const Function &F, StringRef Name, int Default) {
@@ -722,13 +741,16 @@ static unsigned getLastSymbolicHwreg(const MCSubtargetInfo &STI) {
     return ID_SYMBOLIC_FIRST_GFX9_;
   else if (isGFX9(STI))
     return ID_SYMBOLIC_FIRST_GFX10_;
+  else if (isGFX10(STI) && !isGFX10_BEncoding(STI))
+    return ID_SYMBOLIC_FIRST_GFX1030_;
   else
     return ID_SYMBOLIC_LAST_;
 }
 
 bool isValidHwreg(int64_t Id, const MCSubtargetInfo &STI) {
-  return ID_SYMBOLIC_FIRST_ <= Id && Id < getLastSymbolicHwreg(STI) &&
-         IdSymbolic[Id];
+  return
+    ID_SYMBOLIC_FIRST_ <= Id && Id < getLastSymbolicHwreg(STI) &&
+    IdSymbolic[Id] && (Id != ID_XNACK_MASK || !AMDGPU::isGFX10_BEncoding(STI));
 }
 
 bool isValidHwreg(int64_t Id) {
@@ -927,7 +949,15 @@ bool hasSRAMECC(const MCSubtargetInfo &STI) {
 }
 
 bool hasMIMG_R128(const MCSubtargetInfo &STI) {
-  return STI.getFeatureBits()[AMDGPU::FeatureMIMG_R128];
+  return STI.getFeatureBits()[AMDGPU::FeatureMIMG_R128] && !STI.getFeatureBits()[AMDGPU::FeatureR128A16];
+}
+
+bool hasGFX10A16(const MCSubtargetInfo &STI) {
+  return STI.getFeatureBits()[AMDGPU::FeatureGFX10A16];
+}
+
+bool hasG16(const MCSubtargetInfo &STI) {
+  return STI.getFeatureBits()[AMDGPU::FeatureG16];
 }
 
 bool hasPackedD16(const MCSubtargetInfo &STI) {
@@ -958,9 +988,17 @@ bool isGCN3Encoding(const MCSubtargetInfo &STI) {
   return STI.getFeatureBits()[AMDGPU::FeatureGCN3Encoding];
 }
 
+bool isGFX10_BEncoding(const MCSubtargetInfo &STI) {
+  return STI.getFeatureBits()[AMDGPU::FeatureGFX10_BEncoding];
+}
+
+bool hasGFX10_3Insts(const MCSubtargetInfo &STI) {
+  return STI.getFeatureBits()[AMDGPU::FeatureGFX10_3Insts];
+}
+
 bool isSGPR(unsigned Reg, const MCRegisterInfo* TRI) {
   const MCRegisterClass SGPRClass = TRI->getRegClass(AMDGPU::SReg_32RegClassID);
-  const unsigned FirstSubReg = TRI->getSubReg(Reg, 1);
+  const unsigned FirstSubReg = TRI->getSubReg(Reg, AMDGPU::sub0);
   return SGPRClass.contains(FirstSubReg != 0 ? FirstSubReg : Reg) ||
     Reg == AMDGPU::SCC;
 }
@@ -1082,6 +1120,11 @@ bool isSISrcInlinableOperand(const MCInstrDesc &Desc, unsigned OpNo) {
 // (move from MC* level to Target* level). Return size in bits.
 unsigned getRegBitWidth(unsigned RCID) {
   switch (RCID) {
+  case AMDGPU::VGPR_LO16RegClassID:
+  case AMDGPU::VGPR_HI16RegClassID:
+  case AMDGPU::SGPR_LO16RegClassID:
+  case AMDGPU::AGPR_LO16RegClassID:
+    return 16;
   case AMDGPU::SGPR_32RegClassID:
   case AMDGPU::VGPR_32RegClassID:
   case AMDGPU::VRegOrLds_32RegClassID:
@@ -1103,6 +1146,7 @@ unsigned getRegBitWidth(unsigned RCID) {
   case AMDGPU::SGPR_96RegClassID:
   case AMDGPU::SReg_96RegClassID:
   case AMDGPU::VReg_96RegClassID:
+  case AMDGPU::AReg_96RegClassID:
     return 96;
   case AMDGPU::SGPR_128RegClassID:
   case AMDGPU::SReg_128RegClassID:
@@ -1112,14 +1156,24 @@ unsigned getRegBitWidth(unsigned RCID) {
   case AMDGPU::SGPR_160RegClassID:
   case AMDGPU::SReg_160RegClassID:
   case AMDGPU::VReg_160RegClassID:
+  case AMDGPU::AReg_160RegClassID:
     return 160;
+  case AMDGPU::SGPR_192RegClassID:
+  case AMDGPU::SReg_192RegClassID:
+  case AMDGPU::VReg_192RegClassID:
+  case AMDGPU::AReg_192RegClassID:
+    return 192;
+  case AMDGPU::SGPR_256RegClassID:
   case AMDGPU::SReg_256RegClassID:
   case AMDGPU::VReg_256RegClassID:
+  case AMDGPU::AReg_256RegClassID:
     return 256;
+  case AMDGPU::SGPR_512RegClassID:
   case AMDGPU::SReg_512RegClassID:
   case AMDGPU::VReg_512RegClassID:
   case AMDGPU::AReg_512RegClassID:
     return 512;
+  case AMDGPU::SGPR_1024RegClassID:
   case AMDGPU::SReg_1024RegClassID:
   case AMDGPU::VReg_1024RegClassID:
   case AMDGPU::AReg_1024RegClassID:
@@ -1141,7 +1195,7 @@ unsigned getRegOperandSize(const MCRegisterInfo *MRI, const MCInstrDesc &Desc,
 }
 
 bool isInlinableLiteral64(int64_t Literal, bool HasInv2Pi) {
-  if (Literal >= -16 && Literal <= 64)
+  if (isInlinableIntLiteral(Literal))
     return true;
 
   uint64_t Val = static_cast<uint64_t>(Literal);
@@ -1158,7 +1212,7 @@ bool isInlinableLiteral64(int64_t Literal, bool HasInv2Pi) {
 }
 
 bool isInlinableLiteral32(int32_t Literal, bool HasInv2Pi) {
-  if (Literal >= -16 && Literal <= 64)
+  if (isInlinableIntLiteral(Literal))
     return true;
 
   // The actual type of the operand does not seem to matter as long
@@ -1187,7 +1241,7 @@ bool isInlinableLiteral16(int16_t Literal, bool HasInv2Pi) {
   if (!HasInv2Pi)
     return false;
 
-  if (Literal >= -16 && Literal <= 64)
+  if (isInlinableIntLiteral(Literal))
     return true;
 
   uint16_t Val = static_cast<uint16_t>(Literal);
@@ -1217,6 +1271,17 @@ bool isInlinableLiteralV216(int32_t Literal, bool HasInv2Pi) {
   return Lo16 == Hi16 && isInlinableLiteral16(Lo16, HasInv2Pi);
 }
 
+bool isInlinableIntLiteralV216(int32_t Literal) {
+  int16_t Lo16 = static_cast<int16_t>(Literal);
+  if (isInt<16>(Literal) || isUInt<16>(Literal))
+    return isInlinableIntLiteral(Lo16);
+
+  int16_t Hi16 = static_cast<int16_t>(Literal >> 16);
+  if (!(Literal & 0xffff))
+    return isInlinableIntLiteral(Hi16);
+  return Lo16 == Hi16 && isInlinableIntLiteral(Lo16);
+}
+
 bool isArgPassedInSGPR(const Argument *A) {
   const Function *F = A->getParent();
 
@@ -1247,16 +1312,61 @@ static bool hasSMEMByteOffset(const MCSubtargetInfo &ST) {
   return isGCN3Encoding(ST) || isGFX10(ST);
 }
 
-int64_t getSMRDEncodedOffset(const MCSubtargetInfo &ST, int64_t ByteOffset) {
+static bool hasSMRDSignedImmOffset(const MCSubtargetInfo &ST) {
+  return isGFX9(ST) || isGFX10(ST);
+}
+
+bool isLegalSMRDEncodedUnsignedOffset(const MCSubtargetInfo &ST,
+                                      int64_t EncodedOffset) {
+  return hasSMEMByteOffset(ST) ? isUInt<20>(EncodedOffset)
+                               : isUInt<8>(EncodedOffset);
+}
+
+bool isLegalSMRDEncodedSignedOffset(const MCSubtargetInfo &ST,
+                                    int64_t EncodedOffset,
+                                    bool IsBuffer) {
+  return !IsBuffer &&
+         hasSMRDSignedImmOffset(ST) &&
+         isInt<21>(EncodedOffset);
+}
+
+static bool isDwordAligned(uint64_t ByteOffset) {
+  return (ByteOffset & 3) == 0;
+}
+
+uint64_t convertSMRDOffsetUnits(const MCSubtargetInfo &ST,
+                                uint64_t ByteOffset) {
   if (hasSMEMByteOffset(ST))
     return ByteOffset;
+
+  assert(isDwordAligned(ByteOffset));
   return ByteOffset >> 2;
 }
 
-bool isLegalSMRDImmOffset(const MCSubtargetInfo &ST, int64_t ByteOffset) {
-  int64_t EncodedOffset = getSMRDEncodedOffset(ST, ByteOffset);
-  return (hasSMEMByteOffset(ST)) ?
-    isUInt<20>(EncodedOffset) : isUInt<8>(EncodedOffset);
+Optional<int64_t> getSMRDEncodedOffset(const MCSubtargetInfo &ST,
+                                       int64_t ByteOffset, bool IsBuffer) {
+  // The signed version is always a byte offset.
+  if (!IsBuffer && hasSMRDSignedImmOffset(ST)) {
+    assert(hasSMEMByteOffset(ST));
+    return isInt<20>(ByteOffset) ? Optional<int64_t>(ByteOffset) : None;
+  }
+
+  if (!isDwordAligned(ByteOffset) && !hasSMEMByteOffset(ST))
+    return None;
+
+  int64_t EncodedOffset = convertSMRDOffsetUnits(ST, ByteOffset);
+  return isLegalSMRDEncodedUnsignedOffset(ST, EncodedOffset)
+             ? Optional<int64_t>(EncodedOffset)
+             : None;
+}
+
+Optional<int64_t> getSMRDEncodedLiteralOffset32(const MCSubtargetInfo &ST,
+                                                int64_t ByteOffset) {
+  if (!isCI(ST) || !isDwordAligned(ByteOffset))
+    return None;
+
+  int64_t EncodedOffset = convertSMRDOffsetUnits(ST, ByteOffset);
+  return isUInt<32>(EncodedOffset) ? Optional<int64_t>(EncodedOffset) : None;
 }
 
 // Given Imm, split it into the values to put into the SOffset and ImmOffset
@@ -1267,8 +1377,8 @@ bool isLegalSMRDImmOffset(const MCSubtargetInfo &ST, int64_t ByteOffset) {
 // aligned if they are aligned to begin with. It also ensures that additional
 // offsets within the given alignment can be added to the resulting ImmOffset.
 bool splitMUBUFOffset(uint32_t Imm, uint32_t &SOffset, uint32_t &ImmOffset,
-                      const GCNSubtarget *Subtarget, uint32_t Align) {
-  const uint32_t MaxImm = alignDown(4095, Align);
+                      const GCNSubtarget *Subtarget, Align Alignment) {
+  const uint32_t MaxImm = alignDown(4095, Alignment.value());
   uint32_t Overflow = 0;
 
   if (Imm > MaxImm) {
@@ -1286,10 +1396,10 @@ bool splitMUBUFOffset(uint32_t Imm, uint32_t &SOffset, uint32_t &ImmOffset,
       //
       // Atomic operations fail to work correctly when individual address
       // components are unaligned, even if their sum is aligned.
-      uint32_t High = (Imm + Align) & ~4095;
-      uint32_t Low = (Imm + Align) & 4095;
+      uint32_t High = (Imm + Alignment.value()) & ~4095;
+      uint32_t Low = (Imm + Alignment.value()) & 4095;
       Imm = Low;
-      Overflow = High - Align;
+      Overflow = High - Alignment.value();
     }
   }
 
@@ -1305,8 +1415,7 @@ bool splitMUBUFOffset(uint32_t Imm, uint32_t &SOffset, uint32_t &ImmOffset,
   return true;
 }
 
-SIModeRegisterDefaults::SIModeRegisterDefaults(const Function &F,
-                                               const GCNSubtarget &ST) {
+SIModeRegisterDefaults::SIModeRegisterDefaults(const Function &F) {
   *this = getDefaultForCallingConv(F.getCallingConv());
 
   StringRef IEEEAttr = F.getFnAttribute("amdgpu-ieee").getValueAsString();
@@ -1318,8 +1427,25 @@ SIModeRegisterDefaults::SIModeRegisterDefaults(const Function &F,
   if (!DX10ClampAttr.empty())
     DX10Clamp = DX10ClampAttr == "true";
 
-  FP32Denormals = ST.hasFP32Denormals(F);
-  FP64FP16Denormals = ST.hasFP64FP16Denormals(F);
+  StringRef DenormF32Attr = F.getFnAttribute("denormal-fp-math-f32").getValueAsString();
+  if (!DenormF32Attr.empty()) {
+    DenormalMode DenormMode = parseDenormalFPAttribute(DenormF32Attr);
+    FP32InputDenormals = DenormMode.Input == DenormalMode::IEEE;
+    FP32OutputDenormals = DenormMode.Output == DenormalMode::IEEE;
+  }
+
+  StringRef DenormAttr = F.getFnAttribute("denormal-fp-math").getValueAsString();
+  if (!DenormAttr.empty()) {
+    DenormalMode DenormMode = parseDenormalFPAttribute(DenormAttr);
+
+    if (DenormF32Attr.empty()) {
+      FP32InputDenormals = DenormMode.Input == DenormalMode::IEEE;
+      FP32OutputDenormals = DenormMode.Output == DenormalMode::IEEE;
+    }
+
+    FP64FP16InputDenormals = DenormMode.Input == DenormalMode::IEEE;
+    FP64FP16OutputDenormals = DenormMode.Output == DenormalMode::IEEE;
+  }
 }
 
 namespace {
diff --git a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h
index a5bada2890d2c..e71554575f6af 100644
--- a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h
+++ b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h
@@ -12,10 +12,10 @@
 #include "AMDGPU.h"
 #include "AMDKernelCodeT.h"
 #include "SIDefines.h"
-#include "llvm/ADT/StringRef.h"
 #include "llvm/IR/CallingConv.h"
 #include "llvm/MC/MCInstrDesc.h"
 #include "llvm/Support/AMDHSAKernelDescriptor.h"
+#include "llvm/Support/Alignment.h"
 #include "llvm/Support/Compiler.h"
 #include "llvm/Support/ErrorHandling.h"
 #include "llvm/Support/TargetParser.h"
@@ -26,17 +26,13 @@
 namespace llvm {
 
 class Argument;
-class AMDGPUSubtarget;
-class FeatureBitset;
 class Function;
 class GCNSubtarget;
 class GlobalValue;
-class MCContext;
 class MCRegisterClass;
 class MCRegisterInfo;
-class MCSection;
 class MCSubtargetInfo;
-class MachineMemOperand;
+class StringRef;
 class Triple;
 
 namespace AMDGPU {
@@ -87,15 +83,6 @@ unsigned getEUsPerCU(const MCSubtargetInfo *STI);
 unsigned getMaxWorkGroupsPerCU(const MCSubtargetInfo *STI,
                                unsigned FlatWorkGroupSize);
 
-/// \returns Maximum number of waves per compute unit for given subtarget \p
-/// STI without any kind of limitation.
-unsigned getMaxWavesPerCU(const MCSubtargetInfo *STI);
-
-/// \returns Maximum number of waves per compute unit for given subtarget \p
-/// STI and limited by given \p FlatWorkGroupSize.
-unsigned getMaxWavesPerCU(const MCSubtargetInfo *STI,
-                          unsigned FlatWorkGroupSize);
-
 /// \returns Minimum number of waves per execution unit for given subtarget \p
 /// STI.
 unsigned getMinWavesPerEU(const MCSubtargetInfo *STI);
@@ -104,10 +91,10 @@ unsigned getMinWavesPerEU(const MCSubtargetInfo *STI);
 /// STI without any kind of limitation.
 unsigned getMaxWavesPerEU(const MCSubtargetInfo *STI);
 
-/// \returns Maximum number of waves per execution unit for given subtarget \p
-/// STI and limited by given \p FlatWorkGroupSize.
-unsigned getMaxWavesPerEU(const MCSubtargetInfo *STI,
-                          unsigned FlatWorkGroupSize);
+/// \returns Number of waves per execution unit required to support the given \p
+/// FlatWorkGroupSize.
+unsigned getWavesPerEUForWorkGroup(const MCSubtargetInfo *STI,
+                                   unsigned FlatWorkGroupSize);
 
 /// \returns Minimum flat work group size for given subtarget \p STI.
 unsigned getMinFlatWorkGroupSize(const MCSubtargetInfo *STI);
@@ -116,7 +103,7 @@ unsigned getMinFlatWorkGroupSize(const MCSubtargetInfo *STI);
 unsigned getMaxFlatWorkGroupSize(const MCSubtargetInfo *STI);
 
 /// \returns Number of waves per work group for given subtarget \p STI and
-/// limited by given \p FlatWorkGroupSize.
+/// \p FlatWorkGroupSize.
 unsigned getWavesPerWorkGroup(const MCSubtargetInfo *STI,
                               unsigned FlatWorkGroupSize);
 
@@ -211,6 +198,7 @@ struct MIMGBaseOpcodeInfo {
 
   uint8_t NumExtraArgs;
   bool Gradients;
+  bool G16;
   bool Coordinates;
   bool LodOrClampOrMip;
   bool HasD16;
@@ -247,11 +235,19 @@ struct MIMGMIPMappingInfo {
   MIMGBaseOpcode NONMIP;
 };
 
+struct MIMGG16MappingInfo {
+  MIMGBaseOpcode G;
+  MIMGBaseOpcode G16;
+};
+
 LLVM_READONLY
 const MIMGLZMappingInfo *getMIMGLZMappingInfo(unsigned L);
 
 LLVM_READONLY
-const MIMGMIPMappingInfo *getMIMGMIPMappingInfo(unsigned L);
+const MIMGMIPMappingInfo *getMIMGMIPMappingInfo(unsigned MIP);
+
+LLVM_READONLY
+const MIMGG16MappingInfo *getMIMGG16MappingInfo(unsigned G);
 
 LLVM_READONLY
 int getMIMGOpcode(unsigned BaseOpcode, unsigned MIMGEncoding,
@@ -308,6 +304,9 @@ LLVM_READONLY
 bool getMUBUFHasSoffset(unsigned Opc);
 
 LLVM_READONLY
+bool getSMEMIsBuffer(unsigned Opc);
+
+LLVM_READONLY
 const GcnBufferFormatInfo *getGcnBufferFormatInfo(uint8_t BitsPerComp,
                                                   uint8_t NumComponents,
                                                   uint8_t NumFormat,
@@ -551,6 +550,8 @@ inline bool isKernel(CallingConv::ID CC) {
 bool hasXNACK(const MCSubtargetInfo &STI);
 bool hasSRAMECC(const MCSubtargetInfo &STI);
 bool hasMIMG_R128(const MCSubtargetInfo &STI);
+bool hasGFX10A16(const MCSubtargetInfo &STI);
+bool hasG16(const MCSubtargetInfo &STI);
 bool hasPackedD16(const MCSubtargetInfo &STI);
 
 bool isSI(const MCSubtargetInfo &STI);
@@ -558,6 +559,9 @@ bool isCI(const MCSubtargetInfo &STI);
 bool isVI(const MCSubtargetInfo &STI);
 bool isGFX9(const MCSubtargetInfo &STI);
 bool isGFX10(const MCSubtargetInfo &STI);
+bool isGCN3Encoding(const MCSubtargetInfo &STI);
+bool isGFX10_BEncoding(const MCSubtargetInfo &STI);
+bool hasGFX10_3Insts(const MCSubtargetInfo &STI);
 
 /// Is Reg - scalar register
 bool isSGPR(unsigned Reg, const MCRegisterInfo* TRI);
@@ -633,6 +637,13 @@ inline unsigned getOperandSize(const MCInstrDesc &Desc, unsigned OpNo) {
   return getOperandSize(Desc.OpInfo[OpNo]);
 }
 
+/// Is this literal inlinable, and not one of the values intended for floating
+/// point values.
+LLVM_READNONE
+inline bool isInlinableIntLiteral(int64_t Literal) {
+  return Literal >= -16 && Literal <= 64;
+}
+
 /// Is this literal inlinable
 LLVM_READNONE
 bool isInlinableLiteral64(int64_t Literal, bool HasInv2Pi);
@@ -646,11 +657,35 @@ bool isInlinableLiteral16(int16_t Literal, bool HasInv2Pi);
 LLVM_READNONE
 bool isInlinableLiteralV216(int32_t Literal, bool HasInv2Pi);
 
+LLVM_READNONE
+bool isInlinableIntLiteralV216(int32_t Literal);
+
 bool isArgPassedInSGPR(const Argument *Arg);
 
-/// \returns The encoding that will be used for \p ByteOffset in the SMRD
-/// offset field.
-int64_t getSMRDEncodedOffset(const MCSubtargetInfo &ST, int64_t ByteOffset);
+LLVM_READONLY
+bool isLegalSMRDEncodedUnsignedOffset(const MCSubtargetInfo &ST,
+                                      int64_t EncodedOffset);
+
+LLVM_READONLY
+bool isLegalSMRDEncodedSignedOffset(const MCSubtargetInfo &ST,
+                                    int64_t EncodedOffset,
+                                    bool IsBuffer);
+
+/// Convert \p ByteOffset to dwords if the subtarget uses dword SMRD immediate
+/// offsets.
+uint64_t convertSMRDOffsetUnits(const MCSubtargetInfo &ST, uint64_t ByteOffset);
+
+/// \returns The encoding that will be used for \p ByteOffset in the
+/// SMRD offset field, or None if it won't fit. On GFX9 and GFX10
+/// S_LOAD instructions have a signed offset, on other subtargets it is
+/// unsigned. S_BUFFER has an unsigned offset for all subtargets.
+Optional<int64_t> getSMRDEncodedOffset(const MCSubtargetInfo &ST,
+                                       int64_t ByteOffset, bool IsBuffer);
+
+/// \return The encoding that can be used for a 32-bit literal offset in an SMRD
+/// instruction. This is only useful on CI.s
+Optional<int64_t> getSMRDEncodedLiteralOffset32(const MCSubtargetInfo &ST,
+                                                int64_t ByteOffset);
 
 /// \returns true if this offset is small enough to fit in the SMRD
 /// offset field.  \p ByteOffset should be the offset in bytes and
@@ -658,7 +693,8 @@ int64_t getSMRDEncodedOffset(const MCSubtargetInfo &ST, int64_t ByteOffset);
 bool isLegalSMRDImmOffset(const MCSubtargetInfo &ST, int64_t ByteOffset);
 
 bool splitMUBUFOffset(uint32_t Imm, uint32_t &SOffset, uint32_t &ImmOffset,
-                      const GCNSubtarget *Subtarget, uint32_t Align = 4);
+                      const GCNSubtarget *Subtarget,
+                      Align Alignment = Align(4));
 
 /// \returns true if the intrinsic is divergent
 bool isIntrinsicSourceOfDivergence(unsigned IntrID);
@@ -677,45 +713,76 @@ struct SIModeRegisterDefaults {
 
   /// If this is set, neither input or output denormals are flushed for most f32
   /// instructions.
-  ///
-  /// TODO: Split into separate input and output fields if necessary like the
-  /// control bits really provide?
-  bool FP32Denormals : 1;
+  bool FP32InputDenormals : 1;
+  bool FP32OutputDenormals : 1;
 
   /// If this is set, neither input or output denormals are flushed for both f64
   /// and f16/v2f16 instructions.
-  bool FP64FP16Denormals : 1;
+  bool FP64FP16InputDenormals : 1;
+  bool FP64FP16OutputDenormals : 1;
 
   SIModeRegisterDefaults() :
     IEEE(true),
     DX10Clamp(true),
-    FP32Denormals(true),
-    FP64FP16Denormals(true) {}
+    FP32InputDenormals(true),
+    FP32OutputDenormals(true),
+    FP64FP16InputDenormals(true),
+    FP64FP16OutputDenormals(true) {}
 
-  // FIXME: Should not depend on the subtarget
-  SIModeRegisterDefaults(const Function &F, const GCNSubtarget &ST);
+  SIModeRegisterDefaults(const Function &F);
 
   static SIModeRegisterDefaults getDefaultForCallingConv(CallingConv::ID CC) {
     const bool IsCompute = AMDGPU::isCompute(CC);
 
     SIModeRegisterDefaults Mode;
-    Mode.DX10Clamp = true;
     Mode.IEEE = IsCompute;
-    Mode.FP32Denormals = false; // FIXME: Should be on by default.
-    Mode.FP64FP16Denormals = true;
     return Mode;
   }
 
   bool operator ==(const SIModeRegisterDefaults Other) const {
     return IEEE == Other.IEEE && DX10Clamp == Other.DX10Clamp &&
-           FP32Denormals == Other.FP32Denormals &&
-           FP64FP16Denormals == Other.FP64FP16Denormals;
+           FP32InputDenormals == Other.FP32InputDenormals &&
+           FP32OutputDenormals == Other.FP32OutputDenormals &&
+           FP64FP16InputDenormals == Other.FP64FP16InputDenormals &&
+           FP64FP16OutputDenormals == Other.FP64FP16OutputDenormals;
+  }
+
+  bool allFP32Denormals() const {
+    return FP32InputDenormals && FP32OutputDenormals;
+  }
+
+  bool allFP64FP16Denormals() const {
+    return FP64FP16InputDenormals && FP64FP16OutputDenormals;
+  }
+
+  /// Get the encoding value for the FP_DENORM bits of the mode register for the
+  /// FP32 denormal mode.
+  uint32_t fpDenormModeSPValue() const {
+    if (FP32InputDenormals && FP32OutputDenormals)
+      return FP_DENORM_FLUSH_NONE;
+    if (FP32InputDenormals)
+      return FP_DENORM_FLUSH_OUT;
+    if (FP32OutputDenormals)
+      return FP_DENORM_FLUSH_IN;
+    return FP_DENORM_FLUSH_IN_FLUSH_OUT;
+  }
+
+  /// Get the encoding value for the FP_DENORM bits of the mode register for the
+  /// FP64/FP16 denormal mode.
+  uint32_t fpDenormModeDPValue() const {
+    if (FP64FP16InputDenormals && FP64FP16OutputDenormals)
+      return FP_DENORM_FLUSH_NONE;
+    if (FP64FP16InputDenormals)
+      return FP_DENORM_FLUSH_OUT;
+    if (FP64FP16OutputDenormals)
+      return FP_DENORM_FLUSH_IN;
+    return FP_DENORM_FLUSH_IN_FLUSH_OUT;
   }
 
   /// Returns true if a flag is compatible if it's enabled in the callee, but
   /// disabled in the caller.
   static bool oneWayCompatible(bool CallerMode, bool CalleeMode) {
-    return CallerMode == CalleeMode || (CallerMode && !CalleeMode);
+    return CallerMode == CalleeMode || (!CallerMode && CalleeMode);
   }
 
   // FIXME: Inlining should be OK for dx10-clamp, since the caller's mode should
@@ -727,8 +794,10 @@ struct SIModeRegisterDefaults {
       return false;
 
     // Allow inlining denormals enabled into denormals flushed functions.
-    return oneWayCompatible(FP64FP16Denormals, CalleeMode.FP64FP16Denormals) &&
-           oneWayCompatible(FP32Denormals, CalleeMode.FP32Denormals);
+    return oneWayCompatible(FP64FP16InputDenormals, CalleeMode.FP64FP16InputDenormals) &&
+           oneWayCompatible(FP64FP16OutputDenormals, CalleeMode.FP64FP16OutputDenormals) &&
+           oneWayCompatible(FP32InputDenormals, CalleeMode.FP32InputDenormals) &&
+           oneWayCompatible(FP32OutputDenormals, CalleeMode.FP32OutputDenormals);
   }
 };
 
diff --git a/llvm/lib/Target/AMDGPU/Utils/AMDGPUPALMetadata.cpp b/llvm/lib/Target/AMDGPU/Utils/AMDGPUPALMetadata.cpp
index 207e4232e8298..ef010a7ac1576 100644
--- a/llvm/lib/Target/AMDGPU/Utils/AMDGPUPALMetadata.cpp
+++ b/llvm/lib/Target/AMDGPU/Utils/AMDGPUPALMetadata.cpp
@@ -397,6 +397,39 @@ static const char *getRegisterName(unsigned RegNum) {
       {0x2c6a, "SPI_SHADER_USER_DATA_VS_30"},
       {0x2c6b, "SPI_SHADER_USER_DATA_VS_31"},
 
+      {0x2c8c, "SPI_SHADER_USER_DATA_GS_0"},
+      {0x2c8d, "SPI_SHADER_USER_DATA_GS_1"},
+      {0x2c8e, "SPI_SHADER_USER_DATA_GS_2"},
+      {0x2c8f, "SPI_SHADER_USER_DATA_GS_3"},
+      {0x2c90, "SPI_SHADER_USER_DATA_GS_4"},
+      {0x2c91, "SPI_SHADER_USER_DATA_GS_5"},
+      {0x2c92, "SPI_SHADER_USER_DATA_GS_6"},
+      {0x2c93, "SPI_SHADER_USER_DATA_GS_7"},
+      {0x2c94, "SPI_SHADER_USER_DATA_GS_8"},
+      {0x2c95, "SPI_SHADER_USER_DATA_GS_9"},
+      {0x2c96, "SPI_SHADER_USER_DATA_GS_10"},
+      {0x2c97, "SPI_SHADER_USER_DATA_GS_11"},
+      {0x2c98, "SPI_SHADER_USER_DATA_GS_12"},
+      {0x2c99, "SPI_SHADER_USER_DATA_GS_13"},
+      {0x2c9a, "SPI_SHADER_USER_DATA_GS_14"},
+      {0x2c9b, "SPI_SHADER_USER_DATA_GS_15"},
+      {0x2c9c, "SPI_SHADER_USER_DATA_GS_16"},
+      {0x2c9d, "SPI_SHADER_USER_DATA_GS_17"},
+      {0x2c9e, "SPI_SHADER_USER_DATA_GS_18"},
+      {0x2c9f, "SPI_SHADER_USER_DATA_GS_19"},
+      {0x2ca0, "SPI_SHADER_USER_DATA_GS_20"},
+      {0x2ca1, "SPI_SHADER_USER_DATA_GS_21"},
+      {0x2ca2, "SPI_SHADER_USER_DATA_GS_22"},
+      {0x2ca3, "SPI_SHADER_USER_DATA_GS_23"},
+      {0x2ca4, "SPI_SHADER_USER_DATA_GS_24"},
+      {0x2ca5, "SPI_SHADER_USER_DATA_GS_25"},
+      {0x2ca6, "SPI_SHADER_USER_DATA_GS_26"},
+      {0x2ca7, "SPI_SHADER_USER_DATA_GS_27"},
+      {0x2ca8, "SPI_SHADER_USER_DATA_GS_28"},
+      {0x2ca9, "SPI_SHADER_USER_DATA_GS_29"},
+      {0x2caa, "SPI_SHADER_USER_DATA_GS_30"},
+      {0x2cab, "SPI_SHADER_USER_DATA_GS_31"},
+
       {0x2ccc, "SPI_SHADER_USER_DATA_ES_0"},
       {0x2ccd, "SPI_SHADER_USER_DATA_ES_1"},
       {0x2cce, "SPI_SHADER_USER_DATA_ES_2"},
@@ -491,38 +524,55 @@ static const char *getRegisterName(unsigned RegNum) {
       {0xa310, "PA_SC_SHADER_CONTROL"},
       {0xa313, "PA_SC_CONSERVATIVE_RASTERIZATION_CNTL"},
 
-      {0x2d0c, "SPI_SHADER_USER_DATA_LS_0"},
-      {0x2d0d, "SPI_SHADER_USER_DATA_LS_1"},
-      {0x2d0e, "SPI_SHADER_USER_DATA_LS_2"},
-      {0x2d0f, "SPI_SHADER_USER_DATA_LS_3"},
-      {0x2d10, "SPI_SHADER_USER_DATA_LS_4"},
-      {0x2d11, "SPI_SHADER_USER_DATA_LS_5"},
-      {0x2d12, "SPI_SHADER_USER_DATA_LS_6"},
-      {0x2d13, "SPI_SHADER_USER_DATA_LS_7"},
-      {0x2d14, "SPI_SHADER_USER_DATA_LS_8"},
-      {0x2d15, "SPI_SHADER_USER_DATA_LS_9"},
-      {0x2d16, "SPI_SHADER_USER_DATA_LS_10"},
-      {0x2d17, "SPI_SHADER_USER_DATA_LS_11"},
-      {0x2d18, "SPI_SHADER_USER_DATA_LS_12"},
-      {0x2d19, "SPI_SHADER_USER_DATA_LS_13"},
-      {0x2d1a, "SPI_SHADER_USER_DATA_LS_14"},
-      {0x2d1b, "SPI_SHADER_USER_DATA_LS_15"},
-      {0x2d1c, "SPI_SHADER_USER_DATA_LS_16"},
-      {0x2d1d, "SPI_SHADER_USER_DATA_LS_17"},
-      {0x2d1e, "SPI_SHADER_USER_DATA_LS_18"},
-      {0x2d1f, "SPI_SHADER_USER_DATA_LS_19"},
-      {0x2d20, "SPI_SHADER_USER_DATA_LS_20"},
-      {0x2d21, "SPI_SHADER_USER_DATA_LS_21"},
-      {0x2d22, "SPI_SHADER_USER_DATA_LS_22"},
-      {0x2d23, "SPI_SHADER_USER_DATA_LS_23"},
-      {0x2d24, "SPI_SHADER_USER_DATA_LS_24"},
-      {0x2d25, "SPI_SHADER_USER_DATA_LS_25"},
-      {0x2d26, "SPI_SHADER_USER_DATA_LS_26"},
-      {0x2d27, "SPI_SHADER_USER_DATA_LS_27"},
-      {0x2d28, "SPI_SHADER_USER_DATA_LS_28"},
-      {0x2d29, "SPI_SHADER_USER_DATA_LS_29"},
-      {0x2d2a, "SPI_SHADER_USER_DATA_LS_30"},
-      {0x2d2b, "SPI_SHADER_USER_DATA_LS_31"},
+      {0x2d0c, "SPI_SHADER_USER_DATA_HS_0"},
+      {0x2d0d, "SPI_SHADER_USER_DATA_HS_1"},
+      {0x2d0e, "SPI_SHADER_USER_DATA_HS_2"},
+      {0x2d0f, "SPI_SHADER_USER_DATA_HS_3"},
+      {0x2d10, "SPI_SHADER_USER_DATA_HS_4"},
+      {0x2d11, "SPI_SHADER_USER_DATA_HS_5"},
+      {0x2d12, "SPI_SHADER_USER_DATA_HS_6"},
+      {0x2d13, "SPI_SHADER_USER_DATA_HS_7"},
+      {0x2d14, "SPI_SHADER_USER_DATA_HS_8"},
+      {0x2d15, "SPI_SHADER_USER_DATA_HS_9"},
+      {0x2d16, "SPI_SHADER_USER_DATA_HS_10"},
+      {0x2d17, "SPI_SHADER_USER_DATA_HS_11"},
+      {0x2d18, "SPI_SHADER_USER_DATA_HS_12"},
+      {0x2d19, "SPI_SHADER_USER_DATA_HS_13"},
+      {0x2d1a, "SPI_SHADER_USER_DATA_HS_14"},
+      {0x2d1b, "SPI_SHADER_USER_DATA_HS_15"},
+      {0x2d1c, "SPI_SHADER_USER_DATA_HS_16"},
+      {0x2d1d, "SPI_SHADER_USER_DATA_HS_17"},
+      {0x2d1e, "SPI_SHADER_USER_DATA_HS_18"},
+      {0x2d1f, "SPI_SHADER_USER_DATA_HS_19"},
+      {0x2d20, "SPI_SHADER_USER_DATA_HS_20"},
+      {0x2d21, "SPI_SHADER_USER_DATA_HS_21"},
+      {0x2d22, "SPI_SHADER_USER_DATA_HS_22"},
+      {0x2d23, "SPI_SHADER_USER_DATA_HS_23"},
+      {0x2d24, "SPI_SHADER_USER_DATA_HS_24"},
+      {0x2d25, "SPI_SHADER_USER_DATA_HS_25"},
+      {0x2d26, "SPI_SHADER_USER_DATA_HS_26"},
+      {0x2d27, "SPI_SHADER_USER_DATA_HS_27"},
+      {0x2d28, "SPI_SHADER_USER_DATA_HS_28"},
+      {0x2d29, "SPI_SHADER_USER_DATA_HS_29"},
+      {0x2d2a, "SPI_SHADER_USER_DATA_HS_30"},
+      {0x2d2b, "SPI_SHADER_USER_DATA_HS_31"},
+
+      {0x2d4c, "SPI_SHADER_USER_DATA_LS_0"},
+      {0x2d4d, "SPI_SHADER_USER_DATA_LS_1"},
+      {0x2d4e, "SPI_SHADER_USER_DATA_LS_2"},
+      {0x2d4f, "SPI_SHADER_USER_DATA_LS_3"},
+      {0x2d50, "SPI_SHADER_USER_DATA_LS_4"},
+      {0x2d51, "SPI_SHADER_USER_DATA_LS_5"},
+      {0x2d52, "SPI_SHADER_USER_DATA_LS_6"},
+      {0x2d53, "SPI_SHADER_USER_DATA_LS_7"},
+      {0x2d54, "SPI_SHADER_USER_DATA_LS_8"},
+      {0x2d55, "SPI_SHADER_USER_DATA_LS_9"},
+      {0x2d56, "SPI_SHADER_USER_DATA_LS_10"},
+      {0x2d57, "SPI_SHADER_USER_DATA_LS_11"},
+      {0x2d58, "SPI_SHADER_USER_DATA_LS_12"},
+      {0x2d59, "SPI_SHADER_USER_DATA_LS_13"},
+      {0x2d5a, "SPI_SHADER_USER_DATA_LS_14"},
+      {0x2d5b, "SPI_SHADER_USER_DATA_LS_15"},
 
       {0xa2aa, "IA_MULTI_VGT_PARAM"},
       {0xa2a5, "VGT_GS_MAX_PRIMS_PER_SUBGROUP"},
diff --git a/llvm/lib/Target/AMDGPU/Utils/AMDGPUPALMetadata.h b/llvm/lib/Target/AMDGPU/Utils/AMDGPUPALMetadata.h
index 0f17c157b2062..544ab669d9ae2 100644
--- a/llvm/lib/Target/AMDGPU/Utils/AMDGPUPALMetadata.h
+++ b/llvm/lib/Target/AMDGPU/Utils/AMDGPUPALMetadata.h
@@ -14,16 +14,12 @@
 #ifndef LLVM_LIB_TARGET_AMDGPU_AMDGPUPALMETADATA_H
 #define LLVM_LIB_TARGET_AMDGPU_AMDGPUPALMETADATA_H
 
-#include "llvm/ADT/StringRef.h"
 #include "llvm/BinaryFormat/MsgPackDocument.h"
-#include <map>
 
 namespace llvm {
 
-class AMDGPUTargetStreamer;
-class formatted_raw_ostream;
-class MCStreamer;
 class Module;
+class StringRef;
 
 class AMDGPUPALMetadata {
   unsigned BlobType = 0;