vendor/llvm/llvm-trunk-r338150

author: Dimitry Andric <dim@FreeBSD.org> 2018-07-28 10:51:19 +0000
committer: Dimitry Andric <dim@FreeBSD.org> 2018-07-28 10:51:19 +0000
commit: eb11fae6d08f479c0799db45860a98af528fa6e7 (patch)
tree: 44d492a50c8c1a7eb8e2d17ea3360ec4d066f042 /lib/Target/AMDGPU
parent: b8a2042aa938069e862750553db0e4d82d25822c (diff)
183 files changed, 15348 insertions, 7244 deletions
diff --git a/lib/Target/AMDGPU/AMDGPU.h b/lib/Target/AMDGPU/AMDGPU.h
index 0ddc43ad5033..796766d94622 100644
--- a/lib/Target/AMDGPU/AMDGPU.h
+++ b/lib/Target/AMDGPU/AMDGPU.h
@@ -11,7 +11,6 @@
 #ifndef LLVM_LIB_TARGET_AMDGPU_AMDGPU_H
 #define LLVM_LIB_TARGET_AMDGPU_AMDGPU_H
 
-#include "MCTargetDesc/AMDGPUMCTargetDesc.h"
 #include "llvm/Target/TargetMachine.h"
 
 namespace llvm {
@@ -50,9 +49,9 @@ FunctionPass *createSIOptimizeExecMaskingPreRAPass();
 FunctionPass *createSIFixSGPRCopiesPass();
 FunctionPass *createSIMemoryLegalizerPass();
 FunctionPass *createSIDebuggerInsertNopsPass();
-FunctionPass *createSIInsertWaitsPass();
 FunctionPass *createSIInsertWaitcntsPass();
 FunctionPass *createSIFixWWMLivenessPass();
+FunctionPass *createSIFormMemoryClausesPass();
 FunctionPass *createAMDGPUSimplifyLibCallsPass(const TargetOptions &);
 FunctionPass *createAMDGPUUseNativeCallsPass();
 FunctionPass *createAMDGPUCodeGenPreparePass();
@@ -74,6 +73,14 @@ ModulePass *createAMDGPULowerIntrinsicsPass();
 void initializeAMDGPULowerIntrinsicsPass(PassRegistry &);
 extern char &AMDGPULowerIntrinsicsID;
 
+FunctionPass *createAMDGPULowerKernelArgumentsPass();
+void initializeAMDGPULowerKernelArgumentsPass(PassRegistry &);
+extern char &AMDGPULowerKernelArgumentsID;
+
+ModulePass *createAMDGPULowerKernelAttributesPass();
+void initializeAMDGPULowerKernelAttributesPass(PassRegistry &);
+extern char &AMDGPULowerKernelAttributesID;
+
 void initializeAMDGPURewriteOutArgumentsPass(PassRegistry &);
 extern char &AMDGPURewriteOutArgumentsID;
 
@@ -134,6 +141,9 @@ extern char &AMDGPUSimplifyLibCallsID;
 void initializeAMDGPUUseNativeCallsPass(PassRegistry &);
 extern char &AMDGPUUseNativeCallsID;
 
+void initializeAMDGPUPerfHintAnalysisPass(PassRegistry &);
+extern char &AMDGPUPerfHintAnalysisID;
+
 // Passes common to R600 and SI
 FunctionPass *createAMDGPUPromoteAlloca();
 void initializeAMDGPUPromoteAllocaPass(PassRegistry&);
@@ -144,7 +154,7 @@ FunctionPass *createAMDGPUISelDag(
   TargetMachine *TM = nullptr,
   CodeGenOpt::Level OptLevel = CodeGenOpt::Default);
 ModulePass *createAMDGPUAlwaysInlinePass(bool GlobalOpt = true);
-ModulePass *createAMDGPUOpenCLImageTypeLoweringPass();
+ModulePass *createR600OpenCLImageTypeLoweringPass();
 FunctionPass *createAMDGPUAnnotateUniformValues();
 
 ModulePass* createAMDGPUUnifyMetadataPass();
@@ -169,12 +179,12 @@ extern char &SIMemoryLegalizerID;
 void initializeSIDebuggerInsertNopsPass(PassRegistry&);
 extern char &SIDebuggerInsertNopsID;
 
-void initializeSIInsertWaitsPass(PassRegistry&);
-extern char &SIInsertWaitsID;
-
 void initializeSIInsertWaitcntsPass(PassRegistry&);
 extern char &SIInsertWaitcntsID;
 
+void initializeSIFormMemoryClausesPass(PassRegistry&);
+extern char &SIFormMemoryClausesID;
+
 void initializeAMDGPUUnifyDivergentExitNodesPass(PassRegistry&);
 extern char &AMDGPUUnifyDivergentExitNodesID;
 
@@ -222,8 +232,11 @@ struct AMDGPUAS {
     MAX_COMMON_ADDRESS = 5,
 
     GLOBAL_ADDRESS = 1,   ///< Address space for global memory (RAT0, VTX0).
-    CONSTANT_ADDRESS = 2, ///< Address space for constant memory (VTX2)
+    CONSTANT_ADDRESS = 4, ///< Address space for constant memory (VTX2)
     LOCAL_ADDRESS = 3,    ///< Address space for local memory.
+
+    CONSTANT_ADDRESS_32BIT = 6, ///< Address space for 32-bit constant memory
+
     /// Address space for direct addressible parameter memory (CONST0)
     PARAM_D_ADDRESS = 6,
     /// Address space for indirect addressible parameter memory (VTX1)
diff --git a/lib/Target/AMDGPU/AMDGPU.td b/lib/Target/AMDGPU/AMDGPU.td
index c02d0a131041..16c2a366db28 100644
--- a/lib/Target/AMDGPU/AMDGPU.td
+++ b/lib/Target/AMDGPU/AMDGPU.td
@@ -7,58 +7,30 @@
 //
 //===------------------------------------------------------------===//
 
+include "llvm/TableGen/SearchableTable.td"
 include "llvm/Target/Target.td"
+include "AMDGPUFeatures.td"
 
 //===------------------------------------------------------------===//
 // Subtarget Features (device properties)
 //===------------------------------------------------------------===//
 
-def FeatureFP64 : SubtargetFeature<"fp64",
-  "FP64",
-  "true",
-  "Enable double precision operations"
->;
-
-def FeatureFMA : SubtargetFeature<"fmaf",
-  "FMA",
-  "true",
-  "Enable single precision FMA (not as fast as mul+add, but fused)"
->;
-
 def FeatureFastFMAF32 : SubtargetFeature<"fast-fmaf",
   "FastFMAF32",
   "true",
   "Assuming f32 fma is at least as fast as mul + add"
 >;
 
-def HalfRate64Ops : SubtargetFeature<"half-rate-64-ops",
-  "HalfRate64Ops",
-  "true",
-  "Most fp64 instructions are half rate instead of quarter"
->;
-
-def FeatureR600ALUInst : SubtargetFeature<"R600ALUInst",
-  "R600ALUInst",
-  "false",
-  "Older version of ALU instructions encoding"
->;
-
-def FeatureVertexCache : SubtargetFeature<"HasVertexCache",
-  "HasVertexCache",
+def FeatureMIMG_R128 : SubtargetFeature<"mimg-r128",
+  "MIMG_R128",
   "true",
-  "Specify use of dedicated vertex cache"
+  "Support 128-bit texture resources"
 >;
 
-def FeatureCaymanISA : SubtargetFeature<"caymanISA",
-  "CaymanISA",
-  "true",
-  "Use Cayman ISA"
->;
-
-def FeatureCFALUBug : SubtargetFeature<"cfalubug",
-  "CFALUBug",
+def HalfRate64Ops : SubtargetFeature<"half-rate-64-ops",
+  "HalfRate64Ops",
   "true",
-  "GPU has CF_ALU bug"
+  "Most fp64 instructions are half rate instead of quarter"
 >;
 
 def FeatureFlatAddressSpace : SubtargetFeature<"flat-address-space",
@@ -121,6 +93,12 @@ def FeatureMadMixInsts : SubtargetFeature<"mad-mix-insts",
   "Has v_mad_mix_f32, v_mad_mixlo_f16, v_mad_mixhi_f16 instructions"
 >;
 
+def FeatureFmaMixInsts : SubtargetFeature<"fma-mix-insts",
+  "HasFmaMixInsts",
+  "true",
+  "Has v_fma_mix_f32, v_fma_mixlo_f16, v_fma_mixhi_f16 instructions"
+>;
+
 // XNACK is disabled if SH_MEM_CONFIG.ADDRESS_MODE = GPUVM on chips that support
 // XNACK. The current default kernel driver setting is:
 // - graphics ring: XNACK disabled
@@ -140,27 +118,6 @@ def FeatureSGPRInitBug : SubtargetFeature<"sgpr-init-bug",
   "VI SGPR initialization bug requiring a fixed SGPR allocation size"
 >;
 
-class SubtargetFeatureFetchLimit <string Value> :
-                          SubtargetFeature <"fetch"#Value,
-  "TexVTXClauseSize",
-  Value,
-  "Limit the maximum number of fetches in a clause to "#Value
->;
-
-def FeatureFetchLimit8 : SubtargetFeatureFetchLimit <"8">;
-def FeatureFetchLimit16 : SubtargetFeatureFetchLimit <"16">;
-
-class SubtargetFeatureWavefrontSize <int Value> : SubtargetFeature<
-  "wavefrontsize"#Value,
-  "WavefrontSize",
-  !cast<string>(Value),
-  "The number of threads per wavefront"
->;
-
-def FeatureWavefrontSize16 : SubtargetFeatureWavefrontSize<16>;
-def FeatureWavefrontSize32 : SubtargetFeatureWavefrontSize<32>;
-def FeatureWavefrontSize64 : SubtargetFeatureWavefrontSize<64>;
-
 class SubtargetFeatureLDSBankCount <int Value> : SubtargetFeature <
   "ldsbankcount"#Value,
   "LDSBankCount",
@@ -171,19 +128,6 @@ class SubtargetFeatureLDSBankCount <int Value> : SubtargetFeature <
 def FeatureLDSBankCount16 : SubtargetFeatureLDSBankCount<16>;
 def FeatureLDSBankCount32 : SubtargetFeatureLDSBankCount<32>;
 
-class SubtargetFeatureLocalMemorySize <int Value> : SubtargetFeature<
-  "localmemorysize"#Value,
-  "LocalMemorySize",
-  !cast<string>(Value),
-  "The size of local memory in bytes"
->;
-
-def FeatureGCN : SubtargetFeature<"gcn",
-  "IsGCN",
-  "true",
-  "GCN or newer GPU"
->;
-
 def FeatureGCN3Encoding : SubtargetFeature<"gcn3-encoding",
   "GCN3Encoding",
   "true",
@@ -244,6 +188,12 @@ def FeatureScalarStores : SubtargetFeature<"scalar-stores",
   "Has store scalar memory instructions"
 >;
 
+def FeatureScalarAtomics : SubtargetFeature<"scalar-atomics",
+  "HasScalarAtomics",
+  "true",
+  "Has atomic scalar memory instructions"
+>;
+
 def FeatureSDWA : SubtargetFeature<"sdwa",
   "HasSDWA",
   "true",
@@ -292,6 +242,27 @@ def FeatureIntClamp : SubtargetFeature<"int-clamp-insts",
   "Support clamp for integer destination"
 >;
 
+def FeatureUnpackedD16VMem : SubtargetFeature<"unpacked-d16-vmem",
+  "HasUnpackedD16VMem",
+  "true",
+  "Has unpacked d16 vmem instructions"
+>;
+
+def FeatureDLInsts : SubtargetFeature<"dl-insts",
+  "HasDLInsts",
+  "true",
+  "Has deep learning instructions"
+>;
+
+def FeatureD16PreservesUnusedBits : SubtargetFeature<
+  "d16-preserves-unused-bits",
+  "D16PreservesUnusedBits",
+  "true",
+  "If present, then instructions defined by HasD16LoadStore predicate preserve "
+  "unused bits. Otherwise instructions defined by HasD16LoadStore predicate "
+  "zero unused bits."
+>;
+
 //===------------------------------------------------------------===//
 // Subtarget Features (options and debugging)
 //===------------------------------------------------------------===//
@@ -329,12 +300,6 @@ def FeatureFP16Denormals : SubtargetFeature<"fp16-denormals",
   [FeatureFP64FP16Denormals]
 >;
 
-def FeatureDX10Clamp : SubtargetFeature<"dx10-clamp",
-  "DX10Clamp",
-  "true",
-  "clamp modifier clamps NaNs to 0.0"
->;
-
 def FeatureFPExceptions : SubtargetFeature<"fp-exceptions",
   "FPExceptions",
   "true",
@@ -377,12 +342,6 @@ def FeatureDumpCodeLower : SubtargetFeature <"dumpcode",
   "Dump MachineInstrs in the CodeEmitter"
 >;
 
-def FeaturePromoteAlloca : SubtargetFeature <"promote-alloca",
-  "EnablePromoteAlloca",
-  "true",
-  "Enable promote alloca pass"
->;
-
 // XXX - This should probably be removed once enabled by default
 def FeatureEnableLoadStoreOpt : SubtargetFeature <"load-store-opt",
   "EnableLoadStoreOpt",
@@ -408,6 +367,12 @@ def FeatureEnableSIScheduler : SubtargetFeature<"si-scheduler",
   "Enable SI Machine Scheduler"
 >;
 
+def FeatureEnableDS128 : SubtargetFeature<"enable-ds128",
+  "EnableDS128",
+  "true",
+  "Use ds_{read|write}_b128"
+>;
+
 // Unless +-flat-for-global is specified, turn on FlatForGlobal for
 // all OS-es on VI and newer hardware to avoid assertion failures due
 // to missing ADDR64 variants of MUBUF instructions.
@@ -440,46 +405,30 @@ def FeatureDisable : SubtargetFeature<"",
   "Dummy feature to disable assembler instructions"
 >;
 
-class SubtargetFeatureGeneration <string Value,
-                                  list<SubtargetFeature> Implies> :
-        SubtargetFeature <Value, "Gen", "AMDGPUSubtarget::"#Value,
-                          Value#" GPU generation", Implies>;
-
-def FeatureLocalMemorySize0 : SubtargetFeatureLocalMemorySize<0>;
-def FeatureLocalMemorySize32768 : SubtargetFeatureLocalMemorySize<32768>;
-def FeatureLocalMemorySize65536 : SubtargetFeatureLocalMemorySize<65536>;
-
-def FeatureR600 : SubtargetFeatureGeneration<"R600",
-  [FeatureR600ALUInst, FeatureFetchLimit8, FeatureLocalMemorySize0]
->;
-
-def FeatureR700 : SubtargetFeatureGeneration<"R700",
-  [FeatureFetchLimit16, FeatureLocalMemorySize0]
->;
-
-def FeatureEvergreen : SubtargetFeatureGeneration<"EVERGREEN",
-  [FeatureFetchLimit16, FeatureLocalMemorySize32768]
+def FeatureGCN : SubtargetFeature<"gcn",
+  "IsGCN",
+  "true",
+  "GCN or newer GPU"
 >;
 
-def FeatureNorthernIslands : SubtargetFeatureGeneration<"NORTHERN_ISLANDS",
-  [FeatureFetchLimit16, FeatureWavefrontSize64,
-   FeatureLocalMemorySize32768]
->;
+class GCNSubtargetFeatureGeneration <string Value,
+                                  list<SubtargetFeature> Implies> :
+        SubtargetFeatureGeneration <Value, "GCNSubtarget", Implies>;
 
-def FeatureSouthernIslands : SubtargetFeatureGeneration<"SOUTHERN_ISLANDS",
-  [FeatureFP64, FeatureLocalMemorySize32768,
+def FeatureSouthernIslands : GCNSubtargetFeatureGeneration<"SOUTHERN_ISLANDS",
+  [FeatureFP64, FeatureLocalMemorySize32768, FeatureMIMG_R128,
   FeatureWavefrontSize64, FeatureGCN,
   FeatureLDSBankCount32, FeatureMovrel]
 >;
 
-def FeatureSeaIslands : SubtargetFeatureGeneration<"SEA_ISLANDS",
-  [FeatureFP64, FeatureLocalMemorySize65536,
+def FeatureSeaIslands : GCNSubtargetFeatureGeneration<"SEA_ISLANDS",
+  [FeatureFP64, FeatureLocalMemorySize65536, FeatureMIMG_R128,
   FeatureWavefrontSize64, FeatureGCN, FeatureFlatAddressSpace,
   FeatureCIInsts, FeatureMovrel]
 >;
 
-def FeatureVolcanicIslands : SubtargetFeatureGeneration<"VOLCANIC_ISLANDS",
-  [FeatureFP64, FeatureLocalMemorySize65536,
+def FeatureVolcanicIslands : GCNSubtargetFeatureGeneration<"VOLCANIC_ISLANDS",
+  [FeatureFP64, FeatureLocalMemorySize65536, FeatureMIMG_R128,
    FeatureWavefrontSize64, FeatureFlatAddressSpace, FeatureGCN,
    FeatureGCN3Encoding, FeatureCIInsts, Feature16BitInsts,
    FeatureSMemRealTime, FeatureVGPRIndexMode, FeatureMovrel,
@@ -489,7 +438,7 @@ def FeatureVolcanicIslands : SubtargetFeatureGeneration<"VOLCANIC_ISLANDS",
   ]
 >;
 
-def FeatureGFX9 : SubtargetFeatureGeneration<"GFX9",
+def FeatureGFX9 : GCNSubtargetFeatureGeneration<"GFX9",
   [FeatureFP64, FeatureLocalMemorySize65536,
    FeatureWavefrontSize64, FeatureFlatAddressSpace, FeatureGCN,
    FeatureGCN3Encoding, FeatureCIInsts, Feature16BitInsts,
@@ -498,7 +447,7 @@ def FeatureGFX9 : SubtargetFeatureGeneration<"GFX9",
    FeatureFastFMAF32, FeatureDPP, FeatureIntClamp,
    FeatureSDWA, FeatureSDWAOmod, FeatureSDWAScalar, FeatureSDWASdst,
    FeatureFlatInstOffsets, FeatureFlatGlobalInsts, FeatureFlatScratchInsts,
-   FeatureAddNoCarryInsts
+   FeatureAddNoCarryInsts, FeatureScalarAtomics
   ]
 >;
 
@@ -534,7 +483,8 @@ def FeatureISAVersion7_0_1 : SubtargetFeatureISAVersion <7,0,1,
 
 def FeatureISAVersion7_0_2 : SubtargetFeatureISAVersion <7,0,2,
   [FeatureSeaIslands,
-   FeatureLDSBankCount16]>;
+   FeatureLDSBankCount16,
+   FeatureFastFMAF32]>;
 
 def FeatureISAVersion7_0_3 : SubtargetFeatureISAVersion <7,0,3,
   [FeatureSeaIslands,
@@ -544,26 +494,24 @@ def FeatureISAVersion7_0_4 : SubtargetFeatureISAVersion <7,0,4,
   [FeatureSeaIslands,
    FeatureLDSBankCount32]>;
 
-def FeatureISAVersion8_0_0 : SubtargetFeatureISAVersion <8,0,0,
-  [FeatureVolcanicIslands,
-   FeatureLDSBankCount32,
-   FeatureSGPRInitBug]>;
-
 def FeatureISAVersion8_0_1 : SubtargetFeatureISAVersion <8,0,1,
   [FeatureVolcanicIslands,
    FeatureFastFMAF32,
    HalfRate64Ops,
    FeatureLDSBankCount32,
-   FeatureXNACK]>;
+   FeatureXNACK,
+   FeatureUnpackedD16VMem]>;
 
 def FeatureISAVersion8_0_2 : SubtargetFeatureISAVersion <8,0,2,
   [FeatureVolcanicIslands,
    FeatureLDSBankCount32,
-   FeatureSGPRInitBug]>;
+   FeatureSGPRInitBug,
+   FeatureUnpackedD16VMem]>;
 
 def FeatureISAVersion8_0_3 : SubtargetFeatureISAVersion <8,0,3,
   [FeatureVolcanicIslands,
-   FeatureLDSBankCount32]>;
+   FeatureLDSBankCount32,
+   FeatureUnpackedD16VMem]>;
 
 def FeatureISAVersion8_1_0 : SubtargetFeatureISAVersion <8,1,0,
   [FeatureVolcanicIslands,
@@ -573,14 +521,28 @@ def FeatureISAVersion8_1_0 : SubtargetFeatureISAVersion <8,1,0,
 def FeatureISAVersion9_0_0 : SubtargetFeatureISAVersion <9,0,0,
   [FeatureGFX9,
    FeatureMadMixInsts,
-   FeatureLDSBankCount32
-   ]>;
+   FeatureLDSBankCount32,
+   FeatureD16PreservesUnusedBits]>;
 
 def FeatureISAVersion9_0_2 : SubtargetFeatureISAVersion <9,0,2,
   [FeatureGFX9,
    FeatureMadMixInsts,
-   FeatureLDSBankCount32
-   ]>;
+   FeatureLDSBankCount32,
+   FeatureXNACK,
+   FeatureD16PreservesUnusedBits]>;
+
+def FeatureISAVersion9_0_4 : SubtargetFeatureISAVersion <9,0,4,
+  [FeatureGFX9,
+   FeatureLDSBankCount32,
+   FeatureFmaMixInsts,
+   FeatureD16PreservesUnusedBits]>;
+
+def FeatureISAVersion9_0_6 : SubtargetFeatureISAVersion <9,0,6,
+  [FeatureGFX9,
+   HalfRate64Ops,
+   FeatureFmaMixInsts,
+   FeatureLDSBankCount32,
+   FeatureDLInsts]>;
 
 //===----------------------------------------------------------------------===//
 // Debugger related subtarget features.
@@ -593,13 +555,6 @@ def FeatureDebuggerInsertNops : SubtargetFeature<
   "Insert one nop instruction for each high level source statement"
 >;
 
-def FeatureDebuggerReserveRegs : SubtargetFeature<
-  "amdgpu-debugger-reserve-regs",
-  "DebuggerReserveRegs",
-  "true",
-  "Reserve registers for debugger usage"
->;
-
 def FeatureDebuggerEmitPrologue : SubtargetFeature<
   "amdgpu-debugger-emit-prologue",
   "DebuggerEmitPrologue",
@@ -675,6 +630,7 @@ def AMDGPU : Target {
                                 SDWA9AsmParserVariant,
                                 DPPAsmParserVariant];
   let AssemblyWriters = [AMDGPUAsmWriter];
+  let AllowRegisterRenaming = 1;
 }
 
 // Dummy Instruction itineraries for pseudo instructions
@@ -685,8 +641,6 @@ def NullALU : InstrItinClass;
 // Predicate helper class
 //===----------------------------------------------------------------------===//
 
-def TruePredicate : Predicate<"true">;
-
 def isSICI : Predicate<
   "Subtarget->getGeneration() == AMDGPUSubtarget::SOUTHERN_ISLANDS ||"
   "Subtarget->getGeneration() == AMDGPUSubtarget::SEA_ISLANDS"
@@ -715,6 +669,13 @@ def HasFlatScratchInsts : Predicate<"Subtarget->hasFlatScratchInsts()">,
 def HasD16LoadStore : Predicate<"Subtarget->hasD16LoadStore()">,
   AssemblerPredicate<"FeatureGFX9Insts">;
 
+def HasUnpackedD16VMem : Predicate<"Subtarget->hasUnpackedD16VMem()">,
+  AssemblerPredicate<"FeatureUnpackedD16VMem">;
+def HasPackedD16VMem : Predicate<"!Subtarget->hasUnpackedD16VMem()">,
+  AssemblerPredicate<"!FeatureUnpackedD16VMem">;
+
+def D16PreservesUnusedBits : Predicate<"Subtarget->d16PreservesUnusedBits()">,
+  AssemblerPredicate<"FeatureD16PreservesUnusedBits">;
 
 def LDSRequiresM0Init : Predicate<"Subtarget->ldsRequiresM0Init()">;
 def NotLDSRequiresM0Init : Predicate<"!Subtarget->ldsRequiresM0Init()">;
@@ -733,6 +694,9 @@ def Has16BitInsts : Predicate<"Subtarget->has16BitInsts()">,
 def HasVOP3PInsts : Predicate<"Subtarget->hasVOP3PInsts()">,
   AssemblerPredicate<"FeatureVOP3P">;
 
+def NotHasVOP3PInsts : Predicate<"!Subtarget->hasVOP3PInsts()">,
+  AssemblerPredicate<"!FeatureVOP3P">;
+
 def HasSDWA : Predicate<"Subtarget->hasSDWA()">,
   AssemblerPredicate<"FeatureSDWA,FeatureVolcanicIslands">;
 
@@ -748,38 +712,35 @@ def HasIntClamp : Predicate<"Subtarget->hasIntClamp()">,
 def HasMadMixInsts : Predicate<"Subtarget->hasMadMixInsts()">,
   AssemblerPredicate<"FeatureMadMixInsts">;
 
-def EnableLateCFGStructurize : Predicate<
-  "EnableLateStructurizeCFG">;
+def HasScalarAtomics : Predicate<"Subtarget->hasScalarAtomics()">,
+  AssemblerPredicate<"FeatureScalarAtomics">;
 
-// Exists to help track down where SubtargetPredicate isn't set rather
-// than letting tablegen crash with an unhelpful error.
-def InvalidPred : Predicate<"predicate not set on instruction or pattern">;
-
-class PredicateControl {
-  Predicate SubtargetPredicate = InvalidPred;
-  Predicate SIAssemblerPredicate = isSICI;
-  Predicate VIAssemblerPredicate = isVI;
-  list<Predicate> AssemblerPredicates = [];
-  Predicate AssemblerPredicate = TruePredicate;
-  list<Predicate> OtherPredicates = [];
-  list<Predicate> Predicates = !listconcat([SubtargetPredicate,
-                                            AssemblerPredicate],
-                                            AssemblerPredicates,
-                                            OtherPredicates);
-}
+def has16BankLDS : Predicate<"Subtarget->getLDSBankCount() == 16">;
+def has32BankLDS : Predicate<"Subtarget->getLDSBankCount() == 32">;
+def HasVGPRIndexMode : Predicate<"Subtarget->hasVGPRIndexMode()">,
+                      AssemblerPredicate<"FeatureVGPRIndexMode">;
+def HasMovrel : Predicate<"Subtarget->hasMovrel()">,
+                AssemblerPredicate<"FeatureMovrel">;
+
+def HasFmaMixInsts : Predicate<"Subtarget->hasFmaMixInsts()">,
+  AssemblerPredicate<"FeatureFmaMixInsts">;
 
-class AMDGPUPat<dag pattern, dag result> : Pat<pattern, result>,
-  PredicateControl;
+def HasDLInsts : Predicate<"Subtarget->hasDLInsts()">,
+  AssemblerPredicate<"FeatureDLInsts">;
 
 
+def EnableLateCFGStructurize : Predicate<
+  "EnableLateStructurizeCFG">;
+
 // Include AMDGPU TD files
-include "R600Schedule.td"
-include "R600Processors.td"
 include "SISchedule.td"
 include "GCNProcessors.td"
 include "AMDGPUInstrInfo.td"
 include "AMDGPUIntrinsics.td"
+include "SIIntrinsics.td"
 include "AMDGPURegisterInfo.td"
 include "AMDGPURegisterBanks.td"
 include "AMDGPUInstructions.td"
+include "SIInstrInfo.td"
 include "AMDGPUCallingConv.td"
+include "AMDGPUSearchableTables.td"
diff --git a/lib/Target/AMDGPU/AMDGPUAliasAnalysis.cpp b/lib/Target/AMDGPU/AMDGPUAliasAnalysis.cpp
index 392b011e387c..ef4b69d09d9f 100644
--- a/lib/Target/AMDGPU/AMDGPUAliasAnalysis.cpp
+++ b/lib/Target/AMDGPU/AMDGPUAliasAnalysis.cpp
@@ -61,7 +61,7 @@ AMDGPUAAResult::ASAliasRulesTy::ASAliasRulesTy(AMDGPUAS AS_, Triple::ArchType Ar
   /* Region   */ {NoAlias , NoAlias , NoAlias , NoAlias , MayAlias, MayAlias}
   };
   static const AliasResult ASAliasRulesGenIsZero[6][6] = {
-  /*             Flat       Global    Constant  Group     Region    Private */
+  /*             Flat       Global    Region    Group     Constant  Private */
   /* Flat     */ {MayAlias, MayAlias, MayAlias, MayAlias, MayAlias, MayAlias},
   /* Global   */ {MayAlias, MayAlias, NoAlias , NoAlias , NoAlias , NoAlias},
   /* Constant */ {MayAlias, NoAlias , MayAlias, NoAlias , NoAlias,  NoAlias},
@@ -72,9 +72,9 @@ AMDGPUAAResult::ASAliasRulesTy::ASAliasRulesTy(AMDGPUAS AS_, Triple::ArchType Ar
   assert(AS.MAX_COMMON_ADDRESS <= 5);
   if (AS.FLAT_ADDRESS == 0) {
     assert(AS.GLOBAL_ADDRESS   == 1 &&
-           AS.REGION_ADDRESS   == 4 &&
+           AS.REGION_ADDRESS   == 2 &&
            AS.LOCAL_ADDRESS    == 3 &&
-           AS.CONSTANT_ADDRESS == 2 &&
+           AS.CONSTANT_ADDRESS == 4 &&
            AS.PRIVATE_ADDRESS  == 5);
     ASAliasRules = &ASAliasRulesGenIsZero;
   } else {
@@ -115,7 +115,8 @@ bool AMDGPUAAResult::pointsToConstantMemory(const MemoryLocation &Loc,
                                             bool OrLocal) {
   const Value *Base = GetUnderlyingObject(Loc.Ptr, DL);
 
-  if (Base->getType()->getPointerAddressSpace() == AS.CONSTANT_ADDRESS) {
+  if (Base->getType()->getPointerAddressSpace() == AS.CONSTANT_ADDRESS ||
+      Base->getType()->getPointerAddressSpace() == AS.CONSTANT_ADDRESS_32BIT) {
     return true;
   }
 
diff --git a/lib/Target/AMDGPU/AMDGPUAlwaysInlinePass.cpp b/lib/Target/AMDGPU/AMDGPUAlwaysInlinePass.cpp
index c27425443abc..d4bbb2c1eb8d 100644
--- a/lib/Target/AMDGPU/AMDGPUAlwaysInlinePass.cpp
+++ b/lib/Target/AMDGPU/AMDGPUAlwaysInlinePass.cpp
@@ -14,6 +14,9 @@
 //===----------------------------------------------------------------------===//
 
 #include "AMDGPU.h"
+#include "AMDGPUTargetMachine.h"
+#include "Utils/AMDGPUBaseInfo.h"
+#include "llvm/ADT/SmallPtrSet.h"
 #include "llvm/IR/Module.h"
 #include "llvm/Transforms/Utils/Cloning.h"
 
@@ -30,13 +33,18 @@ static cl::opt<bool> StressCalls(
 class AMDGPUAlwaysInline : public ModulePass {
   bool GlobalOpt;
 
+  void recursivelyVisitUsers(GlobalValue &GV,
+                             SmallPtrSetImpl<Function *> &FuncsToAlwaysInline);
 public:
   static char ID;
 
   AMDGPUAlwaysInline(bool GlobalOpt = false) :
     ModulePass(ID), GlobalOpt(GlobalOpt) { }
   bool runOnModule(Module &M) override;
-  StringRef getPassName() const override { return "AMDGPU Always Inline Pass"; }
+
+  void getAnalysisUsage(AnalysisUsage &AU) const override {
+    AU.setPreservesAll();
+ }
 };
 
 } // End anonymous namespace
@@ -46,15 +54,53 @@ INITIALIZE_PASS(AMDGPUAlwaysInline, "amdgpu-always-inline",
 
 char AMDGPUAlwaysInline::ID = 0;
 
+void AMDGPUAlwaysInline::recursivelyVisitUsers(
+  GlobalValue &GV,
+  SmallPtrSetImpl<Function *> &FuncsToAlwaysInline) {
+  SmallVector<User *, 16> Stack;
+
+  SmallPtrSet<const Value *, 8> Visited;
+
+  for (User *U : GV.users())
+    Stack.push_back(U);
+
+  while (!Stack.empty()) {
+    User *U = Stack.pop_back_val();
+    if (!Visited.insert(U).second)
+      continue;
+
+    if (Instruction *I = dyn_cast<Instruction>(U)) {
+      Function *F = I->getParent()->getParent();
+      if (!AMDGPU::isEntryFunctionCC(F->getCallingConv())) {
+        FuncsToAlwaysInline.insert(F);
+        Stack.push_back(F);
+      }
+
+      // No need to look at further users, but we do need to inline any callers.
+      continue;
+    }
+
+    for (User *UU : U->users())
+      Stack.push_back(UU);
+  }
+}
+
 bool AMDGPUAlwaysInline::runOnModule(Module &M) {
+  AMDGPUAS AMDGPUAS = AMDGPU::getAMDGPUAS(M);
+
   std::vector<GlobalAlias*> AliasesToRemove;
-  std::vector<Function *> FuncsToClone;
+
+  SmallPtrSet<Function *, 8> FuncsToAlwaysInline;
+  SmallPtrSet<Function *, 8> FuncsToNoInline;
 
   for (GlobalAlias &A : M.aliases()) {
     if (Function* F = dyn_cast<Function>(A.getAliasee())) {
       A.replaceAllUsesWith(F);
       AliasesToRemove.push_back(&A);
     }
+
+    // FIXME: If the aliasee isn't a function, it's some kind of constant expr
+    // cast that won't be inlined through.
   }
 
   if (GlobalOpt) {
@@ -63,31 +109,51 @@ bool AMDGPUAlwaysInline::runOnModule(Module &M) {
     }
   }
 
-  auto NewAttr = StressCalls ? Attribute::NoInline : Attribute::AlwaysInline;
-  auto IncompatAttr
-    = StressCalls ? Attribute::AlwaysInline : Attribute::NoInline;
-
-  for (Function &F : M) {
-    if (!F.hasLocalLinkage() && !F.isDeclaration() && !F.use_empty() &&
-        !F.hasFnAttribute(IncompatAttr))
-      FuncsToClone.push_back(&F);
-  }
-
-  for (Function *F : FuncsToClone) {
-    ValueToValueMapTy VMap;
-    Function *NewFunc = CloneFunction(F, VMap);
-    NewFunc->setLinkage(GlobalValue::InternalLinkage);
-    F->replaceAllUsesWith(NewFunc);
+  // Always force inlining of any function that uses an LDS global address. This
+  // is something of a workaround because we don't have a way of supporting LDS
+  // objects defined in functions. LDS is always allocated by a kernel, and it
+  // is difficult to manage LDS usage if a function may be used by multiple
+  // kernels.
+  //
+  // OpenCL doesn't allow declaring LDS in non-kernels, so in practice this
+  // should only appear when IPO passes manages to move LDs defined in a kernel
+  // into a single user function.
+
+  for (GlobalVariable &GV : M.globals()) {
+    // TODO: Region address
+    unsigned AS = GV.getType()->getAddressSpace();
+    if (AS != AMDGPUAS::LOCAL_ADDRESS && AS != AMDGPUAS.REGION_ADDRESS)
+      continue;
+
+    recursivelyVisitUsers(GV, FuncsToAlwaysInline);
   }
 
-  for (Function &F : M) {
-    if (F.hasLocalLinkage() && !F.hasFnAttribute(IncompatAttr)) {
-      F.addFnAttr(NewAttr);
+  if (!AMDGPUTargetMachine::EnableFunctionCalls || StressCalls) {
+    auto IncompatAttr
+      = StressCalls ? Attribute::AlwaysInline : Attribute::NoInline;
+
+    for (Function &F : M) {
+      if (!F.isDeclaration() && !F.use_empty() &&
+          !F.hasFnAttribute(IncompatAttr)) {
+        if (StressCalls) {
+          if (!FuncsToAlwaysInline.count(&F))
+            FuncsToNoInline.insert(&F);
+        } else
+          FuncsToAlwaysInline.insert(&F);
+      }
     }
   }
-  return false;
+
+  for (Function *F : FuncsToAlwaysInline)
+    F->addFnAttr(Attribute::AlwaysInline);
+
+  for (Function *F : FuncsToNoInline)
+    F->addFnAttr(Attribute::NoInline);
+
+  return !FuncsToAlwaysInline.empty() || !FuncsToNoInline.empty();
 }
 
 ModulePass *llvm::createAMDGPUAlwaysInlinePass(bool GlobalOpt) {
   return new AMDGPUAlwaysInline(GlobalOpt);
 }
+
diff --git a/lib/Target/AMDGPU/AMDGPUAnnotateKernelFeatures.cpp b/lib/Target/AMDGPU/AMDGPUAnnotateKernelFeatures.cpp
index ce17202f3414..1a70833a4472 100644
--- a/lib/Target/AMDGPU/AMDGPUAnnotateKernelFeatures.cpp
+++ b/lib/Target/AMDGPU/AMDGPUAnnotateKernelFeatures.cpp
@@ -219,7 +219,7 @@ static void copyFeaturesToFunction(Function &Parent, const Function &Callee,
 }
 
 bool AMDGPUAnnotateKernelFeatures::addFeatureAttributes(Function &F) {
-  const AMDGPUSubtarget &ST = TM->getSubtarget<AMDGPUSubtarget>(F);
+  const GCNSubtarget &ST = TM->getSubtarget<GCNSubtarget>(F);
   bool HasFlat = ST.hasFlatAddressSpace();
   bool HasApertureRegs = ST.hasApertureRegs();
   SmallPtrSet<const Constant *, 8> ConstantExprVisited;
diff --git a/lib/Target/AMDGPU/AMDGPUArgumentUsageInfo.cpp b/lib/Target/AMDGPU/AMDGPUArgumentUsageInfo.cpp
index dcca3a2fab96..7465cf22b5a4 100644
--- a/lib/Target/AMDGPU/AMDGPUArgumentUsageInfo.cpp
+++ b/lib/Target/AMDGPU/AMDGPUArgumentUsageInfo.cpp
@@ -55,9 +55,6 @@ void AMDGPUArgumentUsageInfo::print(raw_ostream &OS, const Module *M) const {
        << "  DispatchID: " << FI.second.DispatchID
        << "  FlatScratchInit: " << FI.second.FlatScratchInit
        << "  PrivateSegmentSize: " << FI.second.PrivateSegmentSize
-       << "  GridWorkgroupCountX: " << FI.second.GridWorkGroupCountX
-       << "  GridWorkgroupCountY: " << FI.second.GridWorkGroupCountY
-       << "  GridWorkgroupCountZ: " << FI.second.GridWorkGroupCountZ
        << "  WorkGroupIDX: " << FI.second.WorkGroupIDX
        << "  WorkGroupIDY: " << FI.second.WorkGroupIDY
        << "  WorkGroupIDZ: " << FI.second.WorkGroupIDZ
diff --git a/lib/Target/AMDGPU/AMDGPUArgumentUsageInfo.h b/lib/Target/AMDGPU/AMDGPUArgumentUsageInfo.h
index bf9635549a8c..f0e6d1b83f15 100644
--- a/lib/Target/AMDGPU/AMDGPUArgumentUsageInfo.h
+++ b/lib/Target/AMDGPU/AMDGPUArgumentUsageInfo.h
@@ -18,7 +18,7 @@ namespace llvm {
 
 class Function;
 class raw_ostream;
-class SISubtarget;
+class GCNSubtarget;
 class TargetMachine;
 class TargetRegisterClass;
 class TargetRegisterInfo;
@@ -111,9 +111,6 @@ struct AMDGPUFunctionArgInfo {
   ArgDescriptor DispatchID;
   ArgDescriptor FlatScratchInit;
   ArgDescriptor PrivateSegmentSize;
-  ArgDescriptor GridWorkGroupCountX;
-  ArgDescriptor GridWorkGroupCountY;
-  ArgDescriptor GridWorkGroupCountZ;
 
   // System SGPRs in kernels.
   ArgDescriptor WorkGroupIDX;
diff --git a/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp b/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp
index fda6252f46e3..e62e5d52ad74 100644
--- a/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp
+++ b/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp
@@ -1,4 +1,4 @@
-//===-- AMDGPUAsmPrinter.cpp - AMDGPU Assebly printer  --------------------===//
+//===-- AMDGPUAsmPrinter.cpp - AMDGPU assembly printer  -------------------===//
 //
 //                     The LLVM Compiler Infrastructure
 //
@@ -21,7 +21,9 @@
 #include "AMDGPUSubtarget.h"
 #include "AMDGPUTargetMachine.h"
 #include "InstPrinter/AMDGPUInstPrinter.h"
+#include "MCTargetDesc/AMDGPUMCTargetDesc.h"
 #include "MCTargetDesc/AMDGPUTargetStreamer.h"
+#include "R600AsmPrinter.h"
 #include "R600Defines.h"
 #include "R600MachineFunctionInfo.h"
 #include "R600RegisterInfo.h"
@@ -32,7 +34,6 @@
 #include "Utils/AMDGPUBaseInfo.h"
 #include "llvm/BinaryFormat/ELF.h"
 #include "llvm/CodeGen/MachineFrameInfo.h"
-#include "llvm/CodeGen/TargetLoweringObjectFile.h"
 #include "llvm/IR/DiagnosticInfo.h"
 #include "llvm/MC/MCContext.h"
 #include "llvm/MC/MCSectionELF.h"
@@ -40,6 +41,7 @@
 #include "llvm/Support/AMDGPUMetadata.h"
 #include "llvm/Support/MathExtras.h"
 #include "llvm/Support/TargetRegistry.h"
+#include "llvm/Target/TargetLoweringObjectFile.h"
 
 using namespace llvm;
 using namespace llvm::AMDGPU;
@@ -65,7 +67,7 @@ using namespace llvm::AMDGPU;
 // instructions to run at the double precision rate for the device so it's
 // probably best to just report no single precision denormals.
 static uint32_t getFPMode(const MachineFunction &F) {
-  const SISubtarget& ST = F.getSubtarget<SISubtarget>();
+  const GCNSubtarget& ST = F.getSubtarget<GCNSubtarget>();
   // TODO: Is there any real use for the flush in only / flush out only modes?
 
   uint32_t FP32Denormals =
@@ -88,7 +90,7 @@ createAMDGPUAsmPrinterPass(TargetMachine &tm,
 
 extern "C" void LLVMInitializeAMDGPUAsmPrinter() {
   TargetRegistry::RegisterAsmPrinter(getTheAMDGPUTarget(),
-                                     createAMDGPUAsmPrinterPass);
+                                     llvm::createR600AsmPrinterPass);
   TargetRegistry::RegisterAsmPrinter(getTheGCNTarget(),
                                      createAMDGPUAsmPrinterPass);
 }
@@ -114,7 +116,8 @@ AMDGPUTargetStreamer* AMDGPUAsmPrinter::getTargetStreamer() const {
 }
 
 void AMDGPUAsmPrinter::EmitStartOfAsmFile(Module &M) {
-  if (TM.getTargetTriple().getArch() != Triple::amdgcn)
+  if (IsaInfo::hasCodeObjectV3(getSTI()) &&
+      TM.getTargetTriple().getOS() == Triple::AMDHSA)
     return;
 
   if (TM.getTargetTriple().getOS() != Triple::AMDHSA &&
@@ -127,10 +130,6 @@ void AMDGPUAsmPrinter::EmitStartOfAsmFile(Module &M) {
   if (TM.getTargetTriple().getOS() == Triple::AMDPAL)
     readPALMetadata(M);
 
-  // Deprecated notes are not emitted for code object v3.
-  if (IsaInfo::hasCodeObjectV3(getSTI()->getFeatureBits()))
-    return;
-
   // HSA emits NT_AMDGPU_HSA_CODE_OBJECT_VERSION for code objects v2.
   if (TM.getTargetTriple().getOS() == Triple::AMDHSA)
     getTargetStreamer()->EmitDirectiveHSACodeObjectVersion(2, 1);
@@ -142,7 +141,9 @@ void AMDGPUAsmPrinter::EmitStartOfAsmFile(Module &M) {
 }
 
 void AMDGPUAsmPrinter::EmitEndOfAsmFile(Module &M) {
-  if (TM.getTargetTriple().getArch() != Triple::amdgcn)
+  // TODO: Add metadata to code object v3.
+  if (IsaInfo::hasCodeObjectV3(getSTI()) &&
+      TM.getTargetTriple().getOS() == Triple::AMDHSA)
     return;
 
   // Following code requires TargetStreamer to be present.
@@ -189,37 +190,82 @@ bool AMDGPUAsmPrinter::isBlockOnlyReachableByFallthrough(
 }
 
 void AMDGPUAsmPrinter::EmitFunctionBodyStart() {
-  const AMDGPUMachineFunction *MFI = MF->getInfo<AMDGPUMachineFunction>();
-  if (!MFI->isEntryFunction())
+  const SIMachineFunctionInfo &MFI = *MF->getInfo<SIMachineFunctionInfo>();
+  if (!MFI.isEntryFunction())
+    return;
+  if (IsaInfo::hasCodeObjectV3(getSTI()) &&
+      TM.getTargetTriple().getOS() == Triple::AMDHSA)
     return;
 
-  const AMDGPUSubtarget &STM = MF->getSubtarget<AMDGPUSubtarget>();
-  amd_kernel_code_t KernelCode;
-  if (STM.isAmdCodeObjectV2(*MF)) {
+  const GCNSubtarget &STM = MF->getSubtarget<GCNSubtarget>();
+  const Function &F = MF->getFunction();
+  if (STM.isAmdCodeObjectV2(F) &&
+      (F.getCallingConv() == CallingConv::AMDGPU_KERNEL ||
+       F.getCallingConv() == CallingConv::SPIR_KERNEL)) {
+    amd_kernel_code_t KernelCode;
     getAmdKernelCode(KernelCode, CurrentProgramInfo, *MF);
-
-    OutStreamer->SwitchSection(getObjFileLowering().getTextSection());
     getTargetStreamer()->EmitAMDKernelCodeT(KernelCode);
   }
 
   if (TM.getTargetTriple().getOS() != Triple::AMDHSA)
     return;
 
-  HSAMetadataStream.emitKernel(MF->getFunction(),
-                               getHSACodeProps(*MF, CurrentProgramInfo),
-                               getHSADebugProps(*MF, CurrentProgramInfo));
+  HSAMetadataStream.emitKernel(*MF, CurrentProgramInfo);
+}
+
+void AMDGPUAsmPrinter::EmitFunctionBodyEnd() {
+  const SIMachineFunctionInfo &MFI = *MF->getInfo<SIMachineFunctionInfo>();
+  if (!MFI.isEntryFunction())
+    return;
+  if (!IsaInfo::hasCodeObjectV3(getSTI()) ||
+      TM.getTargetTriple().getOS() != Triple::AMDHSA)
+    return;
+
+  auto &Streamer = getTargetStreamer()->getStreamer();
+  auto &Context = Streamer.getContext();
+  auto &ObjectFileInfo = *Context.getObjectFileInfo();
+  auto &ReadOnlySection = *ObjectFileInfo.getReadOnlySection();
+
+  Streamer.PushSection();
+  Streamer.SwitchSection(&ReadOnlySection);
+
+  // CP microcode requires the kernel descriptor to be allocated on 64 byte
+  // alignment.
+  Streamer.EmitValueToAlignment(64, 0, 1, 0);
+  if (ReadOnlySection.getAlignment() < 64)
+    ReadOnlySection.setAlignment(64);
+
+  SmallString<128> KernelName;
+  getNameWithPrefix(KernelName, &MF->getFunction());
+  getTargetStreamer()->EmitAmdhsaKernelDescriptor(
+      *getSTI(), KernelName, getAmdhsaKernelDescriptor(*MF, CurrentProgramInfo),
+      CurrentProgramInfo.NumVGPRsForWavesPerEU,
+      CurrentProgramInfo.NumSGPRsForWavesPerEU -
+          IsaInfo::getNumExtraSGPRs(getSTI()->getFeatureBits(),
+                                    CurrentProgramInfo.VCCUsed,
+                                    CurrentProgramInfo.FlatUsed),
+      CurrentProgramInfo.VCCUsed, CurrentProgramInfo.FlatUsed,
+      hasXNACK(*getSTI()));
+
+  Streamer.PopSection();
 }
 
 void AMDGPUAsmPrinter::EmitFunctionEntryLabel() {
+  if (IsaInfo::hasCodeObjectV3(getSTI()) &&
+      TM.getTargetTriple().getOS() == Triple::AMDHSA) {
+    AsmPrinter::EmitFunctionEntryLabel();
+    return;
+  }
+
   const SIMachineFunctionInfo *MFI = MF->getInfo<SIMachineFunctionInfo>();
-  const AMDGPUSubtarget &STM = MF->getSubtarget<AMDGPUSubtarget>();
-  if (MFI->isEntryFunction() && STM.isAmdCodeObjectV2(*MF)) {
+  const GCNSubtarget &STM = MF->getSubtarget<GCNSubtarget>();
+  if (MFI->isEntryFunction() && STM.isAmdCodeObjectV2(MF->getFunction())) {
     SmallString<128> SymbolName;
     getNameWithPrefix(SymbolName, &MF->getFunction()),
     getTargetStreamer()->EmitAMDGPUSymbolType(
         SymbolName, ELF::STT_AMDGPU_HSA_KERNEL);
   }
-  const AMDGPUSubtarget &STI = MF->getSubtarget<AMDGPUSubtarget>();
+  const GCNSubtarget &STI = MF->getSubtarget<GCNSubtarget>();
   if (STI.dumpCode()) {
     // Disassemble function name label to text.
     DisasmLines.push_back(MF->getName().str() + ":");
@@ -231,7 +277,7 @@ void AMDGPUAsmPrinter::EmitFunctionEntryLabel() {
 }
 
 void AMDGPUAsmPrinter::EmitBasicBlockStart(const MachineBasicBlock &MBB) const {
-  const AMDGPUSubtarget &STI = MBB.getParent()->getSubtarget<AMDGPUSubtarget>();
+  const GCNSubtarget &STI = MBB.getParent()->getSubtarget<GCNSubtarget>();
   if (STI.dumpCode() && !isBlockOnlyReachableByFallthrough(&MBB)) {
     // Write a line for the basic block label if it is not only fallthrough.
     DisasmLines.push_back(
@@ -283,11 +329,66 @@ void AMDGPUAsmPrinter::emitCommonFunctionComments(
   uint32_t NumVGPR,
   uint32_t NumSGPR,
   uint64_t ScratchSize,
-  uint64_t CodeSize) {
+  uint64_t CodeSize,
+  const AMDGPUMachineFunction *MFI) {
   OutStreamer->emitRawComment(" codeLenInByte = " + Twine(CodeSize), false);
   OutStreamer->emitRawComment(" NumSgprs: " + Twine(NumSGPR), false);
   OutStreamer->emitRawComment(" NumVgprs: " + Twine(NumVGPR), false);
   OutStreamer->emitRawComment(" ScratchSize: " + Twine(ScratchSize), false);
+  OutStreamer->emitRawComment(" MemoryBound: " + Twine(MFI->isMemoryBound()),
+                              false);
+}
+
+uint16_t AMDGPUAsmPrinter::getAmdhsaKernelCodeProperties(
+    const MachineFunction &MF) const {
+  const SIMachineFunctionInfo &MFI = *MF.getInfo<SIMachineFunctionInfo>();
+  uint16_t KernelCodeProperties = 0;
+
+  if (MFI.hasPrivateSegmentBuffer()) {
+    KernelCodeProperties |=
+        amdhsa::KERNEL_CODE_PROPERTY_ENABLE_SGPR_PRIVATE_SEGMENT_BUFFER;
+  }
+  if (MFI.hasDispatchPtr()) {
+    KernelCodeProperties |=
+        amdhsa::KERNEL_CODE_PROPERTY_ENABLE_SGPR_DISPATCH_PTR;
+  }
+  if (MFI.hasQueuePtr()) {
+    KernelCodeProperties |=
+        amdhsa::KERNEL_CODE_PROPERTY_ENABLE_SGPR_QUEUE_PTR;
+  }
+  if (MFI.hasKernargSegmentPtr()) {
+    KernelCodeProperties |=
+        amdhsa::KERNEL_CODE_PROPERTY_ENABLE_SGPR_KERNARG_SEGMENT_PTR;
+  }
+  if (MFI.hasDispatchID()) {
+    KernelCodeProperties |=
+        amdhsa::KERNEL_CODE_PROPERTY_ENABLE_SGPR_DISPATCH_ID;
+  }
+  if (MFI.hasFlatScratchInit()) {
+    KernelCodeProperties |=
+        amdhsa::KERNEL_CODE_PROPERTY_ENABLE_SGPR_FLAT_SCRATCH_INIT;
+  }
+
+  return KernelCodeProperties;
+}
+
+amdhsa::kernel_descriptor_t AMDGPUAsmPrinter::getAmdhsaKernelDescriptor(
+    const MachineFunction &MF,
+    const SIProgramInfo &PI) const {
+  amdhsa::kernel_descriptor_t KernelDescriptor;
+  memset(&KernelDescriptor, 0x0, sizeof(KernelDescriptor));
+
+  assert(isUInt<32>(PI.ScratchSize));
+  assert(isUInt<32>(PI.ComputePGMRSrc1));
+  assert(isUInt<32>(PI.ComputePGMRSrc2));
+
+  KernelDescriptor.group_segment_fixed_size = PI.LDSSize;
+  KernelDescriptor.private_segment_fixed_size = PI.ScratchSize;
+  KernelDescriptor.compute_pgm_rsrc1 = PI.ComputePGMRSrc1;
+  KernelDescriptor.compute_pgm_rsrc2 = PI.ComputePGMRSrc2;
+  KernelDescriptor.kernel_code_properties = getAmdhsaKernelCodeProperties(MF);
+
+  return KernelDescriptor;
 }
 
 bool AMDGPUAsmPrinter::runOnMachineFunction(MachineFunction &MF) {
@@ -301,32 +402,29 @@ bool AMDGPUAsmPrinter::runOnMachineFunction(MachineFunction &MF) {
 
   SetupMachineFunction(MF);
 
-  const AMDGPUSubtarget &STM = MF.getSubtarget<AMDGPUSubtarget>();
+  const GCNSubtarget &STM = MF.getSubtarget<GCNSubtarget>();
   MCContext &Context = getObjFileLowering().getContext();
-  if (!STM.isAmdHsaOS()) {
+  // FIXME: This should be an explicit check for Mesa.
+  if (!STM.isAmdHsaOS() && !STM.isAmdPalOS()) {
     MCSectionELF *ConfigSection =
         Context.getELFSection(".AMDGPU.config", ELF::SHT_PROGBITS, 0);
     OutStreamer->SwitchSection(ConfigSection);
   }
 
-  if (STM.getGeneration() >= AMDGPUSubtarget::SOUTHERN_ISLANDS) {
-    if (MFI->isEntryFunction()) {
-      getSIProgramInfo(CurrentProgramInfo, MF);
-    } else {
-      auto I = CallGraphResourceInfo.insert(
-        std::make_pair(&MF.getFunction(), SIFunctionResourceInfo()));
-      SIFunctionResourceInfo &Info = I.first->second;
-      assert(I.second && "should only be called once per function");
-      Info = analyzeResourceUsage(MF);
-    }
-
-    if (STM.isAmdPalOS())
-      EmitPALMetadata(MF, CurrentProgramInfo);
-    if (!STM.isAmdHsaOS()) {
-      EmitProgramInfoSI(MF, CurrentProgramInfo);
-    }
+  if (MFI->isEntryFunction()) {
+    getSIProgramInfo(CurrentProgramInfo, MF);
   } else {
-    EmitProgramInfoR600(MF);
+    auto I = CallGraphResourceInfo.insert(
+      std::make_pair(&MF.getFunction(), SIFunctionResourceInfo()));
+    SIFunctionResourceInfo &Info = I.first->second;
+    assert(I.second && "should only be called once per function");
+    Info = analyzeResourceUsage(MF);
+  }
+
+  if (STM.isAmdPalOS())
+    EmitPALMetadata(MF, CurrentProgramInfo);
+  else if (!STM.isAmdHsaOS()) {
+    EmitProgramInfoSI(MF, CurrentProgramInfo);
   }
 
   DisasmLines.clear();
@@ -340,84 +438,74 @@ bool AMDGPUAsmPrinter::runOnMachineFunction(MachineFunction &MF) {
         Context.getELFSection(".AMDGPU.csdata", ELF::SHT_PROGBITS, 0);
     OutStreamer->SwitchSection(CommentSection);
 
-    if (STM.getGeneration() >= AMDGPUSubtarget::SOUTHERN_ISLANDS) {
-      if (!MFI->isEntryFunction()) {
-        OutStreamer->emitRawComment(" Function info:", false);
-        SIFunctionResourceInfo &Info = CallGraphResourceInfo[&MF.getFunction()];
-        emitCommonFunctionComments(
-          Info.NumVGPR,
-          Info.getTotalNumSGPRs(MF.getSubtarget<SISubtarget>()),
-          Info.PrivateSegmentSize,
-          getFunctionCodeSize(MF));
-        return false;
-      }
-
-      OutStreamer->emitRawComment(" Kernel info:", false);
-      emitCommonFunctionComments(CurrentProgramInfo.NumVGPR,
-                                 CurrentProgramInfo.NumSGPR,
-                                 CurrentProgramInfo.ScratchSize,
-                                 getFunctionCodeSize(MF));
-
-      OutStreamer->emitRawComment(
-        " FloatMode: " + Twine(CurrentProgramInfo.FloatMode), false);
-      OutStreamer->emitRawComment(
-        " IeeeMode: " + Twine(CurrentProgramInfo.IEEEMode), false);
-      OutStreamer->emitRawComment(
-        " LDSByteSize: " + Twine(CurrentProgramInfo.LDSSize) +
-        " bytes/workgroup (compile time only)", false);
-
-      OutStreamer->emitRawComment(
-        " SGPRBlocks: " + Twine(CurrentProgramInfo.SGPRBlocks), false);
-      OutStreamer->emitRawComment(
-        " VGPRBlocks: " + Twine(CurrentProgramInfo.VGPRBlocks), false);
-
-      OutStreamer->emitRawComment(
-        " NumSGPRsForWavesPerEU: " +
-        Twine(CurrentProgramInfo.NumSGPRsForWavesPerEU), false);
-      OutStreamer->emitRawComment(
-        " NumVGPRsForWavesPerEU: " +
-        Twine(CurrentProgramInfo.NumVGPRsForWavesPerEU), false);
-
-      OutStreamer->emitRawComment(
-        " ReservedVGPRFirst: " + Twine(CurrentProgramInfo.ReservedVGPRFirst),
-        false);
-      OutStreamer->emitRawComment(
-        " ReservedVGPRCount: " + Twine(CurrentProgramInfo.ReservedVGPRCount),
-        false);
-
-      if (MF.getSubtarget<SISubtarget>().debuggerEmitPrologue()) {
-        OutStreamer->emitRawComment(
-          " DebuggerWavefrontPrivateSegmentOffsetSGPR: s" +
-          Twine(CurrentProgramInfo.DebuggerWavefrontPrivateSegmentOffsetSGPR), false);
-        OutStreamer->emitRawComment(
-          " DebuggerPrivateSegmentBufferSGPR: s" +
-          Twine(CurrentProgramInfo.DebuggerPrivateSegmentBufferSGPR), false);
-      }
+    if (!MFI->isEntryFunction()) {
+      OutStreamer->emitRawComment(" Function info:", false);
+      SIFunctionResourceInfo &Info = CallGraphResourceInfo[&MF.getFunction()];
+      emitCommonFunctionComments(
+        Info.NumVGPR,
+        Info.getTotalNumSGPRs(MF.getSubtarget<GCNSubtarget>()),
+        Info.PrivateSegmentSize,
+        getFunctionCodeSize(MF), MFI);
+      return false;
+    }
 
+    OutStreamer->emitRawComment(" Kernel info:", false);
+    emitCommonFunctionComments(CurrentProgramInfo.NumVGPR,
+                               CurrentProgramInfo.NumSGPR,
+                               CurrentProgramInfo.ScratchSize,
+                               getFunctionCodeSize(MF), MFI);
+
+    OutStreamer->emitRawComment(
+      " FloatMode: " + Twine(CurrentProgramInfo.FloatMode), false);
+    OutStreamer->emitRawComment(
+      " IeeeMode: " + Twine(CurrentProgramInfo.IEEEMode), false);
+    OutStreamer->emitRawComment(
+      " LDSByteSize: " + Twine(CurrentProgramInfo.LDSSize) +
+      " bytes/workgroup (compile time only)", false);
+
+    OutStreamer->emitRawComment(
+      " SGPRBlocks: " + Twine(CurrentProgramInfo.SGPRBlocks), false);
+    OutStreamer->emitRawComment(
+      " VGPRBlocks: " + Twine(CurrentProgramInfo.VGPRBlocks), false);
+
+    OutStreamer->emitRawComment(
+      " NumSGPRsForWavesPerEU: " +
+      Twine(CurrentProgramInfo.NumSGPRsForWavesPerEU), false);
+    OutStreamer->emitRawComment(
+      " NumVGPRsForWavesPerEU: " +
+      Twine(CurrentProgramInfo.NumVGPRsForWavesPerEU), false);
+
+    OutStreamer->emitRawComment(
+      " WaveLimiterHint : " + Twine(MFI->needsWaveLimiter()), false);
+
+    if (MF.getSubtarget<GCNSubtarget>().debuggerEmitPrologue()) {
       OutStreamer->emitRawComment(
-        " COMPUTE_PGM_RSRC2:USER_SGPR: " +
-        Twine(G_00B84C_USER_SGPR(CurrentProgramInfo.ComputePGMRSrc2)), false);
-      OutStreamer->emitRawComment(
-        " COMPUTE_PGM_RSRC2:TRAP_HANDLER: " +
-        Twine(G_00B84C_TRAP_HANDLER(CurrentProgramInfo.ComputePGMRSrc2)), false);
+        " DebuggerWavefrontPrivateSegmentOffsetSGPR: s" +
+        Twine(CurrentProgramInfo.DebuggerWavefrontPrivateSegmentOffsetSGPR), false);
       OutStreamer->emitRawComment(
-        " COMPUTE_PGM_RSRC2:TGID_X_EN: " +
-        Twine(G_00B84C_TGID_X_EN(CurrentProgramInfo.ComputePGMRSrc2)), false);
-      OutStreamer->emitRawComment(
-        " COMPUTE_PGM_RSRC2:TGID_Y_EN: " +
-        Twine(G_00B84C_TGID_Y_EN(CurrentProgramInfo.ComputePGMRSrc2)), false);
-      OutStreamer->emitRawComment(
-        " COMPUTE_PGM_RSRC2:TGID_Z_EN: " +
-        Twine(G_00B84C_TGID_Z_EN(CurrentProgramInfo.ComputePGMRSrc2)), false);
-      OutStreamer->emitRawComment(
-        " COMPUTE_PGM_RSRC2:TIDIG_COMP_CNT: " +
-        Twine(G_00B84C_TIDIG_COMP_CNT(CurrentProgramInfo.ComputePGMRSrc2)),
-        false);
-    } else {
-      R600MachineFunctionInfo *MFI = MF.getInfo<R600MachineFunctionInfo>();
-      OutStreamer->emitRawComment(
-        Twine("SQ_PGM_RESOURCES:STACK_SIZE = " + Twine(MFI->CFStackSize)));
+        " DebuggerPrivateSegmentBufferSGPR: s" +
+        Twine(CurrentProgramInfo.DebuggerPrivateSegmentBufferSGPR), false);
     }
+
+    OutStreamer->emitRawComment(
+      " COMPUTE_PGM_RSRC2:USER_SGPR: " +
+      Twine(G_00B84C_USER_SGPR(CurrentProgramInfo.ComputePGMRSrc2)), false);
+    OutStreamer->emitRawComment(
+      " COMPUTE_PGM_RSRC2:TRAP_HANDLER: " +
+      Twine(G_00B84C_TRAP_HANDLER(CurrentProgramInfo.ComputePGMRSrc2)), false);
+    OutStreamer->emitRawComment(
+      " COMPUTE_PGM_RSRC2:TGID_X_EN: " +
+      Twine(G_00B84C_TGID_X_EN(CurrentProgramInfo.ComputePGMRSrc2)), false);
+    OutStreamer->emitRawComment(
+      " COMPUTE_PGM_RSRC2:TGID_Y_EN: " +
+      Twine(G_00B84C_TGID_Y_EN(CurrentProgramInfo.ComputePGMRSrc2)), false);
+    OutStreamer->emitRawComment(
+      " COMPUTE_PGM_RSRC2:TGID_Z_EN: " +
+      Twine(G_00B84C_TGID_Z_EN(CurrentProgramInfo.ComputePGMRSrc2)), false);
+    OutStreamer->emitRawComment(
+      " COMPUTE_PGM_RSRC2:TIDIG_COMP_CNT: " +
+      Twine(G_00B84C_TIDIG_COMP_CNT(CurrentProgramInfo.ComputePGMRSrc2)),
+      false);
   }
 
   if (STM.dumpCode()) {
@@ -440,67 +528,8 @@ bool AMDGPUAsmPrinter::runOnMachineFunction(MachineFunction &MF) {
   return false;
 }
 
-void AMDGPUAsmPrinter::EmitProgramInfoR600(const MachineFunction &MF) {
-  unsigned MaxGPR = 0;
-  bool killPixel = false;
-  const R600Subtarget &STM = MF.getSubtarget<R600Subtarget>();
-  const R600RegisterInfo *RI = STM.getRegisterInfo();
-  const R600MachineFunctionInfo *MFI = MF.getInfo<R600MachineFunctionInfo>();
-
-  for (const MachineBasicBlock &MBB : MF) {
-    for (const MachineInstr &MI : MBB) {
-      if (MI.getOpcode() == AMDGPU::KILLGT)
-        killPixel = true;
-      unsigned numOperands = MI.getNumOperands();
-      for (unsigned op_idx = 0; op_idx < numOperands; op_idx++) {
-        const MachineOperand &MO = MI.getOperand(op_idx);
-        if (!MO.isReg())
-          continue;
-        unsigned HWReg = RI->getHWRegIndex(MO.getReg());
-
-        // Register with value > 127 aren't GPR
-        if (HWReg > 127)
-          continue;
-        MaxGPR = std::max(MaxGPR, HWReg);
-      }
-    }
-  }
-
-  unsigned RsrcReg;
-  if (STM.getGeneration() >= R600Subtarget::EVERGREEN) {
-    // Evergreen / Northern Islands
-    switch (MF.getFunction().getCallingConv()) {
-    default: LLVM_FALLTHROUGH;
-    case CallingConv::AMDGPU_CS: RsrcReg = R_0288D4_SQ_PGM_RESOURCES_LS; break;
-    case CallingConv::AMDGPU_GS: RsrcReg = R_028878_SQ_PGM_RESOURCES_GS; break;
-    case CallingConv::AMDGPU_PS: RsrcReg = R_028844_SQ_PGM_RESOURCES_PS; break;
-    case CallingConv::AMDGPU_VS: RsrcReg = R_028860_SQ_PGM_RESOURCES_VS; break;
-    }
-  } else {
-    // R600 / R700
-    switch (MF.getFunction().getCallingConv()) {
-    default: LLVM_FALLTHROUGH;
-    case CallingConv::AMDGPU_GS: LLVM_FALLTHROUGH;
-    case CallingConv::AMDGPU_CS: LLVM_FALLTHROUGH;
-    case CallingConv::AMDGPU_VS: RsrcReg = R_028868_SQ_PGM_RESOURCES_VS; break;
-    case CallingConv::AMDGPU_PS: RsrcReg = R_028850_SQ_PGM_RESOURCES_PS; break;
-    }
-  }
-
-  OutStreamer->EmitIntValue(RsrcReg, 4);
-  OutStreamer->EmitIntValue(S_NUM_GPRS(MaxGPR + 1) |
-                           S_STACK_SIZE(MFI->CFStackSize), 4);
-  OutStreamer->EmitIntValue(R_02880C_DB_SHADER_CONTROL, 4);
-  OutStreamer->EmitIntValue(S_02880C_KILL_ENABLE(killPixel), 4);
-
-  if (AMDGPU::isCompute(MF.getFunction().getCallingConv())) {
-    OutStreamer->EmitIntValue(R_0288E8_SQ_LDS_ALLOC, 4);
-    OutStreamer->EmitIntValue(alignTo(MFI->getLDSSize(), 4) >> 2, 4);
-  }
-}
-
 uint64_t AMDGPUAsmPrinter::getFunctionCodeSize(const MachineFunction &MF) const {
-  const SISubtarget &STM = MF.getSubtarget<SISubtarget>();
+  const GCNSubtarget &STM = MF.getSubtarget<GCNSubtarget>();
   const SIInstrInfo *TII = STM.getInstrInfo();
 
   uint64_t CodeSize = 0;
@@ -510,7 +539,7 @@ uint64_t AMDGPUAsmPrinter::getFunctionCodeSize(const MachineFunction &MF) const
       // TODO: CodeSize should account for multiple functions.
 
       // TODO: Should we count size of debug info?
-      if (MI.isDebugValue())
+      if (MI.isDebugInstr())
         continue;
 
       CodeSize += TII->getInstSizeInBytes(MI);
@@ -531,30 +560,10 @@ static bool hasAnyNonFlatUseOfReg(const MachineRegisterInfo &MRI,
   return false;
 }
 
-static unsigned getNumExtraSGPRs(const SISubtarget &ST,
-                                 bool VCCUsed,
-                                 bool FlatScrUsed) {
-  unsigned ExtraSGPRs = 0;
-  if (VCCUsed)
-    ExtraSGPRs = 2;
-
-  if (ST.getGeneration() < SISubtarget::VOLCANIC_ISLANDS) {
-    if (FlatScrUsed)
-      ExtraSGPRs = 4;
-  } else {
-    if (ST.isXNACKEnabled())
-      ExtraSGPRs = 4;
-
-    if (FlatScrUsed)
-      ExtraSGPRs = 6;
-  }
-
-  return ExtraSGPRs;
-}
-
 int32_t AMDGPUAsmPrinter::SIFunctionResourceInfo::getTotalNumSGPRs(
-  const SISubtarget &ST) const {
-  return NumExplicitSGPR + getNumExtraSGPRs(ST, UsesVCC, UsesFlatScratch);
+  const GCNSubtarget &ST) const {
+  return NumExplicitSGPR + IsaInfo::getNumExtraSGPRs(ST.getFeatureBits(),
+                                                     UsesVCC, UsesFlatScratch);
 }
 
 AMDGPUAsmPrinter::SIFunctionResourceInfo AMDGPUAsmPrinter::analyzeResourceUsage(
@@ -562,7 +571,7 @@ AMDGPUAsmPrinter::SIFunctionResourceInfo AMDGPUAsmPrinter::analyzeResourceUsage(
   SIFunctionResourceInfo Info;
 
   const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
-  const SISubtarget &ST = MF.getSubtarget<SISubtarget>();
+  const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
   const MachineFrameInfo &FrameInfo = MF.getFrameInfo();
   const MachineRegisterInfo &MRI = MF.getRegInfo();
   const SIInstrInfo *TII = ST.getInstrInfo();
@@ -586,6 +595,8 @@ AMDGPUAsmPrinter::SIFunctionResourceInfo AMDGPUAsmPrinter::analyzeResourceUsage(
 
   Info.HasDynamicallySizedStack = FrameInfo.hasVarSizedObjects();
   Info.PrivateSegmentSize = FrameInfo.getStackSize();
+  if (MFI->isStackRealigned())
+    Info.PrivateSegmentSize += FrameInfo.getMaxAlignment();
 
 
   Info.UsesVCC = MRI.isPhysRegUsed(AMDGPU::VCC_LO) ||
@@ -649,7 +660,7 @@ AMDGPUAsmPrinter::SIFunctionResourceInfo AMDGPUAsmPrinter::analyzeResourceUsage(
           continue;
 
         case AMDGPU::NoRegister:
-          assert(MI.isDebugValue());
+          assert(MI.isDebugInstr());
           continue;
 
         case AMDGPU::VCC:
@@ -663,6 +674,11 @@ AMDGPUAsmPrinter::SIFunctionResourceInfo AMDGPUAsmPrinter::analyzeResourceUsage(
         case AMDGPU::FLAT_SCR_HI:
           continue;
 
+        case AMDGPU::XNACK_MASK:
+        case AMDGPU::XNACK_MASK_LO:
+        case AMDGPU::XNACK_MASK_HI:
+          llvm_unreachable("xnack_mask registers should not be used");
+
         case AMDGPU::TBA:
         case AMDGPU::TBA_LO:
         case AMDGPU::TBA_HI:
@@ -742,8 +758,9 @@ AMDGPUAsmPrinter::SIFunctionResourceInfo AMDGPUAsmPrinter::analyzeResourceUsage(
           // conservative guesses.
 
           // 48 SGPRs - vcc, - flat_scr, -xnack
-          int MaxSGPRGuess = 47 - getNumExtraSGPRs(ST, true,
-                                                   ST.hasFlatAddressSpace());
+          int MaxSGPRGuess =
+              47 - IsaInfo::getNumExtraSGPRs(ST.getFeatureBits(), true,
+                                             ST.hasFlatAddressSpace());
           MaxSGPR = std::max(MaxSGPR, MaxSGPRGuess);
           MaxVGPR = std::max(MaxVGPR, 23);
 
@@ -798,15 +815,16 @@ void AMDGPUAsmPrinter::getSIProgramInfo(SIProgramInfo &ProgInfo,
     MF.getFunction().getContext().diagnose(DiagStackSize);
   }
 
-  const SISubtarget &STM = MF.getSubtarget<SISubtarget>();
+  const GCNSubtarget &STM = MF.getSubtarget<GCNSubtarget>();
   const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
   const SIInstrInfo *TII = STM.getInstrInfo();
   const SIRegisterInfo *RI = &TII->getRegisterInfo();
 
-  unsigned ExtraSGPRs = getNumExtraSGPRs(STM,
-                                         ProgInfo.VCCUsed,
-                                         ProgInfo.FlatUsed);
-  unsigned ExtraVGPRs = STM.getReservedNumVGPRs(MF);
+  // TODO(scott.linder): The calculations related to SGPR/VGPR blocks are
+  // duplicated in part in AMDGPUAsmParser::calculateGPRBlocks, and could be
+  // unified.
+  unsigned ExtraSGPRs = IsaInfo::getNumExtraSGPRs(
+      STM.getFeatureBits(), ProgInfo.VCCUsed, ProgInfo.FlatUsed);
 
   // Check the addressable register limit before we add ExtraSGPRs.
   if (STM.getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS &&
@@ -827,7 +845,19 @@ void AMDGPUAsmPrinter::getSIProgramInfo(SIProgramInfo &ProgInfo,
 
   // Account for extra SGPRs and VGPRs reserved for debugger use.
   ProgInfo.NumSGPR += ExtraSGPRs;
-  ProgInfo.NumVGPR += ExtraVGPRs;
+
+  // Ensure there are enough SGPRs and VGPRs for wave dispatch, where wave
+  // dispatch registers are function args.
+  unsigned WaveDispatchNumSGPR = 0, WaveDispatchNumVGPR = 0;
+  for (auto &Arg : MF.getFunction().args()) {
+    unsigned NumRegs = (Arg.getType()->getPrimitiveSizeInBits() + 31) / 32;
+    if (Arg.hasAttribute(Attribute::InReg))
+      WaveDispatchNumSGPR += NumRegs;
+    else
+      WaveDispatchNumVGPR += NumRegs;
+  }
+  ProgInfo.NumSGPR = std::max(ProgInfo.NumSGPR, WaveDispatchNumSGPR);
+  ProgInfo.NumVGPR = std::max(ProgInfo.NumVGPR, WaveDispatchNumVGPR);
 
   // Adjust number of registers used to meet default/requested minimum/maximum
   // number of waves per execution unit request.
@@ -875,19 +905,10 @@ void AMDGPUAsmPrinter::getSIProgramInfo(SIProgramInfo &ProgInfo,
     Ctx.diagnose(Diag);
   }
 
-  // SGPRBlocks is actual number of SGPR blocks minus 1.
-  ProgInfo.SGPRBlocks = alignTo(ProgInfo.NumSGPRsForWavesPerEU,
-                                STM.getSGPREncodingGranule());
-  ProgInfo.SGPRBlocks = ProgInfo.SGPRBlocks / STM.getSGPREncodingGranule() - 1;
-
-  // VGPRBlocks is actual number of VGPR blocks minus 1.
-  ProgInfo.VGPRBlocks = alignTo(ProgInfo.NumVGPRsForWavesPerEU,
-                                STM.getVGPREncodingGranule());
-  ProgInfo.VGPRBlocks = ProgInfo.VGPRBlocks / STM.getVGPREncodingGranule() - 1;
-
-  // Record first reserved VGPR and number of reserved VGPRs.
-  ProgInfo.ReservedVGPRFirst = STM.debuggerReserveRegs() ? ProgInfo.NumVGPR : 0;
-  ProgInfo.ReservedVGPRCount = STM.getReservedNumVGPRs(MF);
+  ProgInfo.SGPRBlocks = IsaInfo::getNumSGPRBlocks(
+      STM.getFeatureBits(), ProgInfo.NumSGPRsForWavesPerEU);
+  ProgInfo.VGPRBlocks = IsaInfo::getNumVGPRBlocks(
+      STM.getFeatureBits(), ProgInfo.NumVGPRsForWavesPerEU);
 
   // Update DebuggerWavefrontPrivateSegmentOffsetSGPR and
   // DebuggerPrivateSegmentBufferSGPR fields if "amdgpu-debugger-emit-prologue"
@@ -909,7 +930,7 @@ void AMDGPUAsmPrinter::getSIProgramInfo(SIProgramInfo &ProgInfo,
   ProgInfo.DX10Clamp = STM.enableDX10Clamp();
 
   unsigned LDSAlignShift;
-  if (STM.getGeneration() < SISubtarget::SEA_ISLANDS) {
+  if (STM.getGeneration() < AMDGPUSubtarget::SEA_ISLANDS) {
     // LDS is allocated in 64 dword blocks.
     LDSAlignShift = 8;
   } else {
@@ -954,7 +975,8 @@ void AMDGPUAsmPrinter::getSIProgramInfo(SIProgramInfo &ProgInfo,
   ProgInfo.ComputePGMRSrc2 =
       S_00B84C_SCRATCH_EN(ProgInfo.ScratchBlocks > 0) |
       S_00B84C_USER_SGPR(MFI->getNumUserSGPRs()) |
-      S_00B84C_TRAP_HANDLER(STM.isTrapHandlerEnabled()) |
+      // For AMDHSA, TRAP_HANDLER must be zero, as it is populated by the CP.
+      S_00B84C_TRAP_HANDLER(STM.isAmdHsaOS() ? 0 : STM.isTrapHandlerEnabled()) |
       S_00B84C_TGID_X_EN(MFI->hasWorkGroupIDX()) |
       S_00B84C_TGID_Y_EN(MFI->hasWorkGroupIDY()) |
       S_00B84C_TGID_Z_EN(MFI->hasWorkGroupIDZ()) |
@@ -981,7 +1003,7 @@ static unsigned getRsrcReg(CallingConv::ID CallConv) {
 
 void AMDGPUAsmPrinter::EmitProgramInfoSI(const MachineFunction &MF,
                                          const SIProgramInfo &CurrentProgramInfo) {
-  const SISubtarget &STM = MF.getSubtarget<SISubtarget>();
+  const GCNSubtarget &STM = MF.getSubtarget<GCNSubtarget>();
   const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
   unsigned RsrcReg = getRsrcReg(MF.getFunction().getCallingConv());
 
@@ -1002,26 +1024,21 @@ void AMDGPUAsmPrinter::EmitProgramInfoSI(const MachineFunction &MF,
     OutStreamer->EmitIntValue(RsrcReg, 4);
     OutStreamer->EmitIntValue(S_00B028_VGPRS(CurrentProgramInfo.VGPRBlocks) |
                               S_00B028_SGPRS(CurrentProgramInfo.SGPRBlocks), 4);
-    unsigned Rsrc2Val = 0;
     if (STM.isVGPRSpillingEnabled(MF.getFunction())) {
       OutStreamer->EmitIntValue(R_0286E8_SPI_TMPRING_SIZE, 4);
       OutStreamer->EmitIntValue(S_0286E8_WAVESIZE(CurrentProgramInfo.ScratchBlocks), 4);
-      if (TM.getTargetTriple().getOS() == Triple::AMDPAL)
-        Rsrc2Val = S_00B84C_SCRATCH_EN(CurrentProgramInfo.ScratchBlocks > 0);
-    }
-    if (MF.getFunction().getCallingConv() == CallingConv::AMDGPU_PS) {
-      OutStreamer->EmitIntValue(R_0286CC_SPI_PS_INPUT_ENA, 4);
-      OutStreamer->EmitIntValue(MFI->getPSInputEnable(), 4);
-      OutStreamer->EmitIntValue(R_0286D0_SPI_PS_INPUT_ADDR, 4);
-      OutStreamer->EmitIntValue(MFI->getPSInputAddr(), 4);
-      Rsrc2Val |= S_00B02C_EXTRA_LDS_SIZE(CurrentProgramInfo.LDSBlocks);
-    }
-    if (Rsrc2Val) {
-      OutStreamer->EmitIntValue(RsrcReg + 4 /*rsrc2*/, 4);
-      OutStreamer->EmitIntValue(Rsrc2Val, 4);
     }
   }
 
+  if (MF.getFunction().getCallingConv() == CallingConv::AMDGPU_PS) {
+    OutStreamer->EmitIntValue(R_00B02C_SPI_SHADER_PGM_RSRC2_PS, 4);
+    OutStreamer->EmitIntValue(S_00B02C_EXTRA_LDS_SIZE(CurrentProgramInfo.LDSBlocks), 4);
+    OutStreamer->EmitIntValue(R_0286CC_SPI_PS_INPUT_ENA, 4);
+    OutStreamer->EmitIntValue(MFI->getPSInputEnable(), 4);
+    OutStreamer->EmitIntValue(R_0286D0_SPI_PS_INPUT_ADDR, 4);
+    OutStreamer->EmitIntValue(MFI->getPSInputAddr(), 4);
+  }
+
   OutStreamer->EmitIntValue(R_SPILLED_SGPRS, 4);
   OutStreamer->EmitIntValue(MFI->getNumSpilledSGPRs(), 4);
   OutStreamer->EmitIntValue(R_SPILLED_VGPRS, 4);
@@ -1114,8 +1131,12 @@ static amd_element_byte_size_t getElementByteSizeValue(unsigned Size) {
 void AMDGPUAsmPrinter::getAmdKernelCode(amd_kernel_code_t &Out,
                                         const SIProgramInfo &CurrentProgramInfo,
                                         const MachineFunction &MF) const {
+  const Function &F = MF.getFunction();
+  assert(F.getCallingConv() == CallingConv::AMDGPU_KERNEL ||
+         F.getCallingConv() == CallingConv::SPIR_KERNEL);
+
   const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
-  const SISubtarget &STM = MF.getSubtarget<SISubtarget>();
+  const GCNSubtarget &STM = MF.getSubtarget<GCNSubtarget>();
 
   AMDGPU::initDefaultAMDKernelCodeT(Out, STM.getFeatureBits());
 
@@ -1151,21 +1172,6 @@ void AMDGPUAsmPrinter::getAmdKernelCode(amd_kernel_code_t &Out,
   if (MFI->hasFlatScratchInit())
     Out.code_properties |= AMD_CODE_PROPERTY_ENABLE_SGPR_FLAT_SCRATCH_INIT;
 
-  if (MFI->hasGridWorkgroupCountX()) {
-    Out.code_properties |=
-      AMD_CODE_PROPERTY_ENABLE_SGPR_GRID_WORKGROUP_COUNT_X;
-  }
-
-  if (MFI->hasGridWorkgroupCountY()) {
-    Out.code_properties |=
-      AMD_CODE_PROPERTY_ENABLE_SGPR_GRID_WORKGROUP_COUNT_Y;
-  }
-
-  if (MFI->hasGridWorkgroupCountZ()) {
-    Out.code_properties |=
-      AMD_CODE_PROPERTY_ENABLE_SGPR_GRID_WORKGROUP_COUNT_Z;
-  }
-
   if (MFI->hasDispatchPtr())
     Out.code_properties |= AMD_CODE_PROPERTY_ENABLE_SGPR_DISPATCH_PTR;
 
@@ -1175,20 +1181,17 @@ void AMDGPUAsmPrinter::getAmdKernelCode(amd_kernel_code_t &Out,
   if (STM.isXNACKEnabled())
     Out.code_properties |= AMD_CODE_PROPERTY_IS_XNACK_SUPPORTED;
 
-  // FIXME: Should use getKernArgSize
-  Out.kernarg_segment_byte_size =
-    STM.getKernArgSegmentSize(MF, MFI->getABIArgOffset());
+  unsigned MaxKernArgAlign;
+  Out.kernarg_segment_byte_size = STM.getKernArgSegmentSize(F, MaxKernArgAlign);
   Out.wavefront_sgpr_count = CurrentProgramInfo.NumSGPR;
   Out.workitem_vgpr_count = CurrentProgramInfo.NumVGPR;
   Out.workitem_private_segment_byte_size = CurrentProgramInfo.ScratchSize;
   Out.workgroup_group_segment_byte_size = CurrentProgramInfo.LDSSize;
-  Out.reserved_vgpr_first = CurrentProgramInfo.ReservedVGPRFirst;
-  Out.reserved_vgpr_count = CurrentProgramInfo.ReservedVGPRCount;
 
   // These alignment values are specified in powers of two, so alignment =
   // 2^n.  The minimum alignment is 2^4 = 16.
   Out.kernarg_segment_alignment = std::max((size_t)4,
-      countTrailingZeros(MFI->getMaxKernArgAlign()));
+      countTrailingZeros(MaxKernArgAlign));
 
   if (STM.debuggerEmitPrologue()) {
     Out.debug_wavefront_private_segment_offset_sgpr =
@@ -1198,55 +1201,6 @@ void AMDGPUAsmPrinter::getAmdKernelCode(amd_kernel_code_t &Out,
   }
 }
 
-AMDGPU::HSAMD::Kernel::CodeProps::Metadata AMDGPUAsmPrinter::getHSACodeProps(
-    const MachineFunction &MF,
-    const SIProgramInfo &ProgramInfo) const {
-  const SISubtarget &STM = MF.getSubtarget<SISubtarget>();
-  const SIMachineFunctionInfo &MFI = *MF.getInfo<SIMachineFunctionInfo>();
-  HSAMD::Kernel::CodeProps::Metadata HSACodeProps;
-
-  HSACodeProps.mKernargSegmentSize =
-      STM.getKernArgSegmentSize(MF, MFI.getABIArgOffset());
-  HSACodeProps.mGroupSegmentFixedSize = ProgramInfo.LDSSize;
-  HSACodeProps.mPrivateSegmentFixedSize = ProgramInfo.ScratchSize;
-  HSACodeProps.mKernargSegmentAlign =
-      std::max(uint32_t(4), MFI.getMaxKernArgAlign());
-  HSACodeProps.mWavefrontSize = STM.getWavefrontSize();
-  HSACodeProps.mNumSGPRs = CurrentProgramInfo.NumSGPR;
-  HSACodeProps.mNumVGPRs = CurrentProgramInfo.NumVGPR;
-  HSACodeProps.mMaxFlatWorkGroupSize = MFI.getMaxFlatWorkGroupSize();
-  HSACodeProps.mIsDynamicCallStack = ProgramInfo.DynamicCallStack;
-  HSACodeProps.mIsXNACKEnabled = STM.isXNACKEnabled();
-  HSACodeProps.mNumSpilledSGPRs = MFI.getNumSpilledSGPRs();
-  HSACodeProps.mNumSpilledVGPRs = MFI.getNumSpilledVGPRs();
-
-  return HSACodeProps;
-}
-
-AMDGPU::HSAMD::Kernel::DebugProps::Metadata AMDGPUAsmPrinter::getHSADebugProps(
-    const MachineFunction &MF,
-    const SIProgramInfo &ProgramInfo) const {
-  const SISubtarget &STM = MF.getSubtarget<SISubtarget>();
-  HSAMD::Kernel::DebugProps::Metadata HSADebugProps;
-
-  if (!STM.debuggerSupported())
-    return HSADebugProps;
-
-  HSADebugProps.mDebuggerABIVersion.push_back(1);
-  HSADebugProps.mDebuggerABIVersion.push_back(0);
-  HSADebugProps.mReservedNumVGPRs = ProgramInfo.ReservedVGPRCount;
-  HSADebugProps.mReservedFirstVGPR = ProgramInfo.ReservedVGPRFirst;
-
-  if (STM.debuggerEmitPrologue()) {
-    HSADebugProps.mPrivateSegmentBufferSGPR =
-        ProgramInfo.DebuggerPrivateSegmentBufferSGPR;
-    HSADebugProps.mWavefrontPrivateSegmentOffsetSGPR =
-        ProgramInfo.DebuggerWavefrontPrivateSegmentOffsetSGPR;
-  }
-
-  return HSADebugProps;
-}
-
 bool AMDGPUAsmPrinter::PrintAsmOperand(const MachineInstr *MI, unsigned OpNo,
                                        unsigned AsmVariant,
                                        const char *ExtraCode, raw_ostream &O) {
diff --git a/lib/Target/AMDGPU/AMDGPUAsmPrinter.h b/lib/Target/AMDGPU/AMDGPUAsmPrinter.h
index 51d48a0c7320..22982d912c70 100644
--- a/lib/Target/AMDGPU/AMDGPUAsmPrinter.h
+++ b/lib/Target/AMDGPU/AMDGPUAsmPrinter.h
@@ -8,7 +8,7 @@
 //===----------------------------------------------------------------------===//
 //
 /// \file
-/// \brief AMDGPU Assembly printer class.
+/// AMDGPU Assembly printer class.
 //
 //===----------------------------------------------------------------------===//
 
@@ -17,9 +17,11 @@
 
 #include "AMDGPU.h"
 #include "AMDKernelCodeT.h"
-#include "MCTargetDesc/AMDGPUHSAMetadataStreamer.h"
+#include "AMDGPUHSAMetadataStreamer.h"
+#include "SIProgramInfo.h"
 #include "llvm/ADT/StringRef.h"
 #include "llvm/CodeGen/AsmPrinter.h"
+#include "llvm/Support/AMDHSAKernelDescriptor.h"
 #include <cstddef>
 #include <cstdint>
 #include <limits>
@@ -29,9 +31,10 @@
 
 namespace llvm {
 
+class AMDGPUMachineFunction;
 class AMDGPUTargetStreamer;
 class MCOperand;
-class SISubtarget;
+class GCNSubtarget;
 
 class AMDGPUAsmPrinter final : public AsmPrinter {
 private:
@@ -47,68 +50,7 @@ private:
     bool HasDynamicallySizedStack = false;
     bool HasRecursion = false;
 
-    int32_t getTotalNumSGPRs(const SISubtarget &ST) const;
-  };
-
-  // Track resource usage for kernels / entry functions.
-  struct SIProgramInfo {
-    // Fields set in PGM_RSRC1 pm4 packet.
-    uint32_t VGPRBlocks = 0;
-    uint32_t SGPRBlocks = 0;
-    uint32_t Priority = 0;
-    uint32_t FloatMode = 0;
-    uint32_t Priv = 0;
-    uint32_t DX10Clamp = 0;
-    uint32_t DebugMode = 0;
-    uint32_t IEEEMode = 0;
-    uint64_t ScratchSize = 0;
-
-    uint64_t ComputePGMRSrc1 = 0;
-
-    // Fields set in PGM_RSRC2 pm4 packet.
-    uint32_t LDSBlocks = 0;
-    uint32_t ScratchBlocks = 0;
-
-    uint64_t ComputePGMRSrc2 = 0;
-
-    uint32_t NumVGPR = 0;
-    uint32_t NumSGPR = 0;
-    uint32_t LDSSize = 0;
-    bool FlatUsed = false;
-
-    // Number of SGPRs that meets number of waves per execution unit request.
-    uint32_t NumSGPRsForWavesPerEU = 0;
-
-    // Number of VGPRs that meets number of waves per execution unit request.
-    uint32_t NumVGPRsForWavesPerEU = 0;
-
-    // If ReservedVGPRCount is 0 then must be 0. Otherwise, this is the first
-    // fixed VGPR number reserved.
-    uint16_t ReservedVGPRFirst = 0;
-
-    // The number of consecutive VGPRs reserved.
-    uint16_t ReservedVGPRCount = 0;
-
-    // Fixed SGPR number used to hold wave scratch offset for entire kernel
-    // execution, or std::numeric_limits<uint16_t>::max() if the register is not
-    // used or not known.
-    uint16_t DebuggerWavefrontPrivateSegmentOffsetSGPR =
-        std::numeric_limits<uint16_t>::max();
-
-    // Fixed SGPR number of the first 4 SGPRs used to hold scratch V# for entire
-    // kernel execution, or std::numeric_limits<uint16_t>::max() if the register
-    // is not used or not known.
-    uint16_t DebuggerPrivateSegmentBufferSGPR =
-        std::numeric_limits<uint16_t>::max();
-
-    // Whether there is recursion, dynamic allocas, indirect calls or some other
-    // reason there may be statically unknown stack usage.
-    bool DynamicCallStack = false;
-
-    // Bonus information for debugging.
-    bool VCCUsed = false;
-
-    SIProgramInfo() = default;
+    int32_t getTotalNumSGPRs(const GCNSubtarget &ST) const;
   };
 
   SIProgramInfo CurrentProgramInfo;
@@ -128,16 +70,8 @@ private:
                               unsigned &NumSGPR,
                               unsigned &NumVGPR) const;
 
-  AMDGPU::HSAMD::Kernel::CodeProps::Metadata getHSACodeProps(
-      const MachineFunction &MF,
-      const SIProgramInfo &ProgramInfo) const;
-  AMDGPU::HSAMD::Kernel::DebugProps::Metadata getHSADebugProps(
-      const MachineFunction &MF,
-      const SIProgramInfo &ProgramInfo) const;
-
-  /// \brief Emit register usage information so that the GPU driver
+  /// Emit register usage information so that the GPU driver
   /// can correctly setup the GPU state.
-  void EmitProgramInfoR600(const MachineFunction &MF);
   void EmitProgramInfoSI(const MachineFunction &MF,
                          const SIProgramInfo &KernelInfo);
   void EmitPALMetadata(const MachineFunction &MF,
@@ -145,7 +79,15 @@ private:
   void emitCommonFunctionComments(uint32_t NumVGPR,
                                   uint32_t NumSGPR,
                                   uint64_t ScratchSize,
-                                  uint64_t CodeSize);
+                                  uint64_t CodeSize,
+                                  const AMDGPUMachineFunction* MFI);
+
+  uint16_t getAmdhsaKernelCodeProperties(
+      const MachineFunction &MF) const;
+
+  amdhsa::kernel_descriptor_t getAmdhsaKernelDescriptor(
+      const MachineFunction &MF,
+      const SIProgramInfo &PI) const;
 
 public:
   explicit AMDGPUAsmPrinter(TargetMachine &TM,
@@ -160,16 +102,16 @@ public:
   bool doFinalization(Module &M) override;
   bool runOnMachineFunction(MachineFunction &MF) override;
 
-  /// \brief Wrapper for MCInstLowering.lowerOperand() for the tblgen'erated
+  /// Wrapper for MCInstLowering.lowerOperand() for the tblgen'erated
   /// pseudo lowering.
   bool lowerOperand(const MachineOperand &MO, MCOperand &MCOp) const;
 
-  /// \brief Lower the specified LLVM Constant to an MCExpr.
+  /// Lower the specified LLVM Constant to an MCExpr.
   /// The AsmPrinter::lowerConstantof does not know how to lower
   /// addrspacecast, therefore they should be lowered by this function.
   const MCExpr *lowerConstant(const Constant *CV) override;
 
-  /// \brief tblgen'erated driver function for lowering simple MI->MC pseudo
+  /// tblgen'erated driver function for lowering simple MI->MC pseudo
   /// instructions.
   bool emitPseudoExpansionLowering(MCStreamer &OutStreamer,
                                    const MachineInstr *MI);
@@ -179,6 +121,8 @@ public:
 
   void EmitFunctionBodyStart() override;
 
+  void EmitFunctionBodyEnd() override;
+
   void EmitFunctionEntryLabel() override;
 
   void EmitBasicBlockStart(const MachineBasicBlock &MBB) const override;
diff --git a/lib/Target/AMDGPU/AMDGPUCallLowering.cpp b/lib/Target/AMDGPU/AMDGPUCallLowering.cpp
index 5a9138731934..18c7df0d94f2 100644
--- a/lib/Target/AMDGPU/AMDGPUCallLowering.cpp
+++ b/lib/Target/AMDGPU/AMDGPUCallLowering.cpp
@@ -20,6 +20,7 @@
 #include "SIISelLowering.h"
 #include "SIMachineFunctionInfo.h"
 #include "SIRegisterInfo.h"
+#include "MCTargetDesc/AMDGPUMCTargetDesc.h"
 #include "llvm/CodeGen/CallingConvLower.h"
 #include "llvm/CodeGen/GlobalISel/MachineIRBuilder.h"
 #include "llvm/CodeGen/MachineInstrBuilder.h"
@@ -32,13 +33,17 @@ AMDGPUCallLowering::AMDGPUCallLowering(const AMDGPUTargetLowering &TLI)
 
 bool AMDGPUCallLowering::lowerReturn(MachineIRBuilder &MIRBuilder,
                                      const Value *Val, unsigned VReg) const {
+  // FIXME: Add support for non-void returns.
+  if (Val)
+    return false;
+
   MIRBuilder.buildInstr(AMDGPU::S_ENDPGM);
   return true;
 }
 
 unsigned AMDGPUCallLowering::lowerParameterPtr(MachineIRBuilder &MIRBuilder,
                                                Type *ParamTy,
-                                               unsigned Offset) const {
+                                               uint64_t Offset) const {
 
   MachineFunction &MF = MIRBuilder.getMF();
   const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
@@ -61,7 +66,8 @@ unsigned AMDGPUCallLowering::lowerParameterPtr(MachineIRBuilder &MIRBuilder,
 }
 
 void AMDGPUCallLowering::lowerParameter(MachineIRBuilder &MIRBuilder,
-                                        Type *ParamTy, unsigned Offset,
+                                        Type *ParamTy, uint64_t Offset,
+                                        unsigned Align,
                                         unsigned DstReg) const {
   MachineFunction &MF = MIRBuilder.getMF();
   const Function &F = MF.getFunction();
@@ -69,7 +75,6 @@ void AMDGPUCallLowering::lowerParameter(MachineIRBuilder &MIRBuilder,
   PointerType *PtrTy = PointerType::get(ParamTy, AMDGPUASI.CONSTANT_ADDRESS);
   MachinePointerInfo PtrInfo(UndefValue::get(PtrTy));
   unsigned TypeSize = DL.getTypeStoreSize(ParamTy);
-  unsigned Align = DL.getABITypeAlignment(ParamTy);
   unsigned PtrReg = lowerParameterPtr(MIRBuilder, ParamTy, Offset);
 
   MachineMemOperand *MMO =
@@ -84,12 +89,16 @@ void AMDGPUCallLowering::lowerParameter(MachineIRBuilder &MIRBuilder,
 bool AMDGPUCallLowering::lowerFormalArguments(MachineIRBuilder &MIRBuilder,
                                               const Function &F,
                                               ArrayRef<unsigned> VRegs) const {
+  // AMDGPU_GS and AMDGP_HS are not supported yet.
+  if (F.getCallingConv() == CallingConv::AMDGPU_GS ||
+      F.getCallingConv() == CallingConv::AMDGPU_HS)
+    return false;
 
   MachineFunction &MF = MIRBuilder.getMF();
-  const SISubtarget *Subtarget = static_cast<const SISubtarget *>(&MF.getSubtarget());
+  const GCNSubtarget *Subtarget = &MF.getSubtarget<GCNSubtarget>();
   MachineRegisterInfo &MRI = MF.getRegInfo();
   SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();
-  const SIRegisterInfo *TRI = MF.getSubtarget<SISubtarget>().getRegisterInfo();
+  const SIRegisterInfo *TRI = MF.getSubtarget<GCNSubtarget>().getRegisterInfo();
   const DataLayout &DL = F.getParent()->getDataLayout();
 
   SmallVector<CCValAssign, 16> ArgLocs;
@@ -116,7 +125,7 @@ bool AMDGPUCallLowering::lowerFormalArguments(MachineIRBuilder &MIRBuilder,
 
   if (Info->hasKernargSegmentPtr()) {
     unsigned InputPtrReg = Info->addKernargSegmentPtr(*TRI);
-    const LLT P2 = LLT::pointer(2, 64);
+    const LLT P2 = LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64);
     unsigned VReg = MRI.createGenericVirtualRegister(P2);
     MRI.addLiveIn(InputPtrReg, VReg);
     MIRBuilder.getMBB().addLiveIn(InputPtrReg);
@@ -136,49 +145,106 @@ bool AMDGPUCallLowering::lowerFormalArguments(MachineIRBuilder &MIRBuilder,
     CCInfo.AllocateReg(FlatScratchInitReg);
   }
 
+  // The infrastructure for normal calling convention lowering is essentially
+  // useless for kernels. We want to avoid any kind of legalization or argument
+  // splitting.
+  if (F.getCallingConv() == CallingConv::AMDGPU_KERNEL) {
+    unsigned i = 0;
+    const unsigned KernArgBaseAlign = 16;
+    const unsigned BaseOffset = Subtarget->getExplicitKernelArgOffset(F);
+    uint64_t ExplicitArgOffset = 0;
+
+    // TODO: Align down to dword alignment and extract bits for extending loads.
+    for (auto &Arg : F.args()) {
+      Type *ArgTy = Arg.getType();
+      unsigned AllocSize = DL.getTypeAllocSize(ArgTy);
+      if (AllocSize == 0)
+        continue;
+
+      unsigned ABIAlign = DL.getABITypeAlignment(ArgTy);
+
+      uint64_t ArgOffset = alignTo(ExplicitArgOffset, ABIAlign) + BaseOffset;
+      ExplicitArgOffset = alignTo(ExplicitArgOffset, ABIAlign) + AllocSize;
+
+      unsigned Align = MinAlign(KernArgBaseAlign, ArgOffset);
+      ArgOffset = alignTo(ArgOffset, DL.getABITypeAlignment(ArgTy));
+      lowerParameter(MIRBuilder, ArgTy, ArgOffset, Align, VRegs[i]);
+      ++i;
+    }
+
+    return true;
+  }
+
   unsigned NumArgs = F.arg_size();
   Function::const_arg_iterator CurOrigArg = F.arg_begin();
   const AMDGPUTargetLowering &TLI = *getTLI<AMDGPUTargetLowering>();
+  unsigned PSInputNum = 0;
+  BitVector Skipped(NumArgs);
   for (unsigned i = 0; i != NumArgs; ++i, ++CurOrigArg) {
     EVT ValEVT = TLI.getValueType(DL, CurOrigArg->getType());
 
     // We can only hanlde simple value types at the moment.
-    if (!ValEVT.isSimple())
-      return false;
-    MVT ValVT = ValEVT.getSimpleVT();
     ISD::ArgFlagsTy Flags;
     ArgInfo OrigArg{VRegs[i], CurOrigArg->getType()};
     setArgFlags(OrigArg, i + 1, DL, F);
     Flags.setOrigAlign(DL.getABITypeAlignment(CurOrigArg->getType()));
+
+    if (F.getCallingConv() == CallingConv::AMDGPU_PS &&
+        !OrigArg.Flags.isInReg() && !OrigArg.Flags.isByVal() &&
+        PSInputNum <= 15) {
+      if (CurOrigArg->use_empty() && !Info->isPSInputAllocated(PSInputNum)) {
+        Skipped.set(i);
+        ++PSInputNum;
+        continue;
+      }
+
+      Info->markPSInputAllocated(PSInputNum);
+      if (!CurOrigArg->use_empty())
+        Info->markPSInputEnabled(PSInputNum);
+
+      ++PSInputNum;
+    }
+
     CCAssignFn *AssignFn = CCAssignFnForCall(F.getCallingConv(),
                                              /*IsVarArg=*/false);
-    bool Res =
-        AssignFn(i, ValVT, ValVT, CCValAssign::Full, OrigArg.Flags, CCInfo);
 
-    // Fail if we don't know how to handle this type.
-    if (Res)
-      return false;
+    if (ValEVT.isVector()) {
+      EVT ElemVT = ValEVT.getVectorElementType();
+      if (!ValEVT.isSimple())
+        return false;
+      MVT ValVT = ElemVT.getSimpleVT();
+      bool Res = AssignFn(i, ValVT, ValVT, CCValAssign::Full,
+                          OrigArg.Flags, CCInfo);
+      if (!Res)
+        return false;
+    } else {
+      MVT ValVT = ValEVT.getSimpleVT();
+      if (!ValEVT.isSimple())
+        return false;
+      bool Res =
+          AssignFn(i, ValVT, ValVT, CCValAssign::Full, OrigArg.Flags, CCInfo);
+
+      // Fail if we don't know how to handle this type.
+      if (Res)
+        return false;
+    }
   }
 
   Function::const_arg_iterator Arg = F.arg_begin();
 
-  if (F.getCallingConv() == CallingConv::AMDGPU_VS) {
-    for (unsigned i = 0; i != NumArgs; ++i, ++Arg) {
-      CCValAssign &VA = ArgLocs[i];
-      MRI.addLiveIn(VA.getLocReg(), VRegs[i]);
+  if (F.getCallingConv() == CallingConv::AMDGPU_VS ||
+      F.getCallingConv() == CallingConv::AMDGPU_PS) {
+    for (unsigned i = 0, OrigArgIdx = 0;
+         OrigArgIdx != NumArgs && i != ArgLocs.size(); ++Arg, ++OrigArgIdx) {
+       if (Skipped.test(OrigArgIdx))
+          continue;
+      CCValAssign &VA = ArgLocs[i++];
+      MRI.addLiveIn(VA.getLocReg(), VRegs[OrigArgIdx]);
       MIRBuilder.getMBB().addLiveIn(VA.getLocReg());
-      MIRBuilder.buildCopy(VRegs[i], VA.getLocReg());
+      MIRBuilder.buildCopy(VRegs[OrigArgIdx], VA.getLocReg());
     }
     return true;
   }
 
-  for (unsigned i = 0; i != NumArgs; ++i, ++Arg) {
-    // FIXME: We should be getting DebugInfo from the arguments some how.
-    CCValAssign &VA = ArgLocs[i];
-    lowerParameter(MIRBuilder, Arg->getType(),
-                   VA.getLocMemOffset() +
-                   Subtarget->getExplicitKernelArgOffset(MF), VRegs[i]);
-  }
-
-  return true;
+  return false;
 }
diff --git a/lib/Target/AMDGPU/AMDGPUCallLowering.h b/lib/Target/AMDGPU/AMDGPUCallLowering.h
index 251cb7a2c440..f51cb6abbf65 100644
--- a/lib/Target/AMDGPU/AMDGPUCallLowering.h
+++ b/lib/Target/AMDGPU/AMDGPUCallLowering.h
@@ -26,10 +26,11 @@ class AMDGPUCallLowering: public CallLowering {
   AMDGPUAS AMDGPUASI;
 
   unsigned lowerParameterPtr(MachineIRBuilder &MIRBuilder, Type *ParamTy,
-                             unsigned Offset) const;
+                             uint64_t Offset) const;
 
   void lowerParameter(MachineIRBuilder &MIRBuilder, Type *ParamTy,
-                      unsigned Offset, unsigned DstReg) const;
+                      uint64_t Offset, unsigned Align,
+                      unsigned DstReg) const;
 
  public:
   AMDGPUCallLowering(const AMDGPUTargetLowering &TLI);
diff --git a/lib/Target/AMDGPU/AMDGPUCallingConv.td b/lib/Target/AMDGPU/AMDGPUCallingConv.td
index c1c066fd1404..68bc7fdd9961 100644
--- a/lib/Target/AMDGPU/AMDGPUCallingConv.td
+++ b/lib/Target/AMDGPU/AMDGPUCallingConv.td
@@ -85,22 +85,6 @@ def RetCC_SI_Shader : CallingConv<[
   ]>>
 ]>;
 
-// Calling convention for R600
-def CC_R600 : CallingConv<[
-  CCIfInReg<CCIfType<[v4f32, v4i32] , CCAssignToReg<[
-    T0_XYZW, T1_XYZW, T2_XYZW, T3_XYZW, T4_XYZW, T5_XYZW, T6_XYZW, T7_XYZW,
-    T8_XYZW, T9_XYZW, T10_XYZW, T11_XYZW, T12_XYZW, T13_XYZW, T14_XYZW, T15_XYZW,
-    T16_XYZW, T17_XYZW, T18_XYZW, T19_XYZW, T20_XYZW, T21_XYZW, T22_XYZW,
-    T23_XYZW, T24_XYZW, T25_XYZW, T26_XYZW, T27_XYZW, T28_XYZW, T29_XYZW,
-    T30_XYZW, T31_XYZW, T32_XYZW
-  ]>>>
-]>;
-
-// Calling convention for compute kernels
-def CC_AMDGPU_Kernel : CallingConv<[
-  CCCustom<"allocateKernArg">
-]>;
-
 def CSR_AMDGPU_VGPRs_24_255 : CalleeSavedRegs<
   (sequence "VGPR%u", 24, 255)
 >;
@@ -127,7 +111,7 @@ def CC_AMDGPU_Func : CallingConv<[
     VGPR8, VGPR9, VGPR10, VGPR11, VGPR12, VGPR13, VGPR14, VGPR15,
     VGPR16, VGPR17, VGPR18, VGPR19, VGPR20, VGPR21, VGPR22, VGPR23,
     VGPR24, VGPR25, VGPR26, VGPR27, VGPR28, VGPR29, VGPR30, VGPR31]>>,
-  CCIfType<[i64, f64, v2i32, v2f32, v4i32, v4f32, v8i32, v8f32, v16i32, v16f32, v2i64, v2f64], CCCustom<"allocateVGPRTuple">>,
+  CCIfType<[i64, f64, v2i32, v2f32, v4i32, v4f32, v8i32, v8f32, v16i32, v16f32, v2i64, v2f64, v4i16, v4f16], CCCustom<"allocateVGPRTuple">>,
   CCIfType<[i32, f32, v2i16, v2f16, i16, f16, i1], CCAssignToStack<4, 4>>,
   CCIfType<[i64, f64, v2i32, v2f32], CCAssignToStack<8, 4>>,
   CCIfType<[v4i32, v4f32, v2i64, v2f64], CCAssignToStack<16, 4>>,
@@ -144,30 +128,16 @@ def RetCC_AMDGPU_Func : CallingConv<[
     VGPR8, VGPR9, VGPR10, VGPR11, VGPR12, VGPR13, VGPR14, VGPR15,
     VGPR16, VGPR17, VGPR18, VGPR19, VGPR20, VGPR21, VGPR22, VGPR23,
     VGPR24, VGPR25, VGPR26, VGPR27, VGPR28, VGPR29, VGPR30, VGPR31]>>,
-  CCIfType<[i64, f64, v2i32, v2f32, v4i32, v4f32, v8i32, v8f32, v16i32, v16f32, v2i64, v2f64], CCCustom<"allocateVGPRTuple">>
+  CCIfType<[i64, f64, v2i32, v2f32, v4i32, v4f32, v8i32, v8f32, v16i32, v16f32, v2i64, v2f64, v4i16, v4f16], CCCustom<"allocateVGPRTuple">>
 ]>;
 
 def CC_AMDGPU : CallingConv<[
-  CCIf<"static_cast<const AMDGPUSubtarget&>"
-        "(State.getMachineFunction().getSubtarget()).getGeneration() >="
-          "AMDGPUSubtarget::SOUTHERN_ISLANDS && "
-        "!AMDGPU::isShader(State.getCallingConv())",
-       CCDelegateTo<CC_AMDGPU_Kernel>>,
-  CCIf<"static_cast<const AMDGPUSubtarget&>"
-        "(State.getMachineFunction().getSubtarget()).getGeneration() < "
-          "AMDGPUSubtarget::SOUTHERN_ISLANDS && "
-         "!AMDGPU::isShader(State.getCallingConv())",
-        CCDelegateTo<CC_AMDGPU_Kernel>>,
-   CCIf<"static_cast<const AMDGPUSubtarget&>"
+   CCIf<"static_cast<const GCNSubtarget&>"
          "(State.getMachineFunction().getSubtarget()).getGeneration() >= "
            "AMDGPUSubtarget::SOUTHERN_ISLANDS",
         CCDelegateTo<CC_SI>>,
-   CCIf<"static_cast<const AMDGPUSubtarget&>"
+   CCIf<"static_cast<const GCNSubtarget&>"
          "(State.getMachineFunction().getSubtarget()).getGeneration() >= "
            "AMDGPUSubtarget::SOUTHERN_ISLANDS && State.getCallingConv() == CallingConv::C",
-        CCDelegateTo<CC_AMDGPU_Func>>,
-   CCIf<"static_cast<const AMDGPUSubtarget&>"
-          "(State.getMachineFunction().getSubtarget()).getGeneration() < "
-            "AMDGPUSubtarget::SOUTHERN_ISLANDS",
-        CCDelegateTo<CC_R600>>
+        CCDelegateTo<CC_AMDGPU_Func>>
 ]>;
diff --git a/lib/Target/AMDGPU/AMDGPUCodeGenPrepare.cpp b/lib/Target/AMDGPU/AMDGPUCodeGenPrepare.cpp
index b17b67167666..5713b7b7f9a8 100644
--- a/lib/Target/AMDGPU/AMDGPUCodeGenPrepare.cpp
+++ b/lib/Target/AMDGPU/AMDGPUCodeGenPrepare.cpp
@@ -17,8 +17,10 @@
 #include "AMDGPUSubtarget.h"
 #include "AMDGPUTargetMachine.h"
 #include "llvm/ADT/StringRef.h"
+#include "llvm/Analysis/AssumptionCache.h"
 #include "llvm/Analysis/DivergenceAnalysis.h"
 #include "llvm/Analysis/Loads.h"
+#include "llvm/Analysis/ValueTracking.h"
 #include "llvm/CodeGen/Passes.h"
 #include "llvm/CodeGen/TargetPassConfig.h"
 #include "llvm/IR/Attributes.h"
@@ -48,15 +50,22 @@ using namespace llvm;
 
 namespace {
 
+static cl::opt<bool> WidenLoads(
+  "amdgpu-codegenprepare-widen-constant-loads",
+  cl::desc("Widen sub-dword constant address space loads in AMDGPUCodeGenPrepare"),
+  cl::ReallyHidden,
+  cl::init(true));
+
 class AMDGPUCodeGenPrepare : public FunctionPass,
                              public InstVisitor<AMDGPUCodeGenPrepare, bool> {
-  const SISubtarget *ST = nullptr;
+  const GCNSubtarget *ST = nullptr;
+  AssumptionCache *AC = nullptr;
   DivergenceAnalysis *DA = nullptr;
   Module *Mod = nullptr;
   bool HasUnsafeFPMath = false;
   AMDGPUAS AMDGPUASI;
 
-  /// \brief Copies exact/nsw/nuw flags (if any) from binary operation \p I to
+  /// Copies exact/nsw/nuw flags (if any) from binary operation \p I to
   /// binary operation \p V.
   ///
   /// \returns Binary operation \p V.
@@ -80,7 +89,7 @@ class AMDGPUCodeGenPrepare : public FunctionPass,
   /// false otherwise.
   bool needsPromotionToI32(const Type *T) const;
 
-  /// \brief Promotes uniform binary operation \p I to equivalent 32 bit binary
+  /// Promotes uniform binary operation \p I to equivalent 32 bit binary
   /// operation.
   ///
   /// \details \p I's base element bit width must be greater than 1 and less
@@ -93,7 +102,7 @@ class AMDGPUCodeGenPrepare : public FunctionPass,
   /// false otherwise.
   bool promoteUniformOpToI32(BinaryOperator &I) const;
 
-  /// \brief Promotes uniform 'icmp' operation \p I to 32 bit 'icmp' operation.
+  /// Promotes uniform 'icmp' operation \p I to 32 bit 'icmp' operation.
   ///
   /// \details \p I's base element bit width must be greater than 1 and less
   /// than or equal 16. Promotion is done by sign or zero extending operands to
@@ -102,7 +111,7 @@ class AMDGPUCodeGenPrepare : public FunctionPass,
   /// \returns True.
   bool promoteUniformOpToI32(ICmpInst &I) const;
 
-  /// \brief Promotes uniform 'select' operation \p I to 32 bit 'select'
+  /// Promotes uniform 'select' operation \p I to 32 bit 'select'
   /// operation.
   ///
   /// \details \p I's base element bit width must be greater than 1 and less
@@ -113,7 +122,7 @@ class AMDGPUCodeGenPrepare : public FunctionPass,
   /// \returns True.
   bool promoteUniformOpToI32(SelectInst &I) const;
 
-  /// \brief Promotes uniform 'bitreverse' intrinsic \p I to 32 bit 'bitreverse'
+  /// Promotes uniform 'bitreverse' intrinsic \p I to 32 bit 'bitreverse'
   /// intrinsic.
   ///
   /// \details \p I's base element bit width must be greater than 1 and less
@@ -125,7 +134,17 @@ class AMDGPUCodeGenPrepare : public FunctionPass,
   ///
   /// \returns True.
   bool promoteUniformBitreverseToI32(IntrinsicInst &I) const;
-  /// \brief Widen a scalar load.
+
+  /// Expands 24 bit div or rem.
+  Value* expandDivRem24(IRBuilder<> &Builder, BinaryOperator &I,
+                        Value *Num, Value *Den,
+                        bool IsDiv, bool IsSigned) const;
+
+  /// Expands 32 bit div or rem.
+  Value* expandDivRem32(IRBuilder<> &Builder, BinaryOperator &I,
+                        Value *Num, Value *Den) const;
+
+  /// Widen a scalar load.
   ///
   /// \details \p Widen scalar load for uniform, small type loads from constant
   //  memory / to a full 32-bits and then truncate the input to allow a scalar
@@ -157,6 +176,7 @@ public:
   StringRef getPassName() const override { return "AMDGPU IR optimizations"; }
 
   void getAnalysisUsage(AnalysisUsage &AU) const override {
+    AU.addRequired<AssumptionCacheTracker>();
     AU.addRequired<DivergenceAnalysis>();
     AU.setPreservesAll();
  }
@@ -250,7 +270,9 @@ bool AMDGPUCodeGenPrepare::promoteUniformOpToI32(BinaryOperator &I) const {
          "I does not need promotion to i32");
 
   if (I.getOpcode() == Instruction::SDiv ||
-      I.getOpcode() == Instruction::UDiv)
+      I.getOpcode() == Instruction::UDiv ||
+      I.getOpcode() == Instruction::SRem ||
+      I.getOpcode() == Instruction::URem)
     return false;
 
   IRBuilder<> Builder(&I);
@@ -372,13 +394,18 @@ bool AMDGPUCodeGenPrepare::promoteUniformBitreverseToI32(
   return true;
 }
 
-static bool shouldKeepFDivF32(Value *Num, bool UnsafeDiv) {
+static bool shouldKeepFDivF32(Value *Num, bool UnsafeDiv, bool HasDenormals) {
   const ConstantFP *CNum = dyn_cast<ConstantFP>(Num);
   if (!CNum)
-    return false;
+    return HasDenormals;
+
+  if (UnsafeDiv)
+    return true;
+
+  bool IsOne = CNum->isExactlyValue(+1.0) || CNum->isExactlyValue(-1.0);
 
   // Reciprocal f32 is handled separately without denormals.
-  return UnsafeDiv || CNum->isExactlyValue(+1.0);
+  return HasDenormals ^ IsOne;
 }
 
 // Insert an intrinsic for fast fdiv for safe math situations where we can
@@ -404,7 +431,7 @@ bool AMDGPUCodeGenPrepare::visitFDiv(BinaryOperator &FDiv) {
                                       FMF.allowReciprocal();
 
   // With UnsafeDiv node will be optimized to just rcp and mul.
-  if (ST->hasFP32Denormals() || UnsafeDiv)
+  if (UnsafeDiv)
     return false;
 
   IRBuilder<> Builder(FDiv.getParent(), std::next(FDiv.getIterator()), FPMath);
@@ -418,6 +445,7 @@ bool AMDGPUCodeGenPrepare::visitFDiv(BinaryOperator &FDiv) {
 
   Value *NewFDiv = nullptr;
 
+  bool HasDenormals = ST->hasFP32Denormals();
   if (VectorType *VT = dyn_cast<VectorType>(Ty)) {
     NewFDiv = UndefValue::get(VT);
 
@@ -428,7 +456,7 @@ bool AMDGPUCodeGenPrepare::visitFDiv(BinaryOperator &FDiv) {
       Value *DenEltI = Builder.CreateExtractElement(Den, I);
       Value *NewElt;
 
-      if (shouldKeepFDivF32(NumEltI, UnsafeDiv)) {
+      if (shouldKeepFDivF32(NumEltI, UnsafeDiv, HasDenormals)) {
         NewElt = Builder.CreateFDiv(NumEltI, DenEltI);
       } else {
         NewElt = Builder.CreateCall(Decl, { NumEltI, DenEltI });
@@ -437,7 +465,7 @@ bool AMDGPUCodeGenPrepare::visitFDiv(BinaryOperator &FDiv) {
       NewFDiv = Builder.CreateInsertElement(NewFDiv, NewElt, I);
     }
   } else {
-    if (!shouldKeepFDivF32(Num, UnsafeDiv))
+    if (!shouldKeepFDivF32(Num, UnsafeDiv, HasDenormals))
       NewFDiv = Builder.CreateCall(Decl, { Num, Den });
   }
 
@@ -447,7 +475,7 @@ bool AMDGPUCodeGenPrepare::visitFDiv(BinaryOperator &FDiv) {
     FDiv.eraseFromParent();
   }
 
-  return true;
+  return !!NewFDiv;
 }
 
 static bool hasUnsafeFPMath(const Function &F) {
@@ -455,18 +483,324 @@ static bool hasUnsafeFPMath(const Function &F) {
   return Attr.getValueAsString() == "true";
 }
 
+static std::pair<Value*, Value*> getMul64(IRBuilder<> &Builder,
+                                          Value *LHS, Value *RHS) {
+  Type *I32Ty = Builder.getInt32Ty();
+  Type *I64Ty = Builder.getInt64Ty();
+
+  Value *LHS_EXT64 = Builder.CreateZExt(LHS, I64Ty);
+  Value *RHS_EXT64 = Builder.CreateZExt(RHS, I64Ty);
+  Value *MUL64 = Builder.CreateMul(LHS_EXT64, RHS_EXT64);
+  Value *Lo = Builder.CreateTrunc(MUL64, I32Ty);
+  Value *Hi = Builder.CreateLShr(MUL64, Builder.getInt64(32));
+  Hi = Builder.CreateTrunc(Hi, I32Ty);
+  return std::make_pair(Lo, Hi);
+}
+
+static Value* getMulHu(IRBuilder<> &Builder, Value *LHS, Value *RHS) {
+  return getMul64(Builder, LHS, RHS).second;
+}
+
+// The fractional part of a float is enough to accurately represent up to
+// a 24-bit signed integer.
+Value* AMDGPUCodeGenPrepare::expandDivRem24(IRBuilder<> &Builder,
+                                            BinaryOperator &I,
+                                            Value *Num, Value *Den,
+                                            bool IsDiv, bool IsSigned) const {
+  assert(Num->getType()->isIntegerTy(32));
+
+  const DataLayout &DL = Mod->getDataLayout();
+  unsigned LHSSignBits = ComputeNumSignBits(Num, DL, 0, AC, &I);
+  if (LHSSignBits < 9)
+    return nullptr;
+
+  unsigned RHSSignBits = ComputeNumSignBits(Den, DL, 0, AC, &I);
+  if (RHSSignBits < 9)
+    return nullptr;
+
+
+  unsigned SignBits = std::min(LHSSignBits, RHSSignBits);
+  unsigned DivBits = 32 - SignBits;
+  if (IsSigned)
+    ++DivBits;
+
+  Type *Ty = Num->getType();
+  Type *I32Ty = Builder.getInt32Ty();
+  Type *F32Ty = Builder.getFloatTy();
+  ConstantInt *One = Builder.getInt32(1);
+  Value *JQ = One;
+
+  if (IsSigned) {
+    // char|short jq = ia ^ ib;
+    JQ = Builder.CreateXor(Num, Den);
+
+    // jq = jq >> (bitsize - 2)
+    JQ = Builder.CreateAShr(JQ, Builder.getInt32(30));
+
+    // jq = jq | 0x1
+    JQ = Builder.CreateOr(JQ, One);
+  }
+
+  // int ia = (int)LHS;
+  Value *IA = Num;
+
+  // int ib, (int)RHS;
+  Value *IB = Den;
+
+  // float fa = (float)ia;
+  Value *FA = IsSigned ? Builder.CreateSIToFP(IA, F32Ty)
+                       : Builder.CreateUIToFP(IA, F32Ty);
+
+  // float fb = (float)ib;
+  Value *FB = IsSigned ? Builder.CreateSIToFP(IB,F32Ty)
+                       : Builder.CreateUIToFP(IB,F32Ty);
+
+  Value *RCP = Builder.CreateFDiv(ConstantFP::get(F32Ty, 1.0), FB);
+  Value *FQM = Builder.CreateFMul(FA, RCP);
+
+  // fq = trunc(fqm);
+  CallInst* FQ = Builder.CreateIntrinsic(Intrinsic::trunc, { FQM });
+  FQ->copyFastMathFlags(Builder.getFastMathFlags());
+
+  // float fqneg = -fq;
+  Value *FQNeg = Builder.CreateFNeg(FQ);
+
+  // float fr = mad(fqneg, fb, fa);
+  Value *FR = Builder.CreateIntrinsic(Intrinsic::amdgcn_fmad_ftz,
+                                      { FQNeg, FB, FA }, FQ);
+
+  // int iq = (int)fq;
+  Value *IQ = IsSigned ? Builder.CreateFPToSI(FQ, I32Ty)
+                       : Builder.CreateFPToUI(FQ, I32Ty);
+
+  // fr = fabs(fr);
+  FR = Builder.CreateIntrinsic(Intrinsic::fabs, { FR }, FQ);
+
+  // fb = fabs(fb);
+  FB = Builder.CreateIntrinsic(Intrinsic::fabs, { FB }, FQ);
+
+  // int cv = fr >= fb;
+  Value *CV = Builder.CreateFCmpOGE(FR, FB);
+
+  // jq = (cv ? jq : 0);
+  JQ = Builder.CreateSelect(CV, JQ, Builder.getInt32(0));
+
+  // dst = iq + jq;
+  Value *Div = Builder.CreateAdd(IQ, JQ);
+
+  Value *Res = Div;
+  if (!IsDiv) {
+    // Rem needs compensation, it's easier to recompute it
+    Value *Rem = Builder.CreateMul(Div, Den);
+    Res = Builder.CreateSub(Num, Rem);
+  }
+
+  // Truncate to number of bits this divide really is.
+  if (IsSigned) {
+    Res = Builder.CreateTrunc(Res, Builder.getIntNTy(DivBits));
+    Res = Builder.CreateSExt(Res, Ty);
+  } else {
+    ConstantInt *TruncMask = Builder.getInt32((UINT64_C(1) << DivBits) - 1);
+    Res = Builder.CreateAnd(Res, TruncMask);
+  }
+
+  return Res;
+}
+
+Value* AMDGPUCodeGenPrepare::expandDivRem32(IRBuilder<> &Builder,
+                                            BinaryOperator &I,
+                                            Value *Num, Value *Den) const {
+  Instruction::BinaryOps Opc = I.getOpcode();
+  assert(Opc == Instruction::URem || Opc == Instruction::UDiv ||
+         Opc == Instruction::SRem || Opc == Instruction::SDiv);
+
+  FastMathFlags FMF;
+  FMF.setFast();
+  Builder.setFastMathFlags(FMF);
+
+  if (isa<Constant>(Den))
+    return nullptr; // Keep it for optimization
+
+  bool IsDiv = Opc == Instruction::UDiv || Opc == Instruction::SDiv;
+  bool IsSigned = Opc == Instruction::SRem || Opc == Instruction::SDiv;
+
+  Type *Ty = Num->getType();
+  Type *I32Ty = Builder.getInt32Ty();
+  Type *F32Ty = Builder.getFloatTy();
+
+  if (Ty->getScalarSizeInBits() < 32) {
+    if (IsSigned) {
+      Num = Builder.CreateSExt(Num, I32Ty);
+      Den = Builder.CreateSExt(Den, I32Ty);
+    } else {
+      Num = Builder.CreateZExt(Num, I32Ty);
+      Den = Builder.CreateZExt(Den, I32Ty);
+    }
+  }
+
+  if (Value *Res = expandDivRem24(Builder, I, Num, Den, IsDiv, IsSigned)) {
+    Res = Builder.CreateTrunc(Res, Ty);
+    return Res;
+  }
+
+  ConstantInt *Zero = Builder.getInt32(0);
+  ConstantInt *One = Builder.getInt32(1);
+  ConstantInt *MinusOne = Builder.getInt32(~0);
+
+  Value *Sign = nullptr;
+  if (IsSigned) {
+    ConstantInt *K31 = Builder.getInt32(31);
+    Value *LHSign = Builder.CreateAShr(Num, K31);
+    Value *RHSign = Builder.CreateAShr(Den, K31);
+    // Remainder sign is the same as LHS
+    Sign = IsDiv ? Builder.CreateXor(LHSign, RHSign) : LHSign;
+
+    Num = Builder.CreateAdd(Num, LHSign);
+    Den = Builder.CreateAdd(Den, RHSign);
+
+    Num = Builder.CreateXor(Num, LHSign);
+    Den = Builder.CreateXor(Den, RHSign);
+  }
+
+  // RCP =  URECIP(Den) = 2^32 / Den + e
+  // e is rounding error.
+  Value *DEN_F32 = Builder.CreateUIToFP(Den, F32Ty);
+  Value *RCP_F32 = Builder.CreateFDiv(ConstantFP::get(F32Ty, 1.0), DEN_F32);
+  Constant *UINT_MAX_PLUS_1 = ConstantFP::get(F32Ty, BitsToFloat(0x4f800000));
+  Value *RCP_SCALE = Builder.CreateFMul(RCP_F32, UINT_MAX_PLUS_1);
+  Value *RCP = Builder.CreateFPToUI(RCP_SCALE, I32Ty);
+
+  // RCP_LO, RCP_HI = mul(RCP, Den) */
+  Value *RCP_LO, *RCP_HI;
+  std::tie(RCP_LO, RCP_HI) = getMul64(Builder, RCP, Den);
+
+  // NEG_RCP_LO = -RCP_LO
+  Value *NEG_RCP_LO = Builder.CreateNeg(RCP_LO);
+
+  // ABS_RCP_LO = (RCP_HI == 0 ? NEG_RCP_LO : RCP_LO)
+  Value *RCP_HI_0_CC = Builder.CreateICmpEQ(RCP_HI, Zero);
+  Value *ABS_RCP_LO = Builder.CreateSelect(RCP_HI_0_CC, NEG_RCP_LO, RCP_LO);
+
+  // Calculate the rounding error from the URECIP instruction
+  // E = mulhu(ABS_RCP_LO, RCP)
+  Value *E = getMulHu(Builder, ABS_RCP_LO, RCP);
+
+  // RCP_A_E = RCP + E
+  Value *RCP_A_E = Builder.CreateAdd(RCP, E);
+
+  // RCP_S_E = RCP - E
+  Value *RCP_S_E = Builder.CreateSub(RCP, E);
+
+  // Tmp0 = (RCP_HI == 0 ? RCP_A_E : RCP_SUB_E)
+  Value *Tmp0 = Builder.CreateSelect(RCP_HI_0_CC, RCP_A_E, RCP_S_E);
+
+  // Quotient = mulhu(Tmp0, Num)
+  Value *Quotient = getMulHu(Builder, Tmp0, Num);
+
+  // Num_S_Remainder = Quotient * Den
+  Value *Num_S_Remainder = Builder.CreateMul(Quotient, Den);
+
+  // Remainder = Num - Num_S_Remainder
+  Value *Remainder = Builder.CreateSub(Num, Num_S_Remainder);
+
+  // Remainder_GE_Den = (Remainder >= Den ? -1 : 0)
+  Value *Rem_GE_Den_CC = Builder.CreateICmpUGE(Remainder, Den);
+  Value *Remainder_GE_Den = Builder.CreateSelect(Rem_GE_Den_CC, MinusOne, Zero);
+
+  // Remainder_GE_Zero = (Num >= Num_S_Remainder ? -1 : 0)
+  Value *Num_GE_Num_S_Rem_CC = Builder.CreateICmpUGE(Num, Num_S_Remainder);
+  Value *Remainder_GE_Zero = Builder.CreateSelect(Num_GE_Num_S_Rem_CC,
+                                                  MinusOne, Zero);
+
+  // Tmp1 = Remainder_GE_Den & Remainder_GE_Zero
+  Value *Tmp1 = Builder.CreateAnd(Remainder_GE_Den, Remainder_GE_Zero);
+  Value *Tmp1_0_CC = Builder.CreateICmpEQ(Tmp1, Zero);
+
+  Value *Res;
+  if (IsDiv) {
+    // Quotient_A_One = Quotient + 1
+    Value *Quotient_A_One = Builder.CreateAdd(Quotient, One);
+
+    // Quotient_S_One = Quotient - 1
+    Value *Quotient_S_One = Builder.CreateSub(Quotient, One);
+
+    // Div = (Tmp1 == 0 ? Quotient : Quotient_A_One)
+    Value *Div = Builder.CreateSelect(Tmp1_0_CC, Quotient, Quotient_A_One);
+
+    // Div = (Remainder_GE_Zero == 0 ? Quotient_S_One : Div)
+    Res = Builder.CreateSelect(Num_GE_Num_S_Rem_CC, Div, Quotient_S_One);
+  } else {
+    // Remainder_S_Den = Remainder - Den
+    Value *Remainder_S_Den = Builder.CreateSub(Remainder, Den);
+
+    // Remainder_A_Den = Remainder + Den
+    Value *Remainder_A_Den = Builder.CreateAdd(Remainder, Den);
+
+    // Rem = (Tmp1 == 0 ? Remainder : Remainder_S_Den)
+    Value *Rem = Builder.CreateSelect(Tmp1_0_CC, Remainder, Remainder_S_Den);
+
+    // Rem = (Remainder_GE_Zero == 0 ? Remainder_A_Den : Rem)
+    Res = Builder.CreateSelect(Num_GE_Num_S_Rem_CC, Rem, Remainder_A_Den);
+  }
+
+  if (IsSigned) {
+    Res = Builder.CreateXor(Res, Sign);
+    Res = Builder.CreateSub(Res, Sign);
+  }
+
+  Res = Builder.CreateTrunc(Res, Ty);
+
+  return Res;
+}
+
 bool AMDGPUCodeGenPrepare::visitBinaryOperator(BinaryOperator &I) {
+  if (ST->has16BitInsts() && needsPromotionToI32(I.getType()) &&
+      DA->isUniform(&I) && promoteUniformOpToI32(I))
+    return true;
+
   bool Changed = false;
+  Instruction::BinaryOps Opc = I.getOpcode();
+  Type *Ty = I.getType();
+  Value *NewDiv = nullptr;
+  if ((Opc == Instruction::URem || Opc == Instruction::UDiv ||
+       Opc == Instruction::SRem || Opc == Instruction::SDiv) &&
+      Ty->getScalarSizeInBits() <= 32) {
+    Value *Num = I.getOperand(0);
+    Value *Den = I.getOperand(1);
+    IRBuilder<> Builder(&I);
+    Builder.SetCurrentDebugLocation(I.getDebugLoc());
 
-  if (ST->has16BitInsts() && needsPromotionToI32(I.getType()) &&
-      DA->isUniform(&I))
-    Changed |= promoteUniformOpToI32(I);
+    if (VectorType *VT = dyn_cast<VectorType>(Ty)) {
+      NewDiv = UndefValue::get(VT);
+
+      for (unsigned N = 0, E = VT->getNumElements(); N != E; ++N) {
+        Value *NumEltN = Builder.CreateExtractElement(Num, N);
+        Value *DenEltN = Builder.CreateExtractElement(Den, N);
+        Value *NewElt = expandDivRem32(Builder, I, NumEltN, DenEltN);
+        if (!NewElt)
+          NewElt = Builder.CreateBinOp(Opc, NumEltN, DenEltN);
+        NewDiv = Builder.CreateInsertElement(NewDiv, NewElt, N);
+      }
+    } else {
+      NewDiv = expandDivRem32(Builder, I, Num, Den);
+    }
+
+    if (NewDiv) {
+      I.replaceAllUsesWith(NewDiv);
+      I.eraseFromParent();
+      Changed = true;
+    }
+  }
 
   return Changed;
 }
 
-bool AMDGPUCodeGenPrepare::visitLoadInst(LoadInst  &I) {
-  if (I.getPointerAddressSpace() == AMDGPUASI.CONSTANT_ADDRESS &&
+bool AMDGPUCodeGenPrepare::visitLoadInst(LoadInst &I) {
+  if (!WidenLoads)
+    return false;
+
+  if ((I.getPointerAddressSpace() == AMDGPUASI.CONSTANT_ADDRESS ||
+       I.getPointerAddressSpace() == AMDGPUASI.CONSTANT_ADDRESS_32BIT) &&
       canWidenScalarExtLoad(I)) {
     IRBuilder<> Builder(&I);
     Builder.SetCurrentDebugLocation(I.getDebugLoc());
@@ -474,7 +808,28 @@ bool AMDGPUCodeGenPrepare::visitLoadInst(LoadInst  &I) {
     Type *I32Ty = Builder.getInt32Ty();
     Type *PT = PointerType::get(I32Ty, I.getPointerAddressSpace());
     Value *BitCast= Builder.CreateBitCast(I.getPointerOperand(), PT);
-    Value *WidenLoad = Builder.CreateLoad(BitCast);
+    LoadInst *WidenLoad = Builder.CreateLoad(BitCast);
+    WidenLoad->copyMetadata(I);
+
+    // If we have range metadata, we need to convert the type, and not make
+    // assumptions about the high bits.
+    if (auto *Range = WidenLoad->getMetadata(LLVMContext::MD_range)) {
+      ConstantInt *Lower =
+        mdconst::extract<ConstantInt>(Range->getOperand(0));
+
+      if (Lower->getValue().isNullValue()) {
+        WidenLoad->setMetadata(LLVMContext::MD_range, nullptr);
+      } else {
+        Metadata *LowAndHigh[] = {
+          ConstantAsMetadata::get(ConstantInt::get(I32Ty, Lower->getValue().zext(32))),
+          // Don't make assumptions about the high bits.
+          ConstantAsMetadata::get(ConstantInt::get(I32Ty, 0))
+        };
+
+        WidenLoad->setMetadata(LLVMContext::MD_range,
+                               MDNode::get(Mod->getContext(), LowAndHigh));
+      }
+    }
 
     int TySize = Mod->getDataLayout().getTypeSizeInBits(I.getType());
     Type *IntNTy = Builder.getIntNTy(TySize);
@@ -540,10 +895,12 @@ bool AMDGPUCodeGenPrepare::runOnFunction(Function &F) {
   if (!TPC)
     return false;
 
-  const TargetMachine &TM = TPC->getTM<TargetMachine>();
-  ST = &TM.getSubtarget<SISubtarget>(F);
+  const AMDGPUTargetMachine &TM = TPC->getTM<AMDGPUTargetMachine>();
+  ST = &TM.getSubtarget<GCNSubtarget>(F);
+  AC = &getAnalysis<AssumptionCacheTracker>().getAssumptionCache(F);
   DA = &getAnalysis<DivergenceAnalysis>();
   HasUnsafeFPMath = hasUnsafeFPMath(F);
+  AMDGPUASI = TM.getAMDGPUAS();
 
   bool MadeChange = false;
 
@@ -560,6 +917,7 @@ bool AMDGPUCodeGenPrepare::runOnFunction(Function &F) {
 
 INITIALIZE_PASS_BEGIN(AMDGPUCodeGenPrepare, DEBUG_TYPE,
                       "AMDGPU IR optimizations", false, false)
+INITIALIZE_PASS_DEPENDENCY(AssumptionCacheTracker)
 INITIALIZE_PASS_DEPENDENCY(DivergenceAnalysis)
 INITIALIZE_PASS_END(AMDGPUCodeGenPrepare, DEBUG_TYPE, "AMDGPU IR optimizations",
                     false, false)
diff --git a/lib/Target/AMDGPU/AMDGPUFeatures.td b/lib/Target/AMDGPU/AMDGPUFeatures.td
new file mode 100644
index 000000000000..b375cae9018e
--- /dev/null
+++ b/lib/Target/AMDGPU/AMDGPUFeatures.td
@@ -0,0 +1,60 @@
+//===-- AMDGPUFeatures.td - AMDGPU Feature Definitions -----*- tablegen -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+def FeatureFP64 : SubtargetFeature<"fp64",
+  "FP64",
+  "true",
+  "Enable double precision operations"
+>;
+
+def FeatureFMA : SubtargetFeature<"fmaf",
+  "FMA",
+  "true",
+  "Enable single precision FMA (not as fast as mul+add, but fused)"
+>;
+
+class SubtargetFeatureLocalMemorySize <int Value> : SubtargetFeature<
+  "localmemorysize"#Value,
+  "LocalMemorySize",
+  !cast<string>(Value),
+  "The size of local memory in bytes"
+>;
+
+def FeatureLocalMemorySize0 : SubtargetFeatureLocalMemorySize<0>;
+def FeatureLocalMemorySize32768 : SubtargetFeatureLocalMemorySize<32768>;
+def FeatureLocalMemorySize65536 : SubtargetFeatureLocalMemorySize<65536>;
+
+class SubtargetFeatureWavefrontSize <int Value> : SubtargetFeature<
+  "wavefrontsize"#Value,
+  "WavefrontSize",
+  !cast<string>(Value),
+  "The number of threads per wavefront"
+>;
+
+def FeatureWavefrontSize16 : SubtargetFeatureWavefrontSize<16>;
+def FeatureWavefrontSize32 : SubtargetFeatureWavefrontSize<32>;
+def FeatureWavefrontSize64 : SubtargetFeatureWavefrontSize<64>;
+
+class SubtargetFeatureGeneration <string Value, string Subtarget,
+                                  list<SubtargetFeature> Implies> :
+        SubtargetFeature <Value, "Gen", Subtarget#"::"#Value,
+                          Value#" GPU generation", Implies>;
+
+def FeatureDX10Clamp : SubtargetFeature<"dx10-clamp",
+  "DX10Clamp",
+  "true",
+  "clamp modifier clamps NaNs to 0.0"
+>;
+
+def FeaturePromoteAlloca : SubtargetFeature <"promote-alloca",
+  "EnablePromoteAlloca",
+  "true",
+  "Enable promote alloca pass"
+>;
+
diff --git a/lib/Target/AMDGPU/AMDGPUFrameLowering.h b/lib/Target/AMDGPU/AMDGPUFrameLowering.h
index 91fe921bfeec..ee836bf8a631 100644
--- a/lib/Target/AMDGPU/AMDGPUFrameLowering.h
+++ b/lib/Target/AMDGPU/AMDGPUFrameLowering.h
@@ -8,7 +8,7 @@
 //===----------------------------------------------------------------------===//
 //
 /// \file
-/// \brief Interface to describe a layout of a stack frame on an AMDGPU target.
+/// Interface to describe a layout of a stack frame on an AMDGPU target.
 //
 //===----------------------------------------------------------------------===//
 
@@ -19,7 +19,7 @@
 
 namespace llvm {
 
-/// \brief Information about the stack frame layout on the AMDGPU targets.
+/// Information about the stack frame layout on the AMDGPU targets.
 ///
 /// It holds the direction of the stack growth, the known stack alignment on
 /// entry to each function, and the offset to the locals area.
diff --git a/lib/Target/AMDGPU/AMDGPUGISel.td b/lib/Target/AMDGPU/AMDGPUGISel.td
new file mode 100644
index 000000000000..ba735390f679
--- /dev/null
+++ b/lib/Target/AMDGPU/AMDGPUGISel.td
@@ -0,0 +1,138 @@
+//===-- AMDGPUGIsel.td - AMDGPU GlobalISel Patterns---------*- tablegen -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+// This files contains patterns that should only be used by GlobalISel.  For
+// example patterns for V_* instructions that have S_* equivalents.
+// SelectionDAG does not support selecting V_* instructions.
+//===----------------------------------------------------------------------===//
+
+include "AMDGPU.td"
+
+def sd_vsrc0 : ComplexPattern<i32, 1, "">;
+def gi_vsrc0 :
+    GIComplexOperandMatcher<s32, "selectVSRC0">,
+    GIComplexPatternEquiv<sd_vsrc0>;
+
+def sd_vcsrc : ComplexPattern<i32, 1, "">;
+def gi_vcsrc :
+    GIComplexOperandMatcher<s32, "selectVCSRC">,
+    GIComplexPatternEquiv<sd_vcsrc>;
+
+def gi_vop3mods0 :
+    GIComplexOperandMatcher<s32, "selectVOP3Mods0">,
+    GIComplexPatternEquiv<VOP3Mods0>;
+
+def gi_vop3mods :
+    GIComplexOperandMatcher<s32, "selectVOP3Mods">,
+    GIComplexPatternEquiv<VOP3Mods>;
+
+def gi_vop3omods :
+    GIComplexOperandMatcher<s32, "selectVOP3OMods">,
+    GIComplexPatternEquiv<VOP3OMods>;
+
+class GISelSop2Pat <
+  SDPatternOperator node,
+  Instruction inst,
+  ValueType dst_vt,
+  ValueType src0_vt = dst_vt, ValueType src1_vt = src0_vt>   : GCNPat <
+
+  (dst_vt (node (src0_vt SReg_32:$src0), (src1_vt SReg_32:$src1))),
+  (inst src0_vt:$src0, src1_vt:$src1)
+>;
+
+class GISelVop2Pat <
+  SDPatternOperator node,
+  Instruction inst,
+  ValueType dst_vt,
+  ValueType src0_vt = dst_vt, ValueType src1_vt = src0_vt>   : GCNPat <
+
+  (dst_vt (node (src0_vt (sd_vsrc0 src0_vt:$src0)), (src1_vt VGPR_32:$src1))),
+  (inst src0_vt:$src0, src1_vt:$src1)
+>;
+
+class GISelVop2CommutePat <
+  SDPatternOperator node,
+  Instruction inst,
+  ValueType dst_vt,
+  ValueType src0_vt = dst_vt, ValueType src1_vt = src0_vt>   : GCNPat <
+
+  (dst_vt (node (src1_vt VGPR_32:$src1), (src0_vt (sd_vsrc0 src0_vt:$src0)))),
+  (inst src0_vt:$src0, src1_vt:$src1)
+>;
+
+class GISelVop3Pat2 <
+  SDPatternOperator node,
+  Instruction inst,
+  ValueType dst_vt,
+  ValueType src0_vt = dst_vt, ValueType src1_vt = src0_vt>   : GCNPat <
+
+  (dst_vt (node (src0_vt (sd_vcsrc src0_vt:$src0)), (src1_vt (sd_vcsrc src1_vt:$src1)))),
+  (inst src0_vt:$src0, src1_vt:$src1)
+>;
+
+class GISelVop3Pat2CommutePat <
+  SDPatternOperator node,
+  Instruction inst,
+  ValueType dst_vt,
+  ValueType src0_vt = dst_vt, ValueType src1_vt = src0_vt>   : GCNPat <
+
+  (dst_vt (node (src0_vt (sd_vcsrc src0_vt:$src0)), (src1_vt (sd_vcsrc src1_vt:$src1)))),
+  (inst src0_vt:$src1, src1_vt:$src0)
+>;
+
+class GISelVop3Pat2ModsPat <
+  SDPatternOperator node,
+  Instruction inst,
+  ValueType dst_vt,
+  ValueType src0_vt = dst_vt, ValueType src1_vt = src0_vt> : GCNPat <
+
+  (dst_vt (node (src0_vt (VOP3Mods0 src0_vt:$src0, i32:$src0_modifiers, i1:$clamp, i32:$omods)),
+                (src1_vt (VOP3Mods src1_vt:$src1, i32:$src1_modifiers)))),
+  (inst i32:$src0_modifiers, src0_vt:$src0,
+        i32:$src1_modifiers, src1_vt:$src1, $clamp, $omods)
+>;
+
+multiclass GISelVop2IntrPat <
+  SDPatternOperator node, Instruction inst,
+  ValueType dst_vt, ValueType src_vt = dst_vt> {
+
+  def : GISelVop2Pat <node, inst, dst_vt, src_vt>;
+
+  // FIXME: Intrinsics aren't marked as commutable, so we need to add an explcit
+  // pattern to handle commuting.  This is another reason why legalizing to a
+  // generic machine instruction may be better that matching the intrinsic
+  // directly.
+  def : GISelVop2CommutePat <node, inst, dst_vt, src_vt>;
+}
+
+def : GISelSop2Pat <or, S_OR_B32, i32>;
+def : GISelVop2Pat <or, V_OR_B32_e32, i32>;
+
+def : GISelSop2Pat <sra, S_ASHR_I32, i32>;
+let AddedComplexity = 100 in {
+let SubtargetPredicate = isSICI in {
+def : GISelVop2Pat <sra, V_ASHR_I32_e32, i32>;
+}
+def : GISelVop2CommutePat <sra, V_ASHRREV_I32_e32, i32>;
+}
+def : GISelVop3Pat2CommutePat <sra, V_ASHRREV_I32_e64, i32>;
+
+// FIXME: Select directly to _e32 so we don't need to deal with modifiers.
+// FIXME: We can't re-use SelectionDAG patterns here because they match
+// against a custom SDNode and we would need to create a generic machine
+// instruction that is equivalent to the custom SDNode.  This would also require
+// us to custom legalize the intrinsic to the new generic machine instruction,
+// but I can't get custom legalizing of intrinsic to work and I'm not sure if
+// this is even supported yet.
+defm : GISelVop2IntrPat <
+  int_amdgcn_cvt_pkrtz, V_CVT_PKRTZ_F16_F32_e32, v2f16, f32>;
+
+defm : GISelVop2IntrPat <int_maxnum, V_MAX_F32_e32, f32>;
+def : GISelVop3Pat2ModsPat <int_maxnum, V_MAX_F64, f64>;
+defm : GISelVop2IntrPat <int_minnum, V_MIN_F32_e32, f32>;
+def : GISelVop3Pat2ModsPat <int_minnum, V_MIN_F64, f64>;
diff --git a/lib/Target/AMDGPU/AMDGPUGenRegisterBankInfo.def b/lib/Target/AMDGPU/AMDGPUGenRegisterBankInfo.def
index bf7deb500d1a..3a58c6c6a29f 100644
--- a/lib/Target/AMDGPU/AMDGPUGenRegisterBankInfo.def
+++ b/lib/Target/AMDGPU/AMDGPUGenRegisterBankInfo.def
@@ -16,41 +16,89 @@ namespace AMDGPU {
 
 enum PartialMappingIdx {
   None = - 1,
-  PM_SGPR32 = 0,
-  PM_SGPR64 = 1,
-  PM_VGPR32 = 2,
-  PM_VGPR64 = 3
+  PM_SGPR1  = 0,
+  PM_SGPR16 = 4,
+  PM_SGPR32 = 5,
+  PM_SGPR64 = 6,
+  PM_SGPR128 = 7,
+  PM_SGPR256 = 8,
+  PM_SGPR512 = 9,
+  PM_VGPR1  = 10,
+  PM_VGPR16 = 14,
+  PM_VGPR32 = 15,
+  PM_VGPR64 = 16,
+  PM_VGPR128 = 17,
+  PM_VGPR256 = 18,
+  PM_VGPR512 = 19,
+  PM_SGPR96 = 20,
+  PM_VGPR96 = 21
 };
 
 const RegisterBankInfo::PartialMapping PartMappings[] {
   // StartIdx, Length, RegBank
+  {0, 1,  SCCRegBank},
+  {0, 16, SGPRRegBank},
   {0, 32, SGPRRegBank},
   {0, 64, SGPRRegBank},
+  {0, 128, SGPRRegBank},
+  {0, 256, SGPRRegBank},
+  {0, 512, SGPRRegBank},
+  {0, 1,  SGPRRegBank},
+  {0, 16, VGPRRegBank},
   {0, 32, VGPRRegBank},
-  {0, 64, VGPRRegBank}
+  {0, 64, VGPRRegBank},
+  {0, 128, VGPRRegBank},
+  {0, 256, VGPRRegBank},
+  {0, 512, VGPRRegBank},
+  {0, 96, SGPRRegBank},
+  {0, 96, VGPRRegBank},
 };
 
 const RegisterBankInfo::ValueMapping ValMappings[] {
-  // SGPR 32-bit
   {&PartMappings[0], 1},
-  // SGPR 64-bit
+  {nullptr, 0},
+  {nullptr, 0},
+  {nullptr, 0},
   {&PartMappings[1], 1},
-  // VGPR 32-bit
   {&PartMappings[2], 1},
-  // VGPR 64-bit
-  {&PartMappings[3], 1}
+  {&PartMappings[3], 1},
+  {&PartMappings[4], 1},
+  {&PartMappings[5], 1},
+  {&PartMappings[6], 1},
+  {&PartMappings[7], 1},
+  {nullptr, 0},
+  {nullptr, 0},
+  {nullptr, 0},
+  {&PartMappings[8], 1},
+  {&PartMappings[9], 1},
+  {&PartMappings[10], 1},
+  {&PartMappings[11], 1},
+  {&PartMappings[12], 1},
+  {&PartMappings[13], 1},
+  {&PartMappings[14], 1},
+  {&PartMappings[15], 1}
 };
 
 enum ValueMappingIdx {
   SGPRStartIdx = 0,
-  VGPRStartIdx = 2
+  VGPRStartIdx = 10
 };
 
 const RegisterBankInfo::ValueMapping *getValueMapping(unsigned BankID,
                                                       unsigned Size) {
-  assert(Size % 32 == 0);
-  unsigned Idx = BankID == AMDGPU::SGPRRegBankID ? SGPRStartIdx : VGPRStartIdx;
-  Idx += (Size / 32) - 1;
+  unsigned Idx;
+  switch (Size) {
+  case 1:
+    Idx = BankID == AMDGPU::SCCRegBankID ? PM_SGPR1 : PM_VGPR1;
+    break;
+  case 96:
+    Idx = BankID == AMDGPU::SGPRRegBankID ? PM_SGPR96 : PM_VGPR96;
+    break;
+  default:
+    Idx = BankID == AMDGPU::VGPRRegBankID ? VGPRStartIdx : SGPRStartIdx;
+    Idx += Log2_32_Ceil(Size);
+    break;
+  }
   return &ValMappings[Idx];
 }
 
diff --git a/lib/Target/AMDGPU/MCTargetDesc/AMDGPUHSAMetadataStreamer.cpp b/lib/Target/AMDGPU/AMDGPUHSAMetadataStreamer.cpp
index 463e700f13b7..01ef346f74ee 100644
--- a/lib/Target/AMDGPU/MCTargetDesc/AMDGPUHSAMetadataStreamer.cpp
+++ b/lib/Target/AMDGPU/AMDGPUHSAMetadataStreamer.cpp
@@ -8,13 +8,17 @@
 //===----------------------------------------------------------------------===//
 //
 /// \file
-/// \brief AMDGPU HSA Metadata Streamer.
+/// AMDGPU HSA Metadata Streamer.
 ///
 //
 //===----------------------------------------------------------------------===//
 
 #include "AMDGPUHSAMetadataStreamer.h"
 #include "AMDGPU.h"
+#include "AMDGPUSubtarget.h"
+#include "SIMachineFunctionInfo.h"
+#include "SIProgramInfo.h"
+#include "Utils/AMDGPUBaseInfo.h"
 #include "llvm/ADT/StringSwitch.h"
 #include "llvm/IR/Constants.h"
 #include "llvm/IR/Module.h"
@@ -196,6 +200,57 @@ std::vector<uint32_t> MetadataStreamer::getWorkGroupDimensions(
   return Dims;
 }
 
+Kernel::CodeProps::Metadata MetadataStreamer::getHSACodeProps(
+    const MachineFunction &MF,
+    const SIProgramInfo &ProgramInfo) const {
+  const GCNSubtarget &STM = MF.getSubtarget<GCNSubtarget>();
+  const SIMachineFunctionInfo &MFI = *MF.getInfo<SIMachineFunctionInfo>();
+  HSAMD::Kernel::CodeProps::Metadata HSACodeProps;
+  const Function &F = MF.getFunction();
+
+  assert(F.getCallingConv() == CallingConv::AMDGPU_KERNEL ||
+         F.getCallingConv() == CallingConv::SPIR_KERNEL);
+
+  unsigned MaxKernArgAlign;
+  HSACodeProps.mKernargSegmentSize = STM.getKernArgSegmentSize(F,
+                                                               MaxKernArgAlign);
+  HSACodeProps.mGroupSegmentFixedSize = ProgramInfo.LDSSize;
+  HSACodeProps.mPrivateSegmentFixedSize = ProgramInfo.ScratchSize;
+  HSACodeProps.mKernargSegmentAlign = std::max(MaxKernArgAlign, 4u);
+  HSACodeProps.mWavefrontSize = STM.getWavefrontSize();
+  HSACodeProps.mNumSGPRs = ProgramInfo.NumSGPR;
+  HSACodeProps.mNumVGPRs = ProgramInfo.NumVGPR;
+  HSACodeProps.mMaxFlatWorkGroupSize = MFI.getMaxFlatWorkGroupSize();
+  HSACodeProps.mIsDynamicCallStack = ProgramInfo.DynamicCallStack;
+  HSACodeProps.mIsXNACKEnabled = STM.isXNACKEnabled();
+  HSACodeProps.mNumSpilledSGPRs = MFI.getNumSpilledSGPRs();
+  HSACodeProps.mNumSpilledVGPRs = MFI.getNumSpilledVGPRs();
+
+  return HSACodeProps;
+}
+
+Kernel::DebugProps::Metadata MetadataStreamer::getHSADebugProps(
+    const MachineFunction &MF,
+    const SIProgramInfo &ProgramInfo) const {
+  const GCNSubtarget &STM = MF.getSubtarget<GCNSubtarget>();
+  HSAMD::Kernel::DebugProps::Metadata HSADebugProps;
+
+  if (!STM.debuggerSupported())
+    return HSADebugProps;
+
+  HSADebugProps.mDebuggerABIVersion.push_back(1);
+  HSADebugProps.mDebuggerABIVersion.push_back(0);
+
+  if (STM.debuggerEmitPrologue()) {
+    HSADebugProps.mPrivateSegmentBufferSGPR =
+        ProgramInfo.DebuggerPrivateSegmentBufferSGPR;
+    HSADebugProps.mWavefrontPrivateSegmentOffsetSGPR =
+        ProgramInfo.DebuggerWavefrontPrivateSegmentOffsetSGPR;
+  }
+
+  return HSADebugProps;
+}
+
 void MetadataStreamer::emitVersion() {
   auto &Version = HSAMetadata.mVersion;
 
@@ -255,32 +310,7 @@ void MetadataStreamer::emitKernelArgs(const Function &Func) {
   for (auto &Arg : Func.args())
     emitKernelArg(Arg);
 
-  // TODO: What about other languages?
-  if (!Func.getParent()->getNamedMetadata("opencl.ocl.version"))
-    return;
-
-  auto &DL = Func.getParent()->getDataLayout();
-  auto Int64Ty = Type::getInt64Ty(Func.getContext());
-
-  emitKernelArg(DL, Int64Ty, ValueKind::HiddenGlobalOffsetX);
-  emitKernelArg(DL, Int64Ty, ValueKind::HiddenGlobalOffsetY);
-  emitKernelArg(DL, Int64Ty, ValueKind::HiddenGlobalOffsetZ);
-
-  auto Int8PtrTy = Type::getInt8PtrTy(Func.getContext(),
-                                      AMDGPUASI.GLOBAL_ADDRESS);
-  auto CallsPrintf = Func.getParent()->getNamedMetadata("llvm.printf.fmts");
-  if (CallsPrintf)
-    emitKernelArg(DL, Int8PtrTy, ValueKind::HiddenPrintfBuffer);
-  if (Func.hasFnAttribute("calls-enqueue-kernel")) {
-    if (!CallsPrintf) {
-      // Emit a dummy argument so that the remaining hidden arguments
-      // have a fixed position relative to the first hidden argument.
-      // This is to facilitate library code to access hidden arguments.
-      emitKernelArg(DL, Int8PtrTy, ValueKind::HiddenNone);
-    }
-    emitKernelArg(DL, Int8PtrTy, ValueKind::HiddenDefaultQueue);
-    emitKernelArg(DL, Int8PtrTy, ValueKind::HiddenCompletionAction);
-  }
+  emitHiddenKernelArgs(Func);
 }
 
 void MetadataStreamer::emitKernelArg(const Argument &Arg) {
@@ -320,13 +350,26 @@ void MetadataStreamer::emitKernelArg(const Argument &Arg) {
   if (Node && ArgNo < Node->getNumOperands())
     TypeQual = cast<MDString>(Node->getOperand(ArgNo))->getString();
 
-  emitKernelArg(Func->getParent()->getDataLayout(), Arg.getType(),
-                getValueKind(Arg.getType(), TypeQual, BaseTypeName), Name,
-                TypeName, BaseTypeName, AccQual, TypeQual);
+  Type *Ty = Arg.getType();
+  const DataLayout &DL = Func->getParent()->getDataLayout();
+
+  unsigned PointeeAlign = 0;
+  if (auto PtrTy = dyn_cast<PointerType>(Ty)) {
+    if (PtrTy->getAddressSpace() == AMDGPUASI.LOCAL_ADDRESS) {
+      PointeeAlign = Arg.getParamAlignment();
+      if (PointeeAlign == 0)
+        PointeeAlign = DL.getABITypeAlignment(PtrTy->getElementType());
+    }
+  }
+
+  emitKernelArg(DL, Ty, getValueKind(Arg.getType(), TypeQual, BaseTypeName),
+                PointeeAlign, Name, TypeName, BaseTypeName, AccQual, TypeQual);
 }
 
 void MetadataStreamer::emitKernelArg(const DataLayout &DL, Type *Ty,
-                                     ValueKind ValueKind, StringRef Name,
+                                     ValueKind ValueKind,
+                                     unsigned PointeeAlign,
+                                     StringRef Name,
                                      StringRef TypeName, StringRef BaseTypeName,
                                      StringRef AccQual, StringRef TypeQual) {
   HSAMetadata.mKernels.back().mArgs.push_back(Kernel::Arg::Metadata());
@@ -338,12 +381,7 @@ void MetadataStreamer::emitKernelArg(const DataLayout &DL, Type *Ty,
   Arg.mAlign = DL.getABITypeAlignment(Ty);
   Arg.mValueKind = ValueKind;
   Arg.mValueType = getValueType(Ty, BaseTypeName);
-
-  if (auto PtrTy = dyn_cast<PointerType>(Ty)) {
-    auto ElTy = PtrTy->getElementType();
-    if (PtrTy->getAddressSpace() == AMDGPUASI.LOCAL_ADDRESS && ElTy->isSized())
-      Arg.mPointeeAlign = DL.getABITypeAlignment(ElTy);
-  }
+  Arg.mPointeeAlign = PointeeAlign;
 
   if (auto PtrTy = dyn_cast<PointerType>(Ty))
     Arg.mAddrSpaceQual = getAddressSpaceQualifer(PtrTy->getAddressSpace());
@@ -366,6 +404,48 @@ void MetadataStreamer::emitKernelArg(const DataLayout &DL, Type *Ty,
   }
 }
 
+void MetadataStreamer::emitHiddenKernelArgs(const Function &Func) {
+  int HiddenArgNumBytes =
+      getIntegerAttribute(Func, "amdgpu-implicitarg-num-bytes", 0);
+
+  if (!HiddenArgNumBytes)
+    return;
+
+  auto &DL = Func.getParent()->getDataLayout();
+  auto Int64Ty = Type::getInt64Ty(Func.getContext());
+
+  if (HiddenArgNumBytes >= 8)
+    emitKernelArg(DL, Int64Ty, ValueKind::HiddenGlobalOffsetX);
+  if (HiddenArgNumBytes >= 16)
+    emitKernelArg(DL, Int64Ty, ValueKind::HiddenGlobalOffsetY);
+  if (HiddenArgNumBytes >= 24)
+    emitKernelArg(DL, Int64Ty, ValueKind::HiddenGlobalOffsetZ);
+
+  auto Int8PtrTy = Type::getInt8PtrTy(Func.getContext(),
+                                      AMDGPUASI.GLOBAL_ADDRESS);
+
+  // Emit "printf buffer" argument if printf is used, otherwise emit dummy
+  // "none" argument.
+  if (HiddenArgNumBytes >= 32) {
+    if (Func.getParent()->getNamedMetadata("llvm.printf.fmts"))
+      emitKernelArg(DL, Int8PtrTy, ValueKind::HiddenPrintfBuffer);
+    else
+      emitKernelArg(DL, Int8PtrTy, ValueKind::HiddenNone);
+  }
+
+  // Emit "default queue" and "completion action" arguments if enqueue kernel is
+  // used, otherwise emit dummy "none" arguments.
+  if (HiddenArgNumBytes >= 48) {
+    if (Func.hasFnAttribute("calls-enqueue-kernel")) {
+      emitKernelArg(DL, Int8PtrTy, ValueKind::HiddenDefaultQueue);
+      emitKernelArg(DL, Int8PtrTy, ValueKind::HiddenCompletionAction);
+    } else {
+      emitKernelArg(DL, Int8PtrTy, ValueKind::HiddenNone);
+      emitKernelArg(DL, Int8PtrTy, ValueKind::HiddenNone);
+    }
+  }
+}
+
 void MetadataStreamer::begin(const Module &Mod) {
   AMDGPUASI = getAMDGPUAS(Mod);
   emitVersion();
@@ -383,13 +463,14 @@ void MetadataStreamer::end() {
     verify(HSAMetadataString);
 }
 
-void MetadataStreamer::emitKernel(
-    const Function &Func,
-    const Kernel::CodeProps::Metadata &CodeProps,
-    const Kernel::DebugProps::Metadata &DebugProps) {
+void MetadataStreamer::emitKernel(const MachineFunction &MF, const SIProgramInfo &ProgramInfo) {
+  auto &Func = MF.getFunction();
   if (Func.getCallingConv() != CallingConv::AMDGPU_KERNEL)
     return;
 
+  auto CodeProps = getHSACodeProps(MF, ProgramInfo);
+  auto DebugProps = getHSADebugProps(MF, ProgramInfo);
+
   HSAMetadata.mKernels.push_back(Kernel::Metadata());
   auto &Kernel = HSAMetadata.mKernels.back();
 
diff --git a/lib/Target/AMDGPU/MCTargetDesc/AMDGPUHSAMetadataStreamer.h b/lib/Target/AMDGPU/AMDGPUHSAMetadataStreamer.h
index bd6515521a74..3424c956d781 100644
--- a/lib/Target/AMDGPU/MCTargetDesc/AMDGPUHSAMetadataStreamer.h
+++ b/lib/Target/AMDGPU/AMDGPUHSAMetadataStreamer.h
@@ -8,7 +8,7 @@
 //===----------------------------------------------------------------------===//
 //
 /// \file
-/// \brief AMDGPU HSA Metadata Streamer.
+/// AMDGPU HSA Metadata Streamer.
 ///
 //
 //===----------------------------------------------------------------------===//
@@ -28,6 +28,7 @@ class DataLayout;
 class Function;
 class MDNode;
 class Module;
+struct SIProgramInfo;
 class Type;
 
 namespace AMDGPU {
@@ -55,6 +56,13 @@ private:
 
   std::vector<uint32_t> getWorkGroupDimensions(MDNode *Node) const;
 
+  Kernel::CodeProps::Metadata getHSACodeProps(
+      const MachineFunction &MF,
+      const SIProgramInfo &ProgramInfo) const;
+  Kernel::DebugProps::Metadata getHSADebugProps(
+      const MachineFunction &MF,
+      const SIProgramInfo &ProgramInfo) const;
+
   void emitVersion();
 
   void emitPrintf(const Module &Mod);
@@ -68,10 +76,13 @@ private:
   void emitKernelArg(const Argument &Arg);
 
   void emitKernelArg(const DataLayout &DL, Type *Ty, ValueKind ValueKind,
+                     unsigned PointeeAlign = 0,
                      StringRef Name = "", StringRef TypeName = "",
                      StringRef BaseTypeName = "", StringRef AccQual = "",
                      StringRef TypeQual = "");
 
+  void emitHiddenKernelArgs(const Function &Func);
+
 public:
   MetadataStreamer() = default;
   ~MetadataStreamer() = default;
@@ -84,9 +95,7 @@ public:
 
   void end();
 
-  void emitKernel(const Function &Func,
-                  const Kernel::CodeProps::Metadata &CodeProps,
-                  const Kernel::DebugProps::Metadata &DebugProps);
+  void emitKernel(const MachineFunction &MF, const SIProgramInfo &ProgramInfo);
 };
 
 } // end namespace HSAMD
diff --git a/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp b/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp
index f4776adb069c..f25f4d4693ea 100644
--- a/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp
+++ b/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp
@@ -8,7 +8,7 @@
 //==-----------------------------------------------------------------------===//
 //
 /// \file
-/// \brief Defines an instruction selector for the AMDGPU target.
+/// Defines an instruction selector for the AMDGPU target.
 //
 //===----------------------------------------------------------------------===//
 
@@ -16,6 +16,7 @@
 #include "AMDGPUArgumentUsageInfo.h"
 #include "AMDGPUISelLowering.h" // For AMDGPUISD
 #include "AMDGPUInstrInfo.h"
+#include "AMDGPUPerfHintAnalysis.h"
 #include "AMDGPURegisterInfo.h"
 #include "AMDGPUSubtarget.h"
 #include "AMDGPUTargetMachine.h"
@@ -24,15 +25,16 @@
 #include "SIInstrInfo.h"
 #include "SIMachineFunctionInfo.h"
 #include "SIRegisterInfo.h"
+#include "MCTargetDesc/AMDGPUMCTargetDesc.h"
 #include "llvm/ADT/APInt.h"
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/ADT/StringRef.h"
+#include "llvm/Analysis/DivergenceAnalysis.h"
 #include "llvm/Analysis/ValueTracking.h"
 #include "llvm/CodeGen/FunctionLoweringInfo.h"
 #include "llvm/CodeGen/ISDOpcodes.h"
 #include "llvm/CodeGen/MachineFunction.h"
 #include "llvm/CodeGen/MachineRegisterInfo.h"
-#include "llvm/CodeGen/MachineValueType.h"
 #include "llvm/CodeGen/SelectionDAG.h"
 #include "llvm/CodeGen/SelectionDAGISel.h"
 #include "llvm/CodeGen/SelectionDAGNodes.h"
@@ -43,6 +45,7 @@
 #include "llvm/Support/Casting.h"
 #include "llvm/Support/CodeGen.h"
 #include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/MachineValueType.h"
 #include "llvm/Support/MathExtras.h"
 #include <cassert>
 #include <cstdint>
@@ -68,7 +71,7 @@ namespace {
 class AMDGPUDAGToDAGISel : public SelectionDAGISel {
   // Subtarget - Keep a pointer to the AMDGPU Subtarget around so that we can
   // make the right decision when generating code for different targets.
-  const AMDGPUSubtarget *Subtarget;
+  const GCNSubtarget *Subtarget;
   AMDGPUAS AMDGPUASI;
   bool EnableLateStructurizeCFG;
 
@@ -83,6 +86,8 @@ public:
 
   void getAnalysisUsage(AnalysisUsage &AU) const override {
     AU.addRequired<AMDGPUArgumentUsageInfo>();
+    AU.addRequired<AMDGPUPerfHintAnalysis>();
+    AU.addRequired<DivergenceAnalysis>();
     SelectionDAGISel::getAnalysisUsage(AU);
   }
 
@@ -98,20 +103,12 @@ private:
   std::pair<SDValue, SDValue> foldFrameIndex(SDValue N) const;
   bool isNoNanSrc(SDValue N) const;
   bool isInlineImmediate(const SDNode *N) const;
-  bool FoldOperand(SDValue &Src, SDValue &Sel, SDValue &Neg, SDValue &Abs,
-                   const R600InstrInfo *TII);
-  bool FoldOperands(unsigned, const R600InstrInfo *, std::vector<SDValue> &);
-  bool FoldDotOperands(unsigned, const R600InstrInfo *, std::vector<SDValue> &);
 
-  bool isConstantLoad(const MemSDNode *N, int cbID) const;
   bool isUniformBr(const SDNode *N) const;
 
   SDNode *glueCopyToM0(SDNode *N) const;
 
   const TargetRegisterClass *getOperandRegClass(SDNode *N, unsigned OpNo) const;
-  bool SelectGlobalValueConstantOffset(SDValue Addr, SDValue& IntPtr);
-  bool SelectGlobalValueVariableOffset(SDValue Addr, SDValue &BaseReg,
-                                       SDValue& Offset);
   virtual bool SelectADDRVTX_READ(SDValue Addr, SDValue &Base, SDValue &Offset);
   virtual bool SelectADDRIndirect(SDValue Addr, SDValue &Base, SDValue &Offset);
   bool isDSOffsetLegal(const SDValue &Base, unsigned Offset,
@@ -162,6 +159,7 @@ private:
 
   bool SelectSMRDOffset(SDValue ByteOffsetNode, SDValue &Offset,
                         bool &Imm) const;
+  SDValue Expand32BitAddress(SDValue Addr) const;
   bool SelectSMRD(SDValue Addr, SDValue &SBase, SDValue &Offset,
                   bool &Imm) const;
   bool SelectSMRDImm(SDValue Addr, SDValue &SBase, SDValue &Offset) const;
@@ -216,7 +214,7 @@ private:
   void SelectS_BFE(SDNode *N);
   bool isCBranchSCC(const SDNode *N) const;
   void SelectBRCOND(SDNode *N);
-  void SelectFMAD(SDNode *N);
+  void SelectFMAD_FMA(SDNode *N);
   void SelectATOMIC_CMP_SWAP(SDNode *N);
 
 protected:
@@ -225,9 +223,18 @@ protected:
 };
 
 class R600DAGToDAGISel : public AMDGPUDAGToDAGISel {
+  const R600Subtarget *Subtarget;
+  AMDGPUAS AMDGPUASI;
+
+  bool isConstantLoad(const MemSDNode *N, int cbID) const;
+  bool SelectGlobalValueConstantOffset(SDValue Addr, SDValue& IntPtr);
+  bool SelectGlobalValueVariableOffset(SDValue Addr, SDValue &BaseReg,
+                                       SDValue& Offset);
 public:
   explicit R600DAGToDAGISel(TargetMachine *TM, CodeGenOpt::Level OptLevel) :
-      AMDGPUDAGToDAGISel(TM, OptLevel) {}
+      AMDGPUDAGToDAGISel(TM, OptLevel) {
+    AMDGPUASI = AMDGPU::getAMDGPUAS(*TM);
+      }
 
   void Select(SDNode *N) override;
 
@@ -235,6 +242,11 @@ public:
                           SDValue &Offset) override;
   bool SelectADDRVTX_READ(SDValue Addr, SDValue &Base,
                           SDValue &Offset) override;
+
+  bool runOnMachineFunction(MachineFunction &MF) override;
+protected:
+  // Include the pieces autogenerated from the target description.
+#include "R600GenDAGISel.inc"
 };
 
 }  // end anonymous namespace
@@ -242,17 +254,19 @@ public:
 INITIALIZE_PASS_BEGIN(AMDGPUDAGToDAGISel, "isel",
                       "AMDGPU DAG->DAG Pattern Instruction Selection", false, false)
 INITIALIZE_PASS_DEPENDENCY(AMDGPUArgumentUsageInfo)
+INITIALIZE_PASS_DEPENDENCY(AMDGPUPerfHintAnalysis)
+INITIALIZE_PASS_DEPENDENCY(DivergenceAnalysis)
 INITIALIZE_PASS_END(AMDGPUDAGToDAGISel, "isel",
                     "AMDGPU DAG->DAG Pattern Instruction Selection", false, false)
 
-/// \brief This pass converts a legalized DAG into a AMDGPU-specific
+/// This pass converts a legalized DAG into a AMDGPU-specific
 // DAG, ready for instruction scheduling.
 FunctionPass *llvm::createAMDGPUISelDag(TargetMachine *TM,
                                         CodeGenOpt::Level OptLevel) {
   return new AMDGPUDAGToDAGISel(TM, OptLevel);
 }
 
-/// \brief This pass converts a legalized DAG into a R600-specific
+/// This pass converts a legalized DAG into a R600-specific
 // DAG, ready for instruction scheduling.
 FunctionPass *llvm::createR600ISelDag(TargetMachine *TM,
                                       CodeGenOpt::Level OptLevel) {
@@ -260,7 +274,7 @@ FunctionPass *llvm::createR600ISelDag(TargetMachine *TM,
 }
 
 bool AMDGPUDAGToDAGISel::runOnMachineFunction(MachineFunction &MF) {
-  Subtarget = &MF.getSubtarget<AMDGPUSubtarget>();
+  Subtarget = &MF.getSubtarget<GCNSubtarget>();
   return SelectionDAGISel::runOnMachineFunction(MF);
 }
 
@@ -276,8 +290,7 @@ bool AMDGPUDAGToDAGISel::isNoNanSrc(SDValue N) const {
 }
 
 bool AMDGPUDAGToDAGISel::isInlineImmediate(const SDNode *N) const {
-  const SIInstrInfo *TII
-    = static_cast<const SISubtarget *>(Subtarget)->getInstrInfo();
+  const SIInstrInfo *TII = Subtarget->getInstrInfo();
 
   if (const ConstantSDNode *C = dyn_cast<ConstantSDNode>(N))
     return TII->isInlineConstant(C->getAPIntValue());
@@ -288,7 +301,7 @@ bool AMDGPUDAGToDAGISel::isInlineImmediate(const SDNode *N) const {
   return false;
 }
 
-/// \brief Determine the register class for \p OpNo
+/// Determine the register class for \p OpNo
 /// \returns The register class of the virtual register that will be used for
 /// the given operand number \OpNo or NULL if the register class cannot be
 /// determined.
@@ -303,7 +316,7 @@ const TargetRegisterClass *AMDGPUDAGToDAGISel::getOperandRegClass(SDNode *N,
       }
 
       const SIRegisterInfo *TRI
-        = static_cast<const SISubtarget *>(Subtarget)->getRegisterInfo();
+        = static_cast<const GCNSubtarget *>(Subtarget)->getRegisterInfo();
       return TRI->getPhysRegClass(Reg);
     }
 
@@ -394,7 +407,6 @@ void AMDGPUDAGToDAGISel::SelectBuildVector(SDNode *N, unsigned RegClassID) {
   EVT VT = N->getValueType(0);
   unsigned NumVectorElts = VT.getVectorNumElements();
   EVT EltVT = VT.getVectorElementType();
-  const AMDGPURegisterInfo *TRI = Subtarget->getRegisterInfo();
   SDLoc DL(N);
   SDValue RegClass = CurDAG->getTargetConstant(RegClassID, DL, MVT::i32);
 
@@ -420,10 +432,9 @@ void AMDGPUDAGToDAGISel::SelectBuildVector(SDNode *N, unsigned RegClassID) {
       IsRegSeq = false;
       break;
     }
+    unsigned Sub = AMDGPURegisterInfo::getSubRegFromChannel(i);
     RegSeqArgs[1 + (2 * i)] = N->getOperand(i);
-    RegSeqArgs[1 + (2 * i) + 1] =
-            CurDAG->getTargetConstant(TRI->getSubRegFromChannel(i), DL,
-                                      MVT::i32);
+    RegSeqArgs[1 + (2 * i) + 1] = CurDAG->getTargetConstant(Sub, DL, MVT::i32);
   }
   if (NOps != NumVectorElts) {
     // Fill in the missing undef elements if this was a scalar_to_vector.
@@ -431,9 +442,10 @@ void AMDGPUDAGToDAGISel::SelectBuildVector(SDNode *N, unsigned RegClassID) {
     MachineSDNode *ImpDef = CurDAG->getMachineNode(TargetOpcode::IMPLICIT_DEF,
                                                    DL, EltVT);
     for (unsigned i = NOps; i < NumVectorElts; ++i) {
+      unsigned Sub = AMDGPURegisterInfo::getSubRegFromChannel(i);
       RegSeqArgs[1 + (2 * i)] = SDValue(ImpDef, 0);
       RegSeqArgs[1 + (2 * i) + 1] =
-        CurDAG->getTargetConstant(TRI->getSubRegFromChannel(i), DL, MVT::i32);
+          CurDAG->getTargetConstant(Sub, DL, MVT::i32);
     }
   }
 
@@ -450,7 +462,10 @@ void AMDGPUDAGToDAGISel::Select(SDNode *N) {
   }
 
   if (isa<AtomicSDNode>(N) ||
-      (Opc == AMDGPUISD::ATOMIC_INC || Opc == AMDGPUISD::ATOMIC_DEC))
+      (Opc == AMDGPUISD::ATOMIC_INC || Opc == AMDGPUISD::ATOMIC_DEC ||
+       Opc == AMDGPUISD::ATOMIC_LOAD_FADD ||
+       Opc == AMDGPUISD::ATOMIC_LOAD_FMIN ||
+       Opc == AMDGPUISD::ATOMIC_LOAD_FMAX))
     N = glueCopyToM0(N);
 
   switch (Opc) {
@@ -487,9 +502,8 @@ void AMDGPUDAGToDAGISel::Select(SDNode *N) {
   case ISD::BUILD_VECTOR: {
     EVT VT = N->getValueType(0);
     unsigned NumVectorElts = VT.getVectorNumElements();
-
-    if (VT == MVT::v2i16 || VT == MVT::v2f16) {
-      if (Opc == ISD::BUILD_VECTOR) {
+    if (VT.getScalarSizeInBits() == 16) {
+      if (Opc == ISD::BUILD_VECTOR && NumVectorElts == 2) {
         uint32_t LHSVal, RHSVal;
         if (getConstantValue(N->getOperand(0), LHSVal) &&
             getConstantValue(N->getOperand(1), RHSVal)) {
@@ -559,7 +573,9 @@ void AMDGPUDAGToDAGISel::Select(SDNode *N) {
     return;
   }
   case ISD::LOAD:
-  case ISD::STORE: {
+  case ISD::STORE:
+  case ISD::ATOMIC_LOAD:
+  case ISD::ATOMIC_STORE: {
     N = glueCopyToM0(N);
     break;
   }
@@ -619,7 +635,8 @@ void AMDGPUDAGToDAGISel::Select(SDNode *N) {
     SelectBRCOND(N);
     return;
   case ISD::FMAD:
-    SelectFMAD(N);
+  case ISD::FMA:
+    SelectFMAD_FMA(N);
     return;
   case AMDGPUISD::ATOMIC_CMP_SWAP:
     SelectATOMIC_CMP_SWAP(N);
@@ -629,15 +646,6 @@ void AMDGPUDAGToDAGISel::Select(SDNode *N) {
   SelectCode(N);
 }
 
-bool AMDGPUDAGToDAGISel::isConstantLoad(const MemSDNode *N, int CbId) const {
-  if (!N->readMem())
-    return false;
-  if (CbId == -1)
-    return N->getAddressSpace() == AMDGPUASI.CONSTANT_ADDRESS;
-
-  return N->getAddressSpace() == AMDGPUASI.CONSTANT_BUFFER_0 + CbId;
-}
-
 bool AMDGPUDAGToDAGISel::isUniformBr(const SDNode *N) const {
   const BasicBlock *BB = FuncInfo->MBB->getBasicBlock();
   const Instruction *Term = BB->getTerminator();
@@ -653,26 +661,6 @@ StringRef AMDGPUDAGToDAGISel::getPassName() const {
 // Complex Patterns
 //===----------------------------------------------------------------------===//
 
-bool AMDGPUDAGToDAGISel::SelectGlobalValueConstantOffset(SDValue Addr,
-                                                         SDValue& IntPtr) {
-  if (ConstantSDNode *Cst = dyn_cast<ConstantSDNode>(Addr)) {
-    IntPtr = CurDAG->getIntPtrConstant(Cst->getZExtValue() / 4, SDLoc(Addr),
-                                       true);
-    return true;
-  }
-  return false;
-}
-
-bool AMDGPUDAGToDAGISel::SelectGlobalValueVariableOffset(SDValue Addr,
-    SDValue& BaseReg, SDValue &Offset) {
-  if (!isa<ConstantSDNode>(Addr)) {
-    BaseReg = Addr;
-    Offset = CurDAG->getIntPtrConstant(0, SDLoc(Addr), true);
-    return true;
-  }
-  return false;
-}
-
 bool AMDGPUDAGToDAGISel::SelectADDRVTX_READ(SDValue Addr, SDValue &Base,
                                             SDValue &Offset) {
   return false;
@@ -684,11 +672,11 @@ bool AMDGPUDAGToDAGISel::SelectADDRIndirect(SDValue Addr, SDValue &Base,
   SDLoc DL(Addr);
 
   if ((C = dyn_cast<ConstantSDNode>(Addr))) {
-    Base = CurDAG->getRegister(AMDGPU::INDIRECT_BASE_ADDR, MVT::i32);
+    Base = CurDAG->getRegister(R600::INDIRECT_BASE_ADDR, MVT::i32);
     Offset = CurDAG->getTargetConstant(C->getZExtValue(), DL, MVT::i32);
   } else if ((Addr.getOpcode() == AMDGPUISD::DWORDADDR) &&
              (C = dyn_cast<ConstantSDNode>(Addr.getOperand(0)))) {
-    Base = CurDAG->getRegister(AMDGPU::INDIRECT_BASE_ADDR, MVT::i32);
+    Base = CurDAG->getRegister(R600::INDIRECT_BASE_ADDR, MVT::i32);
     Offset = CurDAG->getTargetConstant(C->getZExtValue(), DL, MVT::i32);
   } else if ((Addr.getOpcode() == ISD::ADD || Addr.getOpcode() == ISD::OR) &&
             (C = dyn_cast<ConstantSDNode>(Addr.getOperand(1)))) {
@@ -759,12 +747,11 @@ void AMDGPUDAGToDAGISel::SelectADD_SUB_I64(SDNode *N) {
 
   if (ProduceCarry) {
     // Replace the carry-use
-    CurDAG->ReplaceAllUsesOfValueWith(SDValue(N, 1), SDValue(AddHi, 1));
+    ReplaceUses(SDValue(N, 1), SDValue(AddHi, 1));
   }
 
   // Replace the remaining uses.
-  CurDAG->ReplaceAllUsesWith(N, RegSequence);
-  CurDAG->RemoveDeadNode(N);
+  ReplaceNode(N, RegSequence);
 }
 
 void AMDGPUDAGToDAGISel::SelectUADDO_USUBO(SDNode *N) {
@@ -1410,7 +1397,7 @@ bool AMDGPUDAGToDAGISel::SelectSMRDOffset(SDValue ByteOffsetNode,
     return false;
 
   SDLoc SL(ByteOffsetNode);
-  AMDGPUSubtarget::Generation Gen = Subtarget->getGeneration();
+  GCNSubtarget::Generation Gen = Subtarget->getGeneration();
   int64_t ByteOffset = C->getSExtValue();
   int64_t EncodedOffset = AMDGPU::getSMRDEncodedOffset(*Subtarget, ByteOffset);
 
@@ -1435,19 +1422,45 @@ bool AMDGPUDAGToDAGISel::SelectSMRDOffset(SDValue ByteOffsetNode,
   return true;
 }
 
+SDValue AMDGPUDAGToDAGISel::Expand32BitAddress(SDValue Addr) const {
+  if (Addr.getValueType() != MVT::i32)
+    return Addr;
+
+  // Zero-extend a 32-bit address.
+  SDLoc SL(Addr);
+
+  const MachineFunction &MF = CurDAG->getMachineFunction();
+  const SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();
+  unsigned AddrHiVal = Info->get32BitAddressHighBits();
+  SDValue AddrHi = CurDAG->getTargetConstant(AddrHiVal, SL, MVT::i32);
+
+  const SDValue Ops[] = {
+    CurDAG->getTargetConstant(AMDGPU::SReg_64_XEXECRegClassID, SL, MVT::i32),
+    Addr,
+    CurDAG->getTargetConstant(AMDGPU::sub0, SL, MVT::i32),
+    SDValue(CurDAG->getMachineNode(AMDGPU::S_MOV_B32, SL, MVT::i32, AddrHi),
+            0),
+    CurDAG->getTargetConstant(AMDGPU::sub1, SL, MVT::i32),
+  };
+
+  return SDValue(CurDAG->getMachineNode(AMDGPU::REG_SEQUENCE, SL, MVT::i64,
+                                        Ops), 0);
+}
+
 bool AMDGPUDAGToDAGISel::SelectSMRD(SDValue Addr, SDValue &SBase,
                                      SDValue &Offset, bool &Imm) const {
   SDLoc SL(Addr);
+
   if (CurDAG->isBaseWithConstantOffset(Addr)) {
     SDValue N0 = Addr.getOperand(0);
     SDValue N1 = Addr.getOperand(1);
 
     if (SelectSMRDOffset(N1, Offset, Imm)) {
-      SBase = N0;
+      SBase = Expand32BitAddress(N0);
       return true;
     }
   }
-  SBase = Addr;
+  SBase = Expand32BitAddress(Addr);
   Offset = CurDAG->getTargetConstant(0, SL, MVT::i32);
   Imm = true;
   return true;
@@ -1651,7 +1664,7 @@ bool AMDGPUDAGToDAGISel::isCBranchSCC(const SDNode *N) const {
     return true;
 
   if (VT == MVT::i64) {
-    auto ST = static_cast<const SISubtarget *>(Subtarget);
+    auto ST = static_cast<const GCNSubtarget *>(Subtarget);
 
     ISD::CondCode CC = cast<CondCodeSDNode>(Cond.getOperand(2))->get();
     return (CC == ISD::SETEQ || CC == ISD::SETNE) && ST->hasScalarCompareEq64();
@@ -1674,15 +1687,39 @@ void AMDGPUDAGToDAGISel::SelectBRCOND(SDNode *N) {
   unsigned CondReg = UseSCCBr ? AMDGPU::SCC : AMDGPU::VCC;
   SDLoc SL(N);
 
+  if (!UseSCCBr) {
+    // This is the case that we are selecting to S_CBRANCH_VCCNZ.  We have not
+    // analyzed what generates the vcc value, so we do not know whether vcc
+    // bits for disabled lanes are 0.  Thus we need to mask out bits for
+    // disabled lanes.
+    //
+    // For the case that we select S_CBRANCH_SCC1 and it gets
+    // changed to S_CBRANCH_VCCNZ in SIFixSGPRCopies, SIFixSGPRCopies calls
+    // SIInstrInfo::moveToVALU which inserts the S_AND).
+    //
+    // We could add an analysis of what generates the vcc value here and omit
+    // the S_AND when is unnecessary. But it would be better to add a separate
+    // pass after SIFixSGPRCopies to do the unnecessary S_AND removal, so it
+    // catches both cases.
+    Cond = SDValue(CurDAG->getMachineNode(AMDGPU::S_AND_B64, SL, MVT::i1,
+                               CurDAG->getRegister(AMDGPU::EXEC, MVT::i1),
+                               Cond),
+                   0);
+  }
+
   SDValue VCC = CurDAG->getCopyToReg(N->getOperand(0), SL, CondReg, Cond);
   CurDAG->SelectNodeTo(N, BrOp, MVT::Other,
                        N->getOperand(2), // Basic Block
                        VCC.getValue(0));
 }
 
-void AMDGPUDAGToDAGISel::SelectFMAD(SDNode *N) {
+void AMDGPUDAGToDAGISel::SelectFMAD_FMA(SDNode *N) {
   MVT VT = N->getSimpleValueType(0);
-  if (VT != MVT::f32 || !Subtarget->hasMadMixInsts()) {
+  bool IsFMA = N->getOpcode() == ISD::FMA;
+  if (VT != MVT::f32 || (!Subtarget->hasMadMixInsts() &&
+                         !Subtarget->hasFmaMixInsts()) ||
+      ((IsFMA && Subtarget->hasMadMixInsts()) ||
+       (!IsFMA && Subtarget->hasFmaMixInsts()))) {
     SelectCode(N);
     return;
   }
@@ -1692,13 +1729,13 @@ void AMDGPUDAGToDAGISel::SelectFMAD(SDNode *N) {
   SDValue Src2 = N->getOperand(2);
   unsigned Src0Mods, Src1Mods, Src2Mods;
 
-  // Avoid using v_mad_mix_f32 unless there is actually an operand using the
-  // conversion from f16.
+  // Avoid using v_mad_mix_f32/v_fma_mix_f32 unless there is actually an operand
+  // using the conversion from f16.
   bool Sel0 = SelectVOP3PMadMixModsImpl(Src0, Src0, Src0Mods);
   bool Sel1 = SelectVOP3PMadMixModsImpl(Src1, Src1, Src1Mods);
   bool Sel2 = SelectVOP3PMadMixModsImpl(Src2, Src2, Src2Mods);
 
-  assert(!Subtarget->hasFP32Denormals() &&
+  assert((IsFMA || !Subtarget->hasFP32Denormals()) &&
          "fmad selected with denormals enabled");
   // TODO: We can select this with f32 denormals enabled if all the sources are
   // converted from f16 (in which case fmad isn't legal).
@@ -1714,7 +1751,9 @@ void AMDGPUDAGToDAGISel::SelectFMAD(SDNode *N) {
       Zero, Zero
     };
 
-    CurDAG->SelectNodeTo(N, AMDGPU::V_MAD_MIX_F32, MVT::f32, Ops);
+    CurDAG->SelectNodeTo(N,
+                         IsFMA ? AMDGPU::V_FMA_MIX_F32 : AMDGPU::V_MAD_MIX_F32,
+                         MVT::f32, Ops);
   } else {
     SelectCode(N);
   }
@@ -2100,6 +2139,41 @@ void AMDGPUDAGToDAGISel::PostprocessISelDAG() {
   } while (IsModified);
 }
 
+bool R600DAGToDAGISel::runOnMachineFunction(MachineFunction &MF) {
+  Subtarget = &MF.getSubtarget<R600Subtarget>();
+  return SelectionDAGISel::runOnMachineFunction(MF);
+}
+
+bool R600DAGToDAGISel::isConstantLoad(const MemSDNode *N, int CbId) const {
+  if (!N->readMem())
+    return false;
+  if (CbId == -1)
+    return N->getAddressSpace() == AMDGPUASI.CONSTANT_ADDRESS ||
+           N->getAddressSpace() == AMDGPUASI.CONSTANT_ADDRESS_32BIT;
+
+  return N->getAddressSpace() == AMDGPUASI.CONSTANT_BUFFER_0 + CbId;
+}
+
+bool R600DAGToDAGISel::SelectGlobalValueConstantOffset(SDValue Addr,
+                                                         SDValue& IntPtr) {
+  if (ConstantSDNode *Cst = dyn_cast<ConstantSDNode>(Addr)) {
+    IntPtr = CurDAG->getIntPtrConstant(Cst->getZExtValue() / 4, SDLoc(Addr),
+                                       true);
+    return true;
+  }
+  return false;
+}
+
+bool R600DAGToDAGISel::SelectGlobalValueVariableOffset(SDValue Addr,
+    SDValue& BaseReg, SDValue &Offset) {
+  if (!isa<ConstantSDNode>(Addr)) {
+    BaseReg = Addr;
+    Offset = CurDAG->getIntPtrConstant(0, SDLoc(Addr), true);
+    return true;
+  }
+  return false;
+}
+
 void R600DAGToDAGISel::Select(SDNode *N) {
   unsigned int Opc = N->getOpcode();
   if (N->isMachineOpcode()) {
@@ -2120,12 +2194,12 @@ void R600DAGToDAGISel::Select(SDNode *N) {
     // pass. We want to avoid 128 bits copies as much as possible because they
     // can't be bundled by our scheduler.
     switch(NumVectorElts) {
-    case 2: RegClassID = AMDGPU::R600_Reg64RegClassID; break;
+    case 2: RegClassID = R600::R600_Reg64RegClassID; break;
     case 4:
       if (Opc == AMDGPUISD::BUILD_VERTICAL_VECTOR)
-        RegClassID = AMDGPU::R600_Reg128VerticalRegClassID;
+        RegClassID = R600::R600_Reg128VerticalRegClassID;
       else
-        RegClassID = AMDGPU::R600_Reg128RegClassID;
+        RegClassID = R600::R600_Reg128RegClassID;
       break;
     default: llvm_unreachable("Do not know how to lower this BUILD_VECTOR");
     }
@@ -2143,11 +2217,11 @@ bool R600DAGToDAGISel::SelectADDRIndirect(SDValue Addr, SDValue &Base,
   SDLoc DL(Addr);
 
   if ((C = dyn_cast<ConstantSDNode>(Addr))) {
-    Base = CurDAG->getRegister(AMDGPU::INDIRECT_BASE_ADDR, MVT::i32);
+    Base = CurDAG->getRegister(R600::INDIRECT_BASE_ADDR, MVT::i32);
     Offset = CurDAG->getTargetConstant(C->getZExtValue(), DL, MVT::i32);
   } else if ((Addr.getOpcode() == AMDGPUISD::DWORDADDR) &&
              (C = dyn_cast<ConstantSDNode>(Addr.getOperand(0)))) {
-    Base = CurDAG->getRegister(AMDGPU::INDIRECT_BASE_ADDR, MVT::i32);
+    Base = CurDAG->getRegister(R600::INDIRECT_BASE_ADDR, MVT::i32);
     Offset = CurDAG->getTargetConstant(C->getZExtValue(), DL, MVT::i32);
   } else if ((Addr.getOpcode() == ISD::ADD || Addr.getOpcode() == ISD::OR) &&
             (C = dyn_cast<ConstantSDNode>(Addr.getOperand(1)))) {
@@ -2178,7 +2252,7 @@ bool R600DAGToDAGISel::SelectADDRVTX_READ(SDValue Addr, SDValue &Base,
              && isInt<16>(IMMOffset->getZExtValue())) {
     Base = CurDAG->getCopyFromReg(CurDAG->getEntryNode(),
                                   SDLoc(CurDAG->getEntryNode()),
-                                  AMDGPU::ZERO, MVT::i32);
+                                  R600::ZERO, MVT::i32);
     Offset = CurDAG->getTargetConstant(IMMOffset->getZExtValue(), SDLoc(Addr),
                                        MVT::i32);
     return true;
diff --git a/lib/Target/AMDGPU/AMDGPUISelLowering.cpp b/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
index 49929441ef21..b201126c593b 100644
--- a/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
+++ b/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
@@ -8,7 +8,7 @@
 //===----------------------------------------------------------------------===//
 //
 /// \file
-/// \brief This is the parent TargetLowering class for hardware code gen
+/// This is the parent TargetLowering class for hardware code gen
 /// targets.
 //
 //===----------------------------------------------------------------------===//
@@ -25,9 +25,12 @@
 #include "AMDGPURegisterInfo.h"
 #include "AMDGPUSubtarget.h"
 #include "AMDGPUTargetMachine.h"
+#include "Utils/AMDGPUBaseInfo.h"
 #include "R600MachineFunctionInfo.h"
 #include "SIInstrInfo.h"
 #include "SIMachineFunctionInfo.h"
+#include "MCTargetDesc/AMDGPUMCTargetDesc.h"
+#include "llvm/CodeGen/Analysis.h"
 #include "llvm/CodeGen/CallingConvLower.h"
 #include "llvm/CodeGen/MachineFunction.h"
 #include "llvm/CodeGen/MachineRegisterInfo.h"
@@ -38,18 +41,6 @@
 #include "llvm/Support/KnownBits.h"
 using namespace llvm;
 
-static bool allocateKernArg(unsigned ValNo, MVT ValVT, MVT LocVT,
-                            CCValAssign::LocInfo LocInfo,
-                            ISD::ArgFlagsTy ArgFlags, CCState &State) {
-  MachineFunction &MF = State.getMachineFunction();
-  AMDGPUMachineFunction *MFI = MF.getInfo<AMDGPUMachineFunction>();
-
-  uint64_t Offset = MFI->allocateKernArg(LocVT.getStoreSize(),
-                                         ArgFlags.getOrigAlign());
-  State.addLoc(CCValAssign::getCustomMem(ValNo, ValVT, Offset, LocVT, LocInfo));
-  return true;
-}
-
 static bool allocateCCRegs(unsigned ValNo, MVT ValVT, MVT LocVT,
                            CCValAssign::LocInfo LocInfo,
                            ISD::ArgFlagsTy ArgFlags, CCState &State,
@@ -71,7 +62,9 @@ static bool allocateSGPRTuple(unsigned ValNo, MVT ValVT, MVT LocVT,
   case MVT::i64:
   case MVT::f64:
   case MVT::v2i32:
-  case MVT::v2f32: {
+  case MVT::v2f32:
+  case MVT::v4i16:
+  case MVT::v4f16: {
     // Up to SGPR0-SGPR39
     return allocateCCRegs(ValNo, ValVT, LocVT, LocInfo, ArgFlags, State,
                           &AMDGPU::SGPR_64RegClass, 20);
@@ -92,7 +85,9 @@ static bool allocateVGPRTuple(unsigned ValNo, MVT ValVT, MVT LocVT,
   case MVT::i64:
   case MVT::f64:
   case MVT::v2i32:
-  case MVT::v2f32: {
+  case MVT::v2f32:
+  case MVT::v4i16:
+  case MVT::v4f16: {
     return allocateCCRegs(ValNo, ValVT, LocVT, LocInfo, ArgFlags, State,
                           &AMDGPU::VReg_64RegClass, 31);
   }
@@ -324,10 +319,6 @@ AMDGPUTargetLowering::AMDGPUTargetLowering(const TargetMachine &TM,
   setOperationAction(ISD::FLOG, MVT::f32, Custom);
   setOperationAction(ISD::FLOG10, MVT::f32, Custom);
 
-  if (Subtarget->has16BitInsts()) {
-    setOperationAction(ISD::FLOG, MVT::f16, Custom);
-    setOperationAction(ISD::FLOG10, MVT::f16, Custom);
-  }
 
   setOperationAction(ISD::FNEARBYINT, MVT::f32, Custom);
   setOperationAction(ISD::FNEARBYINT, MVT::f64, Custom);
@@ -335,10 +326,6 @@ AMDGPUTargetLowering::AMDGPUTargetLowering(const TargetMachine &TM,
   setOperationAction(ISD::FREM, MVT::f32, Custom);
   setOperationAction(ISD::FREM, MVT::f64, Custom);
 
-  // v_mad_f32 does not support denormals according to some sources.
-  if (!Subtarget->hasFP32Denormals())
-    setOperationAction(ISD::FMAD, MVT::f32, Legal);
-
   // Expand to fneg + fadd.
   setOperationAction(ISD::FSUB, MVT::f64, Expand);
 
@@ -353,19 +340,6 @@ AMDGPUTargetLowering::AMDGPUTargetLowering(const TargetMachine &TM,
   setOperationAction(ISD::EXTRACT_SUBVECTOR, MVT::v8f32, Custom);
   setOperationAction(ISD::EXTRACT_SUBVECTOR, MVT::v8i32, Custom);
 
-  if (Subtarget->getGeneration() < AMDGPUSubtarget::SEA_ISLANDS) {
-    setOperationAction(ISD::FCEIL, MVT::f64, Custom);
-    setOperationAction(ISD::FTRUNC, MVT::f64, Custom);
-    setOperationAction(ISD::FRINT, MVT::f64, Custom);
-    setOperationAction(ISD::FFLOOR, MVT::f64, Custom);
-  }
-
-  if (!Subtarget->hasBFI()) {
-    // fcopysign can be done in a single instruction with BFI.
-    setOperationAction(ISD::FCOPYSIGN, MVT::f32, Expand);
-    setOperationAction(ISD::FCOPYSIGN, MVT::f64, Expand);
-  }
-
   setOperationAction(ISD::FP16_TO_FP, MVT::f64, Expand);
   setOperationAction(ISD::FP_TO_FP16, MVT::f64, Custom);
   setOperationAction(ISD::FP_TO_FP16, MVT::f32, Custom);
@@ -389,13 +363,13 @@ AMDGPUTargetLowering::AMDGPUTargetLowering(const TargetMachine &TM,
     setOperationAction(ISD::BSWAP, VT, Expand);
     setOperationAction(ISD::CTTZ, VT, Expand);
     setOperationAction(ISD::CTLZ, VT, Expand);
-  }
-
-  if (!Subtarget->hasBCNT(32))
-    setOperationAction(ISD::CTPOP, MVT::i32, Expand);
 
-  if (!Subtarget->hasBCNT(64))
-    setOperationAction(ISD::CTPOP, MVT::i64, Expand);
+    // AMDGPU uses ADDC/SUBC/ADDE/SUBE
+    setOperationAction(ISD::ADDC, VT, Legal);
+    setOperationAction(ISD::SUBC, VT, Legal);
+    setOperationAction(ISD::ADDE, VT, Legal);
+    setOperationAction(ISD::SUBE, VT, Legal);
+  }
 
   // The hardware supports 32-bit ROTR, but not ROTL.
   setOperationAction(ISD::ROTL, MVT::i32, Expand);
@@ -416,28 +390,11 @@ AMDGPUTargetLowering::AMDGPUTargetLowering(const TargetMachine &TM,
   setOperationAction(ISD::SMAX, MVT::i32, Legal);
   setOperationAction(ISD::UMAX, MVT::i32, Legal);
 
-  if (Subtarget->hasFFBH())
-    setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::i32, Custom);
-
-  if (Subtarget->hasFFBL())
-    setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::i32, Custom);
-
   setOperationAction(ISD::CTTZ, MVT::i64, Custom);
   setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::i64, Custom);
   setOperationAction(ISD::CTLZ, MVT::i64, Custom);
   setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::i64, Custom);
 
-  // We only really have 32-bit BFE instructions (and 16-bit on VI).
-  //
-  // On SI+ there are 64-bit BFEs, but they are scalar only and there isn't any
-  // effort to match them now. We want this to be false for i64 cases when the
-  // extraction isn't restricted to the upper or lower half. Ideally we would
-  // have some pass reduce 64-bit extracts to 32-bit if possible. Extracts that
-  // span the midpoint are probably relatively rare, so don't worry about them
-  // for now.
-  if (Subtarget->hasBFE())
-    setHasExtractBitsInsn(true);
-
   static const MVT::SimpleValueType VectorIntTypes[] = {
     MVT::v2i32, MVT::v4i32
   };
@@ -468,10 +425,6 @@ AMDGPUTargetLowering::AMDGPUTargetLowering(const TargetMachine &TM,
     setOperationAction(ISD::UMUL_LOHI, VT, Expand);
     setOperationAction(ISD::SDIVREM, VT, Custom);
     setOperationAction(ISD::UDIVREM, VT, Expand);
-    setOperationAction(ISD::ADDC, VT, Expand);
-    setOperationAction(ISD::SUBC, VT, Expand);
-    setOperationAction(ISD::ADDE, VT, Expand);
-    setOperationAction(ISD::SUBE, VT, Expand);
     setOperationAction(ISD::SELECT, VT, Expand);
     setOperationAction(ISD::VSELECT, VT, Expand);
     setOperationAction(ISD::SELECT_CC, VT, Expand);
@@ -546,11 +499,6 @@ AMDGPUTargetLowering::AMDGPUTargetLowering(const TargetMachine &TM,
   // vector compares until that is fixed.
   setHasMultipleConditionRegisters(true);
 
-  // SI at least has hardware support for floating point exceptions, but no way
-  // of using or handling them is implemented. They are also optional in OpenCL
-  // (Section 7.3)
-  setHasFloatingPointExceptions(Subtarget->hasFPExceptions());
-
   PredictableSelectIsExpensive = false;
 
   // We want to find all load dependencies for long chains of stores to enable
@@ -573,6 +521,7 @@ AMDGPUTargetLowering::AMDGPUTargetLowering(const TargetMachine &TM,
   setTargetDAGCombine(ISD::SHL);
   setTargetDAGCombine(ISD::SRA);
   setTargetDAGCombine(ISD::SRL);
+  setTargetDAGCombine(ISD::TRUNCATE);
   setTargetDAGCombine(ISD::MUL);
   setTargetDAGCombine(ISD::MULHU);
   setTargetDAGCombine(ISD::MULHS);
@@ -607,6 +556,7 @@ static bool fnegFoldsIntoOp(unsigned Opc) {
   case ISD::FNEARBYINT:
   case AMDGPUISD::RCP:
   case AMDGPUISD::RCP_LEGACY:
+  case AMDGPUISD::RCP_IFLAG:
   case AMDGPUISD::SIN_HW:
   case AMDGPUISD::FMUL_LEGACY:
   case AMDGPUISD::FMIN_LEGACY:
@@ -748,6 +698,37 @@ bool AMDGPUTargetLowering::isCheapToSpeculateCtlz() const {
   return true;
 }
 
+bool AMDGPUTargetLowering::isSDNodeAlwaysUniform(const SDNode * N) const {
+  switch (N->getOpcode()) {
+    default:
+    return false;
+    case ISD::EntryToken:
+    case ISD::TokenFactor:
+      return true;
+    case ISD::INTRINSIC_WO_CHAIN:
+    {
+      unsigned IntrID = cast<ConstantSDNode>(N->getOperand(0))->getZExtValue();
+      switch (IntrID) {
+        default:
+        return false;
+        case Intrinsic::amdgcn_readfirstlane:
+        case Intrinsic::amdgcn_readlane:
+          return true;
+      }
+    }
+    break;
+    case ISD::LOAD:
+    {
+      const LoadSDNode * L = dyn_cast<LoadSDNode>(N);
+      if (L->getMemOperand()->getAddrSpace()
+      == AMDGPUASI.CONSTANT_ADDRESS_32BIT)
+        return true;
+      return false;
+    }
+    break;
+  }
+}
+
 //===---------------------------------------------------------------------===//
 // Target Properties
 //===---------------------------------------------------------------------===//
@@ -832,17 +813,6 @@ bool AMDGPUTargetLowering::isZExtFree(SDValue Val, EVT VT2) const {
   return isZExtFree(Val.getValueType(), VT2);
 }
 
-// v_mad_mix* support a conversion from f16 to f32.
-//
-// There is only one special case when denormals are enabled we don't currently,
-// where this is OK to use.
-bool AMDGPUTargetLowering::isFPExtFoldable(unsigned Opcode,
-                                           EVT DestVT, EVT SrcVT) const {
-  return Opcode == ISD::FMAD && Subtarget->hasMadMixInsts() &&
-         DestVT.getScalarType() == MVT::f32 && !Subtarget->hasFP32Denormals() &&
-         SrcVT.getScalarType() == MVT::f16;
-}
-
 bool AMDGPUTargetLowering::isNarrowingProfitable(EVT SrcVT, EVT DestVT) const {
   // There aren't really 64-bit registers, but pairs of 32-bit ones and only a
   // limited number of native 64-bit operations. Shrinking an operation to fit
@@ -862,7 +832,7 @@ CCAssignFn *AMDGPUCallLowering::CCAssignFnForCall(CallingConv::ID CC,
   switch (CC) {
   case CallingConv::AMDGPU_KERNEL:
   case CallingConv::SPIR_KERNEL:
-    return CC_AMDGPU_Kernel;
+    llvm_unreachable("kernels should not be handled here");
   case CallingConv::AMDGPU_VS:
   case CallingConv::AMDGPU_GS:
   case CallingConv::AMDGPU_PS:
@@ -885,7 +855,7 @@ CCAssignFn *AMDGPUCallLowering::CCAssignFnForReturn(CallingConv::ID CC,
   switch (CC) {
   case CallingConv::AMDGPU_KERNEL:
   case CallingConv::SPIR_KERNEL:
-    return CC_AMDGPU_Kernel;
+    llvm_unreachable("kernels should not be handled here");
   case CallingConv::AMDGPU_VS:
   case CallingConv::AMDGPU_GS:
   case CallingConv::AMDGPU_PS:
@@ -929,74 +899,118 @@ CCAssignFn *AMDGPUCallLowering::CCAssignFnForReturn(CallingConv::ID CC,
 /// for each individual part is i8.  We pass the memory type as LocVT to the
 /// calling convention analysis function and the register type (Ins[x].VT) as
 /// the ValVT.
-void AMDGPUTargetLowering::analyzeFormalArgumentsCompute(CCState &State,
-                             const SmallVectorImpl<ISD::InputArg> &Ins) const {
-  for (unsigned i = 0, e = Ins.size(); i != e; ++i) {
-    const ISD::InputArg &In = Ins[i];
-    EVT MemVT;
-
-    unsigned NumRegs = getNumRegisters(State.getContext(), In.ArgVT);
-
-    if (!Subtarget->isAmdHsaOS() &&
-        (In.ArgVT == MVT::i16 || In.ArgVT == MVT::i8 || In.ArgVT == MVT::f16)) {
-      // The ABI says the caller will extend these values to 32-bits.
-      MemVT = In.ArgVT.isInteger() ? MVT::i32 : MVT::f32;
-    } else if (NumRegs == 1) {
-      // This argument is not split, so the IR type is the memory type.
-      assert(!In.Flags.isSplit());
-      if (In.ArgVT.isExtended()) {
-        // We have an extended type, like i24, so we should just use the register type
-        MemVT = In.VT;
-      } else {
-        MemVT = In.ArgVT;
-      }
-    } else if (In.ArgVT.isVector() && In.VT.isVector() &&
-               In.ArgVT.getScalarType() == In.VT.getScalarType()) {
-      assert(In.ArgVT.getVectorNumElements() > In.VT.getVectorNumElements());
-      // We have a vector value which has been split into a vector with
-      // the same scalar type, but fewer elements.  This should handle
-      // all the floating-point vector types.
-      MemVT = In.VT;
-    } else if (In.ArgVT.isVector() &&
-               In.ArgVT.getVectorNumElements() == NumRegs) {
-      // This arg has been split so that each element is stored in a separate
-      // register.
-      MemVT = In.ArgVT.getScalarType();
-    } else if (In.ArgVT.isExtended()) {
-      // We have an extended type, like i65.
-      MemVT = In.VT;
-    } else {
-      unsigned MemoryBits = In.ArgVT.getStoreSizeInBits() / NumRegs;
-      assert(In.ArgVT.getStoreSizeInBits() % NumRegs == 0);
-      if (In.VT.isInteger()) {
-        MemVT = EVT::getIntegerVT(State.getContext(), MemoryBits);
-      } else if (In.VT.isVector()) {
-        assert(!In.VT.getScalarType().isFloatingPoint());
-        unsigned NumElements = In.VT.getVectorNumElements();
-        assert(MemoryBits % NumElements == 0);
-        // This vector type has been split into another vector type with
-        // a different elements size.
-        EVT ScalarVT = EVT::getIntegerVT(State.getContext(),
-                                         MemoryBits / NumElements);
-        MemVT = EVT::getVectorVT(State.getContext(), ScalarVT, NumElements);
+void AMDGPUTargetLowering::analyzeFormalArgumentsCompute(
+  CCState &State,
+  const SmallVectorImpl<ISD::InputArg> &Ins) const {
+  const MachineFunction &MF = State.getMachineFunction();
+  const Function &Fn = MF.getFunction();
+  LLVMContext &Ctx = Fn.getParent()->getContext();
+  const AMDGPUSubtarget &ST = AMDGPUSubtarget::get(MF);
+  const unsigned ExplicitOffset = ST.getExplicitKernelArgOffset(Fn);
+
+  unsigned MaxAlign = 1;
+  uint64_t ExplicitArgOffset = 0;
+  const DataLayout &DL = Fn.getParent()->getDataLayout();
+
+  unsigned InIndex = 0;
+
+  for (const Argument &Arg : Fn.args()) {
+    Type *BaseArgTy = Arg.getType();
+    unsigned Align = DL.getABITypeAlignment(BaseArgTy);
+    MaxAlign = std::max(Align, MaxAlign);
+    unsigned AllocSize = DL.getTypeAllocSize(BaseArgTy);
+
+    uint64_t ArgOffset = alignTo(ExplicitArgOffset, Align) + ExplicitOffset;
+    ExplicitArgOffset = alignTo(ExplicitArgOffset, Align) + AllocSize;
+
+    // We're basically throwing away everything passed into us and starting over
+    // to get accurate in-memory offsets. The "PartOffset" is completely useless
+    // to us as computed in Ins.
+    //
+    // We also need to figure out what type legalization is trying to do to get
+    // the correct memory offsets.
+
+    SmallVector<EVT, 16> ValueVTs;
+    SmallVector<uint64_t, 16> Offsets;
+    ComputeValueVTs(*this, DL, BaseArgTy, ValueVTs, &Offsets, ArgOffset);
+
+    for (unsigned Value = 0, NumValues = ValueVTs.size();
+         Value != NumValues; ++Value) {
+      uint64_t BasePartOffset = Offsets[Value];
+
+      EVT ArgVT = ValueVTs[Value];
+      EVT MemVT = ArgVT;
+      MVT RegisterVT =
+        getRegisterTypeForCallingConv(Ctx, ArgVT);
+      unsigned NumRegs =
+        getNumRegistersForCallingConv(Ctx, ArgVT);
+
+      if (!Subtarget->isAmdHsaOS() &&
+          (ArgVT == MVT::i16 || ArgVT == MVT::i8 || ArgVT == MVT::f16)) {
+        // The ABI says the caller will extend these values to 32-bits.
+        MemVT = ArgVT.isInteger() ? MVT::i32 : MVT::f32;
+      } else if (NumRegs == 1) {
+        // This argument is not split, so the IR type is the memory type.
+        if (ArgVT.isExtended()) {
+          // We have an extended type, like i24, so we should just use the
+          // register type.
+          MemVT = RegisterVT;
+        } else {
+          MemVT = ArgVT;
+        }
+      } else if (ArgVT.isVector() && RegisterVT.isVector() &&
+                 ArgVT.getScalarType() == RegisterVT.getScalarType()) {
+        assert(ArgVT.getVectorNumElements() > RegisterVT.getVectorNumElements());
+        // We have a vector value which has been split into a vector with
+        // the same scalar type, but fewer elements.  This should handle
+        // all the floating-point vector types.
+        MemVT = RegisterVT;
+      } else if (ArgVT.isVector() &&
+                 ArgVT.getVectorNumElements() == NumRegs) {
+        // This arg has been split so that each element is stored in a separate
+        // register.
+        MemVT = ArgVT.getScalarType();
+      } else if (ArgVT.isExtended()) {
+        // We have an extended type, like i65.
+        MemVT = RegisterVT;
       } else {
-        llvm_unreachable("cannot deduce memory type.");
+        unsigned MemoryBits = ArgVT.getStoreSizeInBits() / NumRegs;
+        assert(ArgVT.getStoreSizeInBits() % NumRegs == 0);
+        if (RegisterVT.isInteger()) {
+          MemVT = EVT::getIntegerVT(State.getContext(), MemoryBits);
+        } else if (RegisterVT.isVector()) {
+          assert(!RegisterVT.getScalarType().isFloatingPoint());
+          unsigned NumElements = RegisterVT.getVectorNumElements();
+          assert(MemoryBits % NumElements == 0);
+          // This vector type has been split into another vector type with
+          // a different elements size.
+          EVT ScalarVT = EVT::getIntegerVT(State.getContext(),
+                                           MemoryBits / NumElements);
+          MemVT = EVT::getVectorVT(State.getContext(), ScalarVT, NumElements);
+        } else {
+          llvm_unreachable("cannot deduce memory type.");
+        }
       }
-    }
 
-    // Convert one element vectors to scalar.
-    if (MemVT.isVector() && MemVT.getVectorNumElements() == 1)
-      MemVT = MemVT.getScalarType();
+      // Convert one element vectors to scalar.
+      if (MemVT.isVector() && MemVT.getVectorNumElements() == 1)
+        MemVT = MemVT.getScalarType();
 
-    if (MemVT.isExtended()) {
-      // This should really only happen if we have vec3 arguments
-      assert(MemVT.isVector() && MemVT.getVectorNumElements() == 3);
-      MemVT = MemVT.getPow2VectorType(State.getContext());
-    }
+      if (MemVT.isExtended()) {
+        // This should really only happen if we have vec3 arguments
+        assert(MemVT.isVector() && MemVT.getVectorNumElements() == 3);
+        MemVT = MemVT.getPow2VectorType(State.getContext());
+      }
 
-    assert(MemVT.isSimple());
-    allocateKernArg(i, In.VT, MemVT.getSimpleVT(), CCValAssign::Full, In.Flags,
-                    State);
+      unsigned PartOffset = 0;
+      for (unsigned i = 0; i != NumRegs; ++i) {
+        State.addLoc(CCValAssign::getCustomMem(InIndex++, RegisterVT,
+                                               BasePartOffset + PartOffset,
+                                               MemVT.getSimpleVT(),
+                                               CCValAssign::Full));
+        PartOffset += MemVT.getStoreSize();
+      }
+    }
   }
 }
 
@@ -1178,7 +1192,15 @@ SDValue AMDGPUTargetLowering::LowerGlobalAddress(AMDGPUMachineFunction* MFI,
   GlobalAddressSDNode *G = cast<GlobalAddressSDNode>(Op);
   const GlobalValue *GV = G->getGlobal();
 
-  if  (G->getAddressSpace() == AMDGPUASI.LOCAL_ADDRESS) {
+  if (G->getAddressSpace() == AMDGPUASI.LOCAL_ADDRESS ||
+      G->getAddressSpace() == AMDGPUASI.REGION_ADDRESS) {
+    if (!MFI->isEntryFunction()) {
+      const Function &Fn = DAG.getMachineFunction().getFunction();
+      DiagnosticInfoUnsupported BadLDSDecl(
+        Fn, "local memory global used by non-kernel function", SDLoc(Op).getDebugLoc());
+      DAG.getContext()->diagnose(BadLDSDecl);
+    }
+
     // XXX: What does the value of G->getOffset() mean?
     assert(G->getOffset() == 0 &&
          "Do not know what to do with an non-zero offset");
@@ -1201,6 +1223,16 @@ SDValue AMDGPUTargetLowering::LowerCONCAT_VECTORS(SDValue Op,
                                                   SelectionDAG &DAG) const {
   SmallVector<SDValue, 8> Args;
 
+  EVT VT = Op.getValueType();
+  if (VT == MVT::v4i16 || VT == MVT::v4f16) {
+    SDLoc SL(Op);
+    SDValue Lo = DAG.getNode(ISD::BITCAST, SL, MVT::i32, Op.getOperand(0));
+    SDValue Hi = DAG.getNode(ISD::BITCAST, SL, MVT::i32, Op.getOperand(1));
+
+    SDValue BV = DAG.getBuildVector(MVT::v2i32, SL, { Lo, Hi });
+    return DAG.getNode(ISD::BITCAST, SL, VT, BV);
+  }
+
   for (const SDUse &U : Op->ops())
     DAG.ExtractVectorElements(U.get(), Args);
 
@@ -1219,7 +1251,7 @@ SDValue AMDGPUTargetLowering::LowerEXTRACT_SUBVECTOR(SDValue Op,
   return DAG.getBuildVector(Op.getValueType(), SDLoc(Op), Args);
 }
 
-/// \brief Generate Min/Max node
+/// Generate Min/Max node
 SDValue AMDGPUTargetLowering::combineFMinMaxLegacy(const SDLoc &DL, EVT VT,
                                                    SDValue LHS, SDValue RHS,
                                                    SDValue True, SDValue False,
@@ -1985,7 +2017,7 @@ SDValue AMDGPUTargetLowering::LowerFTRUNC(SDValue Op, SelectionDAG &DAG) const {
   const SDValue SignBitMask = DAG.getConstant(UINT32_C(1) << 31, SL, MVT::i32);
   SDValue SignBit = DAG.getNode(ISD::AND, SL, MVT::i32, Hi, SignBitMask);
 
-  // Extend back to to 64-bits.
+  // Extend back to 64-bits.
   SDValue SignBit64 = DAG.getBuildVector(MVT::v2i32, SL, {Zero, SignBit});
   SignBit64 = DAG.getNode(ISD::BITCAST, SL, MVT::i64, SignBit64);
 
@@ -2806,28 +2838,6 @@ SDValue AMDGPUTargetLowering::performStoreCombine(SDNode *N,
                       SN->getBasePtr(), SN->getMemOperand());
 }
 
-SDValue AMDGPUTargetLowering::performClampCombine(SDNode *N,
-                                                  DAGCombinerInfo &DCI) const {
-  ConstantFPSDNode *CSrc = dyn_cast<ConstantFPSDNode>(N->getOperand(0));
-  if (!CSrc)
-    return SDValue();
-
-  const APFloat &F = CSrc->getValueAPF();
-  APFloat Zero = APFloat::getZero(F.getSemantics());
-  APFloat::cmpResult Cmp0 = F.compare(Zero);
-  if (Cmp0 == APFloat::cmpLessThan ||
-      (Cmp0 == APFloat::cmpUnordered && Subtarget->enableDX10Clamp())) {
-    return DCI.DAG.getConstantFP(Zero, SDLoc(N), N->getValueType(0));
-  }
-
-  APFloat One(F.getSemantics(), "1.0");
-  APFloat::cmpResult Cmp1 = F.compare(One);
-  if (Cmp1 == APFloat::cmpGreaterThan)
-    return DCI.DAG.getConstantFP(One, SDLoc(N), N->getValueType(0));
-
-  return SDValue(CSrc, 0);
-}
-
 // FIXME: This should go in generic DAG combiner with an isTruncateFree check,
 // but isTruncateFree is inaccurate for i16 now because of SALU vs. VALU
 // issues.
@@ -2903,7 +2913,7 @@ SDValue AMDGPUTargetLowering::performShlCombine(SDNode *N,
     SDValue X = LHS->getOperand(0);
 
     if (VT == MVT::i32 && RHSVal == 16 && X.getValueType() == MVT::i16 &&
-        isTypeLegal(MVT::v2i16)) {
+        isOperationLegal(ISD::BUILD_VECTOR, MVT::v2i16)) {
       // Prefer build_vector as the canonical form if packed types are legal.
       // (shl ([asz]ext i16:x), 16 -> build_vector 0, x
       SDValue Vec = DAG.getBuildVector(MVT::v2i16, SL,
@@ -3017,6 +3027,92 @@ SDValue AMDGPUTargetLowering::performSrlCombine(SDNode *N,
   return DAG.getNode(ISD::BITCAST, SL, MVT::i64, BuildPair);
 }
 
+SDValue AMDGPUTargetLowering::performTruncateCombine(
+  SDNode *N, DAGCombinerInfo &DCI) const {
+  SDLoc SL(N);
+  SelectionDAG &DAG = DCI.DAG;
+  EVT VT = N->getValueType(0);
+  SDValue Src = N->getOperand(0);
+
+  // vt1 (truncate (bitcast (build_vector vt0:x, ...))) -> vt1 (bitcast vt0:x)
+  if (Src.getOpcode() == ISD::BITCAST) {
+    SDValue Vec = Src.getOperand(0);
+    if (Vec.getOpcode() == ISD::BUILD_VECTOR) {
+      SDValue Elt0 = Vec.getOperand(0);
+      EVT EltVT = Elt0.getValueType();
+      if (VT.getSizeInBits() <= EltVT.getSizeInBits()) {
+        if (EltVT.isFloatingPoint()) {
+          Elt0 = DAG.getNode(ISD::BITCAST, SL,
+                             EltVT.changeTypeToInteger(), Elt0);
+        }
+
+        return DAG.getNode(ISD::TRUNCATE, SL, VT, Elt0);
+      }
+    }
+  }
+
+  // Equivalent of above for accessing the high element of a vector as an
+  // integer operation.
+  // trunc (srl (bitcast (build_vector x, y))), 16 -> trunc (bitcast y)
+  if (Src.getOpcode() == ISD::SRL && !VT.isVector()) {
+    if (auto K = isConstOrConstSplat(Src.getOperand(1))) {
+      if (2 * K->getZExtValue() == Src.getValueType().getScalarSizeInBits()) {
+        SDValue BV = stripBitcast(Src.getOperand(0));
+        if (BV.getOpcode() == ISD::BUILD_VECTOR &&
+            BV.getValueType().getVectorNumElements() == 2) {
+          SDValue SrcElt = BV.getOperand(1);
+          EVT SrcEltVT = SrcElt.getValueType();
+          if (SrcEltVT.isFloatingPoint()) {
+            SrcElt = DAG.getNode(ISD::BITCAST, SL,
+                                 SrcEltVT.changeTypeToInteger(), SrcElt);
+          }
+
+          return DAG.getNode(ISD::TRUNCATE, SL, VT, SrcElt);
+        }
+      }
+    }
+  }
+
+  // Partially shrink 64-bit shifts to 32-bit if reduced to 16-bit.
+  //
+  // i16 (trunc (srl i64:x, K)), K <= 16 ->
+  //     i16 (trunc (srl (i32 (trunc x), K)))
+  if (VT.getScalarSizeInBits() < 32) {
+    EVT SrcVT = Src.getValueType();
+    if (SrcVT.getScalarSizeInBits() > 32 &&
+        (Src.getOpcode() == ISD::SRL ||
+         Src.getOpcode() == ISD::SRA ||
+         Src.getOpcode() == ISD::SHL)) {
+      SDValue Amt = Src.getOperand(1);
+      KnownBits Known;
+      DAG.computeKnownBits(Amt, Known);
+      unsigned Size = VT.getScalarSizeInBits();
+      if ((Known.isConstant() && Known.getConstant().ule(Size)) ||
+          (Known.getBitWidth() - Known.countMinLeadingZeros() <= Log2_32(Size))) {
+        EVT MidVT = VT.isVector() ?
+          EVT::getVectorVT(*DAG.getContext(), MVT::i32,
+                           VT.getVectorNumElements()) : MVT::i32;
+
+        EVT NewShiftVT = getShiftAmountTy(MidVT, DAG.getDataLayout());
+        SDValue Trunc = DAG.getNode(ISD::TRUNCATE, SL, MidVT,
+                                    Src.getOperand(0));
+        DCI.AddToWorklist(Trunc.getNode());
+
+        if (Amt.getValueType() != NewShiftVT) {
+          Amt = DAG.getZExtOrTrunc(Amt, SL, NewShiftVT);
+          DCI.AddToWorklist(Amt.getNode());
+        }
+
+        SDValue ShrunkShift = DAG.getNode(Src.getOpcode(), SL, MidVT,
+                                          Trunc, Amt);
+        return DAG.getNode(ISD::TRUNCATE, SL, VT, ShrunkShift);
+      }
+    }
+  }
+
+  return SDValue();
+}
+
 // We need to specifically handle i64 mul here to avoid unnecessary conversion
 // instructions. If we only match on the legalized i64 mul expansion,
 // SimplifyDemandedBits will be unable to remove them because there will be
@@ -3058,6 +3154,17 @@ SDValue AMDGPUTargetLowering::performMulCombine(SDNode *N,
 
   SDValue N0 = N->getOperand(0);
   SDValue N1 = N->getOperand(1);
+
+  // SimplifyDemandedBits has the annoying habit of turning useful zero_extends
+  // in the source into any_extends if the result of the mul is truncated. Since
+  // we can assume the high bits are whatever we want, use the underlying value
+  // to avoid the unknown high bits from interfering.
+  if (N0.getOpcode() == ISD::ANY_EXTEND)
+    N0 = N0.getOperand(0);
+
+  if (N1.getOpcode() == ISD::ANY_EXTEND)
+    N1 = N1.getOperand(0);
+
   SDValue Mul;
 
   if (Subtarget->hasMulU24() && isU24(N0, DAG) && isU24(N1, DAG)) {
@@ -3495,6 +3602,7 @@ SDValue AMDGPUTargetLowering::performFNegCombine(SDNode *N,
   case ISD::FSIN:
   case AMDGPUISD::RCP:
   case AMDGPUISD::RCP_LEGACY:
+  case AMDGPUISD::RCP_IFLAG:
   case AMDGPUISD::SIN_HW: {
     SDValue CvtSrc = N0.getOperand(0);
     if (CvtSrc.getOpcode() == ISD::FNEG) {
@@ -3571,6 +3679,18 @@ SDValue AMDGPUTargetLowering::performFAbsCombine(SDNode *N,
   }
 }
 
+SDValue AMDGPUTargetLowering::performRcpCombine(SDNode *N,
+                                                DAGCombinerInfo &DCI) const {
+  const auto *CFP = dyn_cast<ConstantFPSDNode>(N->getOperand(0));
+  if (!CFP)
+    return SDValue();
+
+  // XXX - Should this flush denormals?
+  const APFloat &Val = CFP->getValueAPF();
+  APFloat One(Val.getSemantics(), "1.0");
+  return DCI.DAG.getConstantFP(One / Val, SDLoc(N), N->getValueType(0));
+}
+
 SDValue AMDGPUTargetLowering::PerformDAGCombine(SDNode *N,
                                                 DAGCombinerInfo &DCI) const {
   SelectionDAG &DAG = DCI.DAG;
@@ -3617,12 +3737,13 @@ SDValue AMDGPUTargetLowering::PerformDAGCombine(SDNode *N,
     // TODO: Generalize and move to DAGCombiner
     SDValue Src = N->getOperand(0);
     if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Src)) {
-      assert(Src.getValueType() == MVT::i64);
-      SDLoc SL(N);
-      uint64_t CVal = C->getZExtValue();
-      return DAG.getNode(ISD::BUILD_VECTOR, SL, DestVT,
-                         DAG.getConstant(Lo_32(CVal), SL, MVT::i32),
-                         DAG.getConstant(Hi_32(CVal), SL, MVT::i32));
+      if (Src.getValueType() == MVT::i64) {
+        SDLoc SL(N);
+        uint64_t CVal = C->getZExtValue();
+        return DAG.getNode(ISD::BUILD_VECTOR, SL, DestVT,
+                           DAG.getConstant(Lo_32(CVal), SL, MVT::i32),
+                           DAG.getConstant(Hi_32(CVal), SL, MVT::i32));
+      }
     }
 
     if (ConstantFPSDNode *C = dyn_cast<ConstantFPSDNode>(Src)) {
@@ -3656,6 +3777,8 @@ SDValue AMDGPUTargetLowering::PerformDAGCombine(SDNode *N,
 
     return performSraCombine(N, DCI);
   }
+  case ISD::TRUNCATE:
+    return performTruncateCombine(N, DCI);
   case ISD::MUL:
     return performMulCombine(N, DCI);
   case ISD::MULHS:
@@ -3768,18 +3891,9 @@ SDValue AMDGPUTargetLowering::PerformDAGCombine(SDNode *N,
     return performLoadCombine(N, DCI);
   case ISD::STORE:
     return performStoreCombine(N, DCI);
-  case AMDGPUISD::CLAMP:
-    return performClampCombine(N, DCI);
-  case AMDGPUISD::RCP: {
-    if (const auto *CFP = dyn_cast<ConstantFPSDNode>(N->getOperand(0))) {
-      // XXX - Should this flush denormals?
-      const APFloat &Val = CFP->getValueAPF();
-      APFloat One(Val.getSemantics(), "1.0");
-      return DAG.getConstantFP(One / Val, SDLoc(N), N->getValueType(0));
-    }
-
-    break;
-  }
+  case AMDGPUISD::RCP:
+  case AMDGPUISD::RCP_IFLAG:
+    return performRcpCombine(N, DCI);
   case ISD::AssertZext:
   case ISD::AssertSext:
     return performAssertSZExtCombine(N, DCI);
@@ -3856,9 +3970,14 @@ SDValue AMDGPUTargetLowering::loadInputValue(SelectionDAG &DAG,
 }
 
 uint32_t AMDGPUTargetLowering::getImplicitParameterOffset(
-    const AMDGPUMachineFunction *MFI, const ImplicitParameter Param) const {
-  unsigned Alignment = Subtarget->getAlignmentForImplicitArgPtr();
-  uint64_t ArgOffset = alignTo(MFI->getABIArgOffset(), Alignment);
+    const MachineFunction &MF, const ImplicitParameter Param) const {
+  const AMDGPUMachineFunction *MFI = MF.getInfo<AMDGPUMachineFunction>();
+  const AMDGPUSubtarget &ST =
+      AMDGPUSubtarget::get(getTargetMachine(), MF.getFunction());
+  unsigned ExplicitArgOffset = ST.getExplicitKernelArgOffset(MF.getFunction());
+  unsigned Alignment = ST.getAlignmentForImplicitArgPtr();
+  uint64_t ArgOffset = alignTo(MFI->getExplicitKernArgSize(), Alignment) +
+                       ExplicitArgOffset;
   switch (Param) {
   case GRID_DIM:
     return ArgOffset;
@@ -3907,6 +4026,7 @@ const char* AMDGPUTargetLowering::getTargetNodeName(unsigned Opcode) const {
   NODE_NAME_CASE(FMED3)
   NODE_NAME_CASE(SMED3)
   NODE_NAME_CASE(UMED3)
+  NODE_NAME_CASE(FDOT2)
   NODE_NAME_CASE(URECIP)
   NODE_NAME_CASE(DIV_SCALE)
   NODE_NAME_CASE(DIV_FMAS)
@@ -3917,6 +4037,7 @@ const char* AMDGPUTargetLowering::getTargetNodeName(unsigned Opcode) const {
   NODE_NAME_CASE(RSQ)
   NODE_NAME_CASE(RCP_LEGACY)
   NODE_NAME_CASE(RSQ_LEGACY)
+  NODE_NAME_CASE(RCP_IFLAG)
   NODE_NAME_CASE(FMUL_LEGACY)
   NODE_NAME_CASE(RSQ_CLAMP)
   NODE_NAME_CASE(LDEXP)
@@ -3941,6 +4062,7 @@ const char* AMDGPUTargetLowering::getTargetNodeName(unsigned Opcode) const {
   NODE_NAME_CASE(MAD_I24)
   NODE_NAME_CASE(MAD_I64_I32)
   NODE_NAME_CASE(MAD_U64_U32)
+  NODE_NAME_CASE(PERM)
   NODE_NAME_CASE(TEXTURE_FETCH)
   NODE_NAME_CASE(EXPORT)
   NODE_NAME_CASE(EXPORT_DONE)
@@ -3957,6 +4079,10 @@ const char* AMDGPUTargetLowering::getTargetNodeName(unsigned Opcode) const {
   NODE_NAME_CASE(CVT_F32_UBYTE2)
   NODE_NAME_CASE(CVT_F32_UBYTE3)
   NODE_NAME_CASE(CVT_PKRTZ_F16_F32)
+  NODE_NAME_CASE(CVT_PKNORM_I16_F32)
+  NODE_NAME_CASE(CVT_PKNORM_U16_F32)
+  NODE_NAME_CASE(CVT_PK_I16_I32)
+  NODE_NAME_CASE(CVT_PK_U16_U32)
   NODE_NAME_CASE(FP_TO_FP16)
   NODE_NAME_CASE(FP16_ZEXT)
   NODE_NAME_CASE(BUILD_VERTICAL_VECTOR)
@@ -3976,14 +4102,21 @@ const char* AMDGPUTargetLowering::getTargetNodeName(unsigned Opcode) const {
   NODE_NAME_CASE(LOAD_CONSTANT)
   NODE_NAME_CASE(TBUFFER_STORE_FORMAT)
   NODE_NAME_CASE(TBUFFER_STORE_FORMAT_X3)
+  NODE_NAME_CASE(TBUFFER_STORE_FORMAT_D16)
   NODE_NAME_CASE(TBUFFER_LOAD_FORMAT)
+  NODE_NAME_CASE(TBUFFER_LOAD_FORMAT_D16)
   NODE_NAME_CASE(ATOMIC_CMP_SWAP)
   NODE_NAME_CASE(ATOMIC_INC)
   NODE_NAME_CASE(ATOMIC_DEC)
+  NODE_NAME_CASE(ATOMIC_LOAD_FADD)
+  NODE_NAME_CASE(ATOMIC_LOAD_FMIN)
+  NODE_NAME_CASE(ATOMIC_LOAD_FMAX)
   NODE_NAME_CASE(BUFFER_LOAD)
   NODE_NAME_CASE(BUFFER_LOAD_FORMAT)
+  NODE_NAME_CASE(BUFFER_LOAD_FORMAT_D16)
   NODE_NAME_CASE(BUFFER_STORE)
   NODE_NAME_CASE(BUFFER_STORE_FORMAT)
+  NODE_NAME_CASE(BUFFER_STORE_FORMAT_D16)
   NODE_NAME_CASE(BUFFER_ATOMIC_SWAP)
   NODE_NAME_CASE(BUFFER_ATOMIC_ADD)
   NODE_NAME_CASE(BUFFER_ATOMIC_SUB)
@@ -3995,6 +4128,7 @@ const char* AMDGPUTargetLowering::getTargetNodeName(unsigned Opcode) const {
   NODE_NAME_CASE(BUFFER_ATOMIC_OR)
   NODE_NAME_CASE(BUFFER_ATOMIC_XOR)
   NODE_NAME_CASE(BUFFER_ATOMIC_CMPSWAP)
+
   case AMDGPUISD::LAST_AMDGPU_ISD_NUMBER: break;
   }
   return nullptr;
@@ -4108,14 +4242,45 @@ void AMDGPUTargetLowering::computeKnownBitsForTargetNode(
       Known.Zero.setHighBits(32 - MaxValBits);
     break;
   }
+  case AMDGPUISD::PERM: {
+    ConstantSDNode *CMask = dyn_cast<ConstantSDNode>(Op.getOperand(2));
+    if (!CMask)
+      return;
+
+    KnownBits LHSKnown, RHSKnown;
+    DAG.computeKnownBits(Op.getOperand(0), LHSKnown, Depth + 1);
+    DAG.computeKnownBits(Op.getOperand(1), RHSKnown, Depth + 1);
+    unsigned Sel = CMask->getZExtValue();
+
+    for (unsigned I = 0; I < 32; I += 8) {
+      unsigned SelBits = Sel & 0xff;
+      if (SelBits < 4) {
+        SelBits *= 8;
+        Known.One |= ((RHSKnown.One.getZExtValue() >> SelBits) & 0xff) << I;
+        Known.Zero |= ((RHSKnown.Zero.getZExtValue() >> SelBits) & 0xff) << I;
+      } else if (SelBits < 7) {
+        SelBits = (SelBits & 3) * 8;
+        Known.One |= ((LHSKnown.One.getZExtValue() >> SelBits) & 0xff) << I;
+        Known.Zero |= ((LHSKnown.Zero.getZExtValue() >> SelBits) & 0xff) << I;
+      } else if (SelBits == 0x0c) {
+        Known.Zero |= 0xff << I;
+      } else if (SelBits > 0x0c) {
+        Known.One |= 0xff << I;
+      }
+      Sel >>= 8;
+    }
+    break;
+  }
   case ISD::INTRINSIC_WO_CHAIN: {
     unsigned IID = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue();
     switch (IID) {
     case Intrinsic::amdgcn_mbcnt_lo:
     case Intrinsic::amdgcn_mbcnt_hi: {
+      const GCNSubtarget &ST =
+          DAG.getMachineFunction().getSubtarget<GCNSubtarget>();
       // These return at most the wavefront size - 1.
       unsigned Size = Op.getValueType().getSizeInBits();
-      Known.Zero.setHighBits(Size - Subtarget->getWavefrontSizeLog2());
+      Known.Zero.setHighBits(Size - ST.getWavefrontSizeLog2());
       break;
     }
     default:
diff --git a/lib/Target/AMDGPU/AMDGPUISelLowering.h b/lib/Target/AMDGPU/AMDGPUISelLowering.h
index 5c31bddd9b1a..a4c3b413e103 100644
--- a/lib/Target/AMDGPU/AMDGPUISelLowering.h
+++ b/lib/Target/AMDGPU/AMDGPUISelLowering.h
@@ -8,7 +8,7 @@
 //===----------------------------------------------------------------------===//
 //
 /// \file
-/// \brief Interface definition of the TargetLowering class that is common
+/// Interface definition of the TargetLowering class that is common
 /// to all AMD GPUs.
 //
 //===----------------------------------------------------------------------===//
@@ -28,6 +28,8 @@ struct ArgDescriptor;
 
 class AMDGPUTargetLowering : public TargetLowering {
 private:
+  const AMDGPUSubtarget *Subtarget;
+
   /// \returns AMDGPUISD::FFBH_U32 node if the incoming \p Op may have been
   /// legalized from a smaller type VT. Need to match pre-legalized type because
   /// the generic legalization inserts the add/sub between the select and
@@ -39,12 +41,11 @@ public:
   static unsigned numBitsSigned(SDValue Op, SelectionDAG &DAG);
 
 protected:
-  const AMDGPUSubtarget *Subtarget;
   AMDGPUAS AMDGPUASI;
 
   SDValue LowerEXTRACT_SUBVECTOR(SDValue Op, SelectionDAG &DAG) const;
   SDValue LowerCONCAT_VECTORS(SDValue Op, SelectionDAG &DAG) const;
-  /// \brief Split a vector store into multiple scalar stores.
+  /// Split a vector store into multiple scalar stores.
   /// \returns The resulting chain.
 
   SDValue LowerFREM(SDValue Op, SelectionDAG &DAG) const;
@@ -78,7 +79,6 @@ protected:
   bool shouldCombineMemoryType(EVT VT) const;
   SDValue performLoadCombine(SDNode *N, DAGCombinerInfo &DCI) const;
   SDValue performStoreCombine(SDNode *N, DAGCombinerInfo &DCI) const;
-  SDValue performClampCombine(SDNode *N, DAGCombinerInfo &DCI) const;
   SDValue performAssertSZExtCombine(SDNode *N, DAGCombinerInfo &DCI) const;
 
   SDValue splitBinaryBitConstantOpImpl(DAGCombinerInfo &DCI, const SDLoc &SL,
@@ -87,6 +87,7 @@ protected:
   SDValue performShlCombine(SDNode *N, DAGCombinerInfo &DCI) const;
   SDValue performSraCombine(SDNode *N, DAGCombinerInfo &DCI) const;
   SDValue performSrlCombine(SDNode *N, DAGCombinerInfo &DCI) const;
+  SDValue performTruncateCombine(SDNode *N, DAGCombinerInfo &DCI) const;
   SDValue performMulCombine(SDNode *N, DAGCombinerInfo &DCI) const;
   SDValue performMulhsCombine(SDNode *N, DAGCombinerInfo &DCI) const;
   SDValue performMulhuCombine(SDNode *N, DAGCombinerInfo &DCI) const;
@@ -96,6 +97,7 @@ protected:
   SDValue performSelectCombine(SDNode *N, DAGCombinerInfo &DCI) const;
   SDValue performFNegCombine(SDNode *N, DAGCombinerInfo &DCI) const;
   SDValue performFAbsCombine(SDNode *N, DAGCombinerInfo &DCI) const;
+  SDValue performRcpCombine(SDNode *N, DAGCombinerInfo &DCI) const;
 
   static EVT getEquivalentMemType(LLVMContext &Context, EVT VT);
 
@@ -108,10 +110,10 @@ protected:
   SDValue getLoHalf64(SDValue Op, SelectionDAG &DAG) const;
   SDValue getHiHalf64(SDValue Op, SelectionDAG &DAG) const;
 
-  /// \brief Split a vector load into 2 loads of half the vector.
+  /// Split a vector load into 2 loads of half the vector.
   SDValue SplitVectorLoad(SDValue Op, SelectionDAG &DAG) const;
 
-  /// \brief Split a vector store into 2 stores of half the vector.
+  /// Split a vector store into 2 stores of half the vector.
   SDValue SplitVectorStore(SDValue Op, SelectionDAG &DAG) const;
 
   SDValue LowerSTORE(SDValue Op, SelectionDAG &DAG) const;
@@ -120,8 +122,11 @@ protected:
   SDValue LowerDIVREM24(SDValue Op, SelectionDAG &DAG, bool sign) const;
   void LowerUDIVREM64(SDValue Op, SelectionDAG &DAG,
                                     SmallVectorImpl<SDValue> &Results) const;
-  void analyzeFormalArgumentsCompute(CCState &State,
-                              const SmallVectorImpl<ISD::InputArg> &Ins) const;
+
+  void analyzeFormalArgumentsCompute(
+    CCState &State,
+    const SmallVectorImpl<ISD::InputArg> &Ins) const;
+
 public:
   AMDGPUTargetLowering(const TargetMachine &TM, const AMDGPUSubtarget &STI);
 
@@ -136,6 +141,10 @@ public:
     return false;
   }
 
+  static inline SDValue stripBitcast(SDValue Val) {
+    return Val.getOpcode() == ISD::BITCAST ? Val.getOperand(0) : Val;
+  }
+
   static bool allUsesHaveSourceMods(const SDNode *N,
                                     unsigned CostThreshold = 4);
   bool isFAbsFree(EVT VT) const override;
@@ -146,7 +155,6 @@ public:
   bool isZExtFree(Type *Src, Type *Dest) const override;
   bool isZExtFree(EVT Src, EVT Dest) const override;
   bool isZExtFree(SDValue Val, EVT VT2) const override;
-  bool isFPExtFoldable(unsigned Opcode, EVT DestVT, EVT SrcVT) const override;
 
   bool isNarrowingProfitable(EVT VT1, EVT VT2) const override;
 
@@ -168,6 +176,7 @@ public:
   bool isCheapToSpeculateCttz() const override;
   bool isCheapToSpeculateCtlz() const override;
 
+  bool isSDNodeAlwaysUniform(const SDNode *N) const override;
   static CCAssignFn *CCAssignFnForCall(CallingConv::ID CC, bool IsVarArg);
   static CCAssignFn *CCAssignFnForReturn(CallingConv::ID CC, bool IsVarArg);
 
@@ -224,7 +233,7 @@ public:
   virtual SDNode *PostISelFolding(MachineSDNode *N,
                                   SelectionDAG &DAG) const = 0;
 
-  /// \brief Determine which of the bits specified in \p Mask are known to be
+  /// Determine which of the bits specified in \p Mask are known to be
   /// either zero or one and return them in the \p KnownZero and \p KnownOne
   /// bitsets.
   void computeKnownBitsForTargetNode(const SDValue Op,
@@ -237,7 +246,7 @@ public:
                                            const SelectionDAG &DAG,
                                            unsigned Depth = 0) const override;
 
-  /// \brief Helper function that adds Reg to the LiveIn list of the DAG's
+  /// Helper function that adds Reg to the LiveIn list of the DAG's
   /// MachineFunction.
   ///
   /// \returns a RegisterSDNode representing Reg if \p RawReg is true, otherwise
@@ -285,9 +294,9 @@ public:
     GRID_OFFSET,
   };
 
-  /// \brief Helper function that returns the byte offset of the given
+  /// Helper function that returns the byte offset of the given
   /// type of implicit parameter.
-  uint32_t getImplicitParameterOffset(const AMDGPUMachineFunction *MFI,
+  uint32_t getImplicitParameterOffset(const MachineFunction &MF,
                                       const ImplicitParameter Param) const;
 
   AMDGPUAS getAMDGPUAS() const {
@@ -357,6 +366,7 @@ enum NodeType : unsigned {
   FMED3,
   SMED3,
   UMED3,
+  FDOT2,
   URECIP,
   DIV_SCALE,
   DIV_FMAS,
@@ -372,6 +382,7 @@ enum NodeType : unsigned {
   RSQ,
   RCP_LEGACY,
   RSQ_LEGACY,
+  RCP_IFLAG,
   FMUL_LEGACY,
   RSQ_CLAMP,
   LDEXP,
@@ -396,6 +407,7 @@ enum NodeType : unsigned {
   MAD_I64_I32,
   MUL_LOHI_I24,
   MUL_LOHI_U24,
+  PERM,
   TEXTURE_FETCH,
   EXPORT, // exp on SI+
   EXPORT_DONE, // exp on SI+ with done bit set
@@ -417,6 +429,10 @@ enum NodeType : unsigned {
   // Convert two float 32 numbers into a single register holding two packed f16
   // with round to zero.
   CVT_PKRTZ_F16_F32,
+  CVT_PKNORM_I16_F32,
+  CVT_PKNORM_U16_F32,
+  CVT_PK_I16_I32,
+  CVT_PK_U16_U32,
 
   // Same as the standard node, except the high bits of the resulting integer
   // are known 0.
@@ -451,14 +467,21 @@ enum NodeType : unsigned {
   LOAD_CONSTANT,
   TBUFFER_STORE_FORMAT,
   TBUFFER_STORE_FORMAT_X3,
+  TBUFFER_STORE_FORMAT_D16,
   TBUFFER_LOAD_FORMAT,
+  TBUFFER_LOAD_FORMAT_D16,
   ATOMIC_CMP_SWAP,
   ATOMIC_INC,
   ATOMIC_DEC,
+  ATOMIC_LOAD_FADD,
+  ATOMIC_LOAD_FMIN,
+  ATOMIC_LOAD_FMAX,
   BUFFER_LOAD,
   BUFFER_LOAD_FORMAT,
+  BUFFER_LOAD_FORMAT_D16,
   BUFFER_STORE,
   BUFFER_STORE_FORMAT,
+  BUFFER_STORE_FORMAT_D16,
   BUFFER_ATOMIC_SWAP,
   BUFFER_ATOMIC_ADD,
   BUFFER_ATOMIC_SUB,
@@ -470,6 +493,7 @@ enum NodeType : unsigned {
   BUFFER_ATOMIC_OR,
   BUFFER_ATOMIC_XOR,
   BUFFER_ATOMIC_CMPSWAP,
+
   LAST_AMDGPU_ISD_NUMBER
 };
 
diff --git a/lib/Target/AMDGPU/AMDGPUInline.cpp b/lib/Target/AMDGPU/AMDGPUInline.cpp
index ff9e7b50ed5c..35dd9eb0a478 100644
--- a/lib/Target/AMDGPU/AMDGPUInline.cpp
+++ b/lib/Target/AMDGPU/AMDGPUInline.cpp
@@ -8,7 +8,7 @@
 //===----------------------------------------------------------------------===//
 //
 /// \file
-/// \brief This is AMDGPU specific replacement of the standard inliner.
+/// This is AMDGPU specific replacement of the standard inliner.
 /// The main purpose is to account for the fact that calls not only expensive
 /// on the AMDGPU, but much more expensive if a private memory pointer is
 /// passed to a function as an argument. In this situation, we are unable to
@@ -161,8 +161,8 @@ static bool isWrapperOnlyCall(CallSite CS) {
       return false;
     }
     if (isa<ReturnInst>(*std::next(I->getIterator()))) {
-      DEBUG(dbgs() << "    Wrapper only call detected: "
-                   << Callee->getName() << '\n');
+      LLVM_DEBUG(dbgs() << "    Wrapper only call detected: "
+                        << Callee->getName() << '\n');
       return true;
     }
   }
diff --git a/lib/Target/AMDGPU/AMDGPUInstrInfo.cpp b/lib/Target/AMDGPU/AMDGPUInstrInfo.cpp
index 8156599528c2..07aa7c2cc8ad 100644
--- a/lib/Target/AMDGPU/AMDGPUInstrInfo.cpp
+++ b/lib/Target/AMDGPU/AMDGPUInstrInfo.cpp
@@ -16,95 +16,36 @@
 #include "AMDGPUInstrInfo.h"
 #include "AMDGPURegisterInfo.h"
 #include "AMDGPUTargetMachine.h"
+#include "MCTargetDesc/AMDGPUMCTargetDesc.h"
 #include "llvm/CodeGen/MachineFrameInfo.h"
 #include "llvm/CodeGen/MachineInstrBuilder.h"
 #include "llvm/CodeGen/MachineRegisterInfo.h"
 
 using namespace llvm;
 
-#define GET_INSTRINFO_CTOR_DTOR
-#include "AMDGPUGenInstrInfo.inc"
-
 // Pin the vtable to this file.
-void AMDGPUInstrInfo::anchor() {}
-
-AMDGPUInstrInfo::AMDGPUInstrInfo(const AMDGPUSubtarget &ST)
-  : AMDGPUGenInstrInfo(AMDGPU::ADJCALLSTACKUP, AMDGPU::ADJCALLSTACKDOWN),
-    ST(ST),
-    AMDGPUASI(ST.getAMDGPUAS()) {}
-
-// FIXME: This behaves strangely. If, for example, you have 32 load + stores,
-// the first 16 loads will be interleaved with the stores, and the next 16 will
-// be clustered as expected. It should really split into 2 16 store batches.
-//
-// Loads are clustered until this returns false, rather than trying to schedule
-// groups of stores. This also means we have to deal with saying different
-// address space loads should be clustered, and ones which might cause bank
-// conflicts.
-//
-// This might be deprecated so it might not be worth that much effort to fix.
-bool AMDGPUInstrInfo::shouldScheduleLoadsNear(SDNode *Load0, SDNode *Load1,
-                                              int64_t Offset0, int64_t Offset1,
-                                              unsigned NumLoads) const {
-  assert(Offset1 > Offset0 &&
-         "Second offset should be larger than first offset!");
-  // If we have less than 16 loads in a row, and the offsets are within 64
-  // bytes, then schedule together.
-
-  // A cacheline is 64 bytes (for global memory).
-  return (NumLoads <= 16 && (Offset1 - Offset0) < 64);
-}
-
-// This must be kept in sync with the SIEncodingFamily class in SIInstrInfo.td
-enum SIEncodingFamily {
-  SI = 0,
-  VI = 1,
-  SDWA = 2,
-  SDWA9 = 3,
-  GFX9 = 4
-};
-
-static SIEncodingFamily subtargetEncodingFamily(const AMDGPUSubtarget &ST) {
-  switch (ST.getGeneration()) {
-  case AMDGPUSubtarget::SOUTHERN_ISLANDS:
-  case AMDGPUSubtarget::SEA_ISLANDS:
-    return SIEncodingFamily::SI;
-  case AMDGPUSubtarget::VOLCANIC_ISLANDS:
-  case AMDGPUSubtarget::GFX9:
-    return SIEncodingFamily::VI;
-
-  // FIXME: This should never be called for r600 GPUs.
-  case AMDGPUSubtarget::R600:
-  case AMDGPUSubtarget::R700:
-  case AMDGPUSubtarget::EVERGREEN:
-  case AMDGPUSubtarget::NORTHERN_ISLANDS:
-    return SIEncodingFamily::SI;
-  }
-
-  llvm_unreachable("Unknown subtarget generation!");
-}
-
-int AMDGPUInstrInfo::pseudoToMCOpcode(int Opcode) const {
-  SIEncodingFamily Gen = subtargetEncodingFamily(ST);
+//void AMDGPUInstrInfo::anchor() {}
 
-  if ((get(Opcode).TSFlags & SIInstrFlags::renamedInGFX9) != 0 &&
-    ST.getGeneration() >= AMDGPUSubtarget::GFX9)
-    Gen = SIEncodingFamily::GFX9;
+AMDGPUInstrInfo::AMDGPUInstrInfo(const GCNSubtarget &ST) { }
 
-  if (get(Opcode).TSFlags & SIInstrFlags::SDWA)
-    Gen = ST.getGeneration() == AMDGPUSubtarget::GFX9 ? SIEncodingFamily::SDWA9
-                                                      : SIEncodingFamily::SDWA;
 
-  int MCOp = AMDGPU::getMCOpcode(Opcode, Gen);
+// TODO: Should largely merge with AMDGPUTTIImpl::isSourceOfDivergence.
+bool AMDGPUInstrInfo::isUniformMMO(const MachineMemOperand *MMO) {
+  const Value *Ptr = MMO->getValue();
+  // UndefValue means this is a load of a kernel input.  These are uniform.
+  // Sometimes LDS instructions have constant pointers.
+  // If Ptr is null, then that means this mem operand contains a
+  // PseudoSourceValue like GOT.
+  if (!Ptr || isa<UndefValue>(Ptr) ||
+      isa<Constant>(Ptr) || isa<GlobalValue>(Ptr))
+    return true;
 
-  // -1 means that Opcode is already a native instruction.
-  if (MCOp == -1)
-    return Opcode;
+  if (MMO->getAddrSpace() == AMDGPUAS::CONSTANT_ADDRESS_32BIT)
+    return true;
 
-  // (uint16_t)-1 means that Opcode is a pseudo instruction that has
-  // no encoding in the given subtarget generation.
-  if (MCOp == (uint16_t)-1)
-    return -1;
+  if (const Argument *Arg = dyn_cast<Argument>(Ptr))
+    return AMDGPU::isArgPassedInSGPR(Arg);
 
-  return MCOp;
+  const Instruction *I = dyn_cast<Instruction>(Ptr);
+  return I && I->getMetadata("amdgpu.uniform");
 }
diff --git a/lib/Target/AMDGPU/AMDGPUInstrInfo.h b/lib/Target/AMDGPU/AMDGPUInstrInfo.h
index a9fcd4834638..2f8166da0d33 100644
--- a/lib/Target/AMDGPU/AMDGPUInstrInfo.h
+++ b/lib/Target/AMDGPU/AMDGPUInstrInfo.h
@@ -8,7 +8,7 @@
 //===----------------------------------------------------------------------===//
 //
 /// \file
-/// \brief Contains the definition of a TargetInstrInfo class that is common
+/// Contains the definition of a TargetInstrInfo class that is common
 /// to all AMD GPUs.
 //
 //===----------------------------------------------------------------------===//
@@ -20,37 +20,43 @@
 #include "Utils/AMDGPUBaseInfo.h"
 #include "llvm/CodeGen/TargetInstrInfo.h"
 
-#define GET_INSTRINFO_HEADER
-#include "AMDGPUGenInstrInfo.inc"
-#undef GET_INSTRINFO_HEADER
-
 namespace llvm {
 
-class AMDGPUSubtarget;
+class GCNSubtarget;
 class MachineFunction;
 class MachineInstr;
 class MachineInstrBuilder;
 
-class AMDGPUInstrInfo : public AMDGPUGenInstrInfo {
-private:
-  const AMDGPUSubtarget &ST;
+class AMDGPUInstrInfo {
+public:
+  explicit AMDGPUInstrInfo(const GCNSubtarget &st);
 
-  virtual void anchor();
-protected:
-  AMDGPUAS AMDGPUASI;
+  static bool isUniformMMO(const MachineMemOperand *MMO);
+};
 
-public:
-  explicit AMDGPUInstrInfo(const AMDGPUSubtarget &st);
+namespace AMDGPU {
 
-  bool shouldScheduleLoadsNear(SDNode *Load1, SDNode *Load2,
-                               int64_t Offset1, int64_t Offset2,
-                               unsigned NumLoads) const override;
+struct RsrcIntrinsic {
+  unsigned Intr;
+  uint8_t RsrcArg;
+  bool IsImage;
+};
+const RsrcIntrinsic *lookupRsrcIntrinsic(unsigned Intr);
+
+struct D16ImageDimIntrinsic {
+  unsigned Intr;
+  unsigned D16HelperIntr;
+};
+const D16ImageDimIntrinsic *lookupD16ImageDimIntrinsic(unsigned Intr);
 
-  /// \brief Return a target-specific opcode if Opcode is a pseudo instruction.
-  /// Return -1 if the target-specific opcode for the pseudo instruction does
-  /// not exist. If Opcode is not a pseudo instruction, this is identity.
-  int pseudoToMCOpcode(int Opcode) const;
+struct ImageDimIntrinsicInfo {
+  unsigned Intr;
+  unsigned BaseOpcode;
+  MIMGDim Dim;
 };
+const ImageDimIntrinsicInfo *getImageDimIntrinsicInfo(unsigned Intr);
+
+} // end AMDGPU namespace
 } // End llvm namespace
 
 #endif
diff --git a/lib/Target/AMDGPU/AMDGPUInstrInfo.td b/lib/Target/AMDGPU/AMDGPUInstrInfo.td
index c024010f3e96..96b7568eec1f 100644
--- a/lib/Target/AMDGPU/AMDGPUInstrInfo.td
+++ b/lib/Target/AMDGPU/AMDGPUInstrInfo.td
@@ -35,6 +35,10 @@ def AMDGPUFPPackOp : SDTypeProfile<1, 2,
   [SDTCisFP<1>, SDTCisSameAs<1, 2>]
 >;
 
+def AMDGPUIntPackOp : SDTypeProfile<1, 2,
+  [SDTCisInt<1>, SDTCisSameAs<1, 2>]
+>;
+
 def AMDGPUDivScaleOp : SDTypeProfile<2, 3,
   [SDTCisFP<0>, SDTCisInt<1>, SDTCisSameAs<0, 2>, SDTCisSameAs<0, 3>, SDTCisSameAs<0, 4>]
 >;
@@ -136,12 +140,18 @@ def AMDGPUrsq : SDNode<"AMDGPUISD::RSQ", SDTFPUnaryOp>;
 def AMDGPUrcp_legacy : SDNode<"AMDGPUISD::RCP_LEGACY", SDTFPUnaryOp>;
 def AMDGPUrsq_legacy : SDNode<"AMDGPUISD::RSQ_LEGACY", SDTFPUnaryOp>;
 
+def AMDGPUrcp_iflag : SDNode<"AMDGPUISD::RCP_IFLAG", SDTFPUnaryOp>;
+
 // out = 1.0 / sqrt(a) result clamped to +/- max_float.
 def AMDGPUrsq_clamp : SDNode<"AMDGPUISD::RSQ_CLAMP", SDTFPUnaryOp>;
 
 def AMDGPUldexp : SDNode<"AMDGPUISD::LDEXP", AMDGPULdExpOp>;
 
 def AMDGPUpkrtz_f16_f32 : SDNode<"AMDGPUISD::CVT_PKRTZ_F16_F32", AMDGPUFPPackOp>;
+def AMDGPUpknorm_i16_f32 : SDNode<"AMDGPUISD::CVT_PKNORM_I16_F32", AMDGPUFPPackOp>;
+def AMDGPUpknorm_u16_f32 : SDNode<"AMDGPUISD::CVT_PKNORM_U16_F32", AMDGPUFPPackOp>;
+def AMDGPUpk_i16_i32 : SDNode<"AMDGPUISD::CVT_PK_I16_I32", AMDGPUIntPackOp>;
+def AMDGPUpk_u16_u32 : SDNode<"AMDGPUISD::CVT_PK_U16_U32", AMDGPUIntPackOp>;
 def AMDGPUfp_to_f16 : SDNode<"AMDGPUISD::FP_TO_FP16" , SDTFPToIntOp>;
 def AMDGPUfp16_zext : SDNode<"AMDGPUISD::FP16_ZEXT" , SDTFPToIntOp>;
 
@@ -160,8 +170,6 @@ def AMDGPUfmul_legacy : SDNode<"AMDGPUISD::FMUL_LEGACY", SDTFPBinOp,
   [SDNPCommutative, SDNPAssociative]
 >;
 
-def AMDGPUclamp : SDNode<"AMDGPUISD::CLAMP", SDTFPUnaryOp>;
-
 // out = min(a, b) a and b are floats, where a nan comparison fails.
 def AMDGPUfmin_legacy : SDNode<"AMDGPUISD::FMIN_LEGACY", SDTFPBinOp,
   []
@@ -333,6 +341,13 @@ def AMDGPUumed3 : SDNode<"AMDGPUISD::UMED3", AMDGPUDTIntTernaryOp,
 
 def AMDGPUfmed3 : SDNode<"AMDGPUISD::FMED3", SDTFPTernaryOp, []>;
 
+def AMDGPUfdot2 : SDNode<"AMDGPUISD::FDOT2",
+                  SDTypeProfile<1, 3, [SDTCisSameAs<0, 3>, SDTCisSameAs<1, 2>,
+                                       SDTCisFP<0>, SDTCisVec<1>]>,
+                  []>;
+
+def AMDGPUperm : SDNode<"AMDGPUISD::PERM", AMDGPUDTIntTernaryOp, []>;
+
 def AMDGPUinit_exec : SDNode<"AMDGPUISD::INIT_EXEC",
                       SDTypeProfile<0, 1, [SDTCisInt<0>]>,
                       [SDNPHasChain, SDNPInGlue]>;
diff --git a/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp b/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp
index 16d240e96196..219d430fbb39 100644
--- a/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp
+++ b/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp
@@ -17,6 +17,12 @@
 #include "AMDGPURegisterBankInfo.h"
 #include "AMDGPURegisterInfo.h"
 #include "AMDGPUSubtarget.h"
+#include "AMDGPUTargetMachine.h"
+#include "SIMachineFunctionInfo.h"
+#include "MCTargetDesc/AMDGPUMCTargetDesc.h"
+#include "llvm/CodeGen/GlobalISel/InstructionSelector.h"
+#include "llvm/CodeGen/GlobalISel/InstructionSelectorImpl.h"
+#include "llvm/CodeGen/GlobalISel/Utils.h"
 #include "llvm/CodeGen/MachineBasicBlock.h"
 #include "llvm/CodeGen/MachineFunction.h"
 #include "llvm/CodeGen/MachineInstr.h"
@@ -30,10 +36,48 @@
 
 using namespace llvm;
 
+#define GET_GLOBALISEL_IMPL
+#define AMDGPUSubtarget GCNSubtarget
+#include "AMDGPUGenGlobalISel.inc"
+#undef GET_GLOBALISEL_IMPL
+#undef AMDGPUSubtarget
+
 AMDGPUInstructionSelector::AMDGPUInstructionSelector(
-    const SISubtarget &STI, const AMDGPURegisterBankInfo &RBI)
+    const GCNSubtarget &STI, const AMDGPURegisterBankInfo &RBI,
+    const AMDGPUTargetMachine &TM)
     : InstructionSelector(), TII(*STI.getInstrInfo()),
-      TRI(*STI.getRegisterInfo()), RBI(RBI), AMDGPUASI(STI.getAMDGPUAS()) {}
+      TRI(*STI.getRegisterInfo()), RBI(RBI), TM(TM),
+      STI(STI),
+      EnableLateStructurizeCFG(AMDGPUTargetMachine::EnableLateStructurizeCFG),
+#define GET_GLOBALISEL_PREDICATES_INIT
+#include "AMDGPUGenGlobalISel.inc"
+#undef GET_GLOBALISEL_PREDICATES_INIT
+#define GET_GLOBALISEL_TEMPORARIES_INIT
+#include "AMDGPUGenGlobalISel.inc"
+#undef GET_GLOBALISEL_TEMPORARIES_INIT
+      ,AMDGPUASI(STI.getAMDGPUAS())
+{
+}
+
+const char *AMDGPUInstructionSelector::getName() { return DEBUG_TYPE; }
+
+bool AMDGPUInstructionSelector::selectCOPY(MachineInstr &I) const {
+  MachineBasicBlock *BB = I.getParent();
+  MachineFunction *MF = BB->getParent();
+  MachineRegisterInfo &MRI = MF->getRegInfo();
+  I.setDesc(TII.get(TargetOpcode::COPY));
+  for (const MachineOperand &MO : I.operands()) {
+    if (TargetRegisterInfo::isPhysicalRegister(MO.getReg()))
+      continue;
+
+    const TargetRegisterClass *RC =
+            TRI.getConstrainedRegClassForOperand(MO, MRI);
+    if (!RC)
+      continue;
+    RBI.constrainGenericRegister(MO.getReg(), *RC, MRI);
+  }
+  return true;
+}
 
 MachineOperand
 AMDGPUInstructionSelector::getSubOperand64(MachineOperand &MO,
@@ -71,6 +115,10 @@ AMDGPUInstructionSelector::getSubOperand64(MachineOperand &MO,
   }
 }
 
+static int64_t getConstant(const MachineInstr *MI) {
+  return MI->getOperand(1).getCImm()->getSExtValue();
+}
+
 bool AMDGPUInstructionSelector::selectG_ADD(MachineInstr &I) const {
   MachineBasicBlock *BB = I.getParent();
   MachineFunction *MF = BB->getParent();
@@ -118,12 +166,144 @@ bool AMDGPUInstructionSelector::selectG_GEP(MachineInstr &I) const {
   return selectG_ADD(I);
 }
 
+bool AMDGPUInstructionSelector::selectG_IMPLICIT_DEF(MachineInstr &I) const {
+  MachineBasicBlock *BB = I.getParent();
+  MachineFunction *MF = BB->getParent();
+  MachineRegisterInfo &MRI = MF->getRegInfo();
+  const MachineOperand &MO = I.getOperand(0);
+  const TargetRegisterClass *RC =
+      TRI.getConstrainedRegClassForOperand(MO, MRI);
+  if (RC)
+    RBI.constrainGenericRegister(MO.getReg(), *RC, MRI);
+  I.setDesc(TII.get(TargetOpcode::IMPLICIT_DEF));
+  return true;
+}
+
+bool AMDGPUInstructionSelector::selectG_INTRINSIC(MachineInstr &I,
+                                          CodeGenCoverage &CoverageInfo) const {
+  unsigned IntrinsicID =  I.getOperand(1).getIntrinsicID();
+
+  switch (IntrinsicID) {
+  default:
+    break;
+  case Intrinsic::maxnum:
+  case Intrinsic::minnum:
+  case Intrinsic::amdgcn_cvt_pkrtz:
+    return selectImpl(I, CoverageInfo);
+
+  case Intrinsic::amdgcn_kernarg_segment_ptr: {
+    MachineFunction *MF = I.getParent()->getParent();
+    MachineRegisterInfo &MRI = MF->getRegInfo();
+    const SIMachineFunctionInfo *MFI = MF->getInfo<SIMachineFunctionInfo>();
+    const ArgDescriptor *InputPtrReg;
+    const TargetRegisterClass *RC;
+    const DebugLoc &DL = I.getDebugLoc();
+
+    std::tie(InputPtrReg, RC)
+      = MFI->getPreloadedValue(AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR);
+    if (!InputPtrReg)
+      report_fatal_error("missing kernarg segment ptr");
+
+    BuildMI(*I.getParent(), &I, DL, TII.get(AMDGPU::COPY))
+      .add(I.getOperand(0))
+      .addReg(MRI.getLiveInVirtReg(InputPtrReg->getRegister()));
+    I.eraseFromParent();
+    return true;
+  }
+  }
+  return false;
+}
+
+static MachineInstr *
+buildEXP(const TargetInstrInfo &TII, MachineInstr *Insert, unsigned Tgt,
+         unsigned Reg0, unsigned Reg1, unsigned Reg2, unsigned Reg3,
+         unsigned VM, bool Compr, unsigned Enabled, bool Done) {
+  const DebugLoc &DL = Insert->getDebugLoc();
+  MachineBasicBlock &BB = *Insert->getParent();
+  unsigned Opcode = Done ? AMDGPU::EXP_DONE : AMDGPU::EXP;
+  return BuildMI(BB, Insert, DL, TII.get(Opcode))
+          .addImm(Tgt)
+          .addReg(Reg0)
+          .addReg(Reg1)
+          .addReg(Reg2)
+          .addReg(Reg3)
+          .addImm(VM)
+          .addImm(Compr)
+          .addImm(Enabled);
+}
+
+bool AMDGPUInstructionSelector::selectG_INTRINSIC_W_SIDE_EFFECTS(
+                                                 MachineInstr &I,
+						 CodeGenCoverage &CoverageInfo) const {
+  MachineBasicBlock *BB = I.getParent();
+  MachineFunction *MF = BB->getParent();
+  MachineRegisterInfo &MRI = MF->getRegInfo();
+
+  unsigned IntrinsicID = I.getOperand(0).getIntrinsicID();
+  switch (IntrinsicID) {
+  case Intrinsic::amdgcn_exp: {
+    int64_t Tgt = getConstant(MRI.getVRegDef(I.getOperand(1).getReg()));
+    int64_t Enabled = getConstant(MRI.getVRegDef(I.getOperand(2).getReg()));
+    int64_t Done = getConstant(MRI.getVRegDef(I.getOperand(7).getReg()));
+    int64_t VM = getConstant(MRI.getVRegDef(I.getOperand(8).getReg()));
+
+    MachineInstr *Exp = buildEXP(TII, &I, Tgt, I.getOperand(3).getReg(),
+                                 I.getOperand(4).getReg(),
+                                 I.getOperand(5).getReg(),
+                                 I.getOperand(6).getReg(),
+                                 VM, false, Enabled, Done);
+
+    I.eraseFromParent();
+    return constrainSelectedInstRegOperands(*Exp, TII, TRI, RBI);
+  }
+  case Intrinsic::amdgcn_exp_compr: {
+    const DebugLoc &DL = I.getDebugLoc();
+    int64_t Tgt = getConstant(MRI.getVRegDef(I.getOperand(1).getReg()));
+    int64_t Enabled = getConstant(MRI.getVRegDef(I.getOperand(2).getReg()));
+    unsigned Reg0 = I.getOperand(3).getReg();
+    unsigned Reg1 = I.getOperand(4).getReg();
+    unsigned Undef = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
+    int64_t Done = getConstant(MRI.getVRegDef(I.getOperand(5).getReg()));
+    int64_t VM = getConstant(MRI.getVRegDef(I.getOperand(6).getReg()));
+
+    BuildMI(*BB, &I, DL, TII.get(AMDGPU::IMPLICIT_DEF), Undef);
+    MachineInstr *Exp = buildEXP(TII, &I, Tgt, Reg0, Reg1, Undef, Undef, VM,
+                                 true,  Enabled, Done);
+
+    I.eraseFromParent();
+    return constrainSelectedInstRegOperands(*Exp, TII, TRI, RBI);
+  }
+  }
+  return false;
+}
+
 bool AMDGPUInstructionSelector::selectG_STORE(MachineInstr &I) const {
   MachineBasicBlock *BB = I.getParent();
+  MachineFunction *MF = BB->getParent();
+  MachineRegisterInfo &MRI = MF->getRegInfo();
   DebugLoc DL = I.getDebugLoc();
+  unsigned StoreSize = RBI.getSizeInBits(I.getOperand(0).getReg(), MRI, TRI);
+  unsigned Opcode;
 
   // FIXME: Select store instruction based on address space
-  MachineInstr *Flat = BuildMI(*BB, &I, DL, TII.get(AMDGPU::FLAT_STORE_DWORD))
+  switch (StoreSize) {
+  default:
+    return false;
+  case 32:
+    Opcode = AMDGPU::FLAT_STORE_DWORD;
+    break;
+  case 64:
+    Opcode = AMDGPU::FLAT_STORE_DWORDX2;
+    break;
+  case 96:
+    Opcode = AMDGPU::FLAT_STORE_DWORDX3;
+    break;
+  case 128:
+    Opcode = AMDGPU::FLAT_STORE_DWORDX4;
+    break;
+  }
+
+  MachineInstr *Flat = BuildMI(*BB, &I, DL, TII.get(Opcode))
           .add(I.getOperand(1))
           .add(I.getOperand(0))
           .addImm(0)  // offset
@@ -143,36 +323,67 @@ bool AMDGPUInstructionSelector::selectG_CONSTANT(MachineInstr &I) const {
   MachineBasicBlock *BB = I.getParent();
   MachineFunction *MF = BB->getParent();
   MachineRegisterInfo &MRI = MF->getRegInfo();
+  MachineOperand &ImmOp = I.getOperand(1);
+
+  // The AMDGPU backend only supports Imm operands and not CImm or FPImm.
+  if (ImmOp.isFPImm()) {
+    const APInt &Imm = ImmOp.getFPImm()->getValueAPF().bitcastToAPInt();
+    ImmOp.ChangeToImmediate(Imm.getZExtValue());
+  } else if (ImmOp.isCImm()) {
+    ImmOp.ChangeToImmediate(ImmOp.getCImm()->getZExtValue());
+  }
+
   unsigned DstReg = I.getOperand(0).getReg();
-  unsigned Size = RBI.getSizeInBits(DstReg, MRI, TRI);
+  unsigned Size;
+  bool IsSgpr;
+  const RegisterBank *RB = MRI.getRegBankOrNull(I.getOperand(0).getReg());
+  if (RB) {
+    IsSgpr = RB->getID() == AMDGPU::SGPRRegBankID;
+    Size = MRI.getType(DstReg).getSizeInBits();
+  } else {
+    const TargetRegisterClass *RC = TRI.getRegClassForReg(MRI, DstReg);
+    IsSgpr = TRI.isSGPRClass(RC);
+    Size = TRI.getRegSizeInBits(*RC);
+  }
 
+  if (Size != 32 && Size != 64)
+    return false;
+
+  unsigned Opcode = IsSgpr ? AMDGPU::S_MOV_B32 : AMDGPU::V_MOV_B32_e32;
   if (Size == 32) {
-    I.setDesc(TII.get(AMDGPU::S_MOV_B32));
+    I.setDesc(TII.get(Opcode));
+    I.addImplicitDefUseOperands(*MF);
     return constrainSelectedInstRegOperands(I, TII, TRI, RBI);
   }
 
-  assert(Size == 64);
-
   DebugLoc DL = I.getDebugLoc();
-  unsigned LoReg = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
-  unsigned HiReg = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
-  const APInt &Imm = I.getOperand(1).getCImm()->getValue();
+  const TargetRegisterClass *RC = IsSgpr ? &AMDGPU::SReg_32_XM0RegClass :
+                                           &AMDGPU::VGPR_32RegClass;
+  unsigned LoReg = MRI.createVirtualRegister(RC);
+  unsigned HiReg = MRI.createVirtualRegister(RC);
+  const APInt &Imm = APInt(Size, I.getOperand(1).getImm());
 
-  BuildMI(*BB, &I, DL, TII.get(AMDGPU::S_MOV_B32), LoReg)
+  BuildMI(*BB, &I, DL, TII.get(Opcode), LoReg)
           .addImm(Imm.trunc(32).getZExtValue());
 
-  BuildMI(*BB, &I, DL, TII.get(AMDGPU::S_MOV_B32), HiReg)
+  BuildMI(*BB, &I, DL, TII.get(Opcode), HiReg)
           .addImm(Imm.ashr(32).getZExtValue());
 
-  BuildMI(*BB, &I, DL, TII.get(AMDGPU::REG_SEQUENCE), DstReg)
-          .addReg(LoReg)
-          .addImm(AMDGPU::sub0)
-          .addReg(HiReg)
-          .addImm(AMDGPU::sub1);
+  const MachineInstr *RS =
+      BuildMI(*BB, &I, DL, TII.get(AMDGPU::REG_SEQUENCE), DstReg)
+              .addReg(LoReg)
+              .addImm(AMDGPU::sub0)
+              .addReg(HiReg)
+              .addImm(AMDGPU::sub1);
+
   // We can't call constrainSelectedInstRegOperands here, because it doesn't
   // work for target independent opcodes
   I.eraseFromParent();
-  return RBI.constrainGenericRegister(DstReg, AMDGPU::SReg_64RegClass, MRI);
+  const TargetRegisterClass *DstRC =
+      TRI.getConstrainedRegClassForOperand(RS->getOperand(0), MRI);
+  if (!DstRC)
+    return true;
+  return RBI.constrainGenericRegister(DstReg, *DstRC, MRI);
 }
 
 static bool isConstant(const MachineInstr &MI) {
@@ -228,6 +439,9 @@ static bool isInstrUniform(const MachineInstr &MI) {
       isa<Constant>(Ptr) || isa<GlobalValue>(Ptr))
     return true;
 
+  if (MMO->getAddrSpace() == AMDGPUAS::CONSTANT_ADDRESS_32BIT)
+    return true;
+
   const Instruction *I = dyn_cast<Instruction>(Ptr);
   return I && I->getMetadata("amdgpu.uniform");
 }
@@ -292,7 +506,8 @@ bool AMDGPUInstructionSelector::selectSMRD(MachineInstr &I,
   if (!I.hasOneMemOperand())
     return false;
 
-  if ((*I.memoperands_begin())->getAddrSpace() != AMDGPUASI.CONSTANT_ADDRESS)
+  if ((*I.memoperands_begin())->getAddrSpace() != AMDGPUASI.CONSTANT_ADDRESS &&
+      (*I.memoperands_begin())->getAddrSpace() != AMDGPUASI.CONSTANT_ADDRESS_32BIT)
     return false;
 
   if (!isInstrUniform(I))
@@ -303,7 +518,7 @@ bool AMDGPUInstructionSelector::selectSMRD(MachineInstr &I,
 
   MachineBasicBlock *BB = I.getParent();
   MachineFunction *MF = BB->getParent();
-  const SISubtarget &Subtarget = MF->getSubtarget<SISubtarget>();
+  const GCNSubtarget &Subtarget = MF->getSubtarget<GCNSubtarget>();
   MachineRegisterInfo &MRI = MF->getRegInfo();
   unsigned DstReg = I.getOperand(0).getReg();
   const DebugLoc &DL = I.getDebugLoc();
@@ -405,18 +620,30 @@ bool AMDGPUInstructionSelector::selectG_LOAD(MachineInstr &I) const {
 bool AMDGPUInstructionSelector::select(MachineInstr &I,
                                        CodeGenCoverage &CoverageInfo) const {
 
-  if (!isPreISelGenericOpcode(I.getOpcode()))
+  if (!isPreISelGenericOpcode(I.getOpcode())) {
+    if (I.isCopy())
+      return selectCOPY(I);
     return true;
+  }
 
   switch (I.getOpcode()) {
   default:
-    break;
+    return selectImpl(I, CoverageInfo);
   case TargetOpcode::G_ADD:
     return selectG_ADD(I);
+  case TargetOpcode::G_BITCAST:
+    return selectCOPY(I);
   case TargetOpcode::G_CONSTANT:
+  case TargetOpcode::G_FCONSTANT:
     return selectG_CONSTANT(I);
   case TargetOpcode::G_GEP:
     return selectG_GEP(I);
+  case TargetOpcode::G_IMPLICIT_DEF:
+    return selectG_IMPLICIT_DEF(I);
+  case TargetOpcode::G_INTRINSIC:
+    return selectG_INTRINSIC(I, CoverageInfo);
+  case TargetOpcode::G_INTRINSIC_W_SIDE_EFFECTS:
+    return selectG_INTRINSIC_W_SIDE_EFFECTS(I, CoverageInfo);
   case TargetOpcode::G_LOAD:
     return selectG_LOAD(I);
   case TargetOpcode::G_STORE:
@@ -424,3 +651,47 @@ bool AMDGPUInstructionSelector::select(MachineInstr &I,
   }
   return false;
 }
+
+InstructionSelector::ComplexRendererFns
+AMDGPUInstructionSelector::selectVCSRC(MachineOperand &Root) const {
+  return {{
+      [=](MachineInstrBuilder &MIB) { MIB.add(Root); }
+  }};
+
+}
+
+///
+/// This will select either an SGPR or VGPR operand and will save us from
+/// having to write an extra tablegen pattern.
+InstructionSelector::ComplexRendererFns
+AMDGPUInstructionSelector::selectVSRC0(MachineOperand &Root) const {
+  return {{
+      [=](MachineInstrBuilder &MIB) { MIB.add(Root); }
+  }};
+}
+
+InstructionSelector::ComplexRendererFns
+AMDGPUInstructionSelector::selectVOP3Mods0(MachineOperand &Root) const {
+  return {{
+      [=](MachineInstrBuilder &MIB) { MIB.add(Root); },
+      [=](MachineInstrBuilder &MIB) { MIB.addImm(0); }, // src0_mods
+      [=](MachineInstrBuilder &MIB) { MIB.addImm(0); }, // clamp
+      [=](MachineInstrBuilder &MIB) { MIB.addImm(0); }  // omod
+  }};
+}
+InstructionSelector::ComplexRendererFns
+AMDGPUInstructionSelector::selectVOP3OMods(MachineOperand &Root) const {
+  return {{
+      [=](MachineInstrBuilder &MIB) { MIB.add(Root); },
+      [=](MachineInstrBuilder &MIB) { MIB.addImm(0); }, // clamp
+      [=](MachineInstrBuilder &MIB) { MIB.addImm(0); }  // omod
+  }};
+}
+
+InstructionSelector::ComplexRendererFns
+AMDGPUInstructionSelector::selectVOP3Mods(MachineOperand &Root) const {
+  return {{
+      [=](MachineInstrBuilder &MIB) { MIB.add(Root); },
+      [=](MachineInstrBuilder &MIB) { MIB.addImm(0); }  // src_mods
+  }};
+}
diff --git a/lib/Target/AMDGPU/AMDGPUInstructionSelector.h b/lib/Target/AMDGPU/AMDGPUInstructionSelector.h
index 715c4882f380..68b40b20aca2 100644
--- a/lib/Target/AMDGPU/AMDGPUInstructionSelector.h
+++ b/lib/Target/AMDGPU/AMDGPUInstructionSelector.h
@@ -15,27 +15,39 @@
 #define LLVM_LIB_TARGET_AMDGPU_AMDGPUINSTRUCTIONSELECTOR_H
 
 #include "AMDGPU.h"
+#include "AMDGPUArgumentUsageInfo.h"
 #include "llvm/ADT/ArrayRef.h"
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/CodeGen/GlobalISel/InstructionSelector.h"
 
+namespace {
+#define GET_GLOBALISEL_PREDICATE_BITSET
+#define AMDGPUSubtarget GCNSubtarget
+#include "AMDGPUGenGlobalISel.inc"
+#undef GET_GLOBALISEL_PREDICATE_BITSET
+#undef AMDGPUSubtarget
+}
+
 namespace llvm {
 
 class AMDGPUInstrInfo;
 class AMDGPURegisterBankInfo;
+class GCNSubtarget;
 class MachineInstr;
 class MachineOperand;
 class MachineRegisterInfo;
 class SIInstrInfo;
+class SIMachineFunctionInfo;
 class SIRegisterInfo;
-class SISubtarget;
 
 class AMDGPUInstructionSelector : public InstructionSelector {
 public:
-  AMDGPUInstructionSelector(const SISubtarget &STI,
-                            const AMDGPURegisterBankInfo &RBI);
+  AMDGPUInstructionSelector(const GCNSubtarget &STI,
+                            const AMDGPURegisterBankInfo &RBI,
+                            const AMDGPUTargetMachine &TM);
 
   bool select(MachineInstr &I, CodeGenCoverage &CoverageInfo) const override;
+  static const char *getName();
 
 private:
   struct GEPInfo {
@@ -46,10 +58,18 @@ private:
     GEPInfo(const MachineInstr &GEP) : GEP(GEP), Imm(0) { }
   };
 
+  /// tblgen-erated 'select' implementation.
+  bool selectImpl(MachineInstr &I, CodeGenCoverage &CoverageInfo) const;
+
   MachineOperand getSubOperand64(MachineOperand &MO, unsigned SubIdx) const;
+  bool selectCOPY(MachineInstr &I) const;
   bool selectG_CONSTANT(MachineInstr &I) const;
   bool selectG_ADD(MachineInstr &I) const;
   bool selectG_GEP(MachineInstr &I) const;
+  bool selectG_IMPLICIT_DEF(MachineInstr &I) const;
+  bool selectG_INTRINSIC(MachineInstr &I, CodeGenCoverage &CoverageInfo) const;
+  bool selectG_INTRINSIC_W_SIDE_EFFECTS(MachineInstr &I,
+                                        CodeGenCoverage &CoverageInfo) const;
   bool hasVgprParts(ArrayRef<GEPInfo> AddrInfo) const;
   void getAddrModeInfo(const MachineInstr &Load, const MachineRegisterInfo &MRI,
                        SmallVectorImpl<GEPInfo> &AddrInfo) const;
@@ -57,9 +77,35 @@ private:
   bool selectG_LOAD(MachineInstr &I) const;
   bool selectG_STORE(MachineInstr &I) const;
 
+  InstructionSelector::ComplexRendererFns
+  selectVCSRC(MachineOperand &Root) const;
+
+  InstructionSelector::ComplexRendererFns
+  selectVSRC0(MachineOperand &Root) const;
+
+  InstructionSelector::ComplexRendererFns
+  selectVOP3Mods0(MachineOperand &Root) const;
+  InstructionSelector::ComplexRendererFns
+  selectVOP3OMods(MachineOperand &Root) const;
+  InstructionSelector::ComplexRendererFns
+  selectVOP3Mods(MachineOperand &Root) const;
+
   const SIInstrInfo &TII;
   const SIRegisterInfo &TRI;
   const AMDGPURegisterBankInfo &RBI;
+  const AMDGPUTargetMachine &TM;
+  const GCNSubtarget &STI;
+  bool EnableLateStructurizeCFG;
+#define GET_GLOBALISEL_PREDICATES_DECL
+#define AMDGPUSubtarget GCNSubtarget
+#include "AMDGPUGenGlobalISel.inc"
+#undef GET_GLOBALISEL_PREDICATES_DECL
+#undef AMDGPUSubtarget
+
+#define GET_GLOBALISEL_TEMPORARIES_DECL
+#include "AMDGPUGenGlobalISel.inc"
+#undef GET_GLOBALISEL_TEMPORARIES_DECL
+
 protected:
   AMDGPUAS AMDGPUASI;
 };
diff --git a/lib/Target/AMDGPU/AMDGPUInstructions.td b/lib/Target/AMDGPU/AMDGPUInstructions.td
index 31f728b0c22f..9426df399597 100644
--- a/lib/Target/AMDGPU/AMDGPUInstructions.td
+++ b/lib/Target/AMDGPU/AMDGPUInstructions.td
@@ -42,6 +42,47 @@ class AMDGPUShaderInst <dag outs, dag ins, string asm = "",
   field bits<32> Inst = 0xffffffff;
 }
 
+//===---------------------------------------------------------------------===//
+// Return instruction
+//===---------------------------------------------------------------------===//
+
+class ILFormat<dag outs, dag ins, string asmstr, list<dag> pattern>
+: Instruction {
+
+     let Namespace = "AMDGPU";
+     dag OutOperandList = outs;
+     dag InOperandList = ins;
+     let Pattern = pattern;
+     let AsmString = !strconcat(asmstr, "\n");
+     let isPseudo = 1;
+     let Itinerary = NullALU;
+     bit hasIEEEFlag = 0;
+     bit hasZeroOpFlag = 0;
+     let mayLoad = 0;
+     let mayStore = 0;
+     let hasSideEffects = 0;
+     let isCodeGenOnly = 1;
+}
+
+def TruePredicate : Predicate<"true">;
+
+// Exists to help track down where SubtargetPredicate isn't set rather
+// than letting tablegen crash with an unhelpful error.
+def InvalidPred : Predicate<"predicate not set on instruction or pattern">;
+
+class PredicateControl {
+  Predicate SubtargetPredicate = InvalidPred;
+  list<Predicate> AssemblerPredicates = [];
+  Predicate AssemblerPredicate = TruePredicate;
+  list<Predicate> OtherPredicates = [];
+  list<Predicate> Predicates = !listconcat([SubtargetPredicate,
+                                            AssemblerPredicate],
+                                            AssemblerPredicates,
+                                            OtherPredicates);
+}
+class AMDGPUPat<dag pattern, dag result> : Pat<pattern, result>,
+      PredicateControl;
+
 def FP16Denormals : Predicate<"Subtarget->hasFP16Denormals()">;
 def FP32Denormals : Predicate<"Subtarget->hasFP32Denormals()">;
 def FP64Denormals : Predicate<"Subtarget->hasFP64Denormals()">;
@@ -52,7 +93,6 @@ def UnsafeFPMath : Predicate<"TM.Options.UnsafeFPMath">;
 def FMA : Predicate<"Subtarget->hasFMA()">;
 
 def InstFlag : OperandWithDefaultOps <i32, (ops (i32 0))>;
-def ADDRIndirect : ComplexPattern<iPTR, 2, "SelectADDRIndirect", [], []>;
 
 def u16ImmTarget : AsmOperandClass {
   let Name = "U16Imm";
@@ -95,12 +135,6 @@ def brtarget   : Operand<OtherVT>;
 // Misc. PatFrags
 //===----------------------------------------------------------------------===//
 
-class HasOneUseUnaryOp<SDPatternOperator op> : PatFrag<
-  (ops node:$src0),
-  (op $src0),
-  [{ return N->hasOneUse(); }]
->;
-
 class HasOneUseBinOp<SDPatternOperator op> : PatFrag<
   (ops node:$src0, node:$src1),
   (op $src0, $src1),
@@ -113,8 +147,6 @@ class HasOneUseTernaryOp<SDPatternOperator op> : PatFrag<
   [{ return N->hasOneUse(); }]
 >;
 
-def trunc_oneuse : HasOneUseUnaryOp<trunc>;
-
 let Properties = [SDNPCommutative, SDNPAssociative] in {
 def smax_oneuse : HasOneUseBinOp<smax>;
 def smin_oneuse : HasOneUseBinOp<smin>;
@@ -127,6 +159,7 @@ def or_oneuse : HasOneUseBinOp<or>;
 def xor_oneuse : HasOneUseBinOp<xor>;
 } // Properties = [SDNPCommutative, SDNPAssociative]
 
+def add_oneuse : HasOneUseBinOp<add>;
 def sub_oneuse : HasOneUseBinOp<sub>;
 
 def srl_oneuse : HasOneUseBinOp<srl>;
@@ -240,6 +273,37 @@ def COND_NULL : PatLeaf <
   [{(void)N; return false;}]
 >;
 
+//===----------------------------------------------------------------------===//
+// PatLeafs for Texture Constants
+//===----------------------------------------------------------------------===//
+
+def TEX_ARRAY : PatLeaf<
+  (imm),
+  [{uint32_t TType = (uint32_t)N->getZExtValue();
+    return TType == 9 || TType == 10 || TType == 16;
+  }]
+>;
+
+def TEX_RECT : PatLeaf<
+  (imm),
+  [{uint32_t TType = (uint32_t)N->getZExtValue();
+    return TType == 5;
+  }]
+>;
+
+def TEX_SHADOW : PatLeaf<
+  (imm),
+  [{uint32_t TType = (uint32_t)N->getZExtValue();
+    return (TType >= 6 && TType <= 8) || TType == 13;
+  }]
+>;
+
+def TEX_SHADOW_ARRAY : PatLeaf<
+  (imm),
+  [{uint32_t TType = (uint32_t)N->getZExtValue();
+    return TType == 11 || TType == 12 || TType == 17;
+  }]
+>;
 
 //===----------------------------------------------------------------------===//
 // Load/Store Pattern Fragments
@@ -249,6 +313,10 @@ class Aligned8Bytes <dag ops, dag frag> : PatFrag <ops, frag, [{
   return cast<MemSDNode>(N)->getAlignment() % 8 == 0;
 }]>;
 
+class Aligned16Bytes <dag ops, dag frag> : PatFrag <ops, frag, [{
+  return cast<MemSDNode>(N)->getAlignment() >= 16;
+}]>;
+
 class LoadFrag <SDPatternOperator op> : PatFrag<(ops node:$ptr), (op node:$ptr)>;
 
 class StoreFrag<SDPatternOperator op> : PatFrag <
@@ -361,21 +429,31 @@ def az_extloadi8_local : LocalLoad <az_extloadi8>;
 def sextloadi8_local : LocalLoad <sextloadi8>;
 def az_extloadi16_local : LocalLoad <az_extloadi16>;
 def sextloadi16_local : LocalLoad <sextloadi16>;
+def atomic_load_32_local : LocalLoad<atomic_load_32>;
+def atomic_load_64_local : LocalLoad<atomic_load_64>;
 
 def store_local : LocalStore <store>;
 def truncstorei8_local : LocalStore <truncstorei8>;
 def truncstorei16_local : LocalStore <truncstorei16>;
 def store_local_hi16 : StoreHi16 <truncstorei16>, LocalAddress;
 def truncstorei8_local_hi16 : StoreHi16<truncstorei8>, LocalAddress;
+def atomic_store_local : LocalStore <atomic_store>;
 
 def load_align8_local : Aligned8Bytes <
   (ops node:$ptr), (load_local node:$ptr)
 >;
 
+def load_align16_local : Aligned16Bytes <
+  (ops node:$ptr), (load_local node:$ptr)
+>;
+
 def store_align8_local : Aligned8Bytes <
   (ops node:$val, node:$ptr), (store_local node:$val, node:$ptr)
 >;
 
+def store_align16_local : Aligned16Bytes <
+  (ops node:$val, node:$ptr), (store_local node:$val, node:$ptr)
+>;
 
 def load_flat          : FlatLoad <load>;
 def az_extloadi8_flat  : FlatLoad <az_extloadi8>;
@@ -571,6 +649,18 @@ multiclass BFIPatterns <Instruction BFI_INT,
     (BFI_INT $x, $y, $z)
   >;
 
+  // 64-bit version
+  def : AMDGPUPat <
+    (or (and i64:$y, i64:$x), (and i64:$z, (not i64:$x))),
+    (REG_SEQUENCE RC64,
+      (BFI_INT (i32 (EXTRACT_SUBREG $x, sub0)),
+               (i32 (EXTRACT_SUBREG $y, sub0)),
+               (i32 (EXTRACT_SUBREG $z, sub0))), sub0,
+      (BFI_INT (i32 (EXTRACT_SUBREG $x, sub1)),
+               (i32 (EXTRACT_SUBREG $y, sub1)),
+               (i32 (EXTRACT_SUBREG $z, sub1))), sub1)
+  >;
+
   // SHA-256 Ch function
   // z ^ (x & (y ^ z))
   def : AMDGPUPat <
@@ -578,6 +668,18 @@ multiclass BFIPatterns <Instruction BFI_INT,
     (BFI_INT $x, $y, $z)
   >;
 
+  // 64-bit version
+  def : AMDGPUPat <
+    (xor i64:$z, (and i64:$x, (xor i64:$y, i64:$z))),
+    (REG_SEQUENCE RC64,
+      (BFI_INT (i32 (EXTRACT_SUBREG $x, sub0)),
+               (i32 (EXTRACT_SUBREG $y, sub0)),
+               (i32 (EXTRACT_SUBREG $z, sub0))), sub0,
+      (BFI_INT (i32 (EXTRACT_SUBREG $x, sub1)),
+               (i32 (EXTRACT_SUBREG $y, sub1)),
+               (i32 (EXTRACT_SUBREG $z, sub1))), sub1)
+  >;
+
   def : AMDGPUPat <
     (fcopysign f32:$src0, f32:$src1),
     (BFI_INT (LoadImm32 (i32 0x7fffffff)), $src0, $src1)
@@ -611,10 +713,25 @@ multiclass BFIPatterns <Instruction BFI_INT,
 // SHA-256 Ma patterns
 
 // ((x & z) | (y & (x | z))) -> BFI_INT (XOR x, y), z, y
-class SHA256MaPattern <Instruction BFI_INT, Instruction XOR> : AMDGPUPat <
-  (or (and i32:$x, i32:$z), (and i32:$y, (or i32:$x, i32:$z))),
-  (BFI_INT (XOR i32:$x, i32:$y), i32:$z, i32:$y)
->;
+multiclass SHA256MaPattern <Instruction BFI_INT, Instruction XOR, RegisterClass RC64> {
+  def : AMDGPUPat <
+    (or (and i32:$x, i32:$z), (and i32:$y, (or i32:$x, i32:$z))),
+    (BFI_INT (XOR i32:$x, i32:$y), i32:$z, i32:$y)
+  >;
+
+  def : AMDGPUPat <
+    (or (and i64:$x, i64:$z), (and i64:$y, (or i64:$x, i64:$z))),
+    (REG_SEQUENCE RC64,
+      (BFI_INT (XOR (i32 (EXTRACT_SUBREG $x, sub0)),
+                    (i32 (EXTRACT_SUBREG $y, sub0))),
+               (i32 (EXTRACT_SUBREG $z, sub0)),
+               (i32 (EXTRACT_SUBREG $y, sub0))), sub0,
+      (BFI_INT (XOR (i32 (EXTRACT_SUBREG $x, sub1)),
+                    (i32 (EXTRACT_SUBREG $y, sub1))),
+               (i32 (EXTRACT_SUBREG $z, sub1)),
+               (i32 (EXTRACT_SUBREG $y, sub1))), sub1)
+  >;
+}
 
 // Bitfield extract patterns
 
@@ -633,14 +750,33 @@ multiclass BFEPattern <Instruction UBFE, Instruction SBFE, Instruction MOV> {
     (UBFE $src, $rshift, (MOV (i32 (IMMPopCount $mask))))
   >;
 
+  // x & ((1 << y) - 1)
+  def : AMDGPUPat <
+    (and i32:$src, (add_oneuse (shl_oneuse 1, i32:$width), -1)),
+    (UBFE $src, (MOV (i32 0)), $width)
+  >;
+
+  // x & ~(-1 << y)
+  def : AMDGPUPat <
+    (and i32:$src, (xor_oneuse (shl_oneuse -1, i32:$width), -1)),
+    (UBFE $src, (MOV (i32 0)), $width)
+  >;
+
+  // x & (-1 >> (bitwidth - y))
+  def : AMDGPUPat <
+    (and i32:$src, (srl_oneuse -1, (sub 32, i32:$width))),
+    (UBFE $src, (MOV (i32 0)), $width)
+  >;
+
+  // x << (bitwidth - y) >> (bitwidth - y)
   def : AMDGPUPat <
     (srl (shl_oneuse i32:$src, (sub 32, i32:$width)), (sub 32, i32:$width)),
-    (UBFE $src, (i32 0), $width)
+    (UBFE $src, (MOV (i32 0)), $width)
   >;
 
   def : AMDGPUPat <
     (sra (shl_oneuse i32:$src, (sub 32, i32:$width)), (sub 32, i32:$width)),
-    (SBFE $src, (i32 0), $width)
+    (SBFE $src, (MOV (i32 0)), $width)
   >;
 }
 
@@ -697,11 +833,3 @@ class RsqPat<Instruction RsqInst, ValueType vt> : AMDGPUPat <
   (AMDGPUrcp (fsqrt vt:$src)),
   (RsqInst $src)
 >;
-
-include "R600Instructions.td"
-include "R700Instructions.td"
-include "EvergreenInstructions.td"
-include "CaymanInstructions.td"
-
-include "SIInstrInfo.td"
-
diff --git a/lib/Target/AMDGPU/AMDGPUIntrinsicInfo.cpp b/lib/Target/AMDGPU/AMDGPUIntrinsicInfo.cpp
index 86dc9bd9ea74..896e2055cf62 100644
--- a/lib/Target/AMDGPU/AMDGPUIntrinsicInfo.cpp
+++ b/lib/Target/AMDGPU/AMDGPUIntrinsicInfo.cpp
@@ -8,7 +8,7 @@
 //==-----------------------------------------------------------------------===//
 //
 /// \file
-/// \brief AMDGPU Implementation of the IntrinsicInfo class.
+/// AMDGPU Implementation of the IntrinsicInfo class.
 //
 //===-----------------------------------------------------------------------===//
 
@@ -25,13 +25,13 @@ AMDGPUIntrinsicInfo::AMDGPUIntrinsicInfo()
 
 static const char *const IntrinsicNameTable[] = {
 #define GET_INTRINSIC_NAME_TABLE
-#include "AMDGPUGenIntrinsics.inc"
+#include "AMDGPUGenIntrinsicImpl.inc"
 #undef GET_INTRINSIC_NAME_TABLE
 };
 
 namespace {
 #define GET_INTRINSIC_ATTRIBUTES
-#include "AMDGPUGenIntrinsics.inc"
+#include "AMDGPUGenIntrinsicImpl.inc"
 #undef GET_INTRINSIC_ATTRIBUTES
 }
 
@@ -80,7 +80,7 @@ unsigned AMDGPUIntrinsicInfo::lookupName(const char *NameData,
 bool AMDGPUIntrinsicInfo::isOverloaded(unsigned id) const {
 // Overload Table
 #define GET_INTRINSIC_OVERLOAD_TABLE
-#include "AMDGPUGenIntrinsics.inc"
+#include "AMDGPUGenIntrinsicImpl.inc"
 #undef GET_INTRINSIC_OVERLOAD_TABLE
 }
 
diff --git a/lib/Target/AMDGPU/AMDGPUIntrinsicInfo.h b/lib/Target/AMDGPU/AMDGPUIntrinsicInfo.h
index 6cb8b9644642..ef42f9a319af 100644
--- a/lib/Target/AMDGPU/AMDGPUIntrinsicInfo.h
+++ b/lib/Target/AMDGPU/AMDGPUIntrinsicInfo.h
@@ -8,7 +8,7 @@
 //==-----------------------------------------------------------------------===//
 //
 /// \file
-/// \brief Interface for the AMDGPU Implementation of the Intrinsic Info class.
+/// Interface for the AMDGPU Implementation of the Intrinsic Info class.
 //
 //===-----------------------------------------------------------------------===//
 #ifndef LLVM_LIB_TARGET_AMDGPU_AMDGPUINTRINSICINFO_H
@@ -24,7 +24,7 @@ namespace AMDGPUIntrinsic {
 enum ID {
   last_non_AMDGPU_intrinsic = Intrinsic::num_intrinsics - 1,
 #define GET_INTRINSIC_ENUM_VALUES
-#include "AMDGPUGenIntrinsics.inc"
+#include "AMDGPUGenIntrinsicEnums.inc"
 #undef GET_INTRINSIC_ENUM_VALUES
       , num_AMDGPU_intrinsics
 };
diff --git a/lib/Target/AMDGPU/AMDGPUIntrinsics.td b/lib/Target/AMDGPU/AMDGPUIntrinsics.td
index 18c9bd933af2..230a04628504 100644
--- a/lib/Target/AMDGPU/AMDGPUIntrinsics.td
+++ b/lib/Target/AMDGPU/AMDGPUIntrinsics.td
@@ -13,7 +13,4 @@
 
 let TargetPrefix = "AMDGPU", isTarget = 1 in {
   def int_AMDGPU_kill : Intrinsic<[], [llvm_float_ty], []>;
-  def int_AMDGPU_kilp : Intrinsic<[], [], []>;
 }
-
-include "SIIntrinsics.td"
diff --git a/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp b/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp
index b4704f6feb92..87b072c9ea20 100644
--- a/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp
+++ b/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp
@@ -12,7 +12,9 @@
 /// \todo This should be generated by TableGen.
 //===----------------------------------------------------------------------===//
 
+#include "AMDGPU.h"
 #include "AMDGPULegalizerInfo.h"
+#include "AMDGPUTargetMachine.h"
 #include "llvm/CodeGen/TargetOpcodes.h"
 #include "llvm/CodeGen/ValueTypes.h"
 #include "llvm/IR/DerivedTypes.h"
@@ -20,19 +22,46 @@
 #include "llvm/Support/Debug.h"
 
 using namespace llvm;
+using namespace LegalizeActions;
 
-AMDGPULegalizerInfo::AMDGPULegalizerInfo() {
+AMDGPULegalizerInfo::AMDGPULegalizerInfo(const GCNSubtarget &ST,
+                                         const GCNTargetMachine &TM) {
   using namespace TargetOpcode;
 
-  const LLT S1= LLT::scalar(1);
+  auto GetAddrSpacePtr = [&TM](unsigned AS) {
+    return LLT::pointer(AS, TM.getPointerSizeInBits(AS));
+  };
+
+  auto AMDGPUAS = ST.getAMDGPUAS();
+
+  const LLT S1 = LLT::scalar(1);
   const LLT V2S16 = LLT::vector(2, 16);
+
   const LLT S32 = LLT::scalar(32);
   const LLT S64 = LLT::scalar(64);
-  const LLT P1 = LLT::pointer(1, 64);
-  const LLT P2 = LLT::pointer(2, 64);
+  const LLT S512 = LLT::scalar(512);
+
+  const LLT GlobalPtr = GetAddrSpacePtr(AMDGPUAS::GLOBAL_ADDRESS);
+  const LLT ConstantPtr = GetAddrSpacePtr(AMDGPUAS::CONSTANT_ADDRESS);
+  const LLT LocalPtr = GetAddrSpacePtr(AMDGPUAS::LOCAL_ADDRESS);
+  const LLT FlatPtr = GetAddrSpacePtr(AMDGPUAS.FLAT_ADDRESS);
+  const LLT PrivatePtr = GetAddrSpacePtr(AMDGPUAS.PRIVATE_ADDRESS);
+
+  const LLT AddrSpaces[] = {
+    GlobalPtr,
+    ConstantPtr,
+    LocalPtr,
+    FlatPtr,
+    PrivatePtr
+  };
 
   setAction({G_ADD, S32}, Legal);
+  setAction({G_ASHR, S32}, Legal);
+  setAction({G_SUB, S32}, Legal);
+  setAction({G_MUL, S32}, Legal);
   setAction({G_AND, S32}, Legal);
+  setAction({G_OR, S32}, Legal);
+  setAction({G_XOR, S32}, Legal);
 
   setAction({G_BITCAST, V2S16}, Legal);
   setAction({G_BITCAST, 1, S32}, Legal);
@@ -40,41 +69,88 @@ AMDGPULegalizerInfo::AMDGPULegalizerInfo() {
   setAction({G_BITCAST, S32}, Legal);
   setAction({G_BITCAST, 1, V2S16}, Legal);
 
+  getActionDefinitionsBuilder(G_FCONSTANT)
+    .legalFor({S32, S64});
+
+  // G_IMPLICIT_DEF is a no-op so we can make it legal for any value type that
+  // can fit in a register.
+  // FIXME: We need to legalize several more operations before we can add
+  // a test case for size > 512.
+  getActionDefinitionsBuilder(G_IMPLICIT_DEF)
+    .legalIf([=](const LegalityQuery &Query) {
+        return Query.Types[0].getSizeInBits() <= 512;
+    })
+    .clampScalar(0, S1, S512);
+
+  getActionDefinitionsBuilder(G_CONSTANT)
+    .legalFor({S1, S32, S64});
+
   // FIXME: i1 operands to intrinsics should always be legal, but other i1
   // values may not be legal.  We need to figure out how to distinguish
   // between these two scenarios.
   setAction({G_CONSTANT, S1}, Legal);
-  setAction({G_CONSTANT, S32}, Legal);
-  setAction({G_CONSTANT, S64}, Legal);
-
-  setAction({G_FCONSTANT, S32}, Legal);
 
   setAction({G_FADD, S32}, Legal);
 
+  setAction({G_FCMP, S1}, Legal);
+  setAction({G_FCMP, 1, S32}, Legal);
+  setAction({G_FCMP, 1, S64}, Legal);
+
   setAction({G_FMUL, S32}, Legal);
 
-  setAction({G_GEP, P1}, Legal);
-  setAction({G_GEP, P2}, Legal);
-  setAction({G_GEP, 1, S64}, Legal);
+  setAction({G_ZEXT, S64}, Legal);
+  setAction({G_ZEXT, 1, S32}, Legal);
+
+  setAction({G_FPTOSI, S32}, Legal);
+  setAction({G_FPTOSI, 1, S32}, Legal);
+
+  setAction({G_SITOFP, S32}, Legal);
+  setAction({G_SITOFP, 1, S32}, Legal);
+
+  setAction({G_FPTOUI, S32}, Legal);
+  setAction({G_FPTOUI, 1, S32}, Legal);
+
+  for (LLT PtrTy : AddrSpaces) {
+    LLT IdxTy = LLT::scalar(PtrTy.getSizeInBits());
+    setAction({G_GEP, PtrTy}, Legal);
+    setAction({G_GEP, 1, IdxTy}, Legal);
+  }
 
   setAction({G_ICMP, S1}, Legal);
   setAction({G_ICMP, 1, S32}, Legal);
 
-  setAction({G_LOAD, P1}, Legal);
-  setAction({G_LOAD, P2}, Legal);
-  setAction({G_LOAD, S32}, Legal);
-  setAction({G_LOAD, 1, P1}, Legal);
-  setAction({G_LOAD, 1, P2}, Legal);
 
-  setAction({G_OR, S32}, Legal);
+  getActionDefinitionsBuilder({G_LOAD, G_STORE})
+    .legalIf([=, &ST](const LegalityQuery &Query) {
+        const LLT &Ty0 = Query.Types[0];
+
+        // TODO: Decompose private loads into 4-byte components.
+        // TODO: Illegal flat loads on SI
+        switch (Ty0.getSizeInBits()) {
+        case 32:
+        case 64:
+        case 128:
+          return true;
+
+        case 96:
+          // XXX hasLoadX3
+          return (ST.getGeneration() >= AMDGPUSubtarget::SEA_ISLANDS);
+
+        case 256:
+        case 512:
+          // TODO: constant loads
+        default:
+          return false;
+        }
+      });
+
+
 
   setAction({G_SELECT, S32}, Legal);
   setAction({G_SELECT, 1, S1}, Legal);
 
   setAction({G_SHL, S32}, Legal);
 
-  setAction({G_STORE, S32}, Legal);
-  setAction({G_STORE, 1, P1}, Legal);
 
   // FIXME: When RegBankSelect inserts copies, it will only create new
   // registers with scalar types.  This means we can end up with
@@ -83,8 +159,54 @@ AMDGPULegalizerInfo::AMDGPULegalizerInfo() {
   // if it sees a generic instruction which isn't legal, so we need to
   // tell it that scalar types are legal for pointer operands
   setAction({G_GEP, S64}, Legal);
-  setAction({G_LOAD, 1, S64}, Legal);
-  setAction({G_STORE, 1, S64}, Legal);
+
+  for (unsigned Op : {G_EXTRACT_VECTOR_ELT, G_INSERT_VECTOR_ELT}) {
+    getActionDefinitionsBuilder(Op)
+      .legalIf([=](const LegalityQuery &Query) {
+          const LLT &VecTy = Query.Types[1];
+          const LLT &IdxTy = Query.Types[2];
+          return VecTy.getSizeInBits() % 32 == 0 &&
+            VecTy.getSizeInBits() <= 512 &&
+            IdxTy.getSizeInBits() == 32;
+        });
+  }
+
+  // FIXME: Doesn't handle extract of illegal sizes.
+  getActionDefinitionsBuilder({G_EXTRACT, G_INSERT})
+    .legalIf([=](const LegalityQuery &Query) {
+        const LLT &Ty0 = Query.Types[0];
+        const LLT &Ty1 = Query.Types[1];
+        return (Ty0.getSizeInBits() % 32 == 0) &&
+               (Ty1.getSizeInBits() % 32 == 0);
+      });
+
+  // Merge/Unmerge
+  for (unsigned Op : {G_MERGE_VALUES, G_UNMERGE_VALUES}) {
+    unsigned BigTyIdx = Op == G_MERGE_VALUES ? 0 : 1;
+    unsigned LitTyIdx = Op == G_MERGE_VALUES ? 1 : 0;
+
+    getActionDefinitionsBuilder(Op)
+      .legalIf([=](const LegalityQuery &Query) {
+          const LLT &BigTy = Query.Types[BigTyIdx];
+          const LLT &LitTy = Query.Types[LitTyIdx];
+          return BigTy.getSizeInBits() % 32 == 0 &&
+                 LitTy.getSizeInBits() % 32 == 0 &&
+                 BigTy.getSizeInBits() <= 512;
+        })
+      // Any vectors left are the wrong size. Scalarize them.
+      .fewerElementsIf([](const LegalityQuery &Query) { return true; },
+                       [](const LegalityQuery &Query) {
+                         return std::make_pair(
+                           0, Query.Types[0].getElementType());
+                       })
+      .fewerElementsIf([](const LegalityQuery &Query) { return true; },
+                       [](const LegalityQuery &Query) {
+                         return std::make_pair(
+                           1, Query.Types[1].getElementType());
+                       });
+
+  }
 
   computeTables();
+  verify(*ST.getInstrInfo());
 }
diff --git a/lib/Target/AMDGPU/AMDGPULegalizerInfo.h b/lib/Target/AMDGPU/AMDGPULegalizerInfo.h
index 291e3361f163..1cbd37c42c4b 100644
--- a/lib/Target/AMDGPU/AMDGPULegalizerInfo.h
+++ b/lib/Target/AMDGPU/AMDGPULegalizerInfo.h
@@ -19,12 +19,15 @@
 
 namespace llvm {
 
+class GCNTargetMachine;
 class LLVMContext;
+class GCNSubtarget;
 
 /// This class provides the information for the target register banks.
 class AMDGPULegalizerInfo : public LegalizerInfo {
 public:
-  AMDGPULegalizerInfo();
+  AMDGPULegalizerInfo(const GCNSubtarget &ST,
+                      const GCNTargetMachine &TM);
 };
 } // End llvm namespace.
 #endif
diff --git a/lib/Target/AMDGPU/AMDGPULibCalls.cpp b/lib/Target/AMDGPU/AMDGPULibCalls.cpp
index f594767c8edb..7a7ed7a4f065 100644
--- a/lib/Target/AMDGPU/AMDGPULibCalls.cpp
+++ b/lib/Target/AMDGPU/AMDGPULibCalls.cpp
@@ -8,7 +8,7 @@
 //===----------------------------------------------------------------------===//
 //
 /// \file
-/// \brief This file does AMD library function optimizations.
+/// This file does AMD library function optimizations.
 //
 //===----------------------------------------------------------------------===//
 
@@ -765,8 +765,7 @@ bool AMDGPULibCalls::TDOFold(CallInst *CI, const FuncInfo &FInfo) {
         ArrayRef<double> tmp(DVal);
         nval = ConstantDataVector::get(context, tmp);
       }
-      DEBUG(errs() << "AMDIC: " << *CI
-                   << " ---> " << *nval << "\n");
+      LLVM_DEBUG(errs() << "AMDIC: " << *CI << " ---> " << *nval << "\n");
       replaceCall(nval);
       return true;
     }
@@ -776,8 +775,7 @@ bool AMDGPULibCalls::TDOFold(CallInst *CI, const FuncInfo &FInfo) {
       for (int i = 0; i < sz; ++i) {
         if (CF->isExactlyValue(ftbl[i].input)) {
           Value *nval = ConstantFP::get(CF->getType(), ftbl[i].result);
-          DEBUG(errs() << "AMDIC: " << *CI
-                       << " ---> " << *nval << "\n");
+          LLVM_DEBUG(errs() << "AMDIC: " << *CI << " ---> " << *nval << "\n");
           replaceCall(nval);
           return true;
         }
@@ -798,11 +796,11 @@ bool AMDGPULibCalls::replaceWithNative(CallInst *CI, const FuncInfo &FInfo) {
   AMDGPULibFunc nf = FInfo;
   nf.setPrefix(AMDGPULibFunc::NATIVE);
   if (Constant *FPExpr = getFunction(M, nf)) {
-    DEBUG(dbgs() << "AMDIC: " << *CI << " ---> ");
+    LLVM_DEBUG(dbgs() << "AMDIC: " << *CI << " ---> ");
 
     CI->setCalledFunction(FPExpr);
 
-    DEBUG(dbgs() << *CI << '\n');
+    LLVM_DEBUG(dbgs() << *CI << '\n');
 
     return true;
   }
@@ -820,8 +818,7 @@ bool AMDGPULibCalls::fold_recip(CallInst *CI, IRBuilder<> &B,
     Value *nval = B.CreateFDiv(ConstantFP::get(CF->getType(), 1.0),
                                opr0,
                                "recip2div");
-    DEBUG(errs() << "AMDIC: " << *CI
-                 << " ---> " << *nval << "\n");
+    LLVM_DEBUG(errs() << "AMDIC: " << *CI << " ---> " << *nval << "\n");
     replaceCall(nval);
     return true;
   }
@@ -899,7 +896,7 @@ bool AMDGPULibCalls::fold_pow(CallInst *CI, IRBuilder<> &B,
 
   if ((CF && CF->isZero()) || (CINT && ci_opr1 == 0) || CZero) {
     //  pow/powr/pown(x, 0) == 1
-    DEBUG(errs() << "AMDIC: " << *CI << " ---> 1\n");
+    LLVM_DEBUG(errs() << "AMDIC: " << *CI << " ---> 1\n");
     Constant *cnval = ConstantFP::get(eltType, 1.0);
     if (getVecSize(FInfo) > 1) {
       cnval = ConstantDataVector::getSplat(getVecSize(FInfo), cnval);
@@ -909,23 +906,21 @@ bool AMDGPULibCalls::fold_pow(CallInst *CI, IRBuilder<> &B,
   }
   if ((CF && CF->isExactlyValue(1.0)) || (CINT && ci_opr1 == 1)) {
     // pow/powr/pown(x, 1.0) = x
-    DEBUG(errs() << "AMDIC: " << *CI
-                 << " ---> " << *opr0 << "\n");
+    LLVM_DEBUG(errs() << "AMDIC: " << *CI << " ---> " << *opr0 << "\n");
     replaceCall(opr0);
     return true;
   }
   if ((CF && CF->isExactlyValue(2.0)) || (CINT && ci_opr1 == 2)) {
     // pow/powr/pown(x, 2.0) = x*x
-    DEBUG(errs() << "AMDIC: " << *CI
-                 << " ---> " << *opr0 << " * " << *opr0 << "\n");
+    LLVM_DEBUG(errs() << "AMDIC: " << *CI << " ---> " << *opr0 << " * " << *opr0
+                      << "\n");
     Value *nval = B.CreateFMul(opr0, opr0, "__pow2");
     replaceCall(nval);
     return true;
   }
   if ((CF && CF->isExactlyValue(-1.0)) || (CINT && ci_opr1 == -1)) {
     // pow/powr/pown(x, -1.0) = 1.0/x
-    DEBUG(errs() << "AMDIC: " << *CI
-                 << " ---> 1 / " << *opr0 << "\n");
+    LLVM_DEBUG(errs() << "AMDIC: " << *CI << " ---> 1 / " << *opr0 << "\n");
     Constant *cnval = ConstantFP::get(eltType, 1.0);
     if (getVecSize(FInfo) > 1) {
       cnval = ConstantDataVector::getSplat(getVecSize(FInfo), cnval);
@@ -942,8 +937,8 @@ bool AMDGPULibCalls::fold_pow(CallInst *CI, IRBuilder<> &B,
     if (Constant *FPExpr = getFunction(M,
         AMDGPULibFunc(issqrt ? AMDGPULibFunc::EI_SQRT
                              : AMDGPULibFunc::EI_RSQRT, FInfo))) {
-      DEBUG(errs() << "AMDIC: " << *CI << " ---> "
-                   << FInfo.getName().c_str() << "(" << *opr0 << ")\n");
+      LLVM_DEBUG(errs() << "AMDIC: " << *CI << " ---> "
+                        << FInfo.getName().c_str() << "(" << *opr0 << ")\n");
       Value *nval = CreateCallEx(B,FPExpr, opr0, issqrt ? "__pow2sqrt"
                                                         : "__pow2rsqrt");
       replaceCall(nval);
@@ -999,8 +994,9 @@ bool AMDGPULibCalls::fold_pow(CallInst *CI, IRBuilder<> &B,
       }
       nval = B.CreateFDiv(cnval, nval, "__1powprod");
     }
-    DEBUG(errs() << "AMDIC: " << *CI << " ---> "
-                 <<  ((ci_opr1 < 0) ? "1/prod(" : "prod(") << *opr0 << ")\n");
+    LLVM_DEBUG(errs() << "AMDIC: " << *CI << " ---> "
+                      << ((ci_opr1 < 0) ? "1/prod(" : "prod(") << *opr0
+                      << ")\n");
     replaceCall(nval);
     return true;
   }
@@ -1137,8 +1133,8 @@ bool AMDGPULibCalls::fold_pow(CallInst *CI, IRBuilder<> &B,
     nval = B.CreateBitCast(nval, opr0->getType());
   }
 
-  DEBUG(errs() << "AMDIC: " << *CI << " ---> "
-               << "exp2(" << *opr1 << " * log2(" << *opr0 << "))\n");
+  LLVM_DEBUG(errs() << "AMDIC: " << *CI << " ---> "
+                    << "exp2(" << *opr1 << " * log2(" << *opr0 << "))\n");
   replaceCall(nval);
 
   return true;
@@ -1155,8 +1151,7 @@ bool AMDGPULibCalls::fold_rootn(CallInst *CI, IRBuilder<> &B,
   }
   int ci_opr1 = (int)CINT->getSExtValue();
   if (ci_opr1 == 1) {  // rootn(x, 1) = x
-    DEBUG(errs() << "AMDIC: " << *CI
-                 << " ---> " << *opr0 << "\n");
+    LLVM_DEBUG(errs() << "AMDIC: " << *CI << " ---> " << *opr0 << "\n");
     replaceCall(opr0);
     return true;
   }
@@ -1166,7 +1161,7 @@ bool AMDGPULibCalls::fold_rootn(CallInst *CI, IRBuilder<> &B,
     Module *M = CI->getModule();
     if (Constant *FPExpr = getFunction(M, AMDGPULibFunc(AMDGPULibFunc::EI_SQRT,
                                                         FInfo))) {
-      DEBUG(errs() << "AMDIC: " << *CI << " ---> sqrt(" << *opr0 << ")\n");
+      LLVM_DEBUG(errs() << "AMDIC: " << *CI << " ---> sqrt(" << *opr0 << ")\n");
       Value *nval = CreateCallEx(B,FPExpr, opr0, "__rootn2sqrt");
       replaceCall(nval);
       return true;
@@ -1175,13 +1170,13 @@ bool AMDGPULibCalls::fold_rootn(CallInst *CI, IRBuilder<> &B,
     Module *M = CI->getModule();
     if (Constant *FPExpr = getFunction(M, AMDGPULibFunc(AMDGPULibFunc::EI_CBRT,
                                                         FInfo))) {
-      DEBUG(errs() << "AMDIC: " << *CI << " ---> cbrt(" << *opr0 << ")\n");
+      LLVM_DEBUG(errs() << "AMDIC: " << *CI << " ---> cbrt(" << *opr0 << ")\n");
       Value *nval = CreateCallEx(B,FPExpr, opr0, "__rootn2cbrt");
       replaceCall(nval);
       return true;
     }
   } else if (ci_opr1 == -1) { // rootn(x, -1) = 1.0/x
-    DEBUG(errs() << "AMDIC: " << *CI << " ---> 1.0 / " << *opr0 << "\n");
+    LLVM_DEBUG(errs() << "AMDIC: " << *CI << " ---> 1.0 / " << *opr0 << "\n");
     Value *nval = B.CreateFDiv(ConstantFP::get(opr0->getType(), 1.0),
                                opr0,
                                "__rootn2div");
@@ -1193,7 +1188,8 @@ bool AMDGPULibCalls::fold_rootn(CallInst *CI, IRBuilder<> &B,
     Module *M = CI->getModule();
     if (Constant *FPExpr = getFunction(M, AMDGPULibFunc(AMDGPULibFunc::EI_RSQRT,
                                                         FInfo))) {
-      DEBUG(errs() << "AMDIC: " << *CI << " ---> rsqrt(" << *opr0 << ")\n");
+      LLVM_DEBUG(errs() << "AMDIC: " << *CI << " ---> rsqrt(" << *opr0
+                        << ")\n");
       Value *nval = CreateCallEx(B,FPExpr, opr0, "__rootn2rsqrt");
       replaceCall(nval);
       return true;
@@ -1212,22 +1208,22 @@ bool AMDGPULibCalls::fold_fma_mad(CallInst *CI, IRBuilder<> &B,
   ConstantFP *CF1 = dyn_cast<ConstantFP>(opr1);
   if ((CF0 && CF0->isZero()) || (CF1 && CF1->isZero())) {
     // fma/mad(a, b, c) = c if a=0 || b=0
-    DEBUG(errs() << "AMDIC: " << *CI << " ---> " << *opr2 << "\n");
+    LLVM_DEBUG(errs() << "AMDIC: " << *CI << " ---> " << *opr2 << "\n");
     replaceCall(opr2);
     return true;
   }
   if (CF0 && CF0->isExactlyValue(1.0f)) {
     // fma/mad(a, b, c) = b+c if a=1
-    DEBUG(errs() << "AMDIC: " << *CI << " ---> "
-                 << *opr1 << " + " << *opr2 << "\n");
+    LLVM_DEBUG(errs() << "AMDIC: " << *CI << " ---> " << *opr1 << " + " << *opr2
+                      << "\n");
     Value *nval = B.CreateFAdd(opr1, opr2, "fmaadd");
     replaceCall(nval);
     return true;
   }
   if (CF1 && CF1->isExactlyValue(1.0f)) {
     // fma/mad(a, b, c) = a+c if b=1
-    DEBUG(errs() << "AMDIC: " << *CI << " ---> "
-                 << *opr0 << " + " << *opr2 << "\n");
+    LLVM_DEBUG(errs() << "AMDIC: " << *CI << " ---> " << *opr0 << " + " << *opr2
+                      << "\n");
     Value *nval = B.CreateFAdd(opr0, opr2, "fmaadd");
     replaceCall(nval);
     return true;
@@ -1235,8 +1231,8 @@ bool AMDGPULibCalls::fold_fma_mad(CallInst *CI, IRBuilder<> &B,
   if (ConstantFP *CF = dyn_cast<ConstantFP>(opr2)) {
     if (CF->isZero()) {
       // fma/mad(a, b, c) = a*b if c=0
-      DEBUG(errs() << "AMDIC: " << *CI << " ---> "
-                   << *opr0 << " * " << *opr1 << "\n");
+      LLVM_DEBUG(errs() << "AMDIC: " << *CI << " ---> " << *opr0 << " * "
+                        << *opr1 << "\n");
       Value *nval = B.CreateFMul(opr0, opr1, "fmamul");
       replaceCall(nval);
       return true;
@@ -1263,8 +1259,8 @@ bool AMDGPULibCalls::fold_sqrt(CallInst *CI, IRBuilder<> &B,
     if (Constant *FPExpr = getNativeFunction(
         CI->getModule(), AMDGPULibFunc(AMDGPULibFunc::EI_SQRT, FInfo))) {
       Value *opr0 = CI->getArgOperand(0);
-      DEBUG(errs() << "AMDIC: " << *CI << " ---> "
-                   << "sqrt(" << *opr0 << ")\n");
+      LLVM_DEBUG(errs() << "AMDIC: " << *CI << " ---> "
+                        << "sqrt(" << *opr0 << ")\n");
       Value *nval = CreateCallEx(B,FPExpr, opr0, "__sqrt");
       replaceCall(nval);
       return true;
@@ -1355,8 +1351,8 @@ bool AMDGPULibCalls::fold_sincos(CallInst *CI, IRBuilder<> &B,
     P = B.CreateAddrSpaceCast(Alloc, PTy);
   CallInst *Call = CreateCallEx2(B, Fsincos, UI->getArgOperand(0), P);
 
-  DEBUG(errs() << "AMDIC: fold_sincos (" << *CI << ", " << *UI
-               << ") with " << *Call << "\n");
+  LLVM_DEBUG(errs() << "AMDIC: fold_sincos (" << *CI << ", " << *UI << ") with "
+                    << *Call << "\n");
 
   if (!isSin) { // CI->cos, UI->sin
     B.SetInsertPoint(&*ItOld);
@@ -1719,9 +1715,8 @@ bool AMDGPUSimplifyLibCalls::runOnFunction(Function &F) {
   bool Changed = false;
   auto AA = &getAnalysis<AAResultsWrapperPass>().getAAResults();
 
-  DEBUG(dbgs() << "AMDIC: process function ";
-        F.printAsOperand(dbgs(), false, F.getParent());
-        dbgs() << '\n';);
+  LLVM_DEBUG(dbgs() << "AMDIC: process function ";
+             F.printAsOperand(dbgs(), false, F.getParent()); dbgs() << '\n';);
 
   if (!EnablePreLink)
     Changed |= setFastFlags(F, Options);
@@ -1737,8 +1732,8 @@ bool AMDGPUSimplifyLibCalls::runOnFunction(Function &F) {
       Function *Callee = CI->getCalledFunction();
       if (Callee == 0) continue;
 
-      DEBUG(dbgs() << "AMDIC: try folding " << *CI << "\n";
-            dbgs().flush());
+      LLVM_DEBUG(dbgs() << "AMDIC: try folding " << *CI << "\n";
+                 dbgs().flush());
       if(Simplifier.fold(CI, AA))
         Changed = true;
     }
diff --git a/lib/Target/AMDGPU/AMDGPULibFunc.h b/lib/Target/AMDGPU/AMDGPULibFunc.h
index 5405bc645714..fe062384800a 100644
--- a/lib/Target/AMDGPU/AMDGPULibFunc.h
+++ b/lib/Target/AMDGPU/AMDGPULibFunc.h
@@ -1,4 +1,4 @@
-//===-- AMDGPULibFunc.h ---------------------------------------------------===//
+//===-- AMDGPULibFunc.h ----------------------------------------*- C++ -*--===//
 //
 //                     The LLVM Compiler Infrastructure
 //
diff --git a/lib/Target/AMDGPU/AMDGPULowerIntrinsics.cpp b/lib/Target/AMDGPU/AMDGPULowerIntrinsics.cpp
index 7e0e9802c0e6..2cec8fe53283 100644
--- a/lib/Target/AMDGPU/AMDGPULowerIntrinsics.cpp
+++ b/lib/Target/AMDGPU/AMDGPULowerIntrinsics.cpp
@@ -117,7 +117,6 @@ bool AMDGPULowerIntrinsics::makeLIDRangeMetadata(Function &F) const {
     return false;
 
   const TargetMachine &TM = TPC->getTM<TargetMachine>();
-  const AMDGPUSubtarget &ST = TM.getSubtarget<AMDGPUSubtarget>(F);
   bool Changed = false;
 
   for (auto *U : F.users()) {
@@ -125,7 +124,7 @@ bool AMDGPULowerIntrinsics::makeLIDRangeMetadata(Function &F) const {
     if (!CI)
       continue;
 
-    Changed |= ST.makeLIDRangeMetadata(CI);
+    Changed |= AMDGPUSubtarget::get(TM, F).makeLIDRangeMetadata(CI);
   }
   return Changed;
 }
diff --git a/lib/Target/AMDGPU/AMDGPULowerKernelArguments.cpp b/lib/Target/AMDGPU/AMDGPULowerKernelArguments.cpp
new file mode 100644
index 000000000000..8cc7e38f7b29
--- /dev/null
+++ b/lib/Target/AMDGPU/AMDGPULowerKernelArguments.cpp
@@ -0,0 +1,264 @@
+//===-- AMDGPULowerKernelArguments.cpp ------------------------------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+/// \file This pass replaces accesses to kernel arguments with loads from
+/// offsets from the kernarg base pointer.
+//
+//===----------------------------------------------------------------------===//
+
+#include "AMDGPU.h"
+#include "AMDGPUSubtarget.h"
+#include "AMDGPUTargetMachine.h"
+#include "llvm/ADT/StringRef.h"
+#include "llvm/Analysis/DivergenceAnalysis.h"
+#include "llvm/Analysis/Loads.h"
+#include "llvm/CodeGen/Passes.h"
+#include "llvm/CodeGen/TargetPassConfig.h"
+#include "llvm/IR/Attributes.h"
+#include "llvm/IR/BasicBlock.h"
+#include "llvm/IR/Constants.h"
+#include "llvm/IR/DerivedTypes.h"
+#include "llvm/IR/Function.h"
+#include "llvm/IR/IRBuilder.h"
+#include "llvm/IR/InstrTypes.h"
+#include "llvm/IR/Instruction.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/LLVMContext.h"
+#include "llvm/IR/MDBuilder.h"
+#include "llvm/IR/Metadata.h"
+#include "llvm/IR/Operator.h"
+#include "llvm/IR/Type.h"
+#include "llvm/IR/Value.h"
+#include "llvm/Pass.h"
+#include "llvm/Support/Casting.h"
+
+#define DEBUG_TYPE "amdgpu-lower-kernel-arguments"
+
+using namespace llvm;
+
+namespace {
+
+class AMDGPULowerKernelArguments : public FunctionPass{
+public:
+  static char ID;
+
+  AMDGPULowerKernelArguments() : FunctionPass(ID) {}
+
+  bool runOnFunction(Function &F) override;
+
+  void getAnalysisUsage(AnalysisUsage &AU) const override {
+    AU.addRequired<TargetPassConfig>();
+    AU.setPreservesAll();
+ }
+};
+
+} // end anonymous namespace
+
+bool AMDGPULowerKernelArguments::runOnFunction(Function &F) {
+  CallingConv::ID CC = F.getCallingConv();
+  if (CC != CallingConv::AMDGPU_KERNEL || F.arg_empty())
+    return false;
+
+  auto &TPC = getAnalysis<TargetPassConfig>();
+
+  const TargetMachine &TM = TPC.getTM<TargetMachine>();
+  const GCNSubtarget &ST = TM.getSubtarget<GCNSubtarget>(F);
+  LLVMContext &Ctx = F.getParent()->getContext();
+  const DataLayout &DL = F.getParent()->getDataLayout();
+  BasicBlock &EntryBlock = *F.begin();
+  IRBuilder<> Builder(&*EntryBlock.begin());
+
+  const unsigned KernArgBaseAlign = 16; // FIXME: Increase if necessary
+  const uint64_t BaseOffset = ST.getExplicitKernelArgOffset(F);
+
+  unsigned MaxAlign;
+  // FIXME: Alignment is broken broken with explicit arg offset.;
+  const uint64_t TotalKernArgSize = ST.getKernArgSegmentSize(F, MaxAlign);
+  if (TotalKernArgSize == 0)
+    return false;
+
+  CallInst *KernArgSegment =
+    Builder.CreateIntrinsic(Intrinsic::amdgcn_kernarg_segment_ptr, nullptr,
+                            F.getName() + ".kernarg.segment");
+
+  KernArgSegment->addAttribute(AttributeList::ReturnIndex, Attribute::NonNull);
+  KernArgSegment->addAttribute(AttributeList::ReturnIndex,
+    Attribute::getWithDereferenceableBytes(Ctx, TotalKernArgSize));
+
+  unsigned AS = KernArgSegment->getType()->getPointerAddressSpace();
+  uint64_t ExplicitArgOffset = 0;
+
+  for (Argument &Arg : F.args()) {
+    Type *ArgTy = Arg.getType();
+    unsigned Align = DL.getABITypeAlignment(ArgTy);
+    unsigned Size = DL.getTypeSizeInBits(ArgTy);
+    unsigned AllocSize = DL.getTypeAllocSize(ArgTy);
+
+
+    // Clover seems to always pad i8/i16 to i32, but doesn't properly align
+    // them?
+    // Make sure the struct elements have correct size and alignment for ext
+    // args. These seem to be padded up to 4-bytes but not correctly aligned.
+    bool IsExtArg = AllocSize < 32 && (Arg.hasZExtAttr() || Arg.hasSExtAttr()) &&
+                    !ST.isAmdHsaOS();
+    if (IsExtArg)
+      AllocSize = 4;
+
+    uint64_t EltOffset = alignTo(ExplicitArgOffset, Align) + BaseOffset;
+    ExplicitArgOffset = alignTo(ExplicitArgOffset, Align) + AllocSize;
+
+    if (Arg.use_empty())
+      continue;
+
+    if (PointerType *PT = dyn_cast<PointerType>(ArgTy)) {
+      // FIXME: Hack. We rely on AssertZext to be able to fold DS addressing
+      // modes on SI to know the high bits are 0 so pointer adds don't wrap. We
+      // can't represent this with range metadata because it's only allowed for
+      // integer types.
+      if (PT->getAddressSpace() == AMDGPUAS::LOCAL_ADDRESS &&
+          ST.getGeneration() == AMDGPUSubtarget::SOUTHERN_ISLANDS)
+        continue;
+
+      // FIXME: We can replace this with equivalent alias.scope/noalias
+      // metadata, but this appears to be a lot of work.
+      if (Arg.hasNoAliasAttr())
+        continue;
+    }
+
+    VectorType *VT = dyn_cast<VectorType>(ArgTy);
+    bool IsV3 = VT && VT->getNumElements() == 3;
+    VectorType *V4Ty = nullptr;
+
+    int64_t AlignDownOffset = alignDown(EltOffset, 4);
+    int64_t OffsetDiff = EltOffset - AlignDownOffset;
+    unsigned AdjustedAlign = MinAlign(KernArgBaseAlign, AlignDownOffset);
+
+    Value *ArgPtr;
+    if (Size < 32 && !ArgTy->isAggregateType()) { // FIXME: Handle aggregate types
+      // Since we don't have sub-dword scalar loads, avoid doing an extload by
+      // loading earlier than the argument address, and extracting the relevant
+      // bits.
+      //
+      // Additionally widen any sub-dword load to i32 even if suitably aligned,
+      // so that CSE between different argument loads works easily.
+
+      ArgPtr = Builder.CreateConstInBoundsGEP1_64(
+        KernArgSegment,
+        AlignDownOffset,
+        Arg.getName() + ".kernarg.offset.align.down");
+      ArgPtr = Builder.CreateBitCast(ArgPtr,
+                                     Builder.getInt32Ty()->getPointerTo(AS),
+                                     ArgPtr->getName() + ".cast");
+    } else {
+      ArgPtr = Builder.CreateConstInBoundsGEP1_64(
+        KernArgSegment,
+        AlignDownOffset,
+        Arg.getName() + ".kernarg.offset");
+      ArgPtr = Builder.CreateBitCast(ArgPtr, ArgTy->getPointerTo(AS),
+                                     ArgPtr->getName() + ".cast");
+    }
+
+    assert((!IsExtArg || !IsV3) && "incompatible situation");
+
+    if (IsV3 && Size >= 32) {
+      V4Ty = VectorType::get(VT->getVectorElementType(), 4);
+      // Use the hack that clang uses to avoid SelectionDAG ruining v3 loads
+      ArgPtr = Builder.CreateBitCast(ArgPtr, V4Ty->getPointerTo(AS));
+    }
+
+    LoadInst *Load = Builder.CreateAlignedLoad(ArgPtr, AdjustedAlign);
+    Load->setMetadata(LLVMContext::MD_invariant_load, MDNode::get(Ctx, {}));
+
+    MDBuilder MDB(Ctx);
+
+    if (isa<PointerType>(ArgTy)) {
+      if (Arg.hasNonNullAttr())
+        Load->setMetadata(LLVMContext::MD_nonnull, MDNode::get(Ctx, {}));
+
+      uint64_t DerefBytes = Arg.getDereferenceableBytes();
+      if (DerefBytes != 0) {
+        Load->setMetadata(
+          LLVMContext::MD_dereferenceable,
+          MDNode::get(Ctx,
+                      MDB.createConstant(
+                        ConstantInt::get(Builder.getInt64Ty(), DerefBytes))));
+      }
+
+      uint64_t DerefOrNullBytes = Arg.getDereferenceableOrNullBytes();
+      if (DerefOrNullBytes != 0) {
+        Load->setMetadata(
+          LLVMContext::MD_dereferenceable_or_null,
+          MDNode::get(Ctx,
+                      MDB.createConstant(ConstantInt::get(Builder.getInt64Ty(),
+                                                          DerefOrNullBytes))));
+      }
+
+      unsigned ParamAlign = Arg.getParamAlignment();
+      if (ParamAlign != 0) {
+        Load->setMetadata(
+          LLVMContext::MD_align,
+          MDNode::get(Ctx,
+                      MDB.createConstant(ConstantInt::get(Builder.getInt64Ty(),
+                                                          ParamAlign))));
+      }
+    }
+
+    // TODO: Convert noalias arg to !noalias
+
+    if (Size < 32 && !ArgTy->isAggregateType()) {
+      if (IsExtArg && OffsetDiff == 0) {
+        Type *I32Ty = Builder.getInt32Ty();
+        bool IsSext = Arg.hasSExtAttr();
+        Metadata *LowAndHigh[] = {
+          ConstantAsMetadata::get(
+            ConstantInt::get(I32Ty, IsSext ? minIntN(Size) : 0)),
+          ConstantAsMetadata::get(
+            ConstantInt::get(I32Ty,
+                             IsSext ? maxIntN(Size) + 1 : maxUIntN(Size) + 1))
+        };
+
+        Load->setMetadata(LLVMContext::MD_range, MDNode::get(Ctx, LowAndHigh));
+      }
+
+      Value *ExtractBits = OffsetDiff == 0 ?
+        Load : Builder.CreateLShr(Load, OffsetDiff * 8);
+
+      IntegerType *ArgIntTy = Builder.getIntNTy(Size);
+      Value *Trunc = Builder.CreateTrunc(ExtractBits, ArgIntTy);
+      Value *NewVal = Builder.CreateBitCast(Trunc, ArgTy,
+                                            Arg.getName() + ".load");
+      Arg.replaceAllUsesWith(NewVal);
+    } else if (IsV3) {
+      Value *Shuf = Builder.CreateShuffleVector(Load, UndefValue::get(V4Ty),
+                                                {0, 1, 2},
+                                                Arg.getName() + ".load");
+      Arg.replaceAllUsesWith(Shuf);
+    } else {
+      Load->setName(Arg.getName() + ".load");
+      Arg.replaceAllUsesWith(Load);
+    }
+  }
+
+  KernArgSegment->addAttribute(
+    AttributeList::ReturnIndex,
+    Attribute::getWithAlignment(Ctx, std::max(KernArgBaseAlign, MaxAlign)));
+
+  return true;
+}
+
+INITIALIZE_PASS_BEGIN(AMDGPULowerKernelArguments, DEBUG_TYPE,
+                      "AMDGPU Lower Kernel Arguments", false, false)
+INITIALIZE_PASS_END(AMDGPULowerKernelArguments, DEBUG_TYPE, "AMDGPU Lower Kernel Arguments",
+                    false, false)
+
+char AMDGPULowerKernelArguments::ID = 0;
+
+FunctionPass *llvm::createAMDGPULowerKernelArgumentsPass() {
+  return new AMDGPULowerKernelArguments();
+}
diff --git a/lib/Target/AMDGPU/AMDGPULowerKernelAttributes.cpp b/lib/Target/AMDGPU/AMDGPULowerKernelAttributes.cpp
new file mode 100644
index 000000000000..a43dcef4cf0b
--- /dev/null
+++ b/lib/Target/AMDGPU/AMDGPULowerKernelAttributes.cpp
@@ -0,0 +1,270 @@
+//===-- AMDGPULowerKernelAttributes.cpp ------------------------------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+/// \file This pass does attempts to make use of reqd_work_group_size metadata
+/// to eliminate loads from the dispatch packet and to constant fold OpenCL
+/// get_local_size-like functions.
+//
+//===----------------------------------------------------------------------===//
+
+#include "AMDGPU.h"
+#include "AMDGPUTargetMachine.h"
+#include "llvm/Analysis/ValueTracking.h"
+#include "llvm/CodeGen/Passes.h"
+#include "llvm/CodeGen/TargetPassConfig.h"
+#include "llvm/IR/Constants.h"
+#include "llvm/IR/Function.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/PatternMatch.h"
+#include "llvm/Pass.h"
+
+#define DEBUG_TYPE "amdgpu-lower-kernel-attributes"
+
+using namespace llvm;
+
+namespace {
+
+// Field offsets in hsa_kernel_dispatch_packet_t.
+enum DispatchPackedOffsets {
+  WORKGROUP_SIZE_X = 4,
+  WORKGROUP_SIZE_Y = 6,
+  WORKGROUP_SIZE_Z = 8,
+
+  GRID_SIZE_X = 12,
+  GRID_SIZE_Y = 16,
+  GRID_SIZE_Z = 20
+};
+
+class AMDGPULowerKernelAttributes : public ModulePass {
+  Module *Mod = nullptr;
+
+public:
+  static char ID;
+
+  AMDGPULowerKernelAttributes() : ModulePass(ID) {}
+
+  bool processUse(CallInst *CI);
+
+  bool doInitialization(Module &M) override;
+  bool runOnModule(Module &M) override;
+
+  StringRef getPassName() const override {
+    return "AMDGPU Kernel Attributes";
+  }
+
+  void getAnalysisUsage(AnalysisUsage &AU) const override {
+    AU.setPreservesAll();
+ }
+};
+
+} // end anonymous namespace
+
+bool AMDGPULowerKernelAttributes::doInitialization(Module &M) {
+  Mod = &M;
+  return false;
+}
+
+bool AMDGPULowerKernelAttributes::processUse(CallInst *CI) {
+  Function *F = CI->getParent()->getParent();
+
+  auto MD = F->getMetadata("reqd_work_group_size");
+  const bool HasReqdWorkGroupSize = MD && MD->getNumOperands() == 3;
+
+  const bool HasUniformWorkGroupSize =
+    F->getFnAttribute("uniform-work-group-size").getValueAsString() == "true";
+
+  if (!HasReqdWorkGroupSize && !HasUniformWorkGroupSize)
+    return false;
+
+  Value *WorkGroupSizeX = nullptr;
+  Value *WorkGroupSizeY = nullptr;
+  Value *WorkGroupSizeZ = nullptr;
+
+  Value *GridSizeX = nullptr;
+  Value *GridSizeY = nullptr;
+  Value *GridSizeZ = nullptr;
+
+  const DataLayout &DL = Mod->getDataLayout();
+
+  // We expect to see several GEP users, casted to the appropriate type and
+  // loaded.
+  for (User *U : CI->users()) {
+    if (!U->hasOneUse())
+      continue;
+
+    int64_t Offset = 0;
+    if (GetPointerBaseWithConstantOffset(U, Offset, DL) != CI)
+      continue;
+
+    auto *BCI = dyn_cast<BitCastInst>(*U->user_begin());
+    if (!BCI || !BCI->hasOneUse())
+      continue;
+
+    auto *Load = dyn_cast<LoadInst>(*BCI->user_begin());
+    if (!Load || !Load->isSimple())
+      continue;
+
+    unsigned LoadSize = DL.getTypeStoreSize(Load->getType());
+
+    // TODO: Handle merged loads.
+    switch (Offset) {
+    case WORKGROUP_SIZE_X:
+      if (LoadSize == 2)
+        WorkGroupSizeX = Load;
+      break;
+    case WORKGROUP_SIZE_Y:
+      if (LoadSize == 2)
+        WorkGroupSizeY = Load;
+      break;
+    case WORKGROUP_SIZE_Z:
+      if (LoadSize == 2)
+        WorkGroupSizeZ = Load;
+      break;
+    case GRID_SIZE_X:
+      if (LoadSize == 4)
+        GridSizeX = Load;
+      break;
+    case GRID_SIZE_Y:
+      if (LoadSize == 4)
+        GridSizeY = Load;
+      break;
+    case GRID_SIZE_Z:
+      if (LoadSize == 4)
+        GridSizeZ = Load;
+      break;
+    default:
+      break;
+    }
+  }
+
+  // Pattern match the code used to handle partial workgroup dispatches in the
+  // library implementation of get_local_size, so the entire function can be
+  // constant folded with a known group size.
+  //
+  // uint r = grid_size - group_id * group_size;
+  // get_local_size = (r < group_size) ? r : group_size;
+  //
+  // If we have uniform-work-group-size (which is the default in OpenCL 1.2),
+  // the grid_size is required to be a multiple of group_size). In this case:
+  //
+  // grid_size - (group_id * group_size) < group_size
+  // ->
+  // grid_size < group_size + (group_id * group_size)
+  //
+  // (grid_size / group_size) < 1 + group_id
+  //
+  // grid_size / group_size is at least 1, so we can conclude the select
+  // condition is false (except for group_id == 0, where the select result is
+  // the same).
+
+  bool MadeChange = false;
+  Value *WorkGroupSizes[3] = { WorkGroupSizeX, WorkGroupSizeY, WorkGroupSizeZ };
+  Value *GridSizes[3] = { GridSizeX, GridSizeY, GridSizeZ };
+
+  for (int I = 0; HasUniformWorkGroupSize && I < 3; ++I) {
+    Value *GroupSize = WorkGroupSizes[I];
+    Value *GridSize = GridSizes[I];
+    if (!GroupSize || !GridSize)
+      continue;
+
+    for (User *U : GroupSize->users()) {
+      auto *ZextGroupSize = dyn_cast<ZExtInst>(U);
+      if (!ZextGroupSize)
+        continue;
+
+      for (User *ZextUser : ZextGroupSize->users()) {
+        auto *SI = dyn_cast<SelectInst>(ZextUser);
+        if (!SI)
+          continue;
+
+        using namespace llvm::PatternMatch;
+        auto GroupIDIntrin = I == 0 ?
+          m_Intrinsic<Intrinsic::amdgcn_workgroup_id_x>() :
+            (I == 1 ? m_Intrinsic<Intrinsic::amdgcn_workgroup_id_y>() :
+                      m_Intrinsic<Intrinsic::amdgcn_workgroup_id_z>());
+
+        auto SubExpr = m_Sub(m_Specific(GridSize),
+                             m_Mul(GroupIDIntrin, m_Specific(ZextGroupSize)));
+
+        ICmpInst::Predicate Pred;
+        if (match(SI,
+                  m_Select(m_ICmp(Pred, SubExpr, m_Specific(ZextGroupSize)),
+                           SubExpr,
+                           m_Specific(ZextGroupSize))) &&
+            Pred == ICmpInst::ICMP_ULT) {
+          if (HasReqdWorkGroupSize) {
+            ConstantInt *KnownSize
+              = mdconst::extract<ConstantInt>(MD->getOperand(I));
+            SI->replaceAllUsesWith(ConstantExpr::getIntegerCast(KnownSize,
+                                                                SI->getType(),
+                                                                false));
+          } else {
+            SI->replaceAllUsesWith(ZextGroupSize);
+          }
+
+          MadeChange = true;
+        }
+      }
+    }
+  }
+
+  if (!HasReqdWorkGroupSize)
+    return MadeChange;
+
+  // Eliminate any other loads we can from the dispatch packet.
+  for (int I = 0; I < 3; ++I) {
+    Value *GroupSize = WorkGroupSizes[I];
+    if (!GroupSize)
+      continue;
+
+    ConstantInt *KnownSize = mdconst::extract<ConstantInt>(MD->getOperand(I));
+    GroupSize->replaceAllUsesWith(
+      ConstantExpr::getIntegerCast(KnownSize,
+                                   GroupSize->getType(),
+                                   false));
+    MadeChange = true;
+  }
+
+  return MadeChange;
+}
+
+// TODO: Move makeLIDRangeMetadata usage into here. Seem to not get
+// TargetPassConfig for subtarget.
+bool AMDGPULowerKernelAttributes::runOnModule(Module &M) {
+  StringRef DispatchPtrName
+    = Intrinsic::getName(Intrinsic::amdgcn_dispatch_ptr);
+
+  Function *DispatchPtr = Mod->getFunction(DispatchPtrName);
+  if (!DispatchPtr) // Dispatch ptr not used.
+    return false;
+
+  bool MadeChange = false;
+
+  SmallPtrSet<Instruction *, 4> HandledUses;
+  for (auto *U : DispatchPtr->users()) {
+    CallInst *CI = cast<CallInst>(U);
+    if (HandledUses.insert(CI).second) {
+      if (processUse(CI))
+        MadeChange = true;
+    }
+  }
+
+  return MadeChange;
+}
+
+INITIALIZE_PASS_BEGIN(AMDGPULowerKernelAttributes, DEBUG_TYPE,
+                      "AMDGPU IR optimizations", false, false)
+INITIALIZE_PASS_END(AMDGPULowerKernelAttributes, DEBUG_TYPE, "AMDGPU IR optimizations",
+                    false, false)
+
+char AMDGPULowerKernelAttributes::ID = 0;
+
+ModulePass *llvm::createAMDGPULowerKernelAttributesPass() {
+  return new AMDGPULowerKernelAttributes();
+}
diff --git a/lib/Target/AMDGPU/AMDGPUMCInstLower.cpp b/lib/Target/AMDGPU/AMDGPUMCInstLower.cpp
index 23fd8113932c..1876dc3f7122 100644
--- a/lib/Target/AMDGPU/AMDGPUMCInstLower.cpp
+++ b/lib/Target/AMDGPU/AMDGPUMCInstLower.cpp
@@ -8,16 +8,17 @@
 //===----------------------------------------------------------------------===//
 //
 /// \file
-/// \brief Code to lower AMDGPU MachineInstrs to their corresponding MCInst.
+/// Code to lower AMDGPU MachineInstrs to their corresponding MCInst.
 //
 //===----------------------------------------------------------------------===//
 //
 
-#include "AMDGPUMCInstLower.h"
 #include "AMDGPUAsmPrinter.h"
 #include "AMDGPUSubtarget.h"
 #include "AMDGPUTargetMachine.h"
 #include "InstPrinter/AMDGPUInstPrinter.h"
+#include "MCTargetDesc/AMDGPUMCTargetDesc.h"
+#include "R600AsmPrinter.h"
 #include "SIInstrInfo.h"
 #include "llvm/CodeGen/MachineBasicBlock.h"
 #include "llvm/CodeGen/MachineInstr.h"
@@ -36,9 +37,43 @@
 
 using namespace llvm;
 
+namespace {
+
+class AMDGPUMCInstLower {
+  MCContext &Ctx;
+  const TargetSubtargetInfo &ST;
+  const AsmPrinter &AP;
+
+  const MCExpr *getLongBranchBlockExpr(const MachineBasicBlock &SrcBB,
+                                       const MachineOperand &MO) const;
+
+public:
+  AMDGPUMCInstLower(MCContext &ctx, const TargetSubtargetInfo &ST,
+                    const AsmPrinter &AP);
+
+  bool lowerOperand(const MachineOperand &MO, MCOperand &MCOp) const;
+
+  /// Lower a MachineInstr to an MCInst
+  void lower(const MachineInstr *MI, MCInst &OutMI) const;
+
+};
+
+class R600MCInstLower : public AMDGPUMCInstLower {
+public:
+  R600MCInstLower(MCContext &ctx, const R600Subtarget &ST,
+                  const AsmPrinter &AP);
+
+  /// Lower a MachineInstr to an MCInst
+  void lower(const MachineInstr *MI, MCInst &OutMI) const;
+};
+
+
+} // End anonymous namespace
+
 #include "AMDGPUGenMCPseudoLowering.inc"
 
-AMDGPUMCInstLower::AMDGPUMCInstLower(MCContext &ctx, const AMDGPUSubtarget &st,
+AMDGPUMCInstLower::AMDGPUMCInstLower(MCContext &ctx,
+                                     const TargetSubtargetInfo &st,
                                      const AsmPrinter &ap):
   Ctx(ctx), ST(st), AP(ap) { }
 
@@ -129,7 +164,7 @@ bool AMDGPUMCInstLower::lowerOperand(const MachineOperand &MO,
 
 void AMDGPUMCInstLower::lower(const MachineInstr *MI, MCInst &OutMI) const {
   unsigned Opcode = MI->getOpcode();
-  const auto *TII = ST.getInstrInfo();
+  const auto *TII = static_cast<const SIInstrInfo*>(ST.getInstrInfo());
 
   // FIXME: Should be able to handle this with emitPseudoExpansionLowering. We
   // need to select it to the subtarget specific version, and there's no way to
@@ -169,16 +204,18 @@ void AMDGPUMCInstLower::lower(const MachineInstr *MI, MCInst &OutMI) const {
 
 bool AMDGPUAsmPrinter::lowerOperand(const MachineOperand &MO,
                                     MCOperand &MCOp) const {
-  const AMDGPUSubtarget &STI = MF->getSubtarget<AMDGPUSubtarget>();
+  const GCNSubtarget &STI = MF->getSubtarget<GCNSubtarget>();
   AMDGPUMCInstLower MCInstLowering(OutContext, STI, *this);
   return MCInstLowering.lowerOperand(MO, MCOp);
 }
 
-const MCExpr *AMDGPUAsmPrinter::lowerConstant(const Constant *CV) {
+static const MCExpr *lowerAddrSpaceCast(const TargetMachine &TM,
+                                        const Constant *CV,
+                                        MCContext &OutContext) {
   // TargetMachine does not support llvm-style cast. Use C++-style cast.
   // This is safe since TM is always of type AMDGPUTargetMachine or its
   // derived class.
-  auto *AT = static_cast<AMDGPUTargetMachine*>(&TM);
+  auto &AT = static_cast<const AMDGPUTargetMachine&>(TM);
   auto *CE = dyn_cast<ConstantExpr>(CV);
 
   // Lower null pointers in private and local address space.
@@ -187,12 +224,18 @@ const MCExpr *AMDGPUAsmPrinter::lowerConstant(const Constant *CV) {
   if (CE && CE->getOpcode() == Instruction::AddrSpaceCast) {
     auto Op = CE->getOperand(0);
     auto SrcAddr = Op->getType()->getPointerAddressSpace();
-    if (Op->isNullValue() && AT->getNullPointerValue(SrcAddr) == 0) {
+    if (Op->isNullValue() && AT.getNullPointerValue(SrcAddr) == 0) {
       auto DstAddr = CE->getType()->getPointerAddressSpace();
-      return MCConstantExpr::create(AT->getNullPointerValue(DstAddr),
+      return MCConstantExpr::create(AT.getNullPointerValue(DstAddr),
         OutContext);
     }
   }
+  return nullptr;
+}
+
+const MCExpr *AMDGPUAsmPrinter::lowerConstant(const Constant *CV) {
+  if (const MCExpr *E = lowerAddrSpaceCast(TM, CV, OutContext))
+    return E;
   return AsmPrinter::lowerConstant(CV);
 }
 
@@ -200,7 +243,7 @@ void AMDGPUAsmPrinter::EmitInstruction(const MachineInstr *MI) {
   if (emitPseudoExpansionLowering(*OutStreamer, MI))
     return;
 
-  const AMDGPUSubtarget &STI = MF->getSubtarget<AMDGPUSubtarget>();
+  const GCNSubtarget &STI = MF->getSubtarget<GCNSubtarget>();
   AMDGPUMCInstLower MCInstLowering(OutContext, STI, *this);
 
   StringRef Err;
@@ -292,3 +335,47 @@ void AMDGPUAsmPrinter::EmitInstruction(const MachineInstr *MI) {
     }
   }
 }
+
+R600MCInstLower::R600MCInstLower(MCContext &Ctx, const R600Subtarget &ST,
+                                 const AsmPrinter &AP) :
+        AMDGPUMCInstLower(Ctx, ST, AP) { }
+
+void R600MCInstLower::lower(const MachineInstr *MI, MCInst &OutMI) const {
+  OutMI.setOpcode(MI->getOpcode());
+  for (const MachineOperand &MO : MI->explicit_operands()) {
+    MCOperand MCOp;
+    lowerOperand(MO, MCOp);
+    OutMI.addOperand(MCOp);
+  }
+}
+
+void R600AsmPrinter::EmitInstruction(const MachineInstr *MI) {
+  const R600Subtarget &STI = MF->getSubtarget<R600Subtarget>();
+  R600MCInstLower MCInstLowering(OutContext, STI, *this);
+
+  StringRef Err;
+  if (!STI.getInstrInfo()->verifyInstruction(*MI, Err)) {
+    LLVMContext &C = MI->getParent()->getParent()->getFunction().getContext();
+    C.emitError("Illegal instruction detected: " + Err);
+    MI->print(errs());
+  }
+
+  if (MI->isBundle()) {
+    const MachineBasicBlock *MBB = MI->getParent();
+    MachineBasicBlock::const_instr_iterator I = ++MI->getIterator();
+    while (I != MBB->instr_end() && I->isInsideBundle()) {
+      EmitInstruction(&*I);
+      ++I;
+    }
+  } else {
+    MCInst TmpInst;
+    MCInstLowering.lower(MI, TmpInst);
+    EmitToStreamer(*OutStreamer, TmpInst);
+ }
+}
+
+const MCExpr *R600AsmPrinter::lowerConstant(const Constant *CV) {
+  if (const MCExpr *E = lowerAddrSpaceCast(TM, CV, OutContext))
+    return E;
+  return AsmPrinter::lowerConstant(CV);
+}
diff --git a/lib/Target/AMDGPU/AMDGPUMCInstLower.h b/lib/Target/AMDGPU/AMDGPUMCInstLower.h
deleted file mode 100644
index 57d2d85daecd..000000000000
--- a/lib/Target/AMDGPU/AMDGPUMCInstLower.h
+++ /dev/null
@@ -1,46 +0,0 @@
-//===- AMDGPUMCInstLower.h MachineInstr Lowering Interface ------*- C++ -*-===//
-//
-//                     The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
-//
-//===----------------------------------------------------------------------===//
-
-#ifndef LLVM_LIB_TARGET_AMDGPU_AMDGPUMCINSTLOWER_H
-#define LLVM_LIB_TARGET_AMDGPU_AMDGPUMCINSTLOWER_H
-
-namespace llvm {
-
-class AMDGPUSubtarget;
-class AsmPrinter;
-class MachineBasicBlock;
-class MachineInstr;
-class MachineOperand;
-class MCContext;
-class MCExpr;
-class MCInst;
-class MCOperand;
-
-class AMDGPUMCInstLower {
-  MCContext &Ctx;
-  const AMDGPUSubtarget &ST;
-  const AsmPrinter &AP;
-
-  const MCExpr *getLongBranchBlockExpr(const MachineBasicBlock &SrcBB,
-                                       const MachineOperand &MO) const;
-
-public:
-  AMDGPUMCInstLower(MCContext &ctx, const AMDGPUSubtarget &ST,
-                    const AsmPrinter &AP);
-
-  bool lowerOperand(const MachineOperand &MO, MCOperand &MCOp) const;
-
-  /// \brief Lower a MachineInstr to an MCInst
-  void lower(const MachineInstr *MI, MCInst &OutMI) const;
-
-};
-
-} // End namespace llvm
-
-#endif
diff --git a/lib/Target/AMDGPU/AMDGPUMachineCFGStructurizer.cpp b/lib/Target/AMDGPU/AMDGPUMachineCFGStructurizer.cpp
index 20918233e447..6f44e2dbb2d5 100644
--- a/lib/Target/AMDGPU/AMDGPUMachineCFGStructurizer.cpp
+++ b/lib/Target/AMDGPU/AMDGPUMachineCFGStructurizer.cpp
@@ -31,6 +31,7 @@
 #include "llvm/CodeGen/MachineRegisterInfo.h"
 #include "llvm/CodeGen/TargetOpcodes.h"
 #include "llvm/CodeGen/TargetRegisterInfo.h"
+#include "llvm/Config/llvm-config.h"
 #include "llvm/IR/DebugLoc.h"
 #include "llvm/Pass.h"
 #include "llvm/Support/Compiler.h"
@@ -658,7 +659,7 @@ RegionMRT *MRT::buildMRT(MachineFunction &MF,
       continue;
     }
 
-    DEBUG(dbgs() << "Visiting " << printMBBReference(*MBB) << "\n");
+    LLVM_DEBUG(dbgs() << "Visiting " << printMBBReference(*MBB) << "\n");
     MBBMRT *NewMBB = new MBBMRT(MBB);
     MachineRegion *Region = RegionInfo->getRegionFor(MBB);
 
@@ -695,18 +696,19 @@ void LinearizedRegion::storeLiveOutReg(MachineBasicBlock *MBB, unsigned Reg,
                                        const TargetRegisterInfo *TRI,
                                        PHILinearize &PHIInfo) {
   if (TRI->isVirtualRegister(Reg)) {
-    DEBUG(dbgs() << "Considering Register: " << printReg(Reg, TRI) << "\n");
+    LLVM_DEBUG(dbgs() << "Considering Register: " << printReg(Reg, TRI)
+                      << "\n");
     // If this is a source register to a PHI we are chaining, it
     // must be live out.
     if (PHIInfo.isSource(Reg)) {
-      DEBUG(dbgs() << "Add LiveOut (PHI): " << printReg(Reg, TRI) << "\n");
+      LLVM_DEBUG(dbgs() << "Add LiveOut (PHI): " << printReg(Reg, TRI) << "\n");
       addLiveOut(Reg);
     } else {
       // If this is live out of the MBB
       for (auto &UI : MRI->use_operands(Reg)) {
         if (UI.getParent()->getParent() != MBB) {
-          DEBUG(dbgs() << "Add LiveOut (MBB " << printMBBReference(*MBB)
-                       << "): " << printReg(Reg, TRI) << "\n");
+          LLVM_DEBUG(dbgs() << "Add LiveOut (MBB " << printMBBReference(*MBB)
+                            << "): " << printReg(Reg, TRI) << "\n");
           addLiveOut(Reg);
         } else {
           // If the use is in the same MBB we have to make sure
@@ -717,8 +719,8 @@ void LinearizedRegion::storeLiveOutReg(MachineBasicBlock *MBB, unsigned Reg,
                    MIE = UseInstr->getParent()->instr_end();
                MII != MIE; ++MII) {
             if ((&(*MII)) == DefInstr) {
-              DEBUG(dbgs() << "Add LiveOut (Loop): " << printReg(Reg, TRI)
-                           << "\n");
+              LLVM_DEBUG(dbgs() << "Add LiveOut (Loop): " << printReg(Reg, TRI)
+                                << "\n");
               addLiveOut(Reg);
             }
           }
@@ -734,11 +736,12 @@ void LinearizedRegion::storeLiveOutRegRegion(RegionMRT *Region, unsigned Reg,
                                              const TargetRegisterInfo *TRI,
                                              PHILinearize &PHIInfo) {
   if (TRI->isVirtualRegister(Reg)) {
-    DEBUG(dbgs() << "Considering Register: " << printReg(Reg, TRI) << "\n");
+    LLVM_DEBUG(dbgs() << "Considering Register: " << printReg(Reg, TRI)
+                      << "\n");
     for (auto &UI : MRI->use_operands(Reg)) {
       if (!Region->contains(UI.getParent()->getParent())) {
-        DEBUG(dbgs() << "Add LiveOut (Region " << (void *)Region
-                     << "): " << printReg(Reg, TRI) << "\n");
+        LLVM_DEBUG(dbgs() << "Add LiveOut (Region " << (void *)Region
+                          << "): " << printReg(Reg, TRI) << "\n");
         addLiveOut(Reg);
       }
     }
@@ -749,8 +752,8 @@ void LinearizedRegion::storeLiveOuts(MachineBasicBlock *MBB,
                                      const MachineRegisterInfo *MRI,
                                      const TargetRegisterInfo *TRI,
                                      PHILinearize &PHIInfo) {
-  DEBUG(dbgs() << "-Store Live Outs Begin (" << printMBBReference(*MBB)
-               << ")-\n");
+  LLVM_DEBUG(dbgs() << "-Store Live Outs Begin (" << printMBBReference(*MBB)
+                    << ")-\n");
   for (auto &II : *MBB) {
     for (auto &RI : II.defs()) {
       storeLiveOutReg(MBB, RI.getReg(), RI.getParent(), MRI, TRI, PHIInfo);
@@ -774,9 +777,10 @@ void LinearizedRegion::storeLiveOuts(MachineBasicBlock *MBB,
         for (int i = 0; i < numPreds; ++i) {
           if (getPHIPred(PHI, i) == MBB) {
             unsigned PHIReg = getPHISourceReg(PHI, i);
-            DEBUG(dbgs() << "Add LiveOut (PhiSource " << printMBBReference(*MBB)
-                         << " -> " << printMBBReference(*(*SI))
-                         << "): " << printReg(PHIReg, TRI) << "\n");
+            LLVM_DEBUG(dbgs()
+                       << "Add LiveOut (PhiSource " << printMBBReference(*MBB)
+                       << " -> " << printMBBReference(*(*SI))
+                       << "): " << printReg(PHIReg, TRI) << "\n");
             addLiveOut(PHIReg);
           }
         }
@@ -784,7 +788,7 @@ void LinearizedRegion::storeLiveOuts(MachineBasicBlock *MBB,
     }
   }
 
-  DEBUG(dbgs() << "-Store Live Outs Endn-\n");
+  LLVM_DEBUG(dbgs() << "-Store Live Outs Endn-\n");
 }
 
 void LinearizedRegion::storeMBBLiveOuts(MachineBasicBlock *MBB,
@@ -844,8 +848,8 @@ void LinearizedRegion::storeLiveOuts(RegionMRT *Region,
         for (int i = 0; i < numPreds; ++i) {
           if (Region->contains(getPHIPred(PHI, i))) {
             unsigned PHIReg = getPHISourceReg(PHI, i);
-            DEBUG(dbgs() << "Add Region LiveOut (" << (void *)Region
-                         << "): " << printReg(PHIReg, TRI) << "\n");
+            LLVM_DEBUG(dbgs() << "Add Region LiveOut (" << (void *)Region
+                              << "): " << printReg(PHIReg, TRI) << "\n");
             addLiveOut(PHIReg);
           }
         }
@@ -909,20 +913,21 @@ void LinearizedRegion::replaceRegister(unsigned Register, unsigned NewRegister,
                                        bool IncludeLoopPHI) {
   assert(Register != NewRegister && "Cannot replace a reg with itself");
 
-  DEBUG(dbgs() << "Pepareing to replace register (region): "
-               << printReg(Register, MRI->getTargetRegisterInfo()) << " with "
-               << printReg(NewRegister, MRI->getTargetRegisterInfo()) << "\n");
+  LLVM_DEBUG(
+      dbgs() << "Pepareing to replace register (region): "
+             << printReg(Register, MRI->getTargetRegisterInfo()) << " with "
+             << printReg(NewRegister, MRI->getTargetRegisterInfo()) << "\n");
 
   // If we are replacing outside, we also need to update the LiveOuts
   if (ReplaceOutside &&
       (isLiveOut(Register) || this->getParent()->isLiveOut(Register))) {
     LinearizedRegion *Current = this;
     while (Current != nullptr && Current->getEntry() != nullptr) {
-      DEBUG(dbgs() << "Region before register replace\n");
-      DEBUG(Current->print(dbgs(), MRI->getTargetRegisterInfo()));
+      LLVM_DEBUG(dbgs() << "Region before register replace\n");
+      LLVM_DEBUG(Current->print(dbgs(), MRI->getTargetRegisterInfo()));
       Current->replaceLiveOut(Register, NewRegister);
-      DEBUG(dbgs() << "Region after register replace\n");
-      DEBUG(Current->print(dbgs(), MRI->getTargetRegisterInfo()));
+      LLVM_DEBUG(dbgs() << "Region after register replace\n");
+      LLVM_DEBUG(Current->print(dbgs(), MRI->getTargetRegisterInfo()));
       Current = Current->getParent();
     }
   }
@@ -946,16 +951,16 @@ void LinearizedRegion::replaceRegister(unsigned Register, unsigned NewRegister,
     if (ShouldReplace) {
 
       if (TargetRegisterInfo::isPhysicalRegister(NewRegister)) {
-        DEBUG(dbgs() << "Trying to substitute physical register: "
-                     << printReg(NewRegister, MRI->getTargetRegisterInfo())
-                     << "\n");
+        LLVM_DEBUG(dbgs() << "Trying to substitute physical register: "
+                          << printReg(NewRegister, MRI->getTargetRegisterInfo())
+                          << "\n");
         llvm_unreachable("Cannot substitute physical registers");
       } else {
-        DEBUG(dbgs() << "Replacing register (region): "
-                     << printReg(Register, MRI->getTargetRegisterInfo())
-                     << " with "
-                     << printReg(NewRegister, MRI->getTargetRegisterInfo())
-                     << "\n");
+        LLVM_DEBUG(dbgs() << "Replacing register (region): "
+                          << printReg(Register, MRI->getTargetRegisterInfo())
+                          << " with "
+                          << printReg(NewRegister, MRI->getTargetRegisterInfo())
+                          << "\n");
         O.setReg(NewRegister);
       }
     }
@@ -1022,18 +1027,18 @@ void LinearizedRegion::removeFalseRegisterKills(MachineRegisterInfo *MRI) {
             if (hasNoDef(Reg, MRI))
               continue;
             if (!MRI->hasOneDef(Reg)) {
-              DEBUG(this->getEntry()->getParent()->dump());
-              DEBUG(dbgs() << printReg(Reg, TRI) << "\n");
+              LLVM_DEBUG(this->getEntry()->getParent()->dump());
+              LLVM_DEBUG(dbgs() << printReg(Reg, TRI) << "\n");
             }
 
             if (MRI->def_begin(Reg) == MRI->def_end()) {
-              DEBUG(dbgs() << "Register "
-                           << printReg(Reg, MRI->getTargetRegisterInfo())
-                           << " has NO defs\n");
+              LLVM_DEBUG(dbgs() << "Register "
+                                << printReg(Reg, MRI->getTargetRegisterInfo())
+                                << " has NO defs\n");
             } else if (!MRI->hasOneDef(Reg)) {
-              DEBUG(dbgs() << "Register "
-                           << printReg(Reg, MRI->getTargetRegisterInfo())
-                           << " has multiple defs\n");
+              LLVM_DEBUG(dbgs() << "Register "
+                                << printReg(Reg, MRI->getTargetRegisterInfo())
+                                << " has multiple defs\n");
             }
 
             assert(MRI->hasOneDef(Reg) && "Register has multiple definitions");
@@ -1041,8 +1046,8 @@ void LinearizedRegion::removeFalseRegisterKills(MachineRegisterInfo *MRI) {
             MachineOperand *UseOperand = &(RI);
             bool UseIsOutsideDefMBB = Def->getParent()->getParent() != MBB;
             if (UseIsOutsideDefMBB && UseOperand->isKill()) {
-              DEBUG(dbgs() << "Removing kill flag on register: "
-                           << printReg(Reg, TRI) << "\n");
+              LLVM_DEBUG(dbgs() << "Removing kill flag on register: "
+                                << printReg(Reg, TRI) << "\n");
               UseOperand->setIsKill(false);
             }
           }
@@ -1415,8 +1420,8 @@ void AMDGPUMachineCFGStructurizer::extractKilledPHIs(MachineBasicBlock *MBB) {
     MachineInstr &Instr = *I;
     if (Instr.isPHI()) {
       unsigned PHIDestReg = getPHIDestReg(Instr);
-      DEBUG(dbgs() << "Extractking killed phi:\n");
-      DEBUG(Instr.dump());
+      LLVM_DEBUG(dbgs() << "Extractking killed phi:\n");
+      LLVM_DEBUG(Instr.dump());
       PHIs.insert(&Instr);
       PHIInfo.addDest(PHIDestReg, Instr.getDebugLoc());
       storePHILinearizationInfoDest(PHIDestReg, Instr);
@@ -1448,9 +1453,10 @@ bool AMDGPUMachineCFGStructurizer::shrinkPHI(MachineInstr &PHI,
                                        MachineBasicBlock *SourceMBB,
                                        SmallVector<unsigned, 2> &PHIIndices,
                                        unsigned *ReplaceReg) {
-  DEBUG(dbgs() << "Shrink PHI: ");
-  DEBUG(PHI.dump());
-  DEBUG(dbgs() << " to " << printReg(getPHIDestReg(PHI), TRI) << " = PHI(");
+  LLVM_DEBUG(dbgs() << "Shrink PHI: ");
+  LLVM_DEBUG(PHI.dump());
+  LLVM_DEBUG(dbgs() << " to " << printReg(getPHIDestReg(PHI), TRI)
+                    << " = PHI(");
 
   bool Replaced = false;
   unsigned NumInputs = getPHINumInputs(PHI);
@@ -1480,8 +1486,8 @@ bool AMDGPUMachineCFGStructurizer::shrinkPHI(MachineInstr &PHI,
     if (SourceMBB) {
       MIB.addReg(CombinedSourceReg);
       MIB.addMBB(SourceMBB);
-      DEBUG(dbgs() << printReg(CombinedSourceReg, TRI) << ", "
-                   << printMBBReference(*SourceMBB));
+      LLVM_DEBUG(dbgs() << printReg(CombinedSourceReg, TRI) << ", "
+                        << printMBBReference(*SourceMBB));
     }
 
     for (unsigned i = 0; i < NumInputs; ++i) {
@@ -1492,10 +1498,10 @@ bool AMDGPUMachineCFGStructurizer::shrinkPHI(MachineInstr &PHI,
       MachineBasicBlock *SourcePred = getPHIPred(PHI, i);
       MIB.addReg(SourceReg);
       MIB.addMBB(SourcePred);
-      DEBUG(dbgs() << printReg(SourceReg, TRI) << ", "
-                   << printMBBReference(*SourcePred));
+      LLVM_DEBUG(dbgs() << printReg(SourceReg, TRI) << ", "
+                        << printMBBReference(*SourcePred));
     }
-    DEBUG(dbgs() << ")\n");
+    LLVM_DEBUG(dbgs() << ")\n");
   }
   PHI.eraseFromParent();
   return Replaced;
@@ -1504,9 +1510,10 @@ bool AMDGPUMachineCFGStructurizer::shrinkPHI(MachineInstr &PHI,
 void AMDGPUMachineCFGStructurizer::replacePHI(
     MachineInstr &PHI, unsigned CombinedSourceReg, MachineBasicBlock *LastMerge,
     SmallVector<unsigned, 2> &PHIRegionIndices) {
-  DEBUG(dbgs() << "Replace PHI: ");
-  DEBUG(PHI.dump());
-  DEBUG(dbgs() << " with " << printReg(getPHIDestReg(PHI), TRI) << " = PHI(");
+  LLVM_DEBUG(dbgs() << "Replace PHI: ");
+  LLVM_DEBUG(PHI.dump());
+  LLVM_DEBUG(dbgs() << " with " << printReg(getPHIDestReg(PHI), TRI)
+                    << " = PHI(");
 
   bool HasExternalEdge = false;
   unsigned NumInputs = getPHINumInputs(PHI);
@@ -1523,8 +1530,8 @@ void AMDGPUMachineCFGStructurizer::replacePHI(
                 getPHIDestReg(PHI));
     MIB.addReg(CombinedSourceReg);
     MIB.addMBB(LastMerge);
-    DEBUG(dbgs() << printReg(CombinedSourceReg, TRI) << ", "
-                 << printMBBReference(*LastMerge));
+    LLVM_DEBUG(dbgs() << printReg(CombinedSourceReg, TRI) << ", "
+                      << printMBBReference(*LastMerge));
     for (unsigned i = 0; i < NumInputs; ++i) {
       if (isPHIRegionIndex(PHIRegionIndices, i)) {
         continue;
@@ -1533,10 +1540,10 @@ void AMDGPUMachineCFGStructurizer::replacePHI(
       MachineBasicBlock *SourcePred = getPHIPred(PHI, i);
       MIB.addReg(SourceReg);
       MIB.addMBB(SourcePred);
-      DEBUG(dbgs() << printReg(SourceReg, TRI) << ", "
-                   << printMBBReference(*SourcePred));
+      LLVM_DEBUG(dbgs() << printReg(SourceReg, TRI) << ", "
+                        << printMBBReference(*SourcePred));
     }
-    DEBUG(dbgs() << ")\n");
+    LLVM_DEBUG(dbgs() << ")\n");
   } else {
     replaceRegisterWith(getPHIDestReg(PHI), CombinedSourceReg);
   }
@@ -1546,9 +1553,9 @@ void AMDGPUMachineCFGStructurizer::replacePHI(
 void AMDGPUMachineCFGStructurizer::replaceEntryPHI(
     MachineInstr &PHI, unsigned CombinedSourceReg, MachineBasicBlock *IfMBB,
     SmallVector<unsigned, 2> &PHIRegionIndices) {
-  DEBUG(dbgs() << "Replace entry PHI: ");
-  DEBUG(PHI.dump());
-  DEBUG(dbgs() << " with ");
+  LLVM_DEBUG(dbgs() << "Replace entry PHI: ");
+  LLVM_DEBUG(PHI.dump());
+  LLVM_DEBUG(dbgs() << " with ");
 
   unsigned NumInputs = getPHINumInputs(PHI);
   unsigned NumNonRegionInputs = NumInputs;
@@ -1561,18 +1568,19 @@ void AMDGPUMachineCFGStructurizer::replaceEntryPHI(
   if (NumNonRegionInputs == 0) {
     auto DestReg = getPHIDestReg(PHI);
     replaceRegisterWith(DestReg, CombinedSourceReg);
-    DEBUG(dbgs() << " register " << printReg(CombinedSourceReg, TRI) << "\n");
+    LLVM_DEBUG(dbgs() << " register " << printReg(CombinedSourceReg, TRI)
+                      << "\n");
     PHI.eraseFromParent();
   } else {
-    DEBUG(dbgs() << printReg(getPHIDestReg(PHI), TRI) << " = PHI(");
+    LLVM_DEBUG(dbgs() << printReg(getPHIDestReg(PHI), TRI) << " = PHI(");
     MachineBasicBlock *MBB = PHI.getParent();
     MachineInstrBuilder MIB =
         BuildMI(*MBB, PHI, PHI.getDebugLoc(), TII->get(TargetOpcode::PHI),
                 getPHIDestReg(PHI));
     MIB.addReg(CombinedSourceReg);
     MIB.addMBB(IfMBB);
-    DEBUG(dbgs() << printReg(CombinedSourceReg, TRI) << ", "
-                 << printMBBReference(*IfMBB));
+    LLVM_DEBUG(dbgs() << printReg(CombinedSourceReg, TRI) << ", "
+                      << printMBBReference(*IfMBB));
     unsigned NumInputs = getPHINumInputs(PHI);
     for (unsigned i = 0; i < NumInputs; ++i) {
       if (isPHIRegionIndex(PHIRegionIndices, i)) {
@@ -1582,10 +1590,10 @@ void AMDGPUMachineCFGStructurizer::replaceEntryPHI(
       MachineBasicBlock *SourcePred = getPHIPred(PHI, i);
       MIB.addReg(SourceReg);
       MIB.addMBB(SourcePred);
-      DEBUG(dbgs() << printReg(SourceReg, TRI) << ", "
-                   << printMBBReference(*SourcePred));
+      LLVM_DEBUG(dbgs() << printReg(SourceReg, TRI) << ", "
+                        << printMBBReference(*SourcePred));
     }
-    DEBUG(dbgs() << ")\n");
+    LLVM_DEBUG(dbgs() << ")\n");
     PHI.eraseFromParent();
   }
 }
@@ -1607,8 +1615,9 @@ void AMDGPUMachineCFGStructurizer::replaceLiveOutRegs(
         }
       }
 
-      DEBUG(dbgs() << "Register " << printReg(Reg, TRI) << " is "
-                   << (IsDead ? "dead" : "alive") << " after PHI replace\n");
+      LLVM_DEBUG(dbgs() << "Register " << printReg(Reg, TRI) << " is "
+                        << (IsDead ? "dead" : "alive")
+                        << " after PHI replace\n");
       if (IsDead) {
         LRegion->removeLiveOut(Reg);
       }
@@ -1682,8 +1691,8 @@ void AMDGPUMachineCFGStructurizer::rewriteRegionEntryPHIs(LinearizedRegion *Regi
 void AMDGPUMachineCFGStructurizer::insertUnconditionalBranch(MachineBasicBlock *MBB,
                                                        MachineBasicBlock *Dest,
                                                        const DebugLoc &DL) {
-  DEBUG(dbgs() << "Inserting unconditional branch: " << MBB->getNumber()
-               << " -> " << Dest->getNumber() << "\n");
+  LLVM_DEBUG(dbgs() << "Inserting unconditional branch: " << MBB->getNumber()
+                    << " -> " << Dest->getNumber() << "\n");
   MachineBasicBlock::instr_iterator Terminator = MBB->getFirstInstrTerminator();
   bool HasTerminator = Terminator != MBB->instr_end();
   if (HasTerminator) {
@@ -1732,7 +1741,8 @@ AMDGPUMachineCFGStructurizer::createLinearizedExitBlock(RegionMRT *Region) {
     MF->insert(ExitIter, LastMerge);
     LastMerge->addSuccessor(Exit);
     insertUnconditionalBranch(LastMerge, Exit);
-    DEBUG(dbgs() << "Created exit block: " << LastMerge->getNumber() << "\n");
+    LLVM_DEBUG(dbgs() << "Created exit block: " << LastMerge->getNumber()
+                      << "\n");
   }
   return LastMerge;
 }
@@ -1748,11 +1758,12 @@ void AMDGPUMachineCFGStructurizer::insertMergePHI(MachineBasicBlock *IfBB,
   if (MergeBB->succ_begin() == MergeBB->succ_end()) {
     return;
   }
-  DEBUG(dbgs() << "Merge PHI (" << printMBBReference(*MergeBB)
-               << "): " << printReg(DestRegister, TRI) << " = PHI("
-               << printReg(IfSourceRegister, TRI) << ", "
-               << printMBBReference(*IfBB) << printReg(CodeSourceRegister, TRI)
-               << ", " << printMBBReference(*CodeBB) << ")\n");
+  LLVM_DEBUG(dbgs() << "Merge PHI (" << printMBBReference(*MergeBB)
+                    << "): " << printReg(DestRegister, TRI) << " = PHI("
+                    << printReg(IfSourceRegister, TRI) << ", "
+                    << printMBBReference(*IfBB)
+                    << printReg(CodeSourceRegister, TRI) << ", "
+                    << printMBBReference(*CodeBB) << ")\n");
   const DebugLoc &DL = MergeBB->findDebugLoc(MergeBB->begin());
   MachineInstrBuilder MIB = BuildMI(*MergeBB, MergeBB->instr_begin(), DL,
                                     TII->get(TargetOpcode::PHI), DestRegister);
@@ -1810,8 +1821,8 @@ static void removeExternalCFGEdges(MachineBasicBlock *StartMBB,
 
   for (auto SI : Succs) {
     std::pair<MachineBasicBlock *, MachineBasicBlock *> Edge = SI;
-    DEBUG(dbgs() << "Removing edge: " << printMBBReference(*Edge.first)
-                 << " -> " << printMBBReference(*Edge.second) << "\n");
+    LLVM_DEBUG(dbgs() << "Removing edge: " << printMBBReference(*Edge.first)
+                      << " -> " << printMBBReference(*Edge.second) << "\n");
     Edge.first->removeSuccessor(Edge.second);
   }
 }
@@ -1844,13 +1855,13 @@ MachineBasicBlock *AMDGPUMachineCFGStructurizer::createIfBlock(
   IfBB->addSuccessor(MergeBB);
   IfBB->addSuccessor(CodeBBStart);
 
-  DEBUG(dbgs() << "Created If block: " << IfBB->getNumber() << "\n");
+  LLVM_DEBUG(dbgs() << "Created If block: " << IfBB->getNumber() << "\n");
   // Ensure that the MergeBB is a successor of the CodeEndBB.
   if (!CodeBBEnd->isSuccessor(MergeBB))
     CodeBBEnd->addSuccessor(MergeBB);
 
-  DEBUG(dbgs() << "Moved " << printMBBReference(*CodeBBStart) << " through "
-               << printMBBReference(*CodeBBEnd) << "\n");
+  LLVM_DEBUG(dbgs() << "Moved " << printMBBReference(*CodeBBStart)
+                    << " through " << printMBBReference(*CodeBBEnd) << "\n");
 
   // If we have a single predecessor we can find a reasonable debug location
   MachineBasicBlock *SinglePred =
@@ -1935,16 +1946,18 @@ void AMDGPUMachineCFGStructurizer::rewriteCodeBBTerminator(MachineBasicBlock *Co
 
 MachineInstr *AMDGPUMachineCFGStructurizer::getDefInstr(unsigned Reg) {
   if (MRI->def_begin(Reg) == MRI->def_end()) {
-    DEBUG(dbgs() << "Register " << printReg(Reg, MRI->getTargetRegisterInfo())
-                 << " has NO defs\n");
+    LLVM_DEBUG(dbgs() << "Register "
+                      << printReg(Reg, MRI->getTargetRegisterInfo())
+                      << " has NO defs\n");
   } else if (!MRI->hasOneDef(Reg)) {
-    DEBUG(dbgs() << "Register " << printReg(Reg, MRI->getTargetRegisterInfo())
-                 << " has multiple defs\n");
-    DEBUG(dbgs() << "DEFS BEGIN:\n");
+    LLVM_DEBUG(dbgs() << "Register "
+                      << printReg(Reg, MRI->getTargetRegisterInfo())
+                      << " has multiple defs\n");
+    LLVM_DEBUG(dbgs() << "DEFS BEGIN:\n");
     for (auto DI = MRI->def_begin(Reg), DE = MRI->def_end(); DI != DE; ++DI) {
-      DEBUG(DI->getParent()->dump());
+      LLVM_DEBUG(DI->getParent()->dump());
     }
-    DEBUG(dbgs() << "DEFS END\n");
+    LLVM_DEBUG(dbgs() << "DEFS END\n");
   }
 
   assert(MRI->hasOneDef(Reg) && "Register has multiple definitions");
@@ -1986,7 +1999,7 @@ void AMDGPUMachineCFGStructurizer::insertChainedPHI(MachineBasicBlock *IfBB,
     const TargetRegisterClass *RegClass = MRI->getRegClass(DestReg);
     unsigned NextDestReg = MRI->createVirtualRegister(RegClass);
     bool IsLastDef = PHIInfo.getNumSources(DestReg) == 1;
-    DEBUG(dbgs() << "Insert Chained PHI\n");
+    LLVM_DEBUG(dbgs() << "Insert Chained PHI\n");
     insertMergePHI(IfBB, InnerRegion->getExit(), MergeBB, DestReg, NextDestReg,
                    SourceReg, IsLastDef);
 
@@ -2022,16 +2035,16 @@ void AMDGPUMachineCFGStructurizer::rewriteLiveOutRegs(MachineBasicBlock *IfBB,
   }
 
   for (auto LI : OldLiveOuts) {
-    DEBUG(dbgs() << "LiveOut: " << printReg(LI, TRI));
+    LLVM_DEBUG(dbgs() << "LiveOut: " << printReg(LI, TRI));
     if (!containsDef(CodeBB, InnerRegion, LI) ||
         (!IsSingleBB && (getDefInstr(LI)->getParent() == LRegion->getExit()))) {
       // If the register simly lives through the CodeBB, we don't have
       // to rewrite anything since the register is not defined in this
       // part of the code.
-      DEBUG(dbgs() << "- through");
+      LLVM_DEBUG(dbgs() << "- through");
       continue;
     }
-    DEBUG(dbgs() << "\n");
+    LLVM_DEBUG(dbgs() << "\n");
     unsigned Reg = LI;
     if (/*!PHIInfo.isSource(Reg) &&*/ Reg != InnerRegion->getBBSelectRegOut()) {
       // If the register is live out, we do want to create a phi,
@@ -2048,12 +2061,12 @@ void AMDGPUMachineCFGStructurizer::rewriteLiveOutRegs(MachineBasicBlock *IfBB,
       unsigned IfSourceReg = MRI->createVirtualRegister(RegClass);
       // Create initializer, this value is never used, but is needed
       // to satisfy SSA.
-      DEBUG(dbgs() << "Initializer for reg: " << printReg(Reg) << "\n");
+      LLVM_DEBUG(dbgs() << "Initializer for reg: " << printReg(Reg) << "\n");
       TII->materializeImmediate(*IfBB, IfBB->getFirstTerminator(), DebugLoc(),
                         IfSourceReg, 0);
 
       InnerRegion->replaceRegisterOutsideRegion(Reg, PHIDestReg, true, MRI);
-      DEBUG(dbgs() << "Insert Non-Chained Live out PHI\n");
+      LLVM_DEBUG(dbgs() << "Insert Non-Chained Live out PHI\n");
       insertMergePHI(IfBB, InnerRegion->getExit(), MergeBB, PHIDestReg,
                      IfSourceReg, Reg, true);
     }
@@ -2063,22 +2076,22 @@ void AMDGPUMachineCFGStructurizer::rewriteLiveOutRegs(MachineBasicBlock *IfBB,
   // is a source block for a definition.
   SmallVector<unsigned, 4> Sources;
   if (PHIInfo.findSourcesFromMBB(CodeBB, Sources)) {
-    DEBUG(dbgs() << "Inserting PHI Live Out from " << printMBBReference(*CodeBB)
-                 << "\n");
+    LLVM_DEBUG(dbgs() << "Inserting PHI Live Out from "
+                      << printMBBReference(*CodeBB) << "\n");
     for (auto SI : Sources) {
       unsigned DestReg;
       PHIInfo.findDest(SI, CodeBB, DestReg);
       insertChainedPHI(IfBB, CodeBB, MergeBB, InnerRegion, DestReg, SI);
     }
-    DEBUG(dbgs() << "Insertion done.\n");
+    LLVM_DEBUG(dbgs() << "Insertion done.\n");
   }
 
-  DEBUG(PHIInfo.dump(MRI));
+  LLVM_DEBUG(PHIInfo.dump(MRI));
 }
 
 void AMDGPUMachineCFGStructurizer::prunePHIInfo(MachineBasicBlock *MBB) {
-  DEBUG(dbgs() << "Before PHI Prune\n");
-  DEBUG(PHIInfo.dump(MRI));
+  LLVM_DEBUG(dbgs() << "Before PHI Prune\n");
+  LLVM_DEBUG(PHIInfo.dump(MRI));
   SmallVector<std::tuple<unsigned, unsigned, MachineBasicBlock *>, 4>
       ElimiatedSources;
   for (auto DRI = PHIInfo.dests_begin(), DE = PHIInfo.dests_end(); DRI != DE;
@@ -2118,8 +2131,8 @@ void AMDGPUMachineCFGStructurizer::prunePHIInfo(MachineBasicBlock *MBB) {
     PHIInfo.removeSource(std::get<0>(SourceInfo), std::get<1>(SourceInfo),
                          std::get<2>(SourceInfo));
   }
-  DEBUG(dbgs() << "After PHI Prune\n");
-  DEBUG(PHIInfo.dump(MRI));
+  LLVM_DEBUG(dbgs() << "After PHI Prune\n");
+  LLVM_DEBUG(PHIInfo.dump(MRI));
 }
 
 void AMDGPUMachineCFGStructurizer::createEntryPHI(LinearizedRegion *CurrentRegion,
@@ -2127,8 +2140,8 @@ void AMDGPUMachineCFGStructurizer::createEntryPHI(LinearizedRegion *CurrentRegio
   MachineBasicBlock *Entry = CurrentRegion->getEntry();
   MachineBasicBlock *Exit = CurrentRegion->getExit();
 
-  DEBUG(dbgs() << "RegionExit: " << Exit->getNumber()
-               << " Pred: " << (*(Entry->pred_begin()))->getNumber() << "\n");
+  LLVM_DEBUG(dbgs() << "RegionExit: " << Exit->getNumber() << " Pred: "
+                    << (*(Entry->pred_begin()))->getNumber() << "\n");
 
   int NumSources = 0;
   auto SE = PHIInfo.sources_end(DestReg);
@@ -2145,7 +2158,7 @@ void AMDGPUMachineCFGStructurizer::createEntryPHI(LinearizedRegion *CurrentRegio
     const DebugLoc &DL = Entry->findDebugLoc(Entry->begin());
     MachineInstrBuilder MIB = BuildMI(*Entry, Entry->instr_begin(), DL,
                                       TII->get(TargetOpcode::PHI), DestReg);
-    DEBUG(dbgs() << "Entry PHI " << printReg(DestReg, TRI) << " = PHI(");
+    LLVM_DEBUG(dbgs() << "Entry PHI " << printReg(DestReg, TRI) << " = PHI(");
 
     unsigned CurrentBackedgeReg = 0;
 
@@ -2169,19 +2182,19 @@ void AMDGPUMachineCFGStructurizer::createEntryPHI(LinearizedRegion *CurrentRegio
           BackedgePHI.addReg(getPHISourceReg(*PHIDefInstr, 1));
           BackedgePHI.addMBB((*SRI).second);
           CurrentBackedgeReg = NewBackedgeReg;
-          DEBUG(dbgs() << "Inserting backedge PHI: "
-                       << printReg(NewBackedgeReg, TRI) << " = PHI("
-                       << printReg(CurrentBackedgeReg, TRI) << ", "
-                       << printMBBReference(*getPHIPred(*PHIDefInstr, 0))
-                       << ", "
-                       << printReg(getPHISourceReg(*PHIDefInstr, 1), TRI)
-                       << ", " << printMBBReference(*(*SRI).second));
+          LLVM_DEBUG(dbgs()
+                     << "Inserting backedge PHI: "
+                     << printReg(NewBackedgeReg, TRI) << " = PHI("
+                     << printReg(CurrentBackedgeReg, TRI) << ", "
+                     << printMBBReference(*getPHIPred(*PHIDefInstr, 0)) << ", "
+                     << printReg(getPHISourceReg(*PHIDefInstr, 1), TRI) << ", "
+                     << printMBBReference(*(*SRI).second));
         }
       } else {
         MIB.addReg(SourceReg);
         MIB.addMBB((*SRI).second);
-        DEBUG(dbgs() << printReg(SourceReg, TRI) << ", "
-                     << printMBBReference(*(*SRI).second) << ", ");
+        LLVM_DEBUG(dbgs() << printReg(SourceReg, TRI) << ", "
+                          << printMBBReference(*(*SRI).second) << ", ");
       }
     }
 
@@ -2189,16 +2202,16 @@ void AMDGPUMachineCFGStructurizer::createEntryPHI(LinearizedRegion *CurrentRegio
     if (CurrentBackedgeReg != 0) {
       MIB.addReg(CurrentBackedgeReg);
       MIB.addMBB(Exit);
-      DEBUG(dbgs() << printReg(CurrentBackedgeReg, TRI) << ", "
-                   << printMBBReference(*Exit) << ")\n");
+      LLVM_DEBUG(dbgs() << printReg(CurrentBackedgeReg, TRI) << ", "
+                        << printMBBReference(*Exit) << ")\n");
     } else {
-      DEBUG(dbgs() << ")\n");
+      LLVM_DEBUG(dbgs() << ")\n");
     }
   }
 }
 
 void AMDGPUMachineCFGStructurizer::createEntryPHIs(LinearizedRegion *CurrentRegion) {
-  DEBUG(PHIInfo.dump(MRI));
+  LLVM_DEBUG(PHIInfo.dump(MRI));
 
   for (auto DRI = PHIInfo.dests_begin(), DE = PHIInfo.dests_end(); DRI != DE;
        ++DRI) {
@@ -2219,19 +2232,19 @@ void AMDGPUMachineCFGStructurizer::replaceRegisterWith(unsigned Register,
     MachineOperand &O = *I;
     ++I;
     if (TargetRegisterInfo::isPhysicalRegister(NewRegister)) {
-      DEBUG(dbgs() << "Trying to substitute physical register: "
-                   << printReg(NewRegister, MRI->getTargetRegisterInfo())
-                   << "\n");
+      LLVM_DEBUG(dbgs() << "Trying to substitute physical register: "
+                        << printReg(NewRegister, MRI->getTargetRegisterInfo())
+                        << "\n");
       llvm_unreachable("Cannot substitute physical registers");
       // We don't handle physical registers, but if we need to
       // in the future This is how we do it:
       // O.substPhysReg(NewRegister, *TRI);
     } else {
-      DEBUG(dbgs() << "Replacing register: "
-                   << printReg(Register, MRI->getTargetRegisterInfo())
-                   << " with "
-                   << printReg(NewRegister, MRI->getTargetRegisterInfo())
-                   << "\n");
+      LLVM_DEBUG(dbgs() << "Replacing register: "
+                        << printReg(Register, MRI->getTargetRegisterInfo())
+                        << " with "
+                        << printReg(NewRegister, MRI->getTargetRegisterInfo())
+                        << "\n");
       O.setReg(NewRegister);
     }
   }
@@ -2239,20 +2252,20 @@ void AMDGPUMachineCFGStructurizer::replaceRegisterWith(unsigned Register,
 
   getRegionMRT()->replaceLiveOutReg(Register, NewRegister);
 
-  DEBUG(PHIInfo.dump(MRI));
+  LLVM_DEBUG(PHIInfo.dump(MRI));
 }
 
 void AMDGPUMachineCFGStructurizer::resolvePHIInfos(MachineBasicBlock *FunctionEntry) {
-  DEBUG(dbgs() << "Resolve PHI Infos\n");
-  DEBUG(PHIInfo.dump(MRI));
+  LLVM_DEBUG(dbgs() << "Resolve PHI Infos\n");
+  LLVM_DEBUG(PHIInfo.dump(MRI));
   for (auto DRI = PHIInfo.dests_begin(), DE = PHIInfo.dests_end(); DRI != DE;
        ++DRI) {
     unsigned DestReg = *DRI;
-    DEBUG(dbgs() << "DestReg: " << printReg(DestReg, TRI) << "\n");
+    LLVM_DEBUG(dbgs() << "DestReg: " << printReg(DestReg, TRI) << "\n");
     auto SRI = PHIInfo.sources_begin(DestReg);
     unsigned SourceReg = (*SRI).first;
-    DEBUG(dbgs() << "DestReg: " << printReg(DestReg, TRI)
-                 << " SourceReg: " << printReg(SourceReg, TRI) << "\n");
+    LLVM_DEBUG(dbgs() << "DestReg: " << printReg(DestReg, TRI)
+                      << " SourceReg: " << printReg(SourceReg, TRI) << "\n");
 
     assert(PHIInfo.sources_end(DestReg) == ++SRI &&
            "More than one phi source in entry node");
@@ -2326,9 +2339,9 @@ MachineBasicBlock *AMDGPUMachineCFGStructurizer::createIfRegion(
         MachineOperand RegOp =
             MachineOperand::CreateReg(Reg, false, false, true);
         ArrayRef<MachineOperand> Cond(RegOp);
-        DEBUG(dbgs() << "RegionExitReg: ");
-        DEBUG(Cond[0].print(dbgs(), TRI));
-        DEBUG(dbgs() << "\n");
+        LLVM_DEBUG(dbgs() << "RegionExitReg: ");
+        LLVM_DEBUG(Cond[0].print(dbgs(), TRI));
+        LLVM_DEBUG(dbgs() << "\n");
         TII->insertBranch(*RegionExit, CurrentRegion->getEntry(), RegionExit,
                           Cond, DebugLoc());
         RegionExit->addSuccessor(CurrentRegion->getEntry());
@@ -2338,12 +2351,12 @@ MachineBasicBlock *AMDGPUMachineCFGStructurizer::createIfRegion(
     LinearizedRegion InnerRegion(CodeBB, MRI, TRI, PHIInfo);
 
     InnerRegion.setParent(CurrentRegion);
-    DEBUG(dbgs() << "Insert BB Select PHI (BB)\n");
+    LLVM_DEBUG(dbgs() << "Insert BB Select PHI (BB)\n");
     insertMergePHI(IfBB, CodeBB, MergeBB, BBSelectRegOut, BBSelectRegIn,
                    CodeBBSelectReg);
     InnerRegion.addMBB(MergeBB);
 
-    DEBUG(InnerRegion.print(dbgs(), TRI));
+    LLVM_DEBUG(InnerRegion.print(dbgs(), TRI));
     rewriteLiveOutRegs(IfBB, CodeBB, MergeBB, &InnerRegion, CurrentRegion);
     extractKilledPHIs(CodeBB);
     if (IsRegionEntryBB) {
@@ -2384,16 +2397,16 @@ MachineBasicBlock *AMDGPUMachineCFGStructurizer::createIfRegion(
                         CurrentRegion->getRegionMRT()->getEntry()->getNumber());
       MachineOperand RegOp = MachineOperand::CreateReg(Reg, false, false, true);
       ArrayRef<MachineOperand> Cond(RegOp);
-      DEBUG(dbgs() << "RegionExitReg: ");
-      DEBUG(Cond[0].print(dbgs(), TRI));
-      DEBUG(dbgs() << "\n");
+      LLVM_DEBUG(dbgs() << "RegionExitReg: ");
+      LLVM_DEBUG(Cond[0].print(dbgs(), TRI));
+      LLVM_DEBUG(dbgs() << "\n");
       TII->insertBranch(*RegionExit, CurrentRegion->getEntry(), RegionExit,
                         Cond, DebugLoc());
       RegionExit->addSuccessor(IfBB);
     }
   }
   CurrentRegion->addMBBs(InnerRegion);
-  DEBUG(dbgs() << "Insert BB Select PHI (region)\n");
+  LLVM_DEBUG(dbgs() << "Insert BB Select PHI (region)\n");
   insertMergePHI(IfBB, CodeExitBB, MergeBB, BBSelectRegOut, BBSelectRegIn,
                  CodeBBSelectReg);
 
@@ -2439,15 +2452,16 @@ void AMDGPUMachineCFGStructurizer::splitLoopPHI(MachineInstr &PHI,
   MachineInstrBuilder MIB =
       BuildMI(*EntrySucc, EntrySucc->instr_begin(), PHI.getDebugLoc(),
               TII->get(TargetOpcode::PHI), NewDestReg);
-  DEBUG(dbgs() << "Split Entry PHI " << printReg(NewDestReg, TRI) << " = PHI(");
+  LLVM_DEBUG(dbgs() << "Split Entry PHI " << printReg(NewDestReg, TRI)
+                    << " = PHI(");
   MIB.addReg(PHISource);
   MIB.addMBB(Entry);
-  DEBUG(dbgs() << printReg(PHISource, TRI) << ", "
-               << printMBBReference(*Entry));
+  LLVM_DEBUG(dbgs() << printReg(PHISource, TRI) << ", "
+                    << printMBBReference(*Entry));
   MIB.addReg(RegionSourceReg);
   MIB.addMBB(RegionSourceMBB);
-  DEBUG(dbgs() << " ," << printReg(RegionSourceReg, TRI) << ", "
-               << printMBBReference(*RegionSourceMBB) << ")\n");
+  LLVM_DEBUG(dbgs() << " ," << printReg(RegionSourceReg, TRI) << ", "
+                    << printMBBReference(*RegionSourceMBB) << ")\n");
 }
 
 void AMDGPUMachineCFGStructurizer::splitLoopPHIs(MachineBasicBlock *Entry,
@@ -2480,7 +2494,8 @@ AMDGPUMachineCFGStructurizer::splitExit(LinearizedRegion *LRegion) {
   LRegion->addMBB(NewExit);
   LRegion->setExit(NewExit);
 
-  DEBUG(dbgs() << "Created new exit block: " << NewExit->getNumber() << "\n");
+  LLVM_DEBUG(dbgs() << "Created new exit block: " << NewExit->getNumber()
+                    << "\n");
 
   // Replace any PHI Predecessors in the successor with NewExit
   for (auto &II : *Succ) {
@@ -2528,9 +2543,9 @@ AMDGPUMachineCFGStructurizer::splitEntry(LinearizedRegion *LRegion) {
   MachineBasicBlock *EntrySucc = split(Entry->getFirstNonPHI());
   MachineBasicBlock *Exit = LRegion->getExit();
 
-  DEBUG(dbgs() << "Split " << printMBBReference(*Entry) << " to "
-               << printMBBReference(*Entry) << " -> "
-               << printMBBReference(*EntrySucc) << "\n");
+  LLVM_DEBUG(dbgs() << "Split " << printMBBReference(*Entry) << " to "
+                    << printMBBReference(*Entry) << " -> "
+                    << printMBBReference(*EntrySucc) << "\n");
   LRegion->addMBB(EntrySucc);
 
   // Make the backedge go to Entry Succ
@@ -2621,21 +2636,21 @@ bool AMDGPUMachineCFGStructurizer::structurizeComplexRegion(RegionMRT *Region) {
   rewriteRegionExitPHIs(Region, LastMerge, LRegion);
   removeOldExitPreds(Region);
 
-  DEBUG(PHIInfo.dump(MRI));
+  LLVM_DEBUG(PHIInfo.dump(MRI));
 
   SetVector<MRT *> *Children = Region->getChildren();
-  DEBUG(dbgs() << "===========If Region Start===============\n");
+  LLVM_DEBUG(dbgs() << "===========If Region Start===============\n");
   if (LRegion->getHasLoop()) {
-    DEBUG(dbgs() << "Has Backedge: Yes\n");
+    LLVM_DEBUG(dbgs() << "Has Backedge: Yes\n");
   } else {
-    DEBUG(dbgs() << "Has Backedge: No\n");
+    LLVM_DEBUG(dbgs() << "Has Backedge: No\n");
   }
 
   unsigned BBSelectRegIn;
   unsigned BBSelectRegOut;
   for (auto CI = Children->begin(), CE = Children->end(); CI != CE; ++CI) {
-    DEBUG(dbgs() << "CurrentRegion: \n");
-    DEBUG(LRegion->print(dbgs(), TRI));
+    LLVM_DEBUG(dbgs() << "CurrentRegion: \n");
+    LLVM_DEBUG(LRegion->print(dbgs(), TRI));
 
     auto CNI = CI;
     ++CNI;
@@ -2649,9 +2664,9 @@ bool AMDGPUMachineCFGStructurizer::structurizeComplexRegion(RegionMRT *Region) {
       // We found the block is the exit of an inner region, we need
       // to put it in the current linearized region.
 
-      DEBUG(dbgs() << "Linearizing region: ");
-      DEBUG(InnerLRegion->print(dbgs(), TRI));
-      DEBUG(dbgs() << "\n");
+      LLVM_DEBUG(dbgs() << "Linearizing region: ");
+      LLVM_DEBUG(InnerLRegion->print(dbgs(), TRI));
+      LLVM_DEBUG(dbgs() << "\n");
 
       MachineBasicBlock *InnerEntry = InnerLRegion->getEntry();
       if ((&(*(InnerEntry->getParent()->begin()))) == InnerEntry) {
@@ -2669,10 +2684,10 @@ bool AMDGPUMachineCFGStructurizer::structurizeComplexRegion(RegionMRT *Region) {
       BBSelectRegOut = Child->getBBSelectRegOut();
       BBSelectRegIn = Child->getBBSelectRegIn();
 
-      DEBUG(dbgs() << "BBSelectRegIn: " << printReg(BBSelectRegIn, TRI)
-                   << "\n");
-      DEBUG(dbgs() << "BBSelectRegOut: " << printReg(BBSelectRegOut, TRI)
-                   << "\n");
+      LLVM_DEBUG(dbgs() << "BBSelectRegIn: " << printReg(BBSelectRegIn, TRI)
+                        << "\n");
+      LLVM_DEBUG(dbgs() << "BBSelectRegOut: " << printReg(BBSelectRegOut, TRI)
+                        << "\n");
 
       MachineBasicBlock *IfEnd = CurrentMerge;
       CurrentMerge = createIfRegion(CurrentMerge, InnerLRegion, LRegion,
@@ -2681,7 +2696,7 @@ bool AMDGPUMachineCFGStructurizer::structurizeComplexRegion(RegionMRT *Region) {
       TII->convertNonUniformIfRegion(CurrentMerge, IfEnd);
     } else {
       MachineBasicBlock *MBB = Child->getMBBMRT()->getMBB();
-      DEBUG(dbgs() << "Linearizing block: " << MBB->getNumber() << "\n");
+      LLVM_DEBUG(dbgs() << "Linearizing block: " << MBB->getNumber() << "\n");
 
       if (MBB == getSingleExitNode(*(MBB->getParent()))) {
         // If this is the exit block then we need to skip to the next.
@@ -2693,10 +2708,10 @@ bool AMDGPUMachineCFGStructurizer::structurizeComplexRegion(RegionMRT *Region) {
       BBSelectRegOut = Child->getBBSelectRegOut();
       BBSelectRegIn = Child->getBBSelectRegIn();
 
-      DEBUG(dbgs() << "BBSelectRegIn: " << printReg(BBSelectRegIn, TRI)
-                   << "\n");
-      DEBUG(dbgs() << "BBSelectRegOut: " << printReg(BBSelectRegOut, TRI)
-                   << "\n");
+      LLVM_DEBUG(dbgs() << "BBSelectRegIn: " << printReg(BBSelectRegIn, TRI)
+                        << "\n");
+      LLVM_DEBUG(dbgs() << "BBSelectRegOut: " << printReg(BBSelectRegOut, TRI)
+                        << "\n");
 
       MachineBasicBlock *IfEnd = CurrentMerge;
       // This is a basic block that is not part of an inner region, we
@@ -2707,7 +2722,7 @@ bool AMDGPUMachineCFGStructurizer::structurizeComplexRegion(RegionMRT *Region) {
         TII->convertNonUniformIfRegion(CurrentMerge, IfEnd);
       }
 
-      DEBUG(PHIInfo.dump(MRI));
+      LLVM_DEBUG(PHIInfo.dump(MRI));
     }
   }
 
@@ -2728,7 +2743,7 @@ bool AMDGPUMachineCFGStructurizer::structurizeComplexRegion(RegionMRT *Region) {
                               NewInReg, Region->getEntry()->getNumber());
     // Need to be careful about updating the registers inside the region.
     LRegion->replaceRegisterInsideRegion(InReg, InnerSelectReg, false, MRI);
-    DEBUG(dbgs() << "Loop BBSelect Merge PHI:\n");
+    LLVM_DEBUG(dbgs() << "Loop BBSelect Merge PHI:\n");
     insertMergePHI(LRegion->getEntry(), LRegion->getExit(), NewSucc,
                    InnerSelectReg, NewInReg,
                    LRegion->getRegionMRT()->getInnerOutputRegister());
@@ -2740,11 +2755,11 @@ bool AMDGPUMachineCFGStructurizer::structurizeComplexRegion(RegionMRT *Region) {
     TII->insertReturn(*LastMerge);
   }
 
-  DEBUG(Region->getEntry()->getParent()->dump());
-  DEBUG(LRegion->print(dbgs(), TRI));
-  DEBUG(PHIInfo.dump(MRI));
+  LLVM_DEBUG(Region->getEntry()->getParent()->dump());
+  LLVM_DEBUG(LRegion->print(dbgs(), TRI));
+  LLVM_DEBUG(PHIInfo.dump(MRI));
 
-  DEBUG(dbgs() << "===========If Region End===============\n");
+  LLVM_DEBUG(dbgs() << "===========If Region End===============\n");
 
   Region->setLinearizedRegion(LRegion);
   return true;
@@ -2784,12 +2799,12 @@ bool AMDGPUMachineCFGStructurizer::structurizeRegions(RegionMRT *Region,
 }
 
 void AMDGPUMachineCFGStructurizer::initFallthroughMap(MachineFunction &MF) {
-  DEBUG(dbgs() << "Fallthrough Map:\n");
+  LLVM_DEBUG(dbgs() << "Fallthrough Map:\n");
   for (auto &MBBI : MF) {
     MachineBasicBlock *MBB = MBBI.getFallThrough();
     if (MBB != nullptr) {
-      DEBUG(dbgs() << "Fallthrough: " << MBBI.getNumber() << " -> "
-                   << MBB->getNumber() << "\n");
+      LLVM_DEBUG(dbgs() << "Fallthrough: " << MBBI.getNumber() << " -> "
+                        << MBB->getNumber() << "\n");
     }
     FallthroughMap[&MBBI] = MBB;
   }
@@ -2800,8 +2815,8 @@ void AMDGPUMachineCFGStructurizer::createLinearizedRegion(RegionMRT *Region,
   LinearizedRegion *LRegion = new LinearizedRegion();
   if (SelectOut) {
     LRegion->addLiveOut(SelectOut);
-    DEBUG(dbgs() << "Add LiveOut (BBSelect): " << printReg(SelectOut, TRI)
-                 << "\n");
+    LLVM_DEBUG(dbgs() << "Add LiveOut (BBSelect): " << printReg(SelectOut, TRI)
+                      << "\n");
   }
   LRegion->setRegionMRT(Region);
   Region->setLinearizedRegion(LRegion);
@@ -2856,26 +2871,26 @@ static void checkRegOnlyPHIInputs(MachineFunction &MF) {
 }
 
 bool AMDGPUMachineCFGStructurizer::runOnMachineFunction(MachineFunction &MF) {
-  const SISubtarget &ST = MF.getSubtarget<SISubtarget>();
+  const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
   const SIInstrInfo *TII = ST.getInstrInfo();
   TRI = ST.getRegisterInfo();
   MRI = &(MF.getRegInfo());
   initFallthroughMap(MF);
 
   checkRegOnlyPHIInputs(MF);
-  DEBUG(dbgs() << "----STRUCTURIZER START----\n");
-  DEBUG(MF.dump());
+  LLVM_DEBUG(dbgs() << "----STRUCTURIZER START----\n");
+  LLVM_DEBUG(MF.dump());
 
   Regions = &(getAnalysis<MachineRegionInfoPass>().getRegionInfo());
-  DEBUG(Regions->dump());
+  LLVM_DEBUG(Regions->dump());
 
   RegionMRT *RTree = MRT::buildMRT(MF, Regions, TII, MRI);
   setRegionMRT(RTree);
   initializeSelectRegisters(RTree, 0, MRI, TII);
-  DEBUG(RTree->dump(TRI));
+  LLVM_DEBUG(RTree->dump(TRI));
   bool result = structurizeRegions(RTree, true);
   delete RTree;
-  DEBUG(dbgs() << "----STRUCTURIZER END----\n");
+  LLVM_DEBUG(dbgs() << "----STRUCTURIZER END----\n");
   initFallthroughMap(MF);
   return result;
 }
diff --git a/lib/Target/AMDGPU/AMDGPUMachineFunction.cpp b/lib/Target/AMDGPU/AMDGPUMachineFunction.cpp
index b7c8c1213537..13b4b50149ce 100644
--- a/lib/Target/AMDGPU/AMDGPUMachineFunction.cpp
+++ b/lib/Target/AMDGPU/AMDGPUMachineFunction.cpp
@@ -9,20 +9,38 @@
 
 #include "AMDGPUMachineFunction.h"
 #include "AMDGPUSubtarget.h"
+#include "AMDGPUPerfHintAnalysis.h"
+#include "llvm/CodeGen/MachineModuleInfo.h"
 
 using namespace llvm;
 
 AMDGPUMachineFunction::AMDGPUMachineFunction(const MachineFunction &MF) :
   MachineFunctionInfo(),
   LocalMemoryObjects(),
-  KernArgSize(0),
+  ExplicitKernArgSize(0),
   MaxKernArgAlign(0),
   LDSSize(0),
-  ABIArgOffset(0),
   IsEntryFunction(AMDGPU::isEntryFunctionCC(MF.getFunction().getCallingConv())),
-  NoSignedZerosFPMath(MF.getTarget().Options.NoSignedZerosFPMath) {
+  NoSignedZerosFPMath(MF.getTarget().Options.NoSignedZerosFPMath),
+  MemoryBound(false),
+  WaveLimiter(false) {
+  const AMDGPUSubtarget &ST = AMDGPUSubtarget::get(MF);
+
   // FIXME: Should initialize KernArgSize based on ExplicitKernelArgOffset,
   // except reserved size is not correctly aligned.
+  const Function &F = MF.getFunction();
+
+  if (auto *Resolver = MF.getMMI().getResolver()) {
+    if (AMDGPUPerfHintAnalysis *PHA = static_cast<AMDGPUPerfHintAnalysis*>(
+          Resolver->getAnalysisIfAvailable(&AMDGPUPerfHintAnalysisID, true))) {
+      MemoryBound = PHA->isMemoryBound(&F);
+      WaveLimiter = PHA->needsWaveLimiter(&F);
+    }
+  }
+
+  CallingConv::ID CC = F.getCallingConv();
+  if (CC == CallingConv::AMDGPU_KERNEL || CC == CallingConv::SPIR_KERNEL)
+    ExplicitKernArgSize = ST.getExplicitKernArgSize(F, MaxKernArgAlign);
 }
 
 unsigned AMDGPUMachineFunction::allocateLDSGlobal(const DataLayout &DL,
diff --git a/lib/Target/AMDGPU/AMDGPUMachineFunction.h b/lib/Target/AMDGPU/AMDGPUMachineFunction.h
index 99bb61b21db0..8d6b871bc03e 100644
--- a/lib/Target/AMDGPU/AMDGPUMachineFunction.h
+++ b/lib/Target/AMDGPU/AMDGPUMachineFunction.h
@@ -15,57 +15,43 @@
 
 namespace llvm {
 
+class GCNSubtarget;
+
 class AMDGPUMachineFunction : public MachineFunctionInfo {
   /// A map to keep track of local memory objects and their offsets within the
   /// local memory space.
   SmallDenseMap<const GlobalValue *, unsigned, 4> LocalMemoryObjects;
 
-  uint64_t KernArgSize;
-  unsigned MaxKernArgAlign;
+protected:
+  uint64_t ExplicitKernArgSize; // Cache for this.
+  unsigned MaxKernArgAlign; // Cache for this.
 
   /// Number of bytes in the LDS that are being used.
   unsigned LDSSize;
 
-  // FIXME: This should probably be removed.
-  /// Start of implicit kernel args
-  unsigned ABIArgOffset;
-
-  // Kernels + shaders. i.e. functions called by the driver and not not called
+  // Kernels + shaders. i.e. functions called by the driver and not called
   // by other functions.
   bool IsEntryFunction;
 
   bool NoSignedZerosFPMath;
 
-public:
-  AMDGPUMachineFunction(const MachineFunction &MF);
-
-  uint64_t allocateKernArg(uint64_t Size, unsigned Align) {
-    assert(isPowerOf2_32(Align));
-    KernArgSize = alignTo(KernArgSize, Align);
+  // Function may be memory bound.
+  bool MemoryBound;
 
-    uint64_t Result = KernArgSize;
-    KernArgSize += Size;
+  // Kernel may need limited waves per EU for better performance.
+  bool WaveLimiter;
 
-    MaxKernArgAlign = std::max(Align, MaxKernArgAlign);
-    return Result;
-  }
+public:
+  AMDGPUMachineFunction(const MachineFunction &MF);
 
-  uint64_t getKernArgSize() const {
-    return KernArgSize;
+  uint64_t getExplicitKernArgSize() const {
+    return ExplicitKernArgSize;
   }
 
   unsigned getMaxKernArgAlign() const {
     return MaxKernArgAlign;
   }
 
-  void setABIArgOffset(unsigned NewOffset) {
-    ABIArgOffset = NewOffset;
-  }
-
-  unsigned getABIArgOffset() const {
-    return ABIArgOffset;
-  }
-
   unsigned getLDSSize() const {
     return LDSSize;
   }
@@ -78,6 +64,14 @@ public:
     return NoSignedZerosFPMath;
   }
 
+  bool isMemoryBound() const {
+    return MemoryBound;
+  }
+
+  bool needsWaveLimiter() const {
+    return WaveLimiter;
+  }
+
   unsigned allocateLDSGlobal(const DataLayout &DL, const GlobalValue &GV);
 };
 
diff --git a/lib/Target/AMDGPU/AMDGPUMachineModuleInfo.cpp b/lib/Target/AMDGPU/AMDGPUMachineModuleInfo.cpp
index 3164140abe29..7b9f673c418c 100644
--- a/lib/Target/AMDGPU/AMDGPUMachineModuleInfo.cpp
+++ b/lib/Target/AMDGPU/AMDGPUMachineModuleInfo.cpp
@@ -8,7 +8,7 @@
 //===----------------------------------------------------------------------===//
 //
 /// \file
-/// \brief AMDGPU Machine Module Info.
+/// AMDGPU Machine Module Info.
 ///
 //
 //===----------------------------------------------------------------------===//
diff --git a/lib/Target/AMDGPU/AMDGPUMachineModuleInfo.h b/lib/Target/AMDGPU/AMDGPUMachineModuleInfo.h
index 1a728c6bd04a..1219ab26fb69 100644
--- a/lib/Target/AMDGPU/AMDGPUMachineModuleInfo.h
+++ b/lib/Target/AMDGPU/AMDGPUMachineModuleInfo.h
@@ -8,7 +8,7 @@
 //===----------------------------------------------------------------------===//
 //
 /// \file
-/// \brief AMDGPU Machine Module Info.
+/// AMDGPU Machine Module Info.
 ///
 //
 //===----------------------------------------------------------------------===//
@@ -30,14 +30,14 @@ private:
   // All supported memory/synchronization scopes can be found here:
   //   http://llvm.org/docs/AMDGPUUsage.html#memory-scopes
 
-  /// \brief Agent synchronization scope ID.
+  /// Agent synchronization scope ID.
   SyncScope::ID AgentSSID;
-  /// \brief Workgroup synchronization scope ID.
+  /// Workgroup synchronization scope ID.
   SyncScope::ID WorkgroupSSID;
-  /// \brief Wavefront synchronization scope ID.
+  /// Wavefront synchronization scope ID.
   SyncScope::ID WavefrontSSID;
 
-  /// \brief In AMDGPU target synchronization scopes are inclusive, meaning a
+  /// In AMDGPU target synchronization scopes are inclusive, meaning a
   /// larger synchronization scope is inclusive of a smaller synchronization
   /// scope.
   ///
@@ -74,7 +74,7 @@ public:
     return WavefrontSSID;
   }
 
-  /// \brief In AMDGPU target synchronization scopes are inclusive, meaning a
+  /// In AMDGPU target synchronization scopes are inclusive, meaning a
   /// larger synchronization scope is inclusive of a smaller synchronization
   /// scope.
   ///
diff --git a/lib/Target/AMDGPU/AMDGPUMacroFusion.cpp b/lib/Target/AMDGPU/AMDGPUMacroFusion.cpp
index 7263ba73d155..995d9ae3907f 100644
--- a/lib/Target/AMDGPU/AMDGPUMacroFusion.cpp
+++ b/lib/Target/AMDGPU/AMDGPUMacroFusion.cpp
@@ -15,6 +15,7 @@
 #include "AMDGPUMacroFusion.h"
 #include "AMDGPUSubtarget.h"
 #include "SIInstrInfo.h"
+#include "MCTargetDesc/AMDGPUMCTargetDesc.h"
 
 #include "llvm/CodeGen/MacroFusion.h"
 
@@ -22,7 +23,7 @@ using namespace llvm;
 
 namespace {
 
-/// \brief Check if the instr pair, FirstMI and SecondMI, should be fused
+/// Check if the instr pair, FirstMI and SecondMI, should be fused
 /// together. Given SecondMI, when FirstMI is unspecified, then check if
 /// SecondMI may be part of a fused pair at all.
 static bool shouldScheduleAdjacent(const TargetInstrInfo &TII_,
diff --git a/lib/Target/AMDGPU/AMDGPUOpenCLEnqueuedBlockLowering.cpp b/lib/Target/AMDGPU/AMDGPUOpenCLEnqueuedBlockLowering.cpp
index bb65636f15af..7bd8533a0ccf 100644
--- a/lib/Target/AMDGPU/AMDGPUOpenCLEnqueuedBlockLowering.cpp
+++ b/lib/Target/AMDGPU/AMDGPUOpenCLEnqueuedBlockLowering.cpp
@@ -8,7 +8,7 @@
 //===----------------------------------------------------------------------===//
 //
 // \file
-// \brief This post-linking pass replaces the function pointer of enqueued
+// This post-linking pass replaces the function pointer of enqueued
 // block kernel with a global variable (runtime handle) and adds
 // "runtime-handle" attribute to the enqueued block kernel.
 //
@@ -36,7 +36,9 @@
 #include "llvm/ADT/DenseSet.h"
 #include "llvm/ADT/StringRef.h"
 #include "llvm/IR/Constants.h"
+#include "llvm/IR/DerivedTypes.h"
 #include "llvm/IR/Instructions.h"
+#include "llvm/IR/Mangler.h"
 #include "llvm/IR/Module.h"
 #include "llvm/IR/User.h"
 #include "llvm/Pass.h"
@@ -49,7 +51,7 @@ using namespace llvm;
 
 namespace {
 
-/// \brief Lower enqueued blocks.
+/// Lower enqueued blocks.
 class AMDGPUOpenCLEnqueuedBlockLowering : public ModulePass {
 public:
   static char ID;
@@ -80,49 +82,63 @@ static void collectCallers(Function *F, DenseSet<Function *> &Callers) {
   for (auto U : F->users()) {
     if (auto *CI = dyn_cast<CallInst>(&*U)) {
       auto *Caller = CI->getParent()->getParent();
-      if (Callers.count(Caller))
-        continue;
-      Callers.insert(Caller);
-      collectCallers(Caller, Callers);
+      if (Callers.insert(Caller).second)
+        collectCallers(Caller, Callers);
     }
   }
 }
 
+/// If \p U is instruction or constant, collect functions which directly or
+/// indirectly use it.
+static void collectFunctionUsers(User *U, DenseSet<Function *> &Funcs) {
+  if (auto *I = dyn_cast<Instruction>(U)) {
+    auto *F = I->getParent()->getParent();
+    if (Funcs.insert(F).second)
+      collectCallers(F, Funcs);
+    return;
+  }
+  if (!isa<Constant>(U))
+    return;
+  for (auto UU : U->users())
+    collectFunctionUsers(&*UU, Funcs);
+}
+
 bool AMDGPUOpenCLEnqueuedBlockLowering::runOnModule(Module &M) {
   DenseSet<Function *> Callers;
   auto &C = M.getContext();
   bool Changed = false;
   for (auto &F : M.functions()) {
     if (F.hasFnAttribute("enqueued-block")) {
-      if (!F.hasOneUse() || !F.user_begin()->hasOneUse() ||
-          !isa<ConstantExpr>(*F.user_begin()) ||
-          !isa<ConstantExpr>(*F.user_begin()->user_begin())) {
-        continue;
+      if (!F.hasName()) {
+        SmallString<64> Name;
+        Mangler::getNameWithPrefix(Name, "__amdgpu_enqueued_kernel",
+                                   M.getDataLayout());
+        F.setName(Name);
       }
-      auto *BitCast = cast<ConstantExpr>(*F.user_begin());
-      auto *AddrCast = cast<ConstantExpr>(*BitCast->user_begin());
-      auto RuntimeHandle = (F.getName() + "_runtime_handle").str();
+      LLVM_DEBUG(dbgs() << "found enqueued kernel: " << F.getName() << '\n');
+      auto RuntimeHandle = (F.getName() + ".runtime_handle").str();
+      auto T = ArrayType::get(Type::getInt64Ty(C), 2);
       auto *GV = new GlobalVariable(
-          M, Type::getInt8Ty(C)->getPointerTo(AMDGPUAS::GLOBAL_ADDRESS),
-          /*IsConstant=*/true, GlobalValue::ExternalLinkage,
-          /*Initializer=*/nullptr, RuntimeHandle, /*InsertBefore=*/nullptr,
-          GlobalValue::NotThreadLocal, AMDGPUAS::GLOBAL_ADDRESS,
-          /*IsExternallyInitialized=*/true);
-      DEBUG(dbgs() << "runtime handle created: " << *GV << '\n');
-      auto *NewPtr = ConstantExpr::getPointerCast(GV, AddrCast->getType());
-      AddrCast->replaceAllUsesWith(NewPtr);
-      F.addFnAttr("runtime-handle", RuntimeHandle);
-      F.setLinkage(GlobalValue::ExternalLinkage);
-
-      // Collect direct or indirect callers of enqueue_kernel.
-      for (auto U : NewPtr->users()) {
-        if (auto *I = dyn_cast<Instruction>(&*U)) {
-          auto *F = I->getParent()->getParent();
-          Callers.insert(F);
-          collectCallers(F, Callers);
-        }
+          M, T,
+          /*IsConstant=*/false, GlobalValue::ExternalLinkage,
+          /*Initializer=*/Constant::getNullValue(T), RuntimeHandle,
+          /*InsertBefore=*/nullptr, GlobalValue::NotThreadLocal,
+          AMDGPUAS::GLOBAL_ADDRESS,
+          /*IsExternallyInitialized=*/false);
+      LLVM_DEBUG(dbgs() << "runtime handle created: " << *GV << '\n');
+
+      for (auto U : F.users()) {
+        auto *UU = &*U;
+        if (!isa<ConstantExpr>(UU))
+          continue;
+        collectFunctionUsers(UU, Callers);
+        auto *BitCast = cast<ConstantExpr>(UU);
+        auto *NewPtr = ConstantExpr::getPointerCast(GV, BitCast->getType());
+        BitCast->replaceAllUsesWith(NewPtr);
+        F.addFnAttr("runtime-handle", RuntimeHandle);
+        F.setLinkage(GlobalValue::ExternalLinkage);
+        Changed = true;
       }
-      Changed = true;
     }
   }
 
@@ -130,6 +146,7 @@ bool AMDGPUOpenCLEnqueuedBlockLowering::runOnModule(Module &M) {
     if (F->getCallingConv() != CallingConv::AMDGPU_KERNEL)
       continue;
     F->addFnAttr("calls-enqueue-kernel");
+    LLVM_DEBUG(dbgs() << "mark enqueue_kernel caller:" << F->getName() << '\n');
   }
   return Changed;
 }
diff --git a/lib/Target/AMDGPU/AMDGPUPerfHintAnalysis.cpp b/lib/Target/AMDGPU/AMDGPUPerfHintAnalysis.cpp
new file mode 100644
index 000000000000..3cfdccc9fe51
--- /dev/null
+++ b/lib/Target/AMDGPU/AMDGPUPerfHintAnalysis.cpp
@@ -0,0 +1,397 @@
+//===- AMDGPUPerfHintAnalysis.cpp - analysis of functions memory traffic --===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+/// \file
+/// \brief Analyzes if a function potentially memory bound and if a kernel
+/// kernel may benefit from limiting number of waves to reduce cache thrashing.
+///
+//===----------------------------------------------------------------------===//
+
+#include "AMDGPU.h"
+#include "AMDGPUPerfHintAnalysis.h"
+#include "Utils/AMDGPUBaseInfo.h"
+#include "llvm/ADT/SmallSet.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/Analysis/ValueTracking.h"
+#include "llvm/CodeGen/TargetLowering.h"
+#include "llvm/CodeGen/TargetPassConfig.h"
+#include "llvm/CodeGen/TargetSubtargetInfo.h"
+#include "llvm/IR/Constants.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/IntrinsicInst.h"
+#include "llvm/IR/Module.h"
+#include "llvm/IR/ValueMap.h"
+#include "llvm/Support/CommandLine.h"
+
+using namespace llvm;
+
+#define DEBUG_TYPE "amdgpu-perf-hint"
+
+static cl::opt<unsigned>
+    MemBoundThresh("amdgpu-membound-threshold", cl::init(50), cl::Hidden,
+                   cl::desc("Function mem bound threshold in %"));
+
+static cl::opt<unsigned>
+    LimitWaveThresh("amdgpu-limit-wave-threshold", cl::init(50), cl::Hidden,
+                    cl::desc("Kernel limit wave threshold in %"));
+
+static cl::opt<unsigned>
+    IAWeight("amdgpu-indirect-access-weight", cl::init(1000), cl::Hidden,
+             cl::desc("Indirect access memory instruction weight"));
+
+static cl::opt<unsigned>
+    LSWeight("amdgpu-large-stride-weight", cl::init(1000), cl::Hidden,
+             cl::desc("Large stride memory access weight"));
+
+static cl::opt<unsigned>
+    LargeStrideThresh("amdgpu-large-stride-threshold", cl::init(64), cl::Hidden,
+                      cl::desc("Large stride memory access threshold"));
+
+STATISTIC(NumMemBound, "Number of functions marked as memory bound");
+STATISTIC(NumLimitWave, "Number of functions marked as needing limit wave");
+
+char llvm::AMDGPUPerfHintAnalysis::ID = 0;
+char &llvm::AMDGPUPerfHintAnalysisID = AMDGPUPerfHintAnalysis::ID;
+
+INITIALIZE_PASS(AMDGPUPerfHintAnalysis, DEBUG_TYPE,
+                "Analysis if a function is memory bound", true, true)
+
+namespace {
+
+struct AMDGPUPerfHint {
+  friend AMDGPUPerfHintAnalysis;
+
+public:
+  AMDGPUPerfHint(AMDGPUPerfHintAnalysis::FuncInfoMap &FIM_,
+                 const TargetLowering *TLI_)
+      : FIM(FIM_), DL(nullptr), TLI(TLI_) {}
+
+  void runOnFunction(Function &F);
+
+private:
+  struct MemAccessInfo {
+    const Value *V;
+    const Value *Base;
+    int64_t Offset;
+    MemAccessInfo() : V(nullptr), Base(nullptr), Offset(0) {}
+    bool isLargeStride(MemAccessInfo &Reference) const;
+#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
+    Printable print() const {
+      return Printable([this](raw_ostream &OS) {
+        OS << "Value: " << *V << '\n'
+           << "Base: " << *Base << " Offset: " << Offset << '\n';
+      });
+    }
+#endif
+  };
+
+  MemAccessInfo makeMemAccessInfo(Instruction *) const;
+
+  MemAccessInfo LastAccess; // Last memory access info
+
+  AMDGPUPerfHintAnalysis::FuncInfoMap &FIM;
+
+  const DataLayout *DL;
+
+  AMDGPUAS AS;
+
+  const TargetLowering *TLI;
+
+  void visit(const Function &F);
+  static bool isMemBound(const AMDGPUPerfHintAnalysis::FuncInfo &F);
+  static bool needLimitWave(const AMDGPUPerfHintAnalysis::FuncInfo &F);
+
+  bool isIndirectAccess(const Instruction *Inst) const;
+
+  /// Check if the instruction is large stride.
+  /// The purpose is to identify memory access pattern like:
+  /// x = a[i];
+  /// y = a[i+1000];
+  /// z = a[i+2000];
+  /// In the above example, the second and third memory access will be marked
+  /// large stride memory access.
+  bool isLargeStride(const Instruction *Inst);
+
+  bool isGlobalAddr(const Value *V) const;
+  bool isLocalAddr(const Value *V) const;
+  bool isConstantAddr(const Value *V) const;
+};
+
+static const Value *getMemoryInstrPtr(const Instruction *Inst) {
+  if (auto LI = dyn_cast<LoadInst>(Inst)) {
+    return LI->getPointerOperand();
+  }
+  if (auto SI = dyn_cast<StoreInst>(Inst)) {
+    return SI->getPointerOperand();
+  }
+  if (auto AI = dyn_cast<AtomicCmpXchgInst>(Inst)) {
+    return AI->getPointerOperand();
+  }
+  if (auto AI = dyn_cast<AtomicRMWInst>(Inst)) {
+    return AI->getPointerOperand();
+  }
+  if (auto MI = dyn_cast<AnyMemIntrinsic>(Inst)) {
+    return MI->getRawDest();
+  }
+
+  return nullptr;
+}
+
+bool AMDGPUPerfHint::isIndirectAccess(const Instruction *Inst) const {
+  LLVM_DEBUG(dbgs() << "[isIndirectAccess] " << *Inst << '\n');
+  SmallSet<const Value *, 32> WorkSet;
+  SmallSet<const Value *, 32> Visited;
+  if (const Value *MO = getMemoryInstrPtr(Inst)) {
+    if (isGlobalAddr(MO))
+      WorkSet.insert(MO);
+  }
+
+  while (!WorkSet.empty()) {
+    const Value *V = *WorkSet.begin();
+    WorkSet.erase(*WorkSet.begin());
+    if (!Visited.insert(V).second)
+      continue;
+    LLVM_DEBUG(dbgs() << "  check: " << *V << '\n');
+
+    if (auto LD = dyn_cast<LoadInst>(V)) {
+      auto M = LD->getPointerOperand();
+      if (isGlobalAddr(M) || isLocalAddr(M) || isConstantAddr(M)) {
+        LLVM_DEBUG(dbgs() << "    is IA\n");
+        return true;
+      }
+      continue;
+    }
+
+    if (auto GEP = dyn_cast<GetElementPtrInst>(V)) {
+      auto P = GEP->getPointerOperand();
+      WorkSet.insert(P);
+      for (unsigned I = 1, E = GEP->getNumIndices() + 1; I != E; ++I)
+        WorkSet.insert(GEP->getOperand(I));
+      continue;
+    }
+
+    if (auto U = dyn_cast<UnaryInstruction>(V)) {
+      WorkSet.insert(U->getOperand(0));
+      continue;
+    }
+
+    if (auto BO = dyn_cast<BinaryOperator>(V)) {
+      WorkSet.insert(BO->getOperand(0));
+      WorkSet.insert(BO->getOperand(1));
+      continue;
+    }
+
+    if (auto S = dyn_cast<SelectInst>(V)) {
+      WorkSet.insert(S->getFalseValue());
+      WorkSet.insert(S->getTrueValue());
+      continue;
+    }
+
+    if (auto E = dyn_cast<ExtractElementInst>(V)) {
+      WorkSet.insert(E->getVectorOperand());
+      continue;
+    }
+
+    LLVM_DEBUG(dbgs() << "    dropped\n");
+  }
+
+  LLVM_DEBUG(dbgs() << "  is not IA\n");
+  return false;
+}
+
+void AMDGPUPerfHint::visit(const Function &F) {
+  auto FIP = FIM.insert(std::make_pair(&F, AMDGPUPerfHintAnalysis::FuncInfo()));
+  if (!FIP.second)
+    return;
+
+  AMDGPUPerfHintAnalysis::FuncInfo &FI = FIP.first->second;
+
+  LLVM_DEBUG(dbgs() << "[AMDGPUPerfHint] process " << F.getName() << '\n');
+
+  for (auto &B : F) {
+    LastAccess = MemAccessInfo();
+    for (auto &I : B) {
+      if (getMemoryInstrPtr(&I)) {
+        if (isIndirectAccess(&I))
+          ++FI.IAMInstCount;
+        if (isLargeStride(&I))
+          ++FI.LSMInstCount;
+        ++FI.MemInstCount;
+        ++FI.InstCount;
+        continue;
+      }
+      CallSite CS(const_cast<Instruction *>(&I));
+      if (CS) {
+        Function *Callee = CS.getCalledFunction();
+        if (!Callee || Callee->isDeclaration()) {
+          ++FI.InstCount;
+          continue;
+        }
+        if (&F == Callee) // Handle immediate recursion
+          continue;
+
+        visit(*Callee);
+        auto Loc = FIM.find(Callee);
+
+        assert(Loc != FIM.end() && "No func info");
+        FI.MemInstCount += Loc->second.MemInstCount;
+        FI.InstCount += Loc->second.InstCount;
+        FI.IAMInstCount += Loc->second.IAMInstCount;
+        FI.LSMInstCount += Loc->second.LSMInstCount;
+      } else if (auto *GEP = dyn_cast<GetElementPtrInst>(&I)) {
+        TargetLoweringBase::AddrMode AM;
+        auto *Ptr = GetPointerBaseWithConstantOffset(GEP, AM.BaseOffs, *DL);
+        AM.BaseGV = dyn_cast_or_null<GlobalValue>(const_cast<Value *>(Ptr));
+        AM.HasBaseReg = !AM.BaseGV;
+        if (TLI->isLegalAddressingMode(*DL, AM, GEP->getResultElementType(),
+                                       GEP->getPointerAddressSpace()))
+          // Offset will likely be folded into load or store
+          continue;
+        ++FI.InstCount;
+      } else {
+        ++FI.InstCount;
+      }
+    }
+  }
+}
+
+void AMDGPUPerfHint::runOnFunction(Function &F) {
+  if (FIM.find(&F) != FIM.end())
+    return;
+
+  const Module &M = *F.getParent();
+  DL = &M.getDataLayout();
+  AS = AMDGPU::getAMDGPUAS(M);
+
+  visit(F);
+  auto Loc = FIM.find(&F);
+
+  assert(Loc != FIM.end() && "No func info");
+  LLVM_DEBUG(dbgs() << F.getName() << " MemInst: " << Loc->second.MemInstCount
+                    << '\n'
+                    << " IAMInst: " << Loc->second.IAMInstCount << '\n'
+                    << " LSMInst: " << Loc->second.LSMInstCount << '\n'
+                    << " TotalInst: " << Loc->second.InstCount << '\n');
+
+  auto &FI = Loc->second;
+
+  if (isMemBound(FI)) {
+    LLVM_DEBUG(dbgs() << F.getName() << " is memory bound\n");
+    NumMemBound++;
+  }
+
+  if (AMDGPU::isEntryFunctionCC(F.getCallingConv()) && needLimitWave(FI)) {
+    LLVM_DEBUG(dbgs() << F.getName() << " needs limit wave\n");
+    NumLimitWave++;
+  }
+}
+
+bool AMDGPUPerfHint::isMemBound(const AMDGPUPerfHintAnalysis::FuncInfo &FI) {
+  return FI.MemInstCount * 100 / FI.InstCount > MemBoundThresh;
+}
+
+bool AMDGPUPerfHint::needLimitWave(const AMDGPUPerfHintAnalysis::FuncInfo &FI) {
+  return ((FI.MemInstCount + FI.IAMInstCount * IAWeight +
+           FI.LSMInstCount * LSWeight) *
+          100 / FI.InstCount) > LimitWaveThresh;
+}
+
+bool AMDGPUPerfHint::isGlobalAddr(const Value *V) const {
+  if (auto PT = dyn_cast<PointerType>(V->getType())) {
+    unsigned As = PT->getAddressSpace();
+    // Flat likely points to global too.
+    return As == AS.GLOBAL_ADDRESS || As == AS.FLAT_ADDRESS;
+  }
+  return false;
+}
+
+bool AMDGPUPerfHint::isLocalAddr(const Value *V) const {
+  if (auto PT = dyn_cast<PointerType>(V->getType()))
+    return PT->getAddressSpace() == AS.LOCAL_ADDRESS;
+  return false;
+}
+
+bool AMDGPUPerfHint::isLargeStride(const Instruction *Inst) {
+  LLVM_DEBUG(dbgs() << "[isLargeStride] " << *Inst << '\n');
+
+  MemAccessInfo MAI = makeMemAccessInfo(const_cast<Instruction *>(Inst));
+  bool IsLargeStride = MAI.isLargeStride(LastAccess);
+  if (MAI.Base)
+    LastAccess = std::move(MAI);
+
+  return IsLargeStride;
+}
+
+AMDGPUPerfHint::MemAccessInfo
+AMDGPUPerfHint::makeMemAccessInfo(Instruction *Inst) const {
+  MemAccessInfo MAI;
+  const Value *MO = getMemoryInstrPtr(Inst);
+
+  LLVM_DEBUG(dbgs() << "[isLargeStride] MO: " << *MO << '\n');
+  // Do not treat local-addr memory access as large stride.
+  if (isLocalAddr(MO))
+    return MAI;
+
+  MAI.V = MO;
+  MAI.Base = GetPointerBaseWithConstantOffset(MO, MAI.Offset, *DL);
+  return MAI;
+}
+
+bool AMDGPUPerfHint::isConstantAddr(const Value *V) const {
+  if (auto PT = dyn_cast<PointerType>(V->getType())) {
+    unsigned As = PT->getAddressSpace();
+    return As == AS.CONSTANT_ADDRESS || As == AS.CONSTANT_ADDRESS_32BIT;
+  }
+  return false;
+}
+
+bool AMDGPUPerfHint::MemAccessInfo::isLargeStride(
+    MemAccessInfo &Reference) const {
+
+  if (!Base || !Reference.Base || Base != Reference.Base)
+    return false;
+
+  uint64_t Diff = Offset > Reference.Offset ? Offset - Reference.Offset
+                                            : Reference.Offset - Offset;
+  bool Result = Diff > LargeStrideThresh;
+  LLVM_DEBUG(dbgs() << "[isLargeStride compare]\n"
+               << print() << "<=>\n"
+               << Reference.print() << "Result:" << Result << '\n');
+  return Result;
+}
+} // namespace
+
+bool AMDGPUPerfHintAnalysis::runOnFunction(Function &F) {
+  auto *TPC = getAnalysisIfAvailable<TargetPassConfig>();
+  if (!TPC)
+    return false;
+
+  const TargetMachine &TM = TPC->getTM<TargetMachine>();
+  const TargetSubtargetInfo *ST = TM.getSubtargetImpl(F);
+
+  AMDGPUPerfHint Analyzer(FIM, ST->getTargetLowering());
+  Analyzer.runOnFunction(F);
+  return false;
+}
+
+bool AMDGPUPerfHintAnalysis::isMemoryBound(const Function *F) const {
+  auto FI = FIM.find(F);
+  if (FI == FIM.end())
+    return false;
+
+  return AMDGPUPerfHint::isMemBound(FI->second);
+}
+
+bool AMDGPUPerfHintAnalysis::needsWaveLimiter(const Function *F) const {
+  auto FI = FIM.find(F);
+  if (FI == FIM.end())
+    return false;
+
+  return AMDGPUPerfHint::needLimitWave(FI->second);
+}
diff --git a/lib/Target/AMDGPU/AMDGPUPerfHintAnalysis.h b/lib/Target/AMDGPU/AMDGPUPerfHintAnalysis.h
new file mode 100644
index 000000000000..be7f37cb6815
--- /dev/null
+++ b/lib/Target/AMDGPU/AMDGPUPerfHintAnalysis.h
@@ -0,0 +1,55 @@
+//===- AMDGPUPerfHintAnalysis.h - analysis of functions memory traffic ----===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+/// \file
+/// \brief Analyzes if a function potentially memory bound and if a kernel
+/// kernel may benefit from limiting number of waves to reduce cache thrashing.
+///
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_TARGET_AMDGPU_MDGPUPERFHINTANALYSIS_H
+#define LLVM_LIB_TARGET_AMDGPU_MDGPUPERFHINTANALYSIS_H
+#include "llvm/IR/ValueMap.h"
+#include "llvm/Pass.h"
+
+namespace llvm {
+
+struct AMDGPUPerfHintAnalysis : public FunctionPass {
+  static char ID;
+
+public:
+  AMDGPUPerfHintAnalysis() : FunctionPass(ID) {}
+
+  bool runOnFunction(Function &F) override;
+
+  void getAnalysisUsage(AnalysisUsage &AU) const override {
+    AU.setPreservesAll();
+  }
+
+  bool isMemoryBound(const Function *F) const;
+
+  bool needsWaveLimiter(const Function *F) const;
+
+  struct FuncInfo {
+    unsigned MemInstCount;
+    unsigned InstCount;
+    unsigned IAMInstCount; // Indirect access memory instruction count
+    unsigned LSMInstCount; // Large stride memory instruction count
+    FuncInfo() : MemInstCount(0), InstCount(0), IAMInstCount(0),
+                 LSMInstCount(0) {}
+  };
+
+  typedef ValueMap<const Function*, FuncInfo> FuncInfoMap;
+
+private:
+
+  FuncInfoMap FIM;
+};
+} // namespace llvm
+#endif // LLVM_LIB_TARGET_AMDGPU_MDGPUPERFHINTANALYSIS_H
diff --git a/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp b/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp
index 41876ed45c8c..d341fec6296f 100644
--- a/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp
+++ b/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp
@@ -65,6 +65,11 @@ using namespace llvm;
 
 namespace {
 
+static cl::opt<bool> DisablePromoteAllocaToVector(
+  "disable-promote-alloca-to-vector",
+  cl::desc("Disable promote alloca to vector"),
+  cl::init(false));
+
 // FIXME: This can create globals so should be a module pass.
 class AMDGPUPromoteAlloca : public FunctionPass {
 private:
@@ -147,7 +152,7 @@ bool AMDGPUPromoteAlloca::runOnFunction(Function &F) {
   IsAMDGCN = TT.getArch() == Triple::amdgcn;
   IsAMDHSA = TT.getOS() == Triple::AMDHSA;
 
-  const AMDGPUSubtarget &ST = TM->getSubtarget<AMDGPUSubtarget>(F);
+  const AMDGPUSubtarget &ST = AMDGPUSubtarget::get(*TM, F);
   if (!ST.isPromoteAllocaEnabled())
     return false;
 
@@ -169,8 +174,8 @@ bool AMDGPUPromoteAlloca::runOnFunction(Function &F) {
 
 std::pair<Value *, Value *>
 AMDGPUPromoteAlloca::getLocalSizeYZ(IRBuilder<> &Builder) {
-  const AMDGPUSubtarget &ST = TM->getSubtarget<AMDGPUSubtarget>(
-                                *Builder.GetInsertBlock()->getParent());
+  const Function &F = *Builder.GetInsertBlock()->getParent();
+  const AMDGPUSubtarget &ST = AMDGPUSubtarget::get(*TM, F);
 
   if (!IsAMDHSA) {
     Function *LocalSizeYFn
@@ -256,8 +261,8 @@ AMDGPUPromoteAlloca::getLocalSizeYZ(IRBuilder<> &Builder) {
 }
 
 Value *AMDGPUPromoteAlloca::getWorkitemID(IRBuilder<> &Builder, unsigned N) {
-  const AMDGPUSubtarget &ST = TM->getSubtarget<AMDGPUSubtarget>(
-                                *Builder.GetInsertBlock()->getParent());
+  const AMDGPUSubtarget &ST =
+      AMDGPUSubtarget::get(*TM, *Builder.GetInsertBlock()->getParent());
   Intrinsic::ID IntrID = Intrinsic::ID::not_intrinsic;
 
   switch (N) {
@@ -318,18 +323,19 @@ static Value* GEPToVectorIndex(GetElementPtrInst *GEP) {
 static bool canVectorizeInst(Instruction *Inst, User *User) {
   switch (Inst->getOpcode()) {
   case Instruction::Load: {
+    // Currently only handle the case where the Pointer Operand is a GEP.
+    // Also we could not vectorize volatile or atomic loads.
     LoadInst *LI = cast<LoadInst>(Inst);
-    // Currently only handle the case where the Pointer Operand is a GEP so check for that case.
-    return isa<GetElementPtrInst>(LI->getPointerOperand()) && !LI->isVolatile();
+    return isa<GetElementPtrInst>(LI->getPointerOperand()) && LI->isSimple();
   }
   case Instruction::BitCast:
-  case Instruction::AddrSpaceCast:
     return true;
   case Instruction::Store: {
     // Must be the stored pointer operand, not a stored value, plus
     // since it should be canonical form, the User should be a GEP.
+    // Also we could not vectorize volatile or atomic stores.
     StoreInst *SI = cast<StoreInst>(Inst);
-    return (SI->getPointerOperand() == User) && isa<GetElementPtrInst>(User) && !SI->isVolatile();
+    return (SI->getPointerOperand() == User) && isa<GetElementPtrInst>(User) && SI->isSimple();
   }
   default:
     return false;
@@ -337,19 +343,25 @@ static bool canVectorizeInst(Instruction *Inst, User *User) {
 }
 
 static bool tryPromoteAllocaToVector(AllocaInst *Alloca, AMDGPUAS AS) {
+
+  if (DisablePromoteAllocaToVector) {
+    LLVM_DEBUG(dbgs() << "  Promotion alloca to vector is disabled\n");
+    return false;
+  }
+
   ArrayType *AllocaTy = dyn_cast<ArrayType>(Alloca->getAllocatedType());
 
-  DEBUG(dbgs() << "Alloca candidate for vectorization\n");
+  LLVM_DEBUG(dbgs() << "Alloca candidate for vectorization\n");
 
   // FIXME: There is no reason why we can't support larger arrays, we
   // are just being conservative for now.
   // FIXME: We also reject alloca's of the form [ 2 x [ 2 x i32 ]] or equivalent. Potentially these
   // could also be promoted but we don't currently handle this case
   if (!AllocaTy ||
-      AllocaTy->getNumElements() > 4 ||
+      AllocaTy->getNumElements() > 16 ||
       AllocaTy->getNumElements() < 2 ||
       !VectorType::isValidElementType(AllocaTy->getElementType())) {
-    DEBUG(dbgs() << "  Cannot convert type to vector\n");
+    LLVM_DEBUG(dbgs() << "  Cannot convert type to vector\n");
     return false;
   }
 
@@ -370,7 +382,8 @@ static bool tryPromoteAllocaToVector(AllocaInst *Alloca, AMDGPUAS AS) {
     // If we can't compute a vector index from this GEP, then we can't
     // promote this alloca to vector.
     if (!Index) {
-      DEBUG(dbgs() << "  Cannot compute vector index for GEP " << *GEP << '\n');
+      LLVM_DEBUG(dbgs() << "  Cannot compute vector index for GEP " << *GEP
+                        << '\n');
       return false;
     }
 
@@ -385,8 +398,8 @@ static bool tryPromoteAllocaToVector(AllocaInst *Alloca, AMDGPUAS AS) {
 
   VectorType *VectorTy = arrayTypeToVecType(AllocaTy);
 
-  DEBUG(dbgs() << "  Converting alloca to vector "
-        << *AllocaTy << " -> " << *VectorTy << '\n');
+  LLVM_DEBUG(dbgs() << "  Converting alloca to vector " << *AllocaTy << " -> "
+                    << *VectorTy << '\n');
 
   for (Value *V : WorkList) {
     Instruction *Inst = cast<Instruction>(V);
@@ -443,7 +456,8 @@ static bool isCallPromotable(CallInst *CI) {
   case Intrinsic::lifetime_end:
   case Intrinsic::invariant_start:
   case Intrinsic::invariant_end:
-  case Intrinsic::invariant_group_barrier:
+  case Intrinsic::launder_invariant_group:
+  case Intrinsic::strip_invariant_group:
   case Intrinsic::objectsize:
     return true;
   default:
@@ -475,7 +489,8 @@ bool AMDGPUPromoteAlloca::binaryOpIsDerivedFromSameAlloca(Value *BaseAlloca,
   // important part is both must have the same address space at
   // the end.
   if (OtherObj != BaseAlloca) {
-    DEBUG(dbgs() << "Found a binary instruction with another alloca object\n");
+    LLVM_DEBUG(
+        dbgs() << "Found a binary instruction with another alloca object\n");
     return false;
   }
 
@@ -588,7 +603,7 @@ bool AMDGPUPromoteAlloca::collectUsesWithPtrTypes(
 bool AMDGPUPromoteAlloca::hasSufficientLocalMem(const Function &F) {
 
   FunctionType *FTy = F.getFunctionType();
-  const AMDGPUSubtarget &ST = TM->getSubtarget<AMDGPUSubtarget>(F);
+  const AMDGPUSubtarget &ST = AMDGPUSubtarget::get(*TM, F);
 
   // If the function has any arguments in the local address space, then it's
   // possible these arguments require the entire local memory space, so
@@ -597,8 +612,8 @@ bool AMDGPUPromoteAlloca::hasSufficientLocalMem(const Function &F) {
     PointerType *PtrTy = dyn_cast<PointerType>(ParamTy);
     if (PtrTy && PtrTy->getAddressSpace() == AS.LOCAL_ADDRESS) {
       LocalMemLimit = 0;
-      DEBUG(dbgs() << "Function has local memory argument. Promoting to "
-                      "local memory disabled.\n");
+      LLVM_DEBUG(dbgs() << "Function has local memory argument. Promoting to "
+                           "local memory disabled.\n");
       return false;
     }
   }
@@ -667,13 +682,12 @@ bool AMDGPUPromoteAlloca::hasSufficientLocalMem(const Function &F) {
 
   LocalMemLimit = MaxSizeWithWaveCount;
 
-  DEBUG(
-    dbgs() << F.getName() << " uses " << CurrentLocalMemUsage << " bytes of LDS\n"
-    << "  Rounding size to " << MaxSizeWithWaveCount
-    << " with a maximum occupancy of " << MaxOccupancy << '\n'
-    << " and " << (LocalMemLimit - CurrentLocalMemUsage)
-    << " available for promotion\n"
-  );
+  LLVM_DEBUG(dbgs() << F.getName() << " uses " << CurrentLocalMemUsage
+                    << " bytes of LDS\n"
+                    << "  Rounding size to " << MaxSizeWithWaveCount
+                    << " with a maximum occupancy of " << MaxOccupancy << '\n'
+                    << " and " << (LocalMemLimit - CurrentLocalMemUsage)
+                    << " available for promotion\n");
 
   return true;
 }
@@ -690,7 +704,7 @@ bool AMDGPUPromoteAlloca::handleAlloca(AllocaInst &I, bool SufficientLDS) {
   // First try to replace the alloca with a vector
   Type *AllocaTy = I.getAllocatedType();
 
-  DEBUG(dbgs() << "Trying to promote " << I << '\n');
+  LLVM_DEBUG(dbgs() << "Trying to promote " << I << '\n');
 
   if (tryPromoteAllocaToVector(&I, AS))
     return true; // Promoted to vector.
@@ -706,7 +720,9 @@ bool AMDGPUPromoteAlloca::handleAlloca(AllocaInst &I, bool SufficientLDS) {
   case CallingConv::SPIR_KERNEL:
     break;
   default:
-    DEBUG(dbgs() << " promote alloca to LDS not supported with calling convention.\n");
+    LLVM_DEBUG(
+        dbgs()
+        << " promote alloca to LDS not supported with calling convention.\n");
     return false;
   }
 
@@ -714,8 +730,7 @@ bool AMDGPUPromoteAlloca::handleAlloca(AllocaInst &I, bool SufficientLDS) {
   if (!SufficientLDS)
     return false;
 
-  const AMDGPUSubtarget &ST =
-    TM->getSubtarget<AMDGPUSubtarget>(ContainingFunction);
+  const AMDGPUSubtarget &ST = AMDGPUSubtarget::get(*TM, ContainingFunction);
   unsigned WorkGroupSize = ST.getFlatWorkGroupSizes(ContainingFunction).second;
 
   const DataLayout &DL = Mod->getDataLayout();
@@ -735,8 +750,8 @@ bool AMDGPUPromoteAlloca::handleAlloca(AllocaInst &I, bool SufficientLDS) {
   NewSize += AllocSize;
 
   if (NewSize > LocalMemLimit) {
-    DEBUG(dbgs() << "  " << AllocSize
-          << " bytes of local memory not available to promote\n");
+    LLVM_DEBUG(dbgs() << "  " << AllocSize
+                      << " bytes of local memory not available to promote\n");
     return false;
   }
 
@@ -745,11 +760,11 @@ bool AMDGPUPromoteAlloca::handleAlloca(AllocaInst &I, bool SufficientLDS) {
   std::vector<Value*> WorkList;
 
   if (!collectUsesWithPtrTypes(&I, &I, WorkList)) {
-    DEBUG(dbgs() << " Do not know how to convert all uses\n");
+    LLVM_DEBUG(dbgs() << " Do not know how to convert all uses\n");
     return false;
   }
 
-  DEBUG(dbgs() << "Promoting alloca to local memory\n");
+  LLVM_DEBUG(dbgs() << "Promoting alloca to local memory\n");
 
   Function *F = I.getParent()->getParent();
 
@@ -843,31 +858,32 @@ bool AMDGPUPromoteAlloca::handleAlloca(AllocaInst &I, bool SufficientLDS) {
       continue;
     case Intrinsic::memcpy: {
       MemCpyInst *MemCpy = cast<MemCpyInst>(Intr);
-      Builder.CreateMemCpy(MemCpy->getRawDest(), MemCpy->getRawSource(),
-                           MemCpy->getLength(), MemCpy->getAlignment(),
-                           MemCpy->isVolatile());
+      Builder.CreateMemCpy(MemCpy->getRawDest(), MemCpy->getDestAlignment(),
+                           MemCpy->getRawSource(), MemCpy->getSourceAlignment(),
+                           MemCpy->getLength(), MemCpy->isVolatile());
       Intr->eraseFromParent();
       continue;
     }
     case Intrinsic::memmove: {
       MemMoveInst *MemMove = cast<MemMoveInst>(Intr);
-      Builder.CreateMemMove(MemMove->getRawDest(), MemMove->getRawSource(),
-                            MemMove->getLength(), MemMove->getAlignment(),
-                            MemMove->isVolatile());
+      Builder.CreateMemMove(MemMove->getRawDest(), MemMove->getDestAlignment(),
+                            MemMove->getRawSource(), MemMove->getSourceAlignment(),
+                            MemMove->getLength(), MemMove->isVolatile());
       Intr->eraseFromParent();
       continue;
     }
     case Intrinsic::memset: {
       MemSetInst *MemSet = cast<MemSetInst>(Intr);
       Builder.CreateMemSet(MemSet->getRawDest(), MemSet->getValue(),
-                           MemSet->getLength(), MemSet->getAlignment(),
+                           MemSet->getLength(), MemSet->getDestAlignment(),
                            MemSet->isVolatile());
       Intr->eraseFromParent();
       continue;
     }
     case Intrinsic::invariant_start:
     case Intrinsic::invariant_end:
-    case Intrinsic::invariant_group_barrier:
+    case Intrinsic::launder_invariant_group:
+    case Intrinsic::strip_invariant_group:
       Intr->eraseFromParent();
       // FIXME: I think the invariant marker should still theoretically apply,
       // but the intrinsics need to be changed to accept pointers with any
diff --git a/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp b/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp
index 1ed02fae085a..012e4fe200aa 100644
--- a/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp
+++ b/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp
@@ -14,7 +14,9 @@
 
 #include "AMDGPURegisterBankInfo.h"
 #include "AMDGPUInstrInfo.h"
+#include "SIMachineFunctionInfo.h"
 #include "SIRegisterInfo.h"
+#include "MCTargetDesc/AMDGPUMCTargetDesc.h"
 #include "llvm/CodeGen/GlobalISel/RegisterBank.h"
 #include "llvm/CodeGen/GlobalISel/RegisterBankInfo.h"
 #include "llvm/CodeGen/TargetRegisterInfo.h"
@@ -50,10 +52,38 @@ AMDGPURegisterBankInfo::AMDGPURegisterBankInfo(const TargetRegisterInfo &TRI)
 
 }
 
-unsigned AMDGPURegisterBankInfo::copyCost(const RegisterBank &A,
-                                           const RegisterBank &B,
-                                           unsigned Size) const {
-  return RegisterBankInfo::copyCost(A, B, Size);
+static bool isConstant(const MachineOperand &MO, int64_t &C) {
+  const MachineFunction *MF = MO.getParent()->getParent()->getParent();
+  const MachineRegisterInfo &MRI = MF->getRegInfo();
+  const MachineInstr *Def = MRI.getVRegDef(MO.getReg());
+  if (!Def)
+    return false;
+
+  if (Def->getOpcode() == AMDGPU::G_CONSTANT) {
+    C = Def->getOperand(1).getCImm()->getSExtValue();
+    return true;
+  }
+
+  if (Def->getOpcode() == AMDGPU::COPY)
+    return isConstant(Def->getOperand(1), C);
+
+  return false;
+}
+
+unsigned AMDGPURegisterBankInfo::copyCost(const RegisterBank &Dst,
+                                          const RegisterBank &Src,
+                                          unsigned Size) const {
+  if (Dst.getID() == AMDGPU::SGPRRegBankID &&
+      Src.getID() == AMDGPU::VGPRRegBankID)
+    return std::numeric_limits<unsigned>::max();
+
+  // SGPRRegBank with size 1 is actually vcc or another 64-bit sgpr written by
+  // the valu.
+  if (Size == 1 && Dst.getID() == AMDGPU::SCCRegBankID &&
+      Src.getID() == AMDGPU::SGPRRegBankID)
+    return std::numeric_limits<unsigned>::max();
+
+  return RegisterBankInfo::copyCost(Dst, Src, Size);
 }
 
 const RegisterBank &AMDGPURegisterBankInfo::getRegBankFromRegClass(
@@ -72,11 +102,11 @@ AMDGPURegisterBankInfo::getInstrAlternativeMappings(
   const MachineFunction &MF = *MI.getParent()->getParent();
   const MachineRegisterInfo &MRI = MF.getRegInfo();
 
-  unsigned Size = getSizeInBits(MI.getOperand(0).getReg(), MRI, *TRI);
 
   InstructionMappings AltMappings;
   switch (MI.getOpcode()) {
   case TargetOpcode::G_LOAD: {
+    unsigned Size = getSizeInBits(MI.getOperand(0).getReg(), MRI, *TRI);
     // FIXME: Should we be hard coding the size for these mappings?
     const InstructionMapping &SSMapping = getInstructionMapping(
         1, 1, getOperandsMapping(
@@ -104,6 +134,42 @@ AMDGPURegisterBankInfo::getInstrAlternativeMappings(
     return AltMappings;
 
   }
+  case TargetOpcode::G_ICMP: {
+    unsigned Size = getSizeInBits(MI.getOperand(2).getReg(), MRI, *TRI);
+    const InstructionMapping &SSMapping = getInstructionMapping(1, 1,
+      getOperandsMapping({AMDGPU::getValueMapping(AMDGPU::SCCRegBankID, 1),
+                          nullptr, // Predicate operand.
+                          AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size),
+                          AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size)}),
+      4); // Num Operands
+    AltMappings.push_back(&SSMapping);
+
+    const InstructionMapping &SVMapping = getInstructionMapping(2, 1,
+      getOperandsMapping({AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, 1),
+                          nullptr, // Predicate operand.
+                          AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size),
+                          AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size)}),
+      4); // Num Operands
+    AltMappings.push_back(&SVMapping);
+
+    const InstructionMapping &VSMapping = getInstructionMapping(3, 1,
+      getOperandsMapping({AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, 1),
+                          nullptr, // Predicate operand.
+                          AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size),
+                          AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size)}),
+      4); // Num Operands
+    AltMappings.push_back(&VSMapping);
+
+    const InstructionMapping &VVMapping = getInstructionMapping(4, 1,
+      getOperandsMapping({AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, 1),
+                          nullptr, // Predicate operand.
+                          AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size),
+                          AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size)}),
+      4); // Num Operands
+    AltMappings.push_back(&VVMapping);
+
+    return AltMappings;
+  }
   default:
     break;
   }
@@ -120,7 +186,60 @@ static bool isInstrUniform(const MachineInstr &MI) {
     return false;
 
   const MachineMemOperand *MMO = *MI.memoperands_begin();
-  return AMDGPU::isUniformMMO(MMO);
+  return AMDGPUInstrInfo::isUniformMMO(MMO);
+}
+
+bool AMDGPURegisterBankInfo::isSALUMapping(const MachineInstr &MI) const {
+  const MachineFunction &MF = *MI.getParent()->getParent();
+  const MachineRegisterInfo &MRI = MF.getRegInfo();
+  for (unsigned i = 0, e = MI.getNumOperands();i != e; ++i) {
+    unsigned Reg = MI.getOperand(i).getReg();
+    const RegisterBank *Bank = getRegBank(Reg, MRI, *TRI);
+    if (Bank && Bank->getID() != AMDGPU::SGPRRegBankID)
+      return false;
+  }
+  return true;
+}
+
+const RegisterBankInfo::InstructionMapping &
+AMDGPURegisterBankInfo::getDefaultMappingSOP(const MachineInstr &MI) const {
+  const MachineFunction &MF = *MI.getParent()->getParent();
+  const MachineRegisterInfo &MRI = MF.getRegInfo();
+  SmallVector<const ValueMapping*, 8> OpdsMapping(MI.getNumOperands());
+
+  for (unsigned i = 0, e = MI.getNumOperands(); i != e; ++i) {
+    unsigned Size = getSizeInBits(MI.getOperand(i).getReg(), MRI, *TRI);
+    OpdsMapping[i] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size);
+  }
+  return getInstructionMapping(1, 1, getOperandsMapping(OpdsMapping),
+                               MI.getNumOperands());
+}
+
+const RegisterBankInfo::InstructionMapping &
+AMDGPURegisterBankInfo::getDefaultMappingVOP(const MachineInstr &MI) const {
+  const MachineFunction &MF = *MI.getParent()->getParent();
+  const MachineRegisterInfo &MRI = MF.getRegInfo();
+  SmallVector<const ValueMapping*, 8> OpdsMapping(MI.getNumOperands());
+  unsigned OpdIdx = 0;
+
+  unsigned Size0 = getSizeInBits(MI.getOperand(0).getReg(), MRI, *TRI);
+  OpdsMapping[OpdIdx++] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size0);
+
+  if (MI.getOperand(OpdIdx).isIntrinsicID())
+    OpdsMapping[OpdIdx++] = nullptr;
+
+  unsigned Reg1 = MI.getOperand(OpdIdx).getReg();
+  unsigned Size1 = getSizeInBits(Reg1, MRI, *TRI);
+  unsigned Bank1 = getRegBankID(Reg1, MRI, *TRI);
+  OpdsMapping[OpdIdx++] = AMDGPU::getValueMapping(Bank1, Size1);
+
+  for (unsigned e = MI.getNumOperands(); OpdIdx != e; ++OpdIdx) {
+    unsigned Size = getSizeInBits(MI.getOperand(OpdIdx).getReg(), MRI, *TRI);
+    OpdsMapping[OpdIdx] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size);
+  }
+
+  return getInstructionMapping(1, 1, getOperandsMapping(OpdsMapping),
+                               MI.getNumOperands());
 }
 
 const RegisterBankInfo::InstructionMapping &
@@ -155,6 +274,22 @@ AMDGPURegisterBankInfo::getInstrMappingForLoad(const MachineInstr &MI) const {
   // handle that during instruction selection?
 }
 
+unsigned
+AMDGPURegisterBankInfo::getRegBankID(unsigned Reg,
+                                     const MachineRegisterInfo &MRI,
+                                     const TargetRegisterInfo &TRI,
+                                     unsigned Default) const {
+
+  const RegisterBank *Bank = getRegBank(Reg, MRI, TRI);
+  return Bank ? Bank->getID() : Default;
+}
+
+///
+/// This function must return a legal mapping, because
+/// AMDGPURegisterBankInfo::getInstrAlternativeMappings() is not called
+/// in RegBankSelect::Mode::Fast.  Any mapping that would cause a
+/// VGPR to SGPR generated is illegal.
+///
 const RegisterBankInfo::InstructionMapping &
 AMDGPURegisterBankInfo::getInstrMapping(const MachineInstr &MI) const {
   const RegisterBankInfo::InstructionMapping &Mapping = getInstrMappingImpl(MI);
@@ -166,16 +301,102 @@ AMDGPURegisterBankInfo::getInstrMapping(const MachineInstr &MI) const {
   const MachineRegisterInfo &MRI = MF.getRegInfo();
   SmallVector<const ValueMapping*, 8> OpdsMapping(MI.getNumOperands());
 
-  bool IsComplete = true;
   switch (MI.getOpcode()) {
   default:
-    IsComplete = false;
+    return getInvalidInstructionMapping();
+  case AMDGPU::G_ADD:
+  case AMDGPU::G_SUB:
+  case AMDGPU::G_MUL:
+  case AMDGPU::G_AND:
+  case AMDGPU::G_OR:
+  case AMDGPU::G_XOR:
+  case AMDGPU::G_SHL:
+    if (isSALUMapping(MI))
+      return getDefaultMappingSOP(MI);
+    // Fall-through
+
+  case AMDGPU::G_FADD:
+  case AMDGPU::G_FPTOSI:
+  case AMDGPU::G_FPTOUI:
+  case AMDGPU::G_FMUL:
+    return getDefaultMappingVOP(MI);
+  case AMDGPU::G_IMPLICIT_DEF: {
+    unsigned Size = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits();
+    OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size);
     break;
+  }
+  case AMDGPU::G_FCONSTANT:
   case AMDGPU::G_CONSTANT: {
     unsigned Size = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits();
     OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size);
     break;
   }
+  case AMDGPU::G_EXTRACT: {
+    unsigned BankID = getRegBankID(MI.getOperand(1).getReg(), MRI, *TRI);
+    unsigned DstSize = getSizeInBits(MI.getOperand(0).getReg(), MRI, *TRI);
+    unsigned SrcSize = getSizeInBits(MI.getOperand(1).getReg(), MRI, *TRI);
+    OpdsMapping[0] = AMDGPU::getValueMapping(BankID, DstSize);
+    OpdsMapping[1] = AMDGPU::getValueMapping(BankID, SrcSize);
+    OpdsMapping[2] = nullptr;
+    break;
+  }
+  case AMDGPU::G_MERGE_VALUES: {
+    unsigned Bank = isSALUMapping(MI) ?
+      AMDGPU::SGPRRegBankID : AMDGPU::VGPRRegBankID;
+    unsigned DstSize = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits();
+    unsigned SrcSize = MRI.getType(MI.getOperand(1).getReg()).getSizeInBits();
+
+    OpdsMapping[0] = AMDGPU::getValueMapping(Bank, DstSize);
+    // Op1 and Dst should use the same register bank.
+    for (unsigned i = 1, e = MI.getNumOperands(); i != e; ++i)
+      OpdsMapping[i] = AMDGPU::getValueMapping(Bank, SrcSize);
+    break;
+  }
+  case AMDGPU::G_BITCAST: {
+    unsigned Size = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits();
+    unsigned BankID = getRegBankID(MI.getOperand(1).getReg(), MRI, *TRI);
+    OpdsMapping[0] = OpdsMapping[1] = AMDGPU::getValueMapping(BankID, Size);
+    break;
+  }
+  case AMDGPU::G_TRUNC: {
+    unsigned Dst = MI.getOperand(0).getReg();
+    unsigned Src = MI.getOperand(1).getReg();
+    unsigned Bank = getRegBankID(Src, MRI, *TRI);
+    unsigned DstSize = getSizeInBits(Dst, MRI, *TRI);
+    unsigned SrcSize = getSizeInBits(Src, MRI, *TRI);
+    OpdsMapping[0] = AMDGPU::getValueMapping(Bank, DstSize);
+    OpdsMapping[1] = AMDGPU::getValueMapping(Bank, SrcSize);
+    break;
+  }
+  case AMDGPU::G_ZEXT: {
+    unsigned Dst = MI.getOperand(0).getReg();
+    unsigned Src = MI.getOperand(1).getReg();
+    unsigned DstSize = getSizeInBits(Dst, MRI, *TRI);
+    unsigned SrcSize = getSizeInBits(Src, MRI, *TRI);
+    unsigned SrcBank = getRegBankID(Src, MRI, *TRI,
+                                    SrcSize == 1 ? AMDGPU::SGPRRegBankID :
+                                    AMDGPU::VGPRRegBankID);
+    unsigned DstBank = SrcBank;
+    if (SrcSize == 1) {
+      if (SrcBank == AMDGPU::SGPRRegBankID)
+        DstBank = AMDGPU::VGPRRegBankID;
+      else
+        DstBank = AMDGPU::SGPRRegBankID;
+    }
+
+    OpdsMapping[0] = AMDGPU::getValueMapping(DstBank, DstSize);
+    OpdsMapping[1] = AMDGPU::getValueMapping(SrcBank, SrcSize);
+    break;
+  }
+  case AMDGPU::G_FCMP: {
+    unsigned Size = MRI.getType(MI.getOperand(2).getReg()).getSizeInBits();
+    unsigned Op2Bank = getRegBankID(MI.getOperand(2).getReg(), MRI, *TRI);
+    OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, 1);
+    OpdsMapping[1] = nullptr; // Predicate Operand.
+    OpdsMapping[2] = AMDGPU::getValueMapping(Op2Bank, Size);
+    OpdsMapping[3] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size);
+    break;
+  }
   case AMDGPU::G_GEP: {
     for (unsigned i = 0, e = MI.getNumOperands(); i != e; ++i) {
       if (!MI.getOperand(i).isReg())
@@ -204,24 +425,113 @@ AMDGPURegisterBankInfo::getInstrMapping(const MachineInstr &MI) const {
     break;
   }
 
-  case AMDGPU::G_LOAD:
-    return getInstrMappingForLoad(MI);
+  case AMDGPU::G_ICMP: {
+    unsigned Size = MRI.getType(MI.getOperand(2).getReg()).getSizeInBits();
+    unsigned Op2Bank = getRegBankID(MI.getOperand(2).getReg(), MRI, *TRI);
+    unsigned Op3Bank = getRegBankID(MI.getOperand(3).getReg(), MRI, *TRI);
+    unsigned Op0Bank = Op2Bank == AMDGPU::SGPRRegBankID &&
+                       Op3Bank == AMDGPU::SGPRRegBankID ?
+                       AMDGPU::SCCRegBankID : AMDGPU::VGPRRegBankID;
+    OpdsMapping[0] = AMDGPU::getValueMapping(Op0Bank, 1);
+    OpdsMapping[1] = nullptr; // Predicate Operand.
+    OpdsMapping[2] = AMDGPU::getValueMapping(Op2Bank, Size);
+    OpdsMapping[3] = AMDGPU::getValueMapping(Op3Bank, Size);
+    break;
+  }
+
+
+  case AMDGPU::G_EXTRACT_VECTOR_ELT: {
+    unsigned IdxOp = 2;
+    int64_t Imm;
+    // XXX - Do we really need to fully handle these? The constant case should
+    // be legalized away before RegBankSelect?
+
+    unsigned OutputBankID = isSALUMapping(MI) && isConstant(MI.getOperand(IdxOp), Imm) ?
+                            AMDGPU::SGPRRegBankID : AMDGPU::VGPRRegBankID;
+
+    unsigned IdxBank = getRegBankID(MI.getOperand(2).getReg(), MRI, *TRI);
+    OpdsMapping[0] = AMDGPU::getValueMapping(OutputBankID, MRI.getType(MI.getOperand(0).getReg()).getSizeInBits());
+    OpdsMapping[1] = AMDGPU::getValueMapping(OutputBankID, MRI.getType(MI.getOperand(1).getReg()).getSizeInBits());
+
+    // The index can be either if the source vector is VGPR.
+    OpdsMapping[2] = AMDGPU::getValueMapping(IdxBank, MRI.getType(MI.getOperand(2).getReg()).getSizeInBits());
+    break;
   }
+  case AMDGPU::G_INSERT_VECTOR_ELT: {
+    // XXX - Do we really need to fully handle these? The constant case should
+    // be legalized away before RegBankSelect?
+
+    int64_t Imm;
+
+    unsigned IdxOp = MI.getOpcode() == AMDGPU::G_EXTRACT_VECTOR_ELT ? 2 : 3;
+    unsigned BankID = isSALUMapping(MI) && isConstant(MI.getOperand(IdxOp), Imm) ?
+                      AMDGPU::SGPRRegBankID : AMDGPU::VGPRRegBankID;
+
+
+
+    // TODO: Can do SGPR indexing, which would obviate the need for the
+    // isConstant check.
+    for (unsigned i = 0, e = MI.getNumOperands(); i != e; ++i) {
+      unsigned Size = getSizeInBits(MI.getOperand(i).getReg(), MRI, *TRI);
+      OpdsMapping[i] = AMDGPU::getValueMapping(BankID, Size);
+    }
 
-  if (!IsComplete) {
-    unsigned BankID = AMDGPU::SGPRRegBankID;
 
-    unsigned Size = 0;
-    for (unsigned Idx = 0; Idx < MI.getNumOperands(); ++Idx) {
-      // If the operand is not a register default to the size of the previous
-      // operand.
-      // FIXME: Can't we pull the types from the MachineInstr rather than the
-      // operands.
-      if (MI.getOperand(Idx).isReg())
-        Size = getSizeInBits(MI.getOperand(Idx).getReg(), MRI, *TRI);
-      OpdsMapping.push_back(AMDGPU::getValueMapping(BankID, Size));
+    break;
+  }
+  case AMDGPU::G_INTRINSIC: {
+    switch (MI.getOperand(1).getIntrinsicID()) {
+    default:
+      return getInvalidInstructionMapping();
+    case Intrinsic::maxnum:
+    case Intrinsic::minnum:
+    case Intrinsic::amdgcn_cvt_pkrtz:
+      return getDefaultMappingVOP(MI);
+    case Intrinsic::amdgcn_kernarg_segment_ptr: {
+      unsigned Size = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits();
+      OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size);
+      break;
+    }
+    }
+    break;
+  }
+  case AMDGPU::G_INTRINSIC_W_SIDE_EFFECTS: {
+    switch (MI.getOperand(0).getIntrinsicID()) {
+    default:
+      return getInvalidInstructionMapping();
+    case Intrinsic::amdgcn_exp_compr:
+      OpdsMapping[0] = nullptr; // IntrinsicID
+      // FIXME: These are immediate values which can't be read from registers.
+      OpdsMapping[1] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, 32);
+      OpdsMapping[2] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, 32);
+      // FIXME: Could we support packed types here?
+      OpdsMapping[3] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, 32);
+      OpdsMapping[4] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, 32);
+      // FIXME: These are immediate values which can't be read from registers.
+      OpdsMapping[5] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, 32);
+      OpdsMapping[6] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, 32);
+      break;
+    case Intrinsic::amdgcn_exp:
+      OpdsMapping[0] = nullptr; // IntrinsicID
+      // FIXME: These are immediate values which can't be read from registers.
+      OpdsMapping[1] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, 32);
+      OpdsMapping[2] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, 32);
+      // FIXME: Could we support packed types here?
+      OpdsMapping[3] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, 32);
+      OpdsMapping[4] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, 32);
+      OpdsMapping[5] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, 32);
+      OpdsMapping[6] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, 32);
+      // FIXME: These are immediate values which can't be read from registers.
+      OpdsMapping[7] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, 32);
+      OpdsMapping[8] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, 32);
+      break;
     }
+    break;
+  }
+  case AMDGPU::G_LOAD:
+    return getInstrMappingForLoad(MI);
   }
+
   return getInstructionMapping(1, 1, getOperandsMapping(OpdsMapping),
                                MI.getNumOperands());
 }
diff --git a/lib/Target/AMDGPU/AMDGPURegisterBankInfo.h b/lib/Target/AMDGPU/AMDGPURegisterBankInfo.h
index 201fdc1974c6..d48a66589873 100644
--- a/lib/Target/AMDGPU/AMDGPURegisterBankInfo.h
+++ b/lib/Target/AMDGPU/AMDGPURegisterBankInfo.h
@@ -16,19 +16,15 @@
 
 #include "llvm/CodeGen/GlobalISel/RegisterBankInfo.h"
 
+#define GET_REGBANK_DECLARATIONS
+#include "AMDGPUGenRegisterBank.inc"
+#undef GET_REGBANK_DECLARATIONS
+
 namespace llvm {
 
 class SIRegisterInfo;
 class TargetRegisterInfo;
 
-namespace AMDGPU {
-enum {
-  SGPRRegBankID = 0,
-  VGPRRegBankID = 1,
-  NumRegisterBanks
-};
-} // End AMDGPU namespace.
-
 /// This class provides the information for the target register banks.
 class AMDGPUGenRegisterBankInfo : public RegisterBankInfo {
 
@@ -46,6 +42,13 @@ class AMDGPURegisterBankInfo : public AMDGPUGenRegisterBankInfo {
   const RegisterBankInfo::InstructionMapping &
   getInstrMappingForLoad(const MachineInstr &MI) const;
 
+  unsigned getRegBankID(unsigned Reg, const MachineRegisterInfo &MRI,
+                        const TargetRegisterInfo &TRI,
+                        unsigned Default = AMDGPU::VGPRRegBankID) const;
+
+  bool isSALUMapping(const MachineInstr &MI) const;
+  const InstructionMapping &getDefaultMappingSOP(const MachineInstr &MI) const;
+  const InstructionMapping &getDefaultMappingVOP(const MachineInstr &MI) const;
 public:
   AMDGPURegisterBankInfo(const TargetRegisterInfo &TRI);
 
diff --git a/lib/Target/AMDGPU/AMDGPURegisterBanks.td b/lib/Target/AMDGPU/AMDGPURegisterBanks.td
index f4428e56035f..7f7f75f65647 100644
--- a/lib/Target/AMDGPU/AMDGPURegisterBanks.td
+++ b/lib/Target/AMDGPU/AMDGPURegisterBanks.td
@@ -14,3 +14,5 @@ def SGPRRegBank : RegisterBank<"SGPR",
 def VGPRRegBank : RegisterBank<"VGPR",
   [VGPR_32, VReg_64, VReg_96, VReg_128, VReg_256, VReg_512]
 >;
+
+def SCCRegBank : RegisterBank <"SCC", [SCC_CLASS ]>;
diff --git a/lib/Target/AMDGPU/AMDGPURegisterInfo.cpp b/lib/Target/AMDGPU/AMDGPURegisterInfo.cpp
index 5e4d33aaa691..50f859addc2b 100644
--- a/lib/Target/AMDGPU/AMDGPURegisterInfo.cpp
+++ b/lib/Target/AMDGPU/AMDGPURegisterInfo.cpp
@@ -8,13 +8,15 @@
 //===----------------------------------------------------------------------===//
 //
 /// \file
-/// \brief Parent TargetRegisterInfo class common to all hw codegen targets.
+/// Parent TargetRegisterInfo class common to all hw codegen targets.
 //
 //===----------------------------------------------------------------------===//
 
 #include "AMDGPURegisterInfo.h"
 #include "AMDGPUTargetMachine.h"
+#include "SIMachineFunctionInfo.h"
 #include "SIRegisterInfo.h"
+#include "MCTargetDesc/AMDGPUMCTargetDesc.h"
 
 using namespace llvm;
 
@@ -25,7 +27,7 @@ AMDGPURegisterInfo::AMDGPURegisterInfo() : AMDGPUGenRegisterInfo(0) {}
 // they are not supported at this time.
 //===----------------------------------------------------------------------===//
 
-unsigned AMDGPURegisterInfo::getSubRegFromChannel(unsigned Channel) const {
+unsigned AMDGPURegisterInfo::getSubRegFromChannel(unsigned Channel) {
   static const unsigned SubRegs[] = {
     AMDGPU::sub0, AMDGPU::sub1, AMDGPU::sub2, AMDGPU::sub3, AMDGPU::sub4,
     AMDGPU::sub5, AMDGPU::sub6, AMDGPU::sub7, AMDGPU::sub8, AMDGPU::sub9,
@@ -37,6 +39,13 @@ unsigned AMDGPURegisterInfo::getSubRegFromChannel(unsigned Channel) const {
   return SubRegs[Channel];
 }
 
+void AMDGPURegisterInfo::reserveRegisterTuples(BitVector &Reserved, unsigned Reg) const {
+  MCRegAliasIterator R(Reg, this, true);
+
+  for (; R.isValid(); ++R)
+    Reserved.set(*R);
+}
+
 #define GET_REGINFO_TARGET_DESC
 #include "AMDGPUGenRegisterInfo.inc"
 
@@ -75,5 +84,6 @@ const uint32_t *SIRegisterInfo::getCallPreservedMask(const MachineFunction &MF,
 }
 
 unsigned SIRegisterInfo::getFrameRegister(const MachineFunction &MF) const {
-  return AMDGPU::NoRegister;
+  const SIMachineFunctionInfo *FuncInfo = MF.getInfo<SIMachineFunctionInfo>();
+  return FuncInfo->getFrameOffsetReg();
 }
diff --git a/lib/Target/AMDGPU/AMDGPURegisterInfo.h b/lib/Target/AMDGPU/AMDGPURegisterInfo.h
index d8604d2590f1..07de5fc549e2 100644
--- a/lib/Target/AMDGPU/AMDGPURegisterInfo.h
+++ b/lib/Target/AMDGPU/AMDGPURegisterInfo.h
@@ -8,7 +8,7 @@
 //===----------------------------------------------------------------------===//
 //
 /// \file
-/// \brief TargetRegisterInfo interface that is implemented by all hw codegen
+/// TargetRegisterInfo interface that is implemented by all hw codegen
 /// targets.
 //
 //===----------------------------------------------------------------------===//
@@ -21,15 +21,19 @@
 
 namespace llvm {
 
-class AMDGPUSubtarget;
+class GCNSubtarget;
 class TargetInstrInfo;
 
 struct AMDGPURegisterInfo : public AMDGPUGenRegisterInfo {
   AMDGPURegisterInfo();
 
+  bool enableMultipleCopyHints() const override { return true; }
+
   /// \returns the sub reg enum value for the given \p Channel
   /// (e.g. getSubRegFromChannel(0) -> AMDGPU::sub0)
-  unsigned getSubRegFromChannel(unsigned Channel) const;
+  static unsigned getSubRegFromChannel(unsigned Channel);
+
+  void reserveRegisterTuples(BitVector &, unsigned Reg) const;
 };
 
 } // End namespace llvm
diff --git a/lib/Target/AMDGPU/AMDGPURegisterInfo.td b/lib/Target/AMDGPU/AMDGPURegisterInfo.td
index 3bbcba826f63..ceabae524414 100644
--- a/lib/Target/AMDGPU/AMDGPURegisterInfo.td
+++ b/lib/Target/AMDGPU/AMDGPURegisterInfo.td
@@ -19,5 +19,4 @@ foreach Index = 0-15 in {
 
 }
 
-include "R600RegisterInfo.td"
 include "SIRegisterInfo.td"
diff --git a/lib/Target/AMDGPU/AMDGPURewriteOutArguments.cpp b/lib/Target/AMDGPU/AMDGPURewriteOutArguments.cpp
index 83e56a9ab495..a861762a8c9e 100644
--- a/lib/Target/AMDGPU/AMDGPURewriteOutArguments.cpp
+++ b/lib/Target/AMDGPU/AMDGPURewriteOutArguments.cpp
@@ -249,8 +249,8 @@ bool AMDGPURewriteOutArguments::runOnFunction(Function &F) {
   SmallVector<Argument *, 4> OutArgs;
   for (Argument &Arg : F.args()) {
     if (isOutArgumentCandidate(Arg)) {
-      DEBUG(dbgs() << "Found possible out argument " << Arg
-            << " in function " << F.getName() << '\n');
+      LLVM_DEBUG(dbgs() << "Found possible out argument " << Arg
+                        << " in function " << F.getName() << '\n');
       OutArgs.push_back(&Arg);
     }
   }
@@ -310,7 +310,7 @@ bool AMDGPURewriteOutArguments::runOnFunction(Function &F) {
           SI = dyn_cast<StoreInst>(Q.getInst());
 
         if (SI) {
-          DEBUG(dbgs() << "Found out argument store: " << *SI << '\n');
+          LLVM_DEBUG(dbgs() << "Found out argument store: " << *SI << '\n');
           ReplaceableStores.emplace_back(RI, SI);
         } else {
           ThisReplaceable = false;
@@ -328,7 +328,8 @@ bool AMDGPURewriteOutArguments::runOnFunction(Function &F) {
         if (llvm::find_if(ValVec,
               [OutArg](const std::pair<Argument *, Value *> &Entry) {
                  return Entry.first == OutArg;}) != ValVec.end()) {
-          DEBUG(dbgs() << "Saw multiple out arg stores" << *OutArg << '\n');
+          LLVM_DEBUG(dbgs()
+                     << "Saw multiple out arg stores" << *OutArg << '\n');
           // It is possible to see stores to the same argument multiple times,
           // but we expect these would have been optimized out already.
           ThisReplaceable = false;
@@ -358,7 +359,7 @@ bool AMDGPURewriteOutArguments::runOnFunction(Function &F) {
                                               F.getFunctionType()->params(),
                                               F.isVarArg());
 
-  DEBUG(dbgs() << "Computed new return type: " << *NewRetTy << '\n');
+  LLVM_DEBUG(dbgs() << "Computed new return type: " << *NewRetTy << '\n');
 
   Function *NewFunc = Function::Create(NewFuncTy, Function::PrivateLinkage,
                                        F.getName() + ".body");
diff --git a/lib/Target/AMDGPU/AMDGPUSearchableTables.td b/lib/Target/AMDGPU/AMDGPUSearchableTables.td
new file mode 100644
index 000000000000..9dbd7751b4d8
--- /dev/null
+++ b/lib/Target/AMDGPU/AMDGPUSearchableTables.td
@@ -0,0 +1,77 @@
+//===-- AMDGPUSearchableTables.td - ------------------------*- tablegen -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+//===----------------------------------------------------------------------===//
+// Resource intrinsics table.
+//===----------------------------------------------------------------------===//
+
+class RsrcIntrinsic<AMDGPURsrcIntrinsic intr> {
+  Intrinsic Intr = !cast<Intrinsic>(intr);
+  bits<8> RsrcArg = intr.RsrcArg;
+  bit IsImage = intr.IsImage;
+}
+
+def RsrcIntrinsics : GenericTable {
+  let FilterClass = "RsrcIntrinsic";
+  let Fields = ["Intr", "RsrcArg", "IsImage"];
+
+  let PrimaryKey = ["Intr"];
+  let PrimaryKeyName = "lookupRsrcIntrinsic";
+}
+
+foreach intr = !listconcat(AMDGPUBufferIntrinsics,
+                           AMDGPUImageDimIntrinsics,
+                           AMDGPUImageDimAtomicIntrinsics) in {
+  def : RsrcIntrinsic<!cast<AMDGPURsrcIntrinsic>(intr)>;
+}
+
+class SourceOfDivergence<Intrinsic intr> {
+  Intrinsic Intr = intr;
+}
+
+def SourcesOfDivergence : GenericTable {
+  let FilterClass = "SourceOfDivergence";
+  let Fields = ["Intr"];
+
+  let PrimaryKey = ["Intr"];
+  let PrimaryKeyName = "lookupSourceOfDivergence";
+}
+
+def : SourceOfDivergence<int_amdgcn_workitem_id_x>;
+def : SourceOfDivergence<int_amdgcn_workitem_id_y>;
+def : SourceOfDivergence<int_amdgcn_workitem_id_z>;
+def : SourceOfDivergence<int_amdgcn_interp_mov>;
+def : SourceOfDivergence<int_amdgcn_interp_p1>;
+def : SourceOfDivergence<int_amdgcn_interp_p2>;
+def : SourceOfDivergence<int_amdgcn_mbcnt_hi>;
+def : SourceOfDivergence<int_amdgcn_mbcnt_lo>;
+def : SourceOfDivergence<int_r600_read_tidig_x>;
+def : SourceOfDivergence<int_r600_read_tidig_y>;
+def : SourceOfDivergence<int_r600_read_tidig_z>;
+def : SourceOfDivergence<int_amdgcn_atomic_inc>;
+def : SourceOfDivergence<int_amdgcn_atomic_dec>;
+def : SourceOfDivergence<int_amdgcn_ds_fadd>;
+def : SourceOfDivergence<int_amdgcn_ds_fmin>;
+def : SourceOfDivergence<int_amdgcn_ds_fmax>;
+def : SourceOfDivergence<int_amdgcn_buffer_atomic_swap>;
+def : SourceOfDivergence<int_amdgcn_buffer_atomic_add>;
+def : SourceOfDivergence<int_amdgcn_buffer_atomic_sub>;
+def : SourceOfDivergence<int_amdgcn_buffer_atomic_smin>;
+def : SourceOfDivergence<int_amdgcn_buffer_atomic_umin>;
+def : SourceOfDivergence<int_amdgcn_buffer_atomic_smax>;
+def : SourceOfDivergence<int_amdgcn_buffer_atomic_umax>;
+def : SourceOfDivergence<int_amdgcn_buffer_atomic_and>;
+def : SourceOfDivergence<int_amdgcn_buffer_atomic_or>;
+def : SourceOfDivergence<int_amdgcn_buffer_atomic_xor>;
+def : SourceOfDivergence<int_amdgcn_buffer_atomic_cmpswap>;
+def : SourceOfDivergence<int_amdgcn_ps_live>;
+def : SourceOfDivergence<int_amdgcn_ds_swizzle>;
+
+foreach intr = AMDGPUImageDimAtomicIntrinsics in
+def : SourceOfDivergence<intr>;
diff --git a/lib/Target/AMDGPU/AMDGPUSubtarget.cpp b/lib/Target/AMDGPU/AMDGPUSubtarget.cpp
index 80feaa44766f..98b49070fa99 100644
--- a/lib/Target/AMDGPU/AMDGPUSubtarget.cpp
+++ b/lib/Target/AMDGPU/AMDGPUSubtarget.cpp
@@ -8,7 +8,7 @@
 //===----------------------------------------------------------------------===//
 //
 /// \file
-/// \brief Implements the AMDGPU specific subclass of TargetSubtarget.
+/// Implements the AMDGPU specific subclass of TargetSubtarget.
 //
 //===----------------------------------------------------------------------===//
 
@@ -20,8 +20,10 @@
 #include "AMDGPULegalizerInfo.h"
 #include "AMDGPURegisterBankInfo.h"
 #include "SIMachineFunctionInfo.h"
+#include "MCTargetDesc/AMDGPUMCTargetDesc.h"
 #include "llvm/ADT/SmallString.h"
 #include "llvm/CodeGen/MachineScheduler.h"
+#include "llvm/MC/MCSubtargetInfo.h"
 #include "llvm/IR/MDBuilder.h"
 #include "llvm/CodeGen/TargetFrameLowering.h"
 #include <algorithm>
@@ -32,12 +34,37 @@ using namespace llvm;
 
 #define GET_SUBTARGETINFO_TARGET_DESC
 #define GET_SUBTARGETINFO_CTOR
+#define AMDGPUSubtarget GCNSubtarget
 #include "AMDGPUGenSubtargetInfo.inc"
+#define GET_SUBTARGETINFO_TARGET_DESC
+#define GET_SUBTARGETINFO_CTOR
+#undef AMDGPUSubtarget
+#include "R600GenSubtargetInfo.inc"
 
-AMDGPUSubtarget::~AMDGPUSubtarget() = default;
+GCNSubtarget::~GCNSubtarget() = default;
+
+R600Subtarget &
+R600Subtarget::initializeSubtargetDependencies(const Triple &TT,
+                                               StringRef GPU, StringRef FS) {
+  SmallString<256> FullFS("+promote-alloca,+dx10-clamp,");
+  FullFS += FS;
+  ParseSubtargetFeatures(GPU, FullFS);
+
+  // FIXME: I don't think think Evergreen has any useful support for
+  // denormals, but should be checked. Should we issue a warning somewhere
+  // if someone tries to enable these?
+  if (getGeneration() <= AMDGPUSubtarget::NORTHERN_ISLANDS) {
+    FP32Denormals = false;
+  }
+
+  HasMulU24 = getGeneration() >= EVERGREEN;
+  HasMulI24 = hasCaymanISA();
+
+  return *this;
+}
 
-AMDGPUSubtarget &
-AMDGPUSubtarget::initializeSubtargetDependencies(const Triple &TT,
+GCNSubtarget &
+GCNSubtarget::initializeSubtargetDependencies(const Triple &TT,
                                                  StringRef GPU, StringRef FS) {
   // Determine default and user-specified characteristics
   // On SI+, we want FP64 denormals to be on by default. FP32 denormals can be
@@ -92,26 +119,43 @@ AMDGPUSubtarget::initializeSubtargetDependencies(const Triple &TT,
       HasMovrel = true;
   }
 
+  HasFminFmaxLegacy = getGeneration() < AMDGPUSubtarget::VOLCANIC_ISLANDS;
+
   return *this;
 }
 
-AMDGPUSubtarget::AMDGPUSubtarget(const Triple &TT, StringRef GPU, StringRef FS,
-                                 const TargetMachine &TM)
-  : AMDGPUGenSubtargetInfo(TT, GPU, FS),
+AMDGPUSubtarget::AMDGPUSubtarget(const Triple &TT,
+                                             const FeatureBitset &FeatureBits) :
+  TargetTriple(TT),
+  SubtargetFeatureBits(FeatureBits),
+  Has16BitInsts(false),
+  HasMadMixInsts(false),
+  FP32Denormals(false),
+  FPExceptions(false),
+  HasSDWA(false),
+  HasVOP3PInsts(false),
+  HasMulI24(true),
+  HasMulU24(true),
+  HasFminFmaxLegacy(true),
+  EnablePromoteAlloca(false),
+  LocalMemorySize(0),
+  WavefrontSize(0)
+  { }
+
+GCNSubtarget::GCNSubtarget(const Triple &TT, StringRef GPU, StringRef FS,
+                                 const GCNTargetMachine &TM) :
+    AMDGPUGenSubtargetInfo(TT, GPU, FS),
+    AMDGPUSubtarget(TT, getFeatureBits()),
     TargetTriple(TT),
-    Gen(TT.getArch() == Triple::amdgcn ? SOUTHERN_ISLANDS : R600),
+    Gen(SOUTHERN_ISLANDS),
     IsaVersion(ISAVersion0_0_0),
-    WavefrontSize(0),
-    LocalMemorySize(0),
     LDSBankCount(0),
     MaxPrivateElementSize(0),
 
     FastFMAF32(false),
     HalfRate64Ops(false),
 
-    FP32Denormals(false),
     FP64FP16Denormals(false),
-    FPExceptions(false),
     DX10Clamp(false),
     FlatForGlobal(false),
     AutoWaitcntBeforeBarrier(false),
@@ -123,57 +167,56 @@ AMDGPUSubtarget::AMDGPUSubtarget(const Triple &TT, StringRef GPU, StringRef FS,
     EnableXNACK(false),
     TrapHandler(false),
     DebuggerInsertNops(false),
-    DebuggerReserveRegs(false),
     DebuggerEmitPrologue(false),
 
     EnableHugePrivateBuffer(false),
     EnableVGPRSpilling(false),
-    EnablePromoteAlloca(false),
     EnableLoadStoreOpt(false),
     EnableUnsafeDSOffsetFolding(false),
     EnableSIScheduler(false),
+    EnableDS128(false),
     DumpCode(false),
 
     FP64(false),
-    FMA(false),
-    IsGCN(false),
     GCN3Encoding(false),
     CIInsts(false),
     GFX9Insts(false),
     SGPRInitBug(false),
     HasSMemRealTime(false),
-    Has16BitInsts(false),
     HasIntClamp(false),
-    HasVOP3PInsts(false),
-    HasMadMixInsts(false),
+    HasFmaMixInsts(false),
     HasMovrel(false),
     HasVGPRIndexMode(false),
     HasScalarStores(false),
+    HasScalarAtomics(false),
     HasInv2PiInlineImm(false),
-    HasSDWA(false),
     HasSDWAOmod(false),
     HasSDWAScalar(false),
     HasSDWASdst(false),
     HasSDWAMac(false),
     HasSDWAOutModsVOPC(false),
     HasDPP(false),
+    HasDLInsts(false),
+    D16PreservesUnusedBits(false),
     FlatAddressSpace(false),
     FlatInstOffsets(false),
     FlatGlobalInsts(false),
     FlatScratchInsts(false),
     AddNoCarryInsts(false),
+    HasUnpackedD16VMem(false),
 
-    R600ALUInst(false),
-    CaymanISA(false),
-    CFALUBug(false),
-    HasVertexCache(false),
-    TexVTXClauseSize(0),
     ScalarizeGlobal(false),
 
     FeatureDisable(false),
-    InstrItins(getInstrItineraryForCPU(GPU)) {
+    InstrInfo(initializeSubtargetDependencies(TT, GPU, FS)),
+    TLInfo(TM, *this),
+    FrameLowering(TargetFrameLowering::StackGrowsUp, getStackAlignment(), 0) {
   AS = AMDGPU::getAMDGPUAS(TT);
-  initializeSubtargetDependencies(TT, GPU, FS);
+  CallLoweringInfo.reset(new AMDGPUCallLowering(*getTargetLowering()));
+  Legalizer.reset(new AMDGPULegalizerInfo(*this, TM));
+  RegBankInfo.reset(new AMDGPURegisterBankInfo(*getRegisterInfo()));
+  InstSelector.reset(new AMDGPUInstructionSelector(
+  *this, *static_cast<AMDGPURegisterBankInfo *>(RegBankInfo.get()), TM));
 }
 
 unsigned AMDGPUSubtarget::getMaxLocalMemSizeWithWaveCount(unsigned NWaves,
@@ -198,6 +241,12 @@ unsigned AMDGPUSubtarget::getOccupancyWithLocalMemSize(uint32_t Bytes,
   return NumWaves;
 }
 
+unsigned
+AMDGPUSubtarget::getOccupancyWithLocalMemSize(const MachineFunction &MF) const {
+  const auto *MFI = MF.getInfo<SIMachineFunctionInfo>();
+  return getOccupancyWithLocalMemSize(MFI->getLDSSize(), MF.getFunction());
+}
+
 std::pair<unsigned, unsigned>
 AMDGPUSubtarget::getDefaultFlatWorkGroupSize(CallingConv::ID CC) const {
   switch (CC) {
@@ -357,27 +406,64 @@ bool AMDGPUSubtarget::makeLIDRangeMetadata(Instruction *I) const {
   return true;
 }
 
-R600Subtarget::R600Subtarget(const Triple &TT, StringRef GPU, StringRef FS,
-                             const TargetMachine &TM) :
-  AMDGPUSubtarget(TT, GPU, FS, TM),
-  InstrInfo(*this),
-  FrameLowering(TargetFrameLowering::StackGrowsUp, getStackAlignment(), 0),
-  TLInfo(TM, *this) {}
+uint64_t AMDGPUSubtarget::getExplicitKernArgSize(const Function &F,
+                                                 unsigned &MaxAlign) const {
+  assert(F.getCallingConv() == CallingConv::AMDGPU_KERNEL ||
+         F.getCallingConv() == CallingConv::SPIR_KERNEL);
 
-SISubtarget::SISubtarget(const Triple &TT, StringRef GPU, StringRef FS,
-                         const TargetMachine &TM)
-    : AMDGPUSubtarget(TT, GPU, FS, TM), InstrInfo(*this),
-      FrameLowering(TargetFrameLowering::StackGrowsUp, getStackAlignment(), 0),
-      TLInfo(TM, *this) {
-  CallLoweringInfo.reset(new AMDGPUCallLowering(*getTargetLowering()));
-  Legalizer.reset(new AMDGPULegalizerInfo());
+  const DataLayout &DL = F.getParent()->getDataLayout();
+  uint64_t ExplicitArgBytes = 0;
+  MaxAlign = 1;
 
-  RegBankInfo.reset(new AMDGPURegisterBankInfo(*getRegisterInfo()));
-  InstSelector.reset(new AMDGPUInstructionSelector(
-      *this, *static_cast<AMDGPURegisterBankInfo *>(RegBankInfo.get())));
+  for (const Argument &Arg : F.args()) {
+    Type *ArgTy = Arg.getType();
+
+    unsigned Align = DL.getABITypeAlignment(ArgTy);
+    uint64_t AllocSize = DL.getTypeAllocSize(ArgTy);
+    ExplicitArgBytes = alignTo(ExplicitArgBytes, Align) + AllocSize;
+    MaxAlign = std::max(MaxAlign, Align);
+  }
+
+  return ExplicitArgBytes;
 }
 
-void SISubtarget::overrideSchedPolicy(MachineSchedPolicy &Policy,
+unsigned AMDGPUSubtarget::getKernArgSegmentSize(const Function &F,
+                                                unsigned &MaxAlign) const {
+  uint64_t ExplicitArgBytes = getExplicitKernArgSize(F, MaxAlign);
+
+  unsigned ExplicitOffset = getExplicitKernelArgOffset(F);
+
+  uint64_t TotalSize = ExplicitOffset + ExplicitArgBytes;
+  unsigned ImplicitBytes = getImplicitArgNumBytes(F);
+  if (ImplicitBytes != 0) {
+    unsigned Alignment = getAlignmentForImplicitArgPtr();
+    TotalSize = alignTo(ExplicitArgBytes, Alignment) + ImplicitBytes;
+  }
+
+  // Being able to dereference past the end is useful for emitting scalar loads.
+  return alignTo(TotalSize, 4);
+}
+
+R600Subtarget::R600Subtarget(const Triple &TT, StringRef GPU, StringRef FS,
+                             const TargetMachine &TM) :
+  R600GenSubtargetInfo(TT, GPU, FS),
+  AMDGPUSubtarget(TT, getFeatureBits()),
+  InstrInfo(*this),
+  FrameLowering(TargetFrameLowering::StackGrowsUp, getStackAlignment(), 0),
+  FMA(false),
+  CaymanISA(false),
+  CFALUBug(false),
+  DX10Clamp(false),
+  HasVertexCache(false),
+  R600ALUInst(false),
+  FP64(false),
+  TexVTXClauseSize(0),
+  Gen(R600),
+  TLInfo(TM, initializeSubtargetDependencies(TT, GPU, FS)),
+  InstrItins(getInstrItineraryForCPU(GPU)),
+  AS (AMDGPU::getAMDGPUAS(TT)) { }
+
+void GCNSubtarget::overrideSchedPolicy(MachineSchedPolicy &Policy,
                                       unsigned NumRegionInstrs) const {
   // Track register pressure so the scheduler can try to decrease
   // pressure once register usage is above the threshold defined by
@@ -394,22 +480,12 @@ void SISubtarget::overrideSchedPolicy(MachineSchedPolicy &Policy,
     Policy.ShouldTrackLaneMasks = true;
 }
 
-bool SISubtarget::isVGPRSpillingEnabled(const Function& F) const {
+bool GCNSubtarget::isVGPRSpillingEnabled(const Function& F) const {
   return EnableVGPRSpilling || !AMDGPU::isShader(F.getCallingConv());
 }
 
-unsigned SISubtarget::getKernArgSegmentSize(const MachineFunction &MF,
-                                            unsigned ExplicitArgBytes) const {
-  unsigned ImplicitBytes = getImplicitArgNumBytes(MF);
-  if (ImplicitBytes == 0)
-    return ExplicitArgBytes;
-
-  unsigned Alignment = getAlignmentForImplicitArgPtr();
-  return alignTo(ExplicitArgBytes, Alignment) + ImplicitBytes;
-}
-
-unsigned SISubtarget::getOccupancyWithNumSGPRs(unsigned SGPRs) const {
-  if (getGeneration() >= SISubtarget::VOLCANIC_ISLANDS) {
+unsigned GCNSubtarget::getOccupancyWithNumSGPRs(unsigned SGPRs) const {
+  if (getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS) {
     if (SGPRs <= 80)
       return 10;
     if (SGPRs <= 88)
@@ -431,7 +507,7 @@ unsigned SISubtarget::getOccupancyWithNumSGPRs(unsigned SGPRs) const {
   return 5;
 }
 
-unsigned SISubtarget::getOccupancyWithNumVGPRs(unsigned VGPRs) const {
+unsigned GCNSubtarget::getOccupancyWithNumVGPRs(unsigned VGPRs) const {
   if (VGPRs <= 24)
     return 10;
   if (VGPRs <= 28)
@@ -453,7 +529,7 @@ unsigned SISubtarget::getOccupancyWithNumVGPRs(unsigned VGPRs) const {
   return 1;
 }
 
-unsigned SISubtarget::getReservedNumSGPRs(const MachineFunction &MF) const {
+unsigned GCNSubtarget::getReservedNumSGPRs(const MachineFunction &MF) const {
   const SIMachineFunctionInfo &MFI = *MF.getInfo<SIMachineFunctionInfo>();
   if (MFI.hasFlatScratchInit()) {
     if (getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS)
@@ -467,7 +543,7 @@ unsigned SISubtarget::getReservedNumSGPRs(const MachineFunction &MF) const {
   return 2; // VCC.
 }
 
-unsigned SISubtarget::getMaxNumSGPRs(const MachineFunction &MF) const {
+unsigned GCNSubtarget::getMaxNumSGPRs(const MachineFunction &MF) const {
   const Function &F = MF.getFunction();
   const SIMachineFunctionInfo &MFI = *MF.getInfo<SIMachineFunctionInfo>();
 
@@ -517,7 +593,7 @@ unsigned SISubtarget::getMaxNumSGPRs(const MachineFunction &MF) const {
                   MaxAddressableNumSGPRs);
 }
 
-unsigned SISubtarget::getMaxNumVGPRs(const MachineFunction &MF) const {
+unsigned GCNSubtarget::getMaxNumVGPRs(const MachineFunction &MF) const {
   const Function &F = MF.getFunction();
   const SIMachineFunctionInfo &MFI = *MF.getInfo<SIMachineFunctionInfo>();
 
@@ -532,10 +608,6 @@ unsigned SISubtarget::getMaxNumVGPRs(const MachineFunction &MF) const {
     unsigned Requested = AMDGPU::getIntegerAttribute(
       F, "amdgpu-num-vgpr", MaxNumVGPRs);
 
-    // Make sure requested value does not violate subtarget's specifications.
-    if (Requested && Requested <= getReservedNumVGPRs(MF))
-      Requested = 0;
-
     // Make sure requested value is compatible with values implied by
     // default/requested minimum/maximum number of waves per execution unit.
     if (Requested && Requested > getMaxNumVGPRs(WavesPerEU.first))
@@ -548,7 +620,7 @@ unsigned SISubtarget::getMaxNumVGPRs(const MachineFunction &MF) const {
       MaxNumVGPRs = Requested;
   }
 
-  return MaxNumVGPRs - getReservedNumVGPRs(MF);
+  return MaxNumVGPRs;
 }
 
 namespace {
@@ -602,7 +674,21 @@ struct MemOpClusterMutation : ScheduleDAGMutation {
 };
 } // namespace
 
-void SISubtarget::getPostRAMutations(
+void GCNSubtarget::getPostRAMutations(
     std::vector<std::unique_ptr<ScheduleDAGMutation>> &Mutations) const {
   Mutations.push_back(llvm::make_unique<MemOpClusterMutation>(&InstrInfo));
 }
+
+const AMDGPUSubtarget &AMDGPUSubtarget::get(const MachineFunction &MF) {
+  if (MF.getTarget().getTargetTriple().getArch() == Triple::amdgcn)
+    return static_cast<const AMDGPUSubtarget&>(MF.getSubtarget<GCNSubtarget>());
+  else
+    return static_cast<const AMDGPUSubtarget&>(MF.getSubtarget<R600Subtarget>());
+}
+
+const AMDGPUSubtarget &AMDGPUSubtarget::get(const TargetMachine &TM, const Function &F) {
+  if (TM.getTargetTriple().getArch() == Triple::amdgcn)
+    return static_cast<const AMDGPUSubtarget&>(TM.getSubtarget<GCNSubtarget>(F));
+  else
+    return static_cast<const AMDGPUSubtarget&>(TM.getSubtarget<R600Subtarget>(F));
+}
diff --git a/lib/Target/AMDGPU/AMDGPUSubtarget.h b/lib/Target/AMDGPU/AMDGPUSubtarget.h
index cf4a691d4b58..623109733651 100644
--- a/lib/Target/AMDGPU/AMDGPUSubtarget.h
+++ b/lib/Target/AMDGPU/AMDGPUSubtarget.h
@@ -8,7 +8,7 @@
 //==-----------------------------------------------------------------------===//
 //
 /// \file
-/// \brief AMDGPU specific subclass of TargetSubtarget.
+/// AMDGPU specific subclass of TargetSubtarget.
 //
 //===----------------------------------------------------------------------===//
 
@@ -23,7 +23,6 @@
 #include "SIFrameLowering.h"
 #include "SIISelLowering.h"
 #include "SIInstrInfo.h"
-#include "SIMachineFunctionInfo.h"
 #include "Utils/AMDGPUBaseInfo.h"
 #include "llvm/ADT/Triple.h"
 #include "llvm/CodeGen/GlobalISel/InstructionSelector.h"
@@ -40,24 +39,216 @@
 
 #define GET_SUBTARGETINFO_HEADER
 #include "AMDGPUGenSubtargetInfo.inc"
+#define GET_SUBTARGETINFO_HEADER
+#include "R600GenSubtargetInfo.inc"
 
 namespace llvm {
 
 class StringRef;
 
-class AMDGPUSubtarget : public AMDGPUGenSubtargetInfo {
+class AMDGPUSubtarget {
 public:
   enum Generation {
     R600 = 0,
-    R700,
-    EVERGREEN,
-    NORTHERN_ISLANDS,
-    SOUTHERN_ISLANDS,
-    SEA_ISLANDS,
-    VOLCANIC_ISLANDS,
-    GFX9,
+    R700 = 1,
+    EVERGREEN = 2,
+    NORTHERN_ISLANDS = 3,
+    SOUTHERN_ISLANDS = 4,
+    SEA_ISLANDS = 5,
+    VOLCANIC_ISLANDS = 6,
+    GFX9 = 7
   };
 
+private:
+  Triple TargetTriple;
+
+protected:
+  const FeatureBitset &SubtargetFeatureBits;
+  bool Has16BitInsts;
+  bool HasMadMixInsts;
+  bool FP32Denormals;
+  bool FPExceptions;
+  bool HasSDWA;
+  bool HasVOP3PInsts;
+  bool HasMulI24;
+  bool HasMulU24;
+  bool HasFminFmaxLegacy;
+  bool EnablePromoteAlloca;
+  int LocalMemorySize;
+  unsigned WavefrontSize;
+
+public:
+  AMDGPUSubtarget(const Triple &TT, const FeatureBitset &FeatureBits);
+
+  static const AMDGPUSubtarget &get(const MachineFunction &MF);
+  static const AMDGPUSubtarget &get(const TargetMachine &TM,
+                                    const Function &F);
+
+  /// \returns Default range flat work group size for a calling convention.
+  std::pair<unsigned, unsigned> getDefaultFlatWorkGroupSize(CallingConv::ID CC) const;
+
+  /// \returns Subtarget's default pair of minimum/maximum flat work group sizes
+  /// for function \p F, or minimum/maximum flat work group sizes explicitly
+  /// requested using "amdgpu-flat-work-group-size" attribute attached to
+  /// function \p F.
+  ///
+  /// \returns Subtarget's default values if explicitly requested values cannot
+  /// be converted to integer, or violate subtarget's specifications.
+  std::pair<unsigned, unsigned> getFlatWorkGroupSizes(const Function &F) const;
+
+  /// \returns Subtarget's default pair of minimum/maximum number of waves per
+  /// execution unit for function \p F, or minimum/maximum number of waves per
+  /// execution unit explicitly requested using "amdgpu-waves-per-eu" attribute
+  /// attached to function \p F.
+  ///
+  /// \returns Subtarget's default values if explicitly requested values cannot
+  /// be converted to integer, violate subtarget's specifications, or are not
+  /// compatible with minimum/maximum number of waves limited by flat work group
+  /// size, register usage, and/or lds usage.
+  std::pair<unsigned, unsigned> getWavesPerEU(const Function &F) const;
+
+  /// Return the amount of LDS that can be used that will not restrict the
+  /// occupancy lower than WaveCount.
+  unsigned getMaxLocalMemSizeWithWaveCount(unsigned WaveCount,
+                                           const Function &) const;
+
+  /// Inverse of getMaxLocalMemWithWaveCount. Return the maximum wavecount if
+  /// the given LDS memory size is the only constraint.
+  unsigned getOccupancyWithLocalMemSize(uint32_t Bytes, const Function &) const;
+
+  unsigned getOccupancyWithLocalMemSize(const MachineFunction &MF) const;
+
+  bool isAmdHsaOS() const {
+    return TargetTriple.getOS() == Triple::AMDHSA;
+  }
+
+  bool isAmdPalOS() const {
+    return TargetTriple.getOS() == Triple::AMDPAL;
+  }
+
+  bool isMesa3DOS() const {
+    return TargetTriple.getOS() == Triple::Mesa3D;
+  }
+
+  bool isMesaKernel(const Function &F) const {
+    return isMesa3DOS() && !AMDGPU::isShader(F.getCallingConv());
+  }
+
+  bool isAmdCodeObjectV2(const Function &F) const {
+    return isAmdHsaOS() || isMesaKernel(F);
+  }
+
+  bool has16BitInsts() const {
+    return Has16BitInsts;
+  }
+
+  bool hasMadMixInsts() const {
+    return HasMadMixInsts;
+  }
+
+  bool hasFP32Denormals() const {
+    return FP32Denormals;
+  }
+
+  bool hasFPExceptions() const {
+    return FPExceptions;
+  }
+
+  bool hasSDWA() const {
+    return HasSDWA;
+  }
+
+  bool hasVOP3PInsts() const {
+    return HasVOP3PInsts;
+  }
+
+  bool hasMulI24() const {
+    return HasMulI24;
+  }
+
+  bool hasMulU24() const {
+    return HasMulU24;
+  }
+
+  bool hasFminFmaxLegacy() const {
+    return HasFminFmaxLegacy;
+  }
+
+  bool isPromoteAllocaEnabled() const {
+    return EnablePromoteAlloca;
+  }
+
+  unsigned getWavefrontSize() const {
+    return WavefrontSize;
+  }
+
+  int getLocalMemorySize() const {
+    return LocalMemorySize;
+  }
+
+  unsigned getAlignmentForImplicitArgPtr() const {
+    return isAmdHsaOS() ? 8 : 4;
+  }
+
+  /// Returns the offset in bytes from the start of the input buffer
+  ///        of the first explicit kernel argument.
+  unsigned getExplicitKernelArgOffset(const Function &F) const {
+    return isAmdCodeObjectV2(F) ? 0 : 36;
+  }
+
+  /// \returns Maximum number of work groups per compute unit supported by the
+  /// subtarget and limited by given \p FlatWorkGroupSize.
+  unsigned getMaxWorkGroupsPerCU(unsigned FlatWorkGroupSize) const {
+    return AMDGPU::IsaInfo::getMaxWorkGroupsPerCU(SubtargetFeatureBits,
+                                                  FlatWorkGroupSize);
+  }
+
+  /// \returns Minimum flat work group size supported by the subtarget.
+  unsigned getMinFlatWorkGroupSize() const {
+    return AMDGPU::IsaInfo::getMinFlatWorkGroupSize(SubtargetFeatureBits);
+  }
+
+  /// \returns Maximum flat work group size supported by the subtarget.
+  unsigned getMaxFlatWorkGroupSize() const {
+    return AMDGPU::IsaInfo::getMaxFlatWorkGroupSize(SubtargetFeatureBits);
+  }
+
+  /// \returns Maximum number of waves per execution unit supported by the
+  /// subtarget and limited by given \p FlatWorkGroupSize.
+  unsigned getMaxWavesPerEU(unsigned FlatWorkGroupSize) const {
+    return AMDGPU::IsaInfo::getMaxWavesPerEU(SubtargetFeatureBits,
+                                             FlatWorkGroupSize);
+  }
+
+  /// \returns Minimum number of waves per execution unit supported by the
+  /// subtarget.
+  unsigned getMinWavesPerEU() const {
+    return AMDGPU::IsaInfo::getMinWavesPerEU(SubtargetFeatureBits);
+  }
+
+  unsigned getMaxWavesPerEU() const { return 10; }
+
+  /// Creates value range metadata on an workitemid.* inrinsic call or load.
+  bool makeLIDRangeMetadata(Instruction *I) const;
+
+  /// \returns Number of bytes of arguments that are passed to a shader or
+  /// kernel in addition to the explicit ones declared for the function.
+  unsigned getImplicitArgNumBytes(const Function &F) const {
+    if (isMesaKernel(F))
+      return 16;
+    return AMDGPU::getIntegerAttribute(F, "amdgpu-implicitarg-num-bytes", 0);
+  }
+  uint64_t getExplicitKernArgSize(const Function &F,
+                                  unsigned &MaxAlign) const;
+  unsigned getKernArgSegmentSize(const Function &F,
+                                 unsigned &MaxAlign) const;
+
+  virtual ~AMDGPUSubtarget() {}
+};
+
+class GCNSubtarget : public AMDGPUGenSubtargetInfo,
+                     public AMDGPUSubtarget {
+public:
   enum {
     ISAVersion0_0_0,
     ISAVersion6_0_0,
@@ -67,13 +258,14 @@ public:
     ISAVersion7_0_2,
     ISAVersion7_0_3,
     ISAVersion7_0_4,
-    ISAVersion8_0_0,
     ISAVersion8_0_1,
     ISAVersion8_0_2,
     ISAVersion8_0_3,
     ISAVersion8_1_0,
     ISAVersion9_0_0,
-    ISAVersion9_0_2
+    ISAVersion9_0_2,
+    ISAVersion9_0_4,
+    ISAVersion9_0_6,
   };
 
   enum TrapHandlerAbi {
@@ -96,13 +288,18 @@ public:
     LLVMTrapHandlerRegValue = 1
   };
 
+private:
+  /// GlobalISel related APIs.
+  std::unique_ptr<AMDGPUCallLowering> CallLoweringInfo;
+  std::unique_ptr<InstructionSelector> InstSelector;
+  std::unique_ptr<LegalizerInfo> Legalizer;
+  std::unique_ptr<RegisterBankInfo> RegBankInfo;
+
 protected:
   // Basic subtarget description.
   Triple TargetTriple;
-  Generation Gen;
+  unsigned Gen;
   unsigned IsaVersion;
-  unsigned WavefrontSize;
-  int LocalMemorySize;
   int LDSBankCount;
   unsigned MaxPrivateElementSize;
 
@@ -111,9 +308,7 @@ protected:
   bool HalfRate64Ops;
 
   // Dynamially set bits that enable features.
-  bool FP32Denormals;
   bool FP64FP16Denormals;
-  bool FPExceptions;
   bool DX10Clamp;
   bool FlatForGlobal;
   bool AutoWaitcntBeforeBarrier;
@@ -124,47 +319,48 @@ protected:
   bool EnableXNACK;
   bool TrapHandler;
   bool DebuggerInsertNops;
-  bool DebuggerReserveRegs;
   bool DebuggerEmitPrologue;
 
   // Used as options.
   bool EnableHugePrivateBuffer;
   bool EnableVGPRSpilling;
-  bool EnablePromoteAlloca;
   bool EnableLoadStoreOpt;
   bool EnableUnsafeDSOffsetFolding;
   bool EnableSIScheduler;
+  bool EnableDS128;
   bool DumpCode;
 
   // Subtarget statically properties set by tablegen
   bool FP64;
   bool FMA;
+  bool MIMG_R128;
   bool IsGCN;
   bool GCN3Encoding;
   bool CIInsts;
   bool GFX9Insts;
   bool SGPRInitBug;
   bool HasSMemRealTime;
-  bool Has16BitInsts;
   bool HasIntClamp;
-  bool HasVOP3PInsts;
-  bool HasMadMixInsts;
+  bool HasFmaMixInsts;
   bool HasMovrel;
   bool HasVGPRIndexMode;
   bool HasScalarStores;
+  bool HasScalarAtomics;
   bool HasInv2PiInlineImm;
-  bool HasSDWA;
   bool HasSDWAOmod;
   bool HasSDWAScalar;
   bool HasSDWASdst;
   bool HasSDWAMac;
   bool HasSDWAOutModsVOPC;
   bool HasDPP;
+  bool HasDLInsts;
+  bool D16PreservesUnusedBits;
   bool FlatAddressSpace;
   bool FlatInstOffsets;
   bool FlatGlobalInsts;
   bool FlatScratchInsts;
   bool AddNoCarryInsts;
+  bool HasUnpackedD16VMem;
   bool R600ALUInst;
   bool CaymanISA;
   bool CFALUBug;
@@ -175,67 +371,68 @@ protected:
   // Dummy feature to use for assembler in tablegen.
   bool FeatureDisable;
 
-  InstrItineraryData InstrItins;
   SelectionDAGTargetInfo TSInfo;
   AMDGPUAS AS;
+private:
+  SIInstrInfo InstrInfo;
+  SITargetLowering TLInfo;
+  SIFrameLowering FrameLowering;
 
 public:
-  AMDGPUSubtarget(const Triple &TT, StringRef GPU, StringRef FS,
-                  const TargetMachine &TM);
-  ~AMDGPUSubtarget() override;
+  GCNSubtarget(const Triple &TT, StringRef GPU, StringRef FS,
+               const GCNTargetMachine &TM);
+  ~GCNSubtarget() override;
 
-  AMDGPUSubtarget &initializeSubtargetDependencies(const Triple &TT,
+  GCNSubtarget &initializeSubtargetDependencies(const Triple &TT,
                                                    StringRef GPU, StringRef FS);
 
-  const AMDGPUInstrInfo *getInstrInfo() const override = 0;
-  const AMDGPUFrameLowering *getFrameLowering() const override = 0;
-  const AMDGPUTargetLowering *getTargetLowering() const override = 0;
-  const AMDGPURegisterInfo *getRegisterInfo() const override = 0;
+  const SIInstrInfo *getInstrInfo() const override {
+    return &InstrInfo;
+  }
 
-  const InstrItineraryData *getInstrItineraryData() const override {
-    return &InstrItins;
+  const SIFrameLowering *getFrameLowering() const override {
+    return &FrameLowering;
   }
 
-  // Nothing implemented, just prevent crashes on use.
-  const SelectionDAGTargetInfo *getSelectionDAGInfo() const override {
-    return &TSInfo;
+  const SITargetLowering *getTargetLowering() const override {
+    return &TLInfo;
   }
 
-  void ParseSubtargetFeatures(StringRef CPU, StringRef FS);
+  const SIRegisterInfo *getRegisterInfo() const override {
+    return &InstrInfo.getRegisterInfo();
+  }
 
-  bool isAmdHsaOS() const {
-    return TargetTriple.getOS() == Triple::AMDHSA;
+  const CallLowering *getCallLowering() const override {
+    return CallLoweringInfo.get();
   }
 
-  bool isMesa3DOS() const {
-    return TargetTriple.getOS() == Triple::Mesa3D;
+  const InstructionSelector *getInstructionSelector() const override {
+    return InstSelector.get();
   }
 
-  bool isOpenCLEnv() const {
-    return TargetTriple.getEnvironment() == Triple::OpenCL ||
-           TargetTriple.getEnvironmentName() == "amdgizcl";
+  const LegalizerInfo *getLegalizerInfo() const override {
+    return Legalizer.get();
   }
 
-  bool isAmdPalOS() const {
-    return TargetTriple.getOS() == Triple::AMDPAL;
+  const RegisterBankInfo *getRegBankInfo() const override {
+    return RegBankInfo.get();
   }
 
-  Generation getGeneration() const {
-    return Gen;
+  // Nothing implemented, just prevent crashes on use.
+  const SelectionDAGTargetInfo *getSelectionDAGInfo() const override {
+    return &TSInfo;
   }
 
-  unsigned getWavefrontSize() const {
-    return WavefrontSize;
+  void ParseSubtargetFeatures(StringRef CPU, StringRef FS);
+
+  Generation getGeneration() const {
+    return (Generation)Gen;
   }
 
   unsigned getWavefrontSizeLog2() const {
     return Log2_32(WavefrontSize);
   }
 
-  int getLocalMemorySize() const {
-    return LocalMemorySize;
-  }
-
   int getLDSBankCount() const {
     return LDSBankCount;
   }
@@ -248,19 +445,19 @@ public:
     return AS;
   }
 
-  bool has16BitInsts() const {
-    return Has16BitInsts;
-  }
-
   bool hasIntClamp() const {
     return HasIntClamp;
   }
 
-  bool hasVOP3PInsts() const {
-    return HasVOP3PInsts;
+  bool hasFP64() const {
+    return FP64;
   }
 
-  bool hasFP64() const {
+  bool hasMIMG_R128() const {
+    return MIMG_R128;
+  }
+
+  bool hasHWFP64() const {
     return FP64;
   }
 
@@ -273,15 +470,15 @@ public:
   }
 
   bool hasAddr64() const {
-    return (getGeneration() < VOLCANIC_ISLANDS);
+    return (getGeneration() < AMDGPUSubtarget::VOLCANIC_ISLANDS);
   }
 
   bool hasBFE() const {
-    return (getGeneration() >= EVERGREEN);
+    return true;
   }
 
   bool hasBFI() const {
-    return (getGeneration() >= EVERGREEN);
+    return true;
   }
 
   bool hasBFM() const {
@@ -289,62 +486,31 @@ public:
   }
 
   bool hasBCNT(unsigned Size) const {
-    if (Size == 32)
-      return (getGeneration() >= EVERGREEN);
-
-    if (Size == 64)
-      return (getGeneration() >= SOUTHERN_ISLANDS);
-
-    return false;
-  }
-
-  bool hasMulU24() const {
-    return (getGeneration() >= EVERGREEN);
-  }
-
-  bool hasMulI24() const {
-    return (getGeneration() >= SOUTHERN_ISLANDS ||
-            hasCaymanISA());
+    return true;
   }
 
   bool hasFFBL() const {
-    return (getGeneration() >= EVERGREEN);
+    return true;
   }
 
   bool hasFFBH() const {
-    return (getGeneration() >= EVERGREEN);
+    return true;
   }
 
   bool hasMed3_16() const {
-    return getGeneration() >= GFX9;
+    return getGeneration() >= AMDGPUSubtarget::GFX9;
   }
 
   bool hasMin3Max3_16() const {
-    return getGeneration() >= GFX9;
-  }
-
-  bool hasMadMixInsts() const {
-    return HasMadMixInsts;
+    return getGeneration() >= AMDGPUSubtarget::GFX9;
   }
 
-  bool hasSBufferLoadStoreAtomicDwordxN() const {
-    // Only use the "x1" variants on GFX9 or don't use the buffer variants.
-    // For x2 and higher variants, if the accessed region spans 2 VM pages and
-    // the second page is unmapped, the hw hangs.
-    // TODO: There is one future GFX9 chip that doesn't have this bug.
-    return getGeneration() != GFX9;
+  bool hasFmaMixInsts() const {
+    return HasFmaMixInsts;
   }
 
   bool hasCARRY() const {
-    return (getGeneration() >= EVERGREEN);
-  }
-
-  bool hasBORROW() const {
-    return (getGeneration() >= EVERGREEN);
-  }
-
-  bool hasCaymanISA() const {
-    return CaymanISA;
+    return true;
   }
 
   bool hasFMA() const {
@@ -359,10 +525,6 @@ public:
     return EnableHugePrivateBuffer;
   }
 
-  bool isPromoteAllocaEnabled() const {
-    return EnablePromoteAlloca;
-  }
-
   bool unsafeDSOffsetFoldingEnabled() const {
     return EnableUnsafeDSOffsetFolding;
   }
@@ -376,23 +538,10 @@ public:
   unsigned getMaxLocalMemSizeWithWaveCount(unsigned WaveCount,
                                            const Function &) const;
 
-  /// Inverse of getMaxLocalMemWithWaveCount. Return the maximum wavecount if
-  /// the given LDS memory size is the only constraint.
-  unsigned getOccupancyWithLocalMemSize(uint32_t Bytes, const Function &) const;
-
-  unsigned getOccupancyWithLocalMemSize(const MachineFunction &MF) const {
-    const auto *MFI = MF.getInfo<SIMachineFunctionInfo>();
-    return getOccupancyWithLocalMemSize(MFI->getLDSSize(), MF.getFunction());
-  }
-
   bool hasFP16Denormals() const {
     return FP64FP16Denormals;
   }
 
-  bool hasFP32Denormals() const {
-    return FP32Denormals;
-  }
-
   bool hasFP64Denormals() const {
     return FP64FP16Denormals;
   }
@@ -401,10 +550,6 @@ public:
     return getGeneration() >= AMDGPUSubtarget::GFX9;
   }
 
-  bool hasFPExceptions() const {
-    return FPExceptions;
-  }
-
   bool enableDX10Clamp() const {
     return DX10Clamp;
   }
@@ -417,6 +562,12 @@ public:
     return FlatForGlobal;
   }
 
+  /// \returns If target supports ds_read/write_b128 and user enables generation
+  /// of ds_read/write_b128.
+  bool useDS128() const {
+    return CIInsts && EnableDS128;
+  }
+
   /// \returns If MUBUF instructions always perform range checking, even for
   /// buffer resources used for private memory access.
   bool privateMemoryResourceIsRangeChecked() const {
@@ -440,7 +591,7 @@ public:
   }
 
   bool hasApertureRegs() const {
-   return HasApertureRegs;
+    return HasApertureRegs;
   }
 
   bool isTrapHandlerEnabled() const {
@@ -467,6 +618,10 @@ public:
     return FlatScratchInsts;
   }
 
+  bool hasFlatLgkmVMemCountInOrder() const {
+    return getGeneration() > GFX9;
+  }
+
   bool hasD16LoadStore() const {
     return getGeneration() >= GFX9;
   }
@@ -481,31 +636,19 @@ public:
     return AddNoCarryInsts;
   }
 
-  bool isMesaKernel(const MachineFunction &MF) const {
-    return isMesa3DOS() && !AMDGPU::isShader(MF.getFunction().getCallingConv());
+  bool hasUnpackedD16VMem() const {
+    return HasUnpackedD16VMem;
   }
 
   // Covers VS/PS/CS graphics shaders
-  bool isMesaGfxShader(const MachineFunction &MF) const {
-    return isMesa3DOS() && AMDGPU::isShader(MF.getFunction().getCallingConv());
-  }
-
-  bool isAmdCodeObjectV2(const MachineFunction &MF) const {
-    return isAmdHsaOS() || isMesaKernel(MF);
+  bool isMesaGfxShader(const Function &F) const {
+    return isMesa3DOS() && AMDGPU::isShader(F.getCallingConv());
   }
 
   bool hasMad64_32() const {
     return getGeneration() >= SEA_ISLANDS;
   }
 
-  bool hasFminFmaxLegacy() const {
-    return getGeneration() < AMDGPUSubtarget::VOLCANIC_ISLANDS;
-  }
-
-  bool hasSDWA() const {
-    return HasSDWA;
-  }
-
   bool hasSDWAOmod() const {
     return HasSDWAOmod;
   }
@@ -526,29 +669,28 @@ public:
     return HasSDWAOutModsVOPC;
   }
 
-  /// \brief Returns the offset in bytes from the start of the input buffer
-  ///        of the first explicit kernel argument.
-  unsigned getExplicitKernelArgOffset(const MachineFunction &MF) const {
-    return isAmdCodeObjectV2(MF) ? 0 : 36;
+  bool vmemWriteNeedsExpWaitcnt() const {
+    return getGeneration() < SEA_ISLANDS;
   }
 
-  unsigned getAlignmentForImplicitArgPtr() const {
-    return isAmdHsaOS() ? 8 : 4;
+  bool hasDLInsts() const {
+    return HasDLInsts;
   }
 
-  unsigned getImplicitArgNumBytes(const MachineFunction &MF) const {
-    if (isMesaKernel(MF))
-      return 16;
-    if (isAmdHsaOS() && isOpenCLEnv())
-      return 32;
-    return 0;
+  bool d16PreservesUnusedBits() const {
+    return D16PreservesUnusedBits;
   }
 
   // Scratch is allocated in 256 dword per wave blocks for the entire
   // wavefront. When viewed from the perspecive of an arbitrary workitem, this
   // is 4-byte aligned.
+  //
+  // Only 4-byte alignment is really needed to access anything. Transformations
+  // on the pointer value itself may rely on the alignment / known low bits of
+  // the pointer. Set this to something above the minimum to avoid needing
+  // dynamic realignment in common cases.
   unsigned getStackAlignment() const {
-    return 4;
+    return 16;
   }
 
   bool enableMachineScheduler() const override {
@@ -559,184 +701,43 @@ public:
     return true;
   }
 
-  void setScalarizeGlobalBehavior(bool b) { ScalarizeGlobal = b;}
-  bool getScalarizeGlobalBehavior() const { return ScalarizeGlobal;}
+  void setScalarizeGlobalBehavior(bool b) { ScalarizeGlobal = b; }
+  bool getScalarizeGlobalBehavior() const { return ScalarizeGlobal; }
 
   /// \returns Number of execution units per compute unit supported by the
   /// subtarget.
   unsigned getEUsPerCU() const {
-    return AMDGPU::IsaInfo::getEUsPerCU(getFeatureBits());
-  }
-
-  /// \returns Maximum number of work groups per compute unit supported by the
-  /// subtarget and limited by given \p FlatWorkGroupSize.
-  unsigned getMaxWorkGroupsPerCU(unsigned FlatWorkGroupSize) const {
-    return AMDGPU::IsaInfo::getMaxWorkGroupsPerCU(getFeatureBits(),
-                                                  FlatWorkGroupSize);
+    return AMDGPU::IsaInfo::getEUsPerCU(MCSubtargetInfo::getFeatureBits());
   }
 
   /// \returns Maximum number of waves per compute unit supported by the
   /// subtarget without any kind of limitation.
   unsigned getMaxWavesPerCU() const {
-    return AMDGPU::IsaInfo::getMaxWavesPerCU(getFeatureBits());
+    return AMDGPU::IsaInfo::getMaxWavesPerCU(MCSubtargetInfo::getFeatureBits());
   }
 
   /// \returns Maximum number of waves per compute unit supported by the
   /// subtarget and limited by given \p FlatWorkGroupSize.
   unsigned getMaxWavesPerCU(unsigned FlatWorkGroupSize) const {
-    return AMDGPU::IsaInfo::getMaxWavesPerCU(getFeatureBits(),
+    return AMDGPU::IsaInfo::getMaxWavesPerCU(MCSubtargetInfo::getFeatureBits(),
                                              FlatWorkGroupSize);
   }
 
-  /// \returns Minimum number of waves per execution unit supported by the
-  /// subtarget.
-  unsigned getMinWavesPerEU() const {
-    return AMDGPU::IsaInfo::getMinWavesPerEU(getFeatureBits());
-  }
-
   /// \returns Maximum number of waves per execution unit supported by the
   /// subtarget without any kind of limitation.
   unsigned getMaxWavesPerEU() const {
-    return AMDGPU::IsaInfo::getMaxWavesPerEU(getFeatureBits());
-  }
-
-  /// \returns Maximum number of waves per execution unit supported by the
-  /// subtarget and limited by given \p FlatWorkGroupSize.
-  unsigned getMaxWavesPerEU(unsigned FlatWorkGroupSize) const {
-    return AMDGPU::IsaInfo::getMaxWavesPerEU(getFeatureBits(),
-                                             FlatWorkGroupSize);
-  }
-
-  /// \returns Minimum flat work group size supported by the subtarget.
-  unsigned getMinFlatWorkGroupSize() const {
-    return AMDGPU::IsaInfo::getMinFlatWorkGroupSize(getFeatureBits());
-  }
-
-  /// \returns Maximum flat work group size supported by the subtarget.
-  unsigned getMaxFlatWorkGroupSize() const {
-    return AMDGPU::IsaInfo::getMaxFlatWorkGroupSize(getFeatureBits());
+    return AMDGPU::IsaInfo::getMaxWavesPerEU();
   }
 
   /// \returns Number of waves per work group supported by the subtarget and
   /// limited by given \p FlatWorkGroupSize.
   unsigned getWavesPerWorkGroup(unsigned FlatWorkGroupSize) const {
-    return AMDGPU::IsaInfo::getWavesPerWorkGroup(getFeatureBits(),
-                                                 FlatWorkGroupSize);
+    return AMDGPU::IsaInfo::getWavesPerWorkGroup(
+        MCSubtargetInfo::getFeatureBits(), FlatWorkGroupSize);
   }
 
-  /// \returns Default range flat work group size for a calling convention.
-  std::pair<unsigned, unsigned> getDefaultFlatWorkGroupSize(CallingConv::ID CC) const;
-
-  /// \returns Subtarget's default pair of minimum/maximum flat work group sizes
-  /// for function \p F, or minimum/maximum flat work group sizes explicitly
-  /// requested using "amdgpu-flat-work-group-size" attribute attached to
-  /// function \p F.
-  ///
-  /// \returns Subtarget's default values if explicitly requested values cannot
-  /// be converted to integer, or violate subtarget's specifications.
-  std::pair<unsigned, unsigned> getFlatWorkGroupSizes(const Function &F) const;
-
-  /// \returns Subtarget's default pair of minimum/maximum number of waves per
-  /// execution unit for function \p F, or minimum/maximum number of waves per
-  /// execution unit explicitly requested using "amdgpu-waves-per-eu" attribute
-  /// attached to function \p F.
-  ///
-  /// \returns Subtarget's default values if explicitly requested values cannot
-  /// be converted to integer, violate subtarget's specifications, or are not
-  /// compatible with minimum/maximum number of waves limited by flat work group
-  /// size, register usage, and/or lds usage.
-  std::pair<unsigned, unsigned> getWavesPerEU(const Function &F) const;
-
-  /// Creates value range metadata on an workitemid.* inrinsic call or load.
-  bool makeLIDRangeMetadata(Instruction *I) const;
-};
-
-class R600Subtarget final : public AMDGPUSubtarget {
-private:
-  R600InstrInfo InstrInfo;
-  R600FrameLowering FrameLowering;
-  R600TargetLowering TLInfo;
-
-public:
-  R600Subtarget(const Triple &TT, StringRef CPU, StringRef FS,
-                const TargetMachine &TM);
-
-  const R600InstrInfo *getInstrInfo() const override {
-    return &InstrInfo;
-  }
-
-  const R600FrameLowering *getFrameLowering() const override {
-    return &FrameLowering;
-  }
-
-  const R600TargetLowering *getTargetLowering() const override {
-    return &TLInfo;
-  }
-
-  const R600RegisterInfo *getRegisterInfo() const override {
-    return &InstrInfo.getRegisterInfo();
-  }
-
-  bool hasCFAluBug() const {
-    return CFALUBug;
-  }
-
-  bool hasVertexCache() const {
-    return HasVertexCache;
-  }
-
-  short getTexVTXClauseSize() const {
-    return TexVTXClauseSize;
-  }
-};
-
-class SISubtarget final : public AMDGPUSubtarget {
-private:
-  SIInstrInfo InstrInfo;
-  SIFrameLowering FrameLowering;
-  SITargetLowering TLInfo;
-
-  /// GlobalISel related APIs.
-  std::unique_ptr<AMDGPUCallLowering> CallLoweringInfo;
-  std::unique_ptr<InstructionSelector> InstSelector;
-  std::unique_ptr<LegalizerInfo> Legalizer;
-  std::unique_ptr<RegisterBankInfo> RegBankInfo;
-
-public:
-  SISubtarget(const Triple &TT, StringRef CPU, StringRef FS,
-              const TargetMachine &TM);
-
-  const SIInstrInfo *getInstrInfo() const override {
-    return &InstrInfo;
-  }
-
-  const SIFrameLowering *getFrameLowering() const override {
-    return &FrameLowering;
-  }
-
-  const SITargetLowering *getTargetLowering() const override {
-    return &TLInfo;
-  }
-
-  const CallLowering *getCallLowering() const override {
-    return CallLoweringInfo.get();
-  }
-
-  const InstructionSelector *getInstructionSelector() const override {
-    return InstSelector.get();
-  }
-
-  const LegalizerInfo *getLegalizerInfo() const override {
-    return Legalizer.get();
-  }
-
-  const RegisterBankInfo *getRegBankInfo() const override {
-    return RegBankInfo.get();
-  }
-
-  const SIRegisterInfo *getRegisterInfo() const override {
-    return &InstrInfo.getRegisterInfo();
-  }
+  // static wrappers
+  static bool hasHalfRate64Ops(const TargetSubtargetInfo &STI);
 
   // XXX - Why is this here if it isn't in the default pass set?
   bool enableEarlyIfConversion() const override {
@@ -746,7 +747,7 @@ public:
   void overrideSchedPolicy(MachineSchedPolicy &Policy,
                            unsigned NumRegionInstrs) const override;
 
-  bool isVGPRSpillingEnabled(const Function& F) const;
+  bool isVGPRSpillingEnabled(const Function &F) const;
 
   unsigned getMaxNumUserSGPRs() const {
     return 16;
@@ -776,6 +777,10 @@ public:
     return HasScalarStores;
   }
 
+  bool hasScalarAtomics() const {
+    return HasScalarAtomics;
+  }
+
   bool hasInv2PiInlineImm() const {
     return HasInv2PiInlineImm;
   }
@@ -789,18 +794,13 @@ public:
   }
 
   bool debuggerSupported() const {
-    return debuggerInsertNops() && debuggerReserveRegs() &&
-      debuggerEmitPrologue();
+    return debuggerInsertNops() && debuggerEmitPrologue();
   }
 
   bool debuggerInsertNops() const {
     return DebuggerInsertNops;
   }
 
-  bool debuggerReserveRegs() const {
-    return DebuggerReserveRegs;
-  }
-
   bool debuggerEmitPrologue() const {
     return DebuggerEmitPrologue;
   }
@@ -829,52 +829,61 @@ public:
     return getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS;
   }
 
-  unsigned getKernArgSegmentSize(const MachineFunction &MF,
-                                 unsigned ExplictArgBytes) const;
-
-  /// Return the maximum number of waves per SIMD for kernels using \p SGPRs SGPRs
+  /// Return the maximum number of waves per SIMD for kernels using \p SGPRs
+  /// SGPRs
   unsigned getOccupancyWithNumSGPRs(unsigned SGPRs) const;
 
-  /// Return the maximum number of waves per SIMD for kernels using \p VGPRs VGPRs
+  /// Return the maximum number of waves per SIMD for kernels using \p VGPRs
+  /// VGPRs
   unsigned getOccupancyWithNumVGPRs(unsigned VGPRs) const;
 
   /// \returns true if the flat_scratch register should be initialized with the
   /// pointer to the wave's scratch memory rather than a size and offset.
   bool flatScratchIsPointer() const {
+    return getGeneration() >= AMDGPUSubtarget::GFX9;
+  }
+
+  /// \returns true if the machine has merged shaders in which s0-s7 are
+  /// reserved by the hardware and user SGPRs start at s8
+  bool hasMergedShaders() const {
     return getGeneration() >= GFX9;
   }
 
   /// \returns SGPR allocation granularity supported by the subtarget.
   unsigned getSGPRAllocGranule() const {
-    return AMDGPU::IsaInfo::getSGPRAllocGranule(getFeatureBits());
+    return AMDGPU::IsaInfo::getSGPRAllocGranule(
+        MCSubtargetInfo::getFeatureBits());
   }
 
   /// \returns SGPR encoding granularity supported by the subtarget.
   unsigned getSGPREncodingGranule() const {
-    return AMDGPU::IsaInfo::getSGPREncodingGranule(getFeatureBits());
+    return AMDGPU::IsaInfo::getSGPREncodingGranule(
+        MCSubtargetInfo::getFeatureBits());
   }
 
   /// \returns Total number of SGPRs supported by the subtarget.
   unsigned getTotalNumSGPRs() const {
-    return AMDGPU::IsaInfo::getTotalNumSGPRs(getFeatureBits());
+    return AMDGPU::IsaInfo::getTotalNumSGPRs(MCSubtargetInfo::getFeatureBits());
   }
 
   /// \returns Addressable number of SGPRs supported by the subtarget.
   unsigned getAddressableNumSGPRs() const {
-    return AMDGPU::IsaInfo::getAddressableNumSGPRs(getFeatureBits());
+    return AMDGPU::IsaInfo::getAddressableNumSGPRs(
+        MCSubtargetInfo::getFeatureBits());
   }
 
   /// \returns Minimum number of SGPRs that meets the given number of waves per
   /// execution unit requirement supported by the subtarget.
   unsigned getMinNumSGPRs(unsigned WavesPerEU) const {
-    return AMDGPU::IsaInfo::getMinNumSGPRs(getFeatureBits(), WavesPerEU);
+    return AMDGPU::IsaInfo::getMinNumSGPRs(MCSubtargetInfo::getFeatureBits(),
+                                           WavesPerEU);
   }
 
   /// \returns Maximum number of SGPRs that meets the given number of waves per
   /// execution unit requirement supported by the subtarget.
   unsigned getMaxNumSGPRs(unsigned WavesPerEU, bool Addressable) const {
-    return AMDGPU::IsaInfo::getMaxNumSGPRs(getFeatureBits(), WavesPerEU,
-                                           Addressable);
+    return AMDGPU::IsaInfo::getMaxNumSGPRs(MCSubtargetInfo::getFeatureBits(),
+                                           WavesPerEU, Addressable);
   }
 
   /// \returns Reserved number of SGPRs for given function \p MF.
@@ -892,39 +901,39 @@ public:
 
   /// \returns VGPR allocation granularity supported by the subtarget.
   unsigned getVGPRAllocGranule() const {
-    return AMDGPU::IsaInfo::getVGPRAllocGranule(getFeatureBits());
+    return AMDGPU::IsaInfo::getVGPRAllocGranule(
+        MCSubtargetInfo::getFeatureBits());
   }
 
   /// \returns VGPR encoding granularity supported by the subtarget.
   unsigned getVGPREncodingGranule() const {
-    return AMDGPU::IsaInfo::getVGPREncodingGranule(getFeatureBits());
+    return AMDGPU::IsaInfo::getVGPREncodingGranule(
+        MCSubtargetInfo::getFeatureBits());
   }
 
   /// \returns Total number of VGPRs supported by the subtarget.
   unsigned getTotalNumVGPRs() const {
-    return AMDGPU::IsaInfo::getTotalNumVGPRs(getFeatureBits());
+    return AMDGPU::IsaInfo::getTotalNumVGPRs(MCSubtargetInfo::getFeatureBits());
   }
 
   /// \returns Addressable number of VGPRs supported by the subtarget.
   unsigned getAddressableNumVGPRs() const {
-    return AMDGPU::IsaInfo::getAddressableNumVGPRs(getFeatureBits());
+    return AMDGPU::IsaInfo::getAddressableNumVGPRs(
+        MCSubtargetInfo::getFeatureBits());
   }
 
   /// \returns Minimum number of VGPRs that meets given number of waves per
   /// execution unit requirement supported by the subtarget.
   unsigned getMinNumVGPRs(unsigned WavesPerEU) const {
-    return AMDGPU::IsaInfo::getMinNumVGPRs(getFeatureBits(), WavesPerEU);
+    return AMDGPU::IsaInfo::getMinNumVGPRs(MCSubtargetInfo::getFeatureBits(),
+                                           WavesPerEU);
   }
 
   /// \returns Maximum number of VGPRs that meets given number of waves per
   /// execution unit requirement supported by the subtarget.
   unsigned getMaxNumVGPRs(unsigned WavesPerEU) const {
-    return AMDGPU::IsaInfo::getMaxNumVGPRs(getFeatureBits(), WavesPerEU);
-  }
-
-  /// \returns Reserved number of VGPRs for given function \p MF.
-  unsigned getReservedNumVGPRs(const MachineFunction &MF) const {
-    return debuggerReserveRegs() ? 4 : 0;
+    return AMDGPU::IsaInfo::getMaxNumVGPRs(MCSubtargetInfo::getFeatureBits(),
+                                           WavesPerEU);
   }
 
   /// \returns Maximum number of VGPRs that meets number of waves per execution
@@ -942,6 +951,119 @@ public:
       const override;
 };
 
+class R600Subtarget final : public R600GenSubtargetInfo,
+                            public AMDGPUSubtarget {
+private:
+  R600InstrInfo InstrInfo;
+  R600FrameLowering FrameLowering;
+  bool FMA;
+  bool CaymanISA;
+  bool CFALUBug;
+  bool DX10Clamp;
+  bool HasVertexCache;
+  bool R600ALUInst;
+  bool FP64;
+  short TexVTXClauseSize;
+  Generation Gen;
+  R600TargetLowering TLInfo;
+  InstrItineraryData InstrItins;
+  SelectionDAGTargetInfo TSInfo;
+  AMDGPUAS AS;
+
+public:
+  R600Subtarget(const Triple &TT, StringRef CPU, StringRef FS,
+                const TargetMachine &TM);
+
+  const R600InstrInfo *getInstrInfo() const override { return &InstrInfo; }
+
+  const R600FrameLowering *getFrameLowering() const override {
+    return &FrameLowering;
+  }
+
+  const R600TargetLowering *getTargetLowering() const override {
+    return &TLInfo;
+  }
+
+  const R600RegisterInfo *getRegisterInfo() const override {
+    return &InstrInfo.getRegisterInfo();
+  }
+
+  const InstrItineraryData *getInstrItineraryData() const override {
+    return &InstrItins;
+  }
+
+  // Nothing implemented, just prevent crashes on use.
+  const SelectionDAGTargetInfo *getSelectionDAGInfo() const override {
+    return &TSInfo;
+  }
+
+  void ParseSubtargetFeatures(StringRef CPU, StringRef FS);
+
+  Generation getGeneration() const {
+    return Gen;
+  }
+
+  unsigned getStackAlignment() const {
+    return 4;
+  }
+
+  R600Subtarget &initializeSubtargetDependencies(const Triple &TT,
+                                                 StringRef GPU, StringRef FS);
+
+  bool hasBFE() const {
+    return (getGeneration() >= EVERGREEN);
+  }
+
+  bool hasBFI() const {
+    return (getGeneration() >= EVERGREEN);
+  }
+
+  bool hasBCNT(unsigned Size) const {
+    if (Size == 32)
+      return (getGeneration() >= EVERGREEN);
+
+    return false;
+  }
+
+  bool hasBORROW() const {
+    return (getGeneration() >= EVERGREEN);
+  }
+
+  bool hasCARRY() const {
+    return (getGeneration() >= EVERGREEN);
+  }
+
+  bool hasCaymanISA() const {
+    return CaymanISA;
+  }
+
+  bool hasFFBL() const {
+    return (getGeneration() >= EVERGREEN);
+  }
+
+  bool hasFFBH() const {
+    return (getGeneration() >= EVERGREEN);
+  }
+
+  bool hasFMA() const { return FMA; }
+
+  bool hasCFAluBug() const { return CFALUBug; }
+
+  bool hasVertexCache() const { return HasVertexCache; }
+
+  short getTexVTXClauseSize() const { return TexVTXClauseSize; }
+
+  AMDGPUAS getAMDGPUAS() const { return AS; }
+
+  bool enableMachineScheduler() const override {
+    return true;
+  }
+
+  bool enableSubRegLiveness() const override {
+    return true;
+  }
+};
+
 } // end namespace llvm
 
 #endif // LLVM_LIB_TARGET_AMDGPU_AMDGPUSUBTARGET_H
diff --git a/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp b/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
index 2042dbf6d5e2..2205819c444f 100644
--- a/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
+++ b/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
@@ -8,7 +8,7 @@
 //===----------------------------------------------------------------------===//
 //
 /// \file
-/// \brief The AMDGPU target machine contains all of the hardware specific
+/// The AMDGPU target machine contains all of the hardware specific
 /// information  needed to emit code for R600 and SI GPUs.
 //
 //===----------------------------------------------------------------------===//
@@ -31,7 +31,6 @@
 #include "llvm/CodeGen/GlobalISel/Legalizer.h"
 #include "llvm/CodeGen/GlobalISel/RegBankSelect.h"
 #include "llvm/CodeGen/Passes.h"
-#include "llvm/CodeGen/TargetLoweringObjectFile.h"
 #include "llvm/CodeGen/TargetPassConfig.h"
 #include "llvm/IR/Attributes.h"
 #include "llvm/IR/Function.h"
@@ -40,6 +39,7 @@
 #include "llvm/Support/CommandLine.h"
 #include "llvm/Support/Compiler.h"
 #include "llvm/Support/TargetRegistry.h"
+#include "llvm/Target/TargetLoweringObjectFile.h"
 #include "llvm/Transforms/IPO.h"
 #include "llvm/Transforms/IPO/AlwaysInliner.h"
 #include "llvm/Transforms/IPO/PassManagerBuilder.h"
@@ -79,7 +79,7 @@ static cl::opt<bool> EnableLoadStoreVectorizer(
   cl::init(true),
   cl::Hidden);
 
-// Option to to control global loads scalarization
+// Option to control global loads scalarization
 static cl::opt<bool> ScalarizeGlobal(
   "amdgpu-scalarize-global-loads",
   cl::desc("Enable global load scalarization"),
@@ -110,12 +110,6 @@ static cl::opt<bool> EnableAMDGPUAliasAnalysis("enable-amdgpu-aa", cl::Hidden,
   cl::desc("Enable AMDGPU Alias Analysis"),
   cl::init(true));
 
-// Option to enable new waitcnt insertion pass.
-static cl::opt<bool> EnableSIInsertWaitcntsPass(
-  "enable-si-insert-waitcnts",
-  cl::desc("Use new waitcnt insertion pass"),
-  cl::init(true));
-
 // Option to run late CFG structurizer
 static cl::opt<bool, true> LateCFGStructurize(
   "amdgpu-late-structurize",
@@ -123,16 +117,23 @@ static cl::opt<bool, true> LateCFGStructurize(
   cl::location(AMDGPUTargetMachine::EnableLateStructurizeCFG),
   cl::Hidden);
 
-static cl::opt<bool> EnableAMDGPUFunctionCalls(
+static cl::opt<bool, true> EnableAMDGPUFunctionCalls(
   "amdgpu-function-calls",
-  cl::Hidden,
   cl::desc("Enable AMDGPU function call support"),
-  cl::init(false));
+  cl::location(AMDGPUTargetMachine::EnableFunctionCalls),
+  cl::init(false),
+  cl::Hidden);
 
 // Enable lib calls simplifications
 static cl::opt<bool> EnableLibCallSimplify(
   "amdgpu-simplify-libcall",
-  cl::desc("Enable mdgpu library simplifications"),
+  cl::desc("Enable amdgpu library simplifications"),
+  cl::init(true),
+  cl::Hidden);
+
+static cl::opt<bool> EnableLowerKernelArguments(
+  "amdgpu-ir-lower-kernel-arguments",
+  cl::desc("Lower kernel argument loads in IR pass"),
   cl::init(true),
   cl::Hidden);
 
@@ -147,6 +148,7 @@ extern "C" void LLVMInitializeAMDGPUTarget() {
   initializeR600PacketizerPass(*PR);
   initializeR600ExpandSpecialInstrsPassPass(*PR);
   initializeR600VectorRegMergerPass(*PR);
+  initializeGlobalISel(*PR);
   initializeAMDGPUDAGToDAGISelPass(*PR);
   initializeSILowerI1CopiesPass(*PR);
   initializeSIFixSGPRCopiesPass(*PR);
@@ -160,6 +162,8 @@ extern "C" void LLVMInitializeAMDGPUTarget() {
   initializeAMDGPUAnnotateKernelFeaturesPass(*PR);
   initializeAMDGPUAnnotateUniformValuesPass(*PR);
   initializeAMDGPUArgumentUsageInfoPass(*PR);
+  initializeAMDGPULowerKernelArgumentsPass(*PR);
+  initializeAMDGPULowerKernelAttributesPass(*PR);
   initializeAMDGPULowerIntrinsicsPass(*PR);
   initializeAMDGPUOpenCLEnqueuedBlockLoweringPass(*PR);
   initializeAMDGPUPromoteAllocaPass(*PR);
@@ -167,7 +171,6 @@ extern "C" void LLVMInitializeAMDGPUTarget() {
   initializeAMDGPURewriteOutArgumentsPass(*PR);
   initializeAMDGPUUnifyMetadataPass(*PR);
   initializeSIAnnotateControlFlowPass(*PR);
-  initializeSIInsertWaitsPass(*PR);
   initializeSIInsertWaitcntsPass(*PR);
   initializeSIWholeQuadModePass(*PR);
   initializeSILowerControlFlowPass(*PR);
@@ -176,6 +179,7 @@ extern "C" void LLVMInitializeAMDGPUTarget() {
   initializeSIDebuggerInsertNopsPass(*PR);
   initializeSIOptimizeExecMaskingPass(*PR);
   initializeSIFixWWMLivenessPass(*PR);
+  initializeSIFormMemoryClausesPass(*PR);
   initializeAMDGPUUnifyDivergentExitNodesPass(*PR);
   initializeAMDGPUAAWrapperPassPass(*PR);
   initializeAMDGPUUseNativeCallsPass(*PR);
@@ -260,24 +264,15 @@ GCNILPSchedRegistry("gcn-ilp",
 static StringRef computeDataLayout(const Triple &TT) {
   if (TT.getArch() == Triple::r600) {
     // 32-bit pointers.
-    if (TT.getEnvironmentName() == "amdgiz" ||
-        TT.getEnvironmentName() == "amdgizcl")
       return "e-p:32:32-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128"
-             "-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64-A5";
-    return "e-p:32:32-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128"
-            "-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64";
+             "-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64-S32-A5";
   }
 
   // 32-bit private, local, and region pointers. 64-bit global, constant and
   // flat.
-  if (TT.getEnvironmentName() == "amdgiz" ||
-      TT.getEnvironmentName() == "amdgizcl")
-    return "e-p:64:64-p1:64:64-p2:64:64-p3:32:32-p4:32:32-p5:32:32"
+    return "e-p:64:64-p1:64:64-p2:32:32-p3:32:32-p4:64:64-p5:32:32-p6:32:32"
          "-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128"
-         "-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64-A5";
-  return "e-p:32:32-p1:64:64-p2:64:64-p3:32:32-p4:64:64-p5:32:32"
-      "-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128"
-      "-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64";
+         "-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64-S32-A5";
 }
 
 LLVM_READNONE
@@ -317,9 +312,10 @@ AMDGPUTargetMachine::AMDGPUTargetMachine(const Target &T, const Triple &TT,
   initAsmInfo();
 }
 
-AMDGPUTargetMachine::~AMDGPUTargetMachine() = default;
-
 bool AMDGPUTargetMachine::EnableLateStructurizeCFG = false;
+bool AMDGPUTargetMachine::EnableFunctionCalls = false;
+
+AMDGPUTargetMachine::~AMDGPUTargetMachine() = default;
 
 StringRef AMDGPUTargetMachine::getGPUName(const Function &F) const {
   Attribute GPUAttr = F.getFnAttribute("target-cpu");
@@ -412,6 +408,10 @@ void AMDGPUTargetMachine::adjustPassManager(PassManagerBuilder &Builder) {
       // Add infer address spaces pass to the opt pipeline after inlining
       // but before SROA to increase SROA opportunities.
       PM.add(createInferAddressSpacesPass());
+
+      // This should run after inlining to have any chance of doing anything,
+      // and before other cleanup optimizations.
+      PM.add(createAMDGPULowerKernelAttributesPass());
   });
 }
 
@@ -449,6 +449,11 @@ const R600Subtarget *R600TargetMachine::getSubtargetImpl(
   return I.get();
 }
 
+TargetTransformInfo
+R600TargetMachine::getTargetTransformInfo(const Function &F) {
+  return TargetTransformInfo(R600TTIImpl(this, F));
+}
+
 //===----------------------------------------------------------------------===//
 // GCN Target Machine (SI+)
 //===----------------------------------------------------------------------===//
@@ -461,7 +466,7 @@ GCNTargetMachine::GCNTargetMachine(const Target &T, const Triple &TT,
                                    CodeGenOpt::Level OL, bool JIT)
     : AMDGPUTargetMachine(T, TT, CPU, FS, Options, RM, CM, OL) {}
 
-const SISubtarget *GCNTargetMachine::getSubtargetImpl(const Function &F) const {
+const GCNSubtarget *GCNTargetMachine::getSubtargetImpl(const Function &F) const {
   StringRef GPU = getGPUName(F);
   StringRef FS = getFeatureString(F);
 
@@ -474,7 +479,7 @@ const SISubtarget *GCNTargetMachine::getSubtargetImpl(const Function &F) const {
     // creation will depend on the TM and the code generation flags on the
     // function that reside in TargetOptions.
     resetTargetOptions(F);
-    I = llvm::make_unique<SISubtarget>(TargetTriple, GPU, FS, *this);
+    I = llvm::make_unique<GCNSubtarget>(TargetTriple, GPU, FS, *this);
   }
 
   I->setScalarizeGlobalBehavior(ScalarizeGlobal);
@@ -482,6 +487,11 @@ const SISubtarget *GCNTargetMachine::getSubtargetImpl(const Function &F) const {
   return I.get();
 }
 
+TargetTransformInfo
+GCNTargetMachine::getTargetTransformInfo(const Function &F) {
+  return TargetTransformInfo(GCNTTIImpl(this, F));
+}
+
 //===----------------------------------------------------------------------===//
 // AMDGPU Pass Setup
 //===----------------------------------------------------------------------===//
@@ -571,11 +581,6 @@ public:
 
 } // end anonymous namespace
 
-TargetTransformInfo
-AMDGPUTargetMachine::getTargetTransformInfo(const Function &F) {
-  return TargetTransformInfo(AMDGPUTTIImpl(this, F));
-}
-
 void AMDGPUPassConfig::addEarlyCSEOrGVNPass() {
   if (getOptLevel() == CodeGenOpt::Aggressive)
     addPass(createGVNPass());
@@ -584,6 +589,7 @@ void AMDGPUPassConfig::addEarlyCSEOrGVNPass() {
 }
 
 void AMDGPUPassConfig::addStraightLineScalarOptimizationPasses() {
+  addPass(createLICMPass());
   addPass(createSeparateConstOffsetFromGEPPass());
   addPass(createSpeculativeExecutionPass());
   // ReassociateGEPs exposes more opportunites for SLSR. See
@@ -629,7 +635,8 @@ void AMDGPUPassConfig::addIRPasses() {
   }
 
   // Handle uses of OpenCL image2d_t, image3d_t and sampler_t arguments.
-  addPass(createAMDGPUOpenCLImageTypeLoweringPass());
+  if (TM.getTargetTriple().getArch() == Triple::r600)
+    addPass(createR600OpenCLImageTypeLoweringPass());
 
   // Replace OpenCL enqueued block function pointers with global variables.
   addPass(createAMDGPUOpenCLEnqueuedBlockLoweringPass());
@@ -672,6 +679,10 @@ void AMDGPUPassConfig::addIRPasses() {
 }
 
 void AMDGPUPassConfig::addCodeGenPrepare() {
+  if (TM->getTargetTriple().getArch() == Triple::amdgcn &&
+      EnableLowerKernelArguments)
+    addPass(createAMDGPULowerKernelArgumentsPass());
+
   TargetPassConfig::addCodeGenPrepare();
 
   if (EnableLoadStoreVectorizer)
@@ -739,7 +750,7 @@ TargetPassConfig *R600TargetMachine::createPassConfig(PassManagerBase &PM) {
 
 ScheduleDAGInstrs *GCNPassConfig::createMachineScheduler(
   MachineSchedContext *C) const {
-  const SISubtarget &ST = C->MF->getSubtarget<SISubtarget>();
+  const GCNSubtarget &ST = C->MF->getSubtarget<GCNSubtarget>();
   if (ST.enableSIScheduler())
     return createSIMachineScheduler(C);
   return createGCNMaxOccupancyMachineScheduler(C);
@@ -782,7 +793,7 @@ void GCNPassConfig::addMachineSSAOptimization() {
   addPass(&SILoadStoreOptimizerID);
   if (EnableSDWAPeephole) {
     addPass(&SIPeepholeSDWAID);
-    addPass(&MachineLICMID);
+    addPass(&EarlyMachineLICMID);
     addPass(&MachineCSEID);
     addPass(&SIFoldOperandsID);
     addPass(&DeadMachineInstructionElimID);
@@ -851,6 +862,8 @@ void GCNPassConfig::addFastRegAlloc(FunctionPass *RegAllocPass) {
 void GCNPassConfig::addOptimizedRegAlloc(FunctionPass *RegAllocPass) {
   insertPass(&MachineSchedulerID, &SIOptimizeExecMaskingPreRAID);
 
+  insertPass(&SIOptimizeExecMaskingPreRAID, &SIFormMemoryClausesID);
+
   // This must be run immediately after phi elimination and before
   // TwoAddressInstructions, otherwise the processing of the tied operand of
   // SI_ELSE will introduce a copy of the tied operand source after the else.
@@ -873,6 +886,10 @@ void GCNPassConfig::addPreSched2() {
 }
 
 void GCNPassConfig::addPreEmitPass() {
+  addPass(createSIMemoryLegalizerPass());
+  addPass(createSIInsertWaitcntsPass());
+  addPass(createSIShrinkInstructionsPass());
+
   // The hazard recognizer that runs as part of the post-ra scheduler does not
   // guarantee to be able handle all hazards correctly. This is because if there
   // are multiple scheduling regions in a basic block, the regions are scheduled
@@ -881,15 +898,12 @@ void GCNPassConfig::addPreEmitPass() {
   //
   // Here we add a stand-alone hazard recognizer pass which can handle all
   // cases.
+  //
+  // FIXME: This stand-alone pass will emit indiv. S_NOP 0, as needed. It would
+  // be better for it to emit S_NOP <N> when possible.
   addPass(&PostRAHazardRecognizerID);
 
-  if (EnableSIInsertWaitcntsPass)
-    addPass(createSIInsertWaitcntsPass());
-  else
-    addPass(createSIInsertWaitsPass());
-  addPass(createSIShrinkInstructionsPass());
   addPass(&SIInsertSkipsPassID);
-  addPass(createSIMemoryLegalizerPass());
   addPass(createSIDebuggerInsertNopsPass());
   addPass(&BranchRelaxationPassID);
 }
diff --git a/lib/Target/AMDGPU/AMDGPUTargetMachine.h b/lib/Target/AMDGPU/AMDGPUTargetMachine.h
index 5f9b2a7fca20..0fe14493fabd 100644
--- a/lib/Target/AMDGPU/AMDGPUTargetMachine.h
+++ b/lib/Target/AMDGPU/AMDGPUTargetMachine.h
@@ -8,7 +8,7 @@
 //===----------------------------------------------------------------------===//
 //
 /// \file
-/// \brief The AMDGPU TargetMachine interface definition for hw codgen targets.
+/// The AMDGPU TargetMachine interface definition for hw codgen targets.
 //
 //===----------------------------------------------------------------------===//
 
@@ -34,7 +34,6 @@ namespace llvm {
 class AMDGPUTargetMachine : public LLVMTargetMachine {
 protected:
   std::unique_ptr<TargetLoweringObjectFile> TLOF;
-  AMDGPUIntrinsicInfo IntrinsicInfo;
   AMDGPUAS AS;
 
   StringRef getGPUName(const Function &F) const;
@@ -42,6 +41,7 @@ protected:
 
 public:
   static bool EnableLateStructurizeCFG;
+  static bool EnableFunctionCalls;
 
   AMDGPUTargetMachine(const Target &T, const Triple &TT, StringRef CPU,
                       StringRef FS, TargetOptions Options,
@@ -49,13 +49,8 @@ public:
                       CodeGenOpt::Level OL);
   ~AMDGPUTargetMachine() override;
 
-  const AMDGPUSubtarget *getSubtargetImpl() const;
-  const AMDGPUSubtarget *getSubtargetImpl(const Function &) const override = 0;
-
-  const AMDGPUIntrinsicInfo *getIntrinsicInfo() const override {
-    return &IntrinsicInfo;
-  }
-  TargetTransformInfo getTargetTransformInfo(const Function &F) override;
+  const TargetSubtargetInfo *getSubtargetImpl() const;
+  const TargetSubtargetInfo *getSubtargetImpl(const Function &) const override = 0;
 
   TargetLoweringObjectFile *getObjFileLowering() const override {
     return TLOF.get();
@@ -91,6 +86,8 @@ public:
 
   const R600Subtarget *getSubtargetImpl(const Function &) const override;
 
+  TargetTransformInfo getTargetTransformInfo(const Function &F) override;
+
   bool isMachineVerifierClean() const override {
     return false;
   }
@@ -102,7 +99,8 @@ public:
 
 class GCNTargetMachine final : public AMDGPUTargetMachine {
 private:
-  mutable StringMap<std::unique_ptr<SISubtarget>> SubtargetMap;
+  AMDGPUIntrinsicInfo IntrinsicInfo;
+  mutable StringMap<std::unique_ptr<GCNSubtarget>> SubtargetMap;
 
 public:
   GCNTargetMachine(const Target &T, const Triple &TT, StringRef CPU,
@@ -112,7 +110,13 @@ public:
 
   TargetPassConfig *createPassConfig(PassManagerBase &PM) override;
 
-  const SISubtarget *getSubtargetImpl(const Function &) const override;
+  const GCNSubtarget *getSubtargetImpl(const Function &) const override;
+
+  TargetTransformInfo getTargetTransformInfo(const Function &F) override;
+
+  const AMDGPUIntrinsicInfo *getIntrinsicInfo() const override {
+    return &IntrinsicInfo;
+  }
 
   bool useIPRA() const override {
     return true;
diff --git a/lib/Target/AMDGPU/AMDGPUTargetObjectFile.h b/lib/Target/AMDGPU/AMDGPUTargetObjectFile.h
index ca6210f69298..dd9dc1a88fc2 100644
--- a/lib/Target/AMDGPU/AMDGPUTargetObjectFile.h
+++ b/lib/Target/AMDGPU/AMDGPUTargetObjectFile.h
@@ -8,7 +8,7 @@
 //===----------------------------------------------------------------------===//
 ///
 /// \file
-/// \brief This file declares the AMDGPU-specific subclass of
+/// This file declares the AMDGPU-specific subclass of
 /// TargetLoweringObjectFile.
 ///
 //===----------------------------------------------------------------------===//
diff --git a/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp b/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp
index 77c2d4b956c6..a68b8d03f06e 100644
--- a/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp
+++ b/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp
@@ -17,12 +17,12 @@
 
 #include "AMDGPUTargetTransformInfo.h"
 #include "AMDGPUSubtarget.h"
+#include "Utils/AMDGPUBaseInfo.h"
 #include "llvm/ADT/STLExtras.h"
 #include "llvm/Analysis/LoopInfo.h"
 #include "llvm/Analysis/TargetTransformInfo.h"
 #include "llvm/Analysis/ValueTracking.h"
 #include "llvm/CodeGen/ISDOpcodes.h"
-#include "llvm/CodeGen/MachineValueType.h"
 #include "llvm/CodeGen/ValueTypes.h"
 #include "llvm/IR/Argument.h"
 #include "llvm/IR/Attributes.h"
@@ -43,6 +43,7 @@
 #include "llvm/Support/CommandLine.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/MachineValueType.h"
 #include "llvm/Support/raw_ostream.h"
 #include "llvm/Target/TargetMachine.h"
 #include <algorithm>
@@ -101,7 +102,7 @@ void AMDGPUTTIImpl::getUnrollingPreferences(Loop *L, ScalarEvolution &SE,
   unsigned ThresholdPrivate = UnrollThresholdPrivate;
   unsigned ThresholdLocal = UnrollThresholdLocal;
   unsigned MaxBoost = std::max(ThresholdPrivate, ThresholdLocal);
-  AMDGPUAS ASST = ST->getAMDGPUAS();
+  const AMDGPUAS &ASST = AMDGPU::getAMDGPUAS(TargetTriple);
   for (const BasicBlock *BB : L->getBlocks()) {
     const DataLayout &DL = BB->getModule()->getDataLayout();
     unsigned LocalGEPsSeen = 0;
@@ -123,8 +124,9 @@ void AMDGPUTTIImpl::getUnrollingPreferences(Loop *L, ScalarEvolution &SE,
             continue;
           if (dependsOnLocalPhi(L, Br->getCondition())) {
             UP.Threshold += UnrollThresholdIf;
-            DEBUG(dbgs() << "Set unroll threshold " << UP.Threshold
-                         << " for loop:\n" << *L << " due to " << *Br << '\n');
+            LLVM_DEBUG(dbgs() << "Set unroll threshold " << UP.Threshold
+                              << " for loop:\n"
+                              << *L << " due to " << *Br << '\n');
             if (UP.Threshold >= MaxBoost)
               return;
           }
@@ -200,61 +202,76 @@ void AMDGPUTTIImpl::getUnrollingPreferences(Loop *L, ScalarEvolution &SE,
       // Don't use the maximum allowed value here as it will make some
       // programs way too big.
       UP.Threshold = Threshold;
-      DEBUG(dbgs() << "Set unroll threshold " << Threshold << " for loop:\n"
-                   << *L << " due to " << *GEP << '\n');
+      LLVM_DEBUG(dbgs() << "Set unroll threshold " << Threshold
+                        << " for loop:\n"
+                        << *L << " due to " << *GEP << '\n');
       if (UP.Threshold >= MaxBoost)
         return;
     }
   }
 }
 
-unsigned AMDGPUTTIImpl::getHardwareNumberOfRegisters(bool Vec) const {
+unsigned GCNTTIImpl::getHardwareNumberOfRegisters(bool Vec) const {
   // The concept of vector registers doesn't really exist. Some packed vector
   // operations operate on the normal 32-bit registers.
-
-  // Number of VGPRs on SI.
-  if (ST->getGeneration() >= AMDGPUSubtarget::SOUTHERN_ISLANDS)
-    return 256;
-
-  return 4 * 128; // XXX - 4 channels. Should these count as vector instead?
+  return 256;
 }
 
-unsigned AMDGPUTTIImpl::getNumberOfRegisters(bool Vec) const {
+unsigned GCNTTIImpl::getNumberOfRegisters(bool Vec) const {
   // This is really the number of registers to fill when vectorizing /
   // interleaving loops, so we lie to avoid trying to use all registers.
   return getHardwareNumberOfRegisters(Vec) >> 3;
 }
 
-unsigned AMDGPUTTIImpl::getRegisterBitWidth(bool Vector) const {
+unsigned GCNTTIImpl::getRegisterBitWidth(bool Vector) const {
   return 32;
 }
 
-unsigned AMDGPUTTIImpl::getMinVectorRegisterBitWidth() const {
+unsigned GCNTTIImpl::getMinVectorRegisterBitWidth() const {
   return 32;
 }
 
-unsigned AMDGPUTTIImpl::getLoadStoreVecRegBitWidth(unsigned AddrSpace) const {
+unsigned GCNTTIImpl::getLoadVectorFactor(unsigned VF, unsigned LoadSize,
+                                            unsigned ChainSizeInBytes,
+                                            VectorType *VecTy) const {
+  unsigned VecRegBitWidth = VF * LoadSize;
+  if (VecRegBitWidth > 128 && VecTy->getScalarSizeInBits() < 32)
+    // TODO: Support element-size less than 32bit?
+    return 128 / LoadSize;
+
+  return VF;
+}
+
+unsigned GCNTTIImpl::getStoreVectorFactor(unsigned VF, unsigned StoreSize,
+                                             unsigned ChainSizeInBytes,
+                                             VectorType *VecTy) const {
+  unsigned VecRegBitWidth = VF * StoreSize;
+  if (VecRegBitWidth > 128)
+    return 128 / StoreSize;
+
+  return VF;
+}
+
+unsigned GCNTTIImpl::getLoadStoreVecRegBitWidth(unsigned AddrSpace) const {
   AMDGPUAS AS = ST->getAMDGPUAS();
   if (AddrSpace == AS.GLOBAL_ADDRESS ||
       AddrSpace == AS.CONSTANT_ADDRESS ||
-      AddrSpace == AS.FLAT_ADDRESS)
-    return 128;
-  if (AddrSpace == AS.LOCAL_ADDRESS ||
+      AddrSpace == AS.CONSTANT_ADDRESS_32BIT) {
+    return 512;
+  }
+
+  if (AddrSpace == AS.FLAT_ADDRESS ||
+      AddrSpace == AS.LOCAL_ADDRESS ||
       AddrSpace == AS.REGION_ADDRESS)
-    return 64;
+    return 128;
+
   if (AddrSpace == AS.PRIVATE_ADDRESS)
     return 8 * ST->getMaxPrivateElementSize();
 
-  if (ST->getGeneration() <= AMDGPUSubtarget::NORTHERN_ISLANDS &&
-      (AddrSpace == AS.PARAM_D_ADDRESS ||
-      AddrSpace == AS.PARAM_I_ADDRESS ||
-      (AddrSpace >= AS.CONSTANT_BUFFER_0 &&
-      AddrSpace <= AS.CONSTANT_BUFFER_15)))
-    return 128;
   llvm_unreachable("unhandled address space");
 }
 
-bool AMDGPUTTIImpl::isLegalToVectorizeMemChain(unsigned ChainSizeInBytes,
+bool GCNTTIImpl::isLegalToVectorizeMemChain(unsigned ChainSizeInBytes,
                                                unsigned Alignment,
                                                unsigned AddrSpace) const {
   // We allow vectorization of flat stores, even though we may need to decompose
@@ -267,19 +284,19 @@ bool AMDGPUTTIImpl::isLegalToVectorizeMemChain(unsigned ChainSizeInBytes,
   return true;
 }
 
-bool AMDGPUTTIImpl::isLegalToVectorizeLoadChain(unsigned ChainSizeInBytes,
+bool GCNTTIImpl::isLegalToVectorizeLoadChain(unsigned ChainSizeInBytes,
                                                 unsigned Alignment,
                                                 unsigned AddrSpace) const {
   return isLegalToVectorizeMemChain(ChainSizeInBytes, Alignment, AddrSpace);
 }
 
-bool AMDGPUTTIImpl::isLegalToVectorizeStoreChain(unsigned ChainSizeInBytes,
+bool GCNTTIImpl::isLegalToVectorizeStoreChain(unsigned ChainSizeInBytes,
                                                  unsigned Alignment,
                                                  unsigned AddrSpace) const {
   return isLegalToVectorizeMemChain(ChainSizeInBytes, Alignment, AddrSpace);
 }
 
-unsigned AMDGPUTTIImpl::getMaxInterleaveFactor(unsigned VF) {
+unsigned GCNTTIImpl::getMaxInterleaveFactor(unsigned VF) {
   // Disable unrolling if the loop is not vectorized.
   // TODO: Enable this again.
   if (VF == 1)
@@ -288,11 +305,14 @@ unsigned AMDGPUTTIImpl::getMaxInterleaveFactor(unsigned VF) {
   return 8;
 }
 
-bool AMDGPUTTIImpl::getTgtMemIntrinsic(IntrinsicInst *Inst,
+bool GCNTTIImpl::getTgtMemIntrinsic(IntrinsicInst *Inst,
                                        MemIntrinsicInfo &Info) const {
   switch (Inst->getIntrinsicID()) {
   case Intrinsic::amdgcn_atomic_inc:
-  case Intrinsic::amdgcn_atomic_dec: {
+  case Intrinsic::amdgcn_atomic_dec:
+  case Intrinsic::amdgcn_ds_fadd:
+  case Intrinsic::amdgcn_ds_fmin:
+  case Intrinsic::amdgcn_ds_fmax: {
     auto *Ordering = dyn_cast<ConstantInt>(Inst->getArgOperand(2));
     auto *Volatile = dyn_cast<ConstantInt>(Inst->getArgOperand(4));
     if (!Ordering || !Volatile)
@@ -314,7 +334,7 @@ bool AMDGPUTTIImpl::getTgtMemIntrinsic(IntrinsicInst *Inst,
   }
 }
 
-int AMDGPUTTIImpl::getArithmeticInstrCost(
+int GCNTTIImpl::getArithmeticInstrCost(
     unsigned Opcode, Type *Ty, TTI::OperandValueKind Opd1Info,
     TTI::OperandValueKind Opd2Info, TTI::OperandValueProperties Opd1PropInfo,
     TTI::OperandValueProperties Opd2PropInfo, ArrayRef<const Value *> Args ) {
@@ -424,7 +444,7 @@ int AMDGPUTTIImpl::getArithmeticInstrCost(
                                        Opd1PropInfo, Opd2PropInfo);
 }
 
-unsigned AMDGPUTTIImpl::getCFInstrCost(unsigned Opcode) {
+unsigned GCNTTIImpl::getCFInstrCost(unsigned Opcode) {
   // XXX - For some reason this isn't called for switch.
   switch (Opcode) {
   case Instruction::Br:
@@ -435,7 +455,38 @@ unsigned AMDGPUTTIImpl::getCFInstrCost(unsigned Opcode) {
   }
 }
 
-int AMDGPUTTIImpl::getVectorInstrCost(unsigned Opcode, Type *ValTy,
+int GCNTTIImpl::getArithmeticReductionCost(unsigned Opcode, Type *Ty,
+                                              bool IsPairwise) {
+  EVT OrigTy = TLI->getValueType(DL, Ty);
+
+  // Computes cost on targets that have packed math instructions(which support
+  // 16-bit types only).
+  if (IsPairwise ||
+      !ST->hasVOP3PInsts() ||
+      OrigTy.getScalarSizeInBits() != 16)
+    return BaseT::getArithmeticReductionCost(Opcode, Ty, IsPairwise);
+
+  std::pair<int, MVT> LT = TLI->getTypeLegalizationCost(DL, Ty);
+  return LT.first * getFullRateInstrCost();
+}
+
+int GCNTTIImpl::getMinMaxReductionCost(Type *Ty, Type *CondTy,
+                                          bool IsPairwise,
+                                          bool IsUnsigned) {
+  EVT OrigTy = TLI->getValueType(DL, Ty);
+
+  // Computes cost on targets that have packed math instructions(which support
+  // 16-bit types only).
+  if (IsPairwise ||
+      !ST->hasVOP3PInsts() ||
+      OrigTy.getScalarSizeInBits() != 16)
+    return BaseT::getMinMaxReductionCost(Ty, CondTy, IsPairwise, IsUnsigned);
+
+  std::pair<int, MVT> LT = TLI->getTypeLegalizationCost(DL, Ty);
+  return LT.first * getHalfRateInstrCost();
+}
+
+int GCNTTIImpl::getVectorInstrCost(unsigned Opcode, Type *ValTy,
                                       unsigned Index) {
   switch (Opcode) {
   case Instruction::ExtractElement:
@@ -460,52 +511,7 @@ int AMDGPUTTIImpl::getVectorInstrCost(unsigned Opcode, Type *ValTy,
   }
 }
 
-static bool isIntrinsicSourceOfDivergence(const IntrinsicInst *I) {
-  switch (I->getIntrinsicID()) {
-  case Intrinsic::amdgcn_workitem_id_x:
-  case Intrinsic::amdgcn_workitem_id_y:
-  case Intrinsic::amdgcn_workitem_id_z:
-  case Intrinsic::amdgcn_interp_mov:
-  case Intrinsic::amdgcn_interp_p1:
-  case Intrinsic::amdgcn_interp_p2:
-  case Intrinsic::amdgcn_mbcnt_hi:
-  case Intrinsic::amdgcn_mbcnt_lo:
-  case Intrinsic::r600_read_tidig_x:
-  case Intrinsic::r600_read_tidig_y:
-  case Intrinsic::r600_read_tidig_z:
-  case Intrinsic::amdgcn_atomic_inc:
-  case Intrinsic::amdgcn_atomic_dec:
-  case Intrinsic::amdgcn_image_atomic_swap:
-  case Intrinsic::amdgcn_image_atomic_add:
-  case Intrinsic::amdgcn_image_atomic_sub:
-  case Intrinsic::amdgcn_image_atomic_smin:
-  case Intrinsic::amdgcn_image_atomic_umin:
-  case Intrinsic::amdgcn_image_atomic_smax:
-  case Intrinsic::amdgcn_image_atomic_umax:
-  case Intrinsic::amdgcn_image_atomic_and:
-  case Intrinsic::amdgcn_image_atomic_or:
-  case Intrinsic::amdgcn_image_atomic_xor:
-  case Intrinsic::amdgcn_image_atomic_inc:
-  case Intrinsic::amdgcn_image_atomic_dec:
-  case Intrinsic::amdgcn_image_atomic_cmpswap:
-  case Intrinsic::amdgcn_buffer_atomic_swap:
-  case Intrinsic::amdgcn_buffer_atomic_add:
-  case Intrinsic::amdgcn_buffer_atomic_sub:
-  case Intrinsic::amdgcn_buffer_atomic_smin:
-  case Intrinsic::amdgcn_buffer_atomic_umin:
-  case Intrinsic::amdgcn_buffer_atomic_smax:
-  case Intrinsic::amdgcn_buffer_atomic_umax:
-  case Intrinsic::amdgcn_buffer_atomic_and:
-  case Intrinsic::amdgcn_buffer_atomic_or:
-  case Intrinsic::amdgcn_buffer_atomic_xor:
-  case Intrinsic::amdgcn_buffer_atomic_cmpswap:
-  case Intrinsic::amdgcn_ps_live:
-  case Intrinsic::amdgcn_ds_swizzle:
-    return true;
-  default:
-    return false;
-  }
-}
+
 
 static bool isArgPassedInSGPR(const Argument *A) {
   const Function *F = A->getParent();
@@ -535,7 +541,7 @@ static bool isArgPassedInSGPR(const Argument *A) {
 
 /// \returns true if the result of the value could potentially be
 /// different across workitems in a wavefront.
-bool AMDGPUTTIImpl::isSourceOfDivergence(const Value *V) const {
+bool GCNTTIImpl::isSourceOfDivergence(const Value *V) const {
   if (const Argument *A = dyn_cast<Argument>(V))
     return !isArgPassedInSGPR(A);
 
@@ -556,7 +562,7 @@ bool AMDGPUTTIImpl::isSourceOfDivergence(const Value *V) const {
     return true;
 
   if (const IntrinsicInst *Intrinsic = dyn_cast<IntrinsicInst>(V))
-    return isIntrinsicSourceOfDivergence(Intrinsic);
+    return AMDGPU::isIntrinsicSourceOfDivergence(Intrinsic->getIntrinsicID());
 
   // Assume all function calls are a source of divergence.
   if (isa<CallInst>(V) || isa<InvokeInst>(V))
@@ -565,7 +571,7 @@ bool AMDGPUTTIImpl::isSourceOfDivergence(const Value *V) const {
   return false;
 }
 
-bool AMDGPUTTIImpl::isAlwaysUniform(const Value *V) const {
+bool GCNTTIImpl::isAlwaysUniform(const Value *V) const {
   if (const IntrinsicInst *Intrinsic = dyn_cast<IntrinsicInst>(V)) {
     switch (Intrinsic->getIntrinsicID()) {
     default:
@@ -578,7 +584,7 @@ bool AMDGPUTTIImpl::isAlwaysUniform(const Value *V) const {
   return false;
 }
 
-unsigned AMDGPUTTIImpl::getShuffleCost(TTI::ShuffleKind Kind, Type *Tp, int Index,
+unsigned GCNTTIImpl::getShuffleCost(TTI::ShuffleKind Kind, Type *Tp, int Index,
                                        Type *SubTp) {
   if (ST->hasVOP3PInsts()) {
     VectorType *VT = cast<VectorType>(Tp);
@@ -601,7 +607,7 @@ unsigned AMDGPUTTIImpl::getShuffleCost(TTI::ShuffleKind Kind, Type *Tp, int Inde
   return BaseT::getShuffleCost(Kind, Tp, Index, SubTp);
 }
 
-bool AMDGPUTTIImpl::areInlineCompatible(const Function *Caller,
+bool GCNTTIImpl::areInlineCompatible(const Function *Caller,
                                         const Function *Callee) const {
   const TargetMachine &TM = getTLI()->getTargetMachine();
   const FeatureBitset &CallerBits =
@@ -613,3 +619,114 @@ bool AMDGPUTTIImpl::areInlineCompatible(const Function *Caller,
   FeatureBitset RealCalleeBits = CalleeBits & ~InlineFeatureIgnoreList;
   return ((RealCallerBits & RealCalleeBits) == RealCalleeBits);
 }
+
+void GCNTTIImpl::getUnrollingPreferences(Loop *L, ScalarEvolution &SE,
+                                         TTI::UnrollingPreferences &UP) {
+  CommonTTI.getUnrollingPreferences(L, SE, UP);
+}
+
+unsigned R600TTIImpl::getHardwareNumberOfRegisters(bool Vec) const {
+  return 4 * 128; // XXX - 4 channels. Should these count as vector instead?
+}
+
+unsigned R600TTIImpl::getNumberOfRegisters(bool Vec) const {
+  return getHardwareNumberOfRegisters(Vec);
+}
+
+unsigned R600TTIImpl::getRegisterBitWidth(bool Vector) const {
+  return 32;
+}
+
+unsigned R600TTIImpl::getMinVectorRegisterBitWidth() const {
+  return 32;
+}
+
+unsigned R600TTIImpl::getLoadStoreVecRegBitWidth(unsigned AddrSpace) const {
+  AMDGPUAS AS = ST->getAMDGPUAS();
+  if (AddrSpace == AS.GLOBAL_ADDRESS ||
+      AddrSpace == AS.CONSTANT_ADDRESS)
+    return 128;
+  if (AddrSpace == AS.LOCAL_ADDRESS ||
+      AddrSpace == AS.REGION_ADDRESS)
+    return 64;
+  if (AddrSpace == AS.PRIVATE_ADDRESS)
+    return 32;
+
+  if ((AddrSpace == AS.PARAM_D_ADDRESS ||
+      AddrSpace == AS.PARAM_I_ADDRESS ||
+      (AddrSpace >= AS.CONSTANT_BUFFER_0 &&
+      AddrSpace <= AS.CONSTANT_BUFFER_15)))
+    return 128;
+  llvm_unreachable("unhandled address space");
+}
+
+bool R600TTIImpl::isLegalToVectorizeMemChain(unsigned ChainSizeInBytes,
+                                             unsigned Alignment,
+                                             unsigned AddrSpace) const {
+  // We allow vectorization of flat stores, even though we may need to decompose
+  // them later if they may access private memory. We don't have enough context
+  // here, and legalization can handle it.
+  if (AddrSpace == ST->getAMDGPUAS().PRIVATE_ADDRESS)
+    return false;
+  return true;
+}
+
+bool R600TTIImpl::isLegalToVectorizeLoadChain(unsigned ChainSizeInBytes,
+                                              unsigned Alignment,
+                                              unsigned AddrSpace) const {
+  return isLegalToVectorizeMemChain(ChainSizeInBytes, Alignment, AddrSpace);
+}
+
+bool R600TTIImpl::isLegalToVectorizeStoreChain(unsigned ChainSizeInBytes,
+                                               unsigned Alignment,
+                                               unsigned AddrSpace) const {
+  return isLegalToVectorizeMemChain(ChainSizeInBytes, Alignment, AddrSpace);
+}
+
+unsigned R600TTIImpl::getMaxInterleaveFactor(unsigned VF) {
+  // Disable unrolling if the loop is not vectorized.
+  // TODO: Enable this again.
+  if (VF == 1)
+    return 1;
+
+  return 8;
+}
+
+unsigned R600TTIImpl::getCFInstrCost(unsigned Opcode) {
+  // XXX - For some reason this isn't called for switch.
+  switch (Opcode) {
+  case Instruction::Br:
+  case Instruction::Ret:
+    return 10;
+  default:
+    return BaseT::getCFInstrCost(Opcode);
+  }
+}
+
+int R600TTIImpl::getVectorInstrCost(unsigned Opcode, Type *ValTy,
+                                    unsigned Index) {
+  switch (Opcode) {
+  case Instruction::ExtractElement:
+  case Instruction::InsertElement: {
+    unsigned EltSize
+      = DL.getTypeSizeInBits(cast<VectorType>(ValTy)->getElementType());
+    if (EltSize < 32) {
+      return BaseT::getVectorInstrCost(Opcode, ValTy, Index);
+    }
+
+    // Extracts are just reads of a subregister, so are free. Inserts are
+    // considered free because we don't want to have any cost for scalarizing
+    // operations, and we don't have to copy into a different register class.
+
+    // Dynamic indexing isn't free and is best avoided.
+    return Index == ~0u ? 2 : 0;
+  }
+  default:
+    return BaseT::getVectorInstrCost(Opcode, ValTy, Index);
+  }
+}
+
+void R600TTIImpl::getUnrollingPreferences(Loop *L, ScalarEvolution &SE,
+                                          TTI::UnrollingPreferences &UP) {
+  CommonTTI.getUnrollingPreferences(L, SE, UP);
+}
diff --git a/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.h b/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.h
index 8899d2c6da8a..8e63d789e17d 100644
--- a/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.h
+++ b/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.h
@@ -21,6 +21,7 @@
 #include "AMDGPU.h"
 #include "AMDGPUSubtarget.h"
 #include "AMDGPUTargetMachine.h"
+#include "MCTargetDesc/AMDGPUMCTargetDesc.h"
 #include "Utils/AMDGPUBaseInfo.h"
 #include "llvm/ADT/ArrayRef.h"
 #include "llvm/Analysis/TargetTransformInfo.h"
@@ -44,8 +45,26 @@ class AMDGPUTTIImpl final : public BasicTTIImplBase<AMDGPUTTIImpl> {
 
   friend BaseT;
 
-  const AMDGPUSubtarget *ST;
+  Triple TargetTriple;
+
+public:
+  explicit AMDGPUTTIImpl(const AMDGPUTargetMachine *TM, const Function &F)
+    : BaseT(TM, F.getParent()->getDataLayout()),
+      TargetTriple(TM->getTargetTriple()) {}
+
+  void getUnrollingPreferences(Loop *L, ScalarEvolution &SE,
+                               TTI::UnrollingPreferences &UP);
+};
+
+class GCNTTIImpl final : public BasicTTIImplBase<GCNTTIImpl> {
+  using BaseT = BasicTTIImplBase<GCNTTIImpl>;
+  using TTI = TargetTransformInfo;
+
+  friend BaseT;
+
+  const GCNSubtarget *ST;
   const AMDGPUTargetLowering *TLI;
+  AMDGPUTTIImpl CommonTTI;
   bool IsGraphicsShader;
 
   const FeatureBitset InlineFeatureIgnoreList = {
@@ -61,7 +80,6 @@ class AMDGPUTTIImpl final : public BasicTTIImplBase<AMDGPUTTIImpl> {
     AMDGPU::FeatureAutoWaitcntBeforeBarrier,
     AMDGPU::FeatureDebuggerEmitPrologue,
     AMDGPU::FeatureDebuggerInsertNops,
-    AMDGPU::FeatureDebuggerReserveRegs,
 
     // Property of the kernel/environment which can't actually differ.
     AMDGPU::FeatureSGPRInitBug,
@@ -73,7 +91,7 @@ class AMDGPUTTIImpl final : public BasicTTIImplBase<AMDGPUTTIImpl> {
     AMDGPU::HalfRate64Ops
   };
 
-  const AMDGPUSubtarget *getST() const { return ST; }
+  const GCNSubtarget *getST() const { return ST; }
   const AMDGPUTargetLowering *getTLI() const { return TLI; }
 
   static inline int getFullRateInstrCost() {
@@ -98,10 +116,11 @@ class AMDGPUTTIImpl final : public BasicTTIImplBase<AMDGPUTTIImpl> {
   }
 
 public:
-  explicit AMDGPUTTIImpl(const AMDGPUTargetMachine *TM, const Function &F)
+  explicit GCNTTIImpl(const AMDGPUTargetMachine *TM, const Function &F)
     : BaseT(TM, F.getParent()->getDataLayout()),
-      ST(TM->getSubtargetImpl(F)),
+      ST(static_cast<const GCNSubtarget*>(TM->getSubtargetImpl(F))),
       TLI(ST->getTargetLowering()),
+      CommonTTI(TM, F),
       IsGraphicsShader(AMDGPU::isShader(F.getCallingConv())) {}
 
   bool hasBranchDivergence() { return true; }
@@ -118,6 +137,12 @@ public:
   unsigned getNumberOfRegisters(bool Vector) const;
   unsigned getRegisterBitWidth(bool Vector) const;
   unsigned getMinVectorRegisterBitWidth() const;
+  unsigned getLoadVectorFactor(unsigned VF, unsigned LoadSize,
+                               unsigned ChainSizeInBytes,
+                               VectorType *VecTy) const;
+  unsigned getStoreVectorFactor(unsigned VF, unsigned StoreSize,
+                                unsigned ChainSizeInBytes,
+                                VectorType *VecTy) const;
   unsigned getLoadStoreVecRegBitWidth(unsigned AddrSpace) const;
 
   bool isLegalToVectorizeMemChain(unsigned ChainSizeInBytes,
@@ -166,6 +191,53 @@ public:
                            const Function *Callee) const;
 
   unsigned getInliningThresholdMultiplier() { return 9; }
+
+  int getArithmeticReductionCost(unsigned Opcode,
+                                 Type *Ty,
+                                 bool IsPairwise);
+  int getMinMaxReductionCost(Type *Ty, Type *CondTy,
+                             bool IsPairwiseForm,
+                             bool IsUnsigned);
+};
+
+class R600TTIImpl final : public BasicTTIImplBase<R600TTIImpl> {
+  using BaseT = BasicTTIImplBase<R600TTIImpl>;
+  using TTI = TargetTransformInfo;
+
+  friend BaseT;
+
+  const R600Subtarget *ST;
+  const AMDGPUTargetLowering *TLI;
+  AMDGPUTTIImpl CommonTTI;
+
+public:
+  explicit R600TTIImpl(const AMDGPUTargetMachine *TM, const Function &F)
+    : BaseT(TM, F.getParent()->getDataLayout()),
+      ST(static_cast<const R600Subtarget*>(TM->getSubtargetImpl(F))),
+      TLI(ST->getTargetLowering()),
+      CommonTTI(TM, F)	{}
+
+  const R600Subtarget *getST() const { return ST; }
+  const AMDGPUTargetLowering *getTLI() const { return TLI; }
+
+  void getUnrollingPreferences(Loop *L, ScalarEvolution &SE,
+                               TTI::UnrollingPreferences &UP);
+  unsigned getHardwareNumberOfRegisters(bool Vec) const;
+  unsigned getNumberOfRegisters(bool Vec) const;
+  unsigned getRegisterBitWidth(bool Vector) const;
+  unsigned getMinVectorRegisterBitWidth() const;
+  unsigned getLoadStoreVecRegBitWidth(unsigned AddrSpace) const;
+  bool isLegalToVectorizeMemChain(unsigned ChainSizeInBytes, unsigned Alignment,
+                                  unsigned AddrSpace) const;
+  bool isLegalToVectorizeLoadChain(unsigned ChainSizeInBytes,
+		                   unsigned Alignment,
+                                   unsigned AddrSpace) const;
+  bool isLegalToVectorizeStoreChain(unsigned ChainSizeInBytes,
+                                    unsigned Alignment,
+                                    unsigned AddrSpace) const;
+  unsigned getMaxInterleaveFactor(unsigned VF);
+  unsigned getCFInstrCost(unsigned Opcode);
+  int getVectorInstrCost(unsigned Opcode, Type *ValTy, unsigned Index);
 };
 
 } // end namespace llvm
diff --git a/lib/Target/AMDGPU/AMDGPUUnifyDivergentExitNodes.cpp b/lib/Target/AMDGPU/AMDGPUUnifyDivergentExitNodes.cpp
index 6107f3a7dd18..0d3a1673696a 100644
--- a/lib/Target/AMDGPU/AMDGPUUnifyDivergentExitNodes.cpp
+++ b/lib/Target/AMDGPU/AMDGPUUnifyDivergentExitNodes.cpp
@@ -28,6 +28,7 @@
 #include "llvm/Analysis/DivergenceAnalysis.h"
 #include "llvm/Analysis/PostDominators.h"
 #include "llvm/Analysis/TargetTransformInfo.h"
+#include "llvm/Transforms/Utils/Local.h"
 #include "llvm/IR/BasicBlock.h"
 #include "llvm/IR/CFG.h"
 #include "llvm/IR/Constants.h"
@@ -39,7 +40,7 @@
 #include "llvm/Pass.h"
 #include "llvm/Support/Casting.h"
 #include "llvm/Transforms/Scalar.h"
-#include "llvm/Transforms/Utils/Local.h"
+#include "llvm/Transforms/Utils.h"
 
 using namespace llvm;
 
@@ -144,7 +145,8 @@ static BasicBlock *unifyReturnBlockSet(Function &F,
     if (PN)
       PN->addIncoming(BB->getTerminator()->getOperand(0), BB);
 
-    BB->getInstList().pop_back();  // Remove the return insn
+    // Remove and delete the return inst.
+    BB->getTerminator()->eraseFromParent();
     BranchInst::Create(NewRetBlock, BB);
   }
 
@@ -168,6 +170,9 @@ bool AMDGPUUnifyDivergentExitNodes::runOnFunction(Function &F) {
   SmallVector<BasicBlock *, 4> ReturningBlocks;
   SmallVector<BasicBlock *, 4> UnreachableBlocks;
 
+  // Dummy return block for infinite loop.
+  BasicBlock *DummyReturnBB = nullptr;
+
   for (BasicBlock *BB : PDT.getRoots()) {
     if (isa<ReturnInst>(BB->getTerminator())) {
       if (!isUniformlyReached(DA, *BB))
@@ -175,6 +180,35 @@ bool AMDGPUUnifyDivergentExitNodes::runOnFunction(Function &F) {
     } else if (isa<UnreachableInst>(BB->getTerminator())) {
       if (!isUniformlyReached(DA, *BB))
         UnreachableBlocks.push_back(BB);
+    } else if (BranchInst *BI = dyn_cast<BranchInst>(BB->getTerminator())) {
+
+      ConstantInt *BoolTrue = ConstantInt::getTrue(F.getContext());
+      if (DummyReturnBB == nullptr) {
+        DummyReturnBB = BasicBlock::Create(F.getContext(),
+                                           "DummyReturnBlock", &F);
+        Type *RetTy = F.getReturnType();
+        Value *RetVal = RetTy->isVoidTy() ? nullptr : UndefValue::get(RetTy);
+        ReturnInst::Create(F.getContext(), RetVal, DummyReturnBB);
+        ReturningBlocks.push_back(DummyReturnBB);
+      }
+
+      if (BI->isUnconditional()) {
+        BasicBlock *LoopHeaderBB = BI->getSuccessor(0);
+        BI->eraseFromParent(); // Delete the unconditional branch.
+        // Add a new conditional branch with a dummy edge to the return block.
+        BranchInst::Create(LoopHeaderBB, DummyReturnBB, BoolTrue, BB);
+      } else { // Conditional branch.
+        // Create a new transition block to hold the conditional branch.
+        BasicBlock *TransitionBB = BasicBlock::Create(F.getContext(),
+                                                      "TransitionBlock", &F);
+
+        // Move BI from BB to the new transition block.
+        BI->removeFromParent();
+        TransitionBB->getInstList().push_back(BI);
+
+        // Create a branch that will always branch to the transition block.
+        BranchInst::Create(TransitionBB, DummyReturnBB, BoolTrue, BB);
+      }
     }
   }
 
@@ -189,7 +223,8 @@ bool AMDGPUUnifyDivergentExitNodes::runOnFunction(Function &F) {
       new UnreachableInst(F.getContext(), UnreachableBlock);
 
       for (BasicBlock *BB : UnreachableBlocks) {
-        BB->getInstList().pop_back();  // Remove the unreachable inst.
+        // Remove and delete the unreachable inst.
+        BB->getTerminator()->eraseFromParent();
         BranchInst::Create(UnreachableBlock, BB);
       }
     }
@@ -200,7 +235,8 @@ bool AMDGPUUnifyDivergentExitNodes::runOnFunction(Function &F) {
 
       Type *RetTy = F.getReturnType();
       Value *RetVal = RetTy->isVoidTy() ? nullptr : UndefValue::get(RetTy);
-      UnreachableBlock->getInstList().pop_back();  // Remove the unreachable inst.
+      // Remove and delete the unreachable inst.
+      UnreachableBlock->getTerminator()->eraseFromParent();
 
       Function *UnreachableIntrin =
         Intrinsic::getDeclaration(F.getParent(), Intrinsic::amdgcn_unreachable);
diff --git a/lib/Target/AMDGPU/AMDGPUUnifyMetadata.cpp b/lib/Target/AMDGPU/AMDGPUUnifyMetadata.cpp
index b78568e89cfb..1f6d9234c1ed 100644
--- a/lib/Target/AMDGPU/AMDGPUUnifyMetadata.cpp
+++ b/lib/Target/AMDGPU/AMDGPUUnifyMetadata.cpp
@@ -8,7 +8,7 @@
 //===----------------------------------------------------------------------===//
 //
 // \file
-// \brief This pass that unifies multiple OpenCL metadata due to linking.
+// This pass that unifies multiple OpenCL metadata due to linking.
 //
 //===----------------------------------------------------------------------===//
 
@@ -37,7 +37,7 @@ namespace {
 
   } // end namespace kOCLMD
 
-  /// \brief Unify multiple OpenCL metadata due to linking.
+  /// Unify multiple OpenCL metadata due to linking.
   class AMDGPUUnifyMetadata : public ModulePass {
   public:
     static char ID;
@@ -47,7 +47,7 @@ namespace {
   private:
     bool runOnModule(Module &M) override;
 
-    /// \brief Unify version metadata.
+    /// Unify version metadata.
     /// \return true if changes are made.
     /// Assume the named metadata has operands each of which is a pair of
     /// integer constant, e.g.
@@ -82,7 +82,7 @@ namespace {
       return true;
     }
 
-  /// \brief Unify version metadata.
+  /// Unify version metadata.
   /// \return true if changes are made.
   /// Assume the named metadata has operands each of which is a list e.g.
   /// !Name = {!n1, !n2}
diff --git a/lib/Target/AMDGPU/AMDILCFGStructurizer.cpp b/lib/Target/AMDGPU/AMDILCFGStructurizer.cpp
index 0a0e43123ae0..11cd49e5b3dc 100644
--- a/lib/Target/AMDGPU/AMDILCFGStructurizer.cpp
+++ b/lib/Target/AMDGPU/AMDILCFGStructurizer.cpp
@@ -11,6 +11,7 @@
 #include "AMDGPUSubtarget.h"
 #include "R600InstrInfo.h"
 #include "R600RegisterInfo.h"
+#include "MCTargetDesc/AMDGPUMCTargetDesc.h"
 #include "llvm/ADT/DepthFirstIterator.h"
 #include "llvm/ADT/SCCIterator.h"
 #include "llvm/ADT/SmallPtrSet.h"
@@ -28,12 +29,12 @@
 #include "llvm/CodeGen/MachineOperand.h"
 #include "llvm/CodeGen/MachinePostDominators.h"
 #include "llvm/CodeGen/MachineRegisterInfo.h"
-#include "llvm/CodeGen/MachineValueType.h"
 #include "llvm/IR/DebugLoc.h"
 #include "llvm/IR/LLVMContext.h"
 #include "llvm/Pass.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/MachineValueType.h"
 #include "llvm/Support/raw_ostream.h"
 #include <cassert>
 #include <cstddef>
@@ -78,23 +79,18 @@ namespace {
 //
 //===----------------------------------------------------------------------===//
 
-#define SHOWNEWINSTR(i) \
-  DEBUG(dbgs() << "New instr: " << *i << "\n");
+#define SHOWNEWINSTR(i) LLVM_DEBUG(dbgs() << "New instr: " << *i << "\n");
 
-#define SHOWNEWBLK(b, msg) \
-DEBUG( \
-  dbgs() << msg << "BB" << b->getNumber() << "size " << b->size(); \
-  dbgs() << "\n"; \
-);
+#define SHOWNEWBLK(b, msg)                                                     \
+  LLVM_DEBUG(dbgs() << msg << "BB" << b->getNumber() << "size " << b->size();  \
+             dbgs() << "\n";);
 
-#define SHOWBLK_DETAIL(b, msg) \
-DEBUG( \
-  if (b) { \
-  dbgs() << msg << "BB" << b->getNumber() << "size " << b->size(); \
-  b->print(dbgs()); \
-  dbgs() << "\n"; \
-  } \
-);
+#define SHOWBLK_DETAIL(b, msg)                                                 \
+  LLVM_DEBUG(if (b) {                                                          \
+    dbgs() << msg << "BB" << b->getNumber() << "size " << b->size();           \
+    b->print(dbgs());                                                          \
+    dbgs() << "\n";                                                            \
+  });
 
 #define INVALIDSCCNUM -1
 
@@ -158,19 +154,19 @@ public:
   bool runOnMachineFunction(MachineFunction &MF) override {
     TII = MF.getSubtarget<R600Subtarget>().getInstrInfo();
     TRI = &TII->getRegisterInfo();
-    DEBUG(MF.dump(););
+    LLVM_DEBUG(MF.dump(););
     OrderedBlks.clear();
     Visited.clear();
     FuncRep = &MF;
     MLI = &getAnalysis<MachineLoopInfo>();
-    DEBUG(dbgs() << "LoopInfo:\n"; PrintLoopinfo(*MLI););
+    LLVM_DEBUG(dbgs() << "LoopInfo:\n"; PrintLoopinfo(*MLI););
     MDT = &getAnalysis<MachineDominatorTree>();
-    DEBUG(MDT->print(dbgs(), (const Module*)nullptr););
+    LLVM_DEBUG(MDT->print(dbgs(), (const Module *)nullptr););
     PDT = &getAnalysis<MachinePostDominatorTree>();
-    DEBUG(PDT->print(dbgs()););
+    LLVM_DEBUG(PDT->print(dbgs()););
     prepare();
     run();
-    DEBUG(MF.dump(););
+    LLVM_DEBUG(MF.dump(););
     return true;
   }
 
@@ -436,19 +432,19 @@ void AMDGPUCFGStructurizer::reversePredicateSetter(
   for (;; --I) {
     if (I == MBB.end())
       continue;
-    if (I->getOpcode() == AMDGPU::PRED_X) {
+    if (I->getOpcode() == R600::PRED_X) {
       switch (I->getOperand(2).getImm()) {
-      case AMDGPU::PRED_SETE_INT:
-        I->getOperand(2).setImm(AMDGPU::PRED_SETNE_INT);
+      case R600::PRED_SETE_INT:
+        I->getOperand(2).setImm(R600::PRED_SETNE_INT);
         return;
-      case AMDGPU::PRED_SETNE_INT:
-        I->getOperand(2).setImm(AMDGPU::PRED_SETE_INT);
+      case R600::PRED_SETNE_INT:
+        I->getOperand(2).setImm(R600::PRED_SETE_INT);
         return;
-      case AMDGPU::PRED_SETE:
-        I->getOperand(2).setImm(AMDGPU::PRED_SETNE);
+      case R600::PRED_SETE:
+        I->getOperand(2).setImm(R600::PRED_SETNE);
         return;
-      case AMDGPU::PRED_SETNE:
-        I->getOperand(2).setImm(AMDGPU::PRED_SETE);
+      case R600::PRED_SETNE:
+        I->getOperand(2).setImm(R600::PRED_SETE);
         return;
       default:
         llvm_unreachable("PRED_X Opcode invalid!");
@@ -517,10 +513,10 @@ void AMDGPUCFGStructurizer::insertCondBranchBefore(
 
 int AMDGPUCFGStructurizer::getBranchNzeroOpcode(int OldOpcode) {
   switch(OldOpcode) {
-  case AMDGPU::JUMP_COND:
-  case AMDGPU::JUMP: return AMDGPU::IF_PREDICATE_SET;
-  case AMDGPU::BRANCH_COND_i32:
-  case AMDGPU::BRANCH_COND_f32: return AMDGPU::IF_LOGICALNZ_f32;
+  case R600::JUMP_COND:
+  case R600::JUMP: return R600::IF_PREDICATE_SET;
+  case R600::BRANCH_COND_i32:
+  case R600::BRANCH_COND_f32: return R600::IF_LOGICALNZ_f32;
   default: llvm_unreachable("internal error");
   }
   return -1;
@@ -528,10 +524,10 @@ int AMDGPUCFGStructurizer::getBranchNzeroOpcode(int OldOpcode) {
 
 int AMDGPUCFGStructurizer::getBranchZeroOpcode(int OldOpcode) {
   switch(OldOpcode) {
-  case AMDGPU::JUMP_COND:
-  case AMDGPU::JUMP: return AMDGPU::IF_PREDICATE_SET;
-  case AMDGPU::BRANCH_COND_i32:
-  case AMDGPU::BRANCH_COND_f32: return AMDGPU::IF_LOGICALZ_f32;
+  case R600::JUMP_COND:
+  case R600::JUMP: return R600::IF_PREDICATE_SET;
+  case R600::BRANCH_COND_i32:
+  case R600::BRANCH_COND_f32: return R600::IF_LOGICALZ_f32;
   default: llvm_unreachable("internal error");
   }
   return -1;
@@ -539,8 +535,8 @@ int AMDGPUCFGStructurizer::getBranchZeroOpcode(int OldOpcode) {
 
 int AMDGPUCFGStructurizer::getContinueNzeroOpcode(int OldOpcode) {
   switch(OldOpcode) {
-  case AMDGPU::JUMP_COND:
-  case AMDGPU::JUMP: return AMDGPU::CONTINUE_LOGICALNZ_i32;
+  case R600::JUMP_COND:
+  case R600::JUMP: return R600::CONTINUE_LOGICALNZ_i32;
   default: llvm_unreachable("internal error");
   }
   return -1;
@@ -548,8 +544,8 @@ int AMDGPUCFGStructurizer::getContinueNzeroOpcode(int OldOpcode) {
 
 int AMDGPUCFGStructurizer::getContinueZeroOpcode(int OldOpcode) {
   switch(OldOpcode) {
-  case AMDGPU::JUMP_COND:
-  case AMDGPU::JUMP: return AMDGPU::CONTINUE_LOGICALZ_i32;
+  case R600::JUMP_COND:
+  case R600::JUMP: return R600::CONTINUE_LOGICALZ_i32;
   default: llvm_unreachable("internal error");
   }
   return -1;
@@ -577,9 +573,9 @@ AMDGPUCFGStructurizer::getFalseBranch(MachineBasicBlock *MBB,
 
 bool AMDGPUCFGStructurizer::isCondBranch(MachineInstr *MI) {
   switch (MI->getOpcode()) {
-    case AMDGPU::JUMP_COND:
-    case AMDGPU::BRANCH_COND_i32:
-    case AMDGPU::BRANCH_COND_f32: return true;
+    case R600::JUMP_COND:
+    case R600::BRANCH_COND_i32:
+    case R600::BRANCH_COND_f32: return true;
   default:
     return false;
   }
@@ -588,8 +584,8 @@ bool AMDGPUCFGStructurizer::isCondBranch(MachineInstr *MI) {
 
 bool AMDGPUCFGStructurizer::isUncondBranch(MachineInstr *MI) {
   switch (MI->getOpcode()) {
-  case AMDGPU::JUMP:
-  case AMDGPU::BRANCH:
+  case R600::JUMP:
+  case R600::BRANCH:
     return true;
   default:
     return false;
@@ -638,7 +634,7 @@ MachineInstr *AMDGPUCFGStructurizer::getReturnInstr(MachineBasicBlock *MBB) {
   MachineBasicBlock::reverse_iterator It = MBB->rbegin();
   if (It != MBB->rend()) {
     MachineInstr *instr = &(*It);
-    if (instr->getOpcode() == AMDGPU::RETURN)
+    if (instr->getOpcode() == R600::RETURN)
       return instr;
   }
   return nullptr;
@@ -650,9 +646,8 @@ bool AMDGPUCFGStructurizer::isReturnBlock(MachineBasicBlock *MBB) {
   if (MI)
     assert(IsReturn);
   else if (IsReturn)
-    DEBUG(
-      dbgs() << "BB" << MBB->getNumber()
-             <<" is return block without RETURN instr\n";);
+    LLVM_DEBUG(dbgs() << "BB" << MBB->getNumber()
+                      << " is return block without RETURN instr\n";);
   return  IsReturn;
 }
 
@@ -692,8 +687,8 @@ void AMDGPUCFGStructurizer::wrapup(MachineBasicBlock *MBB) {
    MachineBasicBlock::iterator E = MBB->end();
    MachineBasicBlock::iterator It = Pre;
    while (It != E) {
-     if (Pre->getOpcode() == AMDGPU::CONTINUE
-         && It->getOpcode() == AMDGPU::ENDLOOP)
+     if (Pre->getOpcode() == R600::CONTINUE
+         && It->getOpcode() == R600::ENDLOOP)
        ContInstr.push_back(&*Pre);
      Pre = It;
      ++It;
@@ -714,7 +709,7 @@ bool AMDGPUCFGStructurizer::prepare() {
 
   //FIXME: if not reducible flow graph, make it so ???
 
-  DEBUG(dbgs() << "AMDGPUCFGStructurizer::prepare\n";);
+  LLVM_DEBUG(dbgs() << "AMDGPUCFGStructurizer::prepare\n";);
 
   orderBlocks(FuncRep);
 
@@ -757,14 +752,14 @@ bool AMDGPUCFGStructurizer::prepare() {
 
 bool AMDGPUCFGStructurizer::run() {
   //Assume reducible CFG...
-  DEBUG(dbgs() << "AMDGPUCFGStructurizer::run\n");
+  LLVM_DEBUG(dbgs() << "AMDGPUCFGStructurizer::run\n");
 
 #ifdef STRESSTEST
   //Use the worse block ordering to test the algorithm.
   ReverseVector(orderedBlks);
 #endif
 
-  DEBUG(dbgs() << "Ordered blocks:\n"; printOrderedBlocks(););
+  LLVM_DEBUG(dbgs() << "Ordered blocks:\n"; printOrderedBlocks(););
   int NumIter = 0;
   bool Finish = false;
   MachineBasicBlock *MBB;
@@ -774,10 +769,8 @@ bool AMDGPUCFGStructurizer::run() {
 
   do {
     ++NumIter;
-    DEBUG(
-      dbgs() << "numIter = " << NumIter
-             << ", numRemaintedBlk = " << NumRemainedBlk << "\n";
-    );
+    LLVM_DEBUG(dbgs() << "numIter = " << NumIter
+                      << ", numRemaintedBlk = " << NumRemainedBlk << "\n";);
 
     SmallVectorImpl<MachineBasicBlock *>::const_iterator It =
         OrderedBlks.begin();
@@ -799,10 +792,8 @@ bool AMDGPUCFGStructurizer::run() {
         SccBeginMBB = MBB;
         SccNumIter = 0;
         SccNumBlk = NumRemainedBlk; // Init to maximum possible number.
-        DEBUG(
-              dbgs() << "start processing SCC" << getSCCNum(SccBeginMBB);
-              dbgs() << "\n";
-        );
+        LLVM_DEBUG(dbgs() << "start processing SCC" << getSCCNum(SccBeginMBB);
+                   dbgs() << "\n";);
       }
 
       if (!isRetiredBlock(MBB))
@@ -817,20 +808,16 @@ bool AMDGPUCFGStructurizer::run() {
         ++SccNumIter;
         int sccRemainedNumBlk = countActiveBlock(SccBeginIter, It);
         if (sccRemainedNumBlk != 1 && sccRemainedNumBlk >= SccNumBlk) {
-          DEBUG(
-            dbgs() << "Can't reduce SCC " << getSCCNum(MBB)
-                   << ", sccNumIter = " << SccNumIter;
-            dbgs() << "doesn't make any progress\n";
-          );
+          LLVM_DEBUG(dbgs() << "Can't reduce SCC " << getSCCNum(MBB)
+                            << ", sccNumIter = " << SccNumIter;
+                     dbgs() << "doesn't make any progress\n";);
           ContNextScc = true;
         } else if (sccRemainedNumBlk != 1 && sccRemainedNumBlk < SccNumBlk) {
           SccNumBlk = sccRemainedNumBlk;
           It = SccBeginIter;
           ContNextScc = false;
-          DEBUG(
-            dbgs() << "repeat processing SCC" << getSCCNum(MBB)
-                   << "sccNumIter = " << SccNumIter << '\n';
-          );
+          LLVM_DEBUG(dbgs() << "repeat processing SCC" << getSCCNum(MBB)
+                            << "sccNumIter = " << SccNumIter << '\n';);
         } else {
           // Finish the current scc.
           ContNextScc = true;
@@ -848,9 +835,7 @@ bool AMDGPUCFGStructurizer::run() {
         *GraphTraits<MachineFunction *>::nodes_begin(FuncRep);
     if (EntryMBB->succ_size() == 0) {
       Finish = true;
-      DEBUG(
-        dbgs() << "Reduce to one block\n";
-      );
+      LLVM_DEBUG(dbgs() << "Reduce to one block\n";);
     } else {
       int NewnumRemainedBlk
         = countActiveBlock(OrderedBlks.begin(), OrderedBlks.end());
@@ -860,9 +845,7 @@ bool AMDGPUCFGStructurizer::run() {
         NumRemainedBlk = NewnumRemainedBlk;
       } else {
         MakeProgress = false;
-        DEBUG(
-          dbgs() << "No progress\n";
-        );
+        LLVM_DEBUG(dbgs() << "No progress\n";);
       }
     }
   } while (!Finish && MakeProgress);
@@ -875,9 +858,7 @@ bool AMDGPUCFGStructurizer::run() {
       It != E; ++It) {
     if ((*It).second && (*It).second->IsRetired) {
       assert(((*It).first)->getNumber() != -1);
-      DEBUG(
-        dbgs() << "Erase BB" << ((*It).first)->getNumber() << "\n";
-      );
+      LLVM_DEBUG(dbgs() << "Erase BB" << ((*It).first)->getNumber() << "\n";);
       (*It).first->eraseFromParent();  //Remove from the parent Function.
     }
     delete (*It).second;
@@ -886,7 +867,7 @@ bool AMDGPUCFGStructurizer::run() {
   LLInfoMap.clear();
 
   if (!Finish) {
-    DEBUG(FuncRep->viewCFG());
+    LLVM_DEBUG(FuncRep->viewCFG());
     report_fatal_error("IRREDUCIBLE_CFG");
   }
 
@@ -920,17 +901,13 @@ int AMDGPUCFGStructurizer::patternMatch(MachineBasicBlock *MBB) {
   int NumMatch = 0;
   int CurMatch;
 
-  DEBUG(
-        dbgs() << "Begin patternMatch BB" << MBB->getNumber() << "\n";
-  );
+  LLVM_DEBUG(dbgs() << "Begin patternMatch BB" << MBB->getNumber() << "\n";);
 
   while ((CurMatch = patternMatchGroup(MBB)) > 0)
     NumMatch += CurMatch;
 
-  DEBUG(
-        dbgs() << "End patternMatch BB" << MBB->getNumber()
-      << ", numMatch = " << NumMatch << "\n";
-  );
+  LLVM_DEBUG(dbgs() << "End patternMatch BB" << MBB->getNumber()
+                    << ", numMatch = " << NumMatch << "\n";);
 
   return NumMatch;
 }
@@ -1050,7 +1027,7 @@ int AMDGPUCFGStructurizer::loopendPatternMatch() {
   for (MachineLoop *ExaminedLoop : NestedLoops) {
     if (ExaminedLoop->getNumBlocks() == 0 || Visited[ExaminedLoop])
       continue;
-    DEBUG(dbgs() << "Processing:\n"; ExaminedLoop->dump(););
+    LLVM_DEBUG(dbgs() << "Processing:\n"; ExaminedLoop->dump(););
     int NumBreak = mergeLoop(ExaminedLoop);
     if (NumBreak == -1)
       break;
@@ -1064,7 +1041,8 @@ int AMDGPUCFGStructurizer::mergeLoop(MachineLoop *LoopRep) {
   MBBVector ExitingMBBs;
   LoopRep->getExitingBlocks(ExitingMBBs);
   assert(!ExitingMBBs.empty() && "Infinite Loop not supported");
-  DEBUG(dbgs() << "Loop has " << ExitingMBBs.size() << " exiting blocks\n";);
+  LLVM_DEBUG(dbgs() << "Loop has " << ExitingMBBs.size()
+                    << " exiting blocks\n";);
   // We assume a single ExitBlk
   MBBVector ExitBlks;
   LoopRep->getExitBlocks(ExitBlks);
@@ -1106,11 +1084,9 @@ bool AMDGPUCFGStructurizer::isSameloopDetachedContbreak(
     if (LoopRep&& LoopRep == MLI->getLoopFor(Src2MBB)) {
       MachineBasicBlock *&TheEntry = LLInfoMap[LoopRep];
       if (TheEntry) {
-        DEBUG(
-          dbgs() << "isLoopContBreakBlock yes src1 = BB"
-                 << Src1MBB->getNumber()
-                 << " src2 = BB" << Src2MBB->getNumber() << "\n";
-        );
+        LLVM_DEBUG(dbgs() << "isLoopContBreakBlock yes src1 = BB"
+                          << Src1MBB->getNumber() << " src2 = BB"
+                          << Src2MBB->getNumber() << "\n";);
         return true;
       }
     }
@@ -1122,9 +1098,8 @@ int AMDGPUCFGStructurizer::handleJumpintoIf(MachineBasicBlock *HeadMBB,
     MachineBasicBlock *TrueMBB, MachineBasicBlock *FalseMBB) {
   int Num = handleJumpintoIfImp(HeadMBB, TrueMBB, FalseMBB);
   if (Num == 0) {
-    DEBUG(
-      dbgs() << "handleJumpintoIf swap trueBlk and FalseBlk" << "\n";
-    );
+    LLVM_DEBUG(dbgs() << "handleJumpintoIf swap trueBlk and FalseBlk"
+                      << "\n";);
     Num = handleJumpintoIfImp(HeadMBB, FalseMBB, TrueMBB);
   }
   return Num;
@@ -1138,22 +1113,16 @@ int AMDGPUCFGStructurizer::handleJumpintoIfImp(MachineBasicBlock *HeadMBB,
   //trueBlk could be the common post dominator
   DownBlk = TrueMBB;
 
-  DEBUG(
-    dbgs() << "handleJumpintoIfImp head = BB" << HeadMBB->getNumber()
-           << " true = BB" << TrueMBB->getNumber()
-           << ", numSucc=" << TrueMBB->succ_size()
-           << " false = BB" << FalseMBB->getNumber() << "\n";
-  );
+  LLVM_DEBUG(dbgs() << "handleJumpintoIfImp head = BB" << HeadMBB->getNumber()
+                    << " true = BB" << TrueMBB->getNumber()
+                    << ", numSucc=" << TrueMBB->succ_size() << " false = BB"
+                    << FalseMBB->getNumber() << "\n";);
 
   while (DownBlk) {
-    DEBUG(
-      dbgs() << "check down = BB" << DownBlk->getNumber();
-    );
+    LLVM_DEBUG(dbgs() << "check down = BB" << DownBlk->getNumber(););
 
     if (singlePathTo(FalseMBB, DownBlk) == SinglePath_InPath) {
-      DEBUG(
-        dbgs() << " working\n";
-      );
+      LLVM_DEBUG(dbgs() << " working\n";);
 
       Num += cloneOnSideEntryTo(HeadMBB, TrueMBB, DownBlk);
       Num += cloneOnSideEntryTo(HeadMBB, FalseMBB, DownBlk);
@@ -1166,9 +1135,7 @@ int AMDGPUCFGStructurizer::handleJumpintoIfImp(MachineBasicBlock *HeadMBB,
 
       break;
     }
-    DEBUG(
-      dbgs() << " not working\n";
-    );
+    LLVM_DEBUG(dbgs() << " not working\n";);
     DownBlk = (DownBlk->succ_size() == 1) ? (*DownBlk->succ_begin()) : nullptr;
   } // walk down the postDomTree
 
@@ -1247,10 +1214,9 @@ int AMDGPUCFGStructurizer::improveSimpleJumpintoIf(MachineBasicBlock *HeadMBB,
   if (!MigrateFalse && FalseMBB && FalseMBB->pred_size() > 1)
     MigrateFalse = true;
 
-  DEBUG(
-    dbgs() << "before improveSimpleJumpintoIf: ";
-    showImproveSimpleJumpintoIf(HeadMBB, TrueMBB, FalseMBB, LandBlk, 0);
-  );
+  LLVM_DEBUG(
+      dbgs() << "before improveSimpleJumpintoIf: ";
+      showImproveSimpleJumpintoIf(HeadMBB, TrueMBB, FalseMBB, LandBlk, 0););
 
   // org: headBlk => if () {trueBlk} else {falseBlk} => landBlk
   //
@@ -1337,15 +1303,15 @@ int AMDGPUCFGStructurizer::improveSimpleJumpintoIf(MachineBasicBlock *HeadMBB,
 
   bool LandBlkHasOtherPred = (LandBlk->pred_size() > 2);
 
-  //insert AMDGPU::ENDIF to avoid special case "input landBlk == NULL"
-  MachineBasicBlock::iterator I = insertInstrBefore(LandBlk, AMDGPU::ENDIF);
+  //insert R600::ENDIF to avoid special case "input landBlk == NULL"
+  MachineBasicBlock::iterator I = insertInstrBefore(LandBlk, R600::ENDIF);
 
   if (LandBlkHasOtherPred) {
     report_fatal_error("Extra register needed to handle CFG");
     unsigned CmpResReg =
       HeadMBB->getParent()->getRegInfo().createVirtualRegister(I32RC);
     report_fatal_error("Extra compare instruction needed to handle CFG");
-    insertCondBranchBefore(LandBlk, I, AMDGPU::IF_PREDICATE_SET,
+    insertCondBranchBefore(LandBlk, I, R600::IF_PREDICATE_SET,
         CmpResReg, DebugLoc());
   }
 
@@ -1353,7 +1319,7 @@ int AMDGPUCFGStructurizer::improveSimpleJumpintoIf(MachineBasicBlock *HeadMBB,
   // cause an assertion failure in the PostRA scheduling pass.
   unsigned InitReg =
     HeadMBB->getParent()->getRegInfo().createVirtualRegister(I32RC);
-  insertCondBranchBefore(LandBlk, I, AMDGPU::IF_PREDICATE_SET, InitReg,
+  insertCondBranchBefore(LandBlk, I, R600::IF_PREDICATE_SET, InitReg,
       DebugLoc());
 
   if (MigrateTrue) {
@@ -1363,7 +1329,7 @@ int AMDGPUCFGStructurizer::improveSimpleJumpintoIf(MachineBasicBlock *HeadMBB,
     // (initVal != 1).
     report_fatal_error("Extra register needed to handle CFG");
   }
-  insertInstrBefore(I, AMDGPU::ELSE);
+  insertInstrBefore(I, R600::ELSE);
 
   if (MigrateFalse) {
     migrateInstruction(FalseMBB, LandBlk, I);
@@ -1375,7 +1341,7 @@ int AMDGPUCFGStructurizer::improveSimpleJumpintoIf(MachineBasicBlock *HeadMBB,
 
   if (LandBlkHasOtherPred) {
     // add endif
-    insertInstrBefore(I, AMDGPU::ENDIF);
+    insertInstrBefore(I, R600::ENDIF);
 
     // put initReg = 2 to other predecessors of landBlk
     for (MachineBasicBlock::pred_iterator PI = LandBlk->pred_begin(),
@@ -1385,10 +1351,9 @@ int AMDGPUCFGStructurizer::improveSimpleJumpintoIf(MachineBasicBlock *HeadMBB,
         report_fatal_error("Extra register needed to handle CFG");
     }
   }
-  DEBUG(
-    dbgs() << "result from improveSimpleJumpintoIf: ";
-    showImproveSimpleJumpintoIf(HeadMBB, TrueMBB, FalseMBB, LandBlk, 0);
-  );
+  LLVM_DEBUG(
+      dbgs() << "result from improveSimpleJumpintoIf: ";
+      showImproveSimpleJumpintoIf(HeadMBB, TrueMBB, FalseMBB, LandBlk, 0););
 
   // update landBlk
   *LandMBBPtr = LandBlk;
@@ -1398,10 +1363,8 @@ int AMDGPUCFGStructurizer::improveSimpleJumpintoIf(MachineBasicBlock *HeadMBB,
 
 void AMDGPUCFGStructurizer::mergeSerialBlock(MachineBasicBlock *DstMBB,
     MachineBasicBlock *SrcMBB) {
-  DEBUG(
-    dbgs() << "serialPattern BB" << DstMBB->getNumber()
-           << " <= BB" << SrcMBB->getNumber() << "\n";
-  );
+  LLVM_DEBUG(dbgs() << "serialPattern BB" << DstMBB->getNumber() << " <= BB"
+                    << SrcMBB->getNumber() << "\n";);
   DstMBB->splice(DstMBB->end(), SrcMBB, SrcMBB->begin(), SrcMBB->end());
 
   DstMBB->removeSuccessor(SrcMBB, true);
@@ -1416,26 +1379,15 @@ void AMDGPUCFGStructurizer::mergeIfthenelseBlock(MachineInstr *BranchMI,
     MachineBasicBlock *MBB, MachineBasicBlock *TrueMBB,
     MachineBasicBlock *FalseMBB, MachineBasicBlock *LandMBB) {
   assert (TrueMBB);
-  DEBUG(
-    dbgs() << "ifPattern BB" << MBB->getNumber();
-    dbgs() << "{  ";
-    if (TrueMBB) {
-      dbgs() << "BB" << TrueMBB->getNumber();
-    }
-    dbgs() << "  } else ";
-    dbgs() << "{  ";
-    if (FalseMBB) {
-      dbgs() << "BB" << FalseMBB->getNumber();
-    }
-    dbgs() << "  }\n ";
-    dbgs() << "landBlock: ";
-    if (!LandMBB) {
-      dbgs() << "NULL";
-    } else {
-      dbgs() << "BB" << LandMBB->getNumber();
-    }
-    dbgs() << "\n";
-  );
+  LLVM_DEBUG(dbgs() << "ifPattern BB" << MBB->getNumber(); dbgs() << "{  ";
+             if (TrueMBB) { dbgs() << "BB" << TrueMBB->getNumber(); } dbgs()
+             << "  } else ";
+             dbgs() << "{  "; if (FalseMBB) {
+               dbgs() << "BB" << FalseMBB->getNumber();
+             } dbgs() << "  }\n ";
+             dbgs() << "landBlock: "; if (!LandMBB) { dbgs() << "NULL"; } else {
+               dbgs() << "BB" << LandMBB->getNumber();
+             } dbgs() << "\n";);
 
   int OldOpcode = BranchMI->getOpcode();
   DebugLoc BranchDL = BranchMI->getDebugLoc();
@@ -1462,7 +1414,7 @@ void AMDGPUCFGStructurizer::mergeIfthenelseBlock(MachineInstr *BranchMI,
   }
 
   if (FalseMBB) {
-    insertInstrBefore(I, AMDGPU::ELSE);
+    insertInstrBefore(I, R600::ELSE);
     MBB->splice(I, FalseMBB, FalseMBB->begin(),
                    FalseMBB->end());
     MBB->removeSuccessor(FalseMBB, true);
@@ -1471,7 +1423,7 @@ void AMDGPUCFGStructurizer::mergeIfthenelseBlock(MachineInstr *BranchMI,
     retireBlock(FalseMBB);
     MLI->removeBlock(FalseMBB);
   }
-  insertInstrBefore(I, AMDGPU::ENDIF);
+  insertInstrBefore(I, R600::ENDIF);
 
   BranchMI->eraseFromParent();
 
@@ -1481,18 +1433,19 @@ void AMDGPUCFGStructurizer::mergeIfthenelseBlock(MachineInstr *BranchMI,
 
 void AMDGPUCFGStructurizer::mergeLooplandBlock(MachineBasicBlock *DstBlk,
     MachineBasicBlock *LandMBB) {
-  DEBUG(dbgs() << "loopPattern header = BB" << DstBlk->getNumber()
-               << " land = BB" << LandMBB->getNumber() << "\n";);
+  LLVM_DEBUG(dbgs() << "loopPattern header = BB" << DstBlk->getNumber()
+                    << " land = BB" << LandMBB->getNumber() << "\n";);
 
-  insertInstrBefore(DstBlk, AMDGPU::WHILELOOP, DebugLoc());
-  insertInstrEnd(DstBlk, AMDGPU::ENDLOOP, DebugLoc());
+  insertInstrBefore(DstBlk, R600::WHILELOOP, DebugLoc());
+  insertInstrEnd(DstBlk, R600::ENDLOOP, DebugLoc());
   DstBlk->replaceSuccessor(DstBlk, LandMBB);
 }
 
 void AMDGPUCFGStructurizer::mergeLoopbreakBlock(MachineBasicBlock *ExitingMBB,
     MachineBasicBlock *LandMBB) {
-  DEBUG(dbgs() << "loopbreakPattern exiting = BB" << ExitingMBB->getNumber()
-               << " land = BB" << LandMBB->getNumber() << "\n";);
+  LLVM_DEBUG(dbgs() << "loopbreakPattern exiting = BB"
+                    << ExitingMBB->getNumber() << " land = BB"
+                    << LandMBB->getNumber() << "\n";);
   MachineInstr *BranchMI = getLoopendBlockBranchInstr(ExitingMBB);
   assert(BranchMI && isCondBranch(BranchMI));
   DebugLoc DL = BranchMI->getDebugLoc();
@@ -1500,9 +1453,9 @@ void AMDGPUCFGStructurizer::mergeLoopbreakBlock(MachineBasicBlock *ExitingMBB,
   MachineBasicBlock::iterator I = BranchMI;
   if (TrueBranch != LandMBB)
     reversePredicateSetter(I, *I->getParent());
-  insertCondBranchBefore(ExitingMBB, I, AMDGPU::IF_PREDICATE_SET, AMDGPU::PREDICATE_BIT, DL);
-  insertInstrBefore(I, AMDGPU::BREAK);
-  insertInstrBefore(I, AMDGPU::ENDIF);
+  insertCondBranchBefore(ExitingMBB, I, R600::IF_PREDICATE_SET, R600::PREDICATE_BIT, DL);
+  insertInstrBefore(I, R600::BREAK);
+  insertInstrBefore(I, R600::ENDIF);
   //now branchInst can be erase safely
   BranchMI->eraseFromParent();
   //now take care of successors, retire blocks
@@ -1511,9 +1464,9 @@ void AMDGPUCFGStructurizer::mergeLoopbreakBlock(MachineBasicBlock *ExitingMBB,
 
 void AMDGPUCFGStructurizer::settleLoopcontBlock(MachineBasicBlock *ContingMBB,
     MachineBasicBlock *ContMBB) {
-  DEBUG(dbgs() << "settleLoopcontBlock conting = BB"
-               << ContingMBB->getNumber()
-               << ", cont = BB" << ContMBB->getNumber() << "\n";);
+  LLVM_DEBUG(dbgs() << "settleLoopcontBlock conting = BB"
+                    << ContingMBB->getNumber() << ", cont = BB"
+                    << ContMBB->getNumber() << "\n";);
 
   MachineInstr *MI = getLoopendBlockBranchInstr(ContingMBB);
   if (MI) {
@@ -1531,8 +1484,8 @@ void AMDGPUCFGStructurizer::settleLoopcontBlock(MachineBasicBlock *ContingMBB,
           getBranchZeroOpcode(OldOpcode);
       insertCondBranchBefore(I, BranchOpcode, DL);
       // insertEnd to ensure phi-moves, if exist, go before the continue-instr.
-      insertInstrEnd(ContingMBB, AMDGPU::CONTINUE, DL);
-      insertInstrEnd(ContingMBB, AMDGPU::ENDIF, DL);
+      insertInstrEnd(ContingMBB, R600::CONTINUE, DL);
+      insertInstrEnd(ContingMBB, R600::ENDIF, DL);
     } else {
       int BranchOpcode =
           TrueBranch == ContMBB ? getContinueNzeroOpcode(OldOpcode) :
@@ -1547,7 +1500,7 @@ void AMDGPUCFGStructurizer::settleLoopcontBlock(MachineBasicBlock *ContingMBB,
     // location we've just inserted that reference here so it should be
     // representative insertEnd to ensure phi-moves, if exist, go before the
     // continue-instr.
-    insertInstrEnd(ContingMBB, AMDGPU::CONTINUE,
+    insertInstrEnd(ContingMBB, R600::CONTINUE,
         getLastDebugLocInBB(ContingMBB));
   }
 }
@@ -1587,10 +1540,9 @@ AMDGPUCFGStructurizer::cloneBlockForPredecessor(MachineBasicBlock *MBB,
 
   numClonedInstr += MBB->size();
 
-  DEBUG(
-    dbgs() << "Cloned block: " << "BB"
-           << MBB->getNumber() << "size " << MBB->size() << "\n";
-  );
+  LLVM_DEBUG(dbgs() << "Cloned block: "
+                    << "BB" << MBB->getNumber() << "size " << MBB->size()
+                    << "\n";);
 
   SHOWNEWBLK(CloneMBB, "result of Cloned block: ");
 
@@ -1603,26 +1555,22 @@ void AMDGPUCFGStructurizer::migrateInstruction(MachineBasicBlock *SrcMBB,
   //look for the input branchinstr, not the AMDGPU branchinstr
   MachineInstr *BranchMI = getNormalBlockBranchInstr(SrcMBB);
   if (!BranchMI) {
-    DEBUG(
-      dbgs() << "migrateInstruction don't see branch instr\n";
-    );
+    LLVM_DEBUG(dbgs() << "migrateInstruction don't see branch instr\n";);
     SpliceEnd = SrcMBB->end();
   } else {
-    DEBUG(dbgs() << "migrateInstruction see branch instr: " << *BranchMI);
+    LLVM_DEBUG(dbgs() << "migrateInstruction see branch instr: " << *BranchMI);
     SpliceEnd = BranchMI;
   }
-  DEBUG(
-    dbgs() << "migrateInstruction before splice dstSize = " << DstMBB->size()
-      << "srcSize = " << SrcMBB->size() << "\n";
-  );
+  LLVM_DEBUG(dbgs() << "migrateInstruction before splice dstSize = "
+                    << DstMBB->size() << "srcSize = " << SrcMBB->size()
+                    << "\n";);
 
   //splice insert before insertPos
   DstMBB->splice(I, SrcMBB, SrcMBB->begin(), SpliceEnd);
 
-  DEBUG(
-    dbgs() << "migrateInstruction after splice dstSize = " << DstMBB->size()
-      << "srcSize = " << SrcMBB->size() << '\n';
-  );
+  LLVM_DEBUG(dbgs() << "migrateInstruction after splice dstSize = "
+                    << DstMBB->size() << "srcSize = " << SrcMBB->size()
+                    << '\n';);
 }
 
 MachineBasicBlock *
@@ -1640,7 +1588,7 @@ AMDGPUCFGStructurizer::normalizeInfiniteLoopExit(MachineLoop* LoopRep) {
   MachineBasicBlock *DummyExitBlk = FuncRep->CreateMachineBasicBlock();
   FuncRep->push_back(DummyExitBlk);  //insert to function
   SHOWNEWBLK(DummyExitBlk, "DummyExitBlock to normalize infiniteLoop: ");
-  DEBUG(dbgs() << "Old branch instr: " << *BranchMI << "\n";);
+  LLVM_DEBUG(dbgs() << "Old branch instr: " << *BranchMI << "\n";);
   LLVMContext &Ctx = LoopHeader->getParent()->getFunction().getContext();
   Ctx.emitError("Extra register needed to handle CFG");
   return nullptr;
@@ -1653,7 +1601,7 @@ void AMDGPUCFGStructurizer::removeUnconditionalBranch(MachineBasicBlock *MBB) {
   // test_fc_do_while_or.c need to fix the upstream on this to remove the loop.
   while ((BranchMI = getLoopendBlockBranchInstr(MBB))
           && isUncondBranch(BranchMI)) {
-    DEBUG(dbgs() << "Removing uncond branch instr: " << *BranchMI);
+    LLVM_DEBUG(dbgs() << "Removing uncond branch instr: " << *BranchMI);
     BranchMI->eraseFromParent();
   }
 }
@@ -1669,7 +1617,7 @@ void AMDGPUCFGStructurizer::removeRedundantConditionalBranch(
 
   MachineInstr *BranchMI = getNormalBlockBranchInstr(MBB);
   assert(BranchMI && isCondBranch(BranchMI));
-  DEBUG(dbgs() << "Removing unneeded cond branch instr: " << *BranchMI);
+  LLVM_DEBUG(dbgs() << "Removing unneeded cond branch instr: " << *BranchMI);
   BranchMI->eraseFromParent();
   SHOWNEWBLK(MBB1, "Removing redundant successor");
   MBB->removeSuccessor(MBB1, true);
@@ -1679,7 +1627,7 @@ void AMDGPUCFGStructurizer::addDummyExitBlock(
     SmallVectorImpl<MachineBasicBlock*> &RetMBB) {
   MachineBasicBlock *DummyExitBlk = FuncRep->CreateMachineBasicBlock();
   FuncRep->push_back(DummyExitBlk);  //insert to function
-  insertInstrEnd(DummyExitBlk, AMDGPU::RETURN);
+  insertInstrEnd(DummyExitBlk, R600::RETURN);
 
   for (SmallVectorImpl<MachineBasicBlock *>::iterator It = RetMBB.begin(),
        E = RetMBB.end(); It != E; ++It) {
@@ -1688,10 +1636,8 @@ void AMDGPUCFGStructurizer::addDummyExitBlock(
     if (MI)
       MI->eraseFromParent();
     MBB->addSuccessor(DummyExitBlk);
-    DEBUG(
-      dbgs() << "Add dummyExitBlock to BB" << MBB->getNumber()
-             << " successors\n";
-    );
+    LLVM_DEBUG(dbgs() << "Add dummyExitBlock to BB" << MBB->getNumber()
+                      << " successors\n";);
   }
   SHOWNEWBLK(DummyExitBlk, "DummyExitBlock: ");
 }
@@ -1710,9 +1656,7 @@ void AMDGPUCFGStructurizer::recordSccnum(MachineBasicBlock *MBB,
 }
 
 void AMDGPUCFGStructurizer::retireBlock(MachineBasicBlock *MBB) {
-  DEBUG(
-        dbgs() << "Retiring BB" << MBB->getNumber() << "\n";
-  );
+  LLVM_DEBUG(dbgs() << "Retiring BB" << MBB->getNumber() << "\n";);
 
   BlockInformation *&SrcBlkInfo = BlockInfoMap[MBB];
 
diff --git a/lib/Target/AMDGPU/AMDKernelCodeT.h b/lib/Target/AMDGPU/AMDKernelCodeT.h
index 5d243e949fd3..289642aaa2d0 100644
--- a/lib/Target/AMDGPU/AMDKernelCodeT.h
+++ b/lib/Target/AMDGPU/AMDKernelCodeT.h
@@ -198,7 +198,7 @@ enum amd_code_property_mask_t {
   AMD_CODE_PROPERTY_RESERVED2 = ((1 << AMD_CODE_PROPERTY_RESERVED2_WIDTH) - 1) << AMD_CODE_PROPERTY_RESERVED2_SHIFT
 };
 
-/// @brief The hsa_ext_control_directives_t specifies the values for the HSAIL
+/// The hsa_ext_control_directives_t specifies the values for the HSAIL
 /// control directives. These control how the finalizer generates code. This
 /// struct is used both as an argument to hsaFinalizeKernel to specify values for
 /// the control directives, and is used in HsaKernelCode to record the values of
@@ -551,14 +551,8 @@ typedef struct amd_kernel_code_s {
   int64_t kernel_code_prefetch_byte_offset;
   uint64_t kernel_code_prefetch_byte_size;
 
-  /// Number of bytes of scratch backing memory required for full
-  /// occupancy of target chip. This takes into account the number of
-  /// bytes of scratch per work-item, the wavefront size, the maximum
-  /// number of wavefronts per CU, and the number of CUs. This is an
-  /// upper limit on scratch. If the grid being dispatched is small it
-  /// may only need less than this. If the kernel uses no scratch, or
-  /// the Finalizer has not computed this value, it must be 0.
-  uint64_t max_scratch_backing_memory_byte_size;
+  /// Reserved. Must be 0.
+  uint64_t reserved0;
 
   /// Shader program settings for CS. Contains COMPUTE_PGM_RSRC1 and
   /// COMPUTE_PGM_RSRC2 registers.
diff --git a/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp b/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp
index ebf656c549ec..31e2885c833d 100644
--- a/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp
+++ b/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp
@@ -12,6 +12,7 @@
 #include "MCTargetDesc/AMDGPUMCTargetDesc.h"
 #include "MCTargetDesc/AMDGPUTargetStreamer.h"
 #include "SIDefines.h"
+#include "SIInstrInfo.h"
 #include "Utils/AMDGPUAsmUtils.h"
 #include "Utils/AMDGPUBaseInfo.h"
 #include "Utils/AMDKernelCodeTUtils.h"
@@ -25,7 +26,6 @@
 #include "llvm/ADT/StringSwitch.h"
 #include "llvm/ADT/Twine.h"
 #include "llvm/BinaryFormat/ELF.h"
-#include "llvm/CodeGen/MachineValueType.h"
 #include "llvm/MC/MCAsmInfo.h"
 #include "llvm/MC/MCContext.h"
 #include "llvm/MC/MCExpr.h"
@@ -42,9 +42,11 @@
 #include "llvm/MC/MCSubtargetInfo.h"
 #include "llvm/MC/MCSymbol.h"
 #include "llvm/Support/AMDGPUMetadata.h"
+#include "llvm/Support/AMDHSAKernelDescriptor.h"
 #include "llvm/Support/Casting.h"
 #include "llvm/Support/Compiler.h"
 #include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/MachineValueType.h"
 #include "llvm/Support/MathExtras.h"
 #include "llvm/Support/SMLoc.h"
 #include "llvm/Support/TargetRegistry.h"
@@ -60,6 +62,7 @@
 
 using namespace llvm;
 using namespace llvm::AMDGPU;
+using namespace llvm::amdhsa;
 
 namespace {
 
@@ -128,6 +131,7 @@ public:
   enum ImmTy {
     ImmTyNone,
     ImmTyGDS,
+    ImmTyLDS,
     ImmTyOffen,
     ImmTyIdxen,
     ImmTyAddr64,
@@ -138,6 +142,7 @@ public:
     ImmTyGLC,
     ImmTySLC,
     ImmTyTFE,
+    ImmTyD16,
     ImmTyClampSI,
     ImmTyOModSI,
     ImmTyDppCtrl,
@@ -267,7 +272,11 @@ public:
     return isOff() || isRegClass(AMDGPU::VGPR_32RegClassID);
   }
 
-  bool isSDWARegKind() const;
+  bool isSDWAOperand(MVT type) const;
+  bool isSDWAFP16Operand() const;
+  bool isSDWAFP32Operand() const;
+  bool isSDWAInt16Operand() const;
+  bool isSDWAInt32Operand() const;
 
   bool isImmTy(ImmTy ImmT) const {
     return isImm() && Imm.Type == ImmT;
@@ -282,7 +291,7 @@ public:
   bool isDMask() const { return isImmTy(ImmTyDMask); }
   bool isUNorm() const { return isImmTy(ImmTyUNorm); }
   bool isDA() const { return isImmTy(ImmTyDA); }
-  bool isR128() const { return isImmTy(ImmTyUNorm); }
+  bool isR128() const { return isImmTy(ImmTyR128); }
   bool isLWE() const { return isImmTy(ImmTyLWE); }
   bool isOff() const { return isImmTy(ImmTyOff); }
   bool isExpTgt() const { return isImmTy(ImmTyExpTgt); }
@@ -298,9 +307,11 @@ public:
   bool isOffsetU12() const { return (isImmTy(ImmTyOffset) || isImmTy(ImmTyInstOffset)) && isUInt<12>(getImm()); }
   bool isOffsetS13() const { return (isImmTy(ImmTyOffset) || isImmTy(ImmTyInstOffset)) && isInt<13>(getImm()); }
   bool isGDS() const { return isImmTy(ImmTyGDS); }
+  bool isLDS() const { return isImmTy(ImmTyLDS); }
   bool isGLC() const { return isImmTy(ImmTyGLC); }
   bool isSLC() const { return isImmTy(ImmTySLC); }
   bool isTFE() const { return isImmTy(ImmTyTFE); }
+  bool isD16() const { return isImmTy(ImmTyD16); }
   bool isDFMT() const { return isImmTy(ImmTyDFMT) && isUInt<8>(getImm()); }
   bool isNFMT() const { return isImmTy(ImmTyNFMT) && isUInt<8>(getImm()); }
   bool isBankMask() const { return isImmTy(ImmTyDppBankMask); }
@@ -434,7 +445,7 @@ public:
   }
 
   bool isVSrcB32() const {
-    return isVCSrcF32() || isLiteralImm(MVT::i32);
+    return isVCSrcF32() || isLiteralImm(MVT::i32) || isExpr();
   }
 
   bool isVSrcB64() const {
@@ -451,7 +462,7 @@ public:
   }
 
   bool isVSrcF32() const {
-    return isVCSrcF32() || isLiteralImm(MVT::f32);
+    return isVCSrcF32() || isLiteralImm(MVT::f32) || isExpr();
   }
 
   bool isVSrcF64() const {
@@ -643,6 +654,7 @@ public:
     switch (Type) {
     case ImmTyNone: OS << "None"; break;
     case ImmTyGDS: OS << "GDS"; break;
+    case ImmTyLDS: OS << "LDS"; break;
     case ImmTyOffen: OS << "Offen"; break;
     case ImmTyIdxen: OS << "Idxen"; break;
     case ImmTyAddr64: OS << "Addr64"; break;
@@ -653,6 +665,7 @@ public:
     case ImmTyGLC: OS << "GLC"; break;
     case ImmTySLC: OS << "SLC"; break;
     case ImmTyTFE: OS << "TFE"; break;
+    case ImmTyD16: OS << "D16"; break;
     case ImmTyDFMT: OS << "DFMT"; break;
     case ImmTyNFMT: OS << "NFMT"; break;
     case ImmTyClampSI: OS << "ClampSI"; break;
@@ -815,6 +828,10 @@ public:
 class AMDGPUAsmParser : public MCTargetAsmParser {
   MCAsmParser &Parser;
 
+  // Number of extra operands parsed after the first optional operand.
+  // This may be necessary to skip hardcoded mandatory operands.
+  static const unsigned MAX_OPR_LOOKAHEAD = 8;
+
   unsigned ForcedEncodingSize = 0;
   bool ForcedDPP = false;
   bool ForcedSDWA = false;
@@ -830,6 +847,27 @@ class AMDGPUAsmParser : public MCTargetAsmParser {
 
 private:
   bool ParseAsAbsoluteExpression(uint32_t &Ret);
+  bool OutOfRangeError(SMRange Range);
+  /// Calculate VGPR/SGPR blocks required for given target, reserved
+  /// registers, and user-specified NextFreeXGPR values.
+  ///
+  /// \param Features [in] Target features, used for bug corrections.
+  /// \param VCCUsed [in] Whether VCC special SGPR is reserved.
+  /// \param FlatScrUsed [in] Whether FLAT_SCRATCH special SGPR is reserved.
+  /// \param XNACKUsed [in] Whether XNACK_MASK special SGPR is reserved.
+  /// \param NextFreeVGPR [in] Max VGPR number referenced, plus one.
+  /// \param VGPRRange [in] Token range, used for VGPR diagnostics.
+  /// \param NextFreeSGPR [in] Max SGPR number referenced, plus one.
+  /// \param SGPRRange [in] Token range, used for SGPR diagnostics.
+  /// \param VGPRBlocks [out] Result VGPR block count.
+  /// \param SGPRBlocks [out] Result SGPR block count.
+  bool calculateGPRBlocks(const FeatureBitset &Features, bool VCCUsed,
+                          bool FlatScrUsed, bool XNACKUsed,
+                          unsigned NextFreeVGPR, SMRange VGPRRange,
+                          unsigned NextFreeSGPR, SMRange SGPRRange,
+                          unsigned &VGPRBlocks, unsigned &SGPRBlocks);
+  bool ParseDirectiveAMDGCNTarget();
+  bool ParseDirectiveAMDHSAKernel();
   bool ParseDirectiveMajorMinor(uint32_t &Major, uint32_t &Minor);
   bool ParseDirectiveHSACodeObjectVersion();
   bool ParseDirectiveHSACodeObjectISA();
@@ -848,8 +886,12 @@ private:
   bool ParseAMDGPURegister(RegisterKind& RegKind, unsigned& Reg,
                            unsigned& RegNum, unsigned& RegWidth,
                            unsigned *DwordRegIndex);
+  Optional<StringRef> getGprCountSymbolName(RegisterKind RegKind);
+  void initializeGprCountSymbol(RegisterKind RegKind);
+  bool updateGprCountSymbols(RegisterKind RegKind, unsigned DwordRegIndex,
+                             unsigned RegWidth);
   void cvtMubufImpl(MCInst &Inst, const OperandVector &Operands,
-                    bool IsAtomic, bool IsAtomicReturn);
+                    bool IsAtomic, bool IsAtomicReturn, bool IsLds = false);
   void cvtDSImpl(MCInst &Inst, const OperandVector &Operands,
                  bool IsGdsHardcoded);
 
@@ -881,15 +923,37 @@ public:
       AMDGPU::IsaInfo::IsaVersion ISA =
           AMDGPU::IsaInfo::getIsaVersion(getFeatureBits());
       MCContext &Ctx = getContext();
-      MCSymbol *Sym =
-          Ctx.getOrCreateSymbol(Twine(".option.machine_version_major"));
-      Sym->setVariableValue(MCConstantExpr::create(ISA.Major, Ctx));
-      Sym = Ctx.getOrCreateSymbol(Twine(".option.machine_version_minor"));
-      Sym->setVariableValue(MCConstantExpr::create(ISA.Minor, Ctx));
-      Sym = Ctx.getOrCreateSymbol(Twine(".option.machine_version_stepping"));
-      Sym->setVariableValue(MCConstantExpr::create(ISA.Stepping, Ctx));
+      if (ISA.Major >= 6 && AMDGPU::IsaInfo::hasCodeObjectV3(&getSTI())) {
+        MCSymbol *Sym =
+            Ctx.getOrCreateSymbol(Twine(".amdgcn.gfx_generation_number"));
+        Sym->setVariableValue(MCConstantExpr::create(ISA.Major, Ctx));
+      } else {
+        MCSymbol *Sym =
+            Ctx.getOrCreateSymbol(Twine(".option.machine_version_major"));
+        Sym->setVariableValue(MCConstantExpr::create(ISA.Major, Ctx));
+        Sym = Ctx.getOrCreateSymbol(Twine(".option.machine_version_minor"));
+        Sym->setVariableValue(MCConstantExpr::create(ISA.Minor, Ctx));
+        Sym = Ctx.getOrCreateSymbol(Twine(".option.machine_version_stepping"));
+        Sym->setVariableValue(MCConstantExpr::create(ISA.Stepping, Ctx));
+      }
+      if (ISA.Major >= 6 && AMDGPU::IsaInfo::hasCodeObjectV3(&getSTI())) {
+        initializeGprCountSymbol(IS_VGPR);
+        initializeGprCountSymbol(IS_SGPR);
+      } else
+        KernelScope.initialize(getContext());
     }
-    KernelScope.initialize(getContext());
+  }
+
+  bool hasXNACK() const {
+    return AMDGPU::hasXNACK(getSTI());
+  }
+
+  bool hasMIMG_R128() const {
+    return AMDGPU::hasMIMG_R128(getSTI());
+  }
+
+  bool hasPackedD16() const {
+    return AMDGPU::hasPackedD16(getSTI());
   }
 
   bool isSI() const {
@@ -1025,6 +1089,11 @@ private:
   bool validateConstantBusLimitations(const MCInst &Inst);
   bool validateEarlyClobberLimitations(const MCInst &Inst);
   bool validateIntClampSupported(const MCInst &Inst);
+  bool validateMIMGAtomicDMask(const MCInst &Inst);
+  bool validateMIMGGatherDMask(const MCInst &Inst);
+  bool validateMIMGDataSize(const MCInst &Inst);
+  bool validateMIMGR128(const MCInst &Inst);
+  bool validateMIMGD16(const MCInst &Inst);
   bool usesConstantBus(const MCInst &Inst, unsigned OpIdx);
   bool isInlineConstant(const MCInst &Inst, unsigned OpIdx) const;
   unsigned findImplicitSGPRReadInVOP(const MCInst &Inst) const;
@@ -1037,6 +1106,7 @@ private:
 
 public:
   OperandMatchResultTy parseOptionalOperand(OperandVector &Operands);
+  OperandMatchResultTy parseOptionalOpr(OperandVector &Operands);
 
   OperandMatchResultTy parseExpTgt(OperandVector &Operands);
   OperandMatchResultTy parseSendMsgOp(OperandVector &Operands);
@@ -1060,17 +1130,12 @@ public:
   void cvtMubuf(MCInst &Inst, const OperandVector &Operands) { cvtMubufImpl(Inst, Operands, false, false); }
   void cvtMubufAtomic(MCInst &Inst, const OperandVector &Operands) { cvtMubufImpl(Inst, Operands, true, false); }
   void cvtMubufAtomicReturn(MCInst &Inst, const OperandVector &Operands) { cvtMubufImpl(Inst, Operands, true, true); }
+  void cvtMubufLds(MCInst &Inst, const OperandVector &Operands) { cvtMubufImpl(Inst, Operands, false, false, true); }
   void cvtMtbuf(MCInst &Inst, const OperandVector &Operands);
 
   AMDGPUOperand::Ptr defaultGLC() const;
   AMDGPUOperand::Ptr defaultSLC() const;
-  AMDGPUOperand::Ptr defaultTFE() const;
 
-  AMDGPUOperand::Ptr defaultDMask() const;
-  AMDGPUOperand::Ptr defaultUNorm() const;
-  AMDGPUOperand::Ptr defaultDA() const;
-  AMDGPUOperand::Ptr defaultR128() const;
-  AMDGPUOperand::Ptr defaultLWE() const;
   AMDGPUOperand::Ptr defaultSMRDOffset8() const;
   AMDGPUOperand::Ptr defaultSMRDOffset20() const;
   AMDGPUOperand::Ptr defaultSMRDLiteralOffset() const;
@@ -1276,15 +1341,31 @@ bool AMDGPUOperand::isRegClass(unsigned RCID) const {
   return isRegKind() && AsmParser->getMRI()->getRegClass(RCID).contains(getReg());
 }
 
-bool AMDGPUOperand::isSDWARegKind() const {
+bool AMDGPUOperand::isSDWAOperand(MVT type) const {
   if (AsmParser->isVI())
     return isVReg();
   else if (AsmParser->isGFX9())
-    return isRegKind();
+    return isRegKind() || isInlinableImm(type);
   else
     return false;
 }
 
+bool AMDGPUOperand::isSDWAFP16Operand() const {
+  return isSDWAOperand(MVT::f16);
+}
+
+bool AMDGPUOperand::isSDWAFP32Operand() const {
+  return isSDWAOperand(MVT::f32);
+}
+
+bool AMDGPUOperand::isSDWAInt16Operand() const {
+  return isSDWAOperand(MVT::i16);
+}
+
+bool AMDGPUOperand::isSDWAInt32Operand() const {
+  return isSDWAOperand(MVT::i32);
+}
+
 uint64_t AMDGPUOperand::applyInputFPModifiers(uint64_t Val, unsigned Size) const
 {
   assert(isImmTy(ImmTyNone) && Imm.Mods.hasFPModifiers());
@@ -1516,12 +1597,15 @@ static unsigned getSpecialRegForName(StringRef RegName) {
     .Case("exec", AMDGPU::EXEC)
     .Case("vcc", AMDGPU::VCC)
     .Case("flat_scratch", AMDGPU::FLAT_SCR)
+    .Case("xnack_mask", AMDGPU::XNACK_MASK)
     .Case("m0", AMDGPU::M0)
     .Case("scc", AMDGPU::SCC)
     .Case("tba", AMDGPU::TBA)
     .Case("tma", AMDGPU::TMA)
     .Case("flat_scratch_lo", AMDGPU::FLAT_SCR_LO)
     .Case("flat_scratch_hi", AMDGPU::FLAT_SCR_HI)
+    .Case("xnack_mask_lo", AMDGPU::XNACK_MASK_LO)
+    .Case("xnack_mask_hi", AMDGPU::XNACK_MASK_HI)
     .Case("vcc_lo", AMDGPU::VCC_LO)
     .Case("vcc_hi", AMDGPU::VCC_HI)
     .Case("exec_lo", AMDGPU::EXEC_LO)
@@ -1559,6 +1643,11 @@ bool AMDGPUAsmParser::AddNextRegisterToList(unsigned &Reg, unsigned &RegWidth,
       RegWidth = 2;
       return true;
     }
+    if (Reg == AMDGPU::XNACK_MASK_LO && Reg1 == AMDGPU::XNACK_MASK_HI) {
+      Reg = AMDGPU::XNACK_MASK;
+      RegWidth = 2;
+      return true;
+    }
     if (Reg == AMDGPU::VCC_LO && Reg1 == AMDGPU::VCC_HI) {
       Reg = AMDGPU::VCC;
       RegWidth = 2;
@@ -1717,6 +1806,54 @@ bool AMDGPUAsmParser::ParseAMDGPURegister(RegisterKind &RegKind, unsigned &Reg,
   return true;
 }
 
+Optional<StringRef>
+AMDGPUAsmParser::getGprCountSymbolName(RegisterKind RegKind) {
+  switch (RegKind) {
+  case IS_VGPR:
+    return StringRef(".amdgcn.next_free_vgpr");
+  case IS_SGPR:
+    return StringRef(".amdgcn.next_free_sgpr");
+  default:
+    return None;
+  }
+}
+
+void AMDGPUAsmParser::initializeGprCountSymbol(RegisterKind RegKind) {
+  auto SymbolName = getGprCountSymbolName(RegKind);
+  assert(SymbolName && "initializing invalid register kind");
+  MCSymbol *Sym = getContext().getOrCreateSymbol(*SymbolName);
+  Sym->setVariableValue(MCConstantExpr::create(0, getContext()));
+}
+
+bool AMDGPUAsmParser::updateGprCountSymbols(RegisterKind RegKind,
+                                            unsigned DwordRegIndex,
+                                            unsigned RegWidth) {
+  // Symbols are only defined for GCN targets
+  if (AMDGPU::IsaInfo::getIsaVersion(getFeatureBits()).Major < 6)
+    return true;
+
+  auto SymbolName = getGprCountSymbolName(RegKind);
+  if (!SymbolName)
+    return true;
+  MCSymbol *Sym = getContext().getOrCreateSymbol(*SymbolName);
+
+  int64_t NewMax = DwordRegIndex + RegWidth - 1;
+  int64_t OldCount;
+
+  if (!Sym->isVariable())
+    return !Error(getParser().getTok().getLoc(),
+                  ".amdgcn.next_free_{v,s}gpr symbols must be variable");
+  if (!Sym->getVariableValue(false)->evaluateAsAbsolute(OldCount))
+    return !Error(
+        getParser().getTok().getLoc(),
+        ".amdgcn.next_free_{v,s}gpr symbols must be absolute expressions");
+
+  if (OldCount <= NewMax)
+    Sym->setVariableValue(MCConstantExpr::create(NewMax + 1, getContext()));
+
+  return true;
+}
+
 std::unique_ptr<AMDGPUOperand> AMDGPUAsmParser::parseRegister() {
   const auto &Tok = Parser.getTok();
   SMLoc StartLoc = Tok.getLoc();
@@ -1727,7 +1864,11 @@ std::unique_ptr<AMDGPUOperand> AMDGPUAsmParser::parseRegister() {
   if (!ParseAMDGPURegister(RegKind, Reg, RegNum, RegWidth, &DwordRegIndex)) {
     return nullptr;
   }
-  KernelScope.usesRegister(RegKind, DwordRegIndex, RegWidth);
+  if (AMDGPU::IsaInfo::hasCodeObjectV3(&getSTI())) {
+    if (!updateGprCountSymbols(RegKind, DwordRegIndex, RegWidth))
+      return nullptr;
+  } else
+    KernelScope.usesRegister(RegKind, DwordRegIndex, RegWidth);
   return AMDGPUOperand::CreateReg(this, Reg, StartLoc, EndLoc, false);
 }
 
@@ -2234,6 +2375,111 @@ bool AMDGPUAsmParser::validateIntClampSupported(const MCInst &Inst) {
   return true;
 }
 
+bool AMDGPUAsmParser::validateMIMGDataSize(const MCInst &Inst) {
+
+  const unsigned Opc = Inst.getOpcode();
+  const MCInstrDesc &Desc = MII.get(Opc);
+
+  if ((Desc.TSFlags & SIInstrFlags::MIMG) == 0)
+    return true;
+
+  int VDataIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::vdata);
+  int DMaskIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::dmask);
+  int TFEIdx   = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::tfe);
+
+  assert(VDataIdx != -1);
+  assert(DMaskIdx != -1);
+  assert(TFEIdx != -1);
+
+  unsigned VDataSize = AMDGPU::getRegOperandSize(getMRI(), Desc, VDataIdx);
+  unsigned TFESize = Inst.getOperand(TFEIdx).getImm()? 1 : 0;
+  unsigned DMask = Inst.getOperand(DMaskIdx).getImm() & 0xf;
+  if (DMask == 0)
+    DMask = 1;
+
+  unsigned DataSize =
+    (Desc.TSFlags & SIInstrFlags::Gather4) ? 4 : countPopulation(DMask);
+  if (hasPackedD16()) {
+    int D16Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::d16);
+    if (D16Idx >= 0 && Inst.getOperand(D16Idx).getImm())
+      DataSize = (DataSize + 1) / 2;
+  }
+
+  return (VDataSize / 4) == DataSize + TFESize;
+}
+
+bool AMDGPUAsmParser::validateMIMGAtomicDMask(const MCInst &Inst) {
+
+  const unsigned Opc = Inst.getOpcode();
+  const MCInstrDesc &Desc = MII.get(Opc);
+
+  if ((Desc.TSFlags & SIInstrFlags::MIMG) == 0)
+    return true;
+  if (!Desc.mayLoad() || !Desc.mayStore())
+    return true; // Not atomic
+
+  int DMaskIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::dmask);
+  unsigned DMask = Inst.getOperand(DMaskIdx).getImm() & 0xf;
+
+  // This is an incomplete check because image_atomic_cmpswap
+  // may only use 0x3 and 0xf while other atomic operations
+  // may use 0x1 and 0x3. However these limitations are
+  // verified when we check that dmask matches dst size.
+  return DMask == 0x1 || DMask == 0x3 || DMask == 0xf;
+}
+
+bool AMDGPUAsmParser::validateMIMGGatherDMask(const MCInst &Inst) {
+
+  const unsigned Opc = Inst.getOpcode();
+  const MCInstrDesc &Desc = MII.get(Opc);
+
+  if ((Desc.TSFlags & SIInstrFlags::Gather4) == 0)
+    return true;
+
+  int DMaskIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::dmask);
+  unsigned DMask = Inst.getOperand(DMaskIdx).getImm() & 0xf;
+
+  // GATHER4 instructions use dmask in a different fashion compared to
+  // other MIMG instructions. The only useful DMASK values are
+  // 1=red, 2=green, 4=blue, 8=alpha. (e.g. 1 returns
+  // (red,red,red,red) etc.) The ISA document doesn't mention
+  // this.
+  return DMask == 0x1 || DMask == 0x2 || DMask == 0x4 || DMask == 0x8;
+}
+
+bool AMDGPUAsmParser::validateMIMGR128(const MCInst &Inst) {
+
+  const unsigned Opc = Inst.getOpcode();
+  const MCInstrDesc &Desc = MII.get(Opc);
+
+  if ((Desc.TSFlags & SIInstrFlags::MIMG) == 0)
+    return true;
+
+  int Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::r128);
+  assert(Idx != -1);
+
+  bool R128 = (Inst.getOperand(Idx).getImm() != 0);
+
+  return !R128 || hasMIMG_R128();
+}
+
+bool AMDGPUAsmParser::validateMIMGD16(const MCInst &Inst) {
+
+  const unsigned Opc = Inst.getOpcode();
+  const MCInstrDesc &Desc = MII.get(Opc);
+
+  if ((Desc.TSFlags & SIInstrFlags::MIMG) == 0)
+    return true;
+
+  int D16Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::d16);
+  if (D16Idx >= 0 && Inst.getOperand(D16Idx).getImm()) {
+    if (isCI() || isSI())
+      return false;
+  }
+
+  return true;
+}
+
 bool AMDGPUAsmParser::validateInstruction(const MCInst &Inst,
                                           const SMLoc &IDLoc) {
   if (!validateConstantBusLimitations(Inst)) {
@@ -2251,6 +2497,32 @@ bool AMDGPUAsmParser::validateInstruction(const MCInst &Inst,
       "integer clamping is not supported on this GPU");
     return false;
   }
+  if (!validateMIMGR128(Inst)) {
+    Error(IDLoc,
+      "r128 modifier is not supported on this GPU");
+    return false;
+  }
+  // For MUBUF/MTBUF d16 is a part of opcode, so there is nothing to validate.
+  if (!validateMIMGD16(Inst)) {
+    Error(IDLoc,
+      "d16 modifier is not supported on this GPU");
+    return false;
+  }
+  if (!validateMIMGDataSize(Inst)) {
+    Error(IDLoc,
+      "image data size does not match dmask and tfe");
+    return false;
+  }
+  if (!validateMIMGAtomicDMask(Inst)) {
+    Error(IDLoc,
+      "invalid atomic image dmask");
+    return false;
+  }
+  if (!validateMIMGGatherDMask(Inst)) {
+    Error(IDLoc,
+      "invalid image_gather dmask: only one bit must be set");
+    return false;
+  }
 
   return true;
 }
@@ -2355,6 +2627,320 @@ bool AMDGPUAsmParser::ParseDirectiveMajorMinor(uint32_t &Major,
   return false;
 }
 
+bool AMDGPUAsmParser::ParseDirectiveAMDGCNTarget() {
+  if (getSTI().getTargetTriple().getArch() != Triple::amdgcn)
+    return TokError("directive only supported for amdgcn architecture");
+
+  std::string Target;
+
+  SMLoc TargetStart = getTok().getLoc();
+  if (getParser().parseEscapedString(Target))
+    return true;
+  SMRange TargetRange = SMRange(TargetStart, getTok().getLoc());
+
+  std::string ExpectedTarget;
+  raw_string_ostream ExpectedTargetOS(ExpectedTarget);
+  IsaInfo::streamIsaVersion(&getSTI(), ExpectedTargetOS);
+
+  if (Target != ExpectedTargetOS.str())
+    return getParser().Error(TargetRange.Start, "target must match options",
+                             TargetRange);
+
+  getTargetStreamer().EmitDirectiveAMDGCNTarget(Target);
+  return false;
+}
+
+bool AMDGPUAsmParser::OutOfRangeError(SMRange Range) {
+  return getParser().Error(Range.Start, "value out of range", Range);
+}
+
+bool AMDGPUAsmParser::calculateGPRBlocks(
+    const FeatureBitset &Features, bool VCCUsed, bool FlatScrUsed,
+    bool XNACKUsed, unsigned NextFreeVGPR, SMRange VGPRRange,
+    unsigned NextFreeSGPR, SMRange SGPRRange, unsigned &VGPRBlocks,
+    unsigned &SGPRBlocks) {
+  // TODO(scott.linder): These calculations are duplicated from
+  // AMDGPUAsmPrinter::getSIProgramInfo and could be unified.
+  IsaInfo::IsaVersion Version = IsaInfo::getIsaVersion(Features);
+
+  unsigned NumVGPRs = NextFreeVGPR;
+  unsigned NumSGPRs = NextFreeSGPR;
+  unsigned MaxAddressableNumSGPRs = IsaInfo::getAddressableNumSGPRs(Features);
+
+  if (Version.Major >= 8 && !Features.test(FeatureSGPRInitBug) &&
+      NumSGPRs > MaxAddressableNumSGPRs)
+    return OutOfRangeError(SGPRRange);
+
+  NumSGPRs +=
+      IsaInfo::getNumExtraSGPRs(Features, VCCUsed, FlatScrUsed, XNACKUsed);
+
+  if ((Version.Major <= 7 || Features.test(FeatureSGPRInitBug)) &&
+      NumSGPRs > MaxAddressableNumSGPRs)
+    return OutOfRangeError(SGPRRange);
+
+  if (Features.test(FeatureSGPRInitBug))
+    NumSGPRs = IsaInfo::FIXED_NUM_SGPRS_FOR_INIT_BUG;
+
+  VGPRBlocks = IsaInfo::getNumVGPRBlocks(Features, NumVGPRs);
+  SGPRBlocks = IsaInfo::getNumSGPRBlocks(Features, NumSGPRs);
+
+  return false;
+}
+
+bool AMDGPUAsmParser::ParseDirectiveAMDHSAKernel() {
+  if (getSTI().getTargetTriple().getArch() != Triple::amdgcn)
+    return TokError("directive only supported for amdgcn architecture");
+
+  if (getSTI().getTargetTriple().getOS() != Triple::AMDHSA)
+    return TokError("directive only supported for amdhsa OS");
+
+  StringRef KernelName;
+  if (getParser().parseIdentifier(KernelName))
+    return true;
+
+  kernel_descriptor_t KD = getDefaultAmdhsaKernelDescriptor();
+
+  StringSet<> Seen;
+
+  IsaInfo::IsaVersion IVersion =
+      IsaInfo::getIsaVersion(getSTI().getFeatureBits());
+
+  SMRange VGPRRange;
+  uint64_t NextFreeVGPR = 0;
+  SMRange SGPRRange;
+  uint64_t NextFreeSGPR = 0;
+  unsigned UserSGPRCount = 0;
+  bool ReserveVCC = true;
+  bool ReserveFlatScr = true;
+  bool ReserveXNACK = hasXNACK();
+
+  while (true) {
+    while (getLexer().is(AsmToken::EndOfStatement))
+      Lex();
+
+    if (getLexer().isNot(AsmToken::Identifier))
+      return TokError("expected .amdhsa_ directive or .end_amdhsa_kernel");
+
+    StringRef ID = getTok().getIdentifier();
+    SMRange IDRange = getTok().getLocRange();
+    Lex();
+
+    if (ID == ".end_amdhsa_kernel")
+      break;
+
+    if (Seen.find(ID) != Seen.end())
+      return TokError(".amdhsa_ directives cannot be repeated");
+    Seen.insert(ID);
+
+    SMLoc ValStart = getTok().getLoc();
+    int64_t IVal;
+    if (getParser().parseAbsoluteExpression(IVal))
+      return true;
+    SMLoc ValEnd = getTok().getLoc();
+    SMRange ValRange = SMRange(ValStart, ValEnd);
+
+    if (IVal < 0)
+      return OutOfRangeError(ValRange);
+
+    uint64_t Val = IVal;
+
+#define PARSE_BITS_ENTRY(FIELD, ENTRY, VALUE, RANGE)                           \
+  if (!isUInt<ENTRY##_WIDTH>(VALUE))                                           \
+    return OutOfRangeError(RANGE);                                             \
+  AMDHSA_BITS_SET(FIELD, ENTRY, VALUE);
+
+    if (ID == ".amdhsa_group_segment_fixed_size") {
+      if (!isUInt<sizeof(KD.group_segment_fixed_size) * CHAR_BIT>(Val))
+        return OutOfRangeError(ValRange);
+      KD.group_segment_fixed_size = Val;
+    } else if (ID == ".amdhsa_private_segment_fixed_size") {
+      if (!isUInt<sizeof(KD.private_segment_fixed_size) * CHAR_BIT>(Val))
+        return OutOfRangeError(ValRange);
+      KD.private_segment_fixed_size = Val;
+    } else if (ID == ".amdhsa_user_sgpr_private_segment_buffer") {
+      PARSE_BITS_ENTRY(KD.kernel_code_properties,
+                       KERNEL_CODE_PROPERTY_ENABLE_SGPR_PRIVATE_SEGMENT_BUFFER,
+                       Val, ValRange);
+      UserSGPRCount++;
+    } else if (ID == ".amdhsa_user_sgpr_dispatch_ptr") {
+      PARSE_BITS_ENTRY(KD.kernel_code_properties,
+                       KERNEL_CODE_PROPERTY_ENABLE_SGPR_DISPATCH_PTR, Val,
+                       ValRange);
+      UserSGPRCount++;
+    } else if (ID == ".amdhsa_user_sgpr_queue_ptr") {
+      PARSE_BITS_ENTRY(KD.kernel_code_properties,
+                       KERNEL_CODE_PROPERTY_ENABLE_SGPR_QUEUE_PTR, Val,
+                       ValRange);
+      UserSGPRCount++;
+    } else if (ID == ".amdhsa_user_sgpr_kernarg_segment_ptr") {
+      PARSE_BITS_ENTRY(KD.kernel_code_properties,
+                       KERNEL_CODE_PROPERTY_ENABLE_SGPR_KERNARG_SEGMENT_PTR,
+                       Val, ValRange);
+      UserSGPRCount++;
+    } else if (ID == ".amdhsa_user_sgpr_dispatch_id") {
+      PARSE_BITS_ENTRY(KD.kernel_code_properties,
+                       KERNEL_CODE_PROPERTY_ENABLE_SGPR_DISPATCH_ID, Val,
+                       ValRange);
+      UserSGPRCount++;
+    } else if (ID == ".amdhsa_user_sgpr_flat_scratch_init") {
+      PARSE_BITS_ENTRY(KD.kernel_code_properties,
+                       KERNEL_CODE_PROPERTY_ENABLE_SGPR_FLAT_SCRATCH_INIT, Val,
+                       ValRange);
+      UserSGPRCount++;
+    } else if (ID == ".amdhsa_user_sgpr_private_segment_size") {
+      PARSE_BITS_ENTRY(KD.kernel_code_properties,
+                       KERNEL_CODE_PROPERTY_ENABLE_SGPR_PRIVATE_SEGMENT_SIZE,
+                       Val, ValRange);
+      UserSGPRCount++;
+    } else if (ID == ".amdhsa_system_sgpr_private_segment_wavefront_offset") {
+      PARSE_BITS_ENTRY(
+          KD.compute_pgm_rsrc2,
+          COMPUTE_PGM_RSRC2_ENABLE_SGPR_PRIVATE_SEGMENT_WAVEFRONT_OFFSET, Val,
+          ValRange);
+    } else if (ID == ".amdhsa_system_sgpr_workgroup_id_x") {
+      PARSE_BITS_ENTRY(KD.compute_pgm_rsrc2,
+                       COMPUTE_PGM_RSRC2_ENABLE_SGPR_WORKGROUP_ID_X, Val,
+                       ValRange);
+    } else if (ID == ".amdhsa_system_sgpr_workgroup_id_y") {
+      PARSE_BITS_ENTRY(KD.compute_pgm_rsrc2,
+                       COMPUTE_PGM_RSRC2_ENABLE_SGPR_WORKGROUP_ID_Y, Val,
+                       ValRange);
+    } else if (ID == ".amdhsa_system_sgpr_workgroup_id_z") {
+      PARSE_BITS_ENTRY(KD.compute_pgm_rsrc2,
+                       COMPUTE_PGM_RSRC2_ENABLE_SGPR_WORKGROUP_ID_Z, Val,
+                       ValRange);
+    } else if (ID == ".amdhsa_system_sgpr_workgroup_info") {
+      PARSE_BITS_ENTRY(KD.compute_pgm_rsrc2,
+                       COMPUTE_PGM_RSRC2_ENABLE_SGPR_WORKGROUP_INFO, Val,
+                       ValRange);
+    } else if (ID == ".amdhsa_system_vgpr_workitem_id") {
+      PARSE_BITS_ENTRY(KD.compute_pgm_rsrc2,
+                       COMPUTE_PGM_RSRC2_ENABLE_VGPR_WORKITEM_ID, Val,
+                       ValRange);
+    } else if (ID == ".amdhsa_next_free_vgpr") {
+      VGPRRange = ValRange;
+      NextFreeVGPR = Val;
+    } else if (ID == ".amdhsa_next_free_sgpr") {
+      SGPRRange = ValRange;
+      NextFreeSGPR = Val;
+    } else if (ID == ".amdhsa_reserve_vcc") {
+      if (!isUInt<1>(Val))
+        return OutOfRangeError(ValRange);
+      ReserveVCC = Val;
+    } else if (ID == ".amdhsa_reserve_flat_scratch") {
+      if (IVersion.Major < 7)
+        return getParser().Error(IDRange.Start, "directive requires gfx7+",
+                                 IDRange);
+      if (!isUInt<1>(Val))
+        return OutOfRangeError(ValRange);
+      ReserveFlatScr = Val;
+    } else if (ID == ".amdhsa_reserve_xnack_mask") {
+      if (IVersion.Major < 8)
+        return getParser().Error(IDRange.Start, "directive requires gfx8+",
+                                 IDRange);
+      if (!isUInt<1>(Val))
+        return OutOfRangeError(ValRange);
+      ReserveXNACK = Val;
+    } else if (ID == ".amdhsa_float_round_mode_32") {
+      PARSE_BITS_ENTRY(KD.compute_pgm_rsrc1,
+                       COMPUTE_PGM_RSRC1_FLOAT_ROUND_MODE_32, Val, ValRange);
+    } else if (ID == ".amdhsa_float_round_mode_16_64") {
+      PARSE_BITS_ENTRY(KD.compute_pgm_rsrc1,
+                       COMPUTE_PGM_RSRC1_FLOAT_ROUND_MODE_16_64, Val, ValRange);
+    } else if (ID == ".amdhsa_float_denorm_mode_32") {
+      PARSE_BITS_ENTRY(KD.compute_pgm_rsrc1,
+                       COMPUTE_PGM_RSRC1_FLOAT_DENORM_MODE_32, Val, ValRange);
+    } else if (ID == ".amdhsa_float_denorm_mode_16_64") {
+      PARSE_BITS_ENTRY(KD.compute_pgm_rsrc1,
+                       COMPUTE_PGM_RSRC1_FLOAT_DENORM_MODE_16_64, Val,
+                       ValRange);
+    } else if (ID == ".amdhsa_dx10_clamp") {
+      PARSE_BITS_ENTRY(KD.compute_pgm_rsrc1,
+                       COMPUTE_PGM_RSRC1_ENABLE_DX10_CLAMP, Val, ValRange);
+    } else if (ID == ".amdhsa_ieee_mode") {
+      PARSE_BITS_ENTRY(KD.compute_pgm_rsrc1, COMPUTE_PGM_RSRC1_ENABLE_IEEE_MODE,
+                       Val, ValRange);
+    } else if (ID == ".amdhsa_fp16_overflow") {
+      if (IVersion.Major < 9)
+        return getParser().Error(IDRange.Start, "directive requires gfx9+",
+                                 IDRange);
+      PARSE_BITS_ENTRY(KD.compute_pgm_rsrc1, COMPUTE_PGM_RSRC1_FP16_OVFL, Val,
+                       ValRange);
+    } else if (ID == ".amdhsa_exception_fp_ieee_invalid_op") {
+      PARSE_BITS_ENTRY(
+          KD.compute_pgm_rsrc2,
+          COMPUTE_PGM_RSRC2_ENABLE_EXCEPTION_IEEE_754_FP_INVALID_OPERATION, Val,
+          ValRange);
+    } else if (ID == ".amdhsa_exception_fp_denorm_src") {
+      PARSE_BITS_ENTRY(KD.compute_pgm_rsrc2,
+                       COMPUTE_PGM_RSRC2_ENABLE_EXCEPTION_FP_DENORMAL_SOURCE,
+                       Val, ValRange);
+    } else if (ID == ".amdhsa_exception_fp_ieee_div_zero") {
+      PARSE_BITS_ENTRY(
+          KD.compute_pgm_rsrc2,
+          COMPUTE_PGM_RSRC2_ENABLE_EXCEPTION_IEEE_754_FP_DIVISION_BY_ZERO, Val,
+          ValRange);
+    } else if (ID == ".amdhsa_exception_fp_ieee_overflow") {
+      PARSE_BITS_ENTRY(KD.compute_pgm_rsrc2,
+                       COMPUTE_PGM_RSRC2_ENABLE_EXCEPTION_IEEE_754_FP_OVERFLOW,
+                       Val, ValRange);
+    } else if (ID == ".amdhsa_exception_fp_ieee_underflow") {
+      PARSE_BITS_ENTRY(KD.compute_pgm_rsrc2,
+                       COMPUTE_PGM_RSRC2_ENABLE_EXCEPTION_IEEE_754_FP_UNDERFLOW,
+                       Val, ValRange);
+    } else if (ID == ".amdhsa_exception_fp_ieee_inexact") {
+      PARSE_BITS_ENTRY(KD.compute_pgm_rsrc2,
+                       COMPUTE_PGM_RSRC2_ENABLE_EXCEPTION_IEEE_754_FP_INEXACT,
+                       Val, ValRange);
+    } else if (ID == ".amdhsa_exception_int_div_zero") {
+      PARSE_BITS_ENTRY(KD.compute_pgm_rsrc2,
+                       COMPUTE_PGM_RSRC2_ENABLE_EXCEPTION_INT_DIVIDE_BY_ZERO,
+                       Val, ValRange);
+    } else {
+      return getParser().Error(IDRange.Start,
+                               "unknown .amdhsa_kernel directive", IDRange);
+    }
+
+#undef PARSE_BITS_ENTRY
+  }
+
+  if (Seen.find(".amdhsa_next_free_vgpr") == Seen.end())
+    return TokError(".amdhsa_next_free_vgpr directive is required");
+
+  if (Seen.find(".amdhsa_next_free_sgpr") == Seen.end())
+    return TokError(".amdhsa_next_free_sgpr directive is required");
+
+  unsigned VGPRBlocks;
+  unsigned SGPRBlocks;
+  if (calculateGPRBlocks(getFeatureBits(), ReserveVCC, ReserveFlatScr,
+                         ReserveXNACK, NextFreeVGPR, VGPRRange, NextFreeSGPR,
+                         SGPRRange, VGPRBlocks, SGPRBlocks))
+    return true;
+
+  if (!isUInt<COMPUTE_PGM_RSRC1_GRANULATED_WORKITEM_VGPR_COUNT_WIDTH>(
+          VGPRBlocks))
+    return OutOfRangeError(VGPRRange);
+  AMDHSA_BITS_SET(KD.compute_pgm_rsrc1,
+                  COMPUTE_PGM_RSRC1_GRANULATED_WORKITEM_VGPR_COUNT, VGPRBlocks);
+
+  if (!isUInt<COMPUTE_PGM_RSRC1_GRANULATED_WAVEFRONT_SGPR_COUNT_WIDTH>(
+          SGPRBlocks))
+    return OutOfRangeError(SGPRRange);
+  AMDHSA_BITS_SET(KD.compute_pgm_rsrc1,
+                  COMPUTE_PGM_RSRC1_GRANULATED_WAVEFRONT_SGPR_COUNT,
+                  SGPRBlocks);
+
+  if (!isUInt<COMPUTE_PGM_RSRC2_USER_SGPR_COUNT_WIDTH>(UserSGPRCount))
+    return TokError("too many user SGPRs enabled");
+  AMDHSA_BITS_SET(KD.compute_pgm_rsrc2, COMPUTE_PGM_RSRC2_USER_SGPR_COUNT,
+                  UserSGPRCount);
+
+  getTargetStreamer().EmitAmdhsaKernelDescriptor(
+      getSTI(), KernelName, KD, NextFreeVGPR, NextFreeSGPR, ReserveVCC,
+      ReserveFlatScr, ReserveXNACK);
+  return false;
+}
+
 bool AMDGPUAsmParser::ParseDirectiveHSACodeObjectVersion() {
   uint32_t Major;
   uint32_t Minor;
@@ -2421,6 +3007,13 @@ bool AMDGPUAsmParser::ParseDirectiveHSACodeObjectISA() {
 
 bool AMDGPUAsmParser::ParseAMDKernelCodeTValue(StringRef ID,
                                                amd_kernel_code_t &Header) {
+  // max_scratch_backing_memory_byte_size is deprecated. Ignore it while parsing
+  // assembly for backwards compatibility.
+  if (ID == "max_scratch_backing_memory_byte_size") {
+    Parser.eatToEndOfStatement();
+    return false;
+  }
+
   SmallString<40> ErrStr;
   raw_svector_ostream Err(ErrStr);
   if (!parseAmdKernelCodeField(ID, getParser(), Header, Err)) {
@@ -2467,7 +3060,8 @@ bool AMDGPUAsmParser::ParseDirectiveAMDGPUHsaKernel() {
   getTargetStreamer().EmitAMDGPUSymbolType(KernelName,
                                            ELF::STT_AMDGPU_HSA_KERNEL);
   Lex();
-  KernelScope.initialize(getContext());
+  if (!AMDGPU::IsaInfo::hasCodeObjectV3(&getSTI()))
+    KernelScope.initialize(getContext());
   return false;
 }
 
@@ -2571,20 +3165,28 @@ bool AMDGPUAsmParser::ParseDirectivePALMetadata() {
 bool AMDGPUAsmParser::ParseDirective(AsmToken DirectiveID) {
   StringRef IDVal = DirectiveID.getString();
 
-  if (IDVal == ".hsa_code_object_version")
-    return ParseDirectiveHSACodeObjectVersion();
+  if (AMDGPU::IsaInfo::hasCodeObjectV3(&getSTI())) {
+    if (IDVal == ".amdgcn_target")
+      return ParseDirectiveAMDGCNTarget();
+
+    if (IDVal == ".amdhsa_kernel")
+      return ParseDirectiveAMDHSAKernel();
+  } else {
+    if (IDVal == ".hsa_code_object_version")
+      return ParseDirectiveHSACodeObjectVersion();
 
-  if (IDVal == ".hsa_code_object_isa")
-    return ParseDirectiveHSACodeObjectISA();
+    if (IDVal == ".hsa_code_object_isa")
+      return ParseDirectiveHSACodeObjectISA();
 
-  if (IDVal == ".amd_kernel_code_t")
-    return ParseDirectiveAMDKernelCodeT();
+    if (IDVal == ".amd_kernel_code_t")
+      return ParseDirectiveAMDKernelCodeT();
 
-  if (IDVal == ".amdgpu_hsa_kernel")
-    return ParseDirectiveAMDGPUHsaKernel();
+    if (IDVal == ".amdgpu_hsa_kernel")
+      return ParseDirectiveAMDGPUHsaKernel();
 
-  if (IDVal == ".amd_amdgpu_isa")
-    return ParseDirectiveISAVersion();
+    if (IDVal == ".amd_amdgpu_isa")
+      return ParseDirectiveISAVersion();
+  }
 
   if (IDVal == AMDGPU::HSAMD::AssemblerDirectiveBegin)
     return ParseDirectiveHSAMetadata();
@@ -2612,6 +3214,10 @@ bool AMDGPUAsmParser::subtargetHasRegister(const MCRegisterInfo &MRI,
   case AMDGPU::TMA_LO:
   case AMDGPU::TMA_HI:
     return !isGFX9();
+  case AMDGPU::XNACK_MASK:
+  case AMDGPU::XNACK_MASK_LO:
+  case AMDGPU::XNACK_MASK_HI:
+    return !isCI() && !isSI() && hasXNACK();
   default:
     break;
   }
@@ -3158,7 +3764,10 @@ bool AMDGPUAsmParser::parseHwregConstruct(OperandInfoTy &HwReg, int64_t &Offset,
     HwReg.IsSymbolic = true;
     HwReg.Id = ID_UNKNOWN_;
     const StringRef tok = Parser.getTok().getString();
-    for (int i = ID_SYMBOLIC_FIRST_; i < ID_SYMBOLIC_LAST_; ++i) {
+    int Last = ID_SYMBOLIC_LAST_;
+    if (isSI() || isCI() || isVI())
+      Last = ID_SYMBOLIC_FIRST_GFX9_;
+    for (int i = ID_SYMBOLIC_FIRST_; i < Last; ++i) {
       if (tok == IdSymbolic[i]) {
         HwReg.Id = i;
         break;
@@ -3859,7 +4468,7 @@ AMDGPUAsmParser::parseSwizzleOp(OperandVector &Operands) {
   } else {
     // Swizzle "offset" operand is optional.
     // If it is omitted, try parsing other optional operands.
-    return parseOptionalOperand(Operands);
+    return parseOptionalOpr(Operands);
   }
 }
 
@@ -3907,13 +4516,13 @@ AMDGPUOperand::Ptr AMDGPUAsmParser::defaultSLC() const {
   return AMDGPUOperand::CreateImm(this, 0, SMLoc(), AMDGPUOperand::ImmTySLC);
 }
 
-AMDGPUOperand::Ptr AMDGPUAsmParser::defaultTFE() const {
-  return AMDGPUOperand::CreateImm(this, 0, SMLoc(), AMDGPUOperand::ImmTyTFE);
-}
-
 void AMDGPUAsmParser::cvtMubufImpl(MCInst &Inst,
                                const OperandVector &Operands,
-                               bool IsAtomic, bool IsAtomicReturn) {
+                               bool IsAtomic,
+                               bool IsAtomicReturn,
+                               bool IsLds) {
+  bool IsLdsOpcode = IsLds;
+  bool HasLdsModifier = false;
   OptionalImmIndexMap OptionalIdx;
   assert(IsAtomicReturn ? IsAtomic : true);
 
@@ -3932,6 +4541,8 @@ void AMDGPUAsmParser::cvtMubufImpl(MCInst &Inst,
       continue;
     }
 
+    HasLdsModifier = Op.isLDS();
+
     // Handle tokens like 'offen' which are sometimes hard-coded into the
     // asm string.  There are no MCInst operands for these.
     if (Op.isToken()) {
@@ -3943,6 +4554,21 @@ void AMDGPUAsmParser::cvtMubufImpl(MCInst &Inst,
     OptionalIdx[Op.getImmTy()] = i;
   }
 
+  // This is a workaround for an llvm quirk which may result in an
+  // incorrect instruction selection. Lds and non-lds versions of
+  // MUBUF instructions are identical except that lds versions
+  // have mandatory 'lds' modifier. However this modifier follows
+  // optional modifiers and llvm asm matcher regards this 'lds'
+  // modifier as an optional one. As a result, an lds version
+  // of opcode may be selected even if it has no 'lds' modifier.
+  if (IsLdsOpcode && !HasLdsModifier) {
+    int NoLdsOpcode = AMDGPU::getMUBUFNoLdsInst(Inst.getOpcode());
+    if (NoLdsOpcode != -1) { // Got lds version - correct it.
+      Inst.setOpcode(NoLdsOpcode);
+      IsLdsOpcode = false;
+    }
+  }
+
   // Copy $vdata_in operand and insert as $vdata for MUBUF_Atomic RTN insns.
   if (IsAtomicReturn) {
     MCInst::iterator I = Inst.begin(); // $vdata_in is always at the beginning.
@@ -3954,7 +4580,10 @@ void AMDGPUAsmParser::cvtMubufImpl(MCInst &Inst,
     addOptionalImmOperand(Inst, Operands, OptionalIdx, AMDGPUOperand::ImmTyGLC);
   }
   addOptionalImmOperand(Inst, Operands, OptionalIdx, AMDGPUOperand::ImmTySLC);
-  addOptionalImmOperand(Inst, Operands, OptionalIdx, AMDGPUOperand::ImmTyTFE);
+
+  if (!IsLdsOpcode) { // tfe is not legal with lds opcodes
+    addOptionalImmOperand(Inst, Operands, OptionalIdx, AMDGPUOperand::ImmTyTFE);
+  }
 }
 
 void AMDGPUAsmParser::cvtMtbuf(MCInst &Inst, const OperandVector &Operands) {
@@ -4009,7 +4638,8 @@ void AMDGPUAsmParser::cvtMIMG(MCInst &Inst, const OperandVector &Operands,
 
   if (IsAtomic) {
     // Add src, same as dst
-    ((AMDGPUOperand &)*Operands[I]).addRegOperands(Inst, 1);
+    assert(Desc.getNumDefs() == 1);
+    ((AMDGPUOperand &)*Operands[I - 1]).addRegOperands(Inst, 1);
   }
 
   OptionalImmIndexMap OptionalIdx;
@@ -4018,9 +4648,8 @@ void AMDGPUAsmParser::cvtMIMG(MCInst &Inst, const OperandVector &Operands,
     AMDGPUOperand &Op = ((AMDGPUOperand &)*Operands[I]);
 
     // Add the register arguments
-    if (Op.isRegOrImm()) {
-      Op.addRegOrImmOperands(Inst, 1);
-      continue;
+    if (Op.isReg()) {
+      Op.addRegOperands(Inst, 1);
     } else if (Op.isImmModifier()) {
       OptionalIdx[Op.getImmTy()] = I;
     } else {
@@ -4031,37 +4660,18 @@ void AMDGPUAsmParser::cvtMIMG(MCInst &Inst, const OperandVector &Operands,
   addOptionalImmOperand(Inst, Operands, OptionalIdx, AMDGPUOperand::ImmTyDMask);
   addOptionalImmOperand(Inst, Operands, OptionalIdx, AMDGPUOperand::ImmTyUNorm);
   addOptionalImmOperand(Inst, Operands, OptionalIdx, AMDGPUOperand::ImmTyGLC);
-  addOptionalImmOperand(Inst, Operands, OptionalIdx, AMDGPUOperand::ImmTyDA);
+  addOptionalImmOperand(Inst, Operands, OptionalIdx, AMDGPUOperand::ImmTySLC);
   addOptionalImmOperand(Inst, Operands, OptionalIdx, AMDGPUOperand::ImmTyR128);
   addOptionalImmOperand(Inst, Operands, OptionalIdx, AMDGPUOperand::ImmTyTFE);
   addOptionalImmOperand(Inst, Operands, OptionalIdx, AMDGPUOperand::ImmTyLWE);
-  addOptionalImmOperand(Inst, Operands, OptionalIdx, AMDGPUOperand::ImmTySLC);
+  addOptionalImmOperand(Inst, Operands, OptionalIdx, AMDGPUOperand::ImmTyDA);
+  addOptionalImmOperand(Inst, Operands, OptionalIdx, AMDGPUOperand::ImmTyD16);
 }
 
 void AMDGPUAsmParser::cvtMIMGAtomic(MCInst &Inst, const OperandVector &Operands) {
   cvtMIMG(Inst, Operands, true);
 }
 
-AMDGPUOperand::Ptr AMDGPUAsmParser::defaultDMask() const {
-  return AMDGPUOperand::CreateImm(this, 0, SMLoc(), AMDGPUOperand::ImmTyDMask);
-}
-
-AMDGPUOperand::Ptr AMDGPUAsmParser::defaultUNorm() const {
-  return AMDGPUOperand::CreateImm(this, 0, SMLoc(), AMDGPUOperand::ImmTyUNorm);
-}
-
-AMDGPUOperand::Ptr AMDGPUAsmParser::defaultDA() const {
-  return AMDGPUOperand::CreateImm(this, 0, SMLoc(), AMDGPUOperand::ImmTyDA);
-}
-
-AMDGPUOperand::Ptr AMDGPUAsmParser::defaultR128() const {
-  return AMDGPUOperand::CreateImm(this, 0, SMLoc(), AMDGPUOperand::ImmTyR128);
-}
-
-AMDGPUOperand::Ptr AMDGPUAsmParser::defaultLWE() const {
-  return AMDGPUOperand::CreateImm(this, 0, SMLoc(), AMDGPUOperand::ImmTyLWE);
-}
-
 //===----------------------------------------------------------------------===//
 // smrd
 //===----------------------------------------------------------------------===//
@@ -4148,6 +4758,7 @@ static const OptionalOperand AMDGPUOptionalOperandTable[] = {
   {"offset0", AMDGPUOperand::ImmTyOffset0, false, nullptr},
   {"offset1", AMDGPUOperand::ImmTyOffset1, false, nullptr},
   {"gds",     AMDGPUOperand::ImmTyGDS, true, nullptr},
+  {"lds",     AMDGPUOperand::ImmTyLDS, true, nullptr},
   {"offset",  AMDGPUOperand::ImmTyOffset, false, nullptr},
   {"inst_offset", AMDGPUOperand::ImmTyInstOffset, false, nullptr},
   {"dfmt",    AMDGPUOperand::ImmTyDFMT, false, nullptr},
@@ -4155,6 +4766,7 @@ static const OptionalOperand AMDGPUOptionalOperandTable[] = {
   {"glc",     AMDGPUOperand::ImmTyGLC, true, nullptr},
   {"slc",     AMDGPUOperand::ImmTySLC, true, nullptr},
   {"tfe",     AMDGPUOperand::ImmTyTFE, true, nullptr},
+  {"d16",     AMDGPUOperand::ImmTyD16, true, nullptr},
   {"high",    AMDGPUOperand::ImmTyHigh, true, nullptr},
   {"clamp",   AMDGPUOperand::ImmTyClampSI, true, nullptr},
   {"omod",    AMDGPUOperand::ImmTyOModSI, false, ConvertOmodMul},
@@ -4162,6 +4774,7 @@ static const OptionalOperand AMDGPUOptionalOperandTable[] = {
   {"da",      AMDGPUOperand::ImmTyDA,    true, nullptr},
   {"r128",    AMDGPUOperand::ImmTyR128,  true, nullptr},
   {"lwe",     AMDGPUOperand::ImmTyLWE,   true, nullptr},
+  {"d16",     AMDGPUOperand::ImmTyD16,   true, nullptr},
   {"dmask",   AMDGPUOperand::ImmTyDMask, false, nullptr},
   {"row_mask",   AMDGPUOperand::ImmTyDppRowMask, false, nullptr},
   {"bank_mask",  AMDGPUOperand::ImmTyDppBankMask, false, nullptr},
@@ -4179,6 +4792,39 @@ static const OptionalOperand AMDGPUOptionalOperandTable[] = {
 };
 
 OperandMatchResultTy AMDGPUAsmParser::parseOptionalOperand(OperandVector &Operands) {
+  unsigned size = Operands.size();
+  assert(size > 0);
+
+  OperandMatchResultTy res = parseOptionalOpr(Operands);
+
+  // This is a hack to enable hardcoded mandatory operands which follow
+  // optional operands.
+  //
+  // Current design assumes that all operands after the first optional operand
+  // are also optional. However implementation of some instructions violates
+  // this rule (see e.g. flat/global atomic which have hardcoded 'glc' operands).
+  //
+  // To alleviate this problem, we have to (implicitly) parse extra operands
+  // to make sure autogenerated parser of custom operands never hit hardcoded
+  // mandatory operands.
+
+  if (size == 1 || ((AMDGPUOperand &)*Operands[size - 1]).isRegKind()) {
+
+    // We have parsed the first optional operand.
+    // Parse as many operands as necessary to skip all mandatory operands.
+
+    for (unsigned i = 0; i < MAX_OPR_LOOKAHEAD; ++i) {
+      if (res != MatchOperand_Success ||
+          getLexer().is(AsmToken::EndOfStatement)) break;
+      if (getLexer().is(AsmToken::Comma)) Parser.Lex();
+      res = parseOptionalOpr(Operands);
+    }
+  }
+
+  return res;
+}
+
+OperandMatchResultTy AMDGPUAsmParser::parseOptionalOpr(OperandVector &Operands) {
   OperandMatchResultTy res;
   for (const OptionalOperand &Op : AMDGPUOptionalOperandTable) {
     // try to parse any optional operand here
@@ -4341,12 +4987,14 @@ void AMDGPUAsmParser::cvtVOP3(MCInst &Inst, const OperandVector &Operands,
     addOptionalImmOperand(Inst, Operands, OptionalIdx, AMDGPUOperand::ImmTyOModSI);
   }
 
-  // special case v_mac_{f16, f32}:
+  // Special case v_mac_{f16, f32} and v_fmac_f32 (gfx906):
   // it has src2 register operand that is tied to dst operand
   // we don't allow modifiers for this operand in assembler so src2_modifiers
-  // should be 0
-  if (Opc == AMDGPU::V_MAC_F32_e64_si || Opc == AMDGPU::V_MAC_F32_e64_vi ||
-      Opc == AMDGPU::V_MAC_F16_e64_vi) {
+  // should be 0.
+  if (Opc == AMDGPU::V_MAC_F32_e64_si ||
+      Opc == AMDGPU::V_MAC_F32_e64_vi ||
+      Opc == AMDGPU::V_MAC_F16_e64_vi ||
+      Opc == AMDGPU::V_FMAC_F32_e64_vi) {
     auto it = Inst.begin();
     std::advance(it, AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src2_modifiers));
     it = Inst.insert(it, MCOperand::createImm(0)); // no modifiers for src2
@@ -4448,21 +5096,23 @@ void AMDGPUAsmParser::cvtVOP3P(MCInst &Inst,
 //===----------------------------------------------------------------------===//
 
 bool AMDGPUOperand::isDPPCtrl() const {
+  using namespace AMDGPU::DPP;
+
   bool result = isImm() && getImmTy() == ImmTyDppCtrl && isUInt<9>(getImm());
   if (result) {
     int64_t Imm = getImm();
-    return ((Imm >= 0x000) && (Imm <= 0x0ff)) ||
-           ((Imm >= 0x101) && (Imm <= 0x10f)) ||
-           ((Imm >= 0x111) && (Imm <= 0x11f)) ||
-           ((Imm >= 0x121) && (Imm <= 0x12f)) ||
-           (Imm == 0x130) ||
-           (Imm == 0x134) ||
-           (Imm == 0x138) ||
-           (Imm == 0x13c) ||
-           (Imm == 0x140) ||
-           (Imm == 0x141) ||
-           (Imm == 0x142) ||
-           (Imm == 0x143);
+    return (Imm >= DppCtrl::QUAD_PERM_FIRST && Imm <= DppCtrl::QUAD_PERM_LAST) ||
+           (Imm >= DppCtrl::ROW_SHL_FIRST && Imm <= DppCtrl::ROW_SHL_LAST) ||
+           (Imm >= DppCtrl::ROW_SHR_FIRST && Imm <= DppCtrl::ROW_SHR_LAST) ||
+           (Imm >= DppCtrl::ROW_ROR_FIRST && Imm <= DppCtrl::ROW_ROR_LAST) ||
+           (Imm == DppCtrl::WAVE_SHL1) ||
+           (Imm == DppCtrl::WAVE_ROL1) ||
+           (Imm == DppCtrl::WAVE_SHR1) ||
+           (Imm == DppCtrl::WAVE_ROR1) ||
+           (Imm == DppCtrl::ROW_MIRROR) ||
+           (Imm == DppCtrl::ROW_HALF_MIRROR) ||
+           (Imm == DppCtrl::BCAST15) ||
+           (Imm == DppCtrl::BCAST31);
   }
   return false;
 }
@@ -4481,6 +5131,8 @@ bool AMDGPUOperand::isU16Imm() const {
 
 OperandMatchResultTy
 AMDGPUAsmParser::parseDPPCtrl(OperandVector &Operands) {
+  using namespace AMDGPU::DPP;
+
   SMLoc S = Parser.getTok().getLoc();
   StringRef Prefix;
   int64_t Int;
@@ -4492,10 +5144,10 @@ AMDGPUAsmParser::parseDPPCtrl(OperandVector &Operands) {
   }
 
   if (Prefix == "row_mirror") {
-    Int = 0x140;
+    Int = DppCtrl::ROW_MIRROR;
     Parser.Lex();
   } else if (Prefix == "row_half_mirror") {
-    Int = 0x141;
+    Int = DppCtrl::ROW_HALF_MIRROR;
     Parser.Lex();
   } else {
     // Check to prevent parseDPPCtrlOps from eating invalid tokens
@@ -4547,24 +5199,24 @@ AMDGPUAsmParser::parseDPPCtrl(OperandVector &Operands) {
         return MatchOperand_ParseFail;
 
       if (Prefix == "row_shl" && 1 <= Int && Int <= 15) {
-        Int |= 0x100;
+        Int |= DppCtrl::ROW_SHL0;
       } else if (Prefix == "row_shr" && 1 <= Int && Int <= 15) {
-        Int |= 0x110;
+        Int |= DppCtrl::ROW_SHR0;
       } else if (Prefix == "row_ror" && 1 <= Int && Int <= 15) {
-        Int |= 0x120;
+        Int |= DppCtrl::ROW_ROR0;
       } else if (Prefix == "wave_shl" && 1 == Int) {
-        Int = 0x130;
+        Int = DppCtrl::WAVE_SHL1;
       } else if (Prefix == "wave_rol" && 1 == Int) {
-        Int = 0x134;
+        Int = DppCtrl::WAVE_ROL1;
       } else if (Prefix == "wave_shr" && 1 == Int) {
-        Int = 0x138;
+        Int = DppCtrl::WAVE_SHR1;
       } else if (Prefix == "wave_ror" && 1 == Int) {
-        Int = 0x13C;
+        Int = DppCtrl::WAVE_ROR1;
       } else if (Prefix == "row_bcast") {
         if (Int == 15) {
-          Int = 0x142;
+          Int = DppCtrl::BCAST15;
         } else if (Int == 31) {
-          Int = 0x143;
+          Int = DppCtrl::BCAST31;
         } else {
           return MatchOperand_ParseFail;
         }
@@ -4742,7 +5394,7 @@ void AMDGPUAsmParser::cvtSDWA(MCInst &Inst, const OperandVector &Operands,
       }
     }
     if (isRegOrImmWithInputMods(Desc, Inst.getNumOperands())) {
-      Op.addRegWithInputModsOperands(Inst, 2);
+      Op.addRegOrImmWithInputModsOperands(Inst, 2);
     } else if (Op.isImm()) {
       // Handle optional arguments
       OptionalIdx[Op.getImmTy()] = I;
@@ -4824,6 +5476,8 @@ unsigned AMDGPUAsmParser::validateTargetOperandClass(MCParsedAsmOperand &Op,
     return Operand.isAddr64() ? Match_Success : Match_InvalidOperand;
   case MCK_gds:
     return Operand.isGDS() ? Match_Success : Match_InvalidOperand;
+  case MCK_lds:
+    return Operand.isLDS() ? Match_Success : Match_InvalidOperand;
   case MCK_glc:
     return Operand.isGLC() ? Match_Success : Match_InvalidOperand;
   case MCK_idxen:
diff --git a/lib/Target/AMDGPU/BUFInstructions.td b/lib/Target/AMDGPU/BUFInstructions.td
index 2230457b3a9b..b87c47a6b9ee 100644
--- a/lib/Target/AMDGPU/BUFInstructions.td
+++ b/lib/Target/AMDGPU/BUFInstructions.td
@@ -52,14 +52,19 @@ class getAddrName<int addrKind> {
     "")))));
 }
 
-class MUBUFAddr64Table <bit is_addr64, string suffix = ""> {
+class MUBUFAddr64Table <bit is_addr64, string Name> {
   bit IsAddr64 = is_addr64;
-  string OpName = NAME # suffix;
+  string OpName = Name;
 }
 
-class MTBUFAddr64Table <bit is_addr64, string suffix = ""> {
+class MUBUFLdsTable <bit is_lds, string Name> {
+  bit IsLds = is_lds;
+  string OpName = Name;
+}
+
+class MTBUFAddr64Table <bit is_addr64, string Name> {
   bit IsAddr64 = is_addr64;
-  string OpName = NAME # suffix;
+  string OpName = Name;
 }
 
 //===----------------------------------------------------------------------===//
@@ -137,17 +142,17 @@ class getMTBUFInsDA<list<RegisterClass> vdataList,
   RegisterClass vaddrClass = !if(!empty(vaddrList), ?, !head(vaddrList));
   dag InsNoData = !if(!empty(vaddrList),
     (ins                    SReg_128:$srsrc, SCSrc_b32:$soffset,
-         offset:$offset, DFMT:$dfmt, NFMT:$nfmt, GLC:$glc, slc:$slc, tfe:$tfe),
+         offset:$offset, DFMT:$dfmt, NFMT:$nfmt, GLC:$glc, SLC:$slc, TFE:$tfe),
     (ins vaddrClass:$vaddr, SReg_128:$srsrc, SCSrc_b32:$soffset,
-         offset:$offset, DFMT:$dfmt, NFMT:$nfmt, GLC:$glc, slc:$slc, tfe:$tfe)
+         offset:$offset, DFMT:$dfmt, NFMT:$nfmt, GLC:$glc, SLC:$slc, TFE:$tfe)
   );
   dag InsData = !if(!empty(vaddrList),
     (ins vdataClass:$vdata,                    SReg_128:$srsrc,
          SCSrc_b32:$soffset, offset:$offset, DFMT:$dfmt, NFMT:$nfmt, GLC:$glc,
-         slc:$slc, tfe:$tfe),
+         SLC:$slc, TFE:$tfe),
     (ins vdataClass:$vdata, vaddrClass:$vaddr, SReg_128:$srsrc,
          SCSrc_b32:$soffset, offset:$offset, DFMT:$dfmt, NFMT:$nfmt, GLC:$glc,
-         slc:$slc, tfe:$tfe)
+         SLC:$slc, TFE:$tfe)
   );
   dag ret = !if(!empty(vdataList), InsNoData, InsData);
 }
@@ -214,13 +219,13 @@ multiclass MTBUF_Pseudo_Loads<string opName, RegisterClass vdataClass,
     [(set load_vt:$vdata,
      (ld (MUBUFOffset v4i32:$srsrc, i32:$soffset, i16:$offset, i8:$dfmt,
                       i8:$nfmt, i1:$glc, i1:$slc, i1:$tfe)))]>,
-    MTBUFAddr64Table<0>;
+    MTBUFAddr64Table<0, NAME>;
 
   def _ADDR64 : MTBUF_Load_Pseudo <opName, BUFAddrKind.Addr64, vdataClass,
     [(set load_vt:$vdata,
      (ld (MUBUFAddr64 v4i32:$srsrc, i64:$vaddr, i32:$soffset, i16:$offset,
                       i8:$dfmt, i8:$nfmt, i1:$glc, i1:$slc, i1:$tfe)))]>,
-    MTBUFAddr64Table<1>;
+    MTBUFAddr64Table<1, NAME>;
 
   def _OFFEN  : MTBUF_Load_Pseudo <opName, BUFAddrKind.OffEn, vdataClass>;
   def _IDXEN  : MTBUF_Load_Pseudo <opName, BUFAddrKind.IdxEn, vdataClass>;
@@ -260,13 +265,13 @@ multiclass MTBUF_Pseudo_Stores<string opName, RegisterClass vdataClass,
     [(st store_vt:$vdata, (MUBUFOffset v4i32:$srsrc, i32:$soffset,
                                        i16:$offset, i8:$dfmt, i8:$nfmt, i1:$glc,
                                        i1:$slc, i1:$tfe))]>,
-    MTBUFAddr64Table<0>;
+    MTBUFAddr64Table<0, NAME>;
 
   def _ADDR64 : MTBUF_Store_Pseudo <opName, BUFAddrKind.Addr64, vdataClass,
     [(st store_vt:$vdata, (MUBUFAddr64 v4i32:$srsrc, i64:$vaddr, i32:$soffset,
                                        i16:$offset, i8:$dfmt, i8:$nfmt, i1:$glc,
                                        i1:$slc, i1:$tfe))]>,
-    MTBUFAddr64Table<1>;
+    MTBUFAddr64Table<1, NAME>;
 
   def _OFFEN  : MTBUF_Store_Pseudo <opName, BUFAddrKind.OffEn, vdataClass>;
   def _IDXEN  : MTBUF_Store_Pseudo <opName, BUFAddrKind.IdxEn, vdataClass>;
@@ -310,6 +315,7 @@ class MUBUF_Pseudo <string opName, dag outs, dag ins,
   bits<1> offen       = 0;
   bits<1> idxen       = 0;
   bits<1> addr64      = 0;
+  bits<1> lds         = 0;
   bits<1> has_vdata   = 1;
   bits<1> has_vaddr   = 1;
   bits<1> has_glc     = 1;
@@ -336,7 +342,6 @@ class MUBUF_Real <bits<7> op, MUBUF_Pseudo ps> :
 
   bits<12> offset;
   bits<1>  glc;
-  bits<1>  lds = 0;
   bits<8>  vaddr;
   bits<8>  vdata;
   bits<7>  srsrc;
@@ -371,31 +376,35 @@ class MUBUF_Invalidate <string opName, SDPatternOperator node> :
 }
 
 class getMUBUFInsDA<list<RegisterClass> vdataList,
-                    list<RegisterClass> vaddrList=[]> {
+                    list<RegisterClass> vaddrList=[],
+                    bit isLds = 0> {
   RegisterClass vdataClass = !if(!empty(vdataList), ?, !head(vdataList));
   RegisterClass vaddrClass = !if(!empty(vaddrList), ?, !head(vaddrList));
   dag InsNoData = !if(!empty(vaddrList),
     (ins                    SReg_128:$srsrc, SCSrc_b32:$soffset,
-         offset:$offset, GLC:$glc, slc:$slc, tfe:$tfe),
+         offset:$offset, GLC:$glc, SLC:$slc),
     (ins vaddrClass:$vaddr, SReg_128:$srsrc, SCSrc_b32:$soffset,
-         offset:$offset, GLC:$glc, slc:$slc, tfe:$tfe)
+         offset:$offset, GLC:$glc, SLC:$slc)
   );
   dag InsData = !if(!empty(vaddrList),
     (ins vdataClass:$vdata,                    SReg_128:$srsrc,
-         SCSrc_b32:$soffset, offset:$offset, GLC:$glc, slc:$slc, tfe:$tfe),
+         SCSrc_b32:$soffset, offset:$offset, GLC:$glc, SLC:$slc),
     (ins vdataClass:$vdata, vaddrClass:$vaddr, SReg_128:$srsrc,
-         SCSrc_b32:$soffset, offset:$offset, GLC:$glc, slc:$slc, tfe:$tfe)
+         SCSrc_b32:$soffset, offset:$offset, GLC:$glc, SLC:$slc)
   );
-  dag ret = !if(!empty(vdataList), InsNoData, InsData);
+  dag ret = !con(
+              !if(!empty(vdataList), InsNoData, InsData),
+              !if(isLds, (ins), (ins TFE:$tfe))
+             );
 }
 
-class getMUBUFIns<int addrKind, list<RegisterClass> vdataList=[]> {
+class getMUBUFIns<int addrKind, list<RegisterClass> vdataList=[], bit isLds = 0> {
   dag ret =
-    !if(!eq(addrKind, BUFAddrKind.Offset), getMUBUFInsDA<vdataList>.ret,
-    !if(!eq(addrKind, BUFAddrKind.OffEn),  getMUBUFInsDA<vdataList, [VGPR_32]>.ret,
-    !if(!eq(addrKind, BUFAddrKind.IdxEn),  getMUBUFInsDA<vdataList, [VGPR_32]>.ret,
-    !if(!eq(addrKind, BUFAddrKind.BothEn), getMUBUFInsDA<vdataList, [VReg_64]>.ret,
-    !if(!eq(addrKind, BUFAddrKind.Addr64), getMUBUFInsDA<vdataList, [VReg_64]>.ret,
+    !if(!eq(addrKind, BUFAddrKind.Offset), getMUBUFInsDA<vdataList, [], isLds>.ret,
+    !if(!eq(addrKind, BUFAddrKind.OffEn),  getMUBUFInsDA<vdataList, [VGPR_32], isLds>.ret,
+    !if(!eq(addrKind, BUFAddrKind.IdxEn),  getMUBUFInsDA<vdataList, [VGPR_32], isLds>.ret,
+    !if(!eq(addrKind, BUFAddrKind.BothEn), getMUBUFInsDA<vdataList, [VReg_64], isLds>.ret,
+    !if(!eq(addrKind, BUFAddrKind.Addr64), getMUBUFInsDA<vdataList, [VReg_64], isLds>.ret,
     (ins))))));
 }
 
@@ -426,20 +435,29 @@ class MUBUF_Load_Pseudo <string opName,
                          int addrKind,
                          RegisterClass vdataClass,
                          bit HasTiedDest = 0,
+                         bit isLds = 0,
                          list<dag> pattern=[],
                          // Workaround bug bz30254
                          int addrKindCopy = addrKind>
   : MUBUF_Pseudo<opName,
                  (outs vdataClass:$vdata),
-                 !con(getMUBUFIns<addrKindCopy>.ret, !if(HasTiedDest, (ins vdataClass:$vdata_in), (ins))),
-                 " $vdata, " # getMUBUFAsmOps<addrKindCopy>.ret # "$glc$slc$tfe",
+                 !con(getMUBUFIns<addrKindCopy, [], isLds>.ret,
+                      !if(HasTiedDest, (ins vdataClass:$vdata_in), (ins))),
+                 " $vdata, " # getMUBUFAsmOps<addrKindCopy>.ret # "$glc$slc" #
+                   !if(isLds, " lds", "$tfe"),
                  pattern>,
     MUBUF_SetupAddr<addrKindCopy> {
-  let PseudoInstr = opName # "_" # getAddrName<addrKindCopy>.ret;
+  let PseudoInstr = opName # !if(isLds, "_lds", "") #
+                    "_" # getAddrName<addrKindCopy>.ret;
+  let AsmMatchConverter = !if(isLds, "cvtMubufLds", "cvtMubuf");
+
   let Constraints = !if(HasTiedDest, "$vdata = $vdata_in", "");
   let mayLoad = 1;
   let mayStore = 0;
   let maybeAtomic = 1;
+  let Uses = !if(isLds, [EXEC, M0], [EXEC]);
+  let has_tfe = !if(isLds, 0, 1);
+  let lds = isLds;
 }
 
 // FIXME: tfe can't be an operand because it requires a separate
@@ -447,32 +465,45 @@ class MUBUF_Load_Pseudo <string opName,
 multiclass MUBUF_Pseudo_Loads<string opName, RegisterClass vdataClass,
                               ValueType load_vt = i32,
                               SDPatternOperator ld = null_frag,
-                              bit TiedDest = 0> {
+                              bit TiedDest = 0,
+                              bit isLds = 0> {
 
   def _OFFSET : MUBUF_Load_Pseudo <opName, BUFAddrKind.Offset, vdataClass,
-    TiedDest,
-    [(set load_vt:$vdata,
-     (ld (MUBUFOffset v4i32:$srsrc, i32:$soffset, i16:$offset, i1:$glc, i1:$slc, i1:$tfe)))]>,
-    MUBUFAddr64Table<0>;
+    TiedDest, isLds,
+    !if(isLds,
+        [],
+        [(set load_vt:$vdata,
+         (ld (MUBUFOffset v4i32:$srsrc, i32:$soffset, i16:$offset, i1:$glc, i1:$slc, i1:$tfe)))])>,
+    MUBUFAddr64Table<0, NAME # !if(isLds, "_LDS", "")>;
 
   def _ADDR64 : MUBUF_Load_Pseudo <opName, BUFAddrKind.Addr64, vdataClass,
-    TiedDest,
-    [(set load_vt:$vdata,
-     (ld (MUBUFAddr64 v4i32:$srsrc, i64:$vaddr, i32:$soffset, i16:$offset, i1:$glc, i1:$slc, i1:$tfe)))]>,
-    MUBUFAddr64Table<1>;
+    TiedDest, isLds,
+    !if(isLds,
+        [],
+        [(set load_vt:$vdata,
+         (ld (MUBUFAddr64 v4i32:$srsrc, i64:$vaddr, i32:$soffset, i16:$offset, i1:$glc, i1:$slc, i1:$tfe)))])>,
+    MUBUFAddr64Table<1, NAME # !if(isLds, "_LDS", "")>;
 
-  def _OFFEN  : MUBUF_Load_Pseudo <opName, BUFAddrKind.OffEn, vdataClass, TiedDest>;
-  def _IDXEN  : MUBUF_Load_Pseudo <opName, BUFAddrKind.IdxEn, vdataClass, TiedDest>;
-  def _BOTHEN : MUBUF_Load_Pseudo <opName, BUFAddrKind.BothEn, vdataClass, TiedDest>;
+  def _OFFEN  : MUBUF_Load_Pseudo <opName, BUFAddrKind.OffEn, vdataClass, TiedDest, isLds>;
+  def _IDXEN  : MUBUF_Load_Pseudo <opName, BUFAddrKind.IdxEn, vdataClass, TiedDest, isLds>;
+  def _BOTHEN : MUBUF_Load_Pseudo <opName, BUFAddrKind.BothEn, vdataClass, TiedDest, isLds>;
 
   let DisableWQM = 1 in {
-    def _OFFSET_exact : MUBUF_Load_Pseudo <opName, BUFAddrKind.Offset, vdataClass, TiedDest>;
-    def _OFFEN_exact  : MUBUF_Load_Pseudo <opName, BUFAddrKind.OffEn, vdataClass, TiedDest>;
-    def _IDXEN_exact  : MUBUF_Load_Pseudo <opName, BUFAddrKind.IdxEn, vdataClass, TiedDest>;
-    def _BOTHEN_exact : MUBUF_Load_Pseudo <opName, BUFAddrKind.BothEn, vdataClass, TiedDest>;
+    def _OFFSET_exact : MUBUF_Load_Pseudo <opName, BUFAddrKind.Offset, vdataClass, TiedDest, isLds>;
+    def _OFFEN_exact  : MUBUF_Load_Pseudo <opName, BUFAddrKind.OffEn, vdataClass, TiedDest, isLds>;
+    def _IDXEN_exact  : MUBUF_Load_Pseudo <opName, BUFAddrKind.IdxEn, vdataClass, TiedDest, isLds>;
+    def _BOTHEN_exact : MUBUF_Load_Pseudo <opName, BUFAddrKind.BothEn, vdataClass, TiedDest, isLds>;
   }
 }
 
+multiclass MUBUF_Pseudo_Loads_Lds<string opName, RegisterClass vdataClass,
+                                  ValueType load_vt = i32,
+                                  SDPatternOperator ld_nolds = null_frag,
+                                  SDPatternOperator ld_lds = null_frag> {
+  defm NAME : MUBUF_Pseudo_Loads<opName, vdataClass, load_vt, ld_nolds>;
+  defm _LDS : MUBUF_Pseudo_Loads<opName, vdataClass, load_vt, ld_lds, 0, 1>;
+}
+
 class MUBUF_Store_Pseudo <string opName,
                           int addrKind,
                           RegisterClass vdataClass,
@@ -499,12 +530,12 @@ multiclass MUBUF_Pseudo_Stores<string opName, RegisterClass vdataClass,
   def _OFFSET : MUBUF_Store_Pseudo <opName, BUFAddrKind.Offset, vdataClass,
     [(st store_vt:$vdata, (MUBUFOffset v4i32:$srsrc, i32:$soffset,
                                        i16:$offset, i1:$glc, i1:$slc, i1:$tfe))]>,
-    MUBUFAddr64Table<0>;
+    MUBUFAddr64Table<0, NAME>;
 
   def _ADDR64 : MUBUF_Store_Pseudo <opName, BUFAddrKind.Addr64, vdataClass,
     [(st store_vt:$vdata, (MUBUFAddr64 v4i32:$srsrc, i64:$vaddr, i32:$soffset,
                                        i16:$offset, i1:$glc, i1:$slc, i1:$tfe))]>,
-    MUBUFAddr64Table<1>;
+    MUBUFAddr64Table<1, NAME>;
 
   def _OFFEN  : MUBUF_Store_Pseudo <opName, BUFAddrKind.OffEn, vdataClass>;
   def _IDXEN  : MUBUF_Store_Pseudo <opName, BUFAddrKind.IdxEn, vdataClass>;
@@ -518,6 +549,23 @@ multiclass MUBUF_Pseudo_Stores<string opName, RegisterClass vdataClass,
   }
 }
 
+class MUBUF_Pseudo_Store_Lds<string opName>
+  : MUBUF_Pseudo<opName,
+                 (outs),
+                 (ins SReg_128:$srsrc, SCSrc_b32:$soffset, offset:$offset, GLC:$glc, SLC:$slc),
+                 " $srsrc, $soffset$offset lds$glc$slc"> {
+  let mayLoad = 0;
+  let mayStore = 1;
+  let maybeAtomic = 1;
+
+  let has_vdata = 0;
+  let has_vaddr = 0;
+  let has_tfe = 0;
+  let lds = 1;
+
+  let Uses = [EXEC, M0];
+  let AsmMatchConverter = "cvtMubufLds";
+}
 
 class getMUBUFAtomicInsDA<RegisterClass vdataClass, bit vdata_in,
                           list<RegisterClass> vaddrList=[]> {
@@ -525,15 +573,15 @@ class getMUBUFAtomicInsDA<RegisterClass vdataClass, bit vdata_in,
   dag ret = !if(vdata_in,
     !if(!empty(vaddrList),
       (ins vdataClass:$vdata_in,
-           SReg_128:$srsrc, SCSrc_b32:$soffset, offset:$offset, slc:$slc),
+           SReg_128:$srsrc, SCSrc_b32:$soffset, offset:$offset, SLC:$slc),
       (ins vdataClass:$vdata_in, vaddrClass:$vaddr,
-           SReg_128:$srsrc, SCSrc_b32:$soffset, offset:$offset, slc:$slc)
+           SReg_128:$srsrc, SCSrc_b32:$soffset, offset:$offset, SLC:$slc)
     ),
     !if(!empty(vaddrList),
       (ins vdataClass:$vdata,
-           SReg_128:$srsrc, SCSrc_b32:$soffset, offset:$offset, slc:$slc),
+           SReg_128:$srsrc, SCSrc_b32:$soffset, offset:$offset, SLC:$slc),
       (ins vdataClass:$vdata, vaddrClass:$vaddr,
-           SReg_128:$srsrc, SCSrc_b32:$soffset, offset:$offset, slc:$slc)
+           SReg_128:$srsrc, SCSrc_b32:$soffset, offset:$offset, SLC:$slc)
   ));
 }
 
@@ -618,9 +666,9 @@ multiclass MUBUF_Pseudo_Atomics <string opName,
                                  SDPatternOperator atomic> {
 
   def _OFFSET : MUBUF_AtomicNoRet_Pseudo <opName, BUFAddrKind.Offset, vdataClass>,
-                MUBUFAddr64Table <0>;
+                MUBUFAddr64Table <0, NAME>;
   def _ADDR64 : MUBUF_AtomicNoRet_Pseudo <opName, BUFAddrKind.Addr64, vdataClass>,
-                MUBUFAddr64Table <1>;
+                MUBUFAddr64Table <1, NAME>;
   def _OFFEN  : MUBUF_AtomicNoRet_Pseudo <opName, BUFAddrKind.OffEn,  vdataClass>;
   def _IDXEN  : MUBUF_AtomicNoRet_Pseudo <opName, BUFAddrKind.IdxEn,  vdataClass>;
   def _BOTHEN : MUBUF_AtomicNoRet_Pseudo <opName, BUFAddrKind.BothEn, vdataClass>;
@@ -629,13 +677,13 @@ multiclass MUBUF_Pseudo_Atomics <string opName,
     [(set vdataType:$vdata,
      (atomic (MUBUFOffsetAtomic v4i32:$srsrc, i32:$soffset, i16:$offset, i1:$slc),
              vdataType:$vdata_in))]>,
-    MUBUFAddr64Table <0, "_RTN">;
+    MUBUFAddr64Table <0, NAME # "_RTN">;
 
   def _ADDR64_RTN : MUBUF_AtomicRet_Pseudo <opName, BUFAddrKind.Addr64, vdataClass,
     [(set vdataType:$vdata,
      (atomic (MUBUFAddr64Atomic v4i32:$srsrc, i64:$vaddr, i32:$soffset, i16:$offset, i1:$slc),
              vdataType:$vdata_in))]>,
-    MUBUFAddr64Table <1, "_RTN">;
+    MUBUFAddr64Table <1, NAME # "_RTN">;
 
   def _OFFEN_RTN  : MUBUF_AtomicRet_Pseudo <opName, BUFAddrKind.OffEn,  vdataClass>;
   def _IDXEN_RTN  : MUBUF_AtomicRet_Pseudo <opName, BUFAddrKind.IdxEn,  vdataClass>;
@@ -647,7 +695,7 @@ multiclass MUBUF_Pseudo_Atomics <string opName,
 // MUBUF Instructions
 //===----------------------------------------------------------------------===//
 
-defm BUFFER_LOAD_FORMAT_X : MUBUF_Pseudo_Loads <
+defm BUFFER_LOAD_FORMAT_X : MUBUF_Pseudo_Loads_Lds <
   "buffer_load_format_x", VGPR_32
 >;
 defm BUFFER_LOAD_FORMAT_XY : MUBUF_Pseudo_Loads <
@@ -671,19 +719,74 @@ defm BUFFER_STORE_FORMAT_XYZ : MUBUF_Pseudo_Stores <
 defm BUFFER_STORE_FORMAT_XYZW : MUBUF_Pseudo_Stores <
   "buffer_store_format_xyzw", VReg_128
 >;
-defm BUFFER_LOAD_UBYTE : MUBUF_Pseudo_Loads <
+
+let SubtargetPredicate = HasUnpackedD16VMem, D16Buf = 1 in {
+  defm BUFFER_LOAD_FORMAT_D16_X_gfx80 : MUBUF_Pseudo_Loads <
+    "buffer_load_format_d16_x", VGPR_32
+  >;
+  defm BUFFER_LOAD_FORMAT_D16_XY_gfx80 : MUBUF_Pseudo_Loads <
+    "buffer_load_format_d16_xy", VReg_64
+  >;
+  defm BUFFER_LOAD_FORMAT_D16_XYZ_gfx80 : MUBUF_Pseudo_Loads <
+    "buffer_load_format_d16_xyz", VReg_96
+  >;
+  defm BUFFER_LOAD_FORMAT_D16_XYZW_gfx80 : MUBUF_Pseudo_Loads <
+   "buffer_load_format_d16_xyzw", VReg_128
+  >;
+  defm BUFFER_STORE_FORMAT_D16_X_gfx80 : MUBUF_Pseudo_Stores <
+    "buffer_store_format_d16_x", VGPR_32
+  >;
+  defm BUFFER_STORE_FORMAT_D16_XY_gfx80 : MUBUF_Pseudo_Stores <
+    "buffer_store_format_d16_xy", VReg_64
+  >;
+  defm BUFFER_STORE_FORMAT_D16_XYZ_gfx80 : MUBUF_Pseudo_Stores <
+    "buffer_store_format_d16_xyz", VReg_96
+  >;
+  defm BUFFER_STORE_FORMAT_D16_XYZW_gfx80 : MUBUF_Pseudo_Stores <
+    "buffer_store_format_d16_xyzw", VReg_128
+  >;
+} // End HasUnpackedD16VMem.
+
+let SubtargetPredicate = HasPackedD16VMem, D16Buf = 1 in {
+  defm BUFFER_LOAD_FORMAT_D16_X : MUBUF_Pseudo_Loads <
+    "buffer_load_format_d16_x", VGPR_32
+  >;
+  defm BUFFER_LOAD_FORMAT_D16_XY : MUBUF_Pseudo_Loads <
+    "buffer_load_format_d16_xy", VGPR_32
+  >;
+  defm BUFFER_LOAD_FORMAT_D16_XYZ : MUBUF_Pseudo_Loads <
+    "buffer_load_format_d16_xyz", VReg_64
+  >;
+  defm BUFFER_LOAD_FORMAT_D16_XYZW : MUBUF_Pseudo_Loads <
+    "buffer_load_format_d16_xyzw", VReg_64
+  >;
+  defm BUFFER_STORE_FORMAT_D16_X : MUBUF_Pseudo_Stores <
+    "buffer_store_format_d16_x", VGPR_32
+  >;
+  defm BUFFER_STORE_FORMAT_D16_XY : MUBUF_Pseudo_Stores <
+    "buffer_store_format_d16_xy", VGPR_32
+  >;
+  defm BUFFER_STORE_FORMAT_D16_XYZ : MUBUF_Pseudo_Stores <
+    "buffer_store_format_d16_xyz", VReg_64
+  >;
+  defm BUFFER_STORE_FORMAT_D16_XYZW : MUBUF_Pseudo_Stores <
+    "buffer_store_format_d16_xyzw", VReg_64
+  >;
+} // End HasPackedD16VMem.
+
+defm BUFFER_LOAD_UBYTE : MUBUF_Pseudo_Loads_Lds <
   "buffer_load_ubyte", VGPR_32, i32, mubuf_az_extloadi8
 >;
-defm BUFFER_LOAD_SBYTE : MUBUF_Pseudo_Loads <
+defm BUFFER_LOAD_SBYTE : MUBUF_Pseudo_Loads_Lds <
   "buffer_load_sbyte", VGPR_32, i32, mubuf_sextloadi8
 >;
-defm BUFFER_LOAD_USHORT : MUBUF_Pseudo_Loads <
+defm BUFFER_LOAD_USHORT : MUBUF_Pseudo_Loads_Lds <
   "buffer_load_ushort", VGPR_32, i32, mubuf_az_extloadi16
 >;
-defm BUFFER_LOAD_SSHORT : MUBUF_Pseudo_Loads <
+defm BUFFER_LOAD_SSHORT : MUBUF_Pseudo_Loads_Lds <
   "buffer_load_sshort", VGPR_32, i32, mubuf_sextloadi16
 >;
-defm BUFFER_LOAD_DWORD : MUBUF_Pseudo_Loads <
+defm BUFFER_LOAD_DWORD : MUBUF_Pseudo_Loads_Lds <
   "buffer_load_dword", VGPR_32, i32, mubuf_load
 >;
 defm BUFFER_LOAD_DWORDX2 : MUBUF_Pseudo_Loads <
@@ -695,6 +798,22 @@ defm BUFFER_LOAD_DWORDX3 : MUBUF_Pseudo_Loads <
 defm BUFFER_LOAD_DWORDX4 : MUBUF_Pseudo_Loads <
   "buffer_load_dwordx4", VReg_128, v4i32, mubuf_load
 >;
+
+// This is not described in AMD documentation,
+// but 'lds' versions of these opcodes are available
+// in at least GFX8+ chips. See Bug 37653.
+let SubtargetPredicate = isVI in {
+defm BUFFER_LOAD_DWORDX2_LDS : MUBUF_Pseudo_Loads <
+  "buffer_load_dwordx2", VReg_64, v2i32, null_frag, 0, 1
+>;
+defm BUFFER_LOAD_DWORDX3_LDS : MUBUF_Pseudo_Loads <
+  "buffer_load_dwordx3", VReg_96, untyped, null_frag, 0, 1
+>;
+defm BUFFER_LOAD_DWORDX4_LDS : MUBUF_Pseudo_Loads <
+  "buffer_load_dwordx4", VReg_128, v4i32, null_frag, 0, 1
+>;
+}
+
 defm BUFFER_STORE_BYTE : MUBUF_Pseudo_Stores <
   "buffer_store_byte", VGPR_32, i32, truncstorei8_global
 >;
@@ -792,6 +911,10 @@ defm BUFFER_ATOMIC_DEC_X2 : MUBUF_Pseudo_Atomics <
   "buffer_atomic_dec_x2", VReg_64, i64, atomic_dec_global
 >;
 
+let SubtargetPredicate = isVI in {
+def BUFFER_STORE_LDS_DWORD : MUBUF_Pseudo_Store_Lds <"buffer_store_lds_dword">;
+}
+
 let SubtargetPredicate = isSI in { // isn't on CI & VI
 /*
 defm BUFFER_ATOMIC_RSUB        : MUBUF_Pseudo_Atomics <"buffer_atomic_rsub">;
@@ -842,6 +965,13 @@ defm BUFFER_STORE_SHORT_D16_HI : MUBUF_Pseudo_Stores <
   "buffer_store_short_d16_hi", VGPR_32, i32
 >;
 
+defm BUFFER_LOAD_FORMAT_D16_HI_X : MUBUF_Pseudo_Loads <
+  "buffer_load_format_d16_hi_x", VGPR_32
+>;
+defm BUFFER_STORE_FORMAT_D16_HI_X : MUBUF_Pseudo_Stores <
+  "buffer_store_format_d16_hi_x", VGPR_32
+>;
+
 } // End HasD16LoadStore
 
 def BUFFER_WBINVL1 : MUBUF_Invalidate <"buffer_wbinvl1",
@@ -860,6 +990,28 @@ defm TBUFFER_STORE_FORMAT_XY   : MTBUF_Pseudo_Stores <"tbuffer_store_format_xy",
 defm TBUFFER_STORE_FORMAT_XYZ  : MTBUF_Pseudo_Stores <"tbuffer_store_format_xyz",  VReg_128>;
 defm TBUFFER_STORE_FORMAT_XYZW : MTBUF_Pseudo_Stores <"tbuffer_store_format_xyzw", VReg_128>;
 
+let SubtargetPredicate = HasUnpackedD16VMem, D16Buf = 1 in {
+  defm TBUFFER_LOAD_FORMAT_D16_X_gfx80     : MTBUF_Pseudo_Loads  <"tbuffer_load_format_d16_x",     VGPR_32>;
+  defm TBUFFER_LOAD_FORMAT_D16_XY_gfx80    : MTBUF_Pseudo_Loads  <"tbuffer_load_format_d16_xy",    VReg_64>;
+  defm TBUFFER_LOAD_FORMAT_D16_XYZ_gfx80   : MTBUF_Pseudo_Loads  <"tbuffer_load_format_d16_xyz",   VReg_96>;
+  defm TBUFFER_LOAD_FORMAT_D16_XYZW_gfx80  : MTBUF_Pseudo_Loads  <"tbuffer_load_format_d16_xyzw",  VReg_128>;
+  defm TBUFFER_STORE_FORMAT_D16_X_gfx80    : MTBUF_Pseudo_Stores <"tbuffer_store_format_d16_x",    VGPR_32>;
+  defm TBUFFER_STORE_FORMAT_D16_XY_gfx80   : MTBUF_Pseudo_Stores <"tbuffer_store_format_d16_xy",   VReg_64>;
+  defm TBUFFER_STORE_FORMAT_D16_XYZ_gfx80  : MTBUF_Pseudo_Stores <"tbuffer_store_format_d16_xyz",  VReg_96>;
+  defm TBUFFER_STORE_FORMAT_D16_XYZW_gfx80 : MTBUF_Pseudo_Stores <"tbuffer_store_format_d16_xyzw", VReg_128>;
+} // End HasUnpackedD16VMem.
+
+let SubtargetPredicate = HasPackedD16VMem, D16Buf = 1 in {
+  defm TBUFFER_LOAD_FORMAT_D16_X     : MTBUF_Pseudo_Loads  <"tbuffer_load_format_d16_x",     VGPR_32>;
+  defm TBUFFER_LOAD_FORMAT_D16_XY    : MTBUF_Pseudo_Loads  <"tbuffer_load_format_d16_xy",    VGPR_32>;
+  defm TBUFFER_LOAD_FORMAT_D16_XYZ   : MTBUF_Pseudo_Loads  <"tbuffer_load_format_d16_xyz",   VReg_64>;
+  defm TBUFFER_LOAD_FORMAT_D16_XYZW  : MTBUF_Pseudo_Loads  <"tbuffer_load_format_d16_xyzw",  VReg_64>;
+  defm TBUFFER_STORE_FORMAT_D16_X    : MTBUF_Pseudo_Stores <"tbuffer_store_format_d16_x",    VGPR_32>;
+  defm TBUFFER_STORE_FORMAT_D16_XY   : MTBUF_Pseudo_Stores <"tbuffer_store_format_d16_xy",   VGPR_32>;
+  defm TBUFFER_STORE_FORMAT_D16_XYZ  : MTBUF_Pseudo_Stores <"tbuffer_store_format_d16_xyz",  VReg_64>;
+  defm TBUFFER_STORE_FORMAT_D16_XYZW : MTBUF_Pseudo_Stores <"tbuffer_store_format_d16_xyzw", VReg_64>;
+} // End HasPackedD16VMem.
+
 let SubtargetPredicate = isCIVI in {
 
 //===----------------------------------------------------------------------===//
@@ -922,6 +1074,19 @@ multiclass MUBUF_LoadIntrinsicPat<SDPatternOperator name, ValueType vt,
 defm : MUBUF_LoadIntrinsicPat<SIbuffer_load_format, f32, "BUFFER_LOAD_FORMAT_X">;
 defm : MUBUF_LoadIntrinsicPat<SIbuffer_load_format, v2f32, "BUFFER_LOAD_FORMAT_XY">;
 defm : MUBUF_LoadIntrinsicPat<SIbuffer_load_format, v4f32, "BUFFER_LOAD_FORMAT_XYZW">;
+
+let SubtargetPredicate = HasUnpackedD16VMem in {
+  defm : MUBUF_LoadIntrinsicPat<SIbuffer_load_format_d16, f16, "BUFFER_LOAD_FORMAT_D16_X_gfx80">;
+  defm : MUBUF_LoadIntrinsicPat<SIbuffer_load_format_d16, v2i32, "BUFFER_LOAD_FORMAT_D16_XY_gfx80">;
+  defm : MUBUF_LoadIntrinsicPat<SIbuffer_load_format_d16, v4i32, "BUFFER_LOAD_FORMAT_D16_XYZW_gfx80">;
+} // End HasUnpackedD16VMem.
+
+let SubtargetPredicate = HasPackedD16VMem in {
+  defm : MUBUF_LoadIntrinsicPat<SIbuffer_load_format_d16, f16, "BUFFER_LOAD_FORMAT_D16_X">;
+  defm : MUBUF_LoadIntrinsicPat<SIbuffer_load_format_d16, v2f16, "BUFFER_LOAD_FORMAT_D16_XY">;
+  defm : MUBUF_LoadIntrinsicPat<SIbuffer_load_format_d16, v4f16, "BUFFER_LOAD_FORMAT_D16_XYZW">;
+} // End HasPackedD16VMem.
+
 defm : MUBUF_LoadIntrinsicPat<SIbuffer_load, f32, "BUFFER_LOAD_DWORD">;
 defm : MUBUF_LoadIntrinsicPat<SIbuffer_load, v2f32, "BUFFER_LOAD_DWORDX2">;
 defm : MUBUF_LoadIntrinsicPat<SIbuffer_load, v4f32, "BUFFER_LOAD_DWORDX4">;
@@ -969,6 +1134,19 @@ multiclass MUBUF_StoreIntrinsicPat<SDPatternOperator name, ValueType vt,
 defm : MUBUF_StoreIntrinsicPat<SIbuffer_store_format, f32, "BUFFER_STORE_FORMAT_X">;
 defm : MUBUF_StoreIntrinsicPat<SIbuffer_store_format, v2f32, "BUFFER_STORE_FORMAT_XY">;
 defm : MUBUF_StoreIntrinsicPat<SIbuffer_store_format, v4f32, "BUFFER_STORE_FORMAT_XYZW">;
+
+let SubtargetPredicate = HasUnpackedD16VMem in {
+  defm : MUBUF_StoreIntrinsicPat<SIbuffer_store_format_d16, f16, "BUFFER_STORE_FORMAT_D16_X_gfx80">;
+  defm : MUBUF_StoreIntrinsicPat<SIbuffer_store_format_d16, v2i32, "BUFFER_STORE_FORMAT_D16_XY_gfx80">;
+  defm : MUBUF_StoreIntrinsicPat<SIbuffer_store_format_d16, v4i32, "BUFFER_STORE_FORMAT_D16_XYZW_gfx80">;
+} // End HasUnpackedD16VMem.
+
+let SubtargetPredicate = HasPackedD16VMem in {
+  defm : MUBUF_StoreIntrinsicPat<SIbuffer_store_format_d16, f16, "BUFFER_STORE_FORMAT_D16_X">;
+  defm : MUBUF_StoreIntrinsicPat<SIbuffer_store_format_d16, v2f16, "BUFFER_STORE_FORMAT_D16_XY">;
+  defm : MUBUF_StoreIntrinsicPat<SIbuffer_store_format_d16, v4f16, "BUFFER_STORE_FORMAT_D16_XYZW">;
+} // End HasPackedD16VMem.
+
 defm : MUBUF_StoreIntrinsicPat<SIbuffer_store, f32, "BUFFER_STORE_DWORD">;
 defm : MUBUF_StoreIntrinsicPat<SIbuffer_store, v2f32, "BUFFER_STORE_DWORDX2">;
 defm : MUBUF_StoreIntrinsicPat<SIbuffer_store, v4f32, "BUFFER_STORE_DWORDX4">;
@@ -1210,7 +1388,7 @@ defm : MUBUFScratchLoadPat <BUFFER_LOAD_DWORD_OFFEN, BUFFER_LOAD_DWORD_OFFSET, i
 defm : MUBUFScratchLoadPat <BUFFER_LOAD_DWORDX2_OFFEN, BUFFER_LOAD_DWORDX2_OFFSET, v2i32, load_private>;
 defm : MUBUFScratchLoadPat <BUFFER_LOAD_DWORDX4_OFFEN, BUFFER_LOAD_DWORDX4_OFFSET, v4i32, load_private>;
 
-let OtherPredicates = [HasD16LoadStore] in {
+let OtherPredicates = [D16PreservesUnusedBits] in {
 defm : MUBUFScratchLoadPat_Hi16<BUFFER_LOAD_SHORT_D16_HI_OFFEN, BUFFER_LOAD_SHORT_D16_HI_OFFSET, i16, load_private>;
 defm : MUBUFScratchLoadPat_Hi16<BUFFER_LOAD_UBYTE_D16_HI_OFFEN, BUFFER_LOAD_UBYTE_D16_HI_OFFSET, i16, az_extloadi8_private>;
 defm : MUBUFScratchLoadPat_Hi16<BUFFER_LOAD_SBYTE_D16_HI_OFFEN, BUFFER_LOAD_SBYTE_D16_HI_OFFSET, i16, sextloadi8_private>;
@@ -1325,7 +1503,7 @@ defm : MUBUFScratchStorePat <BUFFER_STORE_DWORDX2_OFFEN, BUFFER_STORE_DWORDX2_OF
 defm : MUBUFScratchStorePat <BUFFER_STORE_DWORDX4_OFFEN, BUFFER_STORE_DWORDX4_OFFSET, v4i32, store_private>;
 
 
-let OtherPredicates = [HasD16LoadStore] in {
+let OtherPredicates = [D16PreservesUnusedBits] in {
  // Hiding the extract high pattern in the PatFrag seems to not
  // automatically increase the complexity.
 let AddedComplexity = 1 in {
@@ -1382,6 +1560,18 @@ defm : MTBUF_LoadIntrinsicPat<SItbuffer_load, f32,   "TBUFFER_LOAD_FORMAT_X">;
 defm : MTBUF_LoadIntrinsicPat<SItbuffer_load, v2f32, "TBUFFER_LOAD_FORMAT_XY">;
 defm : MTBUF_LoadIntrinsicPat<SItbuffer_load, v4f32, "TBUFFER_LOAD_FORMAT_XYZW">;
 
+let SubtargetPredicate = HasUnpackedD16VMem in {
+  defm : MTBUF_LoadIntrinsicPat<SItbuffer_load_d16, f16,   "TBUFFER_LOAD_FORMAT_D16_X_gfx80">;
+  defm : MTBUF_LoadIntrinsicPat<SItbuffer_load_d16, v2i32, "TBUFFER_LOAD_FORMAT_D16_XY_gfx80">;
+  defm : MTBUF_LoadIntrinsicPat<SItbuffer_load_d16, v4i32, "TBUFFER_LOAD_FORMAT_D16_XYZW_gfx80">;
+} // End HasUnpackedD16VMem.
+
+let SubtargetPredicate = HasPackedD16VMem in {
+  defm : MTBUF_LoadIntrinsicPat<SItbuffer_load_d16, f16,   "TBUFFER_LOAD_FORMAT_D16_X">;
+  defm : MTBUF_LoadIntrinsicPat<SItbuffer_load_d16, v2f16, "TBUFFER_LOAD_FORMAT_D16_XY">;
+  defm : MTBUF_LoadIntrinsicPat<SItbuffer_load_d16, v4f16, "TBUFFER_LOAD_FORMAT_D16_XYZW">;
+} // End HasPackedD16VMem.
+
 multiclass MTBUF_StoreIntrinsicPat<SDPatternOperator name, ValueType vt,
                                    string opcode> {
   def : GCNPat<
@@ -1431,6 +1621,18 @@ defm : MTBUF_StoreIntrinsicPat<SItbuffer_store, v2f32, "TBUFFER_STORE_FORMAT_XY"
 defm : MTBUF_StoreIntrinsicPat<SItbuffer_store_x3, v4f32, "TBUFFER_STORE_FORMAT_XYZ">;
 defm : MTBUF_StoreIntrinsicPat<SItbuffer_store, v4f32, "TBUFFER_STORE_FORMAT_XYZW">;
 
+let SubtargetPredicate = HasUnpackedD16VMem in {
+  defm : MTBUF_StoreIntrinsicPat<SItbuffer_store_d16, f16,   "TBUFFER_STORE_FORMAT_D16_X_gfx80">;
+  defm : MTBUF_StoreIntrinsicPat<SItbuffer_store_d16, v2i32, "TBUFFER_STORE_FORMAT_D16_XY_gfx80">;
+  defm : MTBUF_StoreIntrinsicPat<SItbuffer_store_d16, v4i32, "TBUFFER_STORE_FORMAT_D16_XYZW_gfx80">;
+} // End HasUnpackedD16VMem.
+
+let SubtargetPredicate = HasPackedD16VMem in {
+  defm : MTBUF_StoreIntrinsicPat<SItbuffer_store_d16, f16,   "TBUFFER_STORE_FORMAT_D16_X">;
+  defm : MTBUF_StoreIntrinsicPat<SItbuffer_store_d16, v2f16, "TBUFFER_STORE_FORMAT_D16_XY">;
+  defm : MTBUF_StoreIntrinsicPat<SItbuffer_store_d16, v4f16, "TBUFFER_STORE_FORMAT_D16_XYZW">;
+} // End HasPackedD16VMem.
+
 //===----------------------------------------------------------------------===//
 // Target instructions, move to the appropriate target TD file
 //===----------------------------------------------------------------------===//
@@ -1451,7 +1653,7 @@ class MUBUF_Real_si <bits<7> op, MUBUF_Pseudo ps> :
   let Inst{13}    = ps.idxen;
   let Inst{14}    = !if(ps.has_glc, glc, ps.glc_value);
   let Inst{15}    = ps.addr64;
-  let Inst{16}    = lds;
+  let Inst{16}    = !if(ps.lds, 1, 0);
   let Inst{24-18} = op;
   let Inst{31-26} = 0x38; //encoding
   let Inst{39-32} = !if(ps.has_vaddr, vaddr, ?);
@@ -1470,6 +1672,31 @@ multiclass MUBUF_Real_AllAddr_si<bits<7> op> {
   def _BOTHEN_si : MUBUF_Real_si <op, !cast<MUBUF_Pseudo>(NAME#"_BOTHEN")>;
 }
 
+multiclass MUBUF_Real_AllAddr_Lds_si<bits<7> op> {
+
+  def _OFFSET_si : MUBUF_Real_si <op, !cast<MUBUF_Pseudo>(NAME#"_OFFSET")>,
+                   MUBUFLdsTable<0, NAME # "_OFFSET_si">;
+  def _ADDR64_si : MUBUF_Real_si <op, !cast<MUBUF_Pseudo>(NAME#"_ADDR64")>,
+                   MUBUFLdsTable<0, NAME # "_ADDR64_si">;
+  def _OFFEN_si  : MUBUF_Real_si <op, !cast<MUBUF_Pseudo>(NAME#"_OFFEN")>,
+                   MUBUFLdsTable<0, NAME # "_OFFEN_si">;
+  def _IDXEN_si  : MUBUF_Real_si <op, !cast<MUBUF_Pseudo>(NAME#"_IDXEN")>,
+                   MUBUFLdsTable<0, NAME # "_IDXEN_si">;
+  def _BOTHEN_si : MUBUF_Real_si <op, !cast<MUBUF_Pseudo>(NAME#"_BOTHEN")>,
+                   MUBUFLdsTable<0, NAME # "_BOTHEN_si">;
+
+  def _LDS_OFFSET_si : MUBUF_Real_si <op, !cast<MUBUF_Pseudo>(NAME#"_LDS_OFFSET")>,
+                       MUBUFLdsTable<1, NAME # "_OFFSET_si">;
+  def _LDS_ADDR64_si : MUBUF_Real_si <op, !cast<MUBUF_Pseudo>(NAME#"_LDS_ADDR64")>,
+                       MUBUFLdsTable<1, NAME # "_ADDR64_si">;
+  def _LDS_OFFEN_si  : MUBUF_Real_si <op, !cast<MUBUF_Pseudo>(NAME#"_LDS_OFFEN")>,
+                       MUBUFLdsTable<1, NAME # "_OFFEN_si">;
+  def _LDS_IDXEN_si  : MUBUF_Real_si <op, !cast<MUBUF_Pseudo>(NAME#"_LDS_IDXEN")>,
+                       MUBUFLdsTable<1, NAME # "_IDXEN_si">;
+  def _LDS_BOTHEN_si : MUBUF_Real_si <op, !cast<MUBUF_Pseudo>(NAME#"_LDS_BOTHEN")>,
+                       MUBUFLdsTable<1, NAME # "_BOTHEN_si">;
+}
+
 multiclass MUBUF_Real_Atomic_si<bits<7> op> : MUBUF_Real_AllAddr_si<op> {
   def _OFFSET_RTN_si : MUBUF_Real_si <op, !cast<MUBUF_Pseudo>(NAME#"_OFFSET_RTN")>;
   def _ADDR64_RTN_si : MUBUF_Real_si <op, !cast<MUBUF_Pseudo>(NAME#"_ADDR64_RTN")>;
@@ -1478,7 +1705,7 @@ multiclass MUBUF_Real_Atomic_si<bits<7> op> : MUBUF_Real_AllAddr_si<op> {
   def _BOTHEN_RTN_si : MUBUF_Real_si <op, !cast<MUBUF_Pseudo>(NAME#"_BOTHEN_RTN")>;
 }
 
-defm BUFFER_LOAD_FORMAT_X       : MUBUF_Real_AllAddr_si <0x00>;
+defm BUFFER_LOAD_FORMAT_X       : MUBUF_Real_AllAddr_Lds_si <0x00>;
 defm BUFFER_LOAD_FORMAT_XY      : MUBUF_Real_AllAddr_si <0x01>;
 defm BUFFER_LOAD_FORMAT_XYZ     : MUBUF_Real_AllAddr_si <0x02>;
 defm BUFFER_LOAD_FORMAT_XYZW    : MUBUF_Real_AllAddr_si <0x03>;
@@ -1486,11 +1713,11 @@ defm BUFFER_STORE_FORMAT_X      : MUBUF_Real_AllAddr_si <0x04>;
 defm BUFFER_STORE_FORMAT_XY     : MUBUF_Real_AllAddr_si <0x05>;
 defm BUFFER_STORE_FORMAT_XYZ    : MUBUF_Real_AllAddr_si <0x06>;
 defm BUFFER_STORE_FORMAT_XYZW   : MUBUF_Real_AllAddr_si <0x07>;
-defm BUFFER_LOAD_UBYTE          : MUBUF_Real_AllAddr_si <0x08>;
-defm BUFFER_LOAD_SBYTE          : MUBUF_Real_AllAddr_si <0x09>;
-defm BUFFER_LOAD_USHORT         : MUBUF_Real_AllAddr_si <0x0a>;
-defm BUFFER_LOAD_SSHORT         : MUBUF_Real_AllAddr_si <0x0b>;
-defm BUFFER_LOAD_DWORD          : MUBUF_Real_AllAddr_si <0x0c>;
+defm BUFFER_LOAD_UBYTE          : MUBUF_Real_AllAddr_Lds_si <0x08>;
+defm BUFFER_LOAD_SBYTE          : MUBUF_Real_AllAddr_Lds_si <0x09>;
+defm BUFFER_LOAD_USHORT         : MUBUF_Real_AllAddr_Lds_si <0x0a>;
+defm BUFFER_LOAD_SSHORT         : MUBUF_Real_AllAddr_Lds_si <0x0b>;
+defm BUFFER_LOAD_DWORD          : MUBUF_Real_AllAddr_Lds_si <0x0c>;
 defm BUFFER_LOAD_DWORDX2        : MUBUF_Real_AllAddr_si <0x0d>;
 defm BUFFER_LOAD_DWORDX4        : MUBUF_Real_AllAddr_si <0x0e>;
 defm BUFFER_LOAD_DWORDX3        : MUBUF_Real_AllAddr_si <0x0f>;
@@ -1575,7 +1802,7 @@ multiclass MTBUF_Real_AllAddr_si<bits<3> op> {
 
 defm TBUFFER_LOAD_FORMAT_X     : MTBUF_Real_AllAddr_si <0>;
 defm TBUFFER_LOAD_FORMAT_XY    : MTBUF_Real_AllAddr_si <1>;
-//defm TBUFFER_LOAD_FORMAT_XYZ   : MTBUF_Real_AllAddr_si <2>;
+defm TBUFFER_LOAD_FORMAT_XYZ   : MTBUF_Real_AllAddr_si <2>;
 defm TBUFFER_LOAD_FORMAT_XYZW  : MTBUF_Real_AllAddr_si <3>;
 defm TBUFFER_STORE_FORMAT_X    : MTBUF_Real_AllAddr_si <4>;
 defm TBUFFER_STORE_FORMAT_XY   : MTBUF_Real_AllAddr_si <5>;
@@ -1610,7 +1837,7 @@ class MUBUF_Real_vi <bits<7> op, MUBUF_Pseudo ps> :
   let Inst{12}    = ps.offen;
   let Inst{13}    = ps.idxen;
   let Inst{14}    = !if(ps.has_glc, glc, ps.glc_value);
-  let Inst{16}    = lds;
+  let Inst{16}    = !if(ps.lds, 1, 0);
   let Inst{17}    = !if(ps.has_slc, slc, ?);
   let Inst{24-18} = op;
   let Inst{31-26} = 0x38; //encoding
@@ -1628,6 +1855,56 @@ multiclass MUBUF_Real_AllAddr_vi<bits<7> op> {
   def _BOTHEN_vi : MUBUF_Real_vi <op, !cast<MUBUF_Pseudo>(NAME#"_BOTHEN")>;
 }
 
+multiclass MUBUF_Real_AllAddr_Lds_vi<bits<7> op> {
+
+  def _OFFSET_vi : MUBUF_Real_vi <op, !cast<MUBUF_Pseudo>(NAME#"_OFFSET")>,
+                   MUBUFLdsTable<0, NAME # "_OFFSET_vi">;
+  def _OFFEN_vi  : MUBUF_Real_vi <op, !cast<MUBUF_Pseudo>(NAME#"_OFFEN")>,
+                   MUBUFLdsTable<0, NAME # "_OFFEN_vi">;
+  def _IDXEN_vi  : MUBUF_Real_vi <op, !cast<MUBUF_Pseudo>(NAME#"_IDXEN")>,
+                   MUBUFLdsTable<0, NAME # "_IDXEN_vi">;
+  def _BOTHEN_vi : MUBUF_Real_vi <op, !cast<MUBUF_Pseudo>(NAME#"_BOTHEN")>,
+                   MUBUFLdsTable<0, NAME # "_BOTHEN_vi">;
+
+  def _LDS_OFFSET_vi : MUBUF_Real_vi <op, !cast<MUBUF_Pseudo>(NAME#"_LDS_OFFSET")>,
+                       MUBUFLdsTable<1, NAME # "_OFFSET_vi">;
+  def _LDS_OFFEN_vi  : MUBUF_Real_vi <op, !cast<MUBUF_Pseudo>(NAME#"_LDS_OFFEN")>,
+                       MUBUFLdsTable<1, NAME # "_OFFEN_vi">;
+  def _LDS_IDXEN_vi  : MUBUF_Real_vi <op, !cast<MUBUF_Pseudo>(NAME#"_LDS_IDXEN")>,
+                       MUBUFLdsTable<1, NAME # "_IDXEN_vi">;
+  def _LDS_BOTHEN_vi : MUBUF_Real_vi <op, !cast<MUBUF_Pseudo>(NAME#"_LDS_BOTHEN")>,
+                       MUBUFLdsTable<1, NAME # "_BOTHEN_vi">;
+}
+
+class MUBUF_Real_gfx80 <bits<7> op, MUBUF_Pseudo ps> :
+  MUBUF_Real<op, ps>,
+  Enc64,
+  SIMCInstr<ps.PseudoInstr, SIEncodingFamily.GFX80> {
+  let AssemblerPredicate=HasUnpackedD16VMem;
+  let DecoderNamespace="GFX80_UNPACKED";
+
+  let Inst{11-0}  = !if(ps.has_offset, offset, ?);
+  let Inst{12}    = ps.offen;
+  let Inst{13}    = ps.idxen;
+  let Inst{14}    = !if(ps.has_glc, glc, ps.glc_value);
+  let Inst{16}    = !if(ps.lds, 1, 0);
+  let Inst{17}    = !if(ps.has_slc, slc, ?);
+  let Inst{24-18} = op;
+  let Inst{31-26} = 0x38; //encoding
+  let Inst{39-32} = !if(ps.has_vaddr, vaddr, ?);
+  let Inst{47-40} = !if(ps.has_vdata, vdata, ?);
+  let Inst{52-48} = !if(ps.has_srsrc, srsrc{6-2}, ?);
+  let Inst{55}    = !if(ps.has_tfe, tfe, ?);
+  let Inst{63-56} = !if(ps.has_soffset, soffset, ?);
+}
+
+multiclass MUBUF_Real_AllAddr_gfx80<bits<7> op> {
+  def _OFFSET_gfx80 : MUBUF_Real_gfx80 <op, !cast<MUBUF_Pseudo>(NAME#"_OFFSET")>;
+  def _OFFEN_gfx80  : MUBUF_Real_gfx80 <op, !cast<MUBUF_Pseudo>(NAME#"_OFFEN")>;
+  def _IDXEN_gfx80  : MUBUF_Real_gfx80 <op, !cast<MUBUF_Pseudo>(NAME#"_IDXEN")>;
+  def _BOTHEN_gfx80 : MUBUF_Real_gfx80 <op, !cast<MUBUF_Pseudo>(NAME#"_BOTHEN")>;
+}
+
 multiclass MUBUF_Real_Atomic_vi<bits<7> op> :
   MUBUF_Real_AllAddr_vi<op> {
   def _OFFSET_RTN_vi : MUBUF_Real_vi <op, !cast<MUBUF_Pseudo>(NAME#"_OFFSET_RTN")>;
@@ -1636,7 +1913,7 @@ multiclass MUBUF_Real_Atomic_vi<bits<7> op> :
   def _BOTHEN_RTN_vi : MUBUF_Real_vi <op, !cast<MUBUF_Pseudo>(NAME#"_BOTHEN_RTN")>;
 }
 
-defm BUFFER_LOAD_FORMAT_X       : MUBUF_Real_AllAddr_vi <0x00>;
+defm BUFFER_LOAD_FORMAT_X       : MUBUF_Real_AllAddr_Lds_vi <0x00>;
 defm BUFFER_LOAD_FORMAT_XY      : MUBUF_Real_AllAddr_vi <0x01>;
 defm BUFFER_LOAD_FORMAT_XYZ     : MUBUF_Real_AllAddr_vi <0x02>;
 defm BUFFER_LOAD_FORMAT_XYZW    : MUBUF_Real_AllAddr_vi <0x03>;
@@ -1644,14 +1921,34 @@ defm BUFFER_STORE_FORMAT_X      : MUBUF_Real_AllAddr_vi <0x04>;
 defm BUFFER_STORE_FORMAT_XY     : MUBUF_Real_AllAddr_vi <0x05>;
 defm BUFFER_STORE_FORMAT_XYZ    : MUBUF_Real_AllAddr_vi <0x06>;
 defm BUFFER_STORE_FORMAT_XYZW   : MUBUF_Real_AllAddr_vi <0x07>;
-defm BUFFER_LOAD_UBYTE          : MUBUF_Real_AllAddr_vi <0x10>;
-defm BUFFER_LOAD_SBYTE          : MUBUF_Real_AllAddr_vi <0x11>;
-defm BUFFER_LOAD_USHORT         : MUBUF_Real_AllAddr_vi <0x12>;
-defm BUFFER_LOAD_SSHORT         : MUBUF_Real_AllAddr_vi <0x13>;
-defm BUFFER_LOAD_DWORD          : MUBUF_Real_AllAddr_vi <0x14>;
-defm BUFFER_LOAD_DWORDX2        : MUBUF_Real_AllAddr_vi <0x15>;
-defm BUFFER_LOAD_DWORDX3        : MUBUF_Real_AllAddr_vi <0x16>;
-defm BUFFER_LOAD_DWORDX4        : MUBUF_Real_AllAddr_vi <0x17>;
+let SubtargetPredicate = HasUnpackedD16VMem in {
+  defm BUFFER_LOAD_FORMAT_D16_X_gfx80       : MUBUF_Real_AllAddr_gfx80 <0x08>;
+  defm BUFFER_LOAD_FORMAT_D16_XY_gfx80      : MUBUF_Real_AllAddr_gfx80 <0x09>;
+  defm BUFFER_LOAD_FORMAT_D16_XYZ_gfx80     : MUBUF_Real_AllAddr_gfx80 <0x0a>;
+  defm BUFFER_LOAD_FORMAT_D16_XYZW_gfx80    : MUBUF_Real_AllAddr_gfx80 <0x0b>;
+  defm BUFFER_STORE_FORMAT_D16_X_gfx80      : MUBUF_Real_AllAddr_gfx80 <0x0c>;
+  defm BUFFER_STORE_FORMAT_D16_XY_gfx80     : MUBUF_Real_AllAddr_gfx80 <0x0d>;
+  defm BUFFER_STORE_FORMAT_D16_XYZ_gfx80    : MUBUF_Real_AllAddr_gfx80 <0x0e>;
+  defm BUFFER_STORE_FORMAT_D16_XYZW_gfx80   : MUBUF_Real_AllAddr_gfx80 <0x0f>;
+} // End HasUnpackedD16VMem.
+let SubtargetPredicate = HasPackedD16VMem in {
+  defm BUFFER_LOAD_FORMAT_D16_X       : MUBUF_Real_AllAddr_vi <0x08>;
+  defm BUFFER_LOAD_FORMAT_D16_XY      : MUBUF_Real_AllAddr_vi <0x09>;
+  defm BUFFER_LOAD_FORMAT_D16_XYZ     : MUBUF_Real_AllAddr_vi <0x0a>;
+  defm BUFFER_LOAD_FORMAT_D16_XYZW    : MUBUF_Real_AllAddr_vi <0x0b>;
+  defm BUFFER_STORE_FORMAT_D16_X      : MUBUF_Real_AllAddr_vi <0x0c>;
+  defm BUFFER_STORE_FORMAT_D16_XY     : MUBUF_Real_AllAddr_vi <0x0d>;
+  defm BUFFER_STORE_FORMAT_D16_XYZ    : MUBUF_Real_AllAddr_vi <0x0e>;
+  defm BUFFER_STORE_FORMAT_D16_XYZW   : MUBUF_Real_AllAddr_vi <0x0f>;
+} // End HasPackedD16VMem.
+defm BUFFER_LOAD_UBYTE          : MUBUF_Real_AllAddr_Lds_vi <0x10>;
+defm BUFFER_LOAD_SBYTE          : MUBUF_Real_AllAddr_Lds_vi <0x11>;
+defm BUFFER_LOAD_USHORT         : MUBUF_Real_AllAddr_Lds_vi <0x12>;
+defm BUFFER_LOAD_SSHORT         : MUBUF_Real_AllAddr_Lds_vi <0x13>;
+defm BUFFER_LOAD_DWORD          : MUBUF_Real_AllAddr_Lds_vi <0x14>;
+defm BUFFER_LOAD_DWORDX2        : MUBUF_Real_AllAddr_Lds_vi <0x15>;
+defm BUFFER_LOAD_DWORDX3        : MUBUF_Real_AllAddr_Lds_vi <0x16>;
+defm BUFFER_LOAD_DWORDX4        : MUBUF_Real_AllAddr_Lds_vi <0x17>;
 defm BUFFER_STORE_BYTE          : MUBUF_Real_AllAddr_vi <0x18>;
 defm BUFFER_STORE_BYTE_D16_HI   : MUBUF_Real_AllAddr_vi <0x19>;
 defm BUFFER_STORE_SHORT         : MUBUF_Real_AllAddr_vi <0x1a>;
@@ -1668,6 +1965,9 @@ defm BUFFER_LOAD_SBYTE_D16_HI   : MUBUF_Real_AllAddr_vi <0x23>;
 defm BUFFER_LOAD_SHORT_D16      : MUBUF_Real_AllAddr_vi <0x24>;
 defm BUFFER_LOAD_SHORT_D16_HI   : MUBUF_Real_AllAddr_vi <0x25>;
 
+defm BUFFER_LOAD_FORMAT_D16_HI_X  : MUBUF_Real_AllAddr_vi <0x26>;
+defm BUFFER_STORE_FORMAT_D16_HI_X : MUBUF_Real_AllAddr_vi <0x27>;
+
 defm BUFFER_ATOMIC_SWAP         : MUBUF_Real_Atomic_vi <0x40>;
 defm BUFFER_ATOMIC_CMPSWAP      : MUBUF_Real_Atomic_vi <0x41>;
 defm BUFFER_ATOMIC_ADD          : MUBUF_Real_Atomic_vi <0x42>;
@@ -1696,6 +1996,8 @@ defm BUFFER_ATOMIC_XOR_X2       : MUBUF_Real_Atomic_vi <0x6a>;
 defm BUFFER_ATOMIC_INC_X2       : MUBUF_Real_Atomic_vi <0x6b>;
 defm BUFFER_ATOMIC_DEC_X2       : MUBUF_Real_Atomic_vi <0x6c>;
 
+def BUFFER_STORE_LDS_DWORD_vi   : MUBUF_Real_vi <0x3d, BUFFER_STORE_LDS_DWORD>;
+
 def BUFFER_WBINVL1_vi           : MUBUF_Real_vi <0x3e, BUFFER_WBINVL1>;
 def BUFFER_WBINVL1_VOL_vi       : MUBUF_Real_vi <0x3f, BUFFER_WBINVL1_VOL>;
 
@@ -1729,11 +2031,61 @@ multiclass MTBUF_Real_AllAddr_vi<bits<4> op> {
   def _BOTHEN_vi : MTBUF_Real_vi <op, !cast<MTBUF_Pseudo>(NAME#"_BOTHEN")>;
 }
 
-defm TBUFFER_LOAD_FORMAT_X     : MTBUF_Real_AllAddr_vi <0>;
-defm TBUFFER_LOAD_FORMAT_XY    : MTBUF_Real_AllAddr_vi <1>;
-//defm TBUFFER_LOAD_FORMAT_XYZ   : MTBUF_Real_AllAddr_vi <2>;
-defm TBUFFER_LOAD_FORMAT_XYZW  : MTBUF_Real_AllAddr_vi <3>;
-defm TBUFFER_STORE_FORMAT_X    : MTBUF_Real_AllAddr_vi <4>;
-defm TBUFFER_STORE_FORMAT_XY   : MTBUF_Real_AllAddr_vi <5>;
-defm TBUFFER_STORE_FORMAT_XYZ  : MTBUF_Real_AllAddr_vi <6>;
-defm TBUFFER_STORE_FORMAT_XYZW : MTBUF_Real_AllAddr_vi <7>;
+class MTBUF_Real_gfx80 <bits<4> op, MTBUF_Pseudo ps> :
+  MTBUF_Real<ps>,
+  Enc64,
+  SIMCInstr<ps.PseudoInstr, SIEncodingFamily.GFX80> {
+  let AssemblerPredicate=HasUnpackedD16VMem;
+  let DecoderNamespace="GFX80_UNPACKED";
+
+  let Inst{11-0}  = !if(ps.has_offset, offset, ?);
+  let Inst{12}    = ps.offen;
+  let Inst{13}    = ps.idxen;
+  let Inst{14}    = !if(ps.has_glc, glc, ps.glc_value);
+  let Inst{18-15} = op;
+  let Inst{22-19} = !if(ps.has_dfmt, dfmt, ps.dfmt_value);
+  let Inst{25-23} = !if(ps.has_nfmt, nfmt, ps.nfmt_value);
+  let Inst{31-26} = 0x3a; //encoding
+  let Inst{39-32} = !if(ps.has_vaddr, vaddr, ?);
+  let Inst{47-40} = !if(ps.has_vdata, vdata, ?);
+  let Inst{52-48} = !if(ps.has_srsrc, srsrc{6-2}, ?);
+  let Inst{54}    = !if(ps.has_slc, slc, ?);
+  let Inst{55}    = !if(ps.has_tfe, tfe, ?);
+  let Inst{63-56} = !if(ps.has_soffset, soffset, ?);
+}
+
+multiclass MTBUF_Real_AllAddr_gfx80<bits<4> op> {
+  def _OFFSET_gfx80 : MTBUF_Real_gfx80 <op, !cast<MTBUF_Pseudo>(NAME#"_OFFSET")>;
+  def _OFFEN_gfx80  : MTBUF_Real_gfx80 <op, !cast<MTBUF_Pseudo>(NAME#"_OFFEN")>;
+  def _IDXEN_gfx80  : MTBUF_Real_gfx80 <op, !cast<MTBUF_Pseudo>(NAME#"_IDXEN")>;
+  def _BOTHEN_gfx80 : MTBUF_Real_gfx80 <op, !cast<MTBUF_Pseudo>(NAME#"_BOTHEN")>;
+}
+
+defm TBUFFER_LOAD_FORMAT_X     : MTBUF_Real_AllAddr_vi <0x00>;
+defm TBUFFER_LOAD_FORMAT_XY    : MTBUF_Real_AllAddr_vi <0x01>;
+defm TBUFFER_LOAD_FORMAT_XYZ   : MTBUF_Real_AllAddr_vi <0x02>;
+defm TBUFFER_LOAD_FORMAT_XYZW  : MTBUF_Real_AllAddr_vi <0x03>;
+defm TBUFFER_STORE_FORMAT_X    : MTBUF_Real_AllAddr_vi <0x04>;
+defm TBUFFER_STORE_FORMAT_XY   : MTBUF_Real_AllAddr_vi <0x05>;
+defm TBUFFER_STORE_FORMAT_XYZ  : MTBUF_Real_AllAddr_vi <0x06>;
+defm TBUFFER_STORE_FORMAT_XYZW : MTBUF_Real_AllAddr_vi <0x07>;
+let SubtargetPredicate = HasUnpackedD16VMem in {
+  defm TBUFFER_LOAD_FORMAT_D16_X_gfx80     : MTBUF_Real_AllAddr_gfx80 <0x08>;
+  defm TBUFFER_LOAD_FORMAT_D16_XY_gfx80    : MTBUF_Real_AllAddr_gfx80 <0x09>;
+  defm TBUFFER_LOAD_FORMAT_D16_XYZ_gfx80   : MTBUF_Real_AllAddr_gfx80 <0x0a>;
+  defm TBUFFER_LOAD_FORMAT_D16_XYZW_gfx80  : MTBUF_Real_AllAddr_gfx80 <0x0b>;
+  defm TBUFFER_STORE_FORMAT_D16_X_gfx80    : MTBUF_Real_AllAddr_gfx80 <0x0c>;
+  defm TBUFFER_STORE_FORMAT_D16_XY_gfx80   : MTBUF_Real_AllAddr_gfx80 <0x0d>;
+  defm TBUFFER_STORE_FORMAT_D16_XYZ_gfx80  : MTBUF_Real_AllAddr_gfx80 <0x0e>;
+  defm TBUFFER_STORE_FORMAT_D16_XYZW_gfx80 : MTBUF_Real_AllAddr_gfx80 <0x0f>;
+} // End HasUnpackedD16VMem.
+let SubtargetPredicate = HasPackedD16VMem in {
+  defm TBUFFER_LOAD_FORMAT_D16_X     : MTBUF_Real_AllAddr_vi <0x08>;
+  defm TBUFFER_LOAD_FORMAT_D16_XY    : MTBUF_Real_AllAddr_vi <0x09>;
+  defm TBUFFER_LOAD_FORMAT_D16_XYZ   : MTBUF_Real_AllAddr_vi <0x0a>;
+  defm TBUFFER_LOAD_FORMAT_D16_XYZW  : MTBUF_Real_AllAddr_vi <0x0b>;
+  defm TBUFFER_STORE_FORMAT_D16_X    : MTBUF_Real_AllAddr_vi <0x0c>;
+  defm TBUFFER_STORE_FORMAT_D16_XY   : MTBUF_Real_AllAddr_vi <0x0d>;
+  defm TBUFFER_STORE_FORMAT_D16_XYZ  : MTBUF_Real_AllAddr_vi <0x0e>;
+  defm TBUFFER_STORE_FORMAT_D16_XYZW : MTBUF_Real_AllAddr_vi <0x0f>;
+} // End HasUnpackedD16VMem.
diff --git a/lib/Target/AMDGPU/CMakeLists.txt b/lib/Target/AMDGPU/CMakeLists.txt
index 3a8503030414..174b2df15300 100644
--- a/lib/Target/AMDGPU/CMakeLists.txt
+++ b/lib/Target/AMDGPU/CMakeLists.txt
@@ -1,18 +1,33 @@
 set(LLVM_TARGET_DEFINITIONS AMDGPU.td)
 
-tablegen(LLVM AMDGPUGenRegisterInfo.inc -gen-register-info)
-tablegen(LLVM AMDGPUGenInstrInfo.inc -gen-instr-info)
-tablegen(LLVM AMDGPUGenDAGISel.inc -gen-dag-isel)
-tablegen(LLVM AMDGPUGenCallingConv.inc -gen-callingconv)
-tablegen(LLVM AMDGPUGenSubtargetInfo.inc -gen-subtarget)
-tablegen(LLVM AMDGPUGenIntrinsics.inc -gen-tgt-intrinsic)
-tablegen(LLVM AMDGPUGenMCCodeEmitter.inc -gen-emitter)
-tablegen(LLVM AMDGPUGenDFAPacketizer.inc -gen-dfa-packetizer)
-tablegen(LLVM AMDGPUGenAsmWriter.inc -gen-asm-writer)
 tablegen(LLVM AMDGPUGenAsmMatcher.inc -gen-asm-matcher)
+tablegen(LLVM AMDGPUGenAsmWriter.inc -gen-asm-writer)
+tablegen(LLVM AMDGPUGenCallingConv.inc -gen-callingconv)
+tablegen(LLVM AMDGPUGenDAGISel.inc -gen-dag-isel)
 tablegen(LLVM AMDGPUGenDisassemblerTables.inc -gen-disassembler)
+tablegen(LLVM AMDGPUGenInstrInfo.inc -gen-instr-info)
+tablegen(LLVM AMDGPUGenIntrinsicEnums.inc -gen-tgt-intrinsic-enums)
+tablegen(LLVM AMDGPUGenIntrinsicImpl.inc -gen-tgt-intrinsic-impl)
+tablegen(LLVM AMDGPUGenMCCodeEmitter.inc -gen-emitter)
 tablegen(LLVM AMDGPUGenMCPseudoLowering.inc -gen-pseudo-lowering)
 tablegen(LLVM AMDGPUGenRegisterBank.inc -gen-register-bank)
+tablegen(LLVM AMDGPUGenRegisterInfo.inc -gen-register-info)
+tablegen(LLVM AMDGPUGenSearchableTables.inc -gen-searchable-tables)
+tablegen(LLVM AMDGPUGenSubtargetInfo.inc -gen-subtarget)
+
+set(LLVM_TARGET_DEFINITIONS AMDGPUGISel.td)
+tablegen(LLVM AMDGPUGenGlobalISel.inc -gen-global-isel)
+
+set(LLVM_TARGET_DEFINITIONS R600.td)
+tablegen(LLVM R600GenAsmWriter.inc -gen-asm-writer)
+tablegen(LLVM R600GenCallingConv.inc -gen-callingconv)
+tablegen(LLVM R600GenDAGISel.inc -gen-dag-isel)
+tablegen(LLVM R600GenDFAPacketizer.inc -gen-dfa-packetizer)
+tablegen(LLVM R600GenInstrInfo.inc -gen-instr-info)
+tablegen(LLVM R600GenMCCodeEmitter.inc -gen-emitter)
+tablegen(LLVM R600GenRegisterInfo.inc -gen-register-info)
+tablegen(LLVM R600GenSubtargetInfo.inc -gen-subtarget)
+
 add_public_tablegen_target(AMDGPUCommonTableGen)
 
 add_llvm_target(AMDGPUCodeGen
@@ -25,6 +40,7 @@ add_llvm_target(AMDGPUCodeGen
   AMDGPUCallLowering.cpp
   AMDGPUCodeGenPrepare.cpp
   AMDGPUFrameLowering.cpp
+  AMDGPUHSAMetadataStreamer.cpp
   AMDGPUInstrInfo.cpp
   AMDGPUInstructionSelector.cpp
   AMDGPUIntrinsicInfo.cpp
@@ -34,13 +50,14 @@ add_llvm_target(AMDGPUCodeGen
   AMDGPULibCalls.cpp
   AMDGPULibFunc.cpp
   AMDGPULowerIntrinsics.cpp
+  AMDGPULowerKernelArguments.cpp
+  AMDGPULowerKernelAttributes.cpp
   AMDGPUMachineCFGStructurizer.cpp
   AMDGPUMachineFunction.cpp
   AMDGPUMachineModuleInfo.cpp
   AMDGPUMacroFusion.cpp
   AMDGPUMCInstLower.cpp
   AMDGPUOpenCLEnqueuedBlockLowering.cpp
-  AMDGPUOpenCLImageTypeLoweringPass.cpp
   AMDGPUPromoteAlloca.cpp
   AMDGPURegAsmNames.inc.cpp
   AMDGPURegisterBankInfo.cpp
@@ -53,12 +70,14 @@ add_llvm_target(AMDGPUCodeGen
   AMDGPUUnifyDivergentExitNodes.cpp
   AMDGPUUnifyMetadata.cpp
   AMDGPUInline.cpp
+  AMDGPUPerfHintAnalysis.cpp
   AMDILCFGStructurizer.cpp
   GCNHazardRecognizer.cpp
   GCNIterativeScheduler.cpp
   GCNMinRegStrategy.cpp
   GCNRegPressure.cpp
   GCNSchedStrategy.cpp
+  R600AsmPrinter.cpp
   R600ClauseMergePass.cpp
   R600ControlFlowFinalizer.cpp
   R600EmitClauseMarkers.cpp
@@ -68,6 +87,7 @@ add_llvm_target(AMDGPUCodeGen
   R600ISelLowering.cpp
   R600MachineFunctionInfo.cpp
   R600MachineScheduler.cpp
+  R600OpenCLImageTypeLoweringPass.cpp
   R600OptimizeVectorRegisters.cpp
   R600Packetizer.cpp
   R600RegisterInfo.cpp
@@ -77,10 +97,10 @@ add_llvm_target(AMDGPUCodeGen
   SIFixVGPRCopies.cpp
   SIFixWWMLiveness.cpp
   SIFoldOperands.cpp
+  SIFormMemoryClauses.cpp
   SIFrameLowering.cpp
   SIInsertSkips.cpp
   SIInsertWaitcnts.cpp
-  SIInsertWaits.cpp
   SIInstrInfo.cpp
   SIISelLowering.cpp
   SILoadStoreOptimizer.cpp
@@ -99,8 +119,8 @@ add_llvm_target(AMDGPUCodeGen
   )
 
 add_subdirectory(AsmParser)
-add_subdirectory(InstPrinter)
 add_subdirectory(Disassembler)
-add_subdirectory(TargetInfo)
+add_subdirectory(InstPrinter)
 add_subdirectory(MCTargetDesc)
+add_subdirectory(TargetInfo)
 add_subdirectory(Utils)
diff --git a/lib/Target/AMDGPU/DSInstructions.td b/lib/Target/AMDGPU/DSInstructions.td
index f898fd7948cc..cdc6ab9412e6 100644
--- a/lib/Target/AMDGPU/DSInstructions.td
+++ b/lib/Target/AMDGPU/DSInstructions.td
@@ -440,7 +440,7 @@ defm DS_XOR_RTN_B32   : DS_1A1D_RET_mc<"ds_xor_rtn_b32", VGPR_32, "ds_xor_b32">;
 defm DS_MSKOR_RTN_B32 : DS_1A2D_RET_mc<"ds_mskor_rtn_b32", VGPR_32, "ds_mskor_b32">;
 defm DS_CMPST_RTN_B32 : DS_1A2D_RET_mc<"ds_cmpst_rtn_b32", VGPR_32, "ds_cmpst_b32">;
 defm DS_CMPST_RTN_F32 : DS_1A2D_RET_mc<"ds_cmpst_rtn_f32", VGPR_32, "ds_cmpst_f32">;
-defm DS_MIN_RTN_F32   : DS_1A1D_RET_mc <"ds_min_rtn_f32", VGPR_32, "ds_min_f32">;
+defm DS_MIN_RTN_F32   : DS_1A1D_RET_mc<"ds_min_rtn_f32", VGPR_32, "ds_min_f32">;
 defm DS_MAX_RTN_F32   : DS_1A1D_RET_mc<"ds_max_rtn_f32", VGPR_32, "ds_max_f32">;
 
 defm DS_WRXCHG_RTN_B32 : DS_1A1D_RET_mc<"ds_wrxchg_rtn_b32">;
@@ -584,6 +584,8 @@ def DS_BPERMUTE_B32 : DS_1A1D_PERMUTE <"ds_bpermute_b32",
                                        int_amdgcn_ds_bpermute>;
 }
 
+def DS_ADD_SRC2_F32 : DS_1A<"ds_add_src2_f32">;
+
 } // let SubtargetPredicate = isVI
 
 //===----------------------------------------------------------------------===//
@@ -600,8 +602,6 @@ class DSReadPat <DS_Pseudo inst, ValueType vt, PatFrag frag> : GCNPat <
   (inst $ptr, (as_i16imm $offset), (i1 0))
 >;
 
-// FIXME: Passing name of PatFrag in workaround. Why doesn't
-// !cast<PatFrag>(frag.NAME#"_m0") work!?
 multiclass DSReadPat_mc<DS_Pseudo inst, ValueType vt, string frag> {
 
   let OtherPredicates = [LDSRequiresM0Init] in {
@@ -609,7 +609,7 @@ multiclass DSReadPat_mc<DS_Pseudo inst, ValueType vt, string frag> {
   }
 
   let OtherPredicates = [NotLDSRequiresM0Init] in {
-    def : DSReadPat<!cast<DS_Pseudo>(inst.NAME#"_gfx9"), vt, !cast<PatFrag>(frag)>;
+    def : DSReadPat<!cast<DS_Pseudo>(!cast<string>(inst)#"_gfx9"), vt, !cast<PatFrag>(frag)>;
   }
 }
 
@@ -647,14 +647,17 @@ defm : DSReadPat_mc <DS_READ_I16, i32, "sextloadi16_local">;
 defm : DSReadPat_mc <DS_READ_U16, i32, "az_extloadi16_local">;
 defm : DSReadPat_mc <DS_READ_U16, i16, "load_local">;
 defm : DSReadPat_mc <DS_READ_B32, i32, "load_local">;
+defm : DSReadPat_mc <DS_READ_B32, i32, "atomic_load_32_local">;
+defm : DSReadPat_mc <DS_READ_B64, i64, "atomic_load_64_local">;
 
 let AddedComplexity = 100 in {
 
 defm : DSReadPat_mc <DS_READ_B64, v2i32, "load_align8_local">;
+defm : DSReadPat_mc <DS_READ_B128, v4i32, "load_align16_local">;
 
 } // End AddedComplexity = 100
 
-let OtherPredicates = [HasD16LoadStore] in {
+let OtherPredicates = [D16PreservesUnusedBits] in {
 let AddedComplexity = 100 in {
 defm : DSReadPat_Hi16<DS_READ_U16_D16_HI, load_local>;
 defm : DSReadPat_Hi16<DS_READ_U8_D16_HI, az_extloadi8_local>;
@@ -678,7 +681,24 @@ multiclass DSWritePat_mc <DS_Pseudo inst, ValueType vt, string frag> {
   }
 
   let OtherPredicates = [NotLDSRequiresM0Init] in {
-    def : DSWritePat<!cast<DS_Pseudo>(inst.NAME#"_gfx9"), vt, !cast<PatFrag>(frag)>;
+    def : DSWritePat<!cast<DS_Pseudo>(!cast<string>(inst)#"_gfx9"), vt, !cast<PatFrag>(frag)>;
+  }
+}
+
+// Irritatingly, atomic_store reverses the order of operands from a
+// normal store.
+class DSAtomicWritePat <DS_Pseudo inst, ValueType vt, PatFrag frag> : GCNPat <
+  (frag (DS1Addr1Offset i32:$ptr, i32:$offset), vt:$value),
+  (inst $ptr, $value, (as_i16imm $offset), (i1 0))
+>;
+
+multiclass DSAtomicWritePat_mc <DS_Pseudo inst, ValueType vt, string frag> {
+  let OtherPredicates = [LDSRequiresM0Init] in {
+    def : DSAtomicWritePat<inst, vt, !cast<PatFrag>(frag#"_m0")>;
+  }
+
+  let OtherPredicates = [NotLDSRequiresM0Init] in {
+    def : DSAtomicWritePat<!cast<DS_Pseudo>(!cast<string>(inst)#"_gfx9"), vt, !cast<PatFrag>(frag)>;
   }
 }
 
@@ -687,8 +707,10 @@ defm : DSWritePat_mc <DS_WRITE_B16, i32, "truncstorei16_local">;
 defm : DSWritePat_mc <DS_WRITE_B8, i16, "truncstorei8_local">;
 defm : DSWritePat_mc <DS_WRITE_B16, i16, "store_local">;
 defm : DSWritePat_mc <DS_WRITE_B32, i32, "store_local">;
+defm : DSAtomicWritePat_mc <DS_WRITE_B32, i32, "atomic_store_local">;
+defm : DSAtomicWritePat_mc <DS_WRITE_B64, i64, "atomic_store_local">;
 
-let OtherPredicates = [HasD16LoadStore] in {
+let OtherPredicates = [D16PreservesUnusedBits] in {
 def : DSWritePat <DS_WRITE_B16_D16_HI, i32, store_local_hi16>;
 def : DSWritePat <DS_WRITE_B8_D16_HI, i32, truncstorei8_local_hi16>;
 }
@@ -720,6 +742,8 @@ def : DS64Bit4ByteAlignedWritePat<DS_WRITE2_B32_gfx9, store_local>;
 let AddedComplexity = 100 in {
 
 defm : DSWritePat_mc <DS_WRITE_B64, v2i32, "store_align8_local">;
+defm : DSWritePat_mc <DS_WRITE_B128, v4i32, "store_align16_local">;
+
 } // End AddedComplexity = 100
 class DSAtomicRetPat<DS_Pseudo inst, ValueType vt, PatFrag frag> : GCNPat <
   (frag (DS1Addr1Offset i32:$ptr, i32:$offset), vt:$value),
@@ -732,7 +756,8 @@ multiclass DSAtomicRetPat_mc<DS_Pseudo inst, ValueType vt, string frag> {
   }
 
   let OtherPredicates = [NotLDSRequiresM0Init] in {
-    def : DSAtomicRetPat<!cast<DS_Pseudo>(inst.NAME#"_gfx9"), vt, !cast<PatFrag>(frag)>;
+    def : DSAtomicRetPat<!cast<DS_Pseudo>(!cast<string>(inst)#"_gfx9"), vt,
+                         !cast<PatFrag>(frag)>;
   }
 }
 
@@ -749,7 +774,8 @@ multiclass DSAtomicCmpXChg_mc<DS_Pseudo inst, ValueType vt, string frag> {
   }
 
   let OtherPredicates = [NotLDSRequiresM0Init] in {
-    def : DSAtomicCmpXChg<!cast<DS_Pseudo>(inst.NAME#"_gfx9"), vt, !cast<PatFrag>(frag)>;
+    def : DSAtomicCmpXChg<!cast<DS_Pseudo>(!cast<string>(inst)#"_gfx9"), vt,
+                          !cast<PatFrag>(frag)>;
   }
 }
 
@@ -769,6 +795,9 @@ defm : DSAtomicRetPat_mc<DS_MAX_RTN_I32, i32, "atomic_load_max_local">;
 defm : DSAtomicRetPat_mc<DS_MIN_RTN_U32, i32, "atomic_load_umin_local">;
 defm : DSAtomicRetPat_mc<DS_MAX_RTN_U32, i32, "atomic_load_umax_local">;
 defm : DSAtomicCmpXChg_mc<DS_CMPST_RTN_B32, i32, "atomic_cmp_swap_local">;
+defm : DSAtomicRetPat_mc<DS_MIN_RTN_F32, f32, "atomic_load_fmin_local">;
+defm : DSAtomicRetPat_mc<DS_MAX_RTN_F32, f32, "atomic_load_fmax_local">;
+defm : DSAtomicRetPat_mc<DS_ADD_RTN_F32, f32, "atomic_load_fadd_local">;
 
 // 64-bit atomics.
 defm : DSAtomicRetPat_mc<DS_WRXCHG_RTN_B64, i64, "atomic_swap_local">;
@@ -1123,6 +1152,7 @@ def DS_XOR_SRC2_B32_vi    : DS_Real_vi<0x8b, DS_XOR_SRC2_B32>;
 def DS_WRITE_SRC2_B32_vi  : DS_Real_vi<0x8d, DS_WRITE_SRC2_B32>;
 def DS_MIN_SRC2_F32_vi    : DS_Real_vi<0x92, DS_MIN_SRC2_F32>;
 def DS_MAX_SRC2_F32_vi    : DS_Real_vi<0x93, DS_MAX_SRC2_F32>;
+def DS_ADD_SRC2_F32_vi    : DS_Real_vi<0x95, DS_ADD_SRC2_F32>;
 def DS_ADD_SRC2_U64_vi    : DS_Real_vi<0xc0, DS_ADD_SRC2_U64>;
 def DS_SUB_SRC2_U64_vi    : DS_Real_vi<0xc1, DS_SUB_SRC2_U64>;
 def DS_RSUB_SRC2_U64_vi   : DS_Real_vi<0xc2, DS_RSUB_SRC2_U64>;
diff --git a/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.cpp b/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.cpp
index 47a2d3f2fdc5..f3de903f21b2 100644
--- a/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.cpp
+++ b/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.cpp
@@ -20,7 +20,9 @@
 #include "Disassembler/AMDGPUDisassembler.h"
 #include "AMDGPU.h"
 #include "AMDGPURegisterInfo.h"
+#include "MCTargetDesc/AMDGPUMCTargetDesc.h"
 #include "SIDefines.h"
+#include "MCTargetDesc/AMDGPUMCTargetDesc.h"
 #include "Utils/AMDGPUBaseInfo.h"
 #include "llvm-c/Disassembler.h"
 #include "llvm/ADT/APInt.h"
@@ -198,6 +200,21 @@ DecodeStatus AMDGPUDisassembler::getInstruction(MCInst &MI, uint64_t &Size,
 
       Res = tryDecodeInst(DecoderTableSDWA964, MI, QW, Address);
       if (Res) { IsSDWA = true;  break; }
+
+      if (STI.getFeatureBits()[AMDGPU::FeatureUnpackedD16VMem]) {
+        Res = tryDecodeInst(DecoderTableGFX80_UNPACKED64, MI, QW, Address);
+        if (Res)
+          break;
+      }
+
+      // Some GFX9 subtargets repurposed the v_mad_mix_f32, v_mad_mixlo_f16 and
+      // v_mad_mixhi_f16 for FMA variants. Try to decode using this special
+      // table first so we print the correct name.
+      if (STI.getFeatureBits()[AMDGPU::FeatureFmaMixInsts]) {
+        Res = tryDecodeInst(DecoderTableGFX9_DL64, MI, QW, Address);
+        if (Res)
+          break;
+      }
     }
 
     // Reinitialize Bytes as DPP64 could have eaten too much
@@ -228,7 +245,8 @@ DecodeStatus AMDGPUDisassembler::getInstruction(MCInst &MI, uint64_t &Size,
 
   if (Res && (MI.getOpcode() == AMDGPU::V_MAC_F32_e64_vi ||
               MI.getOpcode() == AMDGPU::V_MAC_F32_e64_si ||
-              MI.getOpcode() == AMDGPU::V_MAC_F16_e64_vi)) {
+              MI.getOpcode() == AMDGPU::V_MAC_F16_e64_vi ||
+              MI.getOpcode() == AMDGPU::V_FMAC_F32_e64_vi)) {
     // Insert dummy unused src2_modifiers.
     insertNamedMCOperand(MI, MCOperand::createImm(0),
                          AMDGPU::OpName::src2_modifiers);
@@ -241,7 +259,10 @@ DecodeStatus AMDGPUDisassembler::getInstruction(MCInst &MI, uint64_t &Size,
   if (Res && IsSDWA)
     Res = convertSDWAInst(MI);
 
-  Size = Res ? (MaxInstBytesNum - Bytes.size()) : 0;
+  // if the opcode was not recognized we'll assume a Size of 4 bytes
+  // (unless there are fewer bytes left)
+  Size = Res ? (MaxInstBytesNum - Bytes.size())
+             : std::min((size_t)4, Bytes_.size());
   return Res;
 }
 
@@ -264,26 +285,70 @@ DecodeStatus AMDGPUDisassembler::convertSDWAInst(MCInst &MI) const {
   return MCDisassembler::Success;
 }
 
+// Note that MIMG format provides no information about VADDR size.
+// Consequently, decoded instructions always show address
+// as if it has 1 dword, which could be not really so.
 DecodeStatus AMDGPUDisassembler::convertMIMGInst(MCInst &MI) const {
+
+  int VDstIdx = AMDGPU::getNamedOperandIdx(MI.getOpcode(),
+                                           AMDGPU::OpName::vdst);
+
   int VDataIdx = AMDGPU::getNamedOperandIdx(MI.getOpcode(),
                                             AMDGPU::OpName::vdata);
 
   int DMaskIdx = AMDGPU::getNamedOperandIdx(MI.getOpcode(),
                                             AMDGPU::OpName::dmask);
+
+  int TFEIdx   = AMDGPU::getNamedOperandIdx(MI.getOpcode(),
+                                            AMDGPU::OpName::tfe);
+  int D16Idx   = AMDGPU::getNamedOperandIdx(MI.getOpcode(),
+                                            AMDGPU::OpName::d16);
+
+  assert(VDataIdx != -1);
+  assert(DMaskIdx != -1);
+  assert(TFEIdx != -1);
+
+  bool IsAtomic = (VDstIdx != -1);
+  bool IsGather4 = MCII->get(MI.getOpcode()).TSFlags & SIInstrFlags::Gather4;
+
   unsigned DMask = MI.getOperand(DMaskIdx).getImm() & 0xf;
   if (DMask == 0)
     return MCDisassembler::Success;
 
-  unsigned ChannelCount = countPopulation(DMask);
-  if (ChannelCount == 1)
+  unsigned DstSize = IsGather4 ? 4 : countPopulation(DMask);
+  if (DstSize == 1)
     return MCDisassembler::Success;
 
-  int NewOpcode = AMDGPU::getMaskedMIMGOp(*MCII, MI.getOpcode(), ChannelCount);
-  assert(NewOpcode != -1 && "could not find matching mimg channel instruction");
+  bool D16 = D16Idx >= 0 && MI.getOperand(D16Idx).getImm();
+  if (D16 && AMDGPU::hasPackedD16(STI)) {
+    DstSize = (DstSize + 1) / 2;
+  }
+
+  // FIXME: Add tfe support
+  if (MI.getOperand(TFEIdx).getImm())
+    return MCDisassembler::Success;
+
+  int NewOpcode = -1;
+
+  if (IsGather4) {
+    if (D16 && AMDGPU::hasPackedD16(STI))
+      NewOpcode = AMDGPU::getMaskedMIMGOp(MI.getOpcode(), 2);
+    else
+      return MCDisassembler::Success;
+  } else {
+    NewOpcode = AMDGPU::getMaskedMIMGOp(MI.getOpcode(), DstSize);
+    if (NewOpcode == -1)
+      return MCDisassembler::Success;
+  }
+
   auto RCID = MCII->get(NewOpcode).OpInfo[VDataIdx].RegClass;
 
-  // Widen the register to the correct number of enabled channels.
+  // Get first subregister of VData
   unsigned Vdata0 = MI.getOperand(VDataIdx).getReg();
+  unsigned VdataSub0 = MRI.getSubReg(Vdata0, AMDGPU::sub0);
+  Vdata0 = (VdataSub0 != 0)? VdataSub0 : Vdata0;
+
+  // Widen the register to the correct number of enabled channels.
   auto NewVdata = MRI.getMatchingSuperReg(Vdata0, AMDGPU::sub0,
                                           &MRI.getRegClass(RCID));
   if (NewVdata == AMDGPU::NoRegister) {
@@ -297,6 +362,12 @@ DecodeStatus AMDGPUDisassembler::convertMIMGInst(MCInst &MI) const {
   // how it is usually emitted because the number of register components is not
   // in the instruction encoding.
   MI.getOperand(VDataIdx) = MCOperand::createReg(NewVdata);
+
+  if (IsAtomic) {
+    // Atomic operations have an additional operand (a copy of data)
+    MI.getOperand(VDstIdx) = MCOperand::createReg(NewVdata);
+  }
+
   return MCDisassembler::Success;
 }
 
@@ -690,9 +761,8 @@ MCOperand AMDGPUDisassembler::decodeSpecialReg32(unsigned Val) const {
   switch (Val) {
   case 102: return createRegOperand(FLAT_SCR_LO);
   case 103: return createRegOperand(FLAT_SCR_HI);
-    // ToDo: no support for xnack_mask_lo/_hi register
-  case 104:
-  case 105: break;
+  case 104: return createRegOperand(XNACK_MASK_LO);
+  case 105: return createRegOperand(XNACK_MASK_HI);
   case 106: return createRegOperand(VCC_LO);
   case 107: return createRegOperand(VCC_HI);
   case 108: assert(!isGFX9()); return createRegOperand(TBA_LO);
@@ -722,6 +792,7 @@ MCOperand AMDGPUDisassembler::decodeSpecialReg64(unsigned Val) const {
 
   switch (Val) {
   case 102: return createRegOperand(FLAT_SCR);
+  case 104: return createRegOperand(XNACK_MASK);
   case 106: return createRegOperand(VCC);
   case 108: assert(!isGFX9()); return createRegOperand(TBA);
   case 110: assert(!isGFX9()); return createRegOperand(TMA);
@@ -732,8 +803,9 @@ MCOperand AMDGPUDisassembler::decodeSpecialReg64(unsigned Val) const {
 }
 
 MCOperand AMDGPUDisassembler::decodeSDWASrc(const OpWidthTy Width,
-                                            unsigned Val) const {
+                                            const unsigned Val) const {
   using namespace AMDGPU::SDWA;
+  using namespace AMDGPU::EncValues;
 
   if (STI.getFeatureBits()[AMDGPU::FeatureGFX9]) {
     // XXX: static_cast<int> is needed to avoid stupid warning:
@@ -754,7 +826,15 @@ MCOperand AMDGPUDisassembler::decodeSDWASrc(const OpWidthTy Width,
                                Val - SDWA9EncValues::SRC_TTMP_MIN);
     }
 
-    return decodeSpecialReg32(Val - SDWA9EncValues::SRC_SGPR_MIN);
+    const unsigned SVal = Val - SDWA9EncValues::SRC_SGPR_MIN;
+
+    if (INLINE_INTEGER_C_MIN <= SVal && SVal <= INLINE_INTEGER_C_MAX)
+      return decodeIntImmed(SVal);
+
+    if (INLINE_FLOATING_C_MIN <= SVal && SVal <= INLINE_FLOATING_C_MAX)
+      return decodeFPImmed(Width, SVal);
+
+    return decodeSpecialReg32(SVal);
   } else if (STI.getFeatureBits()[AMDGPU::FeatureVolcanicIslands]) {
     return createRegOperand(getVgprClassId(Width), Val);
   }
@@ -815,6 +895,9 @@ bool AMDGPUSymbolizer::tryAddingSymbolicOperand(MCInst &Inst,
   }
 
   auto *Symbols = static_cast<SectionSymbolsTy *>(DisInfo);
+  if (!Symbols)
+    return false;
+
   auto Result = std::find_if(Symbols->begin(), Symbols->end(),
                              [Value](const SymbolInfoTy& Val) {
                                 return std::get<0>(Val) == static_cast<uint64_t>(Value)
diff --git a/lib/Target/AMDGPU/EvergreenInstructions.td b/lib/Target/AMDGPU/EvergreenInstructions.td
index 5e26f97b0c86..944f4ffe598d 100644
--- a/lib/Target/AMDGPU/EvergreenInstructions.td
+++ b/lib/Target/AMDGPU/EvergreenInstructions.td
@@ -15,7 +15,6 @@
 
 def isEG : Predicate<
   "Subtarget->getGeneration() >= AMDGPUSubtarget::EVERGREEN && "
-  "Subtarget->getGeneration() <= AMDGPUSubtarget::NORTHERN_ISLANDS && "
   "!Subtarget->hasCaymanISA()"
 >;
 
@@ -693,7 +692,7 @@ def : EGOrCaymanPat<(fp_to_sint f32:$src0), (FLT_TO_INT_eg (TRUNC $src0))>;
 def : EGOrCaymanPat<(fp_to_uint f32:$src0), (FLT_TO_UINT_eg (TRUNC $src0))>;
 
 // SHA-256 Patterns
-def : SHA256MaPattern <BFI_INT_eg, XOR_INT>;
+defm : SHA256MaPattern <BFI_INT_eg, XOR_INT, R600_Reg64>;
 
 def EG_ExportSwz : ExportSwzInst {
   let Word1{19-16} = 0; // BURST_COUNT
diff --git a/lib/Target/AMDGPU/FLATInstructions.td b/lib/Target/AMDGPU/FLATInstructions.td
index 693869128081..3ef473b7fd96 100644
--- a/lib/Target/AMDGPU/FLATInstructions.td
+++ b/lib/Target/AMDGPU/FLATInstructions.td
@@ -135,7 +135,7 @@ class FLAT_Load_Pseudo <string opName, RegisterClass regClass,
         !con((ins VReg_64:$vaddr),
           !if(EnableSaddr, (ins SReg_64:$saddr), (ins))),
             (ins !if(HasSignedOffset,offset_s13,offset_u12):$offset)),
-            (ins GLC:$glc, slc:$slc)),
+            (ins GLC:$glc, SLC:$slc)),
             !if(HasTiedOutput, (ins regClass:$vdst_in), (ins))),
   " $vdst, $vaddr"#!if(HasSaddr, !if(EnableSaddr, ", $saddr", ", off"), "")#"$offset$glc$slc"> {
   let has_data = 0;
@@ -158,7 +158,7 @@ class FLAT_Store_Pseudo <string opName, RegisterClass vdataClass,
       !con((ins VReg_64:$vaddr, vdataClass:$vdata),
         !if(EnableSaddr, (ins SReg_64:$saddr), (ins))),
           (ins !if(HasSignedOffset,offset_s13,offset_u12):$offset)),
-          (ins GLC:$glc, slc:$slc)),
+          (ins GLC:$glc, SLC:$slc)),
   " $vaddr, $vdata"#!if(HasSaddr, !if(EnableSaddr, ", $saddr", ", off"), "")#"$offset$glc$slc"> {
   let mayLoad  = 0;
   let mayStore = 1;
@@ -188,8 +188,8 @@ class FLAT_Scratch_Load_Pseudo <string opName, RegisterClass regClass,
   opName,
   (outs regClass:$vdst),
   !if(EnableSaddr,
-      (ins SReg_32_XEXEC_HI:$saddr, offset_s13:$offset, GLC:$glc, slc:$slc),
-      (ins VGPR_32:$vaddr, offset_s13:$offset, GLC:$glc, slc:$slc)),
+      (ins SReg_32_XEXEC_HI:$saddr, offset_s13:$offset, GLC:$glc, SLC:$slc),
+      (ins VGPR_32:$vaddr, offset_s13:$offset, GLC:$glc, SLC:$slc)),
   " $vdst, "#!if(EnableSaddr, "off", "$vaddr")#!if(EnableSaddr, ", $saddr", ", off")#"$offset$glc$slc"> {
   let has_data = 0;
   let mayLoad = 1;
@@ -204,8 +204,8 @@ class FLAT_Scratch_Store_Pseudo <string opName, RegisterClass vdataClass, bit En
   opName,
   (outs),
   !if(EnableSaddr,
-    (ins vdataClass:$vdata, SReg_32_XEXEC_HI:$saddr, offset_s13:$offset, GLC:$glc, slc:$slc),
-    (ins vdataClass:$vdata, VGPR_32:$vaddr, offset_s13:$offset, GLC:$glc, slc:$slc)),
+    (ins vdataClass:$vdata, SReg_32_XEXEC_HI:$saddr, offset_s13:$offset, GLC:$glc, SLC:$slc),
+    (ins vdataClass:$vdata, VGPR_32:$vaddr, offset_s13:$offset, GLC:$glc, SLC:$slc)),
   " "#!if(EnableSaddr, "off", "$vaddr")#", $vdata, "#!if(EnableSaddr, "$saddr", "off")#"$offset$glc$slc"> {
   let mayLoad  = 0;
   let mayStore = 1;
@@ -260,7 +260,7 @@ multiclass FLAT_Atomic_Pseudo<
   RegisterClass data_rc = vdst_rc> {
   def "" : FLAT_AtomicNoRet_Pseudo <opName,
     (outs),
-    (ins VReg_64:$vaddr, data_rc:$vdata, offset_u12:$offset, slc:$slc),
+    (ins VReg_64:$vaddr, data_rc:$vdata, offset_u12:$offset, SLC:$slc),
     " $vaddr, $vdata$offset$slc">,
     AtomicNoRet <opName, 0> {
     let PseudoInstr = NAME;
@@ -268,7 +268,7 @@ multiclass FLAT_Atomic_Pseudo<
 
   def _RTN : FLAT_AtomicRet_Pseudo <opName,
     (outs vdst_rc:$vdst),
-    (ins VReg_64:$vaddr, data_rc:$vdata, offset_u12:$offset, slc:$slc),
+    (ins VReg_64:$vaddr, data_rc:$vdata, offset_u12:$offset, SLC:$slc),
     " $vdst, $vaddr, $vdata$offset glc$slc",
     [(set vt:$vdst,
       (atomic (FLATAtomic i64:$vaddr, i16:$offset, i1:$slc), data_vt:$vdata))]>,
@@ -285,7 +285,7 @@ multiclass FLAT_Global_Atomic_Pseudo<
 
   def "" : FLAT_AtomicNoRet_Pseudo <opName,
     (outs),
-    (ins VReg_64:$vaddr, data_rc:$vdata, offset_s13:$offset, slc:$slc),
+    (ins VReg_64:$vaddr, data_rc:$vdata, offset_s13:$offset, SLC:$slc),
     " $vaddr, $vdata, off$offset$slc">,
     AtomicNoRet <opName, 0> {
     let has_saddr = 1;
@@ -294,7 +294,7 @@ multiclass FLAT_Global_Atomic_Pseudo<
 
   def _RTN : FLAT_AtomicRet_Pseudo <opName,
     (outs vdst_rc:$vdst),
-      (ins VReg_64:$vaddr, data_rc:$vdata, offset_s13:$offset, slc:$slc),
+      (ins VReg_64:$vaddr, data_rc:$vdata, offset_s13:$offset, SLC:$slc),
     " $vdst, $vaddr, $vdata, off$offset glc$slc",
     [(set vt:$vdst,
       (atomic (FLATSignedAtomic i64:$vaddr, i16:$offset, i1:$slc), data_vt:$vdata))]>,
@@ -304,7 +304,7 @@ multiclass FLAT_Global_Atomic_Pseudo<
 
   def _SADDR : FLAT_AtomicNoRet_Pseudo <opName,
     (outs),
-    (ins VReg_64:$vaddr, data_rc:$vdata, SReg_64:$saddr, offset_s13:$offset, slc:$slc),
+    (ins VReg_64:$vaddr, data_rc:$vdata, SReg_64:$saddr, offset_s13:$offset, SLC:$slc),
     " $vaddr, $vdata, $saddr$offset$slc">,
     AtomicNoRet <opName#"_saddr", 0> {
     let has_saddr = 1;
@@ -314,7 +314,7 @@ multiclass FLAT_Global_Atomic_Pseudo<
 
   def _SADDR_RTN : FLAT_AtomicRet_Pseudo <opName,
     (outs vdst_rc:$vdst),
-      (ins VReg_64:$vaddr, data_rc:$vdata, SReg_64:$saddr, offset_s13:$offset, slc:$slc),
+      (ins VReg_64:$vaddr, data_rc:$vdata, SReg_64:$saddr, offset_s13:$offset, SLC:$slc),
     " $vdst, $vaddr, $vdata, $saddr$offset glc$slc">,
     AtomicNoRet <opName#"_saddr", 1> {
      let has_saddr = 1;
@@ -780,7 +780,7 @@ def : FlatAtomicPat <FLAT_ATOMIC_XOR_X2_RTN, atomic_xor_global, i64>;
 def : FlatStorePat <FLAT_STORE_BYTE, truncstorei8_flat, i16>;
 def : FlatStorePat <FLAT_STORE_SHORT, store_flat, i16>;
 
-let OtherPredicates = [HasD16LoadStore] in {
+let OtherPredicates = [D16PreservesUnusedBits] in {
 def : FlatStorePat <FLAT_STORE_SHORT_D16_HI, truncstorei16_hi16_flat, i32>;
 def : FlatStorePat <FLAT_STORE_BYTE_D16_HI, truncstorei8_hi16_flat, i32>;
 
@@ -824,7 +824,7 @@ def : FlatStoreSignedPat <GLOBAL_STORE_DWORD, store_global, i32>;
 def : FlatStoreSignedPat <GLOBAL_STORE_DWORDX2, store_global, v2i32>;
 def : FlatStoreSignedPat <GLOBAL_STORE_DWORDX4, store_global, v4i32>;
 
-let OtherPredicates = [HasD16LoadStore] in {
+let OtherPredicates = [D16PreservesUnusedBits] in {
 def : FlatStoreSignedPat <GLOBAL_STORE_SHORT_D16_HI, truncstorei16_hi16_global, i32>;
 def : FlatStoreSignedPat <GLOBAL_STORE_BYTE_D16_HI, truncstorei8_hi16_global, i32>;
 
diff --git a/lib/Target/AMDGPU/GCNHazardRecognizer.cpp b/lib/Target/AMDGPU/GCNHazardRecognizer.cpp
index dd515b0bf2f1..f236f10ba75a 100644
--- a/lib/Target/AMDGPU/GCNHazardRecognizer.cpp
+++ b/lib/Target/AMDGPU/GCNHazardRecognizer.cpp
@@ -16,6 +16,7 @@
 #include "SIDefines.h"
 #include "SIInstrInfo.h"
 #include "SIRegisterInfo.h"
+#include "MCTargetDesc/AMDGPUMCTargetDesc.h"
 #include "Utils/AMDGPUBaseInfo.h"
 #include "llvm/ADT/iterator_range.h"
 #include "llvm/CodeGen/MachineFunction.h"
@@ -39,7 +40,7 @@ using namespace llvm;
 GCNHazardRecognizer::GCNHazardRecognizer(const MachineFunction &MF) :
   CurrCycleInstr(nullptr),
   MF(MF),
-  ST(MF.getSubtarget<SISubtarget>()),
+  ST(MF.getSubtarget<GCNSubtarget>()),
   TII(*ST.getInstrInfo()),
   TRI(TII.getRegisterInfo()),
   ClauseUses(TRI.getNumRegUnits()),
@@ -355,13 +356,13 @@ int GCNHazardRecognizer::checkSoftClauseHazards(MachineInstr *MEM) {
 }
 
 int GCNHazardRecognizer::checkSMRDHazards(MachineInstr *SMRD) {
-  const SISubtarget &ST = MF.getSubtarget<SISubtarget>();
+  const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
   int WaitStatesNeeded = 0;
 
   WaitStatesNeeded = checkSoftClauseHazards(SMRD);
 
   // This SMRD hazard only affects SI.
-  if (ST.getGeneration() != SISubtarget::SOUTHERN_ISLANDS)
+  if (ST.getGeneration() != AMDGPUSubtarget::SOUTHERN_ISLANDS)
     return WaitStatesNeeded;
 
   // A read of an SGPR by SMRD instruction requires 4 wait states when the
@@ -398,7 +399,7 @@ int GCNHazardRecognizer::checkSMRDHazards(MachineInstr *SMRD) {
 }
 
 int GCNHazardRecognizer::checkVMEMHazards(MachineInstr* VMEM) {
-  if (ST.getGeneration() < SISubtarget::VOLCANIC_ISLANDS)
+  if (ST.getGeneration() < AMDGPUSubtarget::VOLCANIC_ISLANDS)
     return 0;
 
   int WaitStatesNeeded = checkSoftClauseHazards(VMEM);
@@ -634,7 +635,7 @@ int GCNHazardRecognizer::checkRFEHazards(MachineInstr *RFE) {
 }
 
 int GCNHazardRecognizer::checkAnyInstHazards(MachineInstr *MI) {
-  if (MI->isDebugValue())
+  if (MI->isDebugInstr())
     return 0;
 
   const SIRegisterInfo *TRI = ST.getRegisterInfo();
diff --git a/lib/Target/AMDGPU/GCNHazardRecognizer.h b/lib/Target/AMDGPU/GCNHazardRecognizer.h
index f9a6e395a454..ca17e7cb6018 100644
--- a/lib/Target/AMDGPU/GCNHazardRecognizer.h
+++ b/lib/Target/AMDGPU/GCNHazardRecognizer.h
@@ -28,7 +28,7 @@ class MachineRegisterInfo;
 class ScheduleDAG;
 class SIInstrInfo;
 class SIRegisterInfo;
-class SISubtarget;
+class GCNSubtarget;
 
 class GCNHazardRecognizer final : public ScheduleHazardRecognizer {
   // This variable stores the instruction that has been emitted this cycle. It
@@ -37,7 +37,7 @@ class GCNHazardRecognizer final : public ScheduleHazardRecognizer {
   MachineInstr *CurrCycleInstr;
   std::list<MachineInstr*> EmittedInstrs;
   const MachineFunction &MF;
-  const SISubtarget &ST;
+  const GCNSubtarget &ST;
   const SIInstrInfo &TII;
   const SIRegisterInfo &TRI;
 
diff --git a/lib/Target/AMDGPU/GCNILPSched.cpp b/lib/Target/AMDGPU/GCNILPSched.cpp
index ba8211b189cf..651091d44136 100644
--- a/lib/Target/AMDGPU/GCNILPSched.cpp
+++ b/lib/Target/AMDGPU/GCNILPSched.cpp
@@ -149,9 +149,9 @@ static int BUCompareLatency(const SUnit *left, const SUnit *right) {
   int LDepth = left->getDepth();
   int RDepth = right->getDepth();
   if (LDepth != RDepth) {
-    DEBUG(dbgs() << "  Comparing latency of SU (" << left->NodeNum
-      << ") depth " << LDepth << " vs SU (" << right->NodeNum
-      << ") depth " << RDepth << "\n");
+    LLVM_DEBUG(dbgs() << "  Comparing latency of SU (" << left->NodeNum
+                      << ") depth " << LDepth << " vs SU (" << right->NodeNum
+                      << ") depth " << RDepth << "\n");
     return LDepth < RDepth ? 1 : -1;
   }
   if (left->Latency != right->Latency)
@@ -169,9 +169,9 @@ const SUnit *GCNILPScheduler::pickBest(const SUnit *left, const SUnit *right)
   if (!DisableSchedCriticalPath) {
     int spread = (int)left->getDepth() - (int)right->getDepth();
     if (std::abs(spread) > MaxReorderWindow) {
-      DEBUG(dbgs() << "Depth of SU(" << left->NodeNum << "): "
-        << left->getDepth() << " != SU(" << right->NodeNum << "): "
-        << right->getDepth() << "\n");
+      LLVM_DEBUG(dbgs() << "Depth of SU(" << left->NodeNum << "): "
+                        << left->getDepth() << " != SU(" << right->NodeNum
+                        << "): " << right->getDepth() << "\n");
       return left->getDepth() < right->getDepth() ? right : left;
     }
   }
@@ -324,19 +324,18 @@ GCNILPScheduler::schedule(ArrayRef<const SUnit*> BotRoots,
     if (AvailQueue.empty())
       break;
 
-    DEBUG(
-      dbgs() << "\n=== Picking candidate\n"
-                "Ready queue:";
-      for (auto &C : AvailQueue)
-        dbgs() << ' ' << C.SU->NodeNum;
-      dbgs() << '\n';
-    );
+    LLVM_DEBUG(dbgs() << "\n=== Picking candidate\n"
+                         "Ready queue:";
+               for (auto &C
+                    : AvailQueue) dbgs()
+               << ' ' << C.SU->NodeNum;
+               dbgs() << '\n';);
 
     auto C = pickCandidate();
     assert(C);
     AvailQueue.remove(*C);
     auto SU = C->SU;
-    DEBUG(dbgs() << "Selected "; SU->dump(&DAG));
+    LLVM_DEBUG(dbgs() << "Selected "; SU->dump(&DAG));
 
     advanceToCycle(SU->getHeight());
 
diff --git a/lib/Target/AMDGPU/GCNIterativeScheduler.cpp b/lib/Target/AMDGPU/GCNIterativeScheduler.cpp
index a0e4f7ff24cb..15366d66bd85 100644
--- a/lib/Target/AMDGPU/GCNIterativeScheduler.cpp
+++ b/lib/Target/AMDGPU/GCNIterativeScheduler.cpp
@@ -11,6 +11,7 @@
 #include "AMDGPUSubtarget.h"
 #include "GCNRegPressure.h"
 #include "GCNSchedStrategy.h"
+#include "SIMachineFunctionInfo.h"
 #include "llvm/ADT/ArrayRef.h"
 #include "llvm/ADT/STLExtras.h"
 #include "llvm/ADT/SmallVector.h"
@@ -19,6 +20,7 @@
 #include "llvm/CodeGen/MachineFunction.h"
 #include "llvm/CodeGen/RegisterPressure.h"
 #include "llvm/CodeGen/ScheduleDAG.h"
+#include "llvm/Config/llvm-config.h"
 #include "llvm/Support/Compiler.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/raw_ostream.h"
@@ -68,14 +70,14 @@ static void printRegion(raw_ostream &OS,
   auto I = Begin;
   MaxInstNum = std::max(MaxInstNum, 1u);
   for (; I != End && MaxInstNum; ++I, --MaxInstNum) {
-    if (!I->isDebugValue() && LIS)
+    if (!I->isDebugInstr() && LIS)
       OS << LIS->getInstructionIndex(*I);
     OS << '\t' << *I;
   }
   if (I != End) {
     OS << "\t...\n";
     I = std::prev(End);
-    if (!I->isDebugValue() && LIS)
+    if (!I->isDebugInstr() && LIS)
       OS << LIS->getInstructionIndex(*I);
     OS << '\t' << *I;
   }
@@ -106,7 +108,7 @@ static void printLivenessInfo(raw_ostream &OS,
 
 LLVM_DUMP_METHOD
 void GCNIterativeScheduler::printRegions(raw_ostream &OS) const {
-  const auto &ST = MF.getSubtarget<SISubtarget>();
+  const auto &ST = MF.getSubtarget<GCNSubtarget>();
   for (const auto R : Regions) {
     OS << "Region to schedule ";
     printRegion(OS, R->Begin, R->End, LIS, 1);
@@ -130,7 +132,7 @@ LLVM_DUMP_METHOD
 void GCNIterativeScheduler::printSchedRP(raw_ostream &OS,
                                          const GCNRegPressure &Before,
                                          const GCNRegPressure &After) const {
-  const auto &ST = MF.getSubtarget<SISubtarget>();
+  const auto &ST = MF.getSubtarget<GCNSubtarget>();
   OS << "RP before: ";
   Before.print(OS, &ST);
   OS << "RP after:  ";
@@ -199,8 +201,8 @@ public:
 
   void schedule() {
     assert(Sch.RegionBegin == Rgn.Begin && Sch.RegionEnd == Rgn.End);
-    DEBUG(dbgs() << "\nScheduling ";
-      printRegion(dbgs(), Rgn.Begin, Rgn.End, Sch.LIS, 2));
+    LLVM_DEBUG(dbgs() << "\nScheduling ";
+               printRegion(dbgs(), Rgn.Begin, Rgn.End, Sch.LIS, 2));
     Sch.BaseClass::schedule();
 
     // Unfortunatelly placeDebugValues incorrectly modifies RegionEnd, restore
@@ -310,14 +312,13 @@ void GCNIterativeScheduler::enterRegion(MachineBasicBlock *BB, // overriden
 
 void GCNIterativeScheduler::schedule() { // overriden
   // do nothing
-  DEBUG(
-    printLivenessInfo(dbgs(), RegionBegin, RegionEnd, LIS);
-    if (!Regions.empty() && Regions.back()->Begin == RegionBegin) {
-      dbgs() << "Max RP: ";
-      Regions.back()->MaxPressure.print(dbgs(), &MF.getSubtarget<SISubtarget>());
-    }
-    dbgs() << '\n';
-  );
+  LLVM_DEBUG(printLivenessInfo(dbgs(), RegionBegin, RegionEnd, LIS);
+             if (!Regions.empty() && Regions.back()->Begin == RegionBegin) {
+               dbgs() << "Max RP: ";
+               Regions.back()->MaxPressure.print(
+                   dbgs(), &MF.getSubtarget<GCNSubtarget>());
+             } dbgs()
+             << '\n';);
 }
 
 void GCNIterativeScheduler::finalizeSchedule() { // overriden
@@ -383,10 +384,10 @@ void GCNIterativeScheduler::scheduleRegion(Region &R, Range &&Schedule,
     if (MI != &*Top) {
       BB->remove(MI);
       BB->insert(Top, MI);
-      if (!MI->isDebugValue())
+      if (!MI->isDebugInstr())
         LIS->handleMove(*MI, true);
     }
-    if (!MI->isDebugValue()) {
+    if (!MI->isDebugInstr()) {
       // Reset read - undef flags and update them later.
       for (auto &Op : MI->operands())
         if (Op.isReg() && Op.isDef())
@@ -417,7 +418,7 @@ void GCNIterativeScheduler::scheduleRegion(Region &R, Range &&Schedule,
 
 #ifndef NDEBUG
   const auto RegionMaxRP = getRegionPressure(R);
-  const auto &ST = MF.getSubtarget<SISubtarget>();
+  const auto &ST = MF.getSubtarget<GCNSubtarget>();
 #endif
   assert((SchedMaxRP == RegionMaxRP && (MaxRP.empty() || SchedMaxRP == MaxRP))
   || (dbgs() << "Max RP mismatch!!!\n"
@@ -432,8 +433,8 @@ void GCNIterativeScheduler::scheduleRegion(Region &R, Range &&Schedule,
 
 // Sort recorded regions by pressure - highest at the front
 void GCNIterativeScheduler::sortRegionsByPressure(unsigned TargetOcc) {
-  const auto &ST = MF.getSubtarget<SISubtarget>();
-  std::sort(Regions.begin(), Regions.end(),
+  const auto &ST = MF.getSubtarget<GCNSubtarget>();
+  llvm::sort(Regions.begin(), Regions.end(),
     [&ST, TargetOcc](const Region *R1, const Region *R2) {
     return R2->MaxPressure.less(ST, R1->MaxPressure, TargetOcc);
   });
@@ -450,24 +451,24 @@ void GCNIterativeScheduler::sortRegionsByPressure(unsigned TargetOcc) {
 // BestSchedules aren't deleted on fail.
 unsigned GCNIterativeScheduler::tryMaximizeOccupancy(unsigned TargetOcc) {
   // TODO: assert Regions are sorted descending by pressure
-  const auto &ST = MF.getSubtarget<SISubtarget>();
+  const auto &ST = MF.getSubtarget<GCNSubtarget>();
   const auto Occ = Regions.front()->MaxPressure.getOccupancy(ST);
-  DEBUG(dbgs() << "Trying to to improve occupancy, target = " << TargetOcc
-               << ", current = " << Occ << '\n');
+  LLVM_DEBUG(dbgs() << "Trying to improve occupancy, target = " << TargetOcc
+                    << ", current = " << Occ << '\n');
 
   auto NewOcc = TargetOcc;
   for (auto R : Regions) {
     if (R->MaxPressure.getOccupancy(ST) >= NewOcc)
       break;
 
-    DEBUG(printRegion(dbgs(), R->Begin, R->End, LIS, 3);
-          printLivenessInfo(dbgs(), R->Begin, R->End, LIS));
+    LLVM_DEBUG(printRegion(dbgs(), R->Begin, R->End, LIS, 3);
+               printLivenessInfo(dbgs(), R->Begin, R->End, LIS));
 
     BuildDAG DAG(*R, *this);
     const auto MinSchedule = makeMinRegSchedule(DAG.getTopRoots(), *this);
     const auto MaxRP = getSchedulePressure(*R, MinSchedule);
-    DEBUG(dbgs() << "Occupancy improvement attempt:\n";
-          printSchedRP(dbgs(), R->MaxPressure, MaxRP));
+    LLVM_DEBUG(dbgs() << "Occupancy improvement attempt:\n";
+               printSchedRP(dbgs(), R->MaxPressure, MaxRP));
 
     NewOcc = std::min(NewOcc, MaxRP.getOccupancy(ST));
     if (NewOcc <= Occ)
@@ -475,15 +476,21 @@ unsigned GCNIterativeScheduler::tryMaximizeOccupancy(unsigned TargetOcc) {
 
     setBestSchedule(*R, MinSchedule, MaxRP);
   }
-  DEBUG(dbgs() << "New occupancy = " << NewOcc
-               << ", prev occupancy = " << Occ << '\n');
+  LLVM_DEBUG(dbgs() << "New occupancy = " << NewOcc
+                    << ", prev occupancy = " << Occ << '\n');
+  if (NewOcc > Occ) {
+    SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
+    MFI->increaseOccupancy(MF, NewOcc);
+  }
+
   return std::max(NewOcc, Occ);
 }
 
 void GCNIterativeScheduler::scheduleLegacyMaxOccupancy(
   bool TryMaximizeOccupancy) {
-  const auto &ST = MF.getSubtarget<SISubtarget>();
-  auto TgtOcc = ST.getOccupancyWithLocalMemSize(MF);
+  const auto &ST = MF.getSubtarget<GCNSubtarget>();
+  SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
+  auto TgtOcc = MFI->getMinAllowedOccupancy();
 
   sortRegionsByPressure(TgtOcc);
   auto Occ = Regions.front()->MaxPressure.getOccupancy(ST);
@@ -496,9 +503,11 @@ void GCNIterativeScheduler::scheduleLegacyMaxOccupancy(
   const int NumPasses = Occ < TgtOcc ? 2 : 1;
 
   TgtOcc = std::min(Occ, TgtOcc);
-  DEBUG(dbgs() << "Scheduling using default scheduler, "
-                  "target occupancy = " << TgtOcc << '\n');
+  LLVM_DEBUG(dbgs() << "Scheduling using default scheduler, "
+                       "target occupancy = "
+                    << TgtOcc << '\n');
   GCNMaxOccupancySchedStrategy LStrgy(Context);
+  unsigned FinalOccupancy = std::min(Occ, MFI->getOccupancy());
 
   for (int I = 0; I < NumPasses; ++I) {
     // running first pass with TargetOccupancy = 0 mimics previous scheduling
@@ -509,30 +518,33 @@ void GCNIterativeScheduler::scheduleLegacyMaxOccupancy(
 
       Ovr.schedule();
       const auto RP = getRegionPressure(*R);
-      DEBUG(printSchedRP(dbgs(), R->MaxPressure, RP));
+      LLVM_DEBUG(printSchedRP(dbgs(), R->MaxPressure, RP));
 
       if (RP.getOccupancy(ST) < TgtOcc) {
-        DEBUG(dbgs() << "Didn't fit into target occupancy O" << TgtOcc);
+        LLVM_DEBUG(dbgs() << "Didn't fit into target occupancy O" << TgtOcc);
         if (R->BestSchedule.get() &&
             R->BestSchedule->MaxPressure.getOccupancy(ST) >= TgtOcc) {
-          DEBUG(dbgs() << ", scheduling minimal register\n");
+          LLVM_DEBUG(dbgs() << ", scheduling minimal register\n");
           scheduleBest(*R);
         } else {
-          DEBUG(dbgs() << ", restoring\n");
+          LLVM_DEBUG(dbgs() << ", restoring\n");
           Ovr.restoreOrder();
           assert(R->MaxPressure.getOccupancy(ST) >= TgtOcc);
         }
       }
+      FinalOccupancy = std::min(FinalOccupancy, RP.getOccupancy(ST));
     }
   }
+  MFI->limitOccupancy(FinalOccupancy);
 }
 
 ///////////////////////////////////////////////////////////////////////////////
 // Minimal Register Strategy
 
 void GCNIterativeScheduler::scheduleMinReg(bool force) {
-  const auto &ST = MF.getSubtarget<SISubtarget>();
-  const auto TgtOcc = ST.getOccupancyWithLocalMemSize(MF);
+  const auto &ST = MF.getSubtarget<GCNSubtarget>();
+  const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
+  const auto TgtOcc = MFI->getOccupancy();
   sortRegionsByPressure(TgtOcc);
 
   auto MaxPressure = Regions.front()->MaxPressure;
@@ -544,7 +556,7 @@ void GCNIterativeScheduler::scheduleMinReg(bool force) {
     const auto MinSchedule = makeMinRegSchedule(DAG.getTopRoots(), *this);
 
     const auto RP = getSchedulePressure(*R, MinSchedule);
-    DEBUG(if (R->MaxPressure.less(ST, RP, TgtOcc)) {
+    LLVM_DEBUG(if (R->MaxPressure.less(ST, RP, TgtOcc)) {
       dbgs() << "\nWarning: Pressure becomes worse after minreg!";
       printSchedRP(dbgs(), R->MaxPressure, RP);
     });
@@ -553,7 +565,7 @@ void GCNIterativeScheduler::scheduleMinReg(bool force) {
       break;
 
     scheduleRegion(*R, MinSchedule, RP);
-    DEBUG(printSchedResult(dbgs(), R, RP));
+    LLVM_DEBUG(printSchedResult(dbgs(), R, RP));
 
     MaxPressure = RP;
   }
@@ -564,9 +576,9 @@ void GCNIterativeScheduler::scheduleMinReg(bool force) {
 
 void GCNIterativeScheduler::scheduleILP(
   bool TryMaximizeOccupancy) {
-  const auto &ST = MF.getSubtarget<SISubtarget>();
-  auto TgtOcc = std::min(ST.getOccupancyWithLocalMemSize(MF),
-                         ST.getWavesPerEU(MF.getFunction()).second);
+  const auto &ST = MF.getSubtarget<GCNSubtarget>();
+  SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
+  auto TgtOcc = MFI->getMinAllowedOccupancy();
 
   sortRegionsByPressure(TgtOcc);
   auto Occ = Regions.front()->MaxPressure.getOccupancy(ST);
@@ -575,26 +587,30 @@ void GCNIterativeScheduler::scheduleILP(
     Occ = tryMaximizeOccupancy(TgtOcc);
 
   TgtOcc = std::min(Occ, TgtOcc);
-  DEBUG(dbgs() << "Scheduling using default scheduler, "
-    "target occupancy = " << TgtOcc << '\n');
+  LLVM_DEBUG(dbgs() << "Scheduling using default scheduler, "
+                       "target occupancy = "
+                    << TgtOcc << '\n');
 
+  unsigned FinalOccupancy = std::min(Occ, MFI->getOccupancy());
   for (auto R : Regions) {
     BuildDAG DAG(*R, *this);
     const auto ILPSchedule = makeGCNILPScheduler(DAG.getBottomRoots(), *this);
 
     const auto RP = getSchedulePressure(*R, ILPSchedule);
-    DEBUG(printSchedRP(dbgs(), R->MaxPressure, RP));
+    LLVM_DEBUG(printSchedRP(dbgs(), R->MaxPressure, RP));
 
     if (RP.getOccupancy(ST) < TgtOcc) {
-      DEBUG(dbgs() << "Didn't fit into target occupancy O" << TgtOcc);
+      LLVM_DEBUG(dbgs() << "Didn't fit into target occupancy O" << TgtOcc);
       if (R->BestSchedule.get() &&
         R->BestSchedule->MaxPressure.getOccupancy(ST) >= TgtOcc) {
-        DEBUG(dbgs() << ", scheduling minimal register\n");
+        LLVM_DEBUG(dbgs() << ", scheduling minimal register\n");
         scheduleBest(*R);
       }
     } else {
       scheduleRegion(*R, ILPSchedule, RP);
-      DEBUG(printSchedResult(dbgs(), R, RP));
+      LLVM_DEBUG(printSchedResult(dbgs(), R, RP));
+      FinalOccupancy = std::min(FinalOccupancy, RP.getOccupancy(ST));
     }
   }
+  MFI->limitOccupancy(FinalOccupancy);
 }
diff --git a/lib/Target/AMDGPU/GCNMinRegStrategy.cpp b/lib/Target/AMDGPU/GCNMinRegStrategy.cpp
index 9904b5f0f4ba..192d534bb9cf 100644
--- a/lib/Target/AMDGPU/GCNMinRegStrategy.cpp
+++ b/lib/Target/AMDGPU/GCNMinRegStrategy.cpp
@@ -142,35 +142,38 @@ GCNMinRegScheduler::Candidate* GCNMinRegScheduler::pickCandidate() {
     unsigned Num = RQ.size();
     if (Num == 1) break;
 
-    DEBUG(dbgs() << "\nSelecting max priority candidates among " << Num << '\n');
+    LLVM_DEBUG(dbgs() << "\nSelecting max priority candidates among " << Num
+                      << '\n');
     Num = findMax(Num, [=](const Candidate &C) { return C.Priority; });
     if (Num == 1) break;
 
-    DEBUG(dbgs() << "\nSelecting min non-ready producing candidate among "
-                 << Num << '\n');
+    LLVM_DEBUG(dbgs() << "\nSelecting min non-ready producing candidate among "
+                      << Num << '\n');
     Num = findMax(Num, [=](const Candidate &C) {
       auto SU = C.SU;
       int Res = getNotReadySuccessors(SU);
-      DEBUG(dbgs() << "SU(" << SU->NodeNum << ") would left non-ready "
-                   << Res << " successors, metric = " << -Res << '\n');
+      LLVM_DEBUG(dbgs() << "SU(" << SU->NodeNum << ") would left non-ready "
+                        << Res << " successors, metric = " << -Res << '\n');
       return -Res;
     });
     if (Num == 1) break;
 
-    DEBUG(dbgs() << "\nSelecting most producing candidate among "
-                 << Num << '\n');
+    LLVM_DEBUG(dbgs() << "\nSelecting most producing candidate among " << Num
+                      << '\n');
     Num = findMax(Num, [=](const Candidate &C) {
       auto SU = C.SU;
       auto Res = getReadySuccessors(SU);
-      DEBUG(dbgs() << "SU(" << SU->NodeNum << ") would make ready "
-                   << Res << " successors, metric = " << Res << '\n');
+      LLVM_DEBUG(dbgs() << "SU(" << SU->NodeNum << ") would make ready " << Res
+                        << " successors, metric = " << Res << '\n');
       return Res;
     });
     if (Num == 1) break;
 
     Num = Num ? Num : RQ.size();
-    DEBUG(dbgs() << "\nCan't find best candidate, selecting in program order among "
-                 << Num << '\n');
+    LLVM_DEBUG(
+        dbgs()
+        << "\nCan't find best candidate, selecting in program order among "
+        << Num << '\n');
     Num = findMax(Num, [=](const Candidate &C) { return -(int64_t)C.SU->NodeNum; });
     assert(Num == 1);
   } while (false);
@@ -202,17 +205,17 @@ void GCNMinRegScheduler::bumpPredsPriority(const SUnit *SchedSU, int Priority) {
         Worklist.push_back(P.getSUnit());
     }
   }
-  DEBUG(dbgs() << "Make the predecessors of SU(" << SchedSU->NodeNum
-               << ")'s non-ready successors of " << Priority
-               << " priority in ready queue: ");
+  LLVM_DEBUG(dbgs() << "Make the predecessors of SU(" << SchedSU->NodeNum
+                    << ")'s non-ready successors of " << Priority
+                    << " priority in ready queue: ");
   const auto SetEnd = Set.end();
   for (auto &C : RQ) {
     if (Set.find(C.SU) != SetEnd) {
       C.Priority = Priority;
-      DEBUG(dbgs() << " SU(" << C.SU->NodeNum << ')');
+      LLVM_DEBUG(dbgs() << " SU(" << C.SU->NodeNum << ')');
     }
   }
-  DEBUG(dbgs() << '\n');
+  LLVM_DEBUG(dbgs() << '\n');
 }
 
 void GCNMinRegScheduler::releaseSuccessors(const SUnit* SU, int Priority) {
@@ -243,19 +246,19 @@ GCNMinRegScheduler::schedule(ArrayRef<const SUnit*> TopRoots,
   releaseSuccessors(&DAG.EntrySU, StepNo);
 
   while (!RQ.empty()) {
-    DEBUG(
-      dbgs() << "\n=== Picking candidate, Step = " << StepNo << "\n"
-                "Ready queue:";
-      for (auto &C : RQ)
-        dbgs() << ' ' << C.SU->NodeNum << "(P" << C.Priority << ')';
-      dbgs() << '\n';
-    );
+    LLVM_DEBUG(dbgs() << "\n=== Picking candidate, Step = " << StepNo
+                      << "\n"
+                         "Ready queue:";
+               for (auto &C
+                    : RQ) dbgs()
+               << ' ' << C.SU->NodeNum << "(P" << C.Priority << ')';
+               dbgs() << '\n';);
 
     auto C = pickCandidate();
     assert(C);
     RQ.remove(*C);
     auto SU = C->SU;
-    DEBUG(dbgs() << "Selected "; SU->dump(&DAG));
+    LLVM_DEBUG(dbgs() << "Selected "; SU->dump(&DAG));
 
     releaseSuccessors(SU, StepNo);
     Schedule.push_back(SU);
diff --git a/lib/Target/AMDGPU/GCNProcessors.td b/lib/Target/AMDGPU/GCNProcessors.td
index b2a3f652abd8..d76acfa24f90 100644
--- a/lib/Target/AMDGPU/GCNProcessors.td
+++ b/lib/Target/AMDGPU/GCNProcessors.td
@@ -93,14 +93,6 @@ def : ProcessorModel<"bonaire", SIQuarterSpeedModel,
 // GCN GFX8 (Volcanic Islands (VI)).
 //===----------------------------------------------------------------------===//
 
-def : ProcessorModel<"gfx800", SIQuarterSpeedModel,
-  [FeatureISAVersion8_0_0]
->;
-
-def : ProcessorModel<"iceland", SIQuarterSpeedModel,
-  [FeatureISAVersion8_0_0]
->;
-
 def : ProcessorModel<"gfx801", SIQuarterSpeedModel,
   [FeatureISAVersion8_0_1]
 >;
@@ -113,6 +105,10 @@ def : ProcessorModel<"gfx802", SIQuarterSpeedModel,
   [FeatureISAVersion8_0_2]
 >;
 
+def : ProcessorModel<"iceland", SIQuarterSpeedModel,
+  [FeatureISAVersion8_0_2]
+>;
+
 def : ProcessorModel<"tonga", SIQuarterSpeedModel,
   [FeatureISAVersion8_0_2]
 >;
@@ -152,3 +148,11 @@ def : ProcessorModel<"gfx900", SIQuarterSpeedModel,
 def : ProcessorModel<"gfx902", SIQuarterSpeedModel,
   [FeatureISAVersion9_0_2]
 >;
+
+def : ProcessorModel<"gfx904", SIQuarterSpeedModel,
+  [FeatureISAVersion9_0_4]
+>;
+
+def : ProcessorModel<"gfx906", SIQuarterSpeedModel,
+  [FeatureISAVersion9_0_6]
+>;
diff --git a/lib/Target/AMDGPU/GCNRegPressure.cpp b/lib/Target/AMDGPU/GCNRegPressure.cpp
index 992bb7cceb6f..3d8cacc4f02c 100644
--- a/lib/Target/AMDGPU/GCNRegPressure.cpp
+++ b/lib/Target/AMDGPU/GCNRegPressure.cpp
@@ -19,6 +19,7 @@
 #include "llvm/CodeGen/RegisterPressure.h"
 #include "llvm/CodeGen/SlotIndexes.h"
 #include "llvm/CodeGen/TargetRegisterInfo.h"
+#include "llvm/Config/llvm-config.h"
 #include "llvm/MC/LaneBitmask.h"
 #include "llvm/Support/Compiler.h"
 #include "llvm/Support/Debug.h"
@@ -131,7 +132,7 @@ void GCNRegPressure::inc(unsigned Reg,
   }
 }
 
-bool GCNRegPressure::less(const SISubtarget &ST,
+bool GCNRegPressure::less(const GCNSubtarget &ST,
                           const GCNRegPressure& O,
                           unsigned MaxOccupancy) const {
   const auto SGPROcc = std::min(MaxOccupancy,
@@ -177,7 +178,7 @@ bool GCNRegPressure::less(const SISubtarget &ST,
 
 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
 LLVM_DUMP_METHOD
-void GCNRegPressure::print(raw_ostream &OS, const SISubtarget *ST) const {
+void GCNRegPressure::print(raw_ostream &OS, const GCNSubtarget *ST) const {
   OS << "VGPRs: " << getVGPRNum();
   if (ST) OS << "(O" << ST->getOccupancyWithNumVGPRs(getVGPRNum()) << ')';
   OS << ", SGPRs: " << getSGPRNum();
@@ -283,24 +284,33 @@ GCNRPTracker::LiveRegSet llvm::getLiveRegs(SlotIndex SI,
   return LiveRegs;
 }
 
-void GCNUpwardRPTracker::reset(const MachineInstr &MI,
-                               const LiveRegSet *LiveRegsCopy) {
-  MRI = &MI.getParent()->getParent()->getRegInfo();
+void GCNRPTracker::reset(const MachineInstr &MI,
+                         const LiveRegSet *LiveRegsCopy,
+                         bool After) {
+  const MachineFunction &MF = *MI.getMF();
+  MRI = &MF.getRegInfo();
   if (LiveRegsCopy) {
     if (&LiveRegs != LiveRegsCopy)
       LiveRegs = *LiveRegsCopy;
   } else {
-    LiveRegs = getLiveRegsAfter(MI, LIS);
+    LiveRegs = After ? getLiveRegsAfter(MI, LIS)
+                     : getLiveRegsBefore(MI, LIS);
   }
+
   MaxPressure = CurPressure = getRegPressure(*MRI, LiveRegs);
 }
 
+void GCNUpwardRPTracker::reset(const MachineInstr &MI,
+                               const LiveRegSet *LiveRegsCopy) {
+  GCNRPTracker::reset(MI, LiveRegsCopy, true);
+}
+
 void GCNUpwardRPTracker::recede(const MachineInstr &MI) {
   assert(MRI && "call reset first");
 
   LastTrackedMI = &MI;
 
-  if (MI.isDebugValue())
+  if (MI.isDebugInstr())
     return;
 
   auto const RegUses = collectVirtualRegUses(MI, LIS, *MRI);
@@ -348,13 +358,7 @@ bool GCNDownwardRPTracker::reset(const MachineInstr &MI,
   NextMI = skipDebugInstructionsForward(NextMI, MBBEnd);
   if (NextMI == MBBEnd)
     return false;
-  if (LiveRegsCopy) {
-    if (&LiveRegs != LiveRegsCopy)
-      LiveRegs = *LiveRegsCopy;
-  } else {
-    LiveRegs = getLiveRegsBefore(*NextMI, LIS);
-  }
-  MaxPressure = CurPressure = getRegPressure(*MRI, LiveRegs);
+  GCNRPTracker::reset(*NextMI, LiveRegsCopy, false);
   return true;
 }
 
diff --git a/lib/Target/AMDGPU/GCNRegPressure.h b/lib/Target/AMDGPU/GCNRegPressure.h
index e418aa0fe911..357d3b7b2334 100644
--- a/lib/Target/AMDGPU/GCNRegPressure.h
+++ b/lib/Target/AMDGPU/GCNRegPressure.h
@@ -49,7 +49,7 @@ struct GCNRegPressure {
   unsigned getVGPRTuplesWeight() const { return Value[VGPR_TUPLE]; }
   unsigned getSGPRTuplesWeight() const { return Value[SGPR_TUPLE]; }
 
-  unsigned getOccupancy(const SISubtarget &ST) const {
+  unsigned getOccupancy(const GCNSubtarget &ST) const {
     return std::min(ST.getOccupancyWithNumSGPRs(getSGPRNum()),
                     ST.getOccupancyWithNumVGPRs(getVGPRNum()));
   }
@@ -59,11 +59,11 @@ struct GCNRegPressure {
            LaneBitmask NewMask,
            const MachineRegisterInfo &MRI);
 
-  bool higherOccupancy(const SISubtarget &ST, const GCNRegPressure& O) const {
+  bool higherOccupancy(const GCNSubtarget &ST, const GCNRegPressure& O) const {
     return getOccupancy(ST) > O.getOccupancy(ST);
   }
 
-  bool less(const SISubtarget &ST, const GCNRegPressure& O,
+  bool less(const GCNSubtarget &ST, const GCNRegPressure& O,
     unsigned MaxOccupancy = std::numeric_limits<unsigned>::max()) const;
 
   bool operator==(const GCNRegPressure &O) const {
@@ -74,7 +74,7 @@ struct GCNRegPressure {
     return !(*this == O);
   }
 
-  void print(raw_ostream &OS, const SISubtarget *ST = nullptr) const;
+  void print(raw_ostream &OS, const GCNSubtarget *ST = nullptr) const;
   void dump() const { print(dbgs()); }
 
 private:
@@ -106,6 +106,9 @@ protected:
 
   GCNRPTracker(const LiveIntervals &LIS_) : LIS(LIS_) {}
 
+  void reset(const MachineInstr &MI, const LiveRegSet *LiveRegsCopy,
+             bool After);
+
 public:
   // live regs for the current state
   const decltype(LiveRegs) &getLiveRegs() const { return LiveRegs; }
diff --git a/lib/Target/AMDGPU/GCNSchedStrategy.cpp b/lib/Target/AMDGPU/GCNSchedStrategy.cpp
index d414b899050a..f09b7f6cff22 100644
--- a/lib/Target/AMDGPU/GCNSchedStrategy.cpp
+++ b/lib/Target/AMDGPU/GCNSchedStrategy.cpp
@@ -28,18 +28,6 @@ GCNMaxOccupancySchedStrategy::GCNMaxOccupancySchedStrategy(
     const MachineSchedContext *C) :
     GenericScheduler(C), TargetOccupancy(0), MF(nullptr) { }
 
-static unsigned getMaxWaves(unsigned SGPRs, unsigned VGPRs,
-                            const MachineFunction &MF) {
-
-  const SISubtarget &ST = MF.getSubtarget<SISubtarget>();
-  const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
-  unsigned MinRegOccupancy = std::min(ST.getOccupancyWithNumSGPRs(SGPRs),
-                                      ST.getOccupancyWithNumVGPRs(VGPRs));
-  return std::min(MinRegOccupancy,
-                  ST.getOccupancyWithLocalMemSize(MFI->getLDSSize(),
-                                                  MF.getFunction()));
-}
-
 void GCNMaxOccupancySchedStrategy::initialize(ScheduleDAGMI *DAG) {
   GenericScheduler::initialize(DAG);
 
@@ -47,7 +35,7 @@ void GCNMaxOccupancySchedStrategy::initialize(ScheduleDAGMI *DAG) {
 
   MF = &DAG->MF;
 
-  const SISubtarget &ST = MF->getSubtarget<SISubtarget>();
+  const GCNSubtarget &ST = MF->getSubtarget<GCNSubtarget>();
 
   // FIXME: This is also necessary, because some passes that run after
   // scheduling and before regalloc increase register pressure.
@@ -81,7 +69,7 @@ void GCNMaxOccupancySchedStrategy::initCandidate(SchedCandidate &Cand, SUnit *SU
   Cand.AtTop = AtTop;
 
   // getDownwardPressure() and getUpwardPressure() make temporary changes to
-  // the the tracker, so we need to pass those function a non-const copy.
+  // the tracker, so we need to pass those function a non-const copy.
   RegPressureTracker &TempTracker = const_cast<RegPressureTracker&>(RPTracker);
 
   std::vector<unsigned> Pressure;
@@ -200,34 +188,30 @@ SUnit *GCNMaxOccupancySchedStrategy::pickNodeBidirectional(bool &IsTopNode) {
   setPolicy(TopPolicy, /*IsPostRA=*/false, Top, &Bot);
 
   // See if BotCand is still valid (because we previously scheduled from Top).
-  DEBUG(dbgs() << "Picking from Bot:\n");
+  LLVM_DEBUG(dbgs() << "Picking from Bot:\n");
   if (!BotCand.isValid() || BotCand.SU->isScheduled ||
       BotCand.Policy != BotPolicy) {
     BotCand.reset(CandPolicy());
     pickNodeFromQueue(Bot, BotPolicy, DAG->getBotRPTracker(), BotCand);
     assert(BotCand.Reason != NoCand && "failed to find the first candidate");
   } else {
-    DEBUG(traceCandidate(BotCand));
+    LLVM_DEBUG(traceCandidate(BotCand));
   }
 
   // Check if the top Q has a better candidate.
-  DEBUG(dbgs() << "Picking from Top:\n");
+  LLVM_DEBUG(dbgs() << "Picking from Top:\n");
   if (!TopCand.isValid() || TopCand.SU->isScheduled ||
       TopCand.Policy != TopPolicy) {
     TopCand.reset(CandPolicy());
     pickNodeFromQueue(Top, TopPolicy, DAG->getTopRPTracker(), TopCand);
     assert(TopCand.Reason != NoCand && "failed to find the first candidate");
   } else {
-    DEBUG(traceCandidate(TopCand));
+    LLVM_DEBUG(traceCandidate(TopCand));
   }
 
   // Pick best from BotCand and TopCand.
-  DEBUG(
-    dbgs() << "Top Cand: ";
-    traceCandidate(TopCand);
-    dbgs() << "Bot Cand: ";
-    traceCandidate(BotCand);
-  );
+  LLVM_DEBUG(dbgs() << "Top Cand: "; traceCandidate(TopCand);
+             dbgs() << "Bot Cand: "; traceCandidate(BotCand););
   SchedCandidate Cand;
   if (TopCand.Reason == BotCand.Reason) {
     Cand = BotCand;
@@ -256,10 +240,7 @@ SUnit *GCNMaxOccupancySchedStrategy::pickNodeBidirectional(bool &IsTopNode) {
       }
     }
   }
-  DEBUG(
-    dbgs() << "Picking: ";
-    traceCandidate(Cand);
-  );
+  LLVM_DEBUG(dbgs() << "Picking: "; traceCandidate(Cand););
 
   IsTopNode = Cand.AtTop;
   return Cand.SU;
@@ -305,20 +286,20 @@ SUnit *GCNMaxOccupancySchedStrategy::pickNode(bool &IsTopNode) {
   if (SU->isBottomReady())
     Bot.removeReady(SU);
 
-  DEBUG(dbgs() << "Scheduling SU(" << SU->NodeNum << ") " << *SU->getInstr());
+  LLVM_DEBUG(dbgs() << "Scheduling SU(" << SU->NodeNum << ") "
+                    << *SU->getInstr());
   return SU;
 }
 
 GCNScheduleDAGMILive::GCNScheduleDAGMILive(MachineSchedContext *C,
                         std::unique_ptr<MachineSchedStrategy> S) :
   ScheduleDAGMILive(C, std::move(S)),
-  ST(MF.getSubtarget<SISubtarget>()),
+  ST(MF.getSubtarget<GCNSubtarget>()),
   MFI(*MF.getInfo<SIMachineFunctionInfo>()),
-  StartingOccupancy(ST.getOccupancyWithLocalMemSize(MFI.getLDSSize(),
-                                                    MF.getFunction())),
+  StartingOccupancy(MFI.getOccupancy()),
   MinOccupancy(StartingOccupancy), Stage(0), RegionIdx(0) {
 
-  DEBUG(dbgs() << "Starting occupancy is " << StartingOccupancy << ".\n");
+  LLVM_DEBUG(dbgs() << "Starting occupancy is " << StartingOccupancy << ".\n");
 }
 
 void GCNScheduleDAGMILive::schedule() {
@@ -338,12 +319,12 @@ void GCNScheduleDAGMILive::schedule() {
   if (LIS) {
     PressureBefore = Pressure[RegionIdx];
 
-    DEBUG(dbgs() << "Pressure before scheduling:\nRegion live-ins:";
-          GCNRPTracker::printLiveRegs(dbgs(), LiveIns[RegionIdx], MRI);
-          dbgs() << "Region live-in pressure:  ";
-          llvm::getRegPressure(MRI, LiveIns[RegionIdx]).print(dbgs());
-          dbgs() << "Region register pressure: ";
-          PressureBefore.print(dbgs()));
+    LLVM_DEBUG(dbgs() << "Pressure before scheduling:\nRegion live-ins:";
+               GCNRPTracker::printLiveRegs(dbgs(), LiveIns[RegionIdx], MRI);
+               dbgs() << "Region live-in pressure:  ";
+               llvm::getRegPressure(MRI, LiveIns[RegionIdx]).print(dbgs());
+               dbgs() << "Region register pressure: ";
+               PressureBefore.print(dbgs()));
   }
 
   ScheduleDAGMILive::schedule();
@@ -356,45 +337,54 @@ void GCNScheduleDAGMILive::schedule() {
   GCNMaxOccupancySchedStrategy &S = (GCNMaxOccupancySchedStrategy&)*SchedImpl;
   auto PressureAfter = getRealRegPressure();
 
-  DEBUG(dbgs() << "Pressure after scheduling: "; PressureAfter.print(dbgs()));
+  LLVM_DEBUG(dbgs() << "Pressure after scheduling: ";
+             PressureAfter.print(dbgs()));
 
   if (PressureAfter.getSGPRNum() <= S.SGPRCriticalLimit &&
       PressureAfter.getVGPRNum() <= S.VGPRCriticalLimit) {
     Pressure[RegionIdx] = PressureAfter;
-    DEBUG(dbgs() << "Pressure in desired limits, done.\n");
+    LLVM_DEBUG(dbgs() << "Pressure in desired limits, done.\n");
     return;
   }
-  unsigned WavesAfter = getMaxWaves(PressureAfter.getSGPRNum(),
-                                    PressureAfter.getVGPRNum(), MF);
-  unsigned WavesBefore = getMaxWaves(PressureBefore.getSGPRNum(),
-                                     PressureBefore.getVGPRNum(), MF);
-  DEBUG(dbgs() << "Occupancy before scheduling: " << WavesBefore <<
-                  ", after " << WavesAfter << ".\n");
+  unsigned Occ = MFI.getOccupancy();
+  unsigned WavesAfter = std::min(Occ, PressureAfter.getOccupancy(ST));
+  unsigned WavesBefore = std::min(Occ, PressureBefore.getOccupancy(ST));
+  LLVM_DEBUG(dbgs() << "Occupancy before scheduling: " << WavesBefore
+                    << ", after " << WavesAfter << ".\n");
 
   // We could not keep current target occupancy because of the just scheduled
   // region. Record new occupancy for next scheduling cycle.
   unsigned NewOccupancy = std::max(WavesAfter, WavesBefore);
+  // Allow memory bound functions to drop to 4 waves if not limited by an
+  // attribute.
+  if (WavesAfter < WavesBefore && WavesAfter < MinOccupancy &&
+      WavesAfter >= MFI.getMinAllowedOccupancy()) {
+    LLVM_DEBUG(dbgs() << "Function is memory bound, allow occupancy drop up to "
+                      << MFI.getMinAllowedOccupancy() << " waves\n");
+    NewOccupancy = WavesAfter;
+  }
   if (NewOccupancy < MinOccupancy) {
     MinOccupancy = NewOccupancy;
-    DEBUG(dbgs() << "Occupancy lowered for the function to "
-                 << MinOccupancy << ".\n");
+    MFI.limitOccupancy(MinOccupancy);
+    LLVM_DEBUG(dbgs() << "Occupancy lowered for the function to "
+                      << MinOccupancy << ".\n");
   }
 
-  if (WavesAfter >= WavesBefore) {
+  if (WavesAfter >= MinOccupancy) {
     Pressure[RegionIdx] = PressureAfter;
     return;
   }
 
-  DEBUG(dbgs() << "Attempting to revert scheduling.\n");
+  LLVM_DEBUG(dbgs() << "Attempting to revert scheduling.\n");
   RegionEnd = RegionBegin;
   for (MachineInstr *MI : Unsched) {
-    if (MI->isDebugValue())
+    if (MI->isDebugInstr())
       continue;
 
     if (MI->getIterator() != RegionEnd) {
       BB->remove(MI);
       BB->insert(RegionEnd, MI);
-      if (!MI->isDebugValue())
+      if (!MI->isDebugInstr())
         LIS->handleMove(*MI, true);
     }
     // Reset read-undef flags and update them later.
@@ -403,7 +393,7 @@ void GCNScheduleDAGMILive::schedule() {
         Op.setIsUndef(false);
     RegisterOperands RegOpers;
     RegOpers.collect(*MI, *TRI, MRI, ShouldTrackLaneMasks, false);
-    if (!MI->isDebugValue()) {
+    if (!MI->isDebugInstr()) {
       if (ShouldTrackLaneMasks) {
         // Adjust liveness and add missing dead+read-undef flags.
         SlotIndex SlotIdx = LIS->getInstructionIndex(*MI).getRegSlot();
@@ -415,7 +405,7 @@ void GCNScheduleDAGMILive::schedule() {
     }
     RegionEnd = MI->getIterator();
     ++RegionEnd;
-    DEBUG(dbgs() << "Scheduling " << *MI);
+    LLVM_DEBUG(dbgs() << "Scheduling " << *MI);
   }
   RegionBegin = Unsched.front()->getIterator();
   Regions[RegionIdx] = std::make_pair(RegionBegin, RegionEnd);
@@ -490,7 +480,7 @@ void GCNScheduleDAGMILive::computeBlockPressure(const MachineBasicBlock *MBB) {
 
 void GCNScheduleDAGMILive::finalizeSchedule() {
   GCNMaxOccupancySchedStrategy &S = (GCNMaxOccupancySchedStrategy&)*SchedImpl;
-  DEBUG(dbgs() << "All regions recorded, starting actual scheduling.\n");
+  LLVM_DEBUG(dbgs() << "All regions recorded, starting actual scheduling.\n");
 
   LiveIns.resize(Regions.size());
   Pressure.resize(Regions.size());
@@ -509,9 +499,10 @@ void GCNScheduleDAGMILive::finalizeSchedule() {
       if (!LIS || StartingOccupancy <= MinOccupancy)
         break;
 
-      DEBUG(dbgs()
-              << "Retrying function scheduling with lowest recorded occupancy "
-              << MinOccupancy << ".\n");
+      LLVM_DEBUG(
+          dbgs()
+          << "Retrying function scheduling with lowest recorded occupancy "
+          << MinOccupancy << ".\n");
 
       S.setTargetOccupancy(MinOccupancy);
     }
@@ -537,12 +528,13 @@ void GCNScheduleDAGMILive::finalizeSchedule() {
         continue;
       }
 
-      DEBUG(dbgs() << "********** MI Scheduling **********\n");
-      DEBUG(dbgs() << MF.getName() << ":" << printMBBReference(*MBB) << " "
-                   << MBB->getName() << "\n  From: " << *begin() << "    To: ";
-            if (RegionEnd != MBB->end()) dbgs() << *RegionEnd;
-            else dbgs() << "End";
-            dbgs() << " RegionInstrs: " << NumRegionInstrs << '\n');
+      LLVM_DEBUG(dbgs() << "********** MI Scheduling **********\n");
+      LLVM_DEBUG(dbgs() << MF.getName() << ":" << printMBBReference(*MBB) << " "
+                        << MBB->getName() << "\n  From: " << *begin()
+                        << "    To: ";
+                 if (RegionEnd != MBB->end()) dbgs() << *RegionEnd;
+                 else dbgs() << "End";
+                 dbgs() << " RegionInstrs: " << NumRegionInstrs << '\n');
 
       schedule();
 
diff --git a/lib/Target/AMDGPU/GCNSchedStrategy.h b/lib/Target/AMDGPU/GCNSchedStrategy.h
index 060d2ca72d93..3ac6af89cb9b 100644
--- a/lib/Target/AMDGPU/GCNSchedStrategy.h
+++ b/lib/Target/AMDGPU/GCNSchedStrategy.h
@@ -21,7 +21,7 @@ namespace llvm {
 
 class SIMachineFunctionInfo;
 class SIRegisterInfo;
-class SISubtarget;
+class GCNSubtarget;
 
 /// This is a minimal scheduler strategy.  The main difference between this
 /// and the GenericScheduler is that GCNSchedStrategy uses different
@@ -62,9 +62,9 @@ public:
 
 class GCNScheduleDAGMILive : public ScheduleDAGMILive {
 
-  const SISubtarget &ST;
+  const GCNSubtarget &ST;
 
-  const SIMachineFunctionInfo &MFI;
+  SIMachineFunctionInfo &MFI;
 
   // Occupancy target at the beginning of function scheduling cycle.
   unsigned StartingOccupancy;
diff --git a/lib/Target/AMDGPU/InstPrinter/AMDGPUInstPrinter.cpp b/lib/Target/AMDGPU/InstPrinter/AMDGPUInstPrinter.cpp
index bf57f88bef91..db908368a179 100644
--- a/lib/Target/AMDGPU/InstPrinter/AMDGPUInstPrinter.cpp
+++ b/lib/Target/AMDGPU/InstPrinter/AMDGPUInstPrinter.cpp
@@ -217,6 +217,11 @@ void AMDGPUInstPrinter::printLWE(const MCInst *MI, unsigned OpNo,
   printNamedBit(MI, OpNo, O, "lwe");
 }
 
+void AMDGPUInstPrinter::printD16(const MCInst *MI, unsigned OpNo,
+                                 const MCSubtargetInfo &STI, raw_ostream &O) {
+  printNamedBit(MI, OpNo, O, "d16");
+}
+
 void AMDGPUInstPrinter::printExpCompr(const MCInst *MI, unsigned OpNo,
                                       const MCSubtargetInfo &STI,
                                       raw_ostream &O) {
@@ -267,6 +272,9 @@ void AMDGPUInstPrinter::printRegOperand(unsigned RegNo, raw_ostream &O,
   case AMDGPU::FLAT_SCR:
     O << "flat_scratch";
     return;
+  case AMDGPU::XNACK_MASK:
+    O << "xnack_mask";
+    return;
   case AMDGPU::VCC_LO:
     O << "vcc_lo";
     return;
@@ -297,6 +305,12 @@ void AMDGPUInstPrinter::printRegOperand(unsigned RegNo, raw_ostream &O,
   case AMDGPU::FLAT_SCR_HI:
     O << "flat_scratch_hi";
     return;
+  case AMDGPU::XNACK_MASK_LO:
+    O << "xnack_mask_lo";
+    return;
+  case AMDGPU::XNACK_MASK_HI:
+    O << "xnack_mask_hi";
+    return;
   case AMDGPU::FP_REG:
   case AMDGPU::SP_REG:
   case AMDGPU::SCRATCH_WAVE_OFFSET_REG:
@@ -371,6 +385,16 @@ void AMDGPUInstPrinter::printVOPDst(const MCInst *MI, unsigned OpNo,
   printOperand(MI, OpNo, STI, O);
 }
 
+void AMDGPUInstPrinter::printVINTRPDst(const MCInst *MI, unsigned OpNo,
+                                       const MCSubtargetInfo &STI, raw_ostream &O) {
+  if (AMDGPU::isSI(STI) || AMDGPU::isCI(STI))
+    O << " ";
+  else
+    O << "_e32 ";
+
+  printOperand(MI, OpNo, STI, O);
+}
+
 void AMDGPUInstPrinter::printImmediate16(uint32_t Imm,
                                          const MCSubtargetInfo &STI,
                                          raw_ostream &O) {
@@ -486,11 +510,6 @@ void AMDGPUInstPrinter::printImmediate64(uint64_t Imm,
 void AMDGPUInstPrinter::printOperand(const MCInst *MI, unsigned OpNo,
                                      const MCSubtargetInfo &STI,
                                      raw_ostream &O) {
-  if (!STI.getFeatureBits()[AMDGPU::FeatureGCN]) {
-    static_cast<R600InstPrinter*>(this)->printOperand(MI, OpNo, O);
-    return;
-  }
-
   if (OpNo >= MI->getNumOperands()) {
     O << "/*Missing OP" << OpNo << "*/";
     return;
@@ -612,40 +631,45 @@ void AMDGPUInstPrinter::printOperandAndIntInputMods(const MCInst *MI,
 void AMDGPUInstPrinter::printDPPCtrl(const MCInst *MI, unsigned OpNo,
                                      const MCSubtargetInfo &STI,
                                      raw_ostream &O) {
+  using namespace AMDGPU::DPP;
+
   unsigned Imm = MI->getOperand(OpNo).getImm();
-  if (Imm <= 0x0ff) {
+  if (Imm <= DppCtrl::QUAD_PERM_LAST) {
     O << " quad_perm:[";
     O << formatDec(Imm & 0x3)         << ',';
     O << formatDec((Imm & 0xc)  >> 2) << ',';
     O << formatDec((Imm & 0x30) >> 4) << ',';
     O << formatDec((Imm & 0xc0) >> 6) << ']';
-  } else if ((Imm >= 0x101) && (Imm <= 0x10f)) {
+  } else if ((Imm >= DppCtrl::ROW_SHL_FIRST) &&
+             (Imm <= DppCtrl::ROW_SHL_LAST)) {
     O << " row_shl:";
     printU4ImmDecOperand(MI, OpNo, O);
-  } else if ((Imm >= 0x111) && (Imm <= 0x11f)) {
+  } else if ((Imm >= DppCtrl::ROW_SHR_FIRST) &&
+             (Imm <= DppCtrl::ROW_SHR_LAST)) {
     O << " row_shr:";
     printU4ImmDecOperand(MI, OpNo, O);
-  } else if ((Imm >= 0x121) && (Imm <= 0x12f)) {
+  } else if ((Imm >= DppCtrl::ROW_ROR_FIRST) &&
+             (Imm <= DppCtrl::ROW_ROR_LAST)) {
     O << " row_ror:";
     printU4ImmDecOperand(MI, OpNo, O);
-  } else if (Imm == 0x130) {
+  } else if (Imm == DppCtrl::WAVE_SHL1) {
     O << " wave_shl:1";
-  } else if (Imm == 0x134) {
+  } else if (Imm == DppCtrl::WAVE_ROL1) {
     O << " wave_rol:1";
-  } else if (Imm == 0x138) {
+  } else if (Imm == DppCtrl::WAVE_SHR1) {
     O << " wave_shr:1";
-  } else if (Imm == 0x13c) {
+  } else if (Imm == DppCtrl::WAVE_ROR1) {
     O << " wave_ror:1";
-  } else if (Imm == 0x140) {
+  } else if (Imm == DppCtrl::ROW_MIRROR) {
     O << " row_mirror";
-  } else if (Imm == 0x141) {
+  } else if (Imm == DppCtrl::ROW_HALF_MIRROR) {
     O << " row_half_mirror";
-  } else if (Imm == 0x142) {
+  } else if (Imm == DppCtrl::BCAST15) {
     O << " row_bcast:15";
-  } else if (Imm == 0x143) {
+  } else if (Imm == DppCtrl::BCAST31) {
     O << " row_bcast:31";
   } else {
-    llvm_unreachable("Invalid dpp_ctrl value");
+    O << " /* Invalid dpp_ctrl value */";
   }
 }
 
@@ -936,11 +960,6 @@ void AMDGPUInstPrinter::printVGPRIndexMode(const MCInst *MI, unsigned OpNo,
 void AMDGPUInstPrinter::printMemOperand(const MCInst *MI, unsigned OpNo,
                                         const MCSubtargetInfo &STI,
                                         raw_ostream &O) {
-  if (!STI.getFeatureBits()[AMDGPU::FeatureGCN]) {
-    static_cast<R600InstPrinter*>(this)->printMemOperand(MI, OpNo, O);
-    return;
-  }
-
   printOperand(MI, OpNo, STI, O);
   O  << ", ";
   printOperand(MI, OpNo + 1, STI, O);
@@ -966,16 +985,6 @@ void AMDGPUInstPrinter::printIfSet(const MCInst *MI, unsigned OpNo,
     O << Asm;
 }
 
-void AMDGPUInstPrinter::printAbs(const MCInst *MI, unsigned OpNo,
-                                 const MCSubtargetInfo &STI, raw_ostream &O) {
-  static_cast<R600InstPrinter*>(this)->printAbs(MI, OpNo, O);
-}
-
-void AMDGPUInstPrinter::printClamp(const MCInst *MI, unsigned OpNo,
-                                   const MCSubtargetInfo &STI, raw_ostream &O) {
-  static_cast<R600InstPrinter*>(this)->printClamp(MI, OpNo, O);
-}
-
 void AMDGPUInstPrinter::printHigh(const MCInst *MI, unsigned OpNo,
                                   const MCSubtargetInfo &STI,
                                   raw_ostream &O) {
@@ -1002,70 +1011,6 @@ void AMDGPUInstPrinter::printOModSI(const MCInst *MI, unsigned OpNo,
     O << " div:2";
 }
 
-void AMDGPUInstPrinter::printLiteral(const MCInst *MI, unsigned OpNo,
-                                     const MCSubtargetInfo &STI,
-                                     raw_ostream &O) {
-  static_cast<R600InstPrinter*>(this)->printLiteral(MI, OpNo, O);
-}
-
-void AMDGPUInstPrinter::printLast(const MCInst *MI, unsigned OpNo,
-                                  const MCSubtargetInfo &STI, raw_ostream &O) {
-  static_cast<R600InstPrinter*>(this)->printLast(MI, OpNo, O);
-}
-
-void AMDGPUInstPrinter::printNeg(const MCInst *MI, unsigned OpNo,
-                                 const MCSubtargetInfo &STI, raw_ostream &O) {
-  static_cast<R600InstPrinter*>(this)->printNeg(MI, OpNo, O);
-}
-
-void AMDGPUInstPrinter::printOMOD(const MCInst *MI, unsigned OpNo,
-                                  const MCSubtargetInfo &STI, raw_ostream &O) {
-  static_cast<R600InstPrinter*>(this)->printOMOD(MI, OpNo, O);
-}
-
-void AMDGPUInstPrinter::printRel(const MCInst *MI, unsigned OpNo,
-                                 const MCSubtargetInfo &STI, raw_ostream &O) {
-  static_cast<R600InstPrinter*>(this)->printRel(MI, OpNo, O);
-}
-
-void AMDGPUInstPrinter::printUpdateExecMask(const MCInst *MI, unsigned OpNo,
-                                            const MCSubtargetInfo &STI,
-                                            raw_ostream &O) {
-  static_cast<R600InstPrinter*>(this)->printUpdateExecMask(MI, OpNo, O);
-}
-
-void AMDGPUInstPrinter::printUpdatePred(const MCInst *MI, unsigned OpNo,
-                                        const MCSubtargetInfo &STI,
-                                        raw_ostream &O) {
-  static_cast<R600InstPrinter*>(this)->printUpdatePred(MI, OpNo, O);
-}
-
-void AMDGPUInstPrinter::printWrite(const MCInst *MI, unsigned OpNo,
-                                   const MCSubtargetInfo &STI, raw_ostream &O) {
-  static_cast<R600InstPrinter*>(this)->printWrite(MI, OpNo, O);
-}
-
-void AMDGPUInstPrinter::printBankSwizzle(const MCInst *MI, unsigned OpNo,
-                                         const MCSubtargetInfo &STI,
-                                         raw_ostream &O) {
-  static_cast<R600InstPrinter*>(this)->printBankSwizzle(MI, OpNo, O);
-}
-
-void AMDGPUInstPrinter::printRSel(const MCInst *MI, unsigned OpNo,
-                                  const MCSubtargetInfo &STI, raw_ostream &O) {
-  static_cast<R600InstPrinter*>(this)->printRSel(MI, OpNo, O);
-}
-
-void AMDGPUInstPrinter::printCT(const MCInst *MI, unsigned OpNo,
-                                const MCSubtargetInfo &STI, raw_ostream &O) {
-  static_cast<R600InstPrinter*>(this)->printCT(MI, OpNo, O);
-}
-
-void AMDGPUInstPrinter::printKCache(const MCInst *MI, unsigned OpNo,
-                                    const MCSubtargetInfo &STI, raw_ostream &O) {
-  static_cast<R600InstPrinter*>(this)->printKCache(MI, OpNo, O);
-}
-
 void AMDGPUInstPrinter::printSendMsg(const MCInst *MI, unsigned OpNo,
                                      const MCSubtargetInfo &STI,
                                      raw_ostream &O) {
@@ -1254,7 +1199,10 @@ void AMDGPUInstPrinter::printHwreg(const MCInst *MI, unsigned OpNo,
   const unsigned Width = ((SImm16 & WIDTH_M1_MASK_) >> WIDTH_M1_SHIFT_) + 1;
 
   O << "hwreg(";
-  if (ID_SYMBOLIC_FIRST_ <= Id && Id < ID_SYMBOLIC_LAST_) {
+  unsigned Last = ID_SYMBOLIC_LAST_;
+  if (AMDGPU::isSI(STI) || AMDGPU::isCI(STI) || AMDGPU::isVI(STI))
+    Last = ID_SYMBOLIC_FIRST_GFX9_;
+  if (ID_SYMBOLIC_FIRST_ <= Id && Id < Last && IdSymbolic[Id]) {
     O << IdSymbolic[Id];
   } else {
     O << Id;
@@ -1267,6 +1215,13 @@ void AMDGPUInstPrinter::printHwreg(const MCInst *MI, unsigned OpNo,
 
 #include "AMDGPUGenAsmWriter.inc"
 
+void R600InstPrinter::printInst(const MCInst *MI, raw_ostream &O,
+		                StringRef Annot, const MCSubtargetInfo &STI) {
+  O.flush();
+  printInstruction(MI, O);
+  printAnnotation(O, Annot);
+}
+
 void R600InstPrinter::printAbs(const MCInst *MI, unsigned OpNo,
                                raw_ostream &O) {
   AMDGPUInstPrinter::printIfSet(MI, OpNo, O, '|');
@@ -1385,7 +1340,7 @@ void R600InstPrinter::printOperand(const MCInst *MI, unsigned OpNo,
   if (Op.isReg()) {
     switch (Op.getReg()) {
     // This is the default predicate state, so we don't need to print it.
-    case AMDGPU::PRED_SEL_OFF:
+    case R600::PRED_SEL_OFF:
       break;
 
     default:
@@ -1461,3 +1416,5 @@ void R600InstPrinter::printWrite(const MCInst *MI, unsigned OpNo,
     O << " (MASKED)";
   }
 }
+
+#include "R600GenAsmWriter.inc"
diff --git a/lib/Target/AMDGPU/InstPrinter/AMDGPUInstPrinter.h b/lib/Target/AMDGPU/InstPrinter/AMDGPUInstPrinter.h
index d97f04689e18..11a496a38b2c 100644
--- a/lib/Target/AMDGPU/InstPrinter/AMDGPUInstPrinter.h
+++ b/lib/Target/AMDGPU/InstPrinter/AMDGPUInstPrinter.h
@@ -84,6 +84,8 @@ private:
                  raw_ostream &O);
   void printLWE(const MCInst *MI, unsigned OpNo,
                 const MCSubtargetInfo &STI, raw_ostream &O);
+  void printD16(const MCInst *MI, unsigned OpNo,
+                const MCSubtargetInfo &STI, raw_ostream &O);
   void printExpCompr(const MCInst *MI, unsigned OpNo,
                      const MCSubtargetInfo &STI, raw_ostream &O);
   void printExpVM(const MCInst *MI, unsigned OpNo,
@@ -96,6 +98,8 @@ private:
   void printRegOperand(unsigned RegNo, raw_ostream &O);
   void printVOPDst(const MCInst *MI, unsigned OpNo, const MCSubtargetInfo &STI,
                    raw_ostream &O);
+  void printVINTRPDst(const MCInst *MI, unsigned OpNo, const MCSubtargetInfo &STI,
+                      raw_ostream &O);
   void printImmediate16(uint32_t Imm, const MCSubtargetInfo &STI,
                         raw_ostream &O);
   void printImmediateV216(uint32_t Imm, const MCSubtargetInfo &STI,
@@ -214,13 +218,16 @@ protected:
                   raw_ostream &O);
 };
 
-// FIXME: R600 specific parts of AMDGPUInstrPrinter should be moved here, and
-// MCTargetDesc should be using R600InstPrinter for the R600 target.
-class R600InstPrinter : public AMDGPUInstPrinter {
+class R600InstPrinter : public MCInstPrinter {
 public:
   R600InstPrinter(const MCAsmInfo &MAI, const MCInstrInfo &MII,
                   const MCRegisterInfo &MRI)
-    : AMDGPUInstPrinter(MAI, MII, MRI) {}
+    : MCInstPrinter(MAI, MII, MRI) {}
+
+  void printInst(const MCInst *MI, raw_ostream &O, StringRef Annot,
+                 const MCSubtargetInfo &STI) override;
+  void printInstruction(const MCInst *MI, raw_ostream &O);
+  static const char *getRegisterName(unsigned RegNo);
 
   void printAbs(const MCInst *MI, unsigned OpNo, raw_ostream &O);
   void printBankSwizzle(const MCInst *MI, unsigned OpNo, raw_ostream &O);
diff --git a/lib/Target/AMDGPU/MCTargetDesc/AMDGPUAsmBackend.cpp b/lib/Target/AMDGPU/MCTargetDesc/AMDGPUAsmBackend.cpp
index 778d4a7ba9d0..abc88c02adca 100644
--- a/lib/Target/AMDGPU/MCTargetDesc/AMDGPUAsmBackend.cpp
+++ b/lib/Target/AMDGPU/MCTargetDesc/AMDGPUAsmBackend.cpp
@@ -26,14 +26,14 @@ namespace {
 
 class AMDGPUAsmBackend : public MCAsmBackend {
 public:
-  AMDGPUAsmBackend(const Target &T)
-    : MCAsmBackend() {}
+  AMDGPUAsmBackend(const Target &T) : MCAsmBackend(support::little) {}
 
   unsigned getNumFixupKinds() const override { return AMDGPU::NumTargetFixupKinds; };
 
   void applyFixup(const MCAssembler &Asm, const MCFixup &Fixup,
                   const MCValue &Target, MutableArrayRef<char> Data,
-                  uint64_t Value, bool IsResolved) const override;
+                  uint64_t Value, bool IsResolved,
+                  const MCSubtargetInfo *STI) const override;
   bool fixupNeedsRelaxation(const MCFixup &Fixup, uint64_t Value,
                             const MCRelaxableFragment *DF,
                             const MCAsmLayout &Layout) const override {
@@ -43,10 +43,13 @@ public:
                         MCInst &Res) const override {
     llvm_unreachable("Not implemented");
   }
-  bool mayNeedRelaxation(const MCInst &Inst) const override { return false; }
+  bool mayNeedRelaxation(const MCInst &Inst,
+                         const MCSubtargetInfo &STI) const override {
+    return false;
+  }
 
   unsigned getMinimumNopSize() const override;
-  bool writeNopData(uint64_t Count, MCObjectWriter *OW) const override;
+  bool writeNopData(raw_ostream &OS, uint64_t Count) const override;
 
   const MCFixupKindInfo &getFixupKindInfo(MCFixupKind Kind) const override;
 };
@@ -103,7 +106,8 @@ static uint64_t adjustFixupValue(const MCFixup &Fixup, uint64_t Value,
 void AMDGPUAsmBackend::applyFixup(const MCAssembler &Asm, const MCFixup &Fixup,
                                   const MCValue &Target,
                                   MutableArrayRef<char> Data, uint64_t Value,
-                                  bool IsResolved) const {
+                                  bool IsResolved,
+                                  const MCSubtargetInfo *STI) const {
   Value = adjustFixupValue(Fixup, Value, &Asm.getContext());
   if (!Value)
     return; // Doesn't change encoding.
@@ -140,11 +144,11 @@ unsigned AMDGPUAsmBackend::getMinimumNopSize() const {
   return 4;
 }
 
-bool AMDGPUAsmBackend::writeNopData(uint64_t Count, MCObjectWriter *OW) const {
+bool AMDGPUAsmBackend::writeNopData(raw_ostream &OS, uint64_t Count) const {
   // If the count is not 4-byte aligned, we must be writing data into the text
   // section (otherwise we have unaligned instructions, and thus have far
   // bigger problems), so just write zeros instead.
-  OW->WriteZeros(Count % 4);
+  OS.write_zeros(Count % 4);
 
   // We are properly aligned, so write NOPs as requested.
   Count /= 4;
@@ -154,7 +158,7 @@ bool AMDGPUAsmBackend::writeNopData(uint64_t Count, MCObjectWriter *OW) const {
   const uint32_t Encoded_S_NOP_0 = 0xbf800000;
 
   for (uint64_t I = 0; I != Count; ++I)
-    OW->write32(Encoded_S_NOP_0);
+    support::endian::write<uint32_t>(OS, Encoded_S_NOP_0, Endian);
 
   return true;
 }
@@ -189,18 +193,18 @@ public:
     }
   }
 
-  std::unique_ptr<MCObjectWriter>
-  createObjectWriter(raw_pwrite_stream &OS) const override {
-    return createAMDGPUELFObjectWriter(Is64Bit, OSABI, HasRelocationAddend, OS);
+  std::unique_ptr<MCObjectTargetWriter>
+  createObjectTargetWriter() const override {
+    return createAMDGPUELFObjectWriter(Is64Bit, OSABI, HasRelocationAddend);
   }
 };
 
 } // end anonymous namespace
 
 MCAsmBackend *llvm::createAMDGPUAsmBackend(const Target &T,
+                                           const MCSubtargetInfo &STI,
                                            const MCRegisterInfo &MRI,
-                                           const Triple &TT, StringRef CPU,
                                            const MCTargetOptions &Options) {
   // Use 64-bit ELF for amdgcn
-  return new ELFAMDGPUAsmBackend(T, TT);
+  return new ELFAMDGPUAsmBackend(T, STI.getTargetTriple());
 }
diff --git a/lib/Target/AMDGPU/MCTargetDesc/AMDGPUELFObjectWriter.cpp b/lib/Target/AMDGPU/MCTargetDesc/AMDGPUELFObjectWriter.cpp
index e443b0729606..07bef9103c0d 100644
--- a/lib/Target/AMDGPU/MCTargetDesc/AMDGPUELFObjectWriter.cpp
+++ b/lib/Target/AMDGPU/MCTargetDesc/AMDGPUELFObjectWriter.cpp
@@ -66,6 +66,8 @@ unsigned AMDGPUELFObjectWriter::getRelocType(MCContext &Ctx,
     return ELF::R_AMDGPU_REL32_LO;
   case MCSymbolRefExpr::VK_AMDGPU_REL32_HI:
     return ELF::R_AMDGPU_REL32_HI;
+  case MCSymbolRefExpr::VK_AMDGPU_REL64:
+    return ELF::R_AMDGPU_REL64;
   }
 
   switch (Fixup.getKind()) {
@@ -82,11 +84,9 @@ unsigned AMDGPUELFObjectWriter::getRelocType(MCContext &Ctx,
   llvm_unreachable("unhandled relocation type");
 }
 
-std::unique_ptr<MCObjectWriter>
+std::unique_ptr<MCObjectTargetWriter>
 llvm::createAMDGPUELFObjectWriter(bool Is64Bit, uint8_t OSABI,
-                                  bool HasRelocationAddend,
-                                  raw_pwrite_stream &OS) {
-  auto MOTW = llvm::make_unique<AMDGPUELFObjectWriter>(Is64Bit, OSABI,
-                                                       HasRelocationAddend);
-  return createELFObjectWriter(std::move(MOTW), OS, true);
+                                  bool HasRelocationAddend) {
+  return llvm::make_unique<AMDGPUELFObjectWriter>(Is64Bit, OSABI,
+                                                  HasRelocationAddend);
 }
diff --git a/lib/Target/AMDGPU/MCTargetDesc/AMDGPUELFStreamer.cpp b/lib/Target/AMDGPU/MCTargetDesc/AMDGPUELFStreamer.cpp
index 1497edc7a054..c627a08e7463 100644
--- a/lib/Target/AMDGPU/MCTargetDesc/AMDGPUELFStreamer.cpp
+++ b/lib/Target/AMDGPU/MCTargetDesc/AMDGPUELFStreamer.cpp
@@ -12,37 +12,28 @@
 #include "llvm/BinaryFormat/ELF.h"
 #include "llvm/MC/MCAsmBackend.h"
 #include "llvm/MC/MCCodeEmitter.h"
+#include "llvm/MC/MCObjectWriter.h"
 
 using namespace llvm;
 
-AMDGPUELFStreamer::AMDGPUELFStreamer(const Triple &T, MCContext &Context,
-                                     std::unique_ptr<MCAsmBackend> MAB,
-                                     raw_pwrite_stream &OS,
-                                     std::unique_ptr<MCCodeEmitter> Emitter)
-    : MCELFStreamer(Context, std::move(MAB), OS, std::move(Emitter)) {
-  unsigned Arch = ELF::EF_AMDGPU_ARCH_NONE;
-  switch (T.getArch()) {
-  case Triple::r600:
-    Arch = ELF::EF_AMDGPU_ARCH_R600;
-    break;
-  case Triple::amdgcn:
-    Arch = ELF::EF_AMDGPU_ARCH_GCN;
-    break;
-  default:
-    break;
-  }
+namespace {
+
+class AMDGPUELFStreamer : public MCELFStreamer {
+public:
+  AMDGPUELFStreamer(const Triple &T, MCContext &Context,
+                    std::unique_ptr<MCAsmBackend> MAB,
+                    std::unique_ptr<MCObjectWriter> OW,
+                    std::unique_ptr<MCCodeEmitter> Emitter)
+      : MCELFStreamer(Context, std::move(MAB), std::move(OW),
+                      std::move(Emitter)) {}
+};
 
-  MCAssembler &MCA = getAssembler();
-  unsigned EFlags = MCA.getELFHeaderEFlags();
-  EFlags &= ~ELF::EF_AMDGPU_ARCH;
-  EFlags |= Arch;
-  MCA.setELFHeaderEFlags(EFlags);
 }
 
 MCELFStreamer *llvm::createAMDGPUELFStreamer(
     const Triple &T, MCContext &Context, std::unique_ptr<MCAsmBackend> MAB,
-    raw_pwrite_stream &OS, std::unique_ptr<MCCodeEmitter> Emitter,
+    std::unique_ptr<MCObjectWriter> OW, std::unique_ptr<MCCodeEmitter> Emitter,
     bool RelaxAll) {
-  return new AMDGPUELFStreamer(T, Context, std::move(MAB), OS,
+  return new AMDGPUELFStreamer(T, Context, std::move(MAB), std::move(OW),
                                std::move(Emitter));
 }
diff --git a/lib/Target/AMDGPU/MCTargetDesc/AMDGPUELFStreamer.h b/lib/Target/AMDGPU/MCTargetDesc/AMDGPUELFStreamer.h
index 0cc0a4c5cd5d..41e9063a759e 100644
--- a/lib/Target/AMDGPU/MCTargetDesc/AMDGPUELFStreamer.h
+++ b/lib/Target/AMDGPU/MCTargetDesc/AMDGPUELFStreamer.h
@@ -23,16 +23,9 @@ class MCCodeEmitter;
 class MCContext;
 class MCSubtargetInfo;
 
-class AMDGPUELFStreamer : public MCELFStreamer {
-public:
-  AMDGPUELFStreamer(const Triple &T, MCContext &Context,
-                    std::unique_ptr<MCAsmBackend> MAB, raw_pwrite_stream &OS,
-                    std::unique_ptr<MCCodeEmitter> Emitter);
-};
-
 MCELFStreamer *createAMDGPUELFStreamer(const Triple &T, MCContext &Context,
                                        std::unique_ptr<MCAsmBackend> MAB,
-                                       raw_pwrite_stream &OS,
+                                       std::unique_ptr<MCObjectWriter> OW,
                                        std::unique_ptr<MCCodeEmitter> Emitter,
                                        bool RelaxAll);
 } // namespace llvm.
diff --git a/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCCodeEmitter.cpp b/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCCodeEmitter.cpp
index 521b3b39bba2..cae7a7a6c7e7 100644
--- a/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCCodeEmitter.cpp
+++ b/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCCodeEmitter.cpp
@@ -8,7 +8,7 @@
 //===----------------------------------------------------------------------===//
 //
 /// \file
-/// \brief CodeEmitter interface for R600 and SI codegen.
+/// CodeEmitter interface for R600 and SI codegen.
 //
 //===----------------------------------------------------------------------===//
 
diff --git a/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCCodeEmitter.h b/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCCodeEmitter.h
index 1b062064ace1..dcc10a032afe 100644
--- a/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCCodeEmitter.h
+++ b/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCCodeEmitter.h
@@ -8,7 +8,7 @@
 //===----------------------------------------------------------------------===//
 //
 /// \file
-/// \brief CodeEmitter interface for R600 and SI codegen.
+/// CodeEmitter interface for R600 and SI codegen.
 //
 //===----------------------------------------------------------------------===//
 
diff --git a/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCTargetDesc.cpp b/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCTargetDesc.cpp
index 2b321c04fb30..c579c7d60e16 100644
--- a/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCTargetDesc.cpp
+++ b/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCTargetDesc.cpp
@@ -8,7 +8,7 @@
 //===----------------------------------------------------------------------===//
 //
 /// \file
-/// \brief This file provides AMDGPU specific target descriptions.
+/// This file provides AMDGPU specific target descriptions.
 //
 //===----------------------------------------------------------------------===//
 
@@ -22,6 +22,7 @@
 #include "llvm/MC/MCCodeEmitter.h"
 #include "llvm/MC/MCContext.h"
 #include "llvm/MC/MCInstrInfo.h"
+#include "llvm/MC/MCObjectWriter.h"
 #include "llvm/MC/MCRegisterInfo.h"
 #include "llvm/MC/MCStreamer.h"
 #include "llvm/MC/MCSubtargetInfo.h"
@@ -37,9 +38,17 @@ using namespace llvm;
 #define GET_SUBTARGETINFO_MC_DESC
 #include "AMDGPUGenSubtargetInfo.inc"
 
+#define NoSchedModel NoSchedModelR600
+#define GET_SUBTARGETINFO_MC_DESC
+#include "R600GenSubtargetInfo.inc"
+#undef NoSchedModelR600
+
 #define GET_REGINFO_MC_DESC
 #include "AMDGPUGenRegisterInfo.inc"
 
+#define GET_REGINFO_MC_DESC
+#include "R600GenRegisterInfo.inc"
+
 static MCInstrInfo *createAMDGPUMCInstrInfo() {
   MCInstrInfo *X = new MCInstrInfo();
   InitAMDGPUMCInstrInfo(X);
@@ -48,12 +57,17 @@ static MCInstrInfo *createAMDGPUMCInstrInfo() {
 
 static MCRegisterInfo *createAMDGPUMCRegisterInfo(const Triple &TT) {
   MCRegisterInfo *X = new MCRegisterInfo();
-  InitAMDGPUMCRegisterInfo(X, 0);
+  if (TT.getArch() == Triple::r600)
+    InitR600MCRegisterInfo(X, 0);
+  else
+    InitAMDGPUMCRegisterInfo(X, 0);
   return X;
 }
 
 static MCSubtargetInfo *
 createAMDGPUMCSubtargetInfo(const Triple &TT, StringRef CPU, StringRef FS) {
+  if (TT.getArch() == Triple::r600)
+    return createR600MCSubtargetInfoImpl(TT, CPU, FS);
   return createAMDGPUMCSubtargetInfoImpl(TT, CPU, FS);
 }
 
@@ -62,8 +76,10 @@ static MCInstPrinter *createAMDGPUMCInstPrinter(const Triple &T,
                                                 const MCAsmInfo &MAI,
                                                 const MCInstrInfo &MII,
                                                 const MCRegisterInfo &MRI) {
-  return T.getArch() == Triple::r600 ? new R600InstPrinter(MAI, MII, MRI) : 
-                                       new AMDGPUInstPrinter(MAI, MII, MRI);
+  if (T.getArch() == Triple::r600)
+    return new R600InstPrinter(MAI, MII, MRI);
+  else
+    return new AMDGPUInstPrinter(MAI, MII, MRI);
 }
 
 static MCTargetStreamer *createAMDGPUAsmTargetStreamer(MCStreamer &S,
@@ -76,23 +92,25 @@ static MCTargetStreamer *createAMDGPUAsmTargetStreamer(MCStreamer &S,
 static MCTargetStreamer * createAMDGPUObjectTargetStreamer(
                                                    MCStreamer &S,
                                                    const MCSubtargetInfo &STI) {
-  return new AMDGPUTargetELFStreamer(S);
+  return new AMDGPUTargetELFStreamer(S, STI);
 }
 
 static MCStreamer *createMCStreamer(const Triple &T, MCContext &Context,
                                     std::unique_ptr<MCAsmBackend> &&MAB,
-                                    raw_pwrite_stream &OS,
+                                    std::unique_ptr<MCObjectWriter> &&OW,
                                     std::unique_ptr<MCCodeEmitter> &&Emitter,
                                     bool RelaxAll) {
-  return createAMDGPUELFStreamer(T, Context, std::move(MAB), OS,
+  return createAMDGPUELFStreamer(T, Context, std::move(MAB), std::move(OW),
                                  std::move(Emitter), RelaxAll);
 }
 
 extern "C" void LLVMInitializeAMDGPUTargetMC() {
+
+  TargetRegistry::RegisterMCInstrInfo(getTheGCNTarget(), createAMDGPUMCInstrInfo);
+  TargetRegistry::RegisterMCInstrInfo(getTheAMDGPUTarget(), createR600MCInstrInfo);
   for (Target *T : {&getTheAMDGPUTarget(), &getTheGCNTarget()}) {
     RegisterMCAsmInfo<AMDGPUMCAsmInfo> X(*T);
 
-    TargetRegistry::RegisterMCInstrInfo(*T, createAMDGPUMCInstrInfo);
     TargetRegistry::RegisterMCRegInfo(*T, createAMDGPUMCRegisterInfo);
     TargetRegistry::RegisterMCSubtargetInfo(*T, createAMDGPUMCSubtargetInfo);
     TargetRegistry::RegisterMCInstPrinter(*T, createAMDGPUMCInstPrinter);
@@ -103,6 +121,8 @@ extern "C" void LLVMInitializeAMDGPUTargetMC() {
   // R600 specific registration
   TargetRegistry::RegisterMCCodeEmitter(getTheAMDGPUTarget(),
                                         createR600MCCodeEmitter);
+  TargetRegistry::RegisterObjectTargetStreamer(
+      getTheAMDGPUTarget(), createAMDGPUObjectTargetStreamer);
 
   // GCN specific registration
   TargetRegistry::RegisterMCCodeEmitter(getTheGCNTarget(),
diff --git a/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCTargetDesc.h b/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCTargetDesc.h
index 0b3563303ad0..f3628d96d6e9 100644
--- a/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCTargetDesc.h
+++ b/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCTargetDesc.h
@@ -8,7 +8,7 @@
 //===----------------------------------------------------------------------===//
 //
 /// \file
-/// \brief Provides AMDGPU specific target descriptions.
+/// Provides AMDGPU specific target descriptions.
 //
 //===----------------------------------------------------------------------===//
 //
@@ -25,7 +25,7 @@ class MCAsmBackend;
 class MCCodeEmitter;
 class MCContext;
 class MCInstrInfo;
-class MCObjectWriter;
+class MCObjectTargetWriter;
 class MCRegisterInfo;
 class MCSubtargetInfo;
 class MCTargetOptions;
@@ -40,24 +40,30 @@ Target &getTheGCNTarget();
 MCCodeEmitter *createR600MCCodeEmitter(const MCInstrInfo &MCII,
                                        const MCRegisterInfo &MRI,
                                        MCContext &Ctx);
+MCInstrInfo *createR600MCInstrInfo();
 
 MCCodeEmitter *createSIMCCodeEmitter(const MCInstrInfo &MCII,
                                      const MCRegisterInfo &MRI,
                                      MCContext &Ctx);
 
-MCAsmBackend *createAMDGPUAsmBackend(const Target &T, const MCRegisterInfo &MRI,
-                                     const Triple &TT, StringRef CPU,
+MCAsmBackend *createAMDGPUAsmBackend(const Target &T,
+                                     const MCSubtargetInfo &STI,
+                                     const MCRegisterInfo &MRI,
                                      const MCTargetOptions &Options);
 
-std::unique_ptr<MCObjectWriter>
+std::unique_ptr<MCObjectTargetWriter>
 createAMDGPUELFObjectWriter(bool Is64Bit, uint8_t OSABI,
-                            bool HasRelocationAddend, raw_pwrite_stream &OS);
+                            bool HasRelocationAddend);
 } // End llvm namespace
 
 #define GET_REGINFO_ENUM
 #include "AMDGPUGenRegisterInfo.inc"
 #undef GET_REGINFO_ENUM
 
+#define GET_REGINFO_ENUM
+#include "R600GenRegisterInfo.inc"
+#undef GET_REGINFO_ENUM
+
 #define GET_INSTRINFO_ENUM
 #define GET_INSTRINFO_OPERAND_ENUM
 #define GET_INSTRINFO_SCHED_ENUM
@@ -66,9 +72,20 @@ createAMDGPUELFObjectWriter(bool Is64Bit, uint8_t OSABI,
 #undef GET_INSTRINFO_OPERAND_ENUM
 #undef GET_INSTRINFO_ENUM
 
+#define GET_INSTRINFO_ENUM
+#define GET_INSTRINFO_OPERAND_ENUM
+#define GET_INSTRINFO_SCHED_ENUM
+#include "R600GenInstrInfo.inc"
+#undef GET_INSTRINFO_SCHED_ENUM
+#undef GET_INSTRINFO_OPERAND_ENUM
+#undef GET_INSTRINFO_ENUM
 
 #define GET_SUBTARGETINFO_ENUM
 #include "AMDGPUGenSubtargetInfo.inc"
 #undef GET_SUBTARGETINFO_ENUM
 
+#define GET_SUBTARGETINFO_ENUM
+#include "R600GenSubtargetInfo.inc"
+#undef GET_SUBTARGETINFO_ENUM
+
 #endif
diff --git a/lib/Target/AMDGPU/MCTargetDesc/AMDGPUTargetStreamer.cpp b/lib/Target/AMDGPU/MCTargetDesc/AMDGPUTargetStreamer.cpp
index d897956daccf..6a41e3f650bc 100644
--- a/lib/Target/AMDGPU/MCTargetDesc/AMDGPUTargetStreamer.cpp
+++ b/lib/Target/AMDGPU/MCTargetDesc/AMDGPUTargetStreamer.cpp
@@ -39,6 +39,84 @@ using namespace llvm::AMDGPU;
 // AMDGPUTargetStreamer
 //===----------------------------------------------------------------------===//
 
+static const struct {
+  const char *Name;
+  unsigned Mach;
+} MachTable[] = {
+      // Radeon HD 2000/3000 Series (R600).
+      { "r600", ELF::EF_AMDGPU_MACH_R600_R600 },
+      { "r630", ELF::EF_AMDGPU_MACH_R600_R630 },
+      { "rs880", ELF::EF_AMDGPU_MACH_R600_RS880 },
+      { "rv670", ELF::EF_AMDGPU_MACH_R600_RV670 },
+      // Radeon HD 4000 Series (R700).
+      { "rv710", ELF::EF_AMDGPU_MACH_R600_RV710 },
+      { "rv730", ELF::EF_AMDGPU_MACH_R600_RV730 },
+      { "rv770", ELF::EF_AMDGPU_MACH_R600_RV770 },
+      // Radeon HD 5000 Series (Evergreen).
+      { "cedar", ELF::EF_AMDGPU_MACH_R600_CEDAR },
+      { "cypress", ELF::EF_AMDGPU_MACH_R600_CYPRESS },
+      { "juniper", ELF::EF_AMDGPU_MACH_R600_JUNIPER },
+      { "redwood", ELF::EF_AMDGPU_MACH_R600_REDWOOD },
+      { "sumo", ELF::EF_AMDGPU_MACH_R600_SUMO },
+      // Radeon HD 6000 Series (Northern Islands).
+      { "barts", ELF::EF_AMDGPU_MACH_R600_BARTS },
+      { "caicos", ELF::EF_AMDGPU_MACH_R600_CAICOS },
+      { "cayman", ELF::EF_AMDGPU_MACH_R600_CAYMAN },
+      { "turks", ELF::EF_AMDGPU_MACH_R600_TURKS },
+      // AMDGCN GFX6.
+      { "gfx600", ELF::EF_AMDGPU_MACH_AMDGCN_GFX600 },
+      { "tahiti", ELF::EF_AMDGPU_MACH_AMDGCN_GFX600 },
+      { "gfx601", ELF::EF_AMDGPU_MACH_AMDGCN_GFX601 },
+      { "hainan", ELF::EF_AMDGPU_MACH_AMDGCN_GFX601 },
+      { "oland", ELF::EF_AMDGPU_MACH_AMDGCN_GFX601 },
+      { "pitcairn", ELF::EF_AMDGPU_MACH_AMDGCN_GFX601 },
+      { "verde", ELF::EF_AMDGPU_MACH_AMDGCN_GFX601 },
+      // AMDGCN GFX7.
+      { "gfx700", ELF::EF_AMDGPU_MACH_AMDGCN_GFX700 },
+      { "kaveri", ELF::EF_AMDGPU_MACH_AMDGCN_GFX700 },
+      { "gfx701", ELF::EF_AMDGPU_MACH_AMDGCN_GFX701 },
+      { "hawaii", ELF::EF_AMDGPU_MACH_AMDGCN_GFX701 },
+      { "gfx702", ELF::EF_AMDGPU_MACH_AMDGCN_GFX702 },
+      { "gfx703", ELF::EF_AMDGPU_MACH_AMDGCN_GFX703 },
+      { "kabini", ELF::EF_AMDGPU_MACH_AMDGCN_GFX703 },
+      { "mullins", ELF::EF_AMDGPU_MACH_AMDGCN_GFX703 },
+      { "gfx704", ELF::EF_AMDGPU_MACH_AMDGCN_GFX704 },
+      { "bonaire", ELF::EF_AMDGPU_MACH_AMDGCN_GFX704 },
+      // AMDGCN GFX8.
+      { "gfx801", ELF::EF_AMDGPU_MACH_AMDGCN_GFX801 },
+      { "carrizo", ELF::EF_AMDGPU_MACH_AMDGCN_GFX801 },
+      { "gfx802", ELF::EF_AMDGPU_MACH_AMDGCN_GFX802 },
+      { "iceland", ELF::EF_AMDGPU_MACH_AMDGCN_GFX802 },
+      { "tonga", ELF::EF_AMDGPU_MACH_AMDGCN_GFX802 },
+      { "gfx803", ELF::EF_AMDGPU_MACH_AMDGCN_GFX803 },
+      { "fiji", ELF::EF_AMDGPU_MACH_AMDGCN_GFX803 },
+      { "polaris10", ELF::EF_AMDGPU_MACH_AMDGCN_GFX803 },
+      { "polaris11", ELF::EF_AMDGPU_MACH_AMDGCN_GFX803 },
+      { "gfx810", ELF::EF_AMDGPU_MACH_AMDGCN_GFX810 },
+      { "stoney", ELF::EF_AMDGPU_MACH_AMDGCN_GFX810 },
+      // AMDGCN GFX9.
+      { "gfx900", ELF::EF_AMDGPU_MACH_AMDGCN_GFX900 },
+      { "gfx902", ELF::EF_AMDGPU_MACH_AMDGCN_GFX902 },
+      { "gfx904", ELF::EF_AMDGPU_MACH_AMDGCN_GFX904 },
+      { "gfx906", ELF::EF_AMDGPU_MACH_AMDGCN_GFX906 },
+      // Not specified processor.
+      { nullptr, ELF::EF_AMDGPU_MACH_NONE }
+};
+
+unsigned AMDGPUTargetStreamer::getMACH(StringRef GPU) const {
+  auto Entry = MachTable;
+  for (; Entry->Name && GPU != Entry->Name; ++Entry)
+    ;
+  return Entry->Mach;
+}
+
+const char *AMDGPUTargetStreamer::getMachName(unsigned Mach) {
+  auto Entry = MachTable;
+  for (; Entry->Name && Mach != Entry->Mach; ++Entry)
+    ;
+  return Entry->Name;
+}
+
 bool AMDGPUTargetStreamer::EmitHSAMetadata(StringRef HSAMetadataString) {
   HSAMD::Metadata HSAMetadata;
   if (HSAMD::fromString(HSAMetadataString, HSAMetadata))
@@ -55,9 +133,12 @@ AMDGPUTargetAsmStreamer::AMDGPUTargetAsmStreamer(MCStreamer &S,
                                                  formatted_raw_ostream &OS)
     : AMDGPUTargetStreamer(S), OS(OS) { }
 
-void
-AMDGPUTargetAsmStreamer::EmitDirectiveHSACodeObjectVersion(uint32_t Major,
-                                                           uint32_t Minor) {
+void AMDGPUTargetAsmStreamer::EmitDirectiveAMDGCNTarget(StringRef Target) {
+  OS << "\t.amdgcn_target \"" << Target << "\"\n";
+}
+
+void AMDGPUTargetAsmStreamer::EmitDirectiveHSACodeObjectVersion(
+    uint32_t Major, uint32_t Minor) {
   OS << "\t.hsa_code_object_version " <<
         Twine(Major) << "," << Twine(Minor) << '\n';
 }
@@ -118,12 +199,157 @@ bool AMDGPUTargetAsmStreamer::EmitPALMetadata(
   return true;
 }
 
+void AMDGPUTargetAsmStreamer::EmitAmdhsaKernelDescriptor(
+    const MCSubtargetInfo &STI, StringRef KernelName,
+    const amdhsa::kernel_descriptor_t &KD, uint64_t NextVGPR, uint64_t NextSGPR,
+    bool ReserveVCC, bool ReserveFlatScr, bool ReserveXNACK) {
+  amdhsa::kernel_descriptor_t DefaultKD = getDefaultAmdhsaKernelDescriptor();
+
+  IsaInfo::IsaVersion IVersion = IsaInfo::getIsaVersion(STI.getFeatureBits());
+
+  OS << "\t.amdhsa_kernel " << KernelName << '\n';
+
+#define PRINT_IF_NOT_DEFAULT(STREAM, DIRECTIVE, KERNEL_DESC,                   \
+                             DEFAULT_KERNEL_DESC, MEMBER_NAME, FIELD_NAME)     \
+  if (AMDHSA_BITS_GET(KERNEL_DESC.MEMBER_NAME, FIELD_NAME) !=                  \
+      AMDHSA_BITS_GET(DEFAULT_KERNEL_DESC.MEMBER_NAME, FIELD_NAME))            \
+    STREAM << "\t\t" << DIRECTIVE << " "                                       \
+           << AMDHSA_BITS_GET(KERNEL_DESC.MEMBER_NAME, FIELD_NAME) << '\n';
+
+  if (KD.group_segment_fixed_size != DefaultKD.group_segment_fixed_size)
+    OS << "\t\t.amdhsa_group_segment_fixed_size " << KD.group_segment_fixed_size
+       << '\n';
+  if (KD.private_segment_fixed_size != DefaultKD.private_segment_fixed_size)
+    OS << "\t\t.amdhsa_private_segment_fixed_size "
+       << KD.private_segment_fixed_size << '\n';
+
+  PRINT_IF_NOT_DEFAULT(
+      OS, ".amdhsa_user_sgpr_private_segment_buffer", KD, DefaultKD,
+      kernel_code_properties,
+      amdhsa::KERNEL_CODE_PROPERTY_ENABLE_SGPR_PRIVATE_SEGMENT_BUFFER);
+  PRINT_IF_NOT_DEFAULT(OS, ".amdhsa_user_sgpr_dispatch_ptr", KD, DefaultKD,
+                       kernel_code_properties,
+                       amdhsa::KERNEL_CODE_PROPERTY_ENABLE_SGPR_DISPATCH_PTR);
+  PRINT_IF_NOT_DEFAULT(OS, ".amdhsa_user_sgpr_queue_ptr", KD, DefaultKD,
+                       kernel_code_properties,
+                       amdhsa::KERNEL_CODE_PROPERTY_ENABLE_SGPR_QUEUE_PTR);
+  PRINT_IF_NOT_DEFAULT(
+      OS, ".amdhsa_user_sgpr_kernarg_segment_ptr", KD, DefaultKD,
+      kernel_code_properties,
+      amdhsa::KERNEL_CODE_PROPERTY_ENABLE_SGPR_KERNARG_SEGMENT_PTR);
+  PRINT_IF_NOT_DEFAULT(OS, ".amdhsa_user_sgpr_dispatch_id", KD, DefaultKD,
+                       kernel_code_properties,
+                       amdhsa::KERNEL_CODE_PROPERTY_ENABLE_SGPR_DISPATCH_ID);
+  PRINT_IF_NOT_DEFAULT(
+      OS, ".amdhsa_user_sgpr_flat_scratch_init", KD, DefaultKD,
+      kernel_code_properties,
+      amdhsa::KERNEL_CODE_PROPERTY_ENABLE_SGPR_FLAT_SCRATCH_INIT);
+  PRINT_IF_NOT_DEFAULT(
+      OS, ".amdhsa_user_sgpr_private_segment_size", KD, DefaultKD,
+      kernel_code_properties,
+      amdhsa::KERNEL_CODE_PROPERTY_ENABLE_SGPR_PRIVATE_SEGMENT_SIZE);
+  PRINT_IF_NOT_DEFAULT(
+      OS, ".amdhsa_system_sgpr_private_segment_wavefront_offset", KD, DefaultKD,
+      compute_pgm_rsrc2,
+      amdhsa::COMPUTE_PGM_RSRC2_ENABLE_SGPR_PRIVATE_SEGMENT_WAVEFRONT_OFFSET);
+  PRINT_IF_NOT_DEFAULT(OS, ".amdhsa_system_sgpr_workgroup_id_x", KD, DefaultKD,
+                       compute_pgm_rsrc2,
+                       amdhsa::COMPUTE_PGM_RSRC2_ENABLE_SGPR_WORKGROUP_ID_X);
+  PRINT_IF_NOT_DEFAULT(OS, ".amdhsa_system_sgpr_workgroup_id_y", KD, DefaultKD,
+                       compute_pgm_rsrc2,
+                       amdhsa::COMPUTE_PGM_RSRC2_ENABLE_SGPR_WORKGROUP_ID_Y);
+  PRINT_IF_NOT_DEFAULT(OS, ".amdhsa_system_sgpr_workgroup_id_z", KD, DefaultKD,
+                       compute_pgm_rsrc2,
+                       amdhsa::COMPUTE_PGM_RSRC2_ENABLE_SGPR_WORKGROUP_ID_Z);
+  PRINT_IF_NOT_DEFAULT(OS, ".amdhsa_system_sgpr_workgroup_info", KD, DefaultKD,
+                       compute_pgm_rsrc2,
+                       amdhsa::COMPUTE_PGM_RSRC2_ENABLE_SGPR_WORKGROUP_INFO);
+  PRINT_IF_NOT_DEFAULT(OS, ".amdhsa_system_vgpr_workitem_id", KD, DefaultKD,
+                       compute_pgm_rsrc2,
+                       amdhsa::COMPUTE_PGM_RSRC2_ENABLE_VGPR_WORKITEM_ID);
+
+  // These directives are required.
+  OS << "\t\t.amdhsa_next_free_vgpr " << NextVGPR << '\n';
+  OS << "\t\t.amdhsa_next_free_sgpr " << NextSGPR << '\n';
+
+  if (!ReserveVCC)
+    OS << "\t\t.amdhsa_reserve_vcc " << ReserveVCC << '\n';
+  if (IVersion.Major >= 7 && !ReserveFlatScr)
+    OS << "\t\t.amdhsa_reserve_flat_scratch " << ReserveFlatScr << '\n';
+  if (IVersion.Major >= 8 && ReserveXNACK != hasXNACK(STI))
+    OS << "\t\t.amdhsa_reserve_xnack_mask " << ReserveXNACK << '\n';
+
+  PRINT_IF_NOT_DEFAULT(OS, ".amdhsa_float_round_mode_32", KD, DefaultKD,
+                       compute_pgm_rsrc1,
+                       amdhsa::COMPUTE_PGM_RSRC1_FLOAT_ROUND_MODE_32);
+  PRINT_IF_NOT_DEFAULT(OS, ".amdhsa_float_round_mode_16_64", KD, DefaultKD,
+                       compute_pgm_rsrc1,
+                       amdhsa::COMPUTE_PGM_RSRC1_FLOAT_ROUND_MODE_16_64);
+  PRINT_IF_NOT_DEFAULT(OS, ".amdhsa_float_denorm_mode_32", KD, DefaultKD,
+                       compute_pgm_rsrc1,
+                       amdhsa::COMPUTE_PGM_RSRC1_FLOAT_DENORM_MODE_32);
+  PRINT_IF_NOT_DEFAULT(OS, ".amdhsa_float_denorm_mode_16_64", KD, DefaultKD,
+                       compute_pgm_rsrc1,
+                       amdhsa::COMPUTE_PGM_RSRC1_FLOAT_DENORM_MODE_16_64);
+  PRINT_IF_NOT_DEFAULT(OS, ".amdhsa_dx10_clamp", KD, DefaultKD,
+                       compute_pgm_rsrc1,
+                       amdhsa::COMPUTE_PGM_RSRC1_ENABLE_DX10_CLAMP);
+  PRINT_IF_NOT_DEFAULT(OS, ".amdhsa_ieee_mode", KD, DefaultKD,
+                       compute_pgm_rsrc1,
+                       amdhsa::COMPUTE_PGM_RSRC1_ENABLE_IEEE_MODE);
+  if (IVersion.Major >= 9)
+    PRINT_IF_NOT_DEFAULT(OS, ".amdhsa_fp16_overflow", KD, DefaultKD,
+                         compute_pgm_rsrc1,
+                         amdhsa::COMPUTE_PGM_RSRC1_FP16_OVFL);
+  PRINT_IF_NOT_DEFAULT(
+      OS, ".amdhsa_exception_fp_ieee_invalid_op", KD, DefaultKD,
+      compute_pgm_rsrc2,
+      amdhsa::COMPUTE_PGM_RSRC2_ENABLE_EXCEPTION_IEEE_754_FP_INVALID_OPERATION);
+  PRINT_IF_NOT_DEFAULT(
+      OS, ".amdhsa_exception_fp_denorm_src", KD, DefaultKD, compute_pgm_rsrc2,
+      amdhsa::COMPUTE_PGM_RSRC2_ENABLE_EXCEPTION_FP_DENORMAL_SOURCE);
+  PRINT_IF_NOT_DEFAULT(
+      OS, ".amdhsa_exception_fp_ieee_div_zero", KD, DefaultKD,
+      compute_pgm_rsrc2,
+      amdhsa::COMPUTE_PGM_RSRC2_ENABLE_EXCEPTION_IEEE_754_FP_DIVISION_BY_ZERO);
+  PRINT_IF_NOT_DEFAULT(
+      OS, ".amdhsa_exception_fp_ieee_overflow", KD, DefaultKD,
+      compute_pgm_rsrc2,
+      amdhsa::COMPUTE_PGM_RSRC2_ENABLE_EXCEPTION_IEEE_754_FP_OVERFLOW);
+  PRINT_IF_NOT_DEFAULT(
+      OS, ".amdhsa_exception_fp_ieee_underflow", KD, DefaultKD,
+      compute_pgm_rsrc2,
+      amdhsa::COMPUTE_PGM_RSRC2_ENABLE_EXCEPTION_IEEE_754_FP_UNDERFLOW);
+  PRINT_IF_NOT_DEFAULT(
+      OS, ".amdhsa_exception_fp_ieee_inexact", KD, DefaultKD, compute_pgm_rsrc2,
+      amdhsa::COMPUTE_PGM_RSRC2_ENABLE_EXCEPTION_IEEE_754_FP_INEXACT);
+  PRINT_IF_NOT_DEFAULT(
+      OS, ".amdhsa_exception_int_div_zero", KD, DefaultKD, compute_pgm_rsrc2,
+      amdhsa::COMPUTE_PGM_RSRC2_ENABLE_EXCEPTION_INT_DIVIDE_BY_ZERO);
+#undef PRINT_IF_NOT_DEFAULT
+
+  OS << "\t.end_amdhsa_kernel\n";
+}
+
 //===----------------------------------------------------------------------===//
 // AMDGPUTargetELFStreamer
 //===----------------------------------------------------------------------===//
 
-AMDGPUTargetELFStreamer::AMDGPUTargetELFStreamer(MCStreamer &S)
-    : AMDGPUTargetStreamer(S), Streamer(S) {}
+AMDGPUTargetELFStreamer::AMDGPUTargetELFStreamer(
+    MCStreamer &S, const MCSubtargetInfo &STI)
+    : AMDGPUTargetStreamer(S), Streamer(S) {
+  MCAssembler &MCA = getStreamer().getAssembler();
+  unsigned EFlags = MCA.getELFHeaderEFlags();
+
+  EFlags &= ~ELF::EF_AMDGPU_MACH;
+  EFlags |= getMACH(STI.getCPU());
+
+  EFlags &= ~ELF::EF_AMDGPU_XNACK;
+  if (AMDGPU::hasXNACK(STI))
+    EFlags |= ELF::EF_AMDGPU_XNACK;
+
+  MCA.setELFHeaderEFlags(EFlags);
+}
 
 MCELFStreamer &AMDGPUTargetELFStreamer::getStreamer() {
   return static_cast<MCELFStreamer &>(Streamer);
@@ -150,9 +376,10 @@ void AMDGPUTargetELFStreamer::EmitAMDGPUNote(
   S.PopSection();
 }
 
-void
-AMDGPUTargetELFStreamer::EmitDirectiveHSACodeObjectVersion(uint32_t Major,
-                                                           uint32_t Minor) {
+void AMDGPUTargetELFStreamer::EmitDirectiveAMDGCNTarget(StringRef Target) {}
+
+void AMDGPUTargetELFStreamer::EmitDirectiveHSACodeObjectVersion(
+    uint32_t Major, uint32_t Minor) {
 
   EmitAMDGPUNote(
     MCConstantExpr::create(8, getContext()),
@@ -207,7 +434,7 @@ void AMDGPUTargetELFStreamer::EmitAMDGPUSymbolType(StringRef SymbolName,
                                                    unsigned Type) {
   MCSymbolELF *Symbol = cast<MCSymbolELF>(
       getStreamer().getContext().getOrCreateSymbol(SymbolName));
-  Symbol->setType(ELF::STT_AMDGPU_HSA_KERNEL);
+  Symbol->setType(Type);
 }
 
 bool AMDGPUTargetELFStreamer::EmitISAVersion(StringRef IsaVersionString) {
@@ -271,3 +498,46 @@ bool AMDGPUTargetELFStreamer::EmitPALMetadata(
   );
   return true;
 }
+
+void AMDGPUTargetELFStreamer::EmitAmdhsaKernelDescriptor(
+    const MCSubtargetInfo &STI, StringRef KernelName,
+    const amdhsa::kernel_descriptor_t &KernelDescriptor, uint64_t NextVGPR,
+    uint64_t NextSGPR, bool ReserveVCC, bool ReserveFlatScr,
+    bool ReserveXNACK) {
+  auto &Streamer = getStreamer();
+  auto &Context = Streamer.getContext();
+
+  MCSymbolELF *KernelDescriptorSymbol = cast<MCSymbolELF>(
+      Context.getOrCreateSymbol(Twine(KernelName) + Twine(".kd")));
+  KernelDescriptorSymbol->setBinding(ELF::STB_GLOBAL);
+  KernelDescriptorSymbol->setType(ELF::STT_OBJECT);
+  KernelDescriptorSymbol->setSize(
+      MCConstantExpr::create(sizeof(KernelDescriptor), Context));
+
+  MCSymbolELF *KernelCodeSymbol = cast<MCSymbolELF>(
+      Context.getOrCreateSymbol(Twine(KernelName)));
+  KernelCodeSymbol->setBinding(ELF::STB_LOCAL);
+
+  Streamer.EmitLabel(KernelDescriptorSymbol);
+  Streamer.EmitBytes(StringRef(
+      (const char*)&(KernelDescriptor),
+      offsetof(amdhsa::kernel_descriptor_t, kernel_code_entry_byte_offset)));
+  // FIXME: Remove the use of VK_AMDGPU_REL64 in the expression below. The
+  // expression being created is:
+  //   (start of kernel code) - (start of kernel descriptor)
+  // It implies R_AMDGPU_REL64, but ends up being R_AMDGPU_ABS64.
+  Streamer.EmitValue(MCBinaryExpr::createSub(
+      MCSymbolRefExpr::create(
+          KernelCodeSymbol, MCSymbolRefExpr::VK_AMDGPU_REL64, Context),
+      MCSymbolRefExpr::create(
+          KernelDescriptorSymbol, MCSymbolRefExpr::VK_None, Context),
+      Context),
+      sizeof(KernelDescriptor.kernel_code_entry_byte_offset));
+  Streamer.EmitBytes(StringRef(
+      (const char*)&(KernelDescriptor) +
+          offsetof(amdhsa::kernel_descriptor_t, kernel_code_entry_byte_offset) +
+          sizeof(KernelDescriptor.kernel_code_entry_byte_offset),
+      sizeof(KernelDescriptor) -
+          offsetof(amdhsa::kernel_descriptor_t, kernel_code_entry_byte_offset) -
+          sizeof(KernelDescriptor.kernel_code_entry_byte_offset)));
+}
diff --git a/lib/Target/AMDGPU/MCTargetDesc/AMDGPUTargetStreamer.h b/lib/Target/AMDGPU/MCTargetDesc/AMDGPUTargetStreamer.h
index 0919b754480d..472da1b73593 100644
--- a/lib/Target/AMDGPU/MCTargetDesc/AMDGPUTargetStreamer.h
+++ b/lib/Target/AMDGPU/MCTargetDesc/AMDGPUTargetStreamer.h
@@ -14,6 +14,7 @@
 #include "llvm/MC/MCStreamer.h"
 #include "llvm/MC/MCSubtargetInfo.h"
 #include "llvm/Support/AMDGPUMetadata.h"
+#include "llvm/Support/AMDHSAKernelDescriptor.h"
 
 namespace llvm {
 #include "AMDGPUPTNote.h"
@@ -30,9 +31,17 @@ class AMDGPUTargetStreamer : public MCTargetStreamer {
 protected:
   MCContext &getContext() const { return Streamer.getContext(); }
 
+  /// \returns Equivalent EF_AMDGPU_MACH_* value for given \p GPU name.
+  unsigned getMACH(StringRef GPU) const;
+
 public:
+  /// \returns Equivalent GPU name for an EF_AMDGPU_MACH_* value.
+  static const char *getMachName(unsigned Mach);
+
   AMDGPUTargetStreamer(MCStreamer &S) : MCTargetStreamer(S) {}
 
+  virtual void EmitDirectiveAMDGCNTarget(StringRef Target) = 0;
+
   virtual void EmitDirectiveHSACodeObjectVersion(uint32_t Major,
                                                  uint32_t Minor) = 0;
 
@@ -56,12 +65,21 @@ public:
 
   /// \returns True on success, false on failure.
   virtual bool EmitPALMetadata(const AMDGPU::PALMD::Metadata &PALMetadata) = 0;
+
+  virtual void EmitAmdhsaKernelDescriptor(
+      const MCSubtargetInfo &STI, StringRef KernelName,
+      const amdhsa::kernel_descriptor_t &KernelDescriptor, uint64_t NextVGPR,
+      uint64_t NextSGPR, bool ReserveVCC, bool ReserveFlatScr,
+      bool ReserveXNACK) = 0;
 };
 
 class AMDGPUTargetAsmStreamer final : public AMDGPUTargetStreamer {
   formatted_raw_ostream &OS;
 public:
   AMDGPUTargetAsmStreamer(MCStreamer &S, formatted_raw_ostream &OS);
+
+  void EmitDirectiveAMDGCNTarget(StringRef Target) override;
+
   void EmitDirectiveHSACodeObjectVersion(uint32_t Major,
                                          uint32_t Minor) override;
 
@@ -81,6 +99,12 @@ public:
 
   /// \returns True on success, false on failure.
   bool EmitPALMetadata(const AMDGPU::PALMD::Metadata &PALMetadata) override;
+
+  void EmitAmdhsaKernelDescriptor(
+      const MCSubtargetInfo &STI, StringRef KernelName,
+      const amdhsa::kernel_descriptor_t &KernelDescriptor, uint64_t NextVGPR,
+      uint64_t NextSGPR, bool ReserveVCC, bool ReserveFlatScr,
+      bool ReserveXNACK) override;
 };
 
 class AMDGPUTargetELFStreamer final : public AMDGPUTargetStreamer {
@@ -90,10 +114,12 @@ class AMDGPUTargetELFStreamer final : public AMDGPUTargetStreamer {
                       function_ref<void(MCELFStreamer &)> EmitDesc);
 
 public:
-  AMDGPUTargetELFStreamer(MCStreamer &S);
+  AMDGPUTargetELFStreamer(MCStreamer &S, const MCSubtargetInfo &STI);
 
   MCELFStreamer &getStreamer();
 
+  void EmitDirectiveAMDGCNTarget(StringRef Target) override;
+
   void EmitDirectiveHSACodeObjectVersion(uint32_t Major,
                                          uint32_t Minor) override;
 
@@ -113,6 +139,12 @@ public:
 
   /// \returns True on success, false on failure.
   bool EmitPALMetadata(const AMDGPU::PALMD::Metadata &PALMetadata) override;
+
+  void EmitAmdhsaKernelDescriptor(
+      const MCSubtargetInfo &STI, StringRef KernelName,
+      const amdhsa::kernel_descriptor_t &KernelDescriptor, uint64_t NextVGPR,
+      uint64_t NextSGPR, bool ReserveVCC, bool ReserveFlatScr,
+      bool ReserveXNACK) override;
 };
 
 }
diff --git a/lib/Target/AMDGPU/MCTargetDesc/CMakeLists.txt b/lib/Target/AMDGPU/MCTargetDesc/CMakeLists.txt
index f9cb4678dc51..2d201bbbd7b8 100644
--- a/lib/Target/AMDGPU/MCTargetDesc/CMakeLists.txt
+++ b/lib/Target/AMDGPU/MCTargetDesc/CMakeLists.txt
@@ -2,11 +2,11 @@ add_llvm_library(LLVMAMDGPUDesc
   AMDGPUAsmBackend.cpp
   AMDGPUELFObjectWriter.cpp
   AMDGPUELFStreamer.cpp
-  AMDGPUHSAMetadataStreamer.cpp
   AMDGPUMCAsmInfo.cpp
   AMDGPUMCCodeEmitter.cpp
   AMDGPUMCTargetDesc.cpp
   AMDGPUTargetStreamer.cpp
   R600MCCodeEmitter.cpp
+  R600MCTargetDesc.cpp
   SIMCCodeEmitter.cpp
 )
diff --git a/lib/Target/AMDGPU/MCTargetDesc/R600MCCodeEmitter.cpp b/lib/Target/AMDGPU/MCTargetDesc/R600MCCodeEmitter.cpp
index eab90e1d344c..28d4bc1829e2 100644
--- a/lib/Target/AMDGPU/MCTargetDesc/R600MCCodeEmitter.cpp
+++ b/lib/Target/AMDGPU/MCTargetDesc/R600MCCodeEmitter.cpp
@@ -9,13 +9,12 @@
 //
 /// \file
 ///
-/// \brief The R600 code emitter produces machine code that can be executed
+/// The R600 code emitter produces machine code that can be executed
 /// directly on the GPU device.
 //
 //===----------------------------------------------------------------------===//
 
 #include "MCTargetDesc/AMDGPUFixupKinds.h"
-#include "MCTargetDesc/AMDGPUMCCodeEmitter.h"
 #include "MCTargetDesc/AMDGPUMCTargetDesc.h"
 #include "R600Defines.h"
 #include "llvm/MC/MCCodeEmitter.h"
@@ -36,30 +35,40 @@ using namespace llvm;
 
 namespace {
 
-class R600MCCodeEmitter : public AMDGPUMCCodeEmitter {
+class R600MCCodeEmitter : public MCCodeEmitter {
   const MCRegisterInfo &MRI;
+  const MCInstrInfo &MCII;
 
 public:
   R600MCCodeEmitter(const MCInstrInfo &mcii, const MCRegisterInfo &mri)
-    : AMDGPUMCCodeEmitter(mcii), MRI(mri) {}
+    : MRI(mri), MCII(mcii) {}
   R600MCCodeEmitter(const R600MCCodeEmitter &) = delete;
   R600MCCodeEmitter &operator=(const R600MCCodeEmitter &) = delete;
 
-  /// \brief Encode the instruction and write it to the OS.
+  /// Encode the instruction and write it to the OS.
   void encodeInstruction(const MCInst &MI, raw_ostream &OS,
                          SmallVectorImpl<MCFixup> &Fixups,
-                         const MCSubtargetInfo &STI) const override;
+                         const MCSubtargetInfo &STI) const;
 
   /// \returns the encoding for an MCOperand.
   uint64_t getMachineOpValue(const MCInst &MI, const MCOperand &MO,
                              SmallVectorImpl<MCFixup> &Fixups,
-                             const MCSubtargetInfo &STI) const override;
+                             const MCSubtargetInfo &STI) const;
 
 private:
+
   void Emit(uint32_t value, raw_ostream &OS) const;
   void Emit(uint64_t value, raw_ostream &OS) const;
 
   unsigned getHWReg(unsigned regNo) const;
+
+  uint64_t getBinaryCodeForInstr(const MCInst &MI,
+                                 SmallVectorImpl<MCFixup> &Fixups,
+                                 const MCSubtargetInfo &STI) const;
+  uint64_t computeAvailableFeatures(const FeatureBitset &FB) const;
+  void verifyInstructionPredicates(const MCInst &MI,
+                                   uint64_t AvailableFeatures) const;
+
 };
 
 } // end anonymous namespace
@@ -94,16 +103,16 @@ void R600MCCodeEmitter::encodeInstruction(const MCInst &MI, raw_ostream &OS,
                               computeAvailableFeatures(STI.getFeatureBits()));
 
   const MCInstrDesc &Desc = MCII.get(MI.getOpcode());
-  if (MI.getOpcode() == AMDGPU::RETURN ||
-    MI.getOpcode() == AMDGPU::FETCH_CLAUSE ||
-    MI.getOpcode() == AMDGPU::ALU_CLAUSE ||
-    MI.getOpcode() == AMDGPU::BUNDLE ||
-    MI.getOpcode() == AMDGPU::KILL) {
+  if (MI.getOpcode() == R600::RETURN ||
+    MI.getOpcode() == R600::FETCH_CLAUSE ||
+    MI.getOpcode() == R600::ALU_CLAUSE ||
+    MI.getOpcode() == R600::BUNDLE ||
+    MI.getOpcode() == R600::KILL) {
     return;
   } else if (IS_VTX(Desc)) {
     uint64_t InstWord01 = getBinaryCodeForInstr(MI, Fixups, STI);
     uint32_t InstWord2 = MI.getOperand(2).getImm(); // Offset
-    if (!(STI.getFeatureBits()[AMDGPU::FeatureCaymanISA])) {
+    if (!(STI.getFeatureBits()[R600::FeatureCaymanISA])) {
       InstWord2 |= 1 << 19; // Mega-Fetch bit
     }
 
@@ -136,7 +145,7 @@ void R600MCCodeEmitter::encodeInstruction(const MCInst &MI, raw_ostream &OS,
       Emit((uint32_t) 0, OS);
   } else {
     uint64_t Inst = getBinaryCodeForInstr(MI, Fixups, STI);
-    if ((STI.getFeatureBits()[AMDGPU::FeatureR600ALUInst]) &&
+    if ((STI.getFeatureBits()[R600::FeatureR600ALUInst]) &&
        ((Desc.TSFlags & R600_InstFlag::OP1) ||
          Desc.TSFlags & R600_InstFlag::OP2)) {
       uint64_t ISAOpCode = Inst & (0x3FFULL << 39);
@@ -148,11 +157,11 @@ void R600MCCodeEmitter::encodeInstruction(const MCInst &MI, raw_ostream &OS,
 }
 
 void R600MCCodeEmitter::Emit(uint32_t Value, raw_ostream &OS) const {
-  support::endian::Writer<support::little>(OS).write(Value);
+  support::endian::write(OS, Value, support::little);
 }
 
 void R600MCCodeEmitter::Emit(uint64_t Value, raw_ostream &OS) const {
-  support::endian::Writer<support::little>(OS).write(Value);
+  support::endian::write(OS, Value, support::little);
 }
 
 unsigned R600MCCodeEmitter::getHWReg(unsigned RegNo) const {
@@ -186,4 +195,4 @@ uint64_t R600MCCodeEmitter::getMachineOpValue(const MCInst &MI,
 }
 
 #define ENABLE_INSTR_PREDICATE_VERIFIER
-#include "AMDGPUGenMCCodeEmitter.inc"
+#include "R600GenMCCodeEmitter.inc"
diff --git a/lib/Target/AMDGPU/MCTargetDesc/R600MCTargetDesc.cpp b/lib/Target/AMDGPU/MCTargetDesc/R600MCTargetDesc.cpp
new file mode 100644
index 000000000000..1c99a708e5ac
--- /dev/null
+++ b/lib/Target/AMDGPU/MCTargetDesc/R600MCTargetDesc.cpp
@@ -0,0 +1,27 @@
+//===-- R600MCTargetDesc.cpp - R600 Target Descriptions -------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+/// \file
+/// \brief This file provides R600 specific target descriptions.
+//
+//===----------------------------------------------------------------------===//
+
+#include "AMDGPUMCTargetDesc.h"
+#include "llvm/MC/MCInstrInfo.h"
+
+using namespace llvm;
+
+#define GET_INSTRINFO_MC_DESC
+#include "R600GenInstrInfo.inc"
+
+MCInstrInfo *llvm::createR600MCInstrInfo() {
+  MCInstrInfo *X = new MCInstrInfo();
+  InitR600MCInstrInfo(X);
+  return X;
+}
diff --git a/lib/Target/AMDGPU/MCTargetDesc/SIMCCodeEmitter.cpp b/lib/Target/AMDGPU/MCTargetDesc/SIMCCodeEmitter.cpp
index 94c0157edeb5..36913bd04274 100644
--- a/lib/Target/AMDGPU/MCTargetDesc/SIMCCodeEmitter.cpp
+++ b/lib/Target/AMDGPU/MCTargetDesc/SIMCCodeEmitter.cpp
@@ -8,7 +8,7 @@
 //===----------------------------------------------------------------------===//
 //
 /// \file
-/// \brief The SI code emitter produces machine code that can be executed
+/// The SI code emitter produces machine code that can be executed
 /// directly on the GPU device.
 //
 //===----------------------------------------------------------------------===//
@@ -43,7 +43,7 @@ namespace {
 class SIMCCodeEmitter : public  AMDGPUMCCodeEmitter {
   const MCRegisterInfo &MRI;
 
-  /// \brief Encode an fp or int literal
+  /// Encode an fp or int literal
   uint32_t getLitEncoding(const MCOperand &MO, const MCOperandInfo &OpInfo,
                           const MCSubtargetInfo &STI) const;
 
@@ -54,7 +54,7 @@ public:
   SIMCCodeEmitter(const SIMCCodeEmitter &) = delete;
   SIMCCodeEmitter &operator=(const SIMCCodeEmitter &) = delete;
 
-  /// \brief Encode the instruction and write it to the OS.
+  /// Encode the instruction and write it to the OS.
   void encodeInstruction(const MCInst &MI, raw_ostream &OS,
                          SmallVectorImpl<MCFixup> &Fixups,
                          const MCSubtargetInfo &STI) const override;
@@ -64,7 +64,7 @@ public:
                              SmallVectorImpl<MCFixup> &Fixups,
                              const MCSubtargetInfo &STI) const override;
 
-  /// \brief Use a fixup to encode the simm16 field for SOPP branch
+  /// Use a fixup to encode the simm16 field for SOPP branch
   ///        instructions.
   unsigned getSOPPBrEncoding(const MCInst &MI, unsigned OpNo,
                              SmallVectorImpl<MCFixup> &Fixups,
@@ -335,13 +335,24 @@ SIMCCodeEmitter::getSDWASrcEncoding(const MCInst &MI, unsigned OpNo,
 
   const MCOperand &MO = MI.getOperand(OpNo);
 
-  unsigned Reg = MO.getReg();
-  RegEnc |= MRI.getEncodingValue(Reg);
-  RegEnc &= SDWA9EncValues::SRC_VGPR_MASK;
-  if (AMDGPU::isSGPR(AMDGPU::mc2PseudoReg(Reg), &MRI)) {
-    RegEnc |= SDWA9EncValues::SRC_SGPR_MASK;
+  if (MO.isReg()) {
+    unsigned Reg = MO.getReg();
+    RegEnc |= MRI.getEncodingValue(Reg);
+    RegEnc &= SDWA9EncValues::SRC_VGPR_MASK;
+    if (AMDGPU::isSGPR(AMDGPU::mc2PseudoReg(Reg), &MRI)) {
+      RegEnc |= SDWA9EncValues::SRC_SGPR_MASK;
+    }
+    return RegEnc;
+  } else {
+    const MCInstrDesc &Desc = MCII.get(MI.getOpcode());
+    uint32_t Enc = getLitEncoding(MO, Desc.OpInfo[OpNo], STI);
+    if (Enc != ~0U && Enc != 255) {
+      return Enc | SDWA9EncValues::SRC_SGPR_MASK;
+    }
   }
-  return RegEnc;
+
+  llvm_unreachable("Unsupported operand kind");
+  return 0;
 }
 
 unsigned
@@ -427,3 +438,6 @@ uint64_t SIMCCodeEmitter::getMachineOpValue(const MCInst &MI,
   llvm_unreachable("Encoding of this operand type is not supported yet.");
   return 0;
 }
+
+#define ENABLE_INSTR_PREDICATE_VERIFIER
+#include "AMDGPUGenMCCodeEmitter.inc"
diff --git a/lib/Target/AMDGPU/MIMGInstructions.td b/lib/Target/AMDGPU/MIMGInstructions.td
index 30a2df510386..1e0bc62c45a6 100644
--- a/lib/Target/AMDGPU/MIMGInstructions.td
+++ b/lib/Target/AMDGPU/MIMGInstructions.td
@@ -7,9 +7,63 @@
 //
 //===----------------------------------------------------------------------===//
 
-class MIMG_Mask <string op, int channels> {
-  string Op = op;
-  int Channels = channels;
+// MIMG-specific encoding families to distinguish between semantically
+// equivalent machine instructions with different encoding.
+//
+// - MIMGEncGfx6: encoding introduced with gfx6 (obsoleted for atomics in gfx8)
+// - MIMGEncGfx8: encoding introduced with gfx8 for atomics
+class MIMGEncoding;
+
+def MIMGEncGfx6 : MIMGEncoding;
+def MIMGEncGfx8 : MIMGEncoding;
+
+def MIMGEncoding : GenericEnum {
+  let FilterClass = "MIMGEncoding";
+}
+
+// Represent an ISA-level opcode, independent of the encoding and the
+// vdata/vaddr size.
+class MIMGBaseOpcode {
+  MIMGBaseOpcode BaseOpcode = !cast<MIMGBaseOpcode>(NAME);
+  bit Store = 0;
+  bit Atomic = 0;
+  bit AtomicX2 = 0; // (f)cmpswap
+  bit Sampler = 0;
+  bits<8> NumExtraArgs = 0;
+  bit Gradients = 0;
+  bit Coordinates = 1;
+  bit LodOrClampOrMip = 0;
+  bit HasD16 = 0;
+}
+
+def MIMGBaseOpcode : GenericEnum {
+  let FilterClass = "MIMGBaseOpcode";
+}
+
+def MIMGBaseOpcodesTable : GenericTable {
+  let FilterClass = "MIMGBaseOpcode";
+  let CppTypeName = "MIMGBaseOpcodeInfo";
+  let Fields = ["BaseOpcode", "Store", "Atomic", "AtomicX2", "Sampler",
+                "NumExtraArgs", "Gradients", "Coordinates", "LodOrClampOrMip",
+                "HasD16"];
+  GenericEnum TypeOf_BaseOpcode = MIMGBaseOpcode;
+
+  let PrimaryKey = ["BaseOpcode"];
+  let PrimaryKeyName = "getMIMGBaseOpcodeInfo";
+}
+
+def MIMGDim : GenericEnum {
+  let FilterClass = "AMDGPUDimProps";
+}
+
+def MIMGDimInfoTable : GenericTable {
+  let FilterClass = "AMDGPUDimProps";
+  let CppTypeName = "MIMGDimInfo";
+  let Fields = ["Dim", "NumCoords", "NumGradients", "DA"];
+  GenericEnum TypeOf_Dim = MIMGDim;
+
+  let PrimaryKey = ["Dim"];
+  let PrimaryKeyName = "getMIMGDimInfo";
 }
 
 class mimg <bits<7> si, bits<7> vi = si> {
@@ -17,254 +71,372 @@ class mimg <bits<7> si, bits<7> vi = si> {
   field bits<7> VI = vi;
 }
 
-class MIMG_Helper <dag outs, dag ins, string asm,
-                   string dns=""> : MIMG<outs, ins, asm,[]> {
+class MIMG <dag outs, string dns = "">
+  : InstSI <outs, (ins), "", []> {
+
+  let VM_CNT = 1;
+  let EXP_CNT = 1;
+  let MIMG = 1;
+  let Uses = [EXEC];
   let mayLoad = 1;
   let mayStore = 0;
   let hasPostISelHook = 1;
+  let SchedRW = [WriteVMEM];
+  let UseNamedOperandTable = 1;
+  let hasSideEffects = 0; // XXX ????
+
+  let SubtargetPredicate = isGCN;
   let DecoderNamespace = dns;
   let isAsmParserOnly = !if(!eq(dns,""), 1, 0);
   let AsmMatchConverter = "cvtMIMG";
   let usesCustomInserter = 1;
-  let SchedRW = [WriteVMEM];
+
+  Instruction Opcode = !cast<Instruction>(NAME);
+  MIMGBaseOpcode BaseOpcode;
+  MIMGEncoding MIMGEncoding = MIMGEncGfx6;
+  bits<8> VDataDwords;
+  bits<8> VAddrDwords;
+}
+
+def MIMGInfoTable : GenericTable {
+  let FilterClass = "MIMG";
+  let CppTypeName = "MIMGInfo";
+  let Fields = ["Opcode", "BaseOpcode", "MIMGEncoding", "VDataDwords", "VAddrDwords"];
+  GenericEnum TypeOf_BaseOpcode = MIMGBaseOpcode;
+  GenericEnum TypeOf_MIMGEncoding = MIMGEncoding;
+
+  let PrimaryKey = ["BaseOpcode", "MIMGEncoding", "VDataDwords", "VAddrDwords"];
+  let PrimaryKeyName = "getMIMGOpcodeHelper";
+}
+
+def getMIMGInfo : SearchIndex {
+  let Table = MIMGInfoTable;
+  let Key = ["Opcode"];
 }
 
 class MIMG_NoSampler_Helper <bits<7> op, string asm,
                              RegisterClass dst_rc,
                              RegisterClass addr_rc,
-                             string dns=""> : MIMG_Helper <
-  (outs dst_rc:$vdata),
-  (ins addr_rc:$vaddr, SReg_256:$srsrc,
-       dmask:$dmask, unorm:$unorm, GLC:$glc, slc:$slc,
-       r128:$r128, tfe:$tfe, lwe:$lwe, da:$da),
-  asm#" $vdata, $vaddr, $srsrc$dmask$unorm$glc$slc$r128$tfe$lwe$da",
-  dns>, MIMGe<op> {
+                             string dns="">
+  : MIMG <(outs dst_rc:$vdata), dns>,
+    MIMGe<op> {
   let ssamp = 0;
+  let d16 = !if(BaseOpcode.HasD16, ?, 0);
+
+  let InOperandList = !con((ins addr_rc:$vaddr, SReg_256:$srsrc,
+                                DMask:$dmask, UNorm:$unorm, GLC:$glc, SLC:$slc,
+                                R128:$r128, TFE:$tfe, LWE:$lwe, DA:$da),
+                           !if(BaseOpcode.HasD16, (ins D16:$d16), (ins)));
+  let AsmString = asm#" $vdata, $vaddr, $srsrc$dmask$unorm$glc$slc$r128$tfe$lwe$da"
+                      #!if(BaseOpcode.HasD16, "$d16", "");
 }
 
 multiclass MIMG_NoSampler_Src_Helper <bits<7> op, string asm,
-                                      RegisterClass dst_rc,
-                                      int channels> {
-  def _V1 : MIMG_NoSampler_Helper <op, asm, dst_rc, VGPR_32,
-                                   !if(!eq(channels, 1), "AMDGPU", "")>,
-            MIMG_Mask<asm#"_V1", channels>;
-  def _V2 : MIMG_NoSampler_Helper <op, asm, dst_rc, VReg_64>,
-            MIMG_Mask<asm#"_V2", channels>;
-  def _V4 : MIMG_NoSampler_Helper <op, asm, dst_rc, VReg_128>,
-            MIMG_Mask<asm#"_V4", channels>;
-}
+                                             RegisterClass dst_rc,
+                                             bit enableDisasm> {
+  let VAddrDwords = 1 in
+  def NAME # _V1 : MIMG_NoSampler_Helper <op, asm, dst_rc, VGPR_32,
+                                         !if(enableDisasm, "AMDGPU", "")>;
+  let VAddrDwords = 2 in
+  def NAME # _V2 : MIMG_NoSampler_Helper <op, asm, dst_rc, VReg_64>;
+  let VAddrDwords = 3 in
+  def NAME # _V3 : MIMG_NoSampler_Helper <op, asm, dst_rc, VReg_96>;
+  let VAddrDwords = 4 in
+  def NAME # _V4 : MIMG_NoSampler_Helper <op, asm, dst_rc, VReg_128>;
+}
+
+multiclass MIMG_NoSampler <bits<7> op, string asm, bit has_d16, bit mip = 0,
+                           bit isResInfo = 0> {
+  def "" : MIMGBaseOpcode {
+    let Coordinates = !if(isResInfo, 0, 1);
+    let LodOrClampOrMip = mip;
+    let HasD16 = has_d16;
+  }
 
-multiclass MIMG_NoSampler <bits<7> op, string asm> {
-  defm _V1 : MIMG_NoSampler_Src_Helper <op, asm, VGPR_32, 1>;
-  defm _V2 : MIMG_NoSampler_Src_Helper <op, asm, VReg_64, 2>;
-  defm _V3 : MIMG_NoSampler_Src_Helper <op, asm, VReg_96, 3>;
-  defm _V4 : MIMG_NoSampler_Src_Helper <op, asm, VReg_128, 4>;
+  let BaseOpcode = !cast<MIMGBaseOpcode>(NAME),
+      mayLoad = !if(isResInfo, 0, 1) in {
+    let VDataDwords = 1 in
+    defm _V1 : MIMG_NoSampler_Src_Helper <op, asm, VGPR_32, 1>;
+    let VDataDwords = 2 in
+    defm _V2 : MIMG_NoSampler_Src_Helper <op, asm, VReg_64, 0>;
+    let VDataDwords = 3 in
+    defm _V3 : MIMG_NoSampler_Src_Helper <op, asm, VReg_96, 0>;
+    let VDataDwords = 4 in
+    defm _V4 : MIMG_NoSampler_Src_Helper <op, asm, VReg_128, 0>;
+  }
 }
 
 class MIMG_Store_Helper <bits<7> op, string asm,
                          RegisterClass data_rc,
                          RegisterClass addr_rc,
-                         string dns = ""> : MIMG_Helper <
-  (outs),
-  (ins data_rc:$vdata, addr_rc:$vaddr, SReg_256:$srsrc,
-       dmask:$dmask, unorm:$unorm, GLC:$glc, slc:$slc,
-       r128:$r128, tfe:$tfe, lwe:$lwe, da:$da),
-  asm#" $vdata, $vaddr, $srsrc$dmask$unorm$glc$slc$r128$tfe$lwe$da", dns>, MIMGe<op> {
+                         string dns = "">
+  : MIMG <(outs), dns>,
+    MIMGe<op> {
   let ssamp = 0;
-  let mayLoad = 1; // TableGen requires this for matching with the intrinsics
+  let d16 = !if(BaseOpcode.HasD16, ?, 0);
+
+  let mayLoad = 0;
   let mayStore = 1;
-  let hasSideEffects = 1;
+  let hasSideEffects = 0;
   let hasPostISelHook = 0;
   let DisableWQM = 1;
+
+  let InOperandList = !con((ins data_rc:$vdata, addr_rc:$vaddr, SReg_256:$srsrc,
+                                DMask:$dmask, UNorm:$unorm, GLC:$glc, SLC:$slc,
+                                R128:$r128, TFE:$tfe, LWE:$lwe, DA:$da),
+                           !if(BaseOpcode.HasD16, (ins D16:$d16), (ins)));
+  let AsmString = asm#" $vdata, $vaddr, $srsrc$dmask$unorm$glc$slc$r128$tfe$lwe$da"
+                      #!if(BaseOpcode.HasD16, "$d16", "");
 }
 
 multiclass MIMG_Store_Addr_Helper <bits<7> op, string asm,
                                   RegisterClass data_rc,
-                                  int channels> {
-  def _V1 : MIMG_Store_Helper <op, asm, data_rc, VGPR_32,
-                               !if(!eq(channels, 1), "AMDGPU", "")>,
-            MIMG_Mask<asm#"_V1", channels>;
-  def _V2 : MIMG_Store_Helper <op, asm, data_rc, VReg_64>,
-            MIMG_Mask<asm#"_V2", channels>;
-  def _V4 : MIMG_Store_Helper <op, asm, data_rc, VReg_128>,
-            MIMG_Mask<asm#"_V4", channels>;
-}
+                                  bit enableDisasm> {
+  let VAddrDwords = 1 in
+  def NAME # _V1 : MIMG_Store_Helper <op, asm, data_rc, VGPR_32,
+                                      !if(enableDisasm, "AMDGPU", "")>;
+  let VAddrDwords = 2 in
+  def NAME # _V2 : MIMG_Store_Helper <op, asm, data_rc, VReg_64>;
+  let VAddrDwords = 3 in
+  def NAME # _V3 : MIMG_Store_Helper <op, asm, data_rc, VReg_96>;
+  let VAddrDwords = 4 in
+  def NAME # _V4 : MIMG_Store_Helper <op, asm, data_rc, VReg_128>;
+}
+
+multiclass MIMG_Store <bits<7> op, string asm, bit has_d16, bit mip = 0> {
+  def "" : MIMGBaseOpcode {
+    let Store = 1;
+    let LodOrClampOrMip = mip;
+    let HasD16 = has_d16;
+  }
 
-multiclass MIMG_Store <bits<7> op, string asm> {
-  defm _V1 : MIMG_Store_Addr_Helper <op, asm, VGPR_32, 1>;
-  defm _V2 : MIMG_Store_Addr_Helper <op, asm, VReg_64, 2>;
-  defm _V3 : MIMG_Store_Addr_Helper <op, asm, VReg_96, 3>;
-  defm _V4 : MIMG_Store_Addr_Helper <op, asm, VReg_128, 4>;
+  let BaseOpcode = !cast<MIMGBaseOpcode>(NAME) in {
+    let VDataDwords = 1 in
+    defm _V1 : MIMG_Store_Addr_Helper <op, asm, VGPR_32, 1>;
+    let VDataDwords = 2 in
+    defm _V2 : MIMG_Store_Addr_Helper <op, asm, VReg_64, 0>;
+    let VDataDwords = 3 in
+    defm _V3 : MIMG_Store_Addr_Helper <op, asm, VReg_96, 0>;
+    let VDataDwords = 4 in
+    defm _V4 : MIMG_Store_Addr_Helper <op, asm, VReg_128, 0>;
+  }
 }
 
 class MIMG_Atomic_Helper <string asm, RegisterClass data_rc,
-                          RegisterClass addr_rc> : MIMG_Helper <
-    (outs data_rc:$vdst),
-    (ins data_rc:$vdata, addr_rc:$vaddr, SReg_256:$srsrc,
-         dmask:$dmask, unorm:$unorm, GLC:$glc, slc:$slc,
-         r128:$r128, tfe:$tfe, lwe:$lwe, da:$da),
-    asm#" $vdst, $vaddr, $srsrc$dmask$unorm$glc$slc$r128$tfe$lwe$da"
-  > {
+                          RegisterClass addr_rc, string dns="",
+                          bit enableDasm = 0>
+  : MIMG <(outs data_rc:$vdst), !if(enableDasm, dns, "")> {
+  let mayLoad = 1;
   let mayStore = 1;
-  let hasSideEffects = 1;
+  let hasSideEffects = 1; // FIXME: Remove this
   let hasPostISelHook = 0;
   let DisableWQM = 1;
   let Constraints = "$vdst = $vdata";
   let AsmMatchConverter = "cvtMIMGAtomic";
-}
 
-class MIMG_Atomic_Real_si<mimg op, string name, string asm,
-  RegisterClass data_rc, RegisterClass addr_rc> :
-  MIMG_Atomic_Helper<asm, data_rc, addr_rc>,
-  SIMCInstr<name, SIEncodingFamily.SI>,
-  MIMGe<op.SI> {
-  let isCodeGenOnly = 0;
-  let AssemblerPredicates = [isSICI];
-  let DecoderNamespace = "SICI";
-  let DisableDecoder = DisableSIDecoder;
-}
-
-class MIMG_Atomic_Real_vi<mimg op, string name, string asm,
-  RegisterClass data_rc, RegisterClass addr_rc> :
-  MIMG_Atomic_Helper<asm, data_rc, addr_rc>,
-  SIMCInstr<name, SIEncodingFamily.VI>,
-  MIMGe<op.VI> {
-  let isCodeGenOnly = 0;
-  let AssemblerPredicates = [isVI];
-  let DecoderNamespace = "VI";
-  let DisableDecoder = DisableVIDecoder;
-}
-
-multiclass MIMG_Atomic_Helper_m <mimg op, string name, string asm,
-                                 RegisterClass data_rc, RegisterClass addr_rc> {
-  let isPseudo = 1, isCodeGenOnly = 1 in {
-    def "" : MIMG_Atomic_Helper<asm, data_rc, addr_rc>,
-             SIMCInstr<name, SIEncodingFamily.NONE>;
+  let InOperandList = (ins data_rc:$vdata, addr_rc:$vaddr, SReg_256:$srsrc,
+                           DMask:$dmask, UNorm:$unorm, GLC:$glc, SLC:$slc,
+                           R128:$r128, TFE:$tfe, LWE:$lwe, DA:$da);
+  let AsmString = asm#" $vdst, $vaddr, $srsrc$dmask$unorm$glc$slc$r128$tfe$lwe$da";
+}
+
+multiclass MIMG_Atomic_Helper_m <mimg op, string asm, RegisterClass data_rc,
+                                 RegisterClass addr_rc, bit enableDasm = 0> {
+  let ssamp = 0, d16 = 0 in {
+    def _si : MIMG_Atomic_Helper<asm, data_rc, addr_rc, "SICI", enableDasm>,
+              SIMCInstr<NAME, SIEncodingFamily.SI>,
+              MIMGe<op.SI> {
+      let AssemblerPredicates = [isSICI];
+      let DisableDecoder = DisableSIDecoder;
+    }
+
+    def _vi : MIMG_Atomic_Helper<asm, data_rc, addr_rc, "VI", enableDasm>,
+              SIMCInstr<NAME, SIEncodingFamily.VI>,
+              MIMGe<op.VI> {
+      let AssemblerPredicates = [isVI];
+      let DisableDecoder = DisableVIDecoder;
+      let MIMGEncoding = MIMGEncGfx8;
+    }
   }
+}
 
-  let ssamp = 0 in {
-    def _si : MIMG_Atomic_Real_si<op, name, asm, data_rc, addr_rc>;
+multiclass MIMG_Atomic_Addr_Helper_m <mimg op, string asm,
+                                      RegisterClass data_rc,
+                                      bit enableDasm = 0> {
+  // _V* variants have different address size, but the size is not encoded.
+  // So only one variant can be disassembled. V1 looks the safest to decode.
+  let VAddrDwords = 1 in
+  defm _V1 : MIMG_Atomic_Helper_m <op, asm, data_rc, VGPR_32, enableDasm>;
+  let VAddrDwords = 2 in
+  defm _V2 : MIMG_Atomic_Helper_m <op, asm, data_rc, VReg_64>;
+  let VAddrDwords = 3 in
+  defm _V3 : MIMG_Atomic_Helper_m <op, asm, data_rc, VReg_96>;
+  let VAddrDwords = 4 in
+  defm _V4 : MIMG_Atomic_Helper_m <op, asm, data_rc, VReg_128>;
+}
+
+multiclass MIMG_Atomic <mimg op, string asm, bit isCmpSwap = 0> { // 64-bit atomics
+  def "" : MIMGBaseOpcode {
+    let Atomic = 1;
+    let AtomicX2 = isCmpSwap;
+  }
 
-    def _vi : MIMG_Atomic_Real_vi<op, name, asm, data_rc, addr_rc>;
+  let BaseOpcode = !cast<MIMGBaseOpcode>(NAME) in {
+    // _V* variants have different dst size, but the size is encoded implicitly,
+    // using dmask and tfe. Only 32-bit variant is registered with disassembler.
+    // Other variants are reconstructed by disassembler using dmask and tfe.
+    let VDataDwords = !if(isCmpSwap, 2, 1) in
+    defm _V1 : MIMG_Atomic_Addr_Helper_m <op, asm, !if(isCmpSwap, VReg_64, VGPR_32), 1>;
+    let VDataDwords = !if(isCmpSwap, 4, 2) in
+    defm _V2 : MIMG_Atomic_Addr_Helper_m <op, asm, !if(isCmpSwap, VReg_128, VReg_64)>;
   }
 }
 
-multiclass MIMG_Atomic <mimg op, string asm, RegisterClass data_rc = VGPR_32> {
-  defm _V1 : MIMG_Atomic_Helper_m <op, asm # "_V1", asm, data_rc, VGPR_32>;
-  defm _V2 : MIMG_Atomic_Helper_m <op, asm # "_V2", asm, data_rc, VReg_64>;
-  defm _V4 : MIMG_Atomic_Helper_m <op, asm # "_V3", asm, data_rc, VReg_128>;
+class MIMG_Sampler_Helper <bits<7> op, string asm, RegisterClass dst_rc,
+                           RegisterClass src_rc, string dns="">
+  : MIMG <(outs dst_rc:$vdata), dns>,
+    MIMGe<op> {
+  let d16 = !if(BaseOpcode.HasD16, ?, 0);
+
+  let InOperandList = !con((ins src_rc:$vaddr, SReg_256:$srsrc, SReg_128:$ssamp,
+                                DMask:$dmask, UNorm:$unorm, GLC:$glc, SLC:$slc,
+                                R128:$r128, TFE:$tfe, LWE:$lwe, DA:$da),
+                           !if(BaseOpcode.HasD16, (ins D16:$d16), (ins)));
+  let AsmString = asm#" $vdata, $vaddr, $srsrc, $ssamp$dmask$unorm$glc$slc$r128$tfe$lwe$da"
+                      #!if(BaseOpcode.HasD16, "$d16", "");
+}
+
+class MIMGAddrSize<int dw, bit enable_disasm> {
+  int NumWords = dw;
+
+  RegisterClass RegClass = !if(!le(NumWords, 0), ?,
+                           !if(!eq(NumWords, 1), VGPR_32,
+                           !if(!eq(NumWords, 2), VReg_64,
+                           !if(!eq(NumWords, 3), VReg_96,
+                           !if(!eq(NumWords, 4), VReg_128,
+                           !if(!le(NumWords, 8), VReg_256,
+                           !if(!le(NumWords, 16), VReg_512, ?)))))));
+
+  // Whether the instruction variant with this vaddr size should be enabled for
+  // the auto-generated disassembler.
+  bit Disassemble = enable_disasm;
+}
+
+// Return whether a value inside the range [min, max] (endpoints inclusive)
+// is in the given list.
+class isRangeInList<int min, int max, list<int> lst> {
+  bit ret = !foldl(0, lst, lhs, y, !or(lhs, !and(!le(min, y), !le(y, max))));
+}
+
+class MIMGAddrSizes_tmp<list<MIMGAddrSize> lst, int min> {
+  list<MIMGAddrSize> List = lst;
+  int Min = min;
+}
+
+class MIMG_Sampler_AddrSizes<AMDGPUSampleVariant sample> {
+  // List of all possible numbers of address words, taking all combinations of
+  // A16 and image dimension into account (note: no MSAA, since this is for
+  // sample/gather ops).
+  list<int> AllNumAddrWords =
+    !foreach(dw, !if(sample.Gradients,
+                     !if(!eq(sample.LodOrClamp, ""),
+                         [2, 3, 4, 5, 6, 7, 9],
+                         [2, 3, 4, 5, 7, 8, 10]),
+                     !if(!eq(sample.LodOrClamp, ""),
+                         [1, 2, 3],
+                         [1, 2, 3, 4])),
+             !add(dw, !size(sample.ExtraAddrArgs)));
+
+  // Generate machine instructions based on possible register classes for the
+  // required numbers of address words. The disassembler defaults to the
+  // smallest register class.
+  list<MIMGAddrSize> MachineInstrs =
+    !foldl(MIMGAddrSizes_tmp<[], 0>, [1, 2, 3, 4, 8, 16], lhs, dw,
+           !if(isRangeInList<lhs.Min, dw, AllNumAddrWords>.ret,
+               MIMGAddrSizes_tmp<
+                  !listconcat(lhs.List, [MIMGAddrSize<dw, !empty(lhs.List)>]),
+                  !if(!eq(dw, 3), 3, !add(dw, 1))>, // we still need _V4 for codegen w/ 3 dwords
+               lhs)).List;
 }
 
-class MIMG_Sampler_Helper <bits<7> op, string asm,
-                           RegisterClass dst_rc,
-                           RegisterClass src_rc,
-                           bit wqm,
-                           string dns=""> : MIMG_Helper <
-  (outs dst_rc:$vdata),
-  (ins src_rc:$vaddr, SReg_256:$srsrc, SReg_128:$ssamp,
-       dmask:$dmask, unorm:$unorm, GLC:$glc, slc:$slc,
-       r128:$r128, tfe:$tfe, lwe:$lwe, da:$da),
-  asm#" $vdata, $vaddr, $srsrc, $ssamp$dmask$unorm$glc$slc$r128$tfe$lwe$da",
-  dns>, MIMGe<op> {
-  let WQM = wqm;
+multiclass MIMG_Sampler_Src_Helper <bits<7> op, string asm,
+                                    AMDGPUSampleVariant sample, RegisterClass dst_rc,
+                                    bit enableDisasm = 0> {
+  foreach addr = MIMG_Sampler_AddrSizes<sample>.MachineInstrs in {
+    let VAddrDwords = addr.NumWords in
+    def _V # addr.NumWords
+      : MIMG_Sampler_Helper <op, asm, dst_rc, addr.RegClass,
+                             !if(!and(enableDisasm, addr.Disassemble), "AMDGPU", "")>;
+  }
 }
 
-multiclass MIMG_Sampler_Src_Helper <bits<7> op, string asm,
-                                    RegisterClass dst_rc,
-                                    int channels, bit wqm> {
-  def _V1 : MIMG_Sampler_Helper <op, asm, dst_rc, VGPR_32, wqm,
-                                 !if(!eq(channels, 1), "AMDGPU", "")>,
-            MIMG_Mask<asm#"_V1", channels>;
-  def _V2 : MIMG_Sampler_Helper <op, asm, dst_rc, VReg_64, wqm>,
-            MIMG_Mask<asm#"_V2", channels>;
-  def _V4 : MIMG_Sampler_Helper <op, asm, dst_rc, VReg_128, wqm>,
-            MIMG_Mask<asm#"_V4", channels>;
-  def _V8 : MIMG_Sampler_Helper <op, asm, dst_rc, VReg_256, wqm>,
-            MIMG_Mask<asm#"_V8", channels>;
-  def _V16 : MIMG_Sampler_Helper <op, asm, dst_rc, VReg_512, wqm>,
-            MIMG_Mask<asm#"_V16", channels>;
-}
-
-multiclass MIMG_Sampler <bits<7> op, string asm, bit wqm=0> {
-  defm _V1 : MIMG_Sampler_Src_Helper<op, asm, VGPR_32, 1, wqm>;
-  defm _V2 : MIMG_Sampler_Src_Helper<op, asm, VReg_64, 2, wqm>;
-  defm _V3 : MIMG_Sampler_Src_Helper<op, asm, VReg_96, 3, wqm>;
-  defm _V4 : MIMG_Sampler_Src_Helper<op, asm, VReg_128, 4, wqm>;
-}
-
-multiclass MIMG_Sampler_WQM <bits<7> op, string asm> : MIMG_Sampler<op, asm, 1>;
-
-class MIMG_Gather_Helper <bits<7> op, string asm,
-                          RegisterClass dst_rc,
-                          RegisterClass src_rc, bit wqm> : MIMG <
-  (outs dst_rc:$vdata),
-  (ins src_rc:$vaddr, SReg_256:$srsrc, SReg_128:$ssamp,
-       dmask:$dmask, unorm:$unorm, GLC:$glc, slc:$slc,
-       r128:$r128, tfe:$tfe, lwe:$lwe, da:$da),
-  asm#" $vdata, $vaddr, $srsrc, $ssamp$dmask$unorm$glc$slc$r128$tfe$lwe$da",
-  []>, MIMGe<op> {
-  let mayLoad = 1;
-  let mayStore = 0;
+class MIMG_Sampler_BaseOpcode<AMDGPUSampleVariant sample>
+  : MIMGBaseOpcode {
+  let Sampler = 1;
+  let NumExtraArgs = !size(sample.ExtraAddrArgs);
+  let Gradients = sample.Gradients;
+  let LodOrClampOrMip = !ne(sample.LodOrClamp, "");
+}
 
-  // DMASK was repurposed for GATHER4. 4 components are always
-  // returned and DMASK works like a swizzle - it selects
-  // the component to fetch. The only useful DMASK values are
-  // 1=red, 2=green, 4=blue, 8=alpha. (e.g. 1 returns
-  // (red,red,red,red) etc.) The ISA document doesn't mention
-  // this.
-  // Therefore, disable all code which updates DMASK by setting this:
-  let Gather4 = 1;
-  let hasPostISelHook = 0;
-  let WQM = wqm;
+multiclass MIMG_Sampler <bits<7> op, AMDGPUSampleVariant sample, bit wqm = 0,
+                         bit isGetLod = 0,
+                         string asm = "image_sample"#sample.LowerCaseMod> {
+  def "" : MIMG_Sampler_BaseOpcode<sample> {
+    let HasD16 = !if(isGetLod, 0, 1);
+  }
 
-  let isAsmParserOnly = 1; // TBD: fix it later
+  let BaseOpcode = !cast<MIMGBaseOpcode>(NAME), WQM = wqm,
+      mayLoad = !if(isGetLod, 0, 1) in {
+    let VDataDwords = 1 in
+    defm _V1 : MIMG_Sampler_Src_Helper<op, asm, sample, VGPR_32, 1>;
+    let VDataDwords = 2 in
+    defm _V2 : MIMG_Sampler_Src_Helper<op, asm, sample, VReg_64>;
+    let VDataDwords = 3 in
+    defm _V3 : MIMG_Sampler_Src_Helper<op, asm, sample, VReg_96>;
+    let VDataDwords = 4 in
+    defm _V4 : MIMG_Sampler_Src_Helper<op, asm, sample, VReg_128>;
+  }
 }
 
-multiclass MIMG_Gather_Src_Helper <bits<7> op, string asm,
-                                    RegisterClass dst_rc,
-                                    int channels, bit wqm> {
-  def _V1 : MIMG_Gather_Helper <op, asm, dst_rc, VGPR_32, wqm>,
-            MIMG_Mask<asm#"_V1", channels>;
-  def _V2 : MIMG_Gather_Helper <op, asm, dst_rc, VReg_64, wqm>,
-            MIMG_Mask<asm#"_V2", channels>;
-  def _V4 : MIMG_Gather_Helper <op, asm, dst_rc, VReg_128, wqm>,
-            MIMG_Mask<asm#"_V4", channels>;
-  def _V8 : MIMG_Gather_Helper <op, asm, dst_rc, VReg_256, wqm>,
-            MIMG_Mask<asm#"_V8", channels>;
-  def _V16 : MIMG_Gather_Helper <op, asm, dst_rc, VReg_512, wqm>,
-            MIMG_Mask<asm#"_V16", channels>;
-}
+multiclass MIMG_Sampler_WQM <bits<7> op, AMDGPUSampleVariant sample>
+    : MIMG_Sampler<op, sample, 1>;
 
-multiclass MIMG_Gather <bits<7> op, string asm, bit wqm=0> {
-  defm _V1 : MIMG_Gather_Src_Helper<op, asm, VGPR_32, 1, wqm>;
-  defm _V2 : MIMG_Gather_Src_Helper<op, asm, VReg_64, 2, wqm>;
-  defm _V3 : MIMG_Gather_Src_Helper<op, asm, VReg_96, 3, wqm>;
-  defm _V4 : MIMG_Gather_Src_Helper<op, asm, VReg_128, 4, wqm>;
+multiclass MIMG_Gather <bits<7> op, AMDGPUSampleVariant sample, bit wqm = 0,
+                        string asm = "image_gather4"#sample.LowerCaseMod> {
+  def "" : MIMG_Sampler_BaseOpcode<sample> {
+    let HasD16 = 1;
+  }
+
+  let BaseOpcode = !cast<MIMGBaseOpcode>(NAME), WQM = wqm,
+      Gather4 = 1, hasPostISelHook = 0 in {
+    let VDataDwords = 2 in
+    defm _V2 : MIMG_Sampler_Src_Helper<op, asm, sample, VReg_64>; /* for packed D16 only */
+    let VDataDwords = 4 in
+    defm _V4 : MIMG_Sampler_Src_Helper<op, asm, sample, VReg_128, 1>;
+  }
 }
 
-multiclass MIMG_Gather_WQM <bits<7> op, string asm> : MIMG_Gather<op, asm, 1>;
+multiclass MIMG_Gather_WQM <bits<7> op, AMDGPUSampleVariant sample>
+    : MIMG_Gather<op, sample, 1>;
 
 //===----------------------------------------------------------------------===//
 // MIMG Instructions
 //===----------------------------------------------------------------------===//
-let SubtargetPredicate = isGCN in {
-defm IMAGE_LOAD : MIMG_NoSampler <0x00000000, "image_load">;
-defm IMAGE_LOAD_MIP : MIMG_NoSampler <0x00000001, "image_load_mip">;
-//def IMAGE_LOAD_PCK : MIMG_NoPattern_ <"image_load_pck", 0x00000002>;
-//def IMAGE_LOAD_PCK_SGN : MIMG_NoPattern_ <"image_load_pck_sgn", 0x00000003>;
-//def IMAGE_LOAD_MIP_PCK : MIMG_NoPattern_ <"image_load_mip_pck", 0x00000004>;
-//def IMAGE_LOAD_MIP_PCK_SGN : MIMG_NoPattern_ <"image_load_mip_pck_sgn", 0x00000005>;
-defm IMAGE_STORE : MIMG_Store <0x00000008, "image_store">;
-defm IMAGE_STORE_MIP : MIMG_Store <0x00000009, "image_store_mip">;
-//def IMAGE_STORE_PCK : MIMG_NoPattern_ <"image_store_pck", 0x0000000a>;
-//def IMAGE_STORE_MIP_PCK : MIMG_NoPattern_ <"image_store_mip_pck", 0x0000000b>;
-
-let mayLoad = 0, mayStore = 0 in {
-defm IMAGE_GET_RESINFO : MIMG_NoSampler <0x0000000e, "image_get_resinfo">;
-}
+defm IMAGE_LOAD : MIMG_NoSampler <0x00000000, "image_load", 1>;
+defm IMAGE_LOAD_MIP : MIMG_NoSampler <0x00000001, "image_load_mip", 1, 1>;
+defm IMAGE_LOAD_PCK : MIMG_NoSampler <0x00000002, "image_load_pck", 0>;
+defm IMAGE_LOAD_PCK_SGN : MIMG_NoSampler <0x00000003, "image_load_pck_sgn", 0>;
+defm IMAGE_LOAD_MIP_PCK : MIMG_NoSampler <0x00000004, "image_load_mip_pck", 0, 1>;
+defm IMAGE_LOAD_MIP_PCK_SGN : MIMG_NoSampler <0x00000005, "image_load_mip_pck_sgn", 0, 1>;
+defm IMAGE_STORE : MIMG_Store <0x00000008, "image_store", 1>;
+defm IMAGE_STORE_MIP : MIMG_Store <0x00000009, "image_store_mip", 1, 1>;
+defm IMAGE_STORE_PCK : MIMG_Store <0x0000000a, "image_store_pck", 0>;
+defm IMAGE_STORE_MIP_PCK : MIMG_Store <0x0000000b, "image_store_mip_pck", 0, 1>;
+
+defm IMAGE_GET_RESINFO : MIMG_NoSampler <0x0000000e, "image_get_resinfo", 0, 1, 1>;
 
 defm IMAGE_ATOMIC_SWAP : MIMG_Atomic <mimg<0x0f, 0x10>, "image_atomic_swap">;
-defm IMAGE_ATOMIC_CMPSWAP : MIMG_Atomic <mimg<0x10, 0x11>, "image_atomic_cmpswap", VReg_64>;
+defm IMAGE_ATOMIC_CMPSWAP : MIMG_Atomic <mimg<0x10, 0x11>, "image_atomic_cmpswap", 1>;
 defm IMAGE_ATOMIC_ADD : MIMG_Atomic <mimg<0x11, 0x12>, "image_atomic_add">;
 defm IMAGE_ATOMIC_SUB : MIMG_Atomic <mimg<0x12, 0x13>, "image_atomic_sub">;
 //def IMAGE_ATOMIC_RSUB : MIMG_NoPattern_ <"image_atomic_rsub", 0x00000013>; -- not on VI
@@ -277,397 +449,101 @@ defm IMAGE_ATOMIC_OR : MIMG_Atomic <mimg<0x19>, "image_atomic_or">;
 defm IMAGE_ATOMIC_XOR : MIMG_Atomic <mimg<0x1a>, "image_atomic_xor">;
 defm IMAGE_ATOMIC_INC : MIMG_Atomic <mimg<0x1b>, "image_atomic_inc">;
 defm IMAGE_ATOMIC_DEC : MIMG_Atomic <mimg<0x1c>, "image_atomic_dec">;
-//def IMAGE_ATOMIC_FCMPSWAP : MIMG_NoPattern_ <"image_atomic_fcmpswap", 0x0000001d>; -- not on VI
+//def IMAGE_ATOMIC_FCMPSWAP : MIMG_NoPattern_ <"image_atomic_fcmpswap", 0x0000001d, 1>; -- not on VI
 //def IMAGE_ATOMIC_FMIN : MIMG_NoPattern_ <"image_atomic_fmin", 0x0000001e>; -- not on VI
 //def IMAGE_ATOMIC_FMAX : MIMG_NoPattern_ <"image_atomic_fmax", 0x0000001f>; -- not on VI
-defm IMAGE_SAMPLE           : MIMG_Sampler_WQM <0x00000020, "image_sample">;
-defm IMAGE_SAMPLE_CL        : MIMG_Sampler_WQM <0x00000021, "image_sample_cl">;
-defm IMAGE_SAMPLE_D         : MIMG_Sampler <0x00000022, "image_sample_d">;
-defm IMAGE_SAMPLE_D_CL      : MIMG_Sampler <0x00000023, "image_sample_d_cl">;
-defm IMAGE_SAMPLE_L         : MIMG_Sampler <0x00000024, "image_sample_l">;
-defm IMAGE_SAMPLE_B         : MIMG_Sampler_WQM <0x00000025, "image_sample_b">;
-defm IMAGE_SAMPLE_B_CL      : MIMG_Sampler_WQM <0x00000026, "image_sample_b_cl">;
-defm IMAGE_SAMPLE_LZ        : MIMG_Sampler <0x00000027, "image_sample_lz">;
-defm IMAGE_SAMPLE_C         : MIMG_Sampler_WQM <0x00000028, "image_sample_c">;
-defm IMAGE_SAMPLE_C_CL      : MIMG_Sampler_WQM <0x00000029, "image_sample_c_cl">;
-defm IMAGE_SAMPLE_C_D       : MIMG_Sampler <0x0000002a, "image_sample_c_d">;
-defm IMAGE_SAMPLE_C_D_CL    : MIMG_Sampler <0x0000002b, "image_sample_c_d_cl">;
-defm IMAGE_SAMPLE_C_L       : MIMG_Sampler <0x0000002c, "image_sample_c_l">;
-defm IMAGE_SAMPLE_C_B       : MIMG_Sampler_WQM <0x0000002d, "image_sample_c_b">;
-defm IMAGE_SAMPLE_C_B_CL    : MIMG_Sampler_WQM <0x0000002e, "image_sample_c_b_cl">;
-defm IMAGE_SAMPLE_C_LZ      : MIMG_Sampler <0x0000002f, "image_sample_c_lz">;
-defm IMAGE_SAMPLE_O         : MIMG_Sampler_WQM <0x00000030, "image_sample_o">;
-defm IMAGE_SAMPLE_CL_O      : MIMG_Sampler_WQM <0x00000031, "image_sample_cl_o">;
-defm IMAGE_SAMPLE_D_O       : MIMG_Sampler <0x00000032, "image_sample_d_o">;
-defm IMAGE_SAMPLE_D_CL_O    : MIMG_Sampler <0x00000033, "image_sample_d_cl_o">;
-defm IMAGE_SAMPLE_L_O       : MIMG_Sampler <0x00000034, "image_sample_l_o">;
-defm IMAGE_SAMPLE_B_O       : MIMG_Sampler_WQM <0x00000035, "image_sample_b_o">;
-defm IMAGE_SAMPLE_B_CL_O    : MIMG_Sampler_WQM <0x00000036, "image_sample_b_cl_o">;
-defm IMAGE_SAMPLE_LZ_O      : MIMG_Sampler <0x00000037, "image_sample_lz_o">;
-defm IMAGE_SAMPLE_C_O       : MIMG_Sampler_WQM <0x00000038, "image_sample_c_o">;
-defm IMAGE_SAMPLE_C_CL_O    : MIMG_Sampler_WQM <0x00000039, "image_sample_c_cl_o">;
-defm IMAGE_SAMPLE_C_D_O     : MIMG_Sampler <0x0000003a, "image_sample_c_d_o">;
-defm IMAGE_SAMPLE_C_D_CL_O  : MIMG_Sampler <0x0000003b, "image_sample_c_d_cl_o">;
-defm IMAGE_SAMPLE_C_L_O     : MIMG_Sampler <0x0000003c, "image_sample_c_l_o">;
-defm IMAGE_SAMPLE_C_B_O     : MIMG_Sampler_WQM <0x0000003d, "image_sample_c_b_o">;
-defm IMAGE_SAMPLE_C_B_CL_O  : MIMG_Sampler_WQM <0x0000003e, "image_sample_c_b_cl_o">;
-defm IMAGE_SAMPLE_C_LZ_O    : MIMG_Sampler <0x0000003f, "image_sample_c_lz_o">;
-defm IMAGE_GATHER4          : MIMG_Gather_WQM <0x00000040, "image_gather4">;
-defm IMAGE_GATHER4_CL       : MIMG_Gather_WQM <0x00000041, "image_gather4_cl">;
-defm IMAGE_GATHER4_L        : MIMG_Gather <0x00000044, "image_gather4_l">;
-defm IMAGE_GATHER4_B        : MIMG_Gather_WQM <0x00000045, "image_gather4_b">;
-defm IMAGE_GATHER4_B_CL     : MIMG_Gather_WQM <0x00000046, "image_gather4_b_cl">;
-defm IMAGE_GATHER4_LZ       : MIMG_Gather <0x00000047, "image_gather4_lz">;
-defm IMAGE_GATHER4_C        : MIMG_Gather_WQM <0x00000048, "image_gather4_c">;
-defm IMAGE_GATHER4_C_CL     : MIMG_Gather_WQM <0x00000049, "image_gather4_c_cl">;
-defm IMAGE_GATHER4_C_L      : MIMG_Gather <0x0000004c, "image_gather4_c_l">;
-defm IMAGE_GATHER4_C_B      : MIMG_Gather_WQM <0x0000004d, "image_gather4_c_b">;
-defm IMAGE_GATHER4_C_B_CL   : MIMG_Gather_WQM <0x0000004e, "image_gather4_c_b_cl">;
-defm IMAGE_GATHER4_C_LZ     : MIMG_Gather <0x0000004f, "image_gather4_c_lz">;
-defm IMAGE_GATHER4_O        : MIMG_Gather_WQM <0x00000050, "image_gather4_o">;
-defm IMAGE_GATHER4_CL_O     : MIMG_Gather_WQM <0x00000051, "image_gather4_cl_o">;
-defm IMAGE_GATHER4_L_O      : MIMG_Gather <0x00000054, "image_gather4_l_o">;
-defm IMAGE_GATHER4_B_O      : MIMG_Gather_WQM <0x00000055, "image_gather4_b_o">;
-defm IMAGE_GATHER4_B_CL_O   : MIMG_Gather <0x00000056, "image_gather4_b_cl_o">;
-defm IMAGE_GATHER4_LZ_O     : MIMG_Gather <0x00000057, "image_gather4_lz_o">;
-defm IMAGE_GATHER4_C_O      : MIMG_Gather_WQM <0x00000058, "image_gather4_c_o">;
-defm IMAGE_GATHER4_C_CL_O   : MIMG_Gather_WQM <0x00000059, "image_gather4_c_cl_o">;
-defm IMAGE_GATHER4_C_L_O    : MIMG_Gather <0x0000005c, "image_gather4_c_l_o">;
-defm IMAGE_GATHER4_C_B_O    : MIMG_Gather_WQM <0x0000005d, "image_gather4_c_b_o">;
-defm IMAGE_GATHER4_C_B_CL_O : MIMG_Gather_WQM <0x0000005e, "image_gather4_c_b_cl_o">;
-defm IMAGE_GATHER4_C_LZ_O   : MIMG_Gather <0x0000005f, "image_gather4_c_lz_o">;
-
-let mayLoad = 0, mayStore = 0 in {
-defm IMAGE_GET_LOD          : MIMG_Sampler_WQM <0x00000060, "image_get_lod">;
-}
-
-defm IMAGE_SAMPLE_CD        : MIMG_Sampler <0x00000068, "image_sample_cd">;
-defm IMAGE_SAMPLE_CD_CL     : MIMG_Sampler <0x00000069, "image_sample_cd_cl">;
-defm IMAGE_SAMPLE_C_CD      : MIMG_Sampler <0x0000006a, "image_sample_c_cd">;
-defm IMAGE_SAMPLE_C_CD_CL   : MIMG_Sampler <0x0000006b, "image_sample_c_cd_cl">;
-defm IMAGE_SAMPLE_CD_O      : MIMG_Sampler <0x0000006c, "image_sample_cd_o">;
-defm IMAGE_SAMPLE_CD_CL_O   : MIMG_Sampler <0x0000006d, "image_sample_cd_cl_o">;
-defm IMAGE_SAMPLE_C_CD_O    : MIMG_Sampler <0x0000006e, "image_sample_c_cd_o">;
-defm IMAGE_SAMPLE_C_CD_CL_O : MIMG_Sampler <0x0000006f, "image_sample_c_cd_cl_o">;
+defm IMAGE_SAMPLE           : MIMG_Sampler_WQM <0x00000020, AMDGPUSample>;
+defm IMAGE_SAMPLE_CL        : MIMG_Sampler_WQM <0x00000021, AMDGPUSample_cl>;
+defm IMAGE_SAMPLE_D         : MIMG_Sampler <0x00000022, AMDGPUSample_d>;
+defm IMAGE_SAMPLE_D_CL      : MIMG_Sampler <0x00000023, AMDGPUSample_d_cl>;
+defm IMAGE_SAMPLE_L         : MIMG_Sampler <0x00000024, AMDGPUSample_l>;
+defm IMAGE_SAMPLE_B         : MIMG_Sampler_WQM <0x00000025, AMDGPUSample_b>;
+defm IMAGE_SAMPLE_B_CL      : MIMG_Sampler_WQM <0x00000026, AMDGPUSample_b_cl>;
+defm IMAGE_SAMPLE_LZ        : MIMG_Sampler <0x00000027, AMDGPUSample_lz>;
+defm IMAGE_SAMPLE_C         : MIMG_Sampler_WQM <0x00000028, AMDGPUSample_c>;
+defm IMAGE_SAMPLE_C_CL      : MIMG_Sampler_WQM <0x00000029, AMDGPUSample_c_cl>;
+defm IMAGE_SAMPLE_C_D       : MIMG_Sampler <0x0000002a, AMDGPUSample_c_d>;
+defm IMAGE_SAMPLE_C_D_CL    : MIMG_Sampler <0x0000002b, AMDGPUSample_c_d_cl>;
+defm IMAGE_SAMPLE_C_L       : MIMG_Sampler <0x0000002c, AMDGPUSample_c_l>;
+defm IMAGE_SAMPLE_C_B       : MIMG_Sampler_WQM <0x0000002d, AMDGPUSample_c_b>;
+defm IMAGE_SAMPLE_C_B_CL    : MIMG_Sampler_WQM <0x0000002e, AMDGPUSample_c_b_cl>;
+defm IMAGE_SAMPLE_C_LZ      : MIMG_Sampler <0x0000002f, AMDGPUSample_c_lz>;
+defm IMAGE_SAMPLE_O         : MIMG_Sampler_WQM <0x00000030, AMDGPUSample_o>;
+defm IMAGE_SAMPLE_CL_O      : MIMG_Sampler_WQM <0x00000031, AMDGPUSample_cl_o>;
+defm IMAGE_SAMPLE_D_O       : MIMG_Sampler <0x00000032, AMDGPUSample_d_o>;
+defm IMAGE_SAMPLE_D_CL_O    : MIMG_Sampler <0x00000033, AMDGPUSample_d_cl_o>;
+defm IMAGE_SAMPLE_L_O       : MIMG_Sampler <0x00000034, AMDGPUSample_l_o>;
+defm IMAGE_SAMPLE_B_O       : MIMG_Sampler_WQM <0x00000035, AMDGPUSample_b_o>;
+defm IMAGE_SAMPLE_B_CL_O    : MIMG_Sampler_WQM <0x00000036, AMDGPUSample_b_cl_o>;
+defm IMAGE_SAMPLE_LZ_O      : MIMG_Sampler <0x00000037, AMDGPUSample_lz_o>;
+defm IMAGE_SAMPLE_C_O       : MIMG_Sampler_WQM <0x00000038, AMDGPUSample_c_o>;
+defm IMAGE_SAMPLE_C_CL_O    : MIMG_Sampler_WQM <0x00000039, AMDGPUSample_c_cl_o>;
+defm IMAGE_SAMPLE_C_D_O     : MIMG_Sampler <0x0000003a, AMDGPUSample_c_d_o>;
+defm IMAGE_SAMPLE_C_D_CL_O  : MIMG_Sampler <0x0000003b, AMDGPUSample_c_d_cl_o>;
+defm IMAGE_SAMPLE_C_L_O     : MIMG_Sampler <0x0000003c, AMDGPUSample_c_l_o>;
+defm IMAGE_SAMPLE_C_B_CL_O  : MIMG_Sampler_WQM <0x0000003e, AMDGPUSample_c_b_cl_o>;
+defm IMAGE_SAMPLE_C_B_O     : MIMG_Sampler_WQM <0x0000003d, AMDGPUSample_c_b_o>;
+defm IMAGE_SAMPLE_C_LZ_O    : MIMG_Sampler <0x0000003f, AMDGPUSample_c_lz_o>;
+defm IMAGE_GATHER4          : MIMG_Gather_WQM <0x00000040, AMDGPUSample>;
+defm IMAGE_GATHER4_CL       : MIMG_Gather_WQM <0x00000041, AMDGPUSample_cl>;
+defm IMAGE_GATHER4_L        : MIMG_Gather <0x00000044, AMDGPUSample_l>;
+defm IMAGE_GATHER4_B        : MIMG_Gather_WQM <0x00000045, AMDGPUSample_b>;
+defm IMAGE_GATHER4_B_CL     : MIMG_Gather_WQM <0x00000046, AMDGPUSample_b_cl>;
+defm IMAGE_GATHER4_LZ       : MIMG_Gather <0x00000047, AMDGPUSample_lz>;
+defm IMAGE_GATHER4_C        : MIMG_Gather_WQM <0x00000048, AMDGPUSample_c>;
+defm IMAGE_GATHER4_C_CL     : MIMG_Gather_WQM <0x00000049, AMDGPUSample_c_cl>;
+defm IMAGE_GATHER4_C_L      : MIMG_Gather <0x0000004c, AMDGPUSample_c_l>;
+defm IMAGE_GATHER4_C_B      : MIMG_Gather_WQM <0x0000004d, AMDGPUSample_c_b>;
+defm IMAGE_GATHER4_C_B_CL   : MIMG_Gather_WQM <0x0000004e, AMDGPUSample_c_b_cl>;
+defm IMAGE_GATHER4_C_LZ     : MIMG_Gather <0x0000004f, AMDGPUSample_c_lz>;
+defm IMAGE_GATHER4_O        : MIMG_Gather_WQM <0x00000050, AMDGPUSample_o>;
+defm IMAGE_GATHER4_CL_O     : MIMG_Gather_WQM <0x00000051, AMDGPUSample_cl_o>;
+defm IMAGE_GATHER4_L_O      : MIMG_Gather <0x00000054, AMDGPUSample_l_o>;
+defm IMAGE_GATHER4_B_O      : MIMG_Gather_WQM <0x00000055, AMDGPUSample_b_o>;
+defm IMAGE_GATHER4_B_CL_O   : MIMG_Gather <0x00000056, AMDGPUSample_b_cl_o>;
+defm IMAGE_GATHER4_LZ_O     : MIMG_Gather <0x00000057, AMDGPUSample_lz_o>;
+defm IMAGE_GATHER4_C_O      : MIMG_Gather_WQM <0x00000058, AMDGPUSample_c_o>;
+defm IMAGE_GATHER4_C_CL_O   : MIMG_Gather_WQM <0x00000059, AMDGPUSample_c_cl_o>;
+defm IMAGE_GATHER4_C_L_O    : MIMG_Gather <0x0000005c, AMDGPUSample_c_l_o>;
+defm IMAGE_GATHER4_C_B_O    : MIMG_Gather_WQM <0x0000005d, AMDGPUSample_c_b_o>;
+defm IMAGE_GATHER4_C_B_CL_O : MIMG_Gather_WQM <0x0000005e, AMDGPUSample_c_b_cl_o>;
+defm IMAGE_GATHER4_C_LZ_O   : MIMG_Gather <0x0000005f, AMDGPUSample_c_lz_o>;
+
+defm IMAGE_GET_LOD          : MIMG_Sampler <0x00000060, AMDGPUSample, 1, 1, "image_get_lod">;
+
+defm IMAGE_SAMPLE_CD        : MIMG_Sampler <0x00000068, AMDGPUSample_cd>;
+defm IMAGE_SAMPLE_CD_CL     : MIMG_Sampler <0x00000069, AMDGPUSample_cd_cl>;
+defm IMAGE_SAMPLE_C_CD      : MIMG_Sampler <0x0000006a, AMDGPUSample_c_cd>;
+defm IMAGE_SAMPLE_C_CD_CL   : MIMG_Sampler <0x0000006b, AMDGPUSample_c_cd_cl>;
+defm IMAGE_SAMPLE_CD_O      : MIMG_Sampler <0x0000006c, AMDGPUSample_cd_o>;
+defm IMAGE_SAMPLE_CD_CL_O   : MIMG_Sampler <0x0000006d, AMDGPUSample_cd_cl_o>;
+defm IMAGE_SAMPLE_C_CD_O    : MIMG_Sampler <0x0000006e, AMDGPUSample_c_cd_o>;
+defm IMAGE_SAMPLE_C_CD_CL_O : MIMG_Sampler <0x0000006f, AMDGPUSample_c_cd_cl_o>;
 //def IMAGE_RSRC256 : MIMG_NoPattern_RSRC256 <"image_rsrc256", 0x0000007e>;
 //def IMAGE_SAMPLER : MIMG_NoPattern_ <"image_sampler", 0x0000007f>;
+
+/********** ========================================= **********/
+/********** Table of dimension-aware image intrinsics **********/
+/********** ========================================= **********/
+
+class ImageDimIntrinsicInfo<AMDGPUImageDimIntrinsic I> {
+  Intrinsic Intr = I;
+  MIMGBaseOpcode BaseOpcode = !cast<MIMGBaseOpcode>(!strconcat("IMAGE_", I.P.OpMod));
+  AMDGPUDimProps Dim = I.P.Dim;
 }
 
-/********** ======================= **********/
-/********** Image sampling patterns **********/
-/********** ======================= **********/
-
-// Image + sampler
-class SampleRawPattern<SDPatternOperator name, MIMG opcode, ValueType vt> : GCNPat <
-  (name vt:$addr, v8i32:$rsrc, v4i32:$sampler, i32:$dmask, i32:$unorm,
-        i32:$r128, i32:$da, i32:$glc, i32:$slc, i32:$tfe, i32:$lwe),
-  (opcode $addr, $rsrc, $sampler,
-          (as_i32imm $dmask), (as_i1imm $unorm), (as_i1imm $glc), (as_i1imm $slc),
-          (as_i1imm $r128), (as_i1imm $tfe), (as_i1imm $lwe), (as_i1imm $da))
->;
-
-multiclass SampleRawPatterns<SDPatternOperator name, string opcode> {
-  def : SampleRawPattern<name, !cast<MIMG>(opcode # _V4_V1), i32>;
-  def : SampleRawPattern<name, !cast<MIMG>(opcode # _V4_V2), v2i32>;
-  def : SampleRawPattern<name, !cast<MIMG>(opcode # _V4_V4), v4i32>;
-  def : SampleRawPattern<name, !cast<MIMG>(opcode # _V4_V8), v8i32>;
-  def : SampleRawPattern<name, !cast<MIMG>(opcode # _V4_V16), v16i32>;
-}
-
-// Image + sampler for amdgcn
-// TODO:
-// 1. Handle half data type like v4f16, and add D16 bit support;
-// 2. Handle v4i32 rsrc type (Register Class for the instruction to be SReg_128).
-// 3. Add A16 support when we pass address of half type.
-multiclass AMDGCNSamplePattern<SDPatternOperator name, MIMG opcode, ValueType dt, ValueType vt>  {
-  def : GCNPat<
-    (dt (name vt:$addr, v8i32:$rsrc, v4i32:$sampler, i32:$dmask, i1:$unorm, i1:$glc,
-        i1:$slc, i1:$lwe, i1:$da)),
-    (opcode $addr, $rsrc, $sampler,
-          (as_i32imm $dmask), (as_i1imm $unorm), (as_i1imm $glc), (as_i1imm $slc),
-          0, 0, (as_i1imm $lwe), (as_i1imm $da))
-    >;
-}
-
-multiclass AMDGCNSampleDataPatterns<SDPatternOperator name, string opcode, ValueType dt> {
-  defm : AMDGCNSamplePattern<name, !cast<MIMG>(opcode # _V1), dt, f32>;
-  defm : AMDGCNSamplePattern<name, !cast<MIMG>(opcode # _V2), dt, v2f32>;
-  defm : AMDGCNSamplePattern<name, !cast<MIMG>(opcode # _V4), dt, v4f32>;
-  defm : AMDGCNSamplePattern<name, !cast<MIMG>(opcode # _V8), dt, v8f32>;
-  defm : AMDGCNSamplePattern<name, !cast<MIMG>(opcode # _V16), dt, v16f32>;
-}
-
-// TODO: support v3f32.
-multiclass AMDGCNSamplePatterns<SDPatternOperator name, string opcode> {
-  defm : AMDGCNSampleDataPatterns<name, !cast<string>(opcode # _V1), f32>;
-  defm : AMDGCNSampleDataPatterns<name, !cast<string>(opcode # _V2), v2f32>;
-  defm : AMDGCNSampleDataPatterns<name, !cast<string>(opcode # _V4), v4f32>;
-}
-
-// Image only
-class ImagePattern<SDPatternOperator name, MIMG opcode, ValueType vt> : GCNPat <
-  (name vt:$addr, v8i32:$rsrc, imm:$dmask, imm:$unorm,
-        imm:$r128, imm:$da, imm:$glc, imm:$slc, imm:$tfe, imm:$lwe),
-  (opcode $addr, $rsrc,
-          (as_i32imm $dmask), (as_i1imm $unorm), (as_i1imm $glc), (as_i1imm $slc),
-          (as_i1imm $r128), (as_i1imm $tfe), (as_i1imm $lwe), (as_i1imm $da))
->;
-
-multiclass ImagePatterns<SDPatternOperator name, string opcode> {
-  def : ImagePattern<name, !cast<MIMG>(opcode # _V4_V1), i32>;
-  def : ImagePattern<name, !cast<MIMG>(opcode # _V4_V2), v2i32>;
-  def : ImagePattern<name, !cast<MIMG>(opcode # _V4_V4), v4i32>;
-}
-
-multiclass ImageLoadPattern<SDPatternOperator name, MIMG opcode, ValueType dt, ValueType vt> {
-  def : GCNPat <
-    (dt (name vt:$addr, v8i32:$rsrc, i32:$dmask, i1:$glc, i1:$slc, i1:$lwe,
-                i1:$da)),
-    (opcode $addr, $rsrc,
-          (as_i32imm $dmask), 1, (as_i1imm $glc), (as_i1imm $slc),
-          0, 0, (as_i1imm $lwe), (as_i1imm $da))
-  >;
-}
-
-multiclass ImageLoadDataPatterns<SDPatternOperator name, string opcode, ValueType dt> {
-  defm : ImageLoadPattern<name, !cast<MIMG>(opcode # _V1), dt, i32>;
-  defm : ImageLoadPattern<name, !cast<MIMG>(opcode # _V2), dt, v2i32>;
-  defm : ImageLoadPattern<name, !cast<MIMG>(opcode # _V4), dt, v4i32>;
-}
-
-// TODO: support v3f32.
-multiclass ImageLoadPatterns<SDPatternOperator name, string opcode> {
-  defm : ImageLoadDataPatterns<name, !cast<string>(opcode # _V1), f32>;
-  defm : ImageLoadDataPatterns<name, !cast<string>(opcode # _V2), v2f32>;
-  defm : ImageLoadDataPatterns<name, !cast<string>(opcode # _V4), v4f32>;
-}
-
-multiclass ImageStorePattern<SDPatternOperator name, MIMG opcode, ValueType dt, ValueType vt> {
-  def : GCNPat <
-    (name dt:$data, vt:$addr, v8i32:$rsrc, i32:$dmask, i1:$glc, i1:$slc,
-          i1:$lwe, i1:$da),
-    (opcode $data, $addr, $rsrc,
-          (as_i32imm $dmask), 1, (as_i1imm $glc), (as_i1imm $slc),
-          0, 0, (as_i1imm $lwe), (as_i1imm $da))
-  >;
-}
-
-multiclass ImageStoreDataPatterns<SDPatternOperator name, string opcode, ValueType dt> {
-  defm : ImageStorePattern<name, !cast<MIMG>(opcode # _V1), dt, i32>;
-  defm : ImageStorePattern<name, !cast<MIMG>(opcode # _V2), dt, v2i32>;
-  defm : ImageStorePattern<name, !cast<MIMG>(opcode # _V4), dt, v4i32>;
-}
-
-// TODO: support v3f32.
-multiclass ImageStorePatterns<SDPatternOperator name, string opcode> {
-  defm : ImageStoreDataPatterns<name, !cast<string>(opcode # _V1), f32>;
-  defm : ImageStoreDataPatterns<name, !cast<string>(opcode # _V2), v2f32>;
-  defm : ImageStoreDataPatterns<name, !cast<string>(opcode # _V4), v4f32>;
-}
-
-class ImageAtomicPattern<SDPatternOperator name, MIMG opcode, ValueType vt> : GCNPat <
-  (name i32:$vdata, vt:$addr, v8i32:$rsrc, imm:$r128, imm:$da, imm:$slc),
-  (opcode $vdata, $addr, $rsrc, 1, 1, 1, (as_i1imm $slc), (as_i1imm $r128), 0, 0, (as_i1imm $da))
->;
-
-multiclass ImageAtomicPatterns<SDPatternOperator name, string opcode> {
-  def : ImageAtomicPattern<name, !cast<MIMG>(opcode # _V1), i32>;
-  def : ImageAtomicPattern<name, !cast<MIMG>(opcode # _V2), v2i32>;
-  def : ImageAtomicPattern<name, !cast<MIMG>(opcode # _V4), v4i32>;
-}
-
-class ImageAtomicCmpSwapPattern<MIMG opcode, ValueType vt> : GCNPat <
-  (int_amdgcn_image_atomic_cmpswap i32:$vsrc, i32:$vcmp, vt:$addr, v8i32:$rsrc,
-                                   imm:$r128, imm:$da, imm:$slc),
-  (EXTRACT_SUBREG
-    (opcode (REG_SEQUENCE VReg_64, $vsrc, sub0, $vcmp, sub1),
-            $addr, $rsrc, 3, 1, 1, (as_i1imm $slc), (as_i1imm $r128), 0, 0, (as_i1imm $da)),
-    sub0)
->;
-
-// ======= amdgcn Image Intrinsics ==============
-
-// Image load
-defm : ImageLoadPatterns<int_amdgcn_image_load, "IMAGE_LOAD">;
-defm : ImageLoadPatterns<int_amdgcn_image_load_mip, "IMAGE_LOAD_MIP">;
-defm : ImageLoadPatterns<int_amdgcn_image_getresinfo, "IMAGE_GET_RESINFO">;
-
-// Image store
-defm : ImageStorePatterns<int_amdgcn_image_store, "IMAGE_STORE">;
-defm : ImageStorePatterns<int_amdgcn_image_store_mip, "IMAGE_STORE_MIP">;
-
-// Basic sample
-defm : AMDGCNSamplePatterns<int_amdgcn_image_sample,           "IMAGE_SAMPLE">;
-defm : AMDGCNSamplePatterns<int_amdgcn_image_sample_cl,        "IMAGE_SAMPLE_CL">;
-defm : AMDGCNSamplePatterns<int_amdgcn_image_sample_d,         "IMAGE_SAMPLE_D">;
-defm : AMDGCNSamplePatterns<int_amdgcn_image_sample_d_cl,      "IMAGE_SAMPLE_D_CL">;
-defm : AMDGCNSamplePatterns<int_amdgcn_image_sample_l,         "IMAGE_SAMPLE_L">;
-defm : AMDGCNSamplePatterns<int_amdgcn_image_sample_b,         "IMAGE_SAMPLE_B">;
-defm : AMDGCNSamplePatterns<int_amdgcn_image_sample_b_cl,      "IMAGE_SAMPLE_B_CL">;
-defm : AMDGCNSamplePatterns<int_amdgcn_image_sample_lz,        "IMAGE_SAMPLE_LZ">;
-defm : AMDGCNSamplePatterns<int_amdgcn_image_sample_cd,        "IMAGE_SAMPLE_CD">;
-defm : AMDGCNSamplePatterns<int_amdgcn_image_sample_cd_cl,     "IMAGE_SAMPLE_CD_CL">;
-
-// Sample with comparison
-defm : AMDGCNSamplePatterns<int_amdgcn_image_sample_c,         "IMAGE_SAMPLE_C">;
-defm : AMDGCNSamplePatterns<int_amdgcn_image_sample_c_cl,      "IMAGE_SAMPLE_C_CL">;
-defm : AMDGCNSamplePatterns<int_amdgcn_image_sample_c_d,       "IMAGE_SAMPLE_C_D">;
-defm : AMDGCNSamplePatterns<int_amdgcn_image_sample_c_d_cl,    "IMAGE_SAMPLE_C_D_CL">;
-defm : AMDGCNSamplePatterns<int_amdgcn_image_sample_c_l,       "IMAGE_SAMPLE_C_L">;
-defm : AMDGCNSamplePatterns<int_amdgcn_image_sample_c_b,       "IMAGE_SAMPLE_C_B">;
-defm : AMDGCNSamplePatterns<int_amdgcn_image_sample_c_b_cl,    "IMAGE_SAMPLE_C_B_CL">;
-defm : AMDGCNSamplePatterns<int_amdgcn_image_sample_c_lz,      "IMAGE_SAMPLE_C_LZ">;
-defm : AMDGCNSamplePatterns<int_amdgcn_image_sample_c_cd,      "IMAGE_SAMPLE_C_CD">;
-defm : AMDGCNSamplePatterns<int_amdgcn_image_sample_c_cd_cl,   "IMAGE_SAMPLE_C_CD_CL">;
-
-// Sample with offsets
-defm : AMDGCNSamplePatterns<int_amdgcn_image_sample_o,         "IMAGE_SAMPLE_O">;
-defm : AMDGCNSamplePatterns<int_amdgcn_image_sample_cl_o,      "IMAGE_SAMPLE_CL_O">;
-defm : AMDGCNSamplePatterns<int_amdgcn_image_sample_d_o,       "IMAGE_SAMPLE_D_O">;
-defm : AMDGCNSamplePatterns<int_amdgcn_image_sample_d_cl_o,    "IMAGE_SAMPLE_D_CL_O">;
-defm : AMDGCNSamplePatterns<int_amdgcn_image_sample_l_o,       "IMAGE_SAMPLE_L_O">;
-defm : AMDGCNSamplePatterns<int_amdgcn_image_sample_b_o,       "IMAGE_SAMPLE_B_O">;
-defm : AMDGCNSamplePatterns<int_amdgcn_image_sample_b_cl_o,    "IMAGE_SAMPLE_B_CL_O">;
-defm : AMDGCNSamplePatterns<int_amdgcn_image_sample_lz_o,      "IMAGE_SAMPLE_LZ_O">;
-defm : AMDGCNSamplePatterns<int_amdgcn_image_sample_cd_o,      "IMAGE_SAMPLE_CD_O">;
-defm : AMDGCNSamplePatterns<int_amdgcn_image_sample_cd_cl_o,   "IMAGE_SAMPLE_CD_CL_O">;
-
-// Sample with comparison and offsets
-defm : AMDGCNSamplePatterns<int_amdgcn_image_sample_c_o,       "IMAGE_SAMPLE_C_O">;
-defm : AMDGCNSamplePatterns<int_amdgcn_image_sample_c_cl_o,    "IMAGE_SAMPLE_C_CL_O">;
-defm : AMDGCNSamplePatterns<int_amdgcn_image_sample_c_d_o,     "IMAGE_SAMPLE_C_D_O">;
-defm : AMDGCNSamplePatterns<int_amdgcn_image_sample_c_d_cl_o,  "IMAGE_SAMPLE_C_D_CL_O">;
-defm : AMDGCNSamplePatterns<int_amdgcn_image_sample_c_l_o,     "IMAGE_SAMPLE_C_L_O">;
-defm : AMDGCNSamplePatterns<int_amdgcn_image_sample_c_b_o,     "IMAGE_SAMPLE_C_B_O">;
-defm : AMDGCNSamplePatterns<int_amdgcn_image_sample_c_b_cl_o,  "IMAGE_SAMPLE_C_B_CL_O">;
-defm : AMDGCNSamplePatterns<int_amdgcn_image_sample_c_lz_o,    "IMAGE_SAMPLE_C_LZ_O">;
-defm : AMDGCNSamplePatterns<int_amdgcn_image_sample_c_cd_o,    "IMAGE_SAMPLE_C_CD_O">;
-defm : AMDGCNSamplePatterns<int_amdgcn_image_sample_c_cd_cl_o, "IMAGE_SAMPLE_C_CD_CL_O">;
-
-// Gather opcodes
-defm : AMDGCNSamplePatterns<int_amdgcn_image_gather4,           "IMAGE_GATHER4">;
-defm : AMDGCNSamplePatterns<int_amdgcn_image_gather4_cl,        "IMAGE_GATHER4_CL">;
-defm : AMDGCNSamplePatterns<int_amdgcn_image_gather4_l,         "IMAGE_GATHER4_L">;
-defm : AMDGCNSamplePatterns<int_amdgcn_image_gather4_b,         "IMAGE_GATHER4_B">;
-defm : AMDGCNSamplePatterns<int_amdgcn_image_gather4_b_cl,      "IMAGE_GATHER4_B_CL">;
-defm : AMDGCNSamplePatterns<int_amdgcn_image_gather4_lz,        "IMAGE_GATHER4_LZ">;
-
-defm : AMDGCNSamplePatterns<int_amdgcn_image_gather4_c,         "IMAGE_GATHER4_C">;
-defm : AMDGCNSamplePatterns<int_amdgcn_image_gather4_c_cl,      "IMAGE_GATHER4_C_CL">;
-defm : AMDGCNSamplePatterns<int_amdgcn_image_gather4_c_l,       "IMAGE_GATHER4_C_L">;
-defm : AMDGCNSamplePatterns<int_amdgcn_image_gather4_c_b,       "IMAGE_GATHER4_C_B">;
-defm : AMDGCNSamplePatterns<int_amdgcn_image_gather4_c_b_cl,    "IMAGE_GATHER4_C_B_CL">;
-defm : AMDGCNSamplePatterns<int_amdgcn_image_gather4_c_lz,      "IMAGE_GATHER4_C_LZ">;
-
-defm : AMDGCNSamplePatterns<int_amdgcn_image_gather4_o,         "IMAGE_GATHER4_O">;
-defm : AMDGCNSamplePatterns<int_amdgcn_image_gather4_cl_o,      "IMAGE_GATHER4_CL_O">;
-defm : AMDGCNSamplePatterns<int_amdgcn_image_gather4_l_o,       "IMAGE_GATHER4_L_O">;
-defm : AMDGCNSamplePatterns<int_amdgcn_image_gather4_b_o,       "IMAGE_GATHER4_B_O">;
-defm : AMDGCNSamplePatterns<int_amdgcn_image_gather4_b_cl_o,    "IMAGE_GATHER4_B_CL_O">;
-defm : AMDGCNSamplePatterns<int_amdgcn_image_gather4_lz_o,      "IMAGE_GATHER4_LZ_O">;
-
-defm : AMDGCNSamplePatterns<int_amdgcn_image_gather4_c_o,       "IMAGE_GATHER4_C_O">;
-defm : AMDGCNSamplePatterns<int_amdgcn_image_gather4_c_cl_o,    "IMAGE_GATHER4_C_CL_O">;
-defm : AMDGCNSamplePatterns<int_amdgcn_image_gather4_c_l_o,     "IMAGE_GATHER4_C_L_O">;
-defm : AMDGCNSamplePatterns<int_amdgcn_image_gather4_c_b_o,     "IMAGE_GATHER4_C_B_O">;
-defm : AMDGCNSamplePatterns<int_amdgcn_image_gather4_c_b_cl_o,  "IMAGE_GATHER4_C_B_CL_O">;
-defm : AMDGCNSamplePatterns<int_amdgcn_image_gather4_c_lz_o,    "IMAGE_GATHER4_C_LZ_O">;
-
-defm : AMDGCNSamplePatterns<int_amdgcn_image_getlod, "IMAGE_GET_LOD">;
-
-// Image atomics
-defm : ImageAtomicPatterns<int_amdgcn_image_atomic_swap, "IMAGE_ATOMIC_SWAP">;
-def : ImageAtomicCmpSwapPattern<IMAGE_ATOMIC_CMPSWAP_V1, i32>;
-def : ImageAtomicCmpSwapPattern<IMAGE_ATOMIC_CMPSWAP_V2, v2i32>;
-def : ImageAtomicCmpSwapPattern<IMAGE_ATOMIC_CMPSWAP_V4, v4i32>;
-defm : ImageAtomicPatterns<int_amdgcn_image_atomic_add, "IMAGE_ATOMIC_ADD">;
-defm : ImageAtomicPatterns<int_amdgcn_image_atomic_sub, "IMAGE_ATOMIC_SUB">;
-defm : ImageAtomicPatterns<int_amdgcn_image_atomic_smin, "IMAGE_ATOMIC_SMIN">;
-defm : ImageAtomicPatterns<int_amdgcn_image_atomic_umin, "IMAGE_ATOMIC_UMIN">;
-defm : ImageAtomicPatterns<int_amdgcn_image_atomic_smax, "IMAGE_ATOMIC_SMAX">;
-defm : ImageAtomicPatterns<int_amdgcn_image_atomic_umax, "IMAGE_ATOMIC_UMAX">;
-defm : ImageAtomicPatterns<int_amdgcn_image_atomic_and, "IMAGE_ATOMIC_AND">;
-defm : ImageAtomicPatterns<int_amdgcn_image_atomic_or, "IMAGE_ATOMIC_OR">;
-defm : ImageAtomicPatterns<int_amdgcn_image_atomic_xor, "IMAGE_ATOMIC_XOR">;
-defm : ImageAtomicPatterns<int_amdgcn_image_atomic_inc, "IMAGE_ATOMIC_INC">;
-defm : ImageAtomicPatterns<int_amdgcn_image_atomic_dec, "IMAGE_ATOMIC_DEC">;
-
-/* SIsample for simple 1D texture lookup */
-def : GCNPat <
-  (SIsample i32:$addr, v8i32:$rsrc, v4i32:$sampler, imm),
-  (IMAGE_SAMPLE_V4_V1 $addr, $rsrc, $sampler, 0xf, 0, 0, 0, 0, 0, 0, 0)
->;
-
-class SamplePattern<SDNode name, MIMG opcode, ValueType vt> : GCNPat <
-    (name vt:$addr, v8i32:$rsrc, v4i32:$sampler, imm),
-    (opcode $addr, $rsrc, $sampler, 0xf, 0, 0, 0, 0, 0, 0, 0)
->;
-
-class SampleRectPattern<SDNode name, MIMG opcode, ValueType vt> : GCNPat <
-    (name vt:$addr, v8i32:$rsrc, v4i32:$sampler, TEX_RECT),
-    (opcode $addr, $rsrc, $sampler, 0xf, 1, 0, 0, 0, 0, 0, 0)
->;
-
-class SampleArrayPattern<SDNode name, MIMG opcode, ValueType vt> : GCNPat <
-    (name vt:$addr, v8i32:$rsrc, v4i32:$sampler, TEX_ARRAY),
-    (opcode $addr, $rsrc, $sampler, 0xf, 0, 0, 0, 0, 0, 0, 1)
->;
-
-class SampleShadowPattern<SDNode name, MIMG opcode,
-                          ValueType vt> : GCNPat <
-    (name vt:$addr, v8i32:$rsrc, v4i32:$sampler, TEX_SHADOW),
-    (opcode $addr, $rsrc, $sampler, 0xf, 0, 0, 0, 0, 0, 0, 0)
->;
-
-class SampleShadowArrayPattern<SDNode name, MIMG opcode,
-                               ValueType vt> : GCNPat <
-    (name vt:$addr, v8i32:$rsrc, v4i32:$sampler, TEX_SHADOW_ARRAY),
-    (opcode $addr, $rsrc, $sampler, 0xf, 0, 0, 0, 0, 0, 0, 1)
->;
-
-/* SIsample* for texture lookups consuming more address parameters */
-multiclass SamplePatterns<MIMG sample, MIMG sample_c, MIMG sample_l,
-                          MIMG sample_c_l, MIMG sample_b, MIMG sample_c_b,
-MIMG sample_d, MIMG sample_c_d, ValueType addr_type> {
-  def : SamplePattern <SIsample, sample, addr_type>;
-  def : SampleRectPattern <SIsample, sample, addr_type>;
-  def : SampleArrayPattern <SIsample, sample, addr_type>;
-  def : SampleShadowPattern <SIsample, sample_c, addr_type>;
-  def : SampleShadowArrayPattern <SIsample, sample_c, addr_type>;
-
-  def : SamplePattern <SIsamplel, sample_l, addr_type>;
-  def : SampleArrayPattern <SIsamplel, sample_l, addr_type>;
-  def : SampleShadowPattern <SIsamplel, sample_c_l, addr_type>;
-  def : SampleShadowArrayPattern <SIsamplel, sample_c_l, addr_type>;
-
-  def : SamplePattern <SIsampleb, sample_b, addr_type>;
-  def : SampleArrayPattern <SIsampleb, sample_b, addr_type>;
-  def : SampleShadowPattern <SIsampleb, sample_c_b, addr_type>;
-  def : SampleShadowArrayPattern <SIsampleb, sample_c_b, addr_type>;
-
-  def : SamplePattern <SIsampled, sample_d, addr_type>;
-  def : SampleArrayPattern <SIsampled, sample_d, addr_type>;
-  def : SampleShadowPattern <SIsampled, sample_c_d, addr_type>;
-  def : SampleShadowArrayPattern <SIsampled, sample_c_d, addr_type>;
-}
-
-defm : SamplePatterns<IMAGE_SAMPLE_V4_V2, IMAGE_SAMPLE_C_V4_V2,
-                      IMAGE_SAMPLE_L_V4_V2, IMAGE_SAMPLE_C_L_V4_V2,
-                      IMAGE_SAMPLE_B_V4_V2, IMAGE_SAMPLE_C_B_V4_V2,
-                      IMAGE_SAMPLE_D_V4_V2, IMAGE_SAMPLE_C_D_V4_V2,
-                      v2i32>;
-defm : SamplePatterns<IMAGE_SAMPLE_V4_V4, IMAGE_SAMPLE_C_V4_V4,
-                      IMAGE_SAMPLE_L_V4_V4, IMAGE_SAMPLE_C_L_V4_V4,
-                      IMAGE_SAMPLE_B_V4_V4, IMAGE_SAMPLE_C_B_V4_V4,
-                      IMAGE_SAMPLE_D_V4_V4, IMAGE_SAMPLE_C_D_V4_V4,
-                      v4i32>;
-defm : SamplePatterns<IMAGE_SAMPLE_V4_V8, IMAGE_SAMPLE_C_V4_V8,
-                      IMAGE_SAMPLE_L_V4_V8, IMAGE_SAMPLE_C_L_V4_V8,
-                      IMAGE_SAMPLE_B_V4_V8, IMAGE_SAMPLE_C_B_V4_V8,
-                      IMAGE_SAMPLE_D_V4_V8, IMAGE_SAMPLE_C_D_V4_V8,
-                      v8i32>;
-defm : SamplePatterns<IMAGE_SAMPLE_V4_V16, IMAGE_SAMPLE_C_V4_V16,
-                      IMAGE_SAMPLE_L_V4_V16, IMAGE_SAMPLE_C_L_V4_V16,
-                      IMAGE_SAMPLE_B_V4_V16, IMAGE_SAMPLE_C_B_V4_V16,
-                      IMAGE_SAMPLE_D_V4_V16, IMAGE_SAMPLE_C_D_V4_V16,
-                      v16i32>;
+def ImageDimIntrinsicTable : GenericTable {
+  let FilterClass = "ImageDimIntrinsicInfo";
+  let Fields = ["Intr", "BaseOpcode", "Dim"];
+  GenericEnum TypeOf_BaseOpcode = MIMGBaseOpcode;
+  GenericEnum TypeOf_Dim = MIMGDim;
+
+  let PrimaryKey = ["Intr"];
+  let PrimaryKeyName = "getImageDimIntrinsicInfo";
+  let PrimaryKeyEarlyOut = 1;
+}
+
+foreach intr = !listconcat(AMDGPUImageDimIntrinsics,
+                           AMDGPUImageDimAtomicIntrinsics) in {
+  def : ImageDimIntrinsicInfo<intr>;
+}
diff --git a/lib/Target/AMDGPU/Processors.td b/lib/Target/AMDGPU/Processors.td
deleted file mode 100644
index d50dae78e247..000000000000
--- a/lib/Target/AMDGPU/Processors.td
+++ /dev/null
@@ -1,12 +0,0 @@
-//===-- Processors.td - AMDGPU Processor definitions ----------------------===//
-//
-//                     The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
-//
-//===----------------------------------------------------------------------===//
-
-FIXME: Deleting this file broke buildbots that don't do full rebuilds. This
-file is no longer used by the backend, so it can be deleted once all
-the buildbots update there dependencies.
diff --git a/lib/Target/AMDGPU/R600.td b/lib/Target/AMDGPU/R600.td
new file mode 100644
index 000000000000..5c9c1c1ed504
--- /dev/null
+++ b/lib/Target/AMDGPU/R600.td
@@ -0,0 +1,54 @@
+//===-- R600.td - R600 Tablegen files ----------------------*- tablegen -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+include "llvm/Target/Target.td"
+
+def R600InstrInfo : InstrInfo {
+  let guessInstructionProperties = 1;
+  let noNamedPositionallyEncodedOperands = 1;
+}
+
+def R600 : Target {
+  let InstructionSet = R600InstrInfo;
+  let AllowRegisterRenaming = 1;
+}
+
+let Namespace = "R600" in {
+
+foreach Index = 0-15 in {
+  def sub#Index : SubRegIndex<32, !shl(Index, 5)>;
+}
+
+include "R600RegisterInfo.td"
+
+}
+
+def NullALU : InstrItinClass;
+def ALU_NULL : FuncUnit;
+
+include "AMDGPUFeatures.td"
+include "R600Schedule.td"
+include "R600Processors.td"
+include "AMDGPUInstrInfo.td"
+include "AMDGPUInstructions.td"
+include "R600Instructions.td"
+include "R700Instructions.td"
+include "EvergreenInstructions.td"
+include "CaymanInstructions.td"
+
+// Calling convention for R600
+def CC_R600 : CallingConv<[
+  CCIfInReg<CCIfType<[v4f32, v4i32] , CCAssignToReg<[
+    T0_XYZW, T1_XYZW, T2_XYZW, T3_XYZW, T4_XYZW, T5_XYZW, T6_XYZW, T7_XYZW,
+    T8_XYZW, T9_XYZW, T10_XYZW, T11_XYZW, T12_XYZW, T13_XYZW, T14_XYZW, T15_XYZW,
+    T16_XYZW, T17_XYZW, T18_XYZW, T19_XYZW, T20_XYZW, T21_XYZW, T22_XYZW,
+    T23_XYZW, T24_XYZW, T25_XYZW, T26_XYZW, T27_XYZW, T28_XYZW, T29_XYZW,
+    T30_XYZW, T31_XYZW, T32_XYZW
+  ]>>>
+]>;
diff --git a/lib/Target/AMDGPU/R600AsmPrinter.cpp b/lib/Target/AMDGPU/R600AsmPrinter.cpp
new file mode 100644
index 000000000000..68f8c30775b8
--- /dev/null
+++ b/lib/Target/AMDGPU/R600AsmPrinter.cpp
@@ -0,0 +1,133 @@
+//===-- R600AsmPrinter.cpp - R600 Assebly printer  ------------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+/// \file
+///
+/// The R600AsmPrinter is used to print both assembly string and also binary
+/// code.  When passed an MCAsmStreamer it prints assembly and when passed
+/// an MCObjectStreamer it outputs binary code.
+//
+//===----------------------------------------------------------------------===//
+
+#include "R600AsmPrinter.h"
+#include "AMDGPUSubtarget.h"
+#include "R600Defines.h"
+#include "R600MachineFunctionInfo.h"
+#include "MCTargetDesc/AMDGPUMCTargetDesc.h"
+#include "llvm/BinaryFormat/ELF.h"
+#include "llvm/MC/MCContext.h"
+#include "llvm/MC/MCSectionELF.h"
+#include "llvm/MC/MCStreamer.h"
+#include "llvm/Target/TargetLoweringObjectFile.h"
+
+using namespace llvm;
+
+AsmPrinter *
+llvm::createR600AsmPrinterPass(TargetMachine &TM,
+                               std::unique_ptr<MCStreamer> &&Streamer) {
+  return new R600AsmPrinter(TM, std::move(Streamer));
+}
+
+R600AsmPrinter::R600AsmPrinter(TargetMachine &TM,
+                               std::unique_ptr<MCStreamer> Streamer)
+  : AsmPrinter(TM, std::move(Streamer)) { }
+
+StringRef R600AsmPrinter::getPassName() const {
+  return "R600 Assembly Printer";
+}
+
+void R600AsmPrinter::EmitProgramInfoR600(const MachineFunction &MF) {
+  unsigned MaxGPR = 0;
+  bool killPixel = false;
+  const R600Subtarget &STM = MF.getSubtarget<R600Subtarget>();
+  const R600RegisterInfo *RI = STM.getRegisterInfo();
+  const R600MachineFunctionInfo *MFI = MF.getInfo<R600MachineFunctionInfo>();
+
+  for (const MachineBasicBlock &MBB : MF) {
+    for (const MachineInstr &MI : MBB) {
+      if (MI.getOpcode() == R600::KILLGT)
+        killPixel = true;
+      unsigned numOperands = MI.getNumOperands();
+      for (unsigned op_idx = 0; op_idx < numOperands; op_idx++) {
+        const MachineOperand &MO = MI.getOperand(op_idx);
+        if (!MO.isReg())
+          continue;
+        unsigned HWReg = RI->getHWRegIndex(MO.getReg());
+
+        // Register with value > 127 aren't GPR
+        if (HWReg > 127)
+          continue;
+        MaxGPR = std::max(MaxGPR, HWReg);
+      }
+    }
+  }
+
+  unsigned RsrcReg;
+  if (STM.getGeneration() >= AMDGPUSubtarget::EVERGREEN) {
+    // Evergreen / Northern Islands
+    switch (MF.getFunction().getCallingConv()) {
+    default: LLVM_FALLTHROUGH;
+    case CallingConv::AMDGPU_CS: RsrcReg = R_0288D4_SQ_PGM_RESOURCES_LS; break;
+    case CallingConv::AMDGPU_GS: RsrcReg = R_028878_SQ_PGM_RESOURCES_GS; break;
+    case CallingConv::AMDGPU_PS: RsrcReg = R_028844_SQ_PGM_RESOURCES_PS; break;
+    case CallingConv::AMDGPU_VS: RsrcReg = R_028860_SQ_PGM_RESOURCES_VS; break;
+    }
+  } else {
+    // R600 / R700
+    switch (MF.getFunction().getCallingConv()) {
+    default: LLVM_FALLTHROUGH;
+    case CallingConv::AMDGPU_GS: LLVM_FALLTHROUGH;
+    case CallingConv::AMDGPU_CS: LLVM_FALLTHROUGH;
+    case CallingConv::AMDGPU_VS: RsrcReg = R_028868_SQ_PGM_RESOURCES_VS; break;
+    case CallingConv::AMDGPU_PS: RsrcReg = R_028850_SQ_PGM_RESOURCES_PS; break;
+    }
+  }
+
+  OutStreamer->EmitIntValue(RsrcReg, 4);
+  OutStreamer->EmitIntValue(S_NUM_GPRS(MaxGPR + 1) |
+                           S_STACK_SIZE(MFI->CFStackSize), 4);
+  OutStreamer->EmitIntValue(R_02880C_DB_SHADER_CONTROL, 4);
+  OutStreamer->EmitIntValue(S_02880C_KILL_ENABLE(killPixel), 4);
+
+  if (AMDGPU::isCompute(MF.getFunction().getCallingConv())) {
+    OutStreamer->EmitIntValue(R_0288E8_SQ_LDS_ALLOC, 4);
+    OutStreamer->EmitIntValue(alignTo(MFI->getLDSSize(), 4) >> 2, 4);
+  }
+}
+
+bool R600AsmPrinter::runOnMachineFunction(MachineFunction &MF) {
+
+
+  // Functions needs to be cacheline (256B) aligned.
+  MF.ensureAlignment(8);
+
+  SetupMachineFunction(MF);
+
+  MCContext &Context = getObjFileLowering().getContext();
+  MCSectionELF *ConfigSection =
+      Context.getELFSection(".AMDGPU.config", ELF::SHT_PROGBITS, 0);
+  OutStreamer->SwitchSection(ConfigSection);
+
+  EmitProgramInfoR600(MF);
+
+  EmitFunctionBody();
+
+  if (isVerbose()) {
+    MCSectionELF *CommentSection =
+        Context.getELFSection(".AMDGPU.csdata", ELF::SHT_PROGBITS, 0);
+    OutStreamer->SwitchSection(CommentSection);
+
+    R600MachineFunctionInfo *MFI = MF.getInfo<R600MachineFunctionInfo>();
+    OutStreamer->emitRawComment(
+      Twine("SQ_PGM_RESOURCES:STACK_SIZE = " + Twine(MFI->CFStackSize)));
+  }
+
+  return false;
+}
+
diff --git a/lib/Target/AMDGPU/R600AsmPrinter.h b/lib/Target/AMDGPU/R600AsmPrinter.h
new file mode 100644
index 000000000000..079fc707b03c
--- /dev/null
+++ b/lib/Target/AMDGPU/R600AsmPrinter.h
@@ -0,0 +1,46 @@
+//===-- R600AsmPrinter.h - Print R600 assembly code -------------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+/// \file
+/// R600 Assembly printer class.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_TARGET_AMDGPU_R600ASMPRINTER_H
+#define LLVM_LIB_TARGET_AMDGPU_R600ASMPRINTER_H
+
+#include "llvm/CodeGen/AsmPrinter.h"
+
+namespace llvm {
+
+class R600AsmPrinter final : public AsmPrinter {
+
+public:
+  explicit R600AsmPrinter(TargetMachine &TM,
+                          std::unique_ptr<MCStreamer> Streamer);
+  StringRef getPassName() const override;
+  bool runOnMachineFunction(MachineFunction &MF) override;
+  /// Implemented in AMDGPUMCInstLower.cpp
+  void EmitInstruction(const MachineInstr *MI) override;
+  /// Lower the specified LLVM Constant to an MCExpr.
+  /// The AsmPrinter::lowerConstantof does not know how to lower
+  /// addrspacecast, therefore they should be lowered by this function.
+  const MCExpr *lowerConstant(const Constant *CV) override;
+
+private:
+  void EmitProgramInfoR600(const MachineFunction &MF);
+};
+
+AsmPrinter *
+createR600AsmPrinterPass(TargetMachine &TM,
+                         std::unique_ptr<MCStreamer> &&Streamer);
+
+} // namespace llvm
+
+#endif // LLVM_LIB_TARGET_AMDGPU_R600ASMPRINTER_H
diff --git a/lib/Target/AMDGPU/R600ClauseMergePass.cpp b/lib/Target/AMDGPU/R600ClauseMergePass.cpp
index 5e1ba6b506da..0c62d6a4b3d9 100644
--- a/lib/Target/AMDGPU/R600ClauseMergePass.cpp
+++ b/lib/Target/AMDGPU/R600ClauseMergePass.cpp
@@ -19,6 +19,7 @@
 #include "R600InstrInfo.h"
 #include "R600MachineFunctionInfo.h"
 #include "R600RegisterInfo.h"
+#include "MCTargetDesc/AMDGPUMCTargetDesc.h"
 #include "llvm/CodeGen/MachineFunctionPass.h"
 #include "llvm/CodeGen/MachineInstrBuilder.h"
 #include "llvm/CodeGen/MachineRegisterInfo.h"
@@ -33,8 +34,8 @@ namespace {
 
 static bool isCFAlu(const MachineInstr &MI) {
   switch (MI.getOpcode()) {
-  case AMDGPU::CF_ALU:
-  case AMDGPU::CF_ALU_PUSH_BEFORE:
+  case R600::CF_ALU:
+  case R600::CF_ALU_PUSH_BEFORE:
     return true;
   default:
     return false;
@@ -84,20 +85,20 @@ char &llvm::R600ClauseMergePassID = R600ClauseMergePass::ID;
 unsigned R600ClauseMergePass::getCFAluSize(const MachineInstr &MI) const {
   assert(isCFAlu(MI));
   return MI
-      .getOperand(TII->getOperandIdx(MI.getOpcode(), AMDGPU::OpName::COUNT))
+      .getOperand(TII->getOperandIdx(MI.getOpcode(), R600::OpName::COUNT))
       .getImm();
 }
 
 bool R600ClauseMergePass::isCFAluEnabled(const MachineInstr &MI) const {
   assert(isCFAlu(MI));
   return MI
-      .getOperand(TII->getOperandIdx(MI.getOpcode(), AMDGPU::OpName::Enabled))
+      .getOperand(TII->getOperandIdx(MI.getOpcode(), R600::OpName::Enabled))
       .getImm();
 }
 
 void R600ClauseMergePass::cleanPotentialDisabledCFAlu(
     MachineInstr &CFAlu) const {
-  int CntIdx = TII->getOperandIdx(AMDGPU::CF_ALU, AMDGPU::OpName::COUNT);
+  int CntIdx = TII->getOperandIdx(R600::CF_ALU, R600::OpName::COUNT);
   MachineBasicBlock::iterator I = CFAlu, E = CFAlu.getParent()->end();
   I++;
   do {
@@ -116,46 +117,46 @@ void R600ClauseMergePass::cleanPotentialDisabledCFAlu(
 bool R600ClauseMergePass::mergeIfPossible(MachineInstr &RootCFAlu,
                                           const MachineInstr &LatrCFAlu) const {
   assert(isCFAlu(RootCFAlu) && isCFAlu(LatrCFAlu));
-  int CntIdx = TII->getOperandIdx(AMDGPU::CF_ALU, AMDGPU::OpName::COUNT);
+  int CntIdx = TII->getOperandIdx(R600::CF_ALU, R600::OpName::COUNT);
   unsigned RootInstCount = getCFAluSize(RootCFAlu),
       LaterInstCount = getCFAluSize(LatrCFAlu);
   unsigned CumuledInsts = RootInstCount + LaterInstCount;
   if (CumuledInsts >= TII->getMaxAlusPerClause()) {
-    DEBUG(dbgs() << "Excess inst counts\n");
+    LLVM_DEBUG(dbgs() << "Excess inst counts\n");
     return false;
   }
-  if (RootCFAlu.getOpcode() == AMDGPU::CF_ALU_PUSH_BEFORE)
+  if (RootCFAlu.getOpcode() == R600::CF_ALU_PUSH_BEFORE)
     return false;
   // Is KCache Bank 0 compatible ?
   int Mode0Idx =
-      TII->getOperandIdx(AMDGPU::CF_ALU, AMDGPU::OpName::KCACHE_MODE0);
+      TII->getOperandIdx(R600::CF_ALU, R600::OpName::KCACHE_MODE0);
   int KBank0Idx =
-      TII->getOperandIdx(AMDGPU::CF_ALU, AMDGPU::OpName::KCACHE_BANK0);
+      TII->getOperandIdx(R600::CF_ALU, R600::OpName::KCACHE_BANK0);
   int KBank0LineIdx =
-      TII->getOperandIdx(AMDGPU::CF_ALU, AMDGPU::OpName::KCACHE_ADDR0);
+      TII->getOperandIdx(R600::CF_ALU, R600::OpName::KCACHE_ADDR0);
   if (LatrCFAlu.getOperand(Mode0Idx).getImm() &&
       RootCFAlu.getOperand(Mode0Idx).getImm() &&
       (LatrCFAlu.getOperand(KBank0Idx).getImm() !=
            RootCFAlu.getOperand(KBank0Idx).getImm() ||
        LatrCFAlu.getOperand(KBank0LineIdx).getImm() !=
            RootCFAlu.getOperand(KBank0LineIdx).getImm())) {
-    DEBUG(dbgs() << "Wrong KC0\n");
+    LLVM_DEBUG(dbgs() << "Wrong KC0\n");
     return false;
   }
   // Is KCache Bank 1 compatible ?
   int Mode1Idx =
-      TII->getOperandIdx(AMDGPU::CF_ALU, AMDGPU::OpName::KCACHE_MODE1);
+      TII->getOperandIdx(R600::CF_ALU, R600::OpName::KCACHE_MODE1);
   int KBank1Idx =
-      TII->getOperandIdx(AMDGPU::CF_ALU, AMDGPU::OpName::KCACHE_BANK1);
+      TII->getOperandIdx(R600::CF_ALU, R600::OpName::KCACHE_BANK1);
   int KBank1LineIdx =
-      TII->getOperandIdx(AMDGPU::CF_ALU, AMDGPU::OpName::KCACHE_ADDR1);
+      TII->getOperandIdx(R600::CF_ALU, R600::OpName::KCACHE_ADDR1);
   if (LatrCFAlu.getOperand(Mode1Idx).getImm() &&
       RootCFAlu.getOperand(Mode1Idx).getImm() &&
       (LatrCFAlu.getOperand(KBank1Idx).getImm() !=
            RootCFAlu.getOperand(KBank1Idx).getImm() ||
        LatrCFAlu.getOperand(KBank1LineIdx).getImm() !=
            RootCFAlu.getOperand(KBank1LineIdx).getImm())) {
-    DEBUG(dbgs() << "Wrong KC0\n");
+    LLVM_DEBUG(dbgs() << "Wrong KC0\n");
     return false;
   }
   if (LatrCFAlu.getOperand(Mode0Idx).getImm()) {
diff --git a/lib/Target/AMDGPU/R600ControlFlowFinalizer.cpp b/lib/Target/AMDGPU/R600ControlFlowFinalizer.cpp
index 0e788df1c9c0..a19020276f35 100644
--- a/lib/Target/AMDGPU/R600ControlFlowFinalizer.cpp
+++ b/lib/Target/AMDGPU/R600ControlFlowFinalizer.cpp
@@ -19,6 +19,7 @@
 #include "R600InstrInfo.h"
 #include "R600MachineFunctionInfo.h"
 #include "R600RegisterInfo.h"
+#include "MCTargetDesc/AMDGPUMCTargetDesc.h"
 #include "llvm/ADT/STLExtras.h"
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/ADT/StringRef.h"
@@ -93,7 +94,7 @@ bool CFStack::branchStackContains(CFStack::StackItem Item) {
 }
 
 bool CFStack::requiresWorkAroundForInst(unsigned Opcode) {
-  if (Opcode == AMDGPU::CF_ALU_PUSH_BEFORE && ST->hasCaymanISA() &&
+  if (Opcode == R600::CF_ALU_PUSH_BEFORE && ST->hasCaymanISA() &&
       getLoopDepth() > 1)
     return true;
 
@@ -102,10 +103,10 @@ bool CFStack::requiresWorkAroundForInst(unsigned Opcode) {
 
   switch(Opcode) {
   default: return false;
-  case AMDGPU::CF_ALU_PUSH_BEFORE:
-  case AMDGPU::CF_ALU_ELSE_AFTER:
-  case AMDGPU::CF_ALU_BREAK:
-  case AMDGPU::CF_ALU_CONTINUE:
+  case R600::CF_ALU_PUSH_BEFORE:
+  case R600::CF_ALU_ELSE_AFTER:
+  case R600::CF_ALU_BREAK:
+  case R600::CF_ALU_CONTINUE:
     if (CurrentSubEntries == 0)
       return false;
     if (ST->getWavefrontSize() == 64) {
@@ -136,7 +137,7 @@ unsigned CFStack::getSubEntrySize(CFStack::StackItem Item) {
     return 0;
   case CFStack::FIRST_NON_WQM_PUSH:
   assert(!ST->hasCaymanISA());
-  if (ST->getGeneration() <= R600Subtarget::R700) {
+  if (ST->getGeneration() <= AMDGPUSubtarget::R700) {
     // +1 For the push operation.
     // +2 Extra space required.
     return 3;
@@ -149,7 +150,7 @@ unsigned CFStack::getSubEntrySize(CFStack::StackItem Item) {
     return 2;
   }
   case CFStack::FIRST_NON_WQM_PUSH_W_FULL_ENTRY:
-    assert(ST->getGeneration() >= R600Subtarget::EVERGREEN);
+    assert(ST->getGeneration() >= AMDGPUSubtarget::EVERGREEN);
     // +1 For the push operation.
     // +1 Extra space required.
     return 2;
@@ -167,8 +168,8 @@ void CFStack::updateMaxStackSize() {
 void CFStack::pushBranch(unsigned Opcode, bool isWQM) {
   CFStack::StackItem Item = CFStack::ENTRY;
   switch(Opcode) {
-  case AMDGPU::CF_PUSH_EG:
-  case AMDGPU::CF_ALU_PUSH_BEFORE:
+  case R600::CF_PUSH_EG:
+  case R600::CF_ALU_PUSH_BEFORE:
     if (!isWQM) {
       if (!ST->hasCaymanISA() &&
           !branchStackContains(CFStack::FIRST_NON_WQM_PUSH))
@@ -176,7 +177,7 @@ void CFStack::pushBranch(unsigned Opcode, bool isWQM) {
                                              // See comment in
                                              // CFStack::getSubEntrySize()
       else if (CurrentEntries > 0 &&
-               ST->getGeneration() > R600Subtarget::EVERGREEN &&
+               ST->getGeneration() > AMDGPUSubtarget::EVERGREEN &&
                !ST->hasCaymanISA() &&
                !branchStackContains(CFStack::FIRST_NON_WQM_PUSH_W_FULL_ENTRY))
         Item = CFStack::FIRST_NON_WQM_PUSH_W_FULL_ENTRY;
@@ -239,8 +240,8 @@ private:
 
   bool IsTrivialInst(MachineInstr &MI) const {
     switch (MI.getOpcode()) {
-    case AMDGPU::KILL:
-    case AMDGPU::RETURN:
+    case R600::KILL:
+    case R600::RETURN:
       return true;
     default:
       return false;
@@ -249,44 +250,44 @@ private:
 
   const MCInstrDesc &getHWInstrDesc(ControlFlowInstruction CFI) const {
     unsigned Opcode = 0;
-    bool isEg = (ST->getGeneration() >= R600Subtarget::EVERGREEN);
+    bool isEg = (ST->getGeneration() >= AMDGPUSubtarget::EVERGREEN);
     switch (CFI) {
     case CF_TC:
-      Opcode = isEg ? AMDGPU::CF_TC_EG : AMDGPU::CF_TC_R600;
+      Opcode = isEg ? R600::CF_TC_EG : R600::CF_TC_R600;
       break;
     case CF_VC:
-      Opcode = isEg ? AMDGPU::CF_VC_EG : AMDGPU::CF_VC_R600;
+      Opcode = isEg ? R600::CF_VC_EG : R600::CF_VC_R600;
       break;
     case CF_CALL_FS:
-      Opcode = isEg ? AMDGPU::CF_CALL_FS_EG : AMDGPU::CF_CALL_FS_R600;
+      Opcode = isEg ? R600::CF_CALL_FS_EG : R600::CF_CALL_FS_R600;
       break;
     case CF_WHILE_LOOP:
-      Opcode = isEg ? AMDGPU::WHILE_LOOP_EG : AMDGPU::WHILE_LOOP_R600;
+      Opcode = isEg ? R600::WHILE_LOOP_EG : R600::WHILE_LOOP_R600;
       break;
     case CF_END_LOOP:
-      Opcode = isEg ? AMDGPU::END_LOOP_EG : AMDGPU::END_LOOP_R600;
+      Opcode = isEg ? R600::END_LOOP_EG : R600::END_LOOP_R600;
       break;
     case CF_LOOP_BREAK:
-      Opcode = isEg ? AMDGPU::LOOP_BREAK_EG : AMDGPU::LOOP_BREAK_R600;
+      Opcode = isEg ? R600::LOOP_BREAK_EG : R600::LOOP_BREAK_R600;
       break;
     case CF_LOOP_CONTINUE:
-      Opcode = isEg ? AMDGPU::CF_CONTINUE_EG : AMDGPU::CF_CONTINUE_R600;
+      Opcode = isEg ? R600::CF_CONTINUE_EG : R600::CF_CONTINUE_R600;
       break;
     case CF_JUMP:
-      Opcode = isEg ? AMDGPU::CF_JUMP_EG : AMDGPU::CF_JUMP_R600;
+      Opcode = isEg ? R600::CF_JUMP_EG : R600::CF_JUMP_R600;
       break;
     case CF_ELSE:
-      Opcode = isEg ? AMDGPU::CF_ELSE_EG : AMDGPU::CF_ELSE_R600;
+      Opcode = isEg ? R600::CF_ELSE_EG : R600::CF_ELSE_R600;
       break;
     case CF_POP:
-      Opcode = isEg ? AMDGPU::POP_EG : AMDGPU::POP_R600;
+      Opcode = isEg ? R600::POP_EG : R600::POP_R600;
       break;
     case CF_END:
       if (ST->hasCaymanISA()) {
-        Opcode = AMDGPU::CF_END_CM;
+        Opcode = R600::CF_END_CM;
         break;
       }
-      Opcode = isEg ? AMDGPU::CF_END_EG : AMDGPU::CF_END_R600;
+      Opcode = isEg ? R600::CF_END_EG : R600::CF_END_R600;
       break;
     }
     assert (Opcode && "No opcode selected");
@@ -304,21 +305,21 @@ private:
         continue;
       if (MO.isDef()) {
         unsigned Reg = MO.getReg();
-        if (AMDGPU::R600_Reg128RegClass.contains(Reg))
+        if (R600::R600_Reg128RegClass.contains(Reg))
           DstMI = Reg;
         else
           DstMI = TRI->getMatchingSuperReg(Reg,
-              TRI->getSubRegFromChannel(TRI->getHWRegChan(Reg)),
-              &AMDGPU::R600_Reg128RegClass);
+              AMDGPURegisterInfo::getSubRegFromChannel(TRI->getHWRegChan(Reg)),
+              &R600::R600_Reg128RegClass);
       }
       if (MO.isUse()) {
         unsigned Reg = MO.getReg();
-        if (AMDGPU::R600_Reg128RegClass.contains(Reg))
+        if (R600::R600_Reg128RegClass.contains(Reg))
           SrcMI = Reg;
         else
           SrcMI = TRI->getMatchingSuperReg(Reg,
-              TRI->getSubRegFromChannel(TRI->getHWRegChan(Reg)),
-              &AMDGPU::R600_Reg128RegClass);
+              AMDGPURegisterInfo::getSubRegFromChannel(TRI->getHWRegChan(Reg)),
+              &R600::R600_Reg128RegClass);
       }
     }
     if ((DstRegs.find(SrcMI) == DstRegs.end())) {
@@ -358,15 +359,15 @@ private:
 
   void getLiteral(MachineInstr &MI, std::vector<MachineOperand *> &Lits) const {
     static const unsigned LiteralRegs[] = {
-      AMDGPU::ALU_LITERAL_X,
-      AMDGPU::ALU_LITERAL_Y,
-      AMDGPU::ALU_LITERAL_Z,
-      AMDGPU::ALU_LITERAL_W
+      R600::ALU_LITERAL_X,
+      R600::ALU_LITERAL_Y,
+      R600::ALU_LITERAL_Z,
+      R600::ALU_LITERAL_W
     };
     const SmallVector<std::pair<MachineOperand *, int64_t>, 3> Srcs =
         TII->getSrcs(MI);
     for (const auto &Src:Srcs) {
-      if (Src.first->getReg() != AMDGPU::ALU_LITERAL_X)
+      if (Src.first->getReg() != R600::ALU_LITERAL_X)
         continue;
       int64_t Imm = Src.second;
       std::vector<MachineOperand *>::iterator It =
@@ -376,7 +377,7 @@ private:
 
       // Get corresponding Operand
       MachineOperand &Operand = MI.getOperand(
-          TII->getOperandIdx(MI.getOpcode(), AMDGPU::OpName::literal));
+          TII->getOperandIdx(MI.getOpcode(), R600::OpName::literal));
 
       if (It != Lits.end()) {
         // Reuse existing literal reg
@@ -399,7 +400,7 @@ private:
       unsigned LiteralPair0 = Literals[i];
       unsigned LiteralPair1 = (i + 1 < e)?Literals[i + 1]:0;
       InsertPos = BuildMI(MBB, InsertPos->getDebugLoc(),
-          TII->get(AMDGPU::LITERALS))
+          TII->get(R600::LITERALS))
           .addImm(LiteralPair0)
           .addImm(LiteralPair1);
     }
@@ -441,7 +442,7 @@ private:
       }
       for (unsigned i = 0, e = Literals.size(); i < e; i += 2) {
         MachineInstrBuilder MILit = BuildMI(MBB, I, I->getDebugLoc(),
-            TII->get(AMDGPU::LITERALS));
+            TII->get(R600::LITERALS));
         if (Literals[i]->isImm()) {
             MILit.addImm(Literals[i]->getImm());
         } else {
@@ -470,7 +471,7 @@ private:
                        unsigned &CfCount) {
     CounterPropagateAddr(*Clause.first, CfCount);
     MachineBasicBlock *BB = Clause.first->getParent();
-    BuildMI(BB, DL, TII->get(AMDGPU::FETCH_CLAUSE)).addImm(CfCount);
+    BuildMI(BB, DL, TII->get(R600::FETCH_CLAUSE)).addImm(CfCount);
     for (unsigned i = 0, e = Clause.second.size(); i < e; ++i) {
       BB->splice(InsertPos, BB, Clause.second[i]);
     }
@@ -482,7 +483,7 @@ private:
     Clause.first->getOperand(0).setImm(0);
     CounterPropagateAddr(*Clause.first, CfCount);
     MachineBasicBlock *BB = Clause.first->getParent();
-    BuildMI(BB, DL, TII->get(AMDGPU::ALU_CLAUSE)).addImm(CfCount);
+    BuildMI(BB, DL, TII->get(R600::ALU_CLAUSE)).addImm(CfCount);
     for (unsigned i = 0, e = Clause.second.size(); i < e; ++i) {
       BB->splice(InsertPos, BB, Clause.second[i]);
     }
@@ -531,7 +532,7 @@ public:
       for (MachineBasicBlock::iterator I = MBB.begin(), E = MBB.end();
           I != E;) {
         if (TII->usesTextureCache(*I) || TII->usesVertexCache(*I)) {
-          DEBUG(dbgs() << CfCount << ":"; I->dump(););
+          LLVM_DEBUG(dbgs() << CfCount << ":"; I->dump(););
           FetchClauses.push_back(MakeFetchClause(MBB, I));
           CfCount++;
           LastAlu.back() = nullptr;
@@ -539,33 +540,34 @@ public:
         }
 
         MachineBasicBlock::iterator MI = I;
-        if (MI->getOpcode() != AMDGPU::ENDIF)
+        if (MI->getOpcode() != R600::ENDIF)
           LastAlu.back() = nullptr;
-        if (MI->getOpcode() == AMDGPU::CF_ALU)
+        if (MI->getOpcode() == R600::CF_ALU)
           LastAlu.back() = &*MI;
         I++;
         bool RequiresWorkAround =
             CFStack.requiresWorkAroundForInst(MI->getOpcode());
         switch (MI->getOpcode()) {
-        case AMDGPU::CF_ALU_PUSH_BEFORE:
+        case R600::CF_ALU_PUSH_BEFORE:
           if (RequiresWorkAround) {
-            DEBUG(dbgs() << "Applying bug work-around for ALU_PUSH_BEFORE\n");
-            BuildMI(MBB, MI, MBB.findDebugLoc(MI), TII->get(AMDGPU::CF_PUSH_EG))
+            LLVM_DEBUG(dbgs()
+                       << "Applying bug work-around for ALU_PUSH_BEFORE\n");
+            BuildMI(MBB, MI, MBB.findDebugLoc(MI), TII->get(R600::CF_PUSH_EG))
                 .addImm(CfCount + 1)
                 .addImm(1);
-            MI->setDesc(TII->get(AMDGPU::CF_ALU));
+            MI->setDesc(TII->get(R600::CF_ALU));
             CfCount++;
-            CFStack.pushBranch(AMDGPU::CF_PUSH_EG);
+            CFStack.pushBranch(R600::CF_PUSH_EG);
           } else
-            CFStack.pushBranch(AMDGPU::CF_ALU_PUSH_BEFORE);
+            CFStack.pushBranch(R600::CF_ALU_PUSH_BEFORE);
           LLVM_FALLTHROUGH;
-        case AMDGPU::CF_ALU:
+        case R600::CF_ALU:
           I = MI;
           AluClauses.push_back(MakeALUClause(MBB, I));
-          DEBUG(dbgs() << CfCount << ":"; MI->dump(););
+          LLVM_DEBUG(dbgs() << CfCount << ":"; MI->dump(););
           CfCount++;
           break;
-        case AMDGPU::WHILELOOP: {
+        case R600::WHILELOOP: {
           CFStack.pushLoop();
           MachineInstr *MIb = BuildMI(MBB, MI, MBB.findDebugLoc(MI),
               getHWInstrDesc(CF_WHILE_LOOP))
@@ -578,7 +580,7 @@ public:
           CfCount++;
           break;
         }
-        case AMDGPU::ENDLOOP: {
+        case R600::ENDLOOP: {
           CFStack.popLoop();
           std::pair<unsigned, std::set<MachineInstr *>> Pair =
               std::move(LoopStack.back());
@@ -590,19 +592,19 @@ public:
           CfCount++;
           break;
         }
-        case AMDGPU::IF_PREDICATE_SET: {
+        case R600::IF_PREDICATE_SET: {
           LastAlu.push_back(nullptr);
           MachineInstr *MIb = BuildMI(MBB, MI, MBB.findDebugLoc(MI),
               getHWInstrDesc(CF_JUMP))
               .addImm(0)
               .addImm(0);
           IfThenElseStack.push_back(MIb);
-          DEBUG(dbgs() << CfCount << ":"; MIb->dump(););
+          LLVM_DEBUG(dbgs() << CfCount << ":"; MIb->dump(););
           MI->eraseFromParent();
           CfCount++;
           break;
         }
-        case AMDGPU::ELSE: {
+        case R600::ELSE: {
           MachineInstr * JumpInst = IfThenElseStack.back();
           IfThenElseStack.pop_back();
           CounterPropagateAddr(*JumpInst, CfCount);
@@ -610,13 +612,13 @@ public:
               getHWInstrDesc(CF_ELSE))
               .addImm(0)
               .addImm(0);
-          DEBUG(dbgs() << CfCount << ":"; MIb->dump(););
+          LLVM_DEBUG(dbgs() << CfCount << ":"; MIb->dump(););
           IfThenElseStack.push_back(MIb);
           MI->eraseFromParent();
           CfCount++;
           break;
         }
-        case AMDGPU::ENDIF: {
+        case R600::ENDIF: {
           CFStack.popBranch();
           if (LastAlu.back()) {
             ToPopAfter.push_back(LastAlu.back());
@@ -626,7 +628,7 @@ public:
                 .addImm(CfCount + 1)
                 .addImm(1);
             (void)MIb;
-            DEBUG(dbgs() << CfCount << ":"; MIb->dump(););
+            LLVM_DEBUG(dbgs() << CfCount << ":"; MIb->dump(););
             CfCount++;
           }
 
@@ -638,7 +640,7 @@ public:
           MI->eraseFromParent();
           break;
         }
-        case AMDGPU::BREAK: {
+        case R600::BREAK: {
           CfCount ++;
           MachineInstr *MIb = BuildMI(MBB, MI, MBB.findDebugLoc(MI),
               getHWInstrDesc(CF_LOOP_BREAK))
@@ -647,7 +649,7 @@ public:
           MI->eraseFromParent();
           break;
         }
-        case AMDGPU::CONTINUE: {
+        case R600::CONTINUE: {
           MachineInstr *MIb = BuildMI(MBB, MI, MBB.findDebugLoc(MI),
               getHWInstrDesc(CF_LOOP_CONTINUE))
               .addImm(0);
@@ -656,12 +658,12 @@ public:
           CfCount++;
           break;
         }
-        case AMDGPU::RETURN: {
+        case R600::RETURN: {
           DebugLoc DL = MBB.findDebugLoc(MI);
           BuildMI(MBB, MI, DL, getHWInstrDesc(CF_END));
           CfCount++;
           if (CfCount % 2) {
-            BuildMI(MBB, I, DL, TII->get(AMDGPU::PAD));
+            BuildMI(MBB, I, DL, TII->get(R600::PAD));
             CfCount++;
           }
           MI->eraseFromParent();
@@ -673,7 +675,7 @@ public:
         }
         default:
           if (TII->isExport(MI->getOpcode())) {
-            DEBUG(dbgs() << CfCount << ":"; MI->dump(););
+            LLVM_DEBUG(dbgs() << CfCount << ":"; MI->dump(););
             CfCount++;
           }
           break;
@@ -682,7 +684,7 @@ public:
       for (unsigned i = 0, e = ToPopAfter.size(); i < e; ++i) {
         MachineInstr *Alu = ToPopAfter[i];
         BuildMI(MBB, Alu, MBB.findDebugLoc((MachineBasicBlock::iterator)Alu),
-            TII->get(AMDGPU::CF_ALU_POP_AFTER))
+            TII->get(R600::CF_ALU_POP_AFTER))
             .addImm(Alu->getOperand(0).getImm())
             .addImm(Alu->getOperand(1).getImm())
             .addImm(Alu->getOperand(2).getImm())
diff --git a/lib/Target/AMDGPU/R600Defines.h b/lib/Target/AMDGPU/R600Defines.h
index 534461adc59f..0d33d82e8e0f 100644
--- a/lib/Target/AMDGPU/R600Defines.h
+++ b/lib/Target/AMDGPU/R600Defines.h
@@ -23,7 +23,7 @@
 #define MO_FLAG_LAST  (1 << 6)
 #define NUM_MO_FLAGS 7
 
-/// \brief Helper for getting the operand index for the instruction flags
+/// Helper for getting the operand index for the instruction flags
 /// operand.
 #define GET_FLAG_OPERAND_IDX(Flags) (((Flags) >> 7) & 0x3)
 
@@ -52,7 +52,7 @@ namespace R600_InstFlag {
 
 #define HAS_NATIVE_OPERANDS(Flags) ((Flags) & R600_InstFlag::NATIVE_OPERANDS)
 
-/// \brief Defines for extracting register information from register encoding
+/// Defines for extracting register information from register encoding
 #define HW_REG_MASK 0x1ff
 #define HW_CHAN_SHIFT 9
 
diff --git a/lib/Target/AMDGPU/R600EmitClauseMarkers.cpp b/lib/Target/AMDGPU/R600EmitClauseMarkers.cpp
index 0d8ccd088ec4..1683fe6c9a57 100644
--- a/lib/Target/AMDGPU/R600EmitClauseMarkers.cpp
+++ b/lib/Target/AMDGPU/R600EmitClauseMarkers.cpp
@@ -19,6 +19,7 @@
 #include "R600Defines.h"
 #include "R600InstrInfo.h"
 #include "R600RegisterInfo.h"
+#include "MCTargetDesc/AMDGPUMCTargetDesc.h"
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/ADT/StringRef.h"
 #include "llvm/CodeGen/MachineBasicBlock.h"
@@ -51,12 +52,12 @@ private:
 
   unsigned OccupiedDwords(MachineInstr &MI) const {
     switch (MI.getOpcode()) {
-    case AMDGPU::INTERP_PAIR_XY:
-    case AMDGPU::INTERP_PAIR_ZW:
-    case AMDGPU::INTERP_VEC_LOAD:
-    case AMDGPU::DOT_4:
+    case R600::INTERP_PAIR_XY:
+    case R600::INTERP_PAIR_ZW:
+    case R600::INTERP_VEC_LOAD:
+    case R600::DOT_4:
       return 4;
-    case AMDGPU::KILL:
+    case R600::KILL:
       return 0;
     default:
       break;
@@ -76,7 +77,7 @@ private:
                                     E = MI.operands_end();
          It != E; ++It) {
       MachineOperand &MO = *It;
-      if (MO.isReg() && MO.getReg() == AMDGPU::ALU_LITERAL_X)
+      if (MO.isReg() && MO.getReg() == R600::ALU_LITERAL_X)
         ++NumLiteral;
     }
     return 1 + NumLiteral;
@@ -88,12 +89,12 @@ private:
     if (TII->isVector(MI) || TII->isCubeOp(MI.getOpcode()))
       return true;
     switch (MI.getOpcode()) {
-    case AMDGPU::PRED_X:
-    case AMDGPU::INTERP_PAIR_XY:
-    case AMDGPU::INTERP_PAIR_ZW:
-    case AMDGPU::INTERP_VEC_LOAD:
-    case AMDGPU::COPY:
-    case AMDGPU::DOT_4:
+    case R600::PRED_X:
+    case R600::INTERP_PAIR_XY:
+    case R600::INTERP_PAIR_ZW:
+    case R600::INTERP_VEC_LOAD:
+    case R600::COPY:
+    case R600::DOT_4:
       return true;
     default:
       return false;
@@ -102,9 +103,9 @@ private:
 
   bool IsTrivialInst(MachineInstr &MI) const {
     switch (MI.getOpcode()) {
-    case AMDGPU::KILL:
-    case AMDGPU::RETURN:
-    case AMDGPU::IMPLICIT_DEF:
+    case R600::KILL:
+    case R600::RETURN:
+    case R600::IMPLICIT_DEF:
       return true;
     default:
       return false;
@@ -131,16 +132,16 @@ private:
                        bool UpdateInstr = true) const {
     std::vector<std::pair<unsigned, unsigned>> UsedKCache;
 
-    if (!TII->isALUInstr(MI.getOpcode()) && MI.getOpcode() != AMDGPU::DOT_4)
+    if (!TII->isALUInstr(MI.getOpcode()) && MI.getOpcode() != R600::DOT_4)
       return true;
 
     const SmallVectorImpl<std::pair<MachineOperand *, int64_t>> &Consts =
         TII->getSrcs(MI);
     assert(
-        (TII->isALUInstr(MI.getOpcode()) || MI.getOpcode() == AMDGPU::DOT_4) &&
+        (TII->isALUInstr(MI.getOpcode()) || MI.getOpcode() == R600::DOT_4) &&
         "Can't assign Const");
     for (unsigned i = 0, n = Consts.size(); i < n; ++i) {
-      if (Consts[i].first->getReg() != AMDGPU::ALU_CONST)
+      if (Consts[i].first->getReg() != R600::ALU_CONST)
         continue;
       unsigned Sel = Consts[i].second;
       unsigned Chan = Sel & 3, Index = ((Sel >> 2) - 512) & 31;
@@ -171,16 +172,16 @@ private:
       return true;
 
     for (unsigned i = 0, j = 0, n = Consts.size(); i < n; ++i) {
-      if (Consts[i].first->getReg() != AMDGPU::ALU_CONST)
+      if (Consts[i].first->getReg() != R600::ALU_CONST)
         continue;
       switch(UsedKCache[j].first) {
       case 0:
         Consts[i].first->setReg(
-            AMDGPU::R600_KC0RegClass.getRegister(UsedKCache[j].second));
+            R600::R600_KC0RegClass.getRegister(UsedKCache[j].second));
         break;
       case 1:
         Consts[i].first->setReg(
-            AMDGPU::R600_KC1RegClass.getRegister(UsedKCache[j].second));
+            R600::R600_KC1RegClass.getRegister(UsedKCache[j].second));
         break;
       default:
         llvm_unreachable("Wrong Cache Line");
@@ -252,7 +253,7 @@ private:
         break;
       if (AluInstCount > TII->getMaxAlusPerClause())
         break;
-      if (I->getOpcode() == AMDGPU::PRED_X) {
+      if (I->getOpcode() == R600::PRED_X) {
         // We put PRED_X in its own clause to ensure that ifcvt won't create
         // clauses with more than 128 insts.
         // IfCvt is indeed checking that "then" and "else" branches of an if
@@ -288,7 +289,7 @@ private:
       AluInstCount += OccupiedDwords(*I);
     }
     unsigned Opcode = PushBeforeModifier ?
-        AMDGPU::CF_ALU_PUSH_BEFORE : AMDGPU::CF_ALU;
+        R600::CF_ALU_PUSH_BEFORE : R600::CF_ALU;
     BuildMI(MBB, ClauseHead, MBB.findDebugLoc(ClauseHead), TII->get(Opcode))
     // We don't use the ADDR field until R600ControlFlowFinalizer pass, where
     // it is safe to assume it is 0. However if we always put 0 here, the ifcvt
@@ -321,7 +322,7 @@ public:
                                                     BB != BB_E; ++BB) {
       MachineBasicBlock &MBB = *BB;
       MachineBasicBlock::iterator I = MBB.begin();
-      if (I != MBB.end() && I->getOpcode() == AMDGPU::CF_ALU)
+      if (I != MBB.end() && I->getOpcode() == R600::CF_ALU)
         continue; // BB was already parsed
       for (MachineBasicBlock::iterator E = MBB.end(); I != E;) {
         if (isALU(*I)) {
diff --git a/lib/Target/AMDGPU/R600ExpandSpecialInstrs.cpp b/lib/Target/AMDGPU/R600ExpandSpecialInstrs.cpp
index ffea231ee4d0..b924ff019dd1 100644
--- a/lib/Target/AMDGPU/R600ExpandSpecialInstrs.cpp
+++ b/lib/Target/AMDGPU/R600ExpandSpecialInstrs.cpp
@@ -21,6 +21,7 @@
 #include "R600RegisterInfo.h"
 #include "llvm/CodeGen/MachineBasicBlock.h"
 #include "llvm/CodeGen/MachineFunction.h"
+#include "MCTargetDesc/AMDGPUMCTargetDesc.h"
 #include "llvm/CodeGen/MachineFunctionPass.h"
 #include "llvm/CodeGen/MachineInstr.h"
 #include "llvm/CodeGen/MachineInstrBuilder.h"
@@ -95,16 +96,16 @@ bool R600ExpandSpecialInstrsPass::runOnMachineFunction(MachineFunction &MF) {
 
       // Expand LDS_*_RET instructions
       if (TII->isLDSRetInstr(MI.getOpcode())) {
-        int DstIdx = TII->getOperandIdx(MI.getOpcode(), AMDGPU::OpName::dst);
+        int DstIdx = TII->getOperandIdx(MI.getOpcode(), R600::OpName::dst);
         assert(DstIdx != -1);
         MachineOperand &DstOp = MI.getOperand(DstIdx);
         MachineInstr *Mov = TII->buildMovInstr(&MBB, I,
-                                               DstOp.getReg(), AMDGPU::OQAP);
-        DstOp.setReg(AMDGPU::OQAP);
+                                               DstOp.getReg(), R600::OQAP);
+        DstOp.setReg(R600::OQAP);
         int LDSPredSelIdx = TII->getOperandIdx(MI.getOpcode(),
-                                           AMDGPU::OpName::pred_sel);
+                                           R600::OpName::pred_sel);
         int MovPredSelIdx = TII->getOperandIdx(Mov->getOpcode(),
-                                           AMDGPU::OpName::pred_sel);
+                                           R600::OpName::pred_sel);
         // Copy the pred_sel bit
         Mov->getOperand(MovPredSelIdx).setReg(
             MI.getOperand(LDSPredSelIdx).getReg());
@@ -113,7 +114,7 @@ bool R600ExpandSpecialInstrsPass::runOnMachineFunction(MachineFunction &MF) {
       switch (MI.getOpcode()) {
       default: break;
       // Expand PRED_X to one of the PRED_SET instructions.
-      case AMDGPU::PRED_X: {
+      case R600::PRED_X: {
         uint64_t Flags = MI.getOperand(3).getImm();
         // The native opcode used by PRED_X is stored as an immediate in the
         // third operand.
@@ -121,17 +122,18 @@ bool R600ExpandSpecialInstrsPass::runOnMachineFunction(MachineFunction &MF) {
                                             MI.getOperand(2).getImm(), // opcode
                                             MI.getOperand(0).getReg(), // dst
                                             MI.getOperand(1).getReg(), // src0
-                                            AMDGPU::ZERO);             // src1
+                                            R600::ZERO);             // src1
         TII->addFlag(*PredSet, 0, MO_FLAG_MASK);
         if (Flags & MO_FLAG_PUSH) {
-          TII->setImmOperand(*PredSet, AMDGPU::OpName::update_exec_mask, 1);
+          TII->setImmOperand(*PredSet, R600::OpName::update_exec_mask, 1);
         } else {
-          TII->setImmOperand(*PredSet, AMDGPU::OpName::update_pred, 1);
+          TII->setImmOperand(*PredSet, R600::OpName::update_pred, 1);
         }
         MI.eraseFromParent();
         continue;
         }
-      case AMDGPU::DOT_4: {
+      case R600::DOT_4: {
+
         const R600RegisterInfo &TRI = TII->getRegisterInfo();
 
         unsigned DstReg = MI.getOperand(0).getReg();
@@ -140,7 +142,7 @@ bool R600ExpandSpecialInstrsPass::runOnMachineFunction(MachineFunction &MF) {
         for (unsigned Chan = 0; Chan < 4; ++Chan) {
           bool Mask = (Chan != TRI.getHWRegChan(DstReg));
           unsigned SubDstReg =
-              AMDGPU::R600_TReg32RegClass.getRegister((DstBase * 4) + Chan);
+              R600::R600_TReg32RegClass.getRegister((DstBase * 4) + Chan);
           MachineInstr *BMI =
               TII->buildSlotOfVectorInstruction(MBB, &MI, Chan, SubDstReg);
           if (Chan > 0) {
@@ -155,10 +157,10 @@ bool R600ExpandSpecialInstrsPass::runOnMachineFunction(MachineFunction &MF) {
           // While not strictly necessary from hw point of view, we force
           // all src operands of a dot4 inst to belong to the same slot.
           unsigned Src0 = BMI->getOperand(
-              TII->getOperandIdx(Opcode, AMDGPU::OpName::src0))
+              TII->getOperandIdx(Opcode, R600::OpName::src0))
               .getReg();
           unsigned Src1 = BMI->getOperand(
-              TII->getOperandIdx(Opcode, AMDGPU::OpName::src1))
+              TII->getOperandIdx(Opcode, R600::OpName::src1))
               .getReg();
           (void) Src0;
           (void) Src1;
@@ -205,26 +207,26 @@ bool R600ExpandSpecialInstrsPass::runOnMachineFunction(MachineFunction &MF) {
       // T0_W = CUBE T1_Y, T1_Z
       for (unsigned Chan = 0; Chan < 4; Chan++) {
         unsigned DstReg = MI.getOperand(
-                            TII->getOperandIdx(MI, AMDGPU::OpName::dst)).getReg();
+                            TII->getOperandIdx(MI, R600::OpName::dst)).getReg();
         unsigned Src0 = MI.getOperand(
-                           TII->getOperandIdx(MI, AMDGPU::OpName::src0)).getReg();
+                           TII->getOperandIdx(MI, R600::OpName::src0)).getReg();
         unsigned Src1 = 0;
 
         // Determine the correct source registers
         if (!IsCube) {
-          int Src1Idx = TII->getOperandIdx(MI, AMDGPU::OpName::src1);
+          int Src1Idx = TII->getOperandIdx(MI, R600::OpName::src1);
           if (Src1Idx != -1) {
             Src1 = MI.getOperand(Src1Idx).getReg();
           }
         }
         if (IsReduction) {
-          unsigned SubRegIndex = TRI.getSubRegFromChannel(Chan);
+          unsigned SubRegIndex = AMDGPURegisterInfo::getSubRegFromChannel(Chan);
           Src0 = TRI.getSubReg(Src0, SubRegIndex);
           Src1 = TRI.getSubReg(Src1, SubRegIndex);
         } else if (IsCube) {
           static const int CubeSrcSwz[] = {2, 2, 0, 1};
-          unsigned SubRegIndex0 = TRI.getSubRegFromChannel(CubeSrcSwz[Chan]);
-          unsigned SubRegIndex1 = TRI.getSubRegFromChannel(CubeSrcSwz[3 - Chan]);
+          unsigned SubRegIndex0 = AMDGPURegisterInfo::getSubRegFromChannel(CubeSrcSwz[Chan]);
+          unsigned SubRegIndex1 = AMDGPURegisterInfo::getSubRegFromChannel(CubeSrcSwz[3 - Chan]);
           Src1 = TRI.getSubReg(Src0, SubRegIndex1);
           Src0 = TRI.getSubReg(Src0, SubRegIndex0);
         }
@@ -233,14 +235,14 @@ bool R600ExpandSpecialInstrsPass::runOnMachineFunction(MachineFunction &MF) {
         bool Mask = false;
         bool NotLast = true;
         if (IsCube) {
-          unsigned SubRegIndex = TRI.getSubRegFromChannel(Chan);
+          unsigned SubRegIndex = AMDGPURegisterInfo::getSubRegFromChannel(Chan);
           DstReg = TRI.getSubReg(DstReg, SubRegIndex);
         } else {
           // Mask the write if the original instruction does not write to
           // the current Channel.
           Mask = (Chan != TRI.getHWRegChan(DstReg));
           unsigned DstBase = TRI.getEncodingValue(DstReg) & HW_REG_MASK;
-          DstReg = AMDGPU::R600_TReg32RegClass.getRegister((DstBase * 4) + Chan);
+          DstReg = R600::R600_TReg32RegClass.getRegister((DstBase * 4) + Chan);
         }
 
         // Set the IsLast bit
@@ -249,11 +251,11 @@ bool R600ExpandSpecialInstrsPass::runOnMachineFunction(MachineFunction &MF) {
         // Add the new instruction
         unsigned Opcode = MI.getOpcode();
         switch (Opcode) {
-        case AMDGPU::CUBE_r600_pseudo:
-          Opcode = AMDGPU::CUBE_r600_real;
+        case R600::CUBE_r600_pseudo:
+          Opcode = R600::CUBE_r600_real;
           break;
-        case AMDGPU::CUBE_eg_pseudo:
-          Opcode = AMDGPU::CUBE_eg_real;
+        case R600::CUBE_eg_pseudo:
+          Opcode = R600::CUBE_eg_real;
           break;
         default:
           break;
@@ -270,12 +272,12 @@ bool R600ExpandSpecialInstrsPass::runOnMachineFunction(MachineFunction &MF) {
         if (NotLast) {
           TII->addFlag(*NewMI, 0, MO_FLAG_NOT_LAST);
         }
-        SetFlagInNewMI(NewMI, &MI, AMDGPU::OpName::clamp);
-        SetFlagInNewMI(NewMI, &MI, AMDGPU::OpName::literal);
-        SetFlagInNewMI(NewMI, &MI, AMDGPU::OpName::src0_abs);
-        SetFlagInNewMI(NewMI, &MI, AMDGPU::OpName::src1_abs);
-        SetFlagInNewMI(NewMI, &MI, AMDGPU::OpName::src0_neg);
-        SetFlagInNewMI(NewMI, &MI, AMDGPU::OpName::src1_neg);
+        SetFlagInNewMI(NewMI, &MI, R600::OpName::clamp);
+        SetFlagInNewMI(NewMI, &MI, R600::OpName::literal);
+        SetFlagInNewMI(NewMI, &MI, R600::OpName::src0_abs);
+        SetFlagInNewMI(NewMI, &MI, R600::OpName::src1_abs);
+        SetFlagInNewMI(NewMI, &MI, R600::OpName::src0_neg);
+        SetFlagInNewMI(NewMI, &MI, R600::OpName::src1_neg);
       }
       MI.eraseFromParent();
     }
diff --git a/lib/Target/AMDGPU/R600ISelLowering.cpp b/lib/Target/AMDGPU/R600ISelLowering.cpp
index 66291d0be4e6..113d6249fa60 100644
--- a/lib/Target/AMDGPU/R600ISelLowering.cpp
+++ b/lib/Target/AMDGPU/R600ISelLowering.cpp
@@ -8,18 +8,18 @@
 //===----------------------------------------------------------------------===//
 //
 /// \file
-/// \brief Custom DAG lowering for R600
+/// Custom DAG lowering for R600
 //
 //===----------------------------------------------------------------------===//
 
 #include "R600ISelLowering.h"
 #include "AMDGPUFrameLowering.h"
-#include "AMDGPUIntrinsicInfo.h"
 #include "AMDGPUSubtarget.h"
 #include "R600Defines.h"
 #include "R600FrameLowering.h"
 #include "R600InstrInfo.h"
 #include "R600MachineFunctionInfo.h"
+#include "MCTargetDesc/AMDGPUMCTargetDesc.h"
 #include "Utils/AMDGPUBaseInfo.h"
 #include "llvm/ADT/APFloat.h"
 #include "llvm/ADT/APInt.h"
@@ -35,13 +35,13 @@
 #include "llvm/CodeGen/MachineInstrBuilder.h"
 #include "llvm/CodeGen/MachineMemOperand.h"
 #include "llvm/CodeGen/MachineRegisterInfo.h"
-#include "llvm/CodeGen/MachineValueType.h"
 #include "llvm/CodeGen/SelectionDAG.h"
 #include "llvm/IR/Constants.h"
 #include "llvm/IR/DerivedTypes.h"
 #include "llvm/Support/Casting.h"
 #include "llvm/Support/Compiler.h"
 #include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/MachineValueType.h"
 #include <cassert>
 #include <cstdint>
 #include <iterator>
@@ -50,17 +50,19 @@
 
 using namespace llvm;
 
+#include "R600GenCallingConv.inc"
+
 R600TargetLowering::R600TargetLowering(const TargetMachine &TM,
                                        const R600Subtarget &STI)
-    : AMDGPUTargetLowering(TM, STI), Gen(STI.getGeneration()) {
-  addRegisterClass(MVT::f32, &AMDGPU::R600_Reg32RegClass);
-  addRegisterClass(MVT::i32, &AMDGPU::R600_Reg32RegClass);
-  addRegisterClass(MVT::v2f32, &AMDGPU::R600_Reg64RegClass);
-  addRegisterClass(MVT::v2i32, &AMDGPU::R600_Reg64RegClass);
-  addRegisterClass(MVT::v4f32, &AMDGPU::R600_Reg128RegClass);
-  addRegisterClass(MVT::v4i32, &AMDGPU::R600_Reg128RegClass);
+    : AMDGPUTargetLowering(TM, STI), Subtarget(&STI), Gen(STI.getGeneration()) {
+  addRegisterClass(MVT::f32, &R600::R600_Reg32RegClass);
+  addRegisterClass(MVT::i32, &R600::R600_Reg32RegClass);
+  addRegisterClass(MVT::v2f32, &R600::R600_Reg64RegClass);
+  addRegisterClass(MVT::v2i32, &R600::R600_Reg64RegClass);
+  addRegisterClass(MVT::v4f32, &R600::R600_Reg128RegClass);
+  addRegisterClass(MVT::v4i32, &R600::R600_Reg128RegClass);
 
-  computeRegisterProperties(STI.getRegisterInfo());
+  computeRegisterProperties(Subtarget->getRegisterInfo());
 
   // Legalize loads and stores to the private address space.
   setOperationAction(ISD::LOAD, MVT::i32, Custom);
@@ -147,6 +149,11 @@ R600TargetLowering::R600TargetLowering(const TargetMachine &TM,
 
   setOperationAction(ISD::FSUB, MVT::f32, Expand);
 
+  setOperationAction(ISD::FCEIL, MVT::f64, Custom);
+  setOperationAction(ISD::FTRUNC, MVT::f64, Custom);
+  setOperationAction(ISD::FRINT, MVT::f64, Custom);
+  setOperationAction(ISD::FFLOOR, MVT::f64, Custom);
+
   setOperationAction(ISD::SELECT_CC, MVT::f32, Custom);
   setOperationAction(ISD::SELECT_CC, MVT::i32, Custom);
 
@@ -216,6 +223,34 @@ R600TargetLowering::R600TargetLowering(const TargetMachine &TM,
     setOperationAction(ISD::FMA, MVT::f64, Expand);
   }
 
+  // FIXME: This was moved from AMDGPUTargetLowering, I'm not sure if we
+  // need it for R600.
+  if (!Subtarget->hasFP32Denormals())
+    setOperationAction(ISD::FMAD, MVT::f32, Legal);
+
+  if (!Subtarget->hasBFI()) {
+    // fcopysign can be done in a single instruction with BFI.
+    setOperationAction(ISD::FCOPYSIGN, MVT::f32, Expand);
+    setOperationAction(ISD::FCOPYSIGN, MVT::f64, Expand);
+  }
+
+  if (!Subtarget->hasBCNT(32))
+    setOperationAction(ISD::CTPOP, MVT::i32, Expand);
+
+  if (!Subtarget->hasBCNT(64))
+    setOperationAction(ISD::CTPOP, MVT::i64, Expand);
+
+  if (Subtarget->hasFFBH())
+    setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::i32, Custom);
+
+  if (Subtarget->hasFFBL())
+    setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::i32, Custom);
+
+  // FIXME: This was moved from AMDGPUTargetLowering, I'm not sure if we
+  // need it for R600.
+  if (Subtarget->hasBFE())
+    setHasExtractBitsInsn(true);
+
   setOperationAction(ISD::GlobalAddress, MVT::i32, Custom);
 
   const MVT ScalarIntVTs[] = { MVT::i32, MVT::i64 };
@@ -245,14 +280,10 @@ R600TargetLowering::R600TargetLowering(const TargetMachine &TM,
   setTargetDAGCombine(ISD::LOAD);
 }
 
-const R600Subtarget *R600TargetLowering::getSubtarget() const {
-  return static_cast<const R600Subtarget *>(Subtarget);
-}
-
 static inline bool isEOP(MachineBasicBlock::iterator I) {
   if (std::next(I) == I->getParent()->end())
     return false;
-  return std::next(I)->getOpcode() == AMDGPU::RETURN;
+  return std::next(I)->getOpcode() == R600::RETURN;
 }
 
 MachineBasicBlock *
@@ -261,24 +292,24 @@ R600TargetLowering::EmitInstrWithCustomInserter(MachineInstr &MI,
   MachineFunction *MF = BB->getParent();
   MachineRegisterInfo &MRI = MF->getRegInfo();
   MachineBasicBlock::iterator I = MI;
-  const R600InstrInfo *TII = getSubtarget()->getInstrInfo();
+  const R600InstrInfo *TII = Subtarget->getInstrInfo();
 
   switch (MI.getOpcode()) {
   default:
     // Replace LDS_*_RET instruction that don't have any uses with the
     // equivalent LDS_*_NORET instruction.
     if (TII->isLDSRetInstr(MI.getOpcode())) {
-      int DstIdx = TII->getOperandIdx(MI.getOpcode(), AMDGPU::OpName::dst);
+      int DstIdx = TII->getOperandIdx(MI.getOpcode(), R600::OpName::dst);
       assert(DstIdx != -1);
       MachineInstrBuilder NewMI;
       // FIXME: getLDSNoRetOp method only handles LDS_1A1D LDS ops. Add
       //        LDS_1A2D support and remove this special case.
       if (!MRI.use_empty(MI.getOperand(DstIdx).getReg()) ||
-          MI.getOpcode() == AMDGPU::LDS_CMPST_RET)
+          MI.getOpcode() == R600::LDS_CMPST_RET)
         return BB;
 
       NewMI = BuildMI(*BB, I, BB->findDebugLoc(I),
-                      TII->get(AMDGPU::getLDSNoRetOp(MI.getOpcode())));
+                      TII->get(R600::getLDSNoRetOp(MI.getOpcode())));
       for (unsigned i = 1, e = MI.getNumOperands(); i < e; ++i) {
         NewMI.add(MI.getOperand(i));
       }
@@ -286,31 +317,24 @@ R600TargetLowering::EmitInstrWithCustomInserter(MachineInstr &MI,
       return AMDGPUTargetLowering::EmitInstrWithCustomInserter(MI, BB);
     }
     break;
-  case AMDGPU::CLAMP_R600: {
-    MachineInstr *NewMI = TII->buildDefaultInstruction(
-        *BB, I, AMDGPU::MOV, MI.getOperand(0).getReg(),
-        MI.getOperand(1).getReg());
-    TII->addFlag(*NewMI, 0, MO_FLAG_CLAMP);
-    break;
-  }
 
-  case AMDGPU::FABS_R600: {
+  case R600::FABS_R600: {
     MachineInstr *NewMI = TII->buildDefaultInstruction(
-        *BB, I, AMDGPU::MOV, MI.getOperand(0).getReg(),
+        *BB, I, R600::MOV, MI.getOperand(0).getReg(),
         MI.getOperand(1).getReg());
     TII->addFlag(*NewMI, 0, MO_FLAG_ABS);
     break;
   }
 
-  case AMDGPU::FNEG_R600: {
+  case R600::FNEG_R600: {
     MachineInstr *NewMI = TII->buildDefaultInstruction(
-        *BB, I, AMDGPU::MOV, MI.getOperand(0).getReg(),
+        *BB, I, R600::MOV, MI.getOperand(0).getReg(),
         MI.getOperand(1).getReg());
     TII->addFlag(*NewMI, 0, MO_FLAG_NEG);
     break;
   }
 
-  case AMDGPU::MASK_WRITE: {
+  case R600::MASK_WRITE: {
     unsigned maskedRegister = MI.getOperand(0).getReg();
     assert(TargetRegisterInfo::isVirtualRegister(maskedRegister));
     MachineInstr * defInstr = MRI.getVRegDef(maskedRegister);
@@ -318,7 +342,7 @@ R600TargetLowering::EmitInstrWithCustomInserter(MachineInstr &MI,
     break;
   }
 
-  case AMDGPU::MOV_IMM_F32:
+  case R600::MOV_IMM_F32:
     TII->buildMovImm(*BB, I, MI.getOperand(0).getReg(), MI.getOperand(1)
                                                             .getFPImm()
                                                             ->getValueAPF()
@@ -326,39 +350,39 @@ R600TargetLowering::EmitInstrWithCustomInserter(MachineInstr &MI,
                                                             .getZExtValue());
     break;
 
-  case AMDGPU::MOV_IMM_I32:
+  case R600::MOV_IMM_I32:
     TII->buildMovImm(*BB, I, MI.getOperand(0).getReg(),
                      MI.getOperand(1).getImm());
     break;
 
-  case AMDGPU::MOV_IMM_GLOBAL_ADDR: {
+  case R600::MOV_IMM_GLOBAL_ADDR: {
     //TODO: Perhaps combine this instruction with the next if possible
     auto MIB = TII->buildDefaultInstruction(
-        *BB, MI, AMDGPU::MOV, MI.getOperand(0).getReg(), AMDGPU::ALU_LITERAL_X);
-    int Idx = TII->getOperandIdx(*MIB, AMDGPU::OpName::literal);
+        *BB, MI, R600::MOV, MI.getOperand(0).getReg(), R600::ALU_LITERAL_X);
+    int Idx = TII->getOperandIdx(*MIB, R600::OpName::literal);
     //TODO: Ugh this is rather ugly
     MIB->getOperand(Idx) = MI.getOperand(1);
     break;
   }
 
-  case AMDGPU::CONST_COPY: {
+  case R600::CONST_COPY: {
     MachineInstr *NewMI = TII->buildDefaultInstruction(
-        *BB, MI, AMDGPU::MOV, MI.getOperand(0).getReg(), AMDGPU::ALU_CONST);
-    TII->setImmOperand(*NewMI, AMDGPU::OpName::src0_sel,
+        *BB, MI, R600::MOV, MI.getOperand(0).getReg(), R600::ALU_CONST);
+    TII->setImmOperand(*NewMI, R600::OpName::src0_sel,
                        MI.getOperand(1).getImm());
     break;
   }
 
-  case AMDGPU::RAT_WRITE_CACHELESS_32_eg:
-  case AMDGPU::RAT_WRITE_CACHELESS_64_eg:
-  case AMDGPU::RAT_WRITE_CACHELESS_128_eg:
+  case R600::RAT_WRITE_CACHELESS_32_eg:
+  case R600::RAT_WRITE_CACHELESS_64_eg:
+  case R600::RAT_WRITE_CACHELESS_128_eg:
     BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(MI.getOpcode()))
         .add(MI.getOperand(0))
         .add(MI.getOperand(1))
         .addImm(isEOP(I)); // Set End of program bit
     break;
 
-  case AMDGPU::RAT_STORE_TYPED_eg:
+  case R600::RAT_STORE_TYPED_eg:
     BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(MI.getOpcode()))
         .add(MI.getOperand(0))
         .add(MI.getOperand(1))
@@ -366,49 +390,49 @@ R600TargetLowering::EmitInstrWithCustomInserter(MachineInstr &MI,
         .addImm(isEOP(I)); // Set End of program bit
     break;
 
-  case AMDGPU::BRANCH:
-    BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(AMDGPU::JUMP))
+  case R600::BRANCH:
+    BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(R600::JUMP))
         .add(MI.getOperand(0));
     break;
 
-  case AMDGPU::BRANCH_COND_f32: {
+  case R600::BRANCH_COND_f32: {
     MachineInstr *NewMI =
-        BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(AMDGPU::PRED_X),
-                AMDGPU::PREDICATE_BIT)
+        BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(R600::PRED_X),
+                R600::PREDICATE_BIT)
             .add(MI.getOperand(1))
-            .addImm(AMDGPU::PRED_SETNE)
+            .addImm(R600::PRED_SETNE)
             .addImm(0); // Flags
     TII->addFlag(*NewMI, 0, MO_FLAG_PUSH);
-    BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(AMDGPU::JUMP_COND))
+    BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(R600::JUMP_COND))
         .add(MI.getOperand(0))
-        .addReg(AMDGPU::PREDICATE_BIT, RegState::Kill);
+        .addReg(R600::PREDICATE_BIT, RegState::Kill);
     break;
   }
 
-  case AMDGPU::BRANCH_COND_i32: {
+  case R600::BRANCH_COND_i32: {
     MachineInstr *NewMI =
-        BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(AMDGPU::PRED_X),
-                AMDGPU::PREDICATE_BIT)
+        BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(R600::PRED_X),
+                R600::PREDICATE_BIT)
             .add(MI.getOperand(1))
-            .addImm(AMDGPU::PRED_SETNE_INT)
+            .addImm(R600::PRED_SETNE_INT)
             .addImm(0); // Flags
     TII->addFlag(*NewMI, 0, MO_FLAG_PUSH);
-    BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(AMDGPU::JUMP_COND))
+    BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(R600::JUMP_COND))
         .add(MI.getOperand(0))
-        .addReg(AMDGPU::PREDICATE_BIT, RegState::Kill);
+        .addReg(R600::PREDICATE_BIT, RegState::Kill);
     break;
   }
 
-  case AMDGPU::EG_ExportSwz:
-  case AMDGPU::R600_ExportSwz: {
+  case R600::EG_ExportSwz:
+  case R600::R600_ExportSwz: {
     // Instruction is left unmodified if its not the last one of its type
     bool isLastInstructionOfItsType = true;
     unsigned InstExportType = MI.getOperand(1).getImm();
     for (MachineBasicBlock::iterator NextExportInst = std::next(I),
          EndBlock = BB->end(); NextExportInst != EndBlock;
          NextExportInst = std::next(NextExportInst)) {
-      if (NextExportInst->getOpcode() == AMDGPU::EG_ExportSwz ||
-          NextExportInst->getOpcode() == AMDGPU::R600_ExportSwz) {
+      if (NextExportInst->getOpcode() == R600::EG_ExportSwz ||
+          NextExportInst->getOpcode() == R600::R600_ExportSwz) {
         unsigned CurrentInstExportType = NextExportInst->getOperand(1)
             .getImm();
         if (CurrentInstExportType == InstExportType) {
@@ -420,7 +444,7 @@ R600TargetLowering::EmitInstrWithCustomInserter(MachineInstr &MI,
     bool EOP = isEOP(I);
     if (!EOP && !isLastInstructionOfItsType)
       return BB;
-    unsigned CfInst = (MI.getOpcode() == AMDGPU::EG_ExportSwz) ? 84 : 40;
+    unsigned CfInst = (MI.getOpcode() == R600::EG_ExportSwz) ? 84 : 40;
     BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(MI.getOpcode()))
         .add(MI.getOperand(0))
         .add(MI.getOperand(1))
@@ -433,7 +457,7 @@ R600TargetLowering::EmitInstrWithCustomInserter(MachineInstr &MI,
         .addImm(EOP);
     break;
   }
-  case AMDGPU::RETURN: {
+  case R600::RETURN: {
     return BB;
   }
   }
@@ -478,7 +502,7 @@ SDValue R600TargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const
     unsigned IntrinsicID =
                          cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue();
     switch (IntrinsicID) {
-    case AMDGPUIntrinsic::r600_store_swizzle: {
+    case Intrinsic::r600_store_swizzle: {
       SDLoc DL(Op);
       const SDValue Args[8] = {
         Chain,
@@ -505,14 +529,14 @@ SDValue R600TargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const
     EVT VT = Op.getValueType();
     SDLoc DL(Op);
     switch (IntrinsicID) {
-    case AMDGPUIntrinsic::r600_tex:
-    case AMDGPUIntrinsic::r600_texc: {
+    case Intrinsic::r600_tex:
+    case Intrinsic::r600_texc: {
       unsigned TextureOp;
       switch (IntrinsicID) {
-      case AMDGPUIntrinsic::r600_tex:
+      case Intrinsic::r600_tex:
         TextureOp = 0;
         break;
-      case AMDGPUIntrinsic::r600_texc:
+      case Intrinsic::r600_texc:
         TextureOp = 1;
         break;
       default:
@@ -542,7 +566,7 @@ SDValue R600TargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const
       };
       return DAG.getNode(AMDGPUISD::TEXTURE_FETCH, DL, MVT::v4f32, TexArgs);
     }
-    case AMDGPUIntrinsic::r600_dot4: {
+    case Intrinsic::r600_dot4: {
       SDValue Args[8] = {
       DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32, Op.getOperand(1),
           DAG.getConstant(0, DL, MVT::i32)),
@@ -566,7 +590,7 @@ SDValue R600TargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const
 
     case Intrinsic::r600_implicitarg_ptr: {
       MVT PtrVT = getPointerTy(DAG.getDataLayout(), AMDGPUASI.PARAM_I_ADDRESS);
-      uint32_t ByteOffset = getImplicitParameterOffset(MFI, FIRST_IMPLICIT);
+      uint32_t ByteOffset = getImplicitParameterOffset(MF, FIRST_IMPLICIT);
       return DAG.getConstant(ByteOffset, DL, PtrVT);
     }
     case Intrinsic::r600_read_ngroups_x:
@@ -589,23 +613,23 @@ SDValue R600TargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const
       return LowerImplicitParameter(DAG, VT, DL, 8);
 
     case Intrinsic::r600_read_tgid_x:
-      return CreateLiveInRegisterRaw(DAG, &AMDGPU::R600_TReg32RegClass,
-                                     AMDGPU::T1_X, VT);
+      return CreateLiveInRegisterRaw(DAG, &R600::R600_TReg32RegClass,
+                                     R600::T1_X, VT);
     case Intrinsic::r600_read_tgid_y:
-      return CreateLiveInRegisterRaw(DAG, &AMDGPU::R600_TReg32RegClass,
-                                     AMDGPU::T1_Y, VT);
+      return CreateLiveInRegisterRaw(DAG, &R600::R600_TReg32RegClass,
+                                     R600::T1_Y, VT);
     case Intrinsic::r600_read_tgid_z:
-      return CreateLiveInRegisterRaw(DAG, &AMDGPU::R600_TReg32RegClass,
-                                     AMDGPU::T1_Z, VT);
+      return CreateLiveInRegisterRaw(DAG, &R600::R600_TReg32RegClass,
+                                     R600::T1_Z, VT);
     case Intrinsic::r600_read_tidig_x:
-      return CreateLiveInRegisterRaw(DAG, &AMDGPU::R600_TReg32RegClass,
-                                     AMDGPU::T0_X, VT);
+      return CreateLiveInRegisterRaw(DAG, &R600::R600_TReg32RegClass,
+                                     R600::T0_X, VT);
     case Intrinsic::r600_read_tidig_y:
-      return CreateLiveInRegisterRaw(DAG, &AMDGPU::R600_TReg32RegClass,
-                                     AMDGPU::T0_Y, VT);
+      return CreateLiveInRegisterRaw(DAG, &R600::R600_TReg32RegClass,
+                                     R600::T0_Y, VT);
     case Intrinsic::r600_read_tidig_z:
-      return CreateLiveInRegisterRaw(DAG, &AMDGPU::R600_TReg32RegClass,
-                                     AMDGPU::T0_Z, VT);
+      return CreateLiveInRegisterRaw(DAG, &R600::R600_TReg32RegClass,
+                                     R600::T0_Z, VT);
 
     case Intrinsic::r600_recipsqrt_ieee:
       return DAG.getNode(AMDGPUISD::RSQ, DL, VT, Op.getOperand(1));
@@ -755,7 +779,7 @@ SDValue R600TargetLowering::LowerTrig(SDValue Op, SelectionDAG &DAG) const {
   SDValue TrigVal = DAG.getNode(TrigNode, DL, VT,
       DAG.getNode(ISD::FADD, DL, VT, FractPart,
         DAG.getConstantFP(-0.5, DL, MVT::f32)));
-  if (Gen >= R600Subtarget::R700)
+  if (Gen >= AMDGPUSubtarget::R700)
     return TrigVal;
   // On R600 hw, COS/SIN input must be between -Pi and Pi.
   return DAG.getNode(ISD::FMUL, DL, VT, TrigVal,
@@ -1527,7 +1551,7 @@ SDValue R600TargetLowering::LowerBRCOND(SDValue Op, SelectionDAG &DAG) const {
 SDValue R600TargetLowering::lowerFrameIndex(SDValue Op,
                                             SelectionDAG &DAG) const {
   MachineFunction &MF = DAG.getMachineFunction();
-  const R600FrameLowering *TFL = getSubtarget()->getFrameLowering();
+  const R600FrameLowering *TFL = Subtarget->getFrameLowering();
 
   FrameIndexSDNode *FIN = cast<FrameIndexSDNode>(Op);
 
@@ -1539,6 +1563,28 @@ SDValue R600TargetLowering::lowerFrameIndex(SDValue Op,
                          Op.getValueType());
 }
 
+CCAssignFn *R600TargetLowering::CCAssignFnForCall(CallingConv::ID CC,
+                                                  bool IsVarArg) const {
+  switch (CC) {
+  case CallingConv::AMDGPU_KERNEL:
+  case CallingConv::SPIR_KERNEL:
+  case CallingConv::C:
+  case CallingConv::Fast:
+  case CallingConv::Cold:
+    llvm_unreachable("kernels should not be handled here");
+  case CallingConv::AMDGPU_VS:
+  case CallingConv::AMDGPU_GS:
+  case CallingConv::AMDGPU_PS:
+  case CallingConv::AMDGPU_CS:
+  case CallingConv::AMDGPU_HS:
+  case CallingConv::AMDGPU_ES:
+  case CallingConv::AMDGPU_LS:
+    return CC_R600;
+  default:
+    report_fatal_error("Unsupported calling convention.");
+  }
+}
+
 /// XXX Only kernel functions are supported, so we can assume for now that
 /// every function is a kernel function, but in the future we should use
 /// separate calling conventions for kernel and non-kernel functions.
@@ -1550,8 +1596,6 @@ SDValue R600TargetLowering::LowerFormalArguments(
   CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(), ArgLocs,
                  *DAG.getContext());
   MachineFunction &MF = DAG.getMachineFunction();
-  R600MachineFunctionInfo *MFI = MF.getInfo<R600MachineFunctionInfo>();
-
   SmallVector<ISD::InputArg, 8> LocalIns;
 
   if (AMDGPU::isShader(CallConv)) {
@@ -1571,7 +1615,7 @@ SDValue R600TargetLowering::LowerFormalArguments(
     }
 
     if (AMDGPU::isShader(CallConv)) {
-      unsigned Reg = MF.addLiveIn(VA.getLocReg(), &AMDGPU::R600_Reg128RegClass);
+      unsigned Reg = MF.addLiveIn(VA.getLocReg(), &R600::R600_Reg128RegClass);
       SDValue Register = DAG.getCopyFromReg(Chain, DL, Reg, VT);
       InVals.push_back(Register);
       continue;
@@ -1602,19 +1646,18 @@ SDValue R600TargetLowering::LowerFormalArguments(
 
     unsigned ValBase = ArgLocs[In.getOrigArgIndex()].getLocMemOffset();
     unsigned PartOffset = VA.getLocMemOffset();
-    unsigned Offset = Subtarget->getExplicitKernelArgOffset(MF) + VA.getLocMemOffset();
 
     MachinePointerInfo PtrInfo(UndefValue::get(PtrTy), PartOffset - ValBase);
     SDValue Arg = DAG.getLoad(
         ISD::UNINDEXED, Ext, VT, DL, Chain,
-        DAG.getConstant(Offset, DL, MVT::i32), DAG.getUNDEF(MVT::i32), PtrInfo,
+        DAG.getConstant(PartOffset, DL, MVT::i32), DAG.getUNDEF(MVT::i32),
+        PtrInfo,
         MemVT, /* Alignment = */ 4, MachineMemOperand::MONonTemporal |
                                         MachineMemOperand::MODereferenceable |
                                         MachineMemOperand::MOInvariant);
 
     // 4 is the preferred alignment for the CONSTANT memory space.
     InVals.push_back(Arg);
-    MFI->setABIArgOffset(Offset + MemVT.getStoreSize());
   }
   return Chain;
 }
@@ -1989,26 +2032,26 @@ bool R600TargetLowering::FoldOperand(SDNode *ParentNode, unsigned SrcIdx,
                                      SDValue &Src, SDValue &Neg, SDValue &Abs,
                                      SDValue &Sel, SDValue &Imm,
                                      SelectionDAG &DAG) const {
-  const R600InstrInfo *TII = getSubtarget()->getInstrInfo();
+  const R600InstrInfo *TII = Subtarget->getInstrInfo();
   if (!Src.isMachineOpcode())
     return false;
 
   switch (Src.getMachineOpcode()) {
-  case AMDGPU::FNEG_R600:
+  case R600::FNEG_R600:
     if (!Neg.getNode())
       return false;
     Src = Src.getOperand(0);
     Neg = DAG.getTargetConstant(1, SDLoc(ParentNode), MVT::i32);
     return true;
-  case AMDGPU::FABS_R600:
+  case R600::FABS_R600:
     if (!Abs.getNode())
       return false;
     Src = Src.getOperand(0);
     Abs = DAG.getTargetConstant(1, SDLoc(ParentNode), MVT::i32);
     return true;
-  case AMDGPU::CONST_COPY: {
+  case R600::CONST_COPY: {
     unsigned Opcode = ParentNode->getMachineOpcode();
-    bool HasDst = TII->getOperandIdx(Opcode, AMDGPU::OpName::dst) > -1;
+    bool HasDst = TII->getOperandIdx(Opcode, R600::OpName::dst) > -1;
 
     if (!Sel.getNode())
       return false;
@@ -2019,17 +2062,17 @@ bool R600TargetLowering::FoldOperand(SDNode *ParentNode, unsigned SrcIdx,
 
     // Gather constants values
     int SrcIndices[] = {
-      TII->getOperandIdx(Opcode, AMDGPU::OpName::src0),
-      TII->getOperandIdx(Opcode, AMDGPU::OpName::src1),
-      TII->getOperandIdx(Opcode, AMDGPU::OpName::src2),
-      TII->getOperandIdx(Opcode, AMDGPU::OpName::src0_X),
-      TII->getOperandIdx(Opcode, AMDGPU::OpName::src0_Y),
-      TII->getOperandIdx(Opcode, AMDGPU::OpName::src0_Z),
-      TII->getOperandIdx(Opcode, AMDGPU::OpName::src0_W),
-      TII->getOperandIdx(Opcode, AMDGPU::OpName::src1_X),
-      TII->getOperandIdx(Opcode, AMDGPU::OpName::src1_Y),
-      TII->getOperandIdx(Opcode, AMDGPU::OpName::src1_Z),
-      TII->getOperandIdx(Opcode, AMDGPU::OpName::src1_W)
+      TII->getOperandIdx(Opcode, R600::OpName::src0),
+      TII->getOperandIdx(Opcode, R600::OpName::src1),
+      TII->getOperandIdx(Opcode, R600::OpName::src2),
+      TII->getOperandIdx(Opcode, R600::OpName::src0_X),
+      TII->getOperandIdx(Opcode, R600::OpName::src0_Y),
+      TII->getOperandIdx(Opcode, R600::OpName::src0_Z),
+      TII->getOperandIdx(Opcode, R600::OpName::src0_W),
+      TII->getOperandIdx(Opcode, R600::OpName::src1_X),
+      TII->getOperandIdx(Opcode, R600::OpName::src1_Y),
+      TII->getOperandIdx(Opcode, R600::OpName::src1_Z),
+      TII->getOperandIdx(Opcode, R600::OpName::src1_W)
     };
     std::vector<unsigned> Consts;
     for (int OtherSrcIdx : SrcIndices) {
@@ -2042,7 +2085,7 @@ bool R600TargetLowering::FoldOperand(SDNode *ParentNode, unsigned SrcIdx,
       }
       if (RegisterSDNode *Reg =
           dyn_cast<RegisterSDNode>(ParentNode->getOperand(OtherSrcIdx))) {
-        if (Reg->getReg() == AMDGPU::ALU_CONST) {
+        if (Reg->getReg() == R600::ALU_CONST) {
           ConstantSDNode *Cst
             = cast<ConstantSDNode>(ParentNode->getOperand(OtherSelIdx));
           Consts.push_back(Cst->getZExtValue());
@@ -2057,30 +2100,30 @@ bool R600TargetLowering::FoldOperand(SDNode *ParentNode, unsigned SrcIdx,
     }
 
     Sel = CstOffset;
-    Src = DAG.getRegister(AMDGPU::ALU_CONST, MVT::f32);
+    Src = DAG.getRegister(R600::ALU_CONST, MVT::f32);
     return true;
   }
-  case AMDGPU::MOV_IMM_GLOBAL_ADDR:
+  case R600::MOV_IMM_GLOBAL_ADDR:
     // Check if the Imm slot is used. Taken from below.
     if (cast<ConstantSDNode>(Imm)->getZExtValue())
       return false;
     Imm = Src.getOperand(0);
-    Src = DAG.getRegister(AMDGPU::ALU_LITERAL_X, MVT::i32);
+    Src = DAG.getRegister(R600::ALU_LITERAL_X, MVT::i32);
     return true;
-  case AMDGPU::MOV_IMM_I32:
-  case AMDGPU::MOV_IMM_F32: {
-    unsigned ImmReg = AMDGPU::ALU_LITERAL_X;
+  case R600::MOV_IMM_I32:
+  case R600::MOV_IMM_F32: {
+    unsigned ImmReg = R600::ALU_LITERAL_X;
     uint64_t ImmValue = 0;
 
-    if (Src.getMachineOpcode() == AMDGPU::MOV_IMM_F32) {
+    if (Src.getMachineOpcode() == R600::MOV_IMM_F32) {
       ConstantFPSDNode *FPC = dyn_cast<ConstantFPSDNode>(Src.getOperand(0));
       float FloatValue = FPC->getValueAPF().convertToFloat();
       if (FloatValue == 0.0) {
-        ImmReg = AMDGPU::ZERO;
+        ImmReg = R600::ZERO;
       } else if (FloatValue == 0.5) {
-        ImmReg = AMDGPU::HALF;
+        ImmReg = R600::HALF;
       } else if (FloatValue == 1.0) {
-        ImmReg = AMDGPU::ONE;
+        ImmReg = R600::ONE;
       } else {
         ImmValue = FPC->getValueAPF().bitcastToAPInt().getZExtValue();
       }
@@ -2088,9 +2131,9 @@ bool R600TargetLowering::FoldOperand(SDNode *ParentNode, unsigned SrcIdx,
       ConstantSDNode *C = dyn_cast<ConstantSDNode>(Src.getOperand(0));
       uint64_t Value = C->getZExtValue();
       if (Value == 0) {
-        ImmReg = AMDGPU::ZERO;
+        ImmReg = R600::ZERO;
       } else if (Value == 1) {
-        ImmReg = AMDGPU::ONE_INT;
+        ImmReg = R600::ONE_INT;
       } else {
         ImmValue = Value;
       }
@@ -2099,7 +2142,7 @@ bool R600TargetLowering::FoldOperand(SDNode *ParentNode, unsigned SrcIdx,
     // Check that we aren't already using an immediate.
     // XXX: It's possible for an instruction to have more than one
     // immediate operand, but this is not supported yet.
-    if (ImmReg == AMDGPU::ALU_LITERAL_X) {
+    if (ImmReg == R600::ALU_LITERAL_X) {
       if (!Imm.getNode())
         return false;
       ConstantSDNode *C = dyn_cast<ConstantSDNode>(Imm);
@@ -2116,10 +2159,10 @@ bool R600TargetLowering::FoldOperand(SDNode *ParentNode, unsigned SrcIdx,
   }
 }
 
-/// \brief Fold the instructions after selecting them
+/// Fold the instructions after selecting them
 SDNode *R600TargetLowering::PostISelFolding(MachineSDNode *Node,
                                             SelectionDAG &DAG) const {
-  const R600InstrInfo *TII = getSubtarget()->getInstrInfo();
+  const R600InstrInfo *TII = Subtarget->getInstrInfo();
   if (!Node->isMachineOpcode())
     return Node;
 
@@ -2128,36 +2171,36 @@ SDNode *R600TargetLowering::PostISelFolding(MachineSDNode *Node,
 
   std::vector<SDValue> Ops(Node->op_begin(), Node->op_end());
 
-  if (Opcode == AMDGPU::DOT_4) {
+  if (Opcode == R600::DOT_4) {
     int OperandIdx[] = {
-      TII->getOperandIdx(Opcode, AMDGPU::OpName::src0_X),
-      TII->getOperandIdx(Opcode, AMDGPU::OpName::src0_Y),
-      TII->getOperandIdx(Opcode, AMDGPU::OpName::src0_Z),
-      TII->getOperandIdx(Opcode, AMDGPU::OpName::src0_W),
-      TII->getOperandIdx(Opcode, AMDGPU::OpName::src1_X),
-      TII->getOperandIdx(Opcode, AMDGPU::OpName::src1_Y),
-      TII->getOperandIdx(Opcode, AMDGPU::OpName::src1_Z),
-      TII->getOperandIdx(Opcode, AMDGPU::OpName::src1_W)
+      TII->getOperandIdx(Opcode, R600::OpName::src0_X),
+      TII->getOperandIdx(Opcode, R600::OpName::src0_Y),
+      TII->getOperandIdx(Opcode, R600::OpName::src0_Z),
+      TII->getOperandIdx(Opcode, R600::OpName::src0_W),
+      TII->getOperandIdx(Opcode, R600::OpName::src1_X),
+      TII->getOperandIdx(Opcode, R600::OpName::src1_Y),
+      TII->getOperandIdx(Opcode, R600::OpName::src1_Z),
+      TII->getOperandIdx(Opcode, R600::OpName::src1_W)
         };
     int NegIdx[] = {
-      TII->getOperandIdx(Opcode, AMDGPU::OpName::src0_neg_X),
-      TII->getOperandIdx(Opcode, AMDGPU::OpName::src0_neg_Y),
-      TII->getOperandIdx(Opcode, AMDGPU::OpName::src0_neg_Z),
-      TII->getOperandIdx(Opcode, AMDGPU::OpName::src0_neg_W),
-      TII->getOperandIdx(Opcode, AMDGPU::OpName::src1_neg_X),
-      TII->getOperandIdx(Opcode, AMDGPU::OpName::src1_neg_Y),
-      TII->getOperandIdx(Opcode, AMDGPU::OpName::src1_neg_Z),
-      TII->getOperandIdx(Opcode, AMDGPU::OpName::src1_neg_W)
+      TII->getOperandIdx(Opcode, R600::OpName::src0_neg_X),
+      TII->getOperandIdx(Opcode, R600::OpName::src0_neg_Y),
+      TII->getOperandIdx(Opcode, R600::OpName::src0_neg_Z),
+      TII->getOperandIdx(Opcode, R600::OpName::src0_neg_W),
+      TII->getOperandIdx(Opcode, R600::OpName::src1_neg_X),
+      TII->getOperandIdx(Opcode, R600::OpName::src1_neg_Y),
+      TII->getOperandIdx(Opcode, R600::OpName::src1_neg_Z),
+      TII->getOperandIdx(Opcode, R600::OpName::src1_neg_W)
     };
     int AbsIdx[] = {
-      TII->getOperandIdx(Opcode, AMDGPU::OpName::src0_abs_X),
-      TII->getOperandIdx(Opcode, AMDGPU::OpName::src0_abs_Y),
-      TII->getOperandIdx(Opcode, AMDGPU::OpName::src0_abs_Z),
-      TII->getOperandIdx(Opcode, AMDGPU::OpName::src0_abs_W),
-      TII->getOperandIdx(Opcode, AMDGPU::OpName::src1_abs_X),
-      TII->getOperandIdx(Opcode, AMDGPU::OpName::src1_abs_Y),
-      TII->getOperandIdx(Opcode, AMDGPU::OpName::src1_abs_Z),
-      TII->getOperandIdx(Opcode, AMDGPU::OpName::src1_abs_W)
+      TII->getOperandIdx(Opcode, R600::OpName::src0_abs_X),
+      TII->getOperandIdx(Opcode, R600::OpName::src0_abs_Y),
+      TII->getOperandIdx(Opcode, R600::OpName::src0_abs_Z),
+      TII->getOperandIdx(Opcode, R600::OpName::src0_abs_W),
+      TII->getOperandIdx(Opcode, R600::OpName::src1_abs_X),
+      TII->getOperandIdx(Opcode, R600::OpName::src1_abs_Y),
+      TII->getOperandIdx(Opcode, R600::OpName::src1_abs_Z),
+      TII->getOperandIdx(Opcode, R600::OpName::src1_abs_W)
     };
     for (unsigned i = 0; i < 8; i++) {
       if (OperandIdx[i] < 0)
@@ -2165,7 +2208,7 @@ SDNode *R600TargetLowering::PostISelFolding(MachineSDNode *Node,
       SDValue &Src = Ops[OperandIdx[i] - 1];
       SDValue &Neg = Ops[NegIdx[i] - 1];
       SDValue &Abs = Ops[AbsIdx[i] - 1];
-      bool HasDst = TII->getOperandIdx(Opcode, AMDGPU::OpName::dst) > -1;
+      bool HasDst = TII->getOperandIdx(Opcode, R600::OpName::dst) > -1;
       int SelIdx = TII->getSelIdx(Opcode, OperandIdx[i]);
       if (HasDst)
         SelIdx--;
@@ -2173,42 +2216,28 @@ SDNode *R600TargetLowering::PostISelFolding(MachineSDNode *Node,
       if (FoldOperand(Node, i, Src, Neg, Abs, Sel, FakeOp, DAG))
         return DAG.getMachineNode(Opcode, SDLoc(Node), Node->getVTList(), Ops);
     }
-  } else if (Opcode == AMDGPU::REG_SEQUENCE) {
+  } else if (Opcode == R600::REG_SEQUENCE) {
     for (unsigned i = 1, e = Node->getNumOperands(); i < e; i += 2) {
       SDValue &Src = Ops[i];
       if (FoldOperand(Node, i, Src, FakeOp, FakeOp, FakeOp, FakeOp, DAG))
         return DAG.getMachineNode(Opcode, SDLoc(Node), Node->getVTList(), Ops);
     }
-  } else if (Opcode == AMDGPU::CLAMP_R600) {
-    SDValue Src = Node->getOperand(0);
-    if (!Src.isMachineOpcode() ||
-        !TII->hasInstrModifiers(Src.getMachineOpcode()))
-      return Node;
-    int ClampIdx = TII->getOperandIdx(Src.getMachineOpcode(),
-        AMDGPU::OpName::clamp);
-    if (ClampIdx < 0)
-      return Node;
-    SDLoc DL(Node);
-    std::vector<SDValue> Ops(Src->op_begin(), Src->op_end());
-    Ops[ClampIdx - 1] = DAG.getTargetConstant(1, DL, MVT::i32);
-    return DAG.getMachineNode(Src.getMachineOpcode(), DL,
-                              Node->getVTList(), Ops);
   } else {
     if (!TII->hasInstrModifiers(Opcode))
       return Node;
     int OperandIdx[] = {
-      TII->getOperandIdx(Opcode, AMDGPU::OpName::src0),
-      TII->getOperandIdx(Opcode, AMDGPU::OpName::src1),
-      TII->getOperandIdx(Opcode, AMDGPU::OpName::src2)
+      TII->getOperandIdx(Opcode, R600::OpName::src0),
+      TII->getOperandIdx(Opcode, R600::OpName::src1),
+      TII->getOperandIdx(Opcode, R600::OpName::src2)
     };
     int NegIdx[] = {
-      TII->getOperandIdx(Opcode, AMDGPU::OpName::src0_neg),
-      TII->getOperandIdx(Opcode, AMDGPU::OpName::src1_neg),
-      TII->getOperandIdx(Opcode, AMDGPU::OpName::src2_neg)
+      TII->getOperandIdx(Opcode, R600::OpName::src0_neg),
+      TII->getOperandIdx(Opcode, R600::OpName::src1_neg),
+      TII->getOperandIdx(Opcode, R600::OpName::src2_neg)
     };
     int AbsIdx[] = {
-      TII->getOperandIdx(Opcode, AMDGPU::OpName::src0_abs),
-      TII->getOperandIdx(Opcode, AMDGPU::OpName::src1_abs),
+      TII->getOperandIdx(Opcode, R600::OpName::src0_abs),
+      TII->getOperandIdx(Opcode, R600::OpName::src1_abs),
       -1
     };
     for (unsigned i = 0; i < 3; i++) {
@@ -2218,9 +2247,9 @@ SDNode *R600TargetLowering::PostISelFolding(MachineSDNode *Node,
       SDValue &Neg = Ops[NegIdx[i] - 1];
       SDValue FakeAbs;
       SDValue &Abs = (AbsIdx[i] > -1) ? Ops[AbsIdx[i] - 1] : FakeAbs;
-      bool HasDst = TII->getOperandIdx(Opcode, AMDGPU::OpName::dst) > -1;
+      bool HasDst = TII->getOperandIdx(Opcode, R600::OpName::dst) > -1;
       int SelIdx = TII->getSelIdx(Opcode, OperandIdx[i]);
-      int ImmIdx = TII->getOperandIdx(Opcode, AMDGPU::OpName::literal);
+      int ImmIdx = TII->getOperandIdx(Opcode, R600::OpName::literal);
       if (HasDst) {
         SelIdx--;
         ImmIdx--;
diff --git a/lib/Target/AMDGPU/R600ISelLowering.h b/lib/Target/AMDGPU/R600ISelLowering.h
index 2a774693f02b..907d1f10e151 100644
--- a/lib/Target/AMDGPU/R600ISelLowering.h
+++ b/lib/Target/AMDGPU/R600ISelLowering.h
@@ -8,7 +8,7 @@
 //===----------------------------------------------------------------------===//
 //
 /// \file
-/// \brief R600 DAG Lowering interface definition
+/// R600 DAG Lowering interface definition
 //
 //===----------------------------------------------------------------------===//
 
@@ -23,6 +23,8 @@ class R600InstrInfo;
 class R600Subtarget;
 
 class R600TargetLowering final : public AMDGPUTargetLowering {
+
+  const R600Subtarget *Subtarget;
 public:
   R600TargetLowering(const TargetMachine &TM, const R600Subtarget &STI);
 
@@ -36,6 +38,7 @@ public:
   void ReplaceNodeResults(SDNode * N,
                           SmallVectorImpl<SDValue> &Results,
                           SelectionDAG &DAG) const override;
+  CCAssignFn *CCAssignFnForCall(CallingConv::ID CC, bool IsVarArg) const;
   SDValue LowerFormalArguments(SDValue Chain, CallingConv::ID CallConv,
                                bool isVarArg,
                                const SmallVectorImpl<ISD::InputArg> &Ins,
diff --git a/lib/Target/AMDGPU/R600InstrFormats.td b/lib/Target/AMDGPU/R600InstrFormats.td
index 61106ed42e64..687a9affa138 100644
--- a/lib/Target/AMDGPU/R600InstrFormats.td
+++ b/lib/Target/AMDGPU/R600InstrFormats.td
@@ -11,10 +11,10 @@
 //
 //===----------------------------------------------------------------------===//
 
-def isR600 : Predicate<"Subtarget->getGeneration() <= R600Subtarget::R700">;
+def isR600 : Predicate<"Subtarget->getGeneration() <= AMDGPUSubtarget::R700">;
 
 def isR600toCayman : Predicate<
-    "Subtarget->getGeneration() <= R600Subtarget::NORTHERN_ISLANDS">;
+    "Subtarget->getGeneration() <= AMDGPUSubtarget::NORTHERN_ISLANDS">;
 
 class R600Pat<dag pattern, dag result> : AMDGPUPat<pattern, result> {
   let SubtargetPredicate = isR600toCayman;
@@ -41,7 +41,7 @@ class InstR600 <dag outs, dag ins, string asm, list<dag> pattern,
   bit LDS_1A2D = 0;
 
   let SubtargetPredicate = isR600toCayman;
-  let Namespace = "AMDGPU";
+  let Namespace = "R600";
   let OutOperandList = outs;
   let InOperandList = ins;
   let AsmString = asm;
diff --git a/lib/Target/AMDGPU/R600InstrInfo.cpp b/lib/Target/AMDGPU/R600InstrInfo.cpp
index 23e646c8147c..5397e779474c 100644
--- a/lib/Target/AMDGPU/R600InstrInfo.cpp
+++ b/lib/Target/AMDGPU/R600InstrInfo.cpp
@@ -8,7 +8,7 @@
 //===----------------------------------------------------------------------===//
 //
 /// \file
-/// \brief R600 Implementation of TargetInstrInfo.
+/// R600 Implementation of TargetInstrInfo.
 //
 //===----------------------------------------------------------------------===//
 
@@ -19,6 +19,7 @@
 #include "R600Defines.h"
 #include "R600FrameLowering.h"
 #include "R600RegisterInfo.h"
+#include "MCTargetDesc/AMDGPUMCTargetDesc.h"
 #include "Utils/AMDGPUBaseInfo.h"
 #include "llvm/ADT/BitVector.h"
 #include "llvm/ADT/SmallSet.h"
@@ -44,10 +45,15 @@
 using namespace llvm;
 
 #define GET_INSTRINFO_CTOR_DTOR
-#include "AMDGPUGenDFAPacketizer.inc"
+#include "R600GenDFAPacketizer.inc"
+
+#define GET_INSTRINFO_CTOR_DTOR
+#define GET_INSTRMAP_INFO
+#define GET_INSTRINFO_NAMED_OPS
+#include "R600GenInstrInfo.inc"
 
 R600InstrInfo::R600InstrInfo(const R600Subtarget &ST)
-  : AMDGPUInstrInfo(ST), RI(), ST(ST) {}
+  : R600GenInstrInfo(-1, -1), RI(), ST(ST) {}
 
 bool R600InstrInfo::isVector(const MachineInstr &MI) const {
   return get(MI.getOpcode()).TSFlags & R600_InstFlag::VECTOR;
@@ -58,31 +64,31 @@ void R600InstrInfo::copyPhysReg(MachineBasicBlock &MBB,
                                 const DebugLoc &DL, unsigned DestReg,
                                 unsigned SrcReg, bool KillSrc) const {
   unsigned VectorComponents = 0;
-  if ((AMDGPU::R600_Reg128RegClass.contains(DestReg) ||
-      AMDGPU::R600_Reg128VerticalRegClass.contains(DestReg)) &&
-      (AMDGPU::R600_Reg128RegClass.contains(SrcReg) ||
-       AMDGPU::R600_Reg128VerticalRegClass.contains(SrcReg))) {
+  if ((R600::R600_Reg128RegClass.contains(DestReg) ||
+      R600::R600_Reg128VerticalRegClass.contains(DestReg)) &&
+      (R600::R600_Reg128RegClass.contains(SrcReg) ||
+       R600::R600_Reg128VerticalRegClass.contains(SrcReg))) {
     VectorComponents = 4;
-  } else if((AMDGPU::R600_Reg64RegClass.contains(DestReg) ||
-            AMDGPU::R600_Reg64VerticalRegClass.contains(DestReg)) &&
-            (AMDGPU::R600_Reg64RegClass.contains(SrcReg) ||
-             AMDGPU::R600_Reg64VerticalRegClass.contains(SrcReg))) {
+  } else if((R600::R600_Reg64RegClass.contains(DestReg) ||
+            R600::R600_Reg64VerticalRegClass.contains(DestReg)) &&
+            (R600::R600_Reg64RegClass.contains(SrcReg) ||
+             R600::R600_Reg64VerticalRegClass.contains(SrcReg))) {
     VectorComponents = 2;
   }
 
   if (VectorComponents > 0) {
     for (unsigned I = 0; I < VectorComponents; I++) {
-      unsigned SubRegIndex = RI.getSubRegFromChannel(I);
-      buildDefaultInstruction(MBB, MI, AMDGPU::MOV,
+      unsigned SubRegIndex = AMDGPURegisterInfo::getSubRegFromChannel(I);
+      buildDefaultInstruction(MBB, MI, R600::MOV,
                               RI.getSubReg(DestReg, SubRegIndex),
                               RI.getSubReg(SrcReg, SubRegIndex))
                               .addReg(DestReg,
                                       RegState::Define | RegState::Implicit);
     }
   } else {
-    MachineInstr *NewMI = buildDefaultInstruction(MBB, MI, AMDGPU::MOV,
+    MachineInstr *NewMI = buildDefaultInstruction(MBB, MI, R600::MOV,
                                                   DestReg, SrcReg);
-    NewMI->getOperand(getOperandIdx(*NewMI, AMDGPU::OpName::src0))
+    NewMI->getOperand(getOperandIdx(*NewMI, R600::OpName::src0))
                                     .setIsKill(KillSrc);
   }
 }
@@ -103,9 +109,9 @@ bool R600InstrInfo::isMov(unsigned Opcode) const {
   switch(Opcode) {
   default:
     return false;
-  case AMDGPU::MOV:
-  case AMDGPU::MOV_IMM_F32:
-  case AMDGPU::MOV_IMM_I32:
+  case R600::MOV:
+  case R600::MOV_IMM_F32:
+  case R600::MOV_IMM_I32:
     return true;
   }
 }
@@ -117,10 +123,10 @@ bool R600InstrInfo::isReductionOp(unsigned Opcode) const {
 bool R600InstrInfo::isCubeOp(unsigned Opcode) const {
   switch(Opcode) {
     default: return false;
-    case AMDGPU::CUBE_r600_pseudo:
-    case AMDGPU::CUBE_r600_real:
-    case AMDGPU::CUBE_eg_pseudo:
-    case AMDGPU::CUBE_eg_real:
+    case R600::CUBE_r600_pseudo:
+    case R600::CUBE_r600_real:
+    case R600::CUBE_eg_pseudo:
+    case R600::CUBE_eg_real:
       return true;
   }
 }
@@ -148,7 +154,7 @@ bool R600InstrInfo::isLDSInstr(unsigned Opcode) const {
 }
 
 bool R600InstrInfo::isLDSRetInstr(unsigned Opcode) const {
-  return isLDSInstr(Opcode) && getOperandIdx(Opcode, AMDGPU::OpName::dst) != -1;
+  return isLDSInstr(Opcode) && getOperandIdx(Opcode, R600::OpName::dst) != -1;
 }
 
 bool R600InstrInfo::canBeConsideredALU(const MachineInstr &MI) const {
@@ -157,12 +163,12 @@ bool R600InstrInfo::canBeConsideredALU(const MachineInstr &MI) const {
   if (isVector(MI) || isCubeOp(MI.getOpcode()))
     return true;
   switch (MI.getOpcode()) {
-  case AMDGPU::PRED_X:
-  case AMDGPU::INTERP_PAIR_XY:
-  case AMDGPU::INTERP_PAIR_ZW:
-  case AMDGPU::INTERP_VEC_LOAD:
-  case AMDGPU::COPY:
-  case AMDGPU::DOT_4:
+  case R600::PRED_X:
+  case R600::INTERP_PAIR_XY:
+  case R600::INTERP_PAIR_ZW:
+  case R600::INTERP_VEC_LOAD:
+  case R600::COPY:
+  case R600::DOT_4:
     return true;
   default:
     return false;
@@ -172,7 +178,7 @@ bool R600InstrInfo::canBeConsideredALU(const MachineInstr &MI) const {
 bool R600InstrInfo::isTransOnly(unsigned Opcode) const {
   if (ST.hasCaymanISA())
     return false;
-  return (get(Opcode).getSchedClass() == AMDGPU::Sched::TransALU);
+  return (get(Opcode).getSchedClass() == R600::Sched::TransALU);
 }
 
 bool R600InstrInfo::isTransOnly(const MachineInstr &MI) const {
@@ -180,7 +186,7 @@ bool R600InstrInfo::isTransOnly(const MachineInstr &MI) const {
 }
 
 bool R600InstrInfo::isVectorOnly(unsigned Opcode) const {
-  return (get(Opcode).getSchedClass() == AMDGPU::Sched::VecALU);
+  return (get(Opcode).getSchedClass() == R600::Sched::VecALU);
 }
 
 bool R600InstrInfo::isVectorOnly(const MachineInstr &MI) const {
@@ -214,8 +220,8 @@ bool R600InstrInfo::usesTextureCache(const MachineInstr &MI) const {
 
 bool R600InstrInfo::mustBeLastInClause(unsigned Opcode) const {
   switch (Opcode) {
-  case AMDGPU::KILLGT:
-  case AMDGPU::GROUP_BARRIER:
+  case R600::KILLGT:
+  case R600::GROUP_BARRIER:
     return true;
   default:
     return false;
@@ -223,11 +229,11 @@ bool R600InstrInfo::mustBeLastInClause(unsigned Opcode) const {
 }
 
 bool R600InstrInfo::usesAddressRegister(MachineInstr &MI) const {
-  return MI.findRegisterUseOperandIdx(AMDGPU::AR_X) != -1;
+  return MI.findRegisterUseOperandIdx(R600::AR_X) != -1;
 }
 
 bool R600InstrInfo::definesAddressRegister(MachineInstr &MI) const {
-  return MI.findRegisterDefOperandIdx(AMDGPU::AR_X) != -1;
+  return MI.findRegisterDefOperandIdx(R600::AR_X) != -1;
 }
 
 bool R600InstrInfo::readsLDSSrcReg(const MachineInstr &MI) const {
@@ -241,7 +247,7 @@ bool R600InstrInfo::readsLDSSrcReg(const MachineInstr &MI) const {
         TargetRegisterInfo::isVirtualRegister(I->getReg()))
       continue;
 
-    if (AMDGPU::R600_LDS_SRC_REGRegClass.contains(I->getReg()))
+    if (R600::R600_LDS_SRC_REGRegClass.contains(I->getReg()))
       return true;
   }
   return false;
@@ -249,17 +255,17 @@ bool R600InstrInfo::readsLDSSrcReg(const MachineInstr &MI) const {
 
 int R600InstrInfo::getSelIdx(unsigned Opcode, unsigned SrcIdx) const {
   static const unsigned SrcSelTable[][2] = {
-    {AMDGPU::OpName::src0, AMDGPU::OpName::src0_sel},
-    {AMDGPU::OpName::src1, AMDGPU::OpName::src1_sel},
-    {AMDGPU::OpName::src2, AMDGPU::OpName::src2_sel},
-    {AMDGPU::OpName::src0_X, AMDGPU::OpName::src0_sel_X},
-    {AMDGPU::OpName::src0_Y, AMDGPU::OpName::src0_sel_Y},
-    {AMDGPU::OpName::src0_Z, AMDGPU::OpName::src0_sel_Z},
-    {AMDGPU::OpName::src0_W, AMDGPU::OpName::src0_sel_W},
-    {AMDGPU::OpName::src1_X, AMDGPU::OpName::src1_sel_X},
-    {AMDGPU::OpName::src1_Y, AMDGPU::OpName::src1_sel_Y},
-    {AMDGPU::OpName::src1_Z, AMDGPU::OpName::src1_sel_Z},
-    {AMDGPU::OpName::src1_W, AMDGPU::OpName::src1_sel_W}
+    {R600::OpName::src0, R600::OpName::src0_sel},
+    {R600::OpName::src1, R600::OpName::src1_sel},
+    {R600::OpName::src2, R600::OpName::src2_sel},
+    {R600::OpName::src0_X, R600::OpName::src0_sel_X},
+    {R600::OpName::src0_Y, R600::OpName::src0_sel_Y},
+    {R600::OpName::src0_Z, R600::OpName::src0_sel_Z},
+    {R600::OpName::src0_W, R600::OpName::src0_sel_W},
+    {R600::OpName::src1_X, R600::OpName::src1_sel_X},
+    {R600::OpName::src1_Y, R600::OpName::src1_sel_Y},
+    {R600::OpName::src1_Z, R600::OpName::src1_sel_Z},
+    {R600::OpName::src1_W, R600::OpName::src1_sel_W}
   };
 
   for (const auto &Row : SrcSelTable) {
@@ -274,23 +280,23 @@ SmallVector<std::pair<MachineOperand *, int64_t>, 3>
 R600InstrInfo::getSrcs(MachineInstr &MI) const {
   SmallVector<std::pair<MachineOperand *, int64_t>, 3> Result;
 
-  if (MI.getOpcode() == AMDGPU::DOT_4) {
+  if (MI.getOpcode() == R600::DOT_4) {
     static const unsigned OpTable[8][2] = {
-      {AMDGPU::OpName::src0_X, AMDGPU::OpName::src0_sel_X},
-      {AMDGPU::OpName::src0_Y, AMDGPU::OpName::src0_sel_Y},
-      {AMDGPU::OpName::src0_Z, AMDGPU::OpName::src0_sel_Z},
-      {AMDGPU::OpName::src0_W, AMDGPU::OpName::src0_sel_W},
-      {AMDGPU::OpName::src1_X, AMDGPU::OpName::src1_sel_X},
-      {AMDGPU::OpName::src1_Y, AMDGPU::OpName::src1_sel_Y},
-      {AMDGPU::OpName::src1_Z, AMDGPU::OpName::src1_sel_Z},
-      {AMDGPU::OpName::src1_W, AMDGPU::OpName::src1_sel_W},
+      {R600::OpName::src0_X, R600::OpName::src0_sel_X},
+      {R600::OpName::src0_Y, R600::OpName::src0_sel_Y},
+      {R600::OpName::src0_Z, R600::OpName::src0_sel_Z},
+      {R600::OpName::src0_W, R600::OpName::src0_sel_W},
+      {R600::OpName::src1_X, R600::OpName::src1_sel_X},
+      {R600::OpName::src1_Y, R600::OpName::src1_sel_Y},
+      {R600::OpName::src1_Z, R600::OpName::src1_sel_Z},
+      {R600::OpName::src1_W, R600::OpName::src1_sel_W},
     };
 
     for (unsigned j = 0; j < 8; j++) {
       MachineOperand &MO =
           MI.getOperand(getOperandIdx(MI.getOpcode(), OpTable[j][0]));
       unsigned Reg = MO.getReg();
-      if (Reg == AMDGPU::ALU_CONST) {
+      if (Reg == R600::ALU_CONST) {
         MachineOperand &Sel =
             MI.getOperand(getOperandIdx(MI.getOpcode(), OpTable[j][1]));
         Result.push_back(std::make_pair(&MO, Sel.getImm()));
@@ -302,9 +308,9 @@ R600InstrInfo::getSrcs(MachineInstr &MI) const {
   }
 
   static const unsigned OpTable[3][2] = {
-    {AMDGPU::OpName::src0, AMDGPU::OpName::src0_sel},
-    {AMDGPU::OpName::src1, AMDGPU::OpName::src1_sel},
-    {AMDGPU::OpName::src2, AMDGPU::OpName::src2_sel},
+    {R600::OpName::src0, R600::OpName::src0_sel},
+    {R600::OpName::src1, R600::OpName::src1_sel},
+    {R600::OpName::src2, R600::OpName::src2_sel},
   };
 
   for (unsigned j = 0; j < 3; j++) {
@@ -313,15 +319,15 @@ R600InstrInfo::getSrcs(MachineInstr &MI) const {
       break;
     MachineOperand &MO = MI.getOperand(SrcIdx);
     unsigned Reg = MO.getReg();
-    if (Reg == AMDGPU::ALU_CONST) {
+    if (Reg == R600::ALU_CONST) {
       MachineOperand &Sel =
           MI.getOperand(getOperandIdx(MI.getOpcode(), OpTable[j][1]));
       Result.push_back(std::make_pair(&MO, Sel.getImm()));
       continue;
     }
-    if (Reg == AMDGPU::ALU_LITERAL_X) {
+    if (Reg == R600::ALU_LITERAL_X) {
       MachineOperand &Operand =
-          MI.getOperand(getOperandIdx(MI.getOpcode(), AMDGPU::OpName::literal));
+          MI.getOperand(getOperandIdx(MI.getOpcode(), R600::OpName::literal));
       if (Operand.isImm()) {
         Result.push_back(std::make_pair(&MO, Operand.getImm()));
         continue;
@@ -345,7 +351,7 @@ R600InstrInfo::ExtractSrcs(MachineInstr &MI,
     ++i;
     unsigned Reg = Src.first->getReg();
     int Index = RI.getEncodingValue(Reg) & 0xff;
-    if (Reg == AMDGPU::OQAP) {
+    if (Reg == R600::OQAP) {
       Result.push_back(std::make_pair(Index, 0U));
     }
     if (PV.find(Reg) != PV.end()) {
@@ -435,7 +441,7 @@ unsigned  R600InstrInfo::isLegalUpTo(
       const std::pair<int, unsigned> &Src = Srcs[j];
       if (Src.first < 0 || Src.first == 255)
         continue;
-      if (Src.first == GET_REG_INDEX(RI.getEncodingValue(AMDGPU::OQAP))) {
+      if (Src.first == GET_REG_INDEX(RI.getEncodingValue(R600::OQAP))) {
         if (Swz[i] != R600InstrInfo::ALU_VEC_012_SCL_210 &&
             Swz[i] != R600InstrInfo::ALU_VEC_021_SCL_122) {
             // The value from output queue A (denoted by register OQAP) can
@@ -541,7 +547,7 @@ R600InstrInfo::fitsReadPortLimitations(const std::vector<MachineInstr *> &IG,
   for (unsigned i = 0, e = IG.size(); i < e; ++i) {
     IGSrcs.push_back(ExtractSrcs(*IG[i], PV, ConstCount));
     unsigned Op = getOperandIdx(IG[i]->getOpcode(),
-        AMDGPU::OpName::bank_swizzle);
+        R600::OpName::bank_swizzle);
     ValidSwizzle.push_back( (R600InstrInfo::BankSwizzle)
         IG[i]->getOperand(Op).getImm());
   }
@@ -610,14 +616,14 @@ R600InstrInfo::fitsConstReadLimitations(const std::vector<MachineInstr *> &MIs)
       continue;
 
     for (const auto &Src : getSrcs(MI)) {
-      if (Src.first->getReg() == AMDGPU::ALU_LITERAL_X)
+      if (Src.first->getReg() == R600::ALU_LITERAL_X)
         Literals.insert(Src.second);
       if (Literals.size() > 4)
         return false;
-      if (Src.first->getReg() == AMDGPU::ALU_CONST)
+      if (Src.first->getReg() == R600::ALU_CONST)
         Consts.push_back(Src.second);
-      if (AMDGPU::R600_KC0RegClass.contains(Src.first->getReg()) ||
-          AMDGPU::R600_KC1RegClass.contains(Src.first->getReg())) {
+      if (R600::R600_KC0RegClass.contains(Src.first->getReg()) ||
+          R600::R600_KC1RegClass.contains(Src.first->getReg())) {
         unsigned Index = RI.getEncodingValue(Src.first->getReg()) & 0xff;
         unsigned Chan = RI.getHWRegChan(Src.first->getReg());
         Consts.push_back((Index << 2) | Chan);
@@ -636,7 +642,7 @@ R600InstrInfo::CreateTargetScheduleState(const TargetSubtargetInfo &STI) const {
 static bool
 isPredicateSetter(unsigned Opcode) {
   switch (Opcode) {
-  case AMDGPU::PRED_X:
+  case R600::PRED_X:
     return true;
   default:
     return false;
@@ -658,12 +664,12 @@ findFirstPredicateSetterFrom(MachineBasicBlock &MBB,
 
 static
 bool isJump(unsigned Opcode) {
-  return Opcode == AMDGPU::JUMP || Opcode == AMDGPU::JUMP_COND;
+  return Opcode == R600::JUMP || Opcode == R600::JUMP_COND;
 }
 
 static bool isBranch(unsigned Opcode) {
-  return Opcode == AMDGPU::BRANCH || Opcode == AMDGPU::BRANCH_COND_i32 ||
-      Opcode == AMDGPU::BRANCH_COND_f32;
+  return Opcode == R600::BRANCH || Opcode == R600::BRANCH_COND_i32 ||
+      Opcode == R600::BRANCH_COND_f32;
 }
 
 bool R600InstrInfo::analyzeBranch(MachineBasicBlock &MBB,
@@ -678,7 +684,7 @@ bool R600InstrInfo::analyzeBranch(MachineBasicBlock &MBB,
   if (I == MBB.end())
     return false;
 
-  // AMDGPU::BRANCH* instructions are only available after isel and are not
+  // R600::BRANCH* instructions are only available after isel and are not
   // handled
   if (isBranch(I->getOpcode()))
     return true;
@@ -687,7 +693,7 @@ bool R600InstrInfo::analyzeBranch(MachineBasicBlock &MBB,
   }
 
   // Remove successive JUMP
-  while (I != MBB.begin() && std::prev(I)->getOpcode() == AMDGPU::JUMP) {
+  while (I != MBB.begin() && std::prev(I)->getOpcode() == R600::JUMP) {
       MachineBasicBlock::iterator PriorI = std::prev(I);
       if (AllowModify)
         I->removeFromParent();
@@ -698,10 +704,10 @@ bool R600InstrInfo::analyzeBranch(MachineBasicBlock &MBB,
   // If there is only one terminator instruction, process it.
   unsigned LastOpc = LastInst.getOpcode();
   if (I == MBB.begin() || !isJump((--I)->getOpcode())) {
-    if (LastOpc == AMDGPU::JUMP) {
+    if (LastOpc == R600::JUMP) {
       TBB = LastInst.getOperand(0).getMBB();
       return false;
-    } else if (LastOpc == AMDGPU::JUMP_COND) {
+    } else if (LastOpc == R600::JUMP_COND) {
       auto predSet = I;
       while (!isPredicateSetter(predSet->getOpcode())) {
         predSet = --I;
@@ -709,7 +715,7 @@ bool R600InstrInfo::analyzeBranch(MachineBasicBlock &MBB,
       TBB = LastInst.getOperand(0).getMBB();
       Cond.push_back(predSet->getOperand(1));
       Cond.push_back(predSet->getOperand(2));
-      Cond.push_back(MachineOperand::CreateReg(AMDGPU::PRED_SEL_ONE, false));
+      Cond.push_back(MachineOperand::CreateReg(R600::PRED_SEL_ONE, false));
       return false;
     }
     return true;  // Can't handle indirect branch.
@@ -720,7 +726,7 @@ bool R600InstrInfo::analyzeBranch(MachineBasicBlock &MBB,
   unsigned SecondLastOpc = SecondLastInst.getOpcode();
 
   // If the block ends with a B and a Bcc, handle it.
-  if (SecondLastOpc == AMDGPU::JUMP_COND && LastOpc == AMDGPU::JUMP) {
+  if (SecondLastOpc == R600::JUMP_COND && LastOpc == R600::JUMP) {
     auto predSet = --I;
     while (!isPredicateSetter(predSet->getOpcode())) {
       predSet = --I;
@@ -729,7 +735,7 @@ bool R600InstrInfo::analyzeBranch(MachineBasicBlock &MBB,
     FBB = LastInst.getOperand(0).getMBB();
     Cond.push_back(predSet->getOperand(1));
     Cond.push_back(predSet->getOperand(2));
-    Cond.push_back(MachineOperand::CreateReg(AMDGPU::PRED_SEL_ONE, false));
+    Cond.push_back(MachineOperand::CreateReg(R600::PRED_SEL_ONE, false));
     return false;
   }
 
@@ -741,8 +747,8 @@ static
 MachineBasicBlock::iterator FindLastAluClause(MachineBasicBlock &MBB) {
   for (MachineBasicBlock::reverse_iterator It = MBB.rbegin(), E = MBB.rend();
       It != E; ++It) {
-    if (It->getOpcode() == AMDGPU::CF_ALU ||
-        It->getOpcode() == AMDGPU::CF_ALU_PUSH_BEFORE)
+    if (It->getOpcode() == R600::CF_ALU ||
+        It->getOpcode() == R600::CF_ALU_PUSH_BEFORE)
       return It.getReverse();
   }
   return MBB.end();
@@ -759,7 +765,7 @@ unsigned R600InstrInfo::insertBranch(MachineBasicBlock &MBB,
 
   if (!FBB) {
     if (Cond.empty()) {
-      BuildMI(&MBB, DL, get(AMDGPU::JUMP)).addMBB(TBB);
+      BuildMI(&MBB, DL, get(R600::JUMP)).addMBB(TBB);
       return 1;
     } else {
       MachineInstr *PredSet = findFirstPredicateSetterFrom(MBB, MBB.end());
@@ -767,14 +773,14 @@ unsigned R600InstrInfo::insertBranch(MachineBasicBlock &MBB,
       addFlag(*PredSet, 0, MO_FLAG_PUSH);
       PredSet->getOperand(2).setImm(Cond[1].getImm());
 
-      BuildMI(&MBB, DL, get(AMDGPU::JUMP_COND))
+      BuildMI(&MBB, DL, get(R600::JUMP_COND))
              .addMBB(TBB)
-             .addReg(AMDGPU::PREDICATE_BIT, RegState::Kill);
+             .addReg(R600::PREDICATE_BIT, RegState::Kill);
       MachineBasicBlock::iterator CfAlu = FindLastAluClause(MBB);
       if (CfAlu == MBB.end())
         return 1;
-      assert (CfAlu->getOpcode() == AMDGPU::CF_ALU);
-      CfAlu->setDesc(get(AMDGPU::CF_ALU_PUSH_BEFORE));
+      assert (CfAlu->getOpcode() == R600::CF_ALU);
+      CfAlu->setDesc(get(R600::CF_ALU_PUSH_BEFORE));
       return 1;
     }
   } else {
@@ -782,15 +788,15 @@ unsigned R600InstrInfo::insertBranch(MachineBasicBlock &MBB,
     assert(PredSet && "No previous predicate !");
     addFlag(*PredSet, 0, MO_FLAG_PUSH);
     PredSet->getOperand(2).setImm(Cond[1].getImm());
-    BuildMI(&MBB, DL, get(AMDGPU::JUMP_COND))
+    BuildMI(&MBB, DL, get(R600::JUMP_COND))
             .addMBB(TBB)
-            .addReg(AMDGPU::PREDICATE_BIT, RegState::Kill);
-    BuildMI(&MBB, DL, get(AMDGPU::JUMP)).addMBB(FBB);
+            .addReg(R600::PREDICATE_BIT, RegState::Kill);
+    BuildMI(&MBB, DL, get(R600::JUMP)).addMBB(FBB);
     MachineBasicBlock::iterator CfAlu = FindLastAluClause(MBB);
     if (CfAlu == MBB.end())
       return 2;
-    assert (CfAlu->getOpcode() == AMDGPU::CF_ALU);
-    CfAlu->setDesc(get(AMDGPU::CF_ALU_PUSH_BEFORE));
+    assert (CfAlu->getOpcode() == R600::CF_ALU);
+    CfAlu->setDesc(get(R600::CF_ALU_PUSH_BEFORE));
     return 2;
   }
 }
@@ -811,18 +817,18 @@ unsigned R600InstrInfo::removeBranch(MachineBasicBlock &MBB,
   switch (I->getOpcode()) {
   default:
     return 0;
-  case AMDGPU::JUMP_COND: {
+  case R600::JUMP_COND: {
     MachineInstr *predSet = findFirstPredicateSetterFrom(MBB, I);
     clearFlag(*predSet, 0, MO_FLAG_PUSH);
     I->eraseFromParent();
     MachineBasicBlock::iterator CfAlu = FindLastAluClause(MBB);
     if (CfAlu == MBB.end())
       break;
-    assert (CfAlu->getOpcode() == AMDGPU::CF_ALU_PUSH_BEFORE);
-    CfAlu->setDesc(get(AMDGPU::CF_ALU));
+    assert (CfAlu->getOpcode() == R600::CF_ALU_PUSH_BEFORE);
+    CfAlu->setDesc(get(R600::CF_ALU));
     break;
   }
-  case AMDGPU::JUMP:
+  case R600::JUMP:
     I->eraseFromParent();
     break;
   }
@@ -836,18 +842,18 @@ unsigned R600InstrInfo::removeBranch(MachineBasicBlock &MBB,
     // FIXME: only one case??
   default:
     return 1;
-  case AMDGPU::JUMP_COND: {
+  case R600::JUMP_COND: {
     MachineInstr *predSet = findFirstPredicateSetterFrom(MBB, I);
     clearFlag(*predSet, 0, MO_FLAG_PUSH);
     I->eraseFromParent();
     MachineBasicBlock::iterator CfAlu = FindLastAluClause(MBB);
     if (CfAlu == MBB.end())
       break;
-    assert (CfAlu->getOpcode() == AMDGPU::CF_ALU_PUSH_BEFORE);
-    CfAlu->setDesc(get(AMDGPU::CF_ALU));
+    assert (CfAlu->getOpcode() == R600::CF_ALU_PUSH_BEFORE);
+    CfAlu->setDesc(get(R600::CF_ALU));
     break;
   }
-  case AMDGPU::JUMP:
+  case R600::JUMP:
     I->eraseFromParent();
     break;
   }
@@ -862,9 +868,9 @@ bool R600InstrInfo::isPredicated(const MachineInstr &MI) const {
   unsigned Reg = MI.getOperand(idx).getReg();
   switch (Reg) {
   default: return false;
-  case AMDGPU::PRED_SEL_ONE:
-  case AMDGPU::PRED_SEL_ZERO:
-  case AMDGPU::PREDICATE_BIT:
+  case R600::PRED_SEL_ONE:
+  case R600::PRED_SEL_ZERO:
+  case R600::PREDICATE_BIT:
     return true;
   }
 }
@@ -875,9 +881,9 @@ bool R600InstrInfo::isPredicable(const MachineInstr &MI) const {
   // be predicated.  Until we have proper support for instruction clauses in the
   // backend, we will mark KILL* instructions as unpredicable.
 
-  if (MI.getOpcode() == AMDGPU::KILLGT) {
+  if (MI.getOpcode() == R600::KILLGT) {
     return false;
-  } else if (MI.getOpcode() == AMDGPU::CF_ALU) {
+  } else if (MI.getOpcode() == R600::CF_ALU) {
     // If the clause start in the middle of MBB then the MBB has more
     // than a single clause, unable to predicate several clauses.
     if (MI.getParent()->begin() != MachineBasicBlock::const_iterator(MI))
@@ -887,7 +893,7 @@ bool R600InstrInfo::isPredicable(const MachineInstr &MI) const {
   } else if (isVector(MI)) {
     return false;
   } else {
-    return AMDGPUInstrInfo::isPredicable(MI);
+    return TargetInstrInfo::isPredicable(MI);
   }
 }
 
@@ -928,17 +934,17 @@ bool
 R600InstrInfo::reverseBranchCondition(SmallVectorImpl<MachineOperand> &Cond) const {
   MachineOperand &MO = Cond[1];
   switch (MO.getImm()) {
-  case AMDGPU::PRED_SETE_INT:
-    MO.setImm(AMDGPU::PRED_SETNE_INT);
+  case R600::PRED_SETE_INT:
+    MO.setImm(R600::PRED_SETNE_INT);
     break;
-  case AMDGPU::PRED_SETNE_INT:
-    MO.setImm(AMDGPU::PRED_SETE_INT);
+  case R600::PRED_SETNE_INT:
+    MO.setImm(R600::PRED_SETE_INT);
     break;
-  case AMDGPU::PRED_SETE:
-    MO.setImm(AMDGPU::PRED_SETNE);
+  case R600::PRED_SETE:
+    MO.setImm(R600::PRED_SETNE);
     break;
-  case AMDGPU::PRED_SETNE:
-    MO.setImm(AMDGPU::PRED_SETE);
+  case R600::PRED_SETNE:
+    MO.setImm(R600::PRED_SETE);
     break;
   default:
     return true;
@@ -946,11 +952,11 @@ R600InstrInfo::reverseBranchCondition(SmallVectorImpl<MachineOperand> &Cond) con
 
   MachineOperand &MO2 = Cond[2];
   switch (MO2.getReg()) {
-  case AMDGPU::PRED_SEL_ZERO:
-    MO2.setReg(AMDGPU::PRED_SEL_ONE);
+  case R600::PRED_SEL_ZERO:
+    MO2.setReg(R600::PRED_SEL_ONE);
     break;
-  case AMDGPU::PRED_SEL_ONE:
-    MO2.setReg(AMDGPU::PRED_SEL_ZERO);
+  case R600::PRED_SEL_ONE:
+    MO2.setReg(R600::PRED_SEL_ZERO);
     break;
   default:
     return true;
@@ -967,22 +973,22 @@ bool R600InstrInfo::PredicateInstruction(MachineInstr &MI,
                                          ArrayRef<MachineOperand> Pred) const {
   int PIdx = MI.findFirstPredOperandIdx();
 
-  if (MI.getOpcode() == AMDGPU::CF_ALU) {
+  if (MI.getOpcode() == R600::CF_ALU) {
     MI.getOperand(8).setImm(0);
     return true;
   }
 
-  if (MI.getOpcode() == AMDGPU::DOT_4) {
-    MI.getOperand(getOperandIdx(MI, AMDGPU::OpName::pred_sel_X))
+  if (MI.getOpcode() == R600::DOT_4) {
+    MI.getOperand(getOperandIdx(MI, R600::OpName::pred_sel_X))
         .setReg(Pred[2].getReg());
-    MI.getOperand(getOperandIdx(MI, AMDGPU::OpName::pred_sel_Y))
+    MI.getOperand(getOperandIdx(MI, R600::OpName::pred_sel_Y))
         .setReg(Pred[2].getReg());
-    MI.getOperand(getOperandIdx(MI, AMDGPU::OpName::pred_sel_Z))
+    MI.getOperand(getOperandIdx(MI, R600::OpName::pred_sel_Z))
         .setReg(Pred[2].getReg());
-    MI.getOperand(getOperandIdx(MI, AMDGPU::OpName::pred_sel_W))
+    MI.getOperand(getOperandIdx(MI, R600::OpName::pred_sel_W))
         .setReg(Pred[2].getReg());
     MachineInstrBuilder MIB(*MI.getParent()->getParent(), MI);
-    MIB.addReg(AMDGPU::PREDICATE_BIT, RegState::Implicit);
+    MIB.addReg(R600::PREDICATE_BIT, RegState::Implicit);
     return true;
   }
 
@@ -990,7 +996,7 @@ bool R600InstrInfo::PredicateInstruction(MachineInstr &MI,
     MachineOperand &PMO = MI.getOperand(PIdx);
     PMO.setReg(Pred[2].getReg());
     MachineInstrBuilder MIB(*MI.getParent()->getParent(), MI);
-    MIB.addReg(AMDGPU::PREDICATE_BIT, RegState::Implicit);
+    MIB.addReg(R600::PREDICATE_BIT, RegState::Implicit);
     return true;
   }
 
@@ -1020,20 +1026,20 @@ bool R600InstrInfo::expandPostRAPseudo(MachineInstr &MI) const {
   default: {
     MachineBasicBlock *MBB = MI.getParent();
     int OffsetOpIdx =
-        AMDGPU::getNamedOperandIdx(MI.getOpcode(), AMDGPU::OpName::addr);
+        R600::getNamedOperandIdx(MI.getOpcode(), R600::OpName::addr);
     // addr is a custom operand with multiple MI operands, and only the
     // first MI operand is given a name.
     int RegOpIdx = OffsetOpIdx + 1;
     int ChanOpIdx =
-        AMDGPU::getNamedOperandIdx(MI.getOpcode(), AMDGPU::OpName::chan);
+        R600::getNamedOperandIdx(MI.getOpcode(), R600::OpName::chan);
     if (isRegisterLoad(MI)) {
       int DstOpIdx =
-          AMDGPU::getNamedOperandIdx(MI.getOpcode(), AMDGPU::OpName::dst);
+          R600::getNamedOperandIdx(MI.getOpcode(), R600::OpName::dst);
       unsigned RegIndex = MI.getOperand(RegOpIdx).getImm();
       unsigned Channel = MI.getOperand(ChanOpIdx).getImm();
       unsigned Address = calculateIndirectAddress(RegIndex, Channel);
       unsigned OffsetReg = MI.getOperand(OffsetOpIdx).getReg();
-      if (OffsetReg == AMDGPU::INDIRECT_BASE_ADDR) {
+      if (OffsetReg == R600::INDIRECT_BASE_ADDR) {
         buildMovInstr(MBB, MI, MI.getOperand(DstOpIdx).getReg(),
                       getIndirectAddrRegClass()->getRegister(Address));
       } else {
@@ -1042,12 +1048,12 @@ bool R600InstrInfo::expandPostRAPseudo(MachineInstr &MI) const {
       }
     } else if (isRegisterStore(MI)) {
       int ValOpIdx =
-          AMDGPU::getNamedOperandIdx(MI.getOpcode(), AMDGPU::OpName::val);
+          R600::getNamedOperandIdx(MI.getOpcode(), R600::OpName::val);
       unsigned RegIndex = MI.getOperand(RegOpIdx).getImm();
       unsigned Channel = MI.getOperand(ChanOpIdx).getImm();
       unsigned Address = calculateIndirectAddress(RegIndex, Channel);
       unsigned OffsetReg = MI.getOperand(OffsetOpIdx).getReg();
-      if (OffsetReg == AMDGPU::INDIRECT_BASE_ADDR) {
+      if (OffsetReg == R600::INDIRECT_BASE_ADDR) {
         buildMovInstr(MBB, MI, getIndirectAddrRegClass()->getRegister(Address),
                       MI.getOperand(ValOpIdx).getReg());
       } else {
@@ -1062,15 +1068,15 @@ bool R600InstrInfo::expandPostRAPseudo(MachineInstr &MI) const {
     MBB->erase(MI);
     return true;
   }
-  case AMDGPU::R600_EXTRACT_ELT_V2:
-  case AMDGPU::R600_EXTRACT_ELT_V4:
+  case R600::R600_EXTRACT_ELT_V2:
+  case R600::R600_EXTRACT_ELT_V4:
     buildIndirectRead(MI.getParent(), MI, MI.getOperand(0).getReg(),
                       RI.getHWRegIndex(MI.getOperand(1).getReg()), //  Address
                       MI.getOperand(2).getReg(),
                       RI.getHWRegChan(MI.getOperand(1).getReg()));
     break;
-  case AMDGPU::R600_INSERT_ELT_V2:
-  case AMDGPU::R600_INSERT_ELT_V4:
+  case R600::R600_INSERT_ELT_V2:
+  case R600::R600_INSERT_ELT_V4:
     buildIndirectWrite(MI.getParent(), MI, MI.getOperand(2).getReg(), // Value
                        RI.getHWRegIndex(MI.getOperand(1).getReg()),   // Address
                        MI.getOperand(3).getReg(),                     // Offset
@@ -1082,7 +1088,8 @@ bool R600InstrInfo::expandPostRAPseudo(MachineInstr &MI) const {
 }
 
 void R600InstrInfo::reserveIndirectRegisters(BitVector &Reserved,
-                                             const MachineFunction &MF) const {
+                                             const MachineFunction &MF,
+                                             const R600RegisterInfo &TRI) const {
   const R600Subtarget &ST = MF.getSubtarget<R600Subtarget>();
   const R600FrameLowering *TFL = ST.getFrameLowering();
 
@@ -1093,17 +1100,15 @@ void R600InstrInfo::reserveIndirectRegisters(BitVector &Reserved,
     return;
 
   for (int Index = getIndirectIndexBegin(MF); Index <= End; ++Index) {
-    unsigned SuperReg = AMDGPU::R600_Reg128RegClass.getRegister(Index);
-    Reserved.set(SuperReg);
     for (unsigned Chan = 0; Chan < StackWidth; ++Chan) {
-      unsigned Reg = AMDGPU::R600_TReg32RegClass.getRegister((4 * Index) + Chan);
-      Reserved.set(Reg);
+      unsigned Reg = R600::R600_TReg32RegClass.getRegister((4 * Index) + Chan);
+      TRI.reserveRegisterTuples(Reserved, Reg);
     }
   }
 }
 
 const TargetRegisterClass *R600InstrInfo::getIndirectAddrRegClass() const {
-  return &AMDGPU::R600_TReg32_XRegClass;
+  return &R600::R600_TReg32_XRegClass;
 }
 
 MachineInstrBuilder R600InstrInfo::buildIndirectWrite(MachineBasicBlock *MBB,
@@ -1121,20 +1126,20 @@ MachineInstrBuilder R600InstrInfo::buildIndirectWrite(MachineBasicBlock *MBB,
   unsigned AddrReg;
   switch (AddrChan) {
     default: llvm_unreachable("Invalid Channel");
-    case 0: AddrReg = AMDGPU::R600_AddrRegClass.getRegister(Address); break;
-    case 1: AddrReg = AMDGPU::R600_Addr_YRegClass.getRegister(Address); break;
-    case 2: AddrReg = AMDGPU::R600_Addr_ZRegClass.getRegister(Address); break;
-    case 3: AddrReg = AMDGPU::R600_Addr_WRegClass.getRegister(Address); break;
+    case 0: AddrReg = R600::R600_AddrRegClass.getRegister(Address); break;
+    case 1: AddrReg = R600::R600_Addr_YRegClass.getRegister(Address); break;
+    case 2: AddrReg = R600::R600_Addr_ZRegClass.getRegister(Address); break;
+    case 3: AddrReg = R600::R600_Addr_WRegClass.getRegister(Address); break;
   }
-  MachineInstr *MOVA = buildDefaultInstruction(*MBB, I, AMDGPU::MOVA_INT_eg,
-                                               AMDGPU::AR_X, OffsetReg);
-  setImmOperand(*MOVA, AMDGPU::OpName::write, 0);
+  MachineInstr *MOVA = buildDefaultInstruction(*MBB, I, R600::MOVA_INT_eg,
+                                               R600::AR_X, OffsetReg);
+  setImmOperand(*MOVA, R600::OpName::write, 0);
 
-  MachineInstrBuilder Mov = buildDefaultInstruction(*MBB, I, AMDGPU::MOV,
+  MachineInstrBuilder Mov = buildDefaultInstruction(*MBB, I, R600::MOV,
                                       AddrReg, ValueReg)
-                                      .addReg(AMDGPU::AR_X,
+                                      .addReg(R600::AR_X,
                                            RegState::Implicit | RegState::Kill);
-  setImmOperand(*Mov, AMDGPU::OpName::dst_rel, 1);
+  setImmOperand(*Mov, R600::OpName::dst_rel, 1);
   return Mov;
 }
 
@@ -1153,21 +1158,21 @@ MachineInstrBuilder R600InstrInfo::buildIndirectRead(MachineBasicBlock *MBB,
   unsigned AddrReg;
   switch (AddrChan) {
     default: llvm_unreachable("Invalid Channel");
-    case 0: AddrReg = AMDGPU::R600_AddrRegClass.getRegister(Address); break;
-    case 1: AddrReg = AMDGPU::R600_Addr_YRegClass.getRegister(Address); break;
-    case 2: AddrReg = AMDGPU::R600_Addr_ZRegClass.getRegister(Address); break;
-    case 3: AddrReg = AMDGPU::R600_Addr_WRegClass.getRegister(Address); break;
+    case 0: AddrReg = R600::R600_AddrRegClass.getRegister(Address); break;
+    case 1: AddrReg = R600::R600_Addr_YRegClass.getRegister(Address); break;
+    case 2: AddrReg = R600::R600_Addr_ZRegClass.getRegister(Address); break;
+    case 3: AddrReg = R600::R600_Addr_WRegClass.getRegister(Address); break;
   }
-  MachineInstr *MOVA = buildDefaultInstruction(*MBB, I, AMDGPU::MOVA_INT_eg,
-                                                       AMDGPU::AR_X,
+  MachineInstr *MOVA = buildDefaultInstruction(*MBB, I, R600::MOVA_INT_eg,
+                                                       R600::AR_X,
                                                        OffsetReg);
-  setImmOperand(*MOVA, AMDGPU::OpName::write, 0);
-  MachineInstrBuilder Mov = buildDefaultInstruction(*MBB, I, AMDGPU::MOV,
+  setImmOperand(*MOVA, R600::OpName::write, 0);
+  MachineInstrBuilder Mov = buildDefaultInstruction(*MBB, I, R600::MOV,
                                       ValueReg,
                                       AddrReg)
-                                      .addReg(AMDGPU::AR_X,
+                                      .addReg(R600::AR_X,
                                            RegState::Implicit | RegState::Kill);
-  setImmOperand(*Mov, AMDGPU::OpName::src0_rel, 1);
+  setImmOperand(*Mov, R600::OpName::src0_rel, 1);
 
   return Mov;
 }
@@ -1265,7 +1270,7 @@ MachineInstrBuilder R600InstrInfo::buildDefaultInstruction(MachineBasicBlock &MB
   //XXX: The r600g finalizer expects this to be 1, once we've moved the
   //scheduling to the backend, we can change the default to 0.
   MIB.addImm(1)        // $last
-      .addReg(AMDGPU::PRED_SEL_OFF) // $pred_sel
+      .addReg(R600::PRED_SEL_OFF) // $pred_sel
       .addImm(0)         // $literal
       .addImm(0);        // $bank_swizzle
 
@@ -1286,23 +1291,23 @@ MachineInstrBuilder R600InstrInfo::buildDefaultInstruction(MachineBasicBlock &MB
 
 static unsigned getSlotedOps(unsigned  Op, unsigned Slot) {
   switch (Op) {
-  OPERAND_CASE(AMDGPU::OpName::update_exec_mask)
-  OPERAND_CASE(AMDGPU::OpName::update_pred)
-  OPERAND_CASE(AMDGPU::OpName::write)
-  OPERAND_CASE(AMDGPU::OpName::omod)
-  OPERAND_CASE(AMDGPU::OpName::dst_rel)
-  OPERAND_CASE(AMDGPU::OpName::clamp)
-  OPERAND_CASE(AMDGPU::OpName::src0)
-  OPERAND_CASE(AMDGPU::OpName::src0_neg)
-  OPERAND_CASE(AMDGPU::OpName::src0_rel)
-  OPERAND_CASE(AMDGPU::OpName::src0_abs)
-  OPERAND_CASE(AMDGPU::OpName::src0_sel)
-  OPERAND_CASE(AMDGPU::OpName::src1)
-  OPERAND_CASE(AMDGPU::OpName::src1_neg)
-  OPERAND_CASE(AMDGPU::OpName::src1_rel)
-  OPERAND_CASE(AMDGPU::OpName::src1_abs)
-  OPERAND_CASE(AMDGPU::OpName::src1_sel)
-  OPERAND_CASE(AMDGPU::OpName::pred_sel)
+  OPERAND_CASE(R600::OpName::update_exec_mask)
+  OPERAND_CASE(R600::OpName::update_pred)
+  OPERAND_CASE(R600::OpName::write)
+  OPERAND_CASE(R600::OpName::omod)
+  OPERAND_CASE(R600::OpName::dst_rel)
+  OPERAND_CASE(R600::OpName::clamp)
+  OPERAND_CASE(R600::OpName::src0)
+  OPERAND_CASE(R600::OpName::src0_neg)
+  OPERAND_CASE(R600::OpName::src0_rel)
+  OPERAND_CASE(R600::OpName::src0_abs)
+  OPERAND_CASE(R600::OpName::src0_sel)
+  OPERAND_CASE(R600::OpName::src1)
+  OPERAND_CASE(R600::OpName::src1_neg)
+  OPERAND_CASE(R600::OpName::src1_rel)
+  OPERAND_CASE(R600::OpName::src1_abs)
+  OPERAND_CASE(R600::OpName::src1_sel)
+  OPERAND_CASE(R600::OpName::pred_sel)
   default:
     llvm_unreachable("Wrong Operand");
   }
@@ -1313,39 +1318,39 @@ static unsigned getSlotedOps(unsigned  Op, unsigned Slot) {
 MachineInstr *R600InstrInfo::buildSlotOfVectorInstruction(
     MachineBasicBlock &MBB, MachineInstr *MI, unsigned Slot, unsigned DstReg)
     const {
-  assert (MI->getOpcode() == AMDGPU::DOT_4 && "Not Implemented");
+  assert (MI->getOpcode() == R600::DOT_4 && "Not Implemented");
   unsigned Opcode;
-  if (ST.getGeneration() <= R600Subtarget::R700)
-    Opcode = AMDGPU::DOT4_r600;
+  if (ST.getGeneration() <= AMDGPUSubtarget::R700)
+    Opcode = R600::DOT4_r600;
   else
-    Opcode = AMDGPU::DOT4_eg;
+    Opcode = R600::DOT4_eg;
   MachineBasicBlock::iterator I = MI;
   MachineOperand &Src0 = MI->getOperand(
-      getOperandIdx(MI->getOpcode(), getSlotedOps(AMDGPU::OpName::src0, Slot)));
+      getOperandIdx(MI->getOpcode(), getSlotedOps(R600::OpName::src0, Slot)));
   MachineOperand &Src1 = MI->getOperand(
-      getOperandIdx(MI->getOpcode(), getSlotedOps(AMDGPU::OpName::src1, Slot)));
+      getOperandIdx(MI->getOpcode(), getSlotedOps(R600::OpName::src1, Slot)));
   MachineInstr *MIB = buildDefaultInstruction(
       MBB, I, Opcode, DstReg, Src0.getReg(), Src1.getReg());
   static const unsigned  Operands[14] = {
-    AMDGPU::OpName::update_exec_mask,
-    AMDGPU::OpName::update_pred,
-    AMDGPU::OpName::write,
-    AMDGPU::OpName::omod,
-    AMDGPU::OpName::dst_rel,
-    AMDGPU::OpName::clamp,
-    AMDGPU::OpName::src0_neg,
-    AMDGPU::OpName::src0_rel,
-    AMDGPU::OpName::src0_abs,
-    AMDGPU::OpName::src0_sel,
-    AMDGPU::OpName::src1_neg,
-    AMDGPU::OpName::src1_rel,
-    AMDGPU::OpName::src1_abs,
-    AMDGPU::OpName::src1_sel,
+    R600::OpName::update_exec_mask,
+    R600::OpName::update_pred,
+    R600::OpName::write,
+    R600::OpName::omod,
+    R600::OpName::dst_rel,
+    R600::OpName::clamp,
+    R600::OpName::src0_neg,
+    R600::OpName::src0_rel,
+    R600::OpName::src0_abs,
+    R600::OpName::src0_sel,
+    R600::OpName::src1_neg,
+    R600::OpName::src1_rel,
+    R600::OpName::src1_abs,
+    R600::OpName::src1_sel,
   };
 
   MachineOperand &MO = MI->getOperand(getOperandIdx(MI->getOpcode(),
-      getSlotedOps(AMDGPU::OpName::pred_sel, Slot)));
-  MIB->getOperand(getOperandIdx(Opcode, AMDGPU::OpName::pred_sel))
+      getSlotedOps(R600::OpName::pred_sel, Slot)));
+  MIB->getOperand(getOperandIdx(Opcode, R600::OpName::pred_sel))
       .setReg(MO.getReg());
 
   for (unsigned i = 0; i < 14; i++) {
@@ -1362,16 +1367,16 @@ MachineInstr *R600InstrInfo::buildMovImm(MachineBasicBlock &BB,
                                          MachineBasicBlock::iterator I,
                                          unsigned DstReg,
                                          uint64_t Imm) const {
-  MachineInstr *MovImm = buildDefaultInstruction(BB, I, AMDGPU::MOV, DstReg,
-                                                  AMDGPU::ALU_LITERAL_X);
-  setImmOperand(*MovImm, AMDGPU::OpName::literal, Imm);
+  MachineInstr *MovImm = buildDefaultInstruction(BB, I, R600::MOV, DstReg,
+                                                  R600::ALU_LITERAL_X);
+  setImmOperand(*MovImm, R600::OpName::literal, Imm);
   return MovImm;
 }
 
 MachineInstr *R600InstrInfo::buildMovInstr(MachineBasicBlock *MBB,
                                        MachineBasicBlock::iterator I,
                                        unsigned DstReg, unsigned SrcReg) const {
-  return buildDefaultInstruction(*MBB, I, AMDGPU::MOV, DstReg, SrcReg);
+  return buildDefaultInstruction(*MBB, I, R600::MOV, DstReg, SrcReg);
 }
 
 int R600InstrInfo::getOperandIdx(const MachineInstr &MI, unsigned Op) const {
@@ -1379,7 +1384,7 @@ int R600InstrInfo::getOperandIdx(const MachineInstr &MI, unsigned Op) const {
 }
 
 int R600InstrInfo::getOperandIdx(unsigned Opcode, unsigned Op) const {
-  return AMDGPU::getNamedOperandIdx(Opcode, Op);
+  return R600::getNamedOperandIdx(Opcode, Op);
 }
 
 void R600InstrInfo::setImmOperand(MachineInstr &MI, unsigned Op,
@@ -1406,25 +1411,25 @@ MachineOperand &R600InstrInfo::getFlagOp(MachineInstr &MI, unsigned SrcIdx,
     bool IsOP3 = (TargetFlags & R600_InstFlag::OP3) == R600_InstFlag::OP3;
     switch (Flag) {
     case MO_FLAG_CLAMP:
-      FlagIndex = getOperandIdx(MI, AMDGPU::OpName::clamp);
+      FlagIndex = getOperandIdx(MI, R600::OpName::clamp);
       break;
     case MO_FLAG_MASK:
-      FlagIndex = getOperandIdx(MI, AMDGPU::OpName::write);
+      FlagIndex = getOperandIdx(MI, R600::OpName::write);
       break;
     case MO_FLAG_NOT_LAST:
     case MO_FLAG_LAST:
-      FlagIndex = getOperandIdx(MI, AMDGPU::OpName::last);
+      FlagIndex = getOperandIdx(MI, R600::OpName::last);
       break;
     case MO_FLAG_NEG:
       switch (SrcIdx) {
       case 0:
-        FlagIndex = getOperandIdx(MI, AMDGPU::OpName::src0_neg);
+        FlagIndex = getOperandIdx(MI, R600::OpName::src0_neg);
         break;
       case 1:
-        FlagIndex = getOperandIdx(MI, AMDGPU::OpName::src1_neg);
+        FlagIndex = getOperandIdx(MI, R600::OpName::src1_neg);
         break;
       case 2:
-        FlagIndex = getOperandIdx(MI, AMDGPU::OpName::src2_neg);
+        FlagIndex = getOperandIdx(MI, R600::OpName::src2_neg);
         break;
       }
       break;
@@ -1435,10 +1440,10 @@ MachineOperand &R600InstrInfo::getFlagOp(MachineInstr &MI, unsigned SrcIdx,
       (void)IsOP3;
       switch (SrcIdx) {
       case 0:
-        FlagIndex = getOperandIdx(MI, AMDGPU::OpName::src0_abs);
+        FlagIndex = getOperandIdx(MI, R600::OpName::src0_abs);
         break;
       case 1:
-        FlagIndex = getOperandIdx(MI, AMDGPU::OpName::src1_abs);
+        FlagIndex = getOperandIdx(MI, R600::OpName::src1_abs);
         break;
       }
       break;
@@ -1499,15 +1504,15 @@ unsigned R600InstrInfo::getAddressSpaceForPseudoSourceKind(
   switch (Kind) {
   case PseudoSourceValue::Stack:
   case PseudoSourceValue::FixedStack:
-    return AMDGPUASI.PRIVATE_ADDRESS;
+    return ST.getAMDGPUAS().PRIVATE_ADDRESS;
   case PseudoSourceValue::ConstantPool:
   case PseudoSourceValue::GOT:
   case PseudoSourceValue::JumpTable:
   case PseudoSourceValue::GlobalValueCallEntry:
   case PseudoSourceValue::ExternalSymbolCallEntry:
   case PseudoSourceValue::TargetCustom:
-    return AMDGPUASI.CONSTANT_ADDRESS;
+    return ST.getAMDGPUAS().CONSTANT_ADDRESS;
   }
   llvm_unreachable("Invalid pseudo source kind");
-  return AMDGPUASI.PRIVATE_ADDRESS;
+  return ST.getAMDGPUAS().PRIVATE_ADDRESS;
 }
diff --git a/lib/Target/AMDGPU/R600InstrInfo.h b/lib/Target/AMDGPU/R600InstrInfo.h
index abaa37450758..7a3dece31665 100644
--- a/lib/Target/AMDGPU/R600InstrInfo.h
+++ b/lib/Target/AMDGPU/R600InstrInfo.h
@@ -8,15 +8,18 @@
 //===----------------------------------------------------------------------===//
 //
 /// \file
-/// \brief Interface definition for R600InstrInfo
+/// Interface definition for R600InstrInfo
 //
 //===----------------------------------------------------------------------===//
 
 #ifndef LLVM_LIB_TARGET_AMDGPU_R600INSTRINFO_H
 #define LLVM_LIB_TARGET_AMDGPU_R600INSTRINFO_H
 
-#include "AMDGPUInstrInfo.h"
 #include "R600RegisterInfo.h"
+#include "llvm/CodeGen/TargetInstrInfo.h"
+
+#define GET_INSTRINFO_HEADER
+#include "R600GenInstrInfo.inc"
 
 namespace llvm {
 
@@ -34,7 +37,7 @@ class MachineInstr;
 class MachineInstrBuilder;
 class R600Subtarget;
 
-class R600InstrInfo final : public AMDGPUInstrInfo {
+class R600InstrInfo final : public R600GenInstrInfo {
 private:
   const R600RegisterInfo RI;
   const R600Subtarget &ST;
@@ -150,7 +153,7 @@ public:
   /// Same but using const index set instead of MI set.
   bool fitsConstReadLimitations(const std::vector<unsigned>&) const;
 
-  /// \brief Vector instructions are instructions that must fill all
+  /// Vector instructions are instructions that must fill all
   /// instruction slots within an instruction group.
   bool isVector(const MachineInstr &MI) const;
 
@@ -209,9 +212,10 @@ public:
 
   bool expandPostRAPseudo(MachineInstr &MI) const override;
 
-  /// \brief Reserve the registers that may be accesed using indirect addressing.
+  /// Reserve the registers that may be accesed using indirect addressing.
   void reserveIndirectRegisters(BitVector &Reserved,
-                                const MachineFunction &MF) const;
+                                const MachineFunction &MF,
+                                const R600RegisterInfo &TRI) const;
 
   /// Calculate the "Indirect Address" for the given \p RegIndex and
   /// \p Channel
@@ -235,7 +239,7 @@ public:
   /// read or write or -1 if indirect addressing is not used by this program.
   int getIndirectIndexEnd(const MachineFunction &MF) const;
 
-  /// \brief Build instruction(s) for an indirect register write.
+  /// Build instruction(s) for an indirect register write.
   ///
   /// \returns The instruction that performs the indirect register write
   MachineInstrBuilder buildIndirectWrite(MachineBasicBlock *MBB,
@@ -243,7 +247,7 @@ public:
                                          unsigned ValueReg, unsigned Address,
                                          unsigned OffsetReg) const;
 
-  /// \brief Build instruction(s) for an indirect register read.
+  /// Build instruction(s) for an indirect register read.
   ///
   /// \returns The instruction that performs the indirect register read
   MachineInstrBuilder buildIndirectRead(MachineBasicBlock *MBB,
@@ -281,23 +285,23 @@ public:
                               MachineBasicBlock::iterator I,
                               unsigned DstReg, unsigned SrcReg) const;
 
-  /// \brief Get the index of Op in the MachineInstr.
+  /// Get the index of Op in the MachineInstr.
   ///
   /// \returns -1 if the Instruction does not contain the specified \p Op.
   int getOperandIdx(const MachineInstr &MI, unsigned Op) const;
 
-  /// \brief Get the index of \p Op for the given Opcode.
+  /// Get the index of \p Op for the given Opcode.
   ///
   /// \returns -1 if the Instruction does not contain the specified \p Op.
   int getOperandIdx(unsigned Opcode, unsigned Op) const;
 
-  /// \brief Helper function for setting instruction flag values.
+  /// Helper function for setting instruction flag values.
   void setImmOperand(MachineInstr &MI, unsigned Op, int64_t Imm) const;
 
-  ///\brief Add one of the MO_FLAG* flags to the specified \p Operand.
+  ///Add one of the MO_FLAG* flags to the specified \p Operand.
   void addFlag(MachineInstr &MI, unsigned Operand, unsigned Flag) const;
 
-  ///\brief Determine if the specified \p Flag is set on this \p Operand.
+  ///Determine if the specified \p Flag is set on this \p Operand.
   bool isFlagSet(const MachineInstr &MI, unsigned Operand, unsigned Flag) const;
 
   /// \param SrcIdx The register source to set the flag on (e.g src0, src1, src2)
@@ -307,7 +311,7 @@ public:
   MachineOperand &getFlagOp(MachineInstr &MI, unsigned SrcIdx = 0,
                             unsigned Flag = 0) const;
 
-  /// \brief Clear the specified flag on the instruction.
+  /// Clear the specified flag on the instruction.
   void clearFlag(MachineInstr &MI, unsigned Operand, unsigned Flag) const;
 
   // Helper functions that check the opcode for status information
@@ -323,7 +327,7 @@ public:
       PseudoSourceValue::PSVKind Kind) const override;
 };
 
-namespace AMDGPU {
+namespace R600 {
 
 int getLDSNoRetOp(uint16_t Opcode);
 
diff --git a/lib/Target/AMDGPU/R600Instructions.td b/lib/Target/AMDGPU/R600Instructions.td
index 801e4e61fca6..7bf174f4cd86 100644
--- a/lib/Target/AMDGPU/R600Instructions.td
+++ b/lib/Target/AMDGPU/R600Instructions.td
@@ -12,20 +12,19 @@
 //
 //===----------------------------------------------------------------------===//
 
-include "R600Intrinsics.td"
 include "R600InstrFormats.td"
 
 // FIXME: Should not be arbitrarily split from other R600 inst classes.
 class R600WrapperInst <dag outs, dag ins, string asm = "", list<dag> pattern = []> :
   AMDGPUInst<outs, ins, asm, pattern>, PredicateControl {
   let SubtargetPredicate = isR600toCayman;
+  let Namespace = "R600";
 }
 
 
 class InstR600ISA <dag outs, dag ins, string asm, list<dag> pattern = []> :
     InstR600 <outs, ins, asm, pattern, NullALU> {
 
-  let Namespace = "AMDGPU";
 }
 
 def MEMxi : Operand<iPTR> {
@@ -81,11 +80,18 @@ def ADDRDWord : ComplexPattern<i32, 1, "SelectADDRDWord", [], []>;
 def ADDRVTX_READ : ComplexPattern<i32, 2, "SelectADDRVTX_READ", [], []>;
 def ADDRGA_CONST_OFFSET : ComplexPattern<i32, 1, "SelectGlobalValueConstantOffset", [], []>;
 def ADDRGA_VAR_OFFSET : ComplexPattern<i32, 2, "SelectGlobalValueVariableOffset", [], []>;
+def ADDRIndirect : ComplexPattern<iPTR, 2, "SelectADDRIndirect", [], []>;
 
 
 def R600_Pred : PredicateOperand<i32, (ops R600_Predicate),
                                      (ops PRED_SEL_OFF)>;
 
+let isTerminator = 1, isReturn = 1, hasCtrlDep = 1,
+    usesCustomInserter = 1, Namespace = "R600" in {
+  def RETURN : ILFormat<(outs), (ins variable_ops),
+    "RETURN", [(AMDGPUendpgm)]
+  >;
+}
 
 let mayLoad = 0, mayStore = 0, hasSideEffects = 0 in {
 
@@ -219,34 +225,6 @@ class R600_REDUCTION <bits<11> inst, dag ins, string asm, list<dag> pattern,
 
 } // End mayLoad = 1, mayStore = 0, hasSideEffects = 0
 
-def TEX_SHADOW : PatLeaf<
-  (imm),
-  [{uint32_t TType = (uint32_t)N->getZExtValue();
-    return (TType >= 6 && TType <= 8) || TType == 13;
-  }]
->;
-
-def TEX_RECT : PatLeaf<
-  (imm),
-  [{uint32_t TType = (uint32_t)N->getZExtValue();
-    return TType == 5;
-  }]
->;
-
-def TEX_ARRAY : PatLeaf<
-  (imm),
-  [{uint32_t TType = (uint32_t)N->getZExtValue();
-    return TType == 9 || TType == 10 || TType == 16;
-  }]
->;
-
-def TEX_SHADOW_ARRAY : PatLeaf<
-  (imm),
-  [{uint32_t TType = (uint32_t)N->getZExtValue();
-    return TType == 11 || TType == 12 || TType == 17;
-  }]
->;
-
 class EG_CF_RAT <bits <8> cfinst, bits <6> ratinst, bits<4> ratid, bits<4> mask,
                  dag outs, dag ins, string asm, list<dag> pattern> :
     InstR600ISA <outs, ins, asm, pattern>,
@@ -357,6 +335,8 @@ def vtx_id2_load : LoadVtxId2 <load>;
 // R600 SDNodes
 //===----------------------------------------------------------------------===//
 
+let Namespace = "R600" in {
+
 def INTERP_PAIR_XY :  AMDGPUShaderInst <
   (outs R600_TReg32_X:$dst0, R600_TReg32_Y:$dst1),
   (ins i32imm:$src0, R600_TReg32_Y:$src1, R600_TReg32_X:$src2),
@@ -369,6 +349,8 @@ def INTERP_PAIR_ZW :  AMDGPUShaderInst <
   "INTERP_PAIR_ZW $src0 $src1 $src2 : $dst0 dst1",
   []>;
 
+}
+
 def CONST_ADDRESS: SDNode<"AMDGPUISD::CONST_ADDRESS",
   SDTypeProfile<1, -1, [SDTCisInt<0>, SDTCisPtrTy<1>]>,
   [SDNPVariadic]
@@ -416,11 +398,15 @@ def : R600Pat<(TEXTURE_FETCH (i32 TextureOp), vt:$SRC_GPR,
 // Interpolation Instructions
 //===----------------------------------------------------------------------===//
 
+let Namespace = "R600" in {
+
 def INTERP_VEC_LOAD :  AMDGPUShaderInst <
   (outs R600_Reg128:$dst),
   (ins i32imm:$src0),
   "INTERP_LOAD $src0 : $dst">;
 
+}
+
 def INTERP_XY : R600_2OP <0xD6, "INTERP_XY", []> {
   let bank_swizzle = 5;
 }
@@ -660,14 +646,7 @@ def PAD : R600WrapperInst <(outs), (ins), "PAD", [] > {
 
 let isCodeGenOnly = 1, isPseudo = 1 in {
 
-let usesCustomInserter = 1  in {
-
-class CLAMP <RegisterClass rc> : AMDGPUShaderInst <
-  (outs rc:$dst),
-  (ins rc:$src0),
-  "CLAMP $dst, $src0",
-  [(set f32:$dst, (AMDGPUclamp f32:$src0))]
->;
+let Namespace = "R600", usesCustomInserter = 1  in {
 
 class FABS <RegisterClass rc> : AMDGPUShaderInst <
   (outs rc:$dst),
@@ -799,7 +778,9 @@ class MOV_IMM <ValueType vt, Operand immType> : R600WrapperInst <
   (ins immType:$imm),
   "",
   []
->;
+> {
+  let Namespace = "R600";
+}
 
 } // end let isPseudo = 1, isCodeGenOnly = 1, usesCustomInserter = 1
 
@@ -1014,7 +995,7 @@ class CNDGE_Common <bits<5> inst> : R600_3OP <
 }
 
 
-let isCodeGenOnly = 1, isPseudo = 1, Namespace = "AMDGPU"  in {
+let isCodeGenOnly = 1, isPseudo = 1, Namespace = "R600"  in {
 class R600_VEC2OP<list<dag> pattern> : InstR600 <(outs R600_Reg32:$dst), (ins
 // Slot X
    UEM:$update_exec_mask_X, UP:$update_pred_X, WRITE:$write_X,
@@ -1193,7 +1174,6 @@ class COS_Common <bits<11> inst> : R600_1OP <
   let Itinerary = TransALU;
 }
 
-def CLAMP_R600 :  CLAMP <R600_Reg32>;
 def FABS_R600 : FABS<R600_Reg32>;
 def FNEG_R600 : FNEG<R600_Reg32>;
 
@@ -1334,7 +1314,9 @@ let Predicates = [isR600] in {
 // Regist loads and stores - for indirect addressing
 //===----------------------------------------------------------------------===//
 
+let Namespace = "R600" in {
 defm R600_ : RegisterLoadStore <R600_Reg32, FRAMEri, ADDRIndirect>;
+}
 
 // Hardcode channel to 0
 // NOTE: LSHR is not available here. LSHR is per family instruction
@@ -1386,11 +1368,12 @@ let usesCustomInserter = 1 in {
 
 let mayLoad = 0, mayStore = 0, hasSideEffects = 1 in {
 
-def MASK_WRITE : AMDGPUShaderInst <
+def MASK_WRITE : InstR600 <
     (outs),
     (ins R600_Reg32:$src),
     "MASK_WRITE $src",
-    []
+    [],
+    NullALU
 >;
 
 } // End mayLoad = 0, mayStore = 0, hasSideEffects = 1
@@ -1421,7 +1404,7 @@ def TXD_SHADOW: InstR600 <
 // Constant Buffer Addressing Support
 //===----------------------------------------------------------------------===//
 
-let usesCustomInserter = 1, isCodeGenOnly = 1, isPseudo = 1, Namespace = "AMDGPU"  in {
+let usesCustomInserter = 1, isCodeGenOnly = 1, isPseudo = 1, Namespace = "R600"  in {
 def CONST_COPY : Instruction {
   let OutOperandList = (outs R600_Reg32:$dst);
   let InOperandList = (ins i32imm:$src);
@@ -1544,23 +1527,6 @@ let Inst{63-32} = Word1;
 //===---------------------------------------------------------------------===//
 // Flow and Program control Instructions
 //===---------------------------------------------------------------------===//
-class ILFormat<dag outs, dag ins, string asmstr, list<dag> pattern>
-: Instruction {
-
-     let Namespace = "AMDGPU";
-     dag OutOperandList = outs;
-     dag InOperandList = ins;
-     let Pattern = pattern;
-     let AsmString = !strconcat(asmstr, "\n");
-     let isPseudo = 1;
-     let Itinerary = NullALU;
-     bit hasIEEEFlag = 0;
-     bit hasZeroOpFlag = 0;
-     let mayLoad = 0;
-     let mayStore = 0;
-     let hasSideEffects = 0;
-     let isCodeGenOnly = 1;
-}
 
 multiclass BranchConditional<SDNode Op, RegisterClass rci, RegisterClass rcf> {
     def _i32 : ILFormat<(outs),
@@ -1592,23 +1558,14 @@ multiclass BranchInstr2<string name> {
 // Custom Inserter for Branches and returns, this eventually will be a
 // separate pass
 //===---------------------------------------------------------------------===//
-let isTerminator = 1, usesCustomInserter = 1, isBranch = 1, isBarrier = 1 in {
+let isTerminator = 1, usesCustomInserter = 1, isBranch = 1, isBarrier = 1,
+    Namespace = "R600" in {
   def BRANCH : ILFormat<(outs), (ins brtarget:$target),
       "; Pseudo unconditional branch instruction",
       [(br bb:$target)]>;
   defm BRANCH_COND : BranchConditional<IL_brcond, R600_Reg32, R600_Reg32>;
 }
 
-//===---------------------------------------------------------------------===//
-// Return instruction
-//===---------------------------------------------------------------------===//
-let isTerminator = 1, isReturn = 1, hasCtrlDep = 1,
-    usesCustomInserter = 1 in {
-  def RETURN : ILFormat<(outs), (ins variable_ops),
-    "RETURN", [(AMDGPUendpgm)]
-  >;
-}
-
 //===----------------------------------------------------------------------===//
 // Branch Instructions
 //===----------------------------------------------------------------------===//
@@ -1738,13 +1695,8 @@ def : R600Pat <
 >;
 
 // KIL Patterns
-def KILP : R600Pat <
-  (int_AMDGPU_kilp),
-  (MASK_WRITE (KILLGT (f32 ONE), (f32 ZERO)))
->;
-
 def KIL : R600Pat <
-  (int_AMDGPU_kill f32:$src0),
+  (int_r600_kill f32:$src0),
   (MASK_WRITE (KILLGT (f32 ZERO), $src0))
 >;
 
diff --git a/lib/Target/AMDGPU/R600Intrinsics.td b/lib/Target/AMDGPU/R600Intrinsics.td
deleted file mode 100644
index 4c9e1e8a5434..000000000000
--- a/lib/Target/AMDGPU/R600Intrinsics.td
+++ /dev/null
@@ -1,67 +0,0 @@
-//===-- R600Intrinsics.td - R600 Instrinsic defs -------*- tablegen -*-----===//
-//
-//                     The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
-//
-//===----------------------------------------------------------------------===//
-//
-// R600 Intrinsic Definitions
-//
-//===----------------------------------------------------------------------===//
-
-class TextureIntrinsicFloatInput : Intrinsic<[llvm_v4f32_ty], [
-  llvm_v4f32_ty, // Coord
-  llvm_i32_ty,   // offset_x
-  llvm_i32_ty,   // offset_y,
-  llvm_i32_ty,   // offset_z,
-  llvm_i32_ty,   // resource_id
-  llvm_i32_ty,   // samplerid
-  llvm_i32_ty,   // coord_type_x
-  llvm_i32_ty,   // coord_type_y
-  llvm_i32_ty,   // coord_type_z
-  llvm_i32_ty],  // coord_type_w
-  [IntrNoMem]
->;
-
-class TextureIntrinsicInt32Input : Intrinsic<[llvm_v4i32_ty], [
-    llvm_v4i32_ty, // Coord
-    llvm_i32_ty,   // offset_x
-    llvm_i32_ty,   // offset_y,
-    llvm_i32_ty,   // offset_z,
-    llvm_i32_ty,   // resource_id
-    llvm_i32_ty,   // samplerid
-    llvm_i32_ty,   // coord_type_x
-    llvm_i32_ty,   // coord_type_y
-    llvm_i32_ty,   // coord_type_z
-    llvm_i32_ty],  // coord_type_w
-    [IntrNoMem]
->;
-
-let TargetPrefix = "r600", isTarget = 1 in {
-
-def int_r600_store_swizzle :
-  Intrinsic<[], [llvm_v4f32_ty, llvm_i32_ty, llvm_i32_ty], []
->;
-
-def int_r600_store_stream_output : Intrinsic<
-  [], [llvm_v4f32_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty], []
->;
-
-def int_r600_tex : TextureIntrinsicFloatInput;
-def int_r600_texc : TextureIntrinsicFloatInput;
-def int_r600_txl : TextureIntrinsicFloatInput;
-def int_r600_txlc : TextureIntrinsicFloatInput;
-def int_r600_txb : TextureIntrinsicFloatInput;
-def int_r600_txbc : TextureIntrinsicFloatInput;
-def int_r600_txf : TextureIntrinsicInt32Input;
-def int_r600_txq : TextureIntrinsicInt32Input;
-def int_r600_ddx : TextureIntrinsicFloatInput;
-def int_r600_ddy : TextureIntrinsicFloatInput;
-
-def int_r600_dot4 : Intrinsic<[llvm_float_ty],
-  [llvm_v4f32_ty, llvm_v4f32_ty], [IntrNoMem, IntrSpeculatable]
->;
-
-} // End TargetPrefix = "r600", isTarget = 1
diff --git a/lib/Target/AMDGPU/R600MachineScheduler.cpp b/lib/Target/AMDGPU/R600MachineScheduler.cpp
index a7e540f9d14d..a1429a2ac50f 100644
--- a/lib/Target/AMDGPU/R600MachineScheduler.cpp
+++ b/lib/Target/AMDGPU/R600MachineScheduler.cpp
@@ -8,13 +8,14 @@
 //===----------------------------------------------------------------------===//
 //
 /// \file
-/// \brief R600 Machine Scheduler interface
+/// R600 Machine Scheduler interface
 //
 //===----------------------------------------------------------------------===//
 
 #include "R600MachineScheduler.h"
 #include "AMDGPUSubtarget.h"
 #include "R600InstrInfo.h"
+#include "MCTargetDesc/AMDGPUMCTargetDesc.h"
 #include "llvm/CodeGen/MachineRegisterInfo.h"
 #include "llvm/IR/LegacyPassManager.h"
 #include "llvm/Pass.h"
@@ -78,7 +79,7 @@ SUnit* R600SchedStrategy::pickNode(bool &IsTopNode) {
       AllowSwitchFromAlu = true;
     } else {
       unsigned NeededWF = 62.5f / ALUFetchRationEstimate;
-      DEBUG( dbgs() << NeededWF << " approx. Wavefronts Required\n" );
+      LLVM_DEBUG(dbgs() << NeededWF << " approx. Wavefronts Required\n");
       // We assume the local GPR requirements to be "dominated" by the requirement
       // of the TEX clause (which consumes 128 bits regs) ; ALU inst before and
       // after TEX are indeed likely to consume or generate values from/for the
@@ -124,26 +125,24 @@ SUnit* R600SchedStrategy::pickNode(bool &IsTopNode) {
       NextInstKind = IDOther;
   }
 
-  DEBUG(
-      if (SU) {
-        dbgs() << " ** Pick node **\n";
-        SU->dump(DAG);
-      } else {
-        dbgs() << "NO NODE \n";
-        for (unsigned i = 0; i < DAG->SUnits.size(); i++) {
-          const SUnit &S = DAG->SUnits[i];
-          if (!S.isScheduled)
-            S.dump(DAG);
-        }
-      }
-  );
+  LLVM_DEBUG(if (SU) {
+    dbgs() << " ** Pick node **\n";
+    SU->dump(DAG);
+  } else {
+    dbgs() << "NO NODE \n";
+    for (unsigned i = 0; i < DAG->SUnits.size(); i++) {
+      const SUnit &S = DAG->SUnits[i];
+      if (!S.isScheduled)
+        S.dump(DAG);
+    }
+  });
 
   return SU;
 }
 
 void R600SchedStrategy::schedNode(SUnit *SU, bool IsTopNode) {
   if (NextInstKind != CurInstKind) {
-    DEBUG(dbgs() << "Instruction Type Switch\n");
+    LLVM_DEBUG(dbgs() << "Instruction Type Switch\n");
     if (NextInstKind != IDAlu)
       OccupedSlotsMask |= 31;
     CurEmitted = 0;
@@ -163,7 +162,7 @@ void R600SchedStrategy::schedNode(SUnit *SU, bool IsTopNode) {
       for (MachineInstr::mop_iterator It = SU->getInstr()->operands_begin(),
           E = SU->getInstr()->operands_end(); It != E; ++It) {
         MachineOperand &MO = *It;
-        if (MO.isReg() && MO.getReg() == AMDGPU::ALU_LITERAL_X)
+        if (MO.isReg() && MO.getReg() == R600::ALU_LITERAL_X)
           ++CurEmitted;
       }
     }
@@ -172,8 +171,7 @@ void R600SchedStrategy::schedNode(SUnit *SU, bool IsTopNode) {
     ++CurEmitted;
   }
 
-
-  DEBUG(dbgs() << CurEmitted << " Instructions Emitted in this clause\n");
+  LLVM_DEBUG(dbgs() << CurEmitted << " Instructions Emitted in this clause\n");
 
   if (CurInstKind != IDFetch) {
     MoveUnits(Pending[IDFetch], Available[IDFetch]);
@@ -183,18 +181,18 @@ void R600SchedStrategy::schedNode(SUnit *SU, bool IsTopNode) {
 
 static bool
 isPhysicalRegCopy(MachineInstr *MI) {
-  if (MI->getOpcode() != AMDGPU::COPY)
+  if (MI->getOpcode() != R600::COPY)
     return false;
 
   return !TargetRegisterInfo::isVirtualRegister(MI->getOperand(1).getReg());
 }
 
 void R600SchedStrategy::releaseTopNode(SUnit *SU) {
-  DEBUG(dbgs() << "Top Releasing ";SU->dump(DAG););
+  LLVM_DEBUG(dbgs() << "Top Releasing "; SU->dump(DAG););
 }
 
 void R600SchedStrategy::releaseBottomNode(SUnit *SU) {
-  DEBUG(dbgs() << "Bottom Releasing ";SU->dump(DAG););
+  LLVM_DEBUG(dbgs() << "Bottom Releasing "; SU->dump(DAG););
   if (isPhysicalRegCopy(SU->getInstr())) {
     PhysicalRegCopy.push_back(SU);
     return;
@@ -226,14 +224,14 @@ R600SchedStrategy::AluKind R600SchedStrategy::getAluKind(SUnit *SU) const {
     return AluTrans;
 
   switch (MI->getOpcode()) {
-  case AMDGPU::PRED_X:
+  case R600::PRED_X:
     return AluPredX;
-  case AMDGPU::INTERP_PAIR_XY:
-  case AMDGPU::INTERP_PAIR_ZW:
-  case AMDGPU::INTERP_VEC_LOAD:
-  case AMDGPU::DOT_4:
+  case R600::INTERP_PAIR_XY:
+  case R600::INTERP_PAIR_ZW:
+  case R600::INTERP_VEC_LOAD:
+  case R600::DOT_4:
     return AluT_XYZW;
-  case AMDGPU::COPY:
+  case R600::COPY:
     if (MI->getOperand(1).isUndef()) {
       // MI will become a KILL, don't considers it in scheduling
       return AluDiscarded;
@@ -248,7 +246,7 @@ R600SchedStrategy::AluKind R600SchedStrategy::getAluKind(SUnit *SU) const {
   if(TII->isVector(*MI) ||
      TII->isCubeOp(MI->getOpcode()) ||
      TII->isReductionOp(MI->getOpcode()) ||
-     MI->getOpcode() == AMDGPU::GROUP_BARRIER) {
+     MI->getOpcode() == R600::GROUP_BARRIER) {
     return AluT_XYZW;
   }
 
@@ -259,13 +257,13 @@ R600SchedStrategy::AluKind R600SchedStrategy::getAluKind(SUnit *SU) const {
   // Is the result already assigned to a channel ?
   unsigned DestSubReg = MI->getOperand(0).getSubReg();
   switch (DestSubReg) {
-  case AMDGPU::sub0:
+  case R600::sub0:
     return AluT_X;
-  case AMDGPU::sub1:
+  case R600::sub1:
     return AluT_Y;
-  case AMDGPU::sub2:
+  case R600::sub2:
     return AluT_Z;
-  case AMDGPU::sub3:
+  case R600::sub3:
     return AluT_W;
   default:
     break;
@@ -273,16 +271,16 @@ R600SchedStrategy::AluKind R600SchedStrategy::getAluKind(SUnit *SU) const {
 
   // Is the result already member of a X/Y/Z/W class ?
   unsigned DestReg = MI->getOperand(0).getReg();
-  if (regBelongsToClass(DestReg, &AMDGPU::R600_TReg32_XRegClass) ||
-      regBelongsToClass(DestReg, &AMDGPU::R600_AddrRegClass))
+  if (regBelongsToClass(DestReg, &R600::R600_TReg32_XRegClass) ||
+      regBelongsToClass(DestReg, &R600::R600_AddrRegClass))
     return AluT_X;
-  if (regBelongsToClass(DestReg, &AMDGPU::R600_TReg32_YRegClass))
+  if (regBelongsToClass(DestReg, &R600::R600_TReg32_YRegClass))
     return AluT_Y;
-  if (regBelongsToClass(DestReg, &AMDGPU::R600_TReg32_ZRegClass))
+  if (regBelongsToClass(DestReg, &R600::R600_TReg32_ZRegClass))
     return AluT_Z;
-  if (regBelongsToClass(DestReg, &AMDGPU::R600_TReg32_WRegClass))
+  if (regBelongsToClass(DestReg, &R600::R600_TReg32_WRegClass))
     return AluT_W;
-  if (regBelongsToClass(DestReg, &AMDGPU::R600_Reg128RegClass))
+  if (regBelongsToClass(DestReg, &R600::R600_Reg128RegClass))
     return AluT_XYZW;
 
   // LDS src registers cannot be used in the Trans slot.
@@ -303,13 +301,13 @@ int R600SchedStrategy::getInstKind(SUnit* SU) {
   }
 
   switch (Opcode) {
-  case AMDGPU::PRED_X:
-  case AMDGPU::COPY:
-  case AMDGPU::CONST_COPY:
-  case AMDGPU::INTERP_PAIR_XY:
-  case AMDGPU::INTERP_PAIR_ZW:
-  case AMDGPU::INTERP_VEC_LOAD:
-  case AMDGPU::DOT_4:
+  case R600::PRED_X:
+  case R600::COPY:
+  case R600::CONST_COPY:
+  case R600::INTERP_PAIR_XY:
+  case R600::INTERP_PAIR_ZW:
+  case R600::INTERP_VEC_LOAD:
+  case R600::DOT_4:
     return IDAlu;
   default:
     return IDOther;
@@ -345,17 +343,17 @@ void R600SchedStrategy::LoadAlu() {
 }
 
 void R600SchedStrategy::PrepareNextSlot() {
-  DEBUG(dbgs() << "New Slot\n");
+  LLVM_DEBUG(dbgs() << "New Slot\n");
   assert (OccupedSlotsMask && "Slot wasn't filled");
   OccupedSlotsMask = 0;
-//  if (HwGen == R600Subtarget::NORTHERN_ISLANDS)
+//  if (HwGen == AMDGPUSubtarget::NORTHERN_ISLANDS)
 //    OccupedSlotsMask |= 16;
   InstructionsGroupCandidate.clear();
   LoadAlu();
 }
 
 void R600SchedStrategy::AssignSlot(MachineInstr* MI, unsigned Slot) {
-  int DstIndex = TII->getOperandIdx(MI->getOpcode(), AMDGPU::OpName::dst);
+  int DstIndex = TII->getOperandIdx(MI->getOpcode(), R600::OpName::dst);
   if (DstIndex == -1) {
     return;
   }
@@ -372,16 +370,16 @@ void R600SchedStrategy::AssignSlot(MachineInstr* MI, unsigned Slot) {
   // Constrains the regclass of DestReg to assign it to Slot
   switch (Slot) {
   case 0:
-    MRI->constrainRegClass(DestReg, &AMDGPU::R600_TReg32_XRegClass);
+    MRI->constrainRegClass(DestReg, &R600::R600_TReg32_XRegClass);
     break;
   case 1:
-    MRI->constrainRegClass(DestReg, &AMDGPU::R600_TReg32_YRegClass);
+    MRI->constrainRegClass(DestReg, &R600::R600_TReg32_YRegClass);
     break;
   case 2:
-    MRI->constrainRegClass(DestReg, &AMDGPU::R600_TReg32_ZRegClass);
+    MRI->constrainRegClass(DestReg, &R600::R600_TReg32_ZRegClass);
     break;
   case 3:
-    MRI->constrainRegClass(DestReg, &AMDGPU::R600_TReg32_WRegClass);
+    MRI->constrainRegClass(DestReg, &R600::R600_TReg32_WRegClass);
     break;
   }
 }
@@ -461,7 +459,7 @@ SUnit* R600SchedStrategy::pickOther(int QID) {
   }
   if (!AQ.empty()) {
     SU = AQ.back();
-    AQ.resize(AQ.size() - 1);
+    AQ.pop_back();
   }
   return SU;
 }
diff --git a/lib/Target/AMDGPU/R600MachineScheduler.h b/lib/Target/AMDGPU/R600MachineScheduler.h
index 9a6770570477..8a9a8d3d1e23 100644
--- a/lib/Target/AMDGPU/R600MachineScheduler.h
+++ b/lib/Target/AMDGPU/R600MachineScheduler.h
@@ -8,7 +8,7 @@
 //===----------------------------------------------------------------------===//
 //
 /// \file
-/// \brief R600 Machine Scheduler interface
+/// R600 Machine Scheduler interface
 //
 //===----------------------------------------------------------------------===//
 
diff --git a/lib/Target/AMDGPU/AMDGPUOpenCLImageTypeLoweringPass.cpp b/lib/Target/AMDGPU/R600OpenCLImageTypeLoweringPass.cpp
index cd71f19760b9..7de5e2c9577d 100644
--- a/lib/Target/AMDGPU/AMDGPUOpenCLImageTypeLoweringPass.cpp
+++ b/lib/Target/AMDGPU/R600OpenCLImageTypeLoweringPass.cpp
@@ -1,4 +1,4 @@
-//===- AMDGPUOpenCLImageTypeLoweringPass.cpp ------------------------------===//
+//===- R600OpenCLImageTypeLoweringPass.cpp ------------------------------===//
 //
 //                     The LLVM Compiler Infrastructure
 //
@@ -153,7 +153,7 @@ PushArgMD(KernelArgMD &MD, const MDVector &V) {
 
 namespace {
 
-class AMDGPUOpenCLImageTypeLoweringPass : public ModulePass {
+class R600OpenCLImageTypeLoweringPass : public ModulePass {
   static char ID;
 
   LLVMContext *Context;
@@ -364,7 +364,7 @@ class AMDGPUOpenCLImageTypeLoweringPass : public ModulePass {
   }
 
 public:
-  AMDGPUOpenCLImageTypeLoweringPass() : ModulePass(ID) {}
+  R600OpenCLImageTypeLoweringPass() : ModulePass(ID) {}
 
   bool runOnModule(Module &M) override {
     Context = &M.getContext();
@@ -376,14 +376,14 @@ public:
   }
 
   StringRef getPassName() const override {
-    return "AMDGPU OpenCL Image Type Pass";
+    return "R600 OpenCL Image Type Pass";
   }
 };
 
 } // end anonymous namespace
 
-char AMDGPUOpenCLImageTypeLoweringPass::ID = 0;
+char R600OpenCLImageTypeLoweringPass::ID = 0;
 
-ModulePass *llvm::createAMDGPUOpenCLImageTypeLoweringPass() {
-  return new AMDGPUOpenCLImageTypeLoweringPass();
+ModulePass *llvm::createR600OpenCLImageTypeLoweringPass() {
+  return new R600OpenCLImageTypeLoweringPass();
 }
diff --git a/lib/Target/AMDGPU/R600OptimizeVectorRegisters.cpp b/lib/Target/AMDGPU/R600OptimizeVectorRegisters.cpp
index 4a14d95f1cc4..692451cb8fe0 100644
--- a/lib/Target/AMDGPU/R600OptimizeVectorRegisters.cpp
+++ b/lib/Target/AMDGPU/R600OptimizeVectorRegisters.cpp
@@ -31,6 +31,7 @@
 #include "AMDGPUSubtarget.h"
 #include "R600Defines.h"
 #include "R600InstrInfo.h"
+#include "MCTargetDesc/AMDGPUMCTargetDesc.h"
 #include "llvm/ADT/DenseMap.h"
 #include "llvm/ADT/STLExtras.h"
 #include "llvm/ADT/StringRef.h"
@@ -78,7 +79,7 @@ public:
   std::vector<unsigned> UndefReg;
 
   RegSeqInfo(MachineRegisterInfo &MRI, MachineInstr *MI) : Instr(MI) {
-    assert(MI->getOpcode() == AMDGPU::REG_SEQUENCE);
+    assert(MI->getOpcode() == R600::REG_SEQUENCE);
     for (unsigned i = 1, e = Instr->getNumOperands(); i < e; i+=2) {
       MachineOperand &MO = Instr->getOperand(i);
       unsigned Chan = Instr->getOperand(i + 1).getImm();
@@ -158,8 +159,8 @@ bool R600VectorRegMerger::canSwizzle(const MachineInstr &MI)
   if (TII->get(MI.getOpcode()).TSFlags & R600_InstFlag::TEX_INST)
     return true;
   switch (MI.getOpcode()) {
-  case AMDGPU::R600_ExportSwz:
-  case AMDGPU::EG_ExportSwz:
+  case R600::R600_ExportSwz:
+  case R600::EG_ExportSwz:
     return true;
   default:
     return false;
@@ -212,12 +213,12 @@ MachineInstr *R600VectorRegMerger::RebuildVector(
   std::vector<unsigned> UpdatedUndef = BaseRSI->UndefReg;
   for (DenseMap<unsigned, unsigned>::iterator It = RSI->RegToChan.begin(),
       E = RSI->RegToChan.end(); It != E; ++It) {
-    unsigned DstReg = MRI->createVirtualRegister(&AMDGPU::R600_Reg128RegClass);
+    unsigned DstReg = MRI->createVirtualRegister(&R600::R600_Reg128RegClass);
     unsigned SubReg = (*It).first;
     unsigned Swizzle = (*It).second;
     unsigned Chan = getReassignedChan(RemapChan, Swizzle);
 
-    MachineInstr *Tmp = BuildMI(MBB, Pos, DL, TII->get(AMDGPU::INSERT_SUBREG),
+    MachineInstr *Tmp = BuildMI(MBB, Pos, DL, TII->get(R600::INSERT_SUBREG),
         DstReg)
         .addReg(SrcVec)
         .addReg(SubReg)
@@ -228,20 +229,20 @@ MachineInstr *R600VectorRegMerger::RebuildVector(
       UpdatedUndef.erase(ChanPos);
     assert(!is_contained(UpdatedUndef, Chan) &&
            "UpdatedUndef shouldn't contain Chan more than once!");
-    DEBUG(dbgs() << "    ->"; Tmp->dump(););
+    LLVM_DEBUG(dbgs() << "    ->"; Tmp->dump(););
     (void)Tmp;
     SrcVec = DstReg;
   }
   MachineInstr *NewMI =
-      BuildMI(MBB, Pos, DL, TII->get(AMDGPU::COPY), Reg).addReg(SrcVec);
-  DEBUG(dbgs() << "    ->"; NewMI->dump(););
+      BuildMI(MBB, Pos, DL, TII->get(R600::COPY), Reg).addReg(SrcVec);
+  LLVM_DEBUG(dbgs() << "    ->"; NewMI->dump(););
 
-  DEBUG(dbgs() << "  Updating Swizzle:\n");
+  LLVM_DEBUG(dbgs() << "  Updating Swizzle:\n");
   for (MachineRegisterInfo::use_instr_iterator It = MRI->use_instr_begin(Reg),
       E = MRI->use_instr_end(); It != E; ++It) {
-    DEBUG(dbgs() << "    ";(*It).dump(); dbgs() << "    ->");
+    LLVM_DEBUG(dbgs() << "    "; (*It).dump(); dbgs() << "    ->");
     SwizzleInput(*It, RemapChan);
-    DEBUG((*It).dump());
+    LLVM_DEBUG((*It).dump());
   }
   RSI->Instr->eraseFromParent();
 
@@ -353,7 +354,7 @@ bool R600VectorRegMerger::runOnMachineFunction(MachineFunction &Fn) {
     for (MachineBasicBlock::iterator MII = MB->begin(), MIIE = MB->end();
          MII != MIIE; ++MII) {
       MachineInstr &MI = *MII;
-      if (MI.getOpcode() != AMDGPU::REG_SEQUENCE) {
+      if (MI.getOpcode() != R600::REG_SEQUENCE) {
         if (TII->get(MI.getOpcode()).TSFlags & R600_InstFlag::TEX_INST) {
           unsigned Reg = MI.getOperand(1).getReg();
           for (MachineRegisterInfo::def_instr_iterator
@@ -372,14 +373,14 @@ bool R600VectorRegMerger::runOnMachineFunction(MachineFunction &Fn) {
       if (!areAllUsesSwizzeable(Reg))
         continue;
 
-      DEBUG({
+      LLVM_DEBUG({
         dbgs() << "Trying to optimize ";
         MI.dump();
       });
 
       RegSeqInfo CandidateRSI;
       std::vector<std::pair<unsigned, unsigned>> RemapChan;
-      DEBUG(dbgs() << "Using common slots...\n";);
+      LLVM_DEBUG(dbgs() << "Using common slots...\n";);
       if (tryMergeUsingCommonSlot(RSI, CandidateRSI, RemapChan)) {
         // Remove CandidateRSI mapping
         RemoveMI(CandidateRSI.Instr);
@@ -387,7 +388,7 @@ bool R600VectorRegMerger::runOnMachineFunction(MachineFunction &Fn) {
         trackRSI(RSI);
         continue;
       }
-      DEBUG(dbgs() << "Using free slots...\n";);
+      LLVM_DEBUG(dbgs() << "Using free slots...\n";);
       RemapChan.clear();
       if (tryMergeUsingFreeSlot(RSI, CandidateRSI, RemapChan)) {
         RemoveMI(CandidateRSI.Instr);
diff --git a/lib/Target/AMDGPU/R600Packetizer.cpp b/lib/Target/AMDGPU/R600Packetizer.cpp
index 7340318d2d88..612c62b514fd 100644
--- a/lib/Target/AMDGPU/R600Packetizer.cpp
+++ b/lib/Target/AMDGPU/R600Packetizer.cpp
@@ -17,6 +17,7 @@
 #include "AMDGPU.h"
 #include "AMDGPUSubtarget.h"
 #include "R600InstrInfo.h"
+#include "MCTargetDesc/AMDGPUMCTargetDesc.h"
 #include "llvm/CodeGen/DFAPacketizer.h"
 #include "llvm/CodeGen/MachineDominators.h"
 #include "llvm/CodeGen/MachineFunctionPass.h"
@@ -83,39 +84,39 @@ private:
       LastDstChan = BISlot;
       if (TII->isPredicated(*BI))
         continue;
-      int OperandIdx = TII->getOperandIdx(BI->getOpcode(), AMDGPU::OpName::write);
+      int OperandIdx = TII->getOperandIdx(BI->getOpcode(), R600::OpName::write);
       if (OperandIdx > -1 && BI->getOperand(OperandIdx).getImm() == 0)
         continue;
-      int DstIdx = TII->getOperandIdx(BI->getOpcode(), AMDGPU::OpName::dst);
+      int DstIdx = TII->getOperandIdx(BI->getOpcode(), R600::OpName::dst);
       if (DstIdx == -1) {
         continue;
       }
       unsigned Dst = BI->getOperand(DstIdx).getReg();
       if (isTrans || TII->isTransOnly(*BI)) {
-        Result[Dst] = AMDGPU::PS;
+        Result[Dst] = R600::PS;
         continue;
       }
-      if (BI->getOpcode() == AMDGPU::DOT4_r600 ||
-          BI->getOpcode() == AMDGPU::DOT4_eg) {
-        Result[Dst] = AMDGPU::PV_X;
+      if (BI->getOpcode() == R600::DOT4_r600 ||
+          BI->getOpcode() == R600::DOT4_eg) {
+        Result[Dst] = R600::PV_X;
         continue;
       }
-      if (Dst == AMDGPU::OQAP) {
+      if (Dst == R600::OQAP) {
         continue;
       }
       unsigned PVReg = 0;
       switch (TRI.getHWRegChan(Dst)) {
       case 0:
-        PVReg = AMDGPU::PV_X;
+        PVReg = R600::PV_X;
         break;
       case 1:
-        PVReg = AMDGPU::PV_Y;
+        PVReg = R600::PV_Y;
         break;
       case 2:
-        PVReg = AMDGPU::PV_Z;
+        PVReg = R600::PV_Z;
         break;
       case 3:
-        PVReg = AMDGPU::PV_W;
+        PVReg = R600::PV_W;
         break;
       default:
         llvm_unreachable("Invalid Chan");
@@ -128,9 +129,9 @@ private:
   void substitutePV(MachineInstr &MI, const DenseMap<unsigned, unsigned> &PVs)
       const {
     unsigned Ops[] = {
-      AMDGPU::OpName::src0,
-      AMDGPU::OpName::src1,
-      AMDGPU::OpName::src2
+      R600::OpName::src0,
+      R600::OpName::src1,
+      R600::OpName::src2
     };
     for (unsigned i = 0; i < 3; i++) {
       int OperandIdx = TII->getOperandIdx(MI.getOpcode(), Ops[i]);
@@ -170,7 +171,7 @@ public:
       return true;
     if (!TII->isALUInstr(MI.getOpcode()))
       return true;
-    if (MI.getOpcode() == AMDGPU::GROUP_BARRIER)
+    if (MI.getOpcode() == R600::GROUP_BARRIER)
       return true;
     // XXX: This can be removed once the packetizer properly handles all the
     // LDS instruction group restrictions.
@@ -184,8 +185,8 @@ public:
     if (getSlot(*MII) == getSlot(*MIJ))
       ConsideredInstUsesAlreadyWrittenVectorElement = true;
     // Does MII and MIJ share the same pred_sel ?
-    int OpI = TII->getOperandIdx(MII->getOpcode(), AMDGPU::OpName::pred_sel),
-        OpJ = TII->getOperandIdx(MIJ->getOpcode(), AMDGPU::OpName::pred_sel);
+    int OpI = TII->getOperandIdx(MII->getOpcode(), R600::OpName::pred_sel),
+        OpJ = TII->getOperandIdx(MIJ->getOpcode(), R600::OpName::pred_sel);
     unsigned PredI = (OpI > -1)?MII->getOperand(OpI).getReg():0,
         PredJ = (OpJ > -1)?MIJ->getOperand(OpJ).getReg():0;
     if (PredI != PredJ)
@@ -219,7 +220,7 @@ public:
   }
 
   void setIsLastBit(MachineInstr *MI, unsigned Bit) const {
-    unsigned LastOp = TII->getOperandIdx(MI->getOpcode(), AMDGPU::OpName::last);
+    unsigned LastOp = TII->getOperandIdx(MI->getOpcode(), R600::OpName::last);
     MI->getOperand(LastOp).setImm(Bit);
   }
 
@@ -236,7 +237,7 @@ public:
         if (ConsideredInstUsesAlreadyWrittenVectorElement &&
             !TII->isVectorOnly(MI) && VLIW5) {
           isTransSlot = true;
-          DEBUG({
+          LLVM_DEBUG({
             dbgs() << "Considering as Trans Inst :";
             MI.dump();
           });
@@ -249,7 +250,7 @@ public:
     // Are the Constants limitations met ?
     CurrentPacketMIs.push_back(&MI);
     if (!TII->fitsConstReadLimitations(CurrentPacketMIs)) {
-      DEBUG({
+      LLVM_DEBUG({
         dbgs() << "Couldn't pack :\n";
         MI.dump();
         dbgs() << "with the following packets :\n";
@@ -266,7 +267,7 @@ public:
     // Is there a BankSwizzle set that meet Read Port limitations ?
     if (!TII->fitsReadPortLimitations(CurrentPacketMIs,
             PV, BS, isTransSlot)) {
-      DEBUG({
+      LLVM_DEBUG({
         dbgs() << "Couldn't pack :\n";
         MI.dump();
         dbgs() << "with the following packets :\n";
@@ -300,11 +301,11 @@ public:
       for (unsigned i = 0, e = CurrentPacketMIs.size(); i < e; i++) {
         MachineInstr *MI = CurrentPacketMIs[i];
         unsigned Op = TII->getOperandIdx(MI->getOpcode(),
-            AMDGPU::OpName::bank_swizzle);
+            R600::OpName::bank_swizzle);
         MI->getOperand(Op).setImm(BS[i]);
       }
       unsigned Op =
-          TII->getOperandIdx(MI.getOpcode(), AMDGPU::OpName::bank_swizzle);
+          TII->getOperandIdx(MI.getOpcode(), R600::OpName::bank_swizzle);
       MI.getOperand(Op).setImm(BS.back());
       if (!CurrentPacketMIs.empty())
         setIsLastBit(CurrentPacketMIs.back(), 0);
@@ -333,6 +334,7 @@ bool R600Packetizer::runOnMachineFunction(MachineFunction &Fn) {
 
   // DFA state table should not be empty.
   assert(Packetizer.getResourceTracker() && "Empty DFA table!");
+  assert(Packetizer.getResourceTracker()->getInstrItins());
 
   if (Packetizer.getResourceTracker()->getInstrItins()->isEmpty())
     return false;
@@ -352,8 +354,8 @@ bool R600Packetizer::runOnMachineFunction(MachineFunction &Fn) {
     MachineBasicBlock::iterator End = MBB->end();
     MachineBasicBlock::iterator MI = MBB->begin();
     while (MI != End) {
-      if (MI->isKill() || MI->getOpcode() == AMDGPU::IMPLICIT_DEF ||
-          (MI->getOpcode() == AMDGPU::CF_ALU && !MI->getOperand(8).getImm())) {
+      if (MI->isKill() || MI->getOpcode() == R600::IMPLICIT_DEF ||
+          (MI->getOpcode() == R600::CF_ALU && !MI->getOperand(8).getImm())) {
         MachineBasicBlock::iterator DeleteMI = MI;
         ++MI;
         MBB->erase(DeleteMI);
diff --git a/lib/Target/AMDGPU/R600Processors.td b/lib/Target/AMDGPU/R600Processors.td
index 89194dc1bdf6..f39b3dc1bfd4 100644
--- a/lib/Target/AMDGPU/R600Processors.td
+++ b/lib/Target/AMDGPU/R600Processors.td
@@ -7,6 +7,62 @@
 //
 //===----------------------------------------------------------------------===//
 
+class SubtargetFeatureFetchLimit <string Value> :
+                          SubtargetFeature <"fetch"#Value,
+  "TexVTXClauseSize",
+  Value,
+  "Limit the maximum number of fetches in a clause to "#Value
+>;
+
+def FeatureR600ALUInst : SubtargetFeature<"R600ALUInst",
+  "R600ALUInst",
+  "false",
+  "Older version of ALU instructions encoding"
+>;
+
+def FeatureFetchLimit8 : SubtargetFeatureFetchLimit <"8">;
+def FeatureFetchLimit16 : SubtargetFeatureFetchLimit <"16">;
+
+def FeatureVertexCache : SubtargetFeature<"HasVertexCache",
+  "HasVertexCache",
+  "true",
+  "Specify use of dedicated vertex cache"
+>;
+
+def FeatureCaymanISA : SubtargetFeature<"caymanISA",
+  "CaymanISA",
+  "true",
+  "Use Cayman ISA"
+>;
+
+def FeatureCFALUBug : SubtargetFeature<"cfalubug",
+  "CFALUBug",
+  "true",
+  "GPU has CF_ALU bug"
+>;
+
+class R600SubtargetFeatureGeneration <string Value,
+                                  list<SubtargetFeature> Implies> :
+        SubtargetFeatureGeneration <Value, "R600Subtarget", Implies>;
+
+def FeatureR600 : R600SubtargetFeatureGeneration<"R600",
+  [FeatureR600ALUInst, FeatureFetchLimit8, FeatureLocalMemorySize0]
+>;
+
+def FeatureR700 : R600SubtargetFeatureGeneration<"R700",
+  [FeatureFetchLimit16, FeatureLocalMemorySize0]
+>;
+
+def FeatureEvergreen : R600SubtargetFeatureGeneration<"EVERGREEN",
+  [FeatureFetchLimit16, FeatureLocalMemorySize32768]
+>;
+
+def FeatureNorthernIslands : R600SubtargetFeatureGeneration<"NORTHERN_ISLANDS",
+  [FeatureFetchLimit16, FeatureWavefrontSize64,
+   FeatureLocalMemorySize32768]
+>;
+
+
 //===----------------------------------------------------------------------===//
 // Radeon HD 2000/3000 Series (R600).
 //===----------------------------------------------------------------------===//
diff --git a/lib/Target/AMDGPU/R600RegisterInfo.cpp b/lib/Target/AMDGPU/R600RegisterInfo.cpp
index 7501facb0cba..38933e7616a0 100644
--- a/lib/Target/AMDGPU/R600RegisterInfo.cpp
+++ b/lib/Target/AMDGPU/R600RegisterInfo.cpp
@@ -8,7 +8,7 @@
 //===----------------------------------------------------------------------===//
 //
 /// \file
-/// \brief R600 implementation of the TargetRegisterInfo class.
+/// R600 implementation of the TargetRegisterInfo class.
 //
 //===----------------------------------------------------------------------===//
 
@@ -17,47 +17,51 @@
 #include "R600Defines.h"
 #include "R600InstrInfo.h"
 #include "R600MachineFunctionInfo.h"
+#include "MCTargetDesc/AMDGPUMCTargetDesc.h"
 
 using namespace llvm;
 
-R600RegisterInfo::R600RegisterInfo() : AMDGPURegisterInfo() {
+R600RegisterInfo::R600RegisterInfo() : R600GenRegisterInfo(0) {
   RCW.RegWeight = 0;
   RCW.WeightLimit = 0;
 }
 
+#define GET_REGINFO_TARGET_DESC
+#include "R600GenRegisterInfo.inc"
+
 BitVector R600RegisterInfo::getReservedRegs(const MachineFunction &MF) const {
   BitVector Reserved(getNumRegs());
 
   const R600Subtarget &ST = MF.getSubtarget<R600Subtarget>();
   const R600InstrInfo *TII = ST.getInstrInfo();
 
-  Reserved.set(AMDGPU::ZERO);
-  Reserved.set(AMDGPU::HALF);
-  Reserved.set(AMDGPU::ONE);
-  Reserved.set(AMDGPU::ONE_INT);
-  Reserved.set(AMDGPU::NEG_HALF);
-  Reserved.set(AMDGPU::NEG_ONE);
-  Reserved.set(AMDGPU::PV_X);
-  Reserved.set(AMDGPU::ALU_LITERAL_X);
-  Reserved.set(AMDGPU::ALU_CONST);
-  Reserved.set(AMDGPU::PREDICATE_BIT);
-  Reserved.set(AMDGPU::PRED_SEL_OFF);
-  Reserved.set(AMDGPU::PRED_SEL_ZERO);
-  Reserved.set(AMDGPU::PRED_SEL_ONE);
-  Reserved.set(AMDGPU::INDIRECT_BASE_ADDR);
-
-  for (TargetRegisterClass::iterator I = AMDGPU::R600_AddrRegClass.begin(),
-                        E = AMDGPU::R600_AddrRegClass.end(); I != E; ++I) {
-    Reserved.set(*I);
+  reserveRegisterTuples(Reserved, R600::ZERO);
+  reserveRegisterTuples(Reserved, R600::HALF);
+  reserveRegisterTuples(Reserved, R600::ONE);
+  reserveRegisterTuples(Reserved, R600::ONE_INT);
+  reserveRegisterTuples(Reserved, R600::NEG_HALF);
+  reserveRegisterTuples(Reserved, R600::NEG_ONE);
+  reserveRegisterTuples(Reserved, R600::PV_X);
+  reserveRegisterTuples(Reserved, R600::ALU_LITERAL_X);
+  reserveRegisterTuples(Reserved, R600::ALU_CONST);
+  reserveRegisterTuples(Reserved, R600::PREDICATE_BIT);
+  reserveRegisterTuples(Reserved, R600::PRED_SEL_OFF);
+  reserveRegisterTuples(Reserved, R600::PRED_SEL_ZERO);
+  reserveRegisterTuples(Reserved, R600::PRED_SEL_ONE);
+  reserveRegisterTuples(Reserved, R600::INDIRECT_BASE_ADDR);
+
+  for (TargetRegisterClass::iterator I = R600::R600_AddrRegClass.begin(),
+                        E = R600::R600_AddrRegClass.end(); I != E; ++I) {
+    reserveRegisterTuples(Reserved, *I);
   }
 
-  TII->reserveIndirectRegisters(Reserved, MF);
+  TII->reserveIndirectRegisters(Reserved, MF, *this);
 
   return Reserved;
 }
 
 // Dummy to not crash RegisterClassInfo.
-static const MCPhysReg CalleeSavedReg = AMDGPU::NoRegister;
+static const MCPhysReg CalleeSavedReg = R600::NoRegister;
 
 const MCPhysReg *R600RegisterInfo::getCalleeSavedRegs(
   const MachineFunction *) const {
@@ -65,7 +69,7 @@ const MCPhysReg *R600RegisterInfo::getCalleeSavedRegs(
 }
 
 unsigned R600RegisterInfo::getFrameRegister(const MachineFunction &MF) const {
-  return AMDGPU::NoRegister;
+  return R600::NoRegister;
 }
 
 unsigned R600RegisterInfo::getHWRegChan(unsigned reg) const {
@@ -80,7 +84,7 @@ const TargetRegisterClass * R600RegisterInfo::getCFGStructurizerRegClass(
                                                                    MVT VT) const {
   switch(VT.SimpleTy) {
   default:
-  case MVT::i32: return &AMDGPU::R600_TReg32RegClass;
+  case MVT::i32: return &R600::R600_TReg32RegClass;
   }
 }
 
@@ -93,9 +97,9 @@ bool R600RegisterInfo::isPhysRegLiveAcrossClauses(unsigned Reg) const {
   assert(!TargetRegisterInfo::isVirtualRegister(Reg));
 
   switch (Reg) {
-  case AMDGPU::OQAP:
-  case AMDGPU::OQBP:
-  case AMDGPU::AR_X:
+  case R600::OQAP:
+  case R600::OQBP:
+  case R600::AR_X:
     return false;
   default:
     return true;
@@ -108,3 +112,10 @@ void R600RegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator MI,
                                            RegScavenger *RS) const {
   llvm_unreachable("Subroutines not supported yet");
 }
+
+void R600RegisterInfo::reserveRegisterTuples(BitVector &Reserved, unsigned Reg) const {
+  MCRegAliasIterator R(Reg, this, true);
+
+  for (; R.isValid(); ++R)
+    Reserved.set(*R);
+}
diff --git a/lib/Target/AMDGPU/R600RegisterInfo.h b/lib/Target/AMDGPU/R600RegisterInfo.h
index f0d9644b02f2..c4c77172b299 100644
--- a/lib/Target/AMDGPU/R600RegisterInfo.h
+++ b/lib/Target/AMDGPU/R600RegisterInfo.h
@@ -8,20 +8,19 @@
 //===----------------------------------------------------------------------===//
 //
 /// \file
-/// \brief Interface definition for R600RegisterInfo
+/// Interface definition for R600RegisterInfo
 //
 //===----------------------------------------------------------------------===//
 
 #ifndef LLVM_LIB_TARGET_AMDGPU_R600REGISTERINFO_H
 #define LLVM_LIB_TARGET_AMDGPU_R600REGISTERINFO_H
 
-#include "AMDGPURegisterInfo.h"
+#define GET_REGINFO_HEADER
+#include "R600GenRegisterInfo.inc"
 
 namespace llvm {
 
-class AMDGPUSubtarget;
-
-struct R600RegisterInfo final : public AMDGPURegisterInfo {
+struct R600RegisterInfo final : public R600GenRegisterInfo {
   RegClassWeight RCW;
 
   R600RegisterInfo();
@@ -30,12 +29,12 @@ struct R600RegisterInfo final : public AMDGPURegisterInfo {
   const MCPhysReg *getCalleeSavedRegs(const MachineFunction *MF) const override;
   unsigned getFrameRegister(const MachineFunction &MF) const override;
 
-  /// \brief get the HW encoding for a register's channel.
+  /// get the HW encoding for a register's channel.
   unsigned getHWRegChan(unsigned reg) const;
 
   unsigned getHWRegIndex(unsigned Reg) const;
 
-  /// \brief get the register class of the specified type to use in the
+  /// get the register class of the specified type to use in the
   /// CFGStructurizer
   const TargetRegisterClass *getCFGStructurizerRegClass(MVT VT) const;
 
@@ -49,6 +48,8 @@ struct R600RegisterInfo final : public AMDGPURegisterInfo {
   void eliminateFrameIndex(MachineBasicBlock::iterator MI, int SPAdj,
                            unsigned FIOperandNum,
                            RegScavenger *RS = nullptr) const override;
+
+  void reserveRegisterTuples(BitVector &Reserved, unsigned Reg) const;
 };
 
 } // End namespace llvm
diff --git a/lib/Target/AMDGPU/R600RegisterInfo.td b/lib/Target/AMDGPU/R600RegisterInfo.td
index 84ab328bdb2b..02164b74a01b 100644
--- a/lib/Target/AMDGPU/R600RegisterInfo.td
+++ b/lib/Target/AMDGPU/R600RegisterInfo.td
@@ -245,7 +245,7 @@ def R600_Reg128Vertical : RegisterClass<"AMDGPU", [v4f32, v4i32], 128,
   (add V0123_W, V0123_Z, V0123_Y, V0123_X)
 >;
 
-def R600_Reg64 : RegisterClass<"AMDGPU", [v2f32, v2i32], 64,
+def R600_Reg64 : RegisterClass<"AMDGPU", [v2f32, v2i32, i64, f64], 64,
                                 (add (sequence "T%u_XY", 0, 63))>;
 
 def R600_Reg64Vertical : RegisterClass<"AMDGPU", [v2f32, v2i32], 64,
diff --git a/lib/Target/AMDGPU/SIAnnotateControlFlow.cpp b/lib/Target/AMDGPU/SIAnnotateControlFlow.cpp
index 150d8c3dc3d3..74f1bd8fb986 100644
--- a/lib/Target/AMDGPU/SIAnnotateControlFlow.cpp
+++ b/lib/Target/AMDGPU/SIAnnotateControlFlow.cpp
@@ -18,6 +18,7 @@
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/Analysis/DivergenceAnalysis.h"
 #include "llvm/Analysis/LoopInfo.h"
+#include "llvm/Transforms/Utils/Local.h"
 #include "llvm/IR/BasicBlock.h"
 #include "llvm/IR/CFG.h"
 #include "llvm/IR/Constant.h"
@@ -37,7 +38,6 @@
 #include "llvm/Support/ErrorHandling.h"
 #include "llvm/Support/raw_ostream.h"
 #include "llvm/Transforms/Utils/BasicBlockUtils.h"
-#include "llvm/Transforms/Utils/Local.h"
 #include <cassert>
 #include <utility>
 
@@ -133,7 +133,7 @@ INITIALIZE_PASS_END(SIAnnotateControlFlow, DEBUG_TYPE,
 
 char SIAnnotateControlFlow::ID = 0;
 
-/// \brief Initialize all the types and constants used in the pass
+/// Initialize all the types and constants used in the pass
 bool SIAnnotateControlFlow::doInitialization(Module &M) {
   LLVMContext &Context = M.getContext();
 
@@ -157,29 +157,29 @@ bool SIAnnotateControlFlow::doInitialization(Module &M) {
   return false;
 }
 
-/// \brief Is the branch condition uniform or did the StructurizeCFG pass
+/// Is the branch condition uniform or did the StructurizeCFG pass
 /// consider it as such?
 bool SIAnnotateControlFlow::isUniform(BranchInst *T) {
   return DA->isUniform(T->getCondition()) ||
          T->getMetadata("structurizecfg.uniform") != nullptr;
 }
 
-/// \brief Is BB the last block saved on the stack ?
+/// Is BB the last block saved on the stack ?
 bool SIAnnotateControlFlow::isTopOfStack(BasicBlock *BB) {
   return !Stack.empty() && Stack.back().first == BB;
 }
 
-/// \brief Pop the last saved value from the control flow stack
+/// Pop the last saved value from the control flow stack
 Value *SIAnnotateControlFlow::popSaved() {
   return Stack.pop_back_val().second;
 }
 
-/// \brief Push a BB and saved value to the control flow stack
+/// Push a BB and saved value to the control flow stack
 void SIAnnotateControlFlow::push(BasicBlock *BB, Value *Saved) {
   Stack.push_back(std::make_pair(BB, Saved));
 }
 
-/// \brief Can the condition represented by this PHI node treated like
+/// Can the condition represented by this PHI node treated like
 /// an "Else" block?
 bool SIAnnotateControlFlow::isElse(PHINode *Phi) {
   BasicBlock *IDom = DT->getNode(Phi->getParent())->getIDom()->getBlock();
@@ -198,14 +198,14 @@ bool SIAnnotateControlFlow::isElse(PHINode *Phi) {
   return true;
 }
 
-// \brief Erase "Phi" if it is not used any more
+// Erase "Phi" if it is not used any more
 void SIAnnotateControlFlow::eraseIfUnused(PHINode *Phi) {
   if (RecursivelyDeleteDeadPHINode(Phi)) {
-    DEBUG(dbgs() << "Erased unused condition phi\n");
+    LLVM_DEBUG(dbgs() << "Erased unused condition phi\n");
   }
 }
 
-/// \brief Open a new "If" block
+/// Open a new "If" block
 void SIAnnotateControlFlow::openIf(BranchInst *Term) {
   if (isUniform(Term))
     return;
@@ -215,7 +215,7 @@ void SIAnnotateControlFlow::openIf(BranchInst *Term) {
   push(Term->getSuccessor(1), ExtractValueInst::Create(Ret, 1, "", Term));
 }
 
-/// \brief Close the last "If" block and open a new "Else" block
+/// Close the last "If" block and open a new "Else" block
 void SIAnnotateControlFlow::insertElse(BranchInst *Term) {
   if (isUniform(Term)) {
     return;
@@ -225,7 +225,7 @@ void SIAnnotateControlFlow::insertElse(BranchInst *Term) {
   push(Term->getSuccessor(1), ExtractValueInst::Create(Ret, 1, "", Term));
 }
 
-/// \brief Recursively handle the condition leading to a loop
+/// Recursively handle the condition leading to a loop
 Value *SIAnnotateControlFlow::handleLoopCondition(
     Value *Cond, PHINode *Broken, llvm::Loop *L, BranchInst *Term,
     SmallVectorImpl<WeakTrackingVH> &LoopPhiConditions) {
@@ -322,7 +322,7 @@ Value *SIAnnotateControlFlow::handleLoopCondition(
   llvm_unreachable("Unhandled loop condition!");
 }
 
-/// \brief Handle a back edge (loop)
+/// Handle a back edge (loop)
 void SIAnnotateControlFlow::handleLoop(BranchInst *Term) {
   if (isUniform(Term))
     return;
@@ -353,7 +353,7 @@ void SIAnnotateControlFlow::handleLoop(BranchInst *Term) {
   push(Term->getSuccessor(0), Arg);
 }
 
-/// \brief Close the last opened control flow
+/// Close the last opened control flow
 void SIAnnotateControlFlow::closeControlFlow(BasicBlock *BB) {
   llvm::Loop *L = LI->getLoopFor(BB);
 
@@ -381,7 +381,7 @@ void SIAnnotateControlFlow::closeControlFlow(BasicBlock *BB) {
     CallInst::Create(EndCf, Exec, "", FirstInsertionPt);
 }
 
-/// \brief Annotate the control flow with intrinsics so the backend can
+/// Annotate the control flow with intrinsics so the backend can
 /// recognize if/then/else and loops.
 bool SIAnnotateControlFlow::runOnFunction(Function &F) {
   DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree();
@@ -422,11 +422,15 @@ bool SIAnnotateControlFlow::runOnFunction(Function &F) {
     openIf(Term);
   }
 
-  assert(Stack.empty());
+  if (!Stack.empty()) {
+    // CFG was probably not structured.
+    report_fatal_error("failed to annotate CFG");
+  }
+
   return true;
 }
 
-/// \brief Create the annotation pass
+/// Create the annotation pass
 FunctionPass *llvm::createSIAnnotateControlFlowPass() {
   return new SIAnnotateControlFlow();
 }
diff --git a/lib/Target/AMDGPU/SIDebuggerInsertNops.cpp b/lib/Target/AMDGPU/SIDebuggerInsertNops.cpp
index b5c439b21b89..7e884ad93a23 100644
--- a/lib/Target/AMDGPU/SIDebuggerInsertNops.cpp
+++ b/lib/Target/AMDGPU/SIDebuggerInsertNops.cpp
@@ -8,7 +8,7 @@
 //===----------------------------------------------------------------------===//
 //
 /// \file
-/// \brief Inserts one nop instruction for each high level source statement for
+/// Inserts one nop instruction for each high level source statement for
 /// debugger usage.
 ///
 /// Tools, such as a debugger, need to pause execution based on user input (i.e.
@@ -21,6 +21,7 @@
 
 #include "AMDGPUSubtarget.h"
 #include "SIInstrInfo.h"
+#include "MCTargetDesc/AMDGPUMCTargetDesc.h"
 #include "llvm/ADT/DenseSet.h"
 #include "llvm/CodeGen/MachineFunction.h"
 #include "llvm/CodeGen/MachineFunctionPass.h"
@@ -62,7 +63,7 @@ FunctionPass *llvm::createSIDebuggerInsertNopsPass() {
 bool SIDebuggerInsertNops::runOnMachineFunction(MachineFunction &MF) {
   // Skip this pass if "amdgpu-debugger-insert-nops" attribute was not
   // specified.
-  const SISubtarget &ST = MF.getSubtarget<SISubtarget>();
+  const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
   if (!ST.debuggerInsertNops())
     return false;
 
@@ -78,8 +79,8 @@ bool SIDebuggerInsertNops::runOnMachineFunction(MachineFunction &MF) {
 
   for (auto &MBB : MF) {
     for (auto MI = MBB.begin(); MI != MBB.end(); ++MI) {
-      // Skip DBG_VALUE instructions and instructions without location.
-      if (MI->isDebugValue() || !MI->getDebugLoc())
+      // Skip debug instructions and instructions without location.
+      if (MI->isDebugInstr() || !MI->getDebugLoc())
         continue;
 
       // Insert nop instruction if line number does not have nop inserted.
diff --git a/lib/Target/AMDGPU/SIDefines.h b/lib/Target/AMDGPU/SIDefines.h
index a9f6069e798a..a6d28d6999e5 100644
--- a/lib/Target/AMDGPU/SIDefines.h
+++ b/lib/Target/AMDGPU/SIDefines.h
@@ -85,7 +85,10 @@ enum : uint64_t {
   ClampHi = UINT64_C(1) << 48,
 
   // Is a packed VOP3P instruction.
-  IsPacked = UINT64_C(1) << 49
+  IsPacked = UINT64_C(1) << 49,
+
+  // Is a D16 buffer instruction.
+  D16Buf = UINT64_C(1) << 50
 };
 
 // v_cmp_class_* etc. use a 10-bit mask for what operation is checked.
@@ -137,7 +140,6 @@ namespace AMDGPU {
     OPERAND_INPUT_MODS,
 
     // Operand for SDWA instructions
-    OPERAND_SDWA_SRC,
     OPERAND_SDWA_VOPC_DST,
 
     /// Operand with 32-bit immediate that uses the constant bus.
@@ -146,6 +148,13 @@ namespace AMDGPU {
   };
 }
 
+namespace SIStackID {
+enum StackTypes : uint8_t {
+  SCRATCH = 0,
+  SGPR_SPILL = 1
+};
+}
+
 // Input operand modifiers bit-masks
 // NEG and SEXT share same bit-mask because they can't be set simultaneously.
 namespace SISrcMods {
@@ -273,8 +282,9 @@ enum Id { // HwRegCode, (6) [5:0]
   ID_GPR_ALLOC = 5,
   ID_LDS_ALLOC = 6,
   ID_IB_STS = 7,
-  ID_SYMBOLIC_LAST_ = 8,
   ID_MEM_BASES = 15,
+  ID_SYMBOLIC_FIRST_GFX9_ = ID_MEM_BASES,
+  ID_SYMBOLIC_LAST_ = 16,
   ID_SHIFT_ = 0,
   ID_WIDTH_ = 6,
   ID_MASK_ = (((1 << ID_WIDTH_) - 1) << ID_SHIFT_)
@@ -375,6 +385,44 @@ enum SDWA9EncValues{
 };
 
 } // namespace SDWA
+
+namespace DPP {
+
+enum DppCtrl {
+  QUAD_PERM_FIRST   = 0,
+  QUAD_PERM_LAST    = 0xFF,
+  DPP_UNUSED1       = 0x100,
+  ROW_SHL0          = 0x100,
+  ROW_SHL_FIRST     = 0x101,
+  ROW_SHL_LAST      = 0x10F,
+  DPP_UNUSED2       = 0x110,
+  ROW_SHR0          = 0x110,
+  ROW_SHR_FIRST     = 0x111,
+  ROW_SHR_LAST      = 0x11F,
+  DPP_UNUSED3       = 0x120,
+  ROW_ROR0          = 0x120,
+  ROW_ROR_FIRST     = 0x121,
+  ROW_ROR_LAST      = 0x12F,
+  WAVE_SHL1         = 0x130,
+  DPP_UNUSED4_FIRST = 0x131,
+  DPP_UNUSED4_LAST  = 0x133,
+  WAVE_ROL1         = 0x134,
+  DPP_UNUSED5_FIRST = 0x135,
+  DPP_UNUSED5_LAST  = 0x137,
+  WAVE_SHR1         = 0x138,
+  DPP_UNUSED6_FIRST = 0x139,
+  DPP_UNUSED6_LAST  = 0x13B,
+  WAVE_ROR1         = 0x13C,
+  DPP_UNUSED7_FIRST = 0x13D,
+  DPP_UNUSED7_LAST  = 0x13F,
+  ROW_MIRROR        = 0x140,
+  ROW_HALF_MIRROR   = 0x141,
+  BCAST15           = 0x142,
+  BCAST31           = 0x143,
+  DPP_LAST          = BCAST31
+};
+
+} // namespace DPP
 } // namespace AMDGPU
 
 #define R_00B028_SPI_SHADER_PGM_RSRC1_PS                                0x00B028
diff --git a/lib/Target/AMDGPU/SIFixSGPRCopies.cpp b/lib/Target/AMDGPU/SIFixSGPRCopies.cpp
index 8b155c2d2780..566e0d3febc7 100644
--- a/lib/Target/AMDGPU/SIFixSGPRCopies.cpp
+++ b/lib/Target/AMDGPU/SIFixSGPRCopies.cpp
@@ -69,6 +69,7 @@
 #include "AMDGPUSubtarget.h"
 #include "SIInstrInfo.h"
 #include "SIRegisterInfo.h"
+#include "MCTargetDesc/AMDGPUMCTargetDesc.h"
 #include "llvm/ADT/DenseSet.h"
 #include "llvm/ADT/STLExtras.h"
 #include "llvm/ADT/SmallSet.h"
@@ -81,7 +82,6 @@
 #include "llvm/CodeGen/MachineInstrBuilder.h"
 #include "llvm/CodeGen/MachineOperand.h"
 #include "llvm/CodeGen/MachineRegisterInfo.h"
-#include "llvm/CodeGen/MachinePostDominators.h"
 #include "llvm/CodeGen/TargetRegisterInfo.h"
 #include "llvm/Pass.h"
 #include "llvm/Support/CodeGen.h"
@@ -110,12 +110,7 @@ namespace {
 
 class SIFixSGPRCopies : public MachineFunctionPass {
   MachineDominatorTree *MDT;
-  MachinePostDominatorTree *MPDT;
-  DenseMap<MachineBasicBlock *, SetVector<MachineBasicBlock*>> PDF;
-  void computePDF(MachineFunction * MF);
-#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
-  void printPDF();
-#endif
+
 public:
   static char ID;
 
@@ -128,8 +123,6 @@ public:
   void getAnalysisUsage(AnalysisUsage &AU) const override {
     AU.addRequired<MachineDominatorTree>();
     AU.addPreserved<MachineDominatorTree>();
-    AU.addRequired<MachinePostDominatorTree>();
-    AU.addPreserved<MachinePostDominatorTree>();
     AU.setPreservesCFG();
     MachineFunctionPass::getAnalysisUsage(AU);
   }
@@ -417,6 +410,12 @@ bool searchPredecessors(const MachineBasicBlock *MBB,
   return false;
 }
 
+static bool predsHasDivergentTerminator(MachineBasicBlock *MBB,
+                                        const TargetRegisterInfo *TRI) {
+  return searchPredecessors(MBB, nullptr, [TRI](MachineBasicBlock *MBB) {
+           return hasTerminatorThatModifiesExec(*MBB, *TRI); });
+}
+
 // Checks if there is potential path From instruction To instruction.
 // If CutOff is specified and it sits in between of that path we ignore
 // a higher portion of the path and report it is not reachable.
@@ -515,9 +514,9 @@ static bool hoistAndMergeSGPRInits(unsigned Reg,
 
         if (MDT.dominates(MI1, MI2)) {
           if (!intereferes(MI2, MI1)) {
-            DEBUG(dbgs() << "Erasing from "
-                         << printMBBReference(*MI2->getParent()) << " "
-                         << *MI2);
+            LLVM_DEBUG(dbgs()
+                       << "Erasing from "
+                       << printMBBReference(*MI2->getParent()) << " " << *MI2);
             MI2->eraseFromParent();
             Defs.erase(I2++);
             Changed = true;
@@ -525,9 +524,9 @@ static bool hoistAndMergeSGPRInits(unsigned Reg,
           }
         } else if (MDT.dominates(MI2, MI1)) {
           if (!intereferes(MI1, MI2)) {
-            DEBUG(dbgs() << "Erasing from "
-                         << printMBBReference(*MI1->getParent()) << " "
-                         << *MI1);
+            LLVM_DEBUG(dbgs()
+                       << "Erasing from "
+                       << printMBBReference(*MI1->getParent()) << " " << *MI1);
             MI1->eraseFromParent();
             Defs.erase(I1++);
             Changed = true;
@@ -543,11 +542,12 @@ static bool hoistAndMergeSGPRInits(unsigned Reg,
 
           MachineBasicBlock::iterator I = MBB->getFirstNonPHI();
           if (!intereferes(MI1, I) && !intereferes(MI2, I)) {
-            DEBUG(dbgs() << "Erasing from "
-                         << printMBBReference(*MI1->getParent()) << " " << *MI1
-                         << "and moving from "
-                         << printMBBReference(*MI2->getParent()) << " to "
-                         << printMBBReference(*I->getParent()) << " " << *MI2);
+            LLVM_DEBUG(dbgs()
+                       << "Erasing from "
+                       << printMBBReference(*MI1->getParent()) << " " << *MI1
+                       << "and moving from "
+                       << printMBBReference(*MI2->getParent()) << " to "
+                       << printMBBReference(*I->getParent()) << " " << *MI2);
             I->getParent()->splice(I, MI2->getParent(), MI2);
             MI1->eraseFromParent();
             Defs.erase(I1++);
@@ -567,47 +567,12 @@ static bool hoistAndMergeSGPRInits(unsigned Reg,
   return Changed;
 }
 
-void SIFixSGPRCopies::computePDF(MachineFunction *MF) {
-  MachineFunction::iterator B = MF->begin();
-  MachineFunction::iterator E = MF->end();
-  for (; B != E; ++B) {
-    if (B->succ_size() > 1) {
-      for (auto S : B->successors()) {
-        MachineDomTreeNode *runner = MPDT->getNode(&*S);
-        MachineDomTreeNode *sentinel = MPDT->getNode(&*B)->getIDom();
-        while (runner && runner != sentinel) {
-          PDF[runner->getBlock()].insert(&*B);
-          runner = runner->getIDom();
-        }
-      }
-    }
-  }
-}
-
-#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
-void SIFixSGPRCopies::printPDF() {
-  dbgs() << "\n######## PostDominanceFrontiers set #########\n";
-  for (auto &I : PDF) {
-    dbgs() << "PDF[ " << I.first->getNumber() << "] : ";
-    for (auto &J : I.second) {
-      dbgs() << J->getNumber() << ' ';
-    }
-    dbgs() << '\n';
-  }
-  dbgs() << "\n##############################################\n";
-}
-#endif
-
 bool SIFixSGPRCopies::runOnMachineFunction(MachineFunction &MF) {
-  const SISubtarget &ST = MF.getSubtarget<SISubtarget>();
+  const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
   MachineRegisterInfo &MRI = MF.getRegInfo();
   const SIRegisterInfo *TRI = ST.getRegisterInfo();
   const SIInstrInfo *TII = ST.getInstrInfo();
   MDT = &getAnalysis<MachineDominatorTree>();
-  MPDT = &getAnalysis<MachinePostDominatorTree>();
-  PDF.clear();
-  computePDF(&MF);
-  DEBUG(printPDF());
 
   SmallVector<MachineInstr *, 16> Worklist;
 
@@ -661,28 +626,17 @@ bool SIFixSGPRCopies::runOnMachineFunction(MachineFunction &MF) {
         if (!TRI->isSGPRClass(MRI.getRegClass(Reg)))
           break;
 
-        // We don't need to fix the PHI if all the source blocks
-        // have no divergent control dependecies
+        // We don't need to fix the PHI if the common dominator of the
+        // two incoming blocks terminates with a uniform branch.
         bool HasVGPROperand = phiHasVGPROperands(MI, MRI, TRI, TII);
-        if (!HasVGPROperand) {
-          bool Uniform = true;
-          MachineBasicBlock * Join = MI.getParent();
-          for (auto &O : MI.explicit_operands()) {
-            if (O.isMBB()) {
-              MachineBasicBlock * Source = O.getMBB();
-              SetVector<MachineBasicBlock*> &SourcePDF = PDF[Source];
-              SetVector<MachineBasicBlock*> &JoinPDF   = PDF[Join];
-              SetVector<MachineBasicBlock*> CDList;
-              for (auto &I : SourcePDF) {
-                if (!JoinPDF.count(I) || /* back edge */MDT->dominates(Join, I)) {
-                  if (hasTerminatorThatModifiesExec(*I, *TRI))
-                    Uniform = false;
-                }
-              }
-            }
-          }
-          if (Uniform) {
-            DEBUG(dbgs() << "Not fixing PHI for uniform branch: " << MI << '\n');
+        if (MI.getNumExplicitOperands() == 5 && !HasVGPROperand) {
+          MachineBasicBlock *MBB0 = MI.getOperand(2).getMBB();
+          MachineBasicBlock *MBB1 = MI.getOperand(4).getMBB();
+
+          if (!predsHasDivergentTerminator(MBB0, TRI) &&
+              !predsHasDivergentTerminator(MBB1, TRI)) {
+            LLVM_DEBUG(dbgs()
+                       << "Not fixing PHI for uniform branch: " << MI << '\n');
             break;
           }
         }
@@ -722,7 +676,7 @@ bool SIFixSGPRCopies::runOnMachineFunction(MachineFunction &MF) {
 
         SmallSet<unsigned, 8> Visited;
         if (HasVGPROperand || !phiHasBreakDef(MI, MRI, Visited)) {
-          DEBUG(dbgs() << "Fixing PHI: " << MI);
+          LLVM_DEBUG(dbgs() << "Fixing PHI: " << MI);
           TII->moveToVALU(MI);
         }
         break;
@@ -734,7 +688,7 @@ bool SIFixSGPRCopies::runOnMachineFunction(MachineFunction &MF) {
           continue;
         }
 
-        DEBUG(dbgs() << "Fixing REG_SEQUENCE: " << MI);
+        LLVM_DEBUG(dbgs() << "Fixing REG_SEQUENCE: " << MI);
 
         TII->moveToVALU(MI);
         break;
@@ -745,7 +699,7 @@ bool SIFixSGPRCopies::runOnMachineFunction(MachineFunction &MF) {
         Src1RC = MRI.getRegClass(MI.getOperand(2).getReg());
         if (TRI->isSGPRClass(DstRC) &&
             (TRI->hasVGPRs(Src0RC) || TRI->hasVGPRs(Src1RC))) {
-          DEBUG(dbgs() << " Fixing INSERT_SUBREG: " << MI);
+          LLVM_DEBUG(dbgs() << " Fixing INSERT_SUBREG: " << MI);
           TII->moveToVALU(MI);
         }
         break;
diff --git a/lib/Target/AMDGPU/SIFixVGPRCopies.cpp b/lib/Target/AMDGPU/SIFixVGPRCopies.cpp
index 3d3121788b5e..15ba78edf919 100644
--- a/lib/Target/AMDGPU/SIFixVGPRCopies.cpp
+++ b/lib/Target/AMDGPU/SIFixVGPRCopies.cpp
@@ -8,13 +8,14 @@
 //===----------------------------------------------------------------------===//
 //
 /// \file
-/// \brief Add implicit use of exec to vector register copies.
+/// Add implicit use of exec to vector register copies.
 ///
 //===----------------------------------------------------------------------===//
 
 #include "AMDGPU.h"
 #include "AMDGPUSubtarget.h"
 #include "SIInstrInfo.h"
+#include "MCTargetDesc/AMDGPUMCTargetDesc.h"
 #include "llvm/CodeGen/MachineFunctionPass.h"
 
 using namespace llvm;
@@ -46,7 +47,7 @@ char SIFixVGPRCopies::ID = 0;
 char &llvm::SIFixVGPRCopiesID = SIFixVGPRCopies::ID;
 
 bool SIFixVGPRCopies::runOnMachineFunction(MachineFunction &MF) {
-  const SISubtarget &ST = MF.getSubtarget<SISubtarget>();
+  const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
   const SIRegisterInfo *TRI = ST.getRegisterInfo();
   const SIInstrInfo *TII = ST.getInstrInfo();
   bool Changed = false;
@@ -58,7 +59,7 @@ bool SIFixVGPRCopies::runOnMachineFunction(MachineFunction &MF) {
         if (TII->isVGPRCopy(MI) && !MI.readsRegister(AMDGPU::EXEC, TRI)) {
           MI.addOperand(MF,
                         MachineOperand::CreateReg(AMDGPU::EXEC, false, true));
-          DEBUG(dbgs() << "Add exec use to " << MI);
+          LLVM_DEBUG(dbgs() << "Add exec use to " << MI);
           Changed = true;
         }
         break;
diff --git a/lib/Target/AMDGPU/SIFixWWMLiveness.cpp b/lib/Target/AMDGPU/SIFixWWMLiveness.cpp
index 3493c7775f0c..5d613d8874fa 100644
--- a/lib/Target/AMDGPU/SIFixWWMLiveness.cpp
+++ b/lib/Target/AMDGPU/SIFixWWMLiveness.cpp
@@ -8,7 +8,7 @@
 //===----------------------------------------------------------------------===//
 //
 /// \file
-/// \brief Computations in WWM can overwrite values in inactive channels for
+/// Computations in WWM can overwrite values in inactive channels for
 /// variables that the register allocator thinks are dead. This pass adds fake
 /// uses of those variables to WWM instructions to make sure that they aren't
 /// overwritten.
@@ -55,6 +55,7 @@
 #include "AMDGPUSubtarget.h"
 #include "SIInstrInfo.h"
 #include "SIRegisterInfo.h"
+#include "MCTargetDesc/AMDGPUMCTargetDesc.h"
 #include "llvm/ADT/DepthFirstIterator.h"
 #include "llvm/ADT/SparseBitVector.h"
 #include "llvm/CodeGen/LiveIntervals.h"
@@ -184,7 +185,7 @@ bool SIFixWWMLiveness::runOnMachineFunction(MachineFunction &MF) {
   // This doesn't actually need LiveIntervals, but we can preserve them.
   LIS = getAnalysisIfAvailable<LiveIntervals>();
 
-  const SISubtarget &ST = MF.getSubtarget<SISubtarget>();
+  const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
   const SIInstrInfo *TII = ST.getInstrInfo();
 
   TRI = &TII->getRegisterInfo();
diff --git a/lib/Target/AMDGPU/SIFoldOperands.cpp b/lib/Target/AMDGPU/SIFoldOperands.cpp
index 783181980342..338cabcb906b 100644
--- a/lib/Target/AMDGPU/SIFoldOperands.cpp
+++ b/lib/Target/AMDGPU/SIFoldOperands.cpp
@@ -13,6 +13,7 @@
 #include "AMDGPUSubtarget.h"
 #include "SIInstrInfo.h"
 #include "SIMachineFunctionInfo.h"
+#include "MCTargetDesc/AMDGPUMCTargetDesc.h"
 #include "llvm/ADT/DepthFirstIterator.h"
 #include "llvm/CodeGen/LiveIntervals.h"
 #include "llvm/CodeGen/MachineFunctionPass.h"
@@ -75,7 +76,7 @@ public:
   MachineRegisterInfo *MRI;
   const SIInstrInfo *TII;
   const SIRegisterInfo *TRI;
-  const SISubtarget *ST;
+  const GCNSubtarget *ST;
 
   void foldOperand(MachineOperand &OpToFold,
                    MachineInstr *UseMI,
@@ -127,14 +128,18 @@ static bool isInlineConstantIfFolded(const SIInstrInfo *TII,
   unsigned Opc = UseMI.getOpcode();
   switch (Opc) {
   case AMDGPU::V_MAC_F32_e64:
-  case AMDGPU::V_MAC_F16_e64: {
+  case AMDGPU::V_MAC_F16_e64:
+  case AMDGPU::V_FMAC_F32_e64: {
     // Special case for mac. Since this is replaced with mad when folded into
     // src2, we need to check the legality for the final instruction.
     int Src2Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src2);
     if (static_cast<int>(OpNo) == Src2Idx) {
+      bool IsFMA = Opc == AMDGPU::V_FMAC_F32_e64;
       bool IsF32 = Opc == AMDGPU::V_MAC_F32_e64;
-      const MCInstrDesc &MadDesc
-        = TII->get(IsF32 ? AMDGPU::V_MAD_F32 : AMDGPU::V_MAD_F16);
+
+      unsigned Opc = IsFMA ?
+        AMDGPU::V_FMA_F32 : (IsF32 ? AMDGPU::V_MAD_F32 : AMDGPU::V_MAD_F16);
+      const MCInstrDesc &MadDesc = TII->get(Opc);
       return TII->isInlineConstant(OpToFold, MadDesc.OpInfo[OpNo].OperandType);
     }
     return false;
@@ -155,6 +160,35 @@ static bool updateOperand(FoldCandidate &Fold,
   assert(Old.isReg());
 
   if (Fold.isImm()) {
+    if (MI->getDesc().TSFlags & SIInstrFlags::IsPacked) {
+      // Set op_sel/op_sel_hi on this operand or bail out if op_sel is
+      // already set.
+      unsigned Opcode = MI->getOpcode();
+      int OpNo = MI->getOperandNo(&Old);
+      int ModIdx = -1;
+      if (OpNo == AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::src0))
+        ModIdx = AMDGPU::OpName::src0_modifiers;
+      else if (OpNo == AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::src1))
+        ModIdx = AMDGPU::OpName::src1_modifiers;
+      else if (OpNo == AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::src2))
+        ModIdx = AMDGPU::OpName::src2_modifiers;
+      assert(ModIdx != -1);
+      ModIdx = AMDGPU::getNamedOperandIdx(Opcode, ModIdx);
+      MachineOperand &Mod = MI->getOperand(ModIdx);
+      unsigned Val = Mod.getImm();
+      if ((Val & SISrcMods::OP_SEL_0) || !(Val & SISrcMods::OP_SEL_1))
+        return false;
+      // If upper part is all zero we do not need op_sel_hi.
+      if (!isUInt<16>(Fold.ImmToFold)) {
+        if (!(Fold.ImmToFold & 0xffff)) {
+          Mod.setImm(Mod.getImm() | SISrcMods::OP_SEL_0);
+          Mod.setImm(Mod.getImm() & ~SISrcMods::OP_SEL_1);
+          Old.ChangeToImmediate((Fold.ImmToFold >> 16) & 0xffff);
+          return true;
+        }
+        Mod.setImm(Mod.getImm() & ~SISrcMods::OP_SEL_1);
+      }
+    }
     Old.ChangeToImmediate(Fold.ImmToFold);
     return true;
   }
@@ -195,13 +229,17 @@ static bool tryAddToFoldList(SmallVectorImpl<FoldCandidate> &FoldList,
 
     // Special case for v_mac_{f16, f32}_e64 if we are trying to fold into src2
     unsigned Opc = MI->getOpcode();
-    if ((Opc == AMDGPU::V_MAC_F32_e64 || Opc == AMDGPU::V_MAC_F16_e64) &&
+    if ((Opc == AMDGPU::V_MAC_F32_e64 || Opc == AMDGPU::V_MAC_F16_e64 ||
+         Opc == AMDGPU::V_FMAC_F32_e64) &&
         (int)OpNo == AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src2)) {
+      bool IsFMA = Opc == AMDGPU::V_FMAC_F32_e64;
       bool IsF32 = Opc == AMDGPU::V_MAC_F32_e64;
+      unsigned NewOpc = IsFMA ?
+        AMDGPU::V_FMA_F32 : (IsF32 ? AMDGPU::V_MAD_F32 : AMDGPU::V_MAD_F16);
 
       // Check if changing this to a v_mad_{f16, f32} instruction will allow us
       // to fold the operand.
-      MI->setDesc(TII->get(IsF32 ? AMDGPU::V_MAD_F32 : AMDGPU::V_MAD_F16));
+      MI->setDesc(TII->get(NewOpc));
       bool FoldAsMAD = tryAddToFoldList(FoldList, MI, OpNo, OpToFold, TII);
       if (FoldAsMAD) {
         MI->untieRegOperand(OpNo);
@@ -345,6 +383,7 @@ void SIFoldOperands::foldOperand(
     // Don't fold into target independent nodes.  Target independent opcodes
     // don't have defined register classes.
     if (UseDesc.isVariadic() ||
+        UseOp.isImplicit() ||
         UseDesc.OpInfo[UseOpIdx].RegClass == -1)
       return;
   }
@@ -470,7 +509,8 @@ static MachineOperand *getImmOrMaterializedImm(MachineRegisterInfo &MRI,
                                                MachineOperand &Op) {
   if (Op.isReg()) {
     // If this has a subregister, it obviously is a register source.
-    if (Op.getSubReg() != AMDGPU::NoSubRegister)
+    if (Op.getSubReg() != AMDGPU::NoSubRegister ||
+        !TargetRegisterInfo::isVirtualRegister(Op.getReg()))
       return &Op;
 
     MachineInstr *Def = MRI.getVRegDef(Op.getReg());
@@ -598,14 +638,14 @@ static bool tryFoldInst(const SIInstrInfo *TII,
     const MachineOperand *Src0 = TII->getNamedOperand(*MI, AMDGPU::OpName::src0);
     const MachineOperand *Src1 = TII->getNamedOperand(*MI, AMDGPU::OpName::src1);
     if (Src1->isIdenticalTo(*Src0)) {
-      DEBUG(dbgs() << "Folded " << *MI << " into ");
+      LLVM_DEBUG(dbgs() << "Folded " << *MI << " into ");
       int Src2Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src2);
       if (Src2Idx != -1)
         MI->RemoveOperand(Src2Idx);
       MI->RemoveOperand(AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src1));
       mutateCopyOp(*MI, TII->get(Src0->isReg() ? (unsigned)AMDGPU::COPY
                                                : getMovOpc(false)));
-      DEBUG(dbgs() << *MI << '\n');
+      LLVM_DEBUG(dbgs() << *MI << '\n');
       return true;
     }
   }
@@ -646,7 +686,7 @@ void SIFoldOperands::foldInstOperand(MachineInstr &MI,
       // be folded due to multiple uses or operand constraints.
 
       if (OpToFold.isImm() && tryConstantFoldOp(*MRI, TII, UseMI, &OpToFold)) {
-        DEBUG(dbgs() << "Constant folded " << *UseMI <<'\n');
+        LLVM_DEBUG(dbgs() << "Constant folded " << *UseMI << '\n');
 
         // Some constant folding cases change the same immediate's use to a new
         // instruction, e.g. and x, 0 -> 0. Make sure we re-visit the user
@@ -713,8 +753,9 @@ void SIFoldOperands::foldInstOperand(MachineInstr &MI,
         // copies.
         MRI->clearKillFlags(Fold.OpToFold->getReg());
       }
-      DEBUG(dbgs() << "Folded source from " << MI << " into OpNo " <<
-            static_cast<int>(Fold.UseOpNo) << " of " << *Fold.UseMI << '\n');
+      LLVM_DEBUG(dbgs() << "Folded source from " << MI << " into OpNo "
+                        << static_cast<int>(Fold.UseOpNo) << " of "
+                        << *Fold.UseMI << '\n');
       tryFoldInst(TII, Fold.UseMI);
     } else if (Fold.isCommuted()) {
       // Restoring instruction's original operand order if fold has failed.
@@ -794,7 +835,8 @@ bool SIFoldOperands::tryFoldClamp(MachineInstr &MI) {
   if (!DefClamp)
     return false;
 
-  DEBUG(dbgs() << "Folding clamp " << *DefClamp << " into " << *Def << '\n');
+  LLVM_DEBUG(dbgs() << "Folding clamp " << *DefClamp << " into " << *Def
+                    << '\n');
 
   // Clamp is applied after omod, so it is OK if omod is set.
   DefClamp->setImm(1);
@@ -917,7 +959,7 @@ bool SIFoldOperands::tryFoldOMod(MachineInstr &MI) {
   if (TII->hasModifiersSet(*Def, AMDGPU::OpName::clamp))
     return false;
 
-  DEBUG(dbgs() << "Folding omod " << MI << " into " << *Def << '\n');
+  LLVM_DEBUG(dbgs() << "Folding omod " << MI << " into " << *Def << '\n');
 
   DefOMod->setImm(OMod);
   MRI->replaceRegWith(MI.getOperand(0).getReg(), Def->getOperand(0).getReg());
@@ -930,7 +972,7 @@ bool SIFoldOperands::runOnMachineFunction(MachineFunction &MF) {
     return false;
 
   MRI = &MF.getRegInfo();
-  ST = &MF.getSubtarget<SISubtarget>();
+  ST = &MF.getSubtarget<GCNSubtarget>();
   TII = ST->getInstrInfo();
   TRI = &TII->getRegisterInfo();
 
diff --git a/lib/Target/AMDGPU/SIFormMemoryClauses.cpp b/lib/Target/AMDGPU/SIFormMemoryClauses.cpp
new file mode 100644
index 000000000000..cd14239de822
--- /dev/null
+++ b/lib/Target/AMDGPU/SIFormMemoryClauses.cpp
@@ -0,0 +1,398 @@
+//===-- SIFormMemoryClauses.cpp -------------------------------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+/// \file
+/// This pass creates bundles of SMEM and VMEM instructions forming memory
+/// clauses if XNACK is enabled. Def operands of clauses are marked as early
+/// clobber to make sure we will not override any source within a clause.
+///
+//===----------------------------------------------------------------------===//
+
+#include "AMDGPU.h"
+#include "AMDGPUSubtarget.h"
+#include "GCNRegPressure.h"
+#include "SIInstrInfo.h"
+#include "SIMachineFunctionInfo.h"
+#include "SIRegisterInfo.h"
+#include "MCTargetDesc/AMDGPUMCTargetDesc.h"
+#include "llvm/ADT/DenseMap.h"
+#include "llvm/CodeGen/LiveIntervals.h"
+#include "llvm/CodeGen/MachineFunctionPass.h"
+
+using namespace llvm;
+
+#define DEBUG_TYPE "si-form-memory-clauses"
+
+// Clauses longer then 15 instructions would overflow one of the counters
+// and stall. They can stall even earlier if there are outstanding counters.
+static cl::opt<unsigned>
+MaxClause("amdgpu-max-memory-clause", cl::Hidden, cl::init(15),
+          cl::desc("Maximum length of a memory clause, instructions"));
+
+namespace {
+
+class SIFormMemoryClauses : public MachineFunctionPass {
+  typedef DenseMap<unsigned, std::pair<unsigned, LaneBitmask>> RegUse;
+
+public:
+  static char ID;
+
+public:
+  SIFormMemoryClauses() : MachineFunctionPass(ID) {
+    initializeSIFormMemoryClausesPass(*PassRegistry::getPassRegistry());
+  }
+
+  bool runOnMachineFunction(MachineFunction &MF) override;
+
+  StringRef getPassName() const override {
+    return "SI Form memory clauses";
+  }
+
+  void getAnalysisUsage(AnalysisUsage &AU) const override {
+    AU.addRequired<LiveIntervals>();
+    AU.setPreservesAll();
+    MachineFunctionPass::getAnalysisUsage(AU);
+  }
+
+private:
+  template <typename Callable>
+  void forAllLanes(unsigned Reg, LaneBitmask LaneMask, Callable Func) const;
+
+  bool canBundle(const MachineInstr &MI, RegUse &Defs, RegUse &Uses) const;
+  bool checkPressure(const MachineInstr &MI, GCNDownwardRPTracker &RPT);
+  void collectRegUses(const MachineInstr &MI, RegUse &Defs, RegUse &Uses) const;
+  bool processRegUses(const MachineInstr &MI, RegUse &Defs, RegUse &Uses,
+                      GCNDownwardRPTracker &RPT);
+
+  const GCNSubtarget *ST;
+  const SIRegisterInfo *TRI;
+  const MachineRegisterInfo *MRI;
+  SIMachineFunctionInfo *MFI;
+
+  unsigned LastRecordedOccupancy;
+  unsigned MaxVGPRs;
+  unsigned MaxSGPRs;
+};
+
+} // End anonymous namespace.
+
+INITIALIZE_PASS_BEGIN(SIFormMemoryClauses, DEBUG_TYPE,
+                      "SI Form memory clauses", false, false)
+INITIALIZE_PASS_DEPENDENCY(LiveIntervals)
+INITIALIZE_PASS_END(SIFormMemoryClauses, DEBUG_TYPE,
+                    "SI Form memory clauses", false, false)
+
+
+char SIFormMemoryClauses::ID = 0;
+
+char &llvm::SIFormMemoryClausesID = SIFormMemoryClauses::ID;
+
+FunctionPass *llvm::createSIFormMemoryClausesPass() {
+  return new SIFormMemoryClauses();
+}
+
+static bool isVMEMClauseInst(const MachineInstr &MI) {
+  return SIInstrInfo::isFLAT(MI) || SIInstrInfo::isVMEM(MI);
+}
+
+static bool isSMEMClauseInst(const MachineInstr &MI) {
+  return SIInstrInfo::isSMRD(MI);
+}
+
+// There no sense to create store clauses, they do not define anything,
+// thus there is nothing to set early-clobber.
+static bool isValidClauseInst(const MachineInstr &MI, bool IsVMEMClause) {
+  if (MI.isDebugValue() || MI.isBundled())
+    return false;
+  if (!MI.mayLoad() || MI.mayStore())
+    return false;
+  if (AMDGPU::getAtomicNoRetOp(MI.getOpcode()) != -1 ||
+      AMDGPU::getAtomicRetOp(MI.getOpcode()) != -1)
+    return false;
+  if (IsVMEMClause && !isVMEMClauseInst(MI))
+    return false;
+  if (!IsVMEMClause && !isSMEMClauseInst(MI))
+    return false;
+  return true;
+}
+
+static unsigned getMopState(const MachineOperand &MO) {
+  unsigned S = 0;
+  if (MO.isImplicit())
+    S |= RegState::Implicit;
+  if (MO.isDead())
+    S |= RegState::Dead;
+  if (MO.isUndef())
+    S |= RegState::Undef;
+  if (MO.isKill())
+    S |= RegState::Kill;
+  if (MO.isEarlyClobber())
+    S |= RegState::EarlyClobber;
+  if (TargetRegisterInfo::isPhysicalRegister(MO.getReg()) && MO.isRenamable())
+    S |= RegState::Renamable;
+  return S;
+}
+
+template <typename Callable>
+void SIFormMemoryClauses::forAllLanes(unsigned Reg, LaneBitmask LaneMask,
+                                      Callable Func) const {
+  if (LaneMask.all() || TargetRegisterInfo::isPhysicalRegister(Reg) ||
+      LaneMask == MRI->getMaxLaneMaskForVReg(Reg)) {
+    Func(0);
+    return;
+  }
+
+  const TargetRegisterClass *RC = MRI->getRegClass(Reg);
+  unsigned E = TRI->getNumSubRegIndices();
+  SmallVector<unsigned, AMDGPU::NUM_TARGET_SUBREGS> CoveringSubregs;
+  for (unsigned Idx = 1; Idx < E; ++Idx) {
+    // Is this index even compatible with the given class?
+    if (TRI->getSubClassWithSubReg(RC, Idx) != RC)
+      continue;
+    LaneBitmask SubRegMask = TRI->getSubRegIndexLaneMask(Idx);
+    // Early exit if we found a perfect match.
+    if (SubRegMask == LaneMask) {
+      Func(Idx);
+      return;
+    }
+
+    if ((SubRegMask & ~LaneMask).any() || (SubRegMask & LaneMask).none())
+      continue;
+
+    CoveringSubregs.push_back(Idx);
+  }
+
+  llvm::sort(CoveringSubregs.begin(), CoveringSubregs.end(),
+             [this](unsigned A, unsigned B) {
+               LaneBitmask MaskA = TRI->getSubRegIndexLaneMask(A);
+               LaneBitmask MaskB = TRI->getSubRegIndexLaneMask(B);
+               unsigned NA = MaskA.getNumLanes();
+               unsigned NB = MaskB.getNumLanes();
+               if (NA != NB)
+                 return NA > NB;
+               return MaskA.getHighestLane() > MaskB.getHighestLane();
+             });
+
+  for (unsigned Idx : CoveringSubregs) {
+    LaneBitmask SubRegMask = TRI->getSubRegIndexLaneMask(Idx);
+    if ((SubRegMask & ~LaneMask).any() || (SubRegMask & LaneMask).none())
+      continue;
+
+    Func(Idx);
+    LaneMask &= ~SubRegMask;
+    if (LaneMask.none())
+      return;
+  }
+
+  llvm_unreachable("Failed to find all subregs to cover lane mask");
+}
+
+// Returns false if there is a use of a def already in the map.
+// In this case we must break the clause.
+bool SIFormMemoryClauses::canBundle(const MachineInstr &MI,
+                                    RegUse &Defs, RegUse &Uses) const {
+  // Check interference with defs.
+  for (const MachineOperand &MO : MI.operands()) {
+    // TODO: Prologue/Epilogue Insertion pass does not process bundled
+    //       instructions.
+    if (MO.isFI())
+      return false;
+
+    if (!MO.isReg())
+      continue;
+
+    unsigned Reg = MO.getReg();
+
+    // If it is tied we will need to write same register as we read.
+    if (MO.isTied())
+      return false;
+
+    RegUse &Map = MO.isDef() ? Uses : Defs;
+    auto Conflict = Map.find(Reg);
+    if (Conflict == Map.end())
+      continue;
+
+    if (TargetRegisterInfo::isPhysicalRegister(Reg))
+      return false;
+
+    LaneBitmask Mask = TRI->getSubRegIndexLaneMask(MO.getSubReg());
+    if ((Conflict->second.second & Mask).any())
+      return false;
+  }
+
+  return true;
+}
+
+// Since all defs in the clause are early clobber we can run out of registers.
+// Function returns false if pressure would hit the limit if instruction is
+// bundled into a memory clause.
+bool SIFormMemoryClauses::checkPressure(const MachineInstr &MI,
+                                        GCNDownwardRPTracker &RPT) {
+  // NB: skip advanceBeforeNext() call. Since all defs will be marked
+  // early-clobber they will all stay alive at least to the end of the
+  // clause. Therefor we should not decrease pressure even if load
+  // pointer becomes dead and could otherwise be reused for destination.
+  RPT.advanceToNext();
+  GCNRegPressure MaxPressure = RPT.moveMaxPressure();
+  unsigned Occupancy = MaxPressure.getOccupancy(*ST);
+  if (Occupancy >= MFI->getMinAllowedOccupancy() &&
+      MaxPressure.getVGPRNum() <= MaxVGPRs &&
+      MaxPressure.getSGPRNum() <= MaxSGPRs) {
+    LastRecordedOccupancy = Occupancy;
+    return true;
+  }
+  return false;
+}
+
+// Collect register defs and uses along with their lane masks and states.
+void SIFormMemoryClauses::collectRegUses(const MachineInstr &MI,
+                                         RegUse &Defs, RegUse &Uses) const {
+  for (const MachineOperand &MO : MI.operands()) {
+    if (!MO.isReg())
+      continue;
+    unsigned Reg = MO.getReg();
+    if (!Reg)
+      continue;
+
+    LaneBitmask Mask = TargetRegisterInfo::isVirtualRegister(Reg) ?
+                         TRI->getSubRegIndexLaneMask(MO.getSubReg()) :
+                         LaneBitmask::getAll();
+    RegUse &Map = MO.isDef() ? Defs : Uses;
+
+    auto Loc = Map.find(Reg);
+    unsigned State = getMopState(MO);
+    if (Loc == Map.end()) {
+      Map[Reg] = std::make_pair(State, Mask);
+    } else {
+      Loc->second.first |= State;
+      Loc->second.second |= Mask;
+    }
+  }
+}
+
+// Check register def/use conflicts, occupancy limits and collect def/use maps.
+// Return true if instruction can be bundled with previous. It it cannot
+// def/use maps are not updated.
+bool SIFormMemoryClauses::processRegUses(const MachineInstr &MI,
+                                         RegUse &Defs, RegUse &Uses,
+                                         GCNDownwardRPTracker &RPT) {
+  if (!canBundle(MI, Defs, Uses))
+    return false;
+
+  if (!checkPressure(MI, RPT))
+    return false;
+
+  collectRegUses(MI, Defs, Uses);
+  return true;
+}
+
+bool SIFormMemoryClauses::runOnMachineFunction(MachineFunction &MF) {
+  if (skipFunction(MF.getFunction()))
+    return false;
+
+  ST = &MF.getSubtarget<GCNSubtarget>();
+  if (!ST->isXNACKEnabled())
+    return false;
+
+  const SIInstrInfo *TII = ST->getInstrInfo();
+  TRI = ST->getRegisterInfo();
+  MRI = &MF.getRegInfo();
+  MFI = MF.getInfo<SIMachineFunctionInfo>();
+  LiveIntervals *LIS = &getAnalysis<LiveIntervals>();
+  SlotIndexes *Ind = LIS->getSlotIndexes();
+  bool Changed = false;
+
+  MaxVGPRs = TRI->getAllocatableSet(MF, &AMDGPU::VGPR_32RegClass).count();
+  MaxSGPRs = TRI->getAllocatableSet(MF, &AMDGPU::SGPR_32RegClass).count();
+
+  for (MachineBasicBlock &MBB : MF) {
+    MachineBasicBlock::instr_iterator Next;
+    for (auto I = MBB.instr_begin(), E = MBB.instr_end(); I != E; I = Next) {
+      MachineInstr &MI = *I;
+      Next = std::next(I);
+
+      bool IsVMEM = isVMEMClauseInst(MI);
+
+      if (!isValidClauseInst(MI, IsVMEM))
+        continue;
+
+      RegUse Defs, Uses;
+      GCNDownwardRPTracker RPT(*LIS);
+      RPT.reset(MI);
+
+      if (!processRegUses(MI, Defs, Uses, RPT))
+        continue;
+
+      unsigned Length = 1;
+      for ( ; Next != E && Length < MaxClause; ++Next) {
+        if (!isValidClauseInst(*Next, IsVMEM))
+          break;
+
+        // A load from pointer which was loaded inside the same bundle is an
+        // impossible clause because we will need to write and read the same
+        // register inside. In this case processRegUses will return false.
+        if (!processRegUses(*Next, Defs, Uses, RPT))
+          break;
+
+        ++Length;
+      }
+      if (Length < 2)
+        continue;
+
+      Changed = true;
+      MFI->limitOccupancy(LastRecordedOccupancy);
+
+      auto B = BuildMI(MBB, I, DebugLoc(), TII->get(TargetOpcode::BUNDLE));
+      Ind->insertMachineInstrInMaps(*B);
+
+      for (auto BI = I; BI != Next; ++BI) {
+        BI->bundleWithPred();
+        Ind->removeSingleMachineInstrFromMaps(*BI);
+
+        for (MachineOperand &MO : BI->defs())
+          if (MO.readsReg())
+            MO.setIsInternalRead(true);
+      }
+
+      for (auto &&R : Defs) {
+        forAllLanes(R.first, R.second.second, [&R, &B](unsigned SubReg) {
+          unsigned S = R.second.first | RegState::EarlyClobber;
+          if (!SubReg)
+            S &= ~(RegState::Undef | RegState::Dead);
+          B.addDef(R.first, S, SubReg);
+        });
+      }
+
+      for (auto &&R : Uses) {
+        forAllLanes(R.first, R.second.second, [&R, &B](unsigned SubReg) {
+          B.addUse(R.first, R.second.first & ~RegState::Kill, SubReg);
+        });
+      }
+
+      for (auto &&R : Defs) {
+        unsigned Reg = R.first;
+        Uses.erase(Reg);
+        if (TargetRegisterInfo::isPhysicalRegister(Reg))
+          continue;
+        LIS->removeInterval(Reg);
+        LIS->createAndComputeVirtRegInterval(Reg);
+      }
+
+      for (auto &&R : Uses) {
+        unsigned Reg = R.first;
+        if (TargetRegisterInfo::isPhysicalRegister(Reg))
+          continue;
+        LIS->removeInterval(Reg);
+        LIS->createAndComputeVirtRegInterval(Reg);
+      }
+    }
+  }
+
+  return Changed;
+}
diff --git a/lib/Target/AMDGPU/SIFrameLowering.cpp b/lib/Target/AMDGPU/SIFrameLowering.cpp
index 89bb98dbd028..ac0ef90f25a4 100644
--- a/lib/Target/AMDGPU/SIFrameLowering.cpp
+++ b/lib/Target/AMDGPU/SIFrameLowering.cpp
@@ -12,7 +12,9 @@
 #include "SIInstrInfo.h"
 #include "SIMachineFunctionInfo.h"
 #include "SIRegisterInfo.h"
+#include "MCTargetDesc/AMDGPUMCTargetDesc.h"
 
+#include "llvm/CodeGen/LivePhysRegs.h"
 #include "llvm/CodeGen/MachineFrameInfo.h"
 #include "llvm/CodeGen/MachineFunction.h"
 #include "llvm/CodeGen/MachineInstrBuilder.h"
@@ -21,19 +23,19 @@
 using namespace llvm;
 
 
-static ArrayRef<MCPhysReg> getAllSGPR128(const SISubtarget &ST,
+static ArrayRef<MCPhysReg> getAllSGPR128(const GCNSubtarget &ST,
                                          const MachineFunction &MF) {
   return makeArrayRef(AMDGPU::SGPR_128RegClass.begin(),
                       ST.getMaxNumSGPRs(MF) / 4);
 }
 
-static ArrayRef<MCPhysReg> getAllSGPRs(const SISubtarget &ST,
+static ArrayRef<MCPhysReg> getAllSGPRs(const GCNSubtarget &ST,
                                        const MachineFunction &MF) {
   return makeArrayRef(AMDGPU::SGPR_32RegClass.begin(),
                       ST.getMaxNumSGPRs(MF));
 }
 
-void SIFrameLowering::emitFlatScratchInit(const SISubtarget &ST,
+void SIFrameLowering::emitFlatScratchInit(const GCNSubtarget &ST,
                                           MachineFunction &MF,
                                           MachineBasicBlock &MBB) const {
   const SIInstrInfo *TII = ST.getInstrInfo();
@@ -96,7 +98,7 @@ void SIFrameLowering::emitFlatScratchInit(const SISubtarget &ST,
 }
 
 unsigned SIFrameLowering::getReservedPrivateSegmentBufferReg(
-  const SISubtarget &ST,
+  const GCNSubtarget &ST,
   const SIInstrInfo *TII,
   const SIRegisterInfo *TRI,
   SIMachineFunctionInfo *MFI,
@@ -147,7 +149,7 @@ unsigned SIFrameLowering::getReservedPrivateSegmentBufferReg(
 // SGPRs.
 std::pair<unsigned, unsigned>
 SIFrameLowering::getReservedPrivateSegmentWaveByteOffsetReg(
-  const SISubtarget &ST,
+  const GCNSubtarget &ST,
   const SIInstrInfo *TII,
   const SIRegisterInfo *TRI,
   SIMachineFunctionInfo *MFI,
@@ -218,7 +220,7 @@ void SIFrameLowering::emitEntryFunctionPrologue(MachineFunction &MF,
                                                 MachineBasicBlock &MBB) const {
   // Emit debugger prologue if "amdgpu-debugger-emit-prologue" attribute was
   // specified.
-  const SISubtarget &ST = MF.getSubtarget<SISubtarget>();
+  const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
   if (ST.debuggerEmitPrologue())
     emitDebuggerPrologue(MF, MBB);
 
@@ -235,6 +237,7 @@ void SIFrameLowering::emitEntryFunctionPrologue(MachineFunction &MF,
   const SIInstrInfo *TII = ST.getInstrInfo();
   const SIRegisterInfo *TRI = &TII->getRegisterInfo();
   MachineRegisterInfo &MRI = MF.getRegInfo();
+  const Function &F = MF.getFunction();
 
   // We need to do the replacement of the private segment buffer and wave offset
   // register even if there are no stack objects. There could be stores to undef
@@ -286,7 +289,7 @@ void SIFrameLowering::emitEntryFunctionPrologue(MachineFunction &MF,
     AMDGPUFunctionArgInfo::PRIVATE_SEGMENT_WAVE_BYTE_OFFSET);
 
   unsigned PreloadedPrivateBufferReg = AMDGPU::NoRegister;
-  if (ST.isAmdCodeObjectV2(MF)) {
+  if (ST.isAmdCodeObjectV2(F)) {
     PreloadedPrivateBufferReg = MFI->getPreloadedReg(
       AMDGPUFunctionArgInfo::PRIVATE_SEGMENT_BUFFER);
   }
@@ -305,7 +308,7 @@ void SIFrameLowering::emitEntryFunctionPrologue(MachineFunction &MF,
   }
 
   if (ResourceRegUsed && PreloadedPrivateBufferReg != AMDGPU::NoRegister) {
-    assert(ST.isAmdCodeObjectV2(MF) || ST.isMesaGfxShader(MF));
+    assert(ST.isAmdCodeObjectV2(F) || ST.isMesaGfxShader(F));
     MRI.addLiveIn(PreloadedPrivateBufferReg);
     MBB.addLiveIn(PreloadedPrivateBufferReg);
   }
@@ -330,7 +333,7 @@ void SIFrameLowering::emitEntryFunctionPrologue(MachineFunction &MF,
 
   bool CopyBuffer = ResourceRegUsed &&
     PreloadedPrivateBufferReg != AMDGPU::NoRegister &&
-    ST.isAmdCodeObjectV2(MF) &&
+    ST.isAmdCodeObjectV2(F) &&
     ScratchRsrcReg != PreloadedPrivateBufferReg;
 
   // This needs to be careful of the copying order to avoid overwriting one of
@@ -361,13 +364,14 @@ void SIFrameLowering::emitEntryFunctionPrologue(MachineFunction &MF,
 }
 
 // Emit scratch setup code for AMDPAL or Mesa, assuming ResourceRegUsed is set.
-void SIFrameLowering::emitEntryFunctionScratchSetup(const SISubtarget &ST,
+void SIFrameLowering::emitEntryFunctionScratchSetup(const GCNSubtarget &ST,
       MachineFunction &MF, MachineBasicBlock &MBB, SIMachineFunctionInfo *MFI,
       MachineBasicBlock::iterator I, unsigned PreloadedPrivateBufferReg,
       unsigned ScratchRsrcReg) const {
 
   const SIInstrInfo *TII = ST.getInstrInfo();
   const SIRegisterInfo *TRI = &TII->getRegisterInfo();
+  const Function &Fn = MF.getFunction();
   DebugLoc DL;
 
   if (ST.isAmdPalOS()) {
@@ -387,12 +391,27 @@ void SIFrameLowering::emitEntryFunctionScratchSetup(const SISubtarget &ST,
       const MCInstrDesc &GetPC64 = TII->get(AMDGPU::S_GETPC_B64);
       BuildMI(MBB, I, DL, GetPC64, Rsrc01);
     }
+    auto GitPtrLo = AMDGPU::SGPR0; // Low GIT address passed in
+    if (ST.hasMergedShaders()) {
+      switch (MF.getFunction().getCallingConv()) {
+        case CallingConv::AMDGPU_HS:
+        case CallingConv::AMDGPU_GS:
+          // Low GIT address is passed in s8 rather than s0 for an LS+HS or
+          // ES+GS merged shader on gfx9+.
+          GitPtrLo = AMDGPU::SGPR8;
+          break;
+        default:
+          break;
+      }
+    }
+    MF.getRegInfo().addLiveIn(GitPtrLo);
+    MF.front().addLiveIn(GitPtrLo);
     BuildMI(MBB, I, DL, SMovB32, RsrcLo)
-      .addReg(AMDGPU::SGPR0) // Low address passed in
+      .addReg(GitPtrLo)
       .addReg(ScratchRsrcReg, RegState::ImplicitDefine);
 
     // We now have the GIT ptr - now get the scratch descriptor from the entry
-    // at offset 0.
+    // at offset 0 (or offset 16 for a compute shader).
     PointerType *PtrTy =
       PointerType::get(Type::getInt64Ty(MF.getFunction().getContext()),
                        AMDGPUAS::CONSTANT_ADDRESS);
@@ -403,17 +422,18 @@ void SIFrameLowering::emitEntryFunctionScratchSetup(const SISubtarget &ST,
                                        MachineMemOperand::MOInvariant |
                                        MachineMemOperand::MODereferenceable,
                                        0, 0);
+    unsigned Offset = Fn.getCallingConv() == CallingConv::AMDGPU_CS ? 16 : 0;
     BuildMI(MBB, I, DL, LoadDwordX4, ScratchRsrcReg)
       .addReg(Rsrc01)
-      .addImm(0) // offset
+      .addImm(Offset) // offset
       .addImm(0) // glc
       .addReg(ScratchRsrcReg, RegState::ImplicitDefine)
       .addMemOperand(MMO);
     return;
   }
-  if (ST.isMesaGfxShader(MF)
+  if (ST.isMesaGfxShader(Fn)
       || (PreloadedPrivateBufferReg == AMDGPU::NoRegister)) {
-    assert(!ST.isAmdCodeObjectV2(MF));
+    assert(!ST.isAmdCodeObjectV2(Fn));
     const MCInstrDesc &SMovB32 = TII->get(AMDGPU::S_MOV_B32);
 
     unsigned Rsrc2 = TRI->getSubReg(ScratchRsrcReg, AMDGPU::sub2);
@@ -474,17 +494,52 @@ void SIFrameLowering::emitEntryFunctionScratchSetup(const SISubtarget &ST,
   }
 }
 
+// Find a scratch register that we can use at the start of the prologue to
+// re-align the stack pointer.  We avoid using callee-save registers since they
+// may appear to be free when this is called from canUseAsPrologue (during
+// shrink wrapping), but then no longer be free when this is called from
+// emitPrologue.
+//
+// FIXME: This is a bit conservative, since in the above case we could use one
+// of the callee-save registers as a scratch temp to re-align the stack pointer,
+// but we would then have to make sure that we were in fact saving at least one
+// callee-save register in the prologue, which is additional complexity that
+// doesn't seem worth the benefit.
+static unsigned findScratchNonCalleeSaveRegister(MachineBasicBlock &MBB) {
+  MachineFunction *MF = MBB.getParent();
+
+  const GCNSubtarget &Subtarget = MF->getSubtarget<GCNSubtarget>();
+  const SIRegisterInfo &TRI = *Subtarget.getRegisterInfo();
+  LivePhysRegs LiveRegs(TRI);
+  LiveRegs.addLiveIns(MBB);
+
+  // Mark callee saved registers as used so we will not choose them.
+  const MCPhysReg *CSRegs = TRI.getCalleeSavedRegs(MF);
+  for (unsigned i = 0; CSRegs[i]; ++i)
+    LiveRegs.addReg(CSRegs[i]);
+
+  MachineRegisterInfo &MRI = MF->getRegInfo();
+
+  for (unsigned Reg : AMDGPU::SReg_32_XM0RegClass) {
+    if (LiveRegs.available(MRI, Reg))
+      return Reg;
+  }
+
+  return AMDGPU::NoRegister;
+}
+
 void SIFrameLowering::emitPrologue(MachineFunction &MF,
                                    MachineBasicBlock &MBB) const {
-  const SIMachineFunctionInfo *FuncInfo = MF.getInfo<SIMachineFunctionInfo>();
+  SIMachineFunctionInfo *FuncInfo = MF.getInfo<SIMachineFunctionInfo>();
   if (FuncInfo->isEntryFunction()) {
     emitEntryFunctionPrologue(MF, MBB);
     return;
   }
 
   const MachineFrameInfo &MFI = MF.getFrameInfo();
-  const SISubtarget &ST = MF.getSubtarget<SISubtarget>();
+  const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
   const SIInstrInfo *TII = ST.getInstrInfo();
+  const SIRegisterInfo &TRI = TII->getRegisterInfo();
 
   unsigned StackPtrReg = FuncInfo->getStackPtrOffsetReg();
   unsigned FramePtrReg = FuncInfo->getFrameOffsetReg();
@@ -492,8 +547,34 @@ void SIFrameLowering::emitPrologue(MachineFunction &MF,
   MachineBasicBlock::iterator MBBI = MBB.begin();
   DebugLoc DL;
 
+  // XXX - Is this the right predicate?
+
   bool NeedFP = hasFP(MF);
-  if (NeedFP) {
+  uint32_t NumBytes = MFI.getStackSize();
+  uint32_t RoundedSize = NumBytes;
+  const bool NeedsRealignment = TRI.needsStackRealignment(MF);
+
+  if (NeedsRealignment) {
+    assert(NeedFP);
+    const unsigned Alignment = MFI.getMaxAlignment();
+
+    RoundedSize += Alignment;
+
+    unsigned ScratchSPReg = findScratchNonCalleeSaveRegister(MBB);
+    assert(ScratchSPReg != AMDGPU::NoRegister);
+
+    // s_add_u32 tmp_reg, s32, NumBytes
+    // s_and_b32 s32, tmp_reg, 0b111...0000
+    BuildMI(MBB, MBBI, DL, TII->get(AMDGPU::S_ADD_U32), ScratchSPReg)
+      .addReg(StackPtrReg)
+      .addImm((Alignment - 1) * ST.getWavefrontSize())
+      .setMIFlag(MachineInstr::FrameSetup);
+    BuildMI(MBB, MBBI, DL, TII->get(AMDGPU::S_AND_B32), FramePtrReg)
+      .addReg(ScratchSPReg, RegState::Kill)
+      .addImm(-Alignment * ST.getWavefrontSize())
+      .setMIFlag(MachineInstr::FrameSetup);
+    FuncInfo->setIsStackRealigned(true);
+  } else if (NeedFP) {
     // If we need a base pointer, set it up here. It's whatever the value of
     // the stack pointer is at this point. Any variable size objects will be
     // allocated after this, so we can still use the base pointer to reference
@@ -503,11 +584,10 @@ void SIFrameLowering::emitPrologue(MachineFunction &MF,
       .setMIFlag(MachineInstr::FrameSetup);
   }
 
-  uint32_t NumBytes = MFI.getStackSize();
-  if (NumBytes != 0 && hasSP(MF)) {
+  if (RoundedSize != 0 && hasSP(MF)) {
     BuildMI(MBB, MBBI, DL, TII->get(AMDGPU::S_ADD_U32), StackPtrReg)
       .addReg(StackPtrReg)
-      .addImm(NumBytes * ST.getWavefrontSize())
+      .addImm(RoundedSize * ST.getWavefrontSize())
       .setMIFlag(MachineInstr::FrameSetup);
   }
 
@@ -527,7 +607,7 @@ void SIFrameLowering::emitEpilogue(MachineFunction &MF,
   if (FuncInfo->isEntryFunction())
     return;
 
-  const SISubtarget &ST = MF.getSubtarget<SISubtarget>();
+  const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
   const SIInstrInfo *TII = ST.getInstrInfo();
   MachineBasicBlock::iterator MBBI = MBB.getFirstTerminator();
 
@@ -553,10 +633,12 @@ void SIFrameLowering::emitEpilogue(MachineFunction &MF,
   // it's really whether we need SP to be accurate or not.
 
   if (NumBytes != 0 && hasSP(MF)) {
+    uint32_t RoundedSize = FuncInfo->isStackRealigned() ?
+      NumBytes + MFI.getMaxAlignment() : NumBytes;
+
     BuildMI(MBB, MBBI, DL, TII->get(AMDGPU::S_SUB_U32), StackPtrReg)
       .addReg(StackPtrReg)
-      .addImm(NumBytes * ST.getWavefrontSize())
-      .setMIFlag(MachineInstr::FrameDestroy);
+      .addImm(RoundedSize * ST.getWavefrontSize());
   }
 }
 
@@ -572,7 +654,7 @@ static bool allStackObjectsAreDead(const MachineFrameInfo &MFI) {
 
 int SIFrameLowering::getFrameIndexReference(const MachineFunction &MF, int FI,
                                             unsigned &FrameReg) const {
-  const SIRegisterInfo *RI = MF.getSubtarget<SISubtarget>().getRegisterInfo();
+  const SIRegisterInfo *RI = MF.getSubtarget<GCNSubtarget>().getRegisterInfo();
 
   FrameReg = RI->getFrameRegister(MF);
   return MF.getFrameInfo().getObjectOffset(FI);
@@ -586,7 +668,7 @@ void SIFrameLowering::processFunctionBeforeFrameFinalized(
   if (!MFI.hasStackObjects())
     return;
 
-  const SISubtarget &ST = MF.getSubtarget<SISubtarget>();
+  const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
   const SIInstrInfo *TII = ST.getInstrInfo();
   const SIRegisterInfo &TRI = TII->getRegisterInfo();
   SIMachineFunctionInfo *FuncInfo = MF.getInfo<SIMachineFunctionInfo>();
@@ -611,6 +693,7 @@ void SIFrameLowering::processFunctionBeforeFrameFinalized(
 
         if (TII->isSGPRSpill(MI)) {
           int FI = TII->getNamedOperand(MI, AMDGPU::OpName::addr)->getIndex();
+          assert(MFI.getStackID(FI) == SIStackID::SGPR_SPILL);
           if (FuncInfo->allocateSGPRSpillToVGPR(MF, FI)) {
             bool Spilled = TRI.eliminateSGPRToVGPRSpillFrameIndex(MI, FI, RS);
             (void)Spilled;
@@ -667,7 +750,7 @@ MachineBasicBlock::iterator SIFrameLowering::eliminateCallFramePseudoInstr(
   if (Amount == 0)
     return MBB.erase(I);
 
-  const SISubtarget &ST = MF.getSubtarget<SISubtarget>();
+  const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
   const SIInstrInfo *TII = ST.getInstrInfo();
   const DebugLoc &DL = I->getDebugLoc();
   unsigned Opc = I->getOpcode();
@@ -696,7 +779,7 @@ MachineBasicBlock::iterator SIFrameLowering::eliminateCallFramePseudoInstr(
 
 void SIFrameLowering::emitDebuggerPrologue(MachineFunction &MF,
                                            MachineBasicBlock &MBB) const {
-  const SISubtarget &ST = MF.getSubtarget<SISubtarget>();
+  const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
   const SIInstrInfo *TII = ST.getInstrInfo();
   const SIRegisterInfo *TRI = &TII->getRegisterInfo();
   const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
@@ -746,7 +829,8 @@ bool SIFrameLowering::hasFP(const MachineFunction &MF) const {
 }
 
 bool SIFrameLowering::hasSP(const MachineFunction &MF) const {
+  const SIRegisterInfo *TRI = MF.getSubtarget<GCNSubtarget>().getRegisterInfo();
   // All stack operations are relative to the frame offset SGPR.
   const MachineFrameInfo &MFI = MF.getFrameInfo();
-  return MFI.hasCalls() || MFI.hasVarSizedObjects();
+  return MFI.hasCalls() || MFI.hasVarSizedObjects() || TRI->needsStackRealignment(MF);
 }
diff --git a/lib/Target/AMDGPU/SIFrameLowering.h b/lib/Target/AMDGPU/SIFrameLowering.h
index df6f1632a316..2f35b3631cdc 100644
--- a/lib/Target/AMDGPU/SIFrameLowering.h
+++ b/lib/Target/AMDGPU/SIFrameLowering.h
@@ -17,7 +17,7 @@ namespace llvm {
 class SIInstrInfo;
 class SIMachineFunctionInfo;
 class SIRegisterInfo;
-class SISubtarget;
+class GCNSubtarget;
 
 class SIFrameLowering final : public AMDGPUFrameLowering {
 public:
@@ -48,29 +48,29 @@ public:
                                 MachineBasicBlock::iterator MI) const override;
 
 private:
-  void emitFlatScratchInit(const SISubtarget &ST,
+  void emitFlatScratchInit(const GCNSubtarget &ST,
                            MachineFunction &MF,
                            MachineBasicBlock &MBB) const;
 
   unsigned getReservedPrivateSegmentBufferReg(
-    const SISubtarget &ST,
+    const GCNSubtarget &ST,
     const SIInstrInfo *TII,
     const SIRegisterInfo *TRI,
     SIMachineFunctionInfo *MFI,
     MachineFunction &MF) const;
 
   std::pair<unsigned, unsigned> getReservedPrivateSegmentWaveByteOffsetReg(
-    const SISubtarget &ST,
+    const GCNSubtarget &ST,
     const SIInstrInfo *TII,
     const SIRegisterInfo *TRI,
     SIMachineFunctionInfo *MFI,
     MachineFunction &MF) const;
 
-  /// \brief Emits debugger prologue.
+  /// Emits debugger prologue.
   void emitDebuggerPrologue(MachineFunction &MF, MachineBasicBlock &MBB) const;
 
   // Emit scratch setup code for AMDPAL or Mesa, assuming ResourceRegUsed is set.
-  void emitEntryFunctionScratchSetup(const SISubtarget &ST, MachineFunction &MF,
+  void emitEntryFunctionScratchSetup(const GCNSubtarget &ST, MachineFunction &MF,
       MachineBasicBlock &MBB, SIMachineFunctionInfo *MFI,
       MachineBasicBlock::iterator I, unsigned PreloadedPrivateBufferReg,
       unsigned ScratchRsrcReg) const;
diff --git a/lib/Target/AMDGPU/SIISelLowering.cpp b/lib/Target/AMDGPU/SIISelLowering.cpp
index 50ee88fa635a..5b7fc2656a20 100644
--- a/lib/Target/AMDGPU/SIISelLowering.cpp
+++ b/lib/Target/AMDGPU/SIISelLowering.cpp
@@ -8,7 +8,7 @@
 //===----------------------------------------------------------------------===//
 //
 /// \file
-/// \brief Custom DAG lowering for SI
+/// Custom DAG lowering for SI
 //
 //===----------------------------------------------------------------------===//
 
@@ -26,6 +26,7 @@
 #include "SIInstrInfo.h"
 #include "SIMachineFunctionInfo.h"
 #include "SIRegisterInfo.h"
+#include "MCTargetDesc/AMDGPUMCTargetDesc.h"
 #include "Utils/AMDGPUBaseInfo.h"
 #include "llvm/ADT/APFloat.h"
 #include "llvm/ADT/APInt.h"
@@ -49,7 +50,6 @@
 #include "llvm/CodeGen/MachineModuleInfo.h"
 #include "llvm/CodeGen/MachineOperand.h"
 #include "llvm/CodeGen/MachineRegisterInfo.h"
-#include "llvm/CodeGen/MachineValueType.h"
 #include "llvm/CodeGen/SelectionDAG.h"
 #include "llvm/CodeGen/SelectionDAGNodes.h"
 #include "llvm/CodeGen/TargetCallingConv.h"
@@ -73,6 +73,7 @@
 #include "llvm/Support/Compiler.h"
 #include "llvm/Support/ErrorHandling.h"
 #include "llvm/Support/KnownBits.h"
+#include "llvm/Support/MachineValueType.h"
 #include "llvm/Support/MathExtras.h"
 #include "llvm/Target/TargetOptions.h"
 #include <cassert>
@@ -111,8 +112,9 @@ static unsigned findFirstFreeSGPR(CCState &CCInfo) {
 }
 
 SITargetLowering::SITargetLowering(const TargetMachine &TM,
-                                   const SISubtarget &STI)
-    : AMDGPUTargetLowering(TM, STI) {
+                                   const GCNSubtarget &STI)
+    : AMDGPUTargetLowering(TM, STI),
+      Subtarget(&STI) {
   addRegisterClass(MVT::i1, &AMDGPU::VReg_1RegClass);
   addRegisterClass(MVT::i64, &AMDGPU::SReg_64RegClass);
 
@@ -138,14 +140,15 @@ SITargetLowering::SITargetLowering(const TargetMachine &TM,
   if (Subtarget->has16BitInsts()) {
     addRegisterClass(MVT::i16, &AMDGPU::SReg_32_XM0RegClass);
     addRegisterClass(MVT::f16, &AMDGPU::SReg_32_XM0RegClass);
-  }
 
-  if (Subtarget->hasVOP3PInsts()) {
+    // Unless there are also VOP3P operations, not operations are really legal.
     addRegisterClass(MVT::v2i16, &AMDGPU::SReg_32_XM0RegClass);
     addRegisterClass(MVT::v2f16, &AMDGPU::SReg_32_XM0RegClass);
+    addRegisterClass(MVT::v4i16, &AMDGPU::SReg_64RegClass);
+    addRegisterClass(MVT::v4f16, &AMDGPU::SReg_64RegClass);
   }
 
-  computeRegisterProperties(STI.getRegisterInfo());
+  computeRegisterProperties(Subtarget->getRegisterInfo());
 
   // We need to custom lower vector stores from local memory
   setOperationAction(ISD::LOAD, MVT::v2i32, Custom);
@@ -173,7 +176,6 @@ SITargetLowering::SITargetLowering(const TargetMachine &TM,
 
   setOperationAction(ISD::GlobalAddress, MVT::i32, Custom);
   setOperationAction(ISD::GlobalAddress, MVT::i64, Custom);
-  setOperationAction(ISD::ConstantPool, MVT::v2i64, Expand);
 
   setOperationAction(ISD::SELECT, MVT::i1, Promote);
   setOperationAction(ISD::SELECT, MVT::i64, Custom);
@@ -205,13 +207,17 @@ SITargetLowering::SITargetLowering(const TargetMachine &TM,
   setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::Other, Custom);
   setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::f32, Custom);
   setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::v4f32, Custom);
+  setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::v2i16, Custom);
   setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::v2f16, Custom);
 
+  setOperationAction(ISD::INTRINSIC_W_CHAIN, MVT::v2f16, Custom);
+  setOperationAction(ISD::INTRINSIC_W_CHAIN, MVT::v4f16, Custom);
   setOperationAction(ISD::INTRINSIC_W_CHAIN, MVT::Other, Custom);
 
   setOperationAction(ISD::INTRINSIC_VOID, MVT::Other, Custom);
   setOperationAction(ISD::INTRINSIC_VOID, MVT::v2i16, Custom);
   setOperationAction(ISD::INTRINSIC_VOID, MVT::v2f16, Custom);
+  setOperationAction(ISD::INTRINSIC_VOID, MVT::v4f16, Custom);
 
   setOperationAction(ISD::BRCOND, MVT::Other, Custom);
   setOperationAction(ISD::BR_CC, MVT::i1, Expand);
@@ -231,13 +237,10 @@ SITargetLowering::SITargetLowering(const TargetMachine &TM,
   setOperationAction(ISD::SUBCARRY, MVT::i64, Legal);
 #endif
 
-  //setOperationAction(ISD::ADDC, MVT::i64, Expand);
-  //setOperationAction(ISD::SUBC, MVT::i64, Expand);
-
   // We only support LOAD/STORE and vector manipulation ops for vectors
   // with > 4 elements.
   for (MVT VT : {MVT::v8i32, MVT::v8f32, MVT::v16i32, MVT::v16f32,
-        MVT::v2i64, MVT::v2f64}) {
+        MVT::v2i64, MVT::v2f64, MVT::v4i16, MVT::v4f16 }) {
     for (unsigned Op = 0; Op < ISD::BUILTIN_OP_END; ++Op) {
       switch (Op) {
       case ISD::LOAD:
@@ -260,6 +263,8 @@ SITargetLowering::SITargetLowering(const TargetMachine &TM,
     }
   }
 
+  setOperationAction(ISD::FP_EXTEND, MVT::v4f32, Expand);
+
   // TODO: For dynamic 64-bit vector inserts/extracts, should emit a pseudo that
   // is expanded to avoid having two separate loops in case the index is a VGPR.
 
@@ -284,12 +289,30 @@ SITargetLowering::SITargetLowering(const TargetMachine &TM,
   setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v16i32, Expand);
   setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v16f32, Expand);
 
+  setOperationAction(ISD::BUILD_VECTOR, MVT::v4f16, Custom);
+  setOperationAction(ISD::BUILD_VECTOR, MVT::v4i16, Custom);
+
   // Avoid stack access for these.
   // TODO: Generalize to more vector types.
   setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v2i16, Custom);
   setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v2f16, Custom);
+  setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v4i16, Custom);
+  setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v4f16, Custom);
+
   setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v2i16, Custom);
   setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v2f16, Custom);
+  setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v2i8, Custom);
+  setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v4i8, Custom);
+  setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v8i8, Custom);
+
+  setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v2i8, Custom);
+  setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v4i8, Custom);
+  setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v8i8, Custom);
+
+  setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v4i16, Custom);
+  setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v4f16, Custom);
+  setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v4i16, Custom);
+  setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v4f16, Custom);
 
   // BUFFER/FLAT_ATOMIC_CMP_SWAP on GCN GPUs needs input marshalling,
   // and output demarshalling
@@ -301,7 +324,7 @@ SITargetLowering::SITargetLowering(const TargetMachine &TM,
   setOperationAction(ISD::ATOMIC_CMP_SWAP_WITH_SUCCESS, MVT::i32, Expand);
   setOperationAction(ISD::ATOMIC_CMP_SWAP_WITH_SUCCESS, MVT::i64, Expand);
 
-  if (getSubtarget()->hasFlatAddressSpace()) {
+  if (Subtarget->hasFlatAddressSpace()) {
     setOperationAction(ISD::ADDRSPACECAST, MVT::i32, Custom);
     setOperationAction(ISD::ADDRSPACECAST, MVT::i64, Custom);
   }
@@ -314,13 +337,56 @@ SITargetLowering::SITargetLowering(const TargetMachine &TM,
   setOperationAction(ISD::TRAP, MVT::Other, Custom);
   setOperationAction(ISD::DEBUGTRAP, MVT::Other, Custom);
 
+  if (Subtarget->has16BitInsts()) {
+    setOperationAction(ISD::FLOG, MVT::f16, Custom);
+    setOperationAction(ISD::FLOG10, MVT::f16, Custom);
+  }
+
+  // v_mad_f32 does not support denormals according to some sources.
+  if (!Subtarget->hasFP32Denormals())
+    setOperationAction(ISD::FMAD, MVT::f32, Legal);
+
+  if (!Subtarget->hasBFI()) {
+    // fcopysign can be done in a single instruction with BFI.
+    setOperationAction(ISD::FCOPYSIGN, MVT::f32, Expand);
+    setOperationAction(ISD::FCOPYSIGN, MVT::f64, Expand);
+  }
+
+  if (!Subtarget->hasBCNT(32))
+    setOperationAction(ISD::CTPOP, MVT::i32, Expand);
+
+  if (!Subtarget->hasBCNT(64))
+    setOperationAction(ISD::CTPOP, MVT::i64, Expand);
+
+  if (Subtarget->hasFFBH())
+    setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::i32, Custom);
+
+  if (Subtarget->hasFFBL())
+    setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::i32, Custom);
+
+  // We only really have 32-bit BFE instructions (and 16-bit on VI).
+  //
+  // On SI+ there are 64-bit BFEs, but they are scalar only and there isn't any
+  // effort to match them now. We want this to be false for i64 cases when the
+  // extraction isn't restricted to the upper or lower half. Ideally we would
+  // have some pass reduce 64-bit extracts to 32-bit if possible. Extracts that
+  // span the midpoint are probably relatively rare, so don't worry about them
+  // for now.
+  if (Subtarget->hasBFE())
+    setHasExtractBitsInsn(true);
+
   setOperationAction(ISD::FMINNUM, MVT::f64, Legal);
   setOperationAction(ISD::FMAXNUM, MVT::f64, Legal);
 
-  if (Subtarget->getGeneration() >= SISubtarget::SEA_ISLANDS) {
+  if (Subtarget->getGeneration() >= AMDGPUSubtarget::SEA_ISLANDS) {
     setOperationAction(ISD::FTRUNC, MVT::f64, Legal);
     setOperationAction(ISD::FCEIL, MVT::f64, Legal);
     setOperationAction(ISD::FRINT, MVT::f64, Legal);
+  } else {
+    setOperationAction(ISD::FCEIL, MVT::f64, Custom);
+    setOperationAction(ISD::FTRUNC, MVT::f64, Custom);
+    setOperationAction(ISD::FRINT, MVT::f64, Custom);
+    setOperationAction(ISD::FFLOOR, MVT::f64, Custom);
   }
 
   setOperationAction(ISD::FFLOOR, MVT::f64, Legal);
@@ -357,6 +423,7 @@ SITargetLowering::SITargetLowering(const TargetMachine &TM,
     setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::i16, Promote);
     setOperationAction(ISD::CTLZ, MVT::i16, Promote);
     setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::i16, Promote);
+    setOperationAction(ISD::CTPOP, MVT::i16, Promote);
 
     setOperationAction(ISD::SELECT_CC, MVT::i16, Expand);
 
@@ -406,10 +473,8 @@ SITargetLowering::SITargetLowering(const TargetMachine &TM,
     setOperationAction(ISD::FMA, MVT::f16, Legal);
     if (!Subtarget->hasFP16Denormals())
       setOperationAction(ISD::FMAD, MVT::f16, Legal);
-  }
 
-  if (Subtarget->hasVOP3PInsts()) {
-    for (MVT VT : {MVT::v2i16, MVT::v2f16}) {
+    for (MVT VT : {MVT::v2i16, MVT::v2f16, MVT::v4i16, MVT::v4f16}) {
       for (unsigned Op = 0; Op < ISD::BUILTIN_OP_END; ++Op) {
         switch (Op) {
         case ISD::LOAD:
@@ -436,6 +501,9 @@ SITargetLowering::SITargetLowering(const TargetMachine &TM,
     setOperationAction(ISD::Constant, MVT::v2i16, Legal);
     setOperationAction(ISD::ConstantFP, MVT::v2f16, Legal);
 
+    setOperationAction(ISD::UNDEF, MVT::v2i16, Legal);
+    setOperationAction(ISD::UNDEF, MVT::v2f16, Legal);
+
     setOperationAction(ISD::STORE, MVT::v2i16, Promote);
     AddPromotedToType(ISD::STORE, MVT::v2i16, MVT::i32);
     setOperationAction(ISD::STORE, MVT::v2f16, Promote);
@@ -452,11 +520,38 @@ SITargetLowering::SITargetLowering(const TargetMachine &TM,
     AddPromotedToType(ISD::OR, MVT::v2i16, MVT::i32);
     setOperationAction(ISD::XOR, MVT::v2i16, Promote);
     AddPromotedToType(ISD::XOR, MVT::v2i16, MVT::i32);
-    setOperationAction(ISD::SELECT, MVT::v2i16, Promote);
-    AddPromotedToType(ISD::SELECT, MVT::v2i16, MVT::i32);
-    setOperationAction(ISD::SELECT, MVT::v2f16, Promote);
-    AddPromotedToType(ISD::SELECT, MVT::v2f16, MVT::i32);
 
+    setOperationAction(ISD::LOAD, MVT::v4i16, Promote);
+    AddPromotedToType(ISD::LOAD, MVT::v4i16, MVT::v2i32);
+    setOperationAction(ISD::LOAD, MVT::v4f16, Promote);
+    AddPromotedToType(ISD::LOAD, MVT::v4f16, MVT::v2i32);
+
+    setOperationAction(ISD::STORE, MVT::v4i16, Promote);
+    AddPromotedToType(ISD::STORE, MVT::v4i16, MVT::v2i32);
+    setOperationAction(ISD::STORE, MVT::v4f16, Promote);
+    AddPromotedToType(ISD::STORE, MVT::v4f16, MVT::v2i32);
+
+    setOperationAction(ISD::ANY_EXTEND, MVT::v2i32, Expand);
+    setOperationAction(ISD::ZERO_EXTEND, MVT::v2i32, Expand);
+    setOperationAction(ISD::SIGN_EXTEND, MVT::v2i32, Expand);
+    setOperationAction(ISD::FP_EXTEND, MVT::v2f32, Expand);
+
+    setOperationAction(ISD::ANY_EXTEND, MVT::v4i32, Expand);
+    setOperationAction(ISD::ZERO_EXTEND, MVT::v4i32, Expand);
+    setOperationAction(ISD::SIGN_EXTEND, MVT::v4i32, Expand);
+
+    if (!Subtarget->hasVOP3PInsts()) {
+      setOperationAction(ISD::BUILD_VECTOR, MVT::v2i16, Custom);
+      setOperationAction(ISD::BUILD_VECTOR, MVT::v2f16, Custom);
+    }
+
+    setOperationAction(ISD::FNEG, MVT::v2f16, Legal);
+    // This isn't really legal, but this avoids the legalizer unrolling it (and
+    // allows matching fneg (fabs x) patterns)
+    setOperationAction(ISD::FABS, MVT::v2f16, Legal);
+  }
+
+  if (Subtarget->hasVOP3PInsts()) {
     setOperationAction(ISD::ADD, MVT::v2i16, Legal);
     setOperationAction(ISD::SUB, MVT::v2i16, Legal);
     setOperationAction(ISD::MUL, MVT::v2i16, Legal);
@@ -469,26 +564,51 @@ SITargetLowering::SITargetLowering(const TargetMachine &TM,
     setOperationAction(ISD::UMAX, MVT::v2i16, Legal);
 
     setOperationAction(ISD::FADD, MVT::v2f16, Legal);
-    setOperationAction(ISD::FNEG, MVT::v2f16, Legal);
     setOperationAction(ISD::FMUL, MVT::v2f16, Legal);
     setOperationAction(ISD::FMA, MVT::v2f16, Legal);
     setOperationAction(ISD::FMINNUM, MVT::v2f16, Legal);
     setOperationAction(ISD::FMAXNUM, MVT::v2f16, Legal);
-
-    // This isn't really legal, but this avoids the legalizer unrolling it (and
-    // allows matching fneg (fabs x) patterns)
-    setOperationAction(ISD::FABS, MVT::v2f16, Legal);
+    setOperationAction(ISD::FCANONICALIZE, MVT::v2f16, Legal);
 
     setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v2i16, Custom);
     setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v2f16, Custom);
 
-    setOperationAction(ISD::ANY_EXTEND, MVT::v2i32, Expand);
-    setOperationAction(ISD::ZERO_EXTEND, MVT::v2i32, Expand);
-    setOperationAction(ISD::SIGN_EXTEND, MVT::v2i32, Expand);
-    setOperationAction(ISD::FP_EXTEND, MVT::v2f32, Expand);
+    setOperationAction(ISD::SHL, MVT::v4i16, Custom);
+    setOperationAction(ISD::SRA, MVT::v4i16, Custom);
+    setOperationAction(ISD::SRL, MVT::v4i16, Custom);
+    setOperationAction(ISD::ADD, MVT::v4i16, Custom);
+    setOperationAction(ISD::SUB, MVT::v4i16, Custom);
+    setOperationAction(ISD::MUL, MVT::v4i16, Custom);
+
+    setOperationAction(ISD::SMIN, MVT::v4i16, Custom);
+    setOperationAction(ISD::SMAX, MVT::v4i16, Custom);
+    setOperationAction(ISD::UMIN, MVT::v4i16, Custom);
+    setOperationAction(ISD::UMAX, MVT::v4i16, Custom);
+
+    setOperationAction(ISD::FADD, MVT::v4f16, Custom);
+    setOperationAction(ISD::FMUL, MVT::v4f16, Custom);
+    setOperationAction(ISD::FMINNUM, MVT::v4f16, Custom);
+    setOperationAction(ISD::FMAXNUM, MVT::v4f16, Custom);
+
+    setOperationAction(ISD::SELECT, MVT::v4i16, Custom);
+    setOperationAction(ISD::SELECT, MVT::v4f16, Custom);
+  }
+
+  setOperationAction(ISD::FNEG, MVT::v4f16, Custom);
+  setOperationAction(ISD::FABS, MVT::v4f16, Custom);
+
+  if (Subtarget->has16BitInsts()) {
+    setOperationAction(ISD::SELECT, MVT::v2i16, Promote);
+    AddPromotedToType(ISD::SELECT, MVT::v2i16, MVT::i32);
+    setOperationAction(ISD::SELECT, MVT::v2f16, Promote);
+    AddPromotedToType(ISD::SELECT, MVT::v2f16, MVT::i32);
   } else {
+    // Legalization hack.
     setOperationAction(ISD::SELECT, MVT::v2i16, Custom);
     setOperationAction(ISD::SELECT, MVT::v2f16, Custom);
+
+    setOperationAction(ISD::FNEG, MVT::v2f16, Custom);
+    setOperationAction(ISD::FABS, MVT::v2f16, Custom);
   }
 
   for (MVT VT : { MVT::v4i16, MVT::v4f16, MVT::v2i8, MVT::v4i8, MVT::v8i8 }) {
@@ -503,6 +623,7 @@ SITargetLowering::SITargetLowering(const TargetMachine &TM,
   setTargetDAGCombine(ISD::FSUB);
   setTargetDAGCombine(ISD::FMINNUM);
   setTargetDAGCombine(ISD::FMAXNUM);
+  setTargetDAGCombine(ISD::FMA);
   setTargetDAGCombine(ISD::SMIN);
   setTargetDAGCombine(ISD::SMAX);
   setTargetDAGCombine(ISD::UMIN);
@@ -540,16 +661,33 @@ SITargetLowering::SITargetLowering(const TargetMachine &TM,
   setTargetDAGCombine(ISD::ATOMIC_LOAD_UMAX);
 
   setSchedulingPreference(Sched::RegPressure);
+
+  // SI at least has hardware support for floating point exceptions, but no way
+  // of using or handling them is implemented. They are also optional in OpenCL
+  // (Section 7.3)
+  setHasFloatingPointExceptions(Subtarget->hasFPExceptions());
 }
 
-const SISubtarget *SITargetLowering::getSubtarget() const {
-  return static_cast<const SISubtarget *>(Subtarget);
+const GCNSubtarget *SITargetLowering::getSubtarget() const {
+  return Subtarget;
 }
 
 //===----------------------------------------------------------------------===//
 // TargetLowering queries
 //===----------------------------------------------------------------------===//
 
+// v_mad_mix* support a conversion from f16 to f32.
+//
+// There is only one special case when denormals are enabled we don't currently,
+// where this is OK to use.
+bool SITargetLowering::isFPExtFoldable(unsigned Opcode,
+                                           EVT DestVT, EVT SrcVT) const {
+  return ((Opcode == ISD::FMAD && Subtarget->hasMadMixInsts()) ||
+          (Opcode == ISD::FMA && Subtarget->hasFmaMixInsts())) &&
+         DestVT.getScalarType() == MVT::f32 && !Subtarget->hasFP32Denormals() &&
+         SrcVT.getScalarType() == MVT::f16;
+}
+
 bool SITargetLowering::isShuffleMaskLegal(ArrayRef<int>, EVT) const {
   // SI has some legal vector types, but no legal vector operations. Say no
   // shuffles are legal in order to prefer scalarizing some vector operations.
@@ -560,9 +698,55 @@ bool SITargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info,
                                           const CallInst &CI,
                                           MachineFunction &MF,
                                           unsigned IntrID) const {
+  if (const AMDGPU::RsrcIntrinsic *RsrcIntr =
+          AMDGPU::lookupRsrcIntrinsic(IntrID)) {
+    AttributeList Attr = Intrinsic::getAttributes(CI.getContext(),
+                                                  (Intrinsic::ID)IntrID);
+    if (Attr.hasFnAttribute(Attribute::ReadNone))
+      return false;
+
+    SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
+
+    if (RsrcIntr->IsImage) {
+      Info.ptrVal = MFI->getImagePSV(
+        *MF.getSubtarget<GCNSubtarget>().getInstrInfo(),
+        CI.getArgOperand(RsrcIntr->RsrcArg));
+      Info.align = 0;
+    } else {
+      Info.ptrVal = MFI->getBufferPSV(
+        *MF.getSubtarget<GCNSubtarget>().getInstrInfo(),
+        CI.getArgOperand(RsrcIntr->RsrcArg));
+    }
+
+    Info.flags = MachineMemOperand::MODereferenceable;
+    if (Attr.hasFnAttribute(Attribute::ReadOnly)) {
+      Info.opc = ISD::INTRINSIC_W_CHAIN;
+      Info.memVT = MVT::getVT(CI.getType());
+      Info.flags |= MachineMemOperand::MOLoad;
+    } else if (Attr.hasFnAttribute(Attribute::WriteOnly)) {
+      Info.opc = ISD::INTRINSIC_VOID;
+      Info.memVT = MVT::getVT(CI.getArgOperand(0)->getType());
+      Info.flags |= MachineMemOperand::MOStore;
+    } else {
+      // Atomic
+      Info.opc = ISD::INTRINSIC_W_CHAIN;
+      Info.memVT = MVT::getVT(CI.getType());
+      Info.flags = MachineMemOperand::MOLoad |
+                   MachineMemOperand::MOStore |
+                   MachineMemOperand::MODereferenceable;
+
+      // XXX - Should this be volatile without known ordering?
+      Info.flags |= MachineMemOperand::MOVolatile;
+    }
+    return true;
+  }
+
   switch (IntrID) {
   case Intrinsic::amdgcn_atomic_inc:
-  case Intrinsic::amdgcn_atomic_dec: {
+  case Intrinsic::amdgcn_atomic_dec:
+  case Intrinsic::amdgcn_ds_fadd:
+  case Intrinsic::amdgcn_ds_fmin:
+  case Intrinsic::amdgcn_ds_fmax: {
     Info.opc = ISD::INTRINSIC_W_CHAIN;
     Info.memVT = MVT::getVT(CI.getType());
     Info.ptrVal = CI.getOperand(0);
@@ -575,6 +759,7 @@ bool SITargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info,
 
     return true;
   }
+
   default:
     return false;
   }
@@ -585,7 +770,10 @@ bool SITargetLowering::getAddrModeArguments(IntrinsicInst *II,
                                             Type *&AccessTy) const {
   switch (II->getIntrinsicID()) {
   case Intrinsic::amdgcn_atomic_inc:
-  case Intrinsic::amdgcn_atomic_dec: {
+  case Intrinsic::amdgcn_atomic_dec:
+  case Intrinsic::amdgcn_ds_fadd:
+  case Intrinsic::amdgcn_ds_fmin:
+  case Intrinsic::amdgcn_ds_fmax: {
     Value *Ptr = II->getArgOperand(0);
     AccessTy = II->getType();
     Ops.push_back(Ptr);
@@ -675,7 +863,8 @@ bool SITargetLowering::isLegalAddressingMode(const DataLayout &DL,
   if (AS == AMDGPUASI.GLOBAL_ADDRESS)
     return isLegalGlobalAddressingMode(AM);
 
-  if (AS == AMDGPUASI.CONSTANT_ADDRESS) {
+  if (AS == AMDGPUASI.CONSTANT_ADDRESS ||
+      AS == AMDGPUASI.CONSTANT_ADDRESS_32BIT) {
     // If the offset isn't a multiple of 4, it probably isn't going to be
     // correctly aligned.
     // FIXME: Can we get the real alignment here?
@@ -686,19 +875,19 @@ bool SITargetLowering::isLegalAddressingMode(const DataLayout &DL,
     // will use a MUBUF load.
     // FIXME?: We also need to do this if unaligned, but we don't know the
     // alignment here.
-    if (DL.getTypeStoreSize(Ty) < 4)
+    if (Ty->isSized() && DL.getTypeStoreSize(Ty) < 4)
       return isLegalGlobalAddressingMode(AM);
 
-    if (Subtarget->getGeneration() == SISubtarget::SOUTHERN_ISLANDS) {
+    if (Subtarget->getGeneration() == AMDGPUSubtarget::SOUTHERN_ISLANDS) {
       // SMRD instructions have an 8-bit, dword offset on SI.
       if (!isUInt<8>(AM.BaseOffs / 4))
         return false;
-    } else if (Subtarget->getGeneration() == SISubtarget::SEA_ISLANDS) {
+    } else if (Subtarget->getGeneration() == AMDGPUSubtarget::SEA_ISLANDS) {
       // On CI+, this can also be a 32-bit literal constant offset. If it fits
       // in 8-bits, it can use a smaller encoding.
       if (!isUInt<32>(AM.BaseOffs / 4))
         return false;
-    } else if (Subtarget->getGeneration() >= SISubtarget::VOLCANIC_ISLANDS) {
+    } else if (Subtarget->getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS) {
       // On VI, these use the SMEM format and the offset is 20-bit in bytes.
       if (!isUInt<20>(AM.BaseOffs))
         return false;
@@ -798,7 +987,8 @@ bool SITargetLowering::allowsMisalignedMemoryAccesses(EVT VT,
     // If we have an uniform constant load, it still requires using a slow
     // buffer instruction if unaligned.
     if (IsFast) {
-      *IsFast = (AddrSpace == AMDGPUASI.CONSTANT_ADDRESS) ?
+      *IsFast = (AddrSpace == AMDGPUASI.CONSTANT_ADDRESS ||
+                 AddrSpace == AMDGPUASI.CONSTANT_ADDRESS_32BIT) ?
         (Align % 4 == 0) : true;
     }
 
@@ -841,7 +1031,8 @@ EVT SITargetLowering::getOptimalMemOpType(uint64_t Size, unsigned DstAlign,
 static bool isFlatGlobalAddrSpace(unsigned AS, AMDGPUAS AMDGPUASI) {
   return AS == AMDGPUASI.GLOBAL_ADDRESS ||
          AS == AMDGPUASI.FLAT_ADDRESS ||
-         AS == AMDGPUASI.CONSTANT_ADDRESS;
+         AS == AMDGPUASI.CONSTANT_ADDRESS ||
+         AS == AMDGPUASI.CONSTANT_ADDRESS_32BIT;
 }
 
 bool SITargetLowering::isNoopAddrSpaceCast(unsigned SrcAS,
@@ -853,7 +1044,7 @@ bool SITargetLowering::isNoopAddrSpaceCast(unsigned SrcAS,
 bool SITargetLowering::isMemOpHasNoClobberedMemOperand(const SDNode *N) const {
   const MemSDNode *MemNode = cast<MemSDNode>(N);
   const Value *Ptr = MemNode->getMemOperand()->getValue();
-  const Instruction *I = dyn_cast<Instruction>(Ptr);
+  const Instruction *I = dyn_cast_or_null<Instruction>(Ptr);
   return I && I->getMetadata("amdgpu.noclobber");
 }
 
@@ -870,7 +1061,7 @@ bool SITargetLowering::isCheapAddrSpaceCast(unsigned SrcAS,
 bool SITargetLowering::isMemOpUniform(const SDNode *N) const {
   const MemSDNode *MemNode = cast<MemSDNode>(N);
 
-  return AMDGPU::isUniformMMO(MemNode->getMemOperand());
+  return AMDGPUInstrInfo::isUniformMMO(MemNode->getMemOperand());
 }
 
 TargetLoweringBase::LegalizeTypeAction
@@ -932,14 +1123,13 @@ SDValue SITargetLowering::lowerKernArgParameterPtr(SelectionDAG &DAG,
   SDValue BasePtr = DAG.getCopyFromReg(Chain, SL,
     MRI.getLiveInVirtReg(InputPtrReg->getRegister()), PtrVT);
 
-  return DAG.getNode(ISD::ADD, SL, PtrVT, BasePtr,
-                     DAG.getConstant(Offset, SL, PtrVT));
+  return DAG.getObjectPtrOffset(SL, BasePtr, Offset);
 }
 
 SDValue SITargetLowering::getImplicitArgPtr(SelectionDAG &DAG,
                                             const SDLoc &SL) const {
-  auto MFI = DAG.getMachineFunction().getInfo<SIMachineFunctionInfo>();
-  uint64_t Offset = getImplicitParameterOffset(MFI, FIRST_IMPLICIT);
+  uint64_t Offset = getImplicitParameterOffset(DAG.getMachineFunction(),
+                                               FIRST_IMPLICIT);
   return lowerKernArgParameterPtr(DAG, SL, DAG.getEntryNode(), Offset);
 }
 
@@ -966,18 +1156,42 @@ SDValue SITargetLowering::convertArgType(SelectionDAG &DAG, EVT VT, EVT MemVT,
 SDValue SITargetLowering::lowerKernargMemParameter(
   SelectionDAG &DAG, EVT VT, EVT MemVT,
   const SDLoc &SL, SDValue Chain,
-  uint64_t Offset, bool Signed,
+  uint64_t Offset, unsigned Align, bool Signed,
   const ISD::InputArg *Arg) const {
-  const DataLayout &DL = DAG.getDataLayout();
   Type *Ty = MemVT.getTypeForEVT(*DAG.getContext());
   PointerType *PtrTy = PointerType::get(Ty, AMDGPUASI.CONSTANT_ADDRESS);
   MachinePointerInfo PtrInfo(UndefValue::get(PtrTy));
 
-  unsigned Align = DL.getABITypeAlignment(Ty);
+  // Try to avoid using an extload by loading earlier than the argument address,
+  // and extracting the relevant bits. The load should hopefully be merged with
+  // the previous argument.
+  if (MemVT.getStoreSize() < 4 && Align < 4) {
+    // TODO: Handle align < 4 and size >= 4 (can happen with packed structs).
+    int64_t AlignDownOffset = alignDown(Offset, 4);
+    int64_t OffsetDiff = Offset - AlignDownOffset;
+
+    EVT IntVT = MemVT.changeTypeToInteger();
+
+    // TODO: If we passed in the base kernel offset we could have a better
+    // alignment than 4, but we don't really need it.
+    SDValue Ptr = lowerKernArgParameterPtr(DAG, SL, Chain, AlignDownOffset);
+    SDValue Load = DAG.getLoad(MVT::i32, SL, Chain, Ptr, PtrInfo, 4,
+                               MachineMemOperand::MODereferenceable |
+                               MachineMemOperand::MOInvariant);
+
+    SDValue ShiftAmt = DAG.getConstant(OffsetDiff * 8, SL, MVT::i32);
+    SDValue Extract = DAG.getNode(ISD::SRL, SL, MVT::i32, Load, ShiftAmt);
+
+    SDValue ArgVal = DAG.getNode(ISD::TRUNCATE, SL, IntVT, Extract);
+    ArgVal = DAG.getNode(ISD::BITCAST, SL, MemVT, ArgVal);
+    ArgVal = convertArgType(DAG, VT, MemVT, SL, ArgVal, Signed, Arg);
+
+
+    return DAG.getMergeValues({ ArgVal, Load.getValue(1) }, SL);
+  }
 
   SDValue Ptr = lowerKernArgParameterPtr(DAG, SL, Chain, Offset);
   SDValue Load = DAG.getLoad(MemVT, SL, Chain, Ptr, PtrInfo, Align,
-                             MachineMemOperand::MONonTemporal |
                              MachineMemOperand::MODereferenceable |
                              MachineMemOperand::MOInvariant);
 
@@ -1052,36 +1266,51 @@ static void processShaderInputArgs(SmallVectorImpl<ISD::InputArg> &Splits,
                                    FunctionType *FType,
                                    SIMachineFunctionInfo *Info) {
   for (unsigned I = 0, E = Ins.size(), PSInputNum = 0; I != E; ++I) {
-    const ISD::InputArg &Arg = Ins[I];
+    const ISD::InputArg *Arg = &Ins[I];
 
     // First check if it's a PS input addr.
-    if (CallConv == CallingConv::AMDGPU_PS && !Arg.Flags.isInReg() &&
-        !Arg.Flags.isByVal() && PSInputNum <= 15) {
+    if (CallConv == CallingConv::AMDGPU_PS &&
+        !Arg->Flags.isInReg() && !Arg->Flags.isByVal() && PSInputNum <= 15) {
+
+      bool SkipArg = !Arg->Used && !Info->isPSInputAllocated(PSInputNum);
+
+      // Inconveniently only the first part of the split is marked as isSplit,
+      // so skip to the end. We only want to increment PSInputNum once for the
+      // entire split argument.
+      if (Arg->Flags.isSplit()) {
+        while (!Arg->Flags.isSplitEnd()) {
+          assert(!Arg->VT.isVector() &&
+                 "unexpected vector split in ps argument type");
+          if (!SkipArg)
+            Splits.push_back(*Arg);
+          Arg = &Ins[++I];
+        }
+      }
 
-      if (!Arg.Used && !Info->isPSInputAllocated(PSInputNum)) {
+      if (SkipArg) {
         // We can safely skip PS inputs.
-        Skipped.set(I);
+        Skipped.set(Arg->getOrigArgIndex());
         ++PSInputNum;
         continue;
       }
 
       Info->markPSInputAllocated(PSInputNum);
-      if (Arg.Used)
+      if (Arg->Used)
         Info->markPSInputEnabled(PSInputNum);
 
       ++PSInputNum;
     }
 
     // Second split vertices into their elements.
-    if (Arg.VT.isVector()) {
-      ISD::InputArg NewArg = Arg;
+    if (Arg->VT.isVector()) {
+      ISD::InputArg NewArg = *Arg;
       NewArg.Flags.setSplit();
-      NewArg.VT = Arg.VT.getVectorElementType();
+      NewArg.VT = Arg->VT.getVectorElementType();
 
       // We REALLY want the ORIGINAL number of vertex elements here, e.g. a
       // three or five element vertex only needs three or five registers,
       // NOT four or eight.
-      Type *ParamType = FType->getParamType(Arg.getOrigArgIndex());
+      Type *ParamType = FType->getParamType(Arg->getOrigArgIndex());
       unsigned NumElements = ParamType->getVectorNumElements();
 
       for (unsigned J = 0; J != NumElements; ++J) {
@@ -1089,7 +1318,7 @@ static void processShaderInputArgs(SmallVectorImpl<ISD::InputArg> &Splits,
         NewArg.PartOffset += NewArg.VT.getStoreSize();
       }
     } else {
-      Splits.push_back(Arg);
+      Splits.push_back(*Arg);
     }
   }
 }
@@ -1347,8 +1576,8 @@ static void reservePrivateMemoryRegs(const TargetMachine &TM,
   // the scratch registers to pass in.
   bool RequiresStackAccess = HasStackObjects || MFI.hasCalls();
 
-  const SISubtarget &ST = MF.getSubtarget<SISubtarget>();
-  if (ST.isAmdCodeObjectV2(MF)) {
+  const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
+  if (ST.isAmdCodeObjectV2(MF.getFunction())) {
     if (RequiresStackAccess) {
       // If we have stack objects, we unquestionably need the private buffer
       // resource. For the Code Object V2 ABI, this will be the first 4 user
@@ -1460,12 +1689,12 @@ SDValue SITargetLowering::LowerFormalArguments(
   const SIRegisterInfo *TRI = getSubtarget()->getRegisterInfo();
 
   MachineFunction &MF = DAG.getMachineFunction();
+  const Function &Fn = MF.getFunction();
   FunctionType *FType = MF.getFunction().getFunctionType();
   SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();
-  const SISubtarget &ST = MF.getSubtarget<SISubtarget>();
+  const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
 
   if (Subtarget->isAmdHsaOS() && AMDGPU::isShader(CallConv)) {
-    const Function &Fn = MF.getFunction();
     DiagnosticInfoUnsupported NoGraphicsHSA(
         Fn, "unsupported non-compute shaders with HSA", DL.getDebugLoc());
     DAG.getContext()->diagnose(NoGraphicsHSA);
@@ -1562,9 +1791,16 @@ SDValue SITargetLowering::LowerFormalArguments(
 
   SmallVector<SDValue, 16> Chains;
 
-  for (unsigned i = 0, e = Ins.size(), ArgIdx = 0; i != e; ++i) {
+  // FIXME: This is the minimum kernel argument alignment. We should improve
+  // this to the maximum alignment of the arguments.
+  //
+  // FIXME: Alignment of explicit arguments totally broken with non-0 explicit
+  // kern arg offset.
+  const unsigned KernelArgBaseAlign = 16;
+
+   for (unsigned i = 0, e = Ins.size(), ArgIdx = 0; i != e; ++i) {
     const ISD::InputArg &Arg = Ins[i];
-    if (Skipped[i]) {
+    if (Arg.isOrigArg() && Skipped[Arg.getOrigArgIndex()]) {
       InVals.push_back(DAG.getUNDEF(Arg.VT));
       continue;
     }
@@ -1576,19 +1812,16 @@ SDValue SITargetLowering::LowerFormalArguments(
       VT = Ins[i].VT;
       EVT MemVT = VA.getLocVT();
 
-      const uint64_t Offset = Subtarget->getExplicitKernelArgOffset(MF) +
-        VA.getLocMemOffset();
-      Info->setABIArgOffset(Offset + MemVT.getStoreSize());
+      const uint64_t Offset = VA.getLocMemOffset();
+      unsigned Align = MinAlign(KernelArgBaseAlign, Offset);
 
-      // The first 36 bytes of the input buffer contains information about
-      // thread group and global sizes.
       SDValue Arg = lowerKernargMemParameter(
-        DAG, VT, MemVT, DL, Chain, Offset, Ins[i].Flags.isSExt(), &Ins[i]);
+        DAG, VT, MemVT, DL, Chain, Offset, Align, Ins[i].Flags.isSExt(), &Ins[i]);
       Chains.push_back(Arg.getValue(1));
 
       auto *ParamTy =
         dyn_cast<PointerType>(FType->getParamType(Ins[i].getOrigArgIndex()));
-      if (Subtarget->getGeneration() == SISubtarget::SOUTHERN_ISLANDS &&
+      if (Subtarget->getGeneration() == AMDGPUSubtarget::SOUTHERN_ISLANDS &&
           ParamTy && ParamTy->getAddressSpace() == AMDGPUAS::LOCAL_ADDRESS) {
         // On SI local pointers are just offsets into LDS, so they are always
         // less than 16-bits.  On CI and newer they could potentially be
@@ -1696,7 +1929,7 @@ SDValue SITargetLowering::LowerFormalArguments(
 
   auto &ArgUsageInfo =
     DAG.getPass()->getAnalysis<AMDGPUArgumentUsageInfo>();
-  ArgUsageInfo.setFuncArgInfo(MF.getFunction(), Info->getArgInfo());
+  ArgUsageInfo.setFuncArgInfo(Fn, Info->getArgInfo());
 
   unsigned StackArgSize = CCInfo.getNextStackOffset();
   Info->setBytesInStackArgArea(StackArgSize);
@@ -1841,8 +2074,7 @@ SITargetLowering::LowerReturn(SDValue Chain, CallingConv::ID CallConv,
 
   // FIXME: Does sret work properly?
   if (!Info->isEntryFunction()) {
-    const SIRegisterInfo *TRI
-      = static_cast<const SISubtarget *>(Subtarget)->getRegisterInfo();
+    const SIRegisterInfo *TRI = Subtarget->getRegisterInfo();
     const MCPhysReg *I =
       TRI->getCalleeSavedRegsViaCopy(&DAG.getMachineFunction());
     if (I) {
@@ -1944,8 +2176,7 @@ void SITargetLowering::passSpecialInputs(
   SelectionDAG &DAG = CLI.DAG;
   const SDLoc &DL = CLI.DL;
 
-  const SISubtarget *ST = getSubtarget();
-  const SIRegisterInfo *TRI = ST->getRegisterInfo();
+  const SIRegisterInfo *TRI = Subtarget->getRegisterInfo();
 
   auto &ArgUsageInfo =
     DAG.getPass()->getAnalysis<AMDGPUArgumentUsageInfo>();
@@ -2138,6 +2369,13 @@ SDValue SITargetLowering::LowerCall(CallLoweringInfo &CLI,
                               "unsupported required tail call to function ");
   }
 
+  if (AMDGPU::isShader(MF.getFunction().getCallingConv())) {
+    // Note the issue is with the CC of the calling function, not of the call
+    // itself.
+    return lowerUnhandledCall(CLI, InVals,
+                          "unsupported call from graphics shader of function ");
+  }
+
   // The first 4 bytes are reserved for the callee's emergency stack slot.
   const unsigned CalleeUsableStackOffset = 4;
 
@@ -2383,7 +2621,7 @@ SDValue SITargetLowering::LowerCall(CallLoweringInfo &CLI,
 
   // Add a register mask operand representing the call-preserved registers.
 
-  const AMDGPURegisterInfo *TRI = Subtarget->getRegisterInfo();
+  auto *TRI = static_cast<const SIRegisterInfo*>(Subtarget->getRegisterInfo());
   const uint32_t *Mask = TRI->getCallPreservedMask(MF, CallConv);
   assert(Mask && "Missing call preserved mask for calling convention");
   Ops.push_back(DAG.getRegisterMask(Mask));
@@ -2443,7 +2681,7 @@ unsigned SITargetLowering::getRegisterByName(const char* RegName, EVT VT,
 
   }
 
-  if (Subtarget->getGeneration() == SISubtarget::SOUTHERN_ISLANDS &&
+  if (Subtarget->getGeneration() == AMDGPUSubtarget::SOUTHERN_ISLANDS &&
       Subtarget->getRegisterInfo()->regsOverlap(Reg, AMDGPU::FLAT_SCR)) {
     report_fatal_error(Twine("invalid register \""
                              + StringRef(RegName)  + "\" for subtarget."));
@@ -2517,7 +2755,8 @@ static MachineBasicBlock::iterator emitLoadM0FromVGPRLoop(
   unsigned PhiReg,
   unsigned InitSaveExecReg,
   int Offset,
-  bool UseGPRIdxMode) {
+  bool UseGPRIdxMode,
+  bool IsIndirectSrc) {
   MachineBasicBlock::iterator I = LoopBB.begin();
 
   unsigned PhiExec = MRI.createVirtualRegister(&AMDGPU::SReg_64RegClass);
@@ -2546,6 +2785,12 @@ static MachineBasicBlock::iterator emitLoadM0FromVGPRLoop(
     .addReg(CurrentIdxReg)
     .addReg(IdxReg.getReg(), 0, IdxReg.getSubReg());
 
+  // Update EXEC, save the original EXEC value to VCC.
+  BuildMI(LoopBB, I, DL, TII->get(AMDGPU::S_AND_SAVEEXEC_B64), NewExec)
+    .addReg(CondReg, RegState::Kill);
+
+  MRI.setSimpleHint(NewExec, CondReg);
+
   if (UseGPRIdxMode) {
     unsigned IdxReg;
     if (Offset == 0) {
@@ -2556,11 +2801,13 @@ static MachineBasicBlock::iterator emitLoadM0FromVGPRLoop(
         .addReg(CurrentIdxReg, RegState::Kill)
         .addImm(Offset);
     }
-
-    MachineInstr *SetIdx =
-      BuildMI(LoopBB, I, DL, TII->get(AMDGPU::S_SET_GPR_IDX_IDX))
-      .addReg(IdxReg, RegState::Kill);
-    SetIdx->getOperand(2).setIsUndef();
+    unsigned IdxMode = IsIndirectSrc ?
+      VGPRIndexMode::SRC0_ENABLE : VGPRIndexMode::DST_ENABLE;
+    MachineInstr *SetOn =
+      BuildMI(LoopBB, I, DL, TII->get(AMDGPU::S_SET_GPR_IDX_ON))
+      .addReg(IdxReg, RegState::Kill)
+      .addImm(IdxMode);
+    SetOn->getOperand(3).setIsUndef();
   } else {
     // Move index from VCC into M0
     if (Offset == 0) {
@@ -2573,12 +2820,6 @@ static MachineBasicBlock::iterator emitLoadM0FromVGPRLoop(
     }
   }
 
-  // Update EXEC, save the original EXEC value to VCC.
-  BuildMI(LoopBB, I, DL, TII->get(AMDGPU::S_AND_SAVEEXEC_B64), NewExec)
-    .addReg(CondReg, RegState::Kill);
-
-  MRI.setSimpleHint(NewExec, CondReg);
-
   // Update EXEC, switch all done bits to 0 and all todo bits to 1.
   MachineInstr *InsertPt =
     BuildMI(LoopBB, I, DL, TII->get(AMDGPU::S_XOR_B64), AMDGPU::EXEC)
@@ -2606,7 +2847,8 @@ static MachineBasicBlock::iterator loadM0FromVGPR(const SIInstrInfo *TII,
                                                   unsigned InitResultReg,
                                                   unsigned PhiReg,
                                                   int Offset,
-                                                  bool UseGPRIdxMode) {
+                                                  bool UseGPRIdxMode,
+                                                  bool IsIndirectSrc) {
   MachineFunction *MF = MBB.getParent();
   MachineRegisterInfo &MRI = MF->getRegInfo();
   const DebugLoc &DL = MI.getDebugLoc();
@@ -2645,7 +2887,7 @@ static MachineBasicBlock::iterator loadM0FromVGPR(const SIInstrInfo *TII,
 
   auto InsPt = emitLoadM0FromVGPRLoop(TII, MRI, MBB, *LoopBB, DL, *Idx,
                                       InitResultReg, DstReg, PhiReg, TmpExec,
-                                      Offset, UseGPRIdxMode);
+                                      Offset, UseGPRIdxMode, IsIndirectSrc);
 
   MachineBasicBlock::iterator First = RemainderBB->begin();
   BuildMI(*RemainderBB, First, DL, TII->get(AMDGPU::S_MOV_B64), AMDGPU::EXEC)
@@ -2730,7 +2972,7 @@ static bool setM0ToIndexFromSGPR(const SIInstrInfo *TII,
 // Control flow needs to be inserted if indexing with a VGPR.
 static MachineBasicBlock *emitIndirectSrc(MachineInstr &MI,
                                           MachineBasicBlock &MBB,
-                                          const SISubtarget &ST) {
+                                          const GCNSubtarget &ST) {
   const SIInstrInfo *TII = ST.getInstrInfo();
   const SIRegisterInfo &TRI = TII->getRegisterInfo();
   MachineFunction *MF = MBB.getParent();
@@ -2780,17 +3022,8 @@ static MachineBasicBlock *emitIndirectSrc(MachineInstr &MI,
 
   BuildMI(MBB, I, DL, TII->get(TargetOpcode::IMPLICIT_DEF), InitReg);
 
-  if (UseGPRIdxMode) {
-    MachineInstr *SetOn = BuildMI(MBB, I, DL, TII->get(AMDGPU::S_SET_GPR_IDX_ON))
-      .addImm(0) // Reset inside loop.
-      .addImm(VGPRIndexMode::SRC0_ENABLE);
-    SetOn->getOperand(3).setIsUndef();
-
-    // Disable again after the loop.
-    BuildMI(MBB, std::next(I), DL, TII->get(AMDGPU::S_SET_GPR_IDX_OFF));
-  }
-
-  auto InsPt = loadM0FromVGPR(TII, MBB, MI, InitReg, PhiReg, Offset, UseGPRIdxMode);
+  auto InsPt = loadM0FromVGPR(TII, MBB, MI, InitReg, PhiReg,
+                              Offset, UseGPRIdxMode, true);
   MachineBasicBlock *LoopBB = InsPt->getParent();
 
   if (UseGPRIdxMode) {
@@ -2798,6 +3031,7 @@ static MachineBasicBlock *emitIndirectSrc(MachineInstr &MI,
       .addReg(SrcReg, RegState::Undef, SubReg)
       .addReg(SrcReg, RegState::Implicit)
       .addReg(AMDGPU::M0, RegState::Implicit);
+    BuildMI(*LoopBB, InsPt, DL, TII->get(AMDGPU::S_SET_GPR_IDX_OFF));
   } else {
     BuildMI(*LoopBB, InsPt, DL, TII->get(AMDGPU::V_MOVRELS_B32_e32), Dst)
       .addReg(SrcReg, RegState::Undef, SubReg)
@@ -2829,7 +3063,7 @@ static unsigned getMOVRELDPseudo(const SIRegisterInfo &TRI,
 
 static MachineBasicBlock *emitIndirectDst(MachineInstr &MI,
                                           MachineBasicBlock &MBB,
-                                          const SISubtarget &ST) {
+                                          const GCNSubtarget &ST) {
   const SIInstrInfo *TII = ST.getInstrInfo();
   const SIRegisterInfo &TRI = TII->getRegisterInfo();
   MachineFunction *MF = MBB.getParent();
@@ -2898,22 +3132,10 @@ static MachineBasicBlock *emitIndirectDst(MachineInstr &MI,
 
   const DebugLoc &DL = MI.getDebugLoc();
 
-  if (UseGPRIdxMode) {
-    MachineBasicBlock::iterator I(&MI);
-
-    MachineInstr *SetOn = BuildMI(MBB, I, DL, TII->get(AMDGPU::S_SET_GPR_IDX_ON))
-      .addImm(0) // Reset inside loop.
-      .addImm(VGPRIndexMode::DST_ENABLE);
-    SetOn->getOperand(3).setIsUndef();
-
-    // Disable again after the loop.
-    BuildMI(MBB, std::next(I), DL, TII->get(AMDGPU::S_SET_GPR_IDX_OFF));
-  }
-
   unsigned PhiReg = MRI.createVirtualRegister(VecRC);
 
   auto InsPt = loadM0FromVGPR(TII, MBB, MI, SrcVec->getReg(), PhiReg,
-                              Offset, UseGPRIdxMode);
+                              Offset, UseGPRIdxMode, false);
   MachineBasicBlock *LoopBB = InsPt->getParent();
 
   if (UseGPRIdxMode) {
@@ -2923,6 +3145,7 @@ static MachineBasicBlock *emitIndirectDst(MachineInstr &MI,
         .addReg(Dst, RegState::ImplicitDefine)
         .addReg(PhiReg, RegState::Implicit)
         .addReg(AMDGPU::M0, RegState::Implicit);
+    BuildMI(*LoopBB, InsPt, DL, TII->get(AMDGPU::S_SET_GPR_IDX_OFF));
   } else {
     const MCInstrDesc &MovRelDesc = TII->get(getMOVRELDPseudo(TRI, VecRC));
 
@@ -2946,24 +3169,12 @@ MachineBasicBlock *SITargetLowering::EmitInstrWithCustomInserter(
   SIMachineFunctionInfo *MFI = MF->getInfo<SIMachineFunctionInfo>();
 
   if (TII->isMIMG(MI)) {
-      if (!MI.memoperands_empty())
-        return BB;
+    if (MI.memoperands_empty() && MI.mayLoadOrStore()) {
+      report_fatal_error("missing mem operand from MIMG instruction");
+    }
     // Add a memoperand for mimg instructions so that they aren't assumed to
     // be ordered memory instuctions.
 
-    MachinePointerInfo PtrInfo(MFI->getImagePSV());
-    MachineMemOperand::Flags Flags = MachineMemOperand::MODereferenceable;
-    if (MI.mayStore())
-      Flags |= MachineMemOperand::MOStore;
-
-    if (MI.mayLoad())
-      Flags |= MachineMemOperand::MOLoad;
-
-    if (Flags != MachineMemOperand::MODereferenceable) {
-      auto MMO = MF->getMachineMemOperand(PtrInfo, Flags, 0, 0);
-      MI.addMemOperand(*MF, MMO);
-    }
-
     return BB;
   }
 
@@ -3145,8 +3356,13 @@ MachineBasicBlock *SITargetLowering::EmitInstrWithCustomInserter(
   case AMDGPU::ADJCALLSTACKDOWN: {
     const SIMachineFunctionInfo *Info = MF->getInfo<SIMachineFunctionInfo>();
     MachineInstrBuilder MIB(*MF, &MI);
+
+    // Add an implicit use of the frame offset reg to prevent the restore copy
+    // inserted after the call from being reorderd after stack operations in the
+    // the caller's frame.
     MIB.addReg(Info->getStackPtrOffsetReg(), RegState::ImplicitDefine)
-        .addReg(Info->getStackPtrOffsetReg(), RegState::Implicit);
+        .addReg(Info->getStackPtrOffsetReg(), RegState::Implicit)
+        .addReg(Info->getFrameOffsetReg(), RegState::Implicit);
     return BB;
   }
   case AMDGPU::SI_CALL_ISEL:
@@ -3236,12 +3452,17 @@ bool SITargetLowering::isFMAFasterThanFMulAndFAdd(EVT VT) const {
   VT = VT.getScalarType();
 
   switch (VT.getSimpleVT().SimpleTy) {
-  case MVT::f32:
+  case MVT::f32: {
     // This is as fast on some subtargets. However, we always have full rate f32
     // mad available which returns the same result as the separate operations
     // which we should prefer over fma. We can't use this if we want to support
     // denormals, so only report this in these cases.
-    return Subtarget->hasFP32Denormals() && Subtarget->hasFastFMAF32();
+    if (Subtarget->hasFP32Denormals())
+      return Subtarget->hasFastFMAF32() || Subtarget->hasDLInsts();
+
+    // If the subtarget has v_fmac_f32, that's just as good as v_mac_f32.
+    return Subtarget->hasFastFMAF32() && Subtarget->hasDLInsts();
+  }
   case MVT::f64:
     return true;
   case MVT::f16:
@@ -3257,6 +3478,49 @@ bool SITargetLowering::isFMAFasterThanFMulAndFAdd(EVT VT) const {
 // Custom DAG Lowering Operations
 //===----------------------------------------------------------------------===//
 
+// Work around LegalizeDAG doing the wrong thing and fully scalarizing if the
+// wider vector type is legal.
+SDValue SITargetLowering::splitUnaryVectorOp(SDValue Op,
+                                             SelectionDAG &DAG) const {
+  unsigned Opc = Op.getOpcode();
+  EVT VT = Op.getValueType();
+  assert(VT == MVT::v4f16);
+
+  SDValue Lo, Hi;
+  std::tie(Lo, Hi) = DAG.SplitVectorOperand(Op.getNode(), 0);
+
+  SDLoc SL(Op);
+  SDValue OpLo = DAG.getNode(Opc, SL, Lo.getValueType(), Lo,
+                             Op->getFlags());
+  SDValue OpHi = DAG.getNode(Opc, SL, Hi.getValueType(), Hi,
+                             Op->getFlags());
+
+  return DAG.getNode(ISD::CONCAT_VECTORS, SDLoc(Op), VT, OpLo, OpHi);
+}
+
+// Work around LegalizeDAG doing the wrong thing and fully scalarizing if the
+// wider vector type is legal.
+SDValue SITargetLowering::splitBinaryVectorOp(SDValue Op,
+                                              SelectionDAG &DAG) const {
+  unsigned Opc = Op.getOpcode();
+  EVT VT = Op.getValueType();
+  assert(VT == MVT::v4i16 || VT == MVT::v4f16);
+
+  SDValue Lo0, Hi0;
+  std::tie(Lo0, Hi0) = DAG.SplitVectorOperand(Op.getNode(), 0);
+  SDValue Lo1, Hi1;
+  std::tie(Lo1, Hi1) = DAG.SplitVectorOperand(Op.getNode(), 1);
+
+  SDLoc SL(Op);
+
+  SDValue OpLo = DAG.getNode(Opc, SL, Lo0.getValueType(), Lo0, Lo1,
+                             Op->getFlags());
+  SDValue OpHi = DAG.getNode(Opc, SL, Hi0.getValueType(), Hi0, Hi1,
+                             Op->getFlags());
+
+  return DAG.getNode(ISD::CONCAT_VECTORS, SDLoc(Op), VT, OpLo, OpHi);
+}
+
 SDValue SITargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const {
   switch (Op.getOpcode()) {
   default: return AMDGPUTargetLowering::LowerOperation(Op, DAG);
@@ -3289,15 +3553,105 @@ SDValue SITargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const {
     return lowerINSERT_VECTOR_ELT(Op, DAG);
   case ISD::EXTRACT_VECTOR_ELT:
     return lowerEXTRACT_VECTOR_ELT(Op, DAG);
+  case ISD::BUILD_VECTOR:
+    return lowerBUILD_VECTOR(Op, DAG);
   case ISD::FP_ROUND:
     return lowerFP_ROUND(Op, DAG);
   case ISD::TRAP:
-  case ISD::DEBUGTRAP:
     return lowerTRAP(Op, DAG);
+  case ISD::DEBUGTRAP:
+    return lowerDEBUGTRAP(Op, DAG);
+  case ISD::FABS:
+  case ISD::FNEG:
+    return splitUnaryVectorOp(Op, DAG);
+  case ISD::SHL:
+  case ISD::SRA:
+  case ISD::SRL:
+  case ISD::ADD:
+  case ISD::SUB:
+  case ISD::MUL:
+  case ISD::SMIN:
+  case ISD::SMAX:
+  case ISD::UMIN:
+  case ISD::UMAX:
+  case ISD::FMINNUM:
+  case ISD::FMAXNUM:
+  case ISD::FADD:
+  case ISD::FMUL:
+    return splitBinaryVectorOp(Op, DAG);
   }
   return SDValue();
 }
 
+static SDValue adjustLoadValueTypeImpl(SDValue Result, EVT LoadVT,
+                                       const SDLoc &DL,
+                                       SelectionDAG &DAG, bool Unpacked) {
+  if (!LoadVT.isVector())
+    return Result;
+
+  if (Unpacked) { // From v2i32/v4i32 back to v2f16/v4f16.
+    // Truncate to v2i16/v4i16.
+    EVT IntLoadVT = LoadVT.changeTypeToInteger();
+
+    // Workaround legalizer not scalarizing truncate after vector op
+    // legalization byt not creating intermediate vector trunc.
+    SmallVector<SDValue, 4> Elts;
+    DAG.ExtractVectorElements(Result, Elts);
+    for (SDValue &Elt : Elts)
+      Elt = DAG.getNode(ISD::TRUNCATE, DL, MVT::i16, Elt);
+
+    Result = DAG.getBuildVector(IntLoadVT, DL, Elts);
+
+    // Bitcast to original type (v2f16/v4f16).
+    return DAG.getNode(ISD::BITCAST, DL, LoadVT, Result);
+  }
+
+  // Cast back to the original packed type.
+  return DAG.getNode(ISD::BITCAST, DL, LoadVT, Result);
+}
+
+SDValue SITargetLowering::adjustLoadValueType(unsigned Opcode,
+                                              MemSDNode *M,
+                                              SelectionDAG &DAG,
+                                              bool IsIntrinsic) const {
+  SDLoc DL(M);
+  SmallVector<SDValue, 10> Ops;
+  Ops.reserve(M->getNumOperands());
+
+  Ops.push_back(M->getOperand(0));
+  if (IsIntrinsic)
+    Ops.push_back(DAG.getConstant(Opcode, DL, MVT::i32));
+
+  // Skip 1, as it is the intrinsic ID.
+  for (unsigned I = 2, E = M->getNumOperands(); I != E; ++I)
+    Ops.push_back(M->getOperand(I));
+
+  bool Unpacked = Subtarget->hasUnpackedD16VMem();
+  EVT LoadVT = M->getValueType(0);
+
+  EVT EquivLoadVT = LoadVT;
+  if (Unpacked && LoadVT.isVector()) {
+    EquivLoadVT = LoadVT.isVector() ?
+      EVT::getVectorVT(*DAG.getContext(), MVT::i32,
+                       LoadVT.getVectorNumElements()) : LoadVT;
+  }
+
+  // Change from v4f16/v2f16 to EquivLoadVT.
+  SDVTList VTList = DAG.getVTList(EquivLoadVT, MVT::Other);
+
+  SDValue Load
+    = DAG.getMemIntrinsicNode(
+      IsIntrinsic ? (unsigned)ISD::INTRINSIC_W_CHAIN : Opcode, DL,
+      VTList, Ops, M->getMemoryVT(),
+      M->getMemOperand());
+  if (!Unpacked) // Just adjusted the opcode.
+    return Load;
+
+  SDValue Adjusted = adjustLoadValueTypeImpl(Load, LoadVT, DL, DAG, Unpacked);
+
+  return DAG.getMergeValues({ Adjusted, Load.getValue(1) }, DL);
+}
+
 void SITargetLowering::ReplaceNodeResults(SDNode *N,
                                           SmallVectorImpl<SDValue> &Results,
                                           SelectionDAG &DAG) const {
@@ -3314,7 +3668,8 @@ void SITargetLowering::ReplaceNodeResults(SDNode *N,
   }
   case ISD::INTRINSIC_WO_CHAIN: {
     unsigned IID = cast<ConstantSDNode>(N->getOperand(0))->getZExtValue();
-    if (IID == Intrinsic::amdgcn_cvt_pkrtz) {
+    switch (IID) {
+    case Intrinsic::amdgcn_cvt_pkrtz: {
       SDValue Src0 = N->getOperand(1);
       SDValue Src1 = N->getOperand(2);
       SDLoc SL(N);
@@ -3323,6 +3678,38 @@ void SITargetLowering::ReplaceNodeResults(SDNode *N,
       Results.push_back(DAG.getNode(ISD::BITCAST, SL, MVT::v2f16, Cvt));
       return;
     }
+    case Intrinsic::amdgcn_cvt_pknorm_i16:
+    case Intrinsic::amdgcn_cvt_pknorm_u16:
+    case Intrinsic::amdgcn_cvt_pk_i16:
+    case Intrinsic::amdgcn_cvt_pk_u16: {
+      SDValue Src0 = N->getOperand(1);
+      SDValue Src1 = N->getOperand(2);
+      SDLoc SL(N);
+      unsigned Opcode;
+
+      if (IID == Intrinsic::amdgcn_cvt_pknorm_i16)
+        Opcode = AMDGPUISD::CVT_PKNORM_I16_F32;
+      else if (IID == Intrinsic::amdgcn_cvt_pknorm_u16)
+        Opcode = AMDGPUISD::CVT_PKNORM_U16_F32;
+      else if (IID == Intrinsic::amdgcn_cvt_pk_i16)
+        Opcode = AMDGPUISD::CVT_PK_I16_I32;
+      else
+        Opcode = AMDGPUISD::CVT_PK_U16_U32;
+
+      SDValue Cvt = DAG.getNode(Opcode, SL, MVT::i32, Src0, Src1);
+      Results.push_back(DAG.getNode(ISD::BITCAST, SL, MVT::v2i16, Cvt));
+      return;
+    }
+    }
+    break;
+  }
+  case ISD::INTRINSIC_W_CHAIN: {
+    if (SDValue Res = LowerINTRINSIC_W_CHAIN(SDValue(N, 0), DAG)) {
+      Results.push_back(Res);
+      Results.push_back(Res.getValue(1));
+      return;
+    }
+
     break;
   }
   case ISD::SELECT: {
@@ -3347,12 +3734,38 @@ void SITargetLowering::ReplaceNodeResults(SDNode *N,
     Results.push_back(DAG.getNode(ISD::BITCAST, SL, VT, NewSelect));
     return;
   }
+  case ISD::FNEG: {
+    if (N->getValueType(0) != MVT::v2f16)
+      break;
+
+    SDLoc SL(N);
+    SDValue BC = DAG.getNode(ISD::BITCAST, SL, MVT::i32, N->getOperand(0));
+
+    SDValue Op = DAG.getNode(ISD::XOR, SL, MVT::i32,
+                             BC,
+                             DAG.getConstant(0x80008000, SL, MVT::i32));
+    Results.push_back(DAG.getNode(ISD::BITCAST, SL, MVT::v2f16, Op));
+    return;
+  }
+  case ISD::FABS: {
+    if (N->getValueType(0) != MVT::v2f16)
+      break;
+
+    SDLoc SL(N);
+    SDValue BC = DAG.getNode(ISD::BITCAST, SL, MVT::i32, N->getOperand(0));
+
+    SDValue Op = DAG.getNode(ISD::AND, SL, MVT::i32,
+                             BC,
+                             DAG.getConstant(0x7fff7fff, SL, MVT::i32));
+    Results.push_back(DAG.getNode(ISD::BITCAST, SL, MVT::v2f16, Op));
+    return;
+  }
   default:
     break;
   }
 }
 
-/// \brief Helper function for LowerBRCOND
+/// Helper function for LowerBRCOND
 static SDNode *findUser(SDValue Value, unsigned Opcode) {
 
   SDNode *Parent = Value.getNode();
@@ -3417,13 +3830,15 @@ void SITargetLowering::createDebuggerPrologueStackObjects(
 
 bool SITargetLowering::shouldEmitFixup(const GlobalValue *GV) const {
   const Triple &TT = getTargetMachine().getTargetTriple();
-  return GV->getType()->getAddressSpace() == AMDGPUASI.CONSTANT_ADDRESS &&
+  return (GV->getType()->getAddressSpace() == AMDGPUASI.CONSTANT_ADDRESS ||
+          GV->getType()->getAddressSpace() == AMDGPUASI.CONSTANT_ADDRESS_32BIT) &&
          AMDGPU::shouldEmitConstantsToTextSection(TT);
 }
 
 bool SITargetLowering::shouldEmitGOTReloc(const GlobalValue *GV) const {
   return (GV->getType()->getAddressSpace() == AMDGPUASI.GLOBAL_ADDRESS ||
-              GV->getType()->getAddressSpace() == AMDGPUASI.CONSTANT_ADDRESS) &&
+          GV->getType()->getAddressSpace() == AMDGPUASI.CONSTANT_ADDRESS ||
+          GV->getType()->getAddressSpace() == AMDGPUASI.CONSTANT_ADDRESS_32BIT) &&
          !shouldEmitFixup(GV) &&
          !getTargetMachine().shouldAssumeDSOLocal(*GV->getParent(), GV);
 }
@@ -3560,40 +3975,37 @@ SDValue SITargetLowering::lowerFP_ROUND(SDValue Op, SelectionDAG &DAG) const {
 
 SDValue SITargetLowering::lowerTRAP(SDValue Op, SelectionDAG &DAG) const {
   SDLoc SL(Op);
-  MachineFunction &MF = DAG.getMachineFunction();
   SDValue Chain = Op.getOperand(0);
 
-  unsigned TrapID = Op.getOpcode() == ISD::DEBUGTRAP ?
-    SISubtarget::TrapIDLLVMDebugTrap : SISubtarget::TrapIDLLVMTrap;
-
-  if (Subtarget->getTrapHandlerAbi() == SISubtarget::TrapHandlerAbiHsa &&
-      Subtarget->isTrapHandlerEnabled()) {
-    SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();
-    unsigned UserSGPR = Info->getQueuePtrUserSGPR();
-    assert(UserSGPR != AMDGPU::NoRegister);
-
-    SDValue QueuePtr = CreateLiveInRegister(
-      DAG, &AMDGPU::SReg_64RegClass, UserSGPR, MVT::i64);
-
-    SDValue SGPR01 = DAG.getRegister(AMDGPU::SGPR0_SGPR1, MVT::i64);
-
-    SDValue ToReg = DAG.getCopyToReg(Chain, SL, SGPR01,
-                                     QueuePtr, SDValue());
+  if (Subtarget->getTrapHandlerAbi() != GCNSubtarget::TrapHandlerAbiHsa ||
+      !Subtarget->isTrapHandlerEnabled())
+    return DAG.getNode(AMDGPUISD::ENDPGM, SL, MVT::Other, Chain);
 
-    SDValue Ops[] = {
-      ToReg,
-      DAG.getTargetConstant(TrapID, SL, MVT::i16),
-      SGPR01,
-      ToReg.getValue(1)
-    };
+  MachineFunction &MF = DAG.getMachineFunction();
+  SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();
+  unsigned UserSGPR = Info->getQueuePtrUserSGPR();
+  assert(UserSGPR != AMDGPU::NoRegister);
+  SDValue QueuePtr = CreateLiveInRegister(
+    DAG, &AMDGPU::SReg_64RegClass, UserSGPR, MVT::i64);
+  SDValue SGPR01 = DAG.getRegister(AMDGPU::SGPR0_SGPR1, MVT::i64);
+  SDValue ToReg = DAG.getCopyToReg(Chain, SL, SGPR01,
+                                   QueuePtr, SDValue());
+  SDValue Ops[] = {
+    ToReg,
+    DAG.getTargetConstant(GCNSubtarget::TrapIDLLVMTrap, SL, MVT::i16),
+    SGPR01,
+    ToReg.getValue(1)
+  };
+  return DAG.getNode(AMDGPUISD::TRAP, SL, MVT::Other, Ops);
+}
 
-    return DAG.getNode(AMDGPUISD::TRAP, SL, MVT::Other, Ops);
-  }
+SDValue SITargetLowering::lowerDEBUGTRAP(SDValue Op, SelectionDAG &DAG) const {
+  SDLoc SL(Op);
+  SDValue Chain = Op.getOperand(0);
+  MachineFunction &MF = DAG.getMachineFunction();
 
-  switch (TrapID) {
-  case SISubtarget::TrapIDLLVMTrap:
-    return DAG.getNode(AMDGPUISD::ENDPGM, SL, MVT::Other, Chain);
-  case SISubtarget::TrapIDLLVMDebugTrap: {
+  if (Subtarget->getTrapHandlerAbi() != GCNSubtarget::TrapHandlerAbiHsa ||
+      !Subtarget->isTrapHandlerEnabled()) {
     DiagnosticInfoUnsupported NoTrap(MF.getFunction(),
                                      "debugtrap handler not supported",
                                      Op.getDebugLoc(),
@@ -3602,11 +4014,12 @@ SDValue SITargetLowering::lowerTRAP(SDValue Op, SelectionDAG &DAG) const {
     Ctx.diagnose(NoTrap);
     return Chain;
   }
-  default:
-    llvm_unreachable("unsupported trap handler type!");
-  }
 
-  return Chain;
+  SDValue Ops[] = {
+    Chain,
+    DAG.getTargetConstant(GCNSubtarget::TrapIDLLVMDebugTrap, SL, MVT::i16)
+  };
+  return DAG.getNode(AMDGPUISD::TRAP, SL, MVT::Other, Ops);
 }
 
 SDValue SITargetLowering::getSegmentAperture(unsigned AS, const SDLoc &DL,
@@ -3719,34 +4132,78 @@ SDValue SITargetLowering::lowerADDRSPACECAST(SDValue Op,
 
 SDValue SITargetLowering::lowerINSERT_VECTOR_ELT(SDValue Op,
                                                  SelectionDAG &DAG) const {
+  SDValue Vec = Op.getOperand(0);
+  SDValue InsVal = Op.getOperand(1);
   SDValue Idx = Op.getOperand(2);
+  EVT VecVT = Vec.getValueType();
+  EVT EltVT = VecVT.getVectorElementType();
+  unsigned VecSize = VecVT.getSizeInBits();
+  unsigned EltSize = EltVT.getSizeInBits();
+
+
+  assert(VecSize <= 64);
+
+  unsigned NumElts = VecVT.getVectorNumElements();
+  SDLoc SL(Op);
+  auto KIdx = dyn_cast<ConstantSDNode>(Idx);
+
+  if (NumElts == 4 && EltSize == 16 && KIdx) {
+    SDValue BCVec = DAG.getNode(ISD::BITCAST, SL, MVT::v2i32, Vec);
+
+    SDValue LoHalf = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, BCVec,
+                                 DAG.getConstant(0, SL, MVT::i32));
+    SDValue HiHalf = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, BCVec,
+                                 DAG.getConstant(1, SL, MVT::i32));
+
+    SDValue LoVec = DAG.getNode(ISD::BITCAST, SL, MVT::v2i16, LoHalf);
+    SDValue HiVec = DAG.getNode(ISD::BITCAST, SL, MVT::v2i16, HiHalf);
+
+    unsigned Idx = KIdx->getZExtValue();
+    bool InsertLo = Idx < 2;
+    SDValue InsHalf = DAG.getNode(ISD::INSERT_VECTOR_ELT, SL, MVT::v2i16,
+      InsertLo ? LoVec : HiVec,
+      DAG.getNode(ISD::BITCAST, SL, MVT::i16, InsVal),
+      DAG.getConstant(InsertLo ? Idx : (Idx - 2), SL, MVT::i32));
+
+    InsHalf = DAG.getNode(ISD::BITCAST, SL, MVT::i32, InsHalf);
+
+    SDValue Concat = InsertLo ?
+      DAG.getBuildVector(MVT::v2i32, SL, { InsHalf, HiHalf }) :
+      DAG.getBuildVector(MVT::v2i32, SL, { LoHalf, InsHalf });
+
+    return DAG.getNode(ISD::BITCAST, SL, VecVT, Concat);
+  }
+
   if (isa<ConstantSDNode>(Idx))
     return SDValue();
 
+  MVT IntVT = MVT::getIntegerVT(VecSize);
+
   // Avoid stack access for dynamic indexing.
-  SDLoc SL(Op);
-  SDValue Vec = Op.getOperand(0);
-  SDValue Val = DAG.getNode(ISD::BITCAST, SL, MVT::i16, Op.getOperand(1));
+  SDValue Val = InsVal;
+  if (InsVal.getValueType() == MVT::f16)
+      Val = DAG.getNode(ISD::BITCAST, SL, MVT::i16, InsVal);
 
   // v_bfi_b32 (v_bfm_b32 16, (shl idx, 16)), val, vec
-  SDValue ExtVal = DAG.getNode(ISD::ZERO_EXTEND, SL, MVT::i32, Val);
+  SDValue ExtVal = DAG.getNode(ISD::ZERO_EXTEND, SL, IntVT, Val);
 
-  // Convert vector index to bit-index.
-  SDValue ScaledIdx = DAG.getNode(ISD::SHL, SL, MVT::i32, Idx,
-                                  DAG.getConstant(16, SL, MVT::i32));
+  assert(isPowerOf2_32(EltSize));
+  SDValue ScaleFactor = DAG.getConstant(Log2_32(EltSize), SL, MVT::i32);
 
-  SDValue BCVec = DAG.getNode(ISD::BITCAST, SL, MVT::i32, Vec);
+  // Convert vector index to bit-index.
+  SDValue ScaledIdx = DAG.getNode(ISD::SHL, SL, MVT::i32, Idx, ScaleFactor);
 
-  SDValue BFM = DAG.getNode(ISD::SHL, SL, MVT::i32,
-                            DAG.getConstant(0xffff, SL, MVT::i32),
+  SDValue BCVec = DAG.getNode(ISD::BITCAST, SL, IntVT, Vec);
+  SDValue BFM = DAG.getNode(ISD::SHL, SL, IntVT,
+                            DAG.getConstant(0xffff, SL, IntVT),
                             ScaledIdx);
 
-  SDValue LHS = DAG.getNode(ISD::AND, SL, MVT::i32, BFM, ExtVal);
-  SDValue RHS = DAG.getNode(ISD::AND, SL, MVT::i32,
-                            DAG.getNOT(SL, BFM, MVT::i32), BCVec);
+  SDValue LHS = DAG.getNode(ISD::AND, SL, IntVT, BFM, ExtVal);
+  SDValue RHS = DAG.getNode(ISD::AND, SL, IntVT,
+                            DAG.getNOT(SL, BFM, IntVT), BCVec);
 
-  SDValue BFI = DAG.getNode(ISD::OR, SL, MVT::i32, LHS, RHS);
-  return DAG.getNode(ISD::BITCAST, SL, Op.getValueType(), BFI);
+  SDValue BFI = DAG.getNode(ISD::OR, SL, IntVT, LHS, RHS);
+  return DAG.getNode(ISD::BITCAST, SL, VecVT, BFI);
 }
 
 SDValue SITargetLowering::lowerEXTRACT_VECTOR_ELT(SDValue Op,
@@ -3756,51 +4213,87 @@ SDValue SITargetLowering::lowerEXTRACT_VECTOR_ELT(SDValue Op,
   EVT ResultVT = Op.getValueType();
   SDValue Vec = Op.getOperand(0);
   SDValue Idx = Op.getOperand(1);
+  EVT VecVT = Vec.getValueType();
+  unsigned VecSize = VecVT.getSizeInBits();
+  EVT EltVT = VecVT.getVectorElementType();
+  assert(VecSize <= 64);
 
   DAGCombinerInfo DCI(DAG, AfterLegalizeVectorOps, true, nullptr);
 
-  // Make sure we we do any optimizations that will make it easier to fold
+  // Make sure we do any optimizations that will make it easier to fold
   // source modifiers before obscuring it with bit operations.
 
   // XXX - Why doesn't this get called when vector_shuffle is expanded?
   if (SDValue Combined = performExtractVectorEltCombine(Op.getNode(), DCI))
     return Combined;
 
-  if (const ConstantSDNode *CIdx = dyn_cast<ConstantSDNode>(Idx)) {
-    SDValue Result = DAG.getNode(ISD::BITCAST, SL, MVT::i32, Vec);
+  unsigned EltSize = EltVT.getSizeInBits();
+  assert(isPowerOf2_32(EltSize));
 
-    if (CIdx->getZExtValue() == 1) {
-      Result = DAG.getNode(ISD::SRL, SL, MVT::i32, Result,
-                           DAG.getConstant(16, SL, MVT::i32));
-    } else {
-      assert(CIdx->getZExtValue() == 0);
-    }
+  MVT IntVT = MVT::getIntegerVT(VecSize);
+  SDValue ScaleFactor = DAG.getConstant(Log2_32(EltSize), SL, MVT::i32);
+
+  // Convert vector index to bit-index (* EltSize)
+  SDValue ScaledIdx = DAG.getNode(ISD::SHL, SL, MVT::i32, Idx, ScaleFactor);
 
-    if (ResultVT.bitsLT(MVT::i32))
-      Result = DAG.getNode(ISD::TRUNCATE, SL, MVT::i16, Result);
+  SDValue BC = DAG.getNode(ISD::BITCAST, SL, IntVT, Vec);
+  SDValue Elt = DAG.getNode(ISD::SRL, SL, IntVT, BC, ScaledIdx);
+
+  if (ResultVT == MVT::f16) {
+    SDValue Result = DAG.getNode(ISD::TRUNCATE, SL, MVT::i16, Elt);
     return DAG.getNode(ISD::BITCAST, SL, ResultVT, Result);
   }
 
-  SDValue Sixteen = DAG.getConstant(16, SL, MVT::i32);
+  return DAG.getAnyExtOrTrunc(Elt, SL, ResultVT);
+}
 
-  // Convert vector index to bit-index.
-  SDValue ScaledIdx = DAG.getNode(ISD::SHL, SL, MVT::i32, Idx, Sixteen);
+SDValue SITargetLowering::lowerBUILD_VECTOR(SDValue Op,
+                                            SelectionDAG &DAG) const {
+  SDLoc SL(Op);
+  EVT VT = Op.getValueType();
+
+  if (VT == MVT::v4i16 || VT == MVT::v4f16) {
+    EVT HalfVT = MVT::getVectorVT(VT.getVectorElementType().getSimpleVT(), 2);
+
+    // Turn into pair of packed build_vectors.
+    // TODO: Special case for constants that can be materialized with s_mov_b64.
+    SDValue Lo = DAG.getBuildVector(HalfVT, SL,
+                                    { Op.getOperand(0), Op.getOperand(1) });
+    SDValue Hi = DAG.getBuildVector(HalfVT, SL,
+                                    { Op.getOperand(2), Op.getOperand(3) });
+
+    SDValue CastLo = DAG.getNode(ISD::BITCAST, SL, MVT::i32, Lo);
+    SDValue CastHi = DAG.getNode(ISD::BITCAST, SL, MVT::i32, Hi);
+
+    SDValue Blend = DAG.getBuildVector(MVT::v2i32, SL, { CastLo, CastHi });
+    return DAG.getNode(ISD::BITCAST, SL, VT, Blend);
+  }
 
-  SDValue BC = DAG.getNode(ISD::BITCAST, SL, MVT::i32, Vec);
-  SDValue Elt = DAG.getNode(ISD::SRL, SL, MVT::i32, BC, ScaledIdx);
+  assert(VT == MVT::v2f16 || VT == MVT::v2i16);
 
-  SDValue Result = Elt;
-  if (ResultVT.bitsLT(MVT::i32))
-    Result = DAG.getNode(ISD::TRUNCATE, SL, MVT::i16, Result);
+  SDValue Lo = Op.getOperand(0);
+  SDValue Hi = Op.getOperand(1);
 
-  return DAG.getNode(ISD::BITCAST, SL, ResultVT, Result);
+  Lo = DAG.getNode(ISD::BITCAST, SL, MVT::i16, Lo);
+  Hi = DAG.getNode(ISD::BITCAST, SL, MVT::i16, Hi);
+
+  Lo = DAG.getNode(ISD::ZERO_EXTEND, SL, MVT::i32, Lo);
+  Hi = DAG.getNode(ISD::ZERO_EXTEND, SL, MVT::i32, Hi);
+
+  SDValue ShlHi = DAG.getNode(ISD::SHL, SL, MVT::i32, Hi,
+                              DAG.getConstant(16, SL, MVT::i32));
+
+  SDValue Or = DAG.getNode(ISD::OR, SL, MVT::i32, Lo, ShlHi);
+
+  return DAG.getNode(ISD::BITCAST, SL, VT, Or);
 }
 
 bool
 SITargetLowering::isOffsetFoldingLegal(const GlobalAddressSDNode *GA) const {
   // We can fold offsets for anything that doesn't require a GOT relocation.
   return (GA->getAddressSpace() == AMDGPUASI.GLOBAL_ADDRESS ||
-              GA->getAddressSpace() == AMDGPUASI.CONSTANT_ADDRESS) &&
+          GA->getAddressSpace() == AMDGPUASI.CONSTANT_ADDRESS ||
+          GA->getAddressSpace() == AMDGPUASI.CONSTANT_ADDRESS_32BIT) &&
          !shouldEmitGOTReloc(GA->getGlobal());
 }
 
@@ -3853,6 +4346,7 @@ SDValue SITargetLowering::LowerGlobalAddress(AMDGPUMachineFunction *MFI,
   const GlobalValue *GV = GSD->getGlobal();
 
   if (GSD->getAddressSpace() != AMDGPUASI.CONSTANT_ADDRESS &&
+      GSD->getAddressSpace() != AMDGPUASI.CONSTANT_ADDRESS_32BIT &&
       GSD->getAddressSpace() != AMDGPUASI.GLOBAL_ADDRESS &&
       // FIXME: It isn't correct to rely on the type of the pointer. This should
       // be removed when address space 0 is 64-bit.
@@ -3905,7 +4399,7 @@ SDValue SITargetLowering::lowerImplicitZextParam(SelectionDAG &DAG,
                                                  unsigned Offset) const {
   SDLoc SL(Op);
   SDValue Param = lowerKernargMemParameter(DAG, MVT::i32, MVT::i32, SL,
-                                           DAG.getEntryNode(), Offset, false);
+                                           DAG.getEntryNode(), Offset, 4, false);
   // The local size values will have the hi 16-bits as zero.
   return DAG.getNode(ISD::AssertZext, SL, MVT::i32, Param,
                      DAG.getValueType(VT));
@@ -3929,6 +4423,245 @@ static SDValue emitRemovedIntrinsicError(SelectionDAG &DAG, const SDLoc &DL,
   return DAG.getUNDEF(VT);
 }
 
+static SDValue getBuildDwordsVector(SelectionDAG &DAG, SDLoc DL,
+                                    ArrayRef<SDValue> Elts) {
+  assert(!Elts.empty());
+  MVT Type;
+  unsigned NumElts;
+
+  if (Elts.size() == 1) {
+    Type = MVT::f32;
+    NumElts = 1;
+  } else if (Elts.size() == 2) {
+    Type = MVT::v2f32;
+    NumElts = 2;
+  } else if (Elts.size() <= 4) {
+    Type = MVT::v4f32;
+    NumElts = 4;
+  } else if (Elts.size() <= 8) {
+    Type = MVT::v8f32;
+    NumElts = 8;
+  } else {
+    assert(Elts.size() <= 16);
+    Type = MVT::v16f32;
+    NumElts = 16;
+  }
+
+  SmallVector<SDValue, 16> VecElts(NumElts);
+  for (unsigned i = 0; i < Elts.size(); ++i) {
+    SDValue Elt = Elts[i];
+    if (Elt.getValueType() != MVT::f32)
+      Elt = DAG.getBitcast(MVT::f32, Elt);
+    VecElts[i] = Elt;
+  }
+  for (unsigned i = Elts.size(); i < NumElts; ++i)
+    VecElts[i] = DAG.getUNDEF(MVT::f32);
+
+  if (NumElts == 1)
+    return VecElts[0];
+  return DAG.getBuildVector(Type, DL, VecElts);
+}
+
+static bool parseCachePolicy(SDValue CachePolicy, SelectionDAG &DAG,
+                             SDValue *GLC, SDValue *SLC) {
+  auto CachePolicyConst = dyn_cast<ConstantSDNode>(CachePolicy.getNode());
+  if (!CachePolicyConst)
+    return false;
+
+  uint64_t Value = CachePolicyConst->getZExtValue();
+  SDLoc DL(CachePolicy);
+  if (GLC) {
+    *GLC = DAG.getTargetConstant((Value & 0x1) ? 1 : 0, DL, MVT::i32);
+    Value &= ~(uint64_t)0x1;
+  }
+  if (SLC) {
+    *SLC = DAG.getTargetConstant((Value & 0x2) ? 1 : 0, DL, MVT::i32);
+    Value &= ~(uint64_t)0x2;
+  }
+
+  return Value == 0;
+}
+
+SDValue SITargetLowering::lowerImage(SDValue Op,
+                                     const AMDGPU::ImageDimIntrinsicInfo *Intr,
+                                     SelectionDAG &DAG) const {
+  SDLoc DL(Op);
+  MachineFunction &MF = DAG.getMachineFunction();
+  const AMDGPU::MIMGBaseOpcodeInfo *BaseOpcode =
+      AMDGPU::getMIMGBaseOpcodeInfo(Intr->BaseOpcode);
+  const AMDGPU::MIMGDimInfo *DimInfo = AMDGPU::getMIMGDimInfo(Intr->Dim);
+
+  SmallVector<EVT, 2> ResultTypes(Op->value_begin(), Op->value_end());
+  bool IsD16 = false;
+  SDValue VData;
+  int NumVDataDwords;
+  unsigned AddrIdx; // Index of first address argument
+  unsigned DMask;
+
+  if (BaseOpcode->Atomic) {
+    VData = Op.getOperand(2);
+
+    bool Is64Bit = VData.getValueType() == MVT::i64;
+    if (BaseOpcode->AtomicX2) {
+      SDValue VData2 = Op.getOperand(3);
+      VData = DAG.getBuildVector(Is64Bit ? MVT::v2i64 : MVT::v2i32, DL,
+                                 {VData, VData2});
+      if (Is64Bit)
+        VData = DAG.getBitcast(MVT::v4i32, VData);
+
+      ResultTypes[0] = Is64Bit ? MVT::v2i64 : MVT::v2i32;
+      DMask = Is64Bit ? 0xf : 0x3;
+      NumVDataDwords = Is64Bit ? 4 : 2;
+      AddrIdx = 4;
+    } else {
+      DMask = Is64Bit ? 0x3 : 0x1;
+      NumVDataDwords = Is64Bit ? 2 : 1;
+      AddrIdx = 3;
+    }
+  } else {
+    unsigned DMaskIdx;
+
+    if (BaseOpcode->Store) {
+      VData = Op.getOperand(2);
+
+      MVT StoreVT = VData.getSimpleValueType();
+      if (StoreVT.getScalarType() == MVT::f16) {
+        if (Subtarget->getGeneration() < AMDGPUSubtarget::VOLCANIC_ISLANDS ||
+            !BaseOpcode->HasD16)
+          return Op; // D16 is unsupported for this instruction
+
+        IsD16 = true;
+        VData = handleD16VData(VData, DAG);
+      }
+
+      NumVDataDwords = (VData.getValueType().getSizeInBits() + 31) / 32;
+      DMaskIdx = 3;
+    } else {
+      MVT LoadVT = Op.getSimpleValueType();
+      if (LoadVT.getScalarType() == MVT::f16) {
+        if (Subtarget->getGeneration() < AMDGPUSubtarget::VOLCANIC_ISLANDS ||
+            !BaseOpcode->HasD16)
+          return Op; // D16 is unsupported for this instruction
+
+        IsD16 = true;
+        if (LoadVT.isVector() && Subtarget->hasUnpackedD16VMem())
+          ResultTypes[0] = (LoadVT == MVT::v2f16) ? MVT::v2i32 : MVT::v4i32;
+      }
+
+      NumVDataDwords = (ResultTypes[0].getSizeInBits() + 31) / 32;
+      DMaskIdx = isa<MemSDNode>(Op) ? 2 : 1;
+    }
+
+    auto DMaskConst = dyn_cast<ConstantSDNode>(Op.getOperand(DMaskIdx));
+    if (!DMaskConst)
+      return Op;
+
+    AddrIdx = DMaskIdx + 1;
+    DMask = DMaskConst->getZExtValue();
+    if (!DMask && !BaseOpcode->Store) {
+      // Eliminate no-op loads. Stores with dmask == 0 are *not* no-op: they
+      // store the channels' default values.
+      SDValue Undef = DAG.getUNDEF(Op.getValueType());
+      if (isa<MemSDNode>(Op))
+        return DAG.getMergeValues({Undef, Op.getOperand(0)}, DL);
+      return Undef;
+    }
+  }
+
+  unsigned NumVAddrs = BaseOpcode->NumExtraArgs +
+                       (BaseOpcode->Gradients ? DimInfo->NumGradients : 0) +
+                       (BaseOpcode->Coordinates ? DimInfo->NumCoords : 0) +
+                       (BaseOpcode->LodOrClampOrMip ? 1 : 0);
+  SmallVector<SDValue, 4> VAddrs;
+  for (unsigned i = 0; i < NumVAddrs; ++i)
+    VAddrs.push_back(Op.getOperand(AddrIdx + i));
+  SDValue VAddr = getBuildDwordsVector(DAG, DL, VAddrs);
+
+  SDValue True = DAG.getTargetConstant(1, DL, MVT::i1);
+  SDValue False = DAG.getTargetConstant(0, DL, MVT::i1);
+  unsigned CtrlIdx; // Index of texfailctrl argument
+  SDValue Unorm;
+  if (!BaseOpcode->Sampler) {
+    Unorm = True;
+    CtrlIdx = AddrIdx + NumVAddrs + 1;
+  } else {
+    auto UnormConst =
+        dyn_cast<ConstantSDNode>(Op.getOperand(AddrIdx + NumVAddrs + 2));
+    if (!UnormConst)
+      return Op;
+
+    Unorm = UnormConst->getZExtValue() ? True : False;
+    CtrlIdx = AddrIdx + NumVAddrs + 3;
+  }
+
+  SDValue TexFail = Op.getOperand(CtrlIdx);
+  auto TexFailConst = dyn_cast<ConstantSDNode>(TexFail.getNode());
+  if (!TexFailConst || TexFailConst->getZExtValue() != 0)
+    return Op;
+
+  SDValue GLC;
+  SDValue SLC;
+  if (BaseOpcode->Atomic) {
+    GLC = True; // TODO no-return optimization
+    if (!parseCachePolicy(Op.getOperand(CtrlIdx + 1), DAG, nullptr, &SLC))
+      return Op;
+  } else {
+    if (!parseCachePolicy(Op.getOperand(CtrlIdx + 1), DAG, &GLC, &SLC))
+      return Op;
+  }
+
+  SmallVector<SDValue, 14> Ops;
+  if (BaseOpcode->Store || BaseOpcode->Atomic)
+    Ops.push_back(VData); // vdata
+  Ops.push_back(VAddr);
+  Ops.push_back(Op.getOperand(AddrIdx + NumVAddrs)); // rsrc
+  if (BaseOpcode->Sampler)
+    Ops.push_back(Op.getOperand(AddrIdx + NumVAddrs + 1)); // sampler
+  Ops.push_back(DAG.getTargetConstant(DMask, DL, MVT::i32));
+  Ops.push_back(Unorm);
+  Ops.push_back(GLC);
+  Ops.push_back(SLC);
+  Ops.push_back(False); // r128
+  Ops.push_back(False); // tfe
+  Ops.push_back(False); // lwe
+  Ops.push_back(DimInfo->DA ? True : False);
+  if (BaseOpcode->HasD16)
+    Ops.push_back(IsD16 ? True : False);
+  if (isa<MemSDNode>(Op))
+    Ops.push_back(Op.getOperand(0)); // chain
+
+  int NumVAddrDwords = VAddr.getValueType().getSizeInBits() / 32;
+  int Opcode = -1;
+
+  if (Subtarget->getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS)
+    Opcode = AMDGPU::getMIMGOpcode(Intr->BaseOpcode, AMDGPU::MIMGEncGfx8,
+                                   NumVDataDwords, NumVAddrDwords);
+  if (Opcode == -1)
+    Opcode = AMDGPU::getMIMGOpcode(Intr->BaseOpcode, AMDGPU::MIMGEncGfx6,
+                                   NumVDataDwords, NumVAddrDwords);
+  assert(Opcode != -1);
+
+  MachineSDNode *NewNode = DAG.getMachineNode(Opcode, DL, ResultTypes, Ops);
+  if (auto MemOp = dyn_cast<MemSDNode>(Op)) {
+    MachineInstr::mmo_iterator MemRefs = MF.allocateMemRefsArray(1);
+    *MemRefs = MemOp->getMemOperand();
+    NewNode->setMemRefs(MemRefs, MemRefs + 1);
+  }
+
+  if (BaseOpcode->AtomicX2) {
+    SmallVector<SDValue, 1> Elt;
+    DAG.ExtractVectorElements(SDValue(NewNode, 0), Elt, 0, 1);
+    return DAG.getMergeValues({Elt[0], SDValue(NewNode, 1)}, DL);
+  } else if (IsD16 && !BaseOpcode->Store) {
+    MVT LoadVT = Op.getSimpleValueType();
+    SDValue Adjusted = adjustLoadValueTypeImpl(
+        SDValue(NewNode, 0), LoadVT, DL, DAG, Subtarget->hasUnpackedD16VMem());
+    return DAG.getMergeValues({Adjusted, SDValue(NewNode, 1)}, DL);
+  }
+
+  return SDValue(NewNode, 0);
+}
+
 SDValue SITargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op,
                                                   SelectionDAG &DAG) const {
   MachineFunction &MF = DAG.getMachineFunction();
@@ -3942,14 +4675,14 @@ SDValue SITargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op,
 
   switch (IntrinsicID) {
   case Intrinsic::amdgcn_implicit_buffer_ptr: {
-    if (getSubtarget()->isAmdCodeObjectV2(MF))
+    if (getSubtarget()->isAmdCodeObjectV2(MF.getFunction()))
       return emitNonHSAIntrinsicError(DAG, DL, VT);
     return getPreloadedValue(DAG, *MFI, VT,
                              AMDGPUFunctionArgInfo::IMPLICIT_BUFFER_PTR);
   }
   case Intrinsic::amdgcn_dispatch_ptr:
   case Intrinsic::amdgcn_queue_ptr: {
-    if (!Subtarget->isAmdCodeObjectV2(MF)) {
+    if (!Subtarget->isAmdCodeObjectV2(MF.getFunction())) {
       DiagnosticInfoUnsupported BadIntrin(
           MF.getFunction(), "unsupported hsa intrinsic without hsa target",
           DL.getDebugLoc());
@@ -3979,16 +4712,16 @@ SDValue SITargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op,
   case Intrinsic::amdgcn_rsq:
     return DAG.getNode(AMDGPUISD::RSQ, DL, VT, Op.getOperand(1));
   case Intrinsic::amdgcn_rsq_legacy:
-    if (Subtarget->getGeneration() >= SISubtarget::VOLCANIC_ISLANDS)
+    if (Subtarget->getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS)
       return emitRemovedIntrinsicError(DAG, DL, VT);
 
     return DAG.getNode(AMDGPUISD::RSQ_LEGACY, DL, VT, Op.getOperand(1));
   case Intrinsic::amdgcn_rcp_legacy:
-    if (Subtarget->getGeneration() >= SISubtarget::VOLCANIC_ISLANDS)
+    if (Subtarget->getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS)
       return emitRemovedIntrinsicError(DAG, DL, VT);
     return DAG.getNode(AMDGPUISD::RCP_LEGACY, DL, VT, Op.getOperand(1));
   case Intrinsic::amdgcn_rsq_clamp: {
-    if (Subtarget->getGeneration() < SISubtarget::VOLCANIC_ISLANDS)
+    if (Subtarget->getGeneration() < AMDGPUSubtarget::VOLCANIC_ISLANDS)
       return DAG.getNode(AMDGPUISD::RSQ_CLAMP, DL, VT, Op.getOperand(1));
 
     Type *Type = VT.getTypeForEVT(*DAG.getContext());
@@ -4006,37 +4739,37 @@ SDValue SITargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op,
       return emitNonHSAIntrinsicError(DAG, DL, VT);
 
     return lowerKernargMemParameter(DAG, VT, VT, DL, DAG.getEntryNode(),
-                                    SI::KernelInputOffsets::NGROUPS_X, false);
+                                    SI::KernelInputOffsets::NGROUPS_X, 4, false);
   case Intrinsic::r600_read_ngroups_y:
     if (Subtarget->isAmdHsaOS())
       return emitNonHSAIntrinsicError(DAG, DL, VT);
 
     return lowerKernargMemParameter(DAG, VT, VT, DL, DAG.getEntryNode(),
-                                    SI::KernelInputOffsets::NGROUPS_Y, false);
+                                    SI::KernelInputOffsets::NGROUPS_Y, 4, false);
   case Intrinsic::r600_read_ngroups_z:
     if (Subtarget->isAmdHsaOS())
       return emitNonHSAIntrinsicError(DAG, DL, VT);
 
     return lowerKernargMemParameter(DAG, VT, VT, DL, DAG.getEntryNode(),
-                                    SI::KernelInputOffsets::NGROUPS_Z, false);
+                                    SI::KernelInputOffsets::NGROUPS_Z, 4, false);
   case Intrinsic::r600_read_global_size_x:
     if (Subtarget->isAmdHsaOS())
       return emitNonHSAIntrinsicError(DAG, DL, VT);
 
     return lowerKernargMemParameter(DAG, VT, VT, DL, DAG.getEntryNode(),
-                                    SI::KernelInputOffsets::GLOBAL_SIZE_X, false);
+                                    SI::KernelInputOffsets::GLOBAL_SIZE_X, 4, false);
   case Intrinsic::r600_read_global_size_y:
     if (Subtarget->isAmdHsaOS())
       return emitNonHSAIntrinsicError(DAG, DL, VT);
 
     return lowerKernargMemParameter(DAG, VT, VT, DL, DAG.getEntryNode(),
-                                    SI::KernelInputOffsets::GLOBAL_SIZE_Y, false);
+                                    SI::KernelInputOffsets::GLOBAL_SIZE_Y, 4, false);
   case Intrinsic::r600_read_global_size_z:
     if (Subtarget->isAmdHsaOS())
       return emitNonHSAIntrinsicError(DAG, DL, VT);
 
     return lowerKernargMemParameter(DAG, VT, VT, DL, DAG.getEntryNode(),
-                                    SI::KernelInputOffsets::GLOBAL_SIZE_Z, false);
+                                    SI::KernelInputOffsets::GLOBAL_SIZE_Z, 4, false);
   case Intrinsic::r600_read_local_size_x:
     if (Subtarget->isAmdHsaOS())
       return emitNonHSAIntrinsicError(DAG, DL, VT);
@@ -4125,7 +4858,7 @@ SDValue SITargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op,
     return DAG.getNode(AMDGPUISD::COS_HW, DL, VT, Op.getOperand(1));
 
   case Intrinsic::amdgcn_log_clamp: {
-    if (Subtarget->getGeneration() < SISubtarget::VOLCANIC_ISLANDS)
+    if (Subtarget->getGeneration() < AMDGPUSubtarget::VOLCANIC_ISLANDS)
       return SDValue();
 
     DiagnosticInfoUnsupported BadIntrin(
@@ -4210,6 +4943,9 @@ SDValue SITargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op,
   case Intrinsic::amdgcn_fmed3:
     return DAG.getNode(AMDGPUISD::FMED3, DL, VT,
                        Op.getOperand(1), Op.getOperand(2), Op.getOperand(3));
+  case Intrinsic::amdgcn_fdot2:
+    return DAG.getNode(AMDGPUISD::FDOT2, DL, VT,
+                       Op.getOperand(1), Op.getOperand(2), Op.getOperand(3));
   case Intrinsic::amdgcn_fmul_legacy:
     return DAG.getNode(AMDGPUISD::FMUL_LEGACY, DL, VT,
                        Op.getOperand(1), Op.getOperand(2));
@@ -4221,10 +4957,27 @@ SDValue SITargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op,
   case Intrinsic::amdgcn_ubfe:
     return DAG.getNode(AMDGPUISD::BFE_U32, DL, VT,
                        Op.getOperand(1), Op.getOperand(2), Op.getOperand(3));
-  case Intrinsic::amdgcn_cvt_pkrtz: {
-    // FIXME: Stop adding cast if v2f16 legal.
+  case Intrinsic::amdgcn_cvt_pkrtz:
+  case Intrinsic::amdgcn_cvt_pknorm_i16:
+  case Intrinsic::amdgcn_cvt_pknorm_u16:
+  case Intrinsic::amdgcn_cvt_pk_i16:
+  case Intrinsic::amdgcn_cvt_pk_u16: {
+    // FIXME: Stop adding cast if v2f16/v2i16 are legal.
     EVT VT = Op.getValueType();
-    SDValue Node = DAG.getNode(AMDGPUISD::CVT_PKRTZ_F16_F32, DL, MVT::i32,
+    unsigned Opcode;
+
+    if (IntrinsicID == Intrinsic::amdgcn_cvt_pkrtz)
+      Opcode = AMDGPUISD::CVT_PKRTZ_F16_F32;
+    else if (IntrinsicID == Intrinsic::amdgcn_cvt_pknorm_i16)
+      Opcode = AMDGPUISD::CVT_PKNORM_I16_F32;
+    else if (IntrinsicID == Intrinsic::amdgcn_cvt_pknorm_u16)
+      Opcode = AMDGPUISD::CVT_PKNORM_U16_F32;
+    else if (IntrinsicID == Intrinsic::amdgcn_cvt_pk_i16)
+      Opcode = AMDGPUISD::CVT_PK_I16_I32;
+    else
+      Opcode = AMDGPUISD::CVT_PK_U16_U32;
+
+    SDValue Node = DAG.getNode(Opcode, DL, MVT::i32,
                                Op.getOperand(1), Op.getOperand(2));
     return DAG.getNode(ISD::BITCAST, DL, VT, Node);
   }
@@ -4238,17 +4991,14 @@ SDValue SITargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op,
     return SDValue(DAG.getMachineNode(AMDGPU::WWM, DL, Src.getValueType(), Src),
                    0);
   }
-  case Intrinsic::amdgcn_image_getlod:
-  case Intrinsic::amdgcn_image_getresinfo: {
-    unsigned Idx = (IntrinsicID == Intrinsic::amdgcn_image_getresinfo) ? 3 : 4;
-
-    // Replace dmask with everything disabled with undef.
-    const ConstantSDNode *DMask = dyn_cast<ConstantSDNode>(Op.getOperand(Idx));
-    if (!DMask || DMask->isNullValue())
-      return DAG.getUNDEF(Op.getValueType());
-    return SDValue();
-  }
+  case Intrinsic::amdgcn_fmad_ftz:
+    return DAG.getNode(AMDGPUISD::FMAD_FTZ, DL, VT, Op.getOperand(1),
+                       Op.getOperand(2), Op.getOperand(3));
   default:
+    if (const AMDGPU::ImageDimIntrinsicInfo *ImageDimIntr =
+            AMDGPU::getImageDimIntrinsicInfo(IntrinsicID))
+      return lowerImage(Op, ImageDimIntr, DAG);
+
     return Op;
   }
 }
@@ -4257,14 +5007,34 @@ SDValue SITargetLowering::LowerINTRINSIC_W_CHAIN(SDValue Op,
                                                  SelectionDAG &DAG) const {
   unsigned IntrID = cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue();
   SDLoc DL(Op);
-  MachineFunction &MF = DAG.getMachineFunction();
 
   switch (IntrID) {
   case Intrinsic::amdgcn_atomic_inc:
-  case Intrinsic::amdgcn_atomic_dec: {
+  case Intrinsic::amdgcn_atomic_dec:
+  case Intrinsic::amdgcn_ds_fadd:
+  case Intrinsic::amdgcn_ds_fmin:
+  case Intrinsic::amdgcn_ds_fmax: {
     MemSDNode *M = cast<MemSDNode>(Op);
-    unsigned Opc = (IntrID == Intrinsic::amdgcn_atomic_inc) ?
-      AMDGPUISD::ATOMIC_INC : AMDGPUISD::ATOMIC_DEC;
+    unsigned Opc;
+    switch (IntrID) {
+    case Intrinsic::amdgcn_atomic_inc:
+      Opc = AMDGPUISD::ATOMIC_INC;
+      break;
+    case Intrinsic::amdgcn_atomic_dec:
+      Opc = AMDGPUISD::ATOMIC_DEC;
+      break;
+    case Intrinsic::amdgcn_ds_fadd:
+      Opc = AMDGPUISD::ATOMIC_LOAD_FADD;
+      break;
+    case Intrinsic::amdgcn_ds_fmin:
+      Opc = AMDGPUISD::ATOMIC_LOAD_FMIN;
+      break;
+    case Intrinsic::amdgcn_ds_fmax:
+      Opc = AMDGPUISD::ATOMIC_LOAD_FMAX;
+      break;
+    default:
+      llvm_unreachable("Unknown intrinsic!");
+    }
     SDValue Ops[] = {
       M->getOperand(0), // Chain
       M->getOperand(2), // Ptr
@@ -4284,21 +5054,28 @@ SDValue SITargetLowering::LowerINTRINSIC_W_CHAIN(SDValue Op,
       Op.getOperand(5), // glc
       Op.getOperand(6)  // slc
     };
-    SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
 
     unsigned Opc = (IntrID == Intrinsic::amdgcn_buffer_load) ?
         AMDGPUISD::BUFFER_LOAD : AMDGPUISD::BUFFER_LOAD_FORMAT;
     EVT VT = Op.getValueType();
     EVT IntVT = VT.changeTypeToInteger();
+    auto *M = cast<MemSDNode>(Op);
+    EVT LoadVT = Op.getValueType();
+    bool IsD16 = LoadVT.getScalarType() == MVT::f16;
+    if (IsD16)
+      return adjustLoadValueType(AMDGPUISD::BUFFER_LOAD_FORMAT_D16, M, DAG);
 
-    MachineMemOperand *MMO = MF.getMachineMemOperand(
-      MachinePointerInfo(MFI->getBufferPSV()),
-      MachineMemOperand::MOLoad,
-      VT.getStoreSize(), VT.getStoreSize());
-
-    return DAG.getMemIntrinsicNode(Opc, DL, Op->getVTList(), Ops, IntVT, MMO);
+    return DAG.getMemIntrinsicNode(Opc, DL, Op->getVTList(), Ops, IntVT,
+                                   M->getMemOperand());
   }
   case Intrinsic::amdgcn_tbuffer_load: {
+    MemSDNode *M = cast<MemSDNode>(Op);
+    EVT LoadVT = Op.getValueType();
+    bool IsD16 = LoadVT.getScalarType() == MVT::f16;
+    if (IsD16) {
+      return adjustLoadValueType(AMDGPUISD::TBUFFER_LOAD_FORMAT_D16, M, DAG);
+    }
+
     SDValue Ops[] = {
       Op.getOperand(0),  // Chain
       Op.getOperand(2),  // rsrc
@@ -4312,14 +5089,9 @@ SDValue SITargetLowering::LowerINTRINSIC_W_CHAIN(SDValue Op,
       Op.getOperand(10)   // slc
     };
 
-    EVT VT = Op.getOperand(2).getValueType();
-
-    MachineMemOperand *MMO = MF.getMachineMemOperand(
-      MachinePointerInfo(),
-      MachineMemOperand::MOLoad,
-      VT.getStoreSize(), VT.getStoreSize());
     return DAG.getMemIntrinsicNode(AMDGPUISD::TBUFFER_LOAD_FORMAT, DL,
-                                   Op->getVTList(), Ops, VT, MMO);
+                                   Op->getVTList(), Ops, LoadVT,
+                                   M->getMemOperand());
   }
   case Intrinsic::amdgcn_buffer_atomic_swap:
   case Intrinsic::amdgcn_buffer_atomic_add:
@@ -4339,14 +5111,9 @@ SDValue SITargetLowering::LowerINTRINSIC_W_CHAIN(SDValue Op,
       Op.getOperand(5), // offset
       Op.getOperand(6)  // slc
     };
-    EVT VT = Op.getOperand(3).getValueType();
-    MachineMemOperand *MMO = MF.getMachineMemOperand(
-      MachinePointerInfo(),
-      MachineMemOperand::MOLoad |
-      MachineMemOperand::MOStore |
-      MachineMemOperand::MODereferenceable |
-      MachineMemOperand::MOVolatile,
-      VT.getStoreSize(), 4);
+    EVT VT = Op.getValueType();
+
+    auto *M = cast<MemSDNode>(Op);
     unsigned Opcode = 0;
 
     switch (IntrID) {
@@ -4384,7 +5151,8 @@ SDValue SITargetLowering::LowerINTRINSIC_W_CHAIN(SDValue Op,
       llvm_unreachable("unhandled atomic opcode");
     }
 
-    return DAG.getMemIntrinsicNode(Opcode, DL, Op->getVTList(), Ops, VT, MMO);
+    return DAG.getMemIntrinsicNode(Opcode, DL, Op->getVTList(), Ops, VT,
+                                   M->getMemOperand());
   }
 
   case Intrinsic::amdgcn_buffer_atomic_cmpswap: {
@@ -4397,78 +5165,46 @@ SDValue SITargetLowering::LowerINTRINSIC_W_CHAIN(SDValue Op,
       Op.getOperand(6), // offset
       Op.getOperand(7)  // slc
     };
-    EVT VT = Op.getOperand(4).getValueType();
-    MachineMemOperand *MMO = MF.getMachineMemOperand(
-      MachinePointerInfo(),
-      MachineMemOperand::MOLoad |
-      MachineMemOperand::MOStore |
-      MachineMemOperand::MODereferenceable |
-      MachineMemOperand::MOVolatile,
-      VT.getStoreSize(), 4);
+    EVT VT = Op.getValueType();
+    auto *M = cast<MemSDNode>(Op);
 
     return DAG.getMemIntrinsicNode(AMDGPUISD::BUFFER_ATOMIC_CMPSWAP, DL,
-                                   Op->getVTList(), Ops, VT, MMO);
+                                   Op->getVTList(), Ops, VT, M->getMemOperand());
   }
 
-  // Basic sample.
-  case Intrinsic::amdgcn_image_sample:
-  case Intrinsic::amdgcn_image_sample_cl:
-  case Intrinsic::amdgcn_image_sample_d:
-  case Intrinsic::amdgcn_image_sample_d_cl:
-  case Intrinsic::amdgcn_image_sample_l:
-  case Intrinsic::amdgcn_image_sample_b:
-  case Intrinsic::amdgcn_image_sample_b_cl:
-  case Intrinsic::amdgcn_image_sample_lz:
-  case Intrinsic::amdgcn_image_sample_cd:
-  case Intrinsic::amdgcn_image_sample_cd_cl:
-
-  // Sample with comparison.
-  case Intrinsic::amdgcn_image_sample_c:
-  case Intrinsic::amdgcn_image_sample_c_cl:
-  case Intrinsic::amdgcn_image_sample_c_d:
-  case Intrinsic::amdgcn_image_sample_c_d_cl:
-  case Intrinsic::amdgcn_image_sample_c_l:
-  case Intrinsic::amdgcn_image_sample_c_b:
-  case Intrinsic::amdgcn_image_sample_c_b_cl:
-  case Intrinsic::amdgcn_image_sample_c_lz:
-  case Intrinsic::amdgcn_image_sample_c_cd:
-  case Intrinsic::amdgcn_image_sample_c_cd_cl:
-
-  // Sample with offsets.
-  case Intrinsic::amdgcn_image_sample_o:
-  case Intrinsic::amdgcn_image_sample_cl_o:
-  case Intrinsic::amdgcn_image_sample_d_o:
-  case Intrinsic::amdgcn_image_sample_d_cl_o:
-  case Intrinsic::amdgcn_image_sample_l_o:
-  case Intrinsic::amdgcn_image_sample_b_o:
-  case Intrinsic::amdgcn_image_sample_b_cl_o:
-  case Intrinsic::amdgcn_image_sample_lz_o:
-  case Intrinsic::amdgcn_image_sample_cd_o:
-  case Intrinsic::amdgcn_image_sample_cd_cl_o:
-
-  // Sample with comparison and offsets.
-  case Intrinsic::amdgcn_image_sample_c_o:
-  case Intrinsic::amdgcn_image_sample_c_cl_o:
-  case Intrinsic::amdgcn_image_sample_c_d_o:
-  case Intrinsic::amdgcn_image_sample_c_d_cl_o:
-  case Intrinsic::amdgcn_image_sample_c_l_o:
-  case Intrinsic::amdgcn_image_sample_c_b_o:
-  case Intrinsic::amdgcn_image_sample_c_b_cl_o:
-  case Intrinsic::amdgcn_image_sample_c_lz_o:
-  case Intrinsic::amdgcn_image_sample_c_cd_o:
-  case Intrinsic::amdgcn_image_sample_c_cd_cl_o: {
-    // Replace dmask with everything disabled with undef.
-    const ConstantSDNode *DMask = dyn_cast<ConstantSDNode>(Op.getOperand(5));
-    if (!DMask || DMask->isNullValue()) {
-      SDValue Undef = DAG.getUNDEF(Op.getValueType());
-      return DAG.getMergeValues({ Undef, Op.getOperand(0) }, SDLoc(Op));
-    }
+  default:
+    if (const AMDGPU::ImageDimIntrinsicInfo *ImageDimIntr =
+            AMDGPU::getImageDimIntrinsicInfo(IntrID))
+      return lowerImage(Op, ImageDimIntr, DAG);
 
     return SDValue();
   }
-  default:
-    return SDValue();
+}
+
+SDValue SITargetLowering::handleD16VData(SDValue VData,
+                                         SelectionDAG &DAG) const {
+  EVT StoreVT = VData.getValueType();
+
+  // No change for f16 and legal vector D16 types.
+  if (!StoreVT.isVector())
+    return VData;
+
+  SDLoc DL(VData);
+  assert((StoreVT.getVectorNumElements() != 3) && "Handle v3f16");
+
+  if (Subtarget->hasUnpackedD16VMem()) {
+    // We need to unpack the packed data to store.
+    EVT IntStoreVT = StoreVT.changeTypeToInteger();
+    SDValue IntVData = DAG.getNode(ISD::BITCAST, DL, IntStoreVT, VData);
+
+    EVT EquivStoreVT = EVT::getVectorVT(*DAG.getContext(), MVT::i32,
+                                        StoreVT.getVectorNumElements());
+    SDValue ZExt = DAG.getNode(ISD::ZERO_EXTEND, DL, EquivStoreVT, IntVData);
+    return DAG.UnrollVectorOp(ZExt.getNode());
   }
+
+  assert(isTypeLegal(StoreVT));
+  return VData;
 }
 
 SDValue SITargetLowering::LowerINTRINSIC_VOID(SDValue Op,
@@ -4558,7 +5294,7 @@ SDValue SITargetLowering::LowerINTRINSIC_VOID(SDValue Op,
   }
   case Intrinsic::amdgcn_s_barrier: {
     if (getTargetMachine().getOptLevel() > CodeGenOpt::None) {
-      const SISubtarget &ST = MF.getSubtarget<SISubtarget>();
+      const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
       unsigned WGSize = ST.getFlatWorkGroupSizes(MF.getFunction()).second;
       if (WGSize <= ST.getWavefrontSize())
         return SDValue(DAG.getMachineNode(AMDGPU::WAVE_BARRIER, DL, MVT::Other,
@@ -4613,9 +5349,13 @@ SDValue SITargetLowering::LowerINTRINSIC_VOID(SDValue Op,
   }
 
   case Intrinsic::amdgcn_tbuffer_store: {
+    SDValue VData = Op.getOperand(2);
+    bool IsD16 = (VData.getValueType().getScalarType() == MVT::f16);
+    if (IsD16)
+      VData = handleD16VData(VData, DAG);
     SDValue Ops[] = {
       Chain,
-      Op.getOperand(2),  // vdata
+      VData,             // vdata
       Op.getOperand(3),  // rsrc
       Op.getOperand(4),  // vindex
       Op.getOperand(5),  // voffset
@@ -4626,42 +5366,133 @@ SDValue SITargetLowering::LowerINTRINSIC_VOID(SDValue Op,
       Op.getOperand(10), // glc
       Op.getOperand(11)  // slc
     };
-    EVT VT = Op.getOperand(3).getValueType();
-    MachineMemOperand *MMO = MF.getMachineMemOperand(
-      MachinePointerInfo(),
-      MachineMemOperand::MOStore,
-      VT.getStoreSize(), 4);
-    return DAG.getMemIntrinsicNode(AMDGPUISD::TBUFFER_STORE_FORMAT, DL,
-                                   Op->getVTList(), Ops, VT, MMO);
+    unsigned Opc = IsD16 ? AMDGPUISD::TBUFFER_STORE_FORMAT_D16 :
+                           AMDGPUISD::TBUFFER_STORE_FORMAT;
+    MemSDNode *M = cast<MemSDNode>(Op);
+    return DAG.getMemIntrinsicNode(Opc, DL, Op->getVTList(), Ops,
+                                   M->getMemoryVT(), M->getMemOperand());
   }
 
   case Intrinsic::amdgcn_buffer_store:
   case Intrinsic::amdgcn_buffer_store_format: {
+    SDValue VData = Op.getOperand(2);
+    bool IsD16 = (VData.getValueType().getScalarType() == MVT::f16);
+    if (IsD16)
+      VData = handleD16VData(VData, DAG);
     SDValue Ops[] = {
       Chain,
-      Op.getOperand(2), // vdata
+      VData,            // vdata
       Op.getOperand(3), // rsrc
       Op.getOperand(4), // vindex
       Op.getOperand(5), // offset
       Op.getOperand(6), // glc
       Op.getOperand(7)  // slc
     };
-    EVT VT = Op.getOperand(3).getValueType();
-    MachineMemOperand *MMO = MF.getMachineMemOperand(
-      MachinePointerInfo(),
-      MachineMemOperand::MOStore |
-      MachineMemOperand::MODereferenceable,
-      VT.getStoreSize(), 4);
+    unsigned Opc = IntrinsicID == Intrinsic::amdgcn_buffer_store ?
+                   AMDGPUISD::BUFFER_STORE : AMDGPUISD::BUFFER_STORE_FORMAT;
+    Opc = IsD16 ? AMDGPUISD::BUFFER_STORE_FORMAT_D16 : Opc;
+    MemSDNode *M = cast<MemSDNode>(Op);
+    return DAG.getMemIntrinsicNode(Opc, DL, Op->getVTList(), Ops,
+                                   M->getMemoryVT(), M->getMemOperand());
+  }
+  default: {
+    if (const AMDGPU::ImageDimIntrinsicInfo *ImageDimIntr =
+            AMDGPU::getImageDimIntrinsicInfo(IntrinsicID))
+      return lowerImage(Op, ImageDimIntr, DAG);
 
-    unsigned Opcode = IntrinsicID == Intrinsic::amdgcn_buffer_store ?
-                        AMDGPUISD::BUFFER_STORE :
-                        AMDGPUISD::BUFFER_STORE_FORMAT;
-    return DAG.getMemIntrinsicNode(Opcode, DL, Op->getVTList(), Ops, VT, MMO);
+    return Op;
   }
+  }
+}
 
-  default:
+static SDValue getLoadExtOrTrunc(SelectionDAG &DAG,
+                                 ISD::LoadExtType ExtType, SDValue Op,
+                                 const SDLoc &SL, EVT VT) {
+  if (VT.bitsLT(Op.getValueType()))
+    return DAG.getNode(ISD::TRUNCATE, SL, VT, Op);
+
+  switch (ExtType) {
+  case ISD::SEXTLOAD:
+    return DAG.getNode(ISD::SIGN_EXTEND, SL, VT, Op);
+  case ISD::ZEXTLOAD:
+    return DAG.getNode(ISD::ZERO_EXTEND, SL, VT, Op);
+  case ISD::EXTLOAD:
+    return DAG.getNode(ISD::ANY_EXTEND, SL, VT, Op);
+  case ISD::NON_EXTLOAD:
     return Op;
   }
+
+  llvm_unreachable("invalid ext type");
+}
+
+SDValue SITargetLowering::widenLoad(LoadSDNode *Ld, DAGCombinerInfo &DCI) const {
+  SelectionDAG &DAG = DCI.DAG;
+  if (Ld->getAlignment() < 4 || Ld->isDivergent())
+    return SDValue();
+
+  // FIXME: Constant loads should all be marked invariant.
+  unsigned AS = Ld->getAddressSpace();
+  if (AS != AMDGPUASI.CONSTANT_ADDRESS &&
+      AS != AMDGPUASI.CONSTANT_ADDRESS_32BIT &&
+      (AS != AMDGPUAS::GLOBAL_ADDRESS || !Ld->isInvariant()))
+    return SDValue();
+
+  // Don't do this early, since it may interfere with adjacent load merging for
+  // illegal types. We can avoid losing alignment information for exotic types
+  // pre-legalize.
+  EVT MemVT = Ld->getMemoryVT();
+  if ((MemVT.isSimple() && !DCI.isAfterLegalizeDAG()) ||
+      MemVT.getSizeInBits() >= 32)
+    return SDValue();
+
+  SDLoc SL(Ld);
+
+  assert((!MemVT.isVector() || Ld->getExtensionType() == ISD::NON_EXTLOAD) &&
+         "unexpected vector extload");
+
+  // TODO: Drop only high part of range.
+  SDValue Ptr = Ld->getBasePtr();
+  SDValue NewLoad = DAG.getLoad(ISD::UNINDEXED, ISD::NON_EXTLOAD,
+                                MVT::i32, SL, Ld->getChain(), Ptr,
+                                Ld->getOffset(),
+                                Ld->getPointerInfo(), MVT::i32,
+                                Ld->getAlignment(),
+                                Ld->getMemOperand()->getFlags(),
+                                Ld->getAAInfo(),
+                                nullptr); // Drop ranges
+
+  EVT TruncVT = EVT::getIntegerVT(*DAG.getContext(), MemVT.getSizeInBits());
+  if (MemVT.isFloatingPoint()) {
+    assert(Ld->getExtensionType() == ISD::NON_EXTLOAD &&
+           "unexpected fp extload");
+    TruncVT = MemVT.changeTypeToInteger();
+  }
+
+  SDValue Cvt = NewLoad;
+  if (Ld->getExtensionType() == ISD::SEXTLOAD) {
+    Cvt = DAG.getNode(ISD::SIGN_EXTEND_INREG, SL, MVT::i32, NewLoad,
+                      DAG.getValueType(TruncVT));
+  } else if (Ld->getExtensionType() == ISD::ZEXTLOAD ||
+             Ld->getExtensionType() == ISD::NON_EXTLOAD) {
+    Cvt = DAG.getZeroExtendInReg(NewLoad, SL, TruncVT);
+  } else {
+    assert(Ld->getExtensionType() == ISD::EXTLOAD);
+  }
+
+  EVT VT = Ld->getValueType(0);
+  EVT IntVT = EVT::getIntegerVT(*DAG.getContext(), VT.getSizeInBits());
+
+  DCI.AddToWorklist(Cvt.getNode());
+
+  // We may need to handle exotic cases, such as i16->i64 extloads, so insert
+  // the appropriate extension from the 32-bit load.
+  Cvt = getLoadExtOrTrunc(DAG, Ld->getExtensionType(), Cvt, SL, IntVT);
+  DCI.AddToWorklist(Cvt.getNode());
+
+  // Handle conversion back to floating point if necessary.
+  Cvt = DAG.getNode(ISD::BITCAST, SL, VT, Cvt);
+
+  return DAG.getMergeValues({ Cvt, NewLoad.getValue(1) }, SL);
 }
 
 SDValue SITargetLowering::LowerLOAD(SDValue Op, SelectionDAG &DAG) const {
@@ -4700,9 +5531,10 @@ SDValue SITargetLowering::LowerLOAD(SDValue Op, SelectionDAG &DAG) const {
   assert(Op.getValueType().getVectorElementType() == MVT::i32 &&
          "Custom lowering for non-i32 vectors hasn't been implemented.");
 
+  unsigned Alignment = Load->getAlignment();
   unsigned AS = Load->getAddressSpace();
   if (!allowsMemoryAccess(*DAG.getContext(), DAG.getDataLayout(), MemVT,
-                          AS, Load->getAlignment())) {
+                          AS, Alignment)) {
     SDValue Ops[2];
     std::tie(Ops[0], Ops[1]) = expandUnalignedLoad(Load, DAG);
     return DAG.getMergeValues(Ops, DL);
@@ -4717,24 +5549,32 @@ SDValue SITargetLowering::LowerLOAD(SDValue Op, SelectionDAG &DAG) const {
          AMDGPUASI.PRIVATE_ADDRESS : AMDGPUASI.GLOBAL_ADDRESS;
 
   unsigned NumElements = MemVT.getVectorNumElements();
-  if (AS == AMDGPUASI.CONSTANT_ADDRESS) {
-    if (isMemOpUniform(Load))
+
+  if (AS == AMDGPUASI.CONSTANT_ADDRESS ||
+      AS == AMDGPUASI.CONSTANT_ADDRESS_32BIT) {
+    if (!Op->isDivergent() && Alignment >= 4)
       return SDValue();
     // Non-uniform loads will be selected to MUBUF instructions, so they
     // have the same legalization requirements as global and private
     // loads.
     //
   }
-  if (AS == AMDGPUASI.CONSTANT_ADDRESS || AS == AMDGPUASI.GLOBAL_ADDRESS) {
-    if (Subtarget->getScalarizeGlobalBehavior() && isMemOpUniform(Load) &&
-        !Load->isVolatile() && isMemOpHasNoClobberedMemOperand(Load))
+
+  if (AS == AMDGPUASI.CONSTANT_ADDRESS ||
+      AS == AMDGPUASI.CONSTANT_ADDRESS_32BIT ||
+      AS == AMDGPUASI.GLOBAL_ADDRESS) {
+    if (Subtarget->getScalarizeGlobalBehavior() && !Op->isDivergent() &&
+        !Load->isVolatile() && isMemOpHasNoClobberedMemOperand(Load) &&
+        Alignment >= 4)
       return SDValue();
     // Non-uniform loads will be selected to MUBUF instructions, so they
     // have the same legalization requirements as global and private
     // loads.
     //
   }
-  if (AS == AMDGPUASI.CONSTANT_ADDRESS || AS == AMDGPUASI.GLOBAL_ADDRESS ||
+  if (AS == AMDGPUASI.CONSTANT_ADDRESS ||
+      AS == AMDGPUASI.CONSTANT_ADDRESS_32BIT ||
+      AS == AMDGPUASI.GLOBAL_ADDRESS ||
       AS == AMDGPUASI.FLAT_ADDRESS) {
     if (NumElements > 4)
       return SplitVectorLoad(Op, DAG);
@@ -4761,21 +5601,20 @@ SDValue SITargetLowering::LowerLOAD(SDValue Op, SelectionDAG &DAG) const {
       llvm_unreachable("unsupported private_element_size");
     }
   } else if (AS == AMDGPUASI.LOCAL_ADDRESS) {
-    if (NumElements > 2)
-      return SplitVectorLoad(Op, DAG);
-
-    if (NumElements == 2)
+    // Use ds_read_b128 if possible.
+    if (Subtarget->useDS128() && Load->getAlignment() >= 16 &&
+        MemVT.getStoreSize() == 16)
       return SDValue();
 
-    // If properly aligned, if we split we might be able to use ds_read_b64.
-    return SplitVectorLoad(Op, DAG);
+    if (NumElements > 2)
+      return SplitVectorLoad(Op, DAG);
   }
   return SDValue();
 }
 
 SDValue SITargetLowering::LowerSELECT(SDValue Op, SelectionDAG &DAG) const {
-  if (Op.getValueType() != MVT::i64)
-    return SDValue();
+  EVT VT = Op.getValueType();
+  assert(VT.getSizeInBits() == 64);
 
   SDLoc DL(Op);
   SDValue Cond = Op.getOperand(0);
@@ -4797,7 +5636,7 @@ SDValue SITargetLowering::LowerSELECT(SDValue Op, SelectionDAG &DAG) const {
   SDValue Hi = DAG.getSelect(DL, MVT::i32, Cond, Hi0, Hi1);
 
   SDValue Res = DAG.getBuildVector(MVT::v2i32, DL, {Lo, Hi});
-  return DAG.getNode(ISD::BITCAST, DL, MVT::i64, Res);
+  return DAG.getNode(ISD::BITCAST, DL, VT, Res);
 }
 
 // Catch division cases where we can use shortcuts with rcp and rsq
@@ -4809,8 +5648,7 @@ SDValue SITargetLowering::lowerFastUnsafeFDIV(SDValue Op,
   SDValue RHS = Op.getOperand(1);
   EVT VT = Op.getValueType();
   const SDNodeFlags Flags = Op->getFlags();
-  bool Unsafe = DAG.getTarget().Options.UnsafeFPMath ||
-                Flags.hasUnsafeAlgebra() || Flags.hasAllowReciprocal();
+  bool Unsafe = DAG.getTarget().Options.UnsafeFPMath || Flags.hasAllowReciprocal();
 
   if (!Unsafe && VT == MVT::f32 && Subtarget->hasFP32Denormals())
     return SDValue();
@@ -5067,7 +5905,7 @@ SDValue SITargetLowering::LowerFDIV64(SDValue Op, SelectionDAG &DAG) const {
 
   SDValue Scale;
 
-  if (Subtarget->getGeneration() == SISubtarget::SOUTHERN_ISLANDS) {
+  if (Subtarget->getGeneration() == AMDGPUSubtarget::SOUTHERN_ISLANDS) {
     // Workaround a hardware bug on SI where the condition output from div_scale
     // is not usable.
 
@@ -5165,14 +6003,14 @@ SDValue SITargetLowering::LowerSTORE(SDValue Op, SelectionDAG &DAG) const {
       llvm_unreachable("unsupported private_element_size");
     }
   } else if (AS == AMDGPUASI.LOCAL_ADDRESS) {
+    // Use ds_write_b128 if possible.
+    if (Subtarget->useDS128() && Store->getAlignment() >= 16 &&
+        VT.getStoreSize() == 16)
+      return SDValue();
+
     if (NumElements > 2)
       return SplitVectorStore(Op, DAG);
-
-    if (NumElements == 2)
-      return Op;
-
-    // If properly aligned, if we split we might be able to use ds_write_b64.
-    return SplitVectorStore(Op, DAG);
+    return SDValue();
   } else {
     llvm_unreachable("unhandled address space");
   }
@@ -5246,7 +6084,7 @@ SDValue SITargetLowering::performUCharToFloatCombine(SDNode *N,
   // easier if i8 vectors weren't promoted to i32 vectors, particularly after
   // types are legalized. v4i8 -> v4f32 is probably the only case to worry
   // about in practice.
-  if (DCI.isAfterLegalizeVectorOps() && SrcVT == MVT::i32) {
+  if (DCI.isAfterLegalizeDAG() && SrcVT == MVT::i32) {
     if (DAG.MaskedValueIsZero(Src, APInt::getHighBitsSet(32, 24))) {
       SDValue Cvt = DAG.getNode(AMDGPUISD::CVT_F32_UBYTE0, DL, VT, Src);
       DCI.AddToWorklist(Cvt.getNode());
@@ -5389,6 +6227,71 @@ static bool isBoolSGPR(SDValue V) {
   return false;
 }
 
+// If a constant has all zeroes or all ones within each byte return it.
+// Otherwise return 0.
+static uint32_t getConstantPermuteMask(uint32_t C) {
+  // 0xff for any zero byte in the mask
+  uint32_t ZeroByteMask = 0;
+  if (!(C & 0x000000ff)) ZeroByteMask |= 0x000000ff;
+  if (!(C & 0x0000ff00)) ZeroByteMask |= 0x0000ff00;
+  if (!(C & 0x00ff0000)) ZeroByteMask |= 0x00ff0000;
+  if (!(C & 0xff000000)) ZeroByteMask |= 0xff000000;
+  uint32_t NonZeroByteMask = ~ZeroByteMask; // 0xff for any non-zero byte
+  if ((NonZeroByteMask & C) != NonZeroByteMask)
+    return 0; // Partial bytes selected.
+  return C;
+}
+
+// Check if a node selects whole bytes from its operand 0 starting at a byte
+// boundary while masking the rest. Returns select mask as in the v_perm_b32
+// or -1 if not succeeded.
+// Note byte select encoding:
+// value 0-3 selects corresponding source byte;
+// value 0xc selects zero;
+// value 0xff selects 0xff.
+static uint32_t getPermuteMask(SelectionDAG &DAG, SDValue V) {
+  assert(V.getValueSizeInBits() == 32);
+
+  if (V.getNumOperands() != 2)
+    return ~0;
+
+  ConstantSDNode *N1 = dyn_cast<ConstantSDNode>(V.getOperand(1));
+  if (!N1)
+    return ~0;
+
+  uint32_t C = N1->getZExtValue();
+
+  switch (V.getOpcode()) {
+  default:
+    break;
+  case ISD::AND:
+    if (uint32_t ConstMask = getConstantPermuteMask(C)) {
+      return (0x03020100 & ConstMask) | (0x0c0c0c0c & ~ConstMask);
+    }
+    break;
+
+  case ISD::OR:
+    if (uint32_t ConstMask = getConstantPermuteMask(C)) {
+      return (0x03020100 & ~ConstMask) | ConstMask;
+    }
+    break;
+
+  case ISD::SHL:
+    if (C % 8)
+      return ~0;
+
+    return uint32_t((0x030201000c0c0c0cull << C) >> 32);
+
+  case ISD::SRL:
+    if (C % 8)
+      return ~0;
+
+    return uint32_t(0x0c0c0c0c03020100ull >> C);
+  }
+
+  return ~0;
+}
+
 SDValue SITargetLowering::performAndCombine(SDNode *N,
                                             DAGCombinerInfo &DCI) const {
   if (DCI.isBeforeLegalize())
@@ -5435,6 +6338,20 @@ SDValue SITargetLowering::performAndCombine(SDNode *N,
         }
       }
     }
+
+    // and (perm x, y, c1), c2 -> perm x, y, permute_mask(c1, c2)
+    if (LHS.hasOneUse() && LHS.getOpcode() == AMDGPUISD::PERM &&
+        isa<ConstantSDNode>(LHS.getOperand(2))) {
+      uint32_t Sel = getConstantPermuteMask(Mask);
+      if (!Sel)
+        return SDValue();
+
+      // Select 0xc for all zero bytes
+      Sel = (LHS.getConstantOperandVal(2) & Sel) | (~Sel & 0x0c0c0c0c);
+      SDLoc DL(N);
+      return DAG.getNode(AMDGPUISD::PERM, DL, MVT::i32, LHS.getOperand(0),
+                         LHS.getOperand(1), DAG.getConstant(Sel, DL, MVT::i32));
+    }
   }
 
   // (and (fcmp ord x, x), (fcmp une (fabs x), inf)) ->
@@ -5487,6 +6404,54 @@ SDValue SITargetLowering::performAndCombine(SDNode *N,
                            LHS, DAG.getConstant(0, SDLoc(N), MVT::i32));
   }
 
+  // and (op x, c1), (op y, c2) -> perm x, y, permute_mask(c1, c2)
+  const SIInstrInfo *TII = getSubtarget()->getInstrInfo();
+  if (VT == MVT::i32 && LHS.hasOneUse() && RHS.hasOneUse() &&
+      N->isDivergent() && TII->pseudoToMCOpcode(AMDGPU::V_PERM_B32) != -1) {
+    uint32_t LHSMask = getPermuteMask(DAG, LHS);
+    uint32_t RHSMask = getPermuteMask(DAG, RHS);
+    if (LHSMask != ~0u && RHSMask != ~0u) {
+      // Canonicalize the expression in an attempt to have fewer unique masks
+      // and therefore fewer registers used to hold the masks.
+      if (LHSMask > RHSMask) {
+        std::swap(LHSMask, RHSMask);
+        std::swap(LHS, RHS);
+      }
+
+      // Select 0xc for each lane used from source operand. Zero has 0xc mask
+      // set, 0xff have 0xff in the mask, actual lanes are in the 0-3 range.
+      uint32_t LHSUsedLanes = ~(LHSMask & 0x0c0c0c0c) & 0x0c0c0c0c;
+      uint32_t RHSUsedLanes = ~(RHSMask & 0x0c0c0c0c) & 0x0c0c0c0c;
+
+      // Check of we need to combine values from two sources within a byte.
+      if (!(LHSUsedLanes & RHSUsedLanes) &&
+          // If we select high and lower word keep it for SDWA.
+          // TODO: teach SDWA to work with v_perm_b32 and remove the check.
+          !(LHSUsedLanes == 0x0c0c0000 && RHSUsedLanes == 0x00000c0c)) {
+        // Each byte in each mask is either selector mask 0-3, or has higher
+        // bits set in either of masks, which can be 0xff for 0xff or 0x0c for
+        // zero. If 0x0c is in either mask it shall always be 0x0c. Otherwise
+        // mask which is not 0xff wins. By anding both masks we have a correct
+        // result except that 0x0c shall be corrected to give 0x0c only.
+        uint32_t Mask = LHSMask & RHSMask;
+        for (unsigned I = 0; I < 32; I += 8) {
+          uint32_t ByteSel = 0xff << I;
+          if ((LHSMask & ByteSel) == 0x0c || (RHSMask & ByteSel) == 0x0c)
+            Mask &= (0x0c << I) & 0xffffffff;
+        }
+
+        // Add 4 to each active LHS lane. It will not affect any existing 0xff
+        // or 0x0c.
+        uint32_t Sel = Mask | (LHSUsedLanes & 0x04040404);
+        SDLoc DL(N);
+
+        return DAG.getNode(AMDGPUISD::PERM, DL, MVT::i32,
+                           LHS.getOperand(0), RHS.getOperand(0),
+                           DAG.getConstant(Sel, DL, MVT::i32));
+      }
+    }
+  }
+
   return SDValue();
 }
 
@@ -5522,6 +6487,60 @@ SDValue SITargetLowering::performOrCombine(SDNode *N,
     return SDValue();
   }
 
+  // or (perm x, y, c1), c2 -> perm x, y, permute_mask(c1, c2)
+  if (isa<ConstantSDNode>(RHS) && LHS.hasOneUse() &&
+      LHS.getOpcode() == AMDGPUISD::PERM &&
+      isa<ConstantSDNode>(LHS.getOperand(2))) {
+    uint32_t Sel = getConstantPermuteMask(N->getConstantOperandVal(1));
+    if (!Sel)
+      return SDValue();
+
+    Sel |= LHS.getConstantOperandVal(2);
+    SDLoc DL(N);
+    return DAG.getNode(AMDGPUISD::PERM, DL, MVT::i32, LHS.getOperand(0),
+                       LHS.getOperand(1), DAG.getConstant(Sel, DL, MVT::i32));
+  }
+
+  // or (op x, c1), (op y, c2) -> perm x, y, permute_mask(c1, c2)
+  const SIInstrInfo *TII = getSubtarget()->getInstrInfo();
+  if (VT == MVT::i32 && LHS.hasOneUse() && RHS.hasOneUse() &&
+      N->isDivergent() && TII->pseudoToMCOpcode(AMDGPU::V_PERM_B32) != -1) {
+    uint32_t LHSMask = getPermuteMask(DAG, LHS);
+    uint32_t RHSMask = getPermuteMask(DAG, RHS);
+    if (LHSMask != ~0u && RHSMask != ~0u) {
+      // Canonicalize the expression in an attempt to have fewer unique masks
+      // and therefore fewer registers used to hold the masks.
+      if (LHSMask > RHSMask) {
+        std::swap(LHSMask, RHSMask);
+        std::swap(LHS, RHS);
+      }
+
+      // Select 0xc for each lane used from source operand. Zero has 0xc mask
+      // set, 0xff have 0xff in the mask, actual lanes are in the 0-3 range.
+      uint32_t LHSUsedLanes = ~(LHSMask & 0x0c0c0c0c) & 0x0c0c0c0c;
+      uint32_t RHSUsedLanes = ~(RHSMask & 0x0c0c0c0c) & 0x0c0c0c0c;
+
+      // Check of we need to combine values from two sources within a byte.
+      if (!(LHSUsedLanes & RHSUsedLanes) &&
+          // If we select high and lower word keep it for SDWA.
+          // TODO: teach SDWA to work with v_perm_b32 and remove the check.
+          !(LHSUsedLanes == 0x0c0c0000 && RHSUsedLanes == 0x00000c0c)) {
+        // Kill zero bytes selected by other mask. Zero value is 0xc.
+        LHSMask &= ~RHSUsedLanes;
+        RHSMask &= ~LHSUsedLanes;
+        // Add 4 to each active LHS lane
+        LHSMask |= LHSUsedLanes & 0x04040404;
+        // Combine masks
+        uint32_t Sel = LHSMask | RHSMask;
+        SDLoc DL(N);
+
+        return DAG.getNode(AMDGPUISD::PERM, DL, MVT::i32,
+                           LHS.getOperand(0), RHS.getOperand(0),
+                           DAG.getConstant(Sel, DL, MVT::i32));
+      }
+    }
+  }
+
   if (VT != MVT::i64)
     return SDValue();
 
@@ -5628,6 +6647,7 @@ static bool fp16SrcZerosHighBits(unsigned Opc) {
   case AMDGPUISD::FMAD_FTZ:
   case AMDGPUISD::RCP:
   case AMDGPUISD::RSQ:
+  case AMDGPUISD::RCP_IFLAG:
   case AMDGPUISD::LDEXP:
     return true;
   default:
@@ -5680,6 +6700,23 @@ SDValue SITargetLowering::performClassCombine(SDNode *N,
   return SDValue();
 }
 
+SDValue SITargetLowering::performRcpCombine(SDNode *N,
+                                            DAGCombinerInfo &DCI) const {
+  EVT VT = N->getValueType(0);
+  SDValue N0 = N->getOperand(0);
+
+  if (N0.isUndef())
+    return N0;
+
+  if (VT == MVT::f32 && (N0.getOpcode() == ISD::UINT_TO_FP ||
+                         N0.getOpcode() == ISD::SINT_TO_FP)) {
+    return DCI.DAG.getNode(AMDGPUISD::RCP_IFLAG, SDLoc(N), VT, N0,
+                           N->getFlags());
+  }
+
+  return AMDGPUTargetLowering::performRcpCombine(N, DCI);
+}
+
 static bool isKnownNeverSNan(SelectionDAG &DAG, SDValue Op) {
   if (!DAG.getTargetLoweringInfo().hasFloatingPointExceptions())
     return true;
@@ -5688,7 +6725,7 @@ static bool isKnownNeverSNan(SelectionDAG &DAG, SDValue Op) {
 }
 
 static bool isCanonicalized(SelectionDAG &DAG, SDValue Op,
-                            const SISubtarget *ST, unsigned MaxDepth=5) {
+                            const GCNSubtarget *ST, unsigned MaxDepth=5) {
   // If source is a result of another standard FP operation it is already in
   // canonical form.
 
@@ -5946,7 +6983,7 @@ SDValue SITargetLowering::performMinMaxCombine(SDNode *N,
 
 
   if (Opc != AMDGPUISD::FMIN_LEGACY && Opc != AMDGPUISD::FMAX_LEGACY &&
-      VT != MVT::f64 &&
+      !VT.isVector() && VT != MVT::f64 &&
       ((VT != MVT::f16 && VT != MVT::i16) || Subtarget->hasMin3Max3_16())) {
     // max(max(a, b), c) -> max3(a, b, c)
     // min(min(a, b), c) -> min3(a, b, c)
@@ -6066,15 +7103,87 @@ SDValue SITargetLowering::performCvtPkRTZCombine(SDNode *N,
 SDValue SITargetLowering::performExtractVectorEltCombine(
   SDNode *N, DAGCombinerInfo &DCI) const {
   SDValue Vec = N->getOperand(0);
-
   SelectionDAG &DAG = DCI.DAG;
-  if (Vec.getOpcode() == ISD::FNEG && allUsesHaveSourceMods(N)) {
+
+  EVT VecVT = Vec.getValueType();
+  EVT EltVT = VecVT.getVectorElementType();
+
+  if ((Vec.getOpcode() == ISD::FNEG ||
+       Vec.getOpcode() == ISD::FABS) && allUsesHaveSourceMods(N)) {
     SDLoc SL(N);
     EVT EltVT = N->getValueType(0);
     SDValue Idx = N->getOperand(1);
     SDValue Elt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, EltVT,
                               Vec.getOperand(0), Idx);
-    return DAG.getNode(ISD::FNEG, SL, EltVT, Elt);
+    return DAG.getNode(Vec.getOpcode(), SL, EltVT, Elt);
+  }
+
+  // ScalarRes = EXTRACT_VECTOR_ELT ((vector-BINOP Vec1, Vec2), Idx)
+  //    =>
+  // Vec1Elt = EXTRACT_VECTOR_ELT(Vec1, Idx)
+  // Vec2Elt = EXTRACT_VECTOR_ELT(Vec2, Idx)
+  // ScalarRes = scalar-BINOP Vec1Elt, Vec2Elt
+  if (Vec.hasOneUse() && DCI.isBeforeLegalize()) {
+    SDLoc SL(N);
+    EVT EltVT = N->getValueType(0);
+    SDValue Idx = N->getOperand(1);
+    unsigned Opc = Vec.getOpcode();
+
+    switch(Opc) {
+    default:
+      return SDValue();
+      // TODO: Support other binary operations.
+    case ISD::FADD:
+    case ISD::ADD:
+    case ISD::UMIN:
+    case ISD::UMAX:
+    case ISD::SMIN:
+    case ISD::SMAX:
+    case ISD::FMAXNUM:
+    case ISD::FMINNUM:
+      return DAG.getNode(Opc, SL, EltVT,
+                         DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, EltVT,
+                                     Vec.getOperand(0), Idx),
+                         DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, EltVT,
+                                     Vec.getOperand(1), Idx));
+    }
+  }
+
+  if (!DCI.isBeforeLegalize())
+    return SDValue();
+
+  unsigned VecSize = VecVT.getSizeInBits();
+  unsigned EltSize = EltVT.getSizeInBits();
+
+  // Try to turn sub-dword accesses of vectors into accesses of the same 32-bit
+  // elements. This exposes more load reduction opportunities by replacing
+  // multiple small extract_vector_elements with a single 32-bit extract.
+  auto *Idx = dyn_cast<ConstantSDNode>(N->getOperand(1));
+  if (EltSize <= 16 &&
+      EltVT.isByteSized() &&
+      VecSize > 32 &&
+      VecSize % 32 == 0 &&
+      Idx) {
+    EVT NewVT = getEquivalentMemType(*DAG.getContext(), VecVT);
+
+    unsigned BitIndex = Idx->getZExtValue() * EltSize;
+    unsigned EltIdx = BitIndex / 32;
+    unsigned LeftoverBitIdx = BitIndex % 32;
+    SDLoc SL(N);
+
+    SDValue Cast = DAG.getNode(ISD::BITCAST, SL, NewVT, Vec);
+    DCI.AddToWorklist(Cast.getNode());
+
+    SDValue Elt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, Cast,
+                              DAG.getConstant(EltIdx, SL, MVT::i32));
+    DCI.AddToWorklist(Elt.getNode());
+    SDValue Srl = DAG.getNode(ISD::SRL, SL, MVT::i32, Elt,
+                              DAG.getConstant(LeftoverBitIdx, SL, MVT::i32));
+    DCI.AddToWorklist(Srl.getNode());
+
+    SDValue Trunc = DAG.getNode(ISD::TRUNCATE, SL, EltVT.changeTypeToInteger(), Srl);
+    DCI.AddToWorklist(Trunc.getNode());
+    return DAG.getNode(ISD::BITCAST, SL, EltVT, Trunc);
   }
 
   return SDValue();
@@ -6135,8 +7244,8 @@ unsigned SITargetLowering::getFusedOpcode(const SelectionDAG &DAG,
 
   const TargetOptions &Options = DAG.getTarget().Options;
   if ((Options.AllowFPOpFusion == FPOpFusion::Fast || Options.UnsafeFPMath ||
-       (N0->getFlags().hasUnsafeAlgebra() &&
-        N1->getFlags().hasUnsafeAlgebra())) &&
+       (N0->getFlags().hasAllowContract() &&
+        N1->getFlags().hasAllowContract())) &&
       isFMAFasterThanFMulAndFAdd(VT)) {
     return ISD::FMA;
   }
@@ -6192,7 +7301,7 @@ SDValue SITargetLowering::performAddCombine(SDNode *N,
     return SDValue();
   }
 
-  if (VT != MVT::i32)
+  if (VT != MVT::i32 || !DCI.isAfterLegalizeDAG())
     return SDValue();
 
   // add x, zext (setcc) => addcarry x, 0, setcc
@@ -6368,6 +7477,79 @@ SDValue SITargetLowering::performFSubCombine(SDNode *N,
   return SDValue();
 }
 
+SDValue SITargetLowering::performFMACombine(SDNode *N,
+                                            DAGCombinerInfo &DCI) const {
+  SelectionDAG &DAG = DCI.DAG;
+  EVT VT = N->getValueType(0);
+  SDLoc SL(N);
+
+  if (!Subtarget->hasDLInsts() || VT != MVT::f32)
+    return SDValue();
+
+  // FMA((F32)S0.x, (F32)S1. x, FMA((F32)S0.y, (F32)S1.y, (F32)z)) ->
+  //   FDOT2((V2F16)S0, (V2F16)S1, (F32)z))
+  SDValue Op1 = N->getOperand(0);
+  SDValue Op2 = N->getOperand(1);
+  SDValue FMA = N->getOperand(2);
+
+  if (FMA.getOpcode() != ISD::FMA ||
+      Op1.getOpcode() != ISD::FP_EXTEND ||
+      Op2.getOpcode() != ISD::FP_EXTEND)
+    return SDValue();
+
+  // fdot2_f32_f16 always flushes fp32 denormal operand and output to zero,
+  // regardless of the denorm mode setting. Therefore, unsafe-fp-math/fp-contract
+  // is sufficient to allow generaing fdot2.
+  const TargetOptions &Options = DAG.getTarget().Options;
+  if (Options.AllowFPOpFusion == FPOpFusion::Fast || Options.UnsafeFPMath ||
+      (N->getFlags().hasAllowContract() &&
+       FMA->getFlags().hasAllowContract())) {
+    Op1 = Op1.getOperand(0);
+    Op2 = Op2.getOperand(0);
+    if (Op1.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
+        Op2.getOpcode() != ISD::EXTRACT_VECTOR_ELT)
+      return SDValue();
+
+    SDValue Vec1 = Op1.getOperand(0);
+    SDValue Idx1 = Op1.getOperand(1);
+    SDValue Vec2 = Op2.getOperand(0);
+
+    SDValue FMAOp1 = FMA.getOperand(0);
+    SDValue FMAOp2 = FMA.getOperand(1);
+    SDValue FMAAcc = FMA.getOperand(2);
+
+    if (FMAOp1.getOpcode() != ISD::FP_EXTEND ||
+        FMAOp2.getOpcode() != ISD::FP_EXTEND)
+      return SDValue();
+
+    FMAOp1 = FMAOp1.getOperand(0);
+    FMAOp2 = FMAOp2.getOperand(0);
+    if (FMAOp1.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
+        FMAOp2.getOpcode() != ISD::EXTRACT_VECTOR_ELT)
+      return SDValue();
+
+    SDValue Vec3 = FMAOp1.getOperand(0);
+    SDValue Vec4 = FMAOp2.getOperand(0);
+    SDValue Idx2 = FMAOp1.getOperand(1);
+
+    if (Idx1 != Op2.getOperand(1) || Idx2 != FMAOp2.getOperand(1) ||
+        // Idx1 and Idx2 cannot be the same.
+        Idx1 == Idx2)
+      return SDValue();
+
+    if (Vec1 == Vec2 || Vec3 == Vec4)
+      return SDValue();
+
+    if (Vec1.getValueType() != MVT::v2f16 || Vec2.getValueType() != MVT::v2f16)
+      return SDValue();
+
+    if ((Vec1 == Vec3 && Vec2 == Vec4) ||
+        (Vec1 == Vec4 && Vec2 == Vec3))
+      return DAG.getNode(AMDGPUISD::FDOT2, SL, MVT::f32, Vec1, Vec2, FMAAcc);
+  }
+  return SDValue();
+}
+
 SDValue SITargetLowering::performSetCCCombine(SDNode *N,
                                               DAGCombinerInfo &DCI) const {
   SelectionDAG &DAG = DCI.DAG;
@@ -6387,23 +7569,49 @@ SDValue SITargetLowering::performSetCCCombine(SDNode *N,
     }
   }
 
-  if (CRHS && VT == MVT::i32 && LHS.getOpcode() == ISD::SIGN_EXTEND &&
-      isBoolSGPR(LHS.getOperand(0))) {
-    // setcc (sext from i1 cc), -1, ne|sgt|ult) => not cc => xor cc, -1
-    // setcc (sext from i1 cc), -1, eq|sle|uge) => cc
-    // setcc (sext from i1 cc),  0, eq|sge|ule) => not cc => xor cc, -1
-    // setcc (sext from i1 cc),  0, ne|ugt|slt) => cc
-    if ((CRHS->isAllOnesValue() &&
-         (CC == ISD::SETNE || CC == ISD::SETGT || CC == ISD::SETULT)) ||
-        (CRHS->isNullValue() &&
-         (CC == ISD::SETEQ || CC == ISD::SETGE || CC == ISD::SETULE)))
-      return DAG.getNode(ISD::XOR, SL, MVT::i1, LHS.getOperand(0),
-                         DAG.getConstant(-1, SL, MVT::i1));
-    if ((CRHS->isAllOnesValue() &&
-         (CC == ISD::SETEQ || CC == ISD::SETLE || CC == ISD::SETUGE)) ||
-        (CRHS->isNullValue() &&
-         (CC == ISD::SETNE || CC == ISD::SETUGT || CC == ISD::SETLT)))
-      return LHS.getOperand(0);
+  if (CRHS) {
+    if (VT == MVT::i32 && LHS.getOpcode() == ISD::SIGN_EXTEND &&
+        isBoolSGPR(LHS.getOperand(0))) {
+      // setcc (sext from i1 cc), -1, ne|sgt|ult) => not cc => xor cc, -1
+      // setcc (sext from i1 cc), -1, eq|sle|uge) => cc
+      // setcc (sext from i1 cc),  0, eq|sge|ule) => not cc => xor cc, -1
+      // setcc (sext from i1 cc),  0, ne|ugt|slt) => cc
+      if ((CRHS->isAllOnesValue() &&
+           (CC == ISD::SETNE || CC == ISD::SETGT || CC == ISD::SETULT)) ||
+          (CRHS->isNullValue() &&
+           (CC == ISD::SETEQ || CC == ISD::SETGE || CC == ISD::SETULE)))
+        return DAG.getNode(ISD::XOR, SL, MVT::i1, LHS.getOperand(0),
+                           DAG.getConstant(-1, SL, MVT::i1));
+      if ((CRHS->isAllOnesValue() &&
+           (CC == ISD::SETEQ || CC == ISD::SETLE || CC == ISD::SETUGE)) ||
+          (CRHS->isNullValue() &&
+           (CC == ISD::SETNE || CC == ISD::SETUGT || CC == ISD::SETLT)))
+        return LHS.getOperand(0);
+    }
+
+    uint64_t CRHSVal = CRHS->getZExtValue();
+    if ((CC == ISD::SETEQ || CC == ISD::SETNE) &&
+        LHS.getOpcode() == ISD::SELECT &&
+        isa<ConstantSDNode>(LHS.getOperand(1)) &&
+        isa<ConstantSDNode>(LHS.getOperand(2)) &&
+        LHS.getConstantOperandVal(1) != LHS.getConstantOperandVal(2) &&
+        isBoolSGPR(LHS.getOperand(0))) {
+      // Given CT != FT:
+      // setcc (select cc, CT, CF), CF, eq => xor cc, -1
+      // setcc (select cc, CT, CF), CF, ne => cc
+      // setcc (select cc, CT, CF), CT, ne => xor cc, -1
+      // setcc (select cc, CT, CF), CT, eq => cc
+      uint64_t CT = LHS.getConstantOperandVal(1);
+      uint64_t CF = LHS.getConstantOperandVal(2);
+
+      if ((CF == CRHSVal && CC == ISD::SETEQ) ||
+          (CT == CRHSVal && CC == ISD::SETNE))
+        return DAG.getNode(ISD::XOR, SL, MVT::i1, LHS.getOperand(0),
+                           DAG.getConstant(-1, SL, MVT::i1));
+      if ((CF == CRHSVal && CC == ISD::SETNE) ||
+          (CT == CRHSVal && CC == ISD::SETEQ))
+        return LHS.getOperand(0);
+    }
   }
 
   if (VT != MVT::f32 && VT != MVT::f64 && (Subtarget->has16BitInsts() &&
@@ -6472,6 +7680,29 @@ SDValue SITargetLowering::performCvtF32UByteNCombine(SDNode *N,
   return SDValue();
 }
 
+SDValue SITargetLowering::performClampCombine(SDNode *N,
+                                              DAGCombinerInfo &DCI) const {
+  ConstantFPSDNode *CSrc = dyn_cast<ConstantFPSDNode>(N->getOperand(0));
+  if (!CSrc)
+    return SDValue();
+
+  const APFloat &F = CSrc->getValueAPF();
+  APFloat Zero = APFloat::getZero(F.getSemantics());
+  APFloat::cmpResult Cmp0 = F.compare(Zero);
+  if (Cmp0 == APFloat::cmpLessThan ||
+      (Cmp0 == APFloat::cmpUnordered && Subtarget->enableDX10Clamp())) {
+    return DCI.DAG.getConstantFP(Zero, SDLoc(N), N->getValueType(0));
+  }
+
+  APFloat One(F.getSemantics(), "1.0");
+  APFloat::cmpResult Cmp1 = F.compare(One);
+  if (Cmp1 == APFloat::cmpGreaterThan)
+    return DCI.DAG.getConstantFP(One, SDLoc(N), N->getValueType(0));
+
+  return SDValue(CSrc, 0);
+}
+
+
 SDValue SITargetLowering::PerformDAGCombine(SDNode *N,
                                             DAGCombinerInfo &DCI) const {
   switch (N->getOpcode()) {
@@ -6503,7 +7734,13 @@ SDValue SITargetLowering::PerformDAGCombine(SDNode *N,
       return performMinMaxCombine(N, DCI);
     break;
   }
-  case ISD::LOAD:
+  case ISD::FMA:
+    return performFMACombine(N, DCI);
+  case ISD::LOAD: {
+    if (SDValue Widended = widenLoad(cast<LoadSDNode>(N), DCI))
+      return Widended;
+    LLVM_FALLTHROUGH;
+  }
   case ISD::STORE:
   case ISD::ATOMIC_LOAD:
   case ISD::ATOMIC_STORE:
@@ -6521,7 +7758,10 @@ SDValue SITargetLowering::PerformDAGCombine(SDNode *N,
   case ISD::ATOMIC_LOAD_UMIN:
   case ISD::ATOMIC_LOAD_UMAX:
   case AMDGPUISD::ATOMIC_INC:
-  case AMDGPUISD::ATOMIC_DEC: // TODO: Target mem intrinsics.
+  case AMDGPUISD::ATOMIC_DEC:
+  case AMDGPUISD::ATOMIC_LOAD_FADD:
+  case AMDGPUISD::ATOMIC_LOAD_FMIN:
+  case AMDGPUISD::ATOMIC_LOAD_FMAX:  // TODO: Target mem intrinsics.
     if (DCI.isBeforeLegalize())
       break;
     return performMemSDNodeCombine(cast<MemSDNode>(N), DCI);
@@ -6537,11 +7777,13 @@ SDValue SITargetLowering::PerformDAGCombine(SDNode *N,
     return performClassCombine(N, DCI);
   case ISD::FCANONICALIZE:
     return performFCanonicalizeCombine(N, DCI);
-  case AMDGPUISD::FRACT:
   case AMDGPUISD::RCP:
+    return performRcpCombine(N, DCI);
+  case AMDGPUISD::FRACT:
   case AMDGPUISD::RSQ:
   case AMDGPUISD::RCP_LEGACY:
   case AMDGPUISD::RSQ_LEGACY:
+  case AMDGPUISD::RCP_IFLAG:
   case AMDGPUISD::RSQ_CLAMP:
   case AMDGPUISD::LDEXP: {
     SDValue Src = N->getOperand(0);
@@ -6561,6 +7803,8 @@ SDValue SITargetLowering::PerformDAGCombine(SDNode *N,
     return performFMed3Combine(N, DCI);
   case AMDGPUISD::CVT_PKRTZ_F16_F32:
     return performCvtPkRTZCombine(N, DCI);
+  case AMDGPUISD::CLAMP:
+    return performClampCombine(N, DCI);
   case ISD::SCALAR_TO_VECTOR: {
     SelectionDAG &DAG = DCI.DAG;
     EVT VT = N->getValueType(0);
@@ -6587,7 +7831,7 @@ SDValue SITargetLowering::PerformDAGCombine(SDNode *N,
   return AMDGPUTargetLowering::PerformDAGCombine(N, DCI);
 }
 
-/// \brief Helper function for adjustWritemask
+/// Helper function for adjustWritemask
 static unsigned SubIdx2Lane(unsigned Idx) {
   switch (Idx) {
   default: return 0;
@@ -6598,12 +7842,19 @@ static unsigned SubIdx2Lane(unsigned Idx) {
   }
 }
 
-/// \brief Adjust the writemask of MIMG instructions
+/// Adjust the writemask of MIMG instructions
 SDNode *SITargetLowering::adjustWritemask(MachineSDNode *&Node,
                                           SelectionDAG &DAG) const {
+  unsigned Opcode = Node->getMachineOpcode();
+
+  // Subtract 1 because the vdata output is not a MachineSDNode operand.
+  int D16Idx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::d16) - 1;
+  if (D16Idx >= 0 && Node->getConstantOperandVal(D16Idx))
+    return Node; // not implemented for D16
+
   SDNode *Users[4] = { nullptr };
   unsigned Lane = 0;
-  unsigned DmaskIdx = (Node->getNumOperands() - Node->getNumValues() == 9) ? 2 : 3;
+  unsigned DmaskIdx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::dmask) - 1;
   unsigned OldDmask = Node->getConstantOperandVal(DmaskIdx);
   unsigned NewDmask = 0;
   bool HasChain = Node->getNumValues() > 1;
@@ -6653,9 +7904,7 @@ SDNode *SITargetLowering::adjustWritemask(MachineSDNode *&Node,
 
   unsigned BitsSet = countPopulation(NewDmask);
 
-  const SIInstrInfo *TII = getSubtarget()->getInstrInfo();
-  int NewOpcode = AMDGPU::getMaskedMIMGOp(*TII,
-                                          Node->getMachineOpcode(), BitsSet);
+  int NewOpcode = AMDGPU::getMaskedMIMGOp(Node->getMachineOpcode(), BitsSet);
   assert(NewOpcode != -1 &&
          NewOpcode != static_cast<int>(Node->getMachineOpcode()) &&
          "failed to find equivalent MIMG op");
@@ -6720,7 +7969,7 @@ static bool isFrameIndexOp(SDValue Op) {
   return isa<FrameIndexSDNode>(Op);
 }
 
-/// \brief Legalize target independent instructions (e.g. INSERT_SUBREG)
+/// Legalize target independent instructions (e.g. INSERT_SUBREG)
 /// with frame index operands.
 /// LLVM assumes that inputs are to these instructions are registers.
 SDNode *SITargetLowering::legalizeTargetIndependentNode(SDNode *Node,
@@ -6767,7 +8016,7 @@ SDNode *SITargetLowering::legalizeTargetIndependentNode(SDNode *Node,
   return DAG.UpdateNodeOperands(Node, Ops);
 }
 
-/// \brief Fold the instructions after selecting them.
+/// Fold the instructions after selecting them.
 /// Returns null if users were already updated.
 SDNode *SITargetLowering::PostISelFolding(MachineSDNode *Node,
                                           SelectionDAG &DAG) const {
@@ -6841,7 +8090,7 @@ SDNode *SITargetLowering::PostISelFolding(MachineSDNode *Node,
   return Node;
 }
 
-/// \brief Assign the register class depending on the number of
+/// Assign the register class depending on the number of
 /// bits set in the writemask
 void SITargetLowering::AdjustInstrPostInstrSelection(MachineInstr &MI,
                                                      SDNode *Node) const {
@@ -6928,7 +8177,7 @@ MachineSDNode *SITargetLowering::wrapAddr64Rsrc(SelectionDAG &DAG,
   return DAG.getMachineNode(AMDGPU::REG_SEQUENCE, DL, MVT::v4i32, Ops1);
 }
 
-/// \brief Return a resource descriptor with the 'Add TID' bit enabled
+/// Return a resource descriptor with the 'Add TID' bit enabled
 ///        The TID (Thread ID) is multiplied by the stride value (bits [61:48]
 ///        of the resource descriptor) to create an offset, which is added to
 ///        the resource pointer.
@@ -6970,11 +8219,11 @@ std::pair<unsigned, const TargetRegisterClass *>
 SITargetLowering::getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI,
                                                StringRef Constraint,
                                                MVT VT) const {
-  if (!isTypeLegal(VT))
-    return TargetLowering::getRegForInlineAsmConstraint(TRI, Constraint, VT);
-
+  const TargetRegisterClass *RC = nullptr;
   if (Constraint.size() == 1) {
     switch (Constraint[0]) {
+    default:
+      return TargetLowering::getRegForInlineAsmConstraint(TRI, Constraint, VT);
     case 's':
     case 'r':
       switch (VT.getSizeInBits()) {
@@ -6982,40 +8231,56 @@ SITargetLowering::getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI,
         return std::make_pair(0U, nullptr);
       case 32:
       case 16:
-        return std::make_pair(0U, &AMDGPU::SReg_32_XM0RegClass);
+        RC = &AMDGPU::SReg_32_XM0RegClass;
+        break;
       case 64:
-        return std::make_pair(0U, &AMDGPU::SGPR_64RegClass);
+        RC = &AMDGPU::SGPR_64RegClass;
+        break;
       case 128:
-        return std::make_pair(0U, &AMDGPU::SReg_128RegClass);
+        RC = &AMDGPU::SReg_128RegClass;
+        break;
       case 256:
-        return std::make_pair(0U, &AMDGPU::SReg_256RegClass);
+        RC = &AMDGPU::SReg_256RegClass;
+        break;
       case 512:
-        return std::make_pair(0U, &AMDGPU::SReg_512RegClass);
+        RC = &AMDGPU::SReg_512RegClass;
+        break;
       }
-
+      break;
     case 'v':
       switch (VT.getSizeInBits()) {
       default:
         return std::make_pair(0U, nullptr);
       case 32:
       case 16:
-        return std::make_pair(0U, &AMDGPU::VGPR_32RegClass);
+        RC = &AMDGPU::VGPR_32RegClass;
+        break;
       case 64:
-        return std::make_pair(0U, &AMDGPU::VReg_64RegClass);
+        RC = &AMDGPU::VReg_64RegClass;
+        break;
       case 96:
-        return std::make_pair(0U, &AMDGPU::VReg_96RegClass);
+        RC = &AMDGPU::VReg_96RegClass;
+        break;
       case 128:
-        return std::make_pair(0U, &AMDGPU::VReg_128RegClass);
+        RC = &AMDGPU::VReg_128RegClass;
+        break;
       case 256:
-        return std::make_pair(0U, &AMDGPU::VReg_256RegClass);
+        RC = &AMDGPU::VReg_256RegClass;
+        break;
       case 512:
-        return std::make_pair(0U, &AMDGPU::VReg_512RegClass);
+        RC = &AMDGPU::VReg_512RegClass;
+        break;
       }
+      break;
     }
+    // We actually support i128, i16 and f16 as inline parameters
+    // even if they are not reported as legal
+    if (RC && (isTypeLegal(VT) || VT.SimpleTy == MVT::i128 ||
+               VT.SimpleTy == MVT::i16 || VT.SimpleTy == MVT::f16))
+      return std::make_pair(0U, RC);
   }
 
   if (Constraint.size() > 1) {
-    const TargetRegisterClass *RC = nullptr;
     if (Constraint[1] == 'v') {
       RC = &AMDGPU::VGPR_32RegClass;
     } else if (Constraint[1] == 's') {
@@ -7052,8 +8317,7 @@ void SITargetLowering::finalizeLowering(MachineFunction &MF) const {
   MachineRegisterInfo &MRI = MF.getRegInfo();
   SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();
   const MachineFrameInfo &MFI = MF.getFrameInfo();
-  const SISubtarget &ST = MF.getSubtarget<SISubtarget>();
-  const SIRegisterInfo *TRI = ST.getRegisterInfo();
+  const SIRegisterInfo *TRI = Subtarget->getRegisterInfo();
 
   if (Info->isEntryFunction()) {
     // Callable functions have fixed registers used for stack access.
@@ -7083,6 +8347,8 @@ void SITargetLowering::finalizeLowering(MachineFunction &MF) const {
   MRI.replaceRegWith(AMDGPU::SCRATCH_WAVE_OFFSET_REG,
                      Info->getScratchWaveOffsetReg());
 
+  Info->limitOccupancy(MF);
+
   TargetLoweringBase::finalizeLowering(MF);
 }
 
@@ -7103,3 +8369,69 @@ void SITargetLowering::computeKnownBitsForFrameIndex(const SDValue Op,
   // calculation won't overflow, so assume the sign bit is never set.
   Known.Zero.setHighBits(AssumeFrameIndexHighZeroBits);
 }
+
+bool SITargetLowering::isSDNodeSourceOfDivergence(const SDNode * N,
+  FunctionLoweringInfo * FLI, DivergenceAnalysis * DA) const
+{
+  switch (N->getOpcode()) {
+    case ISD::Register:
+    case ISD::CopyFromReg:
+    {
+      const RegisterSDNode *R = nullptr;
+      if (N->getOpcode() == ISD::Register) {
+        R = dyn_cast<RegisterSDNode>(N);
+      }
+      else {
+        R = dyn_cast<RegisterSDNode>(N->getOperand(1));
+      }
+      if (R)
+      {
+        const MachineFunction * MF = FLI->MF;
+        const GCNSubtarget &ST = MF->getSubtarget<GCNSubtarget>();
+        const MachineRegisterInfo &MRI = MF->getRegInfo();
+        const SIRegisterInfo &TRI = ST.getInstrInfo()->getRegisterInfo();
+        unsigned Reg = R->getReg();
+        if (TRI.isPhysicalRegister(Reg))
+          return TRI.isVGPR(MRI, Reg);
+
+        if (MRI.isLiveIn(Reg)) {
+          // workitem.id.x workitem.id.y workitem.id.z
+          // Any VGPR formal argument is also considered divergent
+          if (TRI.isVGPR(MRI, Reg))
+              return true;
+          // Formal arguments of non-entry functions
+          // are conservatively considered divergent
+          else if (!AMDGPU::isEntryFunctionCC(FLI->Fn->getCallingConv()))
+            return true;
+        }
+        return !DA || DA->isDivergent(FLI->getValueFromVirtualReg(Reg));
+      }
+    }
+    break;
+    case ISD::LOAD: {
+      const LoadSDNode *L = dyn_cast<LoadSDNode>(N);
+      if (L->getMemOperand()->getAddrSpace() ==
+          Subtarget->getAMDGPUAS().PRIVATE_ADDRESS)
+        return true;
+    } break;
+    case ISD::CALLSEQ_END:
+    return true;
+    break;
+    case ISD::INTRINSIC_WO_CHAIN:
+    {
+
+    }
+      return AMDGPU::isIntrinsicSourceOfDivergence(
+      cast<ConstantSDNode>(N->getOperand(0))->getZExtValue());
+    case ISD::INTRINSIC_W_CHAIN:
+      return AMDGPU::isIntrinsicSourceOfDivergence(
+      cast<ConstantSDNode>(N->getOperand(1))->getZExtValue());
+    // In some cases intrinsics that are a source of divergence have been
+    // lowered to AMDGPUISD so we also need to check those too.
+    case AMDGPUISD::INTERP_MOV:
+    case AMDGPUISD::INTERP_P1:
+    case AMDGPUISD::INTERP_P2:
+      return true;
+  }
+  return false;
+}
diff --git a/lib/Target/AMDGPU/SIISelLowering.h b/lib/Target/AMDGPU/SIISelLowering.h
index b48e67f7563a..ad049f2a71c3 100644
--- a/lib/Target/AMDGPU/SIISelLowering.h
+++ b/lib/Target/AMDGPU/SIISelLowering.h
@@ -8,7 +8,7 @@
 //===----------------------------------------------------------------------===//
 //
 /// \file
-/// \brief SI DAG Lowering interface definition
+/// SI DAG Lowering interface definition
 //
 //===----------------------------------------------------------------------===//
 
@@ -22,12 +22,15 @@
 namespace llvm {
 
 class SITargetLowering final : public AMDGPUTargetLowering {
+private:
+  const GCNSubtarget *Subtarget;
+
   SDValue lowerKernArgParameterPtr(SelectionDAG &DAG, const SDLoc &SL,
                                    SDValue Chain, uint64_t Offset) const;
   SDValue getImplicitArgPtr(SelectionDAG &DAG, const SDLoc &SL) const;
   SDValue lowerKernargMemParameter(SelectionDAG &DAG, EVT VT, EVT MemVT,
                                    const SDLoc &SL, SDValue Chain,
-                                   uint64_t Offset, bool Signed,
+                                   uint64_t Offset, unsigned Align, bool Signed,
                                    const ISD::InputArg *Arg = nullptr) const;
 
   SDValue lowerStackParameter(SelectionDAG &DAG, CCValAssign &VA,
@@ -42,10 +45,14 @@ class SITargetLowering final : public AMDGPUTargetLowering {
                              SelectionDAG &DAG) const override;
   SDValue lowerImplicitZextParam(SelectionDAG &DAG, SDValue Op,
                                  MVT VT, unsigned Offset) const;
+  SDValue lowerImage(SDValue Op, const AMDGPU::ImageDimIntrinsicInfo *Intr,
+                     SelectionDAG &DAG) const;
 
   SDValue LowerINTRINSIC_WO_CHAIN(SDValue Op, SelectionDAG &DAG) const;
   SDValue LowerINTRINSIC_W_CHAIN(SDValue Op, SelectionDAG &DAG) const;
   SDValue LowerINTRINSIC_VOID(SDValue Op, SelectionDAG &DAG) const;
+
+  SDValue widenLoad(LoadSDNode *Ld, DAGCombinerInfo &DCI) const;
   SDValue LowerLOAD(SDValue Op, SelectionDAG &DAG) const;
   SDValue LowerSELECT(SDValue Op, SelectionDAG &DAG) const;
   SDValue lowerFastUnsafeFDIV(SDValue Op, SelectionDAG &DAG) const;
@@ -60,7 +67,13 @@ class SITargetLowering final : public AMDGPUTargetLowering {
   SDValue LowerATOMIC_CMP_SWAP(SDValue Op, SelectionDAG &DAG) const;
   SDValue LowerBRCOND(SDValue Op, SelectionDAG &DAG) const;
 
-  /// \brief Converts \p Op, which must be of floating point type, to the
+  SDValue adjustLoadValueType(unsigned Opcode, MemSDNode *M,
+                              SelectionDAG &DAG,
+                              bool IsIntrinsic = false) const;
+
+  SDValue handleD16VData(SDValue VData, SelectionDAG &DAG) const;
+
+  /// Converts \p Op, which must be of floating point type, to the
   /// floating point type \p VT, by either extending or truncating it.
   SDValue getFPExtOrFPTrunc(SelectionDAG &DAG,
                             SDValue Op,
@@ -71,7 +84,7 @@ class SITargetLowering final : public AMDGPUTargetLowering {
     SelectionDAG &DAG, EVT VT, EVT MemVT, const SDLoc &SL, SDValue Val,
     bool Signed, const ISD::InputArg *Arg = nullptr) const;
 
-  /// \brief Custom lowering for ISD::FP_ROUND for MVT::f16.
+  /// Custom lowering for ISD::FP_ROUND for MVT::f16.
   SDValue lowerFP_ROUND(SDValue Op, SelectionDAG &DAG) const;
 
   SDValue getSegmentAperture(unsigned AS, const SDLoc &DL,
@@ -80,7 +93,9 @@ class SITargetLowering final : public AMDGPUTargetLowering {
   SDValue lowerADDRSPACECAST(SDValue Op, SelectionDAG &DAG) const;
   SDValue lowerINSERT_VECTOR_ELT(SDValue Op, SelectionDAG &DAG) const;
   SDValue lowerEXTRACT_VECTOR_ELT(SDValue Op, SelectionDAG &DAG) const;
+  SDValue lowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG) const;
   SDValue lowerTRAP(SDValue Op, SelectionDAG &DAG) const;
+  SDValue lowerDEBUGTRAP(SDValue Op, SelectionDAG &DAG) const;
 
   SDNode *adjustWritemask(MachineSDNode *&N, SelectionDAG &DAG) const;
 
@@ -121,8 +136,11 @@ class SITargetLowering final : public AMDGPUTargetLowering {
   SDValue performSubCombine(SDNode *N, DAGCombinerInfo &DCI) const;
   SDValue performFAddCombine(SDNode *N, DAGCombinerInfo &DCI) const;
   SDValue performFSubCombine(SDNode *N, DAGCombinerInfo &DCI) const;
+  SDValue performFMACombine(SDNode *N, DAGCombinerInfo &DCI) const;
   SDValue performSetCCCombine(SDNode *N, DAGCombinerInfo &DCI) const;
   SDValue performCvtF32UByteNCombine(SDNode *N, DAGCombinerInfo &DCI) const;
+  SDValue performClampCombine(SDNode *N, DAGCombinerInfo &DCI) const;
+  SDValue performRcpCombine(SDNode *N, DAGCombinerInfo &DCI) const;
 
   bool isLegalFlatAddressingMode(const AddrMode &AM) const;
   bool isLegalGlobalAddressingMode(const AddrMode &AM) const;
@@ -145,9 +163,11 @@ class SITargetLowering final : public AMDGPUTargetLowering {
   bool shouldEmitPCReloc(const GlobalValue *GV) const;
 
 public:
-  SITargetLowering(const TargetMachine &tm, const SISubtarget &STI);
+  SITargetLowering(const TargetMachine &tm, const GCNSubtarget &STI);
 
-  const SISubtarget *getSubtarget() const;
+  const GCNSubtarget *getSubtarget() const;
+
+  bool isFPExtFoldable(unsigned Opcode, EVT DestVT, EVT SrcVT) const override;
 
   bool isShuffleMaskLegal(ArrayRef<int> /*Mask*/, EVT /*VT*/) const override;
 
@@ -255,7 +275,10 @@ public:
                          EVT VT) const override;
   MVT getScalarShiftAmountTy(const DataLayout &, EVT) const override;
   bool isFMAFasterThanFMulAndFAdd(EVT VT) const override;
+  SDValue splitUnaryVectorOp(SDValue Op, SelectionDAG &DAG) const;
+  SDValue splitBinaryVectorOp(SDValue Op, SelectionDAG &DAG) const;
   SDValue LowerOperation(SDValue Op, SelectionDAG &DAG) const override;
+
   void ReplaceNodeResults(SDNode *N, SmallVectorImpl<SDValue> &Results,
                           SelectionDAG &DAG) const override;
 
@@ -284,6 +307,9 @@ public:
                                      const APInt &DemandedElts,
                                      const SelectionDAG &DAG,
                                      unsigned Depth = 0) const override;
+
+  bool isSDNodeSourceOfDivergence(const SDNode *N,
+    FunctionLoweringInfo *FLI, DivergenceAnalysis *DA) const override;
 };
 
 } // End namespace llvm
diff --git a/lib/Target/AMDGPU/SIInsertSkips.cpp b/lib/Target/AMDGPU/SIInsertSkips.cpp
index a2f844d7854e..61c8f359e168 100644
--- a/lib/Target/AMDGPU/SIInsertSkips.cpp
+++ b/lib/Target/AMDGPU/SIInsertSkips.cpp
@@ -8,7 +8,7 @@
 //===----------------------------------------------------------------------===//
 //
 /// \file
-/// \brief This pass inserts branches on the 0 exec mask over divergent branches
+/// This pass inserts branches on the 0 exec mask over divergent branches
 /// branches when it's expected that jumping over the untaken control flow will
 /// be cheaper than having every workitem no-op through it.
 //
@@ -18,6 +18,7 @@
 #include "AMDGPUSubtarget.h"
 #include "SIInstrInfo.h"
 #include "SIMachineFunctionInfo.h"
+#include "MCTargetDesc/AMDGPUMCTargetDesc.h"
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/ADT/StringRef.h"
 #include "llvm/CodeGen/MachineBasicBlock.h"
@@ -210,65 +211,73 @@ void SIInsertSkips::kill(MachineInstr &MI) {
     switch (MI.getOperand(2).getImm()) {
     case ISD::SETOEQ:
     case ISD::SETEQ:
-      Opcode = AMDGPU::V_CMPX_EQ_F32_e32;
+      Opcode = AMDGPU::V_CMPX_EQ_F32_e64;
       break;
     case ISD::SETOGT:
     case ISD::SETGT:
-      Opcode = AMDGPU::V_CMPX_LT_F32_e32;
+      Opcode = AMDGPU::V_CMPX_LT_F32_e64;
       break;
     case ISD::SETOGE:
     case ISD::SETGE:
-      Opcode = AMDGPU::V_CMPX_LE_F32_e32;
+      Opcode = AMDGPU::V_CMPX_LE_F32_e64;
       break;
     case ISD::SETOLT:
     case ISD::SETLT:
-      Opcode = AMDGPU::V_CMPX_GT_F32_e32;
+      Opcode = AMDGPU::V_CMPX_GT_F32_e64;
       break;
     case ISD::SETOLE:
     case ISD::SETLE:
-      Opcode = AMDGPU::V_CMPX_GE_F32_e32;
+      Opcode = AMDGPU::V_CMPX_GE_F32_e64;
       break;
     case ISD::SETONE:
     case ISD::SETNE:
-      Opcode = AMDGPU::V_CMPX_LG_F32_e32;
+      Opcode = AMDGPU::V_CMPX_LG_F32_e64;
       break;
     case ISD::SETO:
-      Opcode = AMDGPU::V_CMPX_O_F32_e32;
+      Opcode = AMDGPU::V_CMPX_O_F32_e64;
       break;
     case ISD::SETUO:
-      Opcode = AMDGPU::V_CMPX_U_F32_e32;
+      Opcode = AMDGPU::V_CMPX_U_F32_e64;
       break;
     case ISD::SETUEQ:
-      Opcode = AMDGPU::V_CMPX_NLG_F32_e32;
+      Opcode = AMDGPU::V_CMPX_NLG_F32_e64;
       break;
     case ISD::SETUGT:
-      Opcode = AMDGPU::V_CMPX_NGE_F32_e32;
+      Opcode = AMDGPU::V_CMPX_NGE_F32_e64;
       break;
     case ISD::SETUGE:
-      Opcode = AMDGPU::V_CMPX_NGT_F32_e32;
+      Opcode = AMDGPU::V_CMPX_NGT_F32_e64;
       break;
     case ISD::SETULT:
-      Opcode = AMDGPU::V_CMPX_NLE_F32_e32;
+      Opcode = AMDGPU::V_CMPX_NLE_F32_e64;
       break;
     case ISD::SETULE:
-      Opcode = AMDGPU::V_CMPX_NLT_F32_e32;
+      Opcode = AMDGPU::V_CMPX_NLT_F32_e64;
       break;
     case ISD::SETUNE:
-      Opcode = AMDGPU::V_CMPX_NEQ_F32_e32;
+      Opcode = AMDGPU::V_CMPX_NEQ_F32_e64;
       break;
     default:
       llvm_unreachable("invalid ISD:SET cond code");
     }
 
-    // TODO: Allow this:
-    if (!MI.getOperand(0).isReg() ||
-        !TRI->isVGPR(MBB.getParent()->getRegInfo(),
-                     MI.getOperand(0).getReg()))
-      llvm_unreachable("SI_KILL operand should be a VGPR");
-
-    BuildMI(MBB, &MI, DL, TII->get(Opcode))
-        .add(MI.getOperand(1))
-        .add(MI.getOperand(0));
+    assert(MI.getOperand(0).isReg());
+
+    if (TRI->isVGPR(MBB.getParent()->getRegInfo(),
+                    MI.getOperand(0).getReg())) {
+      Opcode = AMDGPU::getVOPe32(Opcode);
+      BuildMI(MBB, &MI, DL, TII->get(Opcode))
+          .add(MI.getOperand(1))
+          .add(MI.getOperand(0));
+    } else {
+      BuildMI(MBB, &MI, DL, TII->get(Opcode))
+          .addReg(AMDGPU::VCC, RegState::Define)
+          .addImm(0)  // src0 modifiers
+          .add(MI.getOperand(1))
+          .addImm(0)  // src1 modifiers
+          .add(MI.getOperand(0))
+          .addImm(0);  // omod
+    }
     break;
   }
   case AMDGPU::SI_KILL_I1_TERMINATOR: {
@@ -330,7 +339,7 @@ bool SIInsertSkips::skipMaskBranch(MachineInstr &MI,
 }
 
 bool SIInsertSkips::runOnMachineFunction(MachineFunction &MF) {
-  const SISubtarget &ST = MF.getSubtarget<SISubtarget>();
+  const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
   TII = ST.getInstrInfo();
   TRI = &TII->getRegisterInfo();
   SkipThreshold = SkipThresholdFlag;
diff --git a/lib/Target/AMDGPU/SIInsertWaitcnts.cpp b/lib/Target/AMDGPU/SIInsertWaitcnts.cpp
index 6bbe5979316d..d456e3d9b94d 100644
--- a/lib/Target/AMDGPU/SIInsertWaitcnts.cpp
+++ b/lib/Target/AMDGPU/SIInsertWaitcnts.cpp
@@ -8,7 +8,7 @@
 //===----------------------------------------------------------------------===//
 //
 /// \file
-/// \brief Insert wait instructions for memory reads and writes.
+/// Insert wait instructions for memory reads and writes.
 ///
 /// Memory reads and writes are issued asynchronously, so we need to insert
 /// S_WAITCNT instructions when we want to access any of their results or
@@ -40,6 +40,7 @@
 #include "llvm/IR/DebugLoc.h"
 #include "llvm/Pass.h"
 #include "llvm/Support/Debug.h"
+#include "llvm/Support/DebugCounter.h"
 #include "llvm/Support/ErrorHandling.h"
 #include "llvm/Support/raw_ostream.h"
 #include <algorithm>
@@ -50,9 +51,21 @@
 #include <utility>
 #include <vector>
 
+using namespace llvm;
+
 #define DEBUG_TYPE "si-insert-waitcnts"
 
-using namespace llvm;
+DEBUG_COUNTER(ForceExpCounter, DEBUG_TYPE"-forceexp",
+              "Force emit s_waitcnt expcnt(0) instrs");
+DEBUG_COUNTER(ForceLgkmCounter, DEBUG_TYPE"-forcelgkm",
+              "Force emit s_waitcnt lgkmcnt(0) instrs");
+DEBUG_COUNTER(ForceVMCounter, DEBUG_TYPE"-forcevm",
+              "Force emit s_waitcnt vmcnt(0) instrs");
+
+static cl::opt<unsigned> ForceEmitZeroFlag(
+  "amdgpu-waitcnt-forcezero",
+  cl::desc("Force all waitcnt instrs to be emitted as s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)"),
+  cl::init(0), cl::Hidden);
 
 namespace {
 
@@ -115,15 +128,15 @@ enum RegisterMapping {
        (w) = (enum WaitEventType)((w) + 1))
 
 // This is a per-basic-block object that maintains current score brackets
-// of each wait-counter, and a per-register scoreboard for each wait-couner.
+// of each wait counter, and a per-register scoreboard for each wait counter.
 // We also maintain the latest score for every event type that can change the
 // waitcnt in order to know if there are multiple types of events within
 // the brackets. When multiple types of event happen in the bracket,
-// wait-count may get decreased out of order, therefore we need to put in
+// wait count may get decreased out of order, therefore we need to put in
 // "s_waitcnt 0" before use.
 class BlockWaitcntBrackets {
 public:
-  BlockWaitcntBrackets() {
+  BlockWaitcntBrackets(const GCNSubtarget *SubTarget) : ST(SubTarget) {
     for (enum InstCounterType T = VM_CNT; T < NUM_INST_CNTS;
          T = (enum InstCounterType)(T + 1)) {
       memset(VgprScores[T], 0, sizeof(VgprScores[T]));
@@ -301,6 +314,7 @@ public:
   void dump() { print(dbgs()); }
 
 private:
+  const GCNSubtarget *ST = nullptr;
   bool WaitAtBeginning = false;
   bool RevisitLoop = false;
   bool MixedExpTypes = false;
@@ -332,14 +346,12 @@ public:
 
   void incIterCnt() { IterCnt++; }
   void resetIterCnt() { IterCnt = 0; }
-  int32_t getIterCnt() { return IterCnt; }
+  unsigned getIterCnt() { return IterCnt; }
 
   void setWaitcnt(MachineInstr *WaitcntIn) { LfWaitcnt = WaitcntIn; }
   MachineInstr *getWaitcnt() const { return LfWaitcnt; }
 
-  void print() {
-    DEBUG(dbgs() << "  iteration " << IterCnt << '\n';);
-  }
+  void print() { LLVM_DEBUG(dbgs() << "  iteration " << IterCnt << '\n';); }
 
 private:
   // s_waitcnt added at the end of loop footer to stablize wait scores
@@ -352,7 +364,7 @@ private:
 
 class SIInsertWaitcnts : public MachineFunctionPass {
 private:
-  const SISubtarget *ST = nullptr;
+  const GCNSubtarget *ST = nullptr;
   const SIInstrInfo *TII = nullptr;
   const SIRegisterInfo *TRI = nullptr;
   const MachineRegisterInfo *MRI = nullptr;
@@ -361,22 +373,31 @@ private:
   AMDGPUAS AMDGPUASI;
 
   DenseSet<MachineBasicBlock *> BlockVisitedSet;
-  DenseSet<MachineInstr *> CompilerGeneratedWaitcntSet;
+  DenseSet<MachineInstr *> TrackedWaitcntSet;
   DenseSet<MachineInstr *> VCCZBugHandledSet;
 
   DenseMap<MachineBasicBlock *, std::unique_ptr<BlockWaitcntBrackets>>
       BlockWaitcntBracketsMap;
 
-  DenseSet<MachineBasicBlock *> BlockWaitcntProcessedSet;
+  std::vector<MachineBasicBlock *> BlockWaitcntProcessedSet;
 
   DenseMap<MachineLoop *, std::unique_ptr<LoopWaitcntData>> LoopWaitcntDataMap;
 
   std::vector<std::unique_ptr<BlockWaitcntBrackets>> KillWaitBrackets;
 
+  // ForceEmitZeroWaitcnts: force all waitcnts insts to be s_waitcnt 0
+  // because of amdgpu-waitcnt-forcezero flag
+  bool ForceEmitZeroWaitcnts;
+  bool ForceEmitWaitcnt[NUM_INST_CNTS];
+
 public:
   static char ID;
 
-  SIInsertWaitcnts() : MachineFunctionPass(ID) {}
+  SIInsertWaitcnts() : MachineFunctionPass(ID) {
+    (void)ForceExpCounter;
+    (void)ForceLgkmCounter;
+    (void)ForceVMCounter;
+  }
 
   bool runOnMachineFunction(MachineFunction &MF) override;
 
@@ -397,15 +418,53 @@ public:
         llvm::make_unique<BlockWaitcntBrackets>(*Bracket));
   }
 
+  bool isForceEmitWaitcnt() const {
+    for (enum InstCounterType T = VM_CNT; T < NUM_INST_CNTS;
+         T = (enum InstCounterType)(T + 1))
+      if (ForceEmitWaitcnt[T])
+        return true;
+    return false;
+  }
+
+  void setForceEmitWaitcnt() {
+// For non-debug builds, ForceEmitWaitcnt has been initialized to false;
+// For debug builds, get the debug counter info and adjust if need be
+#ifndef NDEBUG
+    if (DebugCounter::isCounterSet(ForceExpCounter) &&
+        DebugCounter::shouldExecute(ForceExpCounter)) {
+      ForceEmitWaitcnt[EXP_CNT] = true;
+    } else {
+      ForceEmitWaitcnt[EXP_CNT] = false;
+    }
+
+    if (DebugCounter::isCounterSet(ForceLgkmCounter) &&
+         DebugCounter::shouldExecute(ForceLgkmCounter)) {
+      ForceEmitWaitcnt[LGKM_CNT] = true;
+    } else {
+      ForceEmitWaitcnt[LGKM_CNT] = false;
+    }
+
+    if (DebugCounter::isCounterSet(ForceVMCounter) &&
+        DebugCounter::shouldExecute(ForceVMCounter)) {
+      ForceEmitWaitcnt[VM_CNT] = true;
+    } else {
+      ForceEmitWaitcnt[VM_CNT] = false;
+    }
+#endif // NDEBUG
+  }
+
   bool mayAccessLDSThroughFlat(const MachineInstr &MI) const;
-  MachineInstr *generateSWaitCntInstBefore(MachineInstr &MI,
-                                           BlockWaitcntBrackets *ScoreBrackets);
-  void updateEventWaitCntAfter(MachineInstr &Inst,
+  void generateWaitcntInstBefore(MachineInstr &MI,
+                                  BlockWaitcntBrackets *ScoreBrackets);
+  void updateEventWaitcntAfter(MachineInstr &Inst,
                                BlockWaitcntBrackets *ScoreBrackets);
   void mergeInputScoreBrackets(MachineBasicBlock &Block);
-  MachineBasicBlock *loopBottom(const MachineLoop *Loop);
+  bool isLoopBottom(const MachineLoop *Loop, const MachineBasicBlock *Block);
+  unsigned countNumBottomBlocks(const MachineLoop *Loop);
   void insertWaitcntInBlock(MachineFunction &MF, MachineBasicBlock &Block);
   void insertWaitcntBeforeCF(MachineBasicBlock &Block, MachineInstr *Inst);
+  bool isWaitcntStronger(unsigned LHS, unsigned RHS);
+  unsigned combineWaitcnt(unsigned LHS, unsigned RHS);
 };
 
 } // end anonymous namespace
@@ -459,7 +518,7 @@ void BlockWaitcntBrackets::setExpScore(const MachineInstr *MI,
                                        const MachineRegisterInfo *MRI,
                                        unsigned OpNo, int32_t Val) {
   RegInterval Interval = getRegInterval(MI, TII, MRI, TRI, OpNo, false);
-  DEBUG({
+  LLVM_DEBUG({
     const MachineOperand &Opnd = MI->getOperand(OpNo);
     assert(TRI->isVGPR(*MRI, Opnd.getReg()));
   });
@@ -681,14 +740,17 @@ unsigned int BlockWaitcntBrackets::updateByWait(InstCounterType T,
   const int32_t LB = getScoreLB(T);
   const int32_t UB = getScoreUB(T);
   if ((UB >= ScoreToWait) && (ScoreToWait > LB)) {
-    if (T == VM_CNT && hasPendingFlat()) {
-      // If there is a pending FLAT operation, and this is a VM waitcnt,
-      // then we need to force a waitcnt 0 for VM.
+    if ((T == VM_CNT || T == LGKM_CNT) &&
+        hasPendingFlat() &&
+        !ST->hasFlatLgkmVMemCountInOrder()) {
+      // If there is a pending FLAT operation, and this is a VMem or LGKM
+      // waitcnt and the target can report early completion, then we need
+      // to force a waitcnt 0.
       NeedWait = CNT_MASK(T);
       setScoreLB(T, getScoreUB(T));
     } else if (counterOutOfOrder(T)) {
       // Counter can get decremented out-of-order when there
-      // are multiple types event in the brack. Also emit an s_wait counter
+      // are multiple types event in the bracket. Also emit an s_wait counter
       // with a conservative value of 0 for the counter.
       NeedWait = CNT_MASK(T);
       setScoreLB(T, getScoreUB(T));
@@ -789,7 +851,30 @@ static bool readsVCCZ(const MachineInstr &MI) {
          !MI.getOperand(1).isUndef();
 }
 
-///  \brief Generate s_waitcnt instruction to be placed before cur_Inst.
+/// Given wait count encodings checks if LHS is stronger than RHS.
+bool SIInsertWaitcnts::isWaitcntStronger(unsigned LHS, unsigned RHS) {
+  if (AMDGPU::decodeVmcnt(IV, LHS) > AMDGPU::decodeVmcnt(IV, RHS))
+    return false;
+  if (AMDGPU::decodeLgkmcnt(IV, LHS) > AMDGPU::decodeLgkmcnt(IV, RHS))
+    return false;
+  if (AMDGPU::decodeExpcnt(IV, LHS) > AMDGPU::decodeExpcnt(IV, RHS))
+    return false;
+  return true;
+}
+
+/// Given wait count encodings create a new encoding which is stronger
+/// or equal to both.
+unsigned SIInsertWaitcnts::combineWaitcnt(unsigned LHS, unsigned RHS) {
+  unsigned VmCnt = std::min(AMDGPU::decodeVmcnt(IV, LHS),
+                            AMDGPU::decodeVmcnt(IV, RHS));
+  unsigned LgkmCnt = std::min(AMDGPU::decodeLgkmcnt(IV, LHS),
+                              AMDGPU::decodeLgkmcnt(IV, RHS));
+  unsigned ExpCnt = std::min(AMDGPU::decodeExpcnt(IV, LHS),
+                             AMDGPU::decodeExpcnt(IV, RHS));
+  return AMDGPU::encodeWaitcnt(IV, VmCnt, ExpCnt, LgkmCnt);
+}
+
+///  Generate s_waitcnt instruction to be placed before cur_Inst.
 ///  Instructions of a given type are returned in order,
 ///  but instructions of different types can complete out of order.
 ///  We rely on this in-order completion
@@ -799,23 +884,29 @@ static bool readsVCCZ(const MachineInstr &MI) {
 ///  and if so what the value of each counter is.
 ///  The "score bracket" is bound by the lower bound and upper bound
 ///  scores (*_score_LB and *_score_ub respectively).
-MachineInstr *SIInsertWaitcnts::generateSWaitCntInstBefore(
+void SIInsertWaitcnts::generateWaitcntInstBefore(
     MachineInstr &MI, BlockWaitcntBrackets *ScoreBrackets) {
   // To emit, or not to emit - that's the question!
   // Start with an assumption that there is no need to emit.
-  unsigned int EmitSwaitcnt = 0;
-  // s_waitcnt instruction to return; default is NULL.
-  MachineInstr *SWaitInst = nullptr;
+  unsigned int EmitWaitcnt = 0;
+
   // No need to wait before phi. If a phi-move exists, then the wait should
   // has been inserted before the move. If a phi-move does not exist, then
   // wait should be inserted before the real use. The same is true for
   // sc-merge. It is not a coincident that all these cases correspond to the
   // instructions that are skipped in the assembling loop.
   bool NeedLineMapping = false; // TODO: Check on this.
-  if (MI.isDebugValue() &&
+
+  // ForceEmitZeroWaitcnt: force a single s_waitcnt 0 due to hw bug
+  bool ForceEmitZeroWaitcnt = false;
+
+  setForceEmitWaitcnt();
+  bool IsForceEmitWaitcnt = isForceEmitWaitcnt();
+
+  if (MI.isDebugInstr() &&
       // TODO: any other opcode?
       !NeedLineMapping) {
-    return SWaitInst;
+    return;
   }
 
   // See if an s_waitcnt is forced at block entry, or is needed at
@@ -826,7 +917,7 @@ MachineInstr *SIInsertWaitcnts::generateSWaitCntInstBefore(
     ScoreBrackets->clearWaitAtBeginning();
     for (enum InstCounterType T = VM_CNT; T < NUM_INST_CNTS;
          T = (enum InstCounterType)(T + 1)) {
-      EmitSwaitcnt |= CNT_MASK(T);
+      EmitWaitcnt |= CNT_MASK(T);
       ScoreBrackets->setScoreLB(T, ScoreBrackets->getScoreUB(T));
     }
   }
@@ -836,21 +927,20 @@ MachineInstr *SIInsertWaitcnts::generateSWaitCntInstBefore(
   else if (MI.getOpcode() == AMDGPU::BUFFER_WBINVL1 ||
            MI.getOpcode() == AMDGPU::BUFFER_WBINVL1_SC ||
            MI.getOpcode() == AMDGPU::BUFFER_WBINVL1_VOL) {
-    EmitSwaitcnt |=
+    EmitWaitcnt |=
         ScoreBrackets->updateByWait(VM_CNT, ScoreBrackets->getScoreUB(VM_CNT));
   }
 
   // All waits must be resolved at call return.
   // NOTE: this could be improved with knowledge of all call sites or
   //   with knowledge of the called routines.
-  if (MI.getOpcode() == AMDGPU::RETURN ||
-      MI.getOpcode() == AMDGPU::SI_RETURN_TO_EPILOG ||
+  if (MI.getOpcode() == AMDGPU::SI_RETURN_TO_EPILOG ||
       MI.getOpcode() == AMDGPU::S_SETPC_B64_return) {
     for (enum InstCounterType T = VM_CNT; T < NUM_INST_CNTS;
          T = (enum InstCounterType)(T + 1)) {
       if (ScoreBrackets->getScoreUB(T) > ScoreBrackets->getScoreLB(T)) {
         ScoreBrackets->setScoreLB(T, ScoreBrackets->getScoreUB(T));
-        EmitSwaitcnt |= CNT_MASK(T);
+        EmitWaitcnt |= CNT_MASK(T);
       }
     }
   }
@@ -861,7 +951,7 @@ MachineInstr *SIInsertWaitcnts::generateSWaitCntInstBefore(
             AMDGPU::SendMsg::ID_GS_DONE)) {
     if (ScoreBrackets->getScoreUB(VM_CNT) > ScoreBrackets->getScoreLB(VM_CNT)) {
       ScoreBrackets->setScoreLB(VM_CNT, ScoreBrackets->getScoreUB(VM_CNT));
-      EmitSwaitcnt |= CNT_MASK(VM_CNT);
+      EmitWaitcnt |= CNT_MASK(VM_CNT);
     }
   }
 #if 0 // TODO: the following blocks of logic when we have fence.
@@ -879,11 +969,11 @@ MachineInstr *SIInsertWaitcnts::generateSWaitCntInstBefore(
         case SCMEM_LDS:
           if (group_is_multi_wave ||
             context->OptFlagIsOn(OPT_R1100_LDSMEM_FENCE_CHICKEN_BIT)) {
-            EmitSwaitcnt |= ScoreBrackets->updateByWait(LGKM_CNT,
+            EmitWaitcnt |= ScoreBrackets->updateByWait(LGKM_CNT,
                                ScoreBrackets->getScoreUB(LGKM_CNT));
             // LDS may have to wait for VM_CNT after buffer load to LDS
             if (target_info->HasBufferLoadToLDS()) {
-              EmitSwaitcnt |= ScoreBrackets->updateByWait(VM_CNT,
+              EmitWaitcnt |= ScoreBrackets->updateByWait(VM_CNT,
                                  ScoreBrackets->getScoreUB(VM_CNT));
             }
           }
@@ -891,9 +981,9 @@ MachineInstr *SIInsertWaitcnts::generateSWaitCntInstBefore(
 
         case SCMEM_GDS:
           if (group_is_multi_wave || fence_is_global) {
-            EmitSwaitcnt |= ScoreBrackets->updateByWait(EXP_CNT,
+            EmitWaitcnt |= ScoreBrackets->updateByWait(EXP_CNT,
               ScoreBrackets->getScoreUB(EXP_CNT));
-            EmitSwaitcnt |= ScoreBrackets->updateByWait(LGKM_CNT,
+            EmitWaitcnt |= ScoreBrackets->updateByWait(LGKM_CNT,
               ScoreBrackets->getScoreUB(LGKM_CNT));
           }
           break;
@@ -903,9 +993,9 @@ MachineInstr *SIInsertWaitcnts::generateSWaitCntInstBefore(
         case SCMEM_RING:
         case SCMEM_SCATTER:
           if (group_is_multi_wave || fence_is_global) {
-            EmitSwaitcnt |= ScoreBrackets->updateByWait(EXP_CNT,
+            EmitWaitcnt |= ScoreBrackets->updateByWait(EXP_CNT,
               ScoreBrackets->getScoreUB(EXP_CNT));
-            EmitSwaitcnt |= ScoreBrackets->updateByWait(VM_CNT,
+            EmitWaitcnt |= ScoreBrackets->updateByWait(VM_CNT,
               ScoreBrackets->getScoreUB(VM_CNT));
           }
           break;
@@ -926,13 +1016,13 @@ MachineInstr *SIInsertWaitcnts::generateSWaitCntInstBefore(
     if (MI.modifiesRegister(AMDGPU::EXEC, TRI)) {
       // Export and GDS are tracked individually, either may trigger a waitcnt
       // for EXEC.
-      EmitSwaitcnt |= ScoreBrackets->updateByWait(
+      EmitWaitcnt |= ScoreBrackets->updateByWait(
           EXP_CNT, ScoreBrackets->getEventUB(EXP_GPR_LOCK));
-      EmitSwaitcnt |= ScoreBrackets->updateByWait(
+      EmitWaitcnt |= ScoreBrackets->updateByWait(
           EXP_CNT, ScoreBrackets->getEventUB(EXP_PARAM_ACCESS));
-      EmitSwaitcnt |= ScoreBrackets->updateByWait(
+      EmitWaitcnt |= ScoreBrackets->updateByWait(
           EXP_CNT, ScoreBrackets->getEventUB(EXP_POS_ACCESS));
-      EmitSwaitcnt |= ScoreBrackets->updateByWait(
+      EmitWaitcnt |= ScoreBrackets->updateByWait(
           EXP_CNT, ScoreBrackets->getEventUB(GDS_GPR_LOCK));
     }
 
@@ -947,7 +1037,7 @@ MachineInstr *SIInsertWaitcnts::generateSWaitCntInstBefore(
       if (ScoreBrackets->getScoreUB(EXP_CNT) >
         ScoreBrackets->getScoreLB(EXP_CNT)) {
         ScoreBrackets->setScoreLB(EXP_CNT, ScoreBrackets->getScoreUB(EXP_CNT));
-        EmitSwaitcnt |= CNT_MASK(EXP_CNT);
+        EmitWaitcnt |= CNT_MASK(EXP_CNT);
       }
     }
 #endif
@@ -965,7 +1055,7 @@ MachineInstr *SIInsertWaitcnts::generateSWaitCntInstBefore(
         continue;
       unsigned RegNo = SQ_MAX_PGM_VGPRS + EXTRA_VGPR_LDS;
       // VM_CNT is only relevant to vgpr or LDS.
-      EmitSwaitcnt |= ScoreBrackets->updateByWait(
+      EmitWaitcnt |= ScoreBrackets->updateByWait(
           VM_CNT, ScoreBrackets->getRegScore(RegNo, VM_CNT));
     }
 
@@ -977,10 +1067,10 @@ MachineInstr *SIInsertWaitcnts::generateSWaitCntInstBefore(
       for (signed RegNo = Interval.first; RegNo < Interval.second; ++RegNo) {
         if (TRI->isVGPR(MRIA, Op.getReg())) {
           // VM_CNT is only relevant to vgpr or LDS.
-          EmitSwaitcnt |= ScoreBrackets->updateByWait(
+          EmitWaitcnt |= ScoreBrackets->updateByWait(
               VM_CNT, ScoreBrackets->getRegScore(RegNo, VM_CNT));
         }
-        EmitSwaitcnt |= ScoreBrackets->updateByWait(
+        EmitWaitcnt |= ScoreBrackets->updateByWait(
             LGKM_CNT, ScoreBrackets->getRegScore(RegNo, LGKM_CNT));
       }
     }
@@ -999,9 +1089,9 @@ MachineInstr *SIInsertWaitcnts::generateSWaitCntInstBefore(
         if (AS != AMDGPUASI.LOCAL_ADDRESS)
           continue;
         unsigned RegNo = SQ_MAX_PGM_VGPRS + EXTRA_VGPR_LDS;
-        EmitSwaitcnt |= ScoreBrackets->updateByWait(
+        EmitWaitcnt |= ScoreBrackets->updateByWait(
             VM_CNT, ScoreBrackets->getRegScore(RegNo, VM_CNT));
-        EmitSwaitcnt |= ScoreBrackets->updateByWait(
+        EmitWaitcnt |= ScoreBrackets->updateByWait(
             EXP_CNT, ScoreBrackets->getRegScore(RegNo, EXP_CNT));
       }
     }
@@ -1012,38 +1102,35 @@ MachineInstr *SIInsertWaitcnts::generateSWaitCntInstBefore(
           ScoreBrackets->getRegInterval(&MI, TII, MRI, TRI, I, true);
       for (signed RegNo = Interval.first; RegNo < Interval.second; ++RegNo) {
         if (TRI->isVGPR(MRIA, Def.getReg())) {
-          EmitSwaitcnt |= ScoreBrackets->updateByWait(
+          EmitWaitcnt |= ScoreBrackets->updateByWait(
               VM_CNT, ScoreBrackets->getRegScore(RegNo, VM_CNT));
-          EmitSwaitcnt |= ScoreBrackets->updateByWait(
+          EmitWaitcnt |= ScoreBrackets->updateByWait(
               EXP_CNT, ScoreBrackets->getRegScore(RegNo, EXP_CNT));
         }
-        EmitSwaitcnt |= ScoreBrackets->updateByWait(
+        EmitWaitcnt |= ScoreBrackets->updateByWait(
             LGKM_CNT, ScoreBrackets->getRegScore(RegNo, LGKM_CNT));
       }
     } // End of for loop that looks at all dest operands.
   }
 
-  // TODO: Tie force zero to a compiler triage option.
-  bool ForceZero = false;
-
   // Check to see if this is an S_BARRIER, and if an implicit S_WAITCNT 0
   // occurs before the instruction. Doing it here prevents any additional
   // S_WAITCNTs from being emitted if the instruction was marked as
   // requiring a WAITCNT beforehand.
   if (MI.getOpcode() == AMDGPU::S_BARRIER &&
       !ST->hasAutoWaitcntBeforeBarrier()) {
-    EmitSwaitcnt |=
+    EmitWaitcnt |=
         ScoreBrackets->updateByWait(VM_CNT, ScoreBrackets->getScoreUB(VM_CNT));
-    EmitSwaitcnt |= ScoreBrackets->updateByWait(
+    EmitWaitcnt |= ScoreBrackets->updateByWait(
         EXP_CNT, ScoreBrackets->getScoreUB(EXP_CNT));
-    EmitSwaitcnt |= ScoreBrackets->updateByWait(
+    EmitWaitcnt |= ScoreBrackets->updateByWait(
         LGKM_CNT, ScoreBrackets->getScoreUB(LGKM_CNT));
   }
 
   // TODO: Remove this work-around, enable the assert for Bug 457939
   //       after fixing the scheduler. Also, the Shader Compiler code is
   //       independent of target.
-  if (readsVCCZ(MI) && ST->getGeneration() <= SISubtarget::SEA_ISLANDS) {
+  if (readsVCCZ(MI) && ST->getGeneration() <= AMDGPUSubtarget::SEA_ISLANDS) {
     if (ScoreBrackets->getScoreLB(LGKM_CNT) <
             ScoreBrackets->getScoreUB(LGKM_CNT) &&
         ScoreBrackets->hasPendingSMEM()) {
@@ -1052,17 +1139,20 @@ MachineInstr *SIInsertWaitcnts::generateSWaitCntInstBefore(
       // block, so if we only wait on LGKM here, we might end up with
       // another s_waitcnt inserted right after this if there are non-LGKM
       // instructions still outstanding.
-      ForceZero = true;
-      EmitSwaitcnt = true;
+      // FIXME: this is too conservative / the comment is wrong.
+      // We don't wait on everything at the end of the block and we combine
+      // waitcnts so we should never have back-to-back waitcnts.
+      ForceEmitZeroWaitcnt = true;
+      EmitWaitcnt = true;
     }
   }
 
   // Does this operand processing indicate s_wait counter update?
-  if (EmitSwaitcnt) {
+  if (EmitWaitcnt || IsForceEmitWaitcnt) {
     int CntVal[NUM_INST_CNTS];
 
     bool UseDefaultWaitcntStrategy = true;
-    if (ForceZero) {
+    if (ForceEmitZeroWaitcnt || ForceEmitZeroWaitcnts) {
       // Force all waitcnts to 0.
       for (enum InstCounterType T = VM_CNT; T < NUM_INST_CNTS;
            T = (enum InstCounterType)(T + 1)) {
@@ -1077,7 +1167,7 @@ MachineInstr *SIInsertWaitcnts::generateSWaitCntInstBefore(
     if (UseDefaultWaitcntStrategy) {
       for (enum InstCounterType T = VM_CNT; T < NUM_INST_CNTS;
            T = (enum InstCounterType)(T + 1)) {
-        if (EmitSwaitcnt & CNT_MASK(T)) {
+        if (EmitWaitcnt & CNT_MASK(T)) {
           int Delta =
               ScoreBrackets->getScoreUB(T) - ScoreBrackets->getScoreLB(T);
           int MaxDelta = ScoreBrackets->getWaitCountMax(T);
@@ -1087,7 +1177,7 @@ MachineInstr *SIInsertWaitcnts::generateSWaitCntInstBefore(
               ScoreBrackets->setScoreLB(
                   T, ScoreBrackets->getScoreUB(T) - MaxDelta);
             }
-            EmitSwaitcnt &= ~CNT_MASK(T);
+            EmitWaitcnt &= ~CNT_MASK(T);
           }
           CntVal[T] = Delta;
         } else {
@@ -1099,10 +1189,11 @@ MachineInstr *SIInsertWaitcnts::generateSWaitCntInstBefore(
     }
 
     // If we are not waiting on any counter we can skip the wait altogether.
-    if (EmitSwaitcnt != 0) {
+    if (EmitWaitcnt != 0 || IsForceEmitWaitcnt) {
       MachineInstr *OldWaitcnt = ScoreBrackets->getWaitcnt();
       int Imm = (!OldWaitcnt) ? 0 : OldWaitcnt->getOperand(0).getImm();
-      if (!OldWaitcnt || (AMDGPU::decodeVmcnt(IV, Imm) !=
+      if (!OldWaitcnt ||
+          (AMDGPU::decodeVmcnt(IV, Imm) !=
                           (CntVal[VM_CNT] & AMDGPU::getVmcntBitMask(IV))) ||
           (AMDGPU::decodeExpcnt(IV, Imm) !=
            (CntVal[EXP_CNT] & AMDGPU::getExpcntBitMask(IV))) ||
@@ -1114,39 +1205,80 @@ MachineInstr *SIInsertWaitcnts::generateSWaitCntInstBefore(
           BlockWaitcntBrackets *ScoreBracket =
               BlockWaitcntBracketsMap[TBB].get();
           if (!ScoreBracket) {
-            assert(BlockVisitedSet.find(TBB) == BlockVisitedSet.end());
+            assert(!BlockVisitedSet.count(TBB));
             BlockWaitcntBracketsMap[TBB] =
-                llvm::make_unique<BlockWaitcntBrackets>();
+                llvm::make_unique<BlockWaitcntBrackets>(ST);
             ScoreBracket = BlockWaitcntBracketsMap[TBB].get();
           }
           ScoreBracket->setRevisitLoop(true);
-          DEBUG(dbgs() << "set-revisit: block"
-                       << ContainingLoop->getHeader()->getNumber() << '\n';);
+          LLVM_DEBUG(dbgs()
+                         << "set-revisit2: Block"
+                         << ContainingLoop->getHeader()->getNumber() << '\n';);
         }
       }
 
       // Update an existing waitcount, or make a new one.
-      MachineFunction &MF = *MI.getParent()->getParent();
-      if (OldWaitcnt && OldWaitcnt->getOpcode() != AMDGPU::S_WAITCNT) {
-        SWaitInst = OldWaitcnt;
-      } else {
-        SWaitInst = MF.CreateMachineInstr(TII->get(AMDGPU::S_WAITCNT),
-                                          MI.getDebugLoc());
-        CompilerGeneratedWaitcntSet.insert(SWaitInst);
-      }
+      unsigned Enc = AMDGPU::encodeWaitcnt(IV,
+                      ForceEmitWaitcnt[VM_CNT] ? 0 : CntVal[VM_CNT],
+                      ForceEmitWaitcnt[EXP_CNT] ? 0 : CntVal[EXP_CNT],
+                      ForceEmitWaitcnt[LGKM_CNT] ? 0 : CntVal[LGKM_CNT]);
+      // We don't remove waitcnts that existed prior to the waitcnt
+      // pass. Check if the waitcnt to-be-inserted can be avoided
+      // or if the prev waitcnt can be updated.
+      bool insertSWaitInst = true;
+      for (MachineBasicBlock::iterator I = MI.getIterator(),
+                                       B = MI.getParent()->begin();
+           insertSWaitInst && I != B; --I) {
+        if (I == MI.getIterator())
+          continue;
 
-      const MachineOperand &Op =
-          MachineOperand::CreateImm(AMDGPU::encodeWaitcnt(
-              IV, CntVal[VM_CNT], CntVal[EXP_CNT], CntVal[LGKM_CNT]));
-      SWaitInst->addOperand(MF, Op);
+        switch (I->getOpcode()) {
+        case AMDGPU::S_WAITCNT:
+          if (isWaitcntStronger(I->getOperand(0).getImm(), Enc))
+            insertSWaitInst = false;
+          else if (!OldWaitcnt) {
+            OldWaitcnt = &*I;
+            Enc = combineWaitcnt(I->getOperand(0).getImm(), Enc);
+          }
+          break;
+        // TODO: skip over instructions which never require wait.
+        }
+        break;
+      }
+      if (insertSWaitInst) {
+        if (OldWaitcnt && OldWaitcnt->getOpcode() == AMDGPU::S_WAITCNT) {
+          if (ForceEmitZeroWaitcnts)
+            LLVM_DEBUG(
+                dbgs()
+                << "Force emit s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)\n");
+          if (IsForceEmitWaitcnt)
+            LLVM_DEBUG(dbgs()
+                       << "Force emit a s_waitcnt due to debug counter\n");
+
+          OldWaitcnt->getOperand(0).setImm(Enc);
+          if (!OldWaitcnt->getParent())
+            MI.getParent()->insert(MI, OldWaitcnt);
+
+          LLVM_DEBUG(dbgs() << "updateWaitcntInBlock\n"
+                            << "Old Instr: " << MI << '\n'
+                            << "New Instr: " << *OldWaitcnt << '\n');
+        } else {
+            auto SWaitInst = BuildMI(*MI.getParent(), MI.getIterator(),
+                               MI.getDebugLoc(), TII->get(AMDGPU::S_WAITCNT))
+                             .addImm(Enc);
+            TrackedWaitcntSet.insert(SWaitInst);
+
+            LLVM_DEBUG(dbgs() << "insertWaitcntInBlock\n"
+                              << "Old Instr: " << MI << '\n'
+                              << "New Instr: " << *SWaitInst << '\n');
+        }
+      }
 
       if (CntVal[EXP_CNT] == 0) {
         ScoreBrackets->setMixedExpTypes(false);
       }
     }
   }
-
-  return SWaitInst;
 }
 
 void SIInsertWaitcnts::insertWaitcntBeforeCF(MachineBasicBlock &MBB,
@@ -1180,7 +1312,7 @@ bool SIInsertWaitcnts::mayAccessLDSThroughFlat(const MachineInstr &MI) const {
   return false;
 }
 
-void SIInsertWaitcnts::updateEventWaitCntAfter(
+void SIInsertWaitcnts::updateEventWaitcntAfter(
     MachineInstr &Inst, BlockWaitcntBrackets *ScoreBrackets) {
   // Now look at the instruction opcode. If it is a memory access
   // instruction, update the upper-bound of the appropriate counter's
@@ -1214,7 +1346,7 @@ void SIInsertWaitcnts::updateEventWaitCntAfter(
              Inst.getOpcode() != AMDGPU::BUFFER_WBINVL1_SC &&
              Inst.getOpcode() != AMDGPU::BUFFER_WBINVL1_VOL) {
     ScoreBrackets->updateByEvent(TII, TRI, MRI, VMEM_ACCESS, Inst);
-    if ( // TODO: assumed yes -- target_info->MemWriteNeedsExpWait() &&
+    if (ST->vmemWriteNeedsExpWaitcnt() &&
         (Inst.mayStore() || AMDGPU::getAtomicNoRetOp(Inst.getOpcode()) != -1)) {
       ScoreBrackets->updateByEvent(TII, TRI, MRI, VMW_GPR_LOCK, Inst);
     }
@@ -1247,27 +1379,37 @@ void SIInsertWaitcnts::updateEventWaitCntAfter(
   }
 }
 
+// Merge the score brackets of the Block's predecessors;
+// this merged score bracket is used when adding waitcnts to the Block
 void SIInsertWaitcnts::mergeInputScoreBrackets(MachineBasicBlock &Block) {
   BlockWaitcntBrackets *ScoreBrackets = BlockWaitcntBracketsMap[&Block].get();
   int32_t MaxPending[NUM_INST_CNTS] = {0};
   int32_t MaxFlat[NUM_INST_CNTS] = {0};
   bool MixedExpTypes = false;
 
-  // Clear the score bracket state.
-  ScoreBrackets->clear();
-
-  // Compute the number of pending elements on block entry.
+  // For single basic block loops, we need to retain the Block's
+  // score bracket to have accurate Pred info. So, make a copy of Block's
+  // score bracket, clear() it (which retains several important bits of info),
+  // populate, and then replace en masse. For non-single basic block loops,
+  // just clear Block's current score bracket and repopulate in-place.
+  bool IsSelfPred;
+  std::unique_ptr<BlockWaitcntBrackets> S;
+
+  IsSelfPred = (std::find(Block.pred_begin(), Block.pred_end(), &Block))
+    != Block.pred_end();
+  if (IsSelfPred) {
+    S = llvm::make_unique<BlockWaitcntBrackets>(*ScoreBrackets);
+    ScoreBrackets = S.get();
+  }
 
-  // IMPORTANT NOTE: If iterative handling of loops is added, the code will
-  // need to handle single BBs with backedges to themselves. This means that
-  // they will need to retain and not clear their initial state.
+  ScoreBrackets->clear();
 
   // See if there are any uninitialized predecessors. If so, emit an
   // s_waitcnt 0 at the beginning of the block.
-  for (MachineBasicBlock *pred : Block.predecessors()) {
+  for (MachineBasicBlock *Pred : Block.predecessors()) {
     BlockWaitcntBrackets *PredScoreBrackets =
-        BlockWaitcntBracketsMap[pred].get();
-    bool Visited = BlockVisitedSet.find(pred) != BlockVisitedSet.end();
+        BlockWaitcntBracketsMap[Pred].get();
+    bool Visited = BlockVisitedSet.count(Pred);
     if (!Visited || PredScoreBrackets->getWaitAtBeginning()) {
       continue;
     }
@@ -1306,7 +1448,7 @@ void SIInsertWaitcnts::mergeInputScoreBrackets(MachineBasicBlock &Block) {
   for (MachineBasicBlock *Pred : Block.predecessors()) {
     BlockWaitcntBrackets *PredScoreBrackets =
         BlockWaitcntBracketsMap[Pred].get();
-    bool Visited = BlockVisitedSet.find(Pred) != BlockVisitedSet.end();
+    bool Visited = BlockVisitedSet.count(Pred);
     if (!Visited || PredScoreBrackets->getWaitAtBeginning()) {
       continue;
     }
@@ -1354,7 +1496,7 @@ void SIInsertWaitcnts::mergeInputScoreBrackets(MachineBasicBlock &Block) {
 
   // Set the register scoreboard.
   for (MachineBasicBlock *Pred : Block.predecessors()) {
-    if (BlockVisitedSet.find(Pred) == BlockVisitedSet.end()) {
+    if (!BlockVisitedSet.count(Pred)) {
       continue;
     }
 
@@ -1468,7 +1610,7 @@ void SIInsertWaitcnts::mergeInputScoreBrackets(MachineBasicBlock &Block) {
   // sequencing predecessors, because changes to EXEC require waitcnts due to
   // the delayed nature of these operations.
   for (MachineBasicBlock *Pred : Block.predecessors()) {
-    if (BlockVisitedSet.find(Pred) == BlockVisitedSet.end()) {
+    if (!BlockVisitedSet.count(Pred)) {
       continue;
     }
 
@@ -1496,17 +1638,36 @@ void SIInsertWaitcnts::mergeInputScoreBrackets(MachineBasicBlock &Block) {
       }
     }
   }
+
+  // if a single block loop, update the score brackets. Not needed for other
+  // blocks, as we did this in-place
+  if (IsSelfPred) {
+    BlockWaitcntBracketsMap[&Block] = llvm::make_unique<BlockWaitcntBrackets>(*ScoreBrackets);
+  }
 }
 
-/// Return the "bottom" block of a loop. This differs from
-/// MachineLoop::getBottomBlock in that it works even if the loop is
-/// discontiguous.
-MachineBasicBlock *SIInsertWaitcnts::loopBottom(const MachineLoop *Loop) {
-  MachineBasicBlock *Bottom = Loop->getHeader();
-  for (MachineBasicBlock *MBB : Loop->blocks())
-    if (MBB->getNumber() > Bottom->getNumber())
-      Bottom = MBB;
-  return Bottom;
+/// Return true if the given basic block is a "bottom" block of a loop.
+/// This works even if the loop is discontiguous. This also handles
+/// multiple back-edges for the same "header" block of a loop.
+bool SIInsertWaitcnts::isLoopBottom(const MachineLoop *Loop,
+                                    const MachineBasicBlock *Block) {
+  for (MachineBasicBlock *MBB : Loop->blocks()) {
+    if (MBB == Block && MBB->isSuccessor(Loop->getHeader())) {
+      return true;
+    }
+  }
+  return false;
+}
+
+/// Count the number of "bottom" basic blocks of a loop.
+unsigned SIInsertWaitcnts::countNumBottomBlocks(const MachineLoop *Loop) {
+  unsigned Count = 0;
+  for (MachineBasicBlock *MBB : Loop->blocks()) {
+    if (MBB->isSuccessor(Loop->getHeader())) {
+      Count++;
+    }
+  }
+  return Count;
 }
 
 // Generate s_waitcnt instructions where needed.
@@ -1517,8 +1678,8 @@ void SIInsertWaitcnts::insertWaitcntInBlock(MachineFunction &MF,
 
   BlockWaitcntBrackets *ScoreBrackets = BlockWaitcntBracketsMap[&Block].get();
 
-  DEBUG({
-    dbgs() << "Block" << Block.getNumber();
+  LLVM_DEBUG({
+    dbgs() << "*** Block" << Block.getNumber() << " ***";
     ScoreBrackets->dump();
   });
 
@@ -1528,16 +1689,16 @@ void SIInsertWaitcnts::insertWaitcntInBlock(MachineFunction &MF,
     MachineInstr &Inst = *Iter;
     // Remove any previously existing waitcnts.
     if (Inst.getOpcode() == AMDGPU::S_WAITCNT) {
-      // TODO: Register the old waitcnt and optimize the following waitcnts.
-      // Leaving the previously existing waitcnts is conservatively correct.
-      if (CompilerGeneratedWaitcntSet.find(&Inst) ==
-          CompilerGeneratedWaitcntSet.end())
+      // Leave pre-existing waitcnts, but note their existence via setWaitcnt.
+      // Remove the waitcnt-pass-generated waitcnts; the pass will add them back
+      // as needed.
+      if (!TrackedWaitcntSet.count(&Inst))
         ++Iter;
       else {
-        ScoreBrackets->setWaitcnt(&Inst);
         ++Iter;
         Inst.removeFromParent();
       }
+      ScoreBrackets->setWaitcnt(&Inst);
       continue;
     }
 
@@ -1550,29 +1711,20 @@ void SIInsertWaitcnts::insertWaitcntInBlock(MachineFunction &MF,
 
     bool VCCZBugWorkAround = false;
     if (readsVCCZ(Inst) &&
-        (VCCZBugHandledSet.find(&Inst) == VCCZBugHandledSet.end())) {
+        (!VCCZBugHandledSet.count(&Inst))) {
       if (ScoreBrackets->getScoreLB(LGKM_CNT) <
               ScoreBrackets->getScoreUB(LGKM_CNT) &&
           ScoreBrackets->hasPendingSMEM()) {
-        if (ST->getGeneration() <= SISubtarget::SEA_ISLANDS)
+        if (ST->getGeneration() <= AMDGPUSubtarget::SEA_ISLANDS)
           VCCZBugWorkAround = true;
       }
     }
 
     // Generate an s_waitcnt instruction to be placed before
     // cur_Inst, if needed.
-    MachineInstr *SWaitInst = generateSWaitCntInstBefore(Inst, ScoreBrackets);
-
-    if (SWaitInst) {
-      Block.insert(Inst, SWaitInst);
-      if (ScoreBrackets->getWaitcnt() != SWaitInst) {
-        DEBUG(dbgs() << "insertWaitcntInBlock\n"
-                     << "Old Instr: " << Inst << '\n'
-                     << "New Instr: " << *SWaitInst << '\n';);
-      }
-    }
+    generateWaitcntInstBefore(Inst, ScoreBrackets);
 
-    updateEventWaitCntAfter(Inst, ScoreBrackets);
+    updateEventWaitcntAfter(Inst, ScoreBrackets);
 
 #if 0 // TODO: implement resource type check controlled by options with ub = LB.
     // If this instruction generates a S_SETVSKIP because it is an
@@ -1587,10 +1739,7 @@ void SIInsertWaitcnts::insertWaitcntInBlock(MachineFunction &MF,
 
     ScoreBrackets->clearWaitcnt();
 
-    if (SWaitInst) {
-      DEBUG({ SWaitInst->print(dbgs() << '\n'); });
-    }
-    DEBUG({
+    LLVM_DEBUG({
       Inst.print(dbgs());
       ScoreBrackets->dump();
     });
@@ -1627,21 +1776,22 @@ void SIInsertWaitcnts::insertWaitcntInBlock(MachineFunction &MF,
 
   // Check if we need to force convergence at loop footer.
   MachineLoop *ContainingLoop = MLI->getLoopFor(&Block);
-  if (ContainingLoop && loopBottom(ContainingLoop) == &Block) {
+  if (ContainingLoop && isLoopBottom(ContainingLoop, &Block)) {
     LoopWaitcntData *WaitcntData = LoopWaitcntDataMap[ContainingLoop].get();
     WaitcntData->print();
-    DEBUG(dbgs() << '\n';);
+    LLVM_DEBUG(dbgs() << '\n';);
 
     // The iterative waitcnt insertion algorithm aims for optimal waitcnt
-    // placement and doesn't always guarantee convergence for a loop. Each
-    // loop should take at most 2 iterations for it to converge naturally.
-    // When this max is reached and result doesn't converge, we force
-    // convergence by inserting a s_waitcnt at the end of loop footer.
-    if (WaitcntData->getIterCnt() > 2) {
+    // placement, but doesn't guarantee convergence for a loop. Each
+    // loop should take at most (n+1) iterations for it to converge naturally,
+    // where n is the number of bottom blocks. If this threshold is reached and
+    // the result hasn't converged, then we force convergence by inserting
+    // a s_waitcnt at the end of loop footer.
+    if (WaitcntData->getIterCnt() > (countNumBottomBlocks(ContainingLoop) + 1)) {
       // To ensure convergence, need to make wait events at loop footer be no
       // more than those from the previous iteration.
-      // As a simplification, Instead of tracking individual scores and
-      // generate the precise wait count, just wait on 0.
+      // As a simplification, instead of tracking individual scores and
+      // generating the precise wait count, just wait on 0.
       bool HasPending = false;
       MachineInstr *SWaitInst = WaitcntData->getWaitcnt();
       for (enum InstCounterType T = VM_CNT; T < NUM_INST_CNTS;
@@ -1649,16 +1799,16 @@ void SIInsertWaitcnts::insertWaitcntInBlock(MachineFunction &MF,
         if (ScoreBrackets->getScoreUB(T) > ScoreBrackets->getScoreLB(T)) {
           ScoreBrackets->setScoreLB(T, ScoreBrackets->getScoreUB(T));
           HasPending = true;
+          break;
         }
       }
 
       if (HasPending) {
         if (!SWaitInst) {
-          SWaitInst = Block.getParent()->CreateMachineInstr(
-              TII->get(AMDGPU::S_WAITCNT), DebugLoc());
-          CompilerGeneratedWaitcntSet.insert(SWaitInst);
-          const MachineOperand &Op = MachineOperand::CreateImm(0);
-          SWaitInst->addOperand(MF, Op);
+          SWaitInst = BuildMI(Block, Block.getFirstNonPHI(),
+                              DebugLoc(), TII->get(AMDGPU::S_WAITCNT))
+                              .addImm(0);
+          TrackedWaitcntSet.insert(SWaitInst);
 #if 0 // TODO: Format the debug output
           OutputTransformBanner("insertWaitcntInBlock",0,"Create:",context);
           OutputTransformAdd(SWaitInst, context);
@@ -1670,7 +1820,7 @@ void SIInsertWaitcnts::insertWaitcntInBlock(MachineFunction &MF,
       }
 
       if (SWaitInst) {
-        DEBUG({
+        LLVM_DEBUG({
           SWaitInst->print(dbgs());
           dbgs() << "\nAdjusted score board:";
           ScoreBrackets->dump();
@@ -1678,7 +1828,7 @@ void SIInsertWaitcnts::insertWaitcntInBlock(MachineFunction &MF,
 
         // Add this waitcnt to the block. It is either newly created or
         // created in previous iterations and added back since block traversal
-        // always remove waitcnt.
+        // always removes waitcnts.
         insertWaitcntBeforeCF(Block, SWaitInst);
         WaitcntData->setWaitcnt(SWaitInst);
       }
@@ -1687,7 +1837,7 @@ void SIInsertWaitcnts::insertWaitcntInBlock(MachineFunction &MF,
 }
 
 bool SIInsertWaitcnts::runOnMachineFunction(MachineFunction &MF) {
-  ST = &MF.getSubtarget<SISubtarget>();
+  ST = &MF.getSubtarget<GCNSubtarget>();
   TII = ST->getInstrInfo();
   TRI = &TII->getRegisterInfo();
   MRI = &MF.getRegInfo();
@@ -1696,6 +1846,11 @@ bool SIInsertWaitcnts::runOnMachineFunction(MachineFunction &MF) {
   const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
   AMDGPUASI = ST->getAMDGPUAS();
 
+  ForceEmitZeroWaitcnts = ForceEmitZeroFlag;
+  for (enum InstCounterType T = VM_CNT; T < NUM_INST_CNTS;
+       T = (enum InstCounterType)(T + 1))
+    ForceEmitWaitcnt[T] = false;
+
   HardwareLimits.VmcntMax = AMDGPU::getVmcntBitMask(IV);
   HardwareLimits.ExpcntMax = AMDGPU::getExpcntBitMask(IV);
   HardwareLimits.LgkmcntMax = AMDGPU::getLgkmcntBitMask(IV);
@@ -1712,6 +1867,12 @@ bool SIInsertWaitcnts::runOnMachineFunction(MachineFunction &MF) {
   RegisterEncoding.SGPRL =
       RegisterEncoding.SGPR0 + HardwareLimits.NumSGPRsMax - 1;
 
+  TrackedWaitcntSet.clear();
+  BlockVisitedSet.clear();
+  VCCZBugHandledSet.clear();
+  LoopWaitcntDataMap.clear();
+  BlockWaitcntProcessedSet.clear();
+
   // Walk over the blocks in reverse post-dominator order, inserting
   // s_waitcnt where needed.
   ReversePostOrderTraversal<MachineFunction *> RPOT(&MF);
@@ -1726,7 +1887,7 @@ bool SIInsertWaitcnts::runOnMachineFunction(MachineFunction &MF) {
 
     BlockWaitcntBrackets *ScoreBrackets = BlockWaitcntBracketsMap[&MBB].get();
     if (!ScoreBrackets) {
-      BlockWaitcntBracketsMap[&MBB] = llvm::make_unique<BlockWaitcntBrackets>();
+      BlockWaitcntBracketsMap[&MBB] = llvm::make_unique<BlockWaitcntBrackets>(ST);
       ScoreBrackets = BlockWaitcntBracketsMap[&MBB].get();
     }
     ScoreBrackets->setPostOrder(MBB.getNumber());
@@ -1737,22 +1898,30 @@ bool SIInsertWaitcnts::runOnMachineFunction(MachineFunction &MF) {
     // If we are walking into the block from before the loop, then guarantee
     // at least 1 re-walk over the loop to propagate the information, even if
     // no S_WAITCNT instructions were generated.
-    if (ContainingLoop && ContainingLoop->getHeader() == &MBB && J < I &&
-        (BlockWaitcntProcessedSet.find(&MBB) ==
-         BlockWaitcntProcessedSet.end())) {
-      BlockWaitcntBracketsMap[&MBB]->setRevisitLoop(true);
-      DEBUG(dbgs() << "set-revisit: block"
-                   << ContainingLoop->getHeader()->getNumber() << '\n';);
+    if (ContainingLoop && ContainingLoop->getHeader() == &MBB) {
+      unsigned Count = countNumBottomBlocks(ContainingLoop);
+
+      // If the loop has multiple back-edges, and so more than one "bottom"
+      // basic block, we have to guarantee a re-walk over every blocks.
+      if ((std::count(BlockWaitcntProcessedSet.begin(),
+                      BlockWaitcntProcessedSet.end(), &MBB) < (int)Count)) {
+        BlockWaitcntBracketsMap[&MBB]->setRevisitLoop(true);
+        LLVM_DEBUG(dbgs() << "set-revisit1: Block"
+                          << ContainingLoop->getHeader()->getNumber() << '\n';);
+      }
     }
 
     // Walk over the instructions.
     insertWaitcntInBlock(MF, MBB);
 
-    // Flag that waitcnts have been processed at least once.
-    BlockWaitcntProcessedSet.insert(&MBB);
+    // Record that waitcnts have been processed at least once for this block.
+    BlockWaitcntProcessedSet.push_back(&MBB);
 
-    // See if we want to revisit the loop.
-    if (ContainingLoop && loopBottom(ContainingLoop) == &MBB) {
+    // See if we want to revisit the loop. If a loop has multiple back-edges,
+    // we shouldn't revisit the same "bottom" basic block.
+    if (ContainingLoop && isLoopBottom(ContainingLoop, &MBB) &&
+        std::count(BlockWaitcntProcessedSet.begin(),
+                   BlockWaitcntProcessedSet.end(), &MBB) == 1) {
       MachineBasicBlock *EntryBB = ContainingLoop->getHeader();
       BlockWaitcntBrackets *EntrySB = BlockWaitcntBracketsMap[EntryBB].get();
       if (EntrySB && EntrySB->getRevisitLoop()) {
@@ -1772,7 +1941,7 @@ bool SIInsertWaitcnts::runOnMachineFunction(MachineFunction &MF) {
         }
         LoopWaitcntData *WaitcntData = LoopWaitcntDataMap[ContainingLoop].get();
         WaitcntData->incIterCnt();
-        DEBUG(dbgs() << "revisit: block" << EntryBB->getNumber() << '\n';);
+        LLVM_DEBUG(dbgs() << "revisit: Block" << EntryBB->getNumber() << '\n';);
         continue;
       } else {
         LoopWaitcntData *WaitcntData = LoopWaitcntDataMap[ContainingLoop].get();
@@ -1837,7 +2006,7 @@ bool SIInsertWaitcnts::runOnMachineFunction(MachineFunction &MF) {
 
   if (!MFI->isEntryFunction()) {
     // Wait for any outstanding memory operations that the input registers may
-    // depend on. We can't track them and it's better to to the wait after the
+    // depend on. We can't track them and it's better to the wait after the
     // costly call sequence.
 
     // TODO: Could insert earlier and schedule more liberally with operations
diff --git a/lib/Target/AMDGPU/SIInsertWaits.cpp b/lib/Target/AMDGPU/SIInsertWaits.cpp
deleted file mode 100644
index b074b95c2d3c..000000000000
--- a/lib/Target/AMDGPU/SIInsertWaits.cpp
+++ /dev/null
@@ -1,703 +0,0 @@
-//===- SILowerControlFlow.cpp - Use predicates for control flow -----------===//
-//
-//                     The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
-//
-//===----------------------------------------------------------------------===//
-//
-/// \file
-/// \brief Insert wait instructions for memory reads and writes.
-///
-/// Memory reads and writes are issued asynchronously, so we need to insert
-/// S_WAITCNT instructions when we want to access any of their results or
-/// overwrite any register that's used asynchronously.
-//
-//===----------------------------------------------------------------------===//
-
-#include "AMDGPU.h"
-#include "AMDGPUSubtarget.h"
-#include "SIDefines.h"
-#include "SIInstrInfo.h"
-#include "SIMachineFunctionInfo.h"
-#include "SIRegisterInfo.h"
-#include "Utils/AMDGPUBaseInfo.h"
-#include "llvm/ADT/SmallVector.h"
-#include "llvm/ADT/StringRef.h"
-#include "llvm/CodeGen/MachineBasicBlock.h"
-#include "llvm/CodeGen/MachineFunction.h"
-#include "llvm/CodeGen/MachineFunctionPass.h"
-#include "llvm/CodeGen/MachineInstr.h"
-#include "llvm/CodeGen/MachineInstrBuilder.h"
-#include "llvm/CodeGen/MachineOperand.h"
-#include "llvm/CodeGen/MachineRegisterInfo.h"
-#include "llvm/IR/DebugLoc.h"
-#include "llvm/MC/MCInstrDesc.h"
-#include "llvm/Pass.h"
-#include "llvm/Support/Debug.h"
-#include "llvm/Support/raw_ostream.h"
-#include <algorithm>
-#include <cassert>
-#include <cstdint>
-#include <cstring>
-#include <utility>
-
-#define DEBUG_TYPE "si-insert-waits"
-
-using namespace llvm;
-
-namespace {
-
-/// \brief One variable for each of the hardware counters
-using Counters = union {
-  struct {
-    unsigned VM;
-    unsigned EXP;
-    unsigned LGKM;
-  } Named;
-  unsigned Array[3];
-};
-
-using InstType = enum {
-  OTHER,
-  SMEM,
-  VMEM
-};
-
-using RegCounters =  Counters[512];
-using RegInterval = std::pair<unsigned, unsigned>;
-
-class SIInsertWaits : public MachineFunctionPass {
-private:
-  const SISubtarget *ST = nullptr;
-  const SIInstrInfo *TII = nullptr;
-  const SIRegisterInfo *TRI = nullptr;
-  const MachineRegisterInfo *MRI;
-  AMDGPU::IsaInfo::IsaVersion ISA;
-
-  /// \brief Constant zero value
-  static const Counters ZeroCounts;
-
-  /// \brief Hardware limits
-  Counters HardwareLimits;
-
-  /// \brief Counter values we have already waited on.
-  Counters WaitedOn;
-
-  /// \brief Counter values that we must wait on before the next counter
-  /// increase.
-  Counters DelayedWaitOn;
-
-  /// \brief Counter values for last instruction issued.
-  Counters LastIssued;
-
-  /// \brief Registers used by async instructions.
-  RegCounters UsedRegs;
-
-  /// \brief Registers defined by async instructions.
-  RegCounters DefinedRegs;
-
-  /// \brief Different export instruction types seen since last wait.
-  unsigned ExpInstrTypesSeen = 0;
-
-  /// \brief Type of the last opcode.
-  InstType LastOpcodeType;
-
-  bool LastInstWritesM0;
-
-  /// Whether or not we have flat operations outstanding.
-  bool IsFlatOutstanding;
-
-  /// \brief Whether the machine function returns void
-  bool ReturnsVoid;
-
-  /// Whether the VCCZ bit is possibly corrupt
-  bool VCCZCorrupt = false;
-
-  /// \brief Get increment/decrement amount for this instruction.
-  Counters getHwCounts(MachineInstr &MI);
-
-  /// \brief Is operand relevant for async execution?
-  bool isOpRelevant(MachineOperand &Op);
-
-  /// \brief Get register interval an operand affects.
-  RegInterval getRegInterval(const TargetRegisterClass *RC,
-                             const MachineOperand &Reg) const;
-
-  /// \brief Handle instructions async components
-  void pushInstruction(MachineBasicBlock &MBB,
-                       MachineBasicBlock::iterator I,
-                       const Counters& Increment);
-
-  /// \brief Insert the actual wait instruction
-  bool insertWait(MachineBasicBlock &MBB,
-                  MachineBasicBlock::iterator I,
-                  const Counters &Counts);
-
-  /// \brief Handle existing wait instructions (from intrinsics)
-  void handleExistingWait(MachineBasicBlock::iterator I);
-
-  /// \brief Do we need def2def checks?
-  bool unorderedDefines(MachineInstr &MI);
-
-  /// \brief Resolve all operand dependencies to counter requirements
-  Counters handleOperands(MachineInstr &MI);
-
-  /// \brief Insert S_NOP between an instruction writing M0 and S_SENDMSG.
-  void handleSendMsg(MachineBasicBlock &MBB, MachineBasicBlock::iterator I);
-
-  /// Return true if there are LGKM instrucitons that haven't been waited on
-  /// yet.
-  bool hasOutstandingLGKM() const;
-
-public:
-  static char ID;
-
-  SIInsertWaits() : MachineFunctionPass(ID) {}
-
-  bool runOnMachineFunction(MachineFunction &MF) override;
-
-  StringRef getPassName() const override {
-    return "SI insert wait instructions";
-  }
-
-  void getAnalysisUsage(AnalysisUsage &AU) const override {
-    AU.setPreservesCFG();
-    MachineFunctionPass::getAnalysisUsage(AU);
-  }
-};
-
-} // end anonymous namespace
-
-INITIALIZE_PASS_BEGIN(SIInsertWaits, DEBUG_TYPE,
-                      "SI Insert Waits", false, false)
-INITIALIZE_PASS_END(SIInsertWaits, DEBUG_TYPE,
-                    "SI Insert Waits", false, false)
-
-char SIInsertWaits::ID = 0;
-
-char &llvm::SIInsertWaitsID = SIInsertWaits::ID;
-
-FunctionPass *llvm::createSIInsertWaitsPass() {
-  return new SIInsertWaits();
-}
-
-const Counters SIInsertWaits::ZeroCounts = { { 0, 0, 0 } };
-
-static bool readsVCCZ(const MachineInstr &MI) {
-  unsigned Opc = MI.getOpcode();
-  return (Opc == AMDGPU::S_CBRANCH_VCCNZ || Opc == AMDGPU::S_CBRANCH_VCCZ) &&
-         !MI.getOperand(1).isUndef();
-}
-
-bool SIInsertWaits::hasOutstandingLGKM() const {
-  return WaitedOn.Named.LGKM != LastIssued.Named.LGKM;
-}
-
-Counters SIInsertWaits::getHwCounts(MachineInstr &MI) {
-  uint64_t TSFlags = MI.getDesc().TSFlags;
-  Counters Result = { { 0, 0, 0 } };
-
-  Result.Named.VM = !!(TSFlags & SIInstrFlags::VM_CNT);
-
-  // Only consider stores or EXP for EXP_CNT
-  Result.Named.EXP = !!(TSFlags & SIInstrFlags::EXP_CNT) && MI.mayStore();
-
-  // LGKM may uses larger values
-  if (TSFlags & SIInstrFlags::LGKM_CNT) {
-
-    if (TII->isSMRD(MI)) {
-
-      if (MI.getNumOperands() != 0) {
-        assert(MI.getOperand(0).isReg() &&
-               "First LGKM operand must be a register!");
-
-        // XXX - What if this is a write into a super register?
-        const TargetRegisterClass *RC = TII->getOpRegClass(MI, 0);
-        unsigned Size = TRI->getRegSizeInBits(*RC);
-        Result.Named.LGKM = Size > 32 ? 2 : 1;
-      } else {
-        // s_dcache_inv etc. do not have a a destination register. Assume we
-        // want a wait on these.
-        // XXX - What is the right value?
-        Result.Named.LGKM = 1;
-      }
-    } else {
-      // DS
-      Result.Named.LGKM = 1;
-    }
-
-  } else {
-    Result.Named.LGKM = 0;
-  }
-
-  return Result;
-}
-
-bool SIInsertWaits::isOpRelevant(MachineOperand &Op) {
-  // Constants are always irrelevant
-  if (!Op.isReg() || !TRI->isInAllocatableClass(Op.getReg()))
-    return false;
-
-  // Defines are always relevant
-  if (Op.isDef())
-    return true;
-
-  // For exports all registers are relevant.
-  // TODO: Skip undef/disabled registers.
-  MachineInstr &MI = *Op.getParent();
-  if (TII->isEXP(MI))
-    return true;
-
-  // For stores the stored value is also relevant
-  if (!MI.getDesc().mayStore())
-    return false;
-
-  // Check if this operand is the value being stored.
-  // Special case for DS/FLAT instructions, since the address
-  // operand comes before the value operand and it may have
-  // multiple data operands.
-
-  if (TII->isDS(MI)) {
-    MachineOperand *Data0 = TII->getNamedOperand(MI, AMDGPU::OpName::data0);
-    if (Data0 && Op.isIdenticalTo(*Data0))
-      return true;
-
-    MachineOperand *Data1 = TII->getNamedOperand(MI, AMDGPU::OpName::data1);
-    return Data1 && Op.isIdenticalTo(*Data1);
-  }
-
-  if (TII->isFLAT(MI)) {
-    MachineOperand *Data = TII->getNamedOperand(MI, AMDGPU::OpName::vdata);
-    if (Data && Op.isIdenticalTo(*Data))
-      return true;
-  }
-
-  // NOTE: This assumes that the value operand is before the
-  // address operand, and that there is only one value operand.
-  for (MachineInstr::mop_iterator I = MI.operands_begin(),
-       E = MI.operands_end(); I != E; ++I) {
-
-    if (I->isReg() && I->isUse())
-      return Op.isIdenticalTo(*I);
-  }
-
-  return false;
-}
-
-RegInterval SIInsertWaits::getRegInterval(const TargetRegisterClass *RC,
-                                          const MachineOperand &Reg) const {
-  unsigned Size = TRI->getRegSizeInBits(*RC);
-  assert(Size >= 32);
-
-  RegInterval Result;
-  Result.first = TRI->getEncodingValue(Reg.getReg());
-  Result.second = Result.first + Size / 32;
-
-  return Result;
-}
-
-void SIInsertWaits::pushInstruction(MachineBasicBlock &MBB,
-                                    MachineBasicBlock::iterator I,
-                                    const Counters &Increment) {
-  // Get the hardware counter increments and sum them up
-  Counters Limit = ZeroCounts;
-  unsigned Sum = 0;
-
-  if (TII->mayAccessFlatAddressSpace(*I))
-    IsFlatOutstanding = true;
-
-  for (unsigned i = 0; i < 3; ++i) {
-    LastIssued.Array[i] += Increment.Array[i];
-    if (Increment.Array[i])
-      Limit.Array[i] = LastIssued.Array[i];
-    Sum += Increment.Array[i];
-  }
-
-  // If we don't increase anything then that's it
-  if (Sum == 0) {
-    LastOpcodeType = OTHER;
-    return;
-  }
-
-  if (ST->getGeneration() >= SISubtarget::VOLCANIC_ISLANDS) {
-    // Any occurrence of consecutive VMEM or SMEM instructions forms a VMEM
-    // or SMEM clause, respectively.
-    //
-    // The temporary workaround is to break the clauses with S_NOP.
-    //
-    // The proper solution would be to allocate registers such that all source
-    // and destination registers don't overlap, e.g. this is illegal:
-    //   r0 = load r2
-    //   r2 = load r0
-    if (LastOpcodeType == VMEM && Increment.Named.VM) {
-      // Insert a NOP to break the clause.
-      BuildMI(MBB, I, DebugLoc(), TII->get(AMDGPU::S_NOP))
-          .addImm(0);
-      LastInstWritesM0 = false;
-    }
-
-    if (TII->isSMRD(*I))
-      LastOpcodeType = SMEM;
-    else if (Increment.Named.VM)
-      LastOpcodeType = VMEM;
-  }
-
-  // Remember which export instructions we have seen
-  if (Increment.Named.EXP) {
-    ExpInstrTypesSeen |= TII->isEXP(*I) ? 1 : 2;
-  }
-
-  for (unsigned i = 0, e = I->getNumOperands(); i != e; ++i) {
-    MachineOperand &Op = I->getOperand(i);
-    if (!isOpRelevant(Op))
-      continue;
-
-    const TargetRegisterClass *RC = TII->getOpRegClass(*I, i);
-    RegInterval Interval = getRegInterval(RC, Op);
-    for (unsigned j = Interval.first; j < Interval.second; ++j) {
-
-      // Remember which registers we define
-      if (Op.isDef())
-        DefinedRegs[j] = Limit;
-
-      // and which one we are using
-      if (Op.isUse())
-        UsedRegs[j] = Limit;
-    }
-  }
-}
-
-bool SIInsertWaits::insertWait(MachineBasicBlock &MBB,
-                               MachineBasicBlock::iterator I,
-                               const Counters &Required) {
-  // End of program? No need to wait on anything
-  // A function not returning void needs to wait, because other bytecode will
-  // be appended after it and we don't know what it will be.
-  if (I != MBB.end() && I->getOpcode() == AMDGPU::S_ENDPGM && ReturnsVoid)
-    return false;
-
-  // Figure out if the async instructions execute in order
-  bool Ordered[3];
-
-  // VM_CNT is always ordered except when there are flat instructions, which
-  // can return out of order.
-  Ordered[0] = !IsFlatOutstanding;
-
-  // EXP_CNT is unordered if we have both EXP & VM-writes
-  Ordered[1] = ExpInstrTypesSeen == 3;
-
-  // LGKM_CNT is handled as always unordered. TODO: Handle LDS and GDS
-  Ordered[2] = false;
-
-  // The values we are going to put into the S_WAITCNT instruction
-  Counters Counts = HardwareLimits;
-
-  // Do we really need to wait?
-  bool NeedWait = false;
-
-  for (unsigned i = 0; i < 3; ++i) {
-    if (Required.Array[i] <= WaitedOn.Array[i])
-      continue;
-
-    NeedWait = true;
-
-    if (Ordered[i]) {
-      unsigned Value = LastIssued.Array[i] - Required.Array[i];
-
-      // Adjust the value to the real hardware possibilities.
-      Counts.Array[i] = std::min(Value, HardwareLimits.Array[i]);
-    } else
-      Counts.Array[i] = 0;
-
-    // Remember on what we have waited on.
-    WaitedOn.Array[i] = LastIssued.Array[i] - Counts.Array[i];
-  }
-
-  if (!NeedWait)
-    return false;
-
-  // Reset EXP_CNT instruction types
-  if (Counts.Named.EXP == 0)
-    ExpInstrTypesSeen = 0;
-
-  // Build the wait instruction
-  BuildMI(MBB, I, DebugLoc(), TII->get(AMDGPU::S_WAITCNT))
-    .addImm(AMDGPU::encodeWaitcnt(ISA,
-                                  Counts.Named.VM,
-                                  Counts.Named.EXP,
-                                  Counts.Named.LGKM));
-
-  LastOpcodeType = OTHER;
-  LastInstWritesM0 = false;
-  IsFlatOutstanding = false;
-  return true;
-}
-
-/// \brief helper function for handleOperands
-static void increaseCounters(Counters &Dst, const Counters &Src) {
-  for (unsigned i = 0; i < 3; ++i)
-    Dst.Array[i] = std::max(Dst.Array[i], Src.Array[i]);
-}
-
-/// \brief check whether any of the counters is non-zero
-static bool countersNonZero(const Counters &Counter) {
-  for (unsigned i = 0; i < 3; ++i)
-    if (Counter.Array[i])
-      return true;
-  return false;
-}
-
-void SIInsertWaits::handleExistingWait(MachineBasicBlock::iterator I) {
-  assert(I->getOpcode() == AMDGPU::S_WAITCNT);
-
-  unsigned Imm = I->getOperand(0).getImm();
-  Counters Counts, WaitOn;
-
-  Counts.Named.VM = AMDGPU::decodeVmcnt(ISA, Imm);
-  Counts.Named.EXP = AMDGPU::decodeExpcnt(ISA, Imm);
-  Counts.Named.LGKM = AMDGPU::decodeLgkmcnt(ISA, Imm);
-
-  for (unsigned i = 0; i < 3; ++i) {
-    if (Counts.Array[i] <= LastIssued.Array[i])
-      WaitOn.Array[i] = LastIssued.Array[i] - Counts.Array[i];
-    else
-      WaitOn.Array[i] = 0;
-  }
-
-  increaseCounters(DelayedWaitOn, WaitOn);
-}
-
-Counters SIInsertWaits::handleOperands(MachineInstr &MI) {
-  Counters Result = ZeroCounts;
-
-  // For each register affected by this instruction increase the result
-  // sequence.
-  //
-  // TODO: We could probably just look at explicit operands if we removed VCC /
-  // EXEC from SMRD dest reg classes.
-  for (unsigned i = 0, e = MI.getNumOperands(); i != e; ++i) {
-    MachineOperand &Op = MI.getOperand(i);
-    if (!Op.isReg() || !TRI->isInAllocatableClass(Op.getReg()))
-      continue;
-
-    const TargetRegisterClass *RC = TII->getOpRegClass(MI, i);
-    RegInterval Interval = getRegInterval(RC, Op);
-    for (unsigned j = Interval.first; j < Interval.second; ++j) {
-      if (Op.isDef()) {
-        increaseCounters(Result, UsedRegs[j]);
-        increaseCounters(Result, DefinedRegs[j]);
-      }
-
-      if (Op.isUse())
-        increaseCounters(Result, DefinedRegs[j]);
-    }
-  }
-
-  return Result;
-}
-
-void SIInsertWaits::handleSendMsg(MachineBasicBlock &MBB,
-                                  MachineBasicBlock::iterator I) {
-  if (ST->getGeneration() < SISubtarget::VOLCANIC_ISLANDS)
-    return;
-
-  // There must be "S_NOP 0" between an instruction writing M0 and S_SENDMSG.
-  if (LastInstWritesM0 && (I->getOpcode() == AMDGPU::S_SENDMSG || I->getOpcode() == AMDGPU::S_SENDMSGHALT)) {
-    BuildMI(MBB, I, DebugLoc(), TII->get(AMDGPU::S_NOP)).addImm(0);
-    LastInstWritesM0 = false;
-    return;
-  }
-
-  // Set whether this instruction sets M0
-  LastInstWritesM0 = false;
-
-  unsigned NumOperands = I->getNumOperands();
-  for (unsigned i = 0; i < NumOperands; i++) {
-    const MachineOperand &Op = I->getOperand(i);
-
-    if (Op.isReg() && Op.isDef() && Op.getReg() == AMDGPU::M0)
-      LastInstWritesM0 = true;
-  }
-}
-
-/// Return true if \p MBB has one successor immediately following, and is its
-/// only predecessor
-static bool hasTrivialSuccessor(const MachineBasicBlock &MBB) {
-  if (MBB.succ_size() != 1)
-    return false;
-
-  const MachineBasicBlock *Succ = *MBB.succ_begin();
-  return (Succ->pred_size() == 1) && MBB.isLayoutSuccessor(Succ);
-}
-
-// FIXME: Insert waits listed in Table 4.2 "Required User-Inserted Wait States"
-// around other non-memory instructions.
-bool SIInsertWaits::runOnMachineFunction(MachineFunction &MF) {
-  bool Changes = false;
-
-  ST = &MF.getSubtarget<SISubtarget>();
-  TII = ST->getInstrInfo();
-  TRI = &TII->getRegisterInfo();
-  MRI = &MF.getRegInfo();
-  ISA = AMDGPU::IsaInfo::getIsaVersion(ST->getFeatureBits());
-  const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
-
-  HardwareLimits.Named.VM = AMDGPU::getVmcntBitMask(ISA);
-  HardwareLimits.Named.EXP = AMDGPU::getExpcntBitMask(ISA);
-  HardwareLimits.Named.LGKM = AMDGPU::getLgkmcntBitMask(ISA);
-
-  WaitedOn = ZeroCounts;
-  DelayedWaitOn = ZeroCounts;
-  LastIssued = ZeroCounts;
-  LastOpcodeType = OTHER;
-  LastInstWritesM0 = false;
-  IsFlatOutstanding = false;
-  ReturnsVoid = MFI->returnsVoid();
-
-  memset(&UsedRegs, 0, sizeof(UsedRegs));
-  memset(&DefinedRegs, 0, sizeof(DefinedRegs));
-
-  SmallVector<MachineInstr *, 4> RemoveMI;
-  SmallVector<MachineBasicBlock *, 4> EndPgmBlocks;
-
-  bool HaveScalarStores = false;
-
-  for (MachineFunction::iterator BI = MF.begin(), BE = MF.end();
-       BI != BE; ++BI) {
-    MachineBasicBlock &MBB = *BI;
-
-    for (MachineBasicBlock::iterator I = MBB.begin(), E = MBB.end();
-         I != E; ++I) {
-      if (!HaveScalarStores && TII->isScalarStore(*I))
-        HaveScalarStores = true;
-
-      if (ST->getGeneration() <= SISubtarget::SEA_ISLANDS) {
-        // There is a hardware bug on CI/SI where SMRD instruction may corrupt
-        // vccz bit, so when we detect that an instruction may read from a
-        // corrupt vccz bit, we need to:
-        // 1. Insert s_waitcnt lgkm(0) to wait for all outstanding SMRD operations to
-        //    complete.
-        // 2. Restore the correct value of vccz by writing the current value
-        //    of vcc back to vcc.
-
-        if (TII->isSMRD(I->getOpcode())) {
-          VCCZCorrupt = true;
-        } else if (!hasOutstandingLGKM() && I->modifiesRegister(AMDGPU::VCC, TRI)) {
-          // FIXME: We only care about SMRD instructions here, not LDS or GDS.
-          // Whenever we store a value in vcc, the correct value of vccz is
-          // restored.
-          VCCZCorrupt = false;
-        }
-
-        // Check if we need to apply the bug work-around
-        if (VCCZCorrupt && readsVCCZ(*I)) {
-          DEBUG(dbgs() << "Inserting vccz bug work-around before: " << *I << '\n');
-
-          // Wait on everything, not just LGKM.  vccz reads usually come from
-          // terminators, and we always wait on everything at the end of the
-          // block, so if we only wait on LGKM here, we might end up with
-          // another s_waitcnt inserted right after this if there are non-LGKM
-          // instructions still outstanding.
-          insertWait(MBB, I, LastIssued);
-
-          // Restore the vccz bit.  Any time a value is written to vcc, the vcc
-          // bit is updated, so we can restore the bit by reading the value of
-          // vcc and then writing it back to the register.
-          BuildMI(MBB, I, I->getDebugLoc(), TII->get(AMDGPU::S_MOV_B64),
-                  AMDGPU::VCC)
-            .addReg(AMDGPU::VCC);
-        }
-      }
-
-      // Record pre-existing, explicitly requested waits
-      if (I->getOpcode() == AMDGPU::S_WAITCNT) {
-        handleExistingWait(*I);
-        RemoveMI.push_back(&*I);
-        continue;
-      }
-
-      Counters Required;
-
-      // Wait for everything before a barrier.
-      //
-      // S_SENDMSG implicitly waits for all outstanding LGKM transfers to finish,
-      // but we also want to wait for any other outstanding transfers before
-      // signalling other hardware blocks
-      if ((I->getOpcode() == AMDGPU::S_BARRIER &&
-               !ST->hasAutoWaitcntBeforeBarrier()) ||
-           I->getOpcode() == AMDGPU::S_SENDMSG ||
-           I->getOpcode() == AMDGPU::S_SENDMSGHALT)
-        Required = LastIssued;
-      else
-        Required = handleOperands(*I);
-
-      Counters Increment = getHwCounts(*I);
-
-      if (countersNonZero(Required) || countersNonZero(Increment))
-        increaseCounters(Required, DelayedWaitOn);
-
-      Changes |= insertWait(MBB, I, Required);
-
-      pushInstruction(MBB, I, Increment);
-      handleSendMsg(MBB, I);
-
-      if (I->getOpcode() == AMDGPU::S_ENDPGM ||
-          I->getOpcode() == AMDGPU::SI_RETURN_TO_EPILOG)
-        EndPgmBlocks.push_back(&MBB);
-    }
-
-    // Wait for everything at the end of the MBB. If there is only one
-    // successor, we can defer this until the uses there.
-    if (!hasTrivialSuccessor(MBB))
-      Changes |= insertWait(MBB, MBB.getFirstTerminator(), LastIssued);
-  }
-
-  if (HaveScalarStores) {
-    // If scalar writes are used, the cache must be flushed or else the next
-    // wave to reuse the same scratch memory can be clobbered.
-    //
-    // Insert s_dcache_wb at wave termination points if there were any scalar
-    // stores, and only if the cache hasn't already been flushed. This could be
-    // improved by looking across blocks for flushes in postdominating blocks
-    // from the stores but an explicitly requested flush is probably very rare.
-    for (MachineBasicBlock *MBB : EndPgmBlocks) {
-      bool SeenDCacheWB = false;
-
-      for (MachineBasicBlock::iterator I = MBB->begin(), E = MBB->end();
-           I != E; ++I) {
-        if (I->getOpcode() == AMDGPU::S_DCACHE_WB)
-          SeenDCacheWB = true;
-        else if (TII->isScalarStore(*I))
-          SeenDCacheWB = false;
-
-        // FIXME: It would be better to insert this before a waitcnt if any.
-        if ((I->getOpcode() == AMDGPU::S_ENDPGM ||
-             I->getOpcode() == AMDGPU::SI_RETURN_TO_EPILOG) && !SeenDCacheWB) {
-          Changes = true;
-          BuildMI(*MBB, I, I->getDebugLoc(), TII->get(AMDGPU::S_DCACHE_WB));
-        }
-      }
-    }
-  }
-
-  for (MachineInstr *I : RemoveMI)
-    I->eraseFromParent();
-
-  if (!MFI->isEntryFunction()) {
-    // Wait for any outstanding memory operations that the input registers may
-    // depend on. We can't track them and it's better to to the wait after the
-    // costly call sequence.
-
-    // TODO: Could insert earlier and schedule more liberally with operations
-    // that only use caller preserved registers.
-    MachineBasicBlock &EntryBB = MF.front();
-    BuildMI(EntryBB, EntryBB.getFirstNonPHI(), DebugLoc(), TII->get(AMDGPU::S_WAITCNT))
-      .addImm(0);
-
-    Changes = true;
-  }
-
-  return Changes;
-}
diff --git a/lib/Target/AMDGPU/SIInstrFormats.td b/lib/Target/AMDGPU/SIInstrFormats.td
index 25917cc06e6a..b73d30940fc3 100644
--- a/lib/Target/AMDGPU/SIInstrFormats.td
+++ b/lib/Target/AMDGPU/SIInstrFormats.td
@@ -12,16 +12,16 @@
 //===----------------------------------------------------------------------===//
 
 def isGCN : Predicate<"Subtarget->getGeneration() "
-                      ">= SISubtarget::SOUTHERN_ISLANDS">,
+                      ">= AMDGPUSubtarget::SOUTHERN_ISLANDS">,
             AssemblerPredicate<"FeatureGCN">;
 def isSI : Predicate<"Subtarget->getGeneration() "
-                      "== SISubtarget::SOUTHERN_ISLANDS">,
+                      "== AMDGPUSubtarget::SOUTHERN_ISLANDS">,
            AssemblerPredicate<"FeatureSouthernIslands">;
 
 
 class InstSI <dag outs, dag ins, string asm = "",
               list<dag> pattern = []> :
-  AMDGPUInst<outs, ins, asm, pattern>, PredicateControl {
+  AMDGPUInst<outs, ins, asm, pattern>, GCNPredicateControl {
   let SubtargetPredicate = isGCN;
 
   // Low bits - basic encoding information.
@@ -118,6 +118,9 @@ class InstSI <dag outs, dag ins, string asm = "",
   // This bit indicates that this is a packed VOP3P instruction
   field bit IsPacked = 0;
 
+  // This bit indicates that this is a D16 buffer instruction.
+  field bit D16Buf = 0;
+
   // These need to be kept in sync with the enum in SIInstrFlags.
   let TSFlags{0} = SALU;
   let TSFlags{1} = VALU;
@@ -173,6 +176,8 @@ class InstSI <dag outs, dag ins, string asm = "",
 
   let TSFlags{49} = IsPacked;
 
+  let TSFlags{50} = D16Buf;
+
   let SchedRW = [Write32Bit];
 
   field bits<1> DisableSIDecoder = 0;
@@ -181,6 +186,9 @@ class InstSI <dag outs, dag ins, string asm = "",
 
   let isAsmParserOnly = !if(!eq(DisableDecoder{0}, {0}), 0, 1);
   let AsmVariantName = AMDGPUAsmVariants.Default;
+
+  // Avoid changing source registers in a way that violates constant bus read limitations.
+  let hasExtraSrcRegAllocReq = !if(VOP1,1,!if(VOP2,1,!if(VOP3,1,!if(VOPC,1,!if(SDWA,1, !if(VALU,1,0))))));
 }
 
 class PseudoInstSI<dag outs, dag ins, list<dag> pattern = [], string asm = "">
@@ -247,6 +255,7 @@ class MIMGe <bits<7> op> : Enc64 {
   bits<1> tfe;
   bits<1> lwe;
   bits<1> slc;
+  bit d16;
   bits<8> vaddr;
   bits<7> srsrc;
   bits<7> ssamp;
@@ -265,6 +274,7 @@ class MIMGe <bits<7> op> : Enc64 {
   let Inst{47-40} = vdata;
   let Inst{52-48} = srsrc{6-2};
   let Inst{57-53} = ssamp{6-2};
+  let Inst{63} = d16;
 }
 
 class EXPe : Enc64 {
@@ -309,6 +319,7 @@ class VINTRPCommon <dag outs, dag ins, string asm, list<dag> pattern> :
   let mayLoad = 0;
   let mayStore = 0;
   let hasSideEffects = 0;
+  let VALU = 1;
 }
 
 class EXPCommon<dag outs, dag ins, string asm, list<dag> pattern> :
@@ -323,15 +334,3 @@ class EXPCommon<dag outs, dag ins, string asm, list<dag> pattern> :
 }
 
 } // End Uses = [EXEC]
-
-class MIMG <dag outs, dag ins, string asm, list<dag> pattern> :
-    InstSI <outs, ins, asm, pattern> {
-
-  let VM_CNT = 1;
-  let EXP_CNT = 1;
-  let MIMG = 1;
-  let Uses = [EXEC];
-
-  let UseNamedOperandTable = 1;
-  let hasSideEffects = 0; // XXX ????
-}
diff --git a/lib/Target/AMDGPU/SIInstrInfo.cpp b/lib/Target/AMDGPU/SIInstrInfo.cpp
index 61967605432e..6c85c92454c3 100644
--- a/lib/Target/AMDGPU/SIInstrInfo.cpp
+++ b/lib/Target/AMDGPU/SIInstrInfo.cpp
@@ -8,17 +8,19 @@
 //===----------------------------------------------------------------------===//
 //
 /// \file
-/// \brief SI Implementation of TargetInstrInfo.
+/// SI Implementation of TargetInstrInfo.
 //
 //===----------------------------------------------------------------------===//
 
 #include "SIInstrInfo.h"
 #include "AMDGPU.h"
+#include "AMDGPUIntrinsicInfo.h"
 #include "AMDGPUSubtarget.h"
 #include "GCNHazardRecognizer.h"
 #include "SIDefines.h"
 #include "SIMachineFunctionInfo.h"
 #include "SIRegisterInfo.h"
+#include "MCTargetDesc/AMDGPUMCTargetDesc.h"
 #include "Utils/AMDGPUBaseInfo.h"
 #include "llvm/ADT/APInt.h"
 #include "llvm/ADT/ArrayRef.h"
@@ -37,7 +39,6 @@
 #include "llvm/CodeGen/MachineMemOperand.h"
 #include "llvm/CodeGen/MachineOperand.h"
 #include "llvm/CodeGen/MachineRegisterInfo.h"
-#include "llvm/CodeGen/MachineValueType.h"
 #include "llvm/CodeGen/RegisterScavenging.h"
 #include "llvm/CodeGen/ScheduleDAG.h"
 #include "llvm/CodeGen/SelectionDAGNodes.h"
@@ -53,6 +54,7 @@
 #include "llvm/Support/CommandLine.h"
 #include "llvm/Support/Compiler.h"
 #include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/MachineValueType.h"
 #include "llvm/Support/MathExtras.h"
 #include "llvm/Target/TargetMachine.h"
 #include <cassert>
@@ -62,6 +64,19 @@
 
 using namespace llvm;
 
+#define GET_INSTRINFO_CTOR_DTOR
+#include "AMDGPUGenInstrInfo.inc"
+
+namespace llvm {
+namespace AMDGPU {
+#define GET_D16ImageDimIntrinsics_IMPL
+#define GET_ImageDimIntrinsicTable_IMPL
+#define GET_RsrcIntrinsics_IMPL
+#include "AMDGPUGenSearchableTables.inc"
+}
+}
+
+
 // Must be at least 4 to be able to branch over minimum unconditional branch
 // code. This is only for making it possible to write reasonably small tests for
 // long branches.
@@ -69,8 +84,9 @@ static cl::opt<unsigned>
 BranchOffsetBits("amdgpu-s-branch-bits", cl::ReallyHidden, cl::init(16),
                  cl::desc("Restrict range of branch instructions (DEBUG)"));
 
-SIInstrInfo::SIInstrInfo(const SISubtarget &ST)
-  : AMDGPUInstrInfo(ST), RI(ST), ST(ST) {}
+SIInstrInfo::SIInstrInfo(const GCNSubtarget &ST)
+  : AMDGPUGenInstrInfo(AMDGPU::ADJCALLSTACKUP, AMDGPU::ADJCALLSTACKDOWN),
+    RI(ST), ST(ST) {}
 
 //===----------------------------------------------------------------------===//
 // TargetInstrInfo callbacks
@@ -89,7 +105,7 @@ static SDValue findChainOperand(SDNode *Load) {
   return LastOp;
 }
 
-/// \brief Returns true if both nodes have the same value for the given
+/// Returns true if both nodes have the same value for the given
 ///        operand \p Op, or if both nodes do not have this operand.
 static bool nodesHaveSameOperandValue(SDNode *N0, SDNode* N1, unsigned OpName) {
   unsigned Opc0 = N0->getMachineOpcode();
@@ -437,6 +453,28 @@ bool SIInstrInfo::shouldClusterMemOps(MachineInstr &FirstLdSt,
   return (NumLoads * (RI.getRegSizeInBits(*DstRC) / 8)) <= LoadClusterThreshold;
 }
 
+// FIXME: This behaves strangely. If, for example, you have 32 load + stores,
+// the first 16 loads will be interleaved with the stores, and the next 16 will
+// be clustered as expected. It should really split into 2 16 store batches.
+//
+// Loads are clustered until this returns false, rather than trying to schedule
+// groups of stores. This also means we have to deal with saying different
+// address space loads should be clustered, and ones which might cause bank
+// conflicts.
+//
+// This might be deprecated so it might not be worth that much effort to fix.
+bool SIInstrInfo::shouldScheduleLoadsNear(SDNode *Load0, SDNode *Load1,
+                                          int64_t Offset0, int64_t Offset1,
+                                          unsigned NumLoads) const {
+  assert(Offset1 > Offset0 &&
+         "Second offset should be larger than first offset!");
+  // If we have less than 16 loads in a row, and the offsets are within 64
+  // bytes, then schedule together.
+
+  // A cacheline is 64 bytes (for global memory).
+  return (NumLoads <= 16 && (Offset1 - Offset0) < 64);
+}
+
 static void reportIllegalCopy(const SIInstrInfo *TII, MachineBasicBlock &MBB,
                               MachineBasicBlock::iterator MI,
                               const DebugLoc &DL, unsigned DestReg,
@@ -827,10 +865,6 @@ void SIInstrInfo::storeRegToStackSlot(MachineBasicBlock &MBB,
   MachineFrameInfo &FrameInfo = MF->getFrameInfo();
   DebugLoc DL = MBB.findDebugLoc(MI);
 
-  assert(SrcReg != MFI->getStackPtrOffsetReg() &&
-         SrcReg != MFI->getFrameOffsetReg() &&
-         SrcReg != MFI->getScratchWaveOffsetReg());
-
   unsigned Size = FrameInfo.getObjectSize(FrameIndex);
   unsigned Align = FrameInfo.getObjectAlignment(FrameIndex);
   MachinePointerInfo PtrInfo
@@ -864,7 +898,7 @@ void SIInstrInfo::storeRegToStackSlot(MachineBasicBlock &MBB,
     // needing them, and need to ensure that the reserved registers are
     // correctly handled.
 
-    FrameInfo.setStackID(FrameIndex, 1);
+    FrameInfo.setStackID(FrameIndex, SIStackID::SGPR_SPILL);
     if (ST.hasScalarStores()) {
       // m0 is used for offset to scalar stores if used to spill.
       Spill.addReg(AMDGPU::M0, RegState::ImplicitDefine | RegState::Dead);
@@ -960,7 +994,7 @@ void SIInstrInfo::loadRegFromStackSlot(MachineBasicBlock &MBB,
       MRI.constrainRegClass(DestReg, &AMDGPU::SReg_32_XM0RegClass);
     }
 
-    FrameInfo.setStackID(FrameIndex, 1);
+    FrameInfo.setStackID(FrameIndex, SIStackID::SGPR_SPILL);
     MachineInstrBuilder Spill = BuildMI(MBB, MI, DL, OpDesc, DestReg)
       .addFrameIndex(FrameIndex) // addr
       .addMemOperand(MMO)
@@ -1001,7 +1035,7 @@ unsigned SIInstrInfo::calculateLDSSpillAddress(
     unsigned FrameOffset, unsigned Size) const {
   MachineFunction *MF = MBB.getParent();
   SIMachineFunctionInfo *MFI = MF->getInfo<SIMachineFunctionInfo>();
-  const SISubtarget &ST = MF->getSubtarget<SISubtarget>();
+  const GCNSubtarget &ST = MF->getSubtarget<GCNSubtarget>();
   DebugLoc DL = MBB.findDebugLoc(MI);
   unsigned WorkGroupSize = MFI->getMaxFlatWorkGroupSize();
   unsigned WavefrontSize = ST.getWavefrontSize();
@@ -1137,7 +1171,7 @@ bool SIInstrInfo::expandPostRAPseudo(MachineInstr &MI) const {
   MachineBasicBlock &MBB = *MI.getParent();
   DebugLoc DL = MBB.findDebugLoc(MI);
   switch (MI.getOpcode()) {
-  default: return AMDGPUInstrInfo::expandPostRAPseudo(MI);
+  default: return TargetInstrInfo::expandPostRAPseudo(MI);
   case AMDGPU::S_MOV_B64_term:
     // This is only a terminator to get the correct spill code placement during
     // register allocation.
@@ -1269,6 +1303,22 @@ bool SIInstrInfo::expandPostRAPseudo(MachineInstr &MI) const {
     MI.setDesc(get(AMDGPU::S_MOV_B64));
     break;
   }
+  case TargetOpcode::BUNDLE: {
+    if (!MI.mayLoad())
+      return false;
+
+    // If it is a load it must be a memory clause
+    for (MachineBasicBlock::instr_iterator I = MI.getIterator();
+         I->isBundledWithSucc(); ++I) {
+      I->unbundleFromSucc();
+      for (MachineOperand &MO : I->operands())
+        if (MO.isReg())
+          MO.setIsInternalRead(false);
+    }
+
+    MI.eraseFromParent();
+    break;
+  }
   }
   return true;
 }
@@ -1887,16 +1937,16 @@ unsigned SIInstrInfo::getAddressSpaceForPseudoSourceKind(
   switch(Kind) {
   case PseudoSourceValue::Stack:
   case PseudoSourceValue::FixedStack:
-    return AMDGPUASI.PRIVATE_ADDRESS;
+    return ST.getAMDGPUAS().PRIVATE_ADDRESS;
   case PseudoSourceValue::ConstantPool:
   case PseudoSourceValue::GOT:
   case PseudoSourceValue::JumpTable:
   case PseudoSourceValue::GlobalValueCallEntry:
   case PseudoSourceValue::ExternalSymbolCallEntry:
   case PseudoSourceValue::TargetCustom:
-    return AMDGPUASI.CONSTANT_ADDRESS;
+    return ST.getAMDGPUAS().CONSTANT_ADDRESS;
   }
-  return AMDGPUASI.FLAT_ADDRESS;
+  return ST.getAMDGPUAS().FLAT_ADDRESS;
 }
 
 static void removeModOperands(MachineInstr &MI) {
@@ -2165,20 +2215,24 @@ static int64_t getFoldableImm(const MachineOperand* MO) {
 MachineInstr *SIInstrInfo::convertToThreeAddress(MachineFunction::iterator &MBB,
                                                  MachineInstr &MI,
                                                  LiveVariables *LV) const {
+  unsigned Opc = MI.getOpcode();
   bool IsF16 = false;
+  bool IsFMA = Opc == AMDGPU::V_FMAC_F32_e32 || Opc == AMDGPU::V_FMAC_F32_e64;
 
-  switch (MI.getOpcode()) {
+  switch (Opc) {
   default:
     return nullptr;
   case AMDGPU::V_MAC_F16_e64:
     IsF16 = true;
     LLVM_FALLTHROUGH;
   case AMDGPU::V_MAC_F32_e64:
+  case AMDGPU::V_FMAC_F32_e64:
     break;
   case AMDGPU::V_MAC_F16_e32:
     IsF16 = true;
     LLVM_FALLTHROUGH;
-  case AMDGPU::V_MAC_F32_e32: {
+  case AMDGPU::V_MAC_F32_e32:
+  case AMDGPU::V_FMAC_F32_e32: {
     int Src0Idx = AMDGPU::getNamedOperandIdx(MI.getOpcode(),
                                              AMDGPU::OpName::src0);
     const MachineOperand *Src0 = &MI.getOperand(Src0Idx);
@@ -2203,7 +2257,7 @@ MachineInstr *SIInstrInfo::convertToThreeAddress(MachineFunction::iterator &MBB,
   const MachineOperand *Clamp = getNamedOperand(MI, AMDGPU::OpName::clamp);
   const MachineOperand *Omod = getNamedOperand(MI, AMDGPU::OpName::omod);
 
-  if (!Src0Mods && !Src1Mods && !Clamp && !Omod &&
+  if (!IsFMA && !Src0Mods && !Src1Mods && !Clamp && !Omod &&
       // If we have an SGPR input, we will violate the constant bus restriction.
       (!Src0->isReg() || !RI.isSGPRReg(MBB->getParent()->getRegInfo(), Src0->getReg()))) {
     if (auto Imm = getFoldableImm(Src2)) {
@@ -2234,8 +2288,10 @@ MachineInstr *SIInstrInfo::convertToThreeAddress(MachineFunction::iterator &MBB,
     }
   }
 
-  return BuildMI(*MBB, MI, MI.getDebugLoc(),
-                 get(IsF16 ? AMDGPU::V_MAD_F16 : AMDGPU::V_MAD_F32))
+  assert((!IsFMA || !IsF16) && "fmac only expected with f32");
+  unsigned NewOpc = IsFMA ? AMDGPU::V_FMA_F32 :
+    (IsF16 ? AMDGPU::V_MAD_F16 : AMDGPU::V_MAD_F32);
+  return BuildMI(*MBB, MI, MI.getDebugLoc(), get(NewOpc))
       .add(*Dst)
       .addImm(Src0Mods ? Src0Mods->getImm() : 0)
       .add(*Src0)
@@ -2339,6 +2395,15 @@ bool SIInstrInfo::isInlineConstant(const MachineOperand &MO,
   }
   case AMDGPU::OPERAND_REG_INLINE_C_V2INT16:
   case AMDGPU::OPERAND_REG_INLINE_C_V2FP16: {
+    if (isUInt<16>(Imm)) {
+      int16_t Trunc = static_cast<int16_t>(Imm);
+      return ST.has16BitInsts() &&
+             AMDGPU::isInlinableLiteral16(Trunc, ST.hasInv2PiInlineImm());
+    }
+    if (!(Imm & 0xffff)) {
+      return ST.has16BitInsts() &&
+             AMDGPU::isInlinableLiteral16(Imm >> 16, ST.hasInv2PiInlineImm());
+    }
     uint32_t Trunc = static_cast<uint32_t>(Imm);
     return  AMDGPU::isInlinableLiteralV216(Trunc, ST.hasInv2PiInlineImm());
   }
@@ -2711,14 +2776,16 @@ bool SIInstrInfo::verifyInstruction(const MachineInstr &MI,
     }
   }
 
-  // Verify VOP*
-  if (isVOP1(MI) || isVOP2(MI) || isVOP3(MI) || isVOPC(MI) || isSDWA(MI)) {
+  // Verify VOP*. Ignore multiple sgpr operands on writelane.
+  if (Desc.getOpcode() != AMDGPU::V_WRITELANE_B32
+      && (isVOP1(MI) || isVOP2(MI) || isVOP3(MI) || isVOPC(MI) || isSDWA(MI))) {
     // Only look at the true operands. Only a real operand can use the constant
     // bus, and we don't want to check pseudo-operands like the source modifier
     // flags.
     const int OpIndices[] = { Src0Idx, Src1Idx, Src2Idx };
 
     unsigned ConstantBusCount = 0;
+    unsigned LiteralCount = 0;
 
     if (AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::imm) != -1)
       ++ConstantBusCount;
@@ -2738,6 +2805,7 @@ bool SIInstrInfo::verifyInstruction(const MachineInstr &MI,
           SGPRUsed = MO.getReg();
         } else {
           ++ConstantBusCount;
+          ++LiteralCount;
         }
       }
     }
@@ -2745,6 +2813,11 @@ bool SIInstrInfo::verifyInstruction(const MachineInstr &MI,
       ErrInfo = "VOP* instruction uses the constant bus more than once";
       return false;
     }
+
+    if (isVOP3(MI) && LiteralCount) {
+      ErrInfo = "VOP3 instruction uses literal";
+      return false;
+    }
   }
 
   // Verify misc. restrictions on specific instructions.
@@ -2842,7 +2915,7 @@ bool SIInstrInfo::verifyInstruction(const MachineInstr &MI,
     }
   }
 
-  if (isFLAT(MI) && !MF->getSubtarget<SISubtarget>().hasFlatInstOffsets()) {
+  if (isFLAT(MI) && !MF->getSubtarget<GCNSubtarget>().hasFlatInstOffsets()) {
     const MachineOperand *Offset = getNamedOperand(MI, AMDGPU::OpName::offset);
     if (Offset->getImm() != 0) {
       ErrInfo = "subtarget does not support offsets in flat instructions";
@@ -2850,6 +2923,22 @@ bool SIInstrInfo::verifyInstruction(const MachineInstr &MI,
     }
   }
 
+  const MachineOperand *DppCt = getNamedOperand(MI, AMDGPU::OpName::dpp_ctrl);
+  if (DppCt) {
+    using namespace AMDGPU::DPP;
+
+    unsigned DC = DppCt->getImm();
+    if (DC == DppCtrl::DPP_UNUSED1 || DC == DppCtrl::DPP_UNUSED2 ||
+        DC == DppCtrl::DPP_UNUSED3 || DC > DppCtrl::DPP_LAST ||
+        (DC >= DppCtrl::DPP_UNUSED4_FIRST && DC <= DppCtrl::DPP_UNUSED4_LAST) ||
+        (DC >= DppCtrl::DPP_UNUSED5_FIRST && DC <= DppCtrl::DPP_UNUSED5_LAST) ||
+        (DC >= DppCtrl::DPP_UNUSED6_FIRST && DC <= DppCtrl::DPP_UNUSED6_LAST) ||
+        (DC >= DppCtrl::DPP_UNUSED7_FIRST && DC <= DppCtrl::DPP_UNUSED7_LAST)) {
+      ErrInfo = "Invalid dpp_ctrl value";
+      return false;
+    }
+  }
+
   return true;
 }
 
@@ -3147,6 +3236,29 @@ void SIInstrInfo::legalizeOperandsVOP2(MachineRegisterInfo &MRI,
       legalizeOpWithMove(MI, Src0Idx);
   }
 
+  // Special case: V_WRITELANE_B32 accepts only immediate or SGPR operands for
+  // both the value to write (src0) and lane select (src1).  Fix up non-SGPR
+  // src0/src1 with V_READFIRSTLANE.
+  if (Opc == AMDGPU::V_WRITELANE_B32) {
+    int Src0Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src0);
+    MachineOperand &Src0 = MI.getOperand(Src0Idx);
+    const DebugLoc &DL = MI.getDebugLoc();
+    if (Src0.isReg() && RI.isVGPR(MRI, Src0.getReg())) {
+      unsigned Reg = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
+      BuildMI(*MI.getParent(), MI, DL, get(AMDGPU::V_READFIRSTLANE_B32), Reg)
+          .add(Src0);
+      Src0.ChangeToRegister(Reg, false);
+    }
+    if (Src1.isReg() && RI.isVGPR(MRI, Src1.getReg())) {
+      unsigned Reg = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
+      const DebugLoc &DL = MI.getDebugLoc();
+      BuildMI(*MI.getParent(), MI, DL, get(AMDGPU::V_READFIRSTLANE_B32), Reg)
+          .add(Src1);
+      Src1.ChangeToRegister(Reg, false);
+    }
+    return;
+  }
+
   // VOP2 src0 instructions support all operand types, so we don't need to check
   // their legality. If src1 is already legal, we don't need to do anything.
   if (isLegalRegOperand(MRI, InstrDesc.OpInfo[Src1Idx], Src1))
@@ -3261,6 +3373,13 @@ unsigned SIInstrInfo::readlaneVGPRToSGPR(unsigned SrcReg, MachineInstr &UseMI,
   unsigned DstReg = MRI.createVirtualRegister(SRC);
   unsigned SubRegs = RI.getRegSizeInBits(*VRC) / 32;
 
+  if (SubRegs == 1) {
+    BuildMI(*UseMI.getParent(), UseMI, UseMI.getDebugLoc(),
+            get(AMDGPU::V_READFIRSTLANE_B32), DstReg)
+        .addReg(SrcReg);
+    return DstReg;
+  }
+
   SmallVector<unsigned, 8> SRegs;
   for (unsigned i = 0; i < SubRegs; ++i) {
     unsigned SGPR = MRI.createVirtualRegister(&AMDGPU::SGPR_32RegClass);
@@ -3438,6 +3557,14 @@ void SIInstrInfo::legalizeOperands(MachineInstr &MI) const {
     return;
   }
 
+  // Legalize SI_INIT_M0
+  if (MI.getOpcode() == AMDGPU::SI_INIT_M0) {
+    MachineOperand &Src = MI.getOperand(0);
+    if (Src.isReg() && RI.hasVGPRs(MRI.getRegClass(Src.getReg())))
+      Src.setReg(readlaneVGPRToSGPR(Src.getReg(), MI, MRI));
+    return;
+  }
+
   // Legalize MIMG and MUBUF/MTBUF for shaders.
   //
   // Shaders only generate MUBUF/MTBUF instructions via intrinsics or via
@@ -3539,8 +3666,8 @@ void SIInstrInfo::legalizeOperands(MachineInstr &MI) const {
     } else {
       // This instructions is the _OFFSET variant, so we need to convert it to
       // ADDR64.
-      assert(MBB.getParent()->getSubtarget<SISubtarget>().getGeneration()
-             < SISubtarget::VOLCANIC_ISLANDS &&
+      assert(MBB.getParent()->getSubtarget<GCNSubtarget>().getGeneration()
+             < AMDGPUSubtarget::VOLCANIC_ISLANDS &&
              "FIXME: Need to emit flat atomics here");
 
       MachineOperand *VData = getNamedOperand(MI, AMDGPU::OpName::vdata);
@@ -3676,37 +3803,37 @@ void SIInstrInfo::moveToVALU(MachineInstr &TopInst) const {
       continue;
 
     case AMDGPU::S_LSHL_B32:
-      if (ST.getGeneration() >= SISubtarget::VOLCANIC_ISLANDS) {
+      if (ST.getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS) {
         NewOpcode = AMDGPU::V_LSHLREV_B32_e64;
         swapOperands(Inst);
       }
       break;
     case AMDGPU::S_ASHR_I32:
-      if (ST.getGeneration() >= SISubtarget::VOLCANIC_ISLANDS) {
+      if (ST.getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS) {
         NewOpcode = AMDGPU::V_ASHRREV_I32_e64;
         swapOperands(Inst);
       }
       break;
     case AMDGPU::S_LSHR_B32:
-      if (ST.getGeneration() >= SISubtarget::VOLCANIC_ISLANDS) {
+      if (ST.getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS) {
         NewOpcode = AMDGPU::V_LSHRREV_B32_e64;
         swapOperands(Inst);
       }
       break;
     case AMDGPU::S_LSHL_B64:
-      if (ST.getGeneration() >= SISubtarget::VOLCANIC_ISLANDS) {
+      if (ST.getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS) {
         NewOpcode = AMDGPU::V_LSHLREV_B64;
         swapOperands(Inst);
       }
       break;
     case AMDGPU::S_ASHR_I64:
-      if (ST.getGeneration() >= SISubtarget::VOLCANIC_ISLANDS) {
+      if (ST.getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS) {
         NewOpcode = AMDGPU::V_ASHRREV_I64;
         swapOperands(Inst);
       }
       break;
     case AMDGPU::S_LSHR_B64:
-      if (ST.getGeneration() >= SISubtarget::VOLCANIC_ISLANDS) {
+      if (ST.getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS) {
         NewOpcode = AMDGPU::V_LSHRREV_B64;
         swapOperands(Inst);
       }
@@ -3756,39 +3883,49 @@ void SIInstrInfo::moveToVALU(MachineInstr &TopInst) const {
       // FIXME: This isn't safe because the addressing mode doesn't work
       // correctly if vaddr is negative.
       //
-      // FIXME: Handle v_add_u32 and VOP3 form. Also don't rely on immediate
-      // being in src0.
-      //
       // FIXME: Should probably be done somewhere else, maybe SIFoldOperands.
       //
       // See if we can extract an immediate offset by recognizing one of these:
       //   V_ADD_I32_e32 dst, imm, src1
       //   V_ADD_I32_e32 dst, (S_MOV_B32 imm), src1
       // V_ADD will be removed by "Remove dead machine instructions".
-      if (Add && Add->getOpcode() == AMDGPU::V_ADD_I32_e32) {
-        const MachineOperand *Src =
-          getNamedOperand(*Add, AMDGPU::OpName::src0);
-
-        if (Src->isReg()) {
-          auto Mov = MRI.getUniqueVRegDef(Src->getReg());
-          if (Mov && Mov->getOpcode() == AMDGPU::S_MOV_B32)
-            Src = &Mov->getOperand(1);
-        }
-
-        if (Src) {
-          if (Src->isImm())
-            Offset = Src->getImm();
-          else if (Src->isCImm())
-            Offset = Src->getCImm()->getZExtValue();
-        }
+      if (Add &&
+          (Add->getOpcode() == AMDGPU::V_ADD_I32_e32 ||
+           Add->getOpcode() == AMDGPU::V_ADD_U32_e64)) {
+        static const unsigned SrcNames[2] = {
+          AMDGPU::OpName::src0,
+          AMDGPU::OpName::src1,
+        };
+
+        // Find a literal offset in one of source operands.
+        for (int i = 0; i < 2; i++) {
+          const MachineOperand *Src =
+            getNamedOperand(*Add, SrcNames[i]);
+
+          if (Src->isReg()) {
+            auto Mov = MRI.getUniqueVRegDef(Src->getReg());
+            if (Mov && Mov->getOpcode() == AMDGPU::S_MOV_B32)
+              Src = &Mov->getOperand(1);
+          }
+
+          if (Src) {
+            if (Src->isImm())
+              Offset = Src->getImm();
+            else if (Src->isCImm())
+              Offset = Src->getCImm()->getZExtValue();
+          }
+
+          if (Offset && isLegalMUBUFImmOffset(Offset)) {
+            VAddr = getNamedOperand(*Add, SrcNames[!i]);
+            break;
+          }
 
-        if (Offset && isLegalMUBUFImmOffset(Offset))
-          VAddr = getNamedOperand(*Add, AMDGPU::OpName::src1);
-        else
           Offset = 0;
+        }
       }
 
-      BuildMI(*MBB, Inst, Inst.getDebugLoc(),
+      MachineInstr *NewInstr =
+        BuildMI(*MBB, Inst, Inst.getDebugLoc(),
               get(AMDGPU::BUFFER_LOAD_DWORD_OFFEN), VDst)
         .add(*VAddr) // vaddr
         .add(*getNamedOperand(Inst, AMDGPU::OpName::sbase)) // srsrc
@@ -3797,12 +3934,17 @@ void SIInstrInfo::moveToVALU(MachineInstr &TopInst) const {
         .addImm(getNamedOperand(Inst, AMDGPU::OpName::glc)->getImm())
         .addImm(0) // slc
         .addImm(0) // tfe
-        .setMemRefs(Inst.memoperands_begin(), Inst.memoperands_end());
+        .setMemRefs(Inst.memoperands_begin(), Inst.memoperands_end())
+        .getInstr();
 
       MRI.replaceRegWith(getNamedOperand(Inst, AMDGPU::OpName::sdst)->getReg(),
                          VDst);
       addUsersToMoveToVALUWorklist(VDst, MRI, Worklist);
       Inst.eraseFromParent();
+
+      // Legalize all operands other than the offset. Notably, convert the srsrc
+      // into SGPRs using v_readfirstlane if needed.
+      legalizeOperands(*NewInstr);
       continue;
     }
     }
@@ -3884,6 +4026,13 @@ void SIInstrInfo::moveToVALU(MachineInstr &TopInst) const {
         MRI.replaceRegWith(DstReg, Inst.getOperand(1).getReg());
         MRI.clearKillFlags(Inst.getOperand(1).getReg());
         Inst.getOperand(0).setReg(DstReg);
+
+        // Make sure we don't leave around a dead VGPR->SGPR copy. Normally
+        // these are deleted later, but at -O0 it would leave a suspicious
+        // looking illegal copy of an undef register.
+        for (unsigned I = Inst.getNumOperands() - 1; I != 0; --I)
+          Inst.RemoveOperand(I);
+        Inst.setDesc(get(AMDGPU::IMPLICIT_DEF));
         continue;
       }
 
@@ -3975,17 +4124,23 @@ void SIInstrInfo::lowerScalarXnor(SetVectorType &Worklist,
   legalizeGenericOperand(MBB, MII, &AMDGPU::VGPR_32RegClass, Src0, MRI, DL);
   legalizeGenericOperand(MBB, MII, &AMDGPU::VGPR_32RegClass, Src1, MRI, DL);
 
-  unsigned Xor = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
-  BuildMI(MBB, MII, DL, get(AMDGPU::V_XOR_B32_e64), Xor)
-    .add(Src0)
-    .add(Src1);
+  unsigned NewDest = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
+  if (ST.hasDLInsts()) {
+    BuildMI(MBB, MII, DL, get(AMDGPU::V_XNOR_B32_e64), NewDest)
+      .add(Src0)
+      .add(Src1);
+  } else {
+    unsigned Xor = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
+    BuildMI(MBB, MII, DL, get(AMDGPU::V_XOR_B32_e64), Xor)
+      .add(Src0)
+      .add(Src1);
 
-  unsigned Not = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
-  BuildMI(MBB, MII, DL, get(AMDGPU::V_NOT_B32_e64), Not)
-    .addReg(Xor);
+    BuildMI(MBB, MII, DL, get(AMDGPU::V_NOT_B32_e64), NewDest)
+      .addReg(Xor);
+  }
 
-  MRI.replaceRegWith(Dest.getReg(), Not);
-  addUsersToMoveToVALUWorklist(Not, MRI, Worklist);
+  MRI.replaceRegWith(Dest.getReg(), NewDest);
+  addUsersToMoveToVALUWorklist(NewDest, MRI, Worklist);
 }
 
 void SIInstrInfo::splitScalar64BitUnaryOp(
@@ -4478,12 +4633,12 @@ uint64_t SIInstrInfo::getDefaultRsrcDataFormat() const {
   uint64_t RsrcDataFormat = AMDGPU::RSRC_DATA_FORMAT;
   if (ST.isAmdHsaOS()) {
     // Set ATC = 1. GFX9 doesn't have this bit.
-    if (ST.getGeneration() <= SISubtarget::VOLCANIC_ISLANDS)
+    if (ST.getGeneration() <= AMDGPUSubtarget::VOLCANIC_ISLANDS)
       RsrcDataFormat |= (1ULL << 56);
 
     // Set MTYPE = 2 (MTYPE_UC = uncached). GFX9 doesn't have this.
     // BTW, it disables TC L2 and therefore decreases performance.
-    if (ST.getGeneration() == SISubtarget::VOLCANIC_ISLANDS)
+    if (ST.getGeneration() == AMDGPUSubtarget::VOLCANIC_ISLANDS)
       RsrcDataFormat |= (2ULL << 59);
   }
 
@@ -4496,7 +4651,7 @@ uint64_t SIInstrInfo::getScratchRsrcWords23() const {
                     0xffffffff; // Size;
 
   // GFX9 doesn't have ELEMENT_SIZE.
-  if (ST.getGeneration() <= SISubtarget::VOLCANIC_ISLANDS) {
+  if (ST.getGeneration() <= AMDGPUSubtarget::VOLCANIC_ISLANDS) {
     uint64_t EltSizeValue = Log2_32(ST.getMaxPrivateElementSize()) - 1;
     Rsrc23 |= EltSizeValue << AMDGPU::RSRC_ELEMENT_SIZE_SHIFT;
   }
@@ -4506,7 +4661,7 @@ uint64_t SIInstrInfo::getScratchRsrcWords23() const {
 
   // If TID_ENABLE is set, DATA_FORMAT specifies stride bits [14:17].
   // Clear them unless we want a huge stride.
-  if (ST.getGeneration() >= SISubtarget::VOLCANIC_ISLANDS)
+  if (ST.getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS)
     Rsrc23 &= ~AMDGPU::RSRC_DATA_FORMAT;
 
   return Rsrc23;
@@ -4531,7 +4686,7 @@ unsigned SIInstrInfo::isStackAccess(const MachineInstr &MI,
     return AMDGPU::NoRegister;
 
   assert(!MI.memoperands_empty() &&
-         (*MI.memoperands_begin())->getAddrSpace() == AMDGPUASI.PRIVATE_ADDRESS);
+         (*MI.memoperands_begin())->getAddrSpace() == ST.getAMDGPUAS().PRIVATE_ADDRESS);
 
   FrameIndex = Addr->getIndex();
   return getNamedOperand(MI, AMDGPU::OpName::vdata)->getReg();
@@ -4598,12 +4753,12 @@ unsigned SIInstrInfo::getInstSizeInBytes(const MachineInstr &MI) const {
   if (DescSize != 0 && DescSize != 4)
     return DescSize;
 
+  if (isFixedSize(MI))
+    return DescSize;
+
   // 4-byte instructions may have a 32-bit literal encoded after them. Check
   // operands that coud ever be literals.
   if (isVALU(MI) || isSALU(MI)) {
-    if (isFixedSize(MI))
-      return DescSize;
-
     int Src0Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src0);
     if (Src0Idx == -1)
       return 4; // No operands.
@@ -4650,7 +4805,7 @@ bool SIInstrInfo::mayAccessFlatAddressSpace(const MachineInstr &MI) const {
     return true;
 
   for (const MachineMemOperand *MMO : MI.memoperands()) {
-    if (MMO->getAddrSpace() == AMDGPUASI.FLAT_ADDRESS)
+    if (MMO->getAddrSpace() == ST.getAMDGPUAS().FLAT_ADDRESS)
       return true;
   }
   return false;
@@ -4817,3 +4972,70 @@ const MCInstrDesc &SIInstrInfo::getKillTerminatorFromPseudo(unsigned Opcode) con
     llvm_unreachable("invalid opcode, expected SI_KILL_*_PSEUDO");
   }
 }
+
+bool SIInstrInfo::isBufferSMRD(const MachineInstr &MI) const {
+  if (!isSMRD(MI))
+    return false;
+
+  // Check that it is using a buffer resource.
+  int Idx = AMDGPU::getNamedOperandIdx(MI.getOpcode(), AMDGPU::OpName::sbase);
+  if (Idx == -1) // e.g. s_memtime
+    return false;
+
+  const auto RCID = MI.getDesc().OpInfo[Idx].RegClass;
+  return RCID == AMDGPU::SReg_128RegClassID;
+}
+
+// This must be kept in sync with the SIEncodingFamily class in SIInstrInfo.td
+enum SIEncodingFamily {
+  SI = 0,
+  VI = 1,
+  SDWA = 2,
+  SDWA9 = 3,
+  GFX80 = 4,
+  GFX9 = 5
+};
+
+static SIEncodingFamily subtargetEncodingFamily(const GCNSubtarget &ST) {
+  switch (ST.getGeneration()) {
+  default:
+    break;
+  case AMDGPUSubtarget::SOUTHERN_ISLANDS:
+  case AMDGPUSubtarget::SEA_ISLANDS:
+    return SIEncodingFamily::SI;
+  case AMDGPUSubtarget::VOLCANIC_ISLANDS:
+  case AMDGPUSubtarget::GFX9:
+    return SIEncodingFamily::VI;
+  }
+  llvm_unreachable("Unknown subtarget generation!");
+}
+
+int SIInstrInfo::pseudoToMCOpcode(int Opcode) const {
+  SIEncodingFamily Gen = subtargetEncodingFamily(ST);
+
+  if ((get(Opcode).TSFlags & SIInstrFlags::renamedInGFX9) != 0 &&
+    ST.getGeneration() >= AMDGPUSubtarget::GFX9)
+    Gen = SIEncodingFamily::GFX9;
+
+  if (get(Opcode).TSFlags & SIInstrFlags::SDWA)
+    Gen = ST.getGeneration() == AMDGPUSubtarget::GFX9 ? SIEncodingFamily::SDWA9
+                                                      : SIEncodingFamily::SDWA;
+  // Adjust the encoding family to GFX80 for D16 buffer instructions when the
+  // subtarget has UnpackedD16VMem feature.
+  // TODO: remove this when we discard GFX80 encoding.
+  if (ST.hasUnpackedD16VMem() && (get(Opcode).TSFlags & SIInstrFlags::D16Buf))
+    Gen = SIEncodingFamily::GFX80;
+
+  int MCOp = AMDGPU::getMCOpcode(Opcode, Gen);
+
+  // -1 means that Opcode is already a native instruction.
+  if (MCOp == -1)
+    return Opcode;
+
+  // (uint16_t)-1 means that Opcode is a pseudo instruction that has
+  // no encoding in the given subtarget generation.
+  if (MCOp == (uint16_t)-1)
+    return -1;
+
+  return MCOp;
+}
diff --git a/lib/Target/AMDGPU/SIInstrInfo.h b/lib/Target/AMDGPU/SIInstrInfo.h
index 24ee843e6ade..0a735257d34e 100644
--- a/lib/Target/AMDGPU/SIInstrInfo.h
+++ b/lib/Target/AMDGPU/SIInstrInfo.h
@@ -8,7 +8,7 @@
 //===----------------------------------------------------------------------===//
 //
 /// \file
-/// \brief Interface definition for SIInstrInfo.
+/// Interface definition for SIInstrInfo.
 //
 //===----------------------------------------------------------------------===//
 
@@ -31,20 +31,23 @@
 #include <cassert>
 #include <cstdint>
 
+#define GET_INSTRINFO_HEADER
+#include "AMDGPUGenInstrInfo.inc"
+
 namespace llvm {
 
 class APInt;
 class MachineRegisterInfo;
 class RegScavenger;
-class SISubtarget;
+class GCNSubtarget;
 class TargetRegisterClass;
 
-class SIInstrInfo final : public AMDGPUInstrInfo {
+class SIInstrInfo final : public AMDGPUGenInstrInfo {
 private:
   const SIRegisterInfo RI;
-  const SISubtarget &ST;
+  const GCNSubtarget &ST;
 
-  // The the inverse predicate should have the negative value.
+  // The inverse predicate should have the negative value.
   enum BranchPredicate {
     INVALID_BR = 0,
     SCC_TRUE = 1,
@@ -144,7 +147,7 @@ public:
     MO_REL32_HI = 5
   };
 
-  explicit SIInstrInfo(const SISubtarget &ST);
+  explicit SIInstrInfo(const GCNSubtarget &ST);
 
   const SIRegisterInfo &getRegisterInfo() const {
     return RI;
@@ -163,7 +166,10 @@ public:
 
   bool shouldClusterMemOps(MachineInstr &FirstLdSt, unsigned BaseReg1,
                            MachineInstr &SecondLdSt, unsigned BaseReg2,
-                           unsigned NumLoads) const final;
+                           unsigned NumLoads) const override;
+
+  bool shouldScheduleLoadsNear(SDNode *Load0, SDNode *Load1, int64_t Offset0,
+                               int64_t Offset1, unsigned NumLoads) const override;
 
   void copyPhysReg(MachineBasicBlock &MBB, MachineBasicBlock::iterator MI,
                    const DebugLoc &DL, unsigned DestReg, unsigned SrcReg,
@@ -203,7 +209,7 @@ public:
 
   bool expandPostRAPseudo(MachineInstr &MI) const override;
 
-  // \brief Returns an opcode that can be used to move a value to a \p DstRC
+  // Returns an opcode that can be used to move a value to a \p DstRC
   // register.  If there is no hardware instruction that can store to \p
   // DstRC, then AMDGPU::COPY is returned.
   unsigned getMovOpcode(const TargetRegisterClass *DstRC) const;
@@ -419,18 +425,7 @@ public:
     return get(Opcode).TSFlags & SIInstrFlags::SMRD;
   }
 
-  bool isBufferSMRD(const MachineInstr &MI) const {
-    if (!isSMRD(MI))
-      return false;
-
-    // Check that it is using a buffer resource.
-    int Idx = AMDGPU::getNamedOperandIdx(MI.getOpcode(), AMDGPU::OpName::sbase);
-    if (Idx == -1) // e.g. s_memtime
-      return false;
-
-    const auto RCID = MI.getDesc().OpInfo[Idx].RegClass;
-    return RCID == AMDGPU::SReg_128RegClassID;
-  }
+  bool isBufferSMRD(const MachineInstr &MI) const;
 
   static bool isDS(const MachineInstr &MI) {
     return MI.getDesc().TSFlags & SIInstrFlags::DS;
@@ -674,16 +669,16 @@ public:
   bool isImmOperandLegal(const MachineInstr &MI, unsigned OpNo,
                          const MachineOperand &MO) const;
 
-  /// \brief Return true if this 64-bit VALU instruction has a 32-bit encoding.
+  /// Return true if this 64-bit VALU instruction has a 32-bit encoding.
   /// This function will return false if you pass it a 32-bit instruction.
   bool hasVALU32BitEncoding(unsigned Opcode) const;
 
-  /// \brief Returns true if this operand uses the constant bus.
+  /// Returns true if this operand uses the constant bus.
   bool usesConstantBus(const MachineRegisterInfo &MRI,
                        const MachineOperand &MO,
                        const MCOperandInfo &OpInfo) const;
 
-  /// \brief Return true if this instruction has any modifiers.
+  /// Return true if this instruction has any modifiers.
   ///  e.g. src[012]_mod, omod, clamp.
   bool hasModifiers(unsigned Opcode) const;
 
@@ -696,7 +691,7 @@ public:
 
   unsigned getVALUOp(const MachineInstr &MI) const;
 
-  /// \brief Return the correct register class for \p OpNo.  For target-specific
+  /// Return the correct register class for \p OpNo.  For target-specific
   /// instructions, this will return the register class that has been defined
   /// in tablegen.  For generic instructions, like REG_SEQUENCE it will return
   /// the register class of its machine operand.
@@ -704,7 +699,7 @@ public:
   const TargetRegisterClass *getOpRegClass(const MachineInstr &MI,
                                            unsigned OpNo) const;
 
-  /// \brief Return the size in bytes of the operand OpNo on the given
+  /// Return the size in bytes of the operand OpNo on the given
   // instruction opcode.
   unsigned getOpSize(uint16_t Opcode, unsigned OpNo) const {
     const MCOperandInfo &OpInfo = get(Opcode).OpInfo[OpNo];
@@ -718,7 +713,7 @@ public:
     return RI.getRegSizeInBits(*RI.getRegClass(OpInfo.RegClass)) / 8;
   }
 
-  /// \brief This form should usually be preferred since it handles operands
+  /// This form should usually be preferred since it handles operands
   /// with unknown register classes.
   unsigned getOpSize(const MachineInstr &MI, unsigned OpNo) const {
     return RI.getRegSizeInBits(*getOpRegClass(MI, OpNo)) / 8;
@@ -728,7 +723,7 @@ public:
   /// to read a VGPR.
   bool canReadVGPR(const MachineInstr &MI, unsigned OpNo) const;
 
-  /// \brief Legalize the \p OpIndex operand of this instruction by inserting
+  /// Legalize the \p OpIndex operand of this instruction by inserting
   /// a MOV.  For example:
   /// ADD_I32_e32 VGPR0, 15
   /// to
@@ -739,29 +734,29 @@ public:
   /// instead of MOV.
   void legalizeOpWithMove(MachineInstr &MI, unsigned OpIdx) const;
 
-  /// \brief Check if \p MO is a legal operand if it was the \p OpIdx Operand
+  /// Check if \p MO is a legal operand if it was the \p OpIdx Operand
   /// for \p MI.
   bool isOperandLegal(const MachineInstr &MI, unsigned OpIdx,
                       const MachineOperand *MO = nullptr) const;
 
-  /// \brief Check if \p MO would be a valid operand for the given operand
+  /// Check if \p MO would be a valid operand for the given operand
   /// definition \p OpInfo. Note this does not attempt to validate constant bus
   /// restrictions (e.g. literal constant usage).
   bool isLegalVSrcOperand(const MachineRegisterInfo &MRI,
                           const MCOperandInfo &OpInfo,
                           const MachineOperand &MO) const;
 
-  /// \brief Check if \p MO (a register operand) is a legal register for the
+  /// Check if \p MO (a register operand) is a legal register for the
   /// given operand description.
   bool isLegalRegOperand(const MachineRegisterInfo &MRI,
                          const MCOperandInfo &OpInfo,
                          const MachineOperand &MO) const;
 
-  /// \brief Legalize operands in \p MI by either commuting it or inserting a
+  /// Legalize operands in \p MI by either commuting it or inserting a
   /// copy of src1.
   void legalizeOperandsVOP2(MachineRegisterInfo &MRI, MachineInstr &MI) const;
 
-  /// \brief Fix operands in \p MI to satisfy constant bus requirements.
+  /// Fix operands in \p MI to satisfy constant bus requirements.
   void legalizeOperandsVOP3(MachineRegisterInfo &MRI, MachineInstr &MI) const;
 
   /// Copy a value from a VGPR (\p SrcReg) to SGPR.  This function can only
@@ -779,11 +774,11 @@ public:
                               MachineOperand &Op, MachineRegisterInfo &MRI,
                               const DebugLoc &DL) const;
 
-  /// \brief Legalize all operands in this instruction.  This function may
+  /// Legalize all operands in this instruction.  This function may
   /// create new instruction and insert them before \p MI.
   void legalizeOperands(MachineInstr &MI) const;
 
-  /// \brief Replace this instruction's opcode with the equivalent VALU
+  /// Replace this instruction's opcode with the equivalent VALU
   /// opcode.  This function will also move the users of \p MI to the
   /// VALU if necessary.
   void moveToVALU(MachineInstr &MI) const;
@@ -795,11 +790,11 @@ public:
                   MachineBasicBlock::iterator MI) const override;
 
   void insertReturn(MachineBasicBlock &MBB) const;
-  /// \brief Return the number of wait states that result from executing this
+  /// Return the number of wait states that result from executing this
   /// instruction.
   unsigned getNumWaitStates(const MachineInstr &MI) const;
 
-  /// \brief Returns the operand named \p Op.  If \p MI does not have an
+  /// Returns the operand named \p Op.  If \p MI does not have an
   /// operand named \c Op, this function returns nullptr.
   LLVM_READONLY
   MachineOperand *getNamedOperand(MachineInstr &MI, unsigned OperandName) const;
@@ -822,7 +817,7 @@ public:
   bool isLowLatencyInstruction(const MachineInstr &MI) const;
   bool isHighLatencyInstruction(const MachineInstr &MI) const;
 
-  /// \brief Return the descriptor of the target-specific machine instruction
+  /// Return the descriptor of the target-specific machine instruction
   /// that corresponds to the specified pseudo or native opcode.
   const MCInstrDesc &getMCOpcodeFromPseudo(unsigned Opcode) const {
     return get(pseudoToMCOpcode(Opcode));
@@ -867,7 +862,7 @@ public:
 
   bool isBasicBlockPrologue(const MachineInstr &MI) const override;
 
-  /// \brief Return a partially built integer add instruction without carry.
+  /// Return a partially built integer add instruction without carry.
   /// Caller must add source operands.
   /// For pre-GFX9 it will generate unused carry destination operand.
   /// TODO: After GFX9 it should return a no-carry operation.
@@ -882,6 +877,12 @@ public:
   static bool isLegalMUBUFImmOffset(unsigned Imm) {
     return isUInt<12>(Imm);
   }
+
+  /// \brief Return a target-specific opcode if Opcode is a pseudo instruction.
+  /// Return -1 if the target-specific opcode for the pseudo instruction does
+  /// not exist. If Opcode is not a pseudo instruction, this is identity.
+  int pseudoToMCOpcode(int Opcode) const;
+
 };
 
 namespace AMDGPU {
@@ -908,6 +909,9 @@ namespace AMDGPU {
   int getAddr64Inst(uint16_t Opcode);
 
   LLVM_READONLY
+  int getMUBUFNoLdsInst(uint16_t Opcode);
+
+  LLVM_READONLY
   int getAtomicRetOp(uint16_t Opcode);
 
   LLVM_READONLY
diff --git a/lib/Target/AMDGPU/SIInstrInfo.td b/lib/Target/AMDGPU/SIInstrInfo.td
index fc2d35d873aa..8fa37aa83dae 100644
--- a/lib/Target/AMDGPU/SIInstrInfo.td
+++ b/lib/Target/AMDGPU/SIInstrInfo.td
@@ -7,16 +7,21 @@
 //
 //===----------------------------------------------------------------------===//
 def isCI : Predicate<"Subtarget->getGeneration() "
-                      ">= SISubtarget::SEA_ISLANDS">;
+                      ">= AMDGPUSubtarget::SEA_ISLANDS">;
 def isCIOnly : Predicate<"Subtarget->getGeneration() =="
-                         "SISubtarget::SEA_ISLANDS">,
+                         "AMDGPUSubtarget::SEA_ISLANDS">,
   AssemblerPredicate <"FeatureSeaIslands">;
 def isVIOnly : Predicate<"Subtarget->getGeneration() =="
-                         "SISubtarget::VOLCANIC_ISLANDS">,
+                         "AMDGPUSubtarget::VOLCANIC_ISLANDS">,
   AssemblerPredicate <"FeatureVolcanicIslands">;
 
 def DisableInst : Predicate <"false">, AssemblerPredicate<"FeatureDisable">;
 
+class GCNPredicateControl : PredicateControl {
+  Predicate SIAssemblerPredicate = isSICI;
+  Predicate VIAssemblerPredicate = isVI;
+}
+
 // Execpt for the NONE field, this must be kept in sync with the
 // SIEncodingFamily enum in AMDGPUInstrInfo.cpp
 def SIEncodingFamily {
@@ -25,13 +30,16 @@ def SIEncodingFamily {
   int VI = 1;
   int SDWA = 2;
   int SDWA9 = 3;
-  int GFX9 = 4;
+  int GFX80 = 4;
+  int GFX9 = 5;
 }
 
 //===----------------------------------------------------------------------===//
 // SI DAG Nodes
 //===----------------------------------------------------------------------===//
 
+def AMDGPUclamp : SDNode<"AMDGPUISD::CLAMP", SDTFPUnaryOp>;
+
 def SIload_constant : SDNode<"AMDGPUISD::LOAD_CONSTANT",
   SDTypeProfile<1, 2, [SDTCisVT<0, f32>, SDTCisVT<1, v4i32>, SDTCisVT<2, i32>]>,
                       [SDNPMayLoad, SDNPMemOperand]
@@ -45,22 +53,41 @@ def SIatomic_dec : SDNode<"AMDGPUISD::ATOMIC_DEC", SDTAtomic2,
   [SDNPMayLoad, SDNPMayStore, SDNPMemOperand, SDNPHasChain]
 >;
 
-def SItbuffer_load : SDNode<"AMDGPUISD::TBUFFER_LOAD_FORMAT",
-  SDTypeProfile<1, 9,
-    [                     // vdata
-     SDTCisVT<1, v4i32>,  // rsrc
-     SDTCisVT<2, i32>,    // vindex(VGPR)
-     SDTCisVT<3, i32>,    // voffset(VGPR)
-     SDTCisVT<4, i32>,    // soffset(SGPR)
-     SDTCisVT<5, i32>,    // offset(imm)
-     SDTCisVT<6, i32>,    // dfmt(imm)
-     SDTCisVT<7, i32>,    // nfmt(imm)
-     SDTCisVT<8, i32>,    // glc(imm)
-     SDTCisVT<9, i32>     // slc(imm)
-    ]>,
-  [SDNPMayLoad, SDNPMemOperand, SDNPHasChain]
+def SDTAtomic2_f32 : SDTypeProfile<1, 2, [
+  SDTCisSameAs<0,2>, SDTCisFP<0>, SDTCisPtrTy<1>
+]>;
+
+def SIatomic_fadd : SDNode<"AMDGPUISD::ATOMIC_LOAD_FADD", SDTAtomic2_f32,
+  [SDNPMayLoad, SDNPMayStore, SDNPMemOperand, SDNPHasChain]
+>;
+
+def SIatomic_fmin : SDNode<"AMDGPUISD::ATOMIC_LOAD_FMIN", SDTAtomic2_f32,
+  [SDNPMayLoad, SDNPMayStore, SDNPMemOperand, SDNPHasChain]
 >;
 
+def SIatomic_fmax : SDNode<"AMDGPUISD::ATOMIC_LOAD_FMAX", SDTAtomic2_f32,
+  [SDNPMayLoad, SDNPMayStore, SDNPMemOperand, SDNPHasChain]
+>;
+
+def SDTbuffer_load : SDTypeProfile<1, 9,
+  [                     // vdata
+   SDTCisVT<1, v4i32>,  // rsrc
+   SDTCisVT<2, i32>,    // vindex(VGPR)
+   SDTCisVT<3, i32>,    // voffset(VGPR)
+   SDTCisVT<4, i32>,    // soffset(SGPR)
+   SDTCisVT<5, i32>,    // offset(imm)
+   SDTCisVT<6, i32>,    // dfmt(imm)
+   SDTCisVT<7, i32>,    // nfmt(imm)
+   SDTCisVT<8, i32>,    // glc(imm)
+   SDTCisVT<9, i32>     // slc(imm)
+  ]>;
+
+def SItbuffer_load :   SDNode<"AMDGPUISD::TBUFFER_LOAD_FORMAT", SDTbuffer_load,
+                              [SDNPMayLoad, SDNPMemOperand, SDNPHasChain]>;
+def SItbuffer_load_d16 : SDNode<"AMDGPUISD::TBUFFER_LOAD_FORMAT_D16",
+                                SDTbuffer_load,
+                                [SDNPMayLoad, SDNPMemOperand, SDNPHasChain]>;
+
 def SDTtbuffer_store : SDTypeProfile<0, 10,
     [                     // vdata
      SDTCisVT<1, v4i32>,  // rsrc
@@ -79,6 +106,9 @@ def SItbuffer_store : SDNode<"AMDGPUISD::TBUFFER_STORE_FORMAT", SDTtbuffer_store
 def SItbuffer_store_x3 : SDNode<"AMDGPUISD::TBUFFER_STORE_FORMAT_X3",
                                 SDTtbuffer_store,
                                 [SDNPMayStore, SDNPMemOperand, SDNPHasChain]>;
+def SItbuffer_store_d16 : SDNode<"AMDGPUISD::TBUFFER_STORE_FORMAT_D16",
+                                SDTtbuffer_store,
+                                [SDNPMayStore, SDNPMemOperand, SDNPHasChain]>;
 
 def SDTBufferLoad : SDTypeProfile<1, 5,
     [                    // vdata
@@ -92,6 +122,9 @@ def SIbuffer_load : SDNode <"AMDGPUISD::BUFFER_LOAD", SDTBufferLoad,
                             [SDNPMemOperand, SDNPHasChain, SDNPMayLoad]>;
 def SIbuffer_load_format : SDNode <"AMDGPUISD::BUFFER_LOAD_FORMAT", SDTBufferLoad,
                             [SDNPMemOperand, SDNPHasChain, SDNPMayLoad]>;
+def SIbuffer_load_format_d16 : SDNode <"AMDGPUISD::BUFFER_LOAD_FORMAT_D16",
+                                SDTBufferLoad,
+                                [SDNPMemOperand, SDNPHasChain, SDNPMayLoad]>;
 
 def SDTBufferStore : SDTypeProfile<0, 6,
     [                    // vdata
@@ -102,9 +135,13 @@ def SDTBufferStore : SDTypeProfile<0, 6,
      SDTCisVT<5, i1>]>;  // slc
 
 def SIbuffer_store : SDNode <"AMDGPUISD::BUFFER_STORE", SDTBufferStore,
-                             [SDNPMemOperand, SDNPHasChain, SDNPMayStore]>;
-def SIbuffer_store_format : SDNode <"AMDGPUISD::BUFFER_STORE_FORMAT", SDTBufferStore,
-                             [SDNPMemOperand, SDNPHasChain, SDNPMayStore]>;
+                             [SDNPMayStore, SDNPMemOperand, SDNPHasChain]>;
+def SIbuffer_store_format : SDNode <"AMDGPUISD::BUFFER_STORE_FORMAT",
+                            SDTBufferStore,
+                            [SDNPMayStore, SDNPMemOperand, SDNPHasChain]>;
+def SIbuffer_store_format_d16 : SDNode <"AMDGPUISD::BUFFER_STORE_FORMAT_D16",
+                            SDTBufferStore,
+                            [SDNPMayStore, SDNPMemOperand, SDNPHasChain]>;
 
 class SDBufferAtomic<string opcode> : SDNode <opcode,
   SDTypeProfile<1, 5,
@@ -140,21 +177,41 @@ def SIbuffer_atomic_cmpswap : SDNode <"AMDGPUISD::BUFFER_ATOMIC_CMPSWAP",
   [SDNPMemOperand, SDNPHasChain, SDNPMayLoad, SDNPMayStore]
 >;
 
-class SDSample<string opcode> : SDNode <opcode,
-  SDTypeProfile<1, 4, [SDTCisVT<0, v4f32>, SDTCisVT<2, v8i32>,
-                       SDTCisVT<3, v4i32>, SDTCisVT<4, i32>]>
->;
-
-def SIsample : SDSample<"AMDGPUISD::SAMPLE">;
-def SIsampleb : SDSample<"AMDGPUISD::SAMPLEB">;
-def SIsampled : SDSample<"AMDGPUISD::SAMPLED">;
-def SIsamplel : SDSample<"AMDGPUISD::SAMPLEL">;
-
 def SIpc_add_rel_offset : SDNode<"AMDGPUISD::PC_ADD_REL_OFFSET",
   SDTypeProfile<1, 2, [SDTCisVT<0, iPTR>, SDTCisSameAs<0,1>, SDTCisSameAs<0,2>]>
 >;
 
 //===----------------------------------------------------------------------===//
+// ValueType helpers
+//===----------------------------------------------------------------------===//
+
+// Returns 1 if the source arguments have modifiers, 0 if they do not.
+// XXX - do f16 instructions?
+class isFloatType<ValueType SrcVT> {
+  bit ret =
+    !if(!eq(SrcVT.Value, f16.Value), 1,
+    !if(!eq(SrcVT.Value, f32.Value), 1,
+    !if(!eq(SrcVT.Value, f64.Value), 1,
+    !if(!eq(SrcVT.Value, v2f16.Value), 1,
+    0))));
+}
+
+class isIntType<ValueType SrcVT> {
+  bit ret =
+    !if(!eq(SrcVT.Value, i16.Value), 1,
+    !if(!eq(SrcVT.Value, i32.Value), 1,
+    !if(!eq(SrcVT.Value, i64.Value), 1,
+    0)));
+}
+
+class isPackedType<ValueType SrcVT> {
+  bit ret =
+    !if(!eq(SrcVT.Value, v2i16.Value), 1,
+      !if(!eq(SrcVT.Value, v2f16.Value), 1, 0)
+    );
+}
+
+//===----------------------------------------------------------------------===//
 // PatFrags for global memory operations
 //===----------------------------------------------------------------------===//
 
@@ -163,6 +220,9 @@ defm atomic_dec_global : global_binary_atomic_op<SIatomic_dec>;
 
 def atomic_inc_local : local_binary_atomic_op<SIatomic_inc>;
 def atomic_dec_local : local_binary_atomic_op<SIatomic_dec>;
+def atomic_load_fadd_local : local_binary_atomic_op<SIatomic_fadd>;
+def atomic_load_fmin_local : local_binary_atomic_op<SIatomic_fmin>;
+def atomic_load_fmax_local : local_binary_atomic_op<SIatomic_fmax>;
 
 //===----------------------------------------------------------------------===//
 // SDNodes PatFrags for loads/stores with a glue input.
@@ -178,6 +238,10 @@ def AMDGPUld_glue : SDNode <"ISD::LOAD", SDTLoad,
   [SDNPHasChain, SDNPMayLoad, SDNPMemOperand, SDNPInGlue]
 >;
 
+def AMDGPUatomic_ld_glue : SDNode <"ISD::ATOMIC_LOAD", SDTAtomicLoad,
+  [SDNPHasChain, SDNPMayLoad, SDNPMemOperand, SDNPInGlue]
+>;
+
 def unindexedload_glue : PatFrag <(ops node:$ptr), (AMDGPUld_glue node:$ptr), [{
   return cast<LoadSDNode>(N)->getAddressingMode() == ISD::UNINDEXED;
 }]>;
@@ -186,6 +250,18 @@ def load_glue : PatFrag <(ops node:$ptr), (unindexedload_glue node:$ptr), [{
   return cast<LoadSDNode>(N)->getExtensionType() == ISD::NON_EXTLOAD;
 }]>;
 
+def atomic_load_32_glue : PatFrag<(ops node:$ptr),
+  (AMDGPUatomic_ld_glue node:$ptr)> {
+  let IsAtomic = 1;
+  let MemoryVT = i32;
+}
+
+def atomic_load_64_glue : PatFrag<(ops node:$ptr),
+  (AMDGPUatomic_ld_glue node:$ptr)> {
+  let IsAtomic = 1;
+  let MemoryVT = i64;
+}
+
 def extload_glue : PatFrag<(ops node:$ptr), (load_glue node:$ptr), [{
   return cast<LoadSDNode>(N)->getExtensionType() == ISD::EXTLOAD;
 }]>;
@@ -219,6 +295,9 @@ def sextloadi16_glue : PatFrag<(ops node:$ptr), (sextload_glue node:$ptr), [{
 def load_glue_align8 : Aligned8Bytes <
   (ops node:$ptr), (load_glue node:$ptr)
 >;
+def load_glue_align16 : Aligned16Bytes <
+  (ops node:$ptr), (load_glue node:$ptr)
+>;
 
 
 def load_local_m0 : LoadFrag<load_glue>, LocalAddress;
@@ -227,12 +306,23 @@ def sextloadi16_local_m0 : LoadFrag<sextloadi16_glue>, LocalAddress;
 def az_extloadi8_local_m0 : LoadFrag<az_extloadi8_glue>, LocalAddress;
 def az_extloadi16_local_m0 : LoadFrag<az_extloadi16_glue>, LocalAddress;
 def load_align8_local_m0 : LoadFrag <load_glue_align8>, LocalAddress;
+def load_align16_local_m0 : LoadFrag <load_glue_align16>, LocalAddress;
+def atomic_load_32_local_m0 : LoadFrag<atomic_load_32_glue>, LocalAddress;
+def atomic_load_64_local_m0 : LoadFrag<atomic_load_64_glue>, LocalAddress;
 
 
 def AMDGPUst_glue : SDNode <"ISD::STORE", SDTStore,
   [SDNPHasChain, SDNPMayStore, SDNPMemOperand, SDNPInGlue]
 >;
 
+def AMDGPUatomic_st_glue : SDNode <"ISD::ATOMIC_STORE", SDTAtomicStore,
+  [SDNPHasChain, SDNPMayStore, SDNPMemOperand, SDNPInGlue]
+>;
+
+def atomic_store_glue : PatFrag<(ops node:$ptr, node:$val),
+  (AMDGPUatomic_st_glue node:$ptr, node:$val)> {
+}
+
 def unindexedstore_glue : PatFrag<(ops node:$val, node:$ptr),
                                    (AMDGPUst_glue node:$val, node:$ptr), [{
   return cast<StoreSDNode>(N)->getAddressingMode() == ISD::UNINDEXED;
@@ -262,11 +352,17 @@ def store_glue_align8 : Aligned8Bytes <
   (ops node:$value, node:$ptr), (store_glue node:$value, node:$ptr)
 >;
 
+def store_glue_align16 : Aligned16Bytes <
+  (ops node:$value, node:$ptr), (store_glue node:$value, node:$ptr)
+>;
+
 def store_local_m0 : StoreFrag<store_glue>, LocalAddress;
 def truncstorei8_local_m0 : StoreFrag<truncstorei8_glue>, LocalAddress;
 def truncstorei16_local_m0 : StoreFrag<truncstorei16_glue>, LocalAddress;
+def atomic_store_local_m0 : StoreFrag<AMDGPUatomic_st_glue>, LocalAddress;
 
 def store_align8_local_m0 : StoreFrag<store_glue_align8>, LocalAddress;
+def store_align16_local_m0 : StoreFrag<store_glue_align16>, LocalAddress;
 
 def si_setcc_uniform : PatFrag <
   (ops node:$lhs, node:$rhs, node:$cond),
@@ -297,10 +393,11 @@ def lshl_rev : PatFrag <
   (shl $src0, $src1)
 >;
 
-multiclass SIAtomicM0Glue2 <string op_name, bit is_amdgpu = 0> {
+multiclass SIAtomicM0Glue2 <string op_name, bit is_amdgpu = 0,
+                            SDTypeProfile tc = SDTAtomic2> {
 
   def _glue : SDNode <
-    !if(is_amdgpu, "AMDGPUISD", "ISD")#"::ATOMIC_"#op_name, SDTAtomic2,
+    !if(is_amdgpu, "AMDGPUISD", "ISD")#"::ATOMIC_"#op_name, tc,
     [SDNPHasChain, SDNPMayStore, SDNPMayLoad, SDNPMemOperand, SDNPInGlue]
   >;
 
@@ -319,6 +416,9 @@ defm atomic_load_xor : SIAtomicM0Glue2 <"LOAD_XOR">;
 defm atomic_load_umin : SIAtomicM0Glue2 <"LOAD_UMIN">;
 defm atomic_load_umax : SIAtomicM0Glue2 <"LOAD_UMAX">;
 defm atomic_swap : SIAtomicM0Glue2 <"SWAP">;
+defm atomic_load_fadd : SIAtomicM0Glue2 <"LOAD_FADD", 1, SDTAtomic2_f32>;
+defm atomic_load_fmin : SIAtomicM0Glue2 <"LOAD_FMIN", 1, SDTAtomic2_f32>;
+defm atomic_load_fmax : SIAtomicM0Glue2 <"LOAD_FMAX", 1, SDTAtomic2_f32>;
 
 def atomic_cmp_swap_glue : SDNode <"ISD::ATOMIC_CMP_SWAP", SDTAtomic3,
   [SDNPHasChain, SDNPMayStore, SDNPMayLoad, SDNPMemOperand, SDNPInGlue]
@@ -368,6 +468,12 @@ return CurDAG->getTargetConstant(
   N->getValueAPF().bitcastToAPInt().getZExtValue(), SDLoc(N), MVT::i64);
 }]>;
 
+class bitextract_imm<int bitnum> : SDNodeXForm<imm, [{
+  uint64_t Imm = N->getZExtValue();
+  unsigned Bit = (Imm >> }] # bitnum # [{ ) & 1;
+  return CurDAG->getTargetConstant(Bit, SDLoc(N), MVT::i1);
+}]>;
+
 def SIMM16bit : PatLeaf <(imm),
   [{return isInt<16>(N->getSExtValue());}]
 >;
@@ -381,7 +487,7 @@ class InlineFPImm <ValueType vt> : PatLeaf <(vt fpimm), [{
 }]>;
 
 class VGPRImm <dag frag> : PatLeaf<frag, [{
-  if (Subtarget->getGeneration() < SISubtarget::SOUTHERN_ISLANDS) {
+  if (Subtarget->getGeneration() < AMDGPUSubtarget::SOUTHERN_ISLANDS) {
     return false;
   }
   const SIRegisterInfo *SIRI =
@@ -552,19 +658,18 @@ def ExpSrc3 : RegisterOperand<VGPR_32> {
   let ParserMatchClass = VReg32OrOffClass;
 }
 
-class SDWASrc : RegisterOperand<VS_32> {
+class SDWASrc<ValueType vt> : RegisterOperand<VS_32> {
   let OperandNamespace = "AMDGPU";
-  let OperandType = "OPERAND_SDWA_SRC";
+  string Type = !if(isFloatType<vt>.ret, "FP", "INT");
+  let OperandType = "OPERAND_REG_INLINE_C_"#Type#vt.Size;
+  let DecoderMethod = "decodeSDWASrc"#vt.Size;
   let EncoderMethod = "getSDWASrcEncoding";
 }
 
-def SDWASrc32 : SDWASrc {
-  let DecoderMethod = "decodeSDWASrc32";
-}
-
-def SDWASrc16 : SDWASrc {
-  let DecoderMethod = "decodeSDWASrc16";
-}
+def SDWASrc_i32 : SDWASrc<i32>;
+def SDWASrc_i16 : SDWASrc<i16>;
+def SDWASrc_f32 : SDWASrc<f32>;
+def SDWASrc_f16 : SDWASrc<f16>;
 
 def SDWAVopcDst : VOPDstOperand<SReg_64> {
   let OperandNamespace = "AMDGPU";
@@ -637,19 +742,20 @@ def clampmod : NamedOperandBit<"ClampSI", NamedMatchClass<"ClampSI">>;
 def highmod : NamedOperandBit<"High", NamedMatchClass<"High">>;
 
 def GLC : NamedOperandBit<"GLC", NamedMatchClass<"GLC">>;
-def slc : NamedOperandBit<"SLC", NamedMatchClass<"SLC">>;
-def tfe : NamedOperandBit<"TFE", NamedMatchClass<"TFE">>;
-def unorm : NamedOperandBit<"UNorm", NamedMatchClass<"UNorm">>;
-def da : NamedOperandBit<"DA", NamedMatchClass<"DA">>;
-def r128 : NamedOperandBit<"R128", NamedMatchClass<"R128">>;
-def lwe : NamedOperandBit<"LWE", NamedMatchClass<"LWE">>;
+def SLC : NamedOperandBit<"SLC", NamedMatchClass<"SLC">>;
+def TFE : NamedOperandBit<"TFE", NamedMatchClass<"TFE">>;
+def UNorm : NamedOperandBit<"UNorm", NamedMatchClass<"UNorm">>;
+def DA : NamedOperandBit<"DA", NamedMatchClass<"DA">>;
+def R128 : NamedOperandBit<"R128", NamedMatchClass<"R128">>;
+def D16 : NamedOperandBit<"D16", NamedMatchClass<"D16">>;
+def LWE : NamedOperandBit<"LWE", NamedMatchClass<"LWE">>;
 def exp_compr : NamedOperandBit<"ExpCompr", NamedMatchClass<"ExpCompr">>;
 def exp_vm : NamedOperandBit<"ExpVM", NamedMatchClass<"ExpVM">>;
 
 def DFMT : NamedOperandU8<"DFMT", NamedMatchClass<"DFMT">>;
 def NFMT : NamedOperandU8<"NFMT", NamedMatchClass<"NFMT">>;
 
-def dmask : NamedOperandU16<"DMask", NamedMatchClass<"DMask">>;
+def DMask : NamedOperandU16<"DMask", NamedMatchClass<"DMask">>;
 
 def dpp_ctrl : NamedOperandU32<"DPPCtrl", NamedMatchClass<"DPPCtrl", 0>>;
 def row_mask : NamedOperandU32<"RowMask", NamedMatchClass<"RowMask">>;
@@ -747,16 +853,23 @@ class OpSelModsMatchClass : AsmOperandClass {
 def IntOpSelModsMatchClass : OpSelModsMatchClass;
 def IntOpSelMods : InputMods<IntOpSelModsMatchClass>;
 
-def FPRegSDWAInputModsMatchClass : AsmOperandClass {
-  let Name = "SDWARegWithFPInputMods";
-  let ParserMethod = "parseRegWithFPInputMods";
-  let PredicateMethod = "isSDWARegKind";
+class FPSDWAInputModsMatchClass <int opSize> : AsmOperandClass {
+  let Name = "SDWAWithFP"#opSize#"InputMods";
+  let ParserMethod = "parseRegOrImmWithFPInputMods";
+  let PredicateMethod = "isSDWAFP"#opSize#"Operand";
 }
 
-def FPRegSDWAInputMods : InputMods <FPRegSDWAInputModsMatchClass> {
+def FP16SDWAInputModsMatchClass : FPSDWAInputModsMatchClass<16>;
+def FP32SDWAInputModsMatchClass : FPSDWAInputModsMatchClass<32>;
+
+class FPSDWAInputMods <FPSDWAInputModsMatchClass matchClass> :
+  InputMods <matchClass> {
   let PrintMethod = "printOperandAndFPInputMods";
 }
 
+def FP16SDWAInputMods : FPSDWAInputMods<FP16SDWAInputModsMatchClass>;
+def FP32SDWAInputMods : FPSDWAInputMods<FP32SDWAInputModsMatchClass>;
+
 def FPVRegInputModsMatchClass : AsmOperandClass {
   let Name = "VRegWithFPInputMods";
   let ParserMethod = "parseRegWithFPInputMods";
@@ -767,17 +880,23 @@ def FPVRegInputMods : InputMods <FPVRegInputModsMatchClass> {
   let PrintMethod = "printOperandAndFPInputMods";
 }
 
-
-def IntRegSDWAInputModsMatchClass : AsmOperandClass {
-  let Name = "SDWARegWithIntInputMods";
-  let ParserMethod = "parseRegWithIntInputMods";
-  let PredicateMethod = "isSDWARegKind";
+class IntSDWAInputModsMatchClass <int opSize> : AsmOperandClass {
+  let Name = "SDWAWithInt"#opSize#"InputMods";
+  let ParserMethod = "parseRegOrImmWithIntInputMods";
+  let PredicateMethod = "isSDWAInt"#opSize#"Operand";
 }
 
-def IntRegSDWAInputMods : InputMods <IntRegSDWAInputModsMatchClass> {
+def Int16SDWAInputModsMatchClass : IntSDWAInputModsMatchClass<16>;
+def Int32SDWAInputModsMatchClass : IntSDWAInputModsMatchClass<32>;
+
+class IntSDWAInputMods <IntSDWAInputModsMatchClass matchClass> :
+  InputMods <matchClass> {
   let PrintMethod = "printOperandAndIntInputMods";
 }
 
+def Int16SDWAInputMods : IntSDWAInputMods<Int16SDWAInputModsMatchClass>;
+def Int32SDWAInputMods : IntSDWAInputMods<Int32SDWAInputModsMatchClass>;
+
 def IntVRegInputModsMatchClass : AsmOperandClass {
   let Name = "VRegWithIntInputMods";
   let ParserMethod = "parseRegWithIntInputMods";
@@ -1023,7 +1142,12 @@ class getVregSrcForVT<ValueType VT> {
 }
 
 class getSDWASrcForVT <ValueType VT> {
-  RegisterOperand ret = !if(!eq(VT.Size, 16), SDWASrc16, SDWASrc32);
+  bit isFP = !if(!eq(VT.Value, f16.Value), 1,
+             !if(!eq(VT.Value, f32.Value), 1,
+             0));
+  RegisterOperand retFlt = !if(!eq(VT.Size, 16), SDWASrc_f16, SDWASrc_f32);
+  RegisterOperand retInt = !if(!eq(VT.Size, 16), SDWASrc_i16, SDWASrc_i32);
+  RegisterOperand ret = !if(isFP, retFlt, retInt);
 }
 
 // Returns the register class to use for sources of VOP3 instructions for the
@@ -1064,32 +1188,6 @@ class getVOP3SrcForVT<ValueType VT> {
   );
 }
 
-// Returns 1 if the source arguments have modifiers, 0 if they do not.
-// XXX - do f16 instructions?
-class isFloatType<ValueType SrcVT> {
-  bit ret =
-    !if(!eq(SrcVT.Value, f16.Value), 1,
-    !if(!eq(SrcVT.Value, f32.Value), 1,
-    !if(!eq(SrcVT.Value, f64.Value), 1,
-    !if(!eq(SrcVT.Value, v2f16.Value), 1,
-    0))));
-}
-
-class isIntType<ValueType SrcVT> {
-  bit ret =
-    !if(!eq(SrcVT.Value, i16.Value), 1,
-    !if(!eq(SrcVT.Value, i32.Value), 1,
-    !if(!eq(SrcVT.Value, i64.Value), 1,
-    0)));
-}
-
-class isPackedType<ValueType SrcVT> {
-  bit ret =
-    !if(!eq(SrcVT.Value, v2i16.Value), 1,
-      !if(!eq(SrcVT.Value, v2f16.Value), 1, 0)
-    );
-}
-
 // Float or packed int
 class isModifierType<ValueType SrcVT> {
   bit ret =
@@ -1134,11 +1232,10 @@ class getSrcModExt <ValueType VT> {
 
 // Return type of input modifiers operand specified input operand for SDWA
 class getSrcModSDWA <ValueType VT> {
-    bit isFP = !if(!eq(VT.Value, f16.Value), 1,
-               !if(!eq(VT.Value, f32.Value), 1,
-               !if(!eq(VT.Value, f64.Value), 1,
-               0)));
-  Operand ret = !if(isFP, FPRegSDWAInputMods, IntRegSDWAInputMods);
+  Operand ret = !if(!eq(VT.Value, f16.Value), FP16SDWAInputMods,
+                !if(!eq(VT.Value, f32.Value), FP32SDWAInputMods,
+                !if(!eq(VT.Value, i16.Value), Int16SDWAInputMods,
+                Int32SDWAInputMods)));
 }
 
 // Returns the input arguments for VOP[12C] instructions for the given SrcVT.
@@ -1733,6 +1830,9 @@ def VOP_I32_F32_I32_I32 : VOPProfile <[i32, f32, i32, i32]>;
 def VOP_I64_I64_I32_I64 : VOPProfile <[i64, i64, i32, i64]>;
 def VOP_V4I32_I64_I32_V4I32 : VOPProfile <[v4i32, i64, i32, v4i32]>;
 
+def VOP_F32_V2F16_V2F16_F32 : VOPProfile <[f32, v2f16, v2f16, f32]>;
+def VOP_I32_V2I16_V2I16_I32 : VOPProfile <[i32, v2i16, v2i16, i32]>;
+
 class Commutable_REV <string revOp, bit isOrig> {
   string RevOp = revOp;
   bit IsOrig = isOrig;
@@ -1747,6 +1847,8 @@ class AtomicNoRet <string noRetOp, bit isRet> {
 // Interpolation opcodes
 //===----------------------------------------------------------------------===//
 
+class VINTRPDstOperand <RegisterClass rc> : RegisterOperand <rc, "printVINTRPDst">;
+
 class VINTRP_Pseudo <string opName, dag outs, dag ins, list<dag> pattern> :
   VINTRPCommon <outs, ins, "", pattern>,
   SIMCInstr<opName, SIEncodingFamily.NONE> {
@@ -1823,38 +1925,6 @@ def getBasicFromSDWAOp : InstrMapping {
   let ValueCols = [["Default"]];
 }
 
-def getMaskedMIMGOp1 : InstrMapping {
-  let FilterClass = "MIMG_Mask";
-  let RowFields = ["Op"];
-  let ColFields = ["Channels"];
-  let KeyCol = ["1"];
-  let ValueCols = [["2"], ["3"], ["4"] ];
-}
-
-def getMaskedMIMGOp2 : InstrMapping {
-  let FilterClass = "MIMG_Mask";
-  let RowFields = ["Op"];
-  let ColFields = ["Channels"];
-  let KeyCol = ["2"];
-  let ValueCols = [["1"], ["3"], ["4"] ];
-}
-
-def getMaskedMIMGOp3 : InstrMapping {
-  let FilterClass = "MIMG_Mask";
-  let RowFields = ["Op"];
-  let ColFields = ["Channels"];
-  let KeyCol = ["3"];
-  let ValueCols = [["1"], ["2"], ["4"] ];
-}
-
-def getMaskedMIMGOp4 : InstrMapping {
-  let FilterClass = "MIMG_Mask";
-  let RowFields = ["Op"];
-  let ColFields = ["Channels"];
-  let KeyCol = ["4"];
-  let ValueCols = [["1"], ["2"], ["3"] ];
-}
-
 // Maps an commuted opcode to its original version
 def getCommuteOrig : InstrMapping {
   let FilterClass = "Commutable_REV";
@@ -1882,6 +1952,11 @@ def getMCOpcodeGen : InstrMapping {
                    [!cast<string>(SIEncodingFamily.VI)],
                    [!cast<string>(SIEncodingFamily.SDWA)],
                    [!cast<string>(SIEncodingFamily.SDWA9)],
+                   // GFX80 encoding is added to work around a multiple matching
+                   // issue for buffer instructions with unpacked d16 data. This
+                   // does not actually change the encoding, and thus may be
+                   // removed later.
+                   [!cast<string>(SIEncodingFamily.GFX80)],
                    [!cast<string>(SIEncodingFamily.GFX9)]];
 }
 
@@ -1902,6 +1977,14 @@ def getAddr64Inst : InstrMapping {
   let ValueCols = [["1"]];
 }
 
+def getMUBUFNoLdsInst : InstrMapping {
+  let FilterClass = "MUBUFLdsTable";
+  let RowFields = ["OpName"];
+  let ColFields = ["IsLds"];
+  let KeyCol = ["1"];
+  let ValueCols = [["0"]];
+}
+
 // Maps an atomic opcode to its version with a return value.
 def getAtomicRetOp : InstrMapping {
   let FilterClass = "AtomicNoRet";
diff --git a/lib/Target/AMDGPU/SIInstructions.td b/lib/Target/AMDGPU/SIInstructions.td
index 9740a18b7248..c3f8bfb53ef4 100644
--- a/lib/Target/AMDGPU/SIInstructions.td
+++ b/lib/Target/AMDGPU/SIInstructions.td
@@ -11,18 +11,10 @@
 // that are not yet supported remain commented out.
 //===----------------------------------------------------------------------===//
 
-def has16BankLDS : Predicate<"Subtarget->getLDSBankCount() == 16">;
-def has32BankLDS : Predicate<"Subtarget->getLDSBankCount() == 32">;
-def HasVGPRIndexMode : Predicate<"Subtarget->hasVGPRIndexMode()">,
-                      AssemblerPredicate<"FeatureVGPRIndexMode">;
-def HasMovrel : Predicate<"Subtarget->hasMovrel()">,
-                AssemblerPredicate<"FeatureMovrel">;
-
-class GCNPat<dag pattern, dag result> : AMDGPUPat<pattern, result> {
+class GCNPat<dag pattern, dag result> : Pat<pattern, result>, GCNPredicateControl {
   let SubtargetPredicate = isGCN;
 }
 
-
 include "VOPInstructions.td"
 include "SOPInstructions.td"
 include "SMInstructions.td"
@@ -40,15 +32,18 @@ defm EXP_DONE : EXP_m<1, AMDGPUexport_done>;
 // VINTRP Instructions
 //===----------------------------------------------------------------------===//
 
+// Used to inject printing of "_e32" suffix for VI (there are "_e64" variants for VI)
+def VINTRPDst : VINTRPDstOperand <VGPR_32>;
+
 let Uses = [M0, EXEC] in {
 
 // FIXME: Specify SchedRW for VINTRP insturctions.
 
 multiclass V_INTERP_P1_F32_m : VINTRP_m <
   0x00000000,
-  (outs VGPR_32:$vdst),
+  (outs VINTRPDst:$vdst),
   (ins VGPR_32:$vsrc, Attr:$attr, AttrChan:$attrchan),
-  "v_interp_p1_f32 $vdst, $vsrc, $attr$attrchan",
+  "v_interp_p1_f32$vdst, $vsrc, $attr$attrchan",
   [(set f32:$vdst, (AMDGPUinterp_p1 f32:$vsrc, (i32 imm:$attrchan),
                                                (i32 imm:$attr)))]
 >;
@@ -69,9 +64,9 @@ let DisableEncoding = "$src0", Constraints = "$src0 = $vdst" in {
 
 defm V_INTERP_P2_F32 : VINTRP_m <
   0x00000001,
-  (outs VGPR_32:$vdst),
+  (outs VINTRPDst:$vdst),
   (ins VGPR_32:$src0, VGPR_32:$vsrc, Attr:$attr, AttrChan:$attrchan),
-  "v_interp_p2_f32 $vdst, $vsrc, $attr$attrchan",
+  "v_interp_p2_f32$vdst, $vsrc, $attr$attrchan",
   [(set f32:$vdst, (AMDGPUinterp_p2 f32:$src0, f32:$vsrc, (i32 imm:$attrchan),
                                                           (i32 imm:$attr)))]>;
 
@@ -79,9 +74,9 @@ defm V_INTERP_P2_F32 : VINTRP_m <
 
 defm V_INTERP_MOV_F32 : VINTRP_m <
   0x00000002,
-  (outs VGPR_32:$vdst),
+  (outs VINTRPDst:$vdst),
   (ins InterpSlot:$vsrc, Attr:$attr, AttrChan:$attrchan),
-  "v_interp_mov_f32 $vdst, $vsrc, $attr$attrchan",
+  "v_interp_mov_f32$vdst, $vsrc, $attr$attrchan",
   [(set f32:$vdst, (AMDGPUinterp_mov (i32 imm:$vsrc), (i32 imm:$attrchan),
                                      (i32 imm:$attr)))]>;
 
@@ -186,6 +181,7 @@ def S_XOR_B64_term : PseudoInstSI<(outs SReg_64:$dst),
   let SALU = 1;
   let isAsCheapAsAMove = 1;
   let isTerminator = 1;
+  let Defs = [SCC];
 }
 
 def S_ANDN2_B64_term : PseudoInstSI<(outs SReg_64:$dst),
@@ -246,7 +242,6 @@ def SI_IF: CFPseudoInstSI <
 def SI_ELSE : CFPseudoInstSI <
   (outs SReg_64:$dst),
   (ins SReg_64:$src, brtarget:$target, i1imm:$execfix), [], 1, 1> {
-  let Constraints = "$src = $dst";
   let Size = 12;
   let hasSideEffects = 1;
 }
@@ -296,14 +291,21 @@ def SI_ELSE_BREAK : CFPseudoInstSI <
   let isReMaterializable = 1;
 }
 
-let Uses = [EXEC], Defs = [EXEC,VCC] in {
+let Uses = [EXEC] in {
 
 multiclass PseudoInstKill <dag ins> {
+  // Even though this pseudo can usually be expanded without an SCC def, we
+  // conservatively assume that it has an SCC def, both because it is sometimes
+  // required in degenerate cases (when V_CMPX cannot be used due to constant
+  // bus limitations) and because it allows us to avoid having to track SCC
+  // liveness across basic blocks.
+  let Defs = [EXEC,VCC,SCC] in
   def _PSEUDO : PseudoInstSI <(outs), ins> {
     let isConvergent = 1;
     let usesCustomInserter = 1;
   }
 
+  let Defs = [EXEC,VCC,SCC] in
   def _TERMINATOR : SPseudoInstSI <(outs), ins> {
     let isTerminator = 1;
   }
@@ -312,6 +314,7 @@ multiclass PseudoInstKill <dag ins> {
 defm SI_KILL_I1 : PseudoInstKill <(ins SSrc_b64:$src, i1imm:$killvalue)>;
 defm SI_KILL_F32_COND_IMM : PseudoInstKill <(ins VSrc_b32:$src0, i32imm:$src1, i32imm:$cond)>;
 
+let Defs = [EXEC,VCC] in
 def SI_ILLEGAL_COPY : SPseudoInstSI <
   (outs unknown:$dst), (ins unknown:$src),
   [], " ; illegal copy $src to $dst">;
@@ -371,6 +374,7 @@ def SI_RETURN_TO_EPILOG : SPseudoInstSI <
   let isReturn = 1;
   let hasNoSchedulingInfo = 1;
   let DisableWQM = 1;
+  let FixedSize = 1;
 }
 
 // Return for returning function calls.
@@ -449,7 +453,7 @@ def ADJCALLSTACKDOWN : SPseudoInstSI<
   let usesCustomInserter = 1;
 }
 
-let Defs = [M0, EXEC],
+let Defs = [M0, EXEC, SCC],
   UseNamedOperandTable = 1 in {
 
 class SI_INDIRECT_SRC<RegisterClass rc> : VPseudoInstSI <
@@ -569,11 +573,6 @@ def : GCNPat<
   (SI_ELSE $src, $target, 0)
 >;
 
-def : GCNPat <
-  (int_AMDGPU_kilp),
-  (SI_KILL_I1_PSEUDO (i1 0), 0)
->;
-
 def : Pat <
   // -1.0 as i32 (LowerINTRINSIC_VOID converts all other constants to -1.0)
   (AMDGPUkill (i32 -1082130432)),
@@ -643,6 +642,11 @@ def : GCNPat <
 >;
 
 def : GCNPat <
+  (f32 (f16_to_fp (i32 (srl_oneuse (and_oneuse i32:$src0, 0x7fff0000), (i32 16))))),
+  (V_CVT_F32_F16_e64 SRCMODS.ABS, (i32 (V_LSHRREV_B32_e64 (i32 16), i32:$src0)), DSTCLAMP.NONE, DSTOMOD.NONE)
+>;
+
+def : GCNPat <
   (f32 (f16_to_fp (or_oneuse i32:$src0, 0x8000))),
   (V_CVT_F32_F16_e64 SRCMODS.NEG_ABS, $src0, DSTCLAMP.NONE, DSTOMOD.NONE)
 >;
@@ -700,15 +704,19 @@ multiclass FMADPat <ValueType vt, Instruction inst> {
 defm : FMADPat <f16, V_MAC_F16_e64>;
 defm : FMADPat <f32, V_MAC_F32_e64>;
 
-class FMADModsPat<Instruction inst, SDPatternOperator mad_opr> : GCNPat<
-  (f32 (mad_opr (VOP3Mods f32:$src0, i32:$src0_mod),
-  (VOP3Mods f32:$src1, i32:$src1_mod),
-  (VOP3Mods f32:$src2, i32:$src2_mod))),
+class FMADModsPat<Instruction inst, SDPatternOperator mad_opr, ValueType Ty>
+  : GCNPat<
+  (Ty (mad_opr (VOP3Mods Ty:$src0, i32:$src0_mod),
+  (VOP3Mods Ty:$src1, i32:$src1_mod),
+  (VOP3Mods Ty:$src2, i32:$src2_mod))),
   (inst $src0_mod, $src0, $src1_mod, $src1,
   $src2_mod, $src2, DSTCLAMP.NONE, DSTOMOD.NONE)
 >;
 
-def : FMADModsPat<V_MAD_F32, AMDGPUfmad_ftz>;
+def : FMADModsPat<V_MAD_F32, AMDGPUfmad_ftz, f32>;
+def : FMADModsPat<V_MAD_F16, AMDGPUfmad_ftz, f16> {
+  let SubtargetPredicate = Has16BitInsts;
+}
 
 multiclass SelectPat <ValueType vt, Instruction inst> {
   def : GCNPat <
@@ -726,6 +734,10 @@ def : GCNPat <
   (i32 (add (i32 (ctpop i32:$popcnt)), i32:$val)),
   (V_BCNT_U32_B32_e64 $popcnt, $val)
 >;
+def : GCNPat <
+  (i16 (add (i16 (trunc (ctpop i32:$popcnt))), i16:$val)),
+  (V_BCNT_U32_B32_e64 $popcnt, $val)
+>;
 
 /********** ============================================ **********/
 /********** Extraction, Insertion, Building and Casting  **********/
@@ -795,6 +807,27 @@ foreach Index = 0-15 in {
   >;
 }
 
+
+def : Pat <
+  (extract_subvector v4i16:$vec, (i32 0)),
+  (v2i16 (EXTRACT_SUBREG v4i16:$vec, sub0))
+>;
+
+def : Pat <
+  (extract_subvector v4i16:$vec, (i32 2)),
+  (v2i16 (EXTRACT_SUBREG v4i16:$vec, sub1))
+>;
+
+def : Pat <
+  (extract_subvector v4f16:$vec, (i32 0)),
+  (v2f16 (EXTRACT_SUBREG v4f16:$vec, sub0))
+>;
+
+def : Pat <
+  (extract_subvector v4f16:$vec, (i32 2)),
+  (v2f16 (EXTRACT_SUBREG v4f16:$vec, sub1))
+>;
+
 let SubtargetPredicate = isGCN in {
 
 // FIXME: Why do only some of these type combinations for SReg and
@@ -834,6 +867,26 @@ def : BitConvert <f64, v2f32, VReg_64>;
 def : BitConvert <v2f32, f64, VReg_64>;
 def : BitConvert <f64, v2i32, VReg_64>;
 def : BitConvert <v2i32, f64, VReg_64>;
+
+// FIXME: Make SGPR
+def : BitConvert <v2i32, v4f16, VReg_64>;
+def : BitConvert <v4f16, v2i32, VReg_64>;
+def : BitConvert <v2i32, v4f16, VReg_64>;
+def : BitConvert <v2i32, v4i16, VReg_64>;
+def : BitConvert <v4i16, v2i32, VReg_64>;
+def : BitConvert <v2f32, v4f16, VReg_64>;
+def : BitConvert <v4f16, v2f32, VReg_64>;
+def : BitConvert <v2f32, v4i16, VReg_64>;
+def : BitConvert <v4i16, v2f32, VReg_64>;
+def : BitConvert <v4i16, f64, VReg_64>;
+def : BitConvert <v4f16, f64, VReg_64>;
+def : BitConvert <f64, v4i16, VReg_64>;
+def : BitConvert <f64, v4f16, VReg_64>;
+def : BitConvert <v4i16, i64, VReg_64>;
+def : BitConvert <v4f16, i64, VReg_64>;
+def : BitConvert <i64, v4i16, VReg_64>;
+def : BitConvert <i64, v4f16, VReg_64>;
+
 def : BitConvert <v4i32, v4f32, VReg_128>;
 def : BitConvert <v4f32, v4i32, VReg_128>;
 
@@ -876,11 +929,13 @@ def : ClampPat<V_MAX_F32_e64, f32>;
 def : ClampPat<V_MAX_F64, f64>;
 def : ClampPat<V_MAX_F16_e64, f16>;
 
+let SubtargetPredicate = HasVOP3PInsts in {
 def : GCNPat <
   (v2f16 (AMDGPUclamp (VOP3PMods v2f16:$src0, i32:$src0_modifiers))),
   (V_PK_MAX_F16 $src0_modifiers, $src0,
                 $src0_modifiers, $src0, DSTCLAMP.ENABLE)
 >;
+}
 
 /********** ================================ **********/
 /********** Floating point absolute/negative **********/
@@ -906,7 +961,7 @@ def : GCNPat <
 
 def : GCNPat <
   (fabs f32:$src),
-  (V_AND_B32_e64 $src, (V_MOV_B32_e32 (i32 0x7fffffff)))
+  (S_AND_B32 $src, (S_MOV_B32 (i32 0x7fffffff)))
 >;
 
 def : GCNPat <
@@ -967,12 +1022,12 @@ def : GCNPat <
 
 def : GCNPat <
   (fneg f16:$src),
-  (V_XOR_B32_e32 $src, (V_MOV_B32_e32 (i32 0x00008000)))
+  (S_XOR_B32 $src, (S_MOV_B32 (i32 0x00008000)))
 >;
 
 def : GCNPat <
   (fabs f16:$src),
-  (V_AND_B32_e64 $src, (V_MOV_B32_e32 (i32 0x00007fff)))
+  (S_AND_B32 $src, (S_MOV_B32 (i32 0x00007fff)))
 >;
 
 def : GCNPat <
@@ -982,12 +1037,12 @@ def : GCNPat <
 
 def : GCNPat <
   (fneg v2f16:$src),
-  (V_XOR_B32_e64 (S_MOV_B32 (i32 0x80008000)), $src)
+  (S_XOR_B32 $src, (S_MOV_B32 (i32 0x80008000)))
 >;
 
 def : GCNPat <
   (fabs v2f16:$src),
-  (V_AND_B32_e64 (S_MOV_B32 (i32 0x7fff7fff)), $src)
+  (S_AND_B32 $src, (S_MOV_B32 (i32 0x7fff7fff)))
 >;
 
 // This is really (fneg (fabs v2f16:$src))
@@ -996,7 +1051,12 @@ def : GCNPat <
 // VOP3P instructions, so it is turned into the bit op.
 def : GCNPat <
   (fneg (v2f16 (bitconvert (and_oneuse i32:$src, 0x7fff7fff)))),
-  (S_OR_B32 (S_MOV_B32 (i32 0x80008000)), $src) // Set sign bit
+  (S_OR_B32 $src, (S_MOV_B32 (i32 0x80008000))) // Set sign bit
+>;
+
+def : GCNPat <
+  (fneg (v2f16 (fabs v2f16:$src))),
+  (S_OR_B32 $src, (S_MOV_B32 (i32 0x80008000))) // Set sign bit
 >;
 
 /********** ================== **********/
@@ -1097,6 +1157,7 @@ let SubtargetPredicate = isGCN in {
 def : IMad24Pat<V_MAD_I32_I24, 1>;
 def : UMad24Pat<V_MAD_U32_U24, 1>;
 
+// FIXME: This should only be done for VALU inputs
 defm : BFIPatterns <V_BFI_B32, S_MOV_B32, SReg_64>;
 def : ROTRPattern <V_ALIGNBIT_B32>;
 
@@ -1337,11 +1398,13 @@ def : GCNPat<
   (V_MAX_F16_e64 $src_mods, $src, $src_mods, $src, 0, 0)
 >;
 
+let SubtargetPredicate = HasVOP3PInsts in {
 def : GCNPat<
   (fcanonicalize (v2f16 (VOP3PMods v2f16:$src, i32:$src_mods))),
   (V_PK_MAX_F16 $src_mods, $src, $src_mods, $src, DSTCLAMP.NONE)
 >;
 }
+}
 
 let OtherPredicates = [NoFP32Denormals] in {
 def : GCNPat<
@@ -1371,6 +1434,16 @@ def : GCNPat<
 >;
 }
 
+let OtherPredicates = [HasDLInsts] in {
+def : GCNPat <
+  (fma (f32 (VOP3Mods0 f32:$src0, i32:$src0_modifiers, i1:$clamp, i32:$omod)),
+       (f32 (VOP3Mods f32:$src1, i32:$src1_modifiers)),
+       (f32 (VOP3NoMods f32:$src2))),
+  (V_FMAC_F32_e64 $src0_modifiers, $src0, $src1_modifiers, $src1,
+                  SRCMODS.NONE, $src2, $clamp, $omod)
+>;
+} // End OtherPredicates = [HasDLInsts]
+
 
 // Allow integer inputs
 class ExpPattern<SDPatternOperator node, ValueType vt, Instruction Inst> : GCNPat<
@@ -1381,11 +1454,6 @@ class ExpPattern<SDPatternOperator node, ValueType vt, Instruction Inst> : GCNPa
 def : ExpPattern<AMDGPUexport, i32, EXP>;
 def : ExpPattern<AMDGPUexport_done, i32, EXP_DONE>;
 
-def : GCNPat <
-  (v2i16 (build_vector i16:$src0, i16:$src1)),
-  (v2i16 (S_PACK_LL_B32_B16 $src0, $src1))
->;
-
 // COPY_TO_REGCLASS is workaround tablegen bug from multiple outputs
 // from S_LSHL_B32's multiple outputs from implicit scc def.
 def : GCNPat <
@@ -1393,6 +1461,13 @@ def : GCNPat <
   (v2i16 (COPY_TO_REGCLASS (S_LSHL_B32 i16:$src1, (i16 16)), SReg_32_XM0))
 >;
 
+
+let SubtargetPredicate = HasVOP3PInsts in {
+def : GCNPat <
+  (v2i16 (build_vector i16:$src0, i16:$src1)),
+  (v2i16 (S_PACK_LL_B32_B16 $src0, $src1))
+>;
+
 // With multiple uses of the shift, this will duplicate the shift and
 // increase register pressure.
 def : GCNPat <
@@ -1400,6 +1475,7 @@ def : GCNPat <
   (v2i16 (S_PACK_LH_B32_B16 i16:$src0, i32:$src1))
 >;
 
+
 def : GCNPat <
   (v2i16 (build_vector (i16 (trunc (srl_oneuse i32:$src0, (i32 16)))),
                        (i16 (trunc (srl_oneuse i32:$src1, (i32 16)))))),
@@ -1412,6 +1488,9 @@ def : GCNPat <
   (v2f16 (S_PACK_LL_B32_B16 $src0, $src1))
 >;
 
+} // End SubtargetPredicate = HasVOP3PInsts
+
+
 // def : GCNPat <
 //   (v2f16 (scalar_to_vector f16:$src0)),
 //   (COPY $src0)
@@ -1422,6 +1501,16 @@ def : GCNPat <
 //   (COPY $src0)
 // >;
 
+def : GCNPat <
+  (v4i16 (scalar_to_vector i16:$src0)),
+  (INSERT_SUBREG (IMPLICIT_DEF), $src0, sub0)
+>;
+
+def : GCNPat <
+  (v4f16 (scalar_to_vector f16:$src0)),
+  (INSERT_SUBREG (IMPLICIT_DEF), $src0, sub0)
+>;
+
 //===----------------------------------------------------------------------===//
 // Fract Patterns
 //===----------------------------------------------------------------------===//
@@ -1486,7 +1575,7 @@ defm : BFMPatterns <i32, S_BFM_B32, S_MOV_B32>;
 // FIXME: defm : BFMPatterns <i64, S_BFM_B64, S_MOV_B64>;
 
 defm : BFEPattern <V_BFE_U32, V_BFE_I32, S_MOV_B32>;
-def : SHA256MaPattern <V_BFI_B32, V_XOR_B32_e64>;
+defm : SHA256MaPattern <V_BFI_B32, V_XOR_B32_e64, SReg_64>;
 
 def : IntMed3Pat<V_MED3_I32, smax, smax_oneuse, smin_oneuse>;
 def : IntMed3Pat<V_MED3_U32, umax, umax_oneuse, umin_oneuse>;
diff --git a/lib/Target/AMDGPU/SILoadStoreOptimizer.cpp b/lib/Target/AMDGPU/SILoadStoreOptimizer.cpp
index 84cd47a101a8..4b537540046f 100644
--- a/lib/Target/AMDGPU/SILoadStoreOptimizer.cpp
+++ b/lib/Target/AMDGPU/SILoadStoreOptimizer.cpp
@@ -45,6 +45,7 @@
 #include "AMDGPUSubtarget.h"
 #include "SIInstrInfo.h"
 #include "SIRegisterInfo.h"
+#include "MCTargetDesc/AMDGPUMCTargetDesc.h"
 #include "Utils/AMDGPUBaseInfo.h"
 #include "llvm/ADT/ArrayRef.h"
 #include "llvm/ADT/SmallVector.h"
@@ -102,7 +103,7 @@ class SILoadStoreOptimizer : public MachineFunctionPass {
    };
 
 private:
-  const SISubtarget *STM = nullptr;
+  const GCNSubtarget *STM = nullptr;
   const SIInstrInfo *TII = nullptr;
   const SIRegisterInfo *TRI = nullptr;
   MachineRegisterInfo *MRI = nullptr;
@@ -137,7 +138,7 @@ public:
 
   bool runOnMachineFunction(MachineFunction &MF) override;
 
-  StringRef getPassName() const override { return "SI Load / Store Optimizer"; }
+  StringRef getPassName() const override { return "SI Load Store Optimizer"; }
 
   void getAnalysisUsage(AnalysisUsage &AU) const override {
     AU.setPreservesCFG();
@@ -150,10 +151,10 @@ public:
 } // end anonymous namespace.
 
 INITIALIZE_PASS_BEGIN(SILoadStoreOptimizer, DEBUG_TYPE,
-                      "SI Load / Store Optimizer", false, false)
+                      "SI Load Store Optimizer", false, false)
 INITIALIZE_PASS_DEPENDENCY(AAResultsWrapperPass)
 INITIALIZE_PASS_END(SILoadStoreOptimizer, DEBUG_TYPE,
-                    "SI Load / Store Optimizer", false, false)
+                    "SI Load Store Optimizer", false, false)
 
 char SILoadStoreOptimizer::ID = 0;
 
@@ -173,10 +174,18 @@ static void moveInstsAfter(MachineBasicBlock::iterator I,
   }
 }
 
-static void addDefsToList(const MachineInstr &MI, DenseSet<unsigned> &Defs) {
-  // XXX: Should this be looking for implicit defs?
-  for (const MachineOperand &Def : MI.defs())
-    Defs.insert(Def.getReg());
+static void addDefsUsesToList(const MachineInstr &MI,
+                              DenseSet<unsigned> &RegDefs,
+                              DenseSet<unsigned> &PhysRegUses) {
+  for (const MachineOperand &Op : MI.operands()) {
+    if (Op.isReg()) {
+      if (Op.isDef())
+        RegDefs.insert(Op.getReg());
+      else if (Op.readsReg() &&
+               TargetRegisterInfo::isPhysicalRegister(Op.getReg()))
+        PhysRegUses.insert(Op.getReg());
+    }
+  }
 }
 
 static bool memAccessesCanBeReordered(MachineBasicBlock::iterator A,
@@ -194,16 +203,24 @@ static bool memAccessesCanBeReordered(MachineBasicBlock::iterator A,
 // already in the list. Returns true in that case.
 static bool
 addToListsIfDependent(MachineInstr &MI,
-                      DenseSet<unsigned> &Defs,
+                      DenseSet<unsigned> &RegDefs,
+                      DenseSet<unsigned> &PhysRegUses,
                       SmallVectorImpl<MachineInstr*> &Insts) {
   for (MachineOperand &Use : MI.operands()) {
     // If one of the defs is read, then there is a use of Def between I and the
     // instruction that I will potentially be merged with. We will need to move
     // this instruction after the merged instructions.
-
-    if (Use.isReg() && Use.readsReg() && Defs.count(Use.getReg())) {
+    //
+    // Similarly, if there is a def which is read by an instruction that is to
+    // be moved for merging, then we need to move the def-instruction as well.
+    // This can only happen for physical registers such as M0; virtual
+    // registers are in SSA form.
+    if (Use.isReg() &&
+        ((Use.readsReg() && RegDefs.count(Use.getReg())) ||
+         (Use.isDef() && TargetRegisterInfo::isPhysicalRegister(Use.getReg()) &&
+          PhysRegUses.count(Use.getReg())))) {
       Insts.push_back(&MI);
-      addDefsToList(MI, Defs);
+      addDefsUsesToList(MI, RegDefs, PhysRegUses);
       return true;
     }
   }
@@ -332,8 +349,9 @@ bool SILoadStoreOptimizer::findMatchingInst(CombineInfo &CI) {
 
   ++MBBI;
 
-  DenseSet<unsigned> DefsToMove;
-  addDefsToList(*CI.I, DefsToMove);
+  DenseSet<unsigned> RegDefsToMove;
+  DenseSet<unsigned> PhysRegUsesToMove;
+  addDefsUsesToList(*CI.I, RegDefsToMove, PhysRegUsesToMove);
 
   for ( ; MBBI != E; ++MBBI) {
     if (MBBI->getOpcode() != CI.I->getOpcode()) {
@@ -356,14 +374,15 @@ bool SILoadStoreOptimizer::findMatchingInst(CombineInfo &CI) {
         // #2.  Add this instruction to the move list and then we will check
         // if condition #2 holds once we have selected the matching instruction.
         CI.InstsToMove.push_back(&*MBBI);
-        addDefsToList(*MBBI, DefsToMove);
+        addDefsUsesToList(*MBBI, RegDefsToMove, PhysRegUsesToMove);
         continue;
       }
 
       // When we match I with another DS instruction we will be moving I down
       // to the location of the matched instruction any uses of I will need to
       // be moved down as well.
-      addToListsIfDependent(*MBBI, DefsToMove, CI.InstsToMove);
+      addToListsIfDependent(*MBBI, RegDefsToMove, PhysRegUsesToMove,
+                            CI.InstsToMove);
       continue;
     }
 
@@ -377,7 +396,8 @@ bool SILoadStoreOptimizer::findMatchingInst(CombineInfo &CI) {
     //   DS_WRITE_B32 addr, f(w), idx1
     // where the DS_READ_B32 ends up in InstsToMove and therefore prevents
     // merging of the two writes.
-    if (addToListsIfDependent(*MBBI, DefsToMove, CI.InstsToMove))
+    if (addToListsIfDependent(*MBBI, RegDefsToMove, PhysRegUsesToMove,
+                              CI.InstsToMove))
       continue;
 
     bool Match = true;
@@ -436,7 +456,7 @@ bool SILoadStoreOptimizer::findMatchingInst(CombineInfo &CI) {
     // down past this instruction.
     // check if we can move I across MBBI and if we can move all I's users
     if (!memAccessesCanBeReordered(*CI.I, *MBBI, TII, AA) ||
-      !canMoveInstsAcrossMemOp(*MBBI, CI.InstsToMove, TII, AA))
+        !canMoveInstsAcrossMemOp(*MBBI, CI.InstsToMove, TII, AA))
       break;
   }
   return false;
@@ -496,13 +516,15 @@ MachineBasicBlock::iterator  SILoadStoreOptimizer::mergeRead2Pair(
   unsigned BaseReg = AddrReg->getReg();
   unsigned BaseRegFlags = 0;
   if (CI.BaseOff) {
+    unsigned ImmReg = MRI->createVirtualRegister(&AMDGPU::SGPR_32RegClass);
+    BuildMI(*MBB, CI.Paired, DL, TII->get(AMDGPU::S_MOV_B32), ImmReg)
+      .addImm(CI.BaseOff);
+
     BaseReg = MRI->createVirtualRegister(&AMDGPU::VGPR_32RegClass);
     BaseRegFlags = RegState::Kill;
 
-    unsigned AddOpc = STM->hasAddNoCarry() ?
-      AMDGPU::V_ADD_U32_e32 : AMDGPU::V_ADD_I32_e32;
-    BuildMI(*MBB, CI.Paired, DL, TII->get(AddOpc), BaseReg)
-      .addImm(CI.BaseOff)
+    TII->getAddNoCarry(*MBB, CI.Paired, DL, BaseReg)
+      .addReg(ImmReg)
       .addReg(AddrReg->getReg());
   }
 
@@ -532,7 +554,7 @@ MachineBasicBlock::iterator  SILoadStoreOptimizer::mergeRead2Pair(
   CI.I->eraseFromParent();
   CI.Paired->eraseFromParent();
 
-  DEBUG(dbgs() << "Inserted read2: " << *Read2 << '\n');
+  LLVM_DEBUG(dbgs() << "Inserted read2: " << *Read2 << '\n');
   return Next;
 }
 
@@ -556,7 +578,7 @@ MachineBasicBlock::iterator SILoadStoreOptimizer::mergeWrite2Pair(
 
   // Be sure to use .addOperand(), and not .addReg() with these. We want to be
   // sure we preserve the subregister index and any register flags set on them.
-  const MachineOperand *Addr = TII->getNamedOperand(*CI.I, AMDGPU::OpName::addr);
+  const MachineOperand *AddrReg = TII->getNamedOperand(*CI.I, AMDGPU::OpName::addr);
   const MachineOperand *Data0 = TII->getNamedOperand(*CI.I, AMDGPU::OpName::data0);
   const MachineOperand *Data1
     = TII->getNamedOperand(*CI.Paired, AMDGPU::OpName::data0);
@@ -579,17 +601,19 @@ MachineBasicBlock::iterator SILoadStoreOptimizer::mergeWrite2Pair(
   const MCInstrDesc &Write2Desc = TII->get(Opc);
   DebugLoc DL = CI.I->getDebugLoc();
 
-  unsigned BaseReg = Addr->getReg();
+  unsigned BaseReg = AddrReg->getReg();
   unsigned BaseRegFlags = 0;
   if (CI.BaseOff) {
+    unsigned ImmReg = MRI->createVirtualRegister(&AMDGPU::SGPR_32RegClass);
+    BuildMI(*MBB, CI.Paired, DL, TII->get(AMDGPU::S_MOV_B32), ImmReg)
+      .addImm(CI.BaseOff);
+
     BaseReg = MRI->createVirtualRegister(&AMDGPU::VGPR_32RegClass);
     BaseRegFlags = RegState::Kill;
 
-    unsigned AddOpc = STM->hasAddNoCarry() ?
-      AMDGPU::V_ADD_U32_e32 : AMDGPU::V_ADD_I32_e32;
-    BuildMI(*MBB, CI.Paired, DL, TII->get(AddOpc), BaseReg)
-      .addImm(CI.BaseOff)
-      .addReg(Addr->getReg());
+    TII->getAddNoCarry(*MBB, CI.Paired, DL, BaseReg)
+      .addReg(ImmReg)
+      .addReg(AddrReg->getReg());
   }
 
   MachineInstrBuilder Write2 =
@@ -608,7 +632,7 @@ MachineBasicBlock::iterator SILoadStoreOptimizer::mergeWrite2Pair(
   CI.I->eraseFromParent();
   CI.Paired->eraseFromParent();
 
-  DEBUG(dbgs() << "Inserted write2 inst: " << *Write2 << '\n');
+  LLVM_DEBUG(dbgs() << "Inserted write2 inst: " << *Write2 << '\n');
   return Next;
 }
 
@@ -849,9 +873,8 @@ bool SILoadStoreOptimizer::optimizeBlock(MachineBasicBlock &MBB) {
 
       continue;
     }
-    if (STM->hasSBufferLoadStoreAtomicDwordxN() &&
-        (Opc == AMDGPU::S_BUFFER_LOAD_DWORD_IMM ||
-         Opc == AMDGPU::S_BUFFER_LOAD_DWORDX2_IMM)) {
+    if (Opc == AMDGPU::S_BUFFER_LOAD_DWORD_IMM ||
+        Opc == AMDGPU::S_BUFFER_LOAD_DWORDX2_IMM) {
       // EltSize is in units of the offset encoding.
       CI.InstClass = S_BUFFER_LOAD_IMM;
       CI.EltSize = AMDGPU::getSMRDEncodedOffset(*STM, 4);
@@ -916,7 +939,7 @@ bool SILoadStoreOptimizer::runOnMachineFunction(MachineFunction &MF) {
   if (skipFunction(MF.getFunction()))
     return false;
 
-  STM = &MF.getSubtarget<SISubtarget>();
+  STM = &MF.getSubtarget<GCNSubtarget>();
   if (!STM->loadStoreOptEnabled())
     return false;
 
@@ -928,7 +951,7 @@ bool SILoadStoreOptimizer::runOnMachineFunction(MachineFunction &MF) {
 
   assert(MRI->isSSA() && "Must be run on SSA");
 
-  DEBUG(dbgs() << "Running SILoadStoreOptimizer\n");
+  LLVM_DEBUG(dbgs() << "Running SILoadStoreOptimizer\n");
 
   bool Modified = false;
 
diff --git a/lib/Target/AMDGPU/SILowerControlFlow.cpp b/lib/Target/AMDGPU/SILowerControlFlow.cpp
index a9af83323976..ad30317c344c 100644
--- a/lib/Target/AMDGPU/SILowerControlFlow.cpp
+++ b/lib/Target/AMDGPU/SILowerControlFlow.cpp
@@ -8,7 +8,7 @@
 //===----------------------------------------------------------------------===//
 //
 /// \file
-/// \brief This pass lowers the pseudo control flow instructions to real
+/// This pass lowers the pseudo control flow instructions to real
 /// machine instructions.
 ///
 /// All control flow is handled using predicated instructions and
@@ -51,6 +51,7 @@
 #include "AMDGPU.h"
 #include "AMDGPUSubtarget.h"
 #include "SIInstrInfo.h"
+#include "MCTargetDesc/AMDGPUMCTargetDesc.h"
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/ADT/StringRef.h"
 #include "llvm/CodeGen/LiveIntervals.h"
@@ -343,11 +344,49 @@ void SILowerControlFlow::emitBreak(MachineInstr &MI) {
 }
 
 void SILowerControlFlow::emitIfBreak(MachineInstr &MI) {
-  MI.setDesc(TII->get(AMDGPU::S_OR_B64));
+  MachineBasicBlock &MBB = *MI.getParent();
+  const DebugLoc &DL = MI.getDebugLoc();
+  auto Dst = MI.getOperand(0).getReg();
+
+  // Skip ANDing with exec if the break condition is already masked by exec
+  // because it is a V_CMP in the same basic block. (We know the break
+  // condition operand was an i1 in IR, so if it is a VALU instruction it must
+  // be one with a carry-out.)
+  bool SkipAnding = false;
+  if (MI.getOperand(1).isReg()) {
+    if (MachineInstr *Def = MRI->getUniqueVRegDef(MI.getOperand(1).getReg())) {
+      SkipAnding = Def->getParent() == MI.getParent()
+          && SIInstrInfo::isVALU(*Def);
+    }
+  }
+
+  // AND the break condition operand with exec, then OR that into the "loop
+  // exit" mask.
+  MachineInstr *And = nullptr, *Or = nullptr;
+  if (!SkipAnding) {
+    And = BuildMI(MBB, &MI, DL, TII->get(AMDGPU::S_AND_B64), Dst)
+             .addReg(AMDGPU::EXEC)
+             .add(MI.getOperand(1));
+    Or = BuildMI(MBB, &MI, DL, TII->get(AMDGPU::S_OR_B64), Dst)
+             .addReg(Dst)
+             .add(MI.getOperand(2));
+  } else
+    Or = BuildMI(MBB, &MI, DL, TII->get(AMDGPU::S_OR_B64), Dst)
+             .add(MI.getOperand(1))
+             .add(MI.getOperand(2));
+
+  if (LIS) {
+    if (And)
+      LIS->InsertMachineInstrInMaps(*And);
+    LIS->ReplaceMachineInstrInMaps(MI, *Or);
+  }
+
+  MI.eraseFromParent();
 }
 
 void SILowerControlFlow::emitElseBreak(MachineInstr &MI) {
-  MI.setDesc(TII->get(AMDGPU::S_OR_B64));
+  // Lowered in the same way as emitIfBreak above.
+  emitIfBreak(MI);
 }
 
 void SILowerControlFlow::emitLoop(MachineInstr &MI) {
@@ -414,8 +453,8 @@ void SILowerControlFlow::findMaskOperands(MachineInstr &MI, unsigned OpNo,
       return;
 
   for (const auto &SrcOp : Def->explicit_operands())
-    if (SrcOp.isUse() && (!SrcOp.isReg() ||
-        TargetRegisterInfo::isVirtualRegister(SrcOp.getReg()) ||
+    if (SrcOp.isReg() && SrcOp.isUse() &&
+        (TargetRegisterInfo::isVirtualRegister(SrcOp.getReg()) ||
         SrcOp.getReg() == AMDGPU::EXEC))
       Src.push_back(SrcOp);
 }
@@ -447,7 +486,7 @@ void SILowerControlFlow::combineMasks(MachineInstr &MI) {
 }
 
 bool SILowerControlFlow::runOnMachineFunction(MachineFunction &MF) {
-  const SISubtarget &ST = MF.getSubtarget<SISubtarget>();
+  const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
   TII = ST.getInstrInfo();
   TRI = &TII->getRegisterInfo();
 
diff --git a/lib/Target/AMDGPU/SILowerI1Copies.cpp b/lib/Target/AMDGPU/SILowerI1Copies.cpp
index da57b90dd8c4..ecc6cff407e1 100644
--- a/lib/Target/AMDGPU/SILowerI1Copies.cpp
+++ b/lib/Target/AMDGPU/SILowerI1Copies.cpp
@@ -17,6 +17,8 @@
 #include "AMDGPU.h"
 #include "AMDGPUSubtarget.h"
 #include "SIInstrInfo.h"
+#include "MCTargetDesc/AMDGPUMCTargetDesc.h"
+#include "Utils/AMDGPULaneDominator.h"
 #include "llvm/CodeGen/LiveIntervals.h"
 #include "llvm/CodeGen/MachineFunctionPass.h"
 #include "llvm/CodeGen/MachineInstrBuilder.h"
@@ -64,7 +66,7 @@ FunctionPass *llvm::createSILowerI1CopiesPass() {
 
 bool SILowerI1Copies::runOnMachineFunction(MachineFunction &MF) {
   MachineRegisterInfo &MRI = MF.getRegInfo();
-  const SISubtarget &ST = MF.getSubtarget<SISubtarget>();
+  const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
   const SIInstrInfo *TII = ST.getInstrInfo();
   const TargetRegisterInfo *TRI = &TII->getRegisterInfo();
 
@@ -141,7 +143,8 @@ bool SILowerI1Copies::runOnMachineFunction(MachineFunction &MF) {
               DefInst->getOperand(3).getReg()) &&
             TRI->getCommonSubClass(
               MRI.getRegClass(DefInst->getOperand(3).getReg()),
-              &AMDGPU::SGPR_64RegClass)) {
+              &AMDGPU::SGPR_64RegClass) &&
+            AMDGPU::laneDominates(DefInst->getParent(), &MBB)) {
           BuildMI(MBB, &MI, DL, TII->get(AMDGPU::S_AND_B64))
               .add(Dst)
               .addReg(AMDGPU::EXEC)
diff --git a/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp b/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp
index 6013ebc81d9f..0d5ff75e37ed 100644
--- a/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp
+++ b/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp
@@ -11,6 +11,7 @@
 #include "AMDGPUArgumentUsageInfo.h"
 #include "AMDGPUSubtarget.h"
 #include "SIRegisterInfo.h"
+#include "MCTargetDesc/AMDGPUMCTargetDesc.h"
 #include "Utils/AMDGPUBaseInfo.h"
 #include "llvm/ADT/Optional.h"
 #include "llvm/CodeGen/MachineBasicBlock.h"
@@ -28,17 +29,12 @@ using namespace llvm;
 
 SIMachineFunctionInfo::SIMachineFunctionInfo(const MachineFunction &MF)
   : AMDGPUMachineFunction(MF),
-    BufferPSV(*(MF.getSubtarget().getInstrInfo())),
-    ImagePSV(*(MF.getSubtarget().getInstrInfo())),
     PrivateSegmentBuffer(false),
     DispatchPtr(false),
     QueuePtr(false),
     KernargSegmentPtr(false),
     DispatchID(false),
     FlatScratchInit(false),
-    GridWorkgroupCountX(false),
-    GridWorkgroupCountY(false),
-    GridWorkgroupCountZ(false),
     WorkGroupIDX(false),
     WorkGroupIDY(false),
     WorkGroupIDZ(false),
@@ -49,12 +45,26 @@ SIMachineFunctionInfo::SIMachineFunctionInfo(const MachineFunction &MF)
     WorkItemIDZ(false),
     ImplicitBufferPtr(false),
     ImplicitArgPtr(false),
-    GITPtrHigh(0xffffffff) {
-  const SISubtarget &ST = MF.getSubtarget<SISubtarget>();
+    GITPtrHigh(0xffffffff),
+    HighBitsOf32BitAddress(0) {
+  const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
   const Function &F = MF.getFunction();
   FlatWorkGroupSizes = ST.getFlatWorkGroupSizes(F);
   WavesPerEU = ST.getWavesPerEU(F);
 
+  Occupancy = getMaxWavesPerEU();
+  limitOccupancy(MF);
+  CallingConv::ID CC = F.getCallingConv();
+
+  if (CC == CallingConv::AMDGPU_KERNEL || CC == CallingConv::SPIR_KERNEL) {
+    if (!F.arg_empty())
+      KernargSegmentPtr = true;
+    WorkGroupIDX = true;
+    WorkItemIDX = true;
+  } else if (CC == CallingConv::AMDGPU_PS) {
+    PSInputAddr = AMDGPU::getInitialPSInputAddr(F);
+  }
+
   if (!isEntryFunction()) {
     // Non-entry functions have no special inputs for now, other registers
     // required for scratch access.
@@ -71,18 +81,11 @@ SIMachineFunctionInfo::SIMachineFunctionInfo(const MachineFunction &MF)
     if (F.hasFnAttribute("amdgpu-implicitarg-ptr"))
       ImplicitArgPtr = true;
   } else {
-    if (F.hasFnAttribute("amdgpu-implicitarg-ptr"))
-      KernargSegmentPtr = true;
-  }
-
-  CallingConv::ID CC = F.getCallingConv();
-  if (CC == CallingConv::AMDGPU_KERNEL || CC == CallingConv::SPIR_KERNEL) {
-    if (!F.arg_empty())
+    if (F.hasFnAttribute("amdgpu-implicitarg-ptr")) {
       KernargSegmentPtr = true;
-    WorkGroupIDX = true;
-    WorkItemIDX = true;
-  } else if (CC == CallingConv::AMDGPU_PS) {
-    PSInputAddr = AMDGPU::getInitialPSInputAddr(F);
+      MaxKernArgAlign = std::max(ST.getAlignmentForImplicitArgPtr(),
+                                 MaxKernArgAlign);
+    }
   }
 
   if (ST.debuggerEmitPrologue()) {
@@ -134,7 +137,7 @@ SIMachineFunctionInfo::SIMachineFunctionInfo(const MachineFunction &MF)
     }
   }
 
-  bool IsCOV2 = ST.isAmdCodeObjectV2(MF);
+  bool IsCOV2 = ST.isAmdCodeObjectV2(F);
   if (IsCOV2) {
     if (HasStackObjects || MaySpill)
       PrivateSegmentBuffer = true;
@@ -147,7 +150,7 @@ SIMachineFunctionInfo::SIMachineFunctionInfo(const MachineFunction &MF)
 
     if (F.hasFnAttribute("amdgpu-dispatch-id"))
       DispatchID = true;
-  } else if (ST.isMesaGfxShader(MF)) {
+  } else if (ST.isMesaGfxShader(F)) {
     if (HasStackObjects || MaySpill)
       ImplicitBufferPtr = true;
   }
@@ -166,6 +169,18 @@ SIMachineFunctionInfo::SIMachineFunctionInfo(const MachineFunction &MF)
   StringRef S = A.getValueAsString();
   if (!S.empty())
     S.consumeInteger(0, GITPtrHigh);
+
+  A = F.getFnAttribute("amdgpu-32bit-address-high-bits");
+  S = A.getValueAsString();
+  if (!S.empty())
+    S.consumeInteger(0, HighBitsOf32BitAddress);
+}
+
+void SIMachineFunctionInfo::limitOccupancy(const MachineFunction &MF) {
+  limitOccupancy(getMaxWavesPerEU());
+  const GCNSubtarget& ST = MF.getSubtarget<GCNSubtarget>();
+  limitOccupancy(ST.getOccupancyWithLocalMemSize(getLDSSize(),
+                 MF.getFunction()));
 }
 
 unsigned SIMachineFunctionInfo::addPrivateSegmentBuffer(
@@ -238,7 +253,7 @@ bool SIMachineFunctionInfo::allocateSGPRSpillToVGPR(MachineFunction &MF,
   if (!SpillLanes.empty())
     return true;
 
-  const SISubtarget &ST = MF.getSubtarget<SISubtarget>();
+  const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
   const SIRegisterInfo *TRI = ST.getRegisterInfo();
   MachineFrameInfo &FrameInfo = MF.getFrameInfo();
   MachineRegisterInfo &MRI = MF.getRegInfo();
@@ -269,10 +284,9 @@ bool SIMachineFunctionInfo::allocateSGPRSpillToVGPR(MachineFunction &MF,
       }
 
       Optional<int> CSRSpillFI;
-      if (FrameInfo.hasCalls() && CSRegs && isCalleeSavedReg(CSRegs, LaneVGPR)) {
-        // TODO: Should this be a CreateSpillStackObject? This is technically a
-        // weird CSR spill.
-        CSRSpillFI = FrameInfo.CreateStackObject(4, 4, false);
+      if ((FrameInfo.hasCalls() || !isEntryFunction()) && CSRegs &&
+          isCalleeSavedReg(CSRegs, LaneVGPR)) {
+        CSRSpillFI = FrameInfo.CreateSpillStackObject(4, 4);
       }
 
       SpillVGPRs.push_back(SGPRSpillVGPRCSR(LaneVGPR, CSRSpillFI));
@@ -295,3 +309,29 @@ void SIMachineFunctionInfo::removeSGPRToVGPRFrameIndices(MachineFrameInfo &MFI)
   for (auto &R : SGPRToVGPRSpills)
     MFI.RemoveStackObject(R.first);
 }
+
+
+/// \returns VGPR used for \p Dim' work item ID.
+unsigned SIMachineFunctionInfo::getWorkItemIDVGPR(unsigned Dim) const {
+  switch (Dim) {
+  case 0:
+    assert(hasWorkItemIDX());
+    return AMDGPU::VGPR0;
+  case 1:
+    assert(hasWorkItemIDY());
+    return AMDGPU::VGPR1;
+  case 2:
+    assert(hasWorkItemIDZ());
+    return AMDGPU::VGPR2;
+  }
+  llvm_unreachable("unexpected dimension");
+}
+
+MCPhysReg SIMachineFunctionInfo::getNextUserSGPR() const {
+  assert(NumSystemSGPRs == 0 && "System SGPRs must be added after user SGPRs");
+  return AMDGPU::SGPR0 + NumUserSGPRs;
+}
+
+MCPhysReg SIMachineFunctionInfo::getNextSystemSGPR() const {
+  return AMDGPU::SGPR0 + NumUserSGPRs + NumSystemSGPRs;
+}
diff --git a/lib/Target/AMDGPU/SIMachineFunctionInfo.h b/lib/Target/AMDGPU/SIMachineFunctionInfo.h
index 5dde72910ee3..ef91d1e43075 100644
--- a/lib/Target/AMDGPU/SIMachineFunctionInfo.h
+++ b/lib/Target/AMDGPU/SIMachineFunctionInfo.h
@@ -16,7 +16,9 @@
 
 #include "AMDGPUArgumentUsageInfo.h"
 #include "AMDGPUMachineFunction.h"
+#include "SIInstrInfo.h"
 #include "SIRegisterInfo.h"
+#include "MCTargetDesc/AMDGPUMCTargetDesc.h"
 #include "llvm/ADT/ArrayRef.h"
 #include "llvm/ADT/DenseMap.h"
 #include "llvm/ADT/Optional.h"
@@ -38,8 +40,9 @@ class TargetRegisterClass;
 
 class AMDGPUImagePseudoSourceValue : public PseudoSourceValue {
 public:
+  // TODO: Is the img rsrc useful?
   explicit AMDGPUImagePseudoSourceValue(const TargetInstrInfo &TII) :
-    PseudoSourceValue(PseudoSourceValue::TargetCustom, TII) { }
+    PseudoSourceValue(PseudoSourceValue::TargetCustom, TII) {}
 
   bool isConstant(const MachineFrameInfo *) const override {
     // This should probably be true for most images, but we will start by being
@@ -48,15 +51,11 @@ public:
   }
 
   bool isAliased(const MachineFrameInfo *) const override {
-    // FIXME: If we ever change image intrinsics to accept fat pointers, then
-    // this could be true for some cases.
-    return false;
+    return true;
   }
 
   bool mayAlias(const MachineFrameInfo *) const override {
-    // FIXME: If we ever change image intrinsics to accept fat pointers, then
-    // this could be true for some cases.
-    return false;
+    return true;
   }
 };
 
@@ -72,15 +71,11 @@ public:
   }
 
   bool isAliased(const MachineFrameInfo *) const override {
-    // FIXME: If we ever change image intrinsics to accept fat pointers, then
-    // this could be true for some cases.
-    return false;
+    return true;
   }
 
   bool mayAlias(const MachineFrameInfo *) const override {
-    // FIXME: If we ever change image intrinsics to accept fat pointers, then
-    // this could be true for some cases.
-    return false;
+    return true;
   }
 };
 
@@ -135,8 +130,10 @@ class SIMachineFunctionInfo final : public AMDGPUMachineFunction {
   // Stack object indices for work item IDs.
   std::array<int, 3> DebuggerWorkItemIDStackObjectIndices = {{0, 0, 0}};
 
-  AMDGPUBufferPseudoSourceValue BufferPSV;
-  AMDGPUImagePseudoSourceValue ImagePSV;
+  DenseMap<const Value *,
+           std::unique_ptr<const AMDGPUBufferPseudoSourceValue>> BufferPSVs;
+  DenseMap<const Value *,
+           std::unique_ptr<const AMDGPUImagePseudoSourceValue>> ImagePSVs;
 
 private:
   unsigned LDSWaveSpillSize = 0;
@@ -146,6 +143,7 @@ private:
   bool HasSpilledSGPRs = false;
   bool HasSpilledVGPRs = false;
   bool HasNonSpillStackObjects = false;
+  bool IsStackRealigned = false;
 
   unsigned NumSpilledSGPRs = 0;
   unsigned NumSpilledVGPRs = 0;
@@ -157,9 +155,6 @@ private:
   bool KernargSegmentPtr : 1;
   bool DispatchID : 1;
   bool FlatScratchInit : 1;
-  bool GridWorkgroupCountX : 1;
-  bool GridWorkgroupCountY : 1;
-  bool GridWorkgroupCountZ : 1;
 
   // Feature bits required for inputs passed in system SGPRs.
   bool WorkGroupIDX : 1; // Always initialized.
@@ -186,25 +181,25 @@ private:
   // current hardware only allows a 16 bit value.
   unsigned GITPtrHigh;
 
-  MCPhysReg getNextUserSGPR() const {
-    assert(NumSystemSGPRs == 0 && "System SGPRs must be added after user SGPRs");
-    return AMDGPU::SGPR0 + NumUserSGPRs;
-  }
+  unsigned HighBitsOf32BitAddress;
 
-  MCPhysReg getNextSystemSGPR() const {
-    return AMDGPU::SGPR0 + NumUserSGPRs + NumSystemSGPRs;
-  }
+  // Current recorded maximum possible occupancy.
+  unsigned Occupancy;
+
+  MCPhysReg getNextUserSGPR() const;
+
+  MCPhysReg getNextSystemSGPR() const;
 
 public:
   struct SpilledReg {
-    unsigned VGPR = AMDGPU::NoRegister;
+    unsigned VGPR = 0;
     int Lane = -1;
 
     SpilledReg() = default;
     SpilledReg(unsigned R, int L) : VGPR (R), Lane (L) {}
 
     bool hasLane() { return Lane != -1;}
-    bool hasReg() { return VGPR != AMDGPU::NoRegister;}
+    bool hasReg() { return VGPR != 0;}
   };
 
   struct SGPRSpillVGPRCSR {
@@ -244,8 +239,8 @@ public:
   bool allocateSGPRSpillToVGPR(MachineFunction &MF, int FI);
   void removeSGPRToVGPRFrameIndices(MachineFrameInfo &MFI);
 
-  bool hasCalculatedTID() const { return TIDReg != AMDGPU::NoRegister; }
-  unsigned getTIDReg() const { return TIDReg; }
+  bool hasCalculatedTID() const { return TIDReg != 0; };
+  unsigned getTIDReg() const { return TIDReg; };
   void setTIDReg(unsigned Reg) { TIDReg = Reg; }
 
   unsigned getBytesInStackArgArea() const {
@@ -338,18 +333,6 @@ public:
     return FlatScratchInit;
   }
 
-  bool hasGridWorkgroupCountX() const {
-    return GridWorkgroupCountX;
-  }
-
-  bool hasGridWorkgroupCountY() const {
-    return GridWorkgroupCountY;
-  }
-
-  bool hasGridWorkgroupCountZ() const {
-    return GridWorkgroupCountZ;
-  }
-
   bool hasWorkGroupIDX() const {
     return WorkGroupIDX;
   }
@@ -411,6 +394,10 @@ public:
     return GITPtrHigh;
   }
 
+  unsigned get32BitAddressHighBits() const {
+    return HighBitsOf32BitAddress;
+  }
+
   unsigned getNumUserSGPRs() const {
     return NumUserSGPRs;
   }
@@ -423,14 +410,14 @@ public:
     return ArgInfo.PrivateSegmentWaveByteOffset.getRegister();
   }
 
-  /// \brief Returns the physical register reserved for use as the resource
+  /// Returns the physical register reserved for use as the resource
   /// descriptor for scratch accesses.
   unsigned getScratchRSrcReg() const {
     return ScratchRSrcReg;
   }
 
   void setScratchRSrcReg(unsigned Reg) {
-    assert(Reg != AMDGPU::NoRegister && "Should never be unset");
+    assert(Reg != 0 && "Should never be unset");
     ScratchRSrcReg = Reg;
   }
 
@@ -443,6 +430,7 @@ public:
   }
 
   void setStackPtrOffsetReg(unsigned Reg) {
+    assert(Reg != 0 && "Should never be unset");
     StackPtrOffsetReg = Reg;
   }
 
@@ -455,7 +443,7 @@ public:
   }
 
   void setScratchWaveOffsetReg(unsigned Reg) {
-    assert(Reg != AMDGPU::NoRegister && "Should never be unset");
+    assert(Reg != 0 && "Should never be unset");
     ScratchWaveOffsetReg = Reg;
     if (isEntryFunction())
       FrameOffsetReg = ScratchWaveOffsetReg;
@@ -493,6 +481,14 @@ public:
     HasNonSpillStackObjects = StackObject;
   }
 
+  bool isStackRealigned() const {
+    return IsStackRealigned;
+  }
+
+  void setIsStackRealigned(bool Realigned = true) {
+    IsStackRealigned = Realigned;
+  }
+
   unsigned getNumSpilledSGPRs() const {
     return NumSpilledSGPRs;
   }
@@ -575,7 +571,7 @@ public:
     return DebuggerWorkGroupIDStackObjectIndices[Dim];
   }
 
-  /// \brief Sets stack object index for \p Dim's work group ID to \p ObjectIdx.
+  /// Sets stack object index for \p Dim's work group ID to \p ObjectIdx.
   void setDebuggerWorkGroupIDStackObjectIndex(unsigned Dim, int ObjectIdx) {
     assert(Dim < 3);
     DebuggerWorkGroupIDStackObjectIndices[Dim] = ObjectIdx;
@@ -587,7 +583,7 @@ public:
     return DebuggerWorkItemIDStackObjectIndices[Dim];
   }
 
-  /// \brief Sets stack object index for \p Dim's work item ID to \p ObjectIdx.
+  /// Sets stack object index for \p Dim's work item ID to \p ObjectIdx.
   void setDebuggerWorkItemIDStackObjectIndex(unsigned Dim, int ObjectIdx) {
     assert(Dim < 3);
     DebuggerWorkItemIDStackObjectIndices[Dim] = ObjectIdx;
@@ -610,31 +606,51 @@ public:
   }
 
   /// \returns VGPR used for \p Dim' work item ID.
-  unsigned getWorkItemIDVGPR(unsigned Dim) const {
-    switch (Dim) {
-    case 0:
-      assert(hasWorkItemIDX());
-      return AMDGPU::VGPR0;
-    case 1:
-      assert(hasWorkItemIDY());
-      return AMDGPU::VGPR1;
-    case 2:
-      assert(hasWorkItemIDZ());
-      return AMDGPU::VGPR2;
-    }
-    llvm_unreachable("unexpected dimension");
-  }
+  unsigned getWorkItemIDVGPR(unsigned Dim) const;
 
   unsigned getLDSWaveSpillSize() const {
     return LDSWaveSpillSize;
   }
 
-  const AMDGPUBufferPseudoSourceValue *getBufferPSV() const {
-    return &BufferPSV;
+  const AMDGPUBufferPseudoSourceValue *getBufferPSV(const SIInstrInfo &TII,
+                                                    const Value *BufferRsrc) {
+    assert(BufferRsrc);
+    auto PSV = BufferPSVs.try_emplace(
+      BufferRsrc,
+      llvm::make_unique<AMDGPUBufferPseudoSourceValue>(TII));
+    return PSV.first->second.get();
+  }
+
+  const AMDGPUImagePseudoSourceValue *getImagePSV(const SIInstrInfo &TII,
+                                                  const Value *ImgRsrc) {
+    assert(ImgRsrc);
+    auto PSV = ImagePSVs.try_emplace(
+      ImgRsrc,
+      llvm::make_unique<AMDGPUImagePseudoSourceValue>(TII));
+    return PSV.first->second.get();
+  }
+
+  unsigned getOccupancy() const {
+    return Occupancy;
+  }
+
+  unsigned getMinAllowedOccupancy() const {
+    if (!isMemoryBound() && !needsWaveLimiter())
+      return Occupancy;
+    return (Occupancy < 4) ? Occupancy : 4;
+  }
+
+  void limitOccupancy(const MachineFunction &MF);
+
+  void limitOccupancy(unsigned Limit) {
+    if (Occupancy > Limit)
+      Occupancy = Limit;
   }
 
-  const AMDGPUImagePseudoSourceValue *getImagePSV() const {
-    return &ImagePSV;
+  void increaseOccupancy(const MachineFunction &MF, unsigned Limit) {
+    if (Occupancy < Limit)
+      Occupancy = Limit;
+    limitOccupancy(MF);
   }
 };
 
diff --git a/lib/Target/AMDGPU/SIMachineScheduler.cpp b/lib/Target/AMDGPU/SIMachineScheduler.cpp
index 6b67b76652ed..18754442898f 100644
--- a/lib/Target/AMDGPU/SIMachineScheduler.cpp
+++ b/lib/Target/AMDGPU/SIMachineScheduler.cpp
@@ -8,7 +8,7 @@
 //===----------------------------------------------------------------------===//
 //
 /// \file
-/// \brief SI Machine Scheduler interface
+/// SI Machine Scheduler interface
 //
 //===----------------------------------------------------------------------===//
 
@@ -16,6 +16,7 @@
 #include "AMDGPU.h"
 #include "SIInstrInfo.h"
 #include "SIRegisterInfo.h"
+#include "MCTargetDesc/AMDGPUMCTargetDesc.h"
 #include "llvm/ADT/STLExtras.h"
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/CodeGen/LiveInterval.h"
@@ -154,6 +155,8 @@ static const char *getReasonStr(SIScheduleCandReason Reason) {
 
 #endif
 
+namespace llvm {
+namespace SISched {
 static bool tryLess(int TryVal, int CandVal,
                     SISchedulerCandidate &TryCand,
                     SISchedulerCandidate &Cand,
@@ -187,6 +190,8 @@ static bool tryGreater(int TryVal, int CandVal,
   Cand.setRepeat(Reason);
   return false;
 }
+} // end namespace SISched
+} // end namespace llvm
 
 // SIScheduleBlock //
 
@@ -212,7 +217,8 @@ void SIScheduleBlock::tryCandidateTopDown(SISchedCandidate &Cand,
   }
 
   if (Cand.SGPRUsage > 60 &&
-      tryLess(TryCand.SGPRUsage, Cand.SGPRUsage, TryCand, Cand, RegUsage))
+      SISched::tryLess(TryCand.SGPRUsage, Cand.SGPRUsage,
+                       TryCand, Cand, RegUsage))
     return;
 
   // Schedule low latency instructions as top as possible.
@@ -230,21 +236,22 @@ void SIScheduleBlock::tryCandidateTopDown(SISchedCandidate &Cand,
   // could go quite high, thus above the arbitrary limit of 60 will encourage
   // use the already loaded constants (in order to release some SGPRs) before
   // loading more.
-  if (tryLess(TryCand.HasLowLatencyNonWaitedParent,
-              Cand.HasLowLatencyNonWaitedParent,
-              TryCand, Cand, SIScheduleCandReason::Depth))
+  if (SISched::tryLess(TryCand.HasLowLatencyNonWaitedParent,
+                       Cand.HasLowLatencyNonWaitedParent,
+                       TryCand, Cand, SIScheduleCandReason::Depth))
     return;
 
-  if (tryGreater(TryCand.IsLowLatency, Cand.IsLowLatency,
-                 TryCand, Cand, SIScheduleCandReason::Depth))
+  if (SISched::tryGreater(TryCand.IsLowLatency, Cand.IsLowLatency,
+                          TryCand, Cand, SIScheduleCandReason::Depth))
     return;
 
   if (TryCand.IsLowLatency &&
-      tryLess(TryCand.LowLatencyOffset, Cand.LowLatencyOffset,
-              TryCand, Cand, SIScheduleCandReason::Depth))
+      SISched::tryLess(TryCand.LowLatencyOffset, Cand.LowLatencyOffset,
+                       TryCand, Cand, SIScheduleCandReason::Depth))
     return;
 
-  if (tryLess(TryCand.VGPRUsage, Cand.VGPRUsage, TryCand, Cand, RegUsage))
+  if (SISched::tryLess(TryCand.VGPRUsage, Cand.VGPRUsage,
+                       TryCand, Cand, RegUsage))
     return;
 
   // Fall through to original instruction order.
@@ -1201,7 +1208,7 @@ void SIScheduleBlockCreator::createBlocksForVariant(SISchedulerBlockCreatorVaria
   NextReservedID = 1;
   NextNonReservedID = DAGSize + 1;
 
-  DEBUG(dbgs() << "Coloring the graph\n");
+  LLVM_DEBUG(dbgs() << "Coloring the graph\n");
 
   if (BlockVariant == SISchedulerBlockCreatorVariant::LatenciesGrouped)
     colorHighLatenciesGroups();
@@ -1258,13 +1265,11 @@ void SIScheduleBlockCreator::createBlocksForVariant(SISchedulerBlockCreatorVaria
     SIScheduleBlock *Block = CurrentBlocks[i];
     Block->finalizeUnits();
   }
-  DEBUG(
-    dbgs() << "Blocks created:\n\n";
-    for (unsigned i = 0, e = CurrentBlocks.size(); i != e; ++i) {
-      SIScheduleBlock *Block = CurrentBlocks[i];
-      Block->printDebug(true);
-    }
-  );
+  LLVM_DEBUG(dbgs() << "Blocks created:\n\n";
+             for (unsigned i = 0, e = CurrentBlocks.size(); i != e; ++i) {
+               SIScheduleBlock *Block = CurrentBlocks[i];
+               Block->printDebug(true);
+             });
 }
 
 // Two functions taken from Codegen/MachineScheduler.cpp
@@ -1274,7 +1279,7 @@ static MachineBasicBlock::iterator
 nextIfDebug(MachineBasicBlock::iterator I,
             MachineBasicBlock::const_iterator End) {
   for (; I != End; ++I) {
-    if (!I->isDebugValue())
+    if (!I->isDebugInstr())
       break;
   }
   return I;
@@ -1284,7 +1289,7 @@ void SIScheduleBlockCreator::topologicalSort() {
   unsigned DAGSize = CurrentBlocks.size();
   std::vector<int> WorkList;
 
-  DEBUG(dbgs() << "Topological Sort\n");
+  LLVM_DEBUG(dbgs() << "Topological Sort\n");
 
   WorkList.reserve(DAGSize);
   TopDownIndex2Block.resize(DAGSize);
@@ -1331,11 +1336,11 @@ void SIScheduleBlockCreator::topologicalSort() {
 void SIScheduleBlockCreator::scheduleInsideBlocks() {
   unsigned DAGSize = CurrentBlocks.size();
 
-  DEBUG(dbgs() << "\nScheduling Blocks\n\n");
+  LLVM_DEBUG(dbgs() << "\nScheduling Blocks\n\n");
 
   // We do schedule a valid scheduling such that a Block corresponds
   // to a range of instructions.
-  DEBUG(dbgs() << "First phase: Fast scheduling for Reg Liveness\n");
+  LLVM_DEBUG(dbgs() << "First phase: Fast scheduling for Reg Liveness\n");
   for (unsigned i = 0, e = DAGSize; i != e; ++i) {
     SIScheduleBlock *Block = CurrentBlocks[i];
     Block->fastSchedule();
@@ -1389,7 +1394,7 @@ void SIScheduleBlockCreator::scheduleInsideBlocks() {
     Block->schedule((*SUs.begin())->getInstr(), (*SUs.rbegin())->getInstr());
   }
 
-  DEBUG(dbgs() << "Restoring MI Pos\n");
+  LLVM_DEBUG(dbgs() << "Restoring MI Pos\n");
   // Restore old ordering (which prevents a LIS->handleMove bug).
   for (unsigned i = PosOld.size(), e = 0; i != e; --i) {
     MachineBasicBlock::iterator POld = PosOld[i-1];
@@ -1403,12 +1408,10 @@ void SIScheduleBlockCreator::scheduleInsideBlocks() {
     }
   }
 
-  DEBUG(
-    for (unsigned i = 0, e = CurrentBlocks.size(); i != e; ++i) {
-      SIScheduleBlock *Block = CurrentBlocks[i];
-      Block->printDebug(true);
-    }
-  );
+  LLVM_DEBUG(for (unsigned i = 0, e = CurrentBlocks.size(); i != e; ++i) {
+    SIScheduleBlock *Block = CurrentBlocks[i];
+    Block->printDebug(true);
+  });
 }
 
 void SIScheduleBlockCreator::fillStats() {
@@ -1559,13 +1562,10 @@ SIScheduleBlockScheduler::SIScheduleBlockScheduler(SIScheduleDAGMI *DAG,
     blockScheduled(Block);
   }
 
-  DEBUG(
-    dbgs() << "Block Order:";
-    for (SIScheduleBlock* Block : BlocksScheduled) {
-      dbgs() << ' ' << Block->getID();
-    }
-    dbgs() << '\n';
-  );
+  LLVM_DEBUG(dbgs() << "Block Order:"; for (SIScheduleBlock *Block
+                                            : BlocksScheduled) {
+    dbgs() << ' ' << Block->getID();
+  } dbgs() << '\n';);
 }
 
 bool SIScheduleBlockScheduler::tryCandidateLatency(SIBlockSchedCandidate &Cand,
@@ -1576,19 +1576,19 @@ bool SIScheduleBlockScheduler::tryCandidateLatency(SIBlockSchedCandidate &Cand,
   }
 
   // Try to hide high latencies.
-  if (tryLess(TryCand.LastPosHighLatParentScheduled,
-              Cand.LastPosHighLatParentScheduled, TryCand, Cand, Latency))
+  if (SISched::tryLess(TryCand.LastPosHighLatParentScheduled,
+                 Cand.LastPosHighLatParentScheduled, TryCand, Cand, Latency))
     return true;
   // Schedule high latencies early so you can hide them better.
-  if (tryGreater(TryCand.IsHighLatency, Cand.IsHighLatency,
-                 TryCand, Cand, Latency))
+  if (SISched::tryGreater(TryCand.IsHighLatency, Cand.IsHighLatency,
+                          TryCand, Cand, Latency))
     return true;
-  if (TryCand.IsHighLatency && tryGreater(TryCand.Height, Cand.Height,
-                                          TryCand, Cand, Depth))
+  if (TryCand.IsHighLatency && SISched::tryGreater(TryCand.Height, Cand.Height,
+                                                   TryCand, Cand, Depth))
     return true;
-  if (tryGreater(TryCand.NumHighLatencySuccessors,
-                 Cand.NumHighLatencySuccessors,
-                 TryCand, Cand, Successor))
+  if (SISched::tryGreater(TryCand.NumHighLatencySuccessors,
+                          Cand.NumHighLatencySuccessors,
+                          TryCand, Cand, Successor))
     return true;
   return false;
 }
@@ -1600,17 +1600,17 @@ bool SIScheduleBlockScheduler::tryCandidateRegUsage(SIBlockSchedCandidate &Cand,
     return true;
   }
 
-  if (tryLess(TryCand.VGPRUsageDiff > 0, Cand.VGPRUsageDiff > 0,
-              TryCand, Cand, RegUsage))
+  if (SISched::tryLess(TryCand.VGPRUsageDiff > 0, Cand.VGPRUsageDiff > 0,
+                       TryCand, Cand, RegUsage))
     return true;
-  if (tryGreater(TryCand.NumSuccessors > 0,
-                 Cand.NumSuccessors > 0,
-                 TryCand, Cand, Successor))
+  if (SISched::tryGreater(TryCand.NumSuccessors > 0,
+                          Cand.NumSuccessors > 0,
+                          TryCand, Cand, Successor))
     return true;
-  if (tryGreater(TryCand.Height, Cand.Height, TryCand, Cand, Depth))
+  if (SISched::tryGreater(TryCand.Height, Cand.Height, TryCand, Cand, Depth))
     return true;
-  if (tryLess(TryCand.VGPRUsageDiff, Cand.VGPRUsageDiff,
-              TryCand, Cand, RegUsage))
+  if (SISched::tryLess(TryCand.VGPRUsageDiff, Cand.VGPRUsageDiff,
+                       TryCand, Cand, RegUsage))
     return true;
   return false;
 }
@@ -1628,18 +1628,17 @@ SIScheduleBlock *SIScheduleBlockScheduler::pickBlock() {
     maxVregUsage = VregCurrentUsage;
   if (SregCurrentUsage > maxSregUsage)
     maxSregUsage = SregCurrentUsage;
-  DEBUG(
-    dbgs() << "Picking New Blocks\n";
-    dbgs() << "Available: ";
-    for (SIScheduleBlock* Block : ReadyBlocks)
-      dbgs() << Block->getID() << ' ';
-    dbgs() << "\nCurrent Live:\n";
-    for (unsigned Reg : LiveRegs)
-      dbgs() << printVRegOrUnit(Reg, DAG->getTRI()) << ' ';
-    dbgs() << '\n';
-    dbgs() << "Current VGPRs: " << VregCurrentUsage << '\n';
-    dbgs() << "Current SGPRs: " << SregCurrentUsage << '\n';
-  );
+  LLVM_DEBUG(dbgs() << "Picking New Blocks\n"; dbgs() << "Available: ";
+             for (SIScheduleBlock *Block
+                  : ReadyBlocks) dbgs()
+             << Block->getID() << ' ';
+             dbgs() << "\nCurrent Live:\n";
+             for (unsigned Reg
+                  : LiveRegs) dbgs()
+             << printVRegOrUnit(Reg, DAG->getTRI()) << ' ';
+             dbgs() << '\n';
+             dbgs() << "Current VGPRs: " << VregCurrentUsage << '\n';
+             dbgs() << "Current SGPRs: " << SregCurrentUsage << '\n';);
 
   Cand.Block = nullptr;
   for (std::vector<SIScheduleBlock*>::iterator I = ReadyBlocks.begin(),
@@ -1671,20 +1670,18 @@ SIScheduleBlock *SIScheduleBlockScheduler::pickBlock() {
     if (TryCand.Reason != NoCand) {
       Cand.setBest(TryCand);
       Best = I;
-      DEBUG(dbgs() << "Best Current Choice: " << Cand.Block->getID() << ' '
-                   << getReasonStr(Cand.Reason) << '\n');
+      LLVM_DEBUG(dbgs() << "Best Current Choice: " << Cand.Block->getID() << ' '
+                        << getReasonStr(Cand.Reason) << '\n');
     }
   }
 
-  DEBUG(
-    dbgs() << "Picking: " << Cand.Block->getID() << '\n';
-    dbgs() << "Is a block with high latency instruction: "
-      << (Cand.IsHighLatency ? "yes\n" : "no\n");
-    dbgs() << "Position of last high latency dependency: "
-           << Cand.LastPosHighLatParentScheduled << '\n';
-    dbgs() << "VGPRUsageDiff: " << Cand.VGPRUsageDiff << '\n';
-    dbgs() << '\n';
-  );
+  LLVM_DEBUG(dbgs() << "Picking: " << Cand.Block->getID() << '\n';
+             dbgs() << "Is a block with high latency instruction: "
+                    << (Cand.IsHighLatency ? "yes\n" : "no\n");
+             dbgs() << "Position of last high latency dependency: "
+                    << Cand.LastPosHighLatParentScheduled << '\n';
+             dbgs() << "VGPRUsageDiff: " << Cand.VGPRUsageDiff << '\n';
+             dbgs() << '\n';);
 
   Block = Cand.Block;
   ReadyBlocks.erase(Best);
@@ -1933,13 +1930,10 @@ void SIScheduleDAGMI::schedule()
 {
   SmallVector<SUnit*, 8> TopRoots, BotRoots;
   SIScheduleBlockResult Best, Temp;
-  DEBUG(dbgs() << "Preparing Scheduling\n");
+  LLVM_DEBUG(dbgs() << "Preparing Scheduling\n");
 
   buildDAGWithRegPressure();
-  DEBUG(
-    for(SUnit& SU : SUnits)
-       SU.dumpAll(this)
-  );
+  LLVM_DEBUG(for (SUnit &SU : SUnits) SU.dumpAll(this));
 
   topologicalSort();
   findRootsAndBiasEdges(TopRoots, BotRoots);
@@ -2041,15 +2035,15 @@ void SIScheduleDAGMI::schedule()
 
     scheduleMI(SU, true);
 
-    DEBUG(dbgs() << "Scheduling SU(" << SU->NodeNum << ") "
-                 << *SU->getInstr());
+    LLVM_DEBUG(dbgs() << "Scheduling SU(" << SU->NodeNum << ") "
+                      << *SU->getInstr());
   }
 
   assert(CurrentTop == CurrentBottom && "Nonempty unscheduled zone.");
 
   placeDebugValues();
 
-  DEBUG({
+  LLVM_DEBUG({
     dbgs() << "*** Final schedule for "
            << printMBBReference(*begin()->getParent()) << " ***\n";
     dumpSchedule();
diff --git a/lib/Target/AMDGPU/SIMachineScheduler.h b/lib/Target/AMDGPU/SIMachineScheduler.h
index d824e38504e6..0ce68ac6a897 100644
--- a/lib/Target/AMDGPU/SIMachineScheduler.h
+++ b/lib/Target/AMDGPU/SIMachineScheduler.h
@@ -8,7 +8,7 @@
 //===----------------------------------------------------------------------===//
 //
 /// \file
-/// \brief SI Machine Scheduler interface
+/// SI Machine Scheduler interface
 //
 //===----------------------------------------------------------------------===//
 
diff --git a/lib/Target/AMDGPU/SIMemoryLegalizer.cpp b/lib/Target/AMDGPU/SIMemoryLegalizer.cpp
index c73fb10b7ea0..938cdaf1ef8f 100644
--- a/lib/Target/AMDGPU/SIMemoryLegalizer.cpp
+++ b/lib/Target/AMDGPU/SIMemoryLegalizer.cpp
@@ -8,7 +8,7 @@
 //===----------------------------------------------------------------------===//
 //
 /// \file
-/// \brief Memory legalizer - implements memory model. More information can be
+/// Memory legalizer - implements memory model. More information can be
 /// found here:
 ///   http://llvm.org/docs/AMDGPUUsage.html#memory-model
 //
@@ -19,7 +19,9 @@
 #include "AMDGPUSubtarget.h"
 #include "SIDefines.h"
 #include "SIInstrInfo.h"
+#include "MCTargetDesc/AMDGPUMCTargetDesc.h"
 #include "Utils/AMDGPUBaseInfo.h"
+#include "llvm/ADT/BitmaskEnum.h"
 #include "llvm/ADT/None.h"
 #include "llvm/ADT/Optional.h"
 #include "llvm/CodeGen/MachineBasicBlock.h"
@@ -36,6 +38,7 @@
 #include "llvm/MC/MCInstrDesc.h"
 #include "llvm/Pass.h"
 #include "llvm/Support/AtomicOrdering.h"
+#include "llvm/Support/MathExtras.h"
 #include <cassert>
 #include <list>
 
@@ -47,42 +50,142 @@ using namespace llvm::AMDGPU;
 
 namespace {
 
+LLVM_ENABLE_BITMASK_ENUMS_IN_NAMESPACE();
+
+/// Memory operation flags. Can be ORed together.
+enum class SIMemOp {
+  NONE = 0u,
+  LOAD = 1u << 0,
+  STORE = 1u << 1,
+  LLVM_MARK_AS_BITMASK_ENUM(/* LargestFlag = */ STORE)
+};
+
+/// Position to insert a new instruction relative to an existing
+/// instruction.
+enum class Position {
+  BEFORE,
+  AFTER
+};
+
+/// The atomic synchronization scopes supported by the AMDGPU target.
+enum class SIAtomicScope {
+  NONE,
+  SINGLETHREAD,
+  WAVEFRONT,
+  WORKGROUP,
+  AGENT,
+  SYSTEM
+};
+
+/// The distinct address spaces supported by the AMDGPU target for
+/// atomic memory operation. Can be ORed toether.
+enum class SIAtomicAddrSpace {
+  NONE = 0u,
+  GLOBAL = 1u << 0,
+  LDS = 1u << 1,
+  SCRATCH = 1u << 2,
+  GDS = 1u << 3,
+  OTHER = 1u << 4,
+
+  /// The address spaces that can be accessed by a FLAT instruction.
+  FLAT = GLOBAL | LDS | SCRATCH,
+
+  /// The address spaces that support atomic instructions.
+  ATOMIC = GLOBAL | LDS | SCRATCH | GDS,
+
+  /// All address spaces.
+  ALL = GLOBAL | LDS | SCRATCH | GDS | OTHER,
+
+  LLVM_MARK_AS_BITMASK_ENUM(/* LargestFlag = */ ALL)
+};
+
+/// Sets named bit \p BitName to "true" if present in instruction \p MI.
+/// \returns Returns true if \p MI is modified, false otherwise.
+template <uint16_t BitName>
+bool enableNamedBit(const MachineBasicBlock::iterator &MI) {
+  int BitIdx = AMDGPU::getNamedOperandIdx(MI->getOpcode(), BitName);
+  if (BitIdx == -1)
+    return false;
+
+  MachineOperand &Bit = MI->getOperand(BitIdx);
+  if (Bit.getImm() != 0)
+    return false;
+
+  Bit.setImm(1);
+  return true;
+}
+
 class SIMemOpInfo final {
 private:
-  SyncScope::ID SSID = SyncScope::System;
+
+  friend class SIMemOpAccess;
+
   AtomicOrdering Ordering = AtomicOrdering::NotAtomic;
   AtomicOrdering FailureOrdering = AtomicOrdering::NotAtomic;
+  SIAtomicScope Scope = SIAtomicScope::SYSTEM;
+  SIAtomicAddrSpace OrderingAddrSpace = SIAtomicAddrSpace::NONE;
+  SIAtomicAddrSpace InstrAddrSpace = SIAtomicAddrSpace::NONE;
+  bool IsCrossAddressSpaceOrdering = false;
   bool IsNonTemporal = false;
 
-  SIMemOpInfo(SyncScope::ID SSID, AtomicOrdering Ordering)
-      : SSID(SSID), Ordering(Ordering) {}
-
-  SIMemOpInfo(SyncScope::ID SSID, AtomicOrdering Ordering,
-              AtomicOrdering FailureOrdering, bool IsNonTemporal = false)
-      : SSID(SSID), Ordering(Ordering), FailureOrdering(FailureOrdering),
-        IsNonTemporal(IsNonTemporal) {}
-
-  /// \returns Info constructed from \p MI, which has at least machine memory
-  /// operand.
-  static Optional<SIMemOpInfo> constructFromMIWithMMO(
-      const MachineBasicBlock::iterator &MI);
+  SIMemOpInfo(AtomicOrdering Ordering = AtomicOrdering::SequentiallyConsistent,
+              SIAtomicScope Scope = SIAtomicScope::SYSTEM,
+              SIAtomicAddrSpace OrderingAddrSpace = SIAtomicAddrSpace::ATOMIC,
+              SIAtomicAddrSpace InstrAddrSpace = SIAtomicAddrSpace::ALL,
+              bool IsCrossAddressSpaceOrdering = true,
+              AtomicOrdering FailureOrdering =
+                AtomicOrdering::SequentiallyConsistent,
+              bool IsNonTemporal = false)
+    : Ordering(Ordering), FailureOrdering(FailureOrdering),
+      Scope(Scope), OrderingAddrSpace(OrderingAddrSpace),
+      InstrAddrSpace(InstrAddrSpace),
+      IsCrossAddressSpaceOrdering(IsCrossAddressSpaceOrdering),
+      IsNonTemporal(IsNonTemporal) {
+    // There is also no cross address space ordering if the ordering
+    // address space is the same as the instruction address space and
+    // only contains a single address space.
+    if ((OrderingAddrSpace == InstrAddrSpace) &&
+        isPowerOf2_32(uint32_t(InstrAddrSpace)))
+      IsCrossAddressSpaceOrdering = false;
+  }
 
 public:
-  /// \returns Synchronization scope ID of the machine instruction used to
+  /// \returns Atomic synchronization scope of the machine instruction used to
   /// create this SIMemOpInfo.
-  SyncScope::ID getSSID() const {
-    return SSID;
+  SIAtomicScope getScope() const {
+    return Scope;
   }
+
   /// \returns Ordering constraint of the machine instruction used to
   /// create this SIMemOpInfo.
   AtomicOrdering getOrdering() const {
     return Ordering;
   }
+
   /// \returns Failure ordering constraint of the machine instruction used to
   /// create this SIMemOpInfo.
   AtomicOrdering getFailureOrdering() const {
     return FailureOrdering;
   }
+
+  /// \returns The address spaces be accessed by the machine
+  /// instruction used to create this SiMemOpInfo.
+  SIAtomicAddrSpace getInstrAddrSpace() const {
+    return InstrAddrSpace;
+  }
+
+  /// \returns The address spaces that must be ordered by the machine
+  /// instruction used to create this SiMemOpInfo.
+  SIAtomicAddrSpace getOrderingAddrSpace() const {
+    return OrderingAddrSpace;
+  }
+
+  /// \returns Return true iff memory ordering of operations on
+  /// different address spaces is required.
+  bool getIsCrossAddressSpaceOrdering() const {
+    return IsCrossAddressSpaceOrdering;
+  }
+
   /// \returns True if memory access of the machine instruction used to
   /// create this SIMemOpInfo is non-temporal, false otherwise.
   bool isNonTemporal() const {
@@ -95,109 +198,198 @@ public:
     return Ordering != AtomicOrdering::NotAtomic;
   }
 
+};
+
+class SIMemOpAccess final {
+private:
+
+  AMDGPUAS SIAddrSpaceInfo;
+  AMDGPUMachineModuleInfo *MMI = nullptr;
+
+  /// Reports unsupported message \p Msg for \p MI to LLVM context.
+  void reportUnsupported(const MachineBasicBlock::iterator &MI,
+                         const char *Msg) const;
+
+  /// Inspects the target synchonization scope \p SSID and determines
+  /// the SI atomic scope it corresponds to, the address spaces it
+  /// covers, and whether the memory ordering applies between address
+  /// spaces.
+  Optional<std::tuple<SIAtomicScope, SIAtomicAddrSpace, bool>>
+  toSIAtomicScope(SyncScope::ID SSID, SIAtomicAddrSpace InstrScope) const;
+
+  /// \return Return a bit set of the address spaces accessed by \p AS.
+  SIAtomicAddrSpace toSIAtomicAddrSpace(unsigned AS) const;
+
+  /// \returns Info constructed from \p MI, which has at least machine memory
+  /// operand.
+  Optional<SIMemOpInfo> constructFromMIWithMMO(
+      const MachineBasicBlock::iterator &MI) const;
+
+public:
+  /// Construct class to support accessing the machine memory operands
+  /// of instructions in the machine function \p MF.
+  SIMemOpAccess(MachineFunction &MF);
+
   /// \returns Load info if \p MI is a load operation, "None" otherwise.
-  static Optional<SIMemOpInfo> getLoadInfo(
-      const MachineBasicBlock::iterator &MI);
+  Optional<SIMemOpInfo> getLoadInfo(
+      const MachineBasicBlock::iterator &MI) const;
+
   /// \returns Store info if \p MI is a store operation, "None" otherwise.
-  static Optional<SIMemOpInfo> getStoreInfo(
-      const MachineBasicBlock::iterator &MI);
+  Optional<SIMemOpInfo> getStoreInfo(
+      const MachineBasicBlock::iterator &MI) const;
+
   /// \returns Atomic fence info if \p MI is an atomic fence operation,
   /// "None" otherwise.
-  static Optional<SIMemOpInfo> getAtomicFenceInfo(
-      const MachineBasicBlock::iterator &MI);
-  /// \returns Atomic cmpxchg info if \p MI is an atomic cmpxchg operation,
-  /// "None" otherwise.
-  static Optional<SIMemOpInfo> getAtomicCmpxchgInfo(
-      const MachineBasicBlock::iterator &MI);
-  /// \returns Atomic rmw info if \p MI is an atomic rmw operation,
-  /// "None" otherwise.
-  static Optional<SIMemOpInfo> getAtomicRmwInfo(
-      const MachineBasicBlock::iterator &MI);
+  Optional<SIMemOpInfo> getAtomicFenceInfo(
+      const MachineBasicBlock::iterator &MI) const;
 
-  /// \brief Reports unknown synchronization scope used in \p MI to LLVM
-  /// context.
-  static void reportUnknownSyncScope(
-      const MachineBasicBlock::iterator &MI);
+  /// \returns Atomic cmpxchg/rmw info if \p MI is an atomic cmpxchg or
+  /// rmw operation, "None" otherwise.
+  Optional<SIMemOpInfo> getAtomicCmpxchgOrRmwInfo(
+      const MachineBasicBlock::iterator &MI) const;
 };
 
-class SIMemoryLegalizer final : public MachineFunctionPass {
-private:
-  /// \brief Machine module info.
-  const AMDGPUMachineModuleInfo *MMI = nullptr;
+class SICacheControl {
+protected:
 
-  /// \brief Instruction info.
+  /// Instruction info.
   const SIInstrInfo *TII = nullptr;
 
-  /// \brief Immediate for "vmcnt(0)".
-  unsigned Vmcnt0Immediate = 0;
+  IsaInfo::IsaVersion IV;
 
-  /// \brief Opcode for cache invalidation instruction (L1).
-  unsigned Wbinvl1Opcode = 0;
+  SICacheControl(const GCNSubtarget &ST);
 
-  /// \brief List of atomic pseudo instructions.
-  std::list<MachineBasicBlock::iterator> AtomicPseudoMIs;
+public:
 
-  /// \brief Sets named bit (BitName) to "true" if present in \p MI. Returns
-  /// true if \p MI is modified, false otherwise.
-  template <uint16_t BitName>
-  bool enableNamedBit(const MachineBasicBlock::iterator &MI) const {
-    int BitIdx = AMDGPU::getNamedOperandIdx(MI->getOpcode(), BitName);
-    if (BitIdx == -1)
-      return false;
+  /// Create a cache control for the subtarget \p ST.
+  static std::unique_ptr<SICacheControl> create(const GCNSubtarget &ST);
+
+  /// Update \p MI memory load instruction to bypass any caches up to
+  /// the \p Scope memory scope for address spaces \p
+  /// AddrSpace. Return true iff the instruction was modified.
+  virtual bool enableLoadCacheBypass(const MachineBasicBlock::iterator &MI,
+                                     SIAtomicScope Scope,
+                                     SIAtomicAddrSpace AddrSpace) const = 0;
+
+  /// Update \p MI memory instruction to indicate it is
+  /// nontemporal. Return true iff the instruction was modified.
+  virtual bool enableNonTemporal(const MachineBasicBlock::iterator &MI)
+    const = 0;
+
+  /// Inserts any necessary instructions at position \p Pos relative
+  /// to instruction \p MI to ensure any caches associated with
+  /// address spaces \p AddrSpace for memory scopes up to memory scope
+  /// \p Scope are invalidated. Returns true iff any instructions
+  /// inserted.
+  virtual bool insertCacheInvalidate(MachineBasicBlock::iterator &MI,
+                                     SIAtomicScope Scope,
+                                     SIAtomicAddrSpace AddrSpace,
+                                     Position Pos) const = 0;
+
+  /// Inserts any necessary instructions at position \p Pos relative
+  /// to instruction \p MI to ensure memory instructions of kind \p Op
+  /// associated with address spaces \p AddrSpace have completed as
+  /// observed by other memory instructions executing in memory scope
+  /// \p Scope. \p IsCrossAddrSpaceOrdering indicates if the memory
+  /// ordering is between address spaces. Returns true iff any
+  /// instructions inserted.
+  virtual bool insertWait(MachineBasicBlock::iterator &MI,
+                          SIAtomicScope Scope,
+                          SIAtomicAddrSpace AddrSpace,
+                          SIMemOp Op,
+                          bool IsCrossAddrSpaceOrdering,
+                          Position Pos) const = 0;
+
+  /// Virtual destructor to allow derivations to be deleted.
+  virtual ~SICacheControl() = default;
 
-    MachineOperand &Bit = MI->getOperand(BitIdx);
-    if (Bit.getImm() != 0)
-      return false;
+};
 
-    Bit.setImm(1);
-    return true;
-  }
+class SIGfx6CacheControl : public SICacheControl {
+protected:
 
-  /// \brief Sets GLC bit to "true" if present in \p MI. Returns true if \p MI
+  /// Sets GLC bit to "true" if present in \p MI. Returns true if \p MI
   /// is modified, false otherwise.
   bool enableGLCBit(const MachineBasicBlock::iterator &MI) const {
     return enableNamedBit<AMDGPU::OpName::glc>(MI);
   }
 
-  /// \brief Sets SLC bit to "true" if present in \p MI. Returns true if \p MI
+  /// Sets SLC bit to "true" if present in \p MI. Returns true if \p MI
   /// is modified, false otherwise.
   bool enableSLCBit(const MachineBasicBlock::iterator &MI) const {
     return enableNamedBit<AMDGPU::OpName::slc>(MI);
   }
 
-  /// \brief Inserts "buffer_wbinvl1_vol" instruction \p Before or after \p MI.
-  /// Always returns true.
-  bool insertBufferWbinvl1Vol(MachineBasicBlock::iterator &MI,
-                              bool Before = true) const;
-  /// \brief Inserts "s_waitcnt vmcnt(0)" instruction \p Before or after \p MI.
-  /// Always returns true.
-  bool insertWaitcntVmcnt0(MachineBasicBlock::iterator &MI,
-                           bool Before = true) const;
+public:
+
+  SIGfx6CacheControl(const GCNSubtarget &ST) : SICacheControl(ST) {};
+
+  bool enableLoadCacheBypass(const MachineBasicBlock::iterator &MI,
+                             SIAtomicScope Scope,
+                             SIAtomicAddrSpace AddrSpace) const override;
+
+  bool enableNonTemporal(const MachineBasicBlock::iterator &MI) const override;
+
+  bool insertCacheInvalidate(MachineBasicBlock::iterator &MI,
+                             SIAtomicScope Scope,
+                             SIAtomicAddrSpace AddrSpace,
+                             Position Pos) const override;
+
+  bool insertWait(MachineBasicBlock::iterator &MI,
+                  SIAtomicScope Scope,
+                  SIAtomicAddrSpace AddrSpace,
+                  SIMemOp Op,
+                  bool IsCrossAddrSpaceOrdering,
+                  Position Pos) const override;
+};
+
+class SIGfx7CacheControl : public SIGfx6CacheControl {
+public:
 
-  /// \brief Removes all processed atomic pseudo instructions from the current
+  SIGfx7CacheControl(const GCNSubtarget &ST) : SIGfx6CacheControl(ST) {};
+
+  bool insertCacheInvalidate(MachineBasicBlock::iterator &MI,
+                             SIAtomicScope Scope,
+                             SIAtomicAddrSpace AddrSpace,
+                             Position Pos) const override;
+
+};
+
+class SIMemoryLegalizer final : public MachineFunctionPass {
+private:
+
+  /// Cache Control.
+  std::unique_ptr<SICacheControl> CC = nullptr;
+
+  /// List of atomic pseudo instructions.
+  std::list<MachineBasicBlock::iterator> AtomicPseudoMIs;
+
+  /// Return true iff instruction \p MI is a atomic instruction that
+  /// returns a result.
+  bool isAtomicRet(const MachineInstr &MI) const {
+    return AMDGPU::getAtomicNoRetOp(MI.getOpcode()) != -1;
+  }
+
+  /// Removes all processed atomic pseudo instructions from the current
   /// function. Returns true if current function is modified, false otherwise.
   bool removeAtomicPseudoMIs();
 
-  /// \brief Expands load operation \p MI. Returns true if instructions are
+  /// Expands load operation \p MI. Returns true if instructions are
   /// added/deleted or \p MI is modified, false otherwise.
   bool expandLoad(const SIMemOpInfo &MOI,
                   MachineBasicBlock::iterator &MI);
-  /// \brief Expands store operation \p MI. Returns true if instructions are
+  /// Expands store operation \p MI. Returns true if instructions are
   /// added/deleted or \p MI is modified, false otherwise.
   bool expandStore(const SIMemOpInfo &MOI,
                    MachineBasicBlock::iterator &MI);
-  /// \brief Expands atomic fence operation \p MI. Returns true if
+  /// Expands atomic fence operation \p MI. Returns true if
   /// instructions are added/deleted or \p MI is modified, false otherwise.
   bool expandAtomicFence(const SIMemOpInfo &MOI,
                          MachineBasicBlock::iterator &MI);
-  /// \brief Expands atomic cmpxchg operation \p MI. Returns true if
-  /// instructions are added/deleted or \p MI is modified, false otherwise.
-  bool expandAtomicCmpxchg(const SIMemOpInfo &MOI,
-                           MachineBasicBlock::iterator &MI);
-  /// \brief Expands atomic rmw operation \p MI. Returns true if
+  /// Expands atomic cmpxchg or rmw operation \p MI. Returns true if
   /// instructions are added/deleted or \p MI is modified, false otherwise.
-  bool expandAtomicRmw(const SIMemOpInfo &MOI,
-                       MachineBasicBlock::iterator &MI);
+  bool expandAtomicCmpxchgOrRmw(const SIMemOpInfo &MOI,
+                                MachineBasicBlock::iterator &MI);
 
 public:
   static char ID;
@@ -218,48 +410,129 @@ public:
 
 } // end namespace anonymous
 
-/* static */
-Optional<SIMemOpInfo> SIMemOpInfo::constructFromMIWithMMO(
-    const MachineBasicBlock::iterator &MI) {
-  assert(MI->getNumMemOperands() > 0);
+void SIMemOpAccess::reportUnsupported(const MachineBasicBlock::iterator &MI,
+                                      const char *Msg) const {
+  const Function &Func = MI->getParent()->getParent()->getFunction();
+  DiagnosticInfoUnsupported Diag(Func, Msg, MI->getDebugLoc());
+  Func.getContext().diagnose(Diag);
+}
 
-  const MachineFunction *MF = MI->getParent()->getParent();
-  const AMDGPUMachineModuleInfo *MMI =
-      &MF->getMMI().getObjFileInfo<AMDGPUMachineModuleInfo>();
+Optional<std::tuple<SIAtomicScope, SIAtomicAddrSpace, bool>>
+SIMemOpAccess::toSIAtomicScope(SyncScope::ID SSID,
+                               SIAtomicAddrSpace InstrScope) const {
+  /// TODO: For now assume OpenCL memory model which treats each
+  /// address space as having a separate happens-before relation, and
+  /// so an instruction only has ordering with respect to the address
+  /// space it accesses, and if it accesses multiple address spaces it
+  /// does not require ordering of operations in different address
+  /// spaces.
+ if (SSID == SyncScope::System)
+    return std::make_tuple(SIAtomicScope::SYSTEM,
+                           SIAtomicAddrSpace::ATOMIC & InstrScope,
+                           false);
+  if (SSID == MMI->getAgentSSID())
+    return std::make_tuple(SIAtomicScope::AGENT,
+                           SIAtomicAddrSpace::ATOMIC & InstrScope,
+                           false);
+  if (SSID == MMI->getWorkgroupSSID())
+    return std::make_tuple(SIAtomicScope::WORKGROUP,
+                           SIAtomicAddrSpace::ATOMIC & InstrScope,
+                           false);
+  if (SSID == MMI->getWavefrontSSID())
+    return std::make_tuple(SIAtomicScope::WAVEFRONT,
+                           SIAtomicAddrSpace::ATOMIC & InstrScope,
+                           false);
+  if (SSID == SyncScope::SingleThread)
+    return std::make_tuple(SIAtomicScope::SINGLETHREAD,
+                           SIAtomicAddrSpace::ATOMIC & InstrScope,
+                           false);
+  /// TODO: To support HSA Memory Model need to add additional memory
+  /// scopes that specify that do require cross address space
+  /// ordering.
+  return None;
+}
+
+SIAtomicAddrSpace SIMemOpAccess::toSIAtomicAddrSpace(unsigned AS) const {
+  if (AS == SIAddrSpaceInfo.FLAT_ADDRESS)
+    return SIAtomicAddrSpace::FLAT;
+  if (AS == SIAddrSpaceInfo.GLOBAL_ADDRESS)
+    return SIAtomicAddrSpace::GLOBAL;
+  if (AS == SIAddrSpaceInfo.LOCAL_ADDRESS)
+    return SIAtomicAddrSpace::LDS;
+  if (AS == SIAddrSpaceInfo.PRIVATE_ADDRESS)
+    return SIAtomicAddrSpace::SCRATCH;
+  if (AS == SIAddrSpaceInfo.REGION_ADDRESS)
+    return SIAtomicAddrSpace::GDS;
+
+  return SIAtomicAddrSpace::OTHER;
+}
+
+SIMemOpAccess::SIMemOpAccess(MachineFunction &MF) {
+  SIAddrSpaceInfo = getAMDGPUAS(MF.getTarget());
+  MMI = &MF.getMMI().getObjFileInfo<AMDGPUMachineModuleInfo>();
+}
+
+Optional<SIMemOpInfo> SIMemOpAccess::constructFromMIWithMMO(
+    const MachineBasicBlock::iterator &MI) const {
+  assert(MI->getNumMemOperands() > 0);
 
   SyncScope::ID SSID = SyncScope::SingleThread;
   AtomicOrdering Ordering = AtomicOrdering::NotAtomic;
   AtomicOrdering FailureOrdering = AtomicOrdering::NotAtomic;
+  SIAtomicAddrSpace InstrAddrSpace = SIAtomicAddrSpace::NONE;
   bool IsNonTemporal = true;
 
   // Validator should check whether or not MMOs cover the entire set of
   // locations accessed by the memory instruction.
   for (const auto &MMO : MI->memoperands()) {
-    const auto &IsSyncScopeInclusion =
-        MMI->isSyncScopeInclusion(SSID, MMO->getSyncScopeID());
-    if (!IsSyncScopeInclusion) {
-      reportUnknownSyncScope(MI);
-      return None;
-    }
-
-    SSID = IsSyncScopeInclusion.getValue() ? SSID : MMO->getSyncScopeID();
-    Ordering =
-        isStrongerThan(Ordering, MMO->getOrdering()) ?
-            Ordering : MMO->getOrdering();
-    FailureOrdering =
-        isStrongerThan(FailureOrdering, MMO->getFailureOrdering()) ?
-            FailureOrdering : MMO->getFailureOrdering();
+    IsNonTemporal &= MMO->isNonTemporal();
+    InstrAddrSpace |=
+      toSIAtomicAddrSpace(MMO->getPointerInfo().getAddrSpace());
+    AtomicOrdering OpOrdering = MMO->getOrdering();
+    if (OpOrdering != AtomicOrdering::NotAtomic) {
+      const auto &IsSyncScopeInclusion =
+          MMI->isSyncScopeInclusion(SSID, MMO->getSyncScopeID());
+      if (!IsSyncScopeInclusion) {
+        reportUnsupported(MI,
+          "Unsupported non-inclusive atomic synchronization scope");
+        return None;
+      }
 
-    if (!(MMO->getFlags() & MachineMemOperand::MONonTemporal))
-      IsNonTemporal = false;
+      SSID = IsSyncScopeInclusion.getValue() ? SSID : MMO->getSyncScopeID();
+      Ordering =
+          isStrongerThan(Ordering, OpOrdering) ?
+              Ordering : MMO->getOrdering();
+      assert(MMO->getFailureOrdering() != AtomicOrdering::Release &&
+             MMO->getFailureOrdering() != AtomicOrdering::AcquireRelease);
+      FailureOrdering =
+          isStrongerThan(FailureOrdering, MMO->getFailureOrdering()) ?
+              FailureOrdering : MMO->getFailureOrdering();
+    }
   }
 
-  return SIMemOpInfo(SSID, Ordering, FailureOrdering, IsNonTemporal);
+  SIAtomicScope Scope = SIAtomicScope::NONE;
+  SIAtomicAddrSpace OrderingAddrSpace = SIAtomicAddrSpace::NONE;
+  bool IsCrossAddressSpaceOrdering = false;
+  if (Ordering != AtomicOrdering::NotAtomic) {
+    auto ScopeOrNone = toSIAtomicScope(SSID, InstrAddrSpace);
+    if (!ScopeOrNone) {
+      reportUnsupported(MI, "Unsupported atomic synchronization scope");
+      return None;
+    }
+    std::tie(Scope, OrderingAddrSpace, IsCrossAddressSpaceOrdering) =
+      ScopeOrNone.getValue();
+    if ((OrderingAddrSpace == SIAtomicAddrSpace::NONE) ||
+        ((OrderingAddrSpace & SIAtomicAddrSpace::ATOMIC) != OrderingAddrSpace)) {
+      reportUnsupported(MI, "Unsupported atomic address space");
+      return None;
+    }
+  }
+  return SIMemOpInfo(Ordering, Scope, OrderingAddrSpace, InstrAddrSpace,
+                     IsCrossAddressSpaceOrdering, FailureOrdering, IsNonTemporal);
 }
 
-/* static */
-Optional<SIMemOpInfo> SIMemOpInfo::getLoadInfo(
-    const MachineBasicBlock::iterator &MI) {
+Optional<SIMemOpInfo> SIMemOpAccess::getLoadInfo(
+    const MachineBasicBlock::iterator &MI) const {
   assert(MI->getDesc().TSFlags & SIInstrFlags::maybeAtomic);
 
   if (!(MI->mayLoad() && !MI->mayStore()))
@@ -267,15 +540,13 @@ Optional<SIMemOpInfo> SIMemOpInfo::getLoadInfo(
 
   // Be conservative if there are no memory operands.
   if (MI->getNumMemOperands() == 0)
-    return SIMemOpInfo(SyncScope::System,
-                       AtomicOrdering::SequentiallyConsistent);
+    return SIMemOpInfo();
 
-  return SIMemOpInfo::constructFromMIWithMMO(MI);
+  return constructFromMIWithMMO(MI);
 }
 
-/* static */
-Optional<SIMemOpInfo> SIMemOpInfo::getStoreInfo(
-    const MachineBasicBlock::iterator &MI) {
+Optional<SIMemOpInfo> SIMemOpAccess::getStoreInfo(
+    const MachineBasicBlock::iterator &MI) const {
   assert(MI->getDesc().TSFlags & SIInstrFlags::maybeAtomic);
 
   if (!(!MI->mayLoad() && MI->mayStore()))
@@ -283,30 +554,46 @@ Optional<SIMemOpInfo> SIMemOpInfo::getStoreInfo(
 
   // Be conservative if there are no memory operands.
   if (MI->getNumMemOperands() == 0)
-    return SIMemOpInfo(SyncScope::System,
-                       AtomicOrdering::SequentiallyConsistent);
+    return SIMemOpInfo();
 
-  return SIMemOpInfo::constructFromMIWithMMO(MI);
+  return constructFromMIWithMMO(MI);
 }
 
-/* static */
-Optional<SIMemOpInfo> SIMemOpInfo::getAtomicFenceInfo(
-    const MachineBasicBlock::iterator &MI) {
+Optional<SIMemOpInfo> SIMemOpAccess::getAtomicFenceInfo(
+    const MachineBasicBlock::iterator &MI) const {
   assert(MI->getDesc().TSFlags & SIInstrFlags::maybeAtomic);
 
   if (MI->getOpcode() != AMDGPU::ATOMIC_FENCE)
     return None;
 
-  SyncScope::ID SSID =
-      static_cast<SyncScope::ID>(MI->getOperand(1).getImm());
   AtomicOrdering Ordering =
-      static_cast<AtomicOrdering>(MI->getOperand(0).getImm());
-  return SIMemOpInfo(SSID, Ordering);
+    static_cast<AtomicOrdering>(MI->getOperand(0).getImm());
+
+  SyncScope::ID SSID = static_cast<SyncScope::ID>(MI->getOperand(1).getImm());
+  auto ScopeOrNone = toSIAtomicScope(SSID, SIAtomicAddrSpace::ATOMIC);
+  if (!ScopeOrNone) {
+    reportUnsupported(MI, "Unsupported atomic synchronization scope");
+    return None;
+  }
+
+  SIAtomicScope Scope = SIAtomicScope::NONE;
+  SIAtomicAddrSpace OrderingAddrSpace = SIAtomicAddrSpace::NONE;
+  bool IsCrossAddressSpaceOrdering = false;
+  std::tie(Scope, OrderingAddrSpace, IsCrossAddressSpaceOrdering) =
+    ScopeOrNone.getValue();
+
+  if ((OrderingAddrSpace == SIAtomicAddrSpace::NONE) ||
+      ((OrderingAddrSpace & SIAtomicAddrSpace::ATOMIC) != OrderingAddrSpace)) {
+    reportUnsupported(MI, "Unsupported atomic address space");
+    return None;
+  }
+
+  return SIMemOpInfo(Ordering, Scope, OrderingAddrSpace, SIAtomicAddrSpace::ATOMIC,
+                     IsCrossAddressSpaceOrdering);
 }
 
-/* static */
-Optional<SIMemOpInfo> SIMemOpInfo::getAtomicCmpxchgInfo(
-    const MachineBasicBlock::iterator &MI) {
+Optional<SIMemOpInfo> SIMemOpAccess::getAtomicCmpxchgOrRmwInfo(
+    const MachineBasicBlock::iterator &MI) const {
   assert(MI->getDesc().TSFlags & SIInstrFlags::maybeAtomic);
 
   if (!(MI->mayLoad() && MI->mayStore()))
@@ -314,68 +601,251 @@ Optional<SIMemOpInfo> SIMemOpInfo::getAtomicCmpxchgInfo(
 
   // Be conservative if there are no memory operands.
   if (MI->getNumMemOperands() == 0)
-    return SIMemOpInfo(SyncScope::System,
-                       AtomicOrdering::SequentiallyConsistent,
-                       AtomicOrdering::SequentiallyConsistent);
+    return SIMemOpInfo();
 
-  return SIMemOpInfo::constructFromMIWithMMO(MI);
+  return constructFromMIWithMMO(MI);
+}
+
+SICacheControl::SICacheControl(const GCNSubtarget &ST) {
+  TII = ST.getInstrInfo();
+  IV = IsaInfo::getIsaVersion(ST.getFeatureBits());
 }
 
 /* static */
-Optional<SIMemOpInfo> SIMemOpInfo::getAtomicRmwInfo(
-    const MachineBasicBlock::iterator &MI) {
-  assert(MI->getDesc().TSFlags & SIInstrFlags::maybeAtomic);
+std::unique_ptr<SICacheControl> SICacheControl::create(const GCNSubtarget &ST) {
+  GCNSubtarget::Generation Generation = ST.getGeneration();
+  if (Generation <= AMDGPUSubtarget::SOUTHERN_ISLANDS)
+    return make_unique<SIGfx6CacheControl>(ST);
+  return make_unique<SIGfx7CacheControl>(ST);
+}
 
-  if (!(MI->mayLoad() && MI->mayStore()))
-    return None;
+bool SIGfx6CacheControl::enableLoadCacheBypass(
+    const MachineBasicBlock::iterator &MI,
+    SIAtomicScope Scope,
+    SIAtomicAddrSpace AddrSpace) const {
+  assert(MI->mayLoad() && !MI->mayStore());
+  bool Changed = false;
 
-  // Be conservative if there are no memory operands.
-  if (MI->getNumMemOperands() == 0)
-    return SIMemOpInfo(SyncScope::System,
-                       AtomicOrdering::SequentiallyConsistent);
+  if ((AddrSpace & SIAtomicAddrSpace::GLOBAL) != SIAtomicAddrSpace::NONE) {
+    /// TODO: Do not set glc for rmw atomic operations as they
+    /// implicitly bypass the L1 cache.
 
-  return SIMemOpInfo::constructFromMIWithMMO(MI);
+    switch (Scope) {
+    case SIAtomicScope::SYSTEM:
+    case SIAtomicScope::AGENT:
+      Changed |= enableGLCBit(MI);
+      break;
+    case SIAtomicScope::WORKGROUP:
+    case SIAtomicScope::WAVEFRONT:
+    case SIAtomicScope::SINGLETHREAD:
+      // No cache to bypass.
+      break;
+    default:
+      llvm_unreachable("Unsupported synchronization scope");
+    }
+  }
+
+  /// The scratch address space does not need the global memory caches
+  /// to be bypassed as all memory operations by the same thread are
+  /// sequentially consistent, and no other thread can access scratch
+  /// memory.
+
+  /// Other address spaces do not hava a cache.
+
+  return Changed;
 }
 
-/* static */
-void SIMemOpInfo::reportUnknownSyncScope(
-    const MachineBasicBlock::iterator &MI) {
-  DiagnosticInfoUnsupported Diag(MI->getParent()->getParent()->getFunction(),
-                                 "Unsupported synchronization scope");
-  LLVMContext *CTX = &MI->getParent()->getParent()->getFunction().getContext();
-  CTX->diagnose(Diag);
+bool SIGfx6CacheControl::enableNonTemporal(
+    const MachineBasicBlock::iterator &MI) const {
+  assert(MI->mayLoad() ^ MI->mayStore());
+  bool Changed = false;
+
+  /// TODO: Do not enableGLCBit if rmw atomic.
+  Changed |= enableGLCBit(MI);
+  Changed |= enableSLCBit(MI);
+
+  return Changed;
 }
 
-bool SIMemoryLegalizer::insertBufferWbinvl1Vol(MachineBasicBlock::iterator &MI,
-                                               bool Before) const {
+bool SIGfx6CacheControl::insertCacheInvalidate(MachineBasicBlock::iterator &MI,
+                                               SIAtomicScope Scope,
+                                               SIAtomicAddrSpace AddrSpace,
+                                               Position Pos) const {
+  bool Changed = false;
+
   MachineBasicBlock &MBB = *MI->getParent();
   DebugLoc DL = MI->getDebugLoc();
 
-  if (!Before)
+  if (Pos == Position::AFTER)
     ++MI;
 
-  BuildMI(MBB, MI, DL, TII->get(Wbinvl1Opcode));
+  if ((AddrSpace & SIAtomicAddrSpace::GLOBAL) != SIAtomicAddrSpace::NONE) {
+    switch (Scope) {
+    case SIAtomicScope::SYSTEM:
+    case SIAtomicScope::AGENT:
+      BuildMI(MBB, MI, DL, TII->get(AMDGPU::BUFFER_WBINVL1));
+      Changed = true;
+      break;
+    case SIAtomicScope::WORKGROUP:
+    case SIAtomicScope::WAVEFRONT:
+    case SIAtomicScope::SINGLETHREAD:
+      // No cache to invalidate.
+      break;
+    default:
+      llvm_unreachable("Unsupported synchronization scope");
+    }
+  }
+
+  /// The scratch address space does not need the global memory cache
+  /// to be flushed as all memory operations by the same thread are
+  /// sequentially consistent, and no other thread can access scratch
+  /// memory.
+
+  /// Other address spaces do not hava a cache.
 
-  if (!Before)
+  if (Pos == Position::AFTER)
     --MI;
 
-  return true;
+  return Changed;
 }
 
-bool SIMemoryLegalizer::insertWaitcntVmcnt0(MachineBasicBlock::iterator &MI,
-                                            bool Before) const {
+bool SIGfx6CacheControl::insertWait(MachineBasicBlock::iterator &MI,
+                                    SIAtomicScope Scope,
+                                    SIAtomicAddrSpace AddrSpace,
+                                    SIMemOp Op,
+                                    bool IsCrossAddrSpaceOrdering,
+                                    Position Pos) const {
+  bool Changed = false;
+
   MachineBasicBlock &MBB = *MI->getParent();
   DebugLoc DL = MI->getDebugLoc();
 
-  if (!Before)
+  if (Pos == Position::AFTER)
     ++MI;
 
-  BuildMI(MBB, MI, DL, TII->get(AMDGPU::S_WAITCNT)).addImm(Vmcnt0Immediate);
+  bool VMCnt = false;
+  bool LGKMCnt = false;
+  bool EXPCnt = false;
+
+  if ((AddrSpace & SIAtomicAddrSpace::GLOBAL) != SIAtomicAddrSpace::NONE) {
+    switch (Scope) {
+    case SIAtomicScope::SYSTEM:
+    case SIAtomicScope::AGENT:
+      VMCnt = true;
+      break;
+    case SIAtomicScope::WORKGROUP:
+    case SIAtomicScope::WAVEFRONT:
+    case SIAtomicScope::SINGLETHREAD:
+      // The L1 cache keeps all memory operations in order for
+      // wavefronts in the same work-group.
+      break;
+    default:
+      llvm_unreachable("Unsupported synchronization scope");
+    }
+  }
+
+  if ((AddrSpace & SIAtomicAddrSpace::LDS) != SIAtomicAddrSpace::NONE) {
+    switch (Scope) {
+    case SIAtomicScope::SYSTEM:
+    case SIAtomicScope::AGENT:
+    case SIAtomicScope::WORKGROUP:
+      // If no cross address space ordering then an LDS waitcnt is not
+      // needed as LDS operations for all waves are executed in a
+      // total global ordering as observed by all waves. Required if
+      // also synchronizing with global/GDS memory as LDS operations
+      // could be reordered with respect to later global/GDS memory
+      // operations of the same wave.
+      LGKMCnt = IsCrossAddrSpaceOrdering;
+      break;
+    case SIAtomicScope::WAVEFRONT:
+    case SIAtomicScope::SINGLETHREAD:
+      // The LDS keeps all memory operations in order for
+      // the same wavesfront.
+      break;
+    default:
+      llvm_unreachable("Unsupported synchronization scope");
+    }
+  }
+
+  if ((AddrSpace & SIAtomicAddrSpace::GDS) != SIAtomicAddrSpace::NONE) {
+    switch (Scope) {
+    case SIAtomicScope::SYSTEM:
+    case SIAtomicScope::AGENT:
+      // If no cross address space ordering then an GDS waitcnt is not
+      // needed as GDS operations for all waves are executed in a
+      // total global ordering as observed by all waves. Required if
+      // also synchronizing with global/LDS memory as GDS operations
+      // could be reordered with respect to later global/LDS memory
+      // operations of the same wave.
+      EXPCnt = IsCrossAddrSpaceOrdering;
+      break;
+    case SIAtomicScope::WORKGROUP:
+    case SIAtomicScope::WAVEFRONT:
+    case SIAtomicScope::SINGLETHREAD:
+      // The GDS keeps all memory operations in order for
+      // the same work-group.
+      break;
+    default:
+      llvm_unreachable("Unsupported synchronization scope");
+    }
+  }
 
-  if (!Before)
+  if (VMCnt || LGKMCnt || EXPCnt) {
+    unsigned WaitCntImmediate =
+      AMDGPU::encodeWaitcnt(IV,
+                            VMCnt ? 0 : getVmcntBitMask(IV),
+                            EXPCnt ? 0 : getExpcntBitMask(IV),
+                            LGKMCnt ? 0 : getLgkmcntBitMask(IV));
+    BuildMI(MBB, MI, DL, TII->get(AMDGPU::S_WAITCNT)).addImm(WaitCntImmediate);
+    Changed = true;
+  }
+
+  if (Pos == Position::AFTER)
     --MI;
 
-  return true;
+  return Changed;
+}
+
+bool SIGfx7CacheControl::insertCacheInvalidate(MachineBasicBlock::iterator &MI,
+                                               SIAtomicScope Scope,
+                                               SIAtomicAddrSpace AddrSpace,
+                                               Position Pos) const {
+  bool Changed = false;
+
+  MachineBasicBlock &MBB = *MI->getParent();
+  DebugLoc DL = MI->getDebugLoc();
+
+  if (Pos == Position::AFTER)
+    ++MI;
+
+  if ((AddrSpace & SIAtomicAddrSpace::GLOBAL) != SIAtomicAddrSpace::NONE) {
+    switch (Scope) {
+    case SIAtomicScope::SYSTEM:
+    case SIAtomicScope::AGENT:
+      BuildMI(MBB, MI, DL, TII->get(AMDGPU::BUFFER_WBINVL1_VOL));
+      Changed = true;
+      break;
+    case SIAtomicScope::WORKGROUP:
+    case SIAtomicScope::WAVEFRONT:
+    case SIAtomicScope::SINGLETHREAD:
+      // No cache to invalidate.
+      break;
+    default:
+      llvm_unreachable("Unsupported synchronization scope");
+    }
+  }
+
+  /// The scratch address space does not need the global memory cache
+  /// to be flushed as all memory operations by the same thread are
+  /// sequentially consistent, and no other thread can access scratch
+  /// memory.
+
+  /// Other address spaces do not hava a cache.
+
+  if (Pos == Position::AFTER)
+    --MI;
+
+  return Changed;
 }
 
 bool SIMemoryLegalizer::removeAtomicPseudoMIs() {
@@ -396,37 +866,38 @@ bool SIMemoryLegalizer::expandLoad(const SIMemOpInfo &MOI,
   bool Changed = false;
 
   if (MOI.isAtomic()) {
-    if (MOI.getSSID() == SyncScope::System ||
-        MOI.getSSID() == MMI->getAgentSSID()) {
-      if (MOI.getOrdering() == AtomicOrdering::Acquire ||
-          MOI.getOrdering() == AtomicOrdering::SequentiallyConsistent)
-        Changed |= enableGLCBit(MI);
-
-      if (MOI.getOrdering() == AtomicOrdering::SequentiallyConsistent)
-        Changed |= insertWaitcntVmcnt0(MI);
-
-      if (MOI.getOrdering() == AtomicOrdering::Acquire ||
-          MOI.getOrdering() == AtomicOrdering::SequentiallyConsistent) {
-        Changed |= insertWaitcntVmcnt0(MI, false);
-        Changed |= insertBufferWbinvl1Vol(MI, false);
-      }
-
-      return Changed;
+    if (MOI.getOrdering() == AtomicOrdering::Monotonic ||
+        MOI.getOrdering() == AtomicOrdering::Acquire ||
+        MOI.getOrdering() == AtomicOrdering::SequentiallyConsistent) {
+      Changed |= CC->enableLoadCacheBypass(MI, MOI.getScope(),
+                                           MOI.getOrderingAddrSpace());
     }
 
-    if (MOI.getSSID() == SyncScope::SingleThread ||
-        MOI.getSSID() == MMI->getWorkgroupSSID() ||
-        MOI.getSSID() == MMI->getWavefrontSSID()) {
-      return Changed;
+    if (MOI.getOrdering() == AtomicOrdering::SequentiallyConsistent)
+      Changed |= CC->insertWait(MI, MOI.getScope(),
+                                MOI.getOrderingAddrSpace(),
+                                SIMemOp::LOAD | SIMemOp::STORE,
+                                MOI.getIsCrossAddressSpaceOrdering(),
+                                Position::BEFORE);
+
+    if (MOI.getOrdering() == AtomicOrdering::Acquire ||
+        MOI.getOrdering() == AtomicOrdering::SequentiallyConsistent) {
+      Changed |= CC->insertWait(MI, MOI.getScope(),
+                                MOI.getInstrAddrSpace(),
+                                SIMemOp::LOAD,
+                                MOI.getIsCrossAddressSpaceOrdering(),
+                                Position::AFTER);
+      Changed |= CC->insertCacheInvalidate(MI, MOI.getScope(),
+                                           MOI.getOrderingAddrSpace(),
+                                           Position::AFTER);
     }
 
-    llvm_unreachable("Unsupported synchronization scope");
+    return Changed;
   }
 
   // Atomic instructions do not have the nontemporal attribute.
   if (MOI.isNonTemporal()) {
-    Changed |= enableGLCBit(MI);
-    Changed |= enableSLCBit(MI);
+    Changed |= CC->enableNonTemporal(MI);
     return Changed;
   }
 
@@ -440,28 +911,20 @@ bool SIMemoryLegalizer::expandStore(const SIMemOpInfo &MOI,
   bool Changed = false;
 
   if (MOI.isAtomic()) {
-    if (MOI.getSSID() == SyncScope::System ||
-        MOI.getSSID() == MMI->getAgentSSID()) {
-      if (MOI.getOrdering() == AtomicOrdering::Release ||
-          MOI.getOrdering() == AtomicOrdering::SequentiallyConsistent)
-        Changed |= insertWaitcntVmcnt0(MI);
+    if (MOI.getOrdering() == AtomicOrdering::Release ||
+        MOI.getOrdering() == AtomicOrdering::SequentiallyConsistent)
+      Changed |= CC->insertWait(MI, MOI.getScope(),
+                                MOI.getOrderingAddrSpace(),
+                                SIMemOp::LOAD | SIMemOp::STORE,
+                                MOI.getIsCrossAddressSpaceOrdering(),
+                                Position::BEFORE);
 
-      return Changed;
-    }
-
-    if (MOI.getSSID() == SyncScope::SingleThread ||
-        MOI.getSSID() == MMI->getWorkgroupSSID() ||
-        MOI.getSSID() == MMI->getWavefrontSSID()) {
-      return Changed;
-    }
-
-    llvm_unreachable("Unsupported synchronization scope");
+    return Changed;
   }
 
   // Atomic instructions do not have the nontemporal attribute.
   if (MOI.isNonTemporal()) {
-    Changed |= enableGLCBit(MI);
-    Changed |= enableSLCBit(MI);
+    Changed |= CC->enableNonTemporal(MI);
     return Changed;
   }
 
@@ -472,111 +935,74 @@ bool SIMemoryLegalizer::expandAtomicFence(const SIMemOpInfo &MOI,
                                           MachineBasicBlock::iterator &MI) {
   assert(MI->getOpcode() == AMDGPU::ATOMIC_FENCE);
 
+  AtomicPseudoMIs.push_back(MI);
   bool Changed = false;
 
   if (MOI.isAtomic()) {
-    if (MOI.getSSID() == SyncScope::System ||
-        MOI.getSSID() == MMI->getAgentSSID()) {
-      if (MOI.getOrdering() == AtomicOrdering::Acquire ||
-          MOI.getOrdering() == AtomicOrdering::Release ||
-          MOI.getOrdering() == AtomicOrdering::AcquireRelease ||
-          MOI.getOrdering() == AtomicOrdering::SequentiallyConsistent)
-        Changed |= insertWaitcntVmcnt0(MI);
-
-      if (MOI.getOrdering() == AtomicOrdering::Acquire ||
-          MOI.getOrdering() == AtomicOrdering::AcquireRelease ||
-          MOI.getOrdering() == AtomicOrdering::SequentiallyConsistent)
-        Changed |= insertBufferWbinvl1Vol(MI);
-
-      AtomicPseudoMIs.push_back(MI);
-      return Changed;
-    }
+    if (MOI.getOrdering() == AtomicOrdering::Acquire ||
+        MOI.getOrdering() == AtomicOrdering::Release ||
+        MOI.getOrdering() == AtomicOrdering::AcquireRelease ||
+        MOI.getOrdering() == AtomicOrdering::SequentiallyConsistent)
+      /// TODO: This relies on a barrier always generating a waitcnt
+      /// for LDS to ensure it is not reordered with the completion of
+      /// the proceeding LDS operations. If barrier had a memory
+      /// ordering and memory scope, then library does not need to
+      /// generate a fence. Could add support in this file for
+      /// barrier. SIInsertWaitcnt.cpp could then stop unconditionally
+      /// adding waitcnt before a S_BARRIER.
+      Changed |= CC->insertWait(MI, MOI.getScope(),
+                                MOI.getOrderingAddrSpace(),
+                                SIMemOp::LOAD | SIMemOp::STORE,
+                                MOI.getIsCrossAddressSpaceOrdering(),
+                                Position::BEFORE);
+
+    if (MOI.getOrdering() == AtomicOrdering::Acquire ||
+        MOI.getOrdering() == AtomicOrdering::AcquireRelease ||
+        MOI.getOrdering() == AtomicOrdering::SequentiallyConsistent)
+      Changed |= CC->insertCacheInvalidate(MI, MOI.getScope(),
+                                           MOI.getOrderingAddrSpace(),
+                                           Position::BEFORE);
 
-    if (MOI.getSSID() == SyncScope::SingleThread ||
-        MOI.getSSID() == MMI->getWorkgroupSSID() ||
-        MOI.getSSID() == MMI->getWavefrontSSID()) {
-      AtomicPseudoMIs.push_back(MI);
-      return Changed;
-    }
-
-    SIMemOpInfo::reportUnknownSyncScope(MI);
-  }
-
-  return Changed;
-}
-
-bool SIMemoryLegalizer::expandAtomicCmpxchg(const SIMemOpInfo &MOI,
-                                            MachineBasicBlock::iterator &MI) {
-  assert(MI->mayLoad() && MI->mayStore());
-
-  bool Changed = false;
-
-  if (MOI.isAtomic()) {
-    if (MOI.getSSID() == SyncScope::System ||
-        MOI.getSSID() == MMI->getAgentSSID()) {
-      if (MOI.getOrdering() == AtomicOrdering::Release ||
-          MOI.getOrdering() == AtomicOrdering::AcquireRelease ||
-          MOI.getOrdering() == AtomicOrdering::SequentiallyConsistent ||
-          MOI.getFailureOrdering() == AtomicOrdering::SequentiallyConsistent)
-        Changed |= insertWaitcntVmcnt0(MI);
-
-      if (MOI.getOrdering() == AtomicOrdering::Acquire ||
-          MOI.getOrdering() == AtomicOrdering::AcquireRelease ||
-          MOI.getOrdering() == AtomicOrdering::SequentiallyConsistent ||
-          MOI.getFailureOrdering() == AtomicOrdering::Acquire ||
-          MOI.getFailureOrdering() == AtomicOrdering::SequentiallyConsistent) {
-        Changed |= insertWaitcntVmcnt0(MI, false);
-        Changed |= insertBufferWbinvl1Vol(MI, false);
-      }
-
-      return Changed;
-    }
-
-    if (MOI.getSSID() == SyncScope::SingleThread ||
-        MOI.getSSID() == MMI->getWorkgroupSSID() ||
-        MOI.getSSID() == MMI->getWavefrontSSID()) {
-      Changed |= enableGLCBit(MI);
-      return Changed;
-    }
-
-    llvm_unreachable("Unsupported synchronization scope");
+    return Changed;
   }
 
   return Changed;
 }
 
-bool SIMemoryLegalizer::expandAtomicRmw(const SIMemOpInfo &MOI,
-                                        MachineBasicBlock::iterator &MI) {
+bool SIMemoryLegalizer::expandAtomicCmpxchgOrRmw(const SIMemOpInfo &MOI,
+  MachineBasicBlock::iterator &MI) {
   assert(MI->mayLoad() && MI->mayStore());
 
   bool Changed = false;
 
   if (MOI.isAtomic()) {
-    if (MOI.getSSID() == SyncScope::System ||
-        MOI.getSSID() == MMI->getAgentSSID()) {
-      if (MOI.getOrdering() == AtomicOrdering::Release ||
-          MOI.getOrdering() == AtomicOrdering::AcquireRelease ||
-          MOI.getOrdering() == AtomicOrdering::SequentiallyConsistent)
-        Changed |= insertWaitcntVmcnt0(MI);
-
-      if (MOI.getOrdering() == AtomicOrdering::Acquire ||
-          MOI.getOrdering() == AtomicOrdering::AcquireRelease ||
-          MOI.getOrdering() == AtomicOrdering::SequentiallyConsistent) {
-        Changed |= insertWaitcntVmcnt0(MI, false);
-        Changed |= insertBufferWbinvl1Vol(MI, false);
-      }
-
-      return Changed;
+    if (MOI.getOrdering() == AtomicOrdering::Release ||
+        MOI.getOrdering() == AtomicOrdering::AcquireRelease ||
+        MOI.getOrdering() == AtomicOrdering::SequentiallyConsistent ||
+        MOI.getFailureOrdering() == AtomicOrdering::SequentiallyConsistent)
+      Changed |= CC->insertWait(MI, MOI.getScope(),
+                                MOI.getOrderingAddrSpace(),
+                                SIMemOp::LOAD | SIMemOp::STORE,
+                                MOI.getIsCrossAddressSpaceOrdering(),
+                                Position::BEFORE);
+
+    if (MOI.getOrdering() == AtomicOrdering::Acquire ||
+        MOI.getOrdering() == AtomicOrdering::AcquireRelease ||
+        MOI.getOrdering() == AtomicOrdering::SequentiallyConsistent ||
+        MOI.getFailureOrdering() == AtomicOrdering::Acquire ||
+        MOI.getFailureOrdering() == AtomicOrdering::SequentiallyConsistent) {
+      Changed |= CC->insertWait(MI, MOI.getScope(),
+                                MOI.getOrderingAddrSpace(),
+                                isAtomicRet(*MI) ? SIMemOp::LOAD :
+                                                   SIMemOp::STORE,
+                                MOI.getIsCrossAddressSpaceOrdering(),
+                                Position::AFTER);
+      Changed |= CC->insertCacheInvalidate(MI, MOI.getScope(),
+                                           MOI.getOrderingAddrSpace(),
+                                           Position::AFTER);
     }
 
-    if (MOI.getSSID() == SyncScope::SingleThread ||
-        MOI.getSSID() == MMI->getWorkgroupSSID() ||
-        MOI.getSSID() == MMI->getWavefrontSSID()) {
-      Changed |= enableGLCBit(MI);
-      return Changed;
-    }
-
-    llvm_unreachable("Unsupported synchronization scope");
+    return Changed;
   }
 
   return Changed;
@@ -584,32 +1010,23 @@ bool SIMemoryLegalizer::expandAtomicRmw(const SIMemOpInfo &MOI,
 
 bool SIMemoryLegalizer::runOnMachineFunction(MachineFunction &MF) {
   bool Changed = false;
-  const SISubtarget &ST = MF.getSubtarget<SISubtarget>();
-  const IsaInfo::IsaVersion IV = IsaInfo::getIsaVersion(ST.getFeatureBits());
-
-  MMI = &MF.getMMI().getObjFileInfo<AMDGPUMachineModuleInfo>();
-  TII = ST.getInstrInfo();
 
-  Vmcnt0Immediate =
-      AMDGPU::encodeWaitcnt(IV, 0, getExpcntBitMask(IV), getLgkmcntBitMask(IV));
-  Wbinvl1Opcode = ST.getGeneration() <= AMDGPUSubtarget::SOUTHERN_ISLANDS ?
-      AMDGPU::BUFFER_WBINVL1 : AMDGPU::BUFFER_WBINVL1_VOL;
+  SIMemOpAccess MOA(MF);
+  CC = SICacheControl::create(MF.getSubtarget<GCNSubtarget>());
 
   for (auto &MBB : MF) {
     for (auto MI = MBB.begin(); MI != MBB.end(); ++MI) {
       if (!(MI->getDesc().TSFlags & SIInstrFlags::maybeAtomic))
         continue;
 
-      if (const auto &MOI = SIMemOpInfo::getLoadInfo(MI))
+      if (const auto &MOI = MOA.getLoadInfo(MI))
         Changed |= expandLoad(MOI.getValue(), MI);
-      else if (const auto &MOI = SIMemOpInfo::getStoreInfo(MI))
+      else if (const auto &MOI = MOA.getStoreInfo(MI))
         Changed |= expandStore(MOI.getValue(), MI);
-      else if (const auto &MOI = SIMemOpInfo::getAtomicFenceInfo(MI))
+      else if (const auto &MOI = MOA.getAtomicFenceInfo(MI))
         Changed |= expandAtomicFence(MOI.getValue(), MI);
-      else if (const auto &MOI = SIMemOpInfo::getAtomicCmpxchgInfo(MI))
-        Changed |= expandAtomicCmpxchg(MOI.getValue(), MI);
-      else if (const auto &MOI = SIMemOpInfo::getAtomicRmwInfo(MI))
-        Changed |= expandAtomicRmw(MOI.getValue(), MI);
+      else if (const auto &MOI = MOA.getAtomicCmpxchgOrRmwInfo(MI))
+        Changed |= expandAtomicCmpxchgOrRmw(MOI.getValue(), MI);
     }
   }
 
diff --git a/lib/Target/AMDGPU/SIOptimizeExecMasking.cpp b/lib/Target/AMDGPU/SIOptimizeExecMasking.cpp
index 2dc6f2702b3b..ebcad30a1866 100644
--- a/lib/Target/AMDGPU/SIOptimizeExecMasking.cpp
+++ b/lib/Target/AMDGPU/SIOptimizeExecMasking.cpp
@@ -10,6 +10,7 @@
 #include "AMDGPU.h"
 #include "AMDGPUSubtarget.h"
 #include "SIInstrInfo.h"
+#include "MCTargetDesc/AMDGPUMCTargetDesc.h"
 #include "llvm/ADT/SmallSet.h"
 #include "llvm/CodeGen/MachineFunctionPass.h"
 #include "llvm/CodeGen/MachineInstrBuilder.h"
@@ -76,7 +77,7 @@ static unsigned isCopyToExec(const MachineInstr &MI) {
   case AMDGPU::COPY:
   case AMDGPU::S_MOV_B64: {
     const MachineOperand &Dst = MI.getOperand(0);
-    if (Dst.isReg() && Dst.getReg() == AMDGPU::EXEC)
+    if (Dst.isReg() && Dst.getReg() == AMDGPU::EXEC && MI.getOperand(1).isReg())
       return MI.getOperand(1).getReg();
     break;
   }
@@ -208,7 +209,7 @@ bool SIOptimizeExecMasking::runOnMachineFunction(MachineFunction &MF) {
   if (skipFunction(MF.getFunction()))
     return false;
 
-  const SISubtarget &ST = MF.getSubtarget<SISubtarget>();
+  const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
   const SIRegisterInfo *TRI = ST.getRegisterInfo();
   const SIInstrInfo *TII = ST.getInstrInfo();
 
@@ -243,11 +244,11 @@ bool SIOptimizeExecMasking::runOnMachineFunction(MachineFunction &MF) {
       // Fold exec = COPY (S_AND_B64 reg, exec) -> exec = S_AND_B64 reg, exec
       if (CopyToExecInst->getOperand(1).isKill() &&
           isLogicalOpOnExec(*PrepareExecInst) == CopyToExec) {
-        DEBUG(dbgs() << "Fold exec copy: " << *PrepareExecInst);
+        LLVM_DEBUG(dbgs() << "Fold exec copy: " << *PrepareExecInst);
 
         PrepareExecInst->getOperand(0).setReg(AMDGPU::EXEC);
 
-        DEBUG(dbgs() << "into: " << *PrepareExecInst << '\n');
+        LLVM_DEBUG(dbgs() << "into: " << *PrepareExecInst << '\n');
 
         CopyToExecInst->eraseFromParent();
       }
@@ -257,7 +258,7 @@ bool SIOptimizeExecMasking::runOnMachineFunction(MachineFunction &MF) {
 
     if (isLiveOut(MBB, CopyToExec)) {
       // The copied register is live out and has a second use in another block.
-      DEBUG(dbgs() << "Exec copy source register is live out\n");
+      LLVM_DEBUG(dbgs() << "Exec copy source register is live out\n");
       continue;
     }
 
@@ -269,7 +270,7 @@ bool SIOptimizeExecMasking::runOnMachineFunction(MachineFunction &MF) {
            = std::next(CopyFromExecInst->getIterator()), JE = I->getIterator();
          J != JE; ++J) {
       if (SaveExecInst && J->readsRegister(AMDGPU::EXEC, TRI)) {
-        DEBUG(dbgs() << "exec read prevents saveexec: " << *J << '\n');
+        LLVM_DEBUG(dbgs() << "exec read prevents saveexec: " << *J << '\n');
         // Make sure this is inserted after any VALU ops that may have been
         // scheduled in between.
         SaveExecInst = nullptr;
@@ -280,8 +281,8 @@ bool SIOptimizeExecMasking::runOnMachineFunction(MachineFunction &MF) {
 
       if (J->modifiesRegister(CopyToExec, TRI)) {
         if (SaveExecInst) {
-          DEBUG(dbgs() << "Multiple instructions modify "
-                << printReg(CopyToExec, TRI) << '\n');
+          LLVM_DEBUG(dbgs() << "Multiple instructions modify "
+                            << printReg(CopyToExec, TRI) << '\n');
           SaveExecInst = nullptr;
           break;
         }
@@ -292,10 +293,11 @@ bool SIOptimizeExecMasking::runOnMachineFunction(MachineFunction &MF) {
 
         if (ReadsCopyFromExec) {
           SaveExecInst = &*J;
-          DEBUG(dbgs() << "Found save exec op: " << *SaveExecInst << '\n');
+          LLVM_DEBUG(dbgs() << "Found save exec op: " << *SaveExecInst << '\n');
           continue;
         } else {
-          DEBUG(dbgs() << "Instruction does not read exec copy: " << *J << '\n');
+          LLVM_DEBUG(dbgs()
+                     << "Instruction does not read exec copy: " << *J << '\n');
           break;
         }
       } else if (ReadsCopyFromExec && !SaveExecInst) {
@@ -307,8 +309,8 @@ bool SIOptimizeExecMasking::runOnMachineFunction(MachineFunction &MF) {
         // spill %sgpr0_sgpr1
         // %sgpr2_sgpr3 = S_AND_B64 %sgpr0_sgpr1
         //
-        DEBUG(dbgs() << "Found second use of save inst candidate: "
-              << *J << '\n');
+        LLVM_DEBUG(dbgs() << "Found second use of save inst candidate: " << *J
+                          << '\n');
         break;
       }
 
@@ -321,7 +323,7 @@ bool SIOptimizeExecMasking::runOnMachineFunction(MachineFunction &MF) {
     if (!SaveExecInst)
       continue;
 
-    DEBUG(dbgs() << "Insert save exec op: " << *SaveExecInst << '\n');
+    LLVM_DEBUG(dbgs() << "Insert save exec op: " << *SaveExecInst << '\n');
 
     MachineOperand &Src0 = SaveExecInst->getOperand(1);
     MachineOperand &Src1 = SaveExecInst->getOperand(2);
diff --git a/lib/Target/AMDGPU/SIOptimizeExecMaskingPreRA.cpp b/lib/Target/AMDGPU/SIOptimizeExecMaskingPreRA.cpp
index 83074773c495..7b678d12ba81 100644
--- a/lib/Target/AMDGPU/SIOptimizeExecMaskingPreRA.cpp
+++ b/lib/Target/AMDGPU/SIOptimizeExecMaskingPreRA.cpp
@@ -8,7 +8,7 @@
 //===----------------------------------------------------------------------===//
 //
 /// \file
-/// \brief This pass removes redundant S_OR_B64 instructions enabling lanes in
+/// This pass removes redundant S_OR_B64 instructions enabling lanes in
 /// the exec. If two SI_END_CF (lowered as S_OR_B64) come together without any
 /// vector instructions between them we can only keep outer SI_END_CF, given
 /// that CFG is structured and exec bits of the outer end statement are always
@@ -23,6 +23,7 @@
 #include "AMDGPU.h"
 #include "AMDGPUSubtarget.h"
 #include "SIInstrInfo.h"
+#include "MCTargetDesc/AMDGPUMCTargetDesc.h"
 #include "llvm/CodeGen/LiveIntervals.h"
 #include "llvm/CodeGen/MachineFunctionPass.h"
 
@@ -106,7 +107,7 @@ bool SIOptimizeExecMaskingPreRA::runOnMachineFunction(MachineFunction &MF) {
   if (skipFunction(MF.getFunction()))
     return false;
 
-  const SISubtarget &ST = MF.getSubtarget<SISubtarget>();
+  const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
   const SIRegisterInfo *TRI = ST.getRegisterInfo();
   const SIInstrInfo *TII = ST.getInstrInfo();
   MachineRegisterInfo &MRI = MF.getRegInfo();
@@ -134,7 +135,7 @@ bool SIOptimizeExecMaskingPreRA::runOnMachineFunction(MachineFunction &MF) {
         }
 
         while (I != E) {
-          if (I->isDebugValue()) {
+          if (I->isDebugInstr()) {
             I = std::next(I);
             continue;
           }
@@ -143,7 +144,8 @@ bool SIOptimizeExecMaskingPreRA::runOnMachineFunction(MachineFunction &MF) {
               I->hasUnmodeledSideEffects() || I->hasOrderedMemoryRef())
             break;
 
-          DEBUG(dbgs() << "Removing no effect instruction: " << *I << '\n');
+          LLVM_DEBUG(dbgs()
+                     << "Removing no effect instruction: " << *I << '\n');
 
           for (auto &Op : I->operands()) {
             if (Op.isReg())
@@ -193,7 +195,7 @@ bool SIOptimizeExecMaskingPreRA::runOnMachineFunction(MachineFunction &MF) {
         !getOrExecSource(*NextLead, *TII, MRI))
       continue;
 
-    DEBUG(dbgs() << "Redundant EXEC = S_OR_B64 found: " << *Lead << '\n');
+    LLVM_DEBUG(dbgs() << "Redundant EXEC = S_OR_B64 found: " << *Lead << '\n');
 
     auto SaveExec = getOrExecSource(*Lead, *TII, MRI);
     unsigned SaveExecReg = getOrNonExecReg(*Lead, *TII);
@@ -224,7 +226,7 @@ bool SIOptimizeExecMaskingPreRA::runOnMachineFunction(MachineFunction &MF) {
         break;
       }
 
-      DEBUG(dbgs() << "Redundant EXEC COPY: " << *SaveExec << '\n');
+      LLVM_DEBUG(dbgs() << "Redundant EXEC COPY: " << *SaveExec << '\n');
     }
 
     if (SafeToReplace) {
diff --git a/lib/Target/AMDGPU/SIPeepholeSDWA.cpp b/lib/Target/AMDGPU/SIPeepholeSDWA.cpp
index 5ed7fdf220bf..0e000b72962e 100644
--- a/lib/Target/AMDGPU/SIPeepholeSDWA.cpp
+++ b/lib/Target/AMDGPU/SIPeepholeSDWA.cpp
@@ -25,6 +25,7 @@
 #include "SIDefines.h"
 #include "SIInstrInfo.h"
 #include "SIRegisterInfo.h"
+#include "MCTargetDesc/AMDGPUMCTargetDesc.h"
 #include "Utils/AMDGPUBaseInfo.h"
 #include "llvm/ADT/None.h"
 #include "llvm/ADT/Optional.h"
@@ -39,6 +40,7 @@
 #include "llvm/CodeGen/MachineOperand.h"
 #include "llvm/CodeGen/MachineRegisterInfo.h"
 #include "llvm/CodeGen/TargetRegisterInfo.h"
+#include "llvm/Config/llvm-config.h"
 #include "llvm/MC/LaneBitmask.h"
 #include "llvm/MC/MCInstrDesc.h"
 #include "llvm/Pass.h"
@@ -86,11 +88,11 @@ public:
   }
 
   bool runOnMachineFunction(MachineFunction &MF) override;
-  void matchSDWAOperands(MachineFunction &MF);
+  void matchSDWAOperands(MachineBasicBlock &MBB);
   std::unique_ptr<SDWAOperand> matchSDWAOperand(MachineInstr &MI);
-  bool isConvertibleToSDWA(const MachineInstr &MI, const SISubtarget &ST) const;
+  bool isConvertibleToSDWA(const MachineInstr &MI, const GCNSubtarget &ST) const;
   bool convertToSDWA(MachineInstr &MI, const SDWAOperandsVector &SDWAOperands);
-  void legalizeScalarOperands(MachineInstr &MI, const SISubtarget &ST) const;
+  void legalizeScalarOperands(MachineInstr &MI, const GCNSubtarget &ST) const;
 
   StringRef getPassName() const override { return "SI Peephole SDWA"; }
 
@@ -218,7 +220,7 @@ FunctionPass *llvm::createSIPeepholeSDWAPass() {
 
 
 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
-static raw_ostream& operator<<(raw_ostream &OS, const SdwaSel &Sel) {
+static raw_ostream& operator<<(raw_ostream &OS, SdwaSel Sel) {
   switch(Sel) {
   case BYTE_0: OS << "BYTE_0"; break;
   case BYTE_1: OS << "BYTE_1"; break;
@@ -366,18 +368,53 @@ MachineInstr *SDWASrcOperand::potentialToConvert(const SIInstrInfo *TII) {
 bool SDWASrcOperand::convertToSDWA(MachineInstr &MI, const SIInstrInfo *TII) {
   // Find operand in instruction that matches source operand and replace it with
   // target operand. Set corresponding src_sel
-
+  bool IsPreserveSrc = false;
   MachineOperand *Src = TII->getNamedOperand(MI, AMDGPU::OpName::src0);
   MachineOperand *SrcSel = TII->getNamedOperand(MI, AMDGPU::OpName::src0_sel);
   MachineOperand *SrcMods =
       TII->getNamedOperand(MI, AMDGPU::OpName::src0_modifiers);
   assert(Src && (Src->isReg() || Src->isImm()));
   if (!isSameReg(*Src, *getReplacedOperand())) {
-    // If this is not src0 then it should be src1
+    // If this is not src0 then it could be src1
     Src = TII->getNamedOperand(MI, AMDGPU::OpName::src1);
     SrcSel = TII->getNamedOperand(MI, AMDGPU::OpName::src1_sel);
     SrcMods = TII->getNamedOperand(MI, AMDGPU::OpName::src1_modifiers);
 
+    if (!Src ||
+        !isSameReg(*Src, *getReplacedOperand())) {
+      // It's possible this Src is a tied operand for
+      // UNUSED_PRESERVE, in which case we can either
+      // abandon the peephole attempt, or if legal we can
+      // copy the target operand into the tied slot
+      // if the preserve operation will effectively cause the same
+      // result by overwriting the rest of the dst.
+      MachineOperand *Dst = TII->getNamedOperand(MI, AMDGPU::OpName::vdst);
+      MachineOperand *DstUnused =
+        TII->getNamedOperand(MI, AMDGPU::OpName::dst_unused);
+
+      if (Dst &&
+          DstUnused->getImm() == AMDGPU::SDWA::DstUnused::UNUSED_PRESERVE) {
+        // This will work if the tied src is acessing WORD_0, and the dst is
+        // writing WORD_1. Modifiers don't matter because all the bits that
+        // would be impacted are being overwritten by the dst.
+        // Any other case will not work.
+        SdwaSel DstSel = static_cast<SdwaSel>(
+            TII->getNamedImmOperand(MI, AMDGPU::OpName::dst_sel));
+        if (DstSel == AMDGPU::SDWA::SdwaSel::WORD_1 &&
+            getSrcSel() == AMDGPU::SDWA::SdwaSel::WORD_0) {
+          IsPreserveSrc = true;
+          auto DstIdx = AMDGPU::getNamedOperandIdx(MI.getOpcode(),
+                                                   AMDGPU::OpName::vdst);
+          auto TiedIdx = MI.findTiedOperandIdx(DstIdx);
+          Src = &MI.getOperand(TiedIdx);
+          SrcSel = nullptr;
+          SrcMods = nullptr;
+        } else {
+          // Not legal to convert this src
+          return false;
+        }
+      }
+    }
     assert(Src && Src->isReg());
 
     if ((MI.getOpcode() == AMDGPU::V_MAC_F16_sdwa ||
@@ -388,11 +425,14 @@ bool SDWASrcOperand::convertToSDWA(MachineInstr &MI, const SIInstrInfo *TII) {
       return false;
     }
 
-    assert(isSameReg(*Src, *getReplacedOperand()) && SrcSel && SrcMods);
+    assert(isSameReg(*Src, *getReplacedOperand()) &&
+           (IsPreserveSrc || (SrcSel && SrcMods)));
   }
   copyRegOperand(*Src, *getTargetOperand());
-  SrcSel->setImm(getSrcSel());
-  SrcMods->setImm(getSrcMods(TII, Src));
+  if (!IsPreserveSrc) {
+    SrcSel->setImm(getSrcSel());
+    SrcMods->setImm(getSrcMods(TII, Src));
+  }
   getTargetOperand()->setIsKill(false);
   return true;
 }
@@ -661,7 +701,7 @@ SIPeepholeSDWA::matchSDWAOperand(MachineInstr &MI) {
 
     MachineOperand *Dst = TII->getNamedOperand(MI, AMDGPU::OpName::vdst);
 
-    if (TRI->isPhysicalRegister(Src1->getReg()) ||
+    if (TRI->isPhysicalRegister(ValSrc->getReg()) ||
         TRI->isPhysicalRegister(Dst->getReg()))
       break;
 
@@ -739,8 +779,8 @@ SIPeepholeSDWA::matchSDWAOperand(MachineInstr &MI) {
 
     // TODO: add support for non-SDWA instructions as OtherInst.
     // For now this only works with SDWA instructions. For regular instructions
-    // there is no way to determine if instruction write only 8/16/24-bit out of
-    // full register size and all registers are at min 32-bit wide.
+    // there is no way to determine if the instruction writes only 8/16/24-bit
+    // out of full register size and all registers are at min 32-bit wide.
     if (!TII->isSDWA(*OtherInst))
       break;
 
@@ -804,20 +844,18 @@ SIPeepholeSDWA::matchSDWAOperand(MachineInstr &MI) {
   return std::unique_ptr<SDWAOperand>(nullptr);
 }
 
-void SIPeepholeSDWA::matchSDWAOperands(MachineFunction &MF) {
-  for (MachineBasicBlock &MBB : MF) {
-    for (MachineInstr &MI : MBB) {
-      if (auto Operand = matchSDWAOperand(MI)) {
-        DEBUG(dbgs() << "Match: " << MI << "To: " << *Operand << '\n');
-        SDWAOperands[&MI] = std::move(Operand);
-        ++NumSDWAPatternsFound;
-      }
+void SIPeepholeSDWA::matchSDWAOperands(MachineBasicBlock &MBB) {
+  for (MachineInstr &MI : MBB) {
+    if (auto Operand = matchSDWAOperand(MI)) {
+      LLVM_DEBUG(dbgs() << "Match: " << MI << "To: " << *Operand << '\n');
+      SDWAOperands[&MI] = std::move(Operand);
+      ++NumSDWAPatternsFound;
     }
   }
 }
 
 bool SIPeepholeSDWA::isConvertibleToSDWA(const MachineInstr &MI,
-                                         const SISubtarget &ST) const {
+                                         const GCNSubtarget &ST) const {
   // Check if this is already an SDWA instruction
   unsigned Opc = MI.getOpcode();
   if (TII->isSDWA(Opc))
@@ -854,11 +892,18 @@ bool SIPeepholeSDWA::isConvertibleToSDWA(const MachineInstr &MI,
                            Opc == AMDGPU::V_MAC_F32_e32))
     return false;
 
+  // FIXME: has SDWA but require handling of implicit VCC use
+  if (Opc == AMDGPU::V_CNDMASK_B32_e32)
+    return false;
+
   return true;
 }
 
 bool SIPeepholeSDWA::convertToSDWA(MachineInstr &MI,
                                    const SDWAOperandsVector &SDWAOperands) {
+
+  LLVM_DEBUG(dbgs() << "Convert instruction:" << MI);
+
   // Convert to sdwa
   int SDWAOpcode;
   unsigned Opcode = MI.getOpcode();
@@ -984,9 +1029,29 @@ bool SIPeepholeSDWA::convertToSDWA(MachineInstr &MI,
     }
   }
 
-  // Apply all sdwa operand pattenrs
+  // Check for a preserved register that needs to be copied.
+  auto DstUnused = TII->getNamedOperand(MI, AMDGPU::OpName::dst_unused);
+  if (DstUnused &&
+      DstUnused->getImm() == AMDGPU::SDWA::DstUnused::UNUSED_PRESERVE) {
+    // We expect, if we are here, that the instruction was already in it's SDWA form,
+    // with a tied operand.
+    assert(Dst && Dst->isTied());
+    assert(Opcode == static_cast<unsigned int>(SDWAOpcode));
+    // We also expect a vdst, since sdst can't preserve.
+    auto PreserveDstIdx = AMDGPU::getNamedOperandIdx(SDWAOpcode, AMDGPU::OpName::vdst);
+    assert(PreserveDstIdx != -1);
+
+    auto TiedIdx = MI.findTiedOperandIdx(PreserveDstIdx);
+    auto Tied = MI.getOperand(TiedIdx);
+
+    SDWAInst.add(Tied);
+    SDWAInst->tieOperands(PreserveDstIdx, SDWAInst->getNumOperands() - 1);
+  }
+
+  // Apply all sdwa operand patterns.
   bool Converted = false;
   for (auto &Operand : SDWAOperands) {
+    LLVM_DEBUG(dbgs() << *SDWAInst << "\nOperand: " << *Operand);
     // There should be no intesection between SDWA operands and potential MIs
     // e.g.:
     // v_and_b32 v0, 0xff, v1 -> src:v1 sel:BYTE_0
@@ -1007,8 +1072,7 @@ bool SIPeepholeSDWA::convertToSDWA(MachineInstr &MI,
     return false;
   }
 
-  DEBUG(dbgs() << "Convert instruction:" << MI
-               << "Into:" << *SDWAInst << '\n');
+  LLVM_DEBUG(dbgs() << "\nInto:" << *SDWAInst << '\n');
   ++NumSDWAInstructionsPeepholed;
 
   MI.eraseFromParent();
@@ -1017,7 +1081,8 @@ bool SIPeepholeSDWA::convertToSDWA(MachineInstr &MI,
 
 // If an instruction was converted to SDWA it should not have immediates or SGPR
 // operands (allowed one SGPR on GFX9). Copy its scalar operands into VGPRs.
-void SIPeepholeSDWA::legalizeScalarOperands(MachineInstr &MI, const SISubtarget &ST) const {
+void SIPeepholeSDWA::legalizeScalarOperands(MachineInstr &MI,
+                                            const GCNSubtarget &ST) const {
   const MCInstrDesc &Desc = TII->get(MI.getOpcode());
   unsigned ConstantBusCount = 0;
   for (MachineOperand &Op : MI.explicit_uses()) {
@@ -1048,7 +1113,7 @@ void SIPeepholeSDWA::legalizeScalarOperands(MachineInstr &MI, const SISubtarget
 }
 
 bool SIPeepholeSDWA::runOnMachineFunction(MachineFunction &MF) {
-  const SISubtarget &ST = MF.getSubtarget<SISubtarget>();
+  const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
 
   if (!ST.hasSDWA() || skipFunction(MF.getFunction()))
     return false;
@@ -1058,35 +1123,36 @@ bool SIPeepholeSDWA::runOnMachineFunction(MachineFunction &MF) {
   TII = ST.getInstrInfo();
 
   // Find all SDWA operands in MF.
-  bool Changed = false;
   bool Ret = false;
-  do {
-    matchSDWAOperands(MF);
-
-    for (const auto &OperandPair : SDWAOperands) {
-      const auto &Operand = OperandPair.second;
-      MachineInstr *PotentialMI = Operand->potentialToConvert(TII);
-      if (PotentialMI && isConvertibleToSDWA(*PotentialMI, ST)) {
-        PotentialMatches[PotentialMI].push_back(Operand.get());
+  for (MachineBasicBlock &MBB : MF) {
+    bool Changed = false;
+    do {
+      matchSDWAOperands(MBB);
+
+      for (const auto &OperandPair : SDWAOperands) {
+        const auto &Operand = OperandPair.second;
+        MachineInstr *PotentialMI = Operand->potentialToConvert(TII);
+        if (PotentialMI && isConvertibleToSDWA(*PotentialMI, ST)) {
+          PotentialMatches[PotentialMI].push_back(Operand.get());
+        }
       }
-    }
 
-    for (auto &PotentialPair : PotentialMatches) {
-      MachineInstr &PotentialMI = *PotentialPair.first;
-      convertToSDWA(PotentialMI, PotentialPair.second);
-    }
-
-    PotentialMatches.clear();
-    SDWAOperands.clear();
+      for (auto &PotentialPair : PotentialMatches) {
+        MachineInstr &PotentialMI = *PotentialPair.first;
+        convertToSDWA(PotentialMI, PotentialPair.second);
+      }
 
-    Changed = !ConvertedInstructions.empty();
+      PotentialMatches.clear();
+      SDWAOperands.clear();
 
-    if (Changed)
-      Ret = true;
+      Changed = !ConvertedInstructions.empty();
 
-    while (!ConvertedInstructions.empty())
-      legalizeScalarOperands(*ConvertedInstructions.pop_back_val(), ST);
-  } while (Changed);
+      if (Changed)
+        Ret = true;
+      while (!ConvertedInstructions.empty())
+        legalizeScalarOperands(*ConvertedInstructions.pop_back_val(), ST);
+    } while (Changed);
+  }
 
   return Ret;
 }
diff --git a/lib/Target/AMDGPU/SIProgramInfo.h b/lib/Target/AMDGPU/SIProgramInfo.h
new file mode 100644
index 000000000000..383f6b575808
--- /dev/null
+++ b/lib/Target/AMDGPU/SIProgramInfo.h
@@ -0,0 +1,77 @@
+//===--- SIProgramInfo.h ----------------------------------------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+/// \file
+/// Defines struct to track resource usage for kernels and entry functions.
+///
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_TARGET_AMDGPU_SIPROGRAMINFO_H
+#define LLVM_LIB_TARGET_AMDGPU_SIPROGRAMINFO_H
+
+namespace llvm {
+
+/// Track resource usage for kernels / entry functions.
+struct SIProgramInfo {
+    // Fields set in PGM_RSRC1 pm4 packet.
+    uint32_t VGPRBlocks = 0;
+    uint32_t SGPRBlocks = 0;
+    uint32_t Priority = 0;
+    uint32_t FloatMode = 0;
+    uint32_t Priv = 0;
+    uint32_t DX10Clamp = 0;
+    uint32_t DebugMode = 0;
+    uint32_t IEEEMode = 0;
+    uint64_t ScratchSize = 0;
+
+    uint64_t ComputePGMRSrc1 = 0;
+
+    // Fields set in PGM_RSRC2 pm4 packet.
+    uint32_t LDSBlocks = 0;
+    uint32_t ScratchBlocks = 0;
+
+    uint64_t ComputePGMRSrc2 = 0;
+
+    uint32_t NumVGPR = 0;
+    uint32_t NumSGPR = 0;
+    uint32_t LDSSize = 0;
+    bool FlatUsed = false;
+
+    // Number of SGPRs that meets number of waves per execution unit request.
+    uint32_t NumSGPRsForWavesPerEU = 0;
+
+    // Number of VGPRs that meets number of waves per execution unit request.
+    uint32_t NumVGPRsForWavesPerEU = 0;
+
+    // Fixed SGPR number used to hold wave scratch offset for entire kernel
+    // execution, or std::numeric_limits<uint16_t>::max() if the register is not
+    // used or not known.
+    uint16_t DebuggerWavefrontPrivateSegmentOffsetSGPR =
+        std::numeric_limits<uint16_t>::max();
+
+    // Fixed SGPR number of the first 4 SGPRs used to hold scratch V# for entire
+    // kernel execution, or std::numeric_limits<uint16_t>::max() if the register
+    // is not used or not known.
+    uint16_t DebuggerPrivateSegmentBufferSGPR =
+        std::numeric_limits<uint16_t>::max();
+
+    // Whether there is recursion, dynamic allocas, indirect calls or some other
+    // reason there may be statically unknown stack usage.
+    bool DynamicCallStack = false;
+
+    // Bonus information for debugging.
+    bool VCCUsed = false;
+
+    SIProgramInfo() = default;
+};
+
+} // namespace llvm
+
+#endif // LLVM_LIB_TARGET_AMDGPU_SIPROGRAMINFO_H
diff --git a/lib/Target/AMDGPU/SIRegisterInfo.cpp b/lib/Target/AMDGPU/SIRegisterInfo.cpp
index 65cdc13e03cd..624607f6ea54 100644
--- a/lib/Target/AMDGPU/SIRegisterInfo.cpp
+++ b/lib/Target/AMDGPU/SIRegisterInfo.cpp
@@ -8,14 +8,16 @@
 //===----------------------------------------------------------------------===//
 //
 /// \file
-/// \brief SI implementation of the TargetRegisterInfo class.
+/// SI implementation of the TargetRegisterInfo class.
 //
 //===----------------------------------------------------------------------===//
 
 #include "SIRegisterInfo.h"
+#include "AMDGPURegisterBankInfo.h"
 #include "AMDGPUSubtarget.h"
 #include "SIInstrInfo.h"
 #include "SIMachineFunctionInfo.h"
+#include "MCTargetDesc/AMDGPUMCTargetDesc.h"
 #include "llvm/CodeGen/MachineFrameInfo.h"
 #include "llvm/CodeGen/MachineInstrBuilder.h"
 #include "llvm/CodeGen/RegisterScavenging.h"
@@ -54,7 +56,7 @@ static cl::opt<bool> EnableSpillSGPRToVGPR(
   cl::ReallyHidden,
   cl::init(true));
 
-SIRegisterInfo::SIRegisterInfo(const SISubtarget &ST) :
+SIRegisterInfo::SIRegisterInfo(const GCNSubtarget &ST) :
   AMDGPURegisterInfo(),
   SGPRPressureSets(getNumRegPressureSets()),
   VGPRPressureSets(getNumRegPressureSets()),
@@ -101,17 +103,10 @@ SIRegisterInfo::SIRegisterInfo(const SISubtarget &ST) :
          VGPRSetID < NumRegPressureSets);
 }
 
-void SIRegisterInfo::reserveRegisterTuples(BitVector &Reserved, unsigned Reg) const {
-  MCRegAliasIterator R(Reg, this, true);
-
-  for (; R.isValid(); ++R)
-    Reserved.set(*R);
-}
-
 unsigned SIRegisterInfo::reservedPrivateSegmentBufferReg(
   const MachineFunction &MF) const {
 
-  const SISubtarget &ST = MF.getSubtarget<SISubtarget>();
+  const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
   unsigned BaseIdx = alignDown(ST.getMaxNumSGPRs(MF), 4) - 4;
   unsigned BaseReg(AMDGPU::SGPR_32RegClass.getRegister(BaseIdx));
   return getMatchingSuperReg(BaseReg, AMDGPU::sub0, &AMDGPU::SReg_128RegClass);
@@ -136,7 +131,7 @@ static unsigned findPrivateSegmentWaveByteOffsetRegIndex(unsigned RegCount) {
 
 unsigned SIRegisterInfo::reservedPrivateSegmentWaveByteOffsetReg(
   const MachineFunction &MF) const {
-  const SISubtarget &ST = MF.getSubtarget<SISubtarget>();
+  const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
   unsigned Reg = findPrivateSegmentWaveByteOffsetRegIndex(ST.getMaxNumSGPRs(MF));
   return AMDGPU::SGPR_32RegClass.getRegister(Reg);
 }
@@ -163,6 +158,9 @@ BitVector SIRegisterInfo::getReservedRegs(const MachineFunction &MF) const {
   reserveRegisterTuples(Reserved, AMDGPU::SRC_PRIVATE_BASE);
   reserveRegisterTuples(Reserved, AMDGPU::SRC_PRIVATE_LIMIT);
 
+  // Reserve xnack_mask registers - support is not implemented in Codegen.
+  reserveRegisterTuples(Reserved, AMDGPU::XNACK_MASK);
+
   // Reserve Trap Handler registers - support is not implemented in Codegen.
   reserveRegisterTuples(Reserved, AMDGPU::TBA);
   reserveRegisterTuples(Reserved, AMDGPU::TMA);
@@ -175,7 +173,7 @@ BitVector SIRegisterInfo::getReservedRegs(const MachineFunction &MF) const {
   reserveRegisterTuples(Reserved, AMDGPU::TTMP12_TTMP13);
   reserveRegisterTuples(Reserved, AMDGPU::TTMP14_TTMP15);
 
-  const SISubtarget &ST = MF.getSubtarget<SISubtarget>();
+  const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
 
   unsigned MaxNumSGPRs = ST.getMaxNumSGPRs(MF);
   unsigned TotalNumSGPRs = AMDGPU::SGPR_32RegClass.getNumRegs();
@@ -255,7 +253,7 @@ bool SIRegisterInfo::requiresFrameIndexReplacementScavenging(
   // create a virtual register for it during frame index elimination, so the
   // scavenger is directly needed.
   return MF.getFrameInfo().hasStackObjects() &&
-         MF.getSubtarget<SISubtarget>().hasScalarStores() &&
+         MF.getSubtarget<GCNSubtarget>().hasScalarStores() &&
          MF.getInfo<SIMachineFunctionInfo>()->hasSpilledSGPRs();
 }
 
@@ -310,7 +308,7 @@ void SIRegisterInfo::materializeFrameBaseRegister(MachineBasicBlock *MBB,
     DL = Ins->getDebugLoc();
 
   MachineFunction *MF = MBB->getParent();
-  const SISubtarget &Subtarget = MF->getSubtarget<SISubtarget>();
+  const GCNSubtarget &Subtarget = MF->getSubtarget<GCNSubtarget>();
   const SIInstrInfo *TII = Subtarget.getInstrInfo();
 
   if (Offset == 0) {
@@ -339,7 +337,7 @@ void SIRegisterInfo::resolveFrameIndex(MachineInstr &MI, unsigned BaseReg,
 
   MachineBasicBlock *MBB = MI.getParent();
   MachineFunction *MF = MBB->getParent();
-  const SISubtarget &Subtarget = MF->getSubtarget<SISubtarget>();
+  const GCNSubtarget &Subtarget = MF->getSubtarget<GCNSubtarget>();
   const SIInstrInfo *TII = Subtarget.getInstrInfo();
 
 #ifndef NDEBUG
@@ -526,7 +524,7 @@ void SIRegisterInfo::buildSpillLoadStore(MachineBasicBlock::iterator MI,
                                          RegScavenger *RS) const {
   MachineBasicBlock *MBB = MI->getParent();
   MachineFunction *MF = MI->getParent()->getParent();
-  const SISubtarget &ST =  MF->getSubtarget<SISubtarget>();
+  const GCNSubtarget &ST =  MF->getSubtarget<GCNSubtarget>();
   const SIInstrInfo *TII = ST.getInstrInfo();
   const MachineFrameInfo &MFI = MF->getFrameInfo();
 
@@ -534,22 +532,29 @@ void SIRegisterInfo::buildSpillLoadStore(MachineBasicBlock::iterator MI,
   const DebugLoc &DL = MI->getDebugLoc();
   bool IsStore = Desc.mayStore();
 
-  bool RanOutOfSGPRs = false;
   bool Scavenged = false;
   unsigned SOffset = ScratchOffsetReg;
 
+  const unsigned EltSize = 4;
   const TargetRegisterClass *RC = getRegClassForReg(MF->getRegInfo(), ValueReg);
-  unsigned NumSubRegs = AMDGPU::getRegBitWidth(RC->getID()) / 32;
-  unsigned Size = NumSubRegs * 4;
+  unsigned NumSubRegs = AMDGPU::getRegBitWidth(RC->getID()) / (EltSize * CHAR_BIT);
+  unsigned Size = NumSubRegs * EltSize;
   int64_t Offset = InstOffset + MFI.getObjectOffset(Index);
-  const int64_t OriginalImmOffset = Offset;
+  int64_t ScratchOffsetRegDelta = 0;
 
   unsigned Align = MFI.getObjectAlignment(Index);
   const MachinePointerInfo &BasePtrInfo = MMO->getPointerInfo();
 
-  if (!isUInt<12>(Offset + Size)) {
+  assert((Offset % EltSize) == 0 && "unexpected VGPR spill offset");
+
+  if (!isUInt<12>(Offset + Size - EltSize)) {
     SOffset = AMDGPU::NoRegister;
 
+    // We currently only support spilling VGPRs to EltSize boundaries, meaning
+    // we can simplify the adjustment of Offset here to just scale with
+    // WavefrontSize.
+    Offset *= ST.getWavefrontSize();
+
     // We don't have access to the register scavenger if this function is called
     // during  PEI::scavengeFrameVirtualRegs().
     if (RS)
@@ -563,8 +568,8 @@ void SIRegisterInfo::buildSpillLoadStore(MachineBasicBlock::iterator MI,
       // add the offset directly to the ScratchOffset register, and then
       // subtract the offset after the spill to return ScratchOffset to it's
       // original value.
-      RanOutOfSGPRs = true;
       SOffset = ScratchOffsetReg;
+      ScratchOffsetRegDelta = Offset;
     } else {
       Scavenged = true;
     }
@@ -576,8 +581,6 @@ void SIRegisterInfo::buildSpillLoadStore(MachineBasicBlock::iterator MI,
     Offset = 0;
   }
 
-  const unsigned EltSize = 4;
-
   for (unsigned i = 0, e = NumSubRegs; i != e; ++i, Offset += EltSize) {
     unsigned SubReg = NumSubRegs == 1 ?
       ValueReg : getSubReg(ValueReg, getSubRegFromChannel(i));
@@ -609,11 +612,11 @@ void SIRegisterInfo::buildSpillLoadStore(MachineBasicBlock::iterator MI,
       MIB.addReg(ValueReg, RegState::Implicit | SrcDstRegState);
   }
 
-  if (RanOutOfSGPRs) {
+  if (ScratchOffsetRegDelta != 0) {
     // Subtract the offset we added to the ScratchOffset register.
     BuildMI(*MBB, MI, DL, TII->get(AMDGPU::S_SUB_U32), ScratchOffsetReg)
-      .addReg(ScratchOffsetReg)
-      .addImm(OriginalImmOffset);
+        .addReg(ScratchOffsetReg)
+        .addImm(ScratchOffsetRegDelta);
   }
 }
 
@@ -640,6 +643,7 @@ bool SIRegisterInfo::spillSGPR(MachineBasicBlock::iterator MI,
   MachineBasicBlock *MBB = MI->getParent();
   MachineFunction *MF = MBB->getParent();
   SIMachineFunctionInfo *MFI = MF->getInfo<SIMachineFunctionInfo>();
+  DenseSet<unsigned> SGPRSpillVGPRDefinedSet;
 
   ArrayRef<SIMachineFunctionInfo::SpilledReg> VGPRSpills
     = MFI->getSGPRToVGPRSpills(Index);
@@ -648,7 +652,7 @@ bool SIRegisterInfo::spillSGPR(MachineBasicBlock::iterator MI,
     return false;
 
   MachineRegisterInfo &MRI = MF->getRegInfo();
-  const SISubtarget &ST =  MF->getSubtarget<SISubtarget>();
+  const GCNSubtarget &ST =  MF->getSubtarget<GCNSubtarget>();
   const SIInstrInfo *TII = ST.getInstrInfo();
 
   unsigned SuperReg = MI->getOperand(0).getReg();
@@ -661,6 +665,10 @@ bool SIRegisterInfo::spillSGPR(MachineBasicBlock::iterator MI,
   if (SpillToSMEM && OnlyToVGPR)
     return false;
 
+  assert(SpillToVGPR || (SuperReg != MFI->getStackPtrOffsetReg() &&
+                         SuperReg != MFI->getFrameOffsetReg() &&
+                         SuperReg != MFI->getScratchWaveOffsetReg()));
+
   assert(SuperReg != AMDGPU::M0 && "m0 should never spill");
 
   unsigned OffsetReg = AMDGPU::M0;
@@ -736,11 +744,21 @@ bool SIRegisterInfo::spillSGPR(MachineBasicBlock::iterator MI,
     if (SpillToVGPR) {
       SIMachineFunctionInfo::SpilledReg Spill = VGPRSpills[i];
 
+      // During SGPR spilling to VGPR, determine if the VGPR is defined. The
+      // only circumstance in which we say it is undefined is when it is the
+      // first spill to this VGPR in the first basic block.
+      bool VGPRDefined = true;
+      if (MBB == &MF->front())
+        VGPRDefined = !SGPRSpillVGPRDefinedSet.insert(Spill.VGPR).second;
+
+      // Mark the "old value of vgpr" input undef only if this is the first sgpr
+      // spill to this specific vgpr in the first basic block.
       BuildMI(*MBB, MI, DL,
               TII->getMCOpcodeFromPseudo(AMDGPU::V_WRITELANE_B32),
               Spill.VGPR)
         .addReg(SubReg, getKillRegState(IsKill))
-        .addImm(Spill.Lane);
+        .addImm(Spill.Lane)
+        .addReg(Spill.VGPR, VGPRDefined ? 0 : RegState::Undef);
 
       // FIXME: Since this spills to another register instead of an actual
       // frame index, we should delete the frame index when all references to
@@ -812,7 +830,7 @@ bool SIRegisterInfo::restoreSGPR(MachineBasicBlock::iterator MI,
     return false;
 
   MachineFrameInfo &FrameInfo = MF->getFrameInfo();
-  const SISubtarget &ST =  MF->getSubtarget<SISubtarget>();
+  const GCNSubtarget &ST =  MF->getSubtarget<GCNSubtarget>();
   const SIInstrInfo *TII = ST.getInstrInfo();
   const DebugLoc &DL = MI->getDebugLoc();
 
@@ -972,7 +990,7 @@ void SIRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator MI,
   MachineBasicBlock *MBB = MI->getParent();
   SIMachineFunctionInfo *MFI = MF->getInfo<SIMachineFunctionInfo>();
   MachineFrameInfo &FrameInfo = MF->getFrameInfo();
-  const SISubtarget &ST =  MF->getSubtarget<SISubtarget>();
+  const GCNSubtarget &ST =  MF->getSubtarget<GCNSubtarget>();
   const SIInstrInfo *TII = ST.getInstrInfo();
   DebugLoc DL = MI->getDebugLoc();
 
@@ -1051,8 +1069,8 @@ void SIRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator MI,
         // Convert to an absolute stack address by finding the offset from the
         // scratch wave base and scaling by the wave size.
         //
-        // In an entry function/kernel the stack address is already the absolute
-        // address relative to the the scratch wave offset.
+        // In an entry function/kernel the stack address is already the
+        // absolute address relative to the scratch wave offset.
 
         unsigned DiffReg
           = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
@@ -1219,6 +1237,8 @@ const TargetRegisterClass *SIRegisterInfo::getPhysRegClass(unsigned Reg) const {
     &AMDGPU::VReg_512RegClass,
     &AMDGPU::SReg_512RegClass,
     &AMDGPU::SCC_CLASSRegClass,
+    &AMDGPU::Pseudo_SReg_32RegClass,
+    &AMDGPU::Pseudo_SReg_128RegClass,
   };
 
   for (const TargetRegisterClass *BaseClass : BaseClasses) {
@@ -1355,7 +1375,7 @@ bool SIRegisterInfo::shouldRewriteCopySrc(
   return getCommonSubClass(DefRC, SrcRC) != nullptr;
 }
 
-/// \brief Returns a register that is not used at any point in the function.
+/// Returns a register that is not used at any point in the function.
 ///        If all registers are used, then this function will return
 //         AMDGPU::NoRegister.
 unsigned
@@ -1483,7 +1503,9 @@ SIRegisterInfo::getRegClassForReg(const MachineRegisterInfo &MRI,
 
 bool SIRegisterInfo::isVGPR(const MachineRegisterInfo &MRI,
                             unsigned Reg) const {
-  return hasVGPRs(getRegClassForReg(MRI, Reg));
+  const TargetRegisterClass * RC = getRegClassForReg(MRI, Reg);
+  assert(RC && "Register class for the reg not found");
+  return hasVGPRs(RC);
 }
 
 bool SIRegisterInfo::shouldCoalesce(MachineInstr *MI,
@@ -1510,7 +1532,7 @@ bool SIRegisterInfo::shouldCoalesce(MachineInstr *MI,
 unsigned SIRegisterInfo::getRegPressureLimit(const TargetRegisterClass *RC,
                                              MachineFunction &MF) const {
 
-  const SISubtarget &ST = MF.getSubtarget<SISubtarget>();
+  const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
   const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
 
   unsigned Occupancy = ST.getOccupancyWithLocalMemSize(MFI->getLDSSize(),
@@ -1545,3 +1567,34 @@ const int *SIRegisterInfo::getRegUnitPressureSets(unsigned RegUnit) const {
     return Empty;
   return AMDGPURegisterInfo::getRegUnitPressureSets(RegUnit);
 }
+
+unsigned SIRegisterInfo::getReturnAddressReg(const MachineFunction &MF) const {
+  // Not a callee saved register.
+  return AMDGPU::SGPR30_SGPR31;
+}
+
+const TargetRegisterClass *
+SIRegisterInfo::getConstrainedRegClassForOperand(const MachineOperand &MO,
+                                         const MachineRegisterInfo &MRI) const {
+  unsigned Size = getRegSizeInBits(MO.getReg(), MRI);
+  const RegisterBank *RB = MRI.getRegBankOrNull(MO.getReg());
+  if (!RB)
+    return nullptr;
+
+  switch (Size) {
+  case 32:
+    return RB->getID() == AMDGPU::VGPRRegBankID ? &AMDGPU::VGPR_32RegClass :
+                                                  &AMDGPU::SReg_32_XM0RegClass;
+  case 64:
+    return RB->getID() == AMDGPU::VGPRRegBankID ? &AMDGPU::VReg_64RegClass :
+                                                   &AMDGPU::SReg_64_XEXECRegClass;
+  case 96:
+    return RB->getID() == AMDGPU::VGPRRegBankID ? &AMDGPU::VReg_96RegClass :
+                                                  nullptr;
+  case 128:
+    return RB->getID() == AMDGPU::VGPRRegBankID ? &AMDGPU::VReg_128RegClass :
+                                                  &AMDGPU::SReg_128RegClass;
+  default:
+    llvm_unreachable("not implemented");
+  }
+}
diff --git a/lib/Target/AMDGPU/SIRegisterInfo.h b/lib/Target/AMDGPU/SIRegisterInfo.h
index bf814b6974a8..5a51b67ca719 100644
--- a/lib/Target/AMDGPU/SIRegisterInfo.h
+++ b/lib/Target/AMDGPU/SIRegisterInfo.h
@@ -8,7 +8,7 @@
 //===----------------------------------------------------------------------===//
 //
 /// \file
-/// \brief Interface definition for SIRegisterInfo
+/// Interface definition for SIRegisterInfo
 //
 //===----------------------------------------------------------------------===//
 
@@ -16,15 +16,14 @@
 #define LLVM_LIB_TARGET_AMDGPU_SIREGISTERINFO_H
 
 #include "AMDGPURegisterInfo.h"
-#include "MCTargetDesc/AMDGPUMCTargetDesc.h"
 #include "SIDefines.h"
 #include "llvm/CodeGen/MachineRegisterInfo.h"
 
 namespace llvm {
 
+class GCNSubtarget;
 class LiveIntervals;
 class MachineRegisterInfo;
-class SISubtarget;
 class SIMachineFunctionInfo;
 
 class SIRegisterInfo final : public AMDGPURegisterInfo {
@@ -36,11 +35,10 @@ private:
   bool SpillSGPRToVGPR;
   bool SpillSGPRToSMEM;
 
-  void reserveRegisterTuples(BitVector &, unsigned Reg) const;
   void classifyPressureSet(unsigned PSetID, unsigned Reg,
                            BitVector &PressureSets) const;
 public:
-  SIRegisterInfo(const SISubtarget &ST);
+  SIRegisterInfo(const GCNSubtarget &ST);
 
   bool spillSGPRToVGPR() const {
     return SpillSGPRToVGPR;
@@ -126,7 +124,7 @@ public:
     return getEncodingValue(Reg) & 0xff;
   }
 
-  /// \brief Return the 'base' register class for this register.
+  /// Return the 'base' register class for this register.
   /// e.g. SGPR0 => SReg_32, VGPR => VGPR_32 SGPR0_SGPR1 -> SReg_32, etc.
   const TargetRegisterClass *getPhysRegClass(unsigned Reg) const;
 
@@ -224,10 +222,11 @@ public:
 
   const int *getRegUnitPressureSets(unsigned RegUnit) const override;
 
-  unsigned getReturnAddressReg(const MachineFunction &MF) const {
-    // Not a callee saved register.
-    return AMDGPU::SGPR30_SGPR31;
-  }
+  unsigned getReturnAddressReg(const MachineFunction &MF) const;
+
+  const TargetRegisterClass *
+  getConstrainedRegClassForOperand(const MachineOperand &MO,
+                                 const MachineRegisterInfo &MRI) const override;
 
 private:
   void buildSpillLoadStore(MachineBasicBlock::iterator MI,
diff --git a/lib/Target/AMDGPU/SIRegisterInfo.td b/lib/Target/AMDGPU/SIRegisterInfo.td
index dd0efef7f91b..f87a0763b353 100644
--- a/lib/Target/AMDGPU/SIRegisterInfo.td
+++ b/lib/Target/AMDGPU/SIRegisterInfo.td
@@ -76,6 +76,16 @@ def SRC_SHARED_LIMIT : SIReg<"src_shared_limit", 236>;
 def SRC_PRIVATE_BASE : SIReg<"src_private_base", 237>;
 def SRC_PRIVATE_LIMIT : SIReg<"src_private_limit", 238>;
 
+def XNACK_MASK_LO : SIReg<"xnack_mask_lo", 104>;
+def XNACK_MASK_HI : SIReg<"xnack_mask_hi", 105>;
+
+def XNACK_MASK : RegisterWithSubRegs<"xnack_mask", [XNACK_MASK_LO, XNACK_MASK_HI]>,
+                 DwarfRegAlias<XNACK_MASK_LO> {
+  let Namespace = "AMDGPU";
+  let SubRegIndices = [sub0, sub1];
+  let HWEncoding = 104;
+}
+
 // Trap handler registers
 def TBA_LO : SIReg<"tba_lo", 108>;
 def TBA_HI : SIReg<"tba_hi", 109>;
@@ -394,7 +404,7 @@ def Pseudo_SReg_32 : RegisterClass<"AMDGPU", [i32, f32, i16, f16, v2i16, v2f16],
   let CopyCost = -1;
 }
 
-def Pseudo_SReg_128 : RegisterClass<"AMDGPU", [v4i32, v2i64], 32,
+def Pseudo_SReg_128 : RegisterClass<"AMDGPU", [v4i32, v2i64, v2f64], 32,
   (add PRIVATE_RSRC_REG)> {
   let isAllocatable = 0;
   let CopyCost = -1;
@@ -403,7 +413,7 @@ def Pseudo_SReg_128 : RegisterClass<"AMDGPU", [v4i32, v2i64], 32,
 // Subset of SReg_32 without M0 for SMRD instructions and alike.
 // See comments in SIInstructions.td for more info.
 def SReg_32_XM0_XEXEC : RegisterClass<"AMDGPU", [i32, f32, i16, f16, v2i16, v2f16], 32,
-  (add SGPR_32, VCC_LO, VCC_HI, FLAT_SCR_LO, FLAT_SCR_HI,
+  (add SGPR_32, VCC_LO, VCC_HI, FLAT_SCR_LO, FLAT_SCR_HI, XNACK_MASK_LO, XNACK_MASK_HI,
    TTMP_32, TMA_LO, TMA_HI, TBA_LO, TBA_HI, SRC_SHARED_BASE, SRC_SHARED_LIMIT,
    SRC_PRIVATE_BASE, SRC_PRIVATE_LIMIT)> {
   let AllocationPriority = 7;
@@ -425,22 +435,22 @@ def SReg_32 : RegisterClass<"AMDGPU", [i32, f32, i16, f16, v2i16, v2f16], 32,
   let AllocationPriority = 7;
 }
 
-def SGPR_64 : RegisterClass<"AMDGPU", [v2i32, i64, f64], 32, (add SGPR_64Regs)> {
+def SGPR_64 : RegisterClass<"AMDGPU", [v2i32, i64, f64, v4i16, v4f16], 32, (add SGPR_64Regs)> {
   let CopyCost = 1;
   let AllocationPriority = 8;
 }
 
-def TTMP_64 : RegisterClass<"AMDGPU", [v2i32, i64, f64], 32, (add TTMP_64Regs)> {
+def TTMP_64 : RegisterClass<"AMDGPU", [v2i32, i64, f64, v4i16, v4f16], 32, (add TTMP_64Regs)> {
   let isAllocatable = 0;
 }
 
-def SReg_64_XEXEC : RegisterClass<"AMDGPU", [v2i32, i64, f64, i1], 32,
-  (add SGPR_64, VCC, FLAT_SCR, TTMP_64, TBA, TMA)> {
+def SReg_64_XEXEC : RegisterClass<"AMDGPU", [v2i32, i64, f64, i1, v4i16, v4f16], 32,
+  (add SGPR_64, VCC, FLAT_SCR, XNACK_MASK, TTMP_64, TBA, TMA)> {
   let CopyCost = 1;
   let AllocationPriority = 8;
 }
 
-def SReg_64 : RegisterClass<"AMDGPU", [v2i32, i64, f64, i1], 32,
+def SReg_64 : RegisterClass<"AMDGPU", [v2i32, i64, f64, i1, v4i16, v4f16], 32,
   (add SReg_64_XEXEC, EXEC)> {
   let CopyCost = 1;
   let AllocationPriority = 8;
@@ -457,7 +467,7 @@ def TTMP_128 : RegisterClass<"AMDGPU", [v4i32, v16i8, v2i64], 32, (add TTMP_128R
   let isAllocatable = 0;
 }
 
-def SReg_128 : RegisterClass<"AMDGPU", [v4i32, v16i8, v2i64], 32,
+def SReg_128 : RegisterClass<"AMDGPU", [v4i32, v16i8, v2i64, v2f64], 32,
   (add SGPR_128, TTMP_128)> {
   let AllocationPriority = 10;
 }
@@ -495,7 +505,7 @@ def SReg_512 : RegisterClass<"AMDGPU", [v16i32, v16f32], 32,
 }
 
 // Register class for all vector registers (VGPRs + Interploation Registers)
-def VReg_64 : RegisterClass<"AMDGPU", [i64, f64, v2i32, v2f32], 32, (add VGPR_64)> {
+def VReg_64 : RegisterClass<"AMDGPU", [i64, f64, v2i32, v2f32, v4f16, v4i16], 32, (add VGPR_64)> {
   let Size = 64;
 
   // Requires 2 v_mov_b32 to copy
diff --git a/lib/Target/AMDGPU/SISchedule.td b/lib/Target/AMDGPU/SISchedule.td
index 0f02f5825cb0..7af69cb6a46d 100644
--- a/lib/Target/AMDGPU/SISchedule.td
+++ b/lib/Target/AMDGPU/SISchedule.td
@@ -46,7 +46,7 @@ def Write64Bit : SchedWrite;
 // instructions)
 
 class SISchedMachineModel : SchedMachineModel {
-  let CompleteModel = 1;
+  let CompleteModel = 0;
   // MicroOpBufferSize = 1 means that instructions will always be added
   // the ready queue when they become available.  This exposes them
   // to the register pressure analysis.
diff --git a/lib/Target/AMDGPU/SIShrinkInstructions.cpp b/lib/Target/AMDGPU/SIShrinkInstructions.cpp
index 41f989ad3228..4189bcce52ea 100644
--- a/lib/Target/AMDGPU/SIShrinkInstructions.cpp
+++ b/lib/Target/AMDGPU/SIShrinkInstructions.cpp
@@ -10,9 +10,9 @@
 //
 
 #include "AMDGPU.h"
-#include "AMDGPUMCInstLower.h"
 #include "AMDGPUSubtarget.h"
 #include "SIInstrInfo.h"
+#include "MCTargetDesc/AMDGPUMCTargetDesc.h"
 #include "llvm/ADT/Statistic.h"
 #include "llvm/CodeGen/MachineFunctionPass.h"
 #include "llvm/CodeGen/MachineInstrBuilder.h"
@@ -64,17 +64,6 @@ FunctionPass *llvm::createSIShrinkInstructionsPass() {
   return new SIShrinkInstructions();
 }
 
-static bool isVGPR(const MachineOperand *MO, const SIRegisterInfo &TRI,
-                   const MachineRegisterInfo &MRI) {
-  if (!MO->isReg())
-    return false;
-
-  if (TargetRegisterInfo::isVirtualRegister(MO->getReg()))
-    return TRI.hasVGPRs(MRI.getRegClass(MO->getReg()));
-
-  return TRI.hasVGPRs(TRI.getPhysRegClass(MO->getReg()));
-}
-
 static bool canShrink(MachineInstr &MI, const SIInstrInfo *TII,
                       const SIRegisterInfo &TRI,
                       const MachineRegisterInfo &MRI) {
@@ -92,14 +81,18 @@ static bool canShrink(MachineInstr &MI, const SIInstrInfo *TII,
 
       case AMDGPU::V_ADDC_U32_e64:
       case AMDGPU::V_SUBB_U32_e64:
-        if (TII->getNamedOperand(MI, AMDGPU::OpName::src1)->isImm())
+      case AMDGPU::V_SUBBREV_U32_e64: {
+        const MachineOperand *Src1
+          = TII->getNamedOperand(MI, AMDGPU::OpName::src1);
+        if (!Src1->isReg() || !TRI.isVGPR(MRI, Src1->getReg()))
           return false;
         // Additional verification is needed for sdst/src2.
         return true;
-
+      }
       case AMDGPU::V_MAC_F32_e64:
       case AMDGPU::V_MAC_F16_e64:
-        if (!isVGPR(Src2, TRI, MRI) ||
+      case AMDGPU::V_FMAC_F32_e64:
+        if (!Src2->isReg() || !TRI.isVGPR(MRI, Src2->getReg()) ||
             TII->hasModifiersSet(MI, AMDGPU::OpName::src2_modifiers))
           return false;
         break;
@@ -110,7 +103,7 @@ static bool canShrink(MachineInstr &MI, const SIInstrInfo *TII,
   }
 
   const MachineOperand *Src1 = TII->getNamedOperand(MI, AMDGPU::OpName::src1);
-  if (Src1 && (!isVGPR(Src1, TRI, MRI) ||
+  if (Src1 && (!Src1->isReg() || !TRI.isVGPR(MRI, Src1->getReg()) ||
                TII->hasModifiersSet(MI, AMDGPU::OpName::src1_modifiers)))
     return false;
 
@@ -124,7 +117,7 @@ static bool canShrink(MachineInstr &MI, const SIInstrInfo *TII,
          !TII->hasModifiersSet(MI, AMDGPU::OpName::clamp);
 }
 
-/// \brief This function checks \p MI for operands defined by a move immediate
+/// This function checks \p MI for operands defined by a move immediate
 /// instruction and then folds the literal constant into the instruction if it
 /// can. This function assumes that \p MI is a VOP1, VOP2, or VOPC instructions.
 static bool foldImmediates(MachineInstr &MI, const SIInstrInfo *TII,
@@ -290,7 +283,7 @@ bool SIShrinkInstructions::runOnMachineFunction(MachineFunction &MF) {
     return false;
 
   MachineRegisterInfo &MRI = MF.getRegInfo();
-  const SISubtarget &ST = MF.getSubtarget<SISubtarget>();
+  const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
   const SIInstrInfo *TII = ST.getInstrInfo();
   const SIRegisterInfo &TRI = TII->getRegisterInfo();
 
@@ -442,7 +435,7 @@ bool SIShrinkInstructions::runOnMachineFunction(MachineFunction &MF) {
           // VCC, e.g. S_AND_B64 (vcc = V_CMP_...), (vcc = V_CMP_...)
           //
           // So, instead of forcing the instruction to write to VCC, we provide
-          // a hint to the register allocator to use VCC and then we we will run
+          // a hint to the register allocator to use VCC and then we will run
           // this pass again after RA and shrink it if it outputs to VCC.
           MRI.setRegAllocationHint(MI.getOperand(0).getReg(), 0, AMDGPU::VCC);
           continue;
@@ -493,7 +486,7 @@ bool SIShrinkInstructions::runOnMachineFunction(MachineFunction &MF) {
       }
 
       // We can shrink this instruction
-      DEBUG(dbgs() << "Shrinking " << MI);
+      LLVM_DEBUG(dbgs() << "Shrinking " << MI);
 
       MachineInstrBuilder Inst32 =
           BuildMI(MBB, I, MI.getDebugLoc(), TII->get(Op32));
@@ -537,9 +530,7 @@ bool SIShrinkInstructions::runOnMachineFunction(MachineFunction &MF) {
       MI.eraseFromParent();
       foldImmediates(*Inst32, TII, MRI);
 
-      DEBUG(dbgs() << "e32 MI = " << *Inst32 << '\n');
-
-
+      LLVM_DEBUG(dbgs() << "e32 MI = " << *Inst32 << '\n');
     }
   }
   return false;
diff --git a/lib/Target/AMDGPU/SIWholeQuadMode.cpp b/lib/Target/AMDGPU/SIWholeQuadMode.cpp
index 53aefe829737..879726b1528c 100644
--- a/lib/Target/AMDGPU/SIWholeQuadMode.cpp
+++ b/lib/Target/AMDGPU/SIWholeQuadMode.cpp
@@ -8,7 +8,7 @@
 //===----------------------------------------------------------------------===//
 //
 /// \file
-/// \brief This pass adds instructions to enable whole quad mode for pixel
+/// This pass adds instructions to enable whole quad mode for pixel
 /// shaders, and whole wavefront mode for all programs.
 ///
 /// Whole quad mode is required for derivative computations, but it interferes
@@ -60,6 +60,7 @@
 #include "AMDGPUSubtarget.h"
 #include "SIInstrInfo.h"
 #include "SIMachineFunctionInfo.h"
+#include "MCTargetDesc/AMDGPUMCTargetDesc.h"
 #include "llvm/ADT/DenseMap.h"
 #include "llvm/ADT/PostOrderIterator.h"
 #include "llvm/ADT/SmallVector.h"
@@ -325,9 +326,7 @@ char SIWholeQuadMode::scanInstructions(MachineFunction &MF,
       unsigned Opcode = MI.getOpcode();
       char Flags = 0;
 
-      if (TII->isDS(Opcode) && CallingConv == CallingConv::AMDGPU_PS) {
-        Flags = StateWQM;
-      } else if (TII->isWQM(Opcode)) {
+      if (TII->isWQM(Opcode)) {
         // Sampling instructions don't need to produce results for all pixels
         // in a quad, they just require all inputs of a quad to have been
         // computed for derivatives.
@@ -454,6 +453,11 @@ void SIWholeQuadMode::propagateInstruction(MachineInstr &MI,
 
   if (II.Needs != 0)
     markInstructionUses(MI, II.Needs, Worklist);
+
+  // Ensure we process a block containing WWM, even if it does not require any
+  // WQM transitions.
+  if (II.Needs & StateWWM)
+    BI.Needs |= StateWWM;
 }
 
 void SIWholeQuadMode::propagateBlock(MachineBasicBlock &MBB,
@@ -681,7 +685,8 @@ void SIWholeQuadMode::processBlock(MachineBasicBlock &MBB, unsigned LiveMaskReg,
   if (!isEntry && BI.Needs == StateWQM && BI.OutNeeds != StateExact)
     return;
 
-  DEBUG(dbgs() << "\nProcessing block " << printMBBReference(MBB) << ":\n");
+  LLVM_DEBUG(dbgs() << "\nProcessing block " << printMBBReference(MBB)
+                    << ":\n");
 
   unsigned SavedWQMReg = 0;
   unsigned SavedNonWWMReg = 0;
@@ -844,7 +849,7 @@ bool SIWholeQuadMode::runOnMachineFunction(MachineFunction &MF) {
   LowerToCopyInstrs.clear();
   CallingConv = MF.getFunction().getCallingConv();
 
-  const SISubtarget &ST = MF.getSubtarget<SISubtarget>();
+  const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
 
   TII = ST.getInstrInfo();
   TRI = &TII->getRegisterInfo();
@@ -884,7 +889,7 @@ bool SIWholeQuadMode::runOnMachineFunction(MachineFunction &MF) {
     }
   }
 
-  DEBUG(printInfo());
+  LLVM_DEBUG(printInfo());
 
   lowerCopyInstrs();
 
diff --git a/lib/Target/AMDGPU/SMInstructions.td b/lib/Target/AMDGPU/SMInstructions.td
index 8f347986eb8a..7485326017b2 100644
--- a/lib/Target/AMDGPU/SMInstructions.td
+++ b/lib/Target/AMDGPU/SMInstructions.td
@@ -63,6 +63,18 @@ class SM_Real <SM_Pseudo ps>
   bits<1> imm = !if(ps.has_offset, ps.offset_is_imm, 0);
 }
 
+class SM_Probe_Pseudo <string opName, dag ins, bit isImm>
+  : SM_Pseudo<opName, (outs), ins, " $sdata, $sbase, $offset"> {
+  let mayLoad = 0;
+  let mayStore = 0;
+  let has_glc = 0;
+  let LGKM_CNT = 0;
+  let ScalarStore = 0;
+  let hasSideEffects = 1;
+  let offset_is_imm = isImm;
+  let PseudoInstr = opName # !if(isImm, "_IMM", "_SGPR");
+}
+
 class SM_Load_Pseudo <string opName, dag outs, dag ins, string asmOps, list<dag> pattern=[]>
   : SM_Pseudo<opName, outs, ins, asmOps, pattern> {
   RegisterClass BaseClass;
@@ -81,6 +93,18 @@ class SM_Store_Pseudo <string opName, dag ins, string asmOps, list<dag> pattern
   let ScalarStore = 1;
 }
 
+class SM_Discard_Pseudo <string opName, dag ins, bit isImm>
+  : SM_Pseudo<opName, (outs), ins, " $sbase, $offset"> {
+  let mayLoad = 0;
+  let mayStore = 0;
+  let has_glc = 0;
+  let has_sdst = 0;
+  let ScalarStore = 0;
+  let hasSideEffects = 1;
+  let offset_is_imm = isImm;
+  let PseudoInstr = opName # !if(isImm, "_IMM", "_SGPR");
+}
+
 multiclass SM_Pseudo_Loads<string opName,
                            RegisterClass baseClass,
                            RegisterClass dstClass> {
@@ -125,6 +149,11 @@ multiclass SM_Pseudo_Stores<string opName,
   }
 }
 
+multiclass SM_Pseudo_Discards<string opName> {
+  def _IMM  : SM_Discard_Pseudo <opName, (ins SReg_64:$sbase, smrd_offset_20:$offset), 1>;
+  def _SGPR : SM_Discard_Pseudo <opName, (ins SReg_64:$sbase, SReg_32:$offset), 0>;
+}
+
 class SM_Time_Pseudo<string opName, SDPatternOperator node> : SM_Pseudo<
   opName, (outs SReg_64_XEXEC:$sdst), (ins),
   " $sdst", [(set i64:$sdst, (node))]> {
@@ -144,6 +173,60 @@ class SM_Inval_Pseudo <string opName, SDPatternOperator node> : SM_Pseudo<
   let has_offset = 0;
 }
 
+multiclass SM_Pseudo_Probe<string opName, RegisterClass baseClass> {
+  def _IMM  : SM_Probe_Pseudo <opName, (ins i8imm:$sdata, baseClass:$sbase, smrd_offset_20:$offset), 1>;
+  def _SGPR : SM_Probe_Pseudo <opName, (ins i8imm:$sdata, baseClass:$sbase, SReg_32:$offset), 0>;
+}
+
+//===----------------------------------------------------------------------===//
+// Scalar Atomic Memory Classes
+//===----------------------------------------------------------------------===//
+
+class SM_Atomic_Pseudo <string opName,
+                        dag outs, dag ins, string asmOps, bit isRet>
+  : SM_Pseudo<opName, outs, ins, asmOps, []> {
+
+  bit glc = isRet;
+
+  let mayLoad = 1;
+  let mayStore = 1;
+  let has_glc = 1;
+
+  // Should these be set?
+  let ScalarStore = 1;
+  let hasSideEffects = 1;
+  let maybeAtomic = 1;
+}
+
+class SM_Pseudo_Atomic<string opName,
+                       RegisterClass baseClass,
+                       RegisterClass dataClass,
+                       bit isImm,
+                       bit isRet> :
+  SM_Atomic_Pseudo<opName,
+                   !if(isRet, (outs dataClass:$sdst), (outs)),
+                   !if(isImm,
+                       (ins dataClass:$sdata, baseClass:$sbase, smrd_offset_20:$offset),
+                       (ins dataClass:$sdata, baseClass:$sbase, SReg_32:$offset)),
+                   !if(isRet, " $sdst", " $sdata") # ", $sbase, $offset" # !if(isRet, " glc", ""),
+                   isRet> {
+  let offset_is_imm = isImm;
+  let PseudoInstr = opName # !if(isImm,
+                                 !if(isRet, "_IMM_RTN", "_IMM"),
+                                 !if(isRet, "_SGPR_RTN", "_SGPR"));
+
+  let Constraints = !if(isRet, "$sdst = $sdata", "");
+  let DisableEncoding = !if(isRet, "$sdata", "");
+}
+
+multiclass SM_Pseudo_Atomics<string opName,
+                             RegisterClass baseClass,
+                             RegisterClass dataClass> {
+  def _IMM      : SM_Pseudo_Atomic <opName, baseClass, dataClass, 1, 0>;
+  def _SGPR     : SM_Pseudo_Atomic <opName, baseClass, dataClass, 0, 0>;
+  def _IMM_RTN  : SM_Pseudo_Atomic <opName, baseClass, dataClass, 1, 1>;
+  def _SGPR_RTN : SM_Pseudo_Atomic <opName, baseClass, dataClass, 0, 1>;
+}
 
 //===----------------------------------------------------------------------===//
 // Scalar Memory Instructions
@@ -211,9 +294,85 @@ let SubtargetPredicate = isVI in {
 def S_DCACHE_WB     : SM_Inval_Pseudo <"s_dcache_wb", int_amdgcn_s_dcache_wb>;
 def S_DCACHE_WB_VOL : SM_Inval_Pseudo <"s_dcache_wb_vol", int_amdgcn_s_dcache_wb_vol>;
 def S_MEMREALTIME   : SM_Time_Pseudo <"s_memrealtime", int_amdgcn_s_memrealtime>;
-} // SubtargetPredicate = isVI
 
+defm S_ATC_PROBE        : SM_Pseudo_Probe <"s_atc_probe", SReg_64>;
+defm S_ATC_PROBE_BUFFER : SM_Pseudo_Probe <"s_atc_probe_buffer", SReg_128>;
+} // SubtargetPredicate = isVI
 
+let SubtargetPredicate = HasFlatScratchInsts, Uses = [FLAT_SCR] in {
+defm S_SCRATCH_LOAD_DWORD    : SM_Pseudo_Loads <"s_scratch_load_dword",   SReg_64, SReg_32_XM0_XEXEC>;
+defm S_SCRATCH_LOAD_DWORDX2  : SM_Pseudo_Loads <"s_scratch_load_dwordx2", SReg_64, SReg_64_XEXEC>;
+defm S_SCRATCH_LOAD_DWORDX4  : SM_Pseudo_Loads <"s_scratch_load_dwordx4", SReg_64, SReg_128>;
+
+defm S_SCRATCH_STORE_DWORD   : SM_Pseudo_Stores <"s_scratch_store_dword",   SReg_64, SReg_32_XM0_XEXEC>;
+defm S_SCRATCH_STORE_DWORDX2 : SM_Pseudo_Stores <"s_scratch_store_dwordx2", SReg_64, SReg_64_XEXEC>;
+defm S_SCRATCH_STORE_DWORDX4 : SM_Pseudo_Stores <"s_scratch_store_dwordx4", SReg_64, SReg_128>;
+} // SubtargetPredicate = HasFlatScratchInsts
+
+let SubtargetPredicate = HasScalarAtomics in {
+
+defm S_BUFFER_ATOMIC_SWAP         : SM_Pseudo_Atomics <"s_buffer_atomic_swap", SReg_128, SReg_32_XM0_XEXEC>;
+defm S_BUFFER_ATOMIC_CMPSWAP      : SM_Pseudo_Atomics <"s_buffer_atomic_cmpswap", SReg_128, SReg_64_XEXEC>;
+defm S_BUFFER_ATOMIC_ADD          : SM_Pseudo_Atomics <"s_buffer_atomic_add", SReg_128, SReg_32_XM0_XEXEC>;
+defm S_BUFFER_ATOMIC_SUB          : SM_Pseudo_Atomics <"s_buffer_atomic_sub", SReg_128, SReg_32_XM0_XEXEC>;
+defm S_BUFFER_ATOMIC_SMIN         : SM_Pseudo_Atomics <"s_buffer_atomic_smin", SReg_128, SReg_32_XM0_XEXEC>;
+defm S_BUFFER_ATOMIC_UMIN         : SM_Pseudo_Atomics <"s_buffer_atomic_umin", SReg_128, SReg_32_XM0_XEXEC>;
+defm S_BUFFER_ATOMIC_SMAX         : SM_Pseudo_Atomics <"s_buffer_atomic_smax", SReg_128, SReg_32_XM0_XEXEC>;
+defm S_BUFFER_ATOMIC_UMAX         : SM_Pseudo_Atomics <"s_buffer_atomic_umax", SReg_128, SReg_32_XM0_XEXEC>;
+defm S_BUFFER_ATOMIC_AND          : SM_Pseudo_Atomics <"s_buffer_atomic_and", SReg_128, SReg_32_XM0_XEXEC>;
+defm S_BUFFER_ATOMIC_OR           : SM_Pseudo_Atomics <"s_buffer_atomic_or", SReg_128, SReg_32_XM0_XEXEC>;
+defm S_BUFFER_ATOMIC_XOR          : SM_Pseudo_Atomics <"s_buffer_atomic_xor", SReg_128, SReg_32_XM0_XEXEC>;
+defm S_BUFFER_ATOMIC_INC          : SM_Pseudo_Atomics <"s_buffer_atomic_inc", SReg_128, SReg_32_XM0_XEXEC>;
+defm S_BUFFER_ATOMIC_DEC          : SM_Pseudo_Atomics <"s_buffer_atomic_dec", SReg_128, SReg_32_XM0_XEXEC>;
+
+defm S_BUFFER_ATOMIC_SWAP_X2      : SM_Pseudo_Atomics <"s_buffer_atomic_swap_x2", SReg_128, SReg_64_XEXEC>;
+defm S_BUFFER_ATOMIC_CMPSWAP_X2   : SM_Pseudo_Atomics <"s_buffer_atomic_cmpswap_x2", SReg_128, SReg_128>;
+defm S_BUFFER_ATOMIC_ADD_X2       : SM_Pseudo_Atomics <"s_buffer_atomic_add_x2", SReg_128, SReg_64_XEXEC>;
+defm S_BUFFER_ATOMIC_SUB_X2       : SM_Pseudo_Atomics <"s_buffer_atomic_sub_x2", SReg_128, SReg_64_XEXEC>;
+defm S_BUFFER_ATOMIC_SMIN_X2      : SM_Pseudo_Atomics <"s_buffer_atomic_smin_x2", SReg_128, SReg_64_XEXEC>;
+defm S_BUFFER_ATOMIC_UMIN_X2      : SM_Pseudo_Atomics <"s_buffer_atomic_umin_x2", SReg_128, SReg_64_XEXEC>;
+defm S_BUFFER_ATOMIC_SMAX_X2      : SM_Pseudo_Atomics <"s_buffer_atomic_smax_x2", SReg_128, SReg_64_XEXEC>;
+defm S_BUFFER_ATOMIC_UMAX_X2      : SM_Pseudo_Atomics <"s_buffer_atomic_umax_x2", SReg_128, SReg_64_XEXEC>;
+defm S_BUFFER_ATOMIC_AND_X2       : SM_Pseudo_Atomics <"s_buffer_atomic_and_x2", SReg_128, SReg_64_XEXEC>;
+defm S_BUFFER_ATOMIC_OR_X2        : SM_Pseudo_Atomics <"s_buffer_atomic_or_x2", SReg_128, SReg_64_XEXEC>;
+defm S_BUFFER_ATOMIC_XOR_X2       : SM_Pseudo_Atomics <"s_buffer_atomic_xor_x2", SReg_128, SReg_64_XEXEC>;
+defm S_BUFFER_ATOMIC_INC_X2       : SM_Pseudo_Atomics <"s_buffer_atomic_inc_x2", SReg_128, SReg_64_XEXEC>;
+defm S_BUFFER_ATOMIC_DEC_X2       : SM_Pseudo_Atomics <"s_buffer_atomic_dec_x2", SReg_128, SReg_64_XEXEC>;
+
+defm S_ATOMIC_SWAP                : SM_Pseudo_Atomics <"s_atomic_swap", SReg_64, SReg_32_XM0_XEXEC>;
+defm S_ATOMIC_CMPSWAP             : SM_Pseudo_Atomics <"s_atomic_cmpswap", SReg_64, SReg_64_XEXEC>;
+defm S_ATOMIC_ADD                 : SM_Pseudo_Atomics <"s_atomic_add", SReg_64, SReg_32_XM0_XEXEC>;
+defm S_ATOMIC_SUB                 : SM_Pseudo_Atomics <"s_atomic_sub", SReg_64, SReg_32_XM0_XEXEC>;
+defm S_ATOMIC_SMIN                : SM_Pseudo_Atomics <"s_atomic_smin", SReg_64, SReg_32_XM0_XEXEC>;
+defm S_ATOMIC_UMIN                : SM_Pseudo_Atomics <"s_atomic_umin", SReg_64, SReg_32_XM0_XEXEC>;
+defm S_ATOMIC_SMAX                : SM_Pseudo_Atomics <"s_atomic_smax", SReg_64, SReg_32_XM0_XEXEC>;
+defm S_ATOMIC_UMAX                : SM_Pseudo_Atomics <"s_atomic_umax", SReg_64, SReg_32_XM0_XEXEC>;
+defm S_ATOMIC_AND                 : SM_Pseudo_Atomics <"s_atomic_and", SReg_64, SReg_32_XM0_XEXEC>;
+defm S_ATOMIC_OR                  : SM_Pseudo_Atomics <"s_atomic_or", SReg_64, SReg_32_XM0_XEXEC>;
+defm S_ATOMIC_XOR                 : SM_Pseudo_Atomics <"s_atomic_xor", SReg_64, SReg_32_XM0_XEXEC>;
+defm S_ATOMIC_INC                 : SM_Pseudo_Atomics <"s_atomic_inc", SReg_64, SReg_32_XM0_XEXEC>;
+defm S_ATOMIC_DEC                 : SM_Pseudo_Atomics <"s_atomic_dec", SReg_64, SReg_32_XM0_XEXEC>;
+
+defm S_ATOMIC_SWAP_X2             : SM_Pseudo_Atomics <"s_atomic_swap_x2", SReg_64, SReg_64_XEXEC>;
+defm S_ATOMIC_CMPSWAP_X2          : SM_Pseudo_Atomics <"s_atomic_cmpswap_x2", SReg_64, SReg_128>;
+defm S_ATOMIC_ADD_X2              : SM_Pseudo_Atomics <"s_atomic_add_x2", SReg_64, SReg_64_XEXEC>;
+defm S_ATOMIC_SUB_X2              : SM_Pseudo_Atomics <"s_atomic_sub_x2", SReg_64, SReg_64_XEXEC>;
+defm S_ATOMIC_SMIN_X2             : SM_Pseudo_Atomics <"s_atomic_smin_x2", SReg_64, SReg_64_XEXEC>;
+defm S_ATOMIC_UMIN_X2             : SM_Pseudo_Atomics <"s_atomic_umin_x2", SReg_64, SReg_64_XEXEC>;
+defm S_ATOMIC_SMAX_X2             : SM_Pseudo_Atomics <"s_atomic_smax_x2", SReg_64, SReg_64_XEXEC>;
+defm S_ATOMIC_UMAX_X2             : SM_Pseudo_Atomics <"s_atomic_umax_x2", SReg_64, SReg_64_XEXEC>;
+defm S_ATOMIC_AND_X2              : SM_Pseudo_Atomics <"s_atomic_and_x2", SReg_64, SReg_64_XEXEC>;
+defm S_ATOMIC_OR_X2               : SM_Pseudo_Atomics <"s_atomic_or_x2", SReg_64, SReg_64_XEXEC>;
+defm S_ATOMIC_XOR_X2              : SM_Pseudo_Atomics <"s_atomic_xor_x2", SReg_64, SReg_64_XEXEC>;
+defm S_ATOMIC_INC_X2              : SM_Pseudo_Atomics <"s_atomic_inc_x2", SReg_64, SReg_64_XEXEC>;
+defm S_ATOMIC_DEC_X2              : SM_Pseudo_Atomics <"s_atomic_dec_x2", SReg_64, SReg_64_XEXEC>;
+
+} // let SubtargetPredicate = HasScalarAtomics
+
+let SubtargetPredicate = isGFX9 in {
+defm S_DCACHE_DISCARD    : SM_Pseudo_Discards <"s_dcache_discard">;
+defm S_DCACHE_DISCARD_X2 : SM_Pseudo_Discards <"s_dcache_discard_x2">;
+}
 
 //===----------------------------------------------------------------------===//
 // Scalar Memory Patterns
@@ -223,11 +382,9 @@ def S_MEMREALTIME   : SM_Time_Pseudo <"s_memrealtime", int_amdgcn_s_memrealtime>
 def smrd_load : PatFrag <(ops node:$ptr), (load node:$ptr), [{
   auto Ld = cast<LoadSDNode>(N);
   return Ld->getAlignment() >= 4  &&
-    ((Ld->getAddressSpace() == AMDGPUASI.CONSTANT_ADDRESS &&
-    static_cast<const SITargetLowering *>(getTargetLowering())->isMemOpUniform(N)) ||
+    ((((Ld->getAddressSpace() == AMDGPUASI.CONSTANT_ADDRESS) || (Ld->getAddressSpace() == AMDGPUASI.CONSTANT_ADDRESS_32BIT)) && !N->isDivergent()) ||
     (Subtarget->getScalarizeGlobalBehavior() && Ld->getAddressSpace() == AMDGPUASI.GLOBAL_ADDRESS &&
-    !Ld->isVolatile() &&
-    static_cast<const SITargetLowering *>(getTargetLowering())->isMemOpUniform(N) &&
+     !Ld->isVolatile() && !N->isDivergent() &&
     static_cast<const SITargetLowering *>(getTargetLowering())->isMemOpHasNoClobberedMemOperand(N)));
 }]>;
 
@@ -407,6 +564,11 @@ multiclass SM_Real_Stores_vi<bits<8> op, string ps,
   }
 }
 
+multiclass SM_Real_Probe_vi<bits<8> op, string ps> {
+  def _IMM_vi  : SMEM_Real_Store_vi <op, !cast<SM_Probe_Pseudo>(ps#_IMM)>;
+  def _SGPR_vi : SMEM_Real_Store_vi <op, !cast<SM_Probe_Pseudo>(ps#_SGPR)>;
+}
+
 defm S_LOAD_DWORD           : SM_Real_Loads_vi <0x00, "S_LOAD_DWORD">;
 defm S_LOAD_DWORDX2         : SM_Real_Loads_vi <0x01, "S_LOAD_DWORDX2">;
 defm S_LOAD_DWORDX4         : SM_Real_Loads_vi <0x02, "S_LOAD_DWORDX4">;
@@ -434,6 +596,103 @@ def S_DCACHE_WB_VOL_vi      : SMEM_Real_vi <0x23, S_DCACHE_WB_VOL>;
 def S_MEMTIME_vi            : SMEM_Real_vi <0x24, S_MEMTIME>;
 def S_MEMREALTIME_vi        : SMEM_Real_vi <0x25, S_MEMREALTIME>;
 
+defm S_SCRATCH_LOAD_DWORD    : SM_Real_Loads_vi <0x05, "S_SCRATCH_LOAD_DWORD">;
+defm S_SCRATCH_LOAD_DWORDX2  : SM_Real_Loads_vi <0x06, "S_SCRATCH_LOAD_DWORDX2">;
+defm S_SCRATCH_LOAD_DWORDX4  : SM_Real_Loads_vi <0x07, "S_SCRATCH_LOAD_DWORDX4">;
+
+defm S_SCRATCH_STORE_DWORD   : SM_Real_Stores_vi <0x15, "S_SCRATCH_STORE_DWORD">;
+defm S_SCRATCH_STORE_DWORDX2 : SM_Real_Stores_vi <0x16, "S_SCRATCH_STORE_DWORDX2">;
+defm S_SCRATCH_STORE_DWORDX4 : SM_Real_Stores_vi <0x17, "S_SCRATCH_STORE_DWORDX4">;
+
+defm S_ATC_PROBE        : SM_Real_Probe_vi <0x26, "S_ATC_PROBE">;
+defm S_ATC_PROBE_BUFFER : SM_Real_Probe_vi <0x27, "S_ATC_PROBE_BUFFER">;
+
+//===----------------------------------------------------------------------===//
+// GFX9
+//===----------------------------------------------------------------------===//
+
+class SMEM_Atomic_Real_vi <bits<8> op, SM_Atomic_Pseudo ps>
+  : SMEM_Real_vi <op, ps> {
+
+  bits<7> sdata;
+
+  let Constraints = ps.Constraints;
+  let DisableEncoding = ps.DisableEncoding;
+
+  let glc = ps.glc;
+  let Inst{12-6} = !if(glc, sdst{6-0}, sdata{6-0});
+}
+
+multiclass SM_Real_Atomics_vi<bits<8> op, string ps> {
+  def _IMM_vi       : SMEM_Atomic_Real_vi <op, !cast<SM_Atomic_Pseudo>(ps#_IMM)>;
+  def _SGPR_vi      : SMEM_Atomic_Real_vi <op, !cast<SM_Atomic_Pseudo>(ps#_SGPR)>;
+  def _IMM_RTN_vi   : SMEM_Atomic_Real_vi <op, !cast<SM_Atomic_Pseudo>(ps#_IMM_RTN)>;
+  def _SGPR_RTN_vi  : SMEM_Atomic_Real_vi <op, !cast<SM_Atomic_Pseudo>(ps#_SGPR_RTN)>;
+}
+
+defm S_BUFFER_ATOMIC_SWAP         : SM_Real_Atomics_vi <0x40, "S_BUFFER_ATOMIC_SWAP">;
+defm S_BUFFER_ATOMIC_CMPSWAP      : SM_Real_Atomics_vi <0x41, "S_BUFFER_ATOMIC_CMPSWAP">;
+defm S_BUFFER_ATOMIC_ADD          : SM_Real_Atomics_vi <0x42, "S_BUFFER_ATOMIC_ADD">;
+defm S_BUFFER_ATOMIC_SUB          : SM_Real_Atomics_vi <0x43, "S_BUFFER_ATOMIC_SUB">;
+defm S_BUFFER_ATOMIC_SMIN         : SM_Real_Atomics_vi <0x44, "S_BUFFER_ATOMIC_SMIN">;
+defm S_BUFFER_ATOMIC_UMIN         : SM_Real_Atomics_vi <0x45, "S_BUFFER_ATOMIC_UMIN">;
+defm S_BUFFER_ATOMIC_SMAX         : SM_Real_Atomics_vi <0x46, "S_BUFFER_ATOMIC_SMAX">;
+defm S_BUFFER_ATOMIC_UMAX         : SM_Real_Atomics_vi <0x47, "S_BUFFER_ATOMIC_UMAX">;
+defm S_BUFFER_ATOMIC_AND          : SM_Real_Atomics_vi <0x48, "S_BUFFER_ATOMIC_AND">;
+defm S_BUFFER_ATOMIC_OR           : SM_Real_Atomics_vi <0x49, "S_BUFFER_ATOMIC_OR">;
+defm S_BUFFER_ATOMIC_XOR          : SM_Real_Atomics_vi <0x4a, "S_BUFFER_ATOMIC_XOR">;
+defm S_BUFFER_ATOMIC_INC          : SM_Real_Atomics_vi <0x4b, "S_BUFFER_ATOMIC_INC">;
+defm S_BUFFER_ATOMIC_DEC          : SM_Real_Atomics_vi <0x4c, "S_BUFFER_ATOMIC_DEC">;
+
+defm S_BUFFER_ATOMIC_SWAP_X2      : SM_Real_Atomics_vi <0x60, "S_BUFFER_ATOMIC_SWAP_X2">;
+defm S_BUFFER_ATOMIC_CMPSWAP_X2   : SM_Real_Atomics_vi <0x61, "S_BUFFER_ATOMIC_CMPSWAP_X2">;
+defm S_BUFFER_ATOMIC_ADD_X2       : SM_Real_Atomics_vi <0x62, "S_BUFFER_ATOMIC_ADD_X2">;
+defm S_BUFFER_ATOMIC_SUB_X2       : SM_Real_Atomics_vi <0x63, "S_BUFFER_ATOMIC_SUB_X2">;
+defm S_BUFFER_ATOMIC_SMIN_X2      : SM_Real_Atomics_vi <0x64, "S_BUFFER_ATOMIC_SMIN_X2">;
+defm S_BUFFER_ATOMIC_UMIN_X2      : SM_Real_Atomics_vi <0x65, "S_BUFFER_ATOMIC_UMIN_X2">;
+defm S_BUFFER_ATOMIC_SMAX_X2      : SM_Real_Atomics_vi <0x66, "S_BUFFER_ATOMIC_SMAX_X2">;
+defm S_BUFFER_ATOMIC_UMAX_X2      : SM_Real_Atomics_vi <0x67, "S_BUFFER_ATOMIC_UMAX_X2">;
+defm S_BUFFER_ATOMIC_AND_X2       : SM_Real_Atomics_vi <0x68, "S_BUFFER_ATOMIC_AND_X2">;
+defm S_BUFFER_ATOMIC_OR_X2        : SM_Real_Atomics_vi <0x69, "S_BUFFER_ATOMIC_OR_X2">;
+defm S_BUFFER_ATOMIC_XOR_X2       : SM_Real_Atomics_vi <0x6a, "S_BUFFER_ATOMIC_XOR_X2">;
+defm S_BUFFER_ATOMIC_INC_X2       : SM_Real_Atomics_vi <0x6b, "S_BUFFER_ATOMIC_INC_X2">;
+defm S_BUFFER_ATOMIC_DEC_X2       : SM_Real_Atomics_vi <0x6c, "S_BUFFER_ATOMIC_DEC_X2">;
+
+defm S_ATOMIC_SWAP                : SM_Real_Atomics_vi <0x80, "S_ATOMIC_SWAP">;
+defm S_ATOMIC_CMPSWAP             : SM_Real_Atomics_vi <0x81, "S_ATOMIC_CMPSWAP">;
+defm S_ATOMIC_ADD                 : SM_Real_Atomics_vi <0x82, "S_ATOMIC_ADD">;
+defm S_ATOMIC_SUB                 : SM_Real_Atomics_vi <0x83, "S_ATOMIC_SUB">;
+defm S_ATOMIC_SMIN                : SM_Real_Atomics_vi <0x84, "S_ATOMIC_SMIN">;
+defm S_ATOMIC_UMIN                : SM_Real_Atomics_vi <0x85, "S_ATOMIC_UMIN">;
+defm S_ATOMIC_SMAX                : SM_Real_Atomics_vi <0x86, "S_ATOMIC_SMAX">;
+defm S_ATOMIC_UMAX                : SM_Real_Atomics_vi <0x87, "S_ATOMIC_UMAX">;
+defm S_ATOMIC_AND                 : SM_Real_Atomics_vi <0x88, "S_ATOMIC_AND">;
+defm S_ATOMIC_OR                  : SM_Real_Atomics_vi <0x89, "S_ATOMIC_OR">;
+defm S_ATOMIC_XOR                 : SM_Real_Atomics_vi <0x8a, "S_ATOMIC_XOR">;
+defm S_ATOMIC_INC                 : SM_Real_Atomics_vi <0x8b, "S_ATOMIC_INC">;
+defm S_ATOMIC_DEC                 : SM_Real_Atomics_vi <0x8c, "S_ATOMIC_DEC">;
+
+defm S_ATOMIC_SWAP_X2             : SM_Real_Atomics_vi <0xa0, "S_ATOMIC_SWAP_X2">;
+defm S_ATOMIC_CMPSWAP_X2          : SM_Real_Atomics_vi <0xa1, "S_ATOMIC_CMPSWAP_X2">;
+defm S_ATOMIC_ADD_X2              : SM_Real_Atomics_vi <0xa2, "S_ATOMIC_ADD_X2">;
+defm S_ATOMIC_SUB_X2              : SM_Real_Atomics_vi <0xa3, "S_ATOMIC_SUB_X2">;
+defm S_ATOMIC_SMIN_X2             : SM_Real_Atomics_vi <0xa4, "S_ATOMIC_SMIN_X2">;
+defm S_ATOMIC_UMIN_X2             : SM_Real_Atomics_vi <0xa5, "S_ATOMIC_UMIN_X2">;
+defm S_ATOMIC_SMAX_X2             : SM_Real_Atomics_vi <0xa6, "S_ATOMIC_SMAX_X2">;
+defm S_ATOMIC_UMAX_X2             : SM_Real_Atomics_vi <0xa7, "S_ATOMIC_UMAX_X2">;
+defm S_ATOMIC_AND_X2              : SM_Real_Atomics_vi <0xa8, "S_ATOMIC_AND_X2">;
+defm S_ATOMIC_OR_X2               : SM_Real_Atomics_vi <0xa9, "S_ATOMIC_OR_X2">;
+defm S_ATOMIC_XOR_X2              : SM_Real_Atomics_vi <0xaa, "S_ATOMIC_XOR_X2">;
+defm S_ATOMIC_INC_X2              : SM_Real_Atomics_vi <0xab, "S_ATOMIC_INC_X2">;
+defm S_ATOMIC_DEC_X2              : SM_Real_Atomics_vi <0xac, "S_ATOMIC_DEC_X2">;
+
+multiclass SM_Real_Discard_vi<bits<8> op, string ps> {
+  def _IMM_vi  : SMEM_Real_vi <op, !cast<SM_Discard_Pseudo>(ps#_IMM)>;
+  def _SGPR_vi : SMEM_Real_vi <op, !cast<SM_Discard_Pseudo>(ps#_SGPR)>;
+}
+
+defm S_DCACHE_DISCARD    : SM_Real_Discard_vi <0x28, "S_DCACHE_DISCARD">;
+defm S_DCACHE_DISCARD_X2 : SM_Real_Discard_vi <0x29, "S_DCACHE_DISCARD_X2">;
 
 //===----------------------------------------------------------------------===//
 // CI
@@ -502,7 +761,7 @@ let AddedComplexity = SM_LOAD_PATTERN.AddedComplexity in {
 
 class SMRD_Pattern_ci <string Instr, ValueType vt> : GCNPat <
   (smrd_load (SMRDImm32 i64:$sbase, i32:$offset)),
-  (vt (!cast<SM_Pseudo>(Instr#"_IMM_ci") $sbase, $offset, 0))> {
+  (vt (!cast<InstSI>(Instr#"_IMM_ci") $sbase, $offset, 0))> {
   let OtherPredicates = [isCIOnly];
 }
 
diff --git a/lib/Target/AMDGPU/SOPInstructions.td b/lib/Target/AMDGPU/SOPInstructions.td
index 02a95a4b6f24..6f5db9644c86 100644
--- a/lib/Target/AMDGPU/SOPInstructions.td
+++ b/lib/Target/AMDGPU/SOPInstructions.td
@@ -19,17 +19,28 @@ def GPRIdxMode : Operand<i32> {
   let OperandType = "OPERAND_IMMEDIATE";
 }
 
+class SOP_Pseudo<string opName, dag outs, dag ins, string asmOps,
+                  list<dag> pattern=[]> :
+    InstSI<outs, ins, "", pattern>,
+    SIMCInstr<opName, SIEncodingFamily.NONE> {
+
+  let isPseudo = 1;
+  let isCodeGenOnly = 1;
+  let SubtargetPredicate = isGCN;
+
+  string Mnemonic = opName;
+  string AsmOperands = asmOps;
+
+  bits<1> has_sdst = 0;
+}
+
 //===----------------------------------------------------------------------===//
 // SOP1 Instructions
 //===----------------------------------------------------------------------===//
 
 class SOP1_Pseudo <string opName, dag outs, dag ins,
                    string asmOps, list<dag> pattern=[]> :
-  InstSI <outs, ins, "", pattern>,
-  SIMCInstr<opName, SIEncodingFamily.NONE> {
-  let isPseudo = 1;
-  let isCodeGenOnly = 1;
-  let SubtargetPredicate = isGCN;
+  SOP_Pseudo<opName, outs, ins, asmOps, pattern> {
 
   let mayLoad = 0;
   let mayStore = 0;
@@ -40,9 +51,6 @@ class SOP1_Pseudo <string opName, dag outs, dag ins,
   let Size = 4;
   let UseNamedOperandTable = 1;
 
-  string Mnemonic = opName;
-  string AsmOperands = asmOps;
-
   bits<1> has_src0 = 1;
   bits<1> has_sdst = 1;
 }
@@ -247,17 +255,25 @@ def S_SET_GPR_IDX_IDX : SOP1_0_32<"s_set_gpr_idx_idx"> {
 }
 }
 
+let SubtargetPredicate = isGFX9 in {
+  let hasSideEffects = 1, Defs = [EXEC, SCC], Uses = [EXEC] in {
+    def S_ANDN1_SAVEEXEC_B64 : SOP1_64<"s_andn1_saveexec_b64">;
+    def S_ORN1_SAVEEXEC_B64  : SOP1_64<"s_orn1_saveexec_b64">;
+    def S_ANDN1_WREXEC_B64   : SOP1_64<"s_andn1_wrexec_b64">;
+    def S_ANDN2_WREXEC_B64   : SOP1_64<"s_andn2_wrexec_b64">;
+  } // End hasSideEffects = 1, Defs = [EXEC, SCC], Uses = [EXEC]
+
+  def S_BITREPLICATE_B64_B32 : SOP1_64_32<"s_bitreplicate_b64_b32">;
+} // End SubtargetPredicate = isGFX9
+
 //===----------------------------------------------------------------------===//
 // SOP2 Instructions
 //===----------------------------------------------------------------------===//
 
 class SOP2_Pseudo<string opName, dag outs, dag ins,
                   string asmOps, list<dag> pattern=[]> :
-  InstSI<outs, ins, "", pattern>,
-  SIMCInstr<opName, SIEncodingFamily.NONE> {
-  let isPseudo = 1;
-  let isCodeGenOnly = 1;
-  let SubtargetPredicate = isGCN;
+  SOP_Pseudo<opName, outs, ins, asmOps, pattern> {
+
   let mayLoad = 0;
   let mayStore = 0;
   let hasSideEffects = 0;
@@ -266,10 +282,7 @@ class SOP2_Pseudo<string opName, dag outs, dag ins,
   let SchedRW = [WriteSALU];
   let UseNamedOperandTable = 1;
 
-  string Mnemonic = opName;
-  string AsmOperands = asmOps;
-
-  bits<1> has_sdst = 1;
+  let has_sdst = 1;
 
   // Pseudo instructions have no encodings, but adding this field here allows
   // us to do:
@@ -279,7 +292,7 @@ class SOP2_Pseudo<string opName, dag outs, dag ins,
   // let Size = 4; // Do we need size here?
 }
 
-class SOP2_Real<bits<7> op, SOP2_Pseudo ps> :
+class SOP2_Real<bits<7> op, SOP_Pseudo ps> :
   InstSI <ps.OutOperandList, ps.InOperandList,
           ps.Mnemonic # " " # ps.AsmOperands, []>,
   Enc32 {
@@ -482,6 +495,16 @@ let SubtargetPredicate = isGFX9 in {
   def S_PACK_LL_B32_B16 : SOP2_32<"s_pack_ll_b32_b16">;
   def S_PACK_LH_B32_B16 : SOP2_32<"s_pack_lh_b32_b16">;
   def S_PACK_HH_B32_B16 : SOP2_32<"s_pack_hh_b32_b16">;
+
+  let Defs = [SCC] in {
+    def S_LSHL1_ADD_U32 : SOP2_32<"s_lshl1_add_u32">;
+    def S_LSHL2_ADD_U32 : SOP2_32<"s_lshl2_add_u32">;
+    def S_LSHL3_ADD_U32 : SOP2_32<"s_lshl3_add_u32">;
+    def S_LSHL4_ADD_U32 : SOP2_32<"s_lshl4_add_u32">;
+  } // End Defs = [SCC]
+
+  def S_MUL_HI_U32 : SOP2_32<"s_mul_hi_u32">;
+  def S_MUL_HI_I32 : SOP2_32<"s_mul_hi_i32">;
 }
 
 //===----------------------------------------------------------------------===//
@@ -659,6 +682,16 @@ def S_SETREG_IMM32_B32 : SOPK_Pseudo <
 
 } // End hasSideEffects = 1
 
+let SubtargetPredicate = isGFX9 in {
+  def S_CALL_B64 : SOPK_Pseudo<
+      "s_call_b64",
+      (outs SReg_64:$sdst),
+      (ins s16imm:$simm16),
+      "$sdst, $simm16"> {
+    let isCall = 1;
+  }
+}
+
 //===----------------------------------------------------------------------===//
 // SOPC Instructions
 //===----------------------------------------------------------------------===//
@@ -806,6 +839,13 @@ def S_ENDPGM_SAVED : SOPP <0x0000001B, (ins), "s_endpgm_saved"> {
 }
 }
 
+let SubtargetPredicate = isGFX9 in {
+  let isBarrier = 1, isReturn = 1, simm16 = 0 in {
+    def S_ENDPGM_ORDERED_PS_DONE :
+      SOPP<0x01e, (ins), "s_endpgm_ordered_ps_done">;
+  } // End isBarrier = 1, isReturn = 1, simm16 = 0
+} // End SubtargetPredicate = isGFX9
+
 let isBranch = 1, SchedRW = [WriteBranch] in {
 def S_BRANCH : SOPP <
   0x00000002, (ins sopp_brtarget:$simm16), "s_branch $simm16",
@@ -1312,3 +1352,26 @@ def S_SETREG_B32_vi        : SOPK_Real_vi <0x12, S_SETREG_B32>;
 //def S_GETREG_REGRD_B32_vi  : SOPK_Real_vi <0x13, S_GETREG_REGRD_B32>; // see pseudo for comments
 def S_SETREG_IMM32_B32_vi  : SOPK_Real64<0x14, S_SETREG_IMM32_B32>,
                              Select_vi<S_SETREG_IMM32_B32.Mnemonic>;
+
+def S_CALL_B64_vi          : SOPK_Real_vi <0x15, S_CALL_B64>;
+
+//===----------------------------------------------------------------------===//
+// SOP1 - GFX9.
+//===----------------------------------------------------------------------===//
+
+def S_ANDN1_SAVEEXEC_B64_vi   : SOP1_Real_vi<0x33, S_ANDN1_SAVEEXEC_B64>;
+def S_ORN1_SAVEEXEC_B64_vi    : SOP1_Real_vi<0x34, S_ORN1_SAVEEXEC_B64>;
+def S_ANDN1_WREXEC_B64_vi     : SOP1_Real_vi<0x35, S_ANDN1_WREXEC_B64>;
+def S_ANDN2_WREXEC_B64_vi     : SOP1_Real_vi<0x36, S_ANDN2_WREXEC_B64>;
+def S_BITREPLICATE_B64_B32_vi : SOP1_Real_vi<0x37, S_BITREPLICATE_B64_B32>;
+
+//===----------------------------------------------------------------------===//
+// SOP2 - GFX9.
+//===----------------------------------------------------------------------===//
+
+def S_LSHL1_ADD_U32_vi   : SOP2_Real_vi<0x2e, S_LSHL1_ADD_U32>;
+def S_LSHL2_ADD_U32_vi   : SOP2_Real_vi<0x2f, S_LSHL2_ADD_U32>;
+def S_LSHL3_ADD_U32_vi   : SOP2_Real_vi<0x30, S_LSHL3_ADD_U32>;
+def S_LSHL4_ADD_U32_vi   : SOP2_Real_vi<0x31, S_LSHL4_ADD_U32>;
+def S_MUL_HI_U32_vi      : SOP2_Real_vi<0x2c, S_MUL_HI_U32>;
+def S_MUL_HI_I32_vi      : SOP2_Real_vi<0x2d, S_MUL_HI_I32>;
diff --git a/lib/Target/AMDGPU/TargetInfo/AMDGPUTargetInfo.cpp b/lib/Target/AMDGPU/TargetInfo/AMDGPUTargetInfo.cpp
index f61e2e413ad4..e4c442db3016 100644
--- a/lib/Target/AMDGPU/TargetInfo/AMDGPUTargetInfo.cpp
+++ b/lib/Target/AMDGPU/TargetInfo/AMDGPUTargetInfo.cpp
@@ -16,19 +16,19 @@
 
 using namespace llvm;
 
-/// \brief The target which supports all AMD GPUs.  This will eventually
+/// The target which supports all AMD GPUs.  This will eventually
 ///         be deprecated and there will be a R600 target and a GCN target.
 Target &llvm::getTheAMDGPUTarget() {
   static Target TheAMDGPUTarget;
   return TheAMDGPUTarget;
 }
-/// \brief The target for GCN GPUs
+/// The target for GCN GPUs
 Target &llvm::getTheGCNTarget() {
   static Target TheGCNTarget;
   return TheGCNTarget;
 }
 
-/// \brief Extern function to initialize the targets for the AMDGPU backend
+/// Extern function to initialize the targets for the AMDGPU backend
 extern "C" void LLVMInitializeAMDGPUTargetInfo() {
   RegisterTarget<Triple::r600, false> R600(getTheAMDGPUTarget(), "r600",
                                            "AMD GPUs HD2XXX-HD6XXX", "AMDGPU");
diff --git a/lib/Target/AMDGPU/Utils/AMDGPUAsmUtils.cpp b/lib/Target/AMDGPU/Utils/AMDGPUAsmUtils.cpp
index 03b11ae80500..9eb4c6513cce 100644
--- a/lib/Target/AMDGPU/Utils/AMDGPUAsmUtils.cpp
+++ b/lib/Target/AMDGPU/Utils/AMDGPUAsmUtils.cpp
@@ -61,7 +61,15 @@ const char* const IdSymbolic[] = {
   "HW_REG_HW_ID",
   "HW_REG_GPR_ALLOC",
   "HW_REG_LDS_ALLOC",
-  "HW_REG_IB_STS"
+  "HW_REG_IB_STS",
+  nullptr,
+  nullptr,
+  nullptr,
+  nullptr,
+  nullptr,
+  nullptr,
+  nullptr,
+  "HW_REG_SH_MEM_BASES"
 };
 
 } // namespace Hwreg
diff --git a/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp b/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp
index 125a3b22d0cf..3fd3c75874a3 100644
--- a/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp
+++ b/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp
@@ -8,6 +8,7 @@
 //===----------------------------------------------------------------------===//
 
 #include "AMDGPUBaseInfo.h"
+#include "AMDGPUTargetTransformInfo.h"
 #include "AMDGPU.h"
 #include "SIDefines.h"
 #include "llvm/ADT/StringRef.h"
@@ -52,7 +53,7 @@ unsigned getBitMask(unsigned Shift, unsigned Width) {
   return ((1 << Width) - 1) << Shift;
 }
 
-/// \brief Packs \p Src into \p Dst for given bit \p Shift and bit \p Width.
+/// Packs \p Src into \p Dst for given bit \p Shift and bit \p Width.
 ///
 /// \returns Packed \p Dst.
 unsigned packBits(unsigned Src, unsigned Dst, unsigned Shift, unsigned Width) {
@@ -61,7 +62,7 @@ unsigned packBits(unsigned Src, unsigned Dst, unsigned Shift, unsigned Width) {
   return Dst;
 }
 
-/// \brief Unpacks bits from \p Src for given bit \p Shift and bit \p Width.
+/// Unpacks bits from \p Src for given bit \p Shift and bit \p Width.
 ///
 /// \returns Unpacked bits.
 unsigned unpackBits(unsigned Src, unsigned Shift, unsigned Width) {
@@ -96,64 +97,34 @@ unsigned getVmcntBitWidthHi() { return 2; }
 
 namespace llvm {
 
-static cl::opt<bool> EnablePackedInlinableLiterals(
-    "enable-packed-inlinable-literals",
-    cl::desc("Enable packed inlinable literals (v2f16, v2i16)"),
-    cl::init(false));
-
 namespace AMDGPU {
 
-LLVM_READNONE
-static inline Channels indexToChannel(unsigned Channel) {
-  switch (Channel) {
-  case 1:
-    return AMDGPU::Channels_1;
-  case 2:
-    return AMDGPU::Channels_2;
-  case 3:
-    return AMDGPU::Channels_3;
-  case 4:
-    return AMDGPU::Channels_4;
-  default:
-    llvm_unreachable("invalid MIMG channel");
-  }
-}
+struct MIMGInfo {
+  uint16_t Opcode;
+  uint16_t BaseOpcode;
+  uint8_t MIMGEncoding;
+  uint8_t VDataDwords;
+  uint8_t VAddrDwords;
+};
 
+#define GET_MIMGBaseOpcodesTable_IMPL
+#define GET_MIMGDimInfoTable_IMPL
+#define GET_MIMGInfoTable_IMPL
+#include "AMDGPUGenSearchableTables.inc"
 
-// FIXME: Need to handle d16 images correctly.
-static unsigned rcToChannels(unsigned RCID) {
-  switch (RCID) {
-  case AMDGPU::VGPR_32RegClassID:
-    return 1;
-  case AMDGPU::VReg_64RegClassID:
-    return 2;
-  case AMDGPU::VReg_96RegClassID:
-    return 3;
-  case AMDGPU::VReg_128RegClassID:
-    return 4;
-  default:
-    llvm_unreachable("invalid MIMG register class");
-  }
+int getMIMGOpcode(unsigned BaseOpcode, unsigned MIMGEncoding,
+                  unsigned VDataDwords, unsigned VAddrDwords) {
+  const MIMGInfo *Info = getMIMGOpcodeHelper(BaseOpcode, MIMGEncoding,
+                                             VDataDwords, VAddrDwords);
+  return Info ? Info->Opcode : -1;
 }
 
-int getMaskedMIMGOp(const MCInstrInfo &MII, unsigned Opc, unsigned NewChannels) {
-  AMDGPU::Channels Channel = AMDGPU::indexToChannel(NewChannels);
-  unsigned OrigChannels = rcToChannels(MII.get(Opc).OpInfo[0].RegClass);
-  if (NewChannels == OrigChannels)
-    return Opc;
-
-  switch (OrigChannels) {
-  case 1:
-    return AMDGPU::getMaskedMIMGOp1(Opc, Channel);
-  case 2:
-    return AMDGPU::getMaskedMIMGOp2(Opc, Channel);
-  case 3:
-    return AMDGPU::getMaskedMIMGOp3(Opc, Channel);
-  case 4:
-    return AMDGPU::getMaskedMIMGOp4(Opc, Channel);
-  default:
-    llvm_unreachable("invalid MIMG channel");
-  }
+int getMaskedMIMGOp(unsigned Opc, unsigned NewChannels) {
+  const MIMGInfo *OrigInfo = getMIMGInfo(Opc);
+  const MIMGInfo *NewInfo =
+      getMIMGOpcodeHelper(OrigInfo->BaseOpcode, OrigInfo->MIMGEncoding,
+                          NewChannels, OrigInfo->VAddrDwords);
+  return NewInfo ? NewInfo->Opcode : -1;
 }
 
 // Wrapper for Tablegen'd function.  enum Subtarget is not defined in any
@@ -183,10 +154,10 @@ IsaVersion getIsaVersion(const FeatureBitset &Features) {
     return {7, 0, 3};
   if (Features.test(FeatureISAVersion7_0_4))
     return {7, 0, 4};
+  if (Features.test(FeatureSeaIslands))
+    return {7, 0, 0};
 
   // GCN GFX8 (Volcanic Islands (VI)).
-  if (Features.test(FeatureISAVersion8_0_0))
-    return {8, 0, 0};
   if (Features.test(FeatureISAVersion8_0_1))
     return {8, 0, 1};
   if (Features.test(FeatureISAVersion8_0_2))
@@ -195,14 +166,22 @@ IsaVersion getIsaVersion(const FeatureBitset &Features) {
     return {8, 0, 3};
   if (Features.test(FeatureISAVersion8_1_0))
     return {8, 1, 0};
+  if (Features.test(FeatureVolcanicIslands))
+    return {8, 0, 0};
 
   // GCN GFX9.
   if (Features.test(FeatureISAVersion9_0_0))
     return {9, 0, 0};
   if (Features.test(FeatureISAVersion9_0_2))
     return {9, 0, 2};
+  if (Features.test(FeatureISAVersion9_0_4))
+    return {9, 0, 4};
+  if (Features.test(FeatureISAVersion9_0_6))
+    return {9, 0, 6};
+  if (Features.test(FeatureGFX9))
+    return {9, 0, 0};
 
-  if (!Features.test(FeatureGCN) || Features.test(FeatureSouthernIslands))
+  if (Features.test(FeatureSouthernIslands))
     return {0, 0, 0};
   return {7, 0, 0};
 }
@@ -219,11 +198,15 @@ void streamIsaVersion(const MCSubtargetInfo *STI, raw_ostream &Stream) {
          << ISAVersion.Major
          << ISAVersion.Minor
          << ISAVersion.Stepping;
+
+  if (hasXNACK(*STI))
+    Stream << "+xnack";
+
   Stream.flush();
 }
 
-bool hasCodeObjectV3(const FeatureBitset &Features) {
-  return Features.test(FeatureCodeObjectV3);
+bool hasCodeObjectV3(const MCSubtargetInfo *STI) {
+  return STI->getFeatureBits().test(FeatureCodeObjectV3);
 }
 
 unsigned getWavefrontSize(const FeatureBitset &Features) {
@@ -260,7 +243,7 @@ unsigned getMaxWorkGroupsPerCU(const FeatureBitset &Features,
 }
 
 unsigned getMaxWavesPerCU(const FeatureBitset &Features) {
-  return getMaxWavesPerEU(Features) * getEUsPerCU(Features);
+  return getMaxWavesPerEU() * getEUsPerCU(Features);
 }
 
 unsigned getMaxWavesPerCU(const FeatureBitset &Features,
@@ -272,9 +255,7 @@ unsigned getMinWavesPerEU(const FeatureBitset &Features) {
   return 1;
 }
 
-unsigned getMaxWavesPerEU(const FeatureBitset &Features) {
-  if (!Features.test(FeatureGCN))
-    return 8;
+unsigned getMaxWavesPerEU() {
   // FIXME: Need to take scratch memory into account.
   return 10;
 }
@@ -330,11 +311,13 @@ unsigned getAddressableNumSGPRs(const FeatureBitset &Features) {
 unsigned getMinNumSGPRs(const FeatureBitset &Features, unsigned WavesPerEU) {
   assert(WavesPerEU != 0);
 
-  if (WavesPerEU >= getMaxWavesPerEU(Features))
+  if (WavesPerEU >= getMaxWavesPerEU())
     return 0;
-  unsigned MinNumSGPRs =
-      alignDown(getTotalNumSGPRs(Features) / (WavesPerEU + 1),
-                getSGPRAllocGranule(Features)) + 1;
+
+  unsigned MinNumSGPRs = getTotalNumSGPRs(Features) / (WavesPerEU + 1);
+  if (Features.test(FeatureTrapHandler))
+    MinNumSGPRs -= std::min(MinNumSGPRs, (unsigned)TRAP_NUM_SGPRS);
+  MinNumSGPRs = alignDown(MinNumSGPRs, getSGPRAllocGranule(Features)) + 1;
   return std::min(MinNumSGPRs, getAddressableNumSGPRs(Features));
 }
 
@@ -343,14 +326,49 @@ unsigned getMaxNumSGPRs(const FeatureBitset &Features, unsigned WavesPerEU,
   assert(WavesPerEU != 0);
 
   IsaVersion Version = getIsaVersion(Features);
-  unsigned MaxNumSGPRs = alignDown(getTotalNumSGPRs(Features) / WavesPerEU,
-                                   getSGPRAllocGranule(Features));
   unsigned AddressableNumSGPRs = getAddressableNumSGPRs(Features);
   if (Version.Major >= 8 && !Addressable)
     AddressableNumSGPRs = 112;
+  unsigned MaxNumSGPRs = getTotalNumSGPRs(Features) / WavesPerEU;
+  if (Features.test(FeatureTrapHandler))
+    MaxNumSGPRs -= std::min(MaxNumSGPRs, (unsigned)TRAP_NUM_SGPRS);
+  MaxNumSGPRs = alignDown(MaxNumSGPRs, getSGPRAllocGranule(Features));
   return std::min(MaxNumSGPRs, AddressableNumSGPRs);
 }
 
+unsigned getNumExtraSGPRs(const FeatureBitset &Features, bool VCCUsed,
+                          bool FlatScrUsed, bool XNACKUsed) {
+  unsigned ExtraSGPRs = 0;
+  if (VCCUsed)
+    ExtraSGPRs = 2;
+
+  IsaVersion Version = getIsaVersion(Features);
+  if (Version.Major < 8) {
+    if (FlatScrUsed)
+      ExtraSGPRs = 4;
+  } else {
+    if (XNACKUsed)
+      ExtraSGPRs = 4;
+
+    if (FlatScrUsed)
+      ExtraSGPRs = 6;
+  }
+
+  return ExtraSGPRs;
+}
+
+unsigned getNumExtraSGPRs(const FeatureBitset &Features, bool VCCUsed,
+                          bool FlatScrUsed) {
+  return getNumExtraSGPRs(Features, VCCUsed, FlatScrUsed,
+                          Features[AMDGPU::FeatureXNACK]);
+}
+
+unsigned getNumSGPRBlocks(const FeatureBitset &Features, unsigned NumSGPRs) {
+  NumSGPRs = alignTo(std::max(1u, NumSGPRs), getSGPREncodingGranule(Features));
+  // SGPRBlocks is actual number of SGPR blocks minus 1.
+  return NumSGPRs / getSGPREncodingGranule(Features) - 1;
+}
+
 unsigned getVGPRAllocGranule(const FeatureBitset &Features) {
   return 4;
 }
@@ -370,7 +388,7 @@ unsigned getAddressableNumVGPRs(const FeatureBitset &Features) {
 unsigned getMinNumVGPRs(const FeatureBitset &Features, unsigned WavesPerEU) {
   assert(WavesPerEU != 0);
 
-  if (WavesPerEU >= getMaxWavesPerEU(Features))
+  if (WavesPerEU >= getMaxWavesPerEU())
     return 0;
   unsigned MinNumVGPRs =
       alignDown(getTotalNumVGPRs(Features) / (WavesPerEU + 1),
@@ -387,6 +405,12 @@ unsigned getMaxNumVGPRs(const FeatureBitset &Features, unsigned WavesPerEU) {
   return std::min(MaxNumVGPRs, AddressableNumVGPRs);
 }
 
+unsigned getNumVGPRBlocks(const FeatureBitset &Features, unsigned NumVGPRs) {
+  NumVGPRs = alignTo(std::max(1u, NumVGPRs), getVGPREncodingGranule(Features));
+  // VGPRBlocks is actual number of VGPR blocks minus 1.
+  return NumVGPRs / getVGPREncodingGranule(Features) - 1;
+}
+
 } // end namespace IsaInfo
 
 void initDefaultAMDKernelCodeT(amd_kernel_code_t &Header,
@@ -396,7 +420,7 @@ void initDefaultAMDKernelCodeT(amd_kernel_code_t &Header,
   memset(&Header, 0, sizeof(Header));
 
   Header.amd_kernel_code_version_major = 1;
-  Header.amd_kernel_code_version_minor = 1;
+  Header.amd_kernel_code_version_minor = 2;
   Header.amd_machine_kind = 1; // AMD_MACHINE_KIND_AMDGPU
   Header.amd_machine_version_major = ISA.Major;
   Header.amd_machine_version_minor = ISA.Minor;
@@ -416,6 +440,21 @@ void initDefaultAMDKernelCodeT(amd_kernel_code_t &Header,
   Header.private_segment_alignment = 4;
 }
 
+amdhsa::kernel_descriptor_t getDefaultAmdhsaKernelDescriptor() {
+  amdhsa::kernel_descriptor_t KD;
+  memset(&KD, 0, sizeof(KD));
+  AMDHSA_BITS_SET(KD.compute_pgm_rsrc1,
+                  amdhsa::COMPUTE_PGM_RSRC1_FLOAT_DENORM_MODE_16_64,
+                  amdhsa::FLOAT_DENORM_MODE_FLUSH_NONE);
+  AMDHSA_BITS_SET(KD.compute_pgm_rsrc1,
+                  amdhsa::COMPUTE_PGM_RSRC1_ENABLE_DX10_CLAMP, 1);
+  AMDHSA_BITS_SET(KD.compute_pgm_rsrc1,
+                  amdhsa::COMPUTE_PGM_RSRC1_ENABLE_IEEE_MODE, 1);
+  AMDHSA_BITS_SET(KD.compute_pgm_rsrc2,
+                  amdhsa::COMPUTE_PGM_RSRC2_ENABLE_SGPR_WORKGROUP_ID_X, 1);
+  return KD;
+}
+
 bool isGroupSegment(const GlobalValue *GV) {
   return GV->getType()->getAddressSpace() == AMDGPUAS::LOCAL_ADDRESS;
 }
@@ -425,7 +464,8 @@ bool isGlobalSegment(const GlobalValue *GV) {
 }
 
 bool isReadOnlySegment(const GlobalValue *GV) {
-  return GV->getType()->getAddressSpace() == AMDGPUAS::CONSTANT_ADDRESS;
+  return GV->getType()->getAddressSpace() == AMDGPUAS::CONSTANT_ADDRESS ||
+         GV->getType()->getAddressSpace() == AMDGPUAS::CONSTANT_ADDRESS_32BIT;
 }
 
 bool shouldEmitConstantsToTextSection(const Triple &TT) {
@@ -598,6 +638,18 @@ bool isEntryFunctionCC(CallingConv::ID CC) {
   }
 }
 
+bool hasXNACK(const MCSubtargetInfo &STI) {
+  return STI.getFeatureBits()[AMDGPU::FeatureXNACK];
+}
+
+bool hasMIMG_R128(const MCSubtargetInfo &STI) {
+  return STI.getFeatureBits()[AMDGPU::FeatureMIMG_R128];
+}
+
+bool hasPackedD16(const MCSubtargetInfo &STI) {
+  return !STI.getFeatureBits()[AMDGPU::FeatureUnpackedD16VMem];
+}
+
 bool isSI(const MCSubtargetInfo &STI) {
   return STI.getFeatureBits()[AMDGPU::FeatureSouthernIslands];
 }
@@ -681,6 +733,8 @@ bool isRegIntersect(unsigned Reg0, unsigned Reg1, const MCRegisterInfo* TRI) {
   case node: return isGFX9(STI) ? node##_gfx9 : node##_vi;
 
 unsigned getMCReg(unsigned Reg, const MCSubtargetInfo &STI) {
+  if (STI.getTargetTriple().getArch() == Triple::r600)
+    return Reg;
   MAP_REG2REG
 }
 
@@ -837,9 +891,6 @@ bool isInlinableLiteral16(int16_t Literal, bool HasInv2Pi) {
 bool isInlinableLiteralV216(int32_t Literal, bool HasInv2Pi) {
   assert(HasInv2Pi);
 
-  if (!EnablePackedInlinableLiterals)
-    return false;
-
   int16_t Lo16 = static_cast<int16_t>(Literal);
   int16_t Hi16 = static_cast<int16_t>(Literal >> 16);
   return Lo16 == Hi16 && isInlinableLiteral16(Lo16, HasInv2Pi);
@@ -871,24 +922,6 @@ bool isArgPassedInSGPR(const Argument *A) {
   }
 }
 
-// TODO: Should largely merge with AMDGPUTTIImpl::isSourceOfDivergence.
-bool isUniformMMO(const MachineMemOperand *MMO) {
-  const Value *Ptr = MMO->getValue();
-  // UndefValue means this is a load of a kernel input.  These are uniform.
-  // Sometimes LDS instructions have constant pointers.
-  // If Ptr is null, then that means this mem operand contains a
-  // PseudoSourceValue like GOT.
-  if (!Ptr || isa<UndefValue>(Ptr) ||
-      isa<Constant>(Ptr) || isa<GlobalValue>(Ptr))
-    return true;
-
-  if (const Argument *Arg = dyn_cast<Argument>(Ptr))
-    return isArgPassedInSGPR(Arg);
-
-  const Instruction *I = dyn_cast<Instruction>(Ptr);
-  return I && I->getMetadata("amdgpu.uniform");
-}
-
 int64_t getSMRDEncodedOffset(const MCSubtargetInfo &ST, int64_t ByteOffset) {
   if (isGCN3Encoding(ST))
     return ByteOffset;
@@ -909,18 +942,10 @@ namespace llvm {
 namespace AMDGPU {
 
 AMDGPUAS getAMDGPUAS(Triple T) {
-  auto Env = T.getEnvironmentName();
   AMDGPUAS AS;
-  if (Env == "amdgiz" || Env == "amdgizcl") {
-    AS.FLAT_ADDRESS     = 0;
-    AS.PRIVATE_ADDRESS  = 5;
-    AS.REGION_ADDRESS   = 4;
-  }
-  else {
-    AS.FLAT_ADDRESS     = 4;
-    AS.PRIVATE_ADDRESS  = 0;
-    AS.REGION_ADDRESS   = 5;
-   }
+  AS.FLAT_ADDRESS = 0;
+  AS.PRIVATE_ADDRESS = 5;
+  AS.REGION_ADDRESS = 2;
   return AS;
 }
 
@@ -931,5 +956,21 @@ AMDGPUAS getAMDGPUAS(const TargetMachine &M) {
 AMDGPUAS getAMDGPUAS(const Module &M) {
   return getAMDGPUAS(Triple(M.getTargetTriple()));
 }
+
+namespace {
+
+struct SourceOfDivergence {
+  unsigned Intr;
+};
+const SourceOfDivergence *lookupSourceOfDivergence(unsigned Intr);
+
+#define GET_SourcesOfDivergence_IMPL
+#include "AMDGPUGenSearchableTables.inc"
+
+} // end anonymous namespace
+
+bool isIntrinsicSourceOfDivergence(unsigned IntrID) {
+  return lookupSourceOfDivergence(IntrID);
+}
 } // namespace AMDGPU
 } // namespace llvm
diff --git a/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h b/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h
index a215b445378e..70681c271697 100644
--- a/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h
+++ b/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h
@@ -16,6 +16,7 @@
 #include "llvm/ADT/StringRef.h"
 #include "llvm/IR/CallingConv.h"
 #include "llvm/MC/MCInstrDesc.h"
+#include "llvm/Support/AMDHSAKernelDescriptor.h"
 #include "llvm/Support/Compiler.h"
 #include "llvm/Support/ErrorHandling.h"
 #include <cstdint>
@@ -28,24 +29,31 @@ class Argument;
 class FeatureBitset;
 class Function;
 class GlobalValue;
-class MachineMemOperand;
 class MCContext;
 class MCRegisterClass;
 class MCRegisterInfo;
 class MCSection;
 class MCSubtargetInfo;
+class MachineMemOperand;
 class Triple;
 
 namespace AMDGPU {
+
+#define GET_MIMGBaseOpcode_DECL
+#define GET_MIMGDim_DECL
+#define GET_MIMGEncoding_DECL
+#include "AMDGPUGenSearchableTables.inc"
+
 namespace IsaInfo {
 
 enum {
   // The closed Vulkan driver sets 96, which limits the wave count to 8 but
   // doesn't spill SGPRs as much as when 80 is set.
-  FIXED_NUM_SGPRS_FOR_INIT_BUG = 96
+  FIXED_NUM_SGPRS_FOR_INIT_BUG = 96,
+  TRAP_NUM_SGPRS = 16
 };
 
-/// \brief Instruction set architecture version.
+/// Instruction set architecture version.
 struct IsaVersion {
   unsigned Major;
   unsigned Minor;
@@ -55,12 +63,12 @@ struct IsaVersion {
 /// \returns Isa version for given subtarget \p Features.
 IsaVersion getIsaVersion(const FeatureBitset &Features);
 
-/// \brief Streams isa version string for given subtarget \p STI into \p Stream.
+/// Streams isa version string for given subtarget \p STI into \p Stream.
 void streamIsaVersion(const MCSubtargetInfo *STI, raw_ostream &Stream);
 
-/// \returns True if given subtarget \p Features support code object version 3,
+/// \returns True if given subtarget \p STI supports code object version 3,
 /// false otherwise.
-bool hasCodeObjectV3(const FeatureBitset &Features);
+bool hasCodeObjectV3(const MCSubtargetInfo *STI);
 
 /// \returns Wavefront size for given subtarget \p Features.
 unsigned getWavefrontSize(const FeatureBitset &Features);
@@ -92,7 +100,7 @@ unsigned getMinWavesPerEU(const FeatureBitset &Features);
 
 /// \returns Maximum number of waves per execution unit for given subtarget \p
 /// Features without any kind of limitation.
-unsigned getMaxWavesPerEU(const FeatureBitset &Features);
+unsigned getMaxWavesPerEU();
 
 /// \returns Maximum number of waves per execution unit for given subtarget \p
 /// Features and limited by given \p FlatWorkGroupSize.
@@ -131,6 +139,22 @@ unsigned getMinNumSGPRs(const FeatureBitset &Features, unsigned WavesPerEU);
 unsigned getMaxNumSGPRs(const FeatureBitset &Features, unsigned WavesPerEU,
                         bool Addressable);
 
+/// \returns Number of extra SGPRs implicitly required by given subtarget \p
+/// Features when the given special registers are used.
+unsigned getNumExtraSGPRs(const FeatureBitset &Features, bool VCCUsed,
+                          bool FlatScrUsed, bool XNACKUsed);
+
+/// \returns Number of extra SGPRs implicitly required by given subtarget \p
+/// Features when the given special registers are used. XNACK is inferred from
+/// \p Features.
+unsigned getNumExtraSGPRs(const FeatureBitset &Features, bool VCCUsed,
+                          bool FlatScrUsed);
+
+/// \returns Number of SGPR blocks needed for given subtarget \p Features when
+/// \p NumSGPRs are used. \p NumSGPRs should already include any special
+/// register counts.
+unsigned getNumSGPRBlocks(const FeatureBitset &Features, unsigned NumSGPRs);
+
 /// \returns VGPR allocation granularity for given subtarget \p Features.
 unsigned getVGPRAllocGranule(const FeatureBitset &Features);
 
@@ -151,20 +175,57 @@ unsigned getMinNumVGPRs(const FeatureBitset &Features, unsigned WavesPerEU);
 /// execution unit requirement for given subtarget \p Features.
 unsigned getMaxNumVGPRs(const FeatureBitset &Features, unsigned WavesPerEU);
 
+/// \returns Number of VGPR blocks needed for given subtarget \p Features when
+/// \p NumVGPRs are used.
+unsigned getNumVGPRBlocks(const FeatureBitset &Features, unsigned NumSGPRs);
+
 } // end namespace IsaInfo
 
 LLVM_READONLY
 int16_t getNamedOperandIdx(uint16_t Opcode, uint16_t NamedIdx);
 
+struct MIMGBaseOpcodeInfo {
+  MIMGBaseOpcode BaseOpcode;
+  bool Store;
+  bool Atomic;
+  bool AtomicX2;
+  bool Sampler;
+
+  uint8_t NumExtraArgs;
+  bool Gradients;
+  bool Coordinates;
+  bool LodOrClampOrMip;
+  bool HasD16;
+};
+
+LLVM_READONLY
+const MIMGBaseOpcodeInfo *getMIMGBaseOpcodeInfo(unsigned BaseOpcode);
+
+struct MIMGDimInfo {
+  MIMGDim Dim;
+  uint8_t NumCoords;
+  uint8_t NumGradients;
+  bool DA;
+};
+
 LLVM_READONLY
-int getMaskedMIMGOp(const MCInstrInfo &MII,
-                    unsigned Opc, unsigned NewChannels);
+const MIMGDimInfo *getMIMGDimInfo(unsigned Dim);
+
+LLVM_READONLY
+int getMIMGOpcode(unsigned BaseOpcode, unsigned MIMGEncoding,
+                  unsigned VDataDwords, unsigned VAddrDwords);
+
+LLVM_READONLY
+int getMaskedMIMGOp(unsigned Opc, unsigned NewChannels);
+
 LLVM_READONLY
 int getMCOpcode(uint16_t Opcode, unsigned Gen);
 
 void initDefaultAMDKernelCodeT(amd_kernel_code_t &Header,
                                const FeatureBitset &Features);
 
+amdhsa::kernel_descriptor_t getDefaultAmdhsaKernelDescriptor();
+
 bool isGroupSegment(const GlobalValue *GV);
 bool isGlobalSegment(const GlobalValue *GV);
 bool isReadOnlySegment(const GlobalValue *GV);
@@ -216,7 +277,7 @@ unsigned decodeExpcnt(const IsaInfo::IsaVersion &Version, unsigned Waitcnt);
 /// \returns Decoded Lgkmcnt from given \p Waitcnt for given isa \p Version.
 unsigned decodeLgkmcnt(const IsaInfo::IsaVersion &Version, unsigned Waitcnt);
 
-/// \brief Decodes Vmcnt, Expcnt and Lgkmcnt from given \p Waitcnt for given isa
+/// Decodes Vmcnt, Expcnt and Lgkmcnt from given \p Waitcnt for given isa
 /// \p Version, and writes decoded values into \p Vmcnt, \p Expcnt and
 /// \p Lgkmcnt respectively.
 ///
@@ -240,7 +301,7 @@ unsigned encodeExpcnt(const IsaInfo::IsaVersion &Version, unsigned Waitcnt,
 unsigned encodeLgkmcnt(const IsaInfo::IsaVersion &Version, unsigned Waitcnt,
                        unsigned Lgkmcnt);
 
-/// \brief Encodes \p Vmcnt, \p Expcnt and \p Lgkmcnt into Waitcnt for given isa
+/// Encodes \p Vmcnt, \p Expcnt and \p Lgkmcnt into Waitcnt for given isa
 /// \p Version.
 ///
 /// \details \p Vmcnt, \p Expcnt and \p Lgkmcnt are encoded as follows:
@@ -278,41 +339,45 @@ inline bool isKernel(CallingConv::ID CC) {
   }
 }
 
+bool hasXNACK(const MCSubtargetInfo &STI);
+bool hasMIMG_R128(const MCSubtargetInfo &STI);
+bool hasPackedD16(const MCSubtargetInfo &STI);
+
 bool isSI(const MCSubtargetInfo &STI);
 bool isCI(const MCSubtargetInfo &STI);
 bool isVI(const MCSubtargetInfo &STI);
 bool isGFX9(const MCSubtargetInfo &STI);
 
-/// \brief Is Reg - scalar register
+/// Is Reg - scalar register
 bool isSGPR(unsigned Reg, const MCRegisterInfo* TRI);
 
-/// \brief Is there any intersection between registers
+/// Is there any intersection between registers
 bool isRegIntersect(unsigned Reg0, unsigned Reg1, const MCRegisterInfo* TRI);
 
 /// If \p Reg is a pseudo reg, return the correct hardware register given
 /// \p STI otherwise return \p Reg.
 unsigned getMCReg(unsigned Reg, const MCSubtargetInfo &STI);
 
-/// \brief Convert hardware register \p Reg to a pseudo register
+/// Convert hardware register \p Reg to a pseudo register
 LLVM_READNONE
 unsigned mc2PseudoReg(unsigned Reg);
 
-/// \brief Can this operand also contain immediate values?
+/// Can this operand also contain immediate values?
 bool isSISrcOperand(const MCInstrDesc &Desc, unsigned OpNo);
 
-/// \brief Is this floating-point operand?
+/// Is this floating-point operand?
 bool isSISrcFPOperand(const MCInstrDesc &Desc, unsigned OpNo);
 
-/// \brief Does this opearnd support only inlinable literals?
+/// Does this opearnd support only inlinable literals?
 bool isSISrcInlinableOperand(const MCInstrDesc &Desc, unsigned OpNo);
 
-/// \brief Get the size in bits of a register from the register class \p RC.
+/// Get the size in bits of a register from the register class \p RC.
 unsigned getRegBitWidth(unsigned RCID);
 
-/// \brief Get the size in bits of a register from the register class \p RC.
+/// Get the size in bits of a register from the register class \p RC.
 unsigned getRegBitWidth(const MCRegisterClass &RC);
 
-/// \brief Get size of register operand
+/// Get size of register operand
 unsigned getRegOperandSize(const MCRegisterInfo *MRI, const MCInstrDesc &Desc,
                            unsigned OpNo);
 
@@ -349,7 +414,7 @@ inline unsigned getOperandSize(const MCInstrDesc &Desc, unsigned OpNo) {
   return getOperandSize(Desc.OpInfo[OpNo]);
 }
 
-/// \brief Is this literal inlinable
+/// Is this literal inlinable
 LLVM_READNONE
 bool isInlinableLiteral64(int64_t Literal, bool HasInv2Pi);
 
@@ -363,7 +428,6 @@ LLVM_READNONE
 bool isInlinableLiteralV216(int32_t Literal, bool HasInv2Pi);
 
 bool isArgPassedInSGPR(const Argument *Arg);
-bool isUniformMMO(const MachineMemOperand *MMO);
 
 /// \returns The encoding that will be used for \p ByteOffset in the SMRD
 /// offset field.
@@ -374,6 +438,9 @@ int64_t getSMRDEncodedOffset(const MCSubtargetInfo &ST, int64_t ByteOffset);
 /// not the encoded offset.
 bool isLegalSMRDImmOffset(const MCSubtargetInfo &ST, int64_t ByteOffset);
 
+/// \returns true if the intrinsic is divergent
+bool isIntrinsicSourceOfDivergence(unsigned IntrID);
+
 } // end namespace AMDGPU
 } // end namespace llvm
 
diff --git a/lib/Target/AMDGPU/Utils/AMDGPULaneDominator.cpp b/lib/Target/AMDGPU/Utils/AMDGPULaneDominator.cpp
new file mode 100644
index 000000000000..1924f71f11c8
--- /dev/null
+++ b/lib/Target/AMDGPU/Utils/AMDGPULaneDominator.cpp
@@ -0,0 +1,75 @@
+//===-- AMDGPULaneDominator.cpp - Determine Lane Dominators ---------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// MBB A lane-dominates MBB B if
+// 1. A dominates B in the usual sense, i.e. every path from the entry to B
+//    goes through A, and
+// 2. whenever B executes, every active lane during that execution of B was
+//    also active during the most recent execution of A.
+//
+// The simplest example where A dominates B but does not lane-dominate it is
+// where A is a loop:
+//
+//     |
+//     +--+
+//     A  |
+//     +--+
+//     |
+//     B
+//
+// Unfortunately, the second condition is not fully captured by the control
+// flow graph when it is unstructured (as may happen when branch conditions are
+// uniform).
+//
+// The following replacement of the second condition is a conservative
+// approximation. It is an equivalent condition when the CFG is fully
+// structured:
+//
+// 2'. every cycle in the CFG that contains A also contains B.
+//
+//===----------------------------------------------------------------------===//
+
+#include "AMDGPULaneDominator.h"
+
+#include "llvm/ADT/DenseSet.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/CodeGen/MachineBasicBlock.h"
+
+namespace llvm {
+
+namespace AMDGPU {
+
+// Given machine basic blocks A and B where A dominates B, check whether
+// A lane-dominates B.
+//
+// The check is conservative, i.e. there can be false-negatives.
+bool laneDominates(MachineBasicBlock *A, MachineBasicBlock *B) {
+  // Check whether A is reachable from itself without going through B.
+  DenseSet<MachineBasicBlock *> Reachable;
+  SmallVector<MachineBasicBlock *, 8> Stack;
+
+  Stack.push_back(A);
+  do {
+    MachineBasicBlock *MBB = Stack.back();
+    Stack.pop_back();
+
+    for (MachineBasicBlock *Succ : MBB->successors()) {
+      if (Succ == A)
+        return false;
+      if (Succ != B && Reachable.insert(Succ).second)
+        Stack.push_back(Succ);
+    }
+  } while (!Stack.empty());
+
+  return true;
+}
+
+} // namespace AMDGPU
+
+} // namespace llvm
diff --git a/lib/Target/AMDGPU/Utils/AMDGPULaneDominator.h b/lib/Target/AMDGPU/Utils/AMDGPULaneDominator.h
new file mode 100644
index 000000000000..4f33a89a364b
--- /dev/null
+++ b/lib/Target/AMDGPU/Utils/AMDGPULaneDominator.h
@@ -0,0 +1,24 @@
+//===- AMDGPULaneDominator.h ------------------------------------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_TARGET_AMDGPU_UTILS_AMDGPULANEDOMINATOR_H
+#define LLVM_LIB_TARGET_AMDGPU_UTILS_AMDGPULANEDOMINATOR_H
+
+namespace llvm {
+
+class MachineBasicBlock;
+
+namespace AMDGPU {
+
+bool laneDominates(MachineBasicBlock *MBBA, MachineBasicBlock *MBBB);
+
+} // end namespace AMDGPU
+} // end namespace llvm
+
+#endif // LLVM_LIB_TARGET_AMDGPU_UTILS_AMDGPULANEDOMINATOR_H
diff --git a/lib/Target/AMDGPU/Utils/AMDKernelCodeTInfo.h b/lib/Target/AMDGPU/Utils/AMDKernelCodeTInfo.h
index 991408c81c92..9f0a4d29b5e4 100644
--- a/lib/Target/AMDGPU/Utils/AMDKernelCodeTInfo.h
+++ b/lib/Target/AMDGPU/Utils/AMDKernelCodeTInfo.h
@@ -73,7 +73,6 @@ FIELD2(amd_machine_version_stepping,  machine_version_stepping,   amd_machine_ve
 
 FIELD(kernel_code_entry_byte_offset),
 FIELD(kernel_code_prefetch_byte_size),
-FIELD(max_scratch_backing_memory_byte_size),
 
 COMPPGM1(granulated_workitem_vgpr_count,  compute_pgm_rsrc1_vgprs,          VGPRS),
 COMPPGM1(granulated_wavefront_sgpr_count, compute_pgm_rsrc1_sgprs,          SGPRS),
diff --git a/lib/Target/AMDGPU/Utils/CMakeLists.txt b/lib/Target/AMDGPU/Utils/CMakeLists.txt
index 01b80ebe8d3d..c5ed32e46821 100644
--- a/lib/Target/AMDGPU/Utils/CMakeLists.txt
+++ b/lib/Target/AMDGPU/Utils/CMakeLists.txt
@@ -2,4 +2,5 @@ add_llvm_library(LLVMAMDGPUUtils
   AMDGPUBaseInfo.cpp
   AMDKernelCodeTUtils.cpp
   AMDGPUAsmUtils.cpp
+  AMDGPULaneDominator.cpp
   )
diff --git a/lib/Target/AMDGPU/VOP1Instructions.td b/lib/Target/AMDGPU/VOP1Instructions.td
index ff2bd2454400..4c7a92219755 100644
--- a/lib/Target/AMDGPU/VOP1Instructions.td
+++ b/lib/Target/AMDGPU/VOP1Instructions.td
@@ -40,17 +40,9 @@ class VOP1_SDWA9Ae <bits<8> op, VOPProfile P> : VOP_SDWA9Ae <P> {
 }
 
 class VOP1_Pseudo <string opName, VOPProfile P, list<dag> pattern=[], bit VOP1Only = 0> :
-  InstSI <P.Outs32, P.Ins32, "", pattern>,
-  VOP <opName>,
-  SIMCInstr <!if(VOP1Only, opName, opName#"_e32"), SIEncodingFamily.NONE>,
-  MnemonicAlias<!if(VOP1Only, opName, opName#"_e32"), opName> {
+  VOP_Pseudo <opName, !if(VOP1Only, "", "_e32"), P, P.Outs32, P.Ins32, "", pattern> {
 
-  let isPseudo = 1;
-  let isCodeGenOnly = 1;
-  let UseNamedOperandTable = 1;
-
-  string Mnemonic = opName;
-  string AsmOperands = P.Asm32;
+  let AsmOperands = P.Asm32;
 
   let Size = 4;
   let mayLoad = 0;
@@ -63,8 +55,6 @@ class VOP1_Pseudo <string opName, VOPProfile P, list<dag> pattern=[], bit VOP1On
   let Uses = [EXEC];
 
   let AsmVariantName = AMDGPUAsmVariants.Default;
-
-  VOPProfile Pfl = P;
 }
 
 class VOP1_Real <VOP1_Pseudo ps, int EncodingFamily> :
@@ -86,6 +76,7 @@ class VOP1_Real <VOP1_Pseudo ps, int EncodingFamily> :
   let TSFlags            = ps.TSFlags;
   let UseNamedOperandTable = ps.UseNamedOperandTable;
   let Uses                 = ps.Uses;
+  let Defs                 = ps.Defs;
 }
 
 class VOP1_SDWA_Pseudo <string OpName, VOPProfile P, list<dag> pattern=[]> :
@@ -202,13 +193,14 @@ defm V_TRUNC_F32 : VOP1Inst <"v_trunc_f32", VOP_F32_F32, ftrunc>;
 defm V_CEIL_F32 : VOP1Inst <"v_ceil_f32", VOP_F32_F32, fceil>;
 defm V_RNDNE_F32 : VOP1Inst <"v_rndne_f32", VOP_F32_F32, frint>;
 defm V_FLOOR_F32 : VOP1Inst <"v_floor_f32", VOP_F32_F32, ffloor>;
-defm V_EXP_F32 : VOP1Inst <"v_exp_f32", VOP_F32_F32, fexp2>;
 
 let SchedRW = [WriteQuarterRate32] in {
+defm V_EXP_F32 : VOP1Inst <"v_exp_f32", VOP_F32_F32, fexp2>;
 defm V_LOG_F32 : VOP1Inst <"v_log_f32", VOP_F32_F32, flog2>;
 defm V_RCP_F32 : VOP1Inst <"v_rcp_f32", VOP_F32_F32, AMDGPUrcp>;
-defm V_RCP_IFLAG_F32 : VOP1Inst <"v_rcp_iflag_f32", VOP_F32_F32>;
+defm V_RCP_IFLAG_F32 : VOP1Inst <"v_rcp_iflag_f32", VOP_F32_F32, AMDGPUrcp_iflag>;
 defm V_RSQ_F32 : VOP1Inst <"v_rsq_f32", VOP_F32_F32, AMDGPUrsq>;
+defm V_SQRT_F32 : VOP1Inst <"v_sqrt_f32", VOP_F32_F32, fsqrt>;
 } // End SchedRW = [WriteQuarterRate32]
 
 let SchedRW = [WriteDouble] in {
@@ -216,8 +208,6 @@ defm V_RCP_F64 : VOP1Inst <"v_rcp_f64", VOP_F64_F64, AMDGPUrcp>;
 defm V_RSQ_F64 : VOP1Inst <"v_rsq_f64", VOP_F64_F64, AMDGPUrsq>;
 } // End SchedRW = [WriteDouble];
 
-defm V_SQRT_F32 : VOP1Inst <"v_sqrt_f32", VOP_F32_F32, fsqrt>;
-
 let SchedRW = [WriteDouble] in {
 defm V_SQRT_F64 : VOP1Inst <"v_sqrt_f64", VOP_F64_F64, fsqrt>;
 } // End SchedRW = [WriteDouble]
@@ -232,9 +222,9 @@ defm V_BFREV_B32 : VOP1Inst <"v_bfrev_b32", VOP_I32_I32>;
 defm V_FFBH_U32 : VOP1Inst <"v_ffbh_u32", VOP_I32_I32>;
 defm V_FFBL_B32 : VOP1Inst <"v_ffbl_b32", VOP_I32_I32>;
 defm V_FFBH_I32 : VOP1Inst <"v_ffbh_i32", VOP_I32_I32>;
-defm V_FREXP_EXP_I32_F64 : VOP1Inst <"v_frexp_exp_i32_f64", VOP_I32_F64, int_amdgcn_frexp_exp>;
 
 let SchedRW = [WriteDoubleAdd] in {
+defm V_FREXP_EXP_I32_F64 : VOP1Inst <"v_frexp_exp_i32_f64", VOP_I32_F64, int_amdgcn_frexp_exp>;
 defm V_FREXP_MANT_F64 : VOP1Inst <"v_frexp_mant_f64", VOP_F64_F64, int_amdgcn_frexp_mant>;
 defm V_FRACT_F64 : VOP1Inst <"v_fract_f64", VOP_F64_F64, AMDGPUfract>;
 } // End SchedRW = [WriteDoubleAdd]
@@ -298,9 +288,7 @@ defm V_MOVRELS_B32 : VOP1Inst <"v_movrels_b32", VOP_I32_VI32_NO_EXT>;
 defm V_MOVRELSD_B32 : VOP1Inst <"v_movrelsd_b32", VOP_NO_EXT<VOP_I32_I32>>;
 } // End Uses = [M0, EXEC]
 
-let SchedRW = [WriteQuarterRate32] in {
 defm V_MOV_FED_B32 : VOP1Inst <"v_mov_fed_b32", VOP_I32_I32>;
-}
 
 // These instruction only exist on SI and CI
 let SubtargetPredicate = isSICI in {
@@ -344,11 +332,15 @@ defm V_CVT_F16_U16 : VOP1Inst <"v_cvt_f16_u16", VOP1_F16_I16, uint_to_fp>;
 defm V_CVT_F16_I16 : VOP1Inst <"v_cvt_f16_i16", VOP1_F16_I16, sint_to_fp>;
 defm V_CVT_U16_F16 : VOP1Inst <"v_cvt_u16_f16", VOP_I16_F16, fp_to_uint>;
 defm V_CVT_I16_F16 : VOP1Inst <"v_cvt_i16_f16", VOP_I16_F16, fp_to_sint>;
+let SchedRW = [WriteQuarterRate32] in {
 defm V_RCP_F16 : VOP1Inst <"v_rcp_f16", VOP_F16_F16, AMDGPUrcp>;
 defm V_SQRT_F16 : VOP1Inst <"v_sqrt_f16", VOP_F16_F16, fsqrt>;
 defm V_RSQ_F16 : VOP1Inst <"v_rsq_f16", VOP_F16_F16, AMDGPUrsq>;
 defm V_LOG_F16 : VOP1Inst <"v_log_f16", VOP_F16_F16, flog2>;
 defm V_EXP_F16 : VOP1Inst <"v_exp_f16", VOP_F16_F16, fexp2>;
+defm V_SIN_F16 : VOP1Inst <"v_sin_f16", VOP_F16_F16, AMDGPUsin>;
+defm V_COS_F16 : VOP1Inst <"v_cos_f16", VOP_F16_F16, AMDGPUcos>;
+} // End SchedRW = [WriteQuarterRate32]
 defm V_FREXP_MANT_F16 : VOP1Inst <"v_frexp_mant_f16", VOP_F16_F16, int_amdgcn_frexp_mant>;
 defm V_FREXP_EXP_I16_F16 : VOP1Inst <"v_frexp_exp_i16_f16", VOP_I16_F16, int_amdgcn_frexp_exp>;
 defm V_FLOOR_F16 : VOP1Inst <"v_floor_f16", VOP_F16_F16, ffloor>;
@@ -356,8 +348,6 @@ defm V_CEIL_F16 : VOP1Inst <"v_ceil_f16", VOP_F16_F16, fceil>;
 defm V_TRUNC_F16 : VOP1Inst <"v_trunc_f16", VOP_F16_F16, ftrunc>;
 defm V_RNDNE_F16 : VOP1Inst <"v_rndne_f16", VOP_F16_F16, frint>;
 defm V_FRACT_F16 : VOP1Inst <"v_fract_f16", VOP_F16_F16, AMDGPUfract>;
-defm V_SIN_F16 : VOP1Inst <"v_sin_f16", VOP_F16_F16, AMDGPUsin>;
-defm V_COS_F16 : VOP1Inst <"v_cos_f16", VOP_F16_F16, AMDGPUcos>;
 
 }
 
@@ -392,6 +382,12 @@ let SubtargetPredicate = isGFX9 in {
 def V_SWAP_B32 : VOP1_Pseudo <"v_swap_b32", VOP_SWAP_I32, [], 1>;
 }
 
+defm V_SCREEN_PARTITION_4SE_B32 : VOP1Inst <"v_screen_partition_4se_b32", VOP_I32_I32>;
+
+defm V_SAT_PK_U8_I16    : VOP1Inst<"v_sat_pk_u8_i16", VOP_I32_I32>;
+defm V_CVT_NORM_I16_F16 : VOP1Inst<"v_cvt_norm_i16_f16", VOP_I16_F16>;
+defm V_CVT_NORM_U16_F16 : VOP1Inst<"v_cvt_norm_u16_f16", VOP_I16_F16>;
+
 } // End SubtargetPredicate = isGFX9
 
 //===----------------------------------------------------------------------===//
@@ -521,7 +517,7 @@ multiclass VOP1Only_Real_vi <bits<10> op> {
   }
 }
 
-multiclass VOP1_Real_vi <bits<10> op> {
+multiclass VOP1_Real_e32e64_vi <bits<10> op> {
   let AssemblerPredicates = [isVI], DecoderNamespace = "VI" in {
     def _e32_vi :
       VOP1_Real<!cast<VOP1_Pseudo>(NAME#"_e32"), SIEncodingFamily.VI>,
@@ -530,6 +526,10 @@ multiclass VOP1_Real_vi <bits<10> op> {
       VOP3_Real<!cast<VOP3_Pseudo>(NAME#"_e64"), SIEncodingFamily.VI>,
       VOP3e_vi <!add(0x140, op), !cast<VOP3_Pseudo>(NAME#"_e64").Pfl>;
   }
+}
+
+multiclass VOP1_Real_vi <bits<10> op> {
+  defm NAME : VOP1_Real_e32e64_vi <op>;
 
   def _sdwa_vi :
     VOP_SDWA_Real <!cast<VOP1_SDWA_Pseudo>(NAME#"_sdwa")>,
@@ -593,9 +593,9 @@ defm V_FRACT_F64         : VOP1_Real_vi <0x32>;
 defm V_FREXP_EXP_I32_F32 : VOP1_Real_vi <0x33>;
 defm V_FREXP_MANT_F32    : VOP1_Real_vi <0x34>;
 defm V_CLREXCP           : VOP1_Real_vi <0x35>;
-defm V_MOVRELD_B32       : VOP1_Real_vi <0x36>;
-defm V_MOVRELS_B32       : VOP1_Real_vi <0x37>;
-defm V_MOVRELSD_B32      : VOP1_Real_vi <0x38>;
+defm V_MOVRELD_B32       : VOP1_Real_e32e64_vi <0x36>;
+defm V_MOVRELS_B32       : VOP1_Real_e32e64_vi <0x37>;
+defm V_MOVRELSD_B32      : VOP1_Real_e32e64_vi <0x38>;
 defm V_TRUNC_F64         : VOP1_Real_vi <0x17>;
 defm V_CEIL_F64          : VOP1_Real_vi <0x18>;
 defm V_FLOOR_F64         : VOP1_Real_vi <0x1A>;
@@ -622,6 +622,10 @@ defm V_SIN_F16           : VOP1_Real_vi <0x49>;
 defm V_COS_F16           : VOP1_Real_vi <0x4a>;
 defm V_SWAP_B32          : VOP1Only_Real_vi <0x51>;
 
+defm V_SAT_PK_U8_I16     : VOP1_Real_vi<0x4f>;
+defm V_CVT_NORM_I16_F16  : VOP1_Real_vi<0x4d>;
+defm V_CVT_NORM_U16_F16  : VOP1_Real_vi<0x4e>;
+
 // Copy of v_mov_b32 with $vdst as a use operand for use with VGPR
 // indexing mode. vdst can't be treated as a def for codegen purposes,
 // and an implicit use and def of the super register should be added.
@@ -694,3 +698,23 @@ def : GCNPat <
 >;
 
 } // End OtherPredicates = [isVI]
+
+//===----------------------------------------------------------------------===//
+// GFX9
+//===----------------------------------------------------------------------===//
+
+multiclass VOP1_Real_gfx9 <bits<10> op> {
+  let AssemblerPredicates = [isGFX9], DecoderNamespace = "GFX9" in {
+    defm NAME : VOP1_Real_e32e64_vi <op>;
+  }
+
+  def _sdwa_gfx9 :
+    VOP_SDWA9_Real <!cast<VOP1_SDWA_Pseudo>(NAME#"_sdwa")>,
+    VOP1_SDWA9Ae <op{7-0}, !cast<VOP1_SDWA_Pseudo>(NAME#"_sdwa").Pfl>;
+
+  // For now left dpp only for asm/dasm
+  // TODO: add corresponding pseudo
+  def _dpp : VOP1_DPP<op{7-0}, !cast<VOP1_Pseudo>(NAME#"_e32")>;
+}
+
+defm V_SCREEN_PARTITION_4SE_B32 : VOP1_Real_gfx9 <0x37>;
diff --git a/lib/Target/AMDGPU/VOP2Instructions.td b/lib/Target/AMDGPU/VOP2Instructions.td
index ef90b68db1a8..5ec1a15c5cd2 100644
--- a/lib/Target/AMDGPU/VOP2Instructions.td
+++ b/lib/Target/AMDGPU/VOP2Instructions.td
@@ -61,17 +61,9 @@ class VOP2_SDWA9Ae <bits<6> op, VOPProfile P> : VOP_SDWA9Ae <P> {
 }
 
 class VOP2_Pseudo <string opName, VOPProfile P, list<dag> pattern=[], string suffix = "_e32"> :
-  InstSI <P.Outs32, P.Ins32, "", pattern>,
-  VOP <opName>,
-  SIMCInstr <opName#suffix, SIEncodingFamily.NONE>,
-  MnemonicAlias<opName#suffix, opName> {
+  VOP_Pseudo <opName, suffix, P, P.Outs32, P.Ins32, "", pattern> {
 
-  let isPseudo = 1;
-  let isCodeGenOnly = 1;
-  let UseNamedOperandTable = 1;
-
-  string Mnemonic = opName;
-  string AsmOperands = P.Asm32;
+  let AsmOperands = P.Asm32;
 
   let Size = 4;
   let mayLoad = 0;
@@ -84,8 +76,6 @@ class VOP2_Pseudo <string opName, VOPProfile P, list<dag> pattern=[], string suf
   let Uses = [EXEC];
 
   let AsmVariantName = AMDGPUAsmVariants.Default;
-
-  VOPProfile Pfl = P;
 }
 
 class VOP2_Real <VOP2_Pseudo ps, int EncodingFamily> :
@@ -107,6 +97,7 @@ class VOP2_Real <VOP2_Pseudo ps, int EncodingFamily> :
   let TSFlags            = ps.TSFlags;
   let UseNamedOperandTable = ps.UseNamedOperandTable;
   let Uses                 = ps.Uses;
+  let Defs                 = ps.Defs;
 }
 
 class VOP2_SDWA_Pseudo <string OpName, VOPProfile P, list<dag> pattern=[]> :
@@ -177,6 +168,10 @@ multiclass VOP2eInst <string opName,
     let Uses = !if(useSGPRInput, [VCC, EXEC], [EXEC]) in {
       def _e32 : VOP2_Pseudo <opName, P>,
                  Commutable_REV<revOp#"_e32", !eq(revOp, opName)>;
+
+      def _sdwa : VOP2_SDWA_Pseudo <opName, P> {
+        let AsmMatchConverter = "cvtSdwaVOP2b";
+      }
     }
 
     def _e64 : VOP3_Pseudo <opName, P, getVOP2Pat64<node, P>.ret>,
@@ -303,12 +298,30 @@ def VOP2e_I32_I32_I32_I1 : VOPProfile<[i32, i32, i32, i1]> {
   let Src0RC32 = VCSrc_b32; // See comment in def VOP2b_I32_I1_I32_I32_I1 above.
   let Asm32 = "$vdst, $src0, $src1, vcc";
   let Asm64 = "$vdst, $src0, $src1, $src2";
+  let AsmSDWA = "$vdst, $src0_modifiers, $src1_modifiers, vcc $clamp $dst_sel $dst_unused $src0_sel $src1_sel";
+  let AsmSDWA9 = "$vdst, $src0_modifiers, $src1_modifiers, vcc $clamp $dst_sel $dst_unused $src0_sel $src1_sel";
+  let AsmDPP = "$vdst, $src0, $src1, vcc $dpp_ctrl$row_mask$bank_mask$bound_ctrl";
+
   let Outs32 = (outs DstRC:$vdst);
   let Outs64 = (outs DstRC:$vdst);
 
   // Suppress src2 implied by type since the 32-bit encoding uses an
   // implicit VCC use.
   let Ins32 = (ins Src0RC32:$src0, Src1RC32:$src1);
+
+  let InsSDWA = (ins Src0ModSDWA:$src0_modifiers, Src0SDWA:$src0,
+                     Src1ModSDWA:$src1_modifiers, Src1SDWA:$src1,
+                     clampmod:$clamp,
+                     dst_sel:$dst_sel, dst_unused:$dst_unused,
+                     src0_sel:$src0_sel, src1_sel:$src1_sel);
+
+  let InsDPP = (ins DstRCDPP:$old,
+                    Src0DPP:$src0,
+                    Src1DPP:$src1,
+                    dpp_ctrl:$dpp_ctrl, row_mask:$row_mask,
+                    bank_mask:$bank_mask, bound_ctrl:$bound_ctrl);
+  let HasExt = 1;
+  let HasSDWA9 = 1;
 }
 
 def VOP_READLANE : VOPProfile<[i32, i32, i32]> {
@@ -322,15 +335,17 @@ def VOP_READLANE : VOPProfile<[i32, i32, i32]> {
   let HasSDWA9 = 0;
 }
 
-def VOP_WRITELANE : VOPProfile<[i32, i32, i32]> {
+def VOP_WRITELANE : VOPProfile<[i32, i32, i32, i32]> {
   let Outs32 = (outs VGPR_32:$vdst);
   let Outs64 = Outs32;
-  let Ins32 = (ins SCSrc_b32:$src0, SCSrc_b32:$src1);
+  let Ins32 = (ins SCSrc_b32:$src0, SCSrc_b32:$src1, VGPR_32:$vdst_in);
   let Ins64 = Ins32;
   let Asm32 = " $vdst, $src0, $src1";
   let Asm64 = Asm32;
   let HasExt = 0;
   let HasSDWA9 = 0;
+  let HasSrc2 = 0;
+  let HasSrc2Mods = 0;
 }
 
 //===----------------------------------------------------------------------===//
@@ -398,7 +413,10 @@ let isConvergent = 1, Uses = []<Register> in {
 def V_READLANE_B32 : VOP2_Pseudo<"v_readlane_b32", VOP_READLANE,
   [(set i32:$vdst, (int_amdgcn_readlane i32:$src0, i32:$src1))], "">;
 
-def V_WRITELANE_B32 : VOP2_Pseudo<"v_writelane_b32", VOP_WRITELANE, [], "">;
+let Constraints = "$vdst = $vdst_in", DisableEncoding="$vdst_in" in {
+def V_WRITELANE_B32 : VOP2_Pseudo<"v_writelane_b32", VOP_WRITELANE,
+  [(set i32:$vdst, (int_amdgcn_writelane i32:$src0, i32:$src1, i32:$vdst_in))], "">;
+} // End $vdst = $vdst_in, DisableEncoding $vdst_in
 } // End isConvergent = 1
 
 defm V_BFM_B32 : VOP2Inst <"v_bfm_b32", VOP_NO_EXT<VOP_I32_I32_I32>>;
@@ -407,11 +425,11 @@ defm V_MBCNT_LO_U32_B32 : VOP2Inst <"v_mbcnt_lo_u32_b32", VOP_NO_EXT<VOP_I32_I32
 defm V_MBCNT_HI_U32_B32 : VOP2Inst <"v_mbcnt_hi_u32_b32", VOP_NO_EXT<VOP_I32_I32_I32>, int_amdgcn_mbcnt_hi>;
 defm V_LDEXP_F32 : VOP2Inst <"v_ldexp_f32", VOP_NO_EXT<VOP_F32_F32_I32>, AMDGPUldexp>;
 defm V_CVT_PKACCUM_U8_F32 : VOP2Inst <"v_cvt_pkaccum_u8_f32", VOP_NO_EXT<VOP_I32_F32_I32>>; // TODO: set "Uses = dst"
-defm V_CVT_PKNORM_I16_F32 : VOP2Inst <"v_cvt_pknorm_i16_f32", VOP_NO_EXT<VOP_I32_F32_F32>>;
-defm V_CVT_PKNORM_U16_F32 : VOP2Inst <"v_cvt_pknorm_u16_f32", VOP_NO_EXT<VOP_I32_F32_F32>>;
+defm V_CVT_PKNORM_I16_F32 : VOP2Inst <"v_cvt_pknorm_i16_f32", VOP_NO_EXT<VOP_I32_F32_F32>, AMDGPUpknorm_i16_f32>;
+defm V_CVT_PKNORM_U16_F32 : VOP2Inst <"v_cvt_pknorm_u16_f32", VOP_NO_EXT<VOP_I32_F32_F32>, AMDGPUpknorm_u16_f32>;
 defm V_CVT_PKRTZ_F16_F32 : VOP2Inst <"v_cvt_pkrtz_f16_f32", VOP_NO_EXT<VOP_I32_F32_F32>, AMDGPUpkrtz_f16_f32>;
-defm V_CVT_PK_U16_U32 : VOP2Inst <"v_cvt_pk_u16_u32", VOP_NO_EXT<VOP_I32_I32_I32>>;
-defm V_CVT_PK_I16_I32 : VOP2Inst <"v_cvt_pk_i16_i32", VOP_NO_EXT<VOP_I32_I32_I32>>;
+defm V_CVT_PK_U16_U32 : VOP2Inst <"v_cvt_pk_u16_u32", VOP_NO_EXT<VOP_I32_I32_I32>, AMDGPUpk_u16_u32>;
+defm V_CVT_PK_I16_I32 : VOP2Inst <"v_cvt_pk_i16_i32", VOP_NO_EXT<VOP_I32_I32_I32>, AMDGPUpk_i16_i32>;
 
 } // End SubtargetPredicate = isGCN
 
@@ -473,6 +491,19 @@ defm V_MAC_F16 : VOP2Inst <"v_mac_f16", VOP_MAC_F16>;
 
 } // End SubtargetPredicate = Has16BitInsts
 
+let SubtargetPredicate = HasDLInsts in {
+
+defm V_XNOR_B32 : VOP2Inst <"v_xnor_b32", VOP_I32_I32_I32>;
+
+let Constraints = "$vdst = $src2",
+    DisableEncoding="$src2",
+    isConvertibleToThreeAddress = 1,
+    isCommutable = 1 in {
+defm V_FMAC_F32 : VOP2Inst <"v_fmac_f32", VOP_MAC_F32>;
+}
+
+} // End SubtargetPredicate = HasDLInsts
+
 // Note: 16-bit instructions produce a 0 result in the high 16-bits.
 multiclass Arithmetic_i16_Pats <SDPatternOperator op, Instruction inst> {
 
@@ -639,7 +670,7 @@ defm V_SUBBREV_U32        : VOP2be_Real_e32e64_si <0x2a>;
 
 defm V_READLANE_B32       : VOP2_Real_si <0x01>;
 
-let InOperandList = (ins SSrc_b32:$src0, SCSrc_b32:$src1) in {
+let InOperandList = (ins SSrc_b32:$src0, SCSrc_b32:$src1, VSrc_b32:$vdst_in) in {
 defm V_WRITELANE_B32      : VOP2_Real_si <0x02>;
 }
 
@@ -824,7 +855,7 @@ multiclass VOP2_Real_e32e64_vi <bits<6> op> :
   def _dpp : VOP2_DPP<op, !cast<VOP2_Pseudo>(NAME#"_e32")>;
 }
 
-defm V_CNDMASK_B32        : Base_VOP2_Real_e32e64_vi <0x0>;
+defm V_CNDMASK_B32        : VOP2_Real_e32e64_vi <0x0>;
 defm V_ADD_F32            : VOP2_Real_e32e64_vi <0x1>;
 defm V_SUB_F32            : VOP2_Real_e32e64_vi <0x2>;
 defm V_SUBREV_F32         : VOP2_Real_e32e64_vi <0x3>;
@@ -926,3 +957,10 @@ def : SI2_VI3Alias <"v_cvt_pknorm_u16_f32", V_CVT_PKNORM_U16_F32_e64_vi>;
 def : SI2_VI3Alias <"v_cvt_pkrtz_f16_f32", V_CVT_PKRTZ_F16_F32_e64_vi>;
 
 } // End SubtargetPredicate = isVI
+
+let SubtargetPredicate = HasDLInsts in {
+
+defm V_FMAC_F32 : VOP2_Real_e32e64_vi <0x3b>;
+defm V_XNOR_B32 : VOP2_Real_e32e64_vi <0x3d>;
+
+} // End SubtargetPredicate = HasDLInsts
diff --git a/lib/Target/AMDGPU/VOP3Instructions.td b/lib/Target/AMDGPU/VOP3Instructions.td
index aedbfa015bf6..17ae08dc6267 100644
--- a/lib/Target/AMDGPU/VOP3Instructions.td
+++ b/lib/Target/AMDGPU/VOP3Instructions.td
@@ -153,19 +153,24 @@ class getVOP3VCC<VOPProfile P, SDPatternOperator node> {
             (i1 VCC)))];
 }
 
-class VOP3Features<bit Clamp, bit OpSel> {
+class VOP3Features<bit Clamp, bit OpSel, bit Packed> {
   bit HasClamp = Clamp;
   bit HasOpSel = OpSel;
+  bit IsPacked = Packed;
 }
 
-def VOP3_REGULAR : VOP3Features<0, 0>;
-def VOP3_CLAMP   : VOP3Features<1, 0>;
-def VOP3_OPSEL   : VOP3Features<1, 1>;
+def VOP3_REGULAR : VOP3Features<0, 0, 0>;
+def VOP3_CLAMP   : VOP3Features<1, 0, 0>;
+def VOP3_OPSEL   : VOP3Features<1, 1, 0>;
+def VOP3_PACKED  : VOP3Features<1, 1, 1>;
 
 class VOP3_Profile<VOPProfile P, VOP3Features Features = VOP3_REGULAR> : VOPProfile<P.ArgVT> {
 
   let HasClamp = !if(Features.HasClamp, 1, P.HasClamp);
   let HasOpSel = !if(Features.HasOpSel, 1, P.HasOpSel);
+  let IsPacked = !if(Features.IsPacked, 1, P.IsPacked);
+
+  let HasModifiers = !if(Features.IsPacked, 1, P.HasModifiers);
 
   // FIXME: Hack to stop printing _e64
   let Outs64 = (outs DstRC.RegClass:$vdst);
@@ -283,10 +288,10 @@ def V_MAD_F32 : VOP3Inst <"v_mad_f32", VOP3_Profile<VOP_F32_F32_F32_F32>, fmad>;
 def V_MAD_I32_I24 : VOP3Inst <"v_mad_i32_i24", VOP3_Profile<VOP_I32_I32_I32_I32, VOP3_CLAMP>>;
 def V_MAD_U32_U24 : VOP3Inst <"v_mad_u32_u24", VOP3_Profile<VOP_I32_I32_I32_I32, VOP3_CLAMP>>;
 def V_FMA_F32 : VOP3Inst <"v_fma_f32", VOP3_Profile<VOP_F32_F32_F32_F32>, fma>;
-def V_FMA_F64 : VOP3Inst <"v_fma_f64", VOP3_Profile<VOP_F64_F64_F64_F64>, fma>;
 def V_LERP_U8 : VOP3Inst <"v_lerp_u8", VOP3_Profile<VOP_I32_I32_I32_I32>, int_amdgcn_lerp>;
 
 let SchedRW = [WriteDoubleAdd] in {
+def V_FMA_F64 : VOP3Inst <"v_fma_f64", VOP3_Profile<VOP_F64_F64_F64_F64>, fma>;
 def V_ADD_F64 : VOP3Inst <"v_add_f64", VOP3_Profile<VOP_F64_F64_F64>, fadd, 1>;
 def V_MUL_F64 : VOP3Inst <"v_mul_f64", VOP3_Profile<VOP_F64_F64_F64>, fmul, 1>;
 def V_MIN_F64 : VOP3Inst <"v_min_f64", VOP3_Profile<VOP_F64_F64_F64>, fminnum, 1>;
@@ -355,14 +360,12 @@ def V_LDEXP_F64 : VOP3Inst <"v_ldexp_f64", VOP3_Profile<VOP_F64_F64_I32>, AMDGPU
 
 def V_DIV_SCALE_F32 : VOP3_Pseudo <"v_div_scale_f32", VOP3b_F32_I1_F32_F32_F32, [], 1> {
   let SchedRW = [WriteFloatFMA, WriteSALU];
-  let hasExtraSrcRegAllocReq = 1;
   let AsmMatchConverter = "";
 }
 
 // Double precision division pre-scale.
 def V_DIV_SCALE_F64 : VOP3_Pseudo <"v_div_scale_f64", VOP3b_F64_I1_F64_F64_F64, [], 1> {
   let SchedRW = [WriteDouble, WriteSALU];
-  let hasExtraSrcRegAllocReq = 1;
   let AsmMatchConverter = "";
 }
 
@@ -376,6 +379,7 @@ def V_TRIG_PREOP_F64 : VOP3Inst <"v_trig_preop_f64", VOP3_Profile<VOP_F64_F64_I3
   let SchedRW = [WriteDouble];
 }
 
+let SchedRW = [Write64Bit] in {
 // These instructions only exist on SI and CI
 let SubtargetPredicate = isSICI in {
 def V_LSHL_B64 : VOP3Inst <"v_lshl_b64", VOP3_Profile<VOP_I64_I64_I32>>;
@@ -389,17 +393,17 @@ def V_LSHLREV_B64 : VOP3Inst <"v_lshlrev_b64", VOP3_Profile<VOP_I64_I32_I64>>;
 def V_LSHRREV_B64 : VOP3Inst <"v_lshrrev_b64", VOP3_Profile<VOP_I64_I32_I64>>;
 def V_ASHRREV_I64 : VOP3Inst <"v_ashrrev_i64", VOP3_Profile<VOP_I64_I32_I64>>;
 } // End SubtargetPredicate = isVI
-
+} // End SchedRW = [Write64Bit]
 
 let SubtargetPredicate = isCIVI in {
 
-let Constraints = "@earlyclobber $vdst" in {
+let Constraints = "@earlyclobber $vdst", SchedRW = [WriteQuarterRate32] in {
 def V_QSAD_PK_U16_U8 : VOP3Inst <"v_qsad_pk_u16_u8", VOP3_Profile<VOP_I64_I64_I32_I64, VOP3_CLAMP>>;
 def V_MQSAD_U32_U8 : VOP3Inst <"v_mqsad_u32_u8", VOP3_Profile<VOP_V4I32_I64_I32_V4I32, VOP3_CLAMP>>;
-} // End Constraints = "@earlyclobber $vdst"
+} // End Constraints = "@earlyclobber $vdst", SchedRW = [WriteQuarterRate32]
 
 let isCommutable = 1 in {
-let SchedRW = [WriteDouble, WriteSALU] in {
+let SchedRW = [WriteQuarterRate32, WriteSALU] in {
 def V_MAD_U64_U32 : VOP3Inst <"v_mad_u64_u32", VOP3b_I64_I1_I32_I32_I64>;
 def V_MAD_I64_I32 : VOP3Inst <"v_mad_i64_i32", VOP3b_I64_I1_I32_I32_I64>;
 } // End SchedRW = [WriteDouble, WriteSALU]
@@ -408,16 +412,16 @@ def V_MAD_I64_I32 : VOP3Inst <"v_mad_i64_i32", VOP3b_I64_I1_I32_I32_I64>;
 } // End SubtargetPredicate = isCIVI
 
 
-let SubtargetPredicate = Has16BitInsts in {
-
-let renamedInGFX9 = 1 in {
-def V_DIV_FIXUP_F16 : VOP3Inst <"v_div_fixup_f16", VOP3_Profile<VOP_F16_F16_F16_F16>, AMDGPUdiv_fixup>;
+def V_DIV_FIXUP_F16 : VOP3Inst <"v_div_fixup_f16", VOP3_Profile<VOP_F16_F16_F16_F16>, AMDGPUdiv_fixup> {
+  let Predicates = [Has16BitInsts, isVIOnly];
 }
-let SubtargetPredicate = isGFX9 in {
-def V_DIV_FIXUP_F16_gfx9 : VOP3Inst <"v_div_fixup_f16_gfx9", VOP3_Profile<VOP_F16_F16_F16_F16, VOP3_OPSEL>>;
+def V_DIV_FIXUP_F16_gfx9 : VOP3Inst <"v_div_fixup_f16_gfx9",
+                                      VOP3_Profile<VOP_F16_F16_F16_F16, VOP3_OPSEL>, AMDGPUdiv_fixup> {
+  let renamedInGFX9 = 1;
+  let Predicates = [Has16BitInsts, isGFX9];
 }
 
-let isCommutable = 1 in {
+let SubtargetPredicate = Has16BitInsts, isCommutable = 1 in {
 
 let renamedInGFX9 = 1 in {
 def V_MAD_F16 : VOP3Inst <"v_mad_f16", VOP3_Profile<VOP_F16_F16_F16_F16>, fmad>;
@@ -438,15 +442,14 @@ def V_INTERP_P2_F16_gfx9 : VOP3Interp <"v_interp_p2_f16_gfx9", VOP3_INTERP16<[f1
 def V_INTERP_P1LL_F16 : VOP3Interp <"v_interp_p1ll_f16", VOP3_INTERP16<[f32, f32, i32, untyped]>>;
 def V_INTERP_P1LV_F16 : VOP3Interp <"v_interp_p1lv_f16", VOP3_INTERP16<[f32, f32, i32, f16]>>;
 
-}  // End isCommutable = 1
-} // End SubtargetPredicate = Has16BitInsts
+} // End SubtargetPredicate = Has16BitInsts, isCommutable = 1
 
 let SubtargetPredicate = isVI in {
 def V_INTERP_P1_F32_e64  : VOP3Interp <"v_interp_p1_f32", VOP3_INTERP>;
 def V_INTERP_P2_F32_e64  : VOP3Interp <"v_interp_p2_f32", VOP3_INTERP>;
 def V_INTERP_MOV_F32_e64 : VOP3Interp <"v_interp_mov_f32", VOP3_INTERP_MOV>;
 
-def V_PERM_B32 : VOP3Inst <"v_perm_b32", VOP3_Profile<VOP_I32_I32_I32_I32>>;
+def V_PERM_B32 : VOP3Inst <"v_perm_b32", VOP3_Profile<VOP_I32_I32_I32_I32>, AMDGPUperm>;
 } // End SubtargetPredicate = isVI
 
 let Predicates = [Has16BitInsts] in {
@@ -697,7 +700,7 @@ multiclass VOP3Interp_F16_Real_vi<bits<10> op> {
 let AssemblerPredicates = [isGFX9], DecoderNamespace = "GFX9" in {
 
 multiclass VOP3_F16_Real_gfx9<bits<10> op, string OpName, string AsmName> {
-  def _vi : VOP3_Real<!cast<VOP3_Pseudo>(OpName), SIEncodingFamily.GFX9>,
+  def _gfx9 : VOP3_Real<!cast<VOP3_Pseudo>(OpName), SIEncodingFamily.GFX9>,
             VOP3e_vi <op, !cast<VOP3_Pseudo>(OpName).Pfl> {
               VOP3_Pseudo ps = !cast<VOP3_Pseudo>(OpName);
               let AsmString = AsmName # ps.AsmOperands;
@@ -705,7 +708,7 @@ multiclass VOP3_F16_Real_gfx9<bits<10> op, string OpName, string AsmName> {
 }
 
 multiclass VOP3OpSel_F16_Real_gfx9<bits<10> op, string AsmName> {
-  def _vi : VOP3_Real<!cast<VOP3_Pseudo>(NAME), SIEncodingFamily.GFX9>,
+  def _gfx9 : VOP3_Real<!cast<VOP3_Pseudo>(NAME), SIEncodingFamily.GFX9>,
             VOP3OpSel_gfx9 <op, !cast<VOP3_Pseudo>(NAME).Pfl> {
               VOP3_Pseudo ps = !cast<VOP3_Pseudo>(NAME);
               let AsmString = AsmName # ps.AsmOperands;
@@ -713,7 +716,7 @@ multiclass VOP3OpSel_F16_Real_gfx9<bits<10> op, string AsmName> {
 }
 
 multiclass VOP3Interp_F16_Real_gfx9<bits<10> op, string OpName, string AsmName> {
-  def _vi : VOP3_Real<!cast<VOP3_Pseudo>(OpName), SIEncodingFamily.GFX9>,
+  def _gfx9 : VOP3_Real<!cast<VOP3_Pseudo>(OpName), SIEncodingFamily.GFX9>,
             VOP3Interp_vi <op, !cast<VOP3_Pseudo>(OpName).Pfl> {
               VOP3_Pseudo ps = !cast<VOP3_Pseudo>(OpName);
               let AsmString = AsmName # ps.AsmOperands;
@@ -721,9 +724,9 @@ multiclass VOP3Interp_F16_Real_gfx9<bits<10> op, string OpName, string AsmName>
 }
 
 multiclass VOP3_Real_gfx9<bits<10> op, string AsmName> {
-  def _vi : VOP3_Real<!cast<VOP3_Pseudo>(NAME), SIEncodingFamily.GFX9>,
-              VOP3e_vi <op, !cast<VOP3_Pseudo>(NAME).Pfl> {
-              VOP3_Pseudo ps = !cast<VOP3_Pseudo>(NAME);
+  def _gfx9 : VOP3_Real<!cast<VOP_Pseudo>(NAME), SIEncodingFamily.GFX9>,
+              VOP3e_vi <op, !cast<VOP_Pseudo>(NAME).Pfl> {
+              VOP_Pseudo ps = !cast<VOP_Pseudo>(NAME);
               let AsmString = AsmName # ps.AsmOperands;
             }
 }
diff --git a/lib/Target/AMDGPU/VOP3PInstructions.td b/lib/Target/AMDGPU/VOP3PInstructions.td
index eeee8b36c175..5c78ada3211e 100644
--- a/lib/Target/AMDGPU/VOP3PInstructions.td
+++ b/lib/Target/AMDGPU/VOP3PInstructions.td
@@ -68,6 +68,67 @@ def V_PK_LSHLREV_B16 : VOP3PInst<"v_pk_lshlrev_b16", VOP3_Profile<VOP_V2I16_V2I1
 def V_PK_ASHRREV_I16 : VOP3PInst<"v_pk_ashrrev_i16", VOP3_Profile<VOP_V2I16_V2I16_V2I16>, ashr_rev>;
 def V_PK_LSHRREV_B16 : VOP3PInst<"v_pk_lshrrev_b16", VOP3_Profile<VOP_V2I16_V2I16_V2I16>, lshr_rev>;
 
+multiclass MadFmaMixPats<SDPatternOperator fma_like,
+                         Instruction mix_inst,
+                         Instruction mixlo_inst,
+                         Instruction mixhi_inst> {
+  def : GCNPat <
+    (f16 (fpround (fma_like (f32 (VOP3PMadMixMods f16:$src0, i32:$src0_modifiers)),
+                            (f32 (VOP3PMadMixMods f16:$src1, i32:$src1_modifiers)),
+                            (f32 (VOP3PMadMixMods f16:$src2, i32:$src2_modifiers))))),
+    (mixlo_inst $src0_modifiers, $src0,
+                $src1_modifiers, $src1,
+                $src2_modifiers, $src2,
+                DSTCLAMP.NONE,
+                (i32 (IMPLICIT_DEF)))
+  >;
+
+  // FIXME: Special case handling for maxhi (especially for clamp)
+  // because dealing with the write to high half of the register is
+  // difficult.
+  def : GCNPat <
+    (build_vector f16:$elt0, (fpround (fma_like (f32 (VOP3PMadMixMods f16:$src0, i32:$src0_modifiers)),
+                                                (f32 (VOP3PMadMixMods f16:$src1, i32:$src1_modifiers)),
+                                                (f32 (VOP3PMadMixMods f16:$src2, i32:$src2_modifiers))))),
+    (v2f16 (mixhi_inst $src0_modifiers, $src0,
+                       $src1_modifiers, $src1,
+                       $src2_modifiers, $src2,
+                       DSTCLAMP.NONE,
+                       $elt0))
+  >;
+
+  def : GCNPat <
+    (build_vector
+      f16:$elt0,
+      (AMDGPUclamp (fpround (fma_like (f32 (VOP3PMadMixMods f16:$src0, i32:$src0_modifiers)),
+                                      (f32 (VOP3PMadMixMods f16:$src1, i32:$src1_modifiers)),
+                                      (f32 (VOP3PMadMixMods f16:$src2, i32:$src2_modifiers)))))),
+    (v2f16 (mixhi_inst $src0_modifiers, $src0,
+                       $src1_modifiers, $src1,
+                       $src2_modifiers, $src2,
+                       DSTCLAMP.ENABLE,
+                       $elt0))
+  >;
+
+  def : GCNPat <
+    (AMDGPUclamp (build_vector
+      (fpround (fma_like (f32 (VOP3PMadMixMods f16:$lo_src0, i32:$lo_src0_modifiers)),
+                         (f32 (VOP3PMadMixMods f16:$lo_src1, i32:$lo_src1_modifiers)),
+                         (f32 (VOP3PMadMixMods f16:$lo_src2, i32:$lo_src2_modifiers)))),
+      (fpround (fma_like (f32 (VOP3PMadMixMods f16:$hi_src0, i32:$hi_src0_modifiers)),
+                         (f32 (VOP3PMadMixMods f16:$hi_src1, i32:$hi_src1_modifiers)),
+                         (f32 (VOP3PMadMixMods f16:$hi_src2, i32:$hi_src2_modifiers)))))),
+    (v2f16 (mixhi_inst $hi_src0_modifiers, $hi_src0,
+                       $hi_src1_modifiers, $hi_src1,
+                       $hi_src2_modifiers, $hi_src2,
+                       DSTCLAMP.ENABLE,
+                       (mixlo_inst $lo_src0_modifiers, $lo_src0,
+                                   $lo_src1_modifiers, $lo_src1,
+                                   $lo_src2_modifiers, $lo_src2,
+                                   DSTCLAMP.ENABLE,
+                                   (i32 (IMPLICIT_DEF)))))
+  >;
+}
 
 let SubtargetPredicate = HasMadMixInsts in {
 // These are VOP3a-like opcodes which accept no omod.
@@ -84,68 +145,41 @@ def V_MAD_MIXHI_F16 : VOP3_VOP3PInst<"v_mad_mixhi_f16", VOP3_Profile<VOP_F16_F16
 }
 }
 
-def : GCNPat <
-  (f16 (fpround (fmad (f32 (VOP3PMadMixMods f16:$src0, i32:$src0_modifiers)),
-                      (f32 (VOP3PMadMixMods f16:$src1, i32:$src1_modifiers)),
-                      (f32 (VOP3PMadMixMods f16:$src2, i32:$src2_modifiers))))),
-  (V_MAD_MIXLO_F16 $src0_modifiers, $src0,
-                   $src1_modifiers, $src1,
-                   $src2_modifiers, $src2,
-                   DSTCLAMP.NONE,
-                   (i32 (IMPLICIT_DEF)))
->;
+defm : MadFmaMixPats<fmad, V_MAD_MIX_F32, V_MAD_MIXLO_F16, V_MAD_MIXHI_F16>;
+} // End SubtargetPredicate = HasMadMixInsts
 
-// FIXME: Special case handling for maxhi (especially for clamp)
-// because dealing with the write to high half of the register is
-// difficult.
-def : GCNPat <
-  (build_vector f16:$elt0, (fpround (fmad (f32 (VOP3PMadMixMods f16:$src0, i32:$src0_modifiers)),
-                                          (f32 (VOP3PMadMixMods f16:$src1, i32:$src1_modifiers)),
-                                          (f32 (VOP3PMadMixMods f16:$src2, i32:$src2_modifiers))))),
-  (v2f16 (V_MAD_MIXHI_F16 $src0_modifiers, $src0,
-                          $src1_modifiers, $src1,
-                          $src2_modifiers, $src2,
-                          DSTCLAMP.NONE,
-                          $elt0))
->;
 
-def : GCNPat <
-  (build_vector
-    f16:$elt0,
-    (AMDGPUclamp (fpround (fmad (f32 (VOP3PMadMixMods f16:$src0, i32:$src0_modifiers)),
-                                (f32 (VOP3PMadMixMods f16:$src1, i32:$src1_modifiers)),
-                                (f32 (VOP3PMadMixMods f16:$src2, i32:$src2_modifiers)))))),
-  (v2f16 (V_MAD_MIXHI_F16 $src0_modifiers, $src0,
-                          $src1_modifiers, $src1,
-                          $src2_modifiers, $src2,
-                          DSTCLAMP.ENABLE,
-                          $elt0))
->;
+// Essentially the same as the mad_mix versions
+let SubtargetPredicate = HasFmaMixInsts in {
+let isCommutable = 1 in {
+def V_FMA_MIX_F32 : VOP3_VOP3PInst<"v_fma_mix_f32", VOP3_Profile<VOP_F32_F16_F16_F16, VOP3_OPSEL>>;
 
-def : GCNPat <
-  (AMDGPUclamp (build_vector
-    (fpround (fmad (f32 (VOP3PMadMixMods f16:$lo_src0, i32:$lo_src0_modifiers)),
-                   (f32 (VOP3PMadMixMods f16:$lo_src1, i32:$lo_src1_modifiers)),
-                   (f32 (VOP3PMadMixMods f16:$lo_src2, i32:$lo_src2_modifiers)))),
-    (fpround (fmad (f32 (VOP3PMadMixMods f16:$hi_src0, i32:$hi_src0_modifiers)),
-                   (f32 (VOP3PMadMixMods f16:$hi_src1, i32:$hi_src1_modifiers)),
-                   (f32 (VOP3PMadMixMods f16:$hi_src2, i32:$hi_src2_modifiers)))))),
-  (v2f16 (V_MAD_MIXHI_F16 $hi_src0_modifiers, $hi_src0,
-                          $hi_src1_modifiers, $hi_src1,
-                          $hi_src2_modifiers, $hi_src2,
-                          DSTCLAMP.ENABLE,
-                          (V_MAD_MIXLO_F16 $lo_src0_modifiers, $lo_src0,
-                                           $lo_src1_modifiers, $lo_src1,
-                                           $lo_src2_modifiers, $lo_src2,
-                                           DSTCLAMP.ENABLE,
-                                           (i32 (IMPLICIT_DEF)))))
->;
+// Clamp modifier is applied after conversion to f16.
+def V_FMA_MIXLO_F16 : VOP3_VOP3PInst<"v_fma_mixlo_f16", VOP3_Profile<VOP_F16_F16_F16_F16, VOP3_OPSEL>, 1>;
+
+let ClampLo = 0, ClampHi = 1 in {
+def V_FMA_MIXHI_F16 : VOP3_VOP3PInst<"v_fma_mixhi_f16", VOP3_Profile<VOP_F16_F16_F16_F16, VOP3_OPSEL>, 1>;
+}
+}
+
+defm : MadFmaMixPats<fma, V_FMA_MIX_F32, V_FMA_MIXLO_F16, V_FMA_MIXHI_F16>;
+}
 
-} // End SubtargetPredicate = [HasMadMixInsts]
+let SubtargetPredicate = HasDLInsts in {
+
+def V_DOT2_F32_F16 : VOP3PInst<"v_dot2_f32_f16", VOP3_Profile<VOP_F32_V2F16_V2F16_F32>, AMDGPUfdot2>;
+def V_DOT2_I32_I16 : VOP3PInst<"v_dot2_i32_i16", VOP3_Profile<VOP_I32_V2I16_V2I16_I32>, int_amdgcn_sdot2>;
+def V_DOT2_U32_U16 : VOP3PInst<"v_dot2_u32_u16", VOP3_Profile<VOP_I32_V2I16_V2I16_I32>, int_amdgcn_udot2>;
+def V_DOT4_I32_I8  : VOP3PInst<"v_dot4_i32_i8", VOP3_Profile<VOP_I32_I32_I32_I32, VOP3_PACKED>, int_amdgcn_sdot4>;
+def V_DOT4_U32_U8  : VOP3PInst<"v_dot4_u32_u8", VOP3_Profile<VOP_I32_I32_I32_I32, VOP3_PACKED>, int_amdgcn_udot4>;
+def V_DOT8_I32_I4  : VOP3PInst<"v_dot8_i32_i4", VOP3_Profile<VOP_I32_I32_I32_I32, VOP3_PACKED>, int_amdgcn_sdot8>;
+def V_DOT8_U32_U4  : VOP3PInst<"v_dot8_u32_u4", VOP3_Profile<VOP_I32_I32_I32_I32, VOP3_PACKED>, int_amdgcn_udot8>;
+
+} // End SubtargetPredicate = HasDLInsts
 
 multiclass VOP3P_Real_vi<bits<10> op> {
-  def _vi : VOP3P_Real<!cast<VOP3P_Pseudo>(NAME), SIEncodingFamily.VI>,
-            VOP3Pe <op, !cast<VOP3P_Pseudo>(NAME).Pfl> {
+  def _vi : VOP3P_Real<!cast<VOP3_Pseudo>(NAME), SIEncodingFamily.VI>,
+            VOP3Pe <op, !cast<VOP3_Pseudo>(NAME).Pfl> {
     let AssemblerPredicates = [HasVOP3PInsts];
     let DecoderNamespace = "VI";
   }
@@ -172,6 +206,33 @@ defm V_PK_MUL_F16 : VOP3P_Real_vi <0x390>;
 defm V_PK_MIN_F16 : VOP3P_Real_vi <0x391>;
 defm V_PK_MAX_F16 : VOP3P_Real_vi <0x392>;
 
+
+let SubtargetPredicate = HasMadMixInsts in {
 defm V_MAD_MIX_F32 : VOP3P_Real_vi <0x3a0>;
 defm V_MAD_MIXLO_F16 : VOP3P_Real_vi <0x3a1>;
 defm V_MAD_MIXHI_F16 : VOP3P_Real_vi <0x3a2>;
+}
+
+let SubtargetPredicate = HasFmaMixInsts in {
+let DecoderNamespace = "GFX9_DL" in {
+// The mad_mix instructions were renamed and their behaviors changed,
+// but the opcode stayed the same so we need to put these in a
+// different DecoderNamespace to avoid the ambiguity.
+defm V_FMA_MIX_F32 : VOP3P_Real_vi <0x3a0>;
+defm V_FMA_MIXLO_F16 : VOP3P_Real_vi <0x3a1>;
+defm V_FMA_MIXHI_F16 : VOP3P_Real_vi <0x3a2>;
+}
+}
+
+
+let SubtargetPredicate = HasDLInsts in {
+
+defm V_DOT2_F32_F16 : VOP3P_Real_vi <0x3a3>;
+defm V_DOT2_I32_I16 : VOP3P_Real_vi <0x3a6>;
+defm V_DOT2_U32_U16 : VOP3P_Real_vi <0x3a7>;
+defm V_DOT4_I32_I8  : VOP3P_Real_vi <0x3a8>;
+defm V_DOT4_U32_U8  : VOP3P_Real_vi <0x3a9>;
+defm V_DOT8_I32_I4  : VOP3P_Real_vi <0x3aa>;
+defm V_DOT8_U32_U4  : VOP3P_Real_vi <0x3ab>;
+
+} // End SubtargetPredicate = HasDLInsts
diff --git a/lib/Target/AMDGPU/VOPCInstructions.td b/lib/Target/AMDGPU/VOPCInstructions.td
index 146870e21531..cc6b8116afee 100644
--- a/lib/Target/AMDGPU/VOPCInstructions.td
+++ b/lib/Target/AMDGPU/VOPCInstructions.td
@@ -30,8 +30,8 @@ class VOPC_SDWAe <bits<8> op, VOPProfile P> : VOP_SDWAe <P> {
   let Inst{31-25} = 0x3e; // encoding
 
   // VOPC disallows dst_sel and dst_unused as they have no effect on destination
-  let Inst{42-40} = SDWA.DWORD;
-  let Inst{44-43} = SDWA.UNUSED_PRESERVE;
+  let Inst{42-40} = 0;
+  let Inst{44-43} = 0;
 }
 
 class VOPC_SDWA9e <bits<8> op, VOPProfile P> : VOP_SDWA9Be <P> {
@@ -106,6 +106,7 @@ class VOPC_Real <VOPC_Pseudo ps, int EncodingFamily> :
   let TSFlags            = ps.TSFlags;
   let UseNamedOperandTable = ps.UseNamedOperandTable;
   let Uses                 = ps.Uses;
+  let Defs                 = ps.Defs;
 }
 
 class VOPC_SDWA_Pseudo <string OpName, VOPProfile P, list<dag> pattern=[]> :
diff --git a/lib/Target/AMDGPU/VOPInstructions.td b/lib/Target/AMDGPU/VOPInstructions.td
index f24ff5ce8dea..f0f7f259f71d 100644
--- a/lib/Target/AMDGPU/VOPInstructions.td
+++ b/lib/Target/AMDGPU/VOPInstructions.td
@@ -38,6 +38,23 @@ class VOPAnyCommon <dag outs, dag ins, string asm, list<dag> pattern> :
   let Uses = [EXEC];
 }
 
+class VOP_Pseudo <string opName, string suffix, VOPProfile P, dag outs, dag ins,
+                  string asm, list<dag> pattern> :
+  InstSI <outs, ins, asm, pattern>,
+  VOP <opName>,
+  SIMCInstr <opName#suffix, SIEncodingFamily.NONE>,
+  MnemonicAlias<opName#suffix, opName> {
+
+  let isPseudo = 1;
+  let isCodeGenOnly = 1;
+  let UseNamedOperandTable = 1;
+
+  string Mnemonic = opName;
+  VOPProfile Pfl = P;
+
+  string AsmOperands;
+}
+
 class VOP3Common <dag outs, dag ins, string asm = "",
                   list<dag> pattern = [], bit HasMods = 0,
                   bit VOP3Only = 0> :
@@ -66,26 +83,18 @@ class VOP3Common <dag outs, dag ins, string asm = "",
 
 class VOP3_Pseudo <string opName, VOPProfile P, list<dag> pattern = [],
                    bit VOP3Only = 0, bit isVOP3P = 0, bit isVop3OpSel = 0> :
-  InstSI <P.Outs64,
-          !if(isVop3OpSel,
-              P.InsVOP3OpSel,
-              !if(!and(isVOP3P, P.IsPacked), P.InsVOP3P, P.Ins64)),
-          "",
-          pattern>,
-  VOP <opName>,
-  SIMCInstr<opName#"_e64", SIEncodingFamily.NONE>,
-  MnemonicAlias<opName#"_e64", opName> {
+  VOP_Pseudo <opName, "_e64", P, P.Outs64,
+              !if(isVop3OpSel,
+                  P.InsVOP3OpSel,
+                  !if(!and(isVOP3P, P.IsPacked), P.InsVOP3P, P.Ins64)),
+              "", pattern> {
 
-  let isPseudo = 1;
-  let isCodeGenOnly = 1;
-  let UseNamedOperandTable = 1;
   let VOP3_OPSEL = isVop3OpSel;
   let IsPacked = P.IsPacked;
 
-  string Mnemonic = opName;
-  string AsmOperands = !if(isVop3OpSel,
-                           P.AsmVOP3OpSel,
-                           !if(!and(isVOP3P, P.IsPacked), P.AsmVOP3P, P.Asm64));
+  let AsmOperands = !if(isVop3OpSel,
+                        P.AsmVOP3OpSel,
+                        !if(!and(isVOP3P, P.IsPacked), P.AsmVOP3P, P.Asm64));
 
   let Size = 8;
   let mayLoad = 0;
@@ -120,8 +129,6 @@ class VOP3_Pseudo <string opName, VOPProfile P, list<dag> pattern = [],
         !if(!or(P.HasModifiers, !or(P.HasOMod, P.HasIntClamp)),
             "cvtVOP3",
             ""));
-
-  VOPProfile Pfl = P;
 }
 
 class VOP3P_Pseudo <string opName, VOPProfile P, list<dag> pattern = []> :
@@ -129,7 +136,7 @@ class VOP3P_Pseudo <string opName, VOPProfile P, list<dag> pattern = []> :
   let VOP3P = 1;
 }
 
-class VOP3_Real <VOP3_Pseudo ps, int EncodingFamily> :
+class VOP3_Real <VOP_Pseudo ps, int EncodingFamily> :
   InstSI <ps.OutOperandList, ps.InOperandList, ps.Mnemonic # ps.AsmOperands, []>,
   SIMCInstr <ps.PseudoInstr, EncodingFamily> {
 
@@ -149,13 +156,14 @@ class VOP3_Real <VOP3_Pseudo ps, int EncodingFamily> :
   let TSFlags            = ps.TSFlags;
   let UseNamedOperandTable = ps.UseNamedOperandTable;
   let Uses                 = ps.Uses;
+  let Defs                 = ps.Defs;
 
   VOPProfile Pfl = ps.Pfl;
 }
 
 // XXX - Is there any reason to distingusih this from regular VOP3
 // here?
-class VOP3P_Real<VOP3P_Pseudo ps, int EncodingFamily> :
+class VOP3P_Real<VOP_Pseudo ps, int EncodingFamily> :
   VOP3_Real<ps, EncodingFamily>;
 
 class VOP3a<VOPProfile P> : Enc64 {
@@ -324,13 +332,13 @@ class VOP_SDWAe<VOPProfile P> : Enc64 {
   bits<1> clamp;
 
   let Inst{39-32} = !if(P.HasSrc0, src0{7-0}, 0);
-  let Inst{42-40} = !if(P.EmitDst, dst_sel{2-0}, SDWA.DWORD);
-  let Inst{44-43} = !if(P.EmitDst, dst_unused{1-0}, SDWA.UNUSED_PRESERVE);
+  let Inst{42-40} = !if(P.EmitDst, dst_sel{2-0}, 0);
+  let Inst{44-43} = !if(P.EmitDst, dst_unused{1-0}, 0);
   let Inst{45}    = !if(P.HasSDWAClamp, clamp{0}, 0);
-  let Inst{50-48} = !if(P.HasSrc0, src0_sel{2-0}, SDWA.DWORD);
+  let Inst{50-48} = !if(P.HasSrc0, src0_sel{2-0}, 0);
   let Inst{51}    = !if(P.HasSrc0IntMods, src0_modifiers{0}, 0);
   let Inst{53-52} = !if(P.HasSrc0FloatMods, src0_modifiers{1-0}, 0);
-  let Inst{58-56} = !if(P.HasSrc1, src1_sel{2-0}, SDWA.DWORD);
+  let Inst{58-56} = !if(P.HasSrc1, src1_sel{2-0}, 0);
   let Inst{59}    = !if(P.HasSrc1IntMods, src1_modifiers{0}, 0);
   let Inst{61-60} = !if(P.HasSrc1FloatMods, src1_modifiers{1-0}, 0);
 }
@@ -358,11 +366,11 @@ class VOP_SDWA9e<VOPProfile P> : Enc64 {
   bits<1> src1_sgpr;
 
   let Inst{39-32} = !if(P.HasSrc0, src0{7-0}, 0);
-  let Inst{50-48} = !if(P.HasSrc0, src0_sel{2-0}, SDWA.DWORD);
+  let Inst{50-48} = !if(P.HasSrc0, src0_sel{2-0}, 0);
   let Inst{51}    = !if(P.HasSrc0IntMods, src0_modifiers{0}, 0);
   let Inst{53-52} = !if(P.HasSrc0FloatMods, src0_modifiers{1-0}, 0);
   let Inst{55}    = !if(P.HasSrc0, src0{8}, 0);
-  let Inst{58-56} = !if(P.HasSrc1, src1_sel{2-0}, SDWA.DWORD);
+  let Inst{58-56} = !if(P.HasSrc1, src1_sel{2-0}, 0);
   let Inst{59}    = !if(P.HasSrc1IntMods, src1_modifiers{0}, 0);
   let Inst{61-60} = !if(P.HasSrc1FloatMods, src1_modifiers{1-0}, 0);
   let Inst{63}    = 0; // src1_sgpr - should be specified in subclass
@@ -375,8 +383,8 @@ class VOP_SDWA9Ae<VOPProfile P> : VOP_SDWA9e<P> {
   bits<1> clamp;
   bits<2> omod;
 
-  let Inst{42-40} = !if(P.EmitDst, dst_sel{2-0}, SDWA.DWORD);
-  let Inst{44-43} = !if(P.EmitDst, dst_unused{1-0}, SDWA.UNUSED_PRESERVE);
+  let Inst{42-40} = !if(P.EmitDst, dst_sel{2-0}, 0);
+  let Inst{44-43} = !if(P.EmitDst, dst_unused{1-0}, 0);
   let Inst{45}    = !if(P.HasSDWAClamp, clamp{0}, 0);
   let Inst{47-46} = !if(P.HasSDWAOMod, omod{1-0}, 0);
 }
author	Dimitry Andric <dim@FreeBSD.org>	2018-07-28 10:51:19 +0000
committer	Dimitry Andric <dim@FreeBSD.org>	2018-07-28 10:51:19 +0000
commit	eb11fae6d08f479c0799db45860a98af528fa6e7 (patch)
tree	44d492a50c8c1a7eb8e2d17ea3360ec4d066f042 /lib/Target/AMDGPU
parent	b8a2042aa938069e862750553db0e4d82d25822c (diff)