vendor/llvm/llvm-trunk-r351319

author: Dimitry Andric <dim@FreeBSD.org> 2019-01-19 10:01:25 +0000
committer: Dimitry Andric <dim@FreeBSD.org> 2019-01-19 10:01:25 +0000
commit: d8e91e46262bc44006913e6796843909f1ac7bcd (patch)
tree: 7d0c143d9b38190e0fa0180805389da22cd834c5 /lib/Target/AMDGPU
parent: b7eb8e35e481a74962664b63dfb09483b200209a (diff)
126 files changed, 12013 insertions, 4897 deletions
diff --git a/lib/Target/AMDGPU/AMDGPU.h b/lib/Target/AMDGPU/AMDGPU.h
index 796766d946221..bb7801c172f60 100644
--- a/lib/Target/AMDGPU/AMDGPU.h
+++ b/lib/Target/AMDGPU/AMDGPU.h
@@ -37,10 +37,13 @@ FunctionPass *createAMDGPUCFGStructurizerPass();
 FunctionPass *createR600ISelDag(TargetMachine *TM, CodeGenOpt::Level OptLevel);
 
 // SI Passes
+FunctionPass *createGCNDPPCombinePass();
 FunctionPass *createSIAnnotateControlFlowPass();
 FunctionPass *createSIFoldOperandsPass();
 FunctionPass *createSIPeepholeSDWAPass();
 FunctionPass *createSILowerI1CopiesPass();
+FunctionPass *createSIFixupVectorISelPass();
+FunctionPass *createSIAddIMGInitPass();
 FunctionPass *createSIShrinkInstructionsPass();
 FunctionPass *createSILoadStoreOptimizerPass();
 FunctionPass *createSIWholeQuadModePass();
@@ -57,6 +60,7 @@ FunctionPass *createAMDGPUUseNativeCallsPass();
 FunctionPass *createAMDGPUCodeGenPreparePass();
 FunctionPass *createAMDGPUMachineCFGStructurizerPass();
 FunctionPass *createAMDGPURewriteOutArgumentsPass();
+FunctionPass *createSIModeRegisterPass();
 
 void initializeAMDGPUDAGToDAGISelPass(PassRegistry&);
 
@@ -69,10 +73,18 @@ Pass *createAMDGPUAnnotateKernelFeaturesPass();
 void initializeAMDGPUAnnotateKernelFeaturesPass(PassRegistry &);
 extern char &AMDGPUAnnotateKernelFeaturesID;
 
+FunctionPass *createAMDGPUAtomicOptimizerPass();
+void initializeAMDGPUAtomicOptimizerPass(PassRegistry &);
+extern char &AMDGPUAtomicOptimizerID;
+
 ModulePass *createAMDGPULowerIntrinsicsPass();
 void initializeAMDGPULowerIntrinsicsPass(PassRegistry &);
 extern char &AMDGPULowerIntrinsicsID;
 
+ModulePass *createAMDGPUFixFunctionBitcastsPass();
+void initializeAMDGPUFixFunctionBitcastsPass(PassRegistry &);
+extern char &AMDGPUFixFunctionBitcastsID;
+
 FunctionPass *createAMDGPULowerKernelArgumentsPass();
 void initializeAMDGPULowerKernelArgumentsPass(PassRegistry &);
 extern char &AMDGPULowerKernelArgumentsID;
@@ -84,6 +96,9 @@ extern char &AMDGPULowerKernelAttributesID;
 void initializeAMDGPURewriteOutArgumentsPass(PassRegistry &);
 extern char &AMDGPURewriteOutArgumentsID;
 
+void initializeGCNDPPCombinePass(PassRegistry &);
+extern char &GCNDPPCombineID;
+
 void initializeR600ClauseMergePassPass(PassRegistry &);
 extern char &R600ClauseMergePassID;
 
@@ -114,6 +129,9 @@ extern char &SIFixSGPRCopiesID;
 void initializeSIFixVGPRCopiesPass(PassRegistry &);
 extern char &SIFixVGPRCopiesID;
 
+void initializeSIFixupVectorISelPass(PassRegistry &);
+extern char &SIFixupVectorISelID;
+
 void initializeSILowerI1CopiesPass(PassRegistry &);
 extern char &SILowerI1CopiesID;
 
@@ -141,6 +159,9 @@ extern char &AMDGPUSimplifyLibCallsID;
 void initializeAMDGPUUseNativeCallsPass(PassRegistry &);
 extern char &AMDGPUUseNativeCallsID;
 
+void initializeSIAddIMGInitPass(PassRegistry &);
+extern char &SIAddIMGInitID;
+
 void initializeAMDGPUPerfHintAnalysisPass(PassRegistry &);
 extern char &AMDGPUPerfHintAnalysisID;
 
@@ -179,6 +200,9 @@ extern char &SIMemoryLegalizerID;
 void initializeSIDebuggerInsertNopsPass(PassRegistry&);
 extern char &SIDebuggerInsertNopsID;
 
+void initializeSIModeRegisterPass(PassRegistry&);
+extern char &SIModeRegisterID;
+
 void initializeSIInsertWaitcntsPass(PassRegistry&);
 extern char &SIInsertWaitcntsID;
 
@@ -190,6 +214,8 @@ extern char &AMDGPUUnifyDivergentExitNodesID;
 
 ImmutablePass *createAMDGPUAAWrapperPass();
 void initializeAMDGPUAAWrapperPassPass(PassRegistry&);
+ImmutablePass *createAMDGPUExternalAAWrapperPass();
+void initializeAMDGPUExternalAAWrapperPass(PassRegistry&);
 
 void initializeAMDGPUArgumentUsageInfoPass(PassRegistry &);
 
@@ -221,19 +247,18 @@ enum TargetIndex {
 /// however on the GPU, each address space points to
 /// a separate piece of memory that is unique from other
 /// memory locations.
-struct AMDGPUAS {
-  // The following address space values depend on the triple environment.
-  unsigned PRIVATE_ADDRESS;  ///< Address space for private memory.
-  unsigned FLAT_ADDRESS;     ///< Address space for flat memory.
-  unsigned REGION_ADDRESS;   ///< Address space for region memory.
-
+namespace AMDGPUAS {
   enum : unsigned {
     // The maximum value for flat, generic, local, private, constant and region.
-    MAX_COMMON_ADDRESS = 5,
+    MAX_AMDGPU_ADDRESS = 6,
 
+    FLAT_ADDRESS = 0,     ///< Address space for flat memory.
     GLOBAL_ADDRESS = 1,   ///< Address space for global memory (RAT0, VTX0).
+    REGION_ADDRESS = 2,   ///< Address space for region memory.
+
     CONSTANT_ADDRESS = 4, ///< Address space for constant memory (VTX2)
     LOCAL_ADDRESS = 3,    ///< Address space for local memory.
+    PRIVATE_ADDRESS = 5,  ///< Address space for private memory.
 
     CONSTANT_ADDRESS_32BIT = 6, ///< Address space for 32-bit constant memory
 
@@ -268,14 +293,6 @@ struct AMDGPUAS {
     // Some places use this if the address space can't be determined.
     UNKNOWN_ADDRESS_SPACE = ~0u,
   };
-};
-
-namespace llvm {
-namespace AMDGPU {
-AMDGPUAS getAMDGPUAS(const Module &M);
-AMDGPUAS getAMDGPUAS(const TargetMachine &TM);
-AMDGPUAS getAMDGPUAS(Triple T);
-} // namespace AMDGPU
-} // namespace llvm
+}
 
 #endif
diff --git a/lib/Target/AMDGPU/AMDGPU.td b/lib/Target/AMDGPU/AMDGPU.td
index 16c2a366db285..6a4cfe08e4910 100644
--- a/lib/Target/AMDGPU/AMDGPU.td
+++ b/lib/Target/AMDGPU/AMDGPU.td
@@ -11,6 +11,10 @@ include "llvm/TableGen/SearchableTable.td"
 include "llvm/Target/Target.td"
 include "AMDGPUFeatures.td"
 
+class BoolToList<bit Value> {
+  list<int> ret = !if(Value, [1]<int>, []<int>);
+}
+
 //===------------------------------------------------------------===//
 // Subtarget Features (device properties)
 //===------------------------------------------------------------===//
@@ -140,6 +144,12 @@ def FeatureCIInsts : SubtargetFeature<"ci-insts",
   "Additional instructions for CI+"
 >;
 
+def FeatureVIInsts : SubtargetFeature<"vi-insts",
+  "VIInsts",
+  "true",
+  "Additional instructions for VI+"
+>;
+
 def FeatureGFX9Insts : SubtargetFeature<"gfx9-insts",
   "GFX9Insts",
   "true",
@@ -236,6 +246,12 @@ def FeatureDPP : SubtargetFeature<"dpp",
   "Support DPP (Data Parallel Primitives) extension"
 >;
 
+def FeatureR128A16 : SubtargetFeature<"r128-a16",
+  "HasR128A16",
+  "true",
+  "Support 16 bit coordindates/gradients/lod/clamp/mip types on gfx9"
+>;
+
 def FeatureIntClamp : SubtargetFeature<"int-clamp-insts",
   "HasIntClamp",
   "true",
@@ -251,31 +267,25 @@ def FeatureUnpackedD16VMem : SubtargetFeature<"unpacked-d16-vmem",
 def FeatureDLInsts : SubtargetFeature<"dl-insts",
   "HasDLInsts",
   "true",
-  "Has deep learning instructions"
+  "Has v_fmac_f32 and v_xnor_b32 instructions"
 >;
 
-def FeatureD16PreservesUnusedBits : SubtargetFeature<
-  "d16-preserves-unused-bits",
-  "D16PreservesUnusedBits",
+def FeatureDotInsts : SubtargetFeature<"dot-insts",
+  "HasDotInsts",
   "true",
-  "If present, then instructions defined by HasD16LoadStore predicate preserve "
-  "unused bits. Otherwise instructions defined by HasD16LoadStore predicate "
-  "zero unused bits."
+  "Has v_dot* instructions"
+>;
+
+def FeatureSRAMECC : SubtargetFeature<"sram-ecc",
+  "EnableSRAMECC",
+  "true",
+  "Enable SRAM ECC"
 >;
 
 //===------------------------------------------------------------===//
 // Subtarget Features (options and debugging)
 //===------------------------------------------------------------===//
 
-// Some instructions do not support denormals despite this flag. Using
-// fp32 denormals also causes instructions to run at the double
-// precision rate for the device.
-def FeatureFP32Denormals : SubtargetFeature<"fp32-denormals",
-  "FP32Denormals",
-  "true",
-  "Enable single precision denormal handling"
->;
-
 // Denormal handling for fp64 and fp16 is controlled by the same
 // config register when fp16 supported.
 // TODO: Do we need a separate f16 setting when not legal?
@@ -324,12 +334,6 @@ def FeatureEnableHugePrivateBuffer : SubtargetFeature<
   "Enable private/scratch buffer sizes greater than 128 GB"
 >;
 
-def FeatureVGPRSpilling : SubtargetFeature<"vgpr-spilling",
-  "EnableVGPRSpilling",
-  "true",
-  "Enable spilling of VGPRs to scratch memory"
->;
-
 def FeatureDumpCode : SubtargetFeature <"DumpCode",
   "DumpCode",
   "true",
@@ -373,6 +377,16 @@ def FeatureEnableDS128 : SubtargetFeature<"enable-ds128",
   "Use ds_{read|write}_b128"
 >;
 
+// Sparse texture support requires that all result registers are zeroed when
+// PRTStrictNull is set to true. This feature is turned on for all architectures
+// but is enabled as a feature in case there are situations where PRTStrictNull
+// is disabled by the driver.
+def FeatureEnablePRTStrictNull : SubtargetFeature<"enable-prt-strict-null",
+  "EnablePRTStrictNull",
+  "true",
+  "Enable zeroing of result registers for sparse texture fetches"
+>;
+
 // Unless +-flat-for-global is specified, turn on FlatForGlobal for
 // all OS-es on VI and newer hardware to avoid assertion failures due
 // to missing ADDR64 variants of MUBUF instructions.
@@ -399,6 +413,12 @@ def FeatureCodeObjectV3 : SubtargetFeature <
   "Generate code object version 3"
 >;
 
+def FeatureTrigReducedRange : SubtargetFeature<"trig-reduced-range",
+  "HasTrigReducedRange",
+  "true",
+  "Requires use of fract on arguments to trig instructions"
+>;
+
 // Dummy feature used to disable assembler instructions.
 def FeatureDisable : SubtargetFeature<"",
   "FeatureDisable","true",
@@ -418,36 +438,36 @@ class GCNSubtargetFeatureGeneration <string Value,
 def FeatureSouthernIslands : GCNSubtargetFeatureGeneration<"SOUTHERN_ISLANDS",
   [FeatureFP64, FeatureLocalMemorySize32768, FeatureMIMG_R128,
   FeatureWavefrontSize64, FeatureGCN,
-  FeatureLDSBankCount32, FeatureMovrel]
+  FeatureLDSBankCount32, FeatureMovrel, FeatureTrigReducedRange]
 >;
 
 def FeatureSeaIslands : GCNSubtargetFeatureGeneration<"SEA_ISLANDS",
   [FeatureFP64, FeatureLocalMemorySize65536, FeatureMIMG_R128,
   FeatureWavefrontSize64, FeatureGCN, FeatureFlatAddressSpace,
-  FeatureCIInsts, FeatureMovrel]
+  FeatureCIInsts, FeatureMovrel, FeatureTrigReducedRange]
 >;
 
 def FeatureVolcanicIslands : GCNSubtargetFeatureGeneration<"VOLCANIC_ISLANDS",
   [FeatureFP64, FeatureLocalMemorySize65536, FeatureMIMG_R128,
    FeatureWavefrontSize64, FeatureFlatAddressSpace, FeatureGCN,
-   FeatureGCN3Encoding, FeatureCIInsts, Feature16BitInsts,
+   FeatureGCN3Encoding, FeatureCIInsts, FeatureVIInsts, Feature16BitInsts,
    FeatureSMemRealTime, FeatureVGPRIndexMode, FeatureMovrel,
    FeatureScalarStores, FeatureInv2PiInlineImm,
    FeatureSDWA, FeatureSDWAOutModsVOPC, FeatureSDWAMac, FeatureDPP,
-   FeatureIntClamp
+   FeatureIntClamp, FeatureTrigReducedRange
   ]
 >;
 
 def FeatureGFX9 : GCNSubtargetFeatureGeneration<"GFX9",
   [FeatureFP64, FeatureLocalMemorySize65536,
    FeatureWavefrontSize64, FeatureFlatAddressSpace, FeatureGCN,
-   FeatureGCN3Encoding, FeatureCIInsts, Feature16BitInsts,
+   FeatureGCN3Encoding, FeatureCIInsts, FeatureVIInsts, Feature16BitInsts,
    FeatureSMemRealTime, FeatureScalarStores, FeatureInv2PiInlineImm,
    FeatureApertureRegs, FeatureGFX9Insts, FeatureVOP3P, FeatureVGPRIndexMode,
    FeatureFastFMAF32, FeatureDPP, FeatureIntClamp,
    FeatureSDWA, FeatureSDWAOmod, FeatureSDWAScalar, FeatureSDWASdst,
    FeatureFlatInstOffsets, FeatureFlatGlobalInsts, FeatureFlatScratchInsts,
-   FeatureAddNoCarryInsts, FeatureScalarAtomics
+   FeatureAddNoCarryInsts, FeatureScalarAtomics, FeatureR128A16
   ]
 >;
 
@@ -465,34 +485,41 @@ def FeatureISAVersion6_0_0 : SubtargetFeatureISAVersion <6,0,0,
   [FeatureSouthernIslands,
    FeatureFastFMAF32,
    HalfRate64Ops,
-   FeatureLDSBankCount32]>;
+   FeatureLDSBankCount32,
+   FeatureCodeObjectV3]>;
 
 def FeatureISAVersion6_0_1 : SubtargetFeatureISAVersion <6,0,1,
   [FeatureSouthernIslands,
-   FeatureLDSBankCount32]>;
+   FeatureLDSBankCount32,
+   FeatureCodeObjectV3]>;
 
 def FeatureISAVersion7_0_0 : SubtargetFeatureISAVersion <7,0,0,
   [FeatureSeaIslands,
-   FeatureLDSBankCount32]>;
+   FeatureLDSBankCount32,
+   FeatureCodeObjectV3]>;
 
 def FeatureISAVersion7_0_1 : SubtargetFeatureISAVersion <7,0,1,
   [FeatureSeaIslands,
    HalfRate64Ops,
    FeatureLDSBankCount32,
-   FeatureFastFMAF32]>;
+   FeatureFastFMAF32,
+   FeatureCodeObjectV3]>;
 
 def FeatureISAVersion7_0_2 : SubtargetFeatureISAVersion <7,0,2,
   [FeatureSeaIslands,
    FeatureLDSBankCount16,
-   FeatureFastFMAF32]>;
+   FeatureFastFMAF32,
+   FeatureCodeObjectV3]>;
 
 def FeatureISAVersion7_0_3 : SubtargetFeatureISAVersion <7,0,3,
   [FeatureSeaIslands,
-   FeatureLDSBankCount16]>;
+   FeatureLDSBankCount16,
+   FeatureCodeObjectV3]>;
 
 def FeatureISAVersion7_0_4 : SubtargetFeatureISAVersion <7,0,4,
   [FeatureSeaIslands,
-   FeatureLDSBankCount32]>;
+   FeatureLDSBankCount32,
+   FeatureCodeObjectV3]>;
 
 def FeatureISAVersion8_0_1 : SubtargetFeatureISAVersion <8,0,1,
   [FeatureVolcanicIslands,
@@ -500,49 +527,63 @@ def FeatureISAVersion8_0_1 : SubtargetFeatureISAVersion <8,0,1,
    HalfRate64Ops,
    FeatureLDSBankCount32,
    FeatureXNACK,
-   FeatureUnpackedD16VMem]>;
+   FeatureUnpackedD16VMem,
+   FeatureCodeObjectV3]>;
 
 def FeatureISAVersion8_0_2 : SubtargetFeatureISAVersion <8,0,2,
   [FeatureVolcanicIslands,
    FeatureLDSBankCount32,
    FeatureSGPRInitBug,
-   FeatureUnpackedD16VMem]>;
+   FeatureUnpackedD16VMem,
+   FeatureCodeObjectV3]>;
 
 def FeatureISAVersion8_0_3 : SubtargetFeatureISAVersion <8,0,3,
   [FeatureVolcanicIslands,
    FeatureLDSBankCount32,
-   FeatureUnpackedD16VMem]>;
+   FeatureUnpackedD16VMem,
+   FeatureCodeObjectV3]>;
 
 def FeatureISAVersion8_1_0 : SubtargetFeatureISAVersion <8,1,0,
   [FeatureVolcanicIslands,
    FeatureLDSBankCount16,
-   FeatureXNACK]>;
+   FeatureXNACK,
+   FeatureCodeObjectV3]>;
 
 def FeatureISAVersion9_0_0 : SubtargetFeatureISAVersion <9,0,0,
   [FeatureGFX9,
    FeatureMadMixInsts,
    FeatureLDSBankCount32,
-   FeatureD16PreservesUnusedBits]>;
+   FeatureCodeObjectV3]>;
 
 def FeatureISAVersion9_0_2 : SubtargetFeatureISAVersion <9,0,2,
   [FeatureGFX9,
    FeatureMadMixInsts,
    FeatureLDSBankCount32,
    FeatureXNACK,
-   FeatureD16PreservesUnusedBits]>;
+   FeatureCodeObjectV3]>;
 
 def FeatureISAVersion9_0_4 : SubtargetFeatureISAVersion <9,0,4,
   [FeatureGFX9,
    FeatureLDSBankCount32,
    FeatureFmaMixInsts,
-   FeatureD16PreservesUnusedBits]>;
+   FeatureCodeObjectV3]>;
 
 def FeatureISAVersion9_0_6 : SubtargetFeatureISAVersion <9,0,6,
   [FeatureGFX9,
    HalfRate64Ops,
    FeatureFmaMixInsts,
    FeatureLDSBankCount32,
-   FeatureDLInsts]>;
+   FeatureDLInsts,
+   FeatureDotInsts,
+   FeatureSRAMECC,
+   FeatureCodeObjectV3]>;
+
+def FeatureISAVersion9_0_9 : SubtargetFeatureISAVersion <9,0,9,
+  [FeatureGFX9,
+   FeatureMadMixInsts,
+   FeatureLDSBankCount32,
+   FeatureXNACK,
+   FeatureCodeObjectV3]>;
 
 //===----------------------------------------------------------------------===//
 // Debugger related subtarget features.
@@ -674,8 +715,9 @@ def HasUnpackedD16VMem : Predicate<"Subtarget->hasUnpackedD16VMem()">,
 def HasPackedD16VMem : Predicate<"!Subtarget->hasUnpackedD16VMem()">,
   AssemblerPredicate<"!FeatureUnpackedD16VMem">;
 
-def D16PreservesUnusedBits : Predicate<"Subtarget->d16PreservesUnusedBits()">,
-  AssemblerPredicate<"FeatureD16PreservesUnusedBits">;
+def D16PreservesUnusedBits :
+  Predicate<"Subtarget->hasD16LoadStore() && !Subtarget->isSRAMECCEnabled()">,
+  AssemblerPredicate<"FeatureGFX9Insts,!FeatureSRAMECC">;
 
 def LDSRequiresM0Init : Predicate<"Subtarget->ldsRequiresM0Init()">;
 def NotLDSRequiresM0Init : Predicate<"!Subtarget->ldsRequiresM0Init()">;
@@ -683,10 +725,10 @@ def NotLDSRequiresM0Init : Predicate<"!Subtarget->ldsRequiresM0Init()">;
 def HasDSAddTid : Predicate<"Subtarget->getGeneration() >= AMDGPUSubtarget::GFX9">,
   AssemblerPredicate<"FeatureGFX9Insts">;
 
-def HasAddNoCarryInsts : Predicate<"Subtarget->hasAddNoCarryInsts()">,
+def HasAddNoCarryInsts : Predicate<"Subtarget->hasAddNoCarry()">,
   AssemblerPredicate<"FeatureAddNoCarryInsts">;
 
-def NotHasAddNoCarryInsts : Predicate<"!Subtarget->hasAddNoCarryInsts()">,
+def NotHasAddNoCarryInsts : Predicate<"!Subtarget->hasAddNoCarry()">,
   AssemblerPredicate<"!FeatureAddNoCarryInsts">;
 
 def Has16BitInsts : Predicate<"Subtarget->has16BitInsts()">,
@@ -706,6 +748,9 @@ def HasSDWA9 : Predicate<"Subtarget->hasSDWA()">,
 def HasDPP : Predicate<"Subtarget->hasDPP()">,
   AssemblerPredicate<"FeatureDPP">;
 
+def HasR128A16 : Predicate<"Subtarget->hasR128A16()">,
+  AssemblerPredicate<"FeatureR128A16">;
+
 def HasIntClamp : Predicate<"Subtarget->hasIntClamp()">,
   AssemblerPredicate<"FeatureIntClamp">;
 
@@ -728,6 +773,9 @@ def HasFmaMixInsts : Predicate<"Subtarget->hasFmaMixInsts()">,
 def HasDLInsts : Predicate<"Subtarget->hasDLInsts()">,
   AssemblerPredicate<"FeatureDLInsts">;
 
+def HasDotInsts : Predicate<"Subtarget->hasDotInsts()">,
+  AssemblerPredicate<"FeatureDotInsts">;
+
 
 def EnableLateCFGStructurize : Predicate<
   "EnableLateStructurizeCFG">;
@@ -736,7 +784,6 @@ def EnableLateCFGStructurize : Predicate<
 include "SISchedule.td"
 include "GCNProcessors.td"
 include "AMDGPUInstrInfo.td"
-include "AMDGPUIntrinsics.td"
 include "SIIntrinsics.td"
 include "AMDGPURegisterInfo.td"
 include "AMDGPURegisterBanks.td"
diff --git a/lib/Target/AMDGPU/AMDGPUAliasAnalysis.cpp b/lib/Target/AMDGPU/AMDGPUAliasAnalysis.cpp
index ef4b69d09d9f5..73709ba13643e 100644
--- a/lib/Target/AMDGPU/AMDGPUAliasAnalysis.cpp
+++ b/lib/Target/AMDGPU/AMDGPUAliasAnalysis.cpp
@@ -34,69 +34,45 @@ using namespace llvm;
 
 // Register this pass...
 char AMDGPUAAWrapperPass::ID = 0;
+char AMDGPUExternalAAWrapper::ID = 0;
 
 INITIALIZE_PASS(AMDGPUAAWrapperPass, "amdgpu-aa",
                 "AMDGPU Address space based Alias Analysis", false, true)
 
+INITIALIZE_PASS(AMDGPUExternalAAWrapper, "amdgpu-aa-wrapper",
+                "AMDGPU Address space based Alias Analysis Wrapper", false, true)
+
 ImmutablePass *llvm::createAMDGPUAAWrapperPass() {
   return new AMDGPUAAWrapperPass();
 }
 
+ImmutablePass *llvm::createAMDGPUExternalAAWrapperPass() {
+  return new AMDGPUExternalAAWrapper();
+}
+
 void AMDGPUAAWrapperPass::getAnalysisUsage(AnalysisUsage &AU) const {
   AU.setPreservesAll();
 }
 
-// Must match the table in getAliasResult.
-AMDGPUAAResult::ASAliasRulesTy::ASAliasRulesTy(AMDGPUAS AS_, Triple::ArchType Arch_)
-  : Arch(Arch_), AS(AS_) {
-  // These arrarys are indexed by address space value
-  // enum elements 0 ... to 5
-  static const AliasResult ASAliasRulesPrivIsZero[6][6] = {
-  /*             Private    Global    Constant  Group     Flat      Region*/
-  /* Private  */ {MayAlias, NoAlias , NoAlias , NoAlias , MayAlias, NoAlias},
-  /* Global   */ {NoAlias , MayAlias, NoAlias , NoAlias , MayAlias, NoAlias},
-  /* Constant */ {NoAlias , NoAlias , MayAlias, NoAlias , MayAlias, NoAlias},
-  /* Group    */ {NoAlias , NoAlias , NoAlias , MayAlias, MayAlias, NoAlias},
-  /* Flat     */ {MayAlias, MayAlias, MayAlias, MayAlias, MayAlias, MayAlias},
-  /* Region   */ {NoAlias , NoAlias , NoAlias , NoAlias , MayAlias, MayAlias}
-  };
-  static const AliasResult ASAliasRulesGenIsZero[6][6] = {
-  /*             Flat       Global    Region    Group     Constant  Private */
-  /* Flat     */ {MayAlias, MayAlias, MayAlias, MayAlias, MayAlias, MayAlias},
-  /* Global   */ {MayAlias, MayAlias, NoAlias , NoAlias , NoAlias , NoAlias},
-  /* Constant */ {MayAlias, NoAlias , MayAlias, NoAlias , NoAlias,  NoAlias},
-  /* Group    */ {MayAlias, NoAlias , NoAlias , MayAlias, NoAlias , NoAlias},
-  /* Region   */ {MayAlias, NoAlias , NoAlias , NoAlias,  MayAlias, NoAlias},
-  /* Private  */ {MayAlias, NoAlias , NoAlias , NoAlias , NoAlias , MayAlias}
-  };
-  assert(AS.MAX_COMMON_ADDRESS <= 5);
-  if (AS.FLAT_ADDRESS == 0) {
-    assert(AS.GLOBAL_ADDRESS   == 1 &&
-           AS.REGION_ADDRESS   == 2 &&
-           AS.LOCAL_ADDRESS    == 3 &&
-           AS.CONSTANT_ADDRESS == 4 &&
-           AS.PRIVATE_ADDRESS  == 5);
-    ASAliasRules = &ASAliasRulesGenIsZero;
-  } else {
-    assert(AS.PRIVATE_ADDRESS  == 0 &&
-           AS.GLOBAL_ADDRESS   == 1 &&
-           AS.CONSTANT_ADDRESS == 2 &&
-           AS.LOCAL_ADDRESS    == 3 &&
-           AS.FLAT_ADDRESS     == 4 &&
-           AS.REGION_ADDRESS   == 5);
-    ASAliasRules = &ASAliasRulesPrivIsZero;
-  }
-}
+// These arrays are indexed by address space value enum elements 0 ... to 6
+static const AliasResult ASAliasRules[7][7] = {
+  /*                    Flat       Global    Region    Group     Constant  Private   Constant 32-bit */
+  /* Flat     */        {MayAlias, MayAlias, MayAlias, MayAlias, MayAlias, MayAlias, MayAlias},
+  /* Global   */        {MayAlias, MayAlias, NoAlias , NoAlias , MayAlias, NoAlias , MayAlias},
+  /* Region   */        {MayAlias, NoAlias , NoAlias , NoAlias,  MayAlias, NoAlias , MayAlias},
+  /* Group    */        {MayAlias, NoAlias , NoAlias , MayAlias, NoAlias , NoAlias , NoAlias},
+  /* Constant */        {MayAlias, MayAlias, MayAlias, NoAlias , NoAlias,  NoAlias , MayAlias},
+  /* Private  */        {MayAlias, NoAlias , NoAlias , NoAlias , NoAlias , MayAlias, NoAlias},
+  /* Constant 32-bit */ {MayAlias, MayAlias, MayAlias, NoAlias , MayAlias, NoAlias , NoAlias}
+};
 
-AliasResult AMDGPUAAResult::ASAliasRulesTy::getAliasResult(unsigned AS1,
-    unsigned AS2) const {
-  if (AS1 > AS.MAX_COMMON_ADDRESS || AS2 > AS.MAX_COMMON_ADDRESS) {
-    if (Arch == Triple::amdgcn)
-      report_fatal_error("Pointer address space out of range");
-    return AS1 == AS2 ? MayAlias : NoAlias;
-  }
+static AliasResult getAliasResult(unsigned AS1, unsigned AS2) {
+  static_assert(AMDGPUAS::MAX_AMDGPU_ADDRESS <= 6, "Addr space out of range");
 
-  return (*ASAliasRules)[AS1][AS2];
+  if (AS1 > AMDGPUAS::MAX_AMDGPU_ADDRESS || AS2 > AMDGPUAS::MAX_AMDGPU_ADDRESS)
+    return MayAlias;
+
+  return ASAliasRules[AS1][AS2];
 }
 
 AliasResult AMDGPUAAResult::alias(const MemoryLocation &LocA,
@@ -104,8 +80,9 @@ AliasResult AMDGPUAAResult::alias(const MemoryLocation &LocA,
   unsigned asA = LocA.Ptr->getType()->getPointerAddressSpace();
   unsigned asB = LocB.Ptr->getType()->getPointerAddressSpace();
 
-  AliasResult Result = ASAliasRules.getAliasResult(asA, asB);
-  if (Result == NoAlias) return Result;
+  AliasResult Result = getAliasResult(asA, asB);
+  if (Result == NoAlias)
+    return Result;
 
   // Forward the query to the next alias analysis.
   return AAResultBase::alias(LocA, LocB);
@@ -114,9 +91,9 @@ AliasResult AMDGPUAAResult::alias(const MemoryLocation &LocA,
 bool AMDGPUAAResult::pointsToConstantMemory(const MemoryLocation &Loc,
                                             bool OrLocal) {
   const Value *Base = GetUnderlyingObject(Loc.Ptr, DL);
-
-  if (Base->getType()->getPointerAddressSpace() == AS.CONSTANT_ADDRESS ||
-      Base->getType()->getPointerAddressSpace() == AS.CONSTANT_ADDRESS_32BIT) {
+  unsigned AS = Base->getType()->getPointerAddressSpace();
+  if (AS == AMDGPUAS::CONSTANT_ADDRESS ||
+      AS == AMDGPUAS::CONSTANT_ADDRESS_32BIT) {
     return true;
   }
 
diff --git a/lib/Target/AMDGPU/AMDGPUAliasAnalysis.h b/lib/Target/AMDGPU/AMDGPUAliasAnalysis.h
index 645a38af753ce..d76c9fc481995 100644
--- a/lib/Target/AMDGPU/AMDGPUAliasAnalysis.h
+++ b/lib/Target/AMDGPU/AMDGPUAliasAnalysis.h
@@ -33,14 +33,12 @@ class AMDGPUAAResult : public AAResultBase<AMDGPUAAResult> {
   friend AAResultBase<AMDGPUAAResult>;
 
   const DataLayout &DL;
-  AMDGPUAS AS;
 
 public:
   explicit AMDGPUAAResult(const DataLayout &DL, Triple T) : AAResultBase(),
-    DL(DL), AS(AMDGPU::getAMDGPUAS(T)), ASAliasRules(AS, T.getArch()) {}
+    DL(DL) {}
   AMDGPUAAResult(AMDGPUAAResult &&Arg)
-      : AAResultBase(std::move(Arg)), DL(Arg.DL), AS(Arg.AS),
-        ASAliasRules(Arg.ASAliasRules){}
+      : AAResultBase(std::move(Arg)), DL(Arg.DL) {}
 
   /// Handle invalidation events from the new pass manager.
   ///
@@ -53,18 +51,6 @@ public:
 private:
   bool Aliases(const MDNode *A, const MDNode *B) const;
   bool PathAliases(const MDNode *A, const MDNode *B) const;
-
-  class ASAliasRulesTy {
-  public:
-    ASAliasRulesTy(AMDGPUAS AS_, Triple::ArchType Arch_);
-
-    AliasResult getAliasResult(unsigned AS1, unsigned AS2) const;
-
-  private:
-    Triple::ArchType Arch;
-    AMDGPUAS AS;
-    const AliasResult (*ASAliasRules)[6][6];
-  } ASAliasRules;
 };
 
 /// Analysis pass providing a never-invalidated alias analysis result.
@@ -110,6 +96,19 @@ public:
   void getAnalysisUsage(AnalysisUsage &AU) const override;
 };
 
+// Wrapper around ExternalAAWrapperPass so that the default constructor gets the
+// callback.
+class AMDGPUExternalAAWrapper : public ExternalAAWrapperPass {
+public:
+  static char ID;
+
+  AMDGPUExternalAAWrapper() : ExternalAAWrapperPass(
+    [](Pass &P, Function &, AAResults &AAR) {
+      if (auto *WrapperPass = P.getAnalysisIfAvailable<AMDGPUAAWrapperPass>())
+        AAR.addAAResult(WrapperPass->getResult());
+    }) {}
+};
+
 } // end namespace llvm
 
 #endif // LLVM_LIB_TARGET_AMDGPU_AMDGPUALIASANALYSIS_H
diff --git a/lib/Target/AMDGPU/AMDGPUAlwaysInlinePass.cpp b/lib/Target/AMDGPU/AMDGPUAlwaysInlinePass.cpp
index d4bbb2c1eb8d1..fc65430b745f3 100644
--- a/lib/Target/AMDGPU/AMDGPUAlwaysInlinePass.cpp
+++ b/lib/Target/AMDGPU/AMDGPUAlwaysInlinePass.cpp
@@ -86,8 +86,6 @@ void AMDGPUAlwaysInline::recursivelyVisitUsers(
 }
 
 bool AMDGPUAlwaysInline::runOnModule(Module &M) {
-  AMDGPUAS AMDGPUAS = AMDGPU::getAMDGPUAS(M);
-
   std::vector<GlobalAlias*> AliasesToRemove;
 
   SmallPtrSet<Function *, 8> FuncsToAlwaysInline;
@@ -122,7 +120,7 @@ bool AMDGPUAlwaysInline::runOnModule(Module &M) {
   for (GlobalVariable &GV : M.globals()) {
     // TODO: Region address
     unsigned AS = GV.getType()->getAddressSpace();
-    if (AS != AMDGPUAS::LOCAL_ADDRESS && AS != AMDGPUAS.REGION_ADDRESS)
+    if (AS != AMDGPUAS::LOCAL_ADDRESS && AS != AMDGPUAS::REGION_ADDRESS)
       continue;
 
     recursivelyVisitUsers(GV, FuncsToAlwaysInline);
diff --git a/lib/Target/AMDGPU/AMDGPUAnnotateKernelFeatures.cpp b/lib/Target/AMDGPU/AMDGPUAnnotateKernelFeatures.cpp
index 1a70833a4472f..896ac9c87779e 100644
--- a/lib/Target/AMDGPU/AMDGPUAnnotateKernelFeatures.cpp
+++ b/lib/Target/AMDGPU/AMDGPUAnnotateKernelFeatures.cpp
@@ -46,7 +46,6 @@ namespace {
 class AMDGPUAnnotateKernelFeatures : public CallGraphSCCPass {
 private:
   const TargetMachine *TM = nullptr;
-  AMDGPUAS AS;
 
   bool addFeatureAttributes(Function &F);
 
@@ -67,11 +66,10 @@ public:
     CallGraphSCCPass::getAnalysisUsage(AU);
   }
 
-  static bool visitConstantExpr(const ConstantExpr *CE, AMDGPUAS AS);
+  static bool visitConstantExpr(const ConstantExpr *CE);
   static bool visitConstantExprsRecursively(
     const Constant *EntryC,
-    SmallPtrSet<const Constant *, 8> &ConstantExprVisited,
-    AMDGPUAS AS);
+    SmallPtrSet<const Constant *, 8> &ConstantExprVisited);
 };
 
 } // end anonymous namespace
@@ -85,20 +83,18 @@ INITIALIZE_PASS(AMDGPUAnnotateKernelFeatures, DEBUG_TYPE,
 
 
 // The queue ptr is only needed when casting to flat, not from it.
-static bool castRequiresQueuePtr(unsigned SrcAS, const AMDGPUAS &AS) {
-  return SrcAS == AS.LOCAL_ADDRESS || SrcAS == AS.PRIVATE_ADDRESS;
+static bool castRequiresQueuePtr(unsigned SrcAS) {
+  return SrcAS == AMDGPUAS::LOCAL_ADDRESS || SrcAS == AMDGPUAS::PRIVATE_ADDRESS;
 }
 
-static bool castRequiresQueuePtr(const AddrSpaceCastInst *ASC,
-    const AMDGPUAS &AS) {
-  return castRequiresQueuePtr(ASC->getSrcAddressSpace(), AS);
+static bool castRequiresQueuePtr(const AddrSpaceCastInst *ASC) {
+  return castRequiresQueuePtr(ASC->getSrcAddressSpace());
 }
 
-bool AMDGPUAnnotateKernelFeatures::visitConstantExpr(const ConstantExpr *CE,
-    AMDGPUAS AS) {
+bool AMDGPUAnnotateKernelFeatures::visitConstantExpr(const ConstantExpr *CE) {
   if (CE->getOpcode() == Instruction::AddrSpaceCast) {
     unsigned SrcAS = CE->getOperand(0)->getType()->getPointerAddressSpace();
-    return castRequiresQueuePtr(SrcAS, AS);
+    return castRequiresQueuePtr(SrcAS);
   }
 
   return false;
@@ -106,8 +102,7 @@ bool AMDGPUAnnotateKernelFeatures::visitConstantExpr(const ConstantExpr *CE,
 
 bool AMDGPUAnnotateKernelFeatures::visitConstantExprsRecursively(
   const Constant *EntryC,
-  SmallPtrSet<const Constant *, 8> &ConstantExprVisited,
-  AMDGPUAS AS) {
+  SmallPtrSet<const Constant *, 8> &ConstantExprVisited) {
 
   if (!ConstantExprVisited.insert(EntryC).second)
     return false;
@@ -120,7 +115,7 @@ bool AMDGPUAnnotateKernelFeatures::visitConstantExprsRecursively(
 
     // Check this constant expression.
     if (const auto *CE = dyn_cast<ConstantExpr>(C)) {
-      if (visitConstantExpr(CE, AS))
+      if (visitConstantExpr(CE))
         return true;
     }
 
@@ -262,7 +257,7 @@ bool AMDGPUAnnotateKernelFeatures::addFeatureAttributes(Function &F) {
         continue;
 
       if (const AddrSpaceCastInst *ASC = dyn_cast<AddrSpaceCastInst>(&I)) {
-        if (castRequiresQueuePtr(ASC, AS)) {
+        if (castRequiresQueuePtr(ASC)) {
           NeedQueuePtr = true;
           continue;
         }
@@ -273,7 +268,7 @@ bool AMDGPUAnnotateKernelFeatures::addFeatureAttributes(Function &F) {
         if (!OpC)
           continue;
 
-        if (visitConstantExprsRecursively(OpC, ConstantExprVisited, AS)) {
+        if (visitConstantExprsRecursively(OpC, ConstantExprVisited)) {
           NeedQueuePtr = true;
           break;
         }
@@ -318,7 +313,6 @@ bool AMDGPUAnnotateKernelFeatures::doInitialization(CallGraph &CG) {
   if (!TPC)
     report_fatal_error("TargetMachine is required");
 
-  AS = AMDGPU::getAMDGPUAS(CG.getModule());
   TM = &TPC->getTM<TargetMachine>();
   return false;
 }
diff --git a/lib/Target/AMDGPU/AMDGPUAnnotateUniformValues.cpp b/lib/Target/AMDGPU/AMDGPUAnnotateUniformValues.cpp
index ed5370826647f..f88e3b0dac860 100644
--- a/lib/Target/AMDGPU/AMDGPUAnnotateUniformValues.cpp
+++ b/lib/Target/AMDGPU/AMDGPUAnnotateUniformValues.cpp
@@ -16,7 +16,7 @@
 #include "AMDGPU.h"
 #include "AMDGPUIntrinsicInfo.h"
 #include "llvm/ADT/SetVector.h"
-#include "llvm/Analysis/DivergenceAnalysis.h"
+#include "llvm/Analysis/LegacyDivergenceAnalysis.h"
 #include "llvm/Analysis/LoopInfo.h"
 #include "llvm/Analysis/MemoryDependenceAnalysis.h"
 #include "llvm/IR/IRBuilder.h"
@@ -32,12 +32,11 @@ namespace {
 
 class AMDGPUAnnotateUniformValues : public FunctionPass,
                        public InstVisitor<AMDGPUAnnotateUniformValues> {
-  DivergenceAnalysis *DA;
+  LegacyDivergenceAnalysis *DA;
   MemoryDependenceResults *MDR;
   LoopInfo *LI;
   DenseMap<Value*, GetElementPtrInst*> noClobberClones;
   bool isKernelFunc;
-  AMDGPUAS AMDGPUASI;
 
 public:
   static char ID;
@@ -49,7 +48,7 @@ public:
     return "AMDGPU Annotate Uniform Values";
   }
   void getAnalysisUsage(AnalysisUsage &AU) const override {
-    AU.addRequired<DivergenceAnalysis>();
+    AU.addRequired<LegacyDivergenceAnalysis>();
     AU.addRequired<MemoryDependenceWrapperPass>();
     AU.addRequired<LoopInfoWrapperPass>();
     AU.setPreservesAll();
@@ -64,7 +63,7 @@ public:
 
 INITIALIZE_PASS_BEGIN(AMDGPUAnnotateUniformValues, DEBUG_TYPE,
                       "Add AMDGPU uniform metadata", false, false)
-INITIALIZE_PASS_DEPENDENCY(DivergenceAnalysis)
+INITIALIZE_PASS_DEPENDENCY(LegacyDivergenceAnalysis)
 INITIALIZE_PASS_DEPENDENCY(MemoryDependenceWrapperPass)
 INITIALIZE_PASS_DEPENDENCY(LoopInfoWrapperPass)
 INITIALIZE_PASS_END(AMDGPUAnnotateUniformValues, DEBUG_TYPE,
@@ -118,14 +117,8 @@ bool AMDGPUAnnotateUniformValues::isClobberedInFunction(LoadInst * Load) {
 }
 
 void AMDGPUAnnotateUniformValues::visitBranchInst(BranchInst &I) {
-  if (I.isUnconditional())
-    return;
-
-  Value *Cond = I.getCondition();
-  if (!DA->isUniform(Cond))
-    return;
-
-  setUniformMetadata(I.getParent()->getTerminator());
+  if (DA->isUniform(&I))
+    setUniformMetadata(I.getParent()->getTerminator());
 }
 
 void AMDGPUAnnotateUniformValues::visitLoadInst(LoadInst &I) {
@@ -133,7 +126,7 @@ void AMDGPUAnnotateUniformValues::visitLoadInst(LoadInst &I) {
   if (!DA->isUniform(Ptr))
     return;
   auto isGlobalLoad = [&](LoadInst &Load)->bool {
-    return Load.getPointerAddressSpace() == AMDGPUASI.GLOBAL_ADDRESS;
+    return Load.getPointerAddressSpace() == AMDGPUAS::GLOBAL_ADDRESS;
   };
   // We're tracking up to the Function boundaries
   // We cannot go beyond because of FunctionPass restrictions
@@ -168,7 +161,6 @@ void AMDGPUAnnotateUniformValues::visitLoadInst(LoadInst &I) {
 }
 
 bool AMDGPUAnnotateUniformValues::doInitialization(Module &M) {
-  AMDGPUASI = AMDGPU::getAMDGPUAS(M);
   return false;
 }
 
@@ -176,7 +168,7 @@ bool AMDGPUAnnotateUniformValues::runOnFunction(Function &F) {
   if (skipFunction(F))
     return false;
 
-  DA  = &getAnalysis<DivergenceAnalysis>();
+  DA  = &getAnalysis<LegacyDivergenceAnalysis>();
   MDR = &getAnalysis<MemoryDependenceWrapperPass>().getMemDep();
   LI  = &getAnalysis<LoopInfoWrapperPass>().getLoopInfo();
   isKernelFunc = F.getCallingConv() == CallingConv::AMDGPU_KERNEL;
diff --git a/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp b/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp
index e62e5d52ad74f..2ded7cdb64899 100644
--- a/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp
+++ b/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp
@@ -40,11 +40,13 @@
 #include "llvm/MC/MCStreamer.h"
 #include "llvm/Support/AMDGPUMetadata.h"
 #include "llvm/Support/MathExtras.h"
+#include "llvm/Support/TargetParser.h"
 #include "llvm/Support/TargetRegistry.h"
 #include "llvm/Target/TargetLoweringObjectFile.h"
 
 using namespace llvm;
 using namespace llvm::AMDGPU;
+using namespace llvm::AMDGPU::HSAMD;
 
 // TODO: This should get the default rounding mode from the kernel. We just set
 // the default here, but this could change if the OpenCL rounding mode pragmas
@@ -98,8 +100,11 @@ extern "C" void LLVMInitializeAMDGPUAsmPrinter() {
 AMDGPUAsmPrinter::AMDGPUAsmPrinter(TargetMachine &TM,
                                    std::unique_ptr<MCStreamer> Streamer)
   : AsmPrinter(TM, std::move(Streamer)) {
-    AMDGPUASI = static_cast<AMDGPUTargetMachine*>(&TM)->getAMDGPUAS();
-  }
+    if (IsaInfo::hasCodeObjectV3(getSTI()))
+      HSAMetadataStream.reset(new MetadataStreamerV3());
+    else
+      HSAMetadataStream.reset(new MetadataStreamerV2());
+}
 
 StringRef AMDGPUAsmPrinter::getPassName() const {
   return "AMDGPU Assembly Printer";
@@ -116,62 +121,70 @@ AMDGPUTargetStreamer* AMDGPUAsmPrinter::getTargetStreamer() const {
 }
 
 void AMDGPUAsmPrinter::EmitStartOfAsmFile(Module &M) {
-  if (IsaInfo::hasCodeObjectV3(getSTI()) &&
-      TM.getTargetTriple().getOS() == Triple::AMDHSA)
-    return;
+  if (IsaInfo::hasCodeObjectV3(getSTI())) {
+    std::string ExpectedTarget;
+    raw_string_ostream ExpectedTargetOS(ExpectedTarget);
+    IsaInfo::streamIsaVersion(getSTI(), ExpectedTargetOS);
+
+    getTargetStreamer()->EmitDirectiveAMDGCNTarget(ExpectedTarget);
+  }
 
   if (TM.getTargetTriple().getOS() != Triple::AMDHSA &&
       TM.getTargetTriple().getOS() != Triple::AMDPAL)
     return;
 
   if (TM.getTargetTriple().getOS() == Triple::AMDHSA)
-    HSAMetadataStream.begin(M);
+    HSAMetadataStream->begin(M);
 
   if (TM.getTargetTriple().getOS() == Triple::AMDPAL)
     readPALMetadata(M);
 
+  if (IsaInfo::hasCodeObjectV3(getSTI()))
+    return;
+
   // HSA emits NT_AMDGPU_HSA_CODE_OBJECT_VERSION for code objects v2.
   if (TM.getTargetTriple().getOS() == Triple::AMDHSA)
     getTargetStreamer()->EmitDirectiveHSACodeObjectVersion(2, 1);
 
   // HSA and PAL emit NT_AMDGPU_HSA_ISA for code objects v2.
-  IsaInfo::IsaVersion ISA = IsaInfo::getIsaVersion(getSTI()->getFeatureBits());
+  IsaVersion Version = getIsaVersion(getSTI()->getCPU());
   getTargetStreamer()->EmitDirectiveHSACodeObjectISA(
-      ISA.Major, ISA.Minor, ISA.Stepping, "AMD", "AMDGPU");
+      Version.Major, Version.Minor, Version.Stepping, "AMD", "AMDGPU");
 }
 
 void AMDGPUAsmPrinter::EmitEndOfAsmFile(Module &M) {
-  // TODO: Add metadata to code object v3.
-  if (IsaInfo::hasCodeObjectV3(getSTI()) &&
-      TM.getTargetTriple().getOS() == Triple::AMDHSA)
-    return;
-
   // Following code requires TargetStreamer to be present.
   if (!getTargetStreamer())
     return;
 
-  // Emit ISA Version (NT_AMD_AMDGPU_ISA).
-  std::string ISAVersionString;
-  raw_string_ostream ISAVersionStream(ISAVersionString);
-  IsaInfo::streamIsaVersion(getSTI(), ISAVersionStream);
-  getTargetStreamer()->EmitISAVersion(ISAVersionStream.str());
+  if (!IsaInfo::hasCodeObjectV3(getSTI())) {
+    // Emit ISA Version (NT_AMD_AMDGPU_ISA).
+    std::string ISAVersionString;
+    raw_string_ostream ISAVersionStream(ISAVersionString);
+    IsaInfo::streamIsaVersion(getSTI(), ISAVersionStream);
+    getTargetStreamer()->EmitISAVersion(ISAVersionStream.str());
+  }
 
   // Emit HSA Metadata (NT_AMD_AMDGPU_HSA_METADATA).
   if (TM.getTargetTriple().getOS() == Triple::AMDHSA) {
-    HSAMetadataStream.end();
-    getTargetStreamer()->EmitHSAMetadata(HSAMetadataStream.getHSAMetadata());
+    HSAMetadataStream->end();
+    bool Success = HSAMetadataStream->emitTo(*getTargetStreamer());
+    (void)Success;
+    assert(Success && "Malformed HSA Metadata");
   }
 
-  // Emit PAL Metadata (NT_AMD_AMDGPU_PAL_METADATA).
-  if (TM.getTargetTriple().getOS() == Triple::AMDPAL) {
-    // Copy the PAL metadata from the map where we collected it into a vector,
-    // then write it as a .note.
-    PALMD::Metadata PALMetadataVector;
-    for (auto i : PALMetadataMap) {
-      PALMetadataVector.push_back(i.first);
-      PALMetadataVector.push_back(i.second);
+  if (!IsaInfo::hasCodeObjectV3(getSTI())) {
+    // Emit PAL Metadata (NT_AMD_AMDGPU_PAL_METADATA).
+    if (TM.getTargetTriple().getOS() == Triple::AMDPAL) {
+      // Copy the PAL metadata from the map where we collected it into a vector,
+      // then write it as a .note.
+      PALMD::Metadata PALMetadataVector;
+      for (auto i : PALMetadataMap) {
+        PALMetadataVector.push_back(i.first);
+        PALMetadataVector.push_back(i.second);
+      }
+      getTargetStreamer()->EmitPALMetadata(PALMetadataVector);
     }
-    getTargetStreamer()->EmitPALMetadata(PALMetadataVector);
   }
 }
 
@@ -193,13 +206,10 @@ void AMDGPUAsmPrinter::EmitFunctionBodyStart() {
   const SIMachineFunctionInfo &MFI = *MF->getInfo<SIMachineFunctionInfo>();
   if (!MFI.isEntryFunction())
     return;
-  if (IsaInfo::hasCodeObjectV3(getSTI()) &&
-      TM.getTargetTriple().getOS() == Triple::AMDHSA)
-    return;
 
   const GCNSubtarget &STM = MF->getSubtarget<GCNSubtarget>();
   const Function &F = MF->getFunction();
-  if (STM.isAmdCodeObjectV2(F) &&
+  if (!STM.hasCodeObjectV3() && STM.isAmdHsaOrMesa(F) &&
       (F.getCallingConv() == CallingConv::AMDGPU_KERNEL ||
        F.getCallingConv() == CallingConv::SPIR_KERNEL)) {
     amd_kernel_code_t KernelCode;
@@ -207,10 +217,8 @@ void AMDGPUAsmPrinter::EmitFunctionBodyStart() {
     getTargetStreamer()->EmitAMDKernelCodeT(KernelCode);
   }
 
-  if (TM.getTargetTriple().getOS() != Triple::AMDHSA)
-    return;
-
-  HSAMetadataStream.emitKernel(*MF, CurrentProgramInfo);
+  if (STM.isAmdHsaOS())
+    HSAMetadataStream->emitKernel(*MF, CurrentProgramInfo);
 }
 
 void AMDGPUAsmPrinter::EmitFunctionBodyEnd() {
@@ -241,7 +249,7 @@ void AMDGPUAsmPrinter::EmitFunctionBodyEnd() {
       *getSTI(), KernelName, getAmdhsaKernelDescriptor(*MF, CurrentProgramInfo),
       CurrentProgramInfo.NumVGPRsForWavesPerEU,
       CurrentProgramInfo.NumSGPRsForWavesPerEU -
-          IsaInfo::getNumExtraSGPRs(getSTI()->getFeatureBits(),
+          IsaInfo::getNumExtraSGPRs(getSTI(),
                                     CurrentProgramInfo.VCCUsed,
                                     CurrentProgramInfo.FlatUsed),
       CurrentProgramInfo.VCCUsed, CurrentProgramInfo.FlatUsed,
@@ -259,7 +267,7 @@ void AMDGPUAsmPrinter::EmitFunctionEntryLabel() {
 
   const SIMachineFunctionInfo *MFI = MF->getInfo<SIMachineFunctionInfo>();
   const GCNSubtarget &STM = MF->getSubtarget<GCNSubtarget>();
-  if (MFI->isEntryFunction() && STM.isAmdCodeObjectV2(MF->getFunction())) {
+  if (MFI->isEntryFunction() && STM.isAmdHsaOrMesa(MF->getFunction())) {
     SmallString<128> SymbolName;
     getNameWithPrefix(SymbolName, &MF->getFunction()),
     getTargetStreamer()->EmitAMDGPUSymbolType(
@@ -562,7 +570,7 @@ static bool hasAnyNonFlatUseOfReg(const MachineRegisterInfo &MRI,
 
 int32_t AMDGPUAsmPrinter::SIFunctionResourceInfo::getTotalNumSGPRs(
   const GCNSubtarget &ST) const {
-  return NumExplicitSGPR + IsaInfo::getNumExtraSGPRs(ST.getFeatureBits(),
+  return NumExplicitSGPR + IsaInfo::getNumExtraSGPRs(&ST,
                                                      UsesVCC, UsesFlatScratch);
 }
 
@@ -759,7 +767,7 @@ AMDGPUAsmPrinter::SIFunctionResourceInfo AMDGPUAsmPrinter::analyzeResourceUsage(
 
           // 48 SGPRs - vcc, - flat_scr, -xnack
           int MaxSGPRGuess =
-              47 - IsaInfo::getNumExtraSGPRs(ST.getFeatureBits(), true,
+              47 - IsaInfo::getNumExtraSGPRs(getSTI(), true,
                                              ST.hasFlatAddressSpace());
           MaxSGPR = std::max(MaxSGPR, MaxSGPRGuess);
           MaxVGPR = std::max(MaxVGPR, 23);
@@ -824,7 +832,7 @@ void AMDGPUAsmPrinter::getSIProgramInfo(SIProgramInfo &ProgInfo,
   // duplicated in part in AMDGPUAsmParser::calculateGPRBlocks, and could be
   // unified.
   unsigned ExtraSGPRs = IsaInfo::getNumExtraSGPRs(
-      STM.getFeatureBits(), ProgInfo.VCCUsed, ProgInfo.FlatUsed);
+      getSTI(), ProgInfo.VCCUsed, ProgInfo.FlatUsed);
 
   // Check the addressable register limit before we add ExtraSGPRs.
   if (STM.getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS &&
@@ -906,9 +914,9 @@ void AMDGPUAsmPrinter::getSIProgramInfo(SIProgramInfo &ProgInfo,
   }
 
   ProgInfo.SGPRBlocks = IsaInfo::getNumSGPRBlocks(
-      STM.getFeatureBits(), ProgInfo.NumSGPRsForWavesPerEU);
+      &STM, ProgInfo.NumSGPRsForWavesPerEU);
   ProgInfo.VGPRBlocks = IsaInfo::getNumVGPRBlocks(
-      STM.getFeatureBits(), ProgInfo.NumVGPRsForWavesPerEU);
+      &STM, ProgInfo.NumVGPRsForWavesPerEU);
 
   // Update DebuggerWavefrontPrivateSegmentOffsetSGPR and
   // DebuggerPrivateSegmentBufferSGPR fields if "amdgpu-debugger-emit-prologue"
@@ -1003,7 +1011,6 @@ static unsigned getRsrcReg(CallingConv::ID CallConv) {
 
 void AMDGPUAsmPrinter::EmitProgramInfoSI(const MachineFunction &MF,
                                          const SIProgramInfo &CurrentProgramInfo) {
-  const GCNSubtarget &STM = MF.getSubtarget<GCNSubtarget>();
   const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
   unsigned RsrcReg = getRsrcReg(MF.getFunction().getCallingConv());
 
@@ -1024,10 +1031,9 @@ void AMDGPUAsmPrinter::EmitProgramInfoSI(const MachineFunction &MF,
     OutStreamer->EmitIntValue(RsrcReg, 4);
     OutStreamer->EmitIntValue(S_00B028_VGPRS(CurrentProgramInfo.VGPRBlocks) |
                               S_00B028_SGPRS(CurrentProgramInfo.SGPRBlocks), 4);
-    if (STM.isVGPRSpillingEnabled(MF.getFunction())) {
-      OutStreamer->EmitIntValue(R_0286E8_SPI_TMPRING_SIZE, 4);
-      OutStreamer->EmitIntValue(S_0286E8_WAVESIZE(CurrentProgramInfo.ScratchBlocks), 4);
-    }
+    OutStreamer->EmitIntValue(R_0286E8_SPI_TMPRING_SIZE, 4);
+    OutStreamer->EmitIntValue(
+        S_0286E8_WAVESIZE(CurrentProgramInfo.ScratchBlocks), 4);
   }
 
   if (MF.getFunction().getCallingConv() == CallingConv::AMDGPU_PS) {
@@ -1138,7 +1144,7 @@ void AMDGPUAsmPrinter::getAmdKernelCode(amd_kernel_code_t &Out,
   const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
   const GCNSubtarget &STM = MF.getSubtarget<GCNSubtarget>();
 
-  AMDGPU::initDefaultAMDKernelCodeT(Out, STM.getFeatureBits());
+  AMDGPU::initDefaultAMDKernelCodeT(Out, getSTI());
 
   Out.compute_pgm_resource_registers =
       CurrentProgramInfo.ComputePGMRSrc1 |
diff --git a/lib/Target/AMDGPU/AMDGPUAsmPrinter.h b/lib/Target/AMDGPU/AMDGPUAsmPrinter.h
index 22982d912c708..167ac4b21e1e2 100644
--- a/lib/Target/AMDGPU/AMDGPUAsmPrinter.h
+++ b/lib/Target/AMDGPU/AMDGPUAsmPrinter.h
@@ -56,7 +56,7 @@ private:
   SIProgramInfo CurrentProgramInfo;
   DenseMap<const Function *, SIFunctionResourceInfo> CallGraphResourceInfo;
 
-  AMDGPU::HSAMD::MetadataStreamer HSAMetadataStream;
+  std::unique_ptr<AMDGPU::HSAMD::MetadataStreamer> HSAMetadataStream;
   std::map<uint32_t, uint32_t> PALMetadataMap;
 
   uint64_t getFunctionCodeSize(const MachineFunction &MF) const;
@@ -143,7 +143,6 @@ public:
 protected:
   mutable std::vector<std::string> DisasmLines, HexLines;
   mutable size_t DisasmLineMaxLen;
-  AMDGPUAS AMDGPUASI;
 };
 
 } // end namespace llvm
diff --git a/lib/Target/AMDGPU/AMDGPUAtomicOptimizer.cpp b/lib/Target/AMDGPU/AMDGPUAtomicOptimizer.cpp
new file mode 100644
index 0000000000000..644e4fd558bad
--- /dev/null
+++ b/lib/Target/AMDGPU/AMDGPUAtomicOptimizer.cpp
@@ -0,0 +1,458 @@
+//===-- AMDGPUAtomicOptimizer.cpp -----------------------------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+/// \file
+/// This pass optimizes atomic operations by using a single lane of a wavefront
+/// to perform the atomic operation, thus reducing contention on that memory
+/// location.
+//
+//===----------------------------------------------------------------------===//
+
+#include "AMDGPU.h"
+#include "AMDGPUSubtarget.h"
+#include "llvm/Analysis/LegacyDivergenceAnalysis.h"
+#include "llvm/CodeGen/TargetPassConfig.h"
+#include "llvm/IR/IRBuilder.h"
+#include "llvm/IR/InstVisitor.h"
+#include "llvm/Transforms/Utils/BasicBlockUtils.h"
+
+#define DEBUG_TYPE "amdgpu-atomic-optimizer"
+
+using namespace llvm;
+
+namespace {
+
+enum DPP_CTRL {
+  DPP_ROW_SR1 = 0x111,
+  DPP_ROW_SR2 = 0x112,
+  DPP_ROW_SR4 = 0x114,
+  DPP_ROW_SR8 = 0x118,
+  DPP_WF_SR1 = 0x138,
+  DPP_ROW_BCAST15 = 0x142,
+  DPP_ROW_BCAST31 = 0x143
+};
+
+struct ReplacementInfo {
+  Instruction *I;
+  Instruction::BinaryOps Op;
+  unsigned ValIdx;
+  bool ValDivergent;
+};
+
+class AMDGPUAtomicOptimizer : public FunctionPass,
+                              public InstVisitor<AMDGPUAtomicOptimizer> {
+private:
+  SmallVector<ReplacementInfo, 8> ToReplace;
+  const LegacyDivergenceAnalysis *DA;
+  const DataLayout *DL;
+  DominatorTree *DT;
+  bool HasDPP;
+  bool IsPixelShader;
+
+  void optimizeAtomic(Instruction &I, Instruction::BinaryOps Op,
+                      unsigned ValIdx, bool ValDivergent) const;
+
+  void setConvergent(CallInst *const CI) const;
+
+public:
+  static char ID;
+
+  AMDGPUAtomicOptimizer() : FunctionPass(ID) {}
+
+  bool runOnFunction(Function &F) override;
+
+  void getAnalysisUsage(AnalysisUsage &AU) const override {
+    AU.addPreserved<DominatorTreeWrapperPass>();
+    AU.addRequired<LegacyDivergenceAnalysis>();
+    AU.addRequired<TargetPassConfig>();
+  }
+
+  void visitAtomicRMWInst(AtomicRMWInst &I);
+  void visitIntrinsicInst(IntrinsicInst &I);
+};
+
+} // namespace
+
+char AMDGPUAtomicOptimizer::ID = 0;
+
+char &llvm::AMDGPUAtomicOptimizerID = AMDGPUAtomicOptimizer::ID;
+
+bool AMDGPUAtomicOptimizer::runOnFunction(Function &F) {
+  if (skipFunction(F)) {
+    return false;
+  }
+
+  DA = &getAnalysis<LegacyDivergenceAnalysis>();
+  DL = &F.getParent()->getDataLayout();
+  DominatorTreeWrapperPass *const DTW =
+      getAnalysisIfAvailable<DominatorTreeWrapperPass>();
+  DT = DTW ? &DTW->getDomTree() : nullptr;
+  const TargetPassConfig &TPC = getAnalysis<TargetPassConfig>();
+  const TargetMachine &TM = TPC.getTM<TargetMachine>();
+  const GCNSubtarget &ST = TM.getSubtarget<GCNSubtarget>(F);
+  HasDPP = ST.hasDPP();
+  IsPixelShader = F.getCallingConv() == CallingConv::AMDGPU_PS;
+
+  visit(F);
+
+  const bool Changed = !ToReplace.empty();
+
+  for (ReplacementInfo &Info : ToReplace) {
+    optimizeAtomic(*Info.I, Info.Op, Info.ValIdx, Info.ValDivergent);
+  }
+
+  ToReplace.clear();
+
+  return Changed;
+}
+
+void AMDGPUAtomicOptimizer::visitAtomicRMWInst(AtomicRMWInst &I) {
+  // Early exit for unhandled address space atomic instructions.
+  switch (I.getPointerAddressSpace()) {
+  default:
+    return;
+  case AMDGPUAS::GLOBAL_ADDRESS:
+  case AMDGPUAS::LOCAL_ADDRESS:
+    break;
+  }
+
+  Instruction::BinaryOps Op;
+
+  switch (I.getOperation()) {
+  default:
+    return;
+  case AtomicRMWInst::Add:
+    Op = Instruction::Add;
+    break;
+  case AtomicRMWInst::Sub:
+    Op = Instruction::Sub;
+    break;
+  }
+
+  const unsigned PtrIdx = 0;
+  const unsigned ValIdx = 1;
+
+  // If the pointer operand is divergent, then each lane is doing an atomic
+  // operation on a different address, and we cannot optimize that.
+  if (DA->isDivergent(I.getOperand(PtrIdx))) {
+    return;
+  }
+
+  const bool ValDivergent = DA->isDivergent(I.getOperand(ValIdx));
+
+  // If the value operand is divergent, each lane is contributing a different
+  // value to the atomic calculation. We can only optimize divergent values if
+  // we have DPP available on our subtarget, and the atomic operation is 32
+  // bits.
+  if (ValDivergent && (!HasDPP || (DL->getTypeSizeInBits(I.getType()) != 32))) {
+    return;
+  }
+
+  // If we get here, we can optimize the atomic using a single wavefront-wide
+  // atomic operation to do the calculation for the entire wavefront, so
+  // remember the instruction so we can come back to it.
+  const ReplacementInfo Info = {&I, Op, ValIdx, ValDivergent};
+
+  ToReplace.push_back(Info);
+}
+
+void AMDGPUAtomicOptimizer::visitIntrinsicInst(IntrinsicInst &I) {
+  Instruction::BinaryOps Op;
+
+  switch (I.getIntrinsicID()) {
+  default:
+    return;
+  case Intrinsic::amdgcn_buffer_atomic_add:
+  case Intrinsic::amdgcn_struct_buffer_atomic_add:
+  case Intrinsic::amdgcn_raw_buffer_atomic_add:
+    Op = Instruction::Add;
+    break;
+  case Intrinsic::amdgcn_buffer_atomic_sub:
+  case Intrinsic::amdgcn_struct_buffer_atomic_sub:
+  case Intrinsic::amdgcn_raw_buffer_atomic_sub:
+    Op = Instruction::Sub;
+    break;
+  }
+
+  const unsigned ValIdx = 0;
+
+  const bool ValDivergent = DA->isDivergent(I.getOperand(ValIdx));
+
+  // If the value operand is divergent, each lane is contributing a different
+  // value to the atomic calculation. We can only optimize divergent values if
+  // we have DPP available on our subtarget, and the atomic operation is 32
+  // bits.
+  if (ValDivergent && (!HasDPP || (DL->getTypeSizeInBits(I.getType()) != 32))) {
+    return;
+  }
+
+  // If any of the other arguments to the intrinsic are divergent, we can't
+  // optimize the operation.
+  for (unsigned Idx = 1; Idx < I.getNumOperands(); Idx++) {
+    if (DA->isDivergent(I.getOperand(Idx))) {
+      return;
+    }
+  }
+
+  // If we get here, we can optimize the atomic using a single wavefront-wide
+  // atomic operation to do the calculation for the entire wavefront, so
+  // remember the instruction so we can come back to it.
+  const ReplacementInfo Info = {&I, Op, ValIdx, ValDivergent};
+
+  ToReplace.push_back(Info);
+}
+
+void AMDGPUAtomicOptimizer::optimizeAtomic(Instruction &I,
+                                           Instruction::BinaryOps Op,
+                                           unsigned ValIdx,
+                                           bool ValDivergent) const {
+  LLVMContext &Context = I.getContext();
+
+  // Start building just before the instruction.
+  IRBuilder<> B(&I);
+
+  // If we are in a pixel shader, because of how we have to mask out helper
+  // lane invocations, we need to record the entry and exit BB's.
+  BasicBlock *PixelEntryBB = nullptr;
+  BasicBlock *PixelExitBB = nullptr;
+
+  // If we're optimizing an atomic within a pixel shader, we need to wrap the
+  // entire atomic operation in a helper-lane check. We do not want any helper
+  // lanes that are around only for the purposes of derivatives to take part
+  // in any cross-lane communication, and we use a branch on whether the lane is
+  // live to do this.
+  if (IsPixelShader) {
+    // Record I's original position as the entry block.
+    PixelEntryBB = I.getParent();
+
+    Value *const Cond = B.CreateIntrinsic(Intrinsic::amdgcn_ps_live, {}, {});
+    Instruction *const NonHelperTerminator =
+        SplitBlockAndInsertIfThen(Cond, &I, false, nullptr, DT, nullptr);
+
+    // Record I's new position as the exit block.
+    PixelExitBB = I.getParent();
+
+    I.moveBefore(NonHelperTerminator);
+    B.SetInsertPoint(&I);
+  }
+
+  Type *const Ty = I.getType();
+  const unsigned TyBitWidth = DL->getTypeSizeInBits(Ty);
+  Type *const VecTy = VectorType::get(B.getInt32Ty(), 2);
+
+  // This is the value in the atomic operation we need to combine in order to
+  // reduce the number of atomic operations.
+  Value *const V = I.getOperand(ValIdx);
+
+  // We need to know how many lanes are active within the wavefront, and we do
+  // this by getting the exec register, which tells us all the lanes that are
+  // active.
+  MDNode *const RegName =
+      llvm::MDNode::get(Context, llvm::MDString::get(Context, "exec"));
+  Value *const Metadata = llvm::MetadataAsValue::get(Context, RegName);
+  CallInst *const Exec =
+      B.CreateIntrinsic(Intrinsic::read_register, {B.getInt64Ty()}, {Metadata});
+  setConvergent(Exec);
+
+  // We need to know how many lanes are active within the wavefront that are
+  // below us. If we counted each lane linearly starting from 0, a lane is
+  // below us only if its associated index was less than ours. We do this by
+  // using the mbcnt intrinsic.
+  Value *const BitCast = B.CreateBitCast(Exec, VecTy);
+  Value *const ExtractLo = B.CreateExtractElement(BitCast, B.getInt32(0));
+  Value *const ExtractHi = B.CreateExtractElement(BitCast, B.getInt32(1));
+  CallInst *const PartialMbcnt = B.CreateIntrinsic(
+      Intrinsic::amdgcn_mbcnt_lo, {}, {ExtractLo, B.getInt32(0)});
+  CallInst *const Mbcnt = B.CreateIntrinsic(Intrinsic::amdgcn_mbcnt_hi, {},
+                                            {ExtractHi, PartialMbcnt});
+
+  Value *const MbcntCast = B.CreateIntCast(Mbcnt, Ty, false);
+
+  Value *LaneOffset = nullptr;
+  Value *NewV = nullptr;
+
+  // If we have a divergent value in each lane, we need to combine the value
+  // using DPP.
+  if (ValDivergent) {
+    // First we need to set all inactive invocations to 0, so that they can
+    // correctly contribute to the final result.
+    CallInst *const SetInactive = B.CreateIntrinsic(
+        Intrinsic::amdgcn_set_inactive, Ty, {V, B.getIntN(TyBitWidth, 0)});
+    setConvergent(SetInactive);
+    NewV = SetInactive;
+
+    const unsigned Iters = 6;
+    const unsigned DPPCtrl[Iters] = {DPP_ROW_SR1,     DPP_ROW_SR2,
+                                     DPP_ROW_SR4,     DPP_ROW_SR8,
+                                     DPP_ROW_BCAST15, DPP_ROW_BCAST31};
+    const unsigned RowMask[Iters] = {0xf, 0xf, 0xf, 0xf, 0xa, 0xc};
+
+    // This loop performs an inclusive scan across the wavefront, with all lanes
+    // active (by using the WWM intrinsic).
+    for (unsigned Idx = 0; Idx < Iters; Idx++) {
+      CallInst *const DPP = B.CreateIntrinsic(Intrinsic::amdgcn_mov_dpp, Ty,
+                                              {NewV, B.getInt32(DPPCtrl[Idx]),
+                                               B.getInt32(RowMask[Idx]),
+                                               B.getInt32(0xf), B.getFalse()});
+      setConvergent(DPP);
+      Value *const WWM = B.CreateIntrinsic(Intrinsic::amdgcn_wwm, Ty, DPP);
+
+      NewV = B.CreateBinOp(Op, NewV, WWM);
+      NewV = B.CreateIntrinsic(Intrinsic::amdgcn_wwm, Ty, NewV);
+    }
+
+    // NewV has returned the inclusive scan of V, but for the lane offset we
+    // require an exclusive scan. We do this by shifting the values from the
+    // entire wavefront right by 1, and by setting the bound_ctrl (last argument
+    // to the intrinsic below) to true, we can guarantee that 0 will be shifted
+    // into the 0'th invocation.
+    CallInst *const DPP =
+        B.CreateIntrinsic(Intrinsic::amdgcn_mov_dpp, {Ty},
+                          {NewV, B.getInt32(DPP_WF_SR1), B.getInt32(0xf),
+                           B.getInt32(0xf), B.getTrue()});
+    setConvergent(DPP);
+    LaneOffset = B.CreateIntrinsic(Intrinsic::amdgcn_wwm, Ty, DPP);
+
+    // Read the value from the last lane, which has accumlated the values of
+    // each active lane in the wavefront. This will be our new value with which
+    // we will provide to the atomic operation.
+    if (TyBitWidth == 64) {
+      Value *const ExtractLo = B.CreateTrunc(NewV, B.getInt32Ty());
+      Value *const ExtractHi =
+          B.CreateTrunc(B.CreateLShr(NewV, B.getInt64(32)), B.getInt32Ty());
+      CallInst *const ReadLaneLo = B.CreateIntrinsic(
+          Intrinsic::amdgcn_readlane, {}, {ExtractLo, B.getInt32(63)});
+      setConvergent(ReadLaneLo);
+      CallInst *const ReadLaneHi = B.CreateIntrinsic(
+          Intrinsic::amdgcn_readlane, {}, {ExtractHi, B.getInt32(63)});
+      setConvergent(ReadLaneHi);
+      Value *const PartialInsert = B.CreateInsertElement(
+          UndefValue::get(VecTy), ReadLaneLo, B.getInt32(0));
+      Value *const Insert =
+          B.CreateInsertElement(PartialInsert, ReadLaneHi, B.getInt32(1));
+      NewV = B.CreateBitCast(Insert, Ty);
+    } else if (TyBitWidth == 32) {
+      CallInst *const ReadLane = B.CreateIntrinsic(Intrinsic::amdgcn_readlane,
+                                                   {}, {NewV, B.getInt32(63)});
+      setConvergent(ReadLane);
+      NewV = ReadLane;
+    } else {
+      llvm_unreachable("Unhandled atomic bit width");
+    }
+  } else {
+    // Get the total number of active lanes we have by using popcount.
+    Instruction *const Ctpop = B.CreateUnaryIntrinsic(Intrinsic::ctpop, Exec);
+    Value *const CtpopCast = B.CreateIntCast(Ctpop, Ty, false);
+
+    // Calculate the new value we will be contributing to the atomic operation
+    // for the entire wavefront.
+    NewV = B.CreateMul(V, CtpopCast);
+    LaneOffset = B.CreateMul(V, MbcntCast);
+  }
+
+  // We only want a single lane to enter our new control flow, and we do this
+  // by checking if there are any active lanes below us. Only one lane will
+  // have 0 active lanes below us, so that will be the only one to progress.
+  Value *const Cond = B.CreateICmpEQ(MbcntCast, B.getIntN(TyBitWidth, 0));
+
+  // Store I's original basic block before we split the block.
+  BasicBlock *const EntryBB = I.getParent();
+
+  // We need to introduce some new control flow to force a single lane to be
+  // active. We do this by splitting I's basic block at I, and introducing the
+  // new block such that:
+  // entry --> single_lane -\
+  //       \------------------> exit
+  Instruction *const SingleLaneTerminator =
+      SplitBlockAndInsertIfThen(Cond, &I, false, nullptr, DT, nullptr);
+
+  // Move the IR builder into single_lane next.
+  B.SetInsertPoint(SingleLaneTerminator);
+
+  // Clone the original atomic operation into single lane, replacing the
+  // original value with our newly created one.
+  Instruction *const NewI = I.clone();
+  B.Insert(NewI);
+  NewI->setOperand(ValIdx, NewV);
+
+  // Move the IR builder into exit next, and start inserting just before the
+  // original instruction.
+  B.SetInsertPoint(&I);
+
+  // Create a PHI node to get our new atomic result into the exit block.
+  PHINode *const PHI = B.CreatePHI(Ty, 2);
+  PHI->addIncoming(UndefValue::get(Ty), EntryBB);
+  PHI->addIncoming(NewI, SingleLaneTerminator->getParent());
+
+  // We need to broadcast the value who was the lowest active lane (the first
+  // lane) to all other lanes in the wavefront. We use an intrinsic for this,
+  // but have to handle 64-bit broadcasts with two calls to this intrinsic.
+  Value *BroadcastI = nullptr;
+
+  if (TyBitWidth == 64) {
+    Value *const ExtractLo = B.CreateTrunc(PHI, B.getInt32Ty());
+    Value *const ExtractHi =
+        B.CreateTrunc(B.CreateLShr(PHI, B.getInt64(32)), B.getInt32Ty());
+    CallInst *const ReadFirstLaneLo =
+        B.CreateIntrinsic(Intrinsic::amdgcn_readfirstlane, {}, ExtractLo);
+    setConvergent(ReadFirstLaneLo);
+    CallInst *const ReadFirstLaneHi =
+        B.CreateIntrinsic(Intrinsic::amdgcn_readfirstlane, {}, ExtractHi);
+    setConvergent(ReadFirstLaneHi);
+    Value *const PartialInsert = B.CreateInsertElement(
+        UndefValue::get(VecTy), ReadFirstLaneLo, B.getInt32(0));
+    Value *const Insert =
+        B.CreateInsertElement(PartialInsert, ReadFirstLaneHi, B.getInt32(1));
+    BroadcastI = B.CreateBitCast(Insert, Ty);
+  } else if (TyBitWidth == 32) {
+    CallInst *const ReadFirstLane =
+        B.CreateIntrinsic(Intrinsic::amdgcn_readfirstlane, {}, PHI);
+    setConvergent(ReadFirstLane);
+    BroadcastI = ReadFirstLane;
+  } else {
+    llvm_unreachable("Unhandled atomic bit width");
+  }
+
+  // Now that we have the result of our single atomic operation, we need to
+  // get our individual lane's slice into the result. We use the lane offset we
+  // previously calculated combined with the atomic result value we got from the
+  // first lane, to get our lane's index into the atomic result.
+  Value *const Result = B.CreateBinOp(Op, BroadcastI, LaneOffset);
+
+  if (IsPixelShader) {
+    // Need a final PHI to reconverge to above the helper lane branch mask.
+    B.SetInsertPoint(PixelExitBB->getFirstNonPHI());
+
+    PHINode *const PHI = B.CreatePHI(Ty, 2);
+    PHI->addIncoming(UndefValue::get(Ty), PixelEntryBB);
+    PHI->addIncoming(Result, I.getParent());
+    I.replaceAllUsesWith(PHI);
+  } else {
+    // Replace the original atomic instruction with the new one.
+    I.replaceAllUsesWith(Result);
+  }
+
+  // And delete the original.
+  I.eraseFromParent();
+}
+
+void AMDGPUAtomicOptimizer::setConvergent(CallInst *const CI) const {
+  CI->addAttribute(AttributeList::FunctionIndex, Attribute::Convergent);
+}
+
+INITIALIZE_PASS_BEGIN(AMDGPUAtomicOptimizer, DEBUG_TYPE,
+                      "AMDGPU atomic optimizations", false, false)
+INITIALIZE_PASS_DEPENDENCY(LegacyDivergenceAnalysis)
+INITIALIZE_PASS_DEPENDENCY(TargetPassConfig)
+INITIALIZE_PASS_END(AMDGPUAtomicOptimizer, DEBUG_TYPE,
+                    "AMDGPU atomic optimizations", false, false)
+
+FunctionPass *llvm::createAMDGPUAtomicOptimizerPass() {
+  return new AMDGPUAtomicOptimizer();
+}
diff --git a/lib/Target/AMDGPU/AMDGPUCallLowering.cpp b/lib/Target/AMDGPU/AMDGPUCallLowering.cpp
index 18c7df0d94f21..daef37f9c21fb 100644
--- a/lib/Target/AMDGPU/AMDGPUCallLowering.cpp
+++ b/lib/Target/AMDGPU/AMDGPUCallLowering.cpp
@@ -28,11 +28,12 @@
 using namespace llvm;
 
 AMDGPUCallLowering::AMDGPUCallLowering(const AMDGPUTargetLowering &TLI)
-  : CallLowering(&TLI), AMDGPUASI(TLI.getAMDGPUAS()) {
+  : CallLowering(&TLI) {
 }
 
 bool AMDGPUCallLowering::lowerReturn(MachineIRBuilder &MIRBuilder,
-                                     const Value *Val, unsigned VReg) const {
+                                     const Value *Val,
+                                     ArrayRef<unsigned> VRegs) const {
   // FIXME: Add support for non-void returns.
   if (Val)
     return false;
@@ -50,7 +51,7 @@ unsigned AMDGPUCallLowering::lowerParameterPtr(MachineIRBuilder &MIRBuilder,
   MachineRegisterInfo &MRI = MF.getRegInfo();
   const Function &F = MF.getFunction();
   const DataLayout &DL = F.getParent()->getDataLayout();
-  PointerType *PtrTy = PointerType::get(ParamTy, AMDGPUASI.CONSTANT_ADDRESS);
+  PointerType *PtrTy = PointerType::get(ParamTy, AMDGPUAS::CONSTANT_ADDRESS);
   LLT PtrType = getLLTForType(*PtrTy, DL);
   unsigned DstReg = MRI.createGenericVirtualRegister(PtrType);
   unsigned KernArgSegmentPtr =
@@ -72,7 +73,7 @@ void AMDGPUCallLowering::lowerParameter(MachineIRBuilder &MIRBuilder,
   MachineFunction &MF = MIRBuilder.getMF();
   const Function &F = MF.getFunction();
   const DataLayout &DL = F.getParent()->getDataLayout();
-  PointerType *PtrTy = PointerType::get(ParamTy, AMDGPUASI.CONSTANT_ADDRESS);
+  PointerType *PtrTy = PointerType::get(ParamTy, AMDGPUAS::CONSTANT_ADDRESS);
   MachinePointerInfo PtrInfo(UndefValue::get(PtrTy));
   unsigned TypeSize = DL.getTypeStoreSize(ParamTy);
   unsigned PtrReg = lowerParameterPtr(MIRBuilder, ParamTy, Offset);
diff --git a/lib/Target/AMDGPU/AMDGPUCallLowering.h b/lib/Target/AMDGPU/AMDGPUCallLowering.h
index f51cb6abbf65c..ed859716218ef 100644
--- a/lib/Target/AMDGPU/AMDGPUCallLowering.h
+++ b/lib/Target/AMDGPU/AMDGPUCallLowering.h
@@ -23,8 +23,6 @@ namespace llvm {
 class AMDGPUTargetLowering;
 
 class AMDGPUCallLowering: public CallLowering {
-  AMDGPUAS AMDGPUASI;
-
   unsigned lowerParameterPtr(MachineIRBuilder &MIRBuilder, Type *ParamTy,
                              uint64_t Offset) const;
 
@@ -35,8 +33,8 @@ class AMDGPUCallLowering: public CallLowering {
  public:
   AMDGPUCallLowering(const AMDGPUTargetLowering &TLI);
 
-  bool lowerReturn(MachineIRBuilder &MIRBuiler, const Value *Val,
-                   unsigned VReg) const override;
+  bool lowerReturn(MachineIRBuilder &MIRBuilder, const Value *Val,
+                   ArrayRef<unsigned> VRegs) const override;
   bool lowerFormalArguments(MachineIRBuilder &MIRBuilder, const Function &F,
                             ArrayRef<unsigned> VRegs) const override;
   static CCAssignFn *CCAssignFnForCall(CallingConv::ID CC, bool IsVarArg);
diff --git a/lib/Target/AMDGPU/AMDGPUCallingConv.td b/lib/Target/AMDGPU/AMDGPUCallingConv.td
index 68bc7fdd99618..367f120b5fa6b 100644
--- a/lib/Target/AMDGPU/AMDGPUCallingConv.td
+++ b/lib/Target/AMDGPU/AMDGPUCallingConv.td
@@ -19,7 +19,7 @@ class CCIfExtend<CCAction A>
 // Calling convention for SI
 def CC_SI : CallingConv<[
 
-  CCIfInReg<CCIfType<[f32, i32, f16] , CCAssignToReg<[
+  CCIfInReg<CCIfType<[f32, i32, f16, v2i16, v2f16] , CCAssignToReg<[
     SGPR0, SGPR1, SGPR2, SGPR3, SGPR4, SGPR5, SGPR6, SGPR7,
     SGPR8, SGPR9, SGPR10, SGPR11, SGPR12, SGPR13, SGPR14, SGPR15,
     SGPR16, SGPR17, SGPR18, SGPR19, SGPR20, SGPR21, SGPR22, SGPR23,
@@ -33,7 +33,7 @@ def CC_SI : CallingConv<[
   CCIfByVal<CCIfType<[i64], CCCustom<"allocateSGPRTuple">>>,
 
   // 32*4 + 4 is the minimum for a fetch shader consumer with 32 inputs.
-  CCIfNotInReg<CCIfType<[f32, i32, f16] , CCAssignToReg<[
+  CCIfNotInReg<CCIfType<[f32, i32, f16, v2i16, v2f16] , CCAssignToReg<[
     VGPR0, VGPR1, VGPR2, VGPR3, VGPR4, VGPR5, VGPR6, VGPR7,
     VGPR8, VGPR9, VGPR10, VGPR11, VGPR12, VGPR13, VGPR14, VGPR15,
     VGPR16, VGPR17, VGPR18, VGPR19, VGPR20, VGPR21, VGPR22, VGPR23,
@@ -64,7 +64,7 @@ def RetCC_SI_Shader : CallingConv<[
   ]>>,
 
   // 32*4 + 4 is the minimum for a fetch shader with 32 outputs.
-  CCIfType<[f32, f16] , CCAssignToReg<[
+  CCIfType<[f32, f16, v2f16] , CCAssignToReg<[
     VGPR0, VGPR1, VGPR2, VGPR3, VGPR4, VGPR5, VGPR6, VGPR7,
     VGPR8, VGPR9, VGPR10, VGPR11, VGPR12, VGPR13, VGPR14, VGPR15,
     VGPR16, VGPR17, VGPR18, VGPR19, VGPR20, VGPR21, VGPR22, VGPR23,
diff --git a/lib/Target/AMDGPU/AMDGPUCodeGenPrepare.cpp b/lib/Target/AMDGPU/AMDGPUCodeGenPrepare.cpp
index 5713b7b7f9a84..4dc1e67c573d3 100644
--- a/lib/Target/AMDGPU/AMDGPUCodeGenPrepare.cpp
+++ b/lib/Target/AMDGPU/AMDGPUCodeGenPrepare.cpp
@@ -18,7 +18,7 @@
 #include "AMDGPUTargetMachine.h"
 #include "llvm/ADT/StringRef.h"
 #include "llvm/Analysis/AssumptionCache.h"
-#include "llvm/Analysis/DivergenceAnalysis.h"
+#include "llvm/Analysis/LegacyDivergenceAnalysis.h"
 #include "llvm/Analysis/Loads.h"
 #include "llvm/Analysis/ValueTracking.h"
 #include "llvm/CodeGen/Passes.h"
@@ -60,10 +60,9 @@ class AMDGPUCodeGenPrepare : public FunctionPass,
                              public InstVisitor<AMDGPUCodeGenPrepare, bool> {
   const GCNSubtarget *ST = nullptr;
   AssumptionCache *AC = nullptr;
-  DivergenceAnalysis *DA = nullptr;
+  LegacyDivergenceAnalysis *DA = nullptr;
   Module *Mod = nullptr;
   bool HasUnsafeFPMath = false;
-  AMDGPUAS AMDGPUASI;
 
   /// Copies exact/nsw/nuw flags (if any) from binary operation \p I to
   /// binary operation \p V.
@@ -177,7 +176,7 @@ public:
 
   void getAnalysisUsage(AnalysisUsage &AU) const override {
     AU.addRequired<AssumptionCacheTracker>();
-    AU.addRequired<DivergenceAnalysis>();
+    AU.addRequired<LegacyDivergenceAnalysis>();
     AU.setPreservesAll();
  }
 };
@@ -559,7 +558,7 @@ Value* AMDGPUCodeGenPrepare::expandDivRem24(IRBuilder<> &Builder,
   Value *FQM = Builder.CreateFMul(FA, RCP);
 
   // fq = trunc(fqm);
-  CallInst* FQ = Builder.CreateIntrinsic(Intrinsic::trunc, { FQM });
+  CallInst *FQ = Builder.CreateUnaryIntrinsic(Intrinsic::trunc, FQM);
   FQ->copyFastMathFlags(Builder.getFastMathFlags());
 
   // float fqneg = -fq;
@@ -567,17 +566,17 @@ Value* AMDGPUCodeGenPrepare::expandDivRem24(IRBuilder<> &Builder,
 
   // float fr = mad(fqneg, fb, fa);
   Value *FR = Builder.CreateIntrinsic(Intrinsic::amdgcn_fmad_ftz,
-                                      { FQNeg, FB, FA }, FQ);
+                                      {FQNeg->getType()}, {FQNeg, FB, FA}, FQ);
 
   // int iq = (int)fq;
   Value *IQ = IsSigned ? Builder.CreateFPToSI(FQ, I32Ty)
                        : Builder.CreateFPToUI(FQ, I32Ty);
 
   // fr = fabs(fr);
-  FR = Builder.CreateIntrinsic(Intrinsic::fabs, { FR }, FQ);
+  FR = Builder.CreateUnaryIntrinsic(Intrinsic::fabs, FR, FQ);
 
   // fb = fabs(fb);
-  FB = Builder.CreateIntrinsic(Intrinsic::fabs, { FB }, FQ);
+  FB = Builder.CreateUnaryIntrinsic(Intrinsic::fabs, FB, FQ);
 
   // int cv = fr >= fb;
   Value *CV = Builder.CreateFCmpOGE(FR, FB);
@@ -799,8 +798,8 @@ bool AMDGPUCodeGenPrepare::visitLoadInst(LoadInst &I) {
   if (!WidenLoads)
     return false;
 
-  if ((I.getPointerAddressSpace() == AMDGPUASI.CONSTANT_ADDRESS ||
-       I.getPointerAddressSpace() == AMDGPUASI.CONSTANT_ADDRESS_32BIT) &&
+  if ((I.getPointerAddressSpace() == AMDGPUAS::CONSTANT_ADDRESS ||
+       I.getPointerAddressSpace() == AMDGPUAS::CONSTANT_ADDRESS_32BIT) &&
       canWidenScalarExtLoad(I)) {
     IRBuilder<> Builder(&I);
     Builder.SetCurrentDebugLocation(I.getDebugLoc());
@@ -898,9 +897,8 @@ bool AMDGPUCodeGenPrepare::runOnFunction(Function &F) {
   const AMDGPUTargetMachine &TM = TPC->getTM<AMDGPUTargetMachine>();
   ST = &TM.getSubtarget<GCNSubtarget>(F);
   AC = &getAnalysis<AssumptionCacheTracker>().getAssumptionCache(F);
-  DA = &getAnalysis<DivergenceAnalysis>();
+  DA = &getAnalysis<LegacyDivergenceAnalysis>();
   HasUnsafeFPMath = hasUnsafeFPMath(F);
-  AMDGPUASI = TM.getAMDGPUAS();
 
   bool MadeChange = false;
 
@@ -918,7 +916,7 @@ bool AMDGPUCodeGenPrepare::runOnFunction(Function &F) {
 INITIALIZE_PASS_BEGIN(AMDGPUCodeGenPrepare, DEBUG_TYPE,
                       "AMDGPU IR optimizations", false, false)
 INITIALIZE_PASS_DEPENDENCY(AssumptionCacheTracker)
-INITIALIZE_PASS_DEPENDENCY(DivergenceAnalysis)
+INITIALIZE_PASS_DEPENDENCY(LegacyDivergenceAnalysis)
 INITIALIZE_PASS_END(AMDGPUCodeGenPrepare, DEBUG_TYPE, "AMDGPU IR optimizations",
                     false, false)
 
diff --git a/lib/Target/AMDGPU/AMDGPUFeatures.td b/lib/Target/AMDGPU/AMDGPUFeatures.td
index b375cae9018ea..3c7d8a8fc5509 100644
--- a/lib/Target/AMDGPU/AMDGPUFeatures.td
+++ b/lib/Target/AMDGPU/AMDGPUFeatures.td
@@ -19,6 +19,15 @@ def FeatureFMA : SubtargetFeature<"fmaf",
   "Enable single precision FMA (not as fast as mul+add, but fused)"
 >;
 
+// Some instructions do not support denormals despite this flag. Using
+// fp32 denormals also causes instructions to run at the double
+// precision rate for the device.
+def FeatureFP32Denormals : SubtargetFeature<"fp32-denormals",
+  "FP32Denormals",
+  "true",
+  "Enable single precision denormal handling"
+>;
+
 class SubtargetFeatureLocalMemorySize <int Value> : SubtargetFeature<
   "localmemorysize"#Value,
   "LocalMemorySize",
diff --git a/lib/Target/AMDGPU/AMDGPUFixFunctionBitcasts.cpp b/lib/Target/AMDGPU/AMDGPUFixFunctionBitcasts.cpp
new file mode 100644
index 0000000000000..6e2a981d33968
--- /dev/null
+++ b/lib/Target/AMDGPU/AMDGPUFixFunctionBitcasts.cpp
@@ -0,0 +1,63 @@
+//===-- AMDGPUFixFunctionBitcasts.cpp - Fix function bitcasts -------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+///
+/// \file
+/// Promote indirect (bitcast) calls to direct calls when they are statically
+/// known to be direct. Required when InstCombine is not run (e.g. at OptNone)
+/// because AMDGPU does not support indirect calls.
+///
+//===----------------------------------------------------------------------===//
+
+#include "AMDGPU.h"
+#include "llvm/IR/InstVisitor.h"
+#include "llvm/Transforms/Utils/CallPromotionUtils.h"
+
+using namespace llvm;
+
+#define DEBUG_TYPE "amdgpu-fix-function-bitcasts"
+
+namespace {
+class AMDGPUFixFunctionBitcasts final
+    : public ModulePass,
+      public InstVisitor<AMDGPUFixFunctionBitcasts> {
+
+  bool runOnModule(Module &M) override;
+
+  bool Modified;
+
+public:
+  void visitCallSite(CallSite CS) {
+    if (CS.getCalledFunction())
+      return;
+    auto Callee = dyn_cast<Function>(CS.getCalledValue()->stripPointerCasts());
+    if (Callee && isLegalToPromote(CS, Callee)) {
+      promoteCall(CS, Callee);
+      Modified = true;
+    }
+  }
+
+  static char ID;
+  AMDGPUFixFunctionBitcasts() : ModulePass(ID) {}
+};
+} // End anonymous namespace
+
+char AMDGPUFixFunctionBitcasts::ID = 0;
+char &llvm::AMDGPUFixFunctionBitcastsID = AMDGPUFixFunctionBitcasts::ID;
+INITIALIZE_PASS(AMDGPUFixFunctionBitcasts, DEBUG_TYPE,
+                "Fix function bitcasts for AMDGPU", false, false)
+
+ModulePass *llvm::createAMDGPUFixFunctionBitcastsPass() {
+  return new AMDGPUFixFunctionBitcasts();
+}
+
+bool AMDGPUFixFunctionBitcasts::runOnModule(Module &M) {
+  Modified = false;
+  visit(M);
+  return Modified;
+}
diff --git a/lib/Target/AMDGPU/AMDGPUGISel.td b/lib/Target/AMDGPU/AMDGPUGISel.td
index ba735390f6791..59bb2a16e0f34 100644
--- a/lib/Target/AMDGPU/AMDGPUGISel.td
+++ b/lib/Target/AMDGPU/AMDGPUGISel.td
@@ -122,15 +122,14 @@ def : GISelVop2CommutePat <sra, V_ASHRREV_I32_e32, i32>;
 }
 def : GISelVop3Pat2CommutePat <sra, V_ASHRREV_I32_e64, i32>;
 
-// FIXME: Select directly to _e32 so we don't need to deal with modifiers.
 // FIXME: We can't re-use SelectionDAG patterns here because they match
 // against a custom SDNode and we would need to create a generic machine
 // instruction that is equivalent to the custom SDNode.  This would also require
 // us to custom legalize the intrinsic to the new generic machine instruction,
 // but I can't get custom legalizing of intrinsic to work and I'm not sure if
 // this is even supported yet.
-defm : GISelVop2IntrPat <
-  int_amdgcn_cvt_pkrtz, V_CVT_PKRTZ_F16_F32_e32, v2f16, f32>;
+def : GISelVop3Pat2ModsPat <
+  int_amdgcn_cvt_pkrtz, V_CVT_PKRTZ_F16_F32_e64, v2f16, f32>;
 
 defm : GISelVop2IntrPat <int_maxnum, V_MAX_F32_e32, f32>;
 def : GISelVop3Pat2ModsPat <int_maxnum, V_MAX_F64, f64>;
diff --git a/lib/Target/AMDGPU/AMDGPUGenRegisterBankInfo.def b/lib/Target/AMDGPU/AMDGPUGenRegisterBankInfo.def
index 3a58c6c6a29fe..6eab59ab4e09b 100644
--- a/lib/Target/AMDGPU/AMDGPUGenRegisterBankInfo.def
+++ b/lib/Target/AMDGPU/AMDGPUGenRegisterBankInfo.def
@@ -16,34 +16,38 @@ namespace AMDGPU {
 
 enum PartialMappingIdx {
   None = - 1,
-  PM_SGPR1  = 0,
-  PM_SGPR16 = 4,
-  PM_SGPR32 = 5,
-  PM_SGPR64 = 6,
-  PM_SGPR128 = 7,
-  PM_SGPR256 = 8,
-  PM_SGPR512 = 9,
-  PM_VGPR1  = 10,
-  PM_VGPR16 = 14,
-  PM_VGPR32 = 15,
-  PM_VGPR64 = 16,
-  PM_VGPR128 = 17,
-  PM_VGPR256 = 18,
-  PM_VGPR512 = 19,
-  PM_SGPR96 = 20,
-  PM_VGPR96 = 21
+  PM_SGPR1  = 2,
+  PM_SGPR16 = 6,
+  PM_SGPR32 = 7,
+  PM_SGPR64 = 8,
+  PM_SGPR128 = 9,
+  PM_SGPR256 = 10,
+  PM_SGPR512 = 11,
+  PM_VGPR1  = 12,
+  PM_VGPR16 = 16,
+  PM_VGPR32 = 17,
+  PM_VGPR64 = 18,
+  PM_VGPR128 = 19,
+  PM_VGPR256 = 20,
+  PM_VGPR512 = 21,
+  PM_SGPR96 = 22,
+  PM_VGPR96 = 23
 };
 
 const RegisterBankInfo::PartialMapping PartMappings[] {
   // StartIdx, Length, RegBank
   {0, 1,  SCCRegBank},
+  {0, 1,  VCCRegBank},
+
+  {0, 1,  SGPRRegBank}, // SGPR begin
   {0, 16, SGPRRegBank},
   {0, 32, SGPRRegBank},
   {0, 64, SGPRRegBank},
   {0, 128, SGPRRegBank},
   {0, 256, SGPRRegBank},
   {0, 512, SGPRRegBank},
-  {0, 1,  SGPRRegBank},
+
+  {0, 1,  VGPRRegBank}, // VGPR begin
   {0, 16, VGPRRegBank},
   {0, 32, VGPRRegBank},
   {0, 64, VGPRRegBank},
@@ -55,33 +59,43 @@ const RegisterBankInfo::PartialMapping PartMappings[] {
 };
 
 const RegisterBankInfo::ValueMapping ValMappings[] {
+  // SCC
   {&PartMappings[0], 1},
-  {nullptr, 0},
-  {nullptr, 0},
-  {nullptr, 0},
+
+  // VCC
   {&PartMappings[1], 1},
+
+  // SGPRs
   {&PartMappings[2], 1},
+  {nullptr, 0}, // Illegal power of 2 sizes
+  {nullptr, 0},
+  {nullptr, 0},
   {&PartMappings[3], 1},
   {&PartMappings[4], 1},
   {&PartMappings[5], 1},
   {&PartMappings[6], 1},
   {&PartMappings[7], 1},
+  {&PartMappings[8], 1},
+
+    // VGPRs
+  {&PartMappings[9], 1},
   {nullptr, 0},
   {nullptr, 0},
   {nullptr, 0},
-  {&PartMappings[8], 1},
-  {&PartMappings[9], 1},
   {&PartMappings[10], 1},
   {&PartMappings[11], 1},
   {&PartMappings[12], 1},
   {&PartMappings[13], 1},
   {&PartMappings[14], 1},
-  {&PartMappings[15], 1}
+  {&PartMappings[15], 1},
+  {&PartMappings[16], 1},
+  {&PartMappings[17], 1}
 };
 
 enum ValueMappingIdx {
-  SGPRStartIdx = 0,
-  VGPRStartIdx = 10
+  SCCStartIdx = 0,
+  SGPRStartIdx = 2,
+  VGPRStartIdx = 12
 };
 
 const RegisterBankInfo::ValueMapping *getValueMapping(unsigned BankID,
@@ -89,16 +103,28 @@ const RegisterBankInfo::ValueMapping *getValueMapping(unsigned BankID,
   unsigned Idx;
   switch (Size) {
   case 1:
-    Idx = BankID == AMDGPU::SCCRegBankID ? PM_SGPR1 : PM_VGPR1;
+    if (BankID == AMDGPU::SCCRegBankID)
+      return &ValMappings[0];
+    if (BankID == AMDGPU::VCCRegBankID)
+      return &ValMappings[1];
+
+    // 1-bit values not from a compare etc.
+    Idx = BankID == AMDGPU::SGPRRegBankID ? PM_SGPR1 : PM_VGPR1;
     break;
   case 96:
+    assert(BankID != AMDGPU::VCCRegBankID);
     Idx = BankID == AMDGPU::SGPRRegBankID ? PM_SGPR96 : PM_VGPR96;
     break;
   default:
+    assert(BankID != AMDGPU::VCCRegBankID);
     Idx = BankID == AMDGPU::VGPRRegBankID ? VGPRStartIdx : SGPRStartIdx;
     Idx += Log2_32_Ceil(Size);
     break;
   }
+
+  assert(Log2_32_Ceil(Size) == Log2_32_Ceil(ValMappings[Idx].BreakDown->Length));
+  assert(BankID == ValMappings[Idx].BreakDown->RegBank->getID());
+
   return &ValMappings[Idx];
 }
 
diff --git a/lib/Target/AMDGPU/AMDGPUHSAMetadataStreamer.cpp b/lib/Target/AMDGPU/AMDGPUHSAMetadataStreamer.cpp
index 01ef346f74ee8..c38b0e61558b3 100644
--- a/lib/Target/AMDGPU/AMDGPUHSAMetadataStreamer.cpp
+++ b/lib/Target/AMDGPU/AMDGPUHSAMetadataStreamer.cpp
@@ -16,6 +16,7 @@
 #include "AMDGPUHSAMetadataStreamer.h"
 #include "AMDGPU.h"
 #include "AMDGPUSubtarget.h"
+#include "MCTargetDesc/AMDGPUTargetStreamer.h"
 #include "SIMachineFunctionInfo.h"
 #include "SIProgramInfo.h"
 #include "Utils/AMDGPUBaseInfo.h"
@@ -36,11 +37,14 @@ static cl::opt<bool> VerifyHSAMetadata(
 namespace AMDGPU {
 namespace HSAMD {
 
-void MetadataStreamer::dump(StringRef HSAMetadataString) const {
+//===----------------------------------------------------------------------===//
+// HSAMetadataStreamerV2
+//===----------------------------------------------------------------------===//
+void MetadataStreamerV2::dump(StringRef HSAMetadataString) const {
   errs() << "AMDGPU HSA Metadata:\n" << HSAMetadataString << '\n';
 }
 
-void MetadataStreamer::verify(StringRef HSAMetadataString) const {
+void MetadataStreamerV2::verify(StringRef HSAMetadataString) const {
   errs() << "AMDGPU HSA Metadata Parser Test: ";
 
   HSAMD::Metadata FromHSAMetadataString;
@@ -63,7 +67,8 @@ void MetadataStreamer::verify(StringRef HSAMetadataString) const {
   }
 }
 
-AccessQualifier MetadataStreamer::getAccessQualifier(StringRef AccQual) const {
+AccessQualifier
+MetadataStreamerV2::getAccessQualifier(StringRef AccQual) const {
   if (AccQual.empty())
     return AccessQualifier::Unknown;
 
@@ -74,26 +79,29 @@ AccessQualifier MetadataStreamer::getAccessQualifier(StringRef AccQual) const {
              .Default(AccessQualifier::Default);
 }
 
-AddressSpaceQualifier MetadataStreamer::getAddressSpaceQualifer(
+AddressSpaceQualifier
+MetadataStreamerV2::getAddressSpaceQualifier(
     unsigned AddressSpace) const {
-  if (AddressSpace == AMDGPUASI.PRIVATE_ADDRESS)
+  switch (AddressSpace) {
+  case AMDGPUAS::PRIVATE_ADDRESS:
     return AddressSpaceQualifier::Private;
-  if (AddressSpace == AMDGPUASI.GLOBAL_ADDRESS)
+  case AMDGPUAS::GLOBAL_ADDRESS:
     return AddressSpaceQualifier::Global;
-  if (AddressSpace == AMDGPUASI.CONSTANT_ADDRESS)
+  case AMDGPUAS::CONSTANT_ADDRESS:
     return AddressSpaceQualifier::Constant;
-  if (AddressSpace == AMDGPUASI.LOCAL_ADDRESS)
+  case AMDGPUAS::LOCAL_ADDRESS:
     return AddressSpaceQualifier::Local;
-  if (AddressSpace == AMDGPUASI.FLAT_ADDRESS)
+  case AMDGPUAS::FLAT_ADDRESS:
     return AddressSpaceQualifier::Generic;
-  if (AddressSpace == AMDGPUASI.REGION_ADDRESS)
+  case AMDGPUAS::REGION_ADDRESS:
     return AddressSpaceQualifier::Region;
-
-  llvm_unreachable("Unknown address space qualifier");
+  default:
+    return AddressSpaceQualifier::Unknown;
+  }
 }
 
-ValueKind MetadataStreamer::getValueKind(Type *Ty, StringRef TypeQual,
-                                         StringRef BaseTypeName) const {
+ValueKind MetadataStreamerV2::getValueKind(Type *Ty, StringRef TypeQual,
+                                           StringRef BaseTypeName) const {
   if (TypeQual.find("pipe") != StringRef::npos)
     return ValueKind::Pipe;
 
@@ -114,13 +122,13 @@ ValueKind MetadataStreamer::getValueKind(Type *Ty, StringRef TypeQual,
              .Case("queue_t", ValueKind::Queue)
              .Default(isa<PointerType>(Ty) ?
                           (Ty->getPointerAddressSpace() ==
-                           AMDGPUASI.LOCAL_ADDRESS ?
+                           AMDGPUAS::LOCAL_ADDRESS ?
                            ValueKind::DynamicSharedPointer :
                            ValueKind::GlobalBuffer) :
                       ValueKind::ByValue);
 }
 
-ValueType MetadataStreamer::getValueType(Type *Ty, StringRef TypeName) const {
+ValueType MetadataStreamerV2::getValueType(Type *Ty, StringRef TypeName) const {
   switch (Ty->getTypeID()) {
   case Type::IntegerTyID: {
     auto Signed = !TypeName.startswith("u");
@@ -152,7 +160,7 @@ ValueType MetadataStreamer::getValueType(Type *Ty, StringRef TypeName) const {
   }
 }
 
-std::string MetadataStreamer::getTypeName(Type *Ty, bool Signed) const {
+std::string MetadataStreamerV2::getTypeName(Type *Ty, bool Signed) const {
   switch (Ty->getTypeID()) {
   case Type::IntegerTyID: {
     if (!Signed)
@@ -189,8 +197,8 @@ std::string MetadataStreamer::getTypeName(Type *Ty, bool Signed) const {
   }
 }
 
-std::vector<uint32_t> MetadataStreamer::getWorkGroupDimensions(
-    MDNode *Node) const {
+std::vector<uint32_t>
+MetadataStreamerV2::getWorkGroupDimensions(MDNode *Node) const {
   std::vector<uint32_t> Dims;
   if (Node->getNumOperands() != 3)
     return Dims;
@@ -200,9 +208,9 @@ std::vector<uint32_t> MetadataStreamer::getWorkGroupDimensions(
   return Dims;
 }
 
-Kernel::CodeProps::Metadata MetadataStreamer::getHSACodeProps(
-    const MachineFunction &MF,
-    const SIProgramInfo &ProgramInfo) const {
+Kernel::CodeProps::Metadata
+MetadataStreamerV2::getHSACodeProps(const MachineFunction &MF,
+                                    const SIProgramInfo &ProgramInfo) const {
   const GCNSubtarget &STM = MF.getSubtarget<GCNSubtarget>();
   const SIMachineFunctionInfo &MFI = *MF.getInfo<SIMachineFunctionInfo>();
   HSAMD::Kernel::CodeProps::Metadata HSACodeProps;
@@ -229,9 +237,9 @@ Kernel::CodeProps::Metadata MetadataStreamer::getHSACodeProps(
   return HSACodeProps;
 }
 
-Kernel::DebugProps::Metadata MetadataStreamer::getHSADebugProps(
-    const MachineFunction &MF,
-    const SIProgramInfo &ProgramInfo) const {
+Kernel::DebugProps::Metadata
+MetadataStreamerV2::getHSADebugProps(const MachineFunction &MF,
+                                     const SIProgramInfo &ProgramInfo) const {
   const GCNSubtarget &STM = MF.getSubtarget<GCNSubtarget>();
   HSAMD::Kernel::DebugProps::Metadata HSADebugProps;
 
@@ -251,14 +259,14 @@ Kernel::DebugProps::Metadata MetadataStreamer::getHSADebugProps(
   return HSADebugProps;
 }
 
-void MetadataStreamer::emitVersion() {
+void MetadataStreamerV2::emitVersion() {
   auto &Version = HSAMetadata.mVersion;
 
   Version.push_back(VersionMajor);
   Version.push_back(VersionMinor);
 }
 
-void MetadataStreamer::emitPrintf(const Module &Mod) {
+void MetadataStreamerV2::emitPrintf(const Module &Mod) {
   auto &Printf = HSAMetadata.mPrintf;
 
   auto Node = Mod.getNamedMetadata("llvm.printf.fmts");
@@ -270,7 +278,7 @@ void MetadataStreamer::emitPrintf(const Module &Mod) {
       Printf.push_back(cast<MDString>(Op->getOperand(0))->getString());
 }
 
-void MetadataStreamer::emitKernelLanguage(const Function &Func) {
+void MetadataStreamerV2::emitKernelLanguage(const Function &Func) {
   auto &Kernel = HSAMetadata.mKernels.back();
 
   // TODO: What about other languages?
@@ -288,7 +296,7 @@ void MetadataStreamer::emitKernelLanguage(const Function &Func) {
       mdconst::extract<ConstantInt>(Op0->getOperand(1))->getZExtValue());
 }
 
-void MetadataStreamer::emitKernelAttrs(const Function &Func) {
+void MetadataStreamerV2::emitKernelAttrs(const Function &Func) {
   auto &Attrs = HSAMetadata.mKernels.back().mAttrs;
 
   if (auto Node = Func.getMetadata("reqd_work_group_size"))
@@ -306,14 +314,14 @@ void MetadataStreamer::emitKernelAttrs(const Function &Func) {
   }
 }
 
-void MetadataStreamer::emitKernelArgs(const Function &Func) {
+void MetadataStreamerV2::emitKernelArgs(const Function &Func) {
   for (auto &Arg : Func.args())
     emitKernelArg(Arg);
 
   emitHiddenKernelArgs(Func);
 }
 
-void MetadataStreamer::emitKernelArg(const Argument &Arg) {
+void MetadataStreamerV2::emitKernelArg(const Argument &Arg) {
   auto Func = Arg.getParent();
   auto ArgNo = Arg.getArgNo();
   const MDNode *Node;
@@ -355,7 +363,7 @@ void MetadataStreamer::emitKernelArg(const Argument &Arg) {
 
   unsigned PointeeAlign = 0;
   if (auto PtrTy = dyn_cast<PointerType>(Ty)) {
-    if (PtrTy->getAddressSpace() == AMDGPUASI.LOCAL_ADDRESS) {
+    if (PtrTy->getAddressSpace() == AMDGPUAS::LOCAL_ADDRESS) {
       PointeeAlign = Arg.getParamAlignment();
       if (PointeeAlign == 0)
         PointeeAlign = DL.getABITypeAlignment(PtrTy->getElementType());
@@ -366,12 +374,12 @@ void MetadataStreamer::emitKernelArg(const Argument &Arg) {
                 PointeeAlign, Name, TypeName, BaseTypeName, AccQual, TypeQual);
 }
 
-void MetadataStreamer::emitKernelArg(const DataLayout &DL, Type *Ty,
-                                     ValueKind ValueKind,
-                                     unsigned PointeeAlign,
-                                     StringRef Name,
-                                     StringRef TypeName, StringRef BaseTypeName,
-                                     StringRef AccQual, StringRef TypeQual) {
+void MetadataStreamerV2::emitKernelArg(const DataLayout &DL, Type *Ty,
+                                       ValueKind ValueKind,
+                                       unsigned PointeeAlign, StringRef Name,
+                                       StringRef TypeName,
+                                       StringRef BaseTypeName,
+                                       StringRef AccQual, StringRef TypeQual) {
   HSAMetadata.mKernels.back().mArgs.push_back(Kernel::Arg::Metadata());
   auto &Arg = HSAMetadata.mKernels.back().mArgs.back();
 
@@ -384,7 +392,7 @@ void MetadataStreamer::emitKernelArg(const DataLayout &DL, Type *Ty,
   Arg.mPointeeAlign = PointeeAlign;
 
   if (auto PtrTy = dyn_cast<PointerType>(Ty))
-    Arg.mAddrSpaceQual = getAddressSpaceQualifer(PtrTy->getAddressSpace());
+    Arg.mAddrSpaceQual = getAddressSpaceQualifier(PtrTy->getAddressSpace());
 
   Arg.mAccQual = getAccessQualifier(AccQual);
 
@@ -404,7 +412,7 @@ void MetadataStreamer::emitKernelArg(const DataLayout &DL, Type *Ty,
   }
 }
 
-void MetadataStreamer::emitHiddenKernelArgs(const Function &Func) {
+void MetadataStreamerV2::emitHiddenKernelArgs(const Function &Func) {
   int HiddenArgNumBytes =
       getIntegerAttribute(Func, "amdgpu-implicitarg-num-bytes", 0);
 
@@ -422,7 +430,7 @@ void MetadataStreamer::emitHiddenKernelArgs(const Function &Func) {
     emitKernelArg(DL, Int64Ty, ValueKind::HiddenGlobalOffsetZ);
 
   auto Int8PtrTy = Type::getInt8PtrTy(Func.getContext(),
-                                      AMDGPUASI.GLOBAL_ADDRESS);
+                                      AMDGPUAS::GLOBAL_ADDRESS);
 
   // Emit "printf buffer" argument if printf is used, otherwise emit dummy
   // "none" argument.
@@ -446,13 +454,16 @@ void MetadataStreamer::emitHiddenKernelArgs(const Function &Func) {
   }
 }
 
-void MetadataStreamer::begin(const Module &Mod) {
-  AMDGPUASI = getAMDGPUAS(Mod);
+bool MetadataStreamerV2::emitTo(AMDGPUTargetStreamer &TargetStreamer) {
+  return TargetStreamer.EmitHSAMetadata(getHSAMetadata());
+}
+
+void MetadataStreamerV2::begin(const Module &Mod) {
   emitVersion();
   emitPrintf(Mod);
 }
 
-void MetadataStreamer::end() {
+void MetadataStreamerV2::end() {
   std::string HSAMetadataString;
   if (toString(HSAMetadata, HSAMetadataString))
     return;
@@ -463,7 +474,8 @@ void MetadataStreamer::end() {
     verify(HSAMetadataString);
 }
 
-void MetadataStreamer::emitKernel(const MachineFunction &MF, const SIProgramInfo &ProgramInfo) {
+void MetadataStreamerV2::emitKernel(const MachineFunction &MF,
+                                    const SIProgramInfo &ProgramInfo) {
   auto &Func = MF.getFunction();
   if (Func.getCallingConv() != CallingConv::AMDGPU_KERNEL)
     return;
@@ -483,6 +495,505 @@ void MetadataStreamer::emitKernel(const MachineFunction &MF, const SIProgramInfo
   HSAMetadata.mKernels.back().mDebugProps = DebugProps;
 }
 
+//===----------------------------------------------------------------------===//
+// HSAMetadataStreamerV3
+//===----------------------------------------------------------------------===//
+
+void MetadataStreamerV3::dump(StringRef HSAMetadataString) const {
+  errs() << "AMDGPU HSA Metadata:\n" << HSAMetadataString << '\n';
+}
+
+void MetadataStreamerV3::verify(StringRef HSAMetadataString) const {
+  errs() << "AMDGPU HSA Metadata Parser Test: ";
+
+  std::shared_ptr<msgpack::Node> FromHSAMetadataString =
+      std::make_shared<msgpack::MapNode>();
+
+  yaml::Input YIn(HSAMetadataString);
+  YIn >> FromHSAMetadataString;
+  if (YIn.error()) {
+    errs() << "FAIL\n";
+    return;
+  }
+
+  std::string ToHSAMetadataString;
+  raw_string_ostream StrOS(ToHSAMetadataString);
+  yaml::Output YOut(StrOS);
+  YOut << FromHSAMetadataString;
+
+  errs() << (HSAMetadataString == StrOS.str() ? "PASS" : "FAIL") << '\n';
+  if (HSAMetadataString != ToHSAMetadataString) {
+    errs() << "Original input: " << HSAMetadataString << '\n'
+           << "Produced output: " << StrOS.str() << '\n';
+  }
+}
+
+Optional<StringRef>
+MetadataStreamerV3::getAccessQualifier(StringRef AccQual) const {
+  return StringSwitch<Optional<StringRef>>(AccQual)
+      .Case("read_only", StringRef("read_only"))
+      .Case("write_only", StringRef("write_only"))
+      .Case("read_write", StringRef("read_write"))
+      .Default(None);
+}
+
+Optional<StringRef>
+MetadataStreamerV3::getAddressSpaceQualifier(unsigned AddressSpace) const {
+  switch (AddressSpace) {
+  case AMDGPUAS::PRIVATE_ADDRESS:
+    return StringRef("private");
+  case AMDGPUAS::GLOBAL_ADDRESS:
+    return StringRef("global");
+  case AMDGPUAS::CONSTANT_ADDRESS:
+    return StringRef("constant");
+  case AMDGPUAS::LOCAL_ADDRESS:
+    return StringRef("local");
+  case AMDGPUAS::FLAT_ADDRESS:
+    return StringRef("generic");
+  case AMDGPUAS::REGION_ADDRESS:
+    return StringRef("region");
+  default:
+    return None;
+  }
+}
+
+StringRef MetadataStreamerV3::getValueKind(Type *Ty, StringRef TypeQual,
+                                           StringRef BaseTypeName) const {
+  if (TypeQual.find("pipe") != StringRef::npos)
+    return "pipe";
+
+  return StringSwitch<StringRef>(BaseTypeName)
+      .Case("image1d_t", "image")
+      .Case("image1d_array_t", "image")
+      .Case("image1d_buffer_t", "image")
+      .Case("image2d_t", "image")
+      .Case("image2d_array_t", "image")
+      .Case("image2d_array_depth_t", "image")
+      .Case("image2d_array_msaa_t", "image")
+      .Case("image2d_array_msaa_depth_t", "image")
+      .Case("image2d_depth_t", "image")
+      .Case("image2d_msaa_t", "image")
+      .Case("image2d_msaa_depth_t", "image")
+      .Case("image3d_t", "image")
+      .Case("sampler_t", "sampler")
+      .Case("queue_t", "queue")
+      .Default(isa<PointerType>(Ty)
+                   ? (Ty->getPointerAddressSpace() == AMDGPUAS::LOCAL_ADDRESS
+                          ? "dynamic_shared_pointer"
+                          : "global_buffer")
+                   : "by_value");
+}
+
+StringRef MetadataStreamerV3::getValueType(Type *Ty, StringRef TypeName) const {
+  switch (Ty->getTypeID()) {
+  case Type::IntegerTyID: {
+    auto Signed = !TypeName.startswith("u");
+    switch (Ty->getIntegerBitWidth()) {
+    case 8:
+      return Signed ? "i8" : "u8";
+    case 16:
+      return Signed ? "i16" : "u16";
+    case 32:
+      return Signed ? "i32" : "u32";
+    case 64:
+      return Signed ? "i64" : "u64";
+    default:
+      return "struct";
+    }
+  }
+  case Type::HalfTyID:
+    return "f16";
+  case Type::FloatTyID:
+    return "f32";
+  case Type::DoubleTyID:
+    return "f64";
+  case Type::PointerTyID:
+    return getValueType(Ty->getPointerElementType(), TypeName);
+  case Type::VectorTyID:
+    return getValueType(Ty->getVectorElementType(), TypeName);
+  default:
+    return "struct";
+  }
+}
+
+std::string MetadataStreamerV3::getTypeName(Type *Ty, bool Signed) const {
+  switch (Ty->getTypeID()) {
+  case Type::IntegerTyID: {
+    if (!Signed)
+      return (Twine('u') + getTypeName(Ty, true)).str();
+
+    auto BitWidth = Ty->getIntegerBitWidth();
+    switch (BitWidth) {
+    case 8:
+      return "char";
+    case 16:
+      return "short";
+    case 32:
+      return "int";
+    case 64:
+      return "long";
+    default:
+      return (Twine('i') + Twine(BitWidth)).str();
+    }
+  }
+  case Type::HalfTyID:
+    return "half";
+  case Type::FloatTyID:
+    return "float";
+  case Type::DoubleTyID:
+    return "double";
+  case Type::VectorTyID: {
+    auto VecTy = cast<VectorType>(Ty);
+    auto ElTy = VecTy->getElementType();
+    auto NumElements = VecTy->getVectorNumElements();
+    return (Twine(getTypeName(ElTy, Signed)) + Twine(NumElements)).str();
+  }
+  default:
+    return "unknown";
+  }
+}
+
+std::shared_ptr<msgpack::ArrayNode>
+MetadataStreamerV3::getWorkGroupDimensions(MDNode *Node) const {
+  auto Dims = std::make_shared<msgpack::ArrayNode>();
+  if (Node->getNumOperands() != 3)
+    return Dims;
+
+  for (auto &Op : Node->operands())
+    Dims->push_back(std::make_shared<msgpack::ScalarNode>(
+        mdconst::extract<ConstantInt>(Op)->getZExtValue()));
+  return Dims;
+}
+
+void MetadataStreamerV3::emitVersion() {
+  auto Version = std::make_shared<msgpack::ArrayNode>();
+  Version->push_back(std::make_shared<msgpack::ScalarNode>(V3::VersionMajor));
+  Version->push_back(std::make_shared<msgpack::ScalarNode>(V3::VersionMinor));
+  getRootMetadata("amdhsa.version") = std::move(Version);
+}
+
+void MetadataStreamerV3::emitPrintf(const Module &Mod) {
+  auto Node = Mod.getNamedMetadata("llvm.printf.fmts");
+  if (!Node)
+    return;
+
+  auto Printf = std::make_shared<msgpack::ArrayNode>();
+  for (auto Op : Node->operands())
+    if (Op->getNumOperands())
+      Printf->push_back(std::make_shared<msgpack::ScalarNode>(
+          cast<MDString>(Op->getOperand(0))->getString()));
+  getRootMetadata("amdhsa.printf") = std::move(Printf);
+}
+
+void MetadataStreamerV3::emitKernelLanguage(const Function &Func,
+                                            msgpack::MapNode &Kern) {
+  // TODO: What about other languages?
+  auto Node = Func.getParent()->getNamedMetadata("opencl.ocl.version");
+  if (!Node || !Node->getNumOperands())
+    return;
+  auto Op0 = Node->getOperand(0);
+  if (Op0->getNumOperands() <= 1)
+    return;
+
+  Kern[".language"] = std::make_shared<msgpack::ScalarNode>("OpenCL C");
+  auto LanguageVersion = std::make_shared<msgpack::ArrayNode>();
+  LanguageVersion->push_back(std::make_shared<msgpack::ScalarNode>(
+      mdconst::extract<ConstantInt>(Op0->getOperand(0))->getZExtValue()));
+  LanguageVersion->push_back(std::make_shared<msgpack::ScalarNode>(
+      mdconst::extract<ConstantInt>(Op0->getOperand(1))->getZExtValue()));
+  Kern[".language_version"] = std::move(LanguageVersion);
+}
+
+void MetadataStreamerV3::emitKernelAttrs(const Function &Func,
+                                         msgpack::MapNode &Kern) {
+
+  if (auto Node = Func.getMetadata("reqd_work_group_size"))
+    Kern[".reqd_workgroup_size"] = getWorkGroupDimensions(Node);
+  if (auto Node = Func.getMetadata("work_group_size_hint"))
+    Kern[".workgroup_size_hint"] = getWorkGroupDimensions(Node);
+  if (auto Node = Func.getMetadata("vec_type_hint")) {
+    Kern[".vec_type_hint"] = std::make_shared<msgpack::ScalarNode>(getTypeName(
+        cast<ValueAsMetadata>(Node->getOperand(0))->getType(),
+        mdconst::extract<ConstantInt>(Node->getOperand(1))->getZExtValue()));
+  }
+  if (Func.hasFnAttribute("runtime-handle")) {
+    Kern[".device_enqueue_symbol"] = std::make_shared<msgpack::ScalarNode>(
+        Func.getFnAttribute("runtime-handle").getValueAsString().str());
+  }
+}
+
+void MetadataStreamerV3::emitKernelArgs(const Function &Func,
+                                        msgpack::MapNode &Kern) {
+  unsigned Offset = 0;
+  auto Args = std::make_shared<msgpack::ArrayNode>();
+  for (auto &Arg : Func.args())
+    emitKernelArg(Arg, Offset, *Args);
+
+  emitHiddenKernelArgs(Func, Offset, *Args);
+
+  // TODO: What about other languages?
+  if (Func.getParent()->getNamedMetadata("opencl.ocl.version")) {
+    auto &DL = Func.getParent()->getDataLayout();
+    auto Int64Ty = Type::getInt64Ty(Func.getContext());
+
+    emitKernelArg(DL, Int64Ty, "hidden_global_offset_x", Offset, *Args);
+    emitKernelArg(DL, Int64Ty, "hidden_global_offset_y", Offset, *Args);
+    emitKernelArg(DL, Int64Ty, "hidden_global_offset_z", Offset, *Args);
+
+    auto Int8PtrTy =
+        Type::getInt8PtrTy(Func.getContext(), AMDGPUAS::GLOBAL_ADDRESS);
+
+    // Emit "printf buffer" argument if printf is used, otherwise emit dummy
+    // "none" argument.
+    if (Func.getParent()->getNamedMetadata("llvm.printf.fmts"))
+      emitKernelArg(DL, Int8PtrTy, "hidden_printf_buffer", Offset, *Args);
+    else
+      emitKernelArg(DL, Int8PtrTy, "hidden_none", Offset, *Args);
+
+    // Emit "default queue" and "completion action" arguments if enqueue kernel
+    // is used, otherwise emit dummy "none" arguments.
+    if (Func.hasFnAttribute("calls-enqueue-kernel")) {
+      emitKernelArg(DL, Int8PtrTy, "hidden_default_queue", Offset, *Args);
+      emitKernelArg(DL, Int8PtrTy, "hidden_completion_action", Offset, *Args);
+    } else {
+      emitKernelArg(DL, Int8PtrTy, "hidden_none", Offset, *Args);
+      emitKernelArg(DL, Int8PtrTy, "hidden_none", Offset, *Args);
+    }
+  }
+
+  Kern[".args"] = std::move(Args);
+}
+
+void MetadataStreamerV3::emitKernelArg(const Argument &Arg, unsigned &Offset,
+                                       msgpack::ArrayNode &Args) {
+  auto Func = Arg.getParent();
+  auto ArgNo = Arg.getArgNo();
+  const MDNode *Node;
+
+  StringRef Name;
+  Node = Func->getMetadata("kernel_arg_name");
+  if (Node && ArgNo < Node->getNumOperands())
+    Name = cast<MDString>(Node->getOperand(ArgNo))->getString();
+  else if (Arg.hasName())
+    Name = Arg.getName();
+
+  StringRef TypeName;
+  Node = Func->getMetadata("kernel_arg_type");
+  if (Node && ArgNo < Node->getNumOperands())
+    TypeName = cast<MDString>(Node->getOperand(ArgNo))->getString();
+
+  StringRef BaseTypeName;
+  Node = Func->getMetadata("kernel_arg_base_type");
+  if (Node && ArgNo < Node->getNumOperands())
+    BaseTypeName = cast<MDString>(Node->getOperand(ArgNo))->getString();
+
+  StringRef AccQual;
+  if (Arg.getType()->isPointerTy() && Arg.onlyReadsMemory() &&
+      Arg.hasNoAliasAttr()) {
+    AccQual = "read_only";
+  } else {
+    Node = Func->getMetadata("kernel_arg_access_qual");
+    if (Node && ArgNo < Node->getNumOperands())
+      AccQual = cast<MDString>(Node->getOperand(ArgNo))->getString();
+  }
+
+  StringRef TypeQual;
+  Node = Func->getMetadata("kernel_arg_type_qual");
+  if (Node && ArgNo < Node->getNumOperands())
+    TypeQual = cast<MDString>(Node->getOperand(ArgNo))->getString();
+
+  Type *Ty = Arg.getType();
+  const DataLayout &DL = Func->getParent()->getDataLayout();
+
+  unsigned PointeeAlign = 0;
+  if (auto PtrTy = dyn_cast<PointerType>(Ty)) {
+    if (PtrTy->getAddressSpace() == AMDGPUAS::LOCAL_ADDRESS) {
+      PointeeAlign = Arg.getParamAlignment();
+      if (PointeeAlign == 0)
+        PointeeAlign = DL.getABITypeAlignment(PtrTy->getElementType());
+    }
+  }
+
+  emitKernelArg(Func->getParent()->getDataLayout(), Arg.getType(),
+                getValueKind(Arg.getType(), TypeQual, BaseTypeName), Offset,
+                Args, PointeeAlign, Name, TypeName, BaseTypeName, AccQual,
+                TypeQual);
+}
+
+void MetadataStreamerV3::emitKernelArg(const DataLayout &DL, Type *Ty,
+                                       StringRef ValueKind, unsigned &Offset,
+                                       msgpack::ArrayNode &Args,
+                                       unsigned PointeeAlign, StringRef Name,
+                                       StringRef TypeName,
+                                       StringRef BaseTypeName,
+                                       StringRef AccQual, StringRef TypeQual) {
+  auto ArgPtr = std::make_shared<msgpack::MapNode>();
+  auto &Arg = *ArgPtr;
+
+  if (!Name.empty())
+    Arg[".name"] = std::make_shared<msgpack::ScalarNode>(Name);
+  if (!TypeName.empty())
+    Arg[".type_name"] = std::make_shared<msgpack::ScalarNode>(TypeName);
+  auto Size = DL.getTypeAllocSize(Ty);
+  auto Align = DL.getABITypeAlignment(Ty);
+  Arg[".size"] = std::make_shared<msgpack::ScalarNode>(Size);
+  Offset = alignTo(Offset, Align);
+  Arg[".offset"] = std::make_shared<msgpack::ScalarNode>(Offset);
+  Offset += Size;
+  Arg[".value_kind"] = std::make_shared<msgpack::ScalarNode>(ValueKind);
+  Arg[".value_type"] =
+      std::make_shared<msgpack::ScalarNode>(getValueType(Ty, BaseTypeName));
+  if (PointeeAlign)
+    Arg[".pointee_align"] = std::make_shared<msgpack::ScalarNode>(PointeeAlign);
+
+  if (auto PtrTy = dyn_cast<PointerType>(Ty))
+    if (auto Qualifier = getAddressSpaceQualifier(PtrTy->getAddressSpace()))
+      Arg[".address_space"] = std::make_shared<msgpack::ScalarNode>(*Qualifier);
+
+  if (auto AQ = getAccessQualifier(AccQual))
+    Arg[".access"] = std::make_shared<msgpack::ScalarNode>(*AQ);
+
+  // TODO: Emit Arg[".actual_access"].
+
+  SmallVector<StringRef, 1> SplitTypeQuals;
+  TypeQual.split(SplitTypeQuals, " ", -1, false);
+  for (StringRef Key : SplitTypeQuals) {
+    if (Key == "const")
+      Arg[".is_const"] = std::make_shared<msgpack::ScalarNode>(true);
+    else if (Key == "restrict")
+      Arg[".is_restrict"] = std::make_shared<msgpack::ScalarNode>(true);
+    else if (Key == "volatile")
+      Arg[".is_volatile"] = std::make_shared<msgpack::ScalarNode>(true);
+    else if (Key == "pipe")
+      Arg[".is_pipe"] = std::make_shared<msgpack::ScalarNode>(true);
+  }
+
+  Args.push_back(std::move(ArgPtr));
+}
+
+void MetadataStreamerV3::emitHiddenKernelArgs(const Function &Func,
+                                              unsigned &Offset,
+                                              msgpack::ArrayNode &Args) {
+  int HiddenArgNumBytes =
+      getIntegerAttribute(Func, "amdgpu-implicitarg-num-bytes", 0);
+
+  if (!HiddenArgNumBytes)
+    return;
+
+  auto &DL = Func.getParent()->getDataLayout();
+  auto Int64Ty = Type::getInt64Ty(Func.getContext());
+
+  if (HiddenArgNumBytes >= 8)
+    emitKernelArg(DL, Int64Ty, "hidden_global_offset_x", Offset, Args);
+  if (HiddenArgNumBytes >= 16)
+    emitKernelArg(DL, Int64Ty, "hidden_global_offset_y", Offset, Args);
+  if (HiddenArgNumBytes >= 24)
+    emitKernelArg(DL, Int64Ty, "hidden_global_offset_z", Offset, Args);
+
+  auto Int8PtrTy =
+      Type::getInt8PtrTy(Func.getContext(), AMDGPUAS::GLOBAL_ADDRESS);
+
+  // Emit "printf buffer" argument if printf is used, otherwise emit dummy
+  // "none" argument.
+  if (HiddenArgNumBytes >= 32) {
+    if (Func.getParent()->getNamedMetadata("llvm.printf.fmts"))
+      emitKernelArg(DL, Int8PtrTy, "hidden_printf_buffer", Offset, Args);
+    else
+      emitKernelArg(DL, Int8PtrTy, "hidden_none", Offset, Args);
+  }
+
+  // Emit "default queue" and "completion action" arguments if enqueue kernel is
+  // used, otherwise emit dummy "none" arguments.
+  if (HiddenArgNumBytes >= 48) {
+    if (Func.hasFnAttribute("calls-enqueue-kernel")) {
+      emitKernelArg(DL, Int8PtrTy, "hidden_default_queue", Offset, Args);
+      emitKernelArg(DL, Int8PtrTy, "hidden_completion_action", Offset, Args);
+    } else {
+      emitKernelArg(DL, Int8PtrTy, "hidden_none", Offset, Args);
+      emitKernelArg(DL, Int8PtrTy, "hidden_none", Offset, Args);
+    }
+  }
+}
+
+std::shared_ptr<msgpack::MapNode>
+MetadataStreamerV3::getHSAKernelProps(const MachineFunction &MF,
+                                      const SIProgramInfo &ProgramInfo) const {
+  const GCNSubtarget &STM = MF.getSubtarget<GCNSubtarget>();
+  const SIMachineFunctionInfo &MFI = *MF.getInfo<SIMachineFunctionInfo>();
+  const Function &F = MF.getFunction();
+
+  auto HSAKernelProps = std::make_shared<msgpack::MapNode>();
+  auto &Kern = *HSAKernelProps;
+
+  unsigned MaxKernArgAlign;
+  Kern[".kernarg_segment_size"] = std::make_shared<msgpack::ScalarNode>(
+      STM.getKernArgSegmentSize(F, MaxKernArgAlign));
+  Kern[".group_segment_fixed_size"] =
+      std::make_shared<msgpack::ScalarNode>(ProgramInfo.LDSSize);
+  Kern[".private_segment_fixed_size"] =
+      std::make_shared<msgpack::ScalarNode>(ProgramInfo.ScratchSize);
+  Kern[".kernarg_segment_align"] =
+      std::make_shared<msgpack::ScalarNode>(std::max(uint32_t(4), MaxKernArgAlign));
+  Kern[".wavefront_size"] =
+      std::make_shared<msgpack::ScalarNode>(STM.getWavefrontSize());
+  Kern[".sgpr_count"] = std::make_shared<msgpack::ScalarNode>(ProgramInfo.NumSGPR);
+  Kern[".vgpr_count"] = std::make_shared<msgpack::ScalarNode>(ProgramInfo.NumVGPR);
+  Kern[".max_flat_workgroup_size"] =
+      std::make_shared<msgpack::ScalarNode>(MFI.getMaxFlatWorkGroupSize());
+  Kern[".sgpr_spill_count"] =
+      std::make_shared<msgpack::ScalarNode>(MFI.getNumSpilledSGPRs());
+  Kern[".vgpr_spill_count"] =
+      std::make_shared<msgpack::ScalarNode>(MFI.getNumSpilledVGPRs());
+
+  return HSAKernelProps;
+}
+
+bool MetadataStreamerV3::emitTo(AMDGPUTargetStreamer &TargetStreamer) {
+  return TargetStreamer.EmitHSAMetadata(getHSAMetadataRoot(), true);
+}
+
+void MetadataStreamerV3::begin(const Module &Mod) {
+  emitVersion();
+  emitPrintf(Mod);
+  getRootMetadata("amdhsa.kernels").reset(new msgpack::ArrayNode());
+}
+
+void MetadataStreamerV3::end() {
+  std::string HSAMetadataString;
+  raw_string_ostream StrOS(HSAMetadataString);
+  yaml::Output YOut(StrOS);
+  YOut << HSAMetadataRoot;
+
+  if (DumpHSAMetadata)
+    dump(StrOS.str());
+  if (VerifyHSAMetadata)
+    verify(StrOS.str());
+}
+
+void MetadataStreamerV3::emitKernel(const MachineFunction &MF,
+                                    const SIProgramInfo &ProgramInfo) {
+  auto &Func = MF.getFunction();
+  auto KernelProps = getHSAKernelProps(MF, ProgramInfo);
+
+  assert(Func.getCallingConv() == CallingConv::AMDGPU_KERNEL ||
+         Func.getCallingConv() == CallingConv::SPIR_KERNEL);
+
+  auto &KernelsNode = getRootMetadata("amdhsa.kernels");
+  auto Kernels = cast<msgpack::ArrayNode>(KernelsNode.get());
+
+  {
+    auto &Kern = *KernelProps;
+    Kern[".name"] = std::make_shared<msgpack::ScalarNode>(Func.getName());
+    Kern[".symbol"] = std::make_shared<msgpack::ScalarNode>(
+        (Twine(Func.getName()) + Twine(".kd")).str());
+    emitKernelLanguage(Func, Kern);
+    emitKernelAttrs(Func, Kern);
+    emitKernelArgs(Func, Kern);
+  }
+
+  Kernels->push_back(std::move(KernelProps));
+}
+
 } // end namespace HSAMD
 } // end namespace AMDGPU
 } // end namespace llvm
diff --git a/lib/Target/AMDGPU/AMDGPUHSAMetadataStreamer.h b/lib/Target/AMDGPU/AMDGPUHSAMetadataStreamer.h
index 3424c956d7816..afc09baf952d6 100644
--- a/lib/Target/AMDGPU/AMDGPUHSAMetadataStreamer.h
+++ b/lib/Target/AMDGPU/AMDGPUHSAMetadataStreamer.h
@@ -19,10 +19,12 @@
 #include "AMDGPU.h"
 #include "AMDKernelCodeT.h"
 #include "llvm/ADT/StringRef.h"
+#include "llvm/BinaryFormat/MsgPackTypes.h"
 #include "llvm/Support/AMDGPUMetadata.h"
 
 namespace llvm {
 
+class AMDGPUTargetStreamer;
 class Argument;
 class DataLayout;
 class Function;
@@ -34,10 +36,94 @@ class Type;
 namespace AMDGPU {
 namespace HSAMD {
 
-class MetadataStreamer final {
+class MetadataStreamer {
+public:
+  virtual ~MetadataStreamer(){};
+
+  virtual bool emitTo(AMDGPUTargetStreamer &TargetStreamer) = 0;
+
+  virtual void begin(const Module &Mod) = 0;
+
+  virtual void end() = 0;
+
+  virtual void emitKernel(const MachineFunction &MF,
+                          const SIProgramInfo &ProgramInfo) = 0;
+};
+
+class MetadataStreamerV3 final : public MetadataStreamer {
+private:
+  std::shared_ptr<msgpack::Node> HSAMetadataRoot =
+      std::make_shared<msgpack::MapNode>();
+
+  void dump(StringRef HSAMetadataString) const;
+
+  void verify(StringRef HSAMetadataString) const;
+
+  Optional<StringRef> getAccessQualifier(StringRef AccQual) const;
+
+  Optional<StringRef> getAddressSpaceQualifier(unsigned AddressSpace) const;
+
+  StringRef getValueKind(Type *Ty, StringRef TypeQual,
+                         StringRef BaseTypeName) const;
+
+  StringRef getValueType(Type *Ty, StringRef TypeName) const;
+
+  std::string getTypeName(Type *Ty, bool Signed) const;
+
+  std::shared_ptr<msgpack::ArrayNode>
+  getWorkGroupDimensions(MDNode *Node) const;
+
+  std::shared_ptr<msgpack::MapNode>
+  getHSAKernelProps(const MachineFunction &MF,
+                    const SIProgramInfo &ProgramInfo) const;
+
+  void emitVersion();
+
+  void emitPrintf(const Module &Mod);
+
+  void emitKernelLanguage(const Function &Func, msgpack::MapNode &Kern);
+
+  void emitKernelAttrs(const Function &Func, msgpack::MapNode &Kern);
+
+  void emitKernelArgs(const Function &Func, msgpack::MapNode &Kern);
+
+  void emitKernelArg(const Argument &Arg, unsigned &Offset,
+                     msgpack::ArrayNode &Args);
+
+  void emitKernelArg(const DataLayout &DL, Type *Ty, StringRef ValueKind,
+                     unsigned &Offset, msgpack::ArrayNode &Args,
+                     unsigned PointeeAlign = 0, StringRef Name = "",
+                     StringRef TypeName = "", StringRef BaseTypeName = "",
+                     StringRef AccQual = "", StringRef TypeQual = "");
+
+  void emitHiddenKernelArgs(const Function &Func, unsigned &Offset,
+                            msgpack::ArrayNode &Args);
+
+  std::shared_ptr<msgpack::Node> &getRootMetadata(StringRef Key) {
+    return (*cast<msgpack::MapNode>(HSAMetadataRoot.get()))[Key];
+  }
+
+  std::shared_ptr<msgpack::Node> &getHSAMetadataRoot() {
+    return HSAMetadataRoot;
+  }
+
+public:
+  MetadataStreamerV3() = default;
+  ~MetadataStreamerV3() = default;
+
+  bool emitTo(AMDGPUTargetStreamer &TargetStreamer) override;
+
+  void begin(const Module &Mod) override;
+
+  void end() override;
+
+  void emitKernel(const MachineFunction &MF,
+                  const SIProgramInfo &ProgramInfo) override;
+};
+
+class MetadataStreamerV2 final : public MetadataStreamer {
 private:
   Metadata HSAMetadata;
-  AMDGPUAS AMDGPUASI;
 
   void dump(StringRef HSAMetadataString) const;
 
@@ -45,7 +131,7 @@ private:
 
   AccessQualifier getAccessQualifier(StringRef AccQual) const;
 
-  AddressSpaceQualifier getAddressSpaceQualifer(unsigned AddressSpace) const;
+  AddressSpaceQualifier getAddressSpaceQualifier(unsigned AddressSpace) const;
 
   ValueKind getValueKind(Type *Ty, StringRef TypeQual,
                          StringRef BaseTypeName) const;
@@ -83,19 +169,22 @@ private:
 
   void emitHiddenKernelArgs(const Function &Func);
 
-public:
-  MetadataStreamer() = default;
-  ~MetadataStreamer() = default;
-
   const Metadata &getHSAMetadata() const {
     return HSAMetadata;
   }
 
-  void begin(const Module &Mod);
+public:
+  MetadataStreamerV2() = default;
+  ~MetadataStreamerV2() = default;
+
+  bool emitTo(AMDGPUTargetStreamer &TargetStreamer) override;
+
+  void begin(const Module &Mod) override;
 
-  void end();
+  void end() override;
 
-  void emitKernel(const MachineFunction &MF, const SIProgramInfo &ProgramInfo);
+  void emitKernel(const MachineFunction &MF,
+                  const SIProgramInfo &ProgramInfo) override;
 };
 
 } // end namespace HSAMD
diff --git a/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp b/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp
index f25f4d4693eac..a0a045e72a58f 100644
--- a/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp
+++ b/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp
@@ -29,7 +29,7 @@
 #include "llvm/ADT/APInt.h"
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/ADT/StringRef.h"
-#include "llvm/Analysis/DivergenceAnalysis.h"
+#include "llvm/Analysis/LegacyDivergenceAnalysis.h"
 #include "llvm/Analysis/ValueTracking.h"
 #include "llvm/CodeGen/FunctionLoweringInfo.h"
 #include "llvm/CodeGen/ISDOpcodes.h"
@@ -72,14 +72,12 @@ class AMDGPUDAGToDAGISel : public SelectionDAGISel {
   // Subtarget - Keep a pointer to the AMDGPU Subtarget around so that we can
   // make the right decision when generating code for different targets.
   const GCNSubtarget *Subtarget;
-  AMDGPUAS AMDGPUASI;
   bool EnableLateStructurizeCFG;
 
 public:
   explicit AMDGPUDAGToDAGISel(TargetMachine *TM = nullptr,
                               CodeGenOpt::Level OptLevel = CodeGenOpt::Default)
     : SelectionDAGISel(*TM, OptLevel) {
-    AMDGPUASI = AMDGPU::getAMDGPUAS(*TM);
     EnableLateStructurizeCFG = AMDGPUTargetMachine::EnableLateStructurizeCFG;
   }
   ~AMDGPUDAGToDAGISel() override = default;
@@ -87,7 +85,7 @@ public:
   void getAnalysisUsage(AnalysisUsage &AU) const override {
     AU.addRequired<AMDGPUArgumentUsageInfo>();
     AU.addRequired<AMDGPUPerfHintAnalysis>();
-    AU.addRequired<DivergenceAnalysis>();
+    AU.addRequired<LegacyDivergenceAnalysis>();
     SelectionDAGISel::getAnalysisUsage(AU);
   }
 
@@ -103,9 +101,12 @@ private:
   std::pair<SDValue, SDValue> foldFrameIndex(SDValue N) const;
   bool isNoNanSrc(SDValue N) const;
   bool isInlineImmediate(const SDNode *N) const;
-
+  bool isVGPRImm(const SDNode *N) const;
+  bool isUniformLoad(const SDNode *N) const;
   bool isUniformBr(const SDNode *N) const;
 
+  MachineSDNode *buildSMovImm64(SDLoc &DL, uint64_t Val, EVT VT) const;
+
   SDNode *glueCopyToM0(SDNode *N) const;
 
   const TargetRegisterClass *getOperandRegClass(SDNode *N, unsigned OpNo) const;
@@ -140,13 +141,6 @@ private:
                          SDValue &Offset, SDValue &SLC) const;
   bool SelectMUBUFOffset(SDValue Addr, SDValue &SRsrc, SDValue &Soffset,
                          SDValue &Offset) const;
-  bool SelectMUBUFConstant(SDValue Constant,
-                           SDValue &SOffset,
-                           SDValue &ImmOffset) const;
-  bool SelectMUBUFIntrinsicOffset(SDValue Offset, SDValue &SOffset,
-                                  SDValue &ImmOffset) const;
-  bool SelectMUBUFIntrinsicVOffset(SDValue Offset, SDValue &SOffset,
-                                   SDValue &ImmOffset, SDValue &VOffset) const;
 
   bool SelectFlatAtomic(SDValue Addr, SDValue &VAddr,
                         SDValue &Offset, SDValue &SLC) const;
@@ -224,7 +218,6 @@ protected:
 
 class R600DAGToDAGISel : public AMDGPUDAGToDAGISel {
   const R600Subtarget *Subtarget;
-  AMDGPUAS AMDGPUASI;
 
   bool isConstantLoad(const MemSDNode *N, int cbID) const;
   bool SelectGlobalValueConstantOffset(SDValue Addr, SDValue& IntPtr);
@@ -232,9 +225,7 @@ class R600DAGToDAGISel : public AMDGPUDAGToDAGISel {
                                        SDValue& Offset);
 public:
   explicit R600DAGToDAGISel(TargetMachine *TM, CodeGenOpt::Level OptLevel) :
-      AMDGPUDAGToDAGISel(TM, OptLevel) {
-    AMDGPUASI = AMDGPU::getAMDGPUAS(*TM);
-      }
+      AMDGPUDAGToDAGISel(TM, OptLevel) {}
 
   void Select(SDNode *N) override;
 
@@ -251,12 +242,12 @@ protected:
 
 }  // end anonymous namespace
 
-INITIALIZE_PASS_BEGIN(AMDGPUDAGToDAGISel, "isel",
+INITIALIZE_PASS_BEGIN(AMDGPUDAGToDAGISel, "amdgpu-isel",
                       "AMDGPU DAG->DAG Pattern Instruction Selection", false, false)
 INITIALIZE_PASS_DEPENDENCY(AMDGPUArgumentUsageInfo)
 INITIALIZE_PASS_DEPENDENCY(AMDGPUPerfHintAnalysis)
-INITIALIZE_PASS_DEPENDENCY(DivergenceAnalysis)
-INITIALIZE_PASS_END(AMDGPUDAGToDAGISel, "isel",
+INITIALIZE_PASS_DEPENDENCY(LegacyDivergenceAnalysis)
+INITIALIZE_PASS_END(AMDGPUDAGToDAGISel, "amdgpu-isel",
                     "AMDGPU DAG->DAG Pattern Instruction Selection", false, false)
 
 /// This pass converts a legalized DAG into a AMDGPU-specific
@@ -350,7 +341,7 @@ const TargetRegisterClass *AMDGPUDAGToDAGISel::getOperandRegClass(SDNode *N,
 }
 
 SDNode *AMDGPUDAGToDAGISel::glueCopyToM0(SDNode *N) const {
-  if (cast<MemSDNode>(N)->getAddressSpace() != AMDGPUASI.LOCAL_ADDRESS ||
+  if (cast<MemSDNode>(N)->getAddressSpace() != AMDGPUAS::LOCAL_ADDRESS ||
       !Subtarget->ldsRequiresM0Init())
     return N;
 
@@ -372,6 +363,22 @@ SDNode *AMDGPUDAGToDAGISel::glueCopyToM0(SDNode *N) const {
   return CurDAG->MorphNodeTo(N, N->getOpcode(), N->getVTList(), Ops);
 }
 
+MachineSDNode *AMDGPUDAGToDAGISel::buildSMovImm64(SDLoc &DL, uint64_t Imm,
+                                                  EVT VT) const {
+  SDNode *Lo = CurDAG->getMachineNode(
+      AMDGPU::S_MOV_B32, DL, MVT::i32,
+      CurDAG->getConstant(Imm & 0xFFFFFFFF, DL, MVT::i32));
+  SDNode *Hi =
+      CurDAG->getMachineNode(AMDGPU::S_MOV_B32, DL, MVT::i32,
+                             CurDAG->getConstant(Imm >> 32, DL, MVT::i32));
+  const SDValue Ops[] = {
+      CurDAG->getTargetConstant(AMDGPU::SReg_64RegClassID, DL, MVT::i32),
+      SDValue(Lo, 0), CurDAG->getTargetConstant(AMDGPU::sub0, DL, MVT::i32),
+      SDValue(Hi, 0), CurDAG->getTargetConstant(AMDGPU::sub1, DL, MVT::i32)};
+
+  return CurDAG->getMachineNode(TargetOpcode::REG_SEQUENCE, DL, VT, Ops);
+}
+
 static unsigned selectSGPRVectorRegClassID(unsigned NumVectorElts) {
   switch (NumVectorElts) {
   case 1:
@@ -557,19 +564,7 @@ void AMDGPUDAGToDAGISel::Select(SDNode *N) {
     }
 
     SDLoc DL(N);
-    SDNode *Lo = CurDAG->getMachineNode(AMDGPU::S_MOV_B32, DL, MVT::i32,
-                                CurDAG->getConstant(Imm & 0xFFFFFFFF, DL,
-                                                    MVT::i32));
-    SDNode *Hi = CurDAG->getMachineNode(AMDGPU::S_MOV_B32, DL, MVT::i32,
-                                CurDAG->getConstant(Imm >> 32, DL, MVT::i32));
-    const SDValue Ops[] = {
-      CurDAG->getTargetConstant(AMDGPU::SReg_64RegClassID, DL, MVT::i32),
-      SDValue(Lo, 0), CurDAG->getTargetConstant(AMDGPU::sub0, DL, MVT::i32),
-      SDValue(Hi, 0), CurDAG->getTargetConstant(AMDGPU::sub1, DL, MVT::i32)
-    };
-
-    ReplaceNode(N, CurDAG->getMachineNode(TargetOpcode::REG_SEQUENCE, DL,
-                                          N->getValueType(0), Ops));
+    ReplaceNode(N, buildSMovImm64(DL, Imm, N->getValueType(0)));
     return;
   }
   case ISD::LOAD:
@@ -641,6 +636,20 @@ void AMDGPUDAGToDAGISel::Select(SDNode *N) {
   case AMDGPUISD::ATOMIC_CMP_SWAP:
     SelectATOMIC_CMP_SWAP(N);
     return;
+  case AMDGPUISD::CVT_PKRTZ_F16_F32:
+  case AMDGPUISD::CVT_PKNORM_I16_F32:
+  case AMDGPUISD::CVT_PKNORM_U16_F32:
+  case AMDGPUISD::CVT_PK_U16_U32:
+  case AMDGPUISD::CVT_PK_I16_I32: {
+    // Hack around using a legal type if f16 is illegal.
+    if (N->getValueType(0) == MVT::i32) {
+      MVT NewVT = Opc == AMDGPUISD::CVT_PKRTZ_F16_F32 ? MVT::v2f16 : MVT::v2i16;
+      N = CurDAG->MorphNodeTo(N, N->getOpcode(), CurDAG->getVTList(NewVT),
+                              { N->getOperand(0), N->getOperand(1) });
+      SelectCode(N);
+      return;
+    }
+  }
   }
 
   SelectCode(N);
@@ -969,8 +978,6 @@ bool AMDGPUDAGToDAGISel::SelectDS64Bit4ByteAligned(SDValue Addr, SDValue &Base,
 
   // default case
 
-  // FIXME: This is broken on SI where we still need to check if the base
-  // pointer is positive here.
   Base = Addr;
   Offset0 = CurDAG->getTargetConstant(0, DL, MVT::i8);
   Offset1 = CurDAG->getTargetConstant(1, DL, MVT::i8);
@@ -1000,55 +1007,72 @@ bool AMDGPUDAGToDAGISel::SelectMUBUF(SDValue Addr, SDValue &Ptr,
   Addr64 = CurDAG->getTargetConstant(0, DL, MVT::i1);
   SOffset = CurDAG->getTargetConstant(0, DL, MVT::i32);
 
+  ConstantSDNode *C1 = nullptr;
+  SDValue N0 = Addr;
   if (CurDAG->isBaseWithConstantOffset(Addr)) {
-    SDValue N0 = Addr.getOperand(0);
-    SDValue N1 = Addr.getOperand(1);
-    ConstantSDNode *C1 = cast<ConstantSDNode>(N1);
+    C1 = cast<ConstantSDNode>(Addr.getOperand(1));
+    if (isUInt<32>(C1->getZExtValue()))
+      N0 = Addr.getOperand(0);
+    else
+      C1 = nullptr;
+  }
 
-    if (N0.getOpcode() == ISD::ADD) {
-      // (add (add N2, N3), C1) -> addr64
-      SDValue N2 = N0.getOperand(0);
-      SDValue N3 = N0.getOperand(1);
-      Addr64 = CurDAG->getTargetConstant(1, DL, MVT::i1);
+  if (N0.getOpcode() == ISD::ADD) {
+    // (add N2, N3) -> addr64, or
+    // (add (add N2, N3), C1) -> addr64
+    SDValue N2 = N0.getOperand(0);
+    SDValue N3 = N0.getOperand(1);
+    Addr64 = CurDAG->getTargetConstant(1, DL, MVT::i1);
+
+    if (N2->isDivergent()) {
+      if (N3->isDivergent()) {
+        // Both N2 and N3 are divergent. Use N0 (the result of the add) as the
+        // addr64, and construct the resource from a 0 address.
+        Ptr = SDValue(buildSMovImm64(DL, 0, MVT::v2i32), 0);
+        VAddr = N0;
+      } else {
+        // N2 is divergent, N3 is not.
+        Ptr = N3;
+        VAddr = N2;
+      }
+    } else {
+      // N2 is not divergent.
       Ptr = N2;
       VAddr = N3;
-    } else {
-      // (add N0, C1) -> offset
-      VAddr = CurDAG->getTargetConstant(0, DL, MVT::i32);
-      Ptr = N0;
     }
-
-    if (SIInstrInfo::isLegalMUBUFImmOffset(C1->getZExtValue())) {
-      Offset = CurDAG->getTargetConstant(C1->getZExtValue(), DL, MVT::i16);
-      return true;
-    }
-
-    if (isUInt<32>(C1->getZExtValue())) {
-      // Illegal offset, store it in soffset.
-      Offset = CurDAG->getTargetConstant(0, DL, MVT::i16);
-      SOffset = SDValue(CurDAG->getMachineNode(AMDGPU::S_MOV_B32, DL, MVT::i32,
-                   CurDAG->getTargetConstant(C1->getZExtValue(), DL, MVT::i32)),
-                        0);
-      return true;
-    }
-  }
-
-  if (Addr.getOpcode() == ISD::ADD) {
-    // (add N0, N1) -> addr64
-    SDValue N0 = Addr.getOperand(0);
-    SDValue N1 = Addr.getOperand(1);
+    Offset = CurDAG->getTargetConstant(0, DL, MVT::i16);
+  } else if (N0->isDivergent()) {
+    // N0 is divergent. Use it as the addr64, and construct the resource from a
+    // 0 address.
+    Ptr = SDValue(buildSMovImm64(DL, 0, MVT::v2i32), 0);
+    VAddr = N0;
     Addr64 = CurDAG->getTargetConstant(1, DL, MVT::i1);
+  } else {
+    // N0 -> offset, or
+    // (N0 + C1) -> offset
+    VAddr = CurDAG->getTargetConstant(0, DL, MVT::i32);
     Ptr = N0;
-    VAddr = N1;
+  }
+
+  if (!C1) {
+    // No offset.
     Offset = CurDAG->getTargetConstant(0, DL, MVT::i16);
     return true;
   }
 
-  // default case -> offset
-  VAddr = CurDAG->getTargetConstant(0, DL, MVT::i32);
-  Ptr = Addr;
-  Offset = CurDAG->getTargetConstant(0, DL, MVT::i16);
+  if (SIInstrInfo::isLegalMUBUFImmOffset(C1->getZExtValue())) {
+    // Legal offset for instruction.
+    Offset = CurDAG->getTargetConstant(C1->getZExtValue(), DL, MVT::i16);
+    return true;
+  }
 
+  // Illegal offset, store it in soffset.
+  Offset = CurDAG->getTargetConstant(0, DL, MVT::i16);
+  SOffset =
+      SDValue(CurDAG->getMachineNode(
+                  AMDGPU::S_MOV_B32, DL, MVT::i32,
+                  CurDAG->getTargetConstant(C1->getZExtValue(), DL, MVT::i32)),
+              0);
   return true;
 }
 
@@ -1252,101 +1276,6 @@ bool AMDGPUDAGToDAGISel::SelectMUBUFOffset(SDValue Addr, SDValue &SRsrc,
   return SelectMUBUFOffset(Addr, SRsrc, Soffset, Offset, GLC, SLC, TFE);
 }
 
-bool AMDGPUDAGToDAGISel::SelectMUBUFConstant(SDValue Constant,
-                                             SDValue &SOffset,
-                                             SDValue &ImmOffset) const {
-  SDLoc DL(Constant);
-  const uint32_t Align = 4;
-  const uint32_t MaxImm = alignDown(4095, Align);
-  uint32_t Imm = cast<ConstantSDNode>(Constant)->getZExtValue();
-  uint32_t Overflow = 0;
-
-  if (Imm > MaxImm) {
-    if (Imm <= MaxImm + 64) {
-      // Use an SOffset inline constant for 4..64
-      Overflow = Imm - MaxImm;
-      Imm = MaxImm;
-    } else {
-      // Try to keep the same value in SOffset for adjacent loads, so that
-      // the corresponding register contents can be re-used.
-      //
-      // Load values with all low-bits (except for alignment bits) set into
-      // SOffset, so that a larger range of values can be covered using
-      // s_movk_i32.
-      //
-      // Atomic operations fail to work correctly when individual address
-      // components are unaligned, even if their sum is aligned.
-      uint32_t High = (Imm + Align) & ~4095;
-      uint32_t Low = (Imm + Align) & 4095;
-      Imm = Low;
-      Overflow = High - Align;
-    }
-  }
-
-  // There is a hardware bug in SI and CI which prevents address clamping in
-  // MUBUF instructions from working correctly with SOffsets. The immediate
-  // offset is unaffected.
-  if (Overflow > 0 &&
-      Subtarget->getGeneration() <= AMDGPUSubtarget::SEA_ISLANDS)
-    return false;
-
-  ImmOffset = CurDAG->getTargetConstant(Imm, DL, MVT::i16);
-
-  if (Overflow <= 64)
-    SOffset = CurDAG->getTargetConstant(Overflow, DL, MVT::i32);
-  else
-    SOffset = SDValue(CurDAG->getMachineNode(AMDGPU::S_MOV_B32, DL, MVT::i32,
-                      CurDAG->getTargetConstant(Overflow, DL, MVT::i32)),
-                      0);
-
-  return true;
-}
-
-bool AMDGPUDAGToDAGISel::SelectMUBUFIntrinsicOffset(SDValue Offset,
-                                                    SDValue &SOffset,
-                                                    SDValue &ImmOffset) const {
-  SDLoc DL(Offset);
-
-  if (!isa<ConstantSDNode>(Offset))
-    return false;
-
-  return SelectMUBUFConstant(Offset, SOffset, ImmOffset);
-}
-
-bool AMDGPUDAGToDAGISel::SelectMUBUFIntrinsicVOffset(SDValue Offset,
-                                                     SDValue &SOffset,
-                                                     SDValue &ImmOffset,
-                                                     SDValue &VOffset) const {
-  SDLoc DL(Offset);
-
-  // Don't generate an unnecessary voffset for constant offsets.
-  if (isa<ConstantSDNode>(Offset)) {
-    SDValue Tmp1, Tmp2;
-
-    // When necessary, use a voffset in <= CI anyway to work around a hardware
-    // bug.
-    if (Subtarget->getGeneration() > AMDGPUSubtarget::SEA_ISLANDS ||
-        SelectMUBUFConstant(Offset, Tmp1, Tmp2))
-      return false;
-  }
-
-  if (CurDAG->isBaseWithConstantOffset(Offset)) {
-    SDValue N0 = Offset.getOperand(0);
-    SDValue N1 = Offset.getOperand(1);
-    if (cast<ConstantSDNode>(N1)->getSExtValue() >= 0 &&
-        SelectMUBUFConstant(N1, SOffset, ImmOffset)) {
-      VOffset = N0;
-      return true;
-    }
-  }
-
-  SOffset = CurDAG->getTargetConstant(0, DL, MVT::i32);
-  ImmOffset = CurDAG->getTargetConstant(0, DL, MVT::i16);
-  VOffset = Offset;
-
-  return true;
-}
-
 template <bool IsSigned>
 bool AMDGPUDAGToDAGISel::SelectFlatOffset(SDValue Addr,
                                           SDValue &VAddr,
@@ -1451,7 +1380,11 @@ bool AMDGPUDAGToDAGISel::SelectSMRD(SDValue Addr, SDValue &SBase,
                                      SDValue &Offset, bool &Imm) const {
   SDLoc SL(Addr);
 
-  if (CurDAG->isBaseWithConstantOffset(Addr)) {
+  // A 32-bit (address + offset) should not cause unsigned 32-bit integer
+  // wraparound, because s_load instructions perform the addition in 64 bits.
+  if ((Addr.getValueType() != MVT::i32 ||
+       Addr->getFlags().hasNoUnsignedWrap()) &&
+      CurDAG->isBaseWithConstantOffset(Addr)) {
     SDValue N0 = Addr.getOperand(0);
     SDValue N1 = Addr.getOperand(1);
 
@@ -1521,9 +1454,13 @@ bool AMDGPUDAGToDAGISel::SelectMOVRELOffset(SDValue Index,
     ConstantSDNode *C1 = cast<ConstantSDNode>(N1);
 
     // (add n0, c0)
-    Base = N0;
-    Offset = CurDAG->getTargetConstant(C1->getZExtValue(), DL, MVT::i32);
-    return true;
+    // Don't peel off the offset (c0) if doing so could possibly lead
+    // the base (n0) to be negative.
+    if (C1->getSExtValue() <= 0 || CurDAG->SignBitIsZero(N0)) {
+      Base = N0;
+      Offset = CurDAG->getTargetConstant(C1->getZExtValue(), DL, MVT::i32);
+      return true;
+    }
   }
 
   if (isa<ConstantSDNode>(Index))
@@ -1764,7 +1701,7 @@ void AMDGPUDAGToDAGISel::SelectFMAD_FMA(SDNode *N) {
 void AMDGPUDAGToDAGISel::SelectATOMIC_CMP_SWAP(SDNode *N) {
   MemSDNode *Mem = cast<MemSDNode>(N);
   unsigned AS = Mem->getAddressSpace();
-  if (AS == AMDGPUASI.FLAT_ADDRESS) {
+  if (AS == AMDGPUAS::FLAT_ADDRESS) {
     SelectCode(N);
     return;
   }
@@ -1812,9 +1749,8 @@ void AMDGPUDAGToDAGISel::SelectATOMIC_CMP_SWAP(SDNode *N) {
     return;
   }
 
-  MachineSDNode::mmo_iterator MMOs = MF->allocateMemRefsArray(1);
-  *MMOs = Mem->getMemOperand();
-  CmpSwap->setMemRefs(MMOs, MMOs + 1);
+  MachineMemOperand *MMO = Mem->getMemOperand();
+  CurDAG->setNodeMemRefs(CmpSwap, {MMO});
 
   unsigned SubReg = Is32 ? AMDGPU::sub0 : AMDGPU::sub0_sub1;
   SDValue Extract
@@ -2113,6 +2049,80 @@ bool AMDGPUDAGToDAGISel::SelectHi16Elt(SDValue In, SDValue &Src) const {
   return isExtractHiElt(In, Src);
 }
 
+bool AMDGPUDAGToDAGISel::isVGPRImm(const SDNode * N) const {
+  if (Subtarget->getGeneration() < AMDGPUSubtarget::SOUTHERN_ISLANDS) {
+    return false;
+  }
+  const SIRegisterInfo *SIRI =
+    static_cast<const SIRegisterInfo *>(Subtarget->getRegisterInfo());
+  const SIInstrInfo * SII =
+    static_cast<const SIInstrInfo *>(Subtarget->getInstrInfo());
+
+  unsigned Limit = 0;
+  bool AllUsesAcceptSReg = true;
+  for (SDNode::use_iterator U = N->use_begin(), E = SDNode::use_end();
+    Limit < 10 && U != E; ++U, ++Limit) {
+    const TargetRegisterClass *RC = getOperandRegClass(*U, U.getOperandNo());
+
+    // If the register class is unknown, it could be an unknown
+    // register class that needs to be an SGPR, e.g. an inline asm
+    // constraint
+    if (!RC || SIRI->isSGPRClass(RC))
+      return false;
+
+    if (RC != &AMDGPU::VS_32RegClass) {
+      AllUsesAcceptSReg = false;
+      SDNode * User = *U;
+      if (User->isMachineOpcode()) {
+        unsigned Opc = User->getMachineOpcode();
+        MCInstrDesc Desc = SII->get(Opc);
+        if (Desc.isCommutable()) {
+          unsigned OpIdx = Desc.getNumDefs() + U.getOperandNo();
+          unsigned CommuteIdx1 = TargetInstrInfo::CommuteAnyOperandIndex;
+          if (SII->findCommutedOpIndices(Desc, OpIdx, CommuteIdx1)) {
+            unsigned CommutedOpNo = CommuteIdx1 - Desc.getNumDefs();
+            const TargetRegisterClass *CommutedRC = getOperandRegClass(*U, CommutedOpNo);
+            if (CommutedRC == &AMDGPU::VS_32RegClass)
+              AllUsesAcceptSReg = true;
+          }
+        }
+      }
+      // If "AllUsesAcceptSReg == false" so far we haven't suceeded
+      // commuting current user. This means have at least one use
+      // that strictly require VGPR. Thus, we will not attempt to commute
+      // other user instructions.
+      if (!AllUsesAcceptSReg)
+        break;
+    }
+  }
+  return !AllUsesAcceptSReg && (Limit < 10);
+}
+
+bool AMDGPUDAGToDAGISel::isUniformLoad(const SDNode * N) const {
+  auto Ld = cast<LoadSDNode>(N);
+
+  return Ld->getAlignment() >= 4 &&
+        (
+          (
+            (
+              Ld->getAddressSpace() == AMDGPUAS::CONSTANT_ADDRESS       ||
+              Ld->getAddressSpace() == AMDGPUAS::CONSTANT_ADDRESS_32BIT
+            )
+            &&
+            !N->isDivergent()
+          )
+          ||
+          (
+            Subtarget->getScalarizeGlobalBehavior() &&
+            Ld->getAddressSpace() == AMDGPUAS::GLOBAL_ADDRESS &&
+            !Ld->isVolatile() &&
+            !N->isDivergent() &&
+            static_cast<const SITargetLowering *>(
+              getTargetLowering())->isMemOpHasNoClobberedMemOperand(N)
+          )
+        );
+}
+
 void AMDGPUDAGToDAGISel::PostprocessISelDAG() {
   const AMDGPUTargetLowering& Lowering =
     *static_cast<const AMDGPUTargetLowering*>(getTargetLowering());
@@ -2148,10 +2158,10 @@ bool R600DAGToDAGISel::isConstantLoad(const MemSDNode *N, int CbId) const {
   if (!N->readMem())
     return false;
   if (CbId == -1)
-    return N->getAddressSpace() == AMDGPUASI.CONSTANT_ADDRESS ||
-           N->getAddressSpace() == AMDGPUASI.CONSTANT_ADDRESS_32BIT;
+    return N->getAddressSpace() == AMDGPUAS::CONSTANT_ADDRESS ||
+           N->getAddressSpace() == AMDGPUAS::CONSTANT_ADDRESS_32BIT;
 
-  return N->getAddressSpace() == AMDGPUASI.CONSTANT_BUFFER_0 + CbId;
+  return N->getAddressSpace() == AMDGPUAS::CONSTANT_BUFFER_0 + CbId;
 }
 
 bool R600DAGToDAGISel::SelectGlobalValueConstantOffset(SDValue Addr,
diff --git a/lib/Target/AMDGPU/AMDGPUISelLowering.cpp b/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
index 21e44e9589d3c..6951c915b1772 100644
--- a/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
+++ b/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
@@ -128,10 +128,8 @@ EVT AMDGPUTargetLowering::getEquivalentMemType(LLVMContext &Ctx, EVT VT) {
 }
 
 unsigned AMDGPUTargetLowering::numBitsUnsigned(SDValue Op, SelectionDAG &DAG) {
-  KnownBits Known;
   EVT VT = Op.getValueType();
-  DAG.computeKnownBits(Op, Known);
-
+  KnownBits Known = DAG.computeKnownBits(Op);
   return VT.getSizeInBits() - Known.countMinLeadingZeros();
 }
 
@@ -146,7 +144,6 @@ unsigned AMDGPUTargetLowering::numBitsSigned(SDValue Op, SelectionDAG &DAG) {
 AMDGPUTargetLowering::AMDGPUTargetLowering(const TargetMachine &TM,
                                            const AMDGPUSubtarget &STI)
     : TargetLowering(TM), Subtarget(&STI) {
-  AMDGPUASI = AMDGPU::getAMDGPUAS(TM);
   // Lower floating point store/load to integer store/load to reduce the number
   // of patterns in tablegen.
   setOperationAction(ISD::LOAD, MVT::f32, Promote);
@@ -318,6 +315,7 @@ AMDGPUTargetLowering::AMDGPUTargetLowering(const TargetMachine &TM,
 
   setOperationAction(ISD::FLOG, MVT::f32, Custom);
   setOperationAction(ISD::FLOG10, MVT::f32, Custom);
+  setOperationAction(ISD::FEXP, MVT::f32, Custom);
 
 
   setOperationAction(ISD::FNEARBYINT, MVT::f32, Custom);
@@ -450,6 +448,7 @@ AMDGPUTargetLowering::AMDGPUTargetLowering(const TargetMachine &TM,
     setOperationAction(ISD::FCOS, VT, Expand);
     setOperationAction(ISD::FDIV, VT, Expand);
     setOperationAction(ISD::FEXP2, VT, Expand);
+    setOperationAction(ISD::FEXP, VT, Expand);
     setOperationAction(ISD::FLOG2, VT, Expand);
     setOperationAction(ISD::FREM, VT, Expand);
     setOperationAction(ISD::FLOG, VT, Expand);
@@ -470,6 +469,7 @@ AMDGPUTargetLowering::AMDGPUTargetLowering(const TargetMachine &TM,
     setOperationAction(ISD::FCOPYSIGN, VT, Expand);
     setOperationAction(ISD::VECTOR_SHUFFLE, VT, Expand);
     setOperationAction(ISD::SETCC, VT, Expand);
+    setOperationAction(ISD::FCANONICALIZE, VT, Expand);
   }
 
   // This causes using an unrolled select operation rather than expansion with
@@ -550,6 +550,8 @@ static bool fnegFoldsIntoOp(unsigned Opc) {
   case ISD::FMAD:
   case ISD::FMINNUM:
   case ISD::FMAXNUM:
+  case ISD::FMINNUM_IEEE:
+  case ISD::FMAXNUM_IEEE:
   case ISD::FSIN:
   case ISD::FTRUNC:
   case ISD::FRINT:
@@ -562,6 +564,7 @@ static bool fnegFoldsIntoOp(unsigned Opc) {
   case AMDGPUISD::FMUL_LEGACY:
   case AMDGPUISD::FMIN_LEGACY:
   case AMDGPUISD::FMAX_LEGACY:
+  case AMDGPUISD::FMED3:
     return true;
   default:
     return false;
@@ -650,8 +653,11 @@ bool AMDGPUTargetLowering::ShouldShrinkFPConstant(EVT VT) const {
 }
 
 bool AMDGPUTargetLowering::shouldReduceLoadWidth(SDNode *N,
-                                                 ISD::LoadExtType,
+                                                 ISD::LoadExtType ExtTy,
                                                  EVT NewVT) const {
+  // TODO: This may be worth removing. Check regression tests for diffs.
+  if (!TargetLoweringBase::shouldReduceLoadWidth(N, ExtTy, NewVT))
+    return false;
 
   unsigned NewSize = NewVT.getStoreSizeInBits();
 
@@ -662,6 +668,18 @@ bool AMDGPUTargetLowering::shouldReduceLoadWidth(SDNode *N,
   EVT OldVT = N->getValueType(0);
   unsigned OldSize = OldVT.getStoreSizeInBits();
 
+  MemSDNode *MN = cast<MemSDNode>(N);
+  unsigned AS = MN->getAddressSpace();
+  // Do not shrink an aligned scalar load to sub-dword.
+  // Scalar engine cannot do sub-dword loads.
+  if (OldSize >= 32 && NewSize < 32 && MN->getAlignment() >= 4 &&
+      (AS == AMDGPUAS::CONSTANT_ADDRESS ||
+       AS == AMDGPUAS::CONSTANT_ADDRESS_32BIT ||
+       (isa<LoadSDNode>(N) &&
+        AS == AMDGPUAS::GLOBAL_ADDRESS && MN->isInvariant())) &&
+      AMDGPUInstrInfo::isUniformMMO(MN->getMemOperand()))
+    return false;
+
   // Don't produce extloads from sub 32-bit types. SI doesn't have scalar
   // extloads, so doing one requires using a buffer_load. In cases where we
   // still couldn't use a scalar load, using the wider load shouldn't really
@@ -722,7 +740,7 @@ bool AMDGPUTargetLowering::isSDNodeAlwaysUniform(const SDNode * N) const {
     {
       const LoadSDNode * L = dyn_cast<LoadSDNode>(N);
       if (L->getMemOperand()->getAddrSpace()
-      == AMDGPUASI.CONSTANT_ADDRESS_32BIT)
+      == AMDGPUAS::CONSTANT_ADDRESS_32BIT)
         return true;
       return false;
     }
@@ -1140,6 +1158,8 @@ SDValue AMDGPUTargetLowering::LowerOperation(SDValue Op,
     return LowerFLOG(Op, DAG, 1 / AMDGPU_LOG2E_F);
   case ISD::FLOG10:
     return LowerFLOG(Op, DAG, AMDGPU_LN2_F / AMDGPU_LN10_F);
+  case ISD::FEXP:
+    return lowerFEXP(Op, DAG);
   case ISD::SINT_TO_FP: return LowerSINT_TO_FP(Op, DAG);
   case ISD::UINT_TO_FP: return LowerUINT_TO_FP(Op, DAG);
   case ISD::FP_TO_FP16: return LowerFP_TO_FP16(Op, DAG);
@@ -1188,8 +1208,8 @@ SDValue AMDGPUTargetLowering::LowerGlobalAddress(AMDGPUMachineFunction* MFI,
   GlobalAddressSDNode *G = cast<GlobalAddressSDNode>(Op);
   const GlobalValue *GV = G->getGlobal();
 
-  if (G->getAddressSpace() == AMDGPUASI.LOCAL_ADDRESS ||
-      G->getAddressSpace() == AMDGPUASI.REGION_ADDRESS) {
+  if (G->getAddressSpace() == AMDGPUAS::LOCAL_ADDRESS ||
+      G->getAddressSpace() == AMDGPUAS::REGION_ADDRESS) {
     if (!MFI->isEntryFunction()) {
       const Function &Fn = DAG.getMachineFunction().getFunction();
       DiagnosticInfoUnsupported BadLDSDecl(
@@ -2213,6 +2233,34 @@ SDValue AMDGPUTargetLowering::LowerFLOG(SDValue Op, SelectionDAG &DAG,
   return DAG.getNode(ISD::FMUL, SL, VT, Log2Operand, Log2BaseInvertedOperand);
 }
 
+// Return M_LOG2E of appropriate type
+static SDValue getLog2EVal(SelectionDAG &DAG, const SDLoc &SL, EVT VT) {
+  switch (VT.getScalarType().getSimpleVT().SimpleTy) {
+  case MVT::f32:
+    return DAG.getConstantFP(1.44269504088896340735992468100189214f, SL, VT);
+  case MVT::f16:
+    return DAG.getConstantFP(
+      APFloat(APFloat::IEEEhalf(), "1.44269504088896340735992468100189214"),
+      SL, VT);
+  case MVT::f64:
+    return DAG.getConstantFP(
+      APFloat(APFloat::IEEEdouble(), "0x1.71547652b82fep+0"), SL, VT);
+  default:
+    llvm_unreachable("unsupported fp type");
+  }
+}
+
+// exp2(M_LOG2E_F * f);
+SDValue AMDGPUTargetLowering::lowerFEXP(SDValue Op, SelectionDAG &DAG) const {
+  EVT VT = Op.getValueType();
+  SDLoc SL(Op);
+  SDValue Src = Op.getOperand(0);
+
+  const SDValue K = getLog2EVal(DAG, SL, VT);
+  SDValue Mul = DAG.getNode(ISD::FMUL, SL, VT, Src, K, Op->getFlags());
+  return DAG.getNode(ISD::FEXP2, SL, VT, Mul, Op->getFlags());
+}
+
 static bool isCtlzOpc(unsigned Opc) {
   return Opc == ISD::CTLZ || Opc == ISD::CTLZ_ZERO_UNDEF;
 }
@@ -2669,21 +2717,33 @@ static bool isI24(SDValue Op, SelectionDAG &DAG) {
     AMDGPUTargetLowering::numBitsSigned(Op, DAG) < 24;
 }
 
-static bool simplifyI24(SDNode *Node24, unsigned OpIdx,
-                        TargetLowering::DAGCombinerInfo &DCI) {
-
+static SDValue simplifyI24(SDNode *Node24,
+                           TargetLowering::DAGCombinerInfo &DCI) {
   SelectionDAG &DAG = DCI.DAG;
-  SDValue Op = Node24->getOperand(OpIdx);
-  const TargetLowering &TLI = DAG.getTargetLoweringInfo();
-  EVT VT = Op.getValueType();
+  SDValue LHS = Node24->getOperand(0);
+  SDValue RHS = Node24->getOperand(1);
 
-  APInt Demanded = APInt::getLowBitsSet(VT.getSizeInBits(), 24);
-  APInt KnownZero, KnownOne;
-  TargetLowering::TargetLoweringOpt TLO(DAG, true, true);
-  if (TLI.SimplifyDemandedBits(Node24, OpIdx, Demanded, DCI, TLO))
-    return true;
+  APInt Demanded = APInt::getLowBitsSet(LHS.getValueSizeInBits(), 24);
 
-  return false;
+  // First try to simplify using GetDemandedBits which allows the operands to
+  // have other uses, but will only perform simplifications that involve
+  // bypassing some nodes for this user.
+  SDValue DemandedLHS = DAG.GetDemandedBits(LHS, Demanded);
+  SDValue DemandedRHS = DAG.GetDemandedBits(RHS, Demanded);
+  if (DemandedLHS || DemandedRHS)
+    return DAG.getNode(Node24->getOpcode(), SDLoc(Node24), Node24->getVTList(),
+                       DemandedLHS ? DemandedLHS : LHS,
+                       DemandedRHS ? DemandedRHS : RHS);
+
+  // Now try SimplifyDemandedBits which can simplify the nodes used by our
+  // operands if this node is the only user.
+  const TargetLowering &TLI = DAG.getTargetLoweringInfo();
+  if (TLI.SimplifyDemandedBits(LHS, Demanded, DCI))
+    return SDValue(Node24, 0);
+  if (TLI.SimplifyDemandedBits(RHS, Demanded, DCI))
+    return SDValue(Node24, 0);
+
+  return SDValue();
 }
 
 template <typename IntTy>
@@ -2920,8 +2980,7 @@ SDValue AMDGPUTargetLowering::performShlCombine(SDNode *N,
     // shl (ext x) => zext (shl x), if shift does not overflow int
     if (VT != MVT::i64)
       break;
-    KnownBits Known;
-    DAG.computeKnownBits(X, Known);
+    KnownBits Known = DAG.computeKnownBits(X);
     unsigned LZ = Known.countMinLeadingZeros();
     if (LZ < RHSVal)
       break;
@@ -3080,8 +3139,7 @@ SDValue AMDGPUTargetLowering::performTruncateCombine(
          Src.getOpcode() == ISD::SRA ||
          Src.getOpcode() == ISD::SHL)) {
       SDValue Amt = Src.getOperand(1);
-      KnownBits Known;
-      DAG.computeKnownBits(Amt, Known);
+      KnownBits Known = DAG.computeKnownBits(Amt);
       unsigned Size = VT.getScalarSizeInBits();
       if ((Known.isConstant() && Known.getConstant().ule(Size)) ||
           (Known.getBitWidth() - Known.countMinLeadingZeros() <= Log2_32(Size))) {
@@ -3233,8 +3291,8 @@ SDValue AMDGPUTargetLowering::performMulLoHi24Combine(
   SelectionDAG &DAG = DCI.DAG;
 
   // Simplify demanded bits before splitting into multiple users.
-  if (simplifyI24(N, 0, DCI) || simplifyI24(N, 1, DCI))
-    return SDValue();
+  if (SDValue V = simplifyI24(N, DCI))
+    return V;
 
   SDValue N0 = N->getOperand(0);
   SDValue N1 = N->getOperand(1);
@@ -3449,9 +3507,27 @@ SDValue AMDGPUTargetLowering::performSelectCombine(SDNode *N,
   return performCtlz_CttzCombine(SDLoc(N), Cond, True, False, DCI);
 }
 
-static bool isConstantFPZero(SDValue N) {
-  if (const ConstantFPSDNode *C = isConstOrConstSplatFP(N))
-    return C->isZero() && !C->isNegative();
+static bool isInv2Pi(const APFloat &APF) {
+  static const APFloat KF16(APFloat::IEEEhalf(), APInt(16, 0x3118));
+  static const APFloat KF32(APFloat::IEEEsingle(), APInt(32, 0x3e22f983));
+  static const APFloat KF64(APFloat::IEEEdouble(), APInt(64, 0x3fc45f306dc9c882));
+
+  return APF.bitwiseIsEqual(KF16) ||
+         APF.bitwiseIsEqual(KF32) ||
+         APF.bitwiseIsEqual(KF64);
+}
+
+// 0 and 1.0 / (0.5 * pi) do not have inline immmediates, so there is an
+// additional cost to negate them.
+bool AMDGPUTargetLowering::isConstantCostlierToNegate(SDValue N) const {
+  if (const ConstantFPSDNode *C = isConstOrConstSplatFP(N)) {
+    if (C->isZero() && !C->isNegative())
+      return true;
+
+    if (Subtarget->hasInv2PiInlineImm() && isInv2Pi(C->getValueAPF()))
+      return true;
+  }
+
   return false;
 }
 
@@ -3461,6 +3537,10 @@ static unsigned inverseMinMax(unsigned Opc) {
     return ISD::FMINNUM;
   case ISD::FMINNUM:
     return ISD::FMAXNUM;
+  case ISD::FMAXNUM_IEEE:
+    return ISD::FMINNUM_IEEE;
+  case ISD::FMINNUM_IEEE:
+    return ISD::FMAXNUM_IEEE;
   case AMDGPUISD::FMAX_LEGACY:
     return AMDGPUISD::FMIN_LEGACY;
   case AMDGPUISD::FMIN_LEGACY:
@@ -3566,6 +3646,8 @@ SDValue AMDGPUTargetLowering::performFNegCombine(SDNode *N,
   }
   case ISD::FMAXNUM:
   case ISD::FMINNUM:
+  case ISD::FMAXNUM_IEEE:
+  case ISD::FMINNUM_IEEE:
   case AMDGPUISD::FMAX_LEGACY:
   case AMDGPUISD::FMIN_LEGACY: {
     // fneg (fmaxnum x, y) -> fminnum (fneg x), (fneg y)
@@ -3577,9 +3659,8 @@ SDValue AMDGPUTargetLowering::performFNegCombine(SDNode *N,
     SDValue RHS = N0.getOperand(1);
 
     // 0 doesn't have a negated inline immediate.
-    // TODO: Shouldn't fold 1/2pi either, and should be generalized to other
-    // operations.
-    if (isConstantFPZero(RHS))
+    // TODO: This constant check should be generalized to other operations.
+    if (isConstantCostlierToNegate(RHS))
       return SDValue();
 
     SDValue NegLHS = DAG.getNode(ISD::FNEG, SL, VT, LHS);
@@ -3591,6 +3672,16 @@ SDValue AMDGPUTargetLowering::performFNegCombine(SDNode *N,
       DAG.ReplaceAllUsesWith(N0, DAG.getNode(ISD::FNEG, SL, VT, Res));
     return Res;
   }
+  case AMDGPUISD::FMED3: {
+    SDValue Ops[3];
+    for (unsigned I = 0; I < 3; ++I)
+      Ops[I] = DAG.getNode(ISD::FNEG, SL, VT, N0->getOperand(I), N0->getFlags());
+
+    SDValue Res = DAG.getNode(AMDGPUISD::FMED3, SL, VT, Ops, N0->getFlags());
+    if (!N0.hasOneUse())
+      DAG.ReplaceAllUsesWith(N0, DAG.getNode(ISD::FNEG, SL, VT, Res));
+    return Res;
+  }
   case ISD::FP_EXTEND:
   case ISD::FTRUNC:
   case ISD::FRINT:
@@ -3737,9 +3828,10 @@ SDValue AMDGPUTargetLowering::PerformDAGCombine(SDNode *N,
       if (Src.getValueType() == MVT::i64) {
         SDLoc SL(N);
         uint64_t CVal = C->getZExtValue();
-        return DAG.getNode(ISD::BUILD_VECTOR, SL, DestVT,
-                           DAG.getConstant(Lo_32(CVal), SL, MVT::i32),
-                           DAG.getConstant(Hi_32(CVal), SL, MVT::i32));
+        SDValue BV = DAG.getNode(ISD::BUILD_VECTOR, SL, MVT::v2i32,
+                                 DAG.getConstant(Lo_32(CVal), SL, MVT::i32),
+                                 DAG.getConstant(Hi_32(CVal), SL, MVT::i32));
+        return DAG.getNode(ISD::BITCAST, SL, DestVT, BV);
       }
     }
 
@@ -3786,9 +3878,8 @@ SDValue AMDGPUTargetLowering::PerformDAGCombine(SDNode *N,
   case AMDGPUISD::MUL_U24:
   case AMDGPUISD::MULHI_I24:
   case AMDGPUISD::MULHI_U24: {
-    // If the first call to simplify is successfull, then N may end up being
-    // deleted, so we shouldn't call simplifyI24 again.
-    simplifyI24(N, 0, DCI) || simplifyI24(N, 1, DCI);
+    if (SDValue V = simplifyI24(N, DCI))
+      return V;
     return SDValue();
   }
   case AMDGPUISD::MUL_LOHI_I24:
@@ -3943,13 +4034,12 @@ SDValue AMDGPUTargetLowering::loadStackInputValue(SelectionDAG &DAG,
 SDValue AMDGPUTargetLowering::storeStackInputValue(SelectionDAG &DAG,
                                                    const SDLoc &SL,
                                                    SDValue Chain,
-                                                   SDValue StackPtr,
                                                    SDValue ArgVal,
                                                    int64_t Offset) const {
   MachineFunction &MF = DAG.getMachineFunction();
   MachinePointerInfo DstInfo = MachinePointerInfo::getStack(MF, Offset);
 
-  SDValue Ptr = DAG.getObjectPtrOffset(SL, StackPtr, Offset);
+  SDValue Ptr = DAG.getConstant(Offset, SL, MVT::i32);
   SDValue Store = DAG.getStore(Chain, SL, ArgVal, Ptr, DstInfo, 4,
                                MachineMemOperand::MODereferenceable);
   return Store;
@@ -4111,6 +4201,7 @@ const char* AMDGPUTargetLowering::getTargetNodeName(unsigned Opcode) const {
   NODE_NAME_CASE(BUFFER_LOAD)
   NODE_NAME_CASE(BUFFER_LOAD_FORMAT)
   NODE_NAME_CASE(BUFFER_LOAD_FORMAT_D16)
+  NODE_NAME_CASE(SBUFFER_LOAD)
   NODE_NAME_CASE(BUFFER_STORE)
   NODE_NAME_CASE(BUFFER_STORE_FORMAT)
   NODE_NAME_CASE(BUFFER_STORE_FORMAT_D16)
@@ -4210,33 +4301,42 @@ void AMDGPUTargetLowering::computeKnownBitsForTargetNode(
   }
   case AMDGPUISD::MUL_U24:
   case AMDGPUISD::MUL_I24: {
-    KnownBits LHSKnown, RHSKnown;
-    DAG.computeKnownBits(Op.getOperand(0), LHSKnown, Depth + 1);
-    DAG.computeKnownBits(Op.getOperand(1), RHSKnown, Depth + 1);
-
+    KnownBits LHSKnown = DAG.computeKnownBits(Op.getOperand(0), Depth + 1);
+    KnownBits RHSKnown = DAG.computeKnownBits(Op.getOperand(1), Depth + 1);
     unsigned TrailZ = LHSKnown.countMinTrailingZeros() +
                       RHSKnown.countMinTrailingZeros();
     Known.Zero.setLowBits(std::min(TrailZ, 32u));
 
-    unsigned LHSValBits = 32 - std::max(LHSKnown.countMinSignBits(), 8u);
-    unsigned RHSValBits = 32 - std::max(RHSKnown.countMinSignBits(), 8u);
-    unsigned MaxValBits = std::min(LHSValBits + RHSValBits, 32u);
-    if (MaxValBits >= 32)
-      break;
+    // Truncate to 24 bits.
+    LHSKnown = LHSKnown.trunc(24);
+    RHSKnown = RHSKnown.trunc(24);
+
     bool Negative = false;
     if (Opc == AMDGPUISD::MUL_I24) {
-      bool LHSNegative = !!(LHSKnown.One  & (1 << 23));
-      bool LHSPositive = !!(LHSKnown.Zero & (1 << 23));
-      bool RHSNegative = !!(RHSKnown.One  & (1 << 23));
-      bool RHSPositive = !!(RHSKnown.Zero & (1 << 23));
+      unsigned LHSValBits = 24 - LHSKnown.countMinSignBits();
+      unsigned RHSValBits = 24 - RHSKnown.countMinSignBits();
+      unsigned MaxValBits = std::min(LHSValBits + RHSValBits, 32u);
+      if (MaxValBits >= 32)
+        break;
+      bool LHSNegative = LHSKnown.isNegative();
+      bool LHSPositive = LHSKnown.isNonNegative();
+      bool RHSNegative = RHSKnown.isNegative();
+      bool RHSPositive = RHSKnown.isNonNegative();
       if ((!LHSNegative && !LHSPositive) || (!RHSNegative && !RHSPositive))
         break;
       Negative = (LHSNegative && RHSPositive) || (LHSPositive && RHSNegative);
-    }
-    if (Negative)
-      Known.One.setHighBits(32 - MaxValBits);
-    else
+      if (Negative)
+        Known.One.setHighBits(32 - MaxValBits);
+      else
+        Known.Zero.setHighBits(32 - MaxValBits);
+    } else {
+      unsigned LHSValBits = 24 - LHSKnown.countMinLeadingZeros();
+      unsigned RHSValBits = 24 - RHSKnown.countMinLeadingZeros();
+      unsigned MaxValBits = std::min(LHSValBits + RHSValBits, 32u);
+      if (MaxValBits >= 32)
+        break;
       Known.Zero.setHighBits(32 - MaxValBits);
+    }
     break;
   }
   case AMDGPUISD::PERM: {
@@ -4244,9 +4344,8 @@ void AMDGPUTargetLowering::computeKnownBitsForTargetNode(
     if (!CMask)
       return;
 
-    KnownBits LHSKnown, RHSKnown;
-    DAG.computeKnownBits(Op.getOperand(0), LHSKnown, Depth + 1);
-    DAG.computeKnownBits(Op.getOperand(1), RHSKnown, Depth + 1);
+    KnownBits LHSKnown = DAG.computeKnownBits(Op.getOperand(0), Depth + 1);
+    KnownBits RHSKnown = DAG.computeKnownBits(Op.getOperand(1), Depth + 1);
     unsigned Sel = CMask->getZExtValue();
 
     for (unsigned I = 0; I < 32; I += 8) {
@@ -4320,3 +4419,107 @@ unsigned AMDGPUTargetLowering::ComputeNumSignBitsForTargetNode(
     return 1;
   }
 }
+
+bool AMDGPUTargetLowering::isKnownNeverNaNForTargetNode(SDValue Op,
+                                                        const SelectionDAG &DAG,
+                                                        bool SNaN,
+                                                        unsigned Depth) const {
+  unsigned Opcode = Op.getOpcode();
+  switch (Opcode) {
+  case AMDGPUISD::FMIN_LEGACY:
+  case AMDGPUISD::FMAX_LEGACY: {
+    if (SNaN)
+      return true;
+
+    // TODO: Can check no nans on one of the operands for each one, but which
+    // one?
+    return false;
+  }
+  case AMDGPUISD::FMUL_LEGACY:
+  case AMDGPUISD::CVT_PKRTZ_F16_F32: {
+    if (SNaN)
+      return true;
+    return DAG.isKnownNeverNaN(Op.getOperand(0), SNaN, Depth + 1) &&
+           DAG.isKnownNeverNaN(Op.getOperand(1), SNaN, Depth + 1);
+  }
+  case AMDGPUISD::FMED3:
+  case AMDGPUISD::FMIN3:
+  case AMDGPUISD::FMAX3:
+  case AMDGPUISD::FMAD_FTZ: {
+    if (SNaN)
+      return true;
+    return DAG.isKnownNeverNaN(Op.getOperand(0), SNaN, Depth + 1) &&
+           DAG.isKnownNeverNaN(Op.getOperand(1), SNaN, Depth + 1) &&
+           DAG.isKnownNeverNaN(Op.getOperand(2), SNaN, Depth + 1);
+  }
+  case AMDGPUISD::CVT_F32_UBYTE0:
+  case AMDGPUISD::CVT_F32_UBYTE1:
+  case AMDGPUISD::CVT_F32_UBYTE2:
+  case AMDGPUISD::CVT_F32_UBYTE3:
+    return true;
+
+  case AMDGPUISD::RCP:
+  case AMDGPUISD::RSQ:
+  case AMDGPUISD::RCP_LEGACY:
+  case AMDGPUISD::RSQ_LEGACY:
+  case AMDGPUISD::RSQ_CLAMP: {
+    if (SNaN)
+      return true;
+
+    // TODO: Need is known positive check.
+    return false;
+  }
+  case AMDGPUISD::LDEXP:
+  case AMDGPUISD::FRACT: {
+    if (SNaN)
+      return true;
+    return DAG.isKnownNeverNaN(Op.getOperand(0), SNaN, Depth + 1);
+  }
+  case AMDGPUISD::DIV_SCALE:
+  case AMDGPUISD::DIV_FMAS:
+  case AMDGPUISD::DIV_FIXUP:
+  case AMDGPUISD::TRIG_PREOP:
+    // TODO: Refine on operands.
+    return SNaN;
+  case AMDGPUISD::SIN_HW:
+  case AMDGPUISD::COS_HW: {
+    // TODO: Need check for infinity
+    return SNaN;
+  }
+  case ISD::INTRINSIC_WO_CHAIN: {
+    unsigned IntrinsicID
+      = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue();
+    // TODO: Handle more intrinsics
+    switch (IntrinsicID) {
+    case Intrinsic::amdgcn_cubeid:
+      return true;
+
+    case Intrinsic::amdgcn_frexp_mant: {
+      if (SNaN)
+        return true;
+      return DAG.isKnownNeverNaN(Op.getOperand(1), SNaN, Depth + 1);
+    }
+    case Intrinsic::amdgcn_cvt_pkrtz: {
+      if (SNaN)
+        return true;
+      return DAG.isKnownNeverNaN(Op.getOperand(1), SNaN, Depth + 1) &&
+             DAG.isKnownNeverNaN(Op.getOperand(2), SNaN, Depth + 1);
+    }
+    case Intrinsic::amdgcn_fdot2:
+      // TODO: Refine on operand
+      return SNaN;
+    default:
+      return false;
+    }
+  }
+  default:
+    return false;
+  }
+}
+
+TargetLowering::AtomicExpansionKind
+AMDGPUTargetLowering::shouldExpandAtomicRMWInIR(AtomicRMWInst *RMW) const {
+  if (RMW->getOperation() == AtomicRMWInst::Nand)
+    return AtomicExpansionKind::CmpXChg;
+  return AtomicExpansionKind::None;
+}
diff --git a/lib/Target/AMDGPU/AMDGPUISelLowering.h b/lib/Target/AMDGPU/AMDGPUISelLowering.h
index a4c3b413e1037..0d22cb2e3e20b 100644
--- a/lib/Target/AMDGPU/AMDGPUISelLowering.h
+++ b/lib/Target/AMDGPU/AMDGPUISelLowering.h
@@ -41,8 +41,6 @@ public:
   static unsigned numBitsSigned(SDValue Op, SelectionDAG &DAG);
 
 protected:
-  AMDGPUAS AMDGPUASI;
-
   SDValue LowerEXTRACT_SUBVECTOR(SDValue Op, SelectionDAG &DAG) const;
   SDValue LowerCONCAT_VECTORS(SDValue Op, SelectionDAG &DAG) const;
   /// Split a vector store into multiple scalar stores.
@@ -58,8 +56,9 @@ protected:
   SDValue LowerFROUND64(SDValue Op, SelectionDAG &DAG) const;
   SDValue LowerFROUND(SDValue Op, SelectionDAG &DAG) const;
   SDValue LowerFFLOOR(SDValue Op, SelectionDAG &DAG) const;
-  SDValue LowerFLOG(SDValue Op, SelectionDAG &Dag,
+  SDValue LowerFLOG(SDValue Op, SelectionDAG &DAG,
                     double Log2BaseInverted) const;
+  SDValue lowerFEXP(SDValue Op, SelectionDAG &DAG) const;
 
   SDValue LowerCTLZ_CTTZ(SDValue Op, SelectionDAG &DAG) const;
 
@@ -95,6 +94,8 @@ protected:
   SDValue performCtlz_CttzCombine(const SDLoc &SL, SDValue Cond, SDValue LHS,
                              SDValue RHS, DAGCombinerInfo &DCI) const;
   SDValue performSelectCombine(SDNode *N, DAGCombinerInfo &DCI) const;
+
+  bool isConstantCostlierToNegate(SDValue N) const;
   SDValue performFNegCombine(SDNode *N, DAGCombinerInfo &DCI) const;
   SDValue performFAbsCombine(SDNode *N, DAGCombinerInfo &DCI) const;
   SDValue performRcpCombine(SDNode *N, DAGCombinerInfo &DCI) const;
@@ -246,6 +247,11 @@ public:
                                            const SelectionDAG &DAG,
                                            unsigned Depth = 0) const override;
 
+  bool isKnownNeverNaNForTargetNode(SDValue Op,
+                                    const SelectionDAG &DAG,
+                                    bool SNaN = false,
+                                    unsigned Depth = 0) const override;
+
   /// Helper function that adds Reg to the LiveIn list of the DAG's
   /// MachineFunction.
   ///
@@ -279,7 +285,6 @@ public:
   SDValue storeStackInputValue(SelectionDAG &DAG,
                                const SDLoc &SL,
                                SDValue Chain,
-                               SDValue StackPtr,
                                SDValue ArgVal,
                                int64_t Offset) const;
 
@@ -299,13 +304,11 @@ public:
   uint32_t getImplicitParameterOffset(const MachineFunction &MF,
                                       const ImplicitParameter Param) const;
 
-  AMDGPUAS getAMDGPUAS() const {
-    return AMDGPUASI;
-  }
-
   MVT getFenceOperandTy(const DataLayout &DL) const override {
     return MVT::i32;
   }
+
+  AtomicExpansionKind shouldExpandAtomicRMWInIR(AtomicRMWInst *) const override;
 };
 
 namespace AMDGPUISD {
@@ -357,6 +360,7 @@ enum NodeType : unsigned {
   SIN_HW,
   FMAX_LEGACY,
   FMIN_LEGACY,
+
   FMAX3,
   SMAX3,
   UMAX3,
@@ -479,6 +483,7 @@ enum NodeType : unsigned {
   BUFFER_LOAD,
   BUFFER_LOAD_FORMAT,
   BUFFER_LOAD_FORMAT_D16,
+  SBUFFER_LOAD,
   BUFFER_STORE,
   BUFFER_STORE_FORMAT,
   BUFFER_STORE_FORMAT_D16,
diff --git a/lib/Target/AMDGPU/AMDGPUInline.cpp b/lib/Target/AMDGPU/AMDGPUInline.cpp
index 35dd9eb0a478d..945c9acd379a5 100644
--- a/lib/Target/AMDGPU/AMDGPUInline.cpp
+++ b/lib/Target/AMDGPU/AMDGPUInline.cpp
@@ -44,7 +44,7 @@ ArgAllocaCost("amdgpu-inline-arg-alloca-cost", cl::Hidden, cl::init(2200),
               cl::desc("Cost of alloca argument"));
 
 // If the amount of scratch memory to eliminate exceeds our ability to allocate
-// it into registers we gain nothing by agressively inlining functions for that
+// it into registers we gain nothing by aggressively inlining functions for that
 // heuristic.
 static cl::opt<unsigned>
 ArgAllocaCutoff("amdgpu-inline-arg-alloca-cutoff", cl::Hidden, cl::init(256),
@@ -118,8 +118,6 @@ unsigned AMDGPUInliner::getInlineThreshold(CallSite CS) const {
   if (!Callee)
     return (unsigned)Thres;
 
-  const AMDGPUAS AS = AMDGPU::getAMDGPUAS(*Caller->getParent());
-
   // If we have a pointer to private array passed into a function
   // it will not be optimized out, leaving scratch usage.
   // Increase the inline threshold to allow inliniting in this case.
@@ -128,7 +126,7 @@ unsigned AMDGPUInliner::getInlineThreshold(CallSite CS) const {
   for (Value *PtrArg : CS.args()) {
     Type *Ty = PtrArg->getType();
     if (!Ty->isPointerTy() ||
-        Ty->getPointerAddressSpace() != AS.PRIVATE_ADDRESS)
+        Ty->getPointerAddressSpace() != AMDGPUAS::PRIVATE_ADDRESS)
       continue;
     PtrArg = GetUnderlyingObject(PtrArg, DL);
     if (const AllocaInst *AI = dyn_cast<AllocaInst>(PtrArg)) {
@@ -174,18 +172,23 @@ InlineCost AMDGPUInliner::getInlineCost(CallSite CS) {
   Function *Caller = CS.getCaller();
   TargetTransformInfo &TTI = TTIWP->getTTI(*Callee);
 
-  if (!Callee || Callee->isDeclaration() || CS.isNoInline() ||
-      !TTI.areInlineCompatible(Caller, Callee))
-    return llvm::InlineCost::getNever();
+  if (!Callee || Callee->isDeclaration())
+    return llvm::InlineCost::getNever("undefined callee");
+
+  if (CS.isNoInline())
+    return llvm::InlineCost::getNever("noinline");
+
+  if (!TTI.areInlineCompatible(Caller, Callee))
+    return llvm::InlineCost::getNever("incompatible");
 
   if (CS.hasFnAttr(Attribute::AlwaysInline)) {
     if (isInlineViable(*Callee))
-      return llvm::InlineCost::getAlways();
-    return llvm::InlineCost::getNever();
+      return llvm::InlineCost::getAlways("alwaysinline viable");
+    return llvm::InlineCost::getNever("alwaysinline unviable");
   }
 
   if (isWrapperOnlyCall(CS))
-    return llvm::InlineCost::getAlways();
+    return llvm::InlineCost::getAlways("wrapper-only call");
 
   InlineParams LocalParams = Params;
   LocalParams.DefaultThreshold = (int)getInlineThreshold(CS);
diff --git a/lib/Target/AMDGPU/AMDGPUInstrInfo.td b/lib/Target/AMDGPU/AMDGPUInstrInfo.td
index 7442a59e594f1..82644be265638 100644
--- a/lib/Target/AMDGPU/AMDGPUInstrInfo.td
+++ b/lib/Target/AMDGPU/AMDGPUInstrInfo.td
@@ -62,18 +62,10 @@ def AMDGPULoopOp : SDTypeProfile<0, 2,
   [SDTCisVT<0, i64>, SDTCisVT<1, OtherVT>]
 >;
 
-def AMDGPUBreakOp : SDTypeProfile<1, 1,
-  [SDTCisVT<0, i64>, SDTCisVT<1, i64>]
->;
-
 def AMDGPUIfBreakOp : SDTypeProfile<1, 2,
   [SDTCisVT<0, i64>, SDTCisVT<1, i1>, SDTCisVT<2, i64>]
 >;
 
-def AMDGPUElseBreakOp : SDTypeProfile<1, 2,
-  [SDTCisVT<0, i64>, SDTCisVT<1, i64>, SDTCisVT<2, i64>]
->;
-
 def AMDGPUAddeSubeOp : SDTypeProfile<2, 3,
   [SDTCisSameAs<0, 2>, SDTCisSameAs<0, 3>, SDTCisVT<0, i32>, SDTCisVT<1, i1>, SDTCisVT<4, i1>]
 >;
diff --git a/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp b/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp
index 219d430fbb395..8eb49d49b2e08 100644
--- a/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp
+++ b/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp
@@ -55,7 +55,6 @@ AMDGPUInstructionSelector::AMDGPUInstructionSelector(
 #define GET_GLOBALISEL_TEMPORARIES_INIT
 #include "AMDGPUGenGlobalISel.inc"
 #undef GET_GLOBALISEL_TEMPORARIES_INIT
-      ,AMDGPUASI(STI.getAMDGPUAS())
 {
 }
 
@@ -506,8 +505,8 @@ bool AMDGPUInstructionSelector::selectSMRD(MachineInstr &I,
   if (!I.hasOneMemOperand())
     return false;
 
-  if ((*I.memoperands_begin())->getAddrSpace() != AMDGPUASI.CONSTANT_ADDRESS &&
-      (*I.memoperands_begin())->getAddrSpace() != AMDGPUASI.CONSTANT_ADDRESS_32BIT)
+  if ((*I.memoperands_begin())->getAddrSpace() != AMDGPUAS::CONSTANT_ADDRESS &&
+      (*I.memoperands_begin())->getAddrSpace() != AMDGPUAS::CONSTANT_ADDRESS_32BIT)
     return false;
 
   if (!isInstrUniform(I))
@@ -631,6 +630,7 @@ bool AMDGPUInstructionSelector::select(MachineInstr &I,
     return selectImpl(I, CoverageInfo);
   case TargetOpcode::G_ADD:
     return selectG_ADD(I);
+  case TargetOpcode::G_INTTOPTR:
   case TargetOpcode::G_BITCAST:
     return selectCOPY(I);
   case TargetOpcode::G_CONSTANT:
diff --git a/lib/Target/AMDGPU/AMDGPUInstructionSelector.h b/lib/Target/AMDGPU/AMDGPUInstructionSelector.h
index 68b40b20aca24..449431adc561a 100644
--- a/lib/Target/AMDGPU/AMDGPUInstructionSelector.h
+++ b/lib/Target/AMDGPU/AMDGPUInstructionSelector.h
@@ -105,9 +105,6 @@ private:
 #define GET_GLOBALISEL_TEMPORARIES_DECL
 #include "AMDGPUGenGlobalISel.inc"
 #undef GET_GLOBALISEL_TEMPORARIES_DECL
-
-protected:
-  AMDGPUAS AMDGPUASI;
 };
 
 } // End llvm namespace.
diff --git a/lib/Target/AMDGPU/AMDGPUInstructions.td b/lib/Target/AMDGPU/AMDGPUInstructions.td
index c9c932ef2f5fb..eb8f2002ff2dc 100644
--- a/lib/Target/AMDGPU/AMDGPUInstructions.td
+++ b/lib/Target/AMDGPU/AMDGPUInstructions.td
@@ -135,6 +135,12 @@ def brtarget   : Operand<OtherVT>;
 // Misc. PatFrags
 //===----------------------------------------------------------------------===//
 
+class HasOneUseUnaryOp<SDPatternOperator op> : PatFrag<
+  (ops node:$src0),
+  (op $src0),
+  [{ return N->hasOneUse(); }]
+>;
+
 class HasOneUseBinOp<SDPatternOperator op> : PatFrag<
   (ops node:$src0, node:$src1),
   (op $src0, $src1),
@@ -152,13 +158,21 @@ def smax_oneuse : HasOneUseBinOp<smax>;
 def smin_oneuse : HasOneUseBinOp<smin>;
 def umax_oneuse : HasOneUseBinOp<umax>;
 def umin_oneuse : HasOneUseBinOp<umin>;
+
 def fminnum_oneuse : HasOneUseBinOp<fminnum>;
 def fmaxnum_oneuse : HasOneUseBinOp<fmaxnum>;
+
+def fminnum_ieee_oneuse : HasOneUseBinOp<fminnum_ieee>;
+def fmaxnum_ieee_oneuse : HasOneUseBinOp<fmaxnum_ieee>;
+
+
 def and_oneuse : HasOneUseBinOp<and>;
 def or_oneuse : HasOneUseBinOp<or>;
 def xor_oneuse : HasOneUseBinOp<xor>;
 } // Properties = [SDNPCommutative, SDNPAssociative]
 
+def not_oneuse : HasOneUseUnaryOp<not>;
+
 def add_oneuse : HasOneUseBinOp<add>;
 def sub_oneuse : HasOneUseBinOp<sub>;
 
@@ -167,6 +181,9 @@ def shl_oneuse : HasOneUseBinOp<shl>;
 
 def select_oneuse : HasOneUseTernaryOp<select>;
 
+def AMDGPUmul_u24_oneuse : HasOneUseBinOp<AMDGPUmul_u24>;
+def AMDGPUmul_i24_oneuse : HasOneUseBinOp<AMDGPUmul_i24>;
+
 def srl_16 : PatFrag<
   (ops node:$src0), (srl_oneuse node:$src0, (i32 16))
 >;
@@ -328,37 +345,37 @@ class StoreHi16<SDPatternOperator op> : PatFrag <
 >;
 
 class PrivateAddress : CodePatPred<[{
-  return cast<MemSDNode>(N)->getAddressSpace() == AMDGPUASI.PRIVATE_ADDRESS;
+  return cast<MemSDNode>(N)->getAddressSpace() == AMDGPUAS::PRIVATE_ADDRESS;
 }]>;
 
 class ConstantAddress : CodePatPred<[{
-  return cast<MemSDNode>(N)->getAddressSpace() == AMDGPUASI.CONSTANT_ADDRESS;
+  return cast<MemSDNode>(N)->getAddressSpace() == AMDGPUAS::CONSTANT_ADDRESS;
 }]>;
 
 class LocalAddress : CodePatPred<[{
-  return cast<MemSDNode>(N)->getAddressSpace() == AMDGPUASI.LOCAL_ADDRESS;
+  return cast<MemSDNode>(N)->getAddressSpace() == AMDGPUAS::LOCAL_ADDRESS;
 }]>;
 
 class GlobalAddress : CodePatPred<[{
-  return cast<MemSDNode>(N)->getAddressSpace() == AMDGPUASI.GLOBAL_ADDRESS;
+  return cast<MemSDNode>(N)->getAddressSpace() == AMDGPUAS::GLOBAL_ADDRESS;
 }]>;
 
 class GlobalLoadAddress : CodePatPred<[{
   auto AS = cast<MemSDNode>(N)->getAddressSpace();
-  return AS == AMDGPUASI.GLOBAL_ADDRESS || AS == AMDGPUASI.CONSTANT_ADDRESS;
+  return AS == AMDGPUAS::GLOBAL_ADDRESS || AS == AMDGPUAS::CONSTANT_ADDRESS;
 }]>;
 
 class FlatLoadAddress : CodePatPred<[{
   const auto AS = cast<MemSDNode>(N)->getAddressSpace();
-  return AS == AMDGPUASI.FLAT_ADDRESS ||
-         AS == AMDGPUASI.GLOBAL_ADDRESS ||
-         AS == AMDGPUASI.CONSTANT_ADDRESS;
+  return AS == AMDGPUAS::FLAT_ADDRESS ||
+         AS == AMDGPUAS::GLOBAL_ADDRESS ||
+         AS == AMDGPUAS::CONSTANT_ADDRESS;
 }]>;
 
 class FlatStoreAddress : CodePatPred<[{
   const auto AS = cast<MemSDNode>(N)->getAddressSpace();
-  return AS == AMDGPUASI.FLAT_ADDRESS ||
-         AS == AMDGPUASI.GLOBAL_ADDRESS;
+  return AS == AMDGPUAS::FLAT_ADDRESS ||
+         AS == AMDGPUAS::GLOBAL_ADDRESS;
 }]>;
 
 class AZExtLoadBase <SDPatternOperator ld_node>: PatFrag<(ops node:$ptr),
@@ -480,7 +497,7 @@ def az_extloadi16_constant : ConstantLoad <az_extloadi16>;
 class local_binary_atomic_op<SDNode atomic_op> :
   PatFrag<(ops node:$ptr, node:$value),
     (atomic_op node:$ptr, node:$value), [{
-  return cast<MemSDNode>(N)->getAddressSpace() == AMDGPUASI.LOCAL_ADDRESS;
+  return cast<MemSDNode>(N)->getAddressSpace() == AMDGPUAS::LOCAL_ADDRESS;
 }]>;
 
 def atomic_swap_local : local_binary_atomic_op<atomic_swap>;
@@ -497,14 +514,14 @@ def atomic_load_umax_local : local_binary_atomic_op<atomic_load_umax>;
 
 def mskor_global : PatFrag<(ops node:$val, node:$ptr),
                             (AMDGPUstore_mskor node:$val, node:$ptr), [{
-  return cast<MemSDNode>(N)->getAddressSpace() == AMDGPUASI.GLOBAL_ADDRESS;
+  return cast<MemSDNode>(N)->getAddressSpace() == AMDGPUAS::GLOBAL_ADDRESS;
 }]>;
 
 class AtomicCmpSwapLocal <SDNode cmp_swap_node> : PatFrag<
     (ops node:$ptr, node:$cmp, node:$swap),
     (cmp_swap_node node:$ptr, node:$cmp, node:$swap), [{
       AtomicSDNode *AN = cast<AtomicSDNode>(N);
-      return AN->getAddressSpace() == AMDGPUASI.LOCAL_ADDRESS;
+      return AN->getAddressSpace() == AMDGPUAS::LOCAL_ADDRESS;
 }]>;
 
 def atomic_cmp_swap_local : AtomicCmpSwapLocal <atomic_cmp_swap>;
@@ -513,17 +530,17 @@ multiclass global_binary_atomic_op<SDNode atomic_op> {
   def "" : PatFrag<
         (ops node:$ptr, node:$value),
         (atomic_op node:$ptr, node:$value),
-        [{return cast<MemSDNode>(N)->getAddressSpace() == AMDGPUASI.GLOBAL_ADDRESS;}]>;
+        [{return cast<MemSDNode>(N)->getAddressSpace() == AMDGPUAS::GLOBAL_ADDRESS;}]>;
 
   def _noret : PatFrag<
         (ops node:$ptr, node:$value),
         (atomic_op node:$ptr, node:$value),
-        [{return cast<MemSDNode>(N)->getAddressSpace() == AMDGPUASI.GLOBAL_ADDRESS && (SDValue(N, 0).use_empty());}]>;
+        [{return cast<MemSDNode>(N)->getAddressSpace() == AMDGPUAS::GLOBAL_ADDRESS && (SDValue(N, 0).use_empty());}]>;
 
   def _ret : PatFrag<
         (ops node:$ptr, node:$value),
         (atomic_op node:$ptr, node:$value),
-        [{return cast<MemSDNode>(N)->getAddressSpace() == AMDGPUASI.GLOBAL_ADDRESS && (!SDValue(N, 0).use_empty());}]>;
+        [{return cast<MemSDNode>(N)->getAddressSpace() == AMDGPUAS::GLOBAL_ADDRESS && (!SDValue(N, 0).use_empty());}]>;
 }
 
 defm atomic_swap_global : global_binary_atomic_op<atomic_swap>;
@@ -550,12 +567,12 @@ def atomic_cmp_swap_global : PatFrag<
 def atomic_cmp_swap_global_noret : PatFrag<
   (ops node:$ptr, node:$cmp, node:$value),
   (atomic_cmp_swap node:$ptr, node:$cmp, node:$value),
-  [{return cast<MemSDNode>(N)->getAddressSpace() == AMDGPUASI.GLOBAL_ADDRESS && (SDValue(N, 0).use_empty());}]>;
+  [{return cast<MemSDNode>(N)->getAddressSpace() == AMDGPUAS::GLOBAL_ADDRESS && (SDValue(N, 0).use_empty());}]>;
 
 def atomic_cmp_swap_global_ret : PatFrag<
   (ops node:$ptr, node:$cmp, node:$value),
   (atomic_cmp_swap node:$ptr, node:$cmp, node:$value),
-  [{return cast<MemSDNode>(N)->getAddressSpace() == AMDGPUASI.GLOBAL_ADDRESS && (!SDValue(N, 0).use_empty());}]>;
+  [{return cast<MemSDNode>(N)->getAddressSpace() == AMDGPUAS::GLOBAL_ADDRESS && (!SDValue(N, 0).use_empty());}]>;
 
 //===----------------------------------------------------------------------===//
 // Misc Pattern Fragments
@@ -787,18 +804,30 @@ class ROTRPattern <Instruction BIT_ALIGN> : AMDGPUPat <
   (BIT_ALIGN $src0, $src0, $src1)
 >;
 
-// This matches 16 permutations of
-// max(min(x, y), min(max(x, y), z))
-class IntMed3Pat<Instruction med3Inst,
+multiclass IntMed3Pat<Instruction med3Inst,
+                 SDPatternOperator min,
                  SDPatternOperator max,
-                 SDPatternOperator max_oneuse,
                  SDPatternOperator min_oneuse,
-                 ValueType vt = i32> : AMDGPUPat<
+                 SDPatternOperator max_oneuse,
+                 ValueType vt = i32> {
+
+  // This matches 16 permutations of 
+  // min(max(a, b), max(min(a, b), c))
+  def : AMDGPUPat <
+  (min (max_oneuse vt:$src0, vt:$src1),
+       (max_oneuse (min_oneuse vt:$src0, vt:$src1), vt:$src2)),
+  (med3Inst vt:$src0, vt:$src1, vt:$src2)
+>;
+
+  // This matches 16 permutations of 
+  // max(min(x, y), min(max(x, y), z))
+  def : AMDGPUPat <
   (max (min_oneuse vt:$src0, vt:$src1),
        (min_oneuse (max_oneuse vt:$src0, vt:$src1), vt:$src2)),
   (med3Inst $src0, $src1, $src2)
 >;
-
+}
+  
 // Special conversion patterns
 
 def cvt_rpi_i32_f32 : PatFrag <
@@ -813,6 +842,7 @@ def cvt_flr_i32_f32 : PatFrag <
   [{ (void)N; return TM.Options.NoNaNsFPMath; }]
 >;
 
+let AddedComplexity = 2 in {
 class IMad24Pat<Instruction Inst, bit HasClamp = 0> : AMDGPUPat <
   (add (AMDGPUmul_i24 i32:$src0, i32:$src1), i32:$src2),
   !if(HasClamp, (Inst $src0, $src1, $src2, (i1 0)),
@@ -824,6 +854,7 @@ class UMad24Pat<Instruction Inst, bit HasClamp = 0> : AMDGPUPat <
   !if(HasClamp, (Inst $src0, $src1, $src2, (i1 0)),
                 (Inst $src0, $src1, $src2))
 >;
+} // AddedComplexity.
 
 class RcpPat<Instruction RcpInst, ValueType vt> : AMDGPUPat <
   (fdiv FP_ONE, vt:$src),
@@ -834,3 +865,25 @@ class RsqPat<Instruction RsqInst, ValueType vt> : AMDGPUPat <
   (AMDGPUrcp (fsqrt vt:$src)),
   (RsqInst $src)
 >;
+
+// Instructions which select to the same v_min_f*
+def fminnum_like : PatFrags<(ops node:$src0, node:$src1),
+  [(fminnum_ieee node:$src0, node:$src1),
+   (fminnum node:$src0, node:$src1)]
+>;
+
+// Instructions which select to the same v_max_f*
+def fmaxnum_like : PatFrags<(ops node:$src0, node:$src1),
+  [(fmaxnum_ieee node:$src0, node:$src1),
+   (fmaxnum node:$src0, node:$src1)]
+>;
+
+def fminnum_like_oneuse : PatFrags<(ops node:$src0, node:$src1),
+  [(fminnum_ieee_oneuse node:$src0, node:$src1),
+   (fminnum_oneuse node:$src0, node:$src1)]
+>;
+
+def fmaxnum_like_oneuse : PatFrags<(ops node:$src0, node:$src1),
+  [(fmaxnum_ieee_oneuse node:$src0, node:$src1),
+   (fmaxnum_oneuse node:$src0, node:$src1)]
+>;
diff --git a/lib/Target/AMDGPU/AMDGPUIntrinsicInfo.cpp b/lib/Target/AMDGPU/AMDGPUIntrinsicInfo.cpp
index 896e2055cf620..02108ca3ddd78 100644
--- a/lib/Target/AMDGPU/AMDGPUIntrinsicInfo.cpp
+++ b/lib/Target/AMDGPU/AMDGPUIntrinsicInfo.cpp
@@ -40,7 +40,7 @@ StringRef AMDGPUIntrinsicInfo::getName(unsigned IntrID,
   if (IntrID < Intrinsic::num_intrinsics)
     return StringRef();
 
-  assert(IntrID < AMDGPUIntrinsic::num_AMDGPU_intrinsics &&
+  assert(IntrID < SIIntrinsic::num_AMDGPU_intrinsics &&
          "Invalid intrinsic ID");
 
   return IntrinsicNameTable[IntrID - Intrinsic::num_intrinsics];
@@ -91,7 +91,7 @@ Function *AMDGPUIntrinsicInfo::getDeclaration(Module *M, unsigned IntrID,
     = cast<Function>(M->getOrInsertFunction(getName(IntrID, Tys), FTy));
 
   AttributeList AS =
-      getAttributes(M->getContext(), static_cast<AMDGPUIntrinsic::ID>(IntrID));
+      getAttributes(M->getContext(), static_cast<SIIntrinsic::ID>(IntrID));
   F->setAttributes(AS);
   return F;
 }
diff --git a/lib/Target/AMDGPU/AMDGPUIntrinsicInfo.h b/lib/Target/AMDGPU/AMDGPUIntrinsicInfo.h
index ef42f9a319af6..a1a094dded23d 100644
--- a/lib/Target/AMDGPU/AMDGPUIntrinsicInfo.h
+++ b/lib/Target/AMDGPU/AMDGPUIntrinsicInfo.h
@@ -20,7 +20,7 @@
 namespace llvm {
 class TargetMachine;
 
-namespace AMDGPUIntrinsic {
+namespace SIIntrinsic {
 enum ID {
   last_non_AMDGPU_intrinsic = Intrinsic::num_intrinsics - 1,
 #define GET_INTRINSIC_ENUM_VALUES
diff --git a/lib/Target/AMDGPU/AMDGPUIntrinsics.td b/lib/Target/AMDGPU/AMDGPUIntrinsics.td
deleted file mode 100644
index 230a046285047..0000000000000
--- a/lib/Target/AMDGPU/AMDGPUIntrinsics.td
+++ /dev/null
@@ -1,16 +0,0 @@
-//===-- AMDGPUIntrinsics.td - Common intrinsics  -*- tablegen -*-----------===//
-//
-//                     The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
-//
-//===----------------------------------------------------------------------===//
-//
-// This file defines intrinsics that are used by all hw codegen targets.
-//
-//===----------------------------------------------------------------------===//
-
-let TargetPrefix = "AMDGPU", isTarget = 1 in {
-  def int_AMDGPU_kill : Intrinsic<[], [llvm_float_ty], []>;
-}
diff --git a/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp b/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp
index 87b072c9ea20a..ef85c1040545f 100644
--- a/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp
+++ b/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp
@@ -32,20 +32,52 @@ AMDGPULegalizerInfo::AMDGPULegalizerInfo(const GCNSubtarget &ST,
     return LLT::pointer(AS, TM.getPointerSizeInBits(AS));
   };
 
-  auto AMDGPUAS = ST.getAMDGPUAS();
-
   const LLT S1 = LLT::scalar(1);
-  const LLT V2S16 = LLT::vector(2, 16);
-
   const LLT S32 = LLT::scalar(32);
   const LLT S64 = LLT::scalar(64);
   const LLT S512 = LLT::scalar(512);
 
+  const LLT V2S16 = LLT::vector(2, 16);
+  const LLT V4S16 = LLT::vector(4, 16);
+  const LLT V8S16 = LLT::vector(8, 16);
+
+  const LLT V2S32 = LLT::vector(2, 32);
+  const LLT V3S32 = LLT::vector(3, 32);
+  const LLT V4S32 = LLT::vector(4, 32);
+  const LLT V5S32 = LLT::vector(5, 32);
+  const LLT V6S32 = LLT::vector(6, 32);
+  const LLT V7S32 = LLT::vector(7, 32);
+  const LLT V8S32 = LLT::vector(8, 32);
+  const LLT V9S32 = LLT::vector(9, 32);
+  const LLT V10S32 = LLT::vector(10, 32);
+  const LLT V11S32 = LLT::vector(11, 32);
+  const LLT V12S32 = LLT::vector(12, 32);
+  const LLT V13S32 = LLT::vector(13, 32);
+  const LLT V14S32 = LLT::vector(14, 32);
+  const LLT V15S32 = LLT::vector(15, 32);
+  const LLT V16S32 = LLT::vector(16, 32);
+
+  const LLT V2S64 = LLT::vector(2, 64);
+  const LLT V3S64 = LLT::vector(3, 64);
+  const LLT V4S64 = LLT::vector(4, 64);
+  const LLT V5S64 = LLT::vector(5, 64);
+  const LLT V6S64 = LLT::vector(6, 64);
+  const LLT V7S64 = LLT::vector(7, 64);
+  const LLT V8S64 = LLT::vector(8, 64);
+
+  std::initializer_list<LLT> AllS32Vectors =
+    {V2S32, V3S32, V4S32, V5S32, V6S32, V7S32, V8S32,
+     V9S32, V10S32, V11S32, V12S32, V13S32, V14S32, V15S32, V16S32};
+  std::initializer_list<LLT> AllS64Vectors =
+    {V2S64, V3S64, V4S64, V5S64, V6S64, V7S64, V8S64};
+
   const LLT GlobalPtr = GetAddrSpacePtr(AMDGPUAS::GLOBAL_ADDRESS);
   const LLT ConstantPtr = GetAddrSpacePtr(AMDGPUAS::CONSTANT_ADDRESS);
   const LLT LocalPtr = GetAddrSpacePtr(AMDGPUAS::LOCAL_ADDRESS);
-  const LLT FlatPtr = GetAddrSpacePtr(AMDGPUAS.FLAT_ADDRESS);
-  const LLT PrivatePtr = GetAddrSpacePtr(AMDGPUAS.PRIVATE_ADDRESS);
+  const LLT FlatPtr = GetAddrSpacePtr(AMDGPUAS::FLAT_ADDRESS);
+  const LLT PrivatePtr = GetAddrSpacePtr(AMDGPUAS::PRIVATE_ADDRESS);
+
+  const LLT CodePtr = FlatPtr;
 
   const LLT AddrSpaces[] = {
     GlobalPtr,
@@ -55,13 +87,20 @@ AMDGPULegalizerInfo::AMDGPULegalizerInfo(const GCNSubtarget &ST,
     PrivatePtr
   };
 
+  setAction({G_BRCOND, S1}, Legal);
+
   setAction({G_ADD, S32}, Legal);
   setAction({G_ASHR, S32}, Legal);
   setAction({G_SUB, S32}, Legal);
   setAction({G_MUL, S32}, Legal);
-  setAction({G_AND, S32}, Legal);
-  setAction({G_OR, S32}, Legal);
-  setAction({G_XOR, S32}, Legal);
+
+  // FIXME: 64-bit ones only legal for scalar
+  getActionDefinitionsBuilder({G_AND, G_OR, G_XOR})
+    .legalFor({S32, S1, S64, V2S32});
+
+  getActionDefinitionsBuilder({G_UADDO, G_SADDO, G_USUBO, G_SSUBO,
+                               G_UADDE, G_SADDE, G_USUBE, G_SSUBE})
+    .legalFor({{S32, S1}});
 
   setAction({G_BITCAST, V2S16}, Legal);
   setAction({G_BITCAST, 1, S32}, Legal);
@@ -90,35 +129,80 @@ AMDGPULegalizerInfo::AMDGPULegalizerInfo(const GCNSubtarget &ST,
   // between these two scenarios.
   setAction({G_CONSTANT, S1}, Legal);
 
-  setAction({G_FADD, S32}, Legal);
+  setAction({G_FRAME_INDEX, PrivatePtr}, Legal);
+
+  getActionDefinitionsBuilder(
+    { G_FADD, G_FMUL, G_FNEG, G_FABS, G_FMA})
+    .legalFor({S32, S64});
+
+  getActionDefinitionsBuilder(G_FPTRUNC)
+    .legalFor({{S32, S64}});
+
+  // Use actual fsub instruction
+  setAction({G_FSUB, S32}, Legal);
+
+  // Must use fadd + fneg
+  setAction({G_FSUB, S64}, Lower);
 
   setAction({G_FCMP, S1}, Legal);
   setAction({G_FCMP, 1, S32}, Legal);
   setAction({G_FCMP, 1, S64}, Legal);
 
-  setAction({G_FMUL, S32}, Legal);
-
   setAction({G_ZEXT, S64}, Legal);
   setAction({G_ZEXT, 1, S32}, Legal);
 
+  setAction({G_SEXT, S64}, Legal);
+  setAction({G_SEXT, 1, S32}, Legal);
+
+  setAction({G_ANYEXT, S64}, Legal);
+  setAction({G_ANYEXT, 1, S32}, Legal);
+
   setAction({G_FPTOSI, S32}, Legal);
   setAction({G_FPTOSI, 1, S32}, Legal);
 
   setAction({G_SITOFP, S32}, Legal);
   setAction({G_SITOFP, 1, S32}, Legal);
 
+  setAction({G_UITOFP, S32}, Legal);
+  setAction({G_UITOFP, 1, S32}, Legal);
+
   setAction({G_FPTOUI, S32}, Legal);
   setAction({G_FPTOUI, 1, S32}, Legal);
 
+  setAction({G_FPOW, S32}, Legal);
+  setAction({G_FEXP2, S32}, Legal);
+  setAction({G_FLOG2, S32}, Legal);
+
+  getActionDefinitionsBuilder({G_INTRINSIC_TRUNC, G_INTRINSIC_ROUND})
+    .legalFor({S32, S64});
+
   for (LLT PtrTy : AddrSpaces) {
     LLT IdxTy = LLT::scalar(PtrTy.getSizeInBits());
     setAction({G_GEP, PtrTy}, Legal);
     setAction({G_GEP, 1, IdxTy}, Legal);
   }
 
+  setAction({G_BLOCK_ADDR, CodePtr}, Legal);
+
   setAction({G_ICMP, S1}, Legal);
   setAction({G_ICMP, 1, S32}, Legal);
 
+  setAction({G_CTLZ, S32}, Legal);
+  setAction({G_CTLZ_ZERO_UNDEF, S32}, Legal);
+  setAction({G_CTTZ, S32}, Legal);
+  setAction({G_CTTZ_ZERO_UNDEF, S32}, Legal);
+  setAction({G_BSWAP, S32}, Legal);
+  setAction({G_CTPOP, S32}, Legal);
+
+  getActionDefinitionsBuilder(G_INTTOPTR)
+    .legalIf([](const LegalityQuery &Query) {
+      return true;
+    });
+
+  getActionDefinitionsBuilder(G_PTRTOINT)
+    .legalIf([](const LegalityQuery &Query) {
+      return true;
+    });
 
   getActionDefinitionsBuilder({G_LOAD, G_STORE})
     .legalIf([=, &ST](const LegalityQuery &Query) {
@@ -145,6 +229,16 @@ AMDGPULegalizerInfo::AMDGPULegalizerInfo(const GCNSubtarget &ST,
       });
 
 
+  auto &Atomics = getActionDefinitionsBuilder(
+    {G_ATOMICRMW_XCHG, G_ATOMICRMW_ADD, G_ATOMICRMW_SUB,
+     G_ATOMICRMW_AND, G_ATOMICRMW_OR, G_ATOMICRMW_XOR,
+     G_ATOMICRMW_MAX, G_ATOMICRMW_MIN, G_ATOMICRMW_UMAX,
+     G_ATOMICRMW_UMIN, G_ATOMIC_CMPXCHG})
+    .legalFor({{S32, GlobalPtr}, {S32, LocalPtr},
+               {S64, GlobalPtr}, {S64, LocalPtr}});
+  if (ST.hasFlatAddressSpace()) {
+    Atomics.legalFor({{S32, FlatPtr}, {S64, FlatPtr}});
+  }
 
   setAction({G_SELECT, S32}, Legal);
   setAction({G_SELECT, 1, S1}, Legal);
@@ -180,6 +274,23 @@ AMDGPULegalizerInfo::AMDGPULegalizerInfo(const GCNSubtarget &ST,
                (Ty1.getSizeInBits() % 32 == 0);
       });
 
+  getActionDefinitionsBuilder(G_BUILD_VECTOR)
+    .legalForCartesianProduct(AllS32Vectors, {S32})
+    .legalForCartesianProduct(AllS64Vectors, {S64})
+    .clampNumElements(0, V16S32, V16S32)
+    .clampNumElements(0, V2S64, V8S64)
+    .minScalarSameAs(1, 0);
+
+  // TODO: Support any combination of v2s32
+  getActionDefinitionsBuilder(G_CONCAT_VECTORS)
+    .legalFor({{V4S32, V2S32},
+               {V8S32, V2S32},
+               {V8S32, V4S32},
+               {V4S64, V2S64},
+               {V4S16, V2S16},
+               {V8S16, V2S16},
+               {V8S16, V4S16}});
+
   // Merge/Unmerge
   for (unsigned Op : {G_MERGE_VALUES, G_UNMERGE_VALUES}) {
     unsigned BigTyIdx = Op == G_MERGE_VALUES ? 0 : 1;
diff --git a/lib/Target/AMDGPU/AMDGPULibCalls.cpp b/lib/Target/AMDGPU/AMDGPULibCalls.cpp
index 7a7ed7a4f0656..14e8800426911 100644
--- a/lib/Target/AMDGPU/AMDGPULibCalls.cpp
+++ b/lib/Target/AMDGPU/AMDGPULibCalls.cpp
@@ -1333,8 +1333,7 @@ bool AMDGPULibCalls::fold_sincos(CallInst *CI, IRBuilder<> &B,
   // for OpenCL 2.0 we have only generic implementation of sincos
   // function.
   AMDGPULibFunc nf(AMDGPULibFunc::EI_SINCOS, fInfo);
-  const AMDGPUAS AS = AMDGPU::getAMDGPUAS(*M);
-  nf.getLeads()[0].PtrKind = AMDGPULibFunc::getEPtrKindFromAddrSpace(AS.FLAT_ADDRESS);
+  nf.getLeads()[0].PtrKind = AMDGPULibFunc::getEPtrKindFromAddrSpace(AMDGPUAS::FLAT_ADDRESS);
   Function *Fsincos = dyn_cast_or_null<Function>(getFunction(M, nf));
   if (!Fsincos) return false;
 
@@ -1347,7 +1346,7 @@ bool AMDGPULibCalls::fold_sincos(CallInst *CI, IRBuilder<> &B,
   // The allocaInst allocates the memory in private address space. This need
   // to be bitcasted to point to the address space of cos pointer type.
   // In OpenCL 2.0 this is generic, while in 1.2 that is private.
-  if (PTy->getPointerAddressSpace() != AS.PRIVATE_ADDRESS)
+  if (PTy->getPointerAddressSpace() != AMDGPUAS::PRIVATE_ADDRESS)
     P = B.CreateAddrSpaceCast(Alloc, PTy);
   CallInst *Call = CreateCallEx2(B, Fsincos, UI->getArgOperand(0), P);
 
diff --git a/lib/Target/AMDGPU/AMDGPULibFunc.cpp b/lib/Target/AMDGPU/AMDGPULibFunc.cpp
index 4671273d61f91..4fc3fe0f105b0 100644
--- a/lib/Target/AMDGPU/AMDGPULibFunc.cpp
+++ b/lib/Target/AMDGPU/AMDGPULibFunc.cpp
@@ -90,7 +90,6 @@ class UnmangledFuncInfo {
 
 public:
   using ID = AMDGPULibFunc::EFuncId;
-  UnmangledFuncInfo() = default;
   UnmangledFuncInfo(StringRef _Name, unsigned _NumArgs)
       : Name(_Name), NumArgs(_NumArgs) {}
   // Get index to Table by function name.
@@ -996,8 +995,10 @@ Function *AMDGPULibFunc::getOrInsertFunction(Module *M,
   } else {
     AttributeList Attr;
     LLVMContext &Ctx = M->getContext();
-    Attr.addAttribute(Ctx, AttributeList::FunctionIndex, Attribute::ReadOnly);
-    Attr.addAttribute(Ctx, AttributeList::FunctionIndex, Attribute::NoUnwind);
+    Attr = Attr.addAttribute(Ctx, AttributeList::FunctionIndex,
+                             Attribute::ReadOnly);
+    Attr = Attr.addAttribute(Ctx, AttributeList::FunctionIndex,
+                             Attribute::NoUnwind);
     C = M->getOrInsertFunction(FuncName, FuncTy, Attr);
   }
 
diff --git a/lib/Target/AMDGPU/AMDGPULowerKernelArguments.cpp b/lib/Target/AMDGPU/AMDGPULowerKernelArguments.cpp
index c147830e12ed6..743dc7a0d00b9 100644
--- a/lib/Target/AMDGPU/AMDGPULowerKernelArguments.cpp
+++ b/lib/Target/AMDGPU/AMDGPULowerKernelArguments.cpp
@@ -16,7 +16,6 @@
 #include "AMDGPUSubtarget.h"
 #include "AMDGPUTargetMachine.h"
 #include "llvm/ADT/StringRef.h"
-#include "llvm/Analysis/DivergenceAnalysis.h"
 #include "llvm/Analysis/Loads.h"
 #include "llvm/CodeGen/Passes.h"
 #include "llvm/CodeGen/TargetPassConfig.h"
@@ -84,8 +83,8 @@ bool AMDGPULowerKernelArguments::runOnFunction(Function &F) {
     return false;
 
   CallInst *KernArgSegment =
-    Builder.CreateIntrinsic(Intrinsic::amdgcn_kernarg_segment_ptr, nullptr,
-                            F.getName() + ".kernarg.segment");
+      Builder.CreateIntrinsic(Intrinsic::amdgcn_kernarg_segment_ptr, {}, {},
+                              nullptr, F.getName() + ".kernarg.segment");
 
   KernArgSegment->addAttribute(AttributeList::ReturnIndex, Attribute::NonNull);
   KernArgSegment->addAttribute(AttributeList::ReturnIndex,
@@ -123,14 +122,17 @@ bool AMDGPULowerKernelArguments::runOnFunction(Function &F) {
 
     VectorType *VT = dyn_cast<VectorType>(ArgTy);
     bool IsV3 = VT && VT->getNumElements() == 3;
+    bool DoShiftOpt = Size < 32 && !ArgTy->isAggregateType();
+
     VectorType *V4Ty = nullptr;
 
     int64_t AlignDownOffset = alignDown(EltOffset, 4);
     int64_t OffsetDiff = EltOffset - AlignDownOffset;
-    unsigned AdjustedAlign = MinAlign(KernArgBaseAlign, AlignDownOffset);
+    unsigned AdjustedAlign = MinAlign(DoShiftOpt ? AlignDownOffset : EltOffset,
+                                      KernArgBaseAlign);
 
     Value *ArgPtr;
-    if (Size < 32 && !ArgTy->isAggregateType()) { // FIXME: Handle aggregate types
+    if (DoShiftOpt) { // FIXME: Handle aggregate types
       // Since we don't have sub-dword scalar loads, avoid doing an extload by
       // loading earlier than the argument address, and extracting the relevant
       // bits.
@@ -148,7 +150,7 @@ bool AMDGPULowerKernelArguments::runOnFunction(Function &F) {
     } else {
       ArgPtr = Builder.CreateConstInBoundsGEP1_64(
         KernArgSegment,
-        AlignDownOffset,
+        EltOffset,
         Arg.getName() + ".kernarg.offset");
       ArgPtr = Builder.CreateBitCast(ArgPtr, ArgTy->getPointerTo(AS),
                                      ArgPtr->getName() + ".cast");
@@ -199,7 +201,7 @@ bool AMDGPULowerKernelArguments::runOnFunction(Function &F) {
 
     // TODO: Convert noalias arg to !noalias
 
-    if (Size < 32 && !ArgTy->isAggregateType()) {
+    if (DoShiftOpt) {
       Value *ExtractBits = OffsetDiff == 0 ?
         Load : Builder.CreateLShr(Load, OffsetDiff * 8);
 
diff --git a/lib/Target/AMDGPU/AMDGPUMCInstLower.cpp b/lib/Target/AMDGPU/AMDGPUMCInstLower.cpp
index 1876dc3f71221..f6bdbf5e9be2c 100644
--- a/lib/Target/AMDGPU/AMDGPUMCInstLower.cpp
+++ b/lib/Target/AMDGPU/AMDGPUMCInstLower.cpp
@@ -301,6 +301,26 @@ void AMDGPUAsmPrinter::EmitInstruction(const MachineInstr *MI) {
     MCInstLowering.lower(MI, TmpInst);
     EmitToStreamer(*OutStreamer, TmpInst);
 
+#ifdef EXPENSIVE_CHECKS
+    // Sanity-check getInstSizeInBytes on explicitly specified CPUs (it cannot
+    // work correctly for the generic CPU).
+    //
+    // The isPseudo check really shouldn't be here, but unfortunately there are
+    // some negative lit tests that depend on being able to continue through
+    // here even when pseudo instructions haven't been lowered.
+    if (!MI->isPseudo() && STI.isCPUStringValid(STI.getCPU())) {
+      SmallVector<MCFixup, 4> Fixups;
+      SmallVector<char, 16> CodeBytes;
+      raw_svector_ostream CodeStream(CodeBytes);
+
+      std::unique_ptr<MCCodeEmitter> InstEmitter(createSIMCCodeEmitter(
+          *STI.getInstrInfo(), *OutContext.getRegisterInfo(), OutContext));
+      InstEmitter->encodeInstruction(TmpInst, CodeStream, Fixups, STI);
+
+      assert(CodeBytes.size() == STI.getInstrInfo()->getInstSizeInBytes(*MI));
+    }
+#endif
+
     if (STI.dumpCode()) {
       // Disassemble instruction/operands to text.
       DisasmLines.resize(DisasmLines.size() + 1);
diff --git a/lib/Target/AMDGPU/AMDGPUMacroFusion.cpp b/lib/Target/AMDGPU/AMDGPUMacroFusion.cpp
index 995d9ae3907fc..5e0b7d4290220 100644
--- a/lib/Target/AMDGPU/AMDGPUMacroFusion.cpp
+++ b/lib/Target/AMDGPU/AMDGPUMacroFusion.cpp
@@ -42,9 +42,12 @@ static bool shouldScheduleAdjacent(const TargetInstrInfo &TII_,
     if (!FirstMI)
       return true;
 
+    const MachineBasicBlock &MBB = *FirstMI->getParent();
+    const MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
+    const TargetRegisterInfo *TRI = MRI.getTargetRegisterInfo();
     const MachineOperand *Src2 = TII.getNamedOperand(SecondMI,
                                                      AMDGPU::OpName::src2);
-    return FirstMI->definesRegister(Src2->getReg());
+    return FirstMI->definesRegister(Src2->getReg(), TRI);
   }
   default:
     return false;
diff --git a/lib/Target/AMDGPU/AMDGPUPTNote.h b/lib/Target/AMDGPU/AMDGPUPTNote.h
index b50a2eb8e9e71..2feff14d34a15 100644
--- a/lib/Target/AMDGPU/AMDGPUPTNote.h
+++ b/lib/Target/AMDGPU/AMDGPUPTNote.h
@@ -23,7 +23,8 @@ namespace ElfNote {
 
 const char SectionName[] = ".note";
 
-const char NoteName[] = "AMD";
+const char NoteNameV2[] = "AMD";
+const char NoteNameV3[] = "AMDGPU";
 
 // TODO: Remove this file once we drop code object v2.
 enum NoteType{
diff --git a/lib/Target/AMDGPU/AMDGPUPerfHintAnalysis.cpp b/lib/Target/AMDGPU/AMDGPUPerfHintAnalysis.cpp
index 3cfdccc9fe51a..e53a8fe7c074d 100644
--- a/lib/Target/AMDGPU/AMDGPUPerfHintAnalysis.cpp
+++ b/lib/Target/AMDGPU/AMDGPUPerfHintAnalysis.cpp
@@ -99,8 +99,6 @@ private:
 
   const DataLayout *DL;
 
-  AMDGPUAS AS;
-
   const TargetLowering *TLI;
 
   void visit(const Function &F);
@@ -267,7 +265,6 @@ void AMDGPUPerfHint::runOnFunction(Function &F) {
 
   const Module &M = *F.getParent();
   DL = &M.getDataLayout();
-  AS = AMDGPU::getAMDGPUAS(M);
 
   visit(F);
   auto Loc = FIM.find(&F);
@@ -306,14 +303,14 @@ bool AMDGPUPerfHint::isGlobalAddr(const Value *V) const {
   if (auto PT = dyn_cast<PointerType>(V->getType())) {
     unsigned As = PT->getAddressSpace();
     // Flat likely points to global too.
-    return As == AS.GLOBAL_ADDRESS || As == AS.FLAT_ADDRESS;
+    return As == AMDGPUAS::GLOBAL_ADDRESS || As == AMDGPUAS::FLAT_ADDRESS;
   }
   return false;
 }
 
 bool AMDGPUPerfHint::isLocalAddr(const Value *V) const {
   if (auto PT = dyn_cast<PointerType>(V->getType()))
-    return PT->getAddressSpace() == AS.LOCAL_ADDRESS;
+    return PT->getAddressSpace() == AMDGPUAS::LOCAL_ADDRESS;
   return false;
 }
 
@@ -346,7 +343,8 @@ AMDGPUPerfHint::makeMemAccessInfo(Instruction *Inst) const {
 bool AMDGPUPerfHint::isConstantAddr(const Value *V) const {
   if (auto PT = dyn_cast<PointerType>(V->getType())) {
     unsigned As = PT->getAddressSpace();
-    return As == AS.CONSTANT_ADDRESS || As == AS.CONSTANT_ADDRESS_32BIT;
+    return As == AMDGPUAS::CONSTANT_ADDRESS ||
+           As == AMDGPUAS::CONSTANT_ADDRESS_32BIT;
   }
   return false;
 }
diff --git a/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp b/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp
index d341fec6296fb..5d087c0991844 100644
--- a/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp
+++ b/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp
@@ -70,13 +70,17 @@ static cl::opt<bool> DisablePromoteAllocaToVector(
   cl::desc("Disable promote alloca to vector"),
   cl::init(false));
 
+static cl::opt<bool> DisablePromoteAllocaToLDS(
+  "disable-promote-alloca-to-lds",
+  cl::desc("Disable promote alloca to LDS"),
+  cl::init(false));
+
 // FIXME: This can create globals so should be a module pass.
 class AMDGPUPromoteAlloca : public FunctionPass {
 private:
   const TargetMachine *TM;
   Module *Mod = nullptr;
   const DataLayout *DL = nullptr;
-  AMDGPUAS AS;
 
   // FIXME: This should be per-kernel.
   uint32_t LocalMemLimit = 0;
@@ -156,8 +160,6 @@ bool AMDGPUPromoteAlloca::runOnFunction(Function &F) {
   if (!ST.isPromoteAllocaEnabled())
     return false;
 
-  AS = AMDGPU::getAMDGPUAS(*F.getParent());
-
   bool SufficientLDS = hasSufficientLocalMem(F);
   bool Changed = false;
   BasicBlock &EntryBB = *F.begin();
@@ -238,7 +240,7 @@ AMDGPUPromoteAlloca::getLocalSizeYZ(IRBuilder<> &Builder) {
 
   Type *I32Ty = Type::getInt32Ty(Mod->getContext());
   Value *CastDispatchPtr = Builder.CreateBitCast(
-    DispatchPtr, PointerType::get(I32Ty, AS.CONSTANT_ADDRESS));
+    DispatchPtr, PointerType::get(I32Ty, AMDGPUAS::CONSTANT_ADDRESS));
 
   // We could do a single 64-bit load here, but it's likely that the basic
   // 32-bit and extract sequence is already present, and it is probably easier
@@ -326,6 +328,10 @@ static bool canVectorizeInst(Instruction *Inst, User *User) {
     // Currently only handle the case where the Pointer Operand is a GEP.
     // Also we could not vectorize volatile or atomic loads.
     LoadInst *LI = cast<LoadInst>(Inst);
+    if (isa<AllocaInst>(User) &&
+        LI->getPointerOperandType() == User->getType() &&
+        isa<VectorType>(LI->getType()))
+      return true;
     return isa<GetElementPtrInst>(LI->getPointerOperand()) && LI->isSimple();
   }
   case Instruction::BitCast:
@@ -335,6 +341,10 @@ static bool canVectorizeInst(Instruction *Inst, User *User) {
     // since it should be canonical form, the User should be a GEP.
     // Also we could not vectorize volatile or atomic stores.
     StoreInst *SI = cast<StoreInst>(Inst);
+    if (isa<AllocaInst>(User) &&
+        SI->getPointerOperandType() == User->getType() &&
+        isa<VectorType>(SI->getValueOperand()->getType()))
+      return true;
     return (SI->getPointerOperand() == User) && isa<GetElementPtrInst>(User) && SI->isSimple();
   }
   default:
@@ -342,14 +352,15 @@ static bool canVectorizeInst(Instruction *Inst, User *User) {
   }
 }
 
-static bool tryPromoteAllocaToVector(AllocaInst *Alloca, AMDGPUAS AS) {
+static bool tryPromoteAllocaToVector(AllocaInst *Alloca) {
 
   if (DisablePromoteAllocaToVector) {
     LLVM_DEBUG(dbgs() << "  Promotion alloca to vector is disabled\n");
     return false;
   }
 
-  ArrayType *AllocaTy = dyn_cast<ArrayType>(Alloca->getAllocatedType());
+  Type *AT = Alloca->getAllocatedType();
+  SequentialType *AllocaTy = dyn_cast<SequentialType>(AT);
 
   LLVM_DEBUG(dbgs() << "Alloca candidate for vectorization\n");
 
@@ -396,7 +407,9 @@ static bool tryPromoteAllocaToVector(AllocaInst *Alloca, AMDGPUAS AS) {
     }
   }
 
-  VectorType *VectorTy = arrayTypeToVecType(AllocaTy);
+  VectorType *VectorTy = dyn_cast<VectorType>(AllocaTy);
+  if (!VectorTy)
+    VectorTy = arrayTypeToVecType(cast<ArrayType>(AllocaTy));
 
   LLVM_DEBUG(dbgs() << "  Converting alloca to vector " << *AllocaTy << " -> "
                     << *VectorTy << '\n');
@@ -406,7 +419,10 @@ static bool tryPromoteAllocaToVector(AllocaInst *Alloca, AMDGPUAS AS) {
     IRBuilder<> Builder(Inst);
     switch (Inst->getOpcode()) {
     case Instruction::Load: {
-      Type *VecPtrTy = VectorTy->getPointerTo(AS.PRIVATE_ADDRESS);
+      if (Inst->getType() == AT)
+        break;
+
+      Type *VecPtrTy = VectorTy->getPointerTo(AMDGPUAS::PRIVATE_ADDRESS);
       Value *Ptr = cast<LoadInst>(Inst)->getPointerOperand();
       Value *Index = calculateVectorIndex(Ptr, GEPVectorIdx);
 
@@ -418,9 +434,11 @@ static bool tryPromoteAllocaToVector(AllocaInst *Alloca, AMDGPUAS AS) {
       break;
     }
     case Instruction::Store: {
-      Type *VecPtrTy = VectorTy->getPointerTo(AS.PRIVATE_ADDRESS);
-
       StoreInst *SI = cast<StoreInst>(Inst);
+      if (SI->getValueOperand()->getType() == AT)
+        break;
+
+      Type *VecPtrTy = VectorTy->getPointerTo(AMDGPUAS::PRIVATE_ADDRESS);
       Value *Ptr = SI->getPointerOperand();
       Value *Index = calculateVectorIndex(Ptr, GEPVectorIdx);
       Value *BitCast = Builder.CreateBitCast(Alloca, VecPtrTy);
@@ -610,7 +628,7 @@ bool AMDGPUPromoteAlloca::hasSufficientLocalMem(const Function &F) {
   // we cannot use local memory in the pass.
   for (Type *ParamTy : FTy->params()) {
     PointerType *PtrTy = dyn_cast<PointerType>(ParamTy);
-    if (PtrTy && PtrTy->getAddressSpace() == AS.LOCAL_ADDRESS) {
+    if (PtrTy && PtrTy->getAddressSpace() == AMDGPUAS::LOCAL_ADDRESS) {
       LocalMemLimit = 0;
       LLVM_DEBUG(dbgs() << "Function has local memory argument. Promoting to "
                            "local memory disabled.\n");
@@ -627,7 +645,7 @@ bool AMDGPUPromoteAlloca::hasSufficientLocalMem(const Function &F) {
   // Check how much local memory is being used by global objects
   CurrentLocalMemUsage = 0;
   for (GlobalVariable &GV : Mod->globals()) {
-    if (GV.getType()->getAddressSpace() != AS.LOCAL_ADDRESS)
+    if (GV.getType()->getAddressSpace() != AMDGPUAS::LOCAL_ADDRESS)
       continue;
 
     for (const User *U : GV.users()) {
@@ -706,9 +724,12 @@ bool AMDGPUPromoteAlloca::handleAlloca(AllocaInst &I, bool SufficientLDS) {
 
   LLVM_DEBUG(dbgs() << "Trying to promote " << I << '\n');
 
-  if (tryPromoteAllocaToVector(&I, AS))
+  if (tryPromoteAllocaToVector(&I))
     return true; // Promoted to vector.
 
+  if (DisablePromoteAllocaToLDS)
+    return false;
+
   const Function &ContainingFunction = *I.getParent()->getParent();
   CallingConv::ID CC = ContainingFunction.getCallingConv();
 
@@ -775,7 +796,7 @@ bool AMDGPUPromoteAlloca::handleAlloca(AllocaInst &I, bool SufficientLDS) {
       Twine(F->getName()) + Twine('.') + I.getName(),
       nullptr,
       GlobalVariable::NotThreadLocal,
-      AS.LOCAL_ADDRESS);
+      AMDGPUAS::LOCAL_ADDRESS);
   GV->setUnnamedAddr(GlobalValue::UnnamedAddr::Global);
   GV->setAlignment(I.getAlignment());
 
@@ -808,7 +829,7 @@ bool AMDGPUPromoteAlloca::handleAlloca(AllocaInst &I, bool SufficientLDS) {
       if (ICmpInst *CI = dyn_cast<ICmpInst>(V)) {
         Value *Src0 = CI->getOperand(0);
         Type *EltTy = Src0->getType()->getPointerElementType();
-        PointerType *NewTy = PointerType::get(EltTy, AS.LOCAL_ADDRESS);
+        PointerType *NewTy = PointerType::get(EltTy, AMDGPUAS::LOCAL_ADDRESS);
 
         if (isa<ConstantPointerNull>(CI->getOperand(0)))
           CI->setOperand(0, ConstantPointerNull::get(NewTy));
@@ -825,7 +846,7 @@ bool AMDGPUPromoteAlloca::handleAlloca(AllocaInst &I, bool SufficientLDS) {
         continue;
 
       Type *EltTy = V->getType()->getPointerElementType();
-      PointerType *NewTy = PointerType::get(EltTy, AS.LOCAL_ADDRESS);
+      PointerType *NewTy = PointerType::get(EltTy, AMDGPUAS::LOCAL_ADDRESS);
 
       // FIXME: It doesn't really make sense to try to do this for all
       // instructions.
@@ -894,7 +915,7 @@ bool AMDGPUPromoteAlloca::handleAlloca(AllocaInst &I, bool SufficientLDS) {
       Type *SrcTy = Src->getType()->getPointerElementType();
       Function *ObjectSize = Intrinsic::getDeclaration(Mod,
         Intrinsic::objectsize,
-        { Intr->getType(), PointerType::get(SrcTy, AS.LOCAL_ADDRESS) }
+        { Intr->getType(), PointerType::get(SrcTy, AMDGPUAS::LOCAL_ADDRESS) }
       );
 
       CallInst *NewCall = Builder.CreateCall(
diff --git a/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp b/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp
index 012e4fe200aae..7a760dcf7a908 100644
--- a/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp
+++ b/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp
@@ -35,7 +35,7 @@ AMDGPURegisterBankInfo::AMDGPURegisterBankInfo(const TargetRegisterInfo &TRI)
     : AMDGPUGenRegisterBankInfo(),
       TRI(static_cast<const SIRegisterInfo*>(&TRI)) {
 
-  // HACK: Until this is fully tablegen'd
+  // HACK: Until this is fully tablegen'd.
   static bool AlreadyInit = false;
   if (AlreadyInit)
     return;
@@ -74,13 +74,16 @@ unsigned AMDGPURegisterBankInfo::copyCost(const RegisterBank &Dst,
                                           const RegisterBank &Src,
                                           unsigned Size) const {
   if (Dst.getID() == AMDGPU::SGPRRegBankID &&
-      Src.getID() == AMDGPU::VGPRRegBankID)
+      Src.getID() == AMDGPU::VGPRRegBankID) {
     return std::numeric_limits<unsigned>::max();
+  }
 
   // SGPRRegBank with size 1 is actually vcc or another 64-bit sgpr written by
   // the valu.
   if (Size == 1 && Dst.getID() == AMDGPU::SCCRegBankID &&
-      Src.getID() == AMDGPU::SGPRRegBankID)
+      (Src.getID() == AMDGPU::SGPRRegBankID ||
+       Src.getID() == AMDGPU::VGPRRegBankID ||
+       Src.getID() == AMDGPU::VCCRegBankID))
     return std::numeric_limits<unsigned>::max();
 
   return RegisterBankInfo::copyCost(Dst, Src, Size);
@@ -145,7 +148,7 @@ AMDGPURegisterBankInfo::getInstrAlternativeMappings(
     AltMappings.push_back(&SSMapping);
 
     const InstructionMapping &SVMapping = getInstructionMapping(2, 1,
-      getOperandsMapping({AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, 1),
+      getOperandsMapping({AMDGPU::getValueMapping(AMDGPU::VCCRegBankID, 1),
                           nullptr, // Predicate operand.
                           AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size),
                           AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size)}),
@@ -153,7 +156,7 @@ AMDGPURegisterBankInfo::getInstrAlternativeMappings(
     AltMappings.push_back(&SVMapping);
 
     const InstructionMapping &VSMapping = getInstructionMapping(3, 1,
-      getOperandsMapping({AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, 1),
+      getOperandsMapping({AMDGPU::getValueMapping(AMDGPU::VCCRegBankID, 1),
                           nullptr, // Predicate operand.
                           AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size),
                           AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size)}),
@@ -161,7 +164,7 @@ AMDGPURegisterBankInfo::getInstrAlternativeMappings(
     AltMappings.push_back(&VSMapping);
 
     const InstructionMapping &VVMapping = getInstructionMapping(4, 1,
-      getOperandsMapping({AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, 1),
+      getOperandsMapping({AMDGPU::getValueMapping(AMDGPU::VCCRegBankID, 1),
                           nullptr, // Predicate operand.
                           AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size),
                           AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size)}),
@@ -170,6 +173,67 @@ AMDGPURegisterBankInfo::getInstrAlternativeMappings(
 
     return AltMappings;
   }
+  case TargetOpcode::G_SELECT: {
+    unsigned Size = getSizeInBits(MI.getOperand(0).getReg(), MRI, *TRI);
+    const InstructionMapping &SSMapping = getInstructionMapping(1, 1,
+      getOperandsMapping({AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size),
+                          AMDGPU::getValueMapping(AMDGPU::SCCRegBankID, 1),
+                          AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size),
+                          AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size)}),
+      4); // Num Operands
+    AltMappings.push_back(&SSMapping);
+
+    const InstructionMapping &VVMapping = getInstructionMapping(2, 1,
+      getOperandsMapping({AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size),
+                          AMDGPU::getValueMapping(AMDGPU::VCCRegBankID, 1),
+                          AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size),
+                          AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size)}),
+      4); // Num Operands
+    AltMappings.push_back(&VVMapping);
+
+    return AltMappings;
+  }
+  case TargetOpcode::G_UADDE:
+  case TargetOpcode::G_USUBE:
+  case TargetOpcode::G_SADDE:
+  case TargetOpcode::G_SSUBE: {
+    unsigned Size = getSizeInBits(MI.getOperand(0).getReg(), MRI, *TRI);
+    const InstructionMapping &SSMapping = getInstructionMapping(1, 1,
+      getOperandsMapping(
+        {AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size),
+         AMDGPU::getValueMapping(AMDGPU::SCCRegBankID, 1),
+         AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size),
+         AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size),
+         AMDGPU::getValueMapping(AMDGPU::SCCRegBankID, 1)}),
+      5); // Num Operands
+    AltMappings.push_back(&SSMapping);
+
+    const InstructionMapping &VVMapping = getInstructionMapping(2, 1,
+      getOperandsMapping({AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size),
+                          AMDGPU::getValueMapping(AMDGPU::VCCRegBankID, 1),
+                          AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size),
+                          AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size),
+                          AMDGPU::getValueMapping(AMDGPU::VCCRegBankID, 1)}),
+      5); // Num Operands
+    AltMappings.push_back(&VVMapping);
+    return AltMappings;
+  }
+  case AMDGPU::G_BRCOND: {
+    assert(MRI.getType(MI.getOperand(0).getReg()).getSizeInBits() == 1);
+
+    const InstructionMapping &SMapping = getInstructionMapping(
+      1, 1, getOperandsMapping(
+        {AMDGPU::getValueMapping(AMDGPU::SCCRegBankID, 1), nullptr}),
+      2); // Num Operands
+    AltMappings.push_back(&SMapping);
+
+    const InstructionMapping &VMapping = getInstructionMapping(
+      1, 1, getOperandsMapping(
+        {AMDGPU::getValueMapping(AMDGPU::VCCRegBankID, 1), nullptr }),
+      2); // Num Operands
+    AltMappings.push_back(&VMapping);
+    return AltMappings;
+  }
   default:
     break;
   }
@@ -193,10 +257,16 @@ bool AMDGPURegisterBankInfo::isSALUMapping(const MachineInstr &MI) const {
   const MachineFunction &MF = *MI.getParent()->getParent();
   const MachineRegisterInfo &MRI = MF.getRegInfo();
   for (unsigned i = 0, e = MI.getNumOperands();i != e; ++i) {
+    if (!MI.getOperand(i).isReg())
+      continue;
     unsigned Reg = MI.getOperand(i).getReg();
-    const RegisterBank *Bank = getRegBank(Reg, MRI, *TRI);
-    if (Bank && Bank->getID() != AMDGPU::SGPRRegBankID)
-      return false;
+    if (const RegisterBank *Bank = getRegBank(Reg, MRI, *TRI)) {
+      if (Bank->getID() == AMDGPU::VGPRRegBankID)
+        return false;
+
+      assert(Bank->getID() == AMDGPU::SGPRRegBankID ||
+             Bank->getID() == AMDGPU::SCCRegBankID);
+    }
   }
   return true;
 }
@@ -209,7 +279,8 @@ AMDGPURegisterBankInfo::getDefaultMappingSOP(const MachineInstr &MI) const {
 
   for (unsigned i = 0, e = MI.getNumOperands(); i != e; ++i) {
     unsigned Size = getSizeInBits(MI.getOperand(i).getReg(), MRI, *TRI);
-    OpdsMapping[i] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size);
+    unsigned BankID = Size == 1 ? AMDGPU::SCCRegBankID : AMDGPU::SGPRRegBankID;
+    OpdsMapping[i] = AMDGPU::getValueMapping(BankID, Size);
   }
   return getInstructionMapping(1, 1, getOperandsMapping(OpdsMapping),
                                MI.getNumOperands());
@@ -230,12 +301,32 @@ AMDGPURegisterBankInfo::getDefaultMappingVOP(const MachineInstr &MI) const {
 
   unsigned Reg1 = MI.getOperand(OpdIdx).getReg();
   unsigned Size1 = getSizeInBits(Reg1, MRI, *TRI);
-  unsigned Bank1 = getRegBankID(Reg1, MRI, *TRI);
+
+  unsigned DefaultBankID = Size1 == 1 ?
+    AMDGPU::VCCRegBankID : AMDGPU::VGPRRegBankID;
+  unsigned Bank1 = getRegBankID(Reg1, MRI, *TRI, DefaultBankID);
+
   OpdsMapping[OpdIdx++] = AMDGPU::getValueMapping(Bank1, Size1);
 
   for (unsigned e = MI.getNumOperands(); OpdIdx != e; ++OpdIdx) {
     unsigned Size = getSizeInBits(MI.getOperand(OpdIdx).getReg(), MRI, *TRI);
-    OpdsMapping[OpdIdx] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size);
+    unsigned BankID = Size == 1 ? AMDGPU::VCCRegBankID : AMDGPU::VGPRRegBankID;
+    OpdsMapping[OpdIdx] = AMDGPU::getValueMapping(BankID, Size);
+  }
+
+  return getInstructionMapping(1, 1, getOperandsMapping(OpdsMapping),
+                               MI.getNumOperands());
+}
+
+const RegisterBankInfo::InstructionMapping &
+AMDGPURegisterBankInfo::getDefaultMappingAllVGPR(const MachineInstr &MI) const {
+  const MachineFunction &MF = *MI.getParent()->getParent();
+  const MachineRegisterInfo &MRI = MF.getRegInfo();
+  SmallVector<const ValueMapping*, 8> OpdsMapping(MI.getNumOperands());
+
+  for (unsigned I = 0, E = MI.getNumOperands(); I != E; ++I) {
+    unsigned Size = getSizeInBits(MI.getOperand(I).getReg(), MRI, *TRI);
+    OpdsMapping[I] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size);
   }
 
   return getInstructionMapping(1, 1, getOperandsMapping(OpdsMapping),
@@ -304,21 +395,49 @@ AMDGPURegisterBankInfo::getInstrMapping(const MachineInstr &MI) const {
   switch (MI.getOpcode()) {
   default:
     return getInvalidInstructionMapping();
+
+  case AMDGPU::G_AND:
+  case AMDGPU::G_OR:
+  case AMDGPU::G_XOR: {
+    unsigned Size = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits();
+    if (Size == 1) {
+      OpdsMapping[0] = OpdsMapping[1] =
+        OpdsMapping[2] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size);
+      break;
+    }
+
+    LLVM_FALLTHROUGH;
+  }
+
   case AMDGPU::G_ADD:
   case AMDGPU::G_SUB:
   case AMDGPU::G_MUL:
-  case AMDGPU::G_AND:
-  case AMDGPU::G_OR:
-  case AMDGPU::G_XOR:
   case AMDGPU::G_SHL:
+  case AMDGPU::G_UADDO:
+  case AMDGPU::G_SADDO:
+  case AMDGPU::G_USUBO:
+  case AMDGPU::G_SSUBO:
+  case AMDGPU::G_UADDE:
+  case AMDGPU::G_SADDE:
+  case AMDGPU::G_USUBE:
+  case AMDGPU::G_SSUBE:
     if (isSALUMapping(MI))
       return getDefaultMappingSOP(MI);
-    // Fall-through
+    LLVM_FALLTHROUGH;
 
   case AMDGPU::G_FADD:
+  case AMDGPU::G_FSUB:
   case AMDGPU::G_FPTOSI:
   case AMDGPU::G_FPTOUI:
   case AMDGPU::G_FMUL:
+  case AMDGPU::G_FMA:
+  case AMDGPU::G_SITOFP:
+  case AMDGPU::G_UITOFP:
+  case AMDGPU::G_FPTRUNC:
+  case AMDGPU::G_FEXP2:
+  case AMDGPU::G_FLOG2:
+  case AMDGPU::G_INTRINSIC_TRUNC:
+  case AMDGPU::G_INTRINSIC_ROUND:
     return getDefaultMappingVOP(MI);
   case AMDGPU::G_IMPLICIT_DEF: {
     unsigned Size = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits();
@@ -326,11 +445,25 @@ AMDGPURegisterBankInfo::getInstrMapping(const MachineInstr &MI) const {
     break;
   }
   case AMDGPU::G_FCONSTANT:
-  case AMDGPU::G_CONSTANT: {
+  case AMDGPU::G_CONSTANT:
+  case AMDGPU::G_FRAME_INDEX:
+  case AMDGPU::G_BLOCK_ADDR: {
     unsigned Size = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits();
     OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size);
     break;
   }
+  case AMDGPU::G_INSERT: {
+    unsigned BankID = isSALUMapping(MI) ? AMDGPU::SGPRRegBankID :
+                                          AMDGPU::VGPRRegBankID;
+    unsigned DstSize = getSizeInBits(MI.getOperand(0).getReg(), MRI, *TRI);
+    unsigned SrcSize = getSizeInBits(MI.getOperand(1).getReg(), MRI, *TRI);
+    unsigned EltSize = getSizeInBits(MI.getOperand(2).getReg(), MRI, *TRI);
+    OpdsMapping[0] = AMDGPU::getValueMapping(BankID, DstSize);
+    OpdsMapping[1] = AMDGPU::getValueMapping(BankID, SrcSize);
+    OpdsMapping[2] = AMDGPU::getValueMapping(BankID, EltSize);
+    OpdsMapping[3] = nullptr;
+    break;
+  }
   case AMDGPU::G_EXTRACT: {
     unsigned BankID = getRegBankID(MI.getOperand(1).getReg(), MRI, *TRI);
     unsigned DstSize = getSizeInBits(MI.getOperand(0).getReg(), MRI, *TRI);
@@ -352,7 +485,17 @@ AMDGPURegisterBankInfo::getInstrMapping(const MachineInstr &MI) const {
       OpdsMapping[i] = AMDGPU::getValueMapping(Bank, SrcSize);
     break;
   }
-  case AMDGPU::G_BITCAST: {
+  case AMDGPU::G_BITCAST:
+  case AMDGPU::G_INTTOPTR:
+  case AMDGPU::G_PTRTOINT:
+  case AMDGPU::G_CTLZ:
+  case AMDGPU::G_CTLZ_ZERO_UNDEF:
+  case AMDGPU::G_CTTZ:
+  case AMDGPU::G_CTTZ_ZERO_UNDEF:
+  case AMDGPU::G_CTPOP:
+  case AMDGPU::G_BSWAP:
+  case AMDGPU::G_FABS:
+  case AMDGPU::G_FNEG: {
     unsigned Size = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits();
     unsigned BankID = getRegBankID(MI.getOperand(1).getReg(), MRI, *TRI);
     OpdsMapping[0] = OpdsMapping[1] = AMDGPU::getValueMapping(BankID, Size);
@@ -368,7 +511,9 @@ AMDGPURegisterBankInfo::getInstrMapping(const MachineInstr &MI) const {
     OpdsMapping[1] = AMDGPU::getValueMapping(Bank, SrcSize);
     break;
   }
-  case AMDGPU::G_ZEXT: {
+  case AMDGPU::G_ZEXT:
+  case AMDGPU::G_SEXT:
+  case AMDGPU::G_ANYEXT: {
     unsigned Dst = MI.getOperand(0).getReg();
     unsigned Src = MI.getOperand(1).getReg();
     unsigned DstSize = getSizeInBits(Dst, MRI, *TRI);
@@ -391,7 +536,7 @@ AMDGPURegisterBankInfo::getInstrMapping(const MachineInstr &MI) const {
   case AMDGPU::G_FCMP: {
     unsigned Size = MRI.getType(MI.getOperand(2).getReg()).getSizeInBits();
     unsigned Op2Bank = getRegBankID(MI.getOperand(2).getReg(), MRI, *TRI);
-    OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, 1);
+    OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::VCCRegBankID, 1);
     OpdsMapping[1] = nullptr; // Predicate Operand.
     OpdsMapping[2] = AMDGPU::getValueMapping(Op2Bank, Size);
     OpdsMapping[3] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size);
@@ -431,7 +576,7 @@ AMDGPURegisterBankInfo::getInstrMapping(const MachineInstr &MI) const {
     unsigned Op3Bank = getRegBankID(MI.getOperand(3).getReg(), MRI, *TRI);
     unsigned Op0Bank = Op2Bank == AMDGPU::SGPRRegBankID &&
                        Op3Bank == AMDGPU::SGPRRegBankID ?
-                       AMDGPU::SCCRegBankID : AMDGPU::VGPRRegBankID;
+                       AMDGPU::SCCRegBankID : AMDGPU::VCCRegBankID;
     OpdsMapping[0] = AMDGPU::getValueMapping(Op0Bank, 1);
     OpdsMapping[1] = nullptr; // Predicate Operand.
     OpdsMapping[2] = AMDGPU::getValueMapping(Op2Bank, Size);
@@ -479,6 +624,18 @@ AMDGPURegisterBankInfo::getInstrMapping(const MachineInstr &MI) const {
 
     break;
   }
+  case AMDGPU::G_UNMERGE_VALUES: {
+    unsigned Bank = isSALUMapping(MI) ? AMDGPU::SGPRRegBankID :
+      AMDGPU::VGPRRegBankID;
+
+    // Op1 and Dst should use the same register bank.
+    // FIXME: Shouldn't this be the default? Why do we need to handle this?
+    for (unsigned i = 0, e = MI.getNumOperands(); i != e; ++i) {
+      unsigned Size = getSizeInBits(MI.getOperand(i).getReg(), MRI, *TRI);
+      OpdsMapping[i] = AMDGPU::getValueMapping(Bank, Size);
+    }
+    break;
+  }
   case AMDGPU::G_INTRINSIC: {
     switch (MI.getOperand(1).getIntrinsicID()) {
     default:
@@ -492,6 +649,12 @@ AMDGPURegisterBankInfo::getInstrMapping(const MachineInstr &MI) const {
       OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size);
       break;
     }
+    case Intrinsic::amdgcn_wqm_vote: {
+      unsigned Size = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits();
+      OpdsMapping[0] = OpdsMapping[2]
+        = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size);
+      break;
+    }
     }
     break;
   }
@@ -528,8 +691,50 @@ AMDGPURegisterBankInfo::getInstrMapping(const MachineInstr &MI) const {
     }
     break;
   }
+  case AMDGPU::G_SELECT: {
+    unsigned Size = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits();
+    unsigned Op1Bank = getRegBankID(MI.getOperand(1).getReg(), MRI, *TRI,
+                                    AMDGPU::SGPRRegBankID);
+    unsigned Op2Bank = getRegBankID(MI.getOperand(2).getReg(), MRI, *TRI);
+    unsigned Op3Bank = getRegBankID(MI.getOperand(3).getReg(), MRI, *TRI);
+    bool SGPRSrcs = Op1Bank == AMDGPU::SCCRegBankID &&
+                    Op2Bank == AMDGPU::SGPRRegBankID &&
+                    Op3Bank == AMDGPU::SGPRRegBankID;
+    unsigned Bank = SGPRSrcs ? AMDGPU::SGPRRegBankID : AMDGPU::VGPRRegBankID;
+    Op1Bank = SGPRSrcs ? AMDGPU::SCCRegBankID : AMDGPU::VCCRegBankID;
+    OpdsMapping[0] = AMDGPU::getValueMapping(Bank, Size);
+    OpdsMapping[1] = AMDGPU::getValueMapping(Op1Bank, 1);
+    OpdsMapping[2] = AMDGPU::getValueMapping(Bank, Size);
+    OpdsMapping[3] = AMDGPU::getValueMapping(Bank, Size);
+    break;
+  }
+
   case AMDGPU::G_LOAD:
     return getInstrMappingForLoad(MI);
+
+  case AMDGPU::G_ATOMICRMW_XCHG:
+  case AMDGPU::G_ATOMICRMW_ADD:
+  case AMDGPU::G_ATOMICRMW_SUB:
+  case AMDGPU::G_ATOMICRMW_AND:
+  case AMDGPU::G_ATOMICRMW_OR:
+  case AMDGPU::G_ATOMICRMW_XOR:
+  case AMDGPU::G_ATOMICRMW_MAX:
+  case AMDGPU::G_ATOMICRMW_MIN:
+  case AMDGPU::G_ATOMICRMW_UMAX:
+  case AMDGPU::G_ATOMICRMW_UMIN:
+  case AMDGPU::G_ATOMIC_CMPXCHG: {
+    return getDefaultMappingAllVGPR(MI);
+  }
+  case AMDGPU::G_BRCOND: {
+    unsigned Bank = getRegBankID(MI.getOperand(0).getReg(), MRI, *TRI,
+                                 AMDGPU::SGPRRegBankID);
+    assert(MRI.getType(MI.getOperand(0).getReg()).getSizeInBits() == 1);
+    if (Bank != AMDGPU::SCCRegBankID)
+      Bank = AMDGPU::VCCRegBankID;
+
+    OpdsMapping[0] = AMDGPU::getValueMapping(Bank, 1);
+    break;
+  }
   }
 
   return getInstructionMapping(1, 1, getOperandsMapping(OpdsMapping),
diff --git a/lib/Target/AMDGPU/AMDGPURegisterBankInfo.h b/lib/Target/AMDGPU/AMDGPURegisterBankInfo.h
index d48a665898735..d29f4bc79a519 100644
--- a/lib/Target/AMDGPU/AMDGPURegisterBankInfo.h
+++ b/lib/Target/AMDGPU/AMDGPURegisterBankInfo.h
@@ -49,6 +49,8 @@ class AMDGPURegisterBankInfo : public AMDGPUGenRegisterBankInfo {
   bool isSALUMapping(const MachineInstr &MI) const;
   const InstructionMapping &getDefaultMappingSOP(const MachineInstr &MI) const;
   const InstructionMapping &getDefaultMappingVOP(const MachineInstr &MI) const;
+  const InstructionMapping &getDefaultMappingAllVGPR(
+    const MachineInstr &MI) const;
 public:
   AMDGPURegisterBankInfo(const TargetRegisterInfo &TRI);
 
diff --git a/lib/Target/AMDGPU/AMDGPURegisterBanks.td b/lib/Target/AMDGPU/AMDGPURegisterBanks.td
index 7f7f75f656479..570379a820e12 100644
--- a/lib/Target/AMDGPU/AMDGPURegisterBanks.td
+++ b/lib/Target/AMDGPU/AMDGPURegisterBanks.td
@@ -15,4 +15,7 @@ def VGPRRegBank : RegisterBank<"VGPR",
   [VGPR_32, VReg_64, VReg_96, VReg_128, VReg_256, VReg_512]
 >;
 
-def SCCRegBank : RegisterBank <"SCC", [SCC_CLASS ]>;
+def SCCRegBank : RegisterBank <"SCC", [SCC_CLASS]>;
+
+// It is helpful to distinguish conditions from ordinary SGPRs.
+def VCCRegBank : RegisterBank <"VCC", [SReg_64]>;
diff --git a/lib/Target/AMDGPU/AMDGPURegisterInfo.h b/lib/Target/AMDGPU/AMDGPURegisterInfo.h
index 07de5fc549e29..922d974f2ebd6 100644
--- a/lib/Target/AMDGPU/AMDGPURegisterInfo.h
+++ b/lib/Target/AMDGPU/AMDGPURegisterInfo.h
@@ -27,8 +27,6 @@ class TargetInstrInfo;
 struct AMDGPURegisterInfo : public AMDGPUGenRegisterInfo {
   AMDGPURegisterInfo();
 
-  bool enableMultipleCopyHints() const override { return true; }
-
   /// \returns the sub reg enum value for the given \p Channel
   /// (e.g. getSubRegFromChannel(0) -> AMDGPU::sub0)
   static unsigned getSubRegFromChannel(unsigned Channel);
diff --git a/lib/Target/AMDGPU/AMDGPURewriteOutArguments.cpp b/lib/Target/AMDGPU/AMDGPURewriteOutArguments.cpp
index a861762a8c9e3..efe501cb73c27 100644
--- a/lib/Target/AMDGPU/AMDGPURewriteOutArguments.cpp
+++ b/lib/Target/AMDGPU/AMDGPURewriteOutArguments.cpp
@@ -163,7 +163,7 @@ bool AMDGPURewriteOutArguments::checkArgumentUses(Value &Arg) const {
       // some casts between structs and non-structs, but we can't bitcast
       // directly between them.  directly bitcast between them.  Blender uses
       // some casts that look like { <3 x float> }* to <4 x float>*
-      if ((SrcEltTy->isStructTy() && (SrcEltTy->getNumContainedTypes() != 1)))
+      if ((SrcEltTy->isStructTy() && (SrcEltTy->getStructNumElements() != 1)))
         return false;
 
       // Clang emits OpenCL 3-vector type accesses with a bitcast to the
@@ -401,8 +401,8 @@ bool AMDGPURewriteOutArguments::runOnFunction(Function &F) {
       if (Val->getType() != EltTy) {
         Type *EffectiveEltTy = EltTy;
         if (StructType *CT = dyn_cast<StructType>(EltTy)) {
-          assert(CT->getNumContainedTypes() == 1);
-          EffectiveEltTy = CT->getContainedType(0);
+          assert(CT->getNumElements() == 1);
+          EffectiveEltTy = CT->getElementType(0);
         }
 
         if (DL->getTypeSizeInBits(EffectiveEltTy) !=
diff --git a/lib/Target/AMDGPU/AMDGPUSubtarget.cpp b/lib/Target/AMDGPU/AMDGPUSubtarget.cpp
index 98b49070fa99f..ed0cc70c3d9aa 100644
--- a/lib/Target/AMDGPU/AMDGPUSubtarget.cpp
+++ b/lib/Target/AMDGPU/AMDGPUSubtarget.cpp
@@ -74,6 +74,9 @@ GCNSubtarget::initializeSubtargetDependencies(const Triple &TT,
   // We want to be able to turn these off, but making this a subtarget feature
   // for SI has the unhelpful behavior that it unsets everything else if you
   // disable it.
+  //
+  // Similarly we want enable-prt-strict-null to be on by default and not to
+  // unset everything else if it is disabled
 
   SmallString<256> FullFS("+promote-alloca,+dx10-clamp,+load-store-opt,");
 
@@ -89,6 +92,8 @@ GCNSubtarget::initializeSubtargetDependencies(const Triple &TT,
     FullFS += "-fp32-denormals,";
   }
 
+  FullFS += "+enable-prt-strict-null,"; // This is overridden by a disable in FS
+
   FullFS += FS;
 
   ParseSubtargetFeatures(GPU, FullFS);
@@ -124,10 +129,8 @@ GCNSubtarget::initializeSubtargetDependencies(const Triple &TT,
   return *this;
 }
 
-AMDGPUSubtarget::AMDGPUSubtarget(const Triple &TT,
-                                             const FeatureBitset &FeatureBits) :
+AMDGPUSubtarget::AMDGPUSubtarget(const Triple &TT) :
   TargetTriple(TT),
-  SubtargetFeatureBits(FeatureBits),
   Has16BitInsts(false),
   HasMadMixInsts(false),
   FP32Denormals(false),
@@ -136,19 +139,22 @@ AMDGPUSubtarget::AMDGPUSubtarget(const Triple &TT,
   HasVOP3PInsts(false),
   HasMulI24(true),
   HasMulU24(true),
+  HasInv2PiInlineImm(false),
   HasFminFmaxLegacy(true),
   EnablePromoteAlloca(false),
+  HasTrigReducedRange(false),
   LocalMemorySize(0),
   WavefrontSize(0)
   { }
 
 GCNSubtarget::GCNSubtarget(const Triple &TT, StringRef GPU, StringRef FS,
-                                 const GCNTargetMachine &TM) :
+                           const GCNTargetMachine &TM) :
     AMDGPUGenSubtargetInfo(TT, GPU, FS),
-    AMDGPUSubtarget(TT, getFeatureBits()),
+    AMDGPUSubtarget(TT),
     TargetTriple(TT),
     Gen(SOUTHERN_ISLANDS),
     IsaVersion(ISAVersion0_0_0),
+    InstrItins(getInstrItineraryForCPU(GPU)),
     LDSBankCount(0),
     MaxPrivateElementSize(0),
 
@@ -170,16 +176,17 @@ GCNSubtarget::GCNSubtarget(const Triple &TT, StringRef GPU, StringRef FS,
     DebuggerEmitPrologue(false),
 
     EnableHugePrivateBuffer(false),
-    EnableVGPRSpilling(false),
     EnableLoadStoreOpt(false),
     EnableUnsafeDSOffsetFolding(false),
     EnableSIScheduler(false),
     EnableDS128(false),
+    EnablePRTStrictNull(false),
     DumpCode(false),
 
     FP64(false),
     GCN3Encoding(false),
     CIInsts(false),
+    VIInsts(false),
     GFX9Insts(false),
     SGPRInitBug(false),
     HasSMemRealTime(false),
@@ -189,15 +196,16 @@ GCNSubtarget::GCNSubtarget(const Triple &TT, StringRef GPU, StringRef FS,
     HasVGPRIndexMode(false),
     HasScalarStores(false),
     HasScalarAtomics(false),
-    HasInv2PiInlineImm(false),
     HasSDWAOmod(false),
     HasSDWAScalar(false),
     HasSDWASdst(false),
     HasSDWAMac(false),
     HasSDWAOutModsVOPC(false),
     HasDPP(false),
+    HasR128A16(false),
     HasDLInsts(false),
-    D16PreservesUnusedBits(false),
+    HasDotInsts(false),
+    EnableSRAMECC(false),
     FlatAddressSpace(false),
     FlatInstOffsets(false),
     FlatGlobalInsts(false),
@@ -211,7 +219,6 @@ GCNSubtarget::GCNSubtarget(const Triple &TT, StringRef GPU, StringRef FS,
     InstrInfo(initializeSubtargetDependencies(TT, GPU, FS)),
     TLInfo(TM, *this),
     FrameLowering(TargetFrameLowering::StackGrowsUp, getStackAlignment(), 0) {
-  AS = AMDGPU::getAMDGPUAS(TT);
   CallLoweringInfo.reset(new AMDGPUCallLowering(*getTargetLowering()));
   Legalizer.reset(new AMDGPULegalizerInfo(*this, TM));
   RegBankInfo.reset(new AMDGPURegisterBankInfo(*getRegisterInfo()));
@@ -447,7 +454,7 @@ unsigned AMDGPUSubtarget::getKernArgSegmentSize(const Function &F,
 R600Subtarget::R600Subtarget(const Triple &TT, StringRef GPU, StringRef FS,
                              const TargetMachine &TM) :
   R600GenSubtargetInfo(TT, GPU, FS),
-  AMDGPUSubtarget(TT, getFeatureBits()),
+  AMDGPUSubtarget(TT),
   InstrInfo(*this),
   FrameLowering(TargetFrameLowering::StackGrowsUp, getStackAlignment(), 0),
   FMA(false),
@@ -460,8 +467,7 @@ R600Subtarget::R600Subtarget(const Triple &TT, StringRef GPU, StringRef FS,
   TexVTXClauseSize(0),
   Gen(R600),
   TLInfo(TM, initializeSubtargetDependencies(TT, GPU, FS)),
-  InstrItins(getInstrItineraryForCPU(GPU)),
-  AS (AMDGPU::getAMDGPUAS(TT)) { }
+  InstrItins(getInstrItineraryForCPU(GPU)) { }
 
 void GCNSubtarget::overrideSchedPolicy(MachineSchedPolicy &Policy,
                                       unsigned NumRegionInstrs) const {
@@ -480,10 +486,6 @@ void GCNSubtarget::overrideSchedPolicy(MachineSchedPolicy &Policy,
     Policy.ShouldTrackLaneMasks = true;
 }
 
-bool GCNSubtarget::isVGPRSpillingEnabled(const Function& F) const {
-  return EnableVGPRSpilling || !AMDGPU::isShader(F.getCallingConv());
-}
-
 unsigned GCNSubtarget::getOccupancyWithNumSGPRs(unsigned SGPRs) const {
   if (getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS) {
     if (SGPRs <= 80)
diff --git a/lib/Target/AMDGPU/AMDGPUSubtarget.h b/lib/Target/AMDGPU/AMDGPUSubtarget.h
index 6231097336518..5584759e55804 100644
--- a/lib/Target/AMDGPU/AMDGPUSubtarget.h
+++ b/lib/Target/AMDGPU/AMDGPUSubtarget.h
@@ -63,7 +63,6 @@ private:
   Triple TargetTriple;
 
 protected:
-  const FeatureBitset &SubtargetFeatureBits;
   bool Has16BitInsts;
   bool HasMadMixInsts;
   bool FP32Denormals;
@@ -72,13 +71,15 @@ protected:
   bool HasVOP3PInsts;
   bool HasMulI24;
   bool HasMulU24;
+  bool HasInv2PiInlineImm;
   bool HasFminFmaxLegacy;
   bool EnablePromoteAlloca;
+  bool HasTrigReducedRange;
   int LocalMemorySize;
   unsigned WavefrontSize;
 
 public:
-  AMDGPUSubtarget(const Triple &TT, const FeatureBitset &FeatureBits);
+  AMDGPUSubtarget(const Triple &TT);
 
   static const AMDGPUSubtarget &get(const MachineFunction &MF);
   static const AMDGPUSubtarget &get(const TargetMachine &TM,
@@ -134,7 +135,7 @@ public:
     return isMesa3DOS() && !AMDGPU::isShader(F.getCallingConv());
   }
 
-  bool isAmdCodeObjectV2(const Function &F) const {
+  bool isAmdHsaOrMesa(const Function &F) const {
     return isAmdHsaOS() || isMesaKernel(F);
   }
 
@@ -170,10 +171,18 @@ public:
     return HasMulU24;
   }
 
+  bool hasInv2PiInlineImm() const {
+    return HasInv2PiInlineImm;
+  }
+
   bool hasFminFmaxLegacy() const {
     return HasFminFmaxLegacy;
   }
 
+  bool hasTrigReducedRange() const {
+    return HasTrigReducedRange;
+  }
+
   bool isPromoteAllocaEnabled() const {
     return EnablePromoteAlloca;
   }
@@ -193,38 +202,26 @@ public:
   /// Returns the offset in bytes from the start of the input buffer
   ///        of the first explicit kernel argument.
   unsigned getExplicitKernelArgOffset(const Function &F) const {
-    return isAmdCodeObjectV2(F) ? 0 : 36;
+    return isAmdHsaOrMesa(F) ? 0 : 36;
   }
 
   /// \returns Maximum number of work groups per compute unit supported by the
   /// subtarget and limited by given \p FlatWorkGroupSize.
-  unsigned getMaxWorkGroupsPerCU(unsigned FlatWorkGroupSize) const {
-    return AMDGPU::IsaInfo::getMaxWorkGroupsPerCU(SubtargetFeatureBits,
-                                                  FlatWorkGroupSize);
-  }
+  virtual unsigned getMaxWorkGroupsPerCU(unsigned FlatWorkGroupSize) const = 0;
 
   /// \returns Minimum flat work group size supported by the subtarget.
-  unsigned getMinFlatWorkGroupSize() const {
-    return AMDGPU::IsaInfo::getMinFlatWorkGroupSize(SubtargetFeatureBits);
-  }
+  virtual unsigned getMinFlatWorkGroupSize() const = 0;
 
   /// \returns Maximum flat work group size supported by the subtarget.
-  unsigned getMaxFlatWorkGroupSize() const {
-    return AMDGPU::IsaInfo::getMaxFlatWorkGroupSize(SubtargetFeatureBits);
-  }
+  virtual unsigned getMaxFlatWorkGroupSize() const = 0;
 
   /// \returns Maximum number of waves per execution unit supported by the
   /// subtarget and limited by given \p FlatWorkGroupSize.
-  unsigned getMaxWavesPerEU(unsigned FlatWorkGroupSize) const {
-    return AMDGPU::IsaInfo::getMaxWavesPerEU(SubtargetFeatureBits,
-                                             FlatWorkGroupSize);
-  }
+  virtual unsigned getMaxWavesPerEU(unsigned FlatWorkGroupSize) const  = 0;
 
   /// \returns Minimum number of waves per execution unit supported by the
   /// subtarget.
-  unsigned getMinWavesPerEU() const {
-    return AMDGPU::IsaInfo::getMinWavesPerEU(SubtargetFeatureBits);
-  }
+  virtual unsigned getMinWavesPerEU() const = 0;
 
   unsigned getMaxWavesPerEU() const { return 10; }
 
@@ -266,6 +263,7 @@ public:
     ISAVersion9_0_2,
     ISAVersion9_0_4,
     ISAVersion9_0_6,
+    ISAVersion9_0_9,
   };
 
   enum TrapHandlerAbi {
@@ -300,6 +298,7 @@ protected:
   Triple TargetTriple;
   unsigned Gen;
   unsigned IsaVersion;
+  InstrItineraryData InstrItins;
   int LDSBankCount;
   unsigned MaxPrivateElementSize;
 
@@ -323,11 +322,11 @@ protected:
 
   // Used as options.
   bool EnableHugePrivateBuffer;
-  bool EnableVGPRSpilling;
   bool EnableLoadStoreOpt;
   bool EnableUnsafeDSOffsetFolding;
   bool EnableSIScheduler;
   bool EnableDS128;
+  bool EnablePRTStrictNull;
   bool DumpCode;
 
   // Subtarget statically properties set by tablegen
@@ -337,6 +336,7 @@ protected:
   bool IsGCN;
   bool GCN3Encoding;
   bool CIInsts;
+  bool VIInsts;
   bool GFX9Insts;
   bool SGPRInitBug;
   bool HasSMemRealTime;
@@ -346,15 +346,16 @@ protected:
   bool HasVGPRIndexMode;
   bool HasScalarStores;
   bool HasScalarAtomics;
-  bool HasInv2PiInlineImm;
   bool HasSDWAOmod;
   bool HasSDWAScalar;
   bool HasSDWASdst;
   bool HasSDWAMac;
   bool HasSDWAOutModsVOPC;
   bool HasDPP;
+  bool HasR128A16;
   bool HasDLInsts;
-  bool D16PreservesUnusedBits;
+  bool HasDotInsts;
+  bool EnableSRAMECC;
   bool FlatAddressSpace;
   bool FlatInstOffsets;
   bool FlatGlobalInsts;
@@ -372,7 +373,6 @@ protected:
   bool FeatureDisable;
 
   SelectionDAGTargetInfo TSInfo;
-  AMDGPUAS AS;
 private:
   SIInstrInfo InstrInfo;
   SITargetLowering TLInfo;
@@ -423,6 +423,10 @@ public:
     return &TSInfo;
   }
 
+  const InstrItineraryData *getInstrItineraryData() const override {
+    return &InstrItins;
+  }
+
   void ParseSubtargetFeatures(StringRef CPU, StringRef FS);
 
   Generation getGeneration() const {
@@ -441,10 +445,6 @@ public:
     return MaxPrivateElementSize;
   }
 
-  AMDGPUAS getAMDGPUAS() const {
-    return AS;
-  }
-
   bool hasIntClamp() const {
     return HasIntClamp;
   }
@@ -517,6 +517,10 @@ public:
     return FMA;
   }
 
+  bool hasSwap() const {
+    return GFX9Insts;
+  }
+
   TrapHandlerAbi getTrapHandlerAbi() const {
     return isAmdHsaOS() ? TrapHandlerAbiHsa : TrapHandlerAbiNone;
   }
@@ -574,12 +578,19 @@ public:
     return getGeneration() < AMDGPUSubtarget::GFX9;
   }
 
+  /// \returns If target requires PRT Struct NULL support (zero result registers
+  /// for sparse texture support).
+  bool usePRTStrictNull() const {
+    return EnablePRTStrictNull;
+  }
+
   bool hasAutoWaitcntBeforeBarrier() const {
     return AutoWaitcntBeforeBarrier;
   }
 
   bool hasCodeObjectV3() const {
-    return CodeObjectV3;
+    // FIXME: Need to add code object v3 support for mesa and pal.
+    return isAmdHsaOS() ? CodeObjectV3 : false;
   }
 
   bool hasUnalignedBufferAccess() const {
@@ -677,8 +688,12 @@ public:
     return HasDLInsts;
   }
 
-  bool d16PreservesUnusedBits() const {
-    return D16PreservesUnusedBits;
+  bool hasDotInsts() const {
+    return HasDotInsts;
+  }
+
+  bool isSRAMECCEnabled() const {
+    return EnableSRAMECC;
   }
 
   // Scratch is allocated in 256 dword per wave blocks for the entire
@@ -707,20 +722,19 @@ public:
   /// \returns Number of execution units per compute unit supported by the
   /// subtarget.
   unsigned getEUsPerCU() const {
-    return AMDGPU::IsaInfo::getEUsPerCU(MCSubtargetInfo::getFeatureBits());
+    return AMDGPU::IsaInfo::getEUsPerCU(this);
   }
 
   /// \returns Maximum number of waves per compute unit supported by the
   /// subtarget without any kind of limitation.
   unsigned getMaxWavesPerCU() const {
-    return AMDGPU::IsaInfo::getMaxWavesPerCU(MCSubtargetInfo::getFeatureBits());
+    return AMDGPU::IsaInfo::getMaxWavesPerCU(this);
   }
 
   /// \returns Maximum number of waves per compute unit supported by the
   /// subtarget and limited by given \p FlatWorkGroupSize.
   unsigned getMaxWavesPerCU(unsigned FlatWorkGroupSize) const {
-    return AMDGPU::IsaInfo::getMaxWavesPerCU(MCSubtargetInfo::getFeatureBits(),
-                                             FlatWorkGroupSize);
+    return AMDGPU::IsaInfo::getMaxWavesPerCU(this, FlatWorkGroupSize);
   }
 
   /// \returns Maximum number of waves per execution unit supported by the
@@ -732,8 +746,7 @@ public:
   /// \returns Number of waves per work group supported by the subtarget and
   /// limited by given \p FlatWorkGroupSize.
   unsigned getWavesPerWorkGroup(unsigned FlatWorkGroupSize) const {
-    return AMDGPU::IsaInfo::getWavesPerWorkGroup(
-        MCSubtargetInfo::getFeatureBits(), FlatWorkGroupSize);
+    return AMDGPU::IsaInfo::getWavesPerWorkGroup(this, FlatWorkGroupSize);
   }
 
   // static wrappers
@@ -747,8 +760,6 @@ public:
   void overrideSchedPolicy(MachineSchedPolicy &Policy,
                            unsigned NumRegionInstrs) const override;
 
-  bool isVGPRSpillingEnabled(const Function &F) const;
-
   unsigned getMaxNumUserSGPRs() const {
     return 16;
   }
@@ -781,14 +792,15 @@ public:
     return HasScalarAtomics;
   }
 
-  bool hasInv2PiInlineImm() const {
-    return HasInv2PiInlineImm;
-  }
 
   bool hasDPP() const {
     return HasDPP;
   }
 
+  bool hasR128A16() const {
+    return HasR128A16;
+  }
+
   bool enableSIScheduler() const {
     return EnableSIScheduler;
   }
@@ -817,6 +829,11 @@ public:
     return getGeneration() != AMDGPUSubtarget::SOUTHERN_ISLANDS;
   }
 
+  // \returns true if the subtarget supports DWORDX3 load/store instructions.
+  bool hasDwordx3LoadStores() const {
+    return CIInsts;
+  }
+
   bool hasSMovFedHazard() const {
     return getGeneration() >= AMDGPUSubtarget::GFX9;
   }
@@ -851,39 +868,34 @@ public:
 
   /// \returns SGPR allocation granularity supported by the subtarget.
   unsigned getSGPRAllocGranule() const {
-    return AMDGPU::IsaInfo::getSGPRAllocGranule(
-        MCSubtargetInfo::getFeatureBits());
+    return AMDGPU::IsaInfo::getSGPRAllocGranule(this);
   }
 
   /// \returns SGPR encoding granularity supported by the subtarget.
   unsigned getSGPREncodingGranule() const {
-    return AMDGPU::IsaInfo::getSGPREncodingGranule(
-        MCSubtargetInfo::getFeatureBits());
+    return AMDGPU::IsaInfo::getSGPREncodingGranule(this);
   }
 
   /// \returns Total number of SGPRs supported by the subtarget.
   unsigned getTotalNumSGPRs() const {
-    return AMDGPU::IsaInfo::getTotalNumSGPRs(MCSubtargetInfo::getFeatureBits());
+    return AMDGPU::IsaInfo::getTotalNumSGPRs(this);
   }
 
   /// \returns Addressable number of SGPRs supported by the subtarget.
   unsigned getAddressableNumSGPRs() const {
-    return AMDGPU::IsaInfo::getAddressableNumSGPRs(
-        MCSubtargetInfo::getFeatureBits());
+    return AMDGPU::IsaInfo::getAddressableNumSGPRs(this);
   }
 
   /// \returns Minimum number of SGPRs that meets the given number of waves per
   /// execution unit requirement supported by the subtarget.
   unsigned getMinNumSGPRs(unsigned WavesPerEU) const {
-    return AMDGPU::IsaInfo::getMinNumSGPRs(MCSubtargetInfo::getFeatureBits(),
-                                           WavesPerEU);
+    return AMDGPU::IsaInfo::getMinNumSGPRs(this, WavesPerEU);
   }
 
   /// \returns Maximum number of SGPRs that meets the given number of waves per
   /// execution unit requirement supported by the subtarget.
   unsigned getMaxNumSGPRs(unsigned WavesPerEU, bool Addressable) const {
-    return AMDGPU::IsaInfo::getMaxNumSGPRs(MCSubtargetInfo::getFeatureBits(),
-                                           WavesPerEU, Addressable);
+    return AMDGPU::IsaInfo::getMaxNumSGPRs(this, WavesPerEU, Addressable);
   }
 
   /// \returns Reserved number of SGPRs for given function \p MF.
@@ -901,39 +913,34 @@ public:
 
   /// \returns VGPR allocation granularity supported by the subtarget.
   unsigned getVGPRAllocGranule() const {
-    return AMDGPU::IsaInfo::getVGPRAllocGranule(
-        MCSubtargetInfo::getFeatureBits());
+    return AMDGPU::IsaInfo::getVGPRAllocGranule(this);
   }
 
   /// \returns VGPR encoding granularity supported by the subtarget.
   unsigned getVGPREncodingGranule() const {
-    return AMDGPU::IsaInfo::getVGPREncodingGranule(
-        MCSubtargetInfo::getFeatureBits());
+    return AMDGPU::IsaInfo::getVGPREncodingGranule(this);
   }
 
   /// \returns Total number of VGPRs supported by the subtarget.
   unsigned getTotalNumVGPRs() const {
-    return AMDGPU::IsaInfo::getTotalNumVGPRs(MCSubtargetInfo::getFeatureBits());
+    return AMDGPU::IsaInfo::getTotalNumVGPRs(this);
   }
 
   /// \returns Addressable number of VGPRs supported by the subtarget.
   unsigned getAddressableNumVGPRs() const {
-    return AMDGPU::IsaInfo::getAddressableNumVGPRs(
-        MCSubtargetInfo::getFeatureBits());
+    return AMDGPU::IsaInfo::getAddressableNumVGPRs(this);
   }
 
   /// \returns Minimum number of VGPRs that meets given number of waves per
   /// execution unit requirement supported by the subtarget.
   unsigned getMinNumVGPRs(unsigned WavesPerEU) const {
-    return AMDGPU::IsaInfo::getMinNumVGPRs(MCSubtargetInfo::getFeatureBits(),
-                                           WavesPerEU);
+    return AMDGPU::IsaInfo::getMinNumVGPRs(this, WavesPerEU);
   }
 
   /// \returns Maximum number of VGPRs that meets given number of waves per
   /// execution unit requirement supported by the subtarget.
   unsigned getMaxNumVGPRs(unsigned WavesPerEU) const {
-    return AMDGPU::IsaInfo::getMaxNumVGPRs(MCSubtargetInfo::getFeatureBits(),
-                                           WavesPerEU);
+    return AMDGPU::IsaInfo::getMaxNumVGPRs(this, WavesPerEU);
   }
 
   /// \returns Maximum number of VGPRs that meets number of waves per execution
@@ -949,6 +956,34 @@ public:
   void getPostRAMutations(
       std::vector<std::unique_ptr<ScheduleDAGMutation>> &Mutations)
       const override;
+
+  /// \returns Maximum number of work groups per compute unit supported by the
+  /// subtarget and limited by given \p FlatWorkGroupSize.
+  unsigned getMaxWorkGroupsPerCU(unsigned FlatWorkGroupSize) const override {
+    return AMDGPU::IsaInfo::getMaxWorkGroupsPerCU(this, FlatWorkGroupSize);
+  }
+
+  /// \returns Minimum flat work group size supported by the subtarget.
+  unsigned getMinFlatWorkGroupSize() const override {
+    return AMDGPU::IsaInfo::getMinFlatWorkGroupSize(this);
+  }
+
+  /// \returns Maximum flat work group size supported by the subtarget.
+  unsigned getMaxFlatWorkGroupSize() const override {
+    return AMDGPU::IsaInfo::getMaxFlatWorkGroupSize(this);
+  }
+
+  /// \returns Maximum number of waves per execution unit supported by the
+  /// subtarget and limited by given \p FlatWorkGroupSize.
+  unsigned getMaxWavesPerEU(unsigned FlatWorkGroupSize) const override {
+    return AMDGPU::IsaInfo::getMaxWavesPerEU(this, FlatWorkGroupSize);
+  }
+
+  /// \returns Minimum number of waves per execution unit supported by the
+  /// subtarget.
+  unsigned getMinWavesPerEU() const override {
+    return AMDGPU::IsaInfo::getMinWavesPerEU(this);
+  }
 };
 
 class R600Subtarget final : public R600GenSubtargetInfo,
@@ -968,7 +1003,6 @@ private:
   R600TargetLowering TLInfo;
   InstrItineraryData InstrItins;
   SelectionDAGTargetInfo TSInfo;
-  AMDGPUAS AS;
 
 public:
   R600Subtarget(const Triple &TT, StringRef CPU, StringRef FS,
@@ -1053,8 +1087,6 @@ public:
 
   short getTexVTXClauseSize() const { return TexVTXClauseSize; }
 
-  AMDGPUAS getAMDGPUAS() const { return AS; }
-
   bool enableMachineScheduler() const override {
     return true;
   }
@@ -1062,6 +1094,34 @@ public:
   bool enableSubRegLiveness() const override {
     return true;
   }
+
+  /// \returns Maximum number of work groups per compute unit supported by the
+  /// subtarget and limited by given \p FlatWorkGroupSize.
+  unsigned getMaxWorkGroupsPerCU(unsigned FlatWorkGroupSize) const override {
+    return AMDGPU::IsaInfo::getMaxWorkGroupsPerCU(this, FlatWorkGroupSize);
+  }
+
+  /// \returns Minimum flat work group size supported by the subtarget.
+  unsigned getMinFlatWorkGroupSize() const override {
+    return AMDGPU::IsaInfo::getMinFlatWorkGroupSize(this);
+  }
+
+  /// \returns Maximum flat work group size supported by the subtarget.
+  unsigned getMaxFlatWorkGroupSize() const override {
+    return AMDGPU::IsaInfo::getMaxFlatWorkGroupSize(this);
+  }
+
+  /// \returns Maximum number of waves per execution unit supported by the
+  /// subtarget and limited by given \p FlatWorkGroupSize.
+  unsigned getMaxWavesPerEU(unsigned FlatWorkGroupSize) const override {
+    return AMDGPU::IsaInfo::getMaxWavesPerEU(this, FlatWorkGroupSize);
+  }
+
+  /// \returns Minimum number of waves per execution unit supported by the
+  /// subtarget.
+  unsigned getMinWavesPerEU() const override {
+    return AMDGPU::IsaInfo::getMinWavesPerEU(this);
+  }
 };
 
 } // end namespace llvm
diff --git a/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp b/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
index 2205819c444ff..e8cefdbf74b97 100644
--- a/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
+++ b/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
@@ -45,6 +45,7 @@
 #include "llvm/Transforms/IPO/PassManagerBuilder.h"
 #include "llvm/Transforms/Scalar.h"
 #include "llvm/Transforms/Scalar/GVN.h"
+#include "llvm/Transforms/Utils.h"
 #include "llvm/Transforms/Vectorize.h"
 #include <memory>
 
@@ -105,6 +106,11 @@ static cl::opt<bool> EnableSDWAPeephole(
   cl::desc("Enable SDWA peepholer"),
   cl::init(true));
 
+static cl::opt<bool> EnableDPPCombine(
+  "amdgpu-dpp-combine",
+  cl::desc("Enable DPP combiner"),
+  cl::init(false));
+
 // Enable address space based alias analysis
 static cl::opt<bool> EnableAMDGPUAliasAnalysis("enable-amdgpu-aa", cl::Hidden,
   cl::desc("Enable AMDGPU Alias Analysis"),
@@ -137,6 +143,20 @@ static cl::opt<bool> EnableLowerKernelArguments(
   cl::init(true),
   cl::Hidden);
 
+// Enable atomic optimization
+static cl::opt<bool> EnableAtomicOptimizations(
+  "amdgpu-atomic-optimizations",
+  cl::desc("Enable atomic optimizations"),
+  cl::init(false),
+  cl::Hidden);
+
+// Enable Mode register optimization
+static cl::opt<bool> EnableSIModeRegisterPass(
+  "amdgpu-mode-register",
+  cl::desc("Enable mode register pass"),
+  cl::init(true),
+  cl::Hidden);
+
 extern "C" void LLVMInitializeAMDGPUTarget() {
   // Register the target
   RegisterTargetMachine<R600TargetMachine> X(getTheAMDGPUTarget());
@@ -150,18 +170,22 @@ extern "C" void LLVMInitializeAMDGPUTarget() {
   initializeR600VectorRegMergerPass(*PR);
   initializeGlobalISel(*PR);
   initializeAMDGPUDAGToDAGISelPass(*PR);
+  initializeGCNDPPCombinePass(*PR);
   initializeSILowerI1CopiesPass(*PR);
   initializeSIFixSGPRCopiesPass(*PR);
   initializeSIFixVGPRCopiesPass(*PR);
+  initializeSIFixupVectorISelPass(*PR);
   initializeSIFoldOperandsPass(*PR);
   initializeSIPeepholeSDWAPass(*PR);
   initializeSIShrinkInstructionsPass(*PR);
   initializeSIOptimizeExecMaskingPreRAPass(*PR);
   initializeSILoadStoreOptimizerPass(*PR);
+  initializeAMDGPUFixFunctionBitcastsPass(*PR);
   initializeAMDGPUAlwaysInlinePass(*PR);
   initializeAMDGPUAnnotateKernelFeaturesPass(*PR);
   initializeAMDGPUAnnotateUniformValuesPass(*PR);
   initializeAMDGPUArgumentUsageInfoPass(*PR);
+  initializeAMDGPUAtomicOptimizerPass(*PR);
   initializeAMDGPULowerKernelArgumentsPass(*PR);
   initializeAMDGPULowerKernelAttributesPass(*PR);
   initializeAMDGPULowerIntrinsicsPass(*PR);
@@ -172,6 +196,7 @@ extern "C" void LLVMInitializeAMDGPUTarget() {
   initializeAMDGPUUnifyMetadataPass(*PR);
   initializeSIAnnotateControlFlowPass(*PR);
   initializeSIInsertWaitcntsPass(*PR);
+  initializeSIModeRegisterPass(*PR);
   initializeSIWholeQuadModePass(*PR);
   initializeSILowerControlFlowPass(*PR);
   initializeSIInsertSkipsPass(*PR);
@@ -182,6 +207,7 @@ extern "C" void LLVMInitializeAMDGPUTarget() {
   initializeSIFormMemoryClausesPass(*PR);
   initializeAMDGPUUnifyDivergentExitNodesPass(*PR);
   initializeAMDGPUAAWrapperPassPass(*PR);
+  initializeAMDGPUExternalAAWrapperPass(*PR);
   initializeAMDGPUUseNativeCallsPass(*PR);
   initializeAMDGPUSimplifyLibCallsPass(*PR);
   initializeAMDGPUInlinerPass(*PR);
@@ -292,12 +318,6 @@ static Reloc::Model getEffectiveRelocModel(Optional<Reloc::Model> RM) {
   return Reloc::PIC_;
 }
 
-static CodeModel::Model getEffectiveCodeModel(Optional<CodeModel::Model> CM) {
-  if (CM)
-    return *CM;
-  return CodeModel::Small;
-}
-
 AMDGPUTargetMachine::AMDGPUTargetMachine(const Target &T, const Triple &TT,
                                          StringRef CPU, StringRef FS,
                                          TargetOptions Options,
@@ -306,9 +326,8 @@ AMDGPUTargetMachine::AMDGPUTargetMachine(const Target &T, const Triple &TT,
                                          CodeGenOpt::Level OptLevel)
     : LLVMTargetMachine(T, computeDataLayout(TT), TT, getGPUOrDefault(TT, CPU),
                         FS, Options, getEffectiveRelocModel(RM),
-                        getEffectiveCodeModel(CM), OptLevel),
+                        getEffectiveCodeModel(CM, CodeModel::Small), OptLevel),
       TLOF(createTLOF(getTargetTriple())) {
-  AS = AMDGPU::getAMDGPUAS(TT);
   initAsmInfo();
 }
 
@@ -331,13 +350,6 @@ StringRef AMDGPUTargetMachine::getFeatureString(const Function &F) const {
     FSAttr.getValueAsString();
 }
 
-static ImmutablePass *createAMDGPUExternalAAWrapperPass() {
-  return createExternalAAWrapperPass([](Pass &P, Function &, AAResults &AAR) {
-      if (auto *WrapperPass = P.getAnalysisIfAvailable<AMDGPUAAWrapperPass>())
-        AAR.addAAResult(WrapperPass->getResult());
-      });
-}
-
 /// Predicate for Internalize pass.
 static bool mustPreserveGV(const GlobalValue &GV) {
   if (const Function *F = dyn_cast<Function>(&GV))
@@ -360,17 +372,6 @@ void AMDGPUTargetMachine::adjustPassManager(PassManagerBuilder &Builder) {
     Builder.Inliner = createAMDGPUFunctionInliningPass();
   }
 
-  if (Internalize) {
-    // If we're generating code, we always have the whole program available. The
-    // relocations expected for externally visible functions aren't supported,
-    // so make sure every non-entry function is hidden.
-    Builder.addExtension(
-      PassManagerBuilder::EP_EnabledOnOptLevel0,
-      [](const PassManagerBuilder &, legacy::PassManagerBase &PM) {
-        PM.add(createInternalizePass(mustPreserveGV));
-      });
-  }
-
   Builder.addExtension(
     PassManagerBuilder::EP_ModuleOptimizerEarly,
     [Internalize, EarlyInline, AMDGPUAA](const PassManagerBuilder &,
@@ -613,20 +614,23 @@ void AMDGPUPassConfig::addIRPasses() {
   disablePass(&FuncletLayoutID);
   disablePass(&PatchableFunctionID);
 
+  addPass(createAtomicExpandPass());
+
+  // This must occur before inlining, as the inliner will not look through
+  // bitcast calls.
+  addPass(createAMDGPUFixFunctionBitcastsPass());
+
   addPass(createAMDGPULowerIntrinsicsPass());
 
-  if (TM.getTargetTriple().getArch() == Triple::r600 ||
-      !EnableAMDGPUFunctionCalls) {
-    // Function calls are not supported, so make sure we inline everything.
-    addPass(createAMDGPUAlwaysInlinePass());
-    addPass(createAlwaysInlinerLegacyPass());
-    // We need to add the barrier noop pass, otherwise adding the function
-    // inlining pass will cause all of the PassConfigs passes to be run
-    // one function at a time, which means if we have a nodule with two
-    // functions, then we will generate code for the first function
-    // without ever running any passes on the second.
-    addPass(createBarrierNoopPass());
-  }
+  // Function calls are not supported, so make sure we inline everything.
+  addPass(createAMDGPUAlwaysInlinePass());
+  addPass(createAlwaysInlinerLegacyPass());
+  // We need to add the barrier noop pass, otherwise adding the function
+  // inlining pass will cause all of the PassConfigs passes to be run
+  // one function at a time, which means if we have a nodule with two
+  // functions, then we will generate code for the first function
+  // without ever running any passes on the second.
+  addPass(createBarrierNoopPass());
 
   if (TM.getTargetTriple().getArch() == Triple::amdgcn) {
     // TODO: May want to move later or split into an early and late one.
@@ -690,6 +694,7 @@ void AMDGPUPassConfig::addCodeGenPrepare() {
 }
 
 bool AMDGPUPassConfig::addPreISel() {
+  addPass(createLowerSwitchPass());
   addPass(createFlattenCFGPass());
   return false;
 }
@@ -759,6 +764,10 @@ ScheduleDAGInstrs *GCNPassConfig::createMachineScheduler(
 bool GCNPassConfig::addPreISel() {
   AMDGPUPassConfig::addPreISel();
 
+  if (EnableAtomicOptimizations) {
+    addPass(createAMDGPUAtomicOptimizerPass());
+  }
+
   // FIXME: We need to run a pass to propagate the attributes when calls are
   // supported.
   addPass(createAMDGPUAnnotateKernelFeaturesPass());
@@ -789,6 +798,8 @@ void GCNPassConfig::addMachineSSAOptimization() {
   //
   // XXX - Can we get away without running DeadMachineInstructionElim again?
   addPass(&SIFoldOperandsID);
+  if (EnableDPPCombine)
+    addPass(&GCNDPPCombineID);
   addPass(&DeadMachineInstructionElimID);
   addPass(&SILoadStoreOptimizerID);
   if (EnableSDWAPeephole) {
@@ -811,8 +822,10 @@ bool GCNPassConfig::addILPOpts() {
 
 bool GCNPassConfig::addInstSelector() {
   AMDGPUPassConfig::addInstSelector();
-  addPass(createSILowerI1CopiesPass());
   addPass(&SIFixSGPRCopiesID);
+  addPass(createSILowerI1CopiesPass());
+  addPass(createSIFixupVectorISelPass());
+  addPass(createSIAddIMGInitPass());
   return false;
 }
 
@@ -878,7 +891,8 @@ void GCNPassConfig::addOptimizedRegAlloc(FunctionPass *RegAllocPass) {
 
 void GCNPassConfig::addPostRegAlloc() {
   addPass(&SIFixVGPRCopiesID);
-  addPass(&SIOptimizeExecMaskingID);
+  if (getOptLevel() > CodeGenOpt::None)
+    addPass(&SIOptimizeExecMaskingID);
   TargetPassConfig::addPostRegAlloc();
 }
 
@@ -889,6 +903,7 @@ void GCNPassConfig::addPreEmitPass() {
   addPass(createSIMemoryLegalizerPass());
   addPass(createSIInsertWaitcntsPass());
   addPass(createSIShrinkInstructionsPass());
+  addPass(createSIModeRegisterPass());
 
   // The hazard recognizer that runs as part of the post-ra scheduler does not
   // guarantee to be able handle all hazards correctly. This is because if there
diff --git a/lib/Target/AMDGPU/AMDGPUTargetMachine.h b/lib/Target/AMDGPU/AMDGPUTargetMachine.h
index 0fe14493fabdd..62fbe71d19023 100644
--- a/lib/Target/AMDGPU/AMDGPUTargetMachine.h
+++ b/lib/Target/AMDGPU/AMDGPUTargetMachine.h
@@ -34,7 +34,6 @@ namespace llvm {
 class AMDGPUTargetMachine : public LLVMTargetMachine {
 protected:
   std::unique_ptr<TargetLoweringObjectFile> TLOF;
-  AMDGPUAS AS;
 
   StringRef getGPUName(const Function &F) const;
   StringRef getFeatureString(const Function &F) const;
@@ -55,16 +54,13 @@ public:
   TargetLoweringObjectFile *getObjFileLowering() const override {
     return TLOF.get();
   }
-  AMDGPUAS getAMDGPUAS() const {
-    return AS;
-  }
 
   void adjustPassManager(PassManagerBuilder &) override;
+
   /// Get the integer value of a null pointer in the given address space.
   uint64_t getNullPointerValue(unsigned AddrSpace) const {
-    if (AddrSpace == AS.LOCAL_ADDRESS || AddrSpace == AS.REGION_ADDRESS)
-      return -1;
-    return 0;
+    return (AddrSpace == AMDGPUAS::LOCAL_ADDRESS ||
+            AddrSpace == AMDGPUAS::REGION_ADDRESS) ? -1 : 0;
   }
 };
 
diff --git a/lib/Target/AMDGPU/AMDGPUTargetObjectFile.cpp b/lib/Target/AMDGPU/AMDGPUTargetObjectFile.cpp
index e2f718bd3c34d..c4e1efde130b8 100644
--- a/lib/Target/AMDGPU/AMDGPUTargetObjectFile.cpp
+++ b/lib/Target/AMDGPU/AMDGPUTargetObjectFile.cpp
@@ -29,3 +29,13 @@ MCSection *AMDGPUTargetObjectFile::SelectSectionForGlobal(
 
   return TargetLoweringObjectFileELF::SelectSectionForGlobal(GO, Kind, TM);
 }
+
+MCSection *AMDGPUTargetObjectFile::getExplicitSectionGlobal(
+    const GlobalObject *GO, SectionKind SK, const TargetMachine &TM) const {
+  // Set metadata access for the explicit section
+  StringRef SectionName = GO->getSection();
+  if (SectionName.startswith(".AMDGPU.comment."))
+    SK = SectionKind::getMetadata();
+
+  return TargetLoweringObjectFileELF::getExplicitSectionGlobal(GO, SK, TM);
+}
diff --git a/lib/Target/AMDGPU/AMDGPUTargetObjectFile.h b/lib/Target/AMDGPU/AMDGPUTargetObjectFile.h
index dd9dc1a88fc2b..a4ae1a2c18c26 100644
--- a/lib/Target/AMDGPU/AMDGPUTargetObjectFile.h
+++ b/lib/Target/AMDGPU/AMDGPUTargetObjectFile.h
@@ -26,6 +26,8 @@ class AMDGPUTargetObjectFile : public TargetLoweringObjectFileELF {
   public:
     MCSection *SelectSectionForGlobal(const GlobalObject *GO, SectionKind Kind,
                                       const TargetMachine &TM) const override;
+    MCSection *getExplicitSectionGlobal(const GlobalObject *GO, SectionKind Kind,
+                                        const TargetMachine &TM) const override;
 };
 
 } // end namespace llvm
diff --git a/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp b/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp
index a68b8d03f06e2..11e4ba4b5010d 100644
--- a/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp
+++ b/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp
@@ -102,7 +102,6 @@ void AMDGPUTTIImpl::getUnrollingPreferences(Loop *L, ScalarEvolution &SE,
   unsigned ThresholdPrivate = UnrollThresholdPrivate;
   unsigned ThresholdLocal = UnrollThresholdLocal;
   unsigned MaxBoost = std::max(ThresholdPrivate, ThresholdLocal);
-  const AMDGPUAS &ASST = AMDGPU::getAMDGPUAS(TargetTriple);
   for (const BasicBlock *BB : L->getBlocks()) {
     const DataLayout &DL = BB->getModule()->getDataLayout();
     unsigned LocalGEPsSeen = 0;
@@ -140,9 +139,9 @@ void AMDGPUTTIImpl::getUnrollingPreferences(Loop *L, ScalarEvolution &SE,
 
       unsigned AS = GEP->getAddressSpace();
       unsigned Threshold = 0;
-      if (AS == ASST.PRIVATE_ADDRESS)
+      if (AS == AMDGPUAS::PRIVATE_ADDRESS)
         Threshold = ThresholdPrivate;
-      else if (AS == ASST.LOCAL_ADDRESS)
+      else if (AS == AMDGPUAS::LOCAL_ADDRESS)
         Threshold = ThresholdLocal;
       else
         continue;
@@ -150,7 +149,7 @@ void AMDGPUTTIImpl::getUnrollingPreferences(Loop *L, ScalarEvolution &SE,
       if (UP.Threshold >= Threshold)
         continue;
 
-      if (AS == ASST.PRIVATE_ADDRESS) {
+      if (AS == AMDGPUAS::PRIVATE_ADDRESS) {
         const Value *Ptr = GEP->getPointerOperand();
         const AllocaInst *Alloca =
             dyn_cast<AllocaInst>(GetUnderlyingObject(Ptr, DL));
@@ -160,7 +159,7 @@ void AMDGPUTTIImpl::getUnrollingPreferences(Loop *L, ScalarEvolution &SE,
         unsigned AllocaSize = Ty->isSized() ? DL.getTypeAllocSize(Ty) : 0;
         if (AllocaSize > MaxAlloca)
           continue;
-      } else if (AS == ASST.LOCAL_ADDRESS) {
+      } else if (AS == AMDGPUAS::LOCAL_ADDRESS) {
         LocalGEPsSeen++;
         // Inhibit unroll for local memory if we have seen addressing not to
         // a variable, most likely we will be unable to combine it.
@@ -253,19 +252,18 @@ unsigned GCNTTIImpl::getStoreVectorFactor(unsigned VF, unsigned StoreSize,
 }
 
 unsigned GCNTTIImpl::getLoadStoreVecRegBitWidth(unsigned AddrSpace) const {
-  AMDGPUAS AS = ST->getAMDGPUAS();
-  if (AddrSpace == AS.GLOBAL_ADDRESS ||
-      AddrSpace == AS.CONSTANT_ADDRESS ||
-      AddrSpace == AS.CONSTANT_ADDRESS_32BIT) {
+  if (AddrSpace == AMDGPUAS::GLOBAL_ADDRESS ||
+      AddrSpace == AMDGPUAS::CONSTANT_ADDRESS ||
+      AddrSpace == AMDGPUAS::CONSTANT_ADDRESS_32BIT) {
     return 512;
   }
 
-  if (AddrSpace == AS.FLAT_ADDRESS ||
-      AddrSpace == AS.LOCAL_ADDRESS ||
-      AddrSpace == AS.REGION_ADDRESS)
+  if (AddrSpace == AMDGPUAS::FLAT_ADDRESS ||
+      AddrSpace == AMDGPUAS::LOCAL_ADDRESS ||
+      AddrSpace == AMDGPUAS::REGION_ADDRESS)
     return 128;
 
-  if (AddrSpace == AS.PRIVATE_ADDRESS)
+  if (AddrSpace == AMDGPUAS::PRIVATE_ADDRESS)
     return 8 * ST->getMaxPrivateElementSize();
 
   llvm_unreachable("unhandled address space");
@@ -277,7 +275,7 @@ bool GCNTTIImpl::isLegalToVectorizeMemChain(unsigned ChainSizeInBytes,
   // We allow vectorization of flat stores, even though we may need to decompose
   // them later if they may access private memory. We don't have enough context
   // here, and legalization can handle it.
-  if (AddrSpace == ST->getAMDGPUAS().PRIVATE_ADDRESS) {
+  if (AddrSpace == AMDGPUAS::PRIVATE_ADDRESS) {
     return (Alignment >= 4 || ST->hasUnalignedScratchAccess()) &&
       ChainSizeInBytes <= ST->getMaxPrivateElementSize();
   }
@@ -545,14 +543,15 @@ bool GCNTTIImpl::isSourceOfDivergence(const Value *V) const {
   if (const Argument *A = dyn_cast<Argument>(V))
     return !isArgPassedInSGPR(A);
 
-  // Loads from the private address space are divergent, because threads
-  // can execute the load instruction with the same inputs and get different
-  // results.
+  // Loads from the private and flat address spaces are divergent, because
+  // threads can execute the load instruction with the same inputs and get
+  // different results.
   //
   // All other loads are not divergent, because if threads issue loads with the
   // same arguments, they will always get the same result.
   if (const LoadInst *Load = dyn_cast<LoadInst>(V))
-    return Load->getPointerAddressSpace() == ST->getAMDGPUAS().PRIVATE_ADDRESS;
+    return Load->getPointerAddressSpace() == AMDGPUAS::PRIVATE_ADDRESS ||
+           Load->getPointerAddressSpace() == AMDGPUAS::FLAT_ADDRESS;
 
   // Atomics are divergent because they are executed sequentially: when an
   // atomic operation refers to the same address in each thread, then each
@@ -642,20 +641,19 @@ unsigned R600TTIImpl::getMinVectorRegisterBitWidth() const {
 }
 
 unsigned R600TTIImpl::getLoadStoreVecRegBitWidth(unsigned AddrSpace) const {
-  AMDGPUAS AS = ST->getAMDGPUAS();
-  if (AddrSpace == AS.GLOBAL_ADDRESS ||
-      AddrSpace == AS.CONSTANT_ADDRESS)
+  if (AddrSpace == AMDGPUAS::GLOBAL_ADDRESS ||
+      AddrSpace == AMDGPUAS::CONSTANT_ADDRESS)
     return 128;
-  if (AddrSpace == AS.LOCAL_ADDRESS ||
-      AddrSpace == AS.REGION_ADDRESS)
+  if (AddrSpace == AMDGPUAS::LOCAL_ADDRESS ||
+      AddrSpace == AMDGPUAS::REGION_ADDRESS)
     return 64;
-  if (AddrSpace == AS.PRIVATE_ADDRESS)
+  if (AddrSpace == AMDGPUAS::PRIVATE_ADDRESS)
     return 32;
 
-  if ((AddrSpace == AS.PARAM_D_ADDRESS ||
-      AddrSpace == AS.PARAM_I_ADDRESS ||
-      (AddrSpace >= AS.CONSTANT_BUFFER_0 &&
-      AddrSpace <= AS.CONSTANT_BUFFER_15)))
+  if ((AddrSpace == AMDGPUAS::PARAM_D_ADDRESS ||
+      AddrSpace == AMDGPUAS::PARAM_I_ADDRESS ||
+      (AddrSpace >= AMDGPUAS::CONSTANT_BUFFER_0 &&
+      AddrSpace <= AMDGPUAS::CONSTANT_BUFFER_15)))
     return 128;
   llvm_unreachable("unhandled address space");
 }
@@ -666,9 +664,7 @@ bool R600TTIImpl::isLegalToVectorizeMemChain(unsigned ChainSizeInBytes,
   // We allow vectorization of flat stores, even though we may need to decompose
   // them later if they may access private memory. We don't have enough context
   // here, and legalization can handle it.
-  if (AddrSpace == ST->getAMDGPUAS().PRIVATE_ADDRESS)
-    return false;
-  return true;
+  return (AddrSpace != AMDGPUAS::PRIVATE_ADDRESS);
 }
 
 bool R600TTIImpl::isLegalToVectorizeLoadChain(unsigned ChainSizeInBytes,
diff --git a/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.h b/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.h
index 8e63d789e17d7..397c5c6fa6fbe 100644
--- a/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.h
+++ b/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.h
@@ -179,7 +179,7 @@ public:
     if (IsGraphicsShader)
       return -1;
     return ST->hasFlatAddressSpace() ?
-      ST->getAMDGPUAS().FLAT_ADDRESS : ST->getAMDGPUAS().UNKNOWN_ADDRESS_SPACE;
+      AMDGPUAS::FLAT_ADDRESS : AMDGPUAS::UNKNOWN_ADDRESS_SPACE;
   }
 
   unsigned getVectorSplitCost() { return 0; }
diff --git a/lib/Target/AMDGPU/AMDGPUUnifyDivergentExitNodes.cpp b/lib/Target/AMDGPU/AMDGPUUnifyDivergentExitNodes.cpp
index 0d3a1673696a5..ced3f6f567e2f 100644
--- a/lib/Target/AMDGPU/AMDGPUUnifyDivergentExitNodes.cpp
+++ b/lib/Target/AMDGPU/AMDGPUUnifyDivergentExitNodes.cpp
@@ -25,7 +25,7 @@
 #include "llvm/ADT/SmallPtrSet.h"
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/ADT/StringRef.h"
-#include "llvm/Analysis/DivergenceAnalysis.h"
+#include "llvm/Analysis/LegacyDivergenceAnalysis.h"
 #include "llvm/Analysis/PostDominators.h"
 #include "llvm/Analysis/TargetTransformInfo.h"
 #include "llvm/Transforms/Utils/Local.h"
@@ -70,7 +70,7 @@ char &llvm::AMDGPUUnifyDivergentExitNodesID = AMDGPUUnifyDivergentExitNodes::ID;
 INITIALIZE_PASS_BEGIN(AMDGPUUnifyDivergentExitNodes, DEBUG_TYPE,
                      "Unify divergent function exit nodes", false, false)
 INITIALIZE_PASS_DEPENDENCY(PostDominatorTreeWrapperPass)
-INITIALIZE_PASS_DEPENDENCY(DivergenceAnalysis)
+INITIALIZE_PASS_DEPENDENCY(LegacyDivergenceAnalysis)
 INITIALIZE_PASS_END(AMDGPUUnifyDivergentExitNodes, DEBUG_TYPE,
                     "Unify divergent function exit nodes", false, false)
 
@@ -78,10 +78,10 @@ void AMDGPUUnifyDivergentExitNodes::getAnalysisUsage(AnalysisUsage &AU) const{
   // TODO: Preserve dominator tree.
   AU.addRequired<PostDominatorTreeWrapperPass>();
 
-  AU.addRequired<DivergenceAnalysis>();
+  AU.addRequired<LegacyDivergenceAnalysis>();
 
   // No divergent values are changed, only blocks and branch edges.
-  AU.addPreserved<DivergenceAnalysis>();
+  AU.addPreserved<LegacyDivergenceAnalysis>();
 
   // We preserve the non-critical-edgeness property
   AU.addPreservedID(BreakCriticalEdgesID);
@@ -95,7 +95,7 @@ void AMDGPUUnifyDivergentExitNodes::getAnalysisUsage(AnalysisUsage &AU) const{
 
 /// \returns true if \p BB is reachable through only uniform branches.
 /// XXX - Is there a more efficient way to find this?
-static bool isUniformlyReached(const DivergenceAnalysis &DA,
+static bool isUniformlyReached(const LegacyDivergenceAnalysis &DA,
                                BasicBlock &BB) {
   SmallVector<BasicBlock *, 8> Stack;
   SmallPtrSet<BasicBlock *, 8> Visited;
@@ -163,7 +163,7 @@ bool AMDGPUUnifyDivergentExitNodes::runOnFunction(Function &F) {
   if (PDT.getRoots().size() <= 1)
     return false;
 
-  DivergenceAnalysis &DA = getAnalysis<DivergenceAnalysis>();
+  LegacyDivergenceAnalysis &DA = getAnalysis<LegacyDivergenceAnalysis>();
 
   // Loop over all of the blocks in a function, tracking all of the blocks that
   // return.
diff --git a/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp b/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp
index 31e2885c833d9..3f9af27a2e5e1 100644
--- a/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp
+++ b/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp
@@ -49,6 +49,7 @@
 #include "llvm/Support/MachineValueType.h"
 #include "llvm/Support/MathExtras.h"
 #include "llvm/Support/SMLoc.h"
+#include "llvm/Support/TargetParser.h"
 #include "llvm/Support/TargetRegistry.h"
 #include "llvm/Support/raw_ostream.h"
 #include <algorithm>
@@ -156,13 +157,12 @@ public:
     ImmTyDMask,
     ImmTyUNorm,
     ImmTyDA,
-    ImmTyR128,
+    ImmTyR128A16,
     ImmTyLWE,
     ImmTyExpTgt,
     ImmTyExpCompr,
     ImmTyExpVM,
-    ImmTyDFMT,
-    ImmTyNFMT,
+    ImmTyFORMAT,
     ImmTyHwreg,
     ImmTyOff,
     ImmTySendMsg,
@@ -291,7 +291,7 @@ public:
   bool isDMask() const { return isImmTy(ImmTyDMask); }
   bool isUNorm() const { return isImmTy(ImmTyUNorm); }
   bool isDA() const { return isImmTy(ImmTyDA); }
-  bool isR128() const { return isImmTy(ImmTyR128); }
+  bool isR128A16() const { return isImmTy(ImmTyR128A16); }
   bool isLWE() const { return isImmTy(ImmTyLWE); }
   bool isOff() const { return isImmTy(ImmTyOff); }
   bool isExpTgt() const { return isImmTy(ImmTyExpTgt); }
@@ -312,8 +312,7 @@ public:
   bool isSLC() const { return isImmTy(ImmTySLC); }
   bool isTFE() const { return isImmTy(ImmTyTFE); }
   bool isD16() const { return isImmTy(ImmTyD16); }
-  bool isDFMT() const { return isImmTy(ImmTyDFMT) && isUInt<8>(getImm()); }
-  bool isNFMT() const { return isImmTy(ImmTyNFMT) && isUInt<8>(getImm()); }
+  bool isFORMAT() const { return isImmTy(ImmTyFORMAT) && isUInt<8>(getImm()); }
   bool isBankMask() const { return isImmTy(ImmTyDppBankMask); }
   bool isRowMask() const { return isImmTy(ImmTyDppRowMask); }
   bool isBoundCtrl() const { return isImmTy(ImmTyDppBoundCtrl); }
@@ -666,8 +665,7 @@ public:
     case ImmTySLC: OS << "SLC"; break;
     case ImmTyTFE: OS << "TFE"; break;
     case ImmTyD16: OS << "D16"; break;
-    case ImmTyDFMT: OS << "DFMT"; break;
-    case ImmTyNFMT: OS << "NFMT"; break;
+    case ImmTyFORMAT: OS << "FORMAT"; break;
     case ImmTyClampSI: OS << "ClampSI"; break;
     case ImmTyOModSI: OS << "OModSI"; break;
     case ImmTyDppCtrl: OS << "DppCtrl"; break;
@@ -681,7 +679,7 @@ public:
     case ImmTyDMask: OS << "DMask"; break;
     case ImmTyUNorm: OS << "UNorm"; break;
     case ImmTyDA: OS << "DA"; break;
-    case ImmTyR128: OS << "R128"; break;
+    case ImmTyR128A16: OS << "R128A16"; break;
     case ImmTyLWE: OS << "LWE"; break;
     case ImmTyOff: OS << "Off"; break;
     case ImmTyExpTgt: OS << "ExpTgt"; break;
@@ -920,8 +918,7 @@ public:
       // Currently there is none suitable machinery in the core llvm-mc for this.
       // MCSymbol::isRedefinable is intended for another purpose, and
       // AsmParser::parseDirectiveSet() cannot be specialized for specific target.
-      AMDGPU::IsaInfo::IsaVersion ISA =
-          AMDGPU::IsaInfo::getIsaVersion(getFeatureBits());
+      AMDGPU::IsaVersion ISA = AMDGPU::getIsaVersion(getSTI().getCPU());
       MCContext &Ctx = getContext();
       if (ISA.Major >= 6 && AMDGPU::IsaInfo::hasCodeObjectV3(&getSTI())) {
         MCSymbol *Sym =
@@ -1061,6 +1058,7 @@ public:
   OperandMatchResultTy parseRegWithFPInputMods(OperandVector &Operands);
   OperandMatchResultTy parseRegWithIntInputMods(OperandVector &Operands);
   OperandMatchResultTy parseVReg32OrOff(OperandVector &Operands);
+  OperandMatchResultTy parseDfmtNfmt(OperandVector &Operands);
 
   void cvtDSOffset01(MCInst &Inst, const OperandVector &Operands);
   void cvtDS(MCInst &Inst, const OperandVector &Operands) { cvtDSImpl(Inst, Operands, false); }
@@ -1092,7 +1090,6 @@ private:
   bool validateMIMGAtomicDMask(const MCInst &Inst);
   bool validateMIMGGatherDMask(const MCInst &Inst);
   bool validateMIMGDataSize(const MCInst &Inst);
-  bool validateMIMGR128(const MCInst &Inst);
   bool validateMIMGD16(const MCInst &Inst);
   bool usesConstantBus(const MCInst &Inst, unsigned OpIdx);
   bool isInlineConstant(const MCInst &Inst, unsigned OpIdx) const;
@@ -1829,7 +1826,7 @@ bool AMDGPUAsmParser::updateGprCountSymbols(RegisterKind RegKind,
                                             unsigned DwordRegIndex,
                                             unsigned RegWidth) {
   // Symbols are only defined for GCN targets
-  if (AMDGPU::IsaInfo::getIsaVersion(getFeatureBits()).Major < 6)
+  if (AMDGPU::getIsaVersion(getSTI().getCPU()).Major < 6)
     return true;
 
   auto SymbolName = getGprCountSymbolName(RegKind);
@@ -2447,22 +2444,6 @@ bool AMDGPUAsmParser::validateMIMGGatherDMask(const MCInst &Inst) {
   return DMask == 0x1 || DMask == 0x2 || DMask == 0x4 || DMask == 0x8;
 }
 
-bool AMDGPUAsmParser::validateMIMGR128(const MCInst &Inst) {
-
-  const unsigned Opc = Inst.getOpcode();
-  const MCInstrDesc &Desc = MII.get(Opc);
-
-  if ((Desc.TSFlags & SIInstrFlags::MIMG) == 0)
-    return true;
-
-  int Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::r128);
-  assert(Idx != -1);
-
-  bool R128 = (Inst.getOperand(Idx).getImm() != 0);
-
-  return !R128 || hasMIMG_R128();
-}
-
 bool AMDGPUAsmParser::validateMIMGD16(const MCInst &Inst) {
 
   const unsigned Opc = Inst.getOpcode();
@@ -2497,11 +2478,6 @@ bool AMDGPUAsmParser::validateInstruction(const MCInst &Inst,
       "integer clamping is not supported on this GPU");
     return false;
   }
-  if (!validateMIMGR128(Inst)) {
-    Error(IDLoc,
-      "r128 modifier is not supported on this GPU");
-    return false;
-  }
   // For MUBUF/MTBUF d16 is a part of opcode, so there is nothing to validate.
   if (!validateMIMGD16(Inst)) {
     Error(IDLoc,
@@ -2661,18 +2637,18 @@ bool AMDGPUAsmParser::calculateGPRBlocks(
     unsigned &SGPRBlocks) {
   // TODO(scott.linder): These calculations are duplicated from
   // AMDGPUAsmPrinter::getSIProgramInfo and could be unified.
-  IsaInfo::IsaVersion Version = IsaInfo::getIsaVersion(Features);
+  IsaVersion Version = getIsaVersion(getSTI().getCPU());
 
   unsigned NumVGPRs = NextFreeVGPR;
   unsigned NumSGPRs = NextFreeSGPR;
-  unsigned MaxAddressableNumSGPRs = IsaInfo::getAddressableNumSGPRs(Features);
+  unsigned MaxAddressableNumSGPRs = IsaInfo::getAddressableNumSGPRs(&getSTI());
 
   if (Version.Major >= 8 && !Features.test(FeatureSGPRInitBug) &&
       NumSGPRs > MaxAddressableNumSGPRs)
     return OutOfRangeError(SGPRRange);
 
   NumSGPRs +=
-      IsaInfo::getNumExtraSGPRs(Features, VCCUsed, FlatScrUsed, XNACKUsed);
+      IsaInfo::getNumExtraSGPRs(&getSTI(), VCCUsed, FlatScrUsed, XNACKUsed);
 
   if ((Version.Major <= 7 || Features.test(FeatureSGPRInitBug)) &&
       NumSGPRs > MaxAddressableNumSGPRs)
@@ -2681,8 +2657,8 @@ bool AMDGPUAsmParser::calculateGPRBlocks(
   if (Features.test(FeatureSGPRInitBug))
     NumSGPRs = IsaInfo::FIXED_NUM_SGPRS_FOR_INIT_BUG;
 
-  VGPRBlocks = IsaInfo::getNumVGPRBlocks(Features, NumVGPRs);
-  SGPRBlocks = IsaInfo::getNumSGPRBlocks(Features, NumSGPRs);
+  VGPRBlocks = IsaInfo::getNumVGPRBlocks(&getSTI(), NumVGPRs);
+  SGPRBlocks = IsaInfo::getNumSGPRBlocks(&getSTI(), NumSGPRs);
 
   return false;
 }
@@ -2702,8 +2678,7 @@ bool AMDGPUAsmParser::ParseDirectiveAMDHSAKernel() {
 
   StringSet<> Seen;
 
-  IsaInfo::IsaVersion IVersion =
-      IsaInfo::getIsaVersion(getSTI().getFeatureBits());
+  IsaVersion IVersion = getIsaVersion(getSTI().getCPU());
 
   SMRange VGPRRange;
   uint64_t NextFreeVGPR = 0;
@@ -2962,8 +2937,7 @@ bool AMDGPUAsmParser::ParseDirectiveHSACodeObjectISA() {
   // If this directive has no arguments, then use the ISA version for the
   // targeted GPU.
   if (getLexer().is(AsmToken::EndOfStatement)) {
-    AMDGPU::IsaInfo::IsaVersion ISA =
-        AMDGPU::IsaInfo::getIsaVersion(getFeatureBits());
+    AMDGPU::IsaVersion ISA = AMDGPU::getIsaVersion(getSTI().getCPU());
     getTargetStreamer().EmitDirectiveHSACodeObjectISA(ISA.Major, ISA.Minor,
                                                       ISA.Stepping,
                                                       "AMD", "AMDGPU");
@@ -3025,7 +2999,7 @@ bool AMDGPUAsmParser::ParseAMDKernelCodeTValue(StringRef ID,
 
 bool AMDGPUAsmParser::ParseDirectiveAMDKernelCodeT() {
   amd_kernel_code_t Header;
-  AMDGPU::initDefaultAMDKernelCodeT(Header, getFeatureBits());
+  AMDGPU::initDefaultAMDKernelCodeT(Header, &getSTI());
 
   while (true) {
     // Lex EndOfStatement.  This is in a while loop, because lexing a comment
@@ -3091,9 +3065,18 @@ bool AMDGPUAsmParser::ParseDirectiveISAVersion() {
 }
 
 bool AMDGPUAsmParser::ParseDirectiveHSAMetadata() {
+  const char *AssemblerDirectiveBegin;
+  const char *AssemblerDirectiveEnd;
+  std::tie(AssemblerDirectiveBegin, AssemblerDirectiveEnd) =
+      AMDGPU::IsaInfo::hasCodeObjectV3(&getSTI())
+          ? std::make_tuple(HSAMD::V3::AssemblerDirectiveBegin,
+                            HSAMD::V3::AssemblerDirectiveEnd)
+          : std::make_tuple(HSAMD::AssemblerDirectiveBegin,
+                            HSAMD::AssemblerDirectiveEnd);
+
   if (getSTI().getTargetTriple().getOS() != Triple::AMDHSA) {
     return Error(getParser().getTok().getLoc(),
-                 (Twine(HSAMD::AssemblerDirectiveBegin) + Twine(" directive is "
+                 (Twine(AssemblerDirectiveBegin) + Twine(" directive is "
                  "not available on non-amdhsa OSes")).str());
   }
 
@@ -3111,7 +3094,7 @@ bool AMDGPUAsmParser::ParseDirectiveHSAMetadata() {
 
     if (getLexer().is(AsmToken::Identifier)) {
       StringRef ID = getLexer().getTok().getIdentifier();
-      if (ID == AMDGPU::HSAMD::AssemblerDirectiveEnd) {
+      if (ID == AssemblerDirectiveEnd) {
         Lex();
         FoundEnd = true;
         break;
@@ -3133,8 +3116,13 @@ bool AMDGPUAsmParser::ParseDirectiveHSAMetadata() {
 
   YamlStream.flush();
 
-  if (!getTargetStreamer().EmitHSAMetadata(HSAMetadataString))
-    return Error(getParser().getTok().getLoc(), "invalid HSA metadata");
+  if (IsaInfo::hasCodeObjectV3(&getSTI())) {
+    if (!getTargetStreamer().EmitHSAMetadataV3(HSAMetadataString))
+      return Error(getParser().getTok().getLoc(), "invalid HSA metadata");
+  } else {
+    if (!getTargetStreamer().EmitHSAMetadataV2(HSAMetadataString))
+      return Error(getParser().getTok().getLoc(), "invalid HSA metadata");
+  }
 
   return false;
 }
@@ -3171,6 +3159,10 @@ bool AMDGPUAsmParser::ParseDirective(AsmToken DirectiveID) {
 
     if (IDVal == ".amdhsa_kernel")
       return ParseDirectiveAMDHSAKernel();
+
+    // TODO: Restructure/combine with PAL metadata directive.
+    if (IDVal == AMDGPU::HSAMD::V3::AssemblerDirectiveBegin)
+      return ParseDirectiveHSAMetadata();
   } else {
     if (IDVal == ".hsa_code_object_version")
       return ParseDirectiveHSACodeObjectVersion();
@@ -3186,10 +3178,10 @@ bool AMDGPUAsmParser::ParseDirective(AsmToken DirectiveID) {
 
     if (IDVal == ".amd_amdgpu_isa")
       return ParseDirectiveISAVersion();
-  }
 
-  if (IDVal == AMDGPU::HSAMD::AssemblerDirectiveBegin)
-    return ParseDirectiveHSAMetadata();
+    if (IDVal == AMDGPU::HSAMD::AssemblerDirectiveBegin)
+      return ParseDirectiveHSAMetadata();
+  }
 
   if (IDVal == PALMD::AssemblerDirective)
     return ParseDirectivePALMetadata();
@@ -3465,6 +3457,10 @@ AMDGPUAsmParser::parseNamedBit(const char *Name, OperandVector &Operands,
       case AsmToken::Identifier: {
         StringRef Tok = Parser.getTok().getString();
         if (Tok == Name) {
+          if (Tok == "r128" && isGFX9())
+            Error(S, "r128 modifier is not supported on this GPU");
+          if (Tok == "a16" && !isGFX9())
+            Error(S, "a16 modifier is not supported on this GPU");
           Bit = 1;
           Parser.Lex();
         } else if (Tok.startswith("no") && Tok.endswith(Name)) {
@@ -3522,6 +3518,53 @@ AMDGPUAsmParser::parseStringWithPrefix(StringRef Prefix, StringRef &Value) {
   return MatchOperand_Success;
 }
 
+// dfmt and nfmt (in a tbuffer instruction) are parsed as one to allow their
+// values to live in a joint format operand in the MCInst encoding.
+OperandMatchResultTy
+AMDGPUAsmParser::parseDfmtNfmt(OperandVector &Operands) {
+  SMLoc S = Parser.getTok().getLoc();
+  int64_t Dfmt = 0, Nfmt = 0;
+  // dfmt and nfmt can appear in either order, and each is optional.
+  bool GotDfmt = false, GotNfmt = false;
+  while (!GotDfmt || !GotNfmt) {
+    if (!GotDfmt) {
+      auto Res = parseIntWithPrefix("dfmt", Dfmt);
+      if (Res != MatchOperand_NoMatch) {
+        if (Res != MatchOperand_Success)
+          return Res;
+        if (Dfmt >= 16) {
+          Error(Parser.getTok().getLoc(), "out of range dfmt");
+          return MatchOperand_ParseFail;
+        }
+        GotDfmt = true;
+        Parser.Lex();
+        continue;
+      }
+    }
+    if (!GotNfmt) {
+      auto Res = parseIntWithPrefix("nfmt", Nfmt);
+      if (Res != MatchOperand_NoMatch) {
+        if (Res != MatchOperand_Success)
+          return Res;
+        if (Nfmt >= 8) {
+          Error(Parser.getTok().getLoc(), "out of range nfmt");
+          return MatchOperand_ParseFail;
+        }
+        GotNfmt = true;
+        Parser.Lex();
+        continue;
+      }
+    }
+    break;
+  }
+  if (!GotDfmt && !GotNfmt)
+    return MatchOperand_NoMatch;
+  auto Format = Dfmt | Nfmt << 4;
+  Operands.push_back(
+      AMDGPUOperand::CreateImm(this, Format, S, AMDGPUOperand::ImmTyFORMAT));
+  return MatchOperand_Success;
+}
+
 //===----------------------------------------------------------------------===//
 // ds
 //===----------------------------------------------------------------------===//
@@ -3652,12 +3695,12 @@ void AMDGPUAsmParser::cvtExp(MCInst &Inst, const OperandVector &Operands) {
 
 static bool
 encodeCnt(
-  const AMDGPU::IsaInfo::IsaVersion ISA,
+  const AMDGPU::IsaVersion ISA,
   int64_t &IntVal,
   int64_t CntVal,
   bool Saturate,
-  unsigned (*encode)(const IsaInfo::IsaVersion &Version, unsigned, unsigned),
-  unsigned (*decode)(const IsaInfo::IsaVersion &Version, unsigned))
+  unsigned (*encode)(const IsaVersion &Version, unsigned, unsigned),
+  unsigned (*decode)(const IsaVersion &Version, unsigned))
 {
   bool Failed = false;
 
@@ -3688,8 +3731,7 @@ bool AMDGPUAsmParser::parseCnt(int64_t &IntVal) {
   if (getParser().parseAbsoluteExpression(CntVal))
     return true;
 
-  AMDGPU::IsaInfo::IsaVersion ISA =
-      AMDGPU::IsaInfo::getIsaVersion(getFeatureBits());
+  AMDGPU::IsaVersion ISA = AMDGPU::getIsaVersion(getSTI().getCPU());
 
   bool Failed = true;
   bool Sat = CntName.endswith("_sat");
@@ -3724,8 +3766,7 @@ bool AMDGPUAsmParser::parseCnt(int64_t &IntVal) {
 
 OperandMatchResultTy
 AMDGPUAsmParser::parseSWaitCntOps(OperandVector &Operands) {
-  AMDGPU::IsaInfo::IsaVersion ISA =
-      AMDGPU::IsaInfo::getIsaVersion(getFeatureBits());
+  AMDGPU::IsaVersion ISA = AMDGPU::getIsaVersion(getSTI().getCPU());
   int64_t Waitcnt = getWaitcntBitMask(ISA);
   SMLoc S = Parser.getTok().getLoc();
 
@@ -4617,8 +4658,7 @@ void AMDGPUAsmParser::cvtMtbuf(MCInst &Inst, const OperandVector &Operands) {
 
   addOptionalImmOperand(Inst, Operands, OptionalIdx,
                         AMDGPUOperand::ImmTyOffset);
-  addOptionalImmOperand(Inst, Operands, OptionalIdx, AMDGPUOperand::ImmTyDFMT);
-  addOptionalImmOperand(Inst, Operands, OptionalIdx, AMDGPUOperand::ImmTyNFMT);
+  addOptionalImmOperand(Inst, Operands, OptionalIdx, AMDGPUOperand::ImmTyFORMAT);
   addOptionalImmOperand(Inst, Operands, OptionalIdx, AMDGPUOperand::ImmTyGLC);
   addOptionalImmOperand(Inst, Operands, OptionalIdx, AMDGPUOperand::ImmTySLC);
   addOptionalImmOperand(Inst, Operands, OptionalIdx, AMDGPUOperand::ImmTyTFE);
@@ -4661,7 +4701,7 @@ void AMDGPUAsmParser::cvtMIMG(MCInst &Inst, const OperandVector &Operands,
   addOptionalImmOperand(Inst, Operands, OptionalIdx, AMDGPUOperand::ImmTyUNorm);
   addOptionalImmOperand(Inst, Operands, OptionalIdx, AMDGPUOperand::ImmTyGLC);
   addOptionalImmOperand(Inst, Operands, OptionalIdx, AMDGPUOperand::ImmTySLC);
-  addOptionalImmOperand(Inst, Operands, OptionalIdx, AMDGPUOperand::ImmTyR128);
+  addOptionalImmOperand(Inst, Operands, OptionalIdx, AMDGPUOperand::ImmTyR128A16);
   addOptionalImmOperand(Inst, Operands, OptionalIdx, AMDGPUOperand::ImmTyTFE);
   addOptionalImmOperand(Inst, Operands, OptionalIdx, AMDGPUOperand::ImmTyLWE);
   addOptionalImmOperand(Inst, Operands, OptionalIdx, AMDGPUOperand::ImmTyDA);
@@ -4761,8 +4801,7 @@ static const OptionalOperand AMDGPUOptionalOperandTable[] = {
   {"lds",     AMDGPUOperand::ImmTyLDS, true, nullptr},
   {"offset",  AMDGPUOperand::ImmTyOffset, false, nullptr},
   {"inst_offset", AMDGPUOperand::ImmTyInstOffset, false, nullptr},
-  {"dfmt",    AMDGPUOperand::ImmTyDFMT, false, nullptr},
-  {"nfmt",    AMDGPUOperand::ImmTyNFMT, false, nullptr},
+  {"dfmt",    AMDGPUOperand::ImmTyFORMAT, false, nullptr},
   {"glc",     AMDGPUOperand::ImmTyGLC, true, nullptr},
   {"slc",     AMDGPUOperand::ImmTySLC, true, nullptr},
   {"tfe",     AMDGPUOperand::ImmTyTFE, true, nullptr},
@@ -4772,7 +4811,8 @@ static const OptionalOperand AMDGPUOptionalOperandTable[] = {
   {"omod",    AMDGPUOperand::ImmTyOModSI, false, ConvertOmodMul},
   {"unorm",   AMDGPUOperand::ImmTyUNorm, true, nullptr},
   {"da",      AMDGPUOperand::ImmTyDA,    true, nullptr},
-  {"r128",    AMDGPUOperand::ImmTyR128,  true, nullptr},
+  {"r128",    AMDGPUOperand::ImmTyR128A16,  true, nullptr},
+  {"a16",     AMDGPUOperand::ImmTyR128A16,  true, nullptr},
   {"lwe",     AMDGPUOperand::ImmTyLWE,   true, nullptr},
   {"d16",     AMDGPUOperand::ImmTyD16,   true, nullptr},
   {"dmask",   AMDGPUOperand::ImmTyDMask, false, nullptr},
@@ -4844,6 +4884,8 @@ OperandMatchResultTy AMDGPUAsmParser::parseOptionalOpr(OperandVector &Operands)
                Op.Type == AMDGPUOperand::ImmTyNegHi) {
       res = parseOperandArrayWithPrefix(Op.Name, Operands, Op.Type,
                                         Op.ConvertResult);
+    } else if (Op.Type == AMDGPUOperand::ImmTyFORMAT) {
+      res = parseDfmtNfmt(Operands);
     } else {
       res = parseIntWithPrefix(Op.Name, Operands, Op.Type, Op.ConvertResult);
     }
@@ -5251,12 +5293,14 @@ void AMDGPUAsmParser::cvtDPP(MCInst &Inst, const OperandVector &Operands) {
     ((AMDGPUOperand &)*Operands[I++]).addRegOperands(Inst, 1);
   }
 
-  // All DPP instructions with at least one source operand have a fake "old"
-  // source at the beginning that's tied to the dst operand. Handle it here.
-  if (Desc.getNumOperands() >= 2)
-    Inst.addOperand(Inst.getOperand(0));
-
   for (unsigned E = Operands.size(); I != E; ++I) {
+    auto TiedTo = Desc.getOperandConstraint(Inst.getNumOperands(),
+                                            MCOI::TIED_TO);
+    if (TiedTo != -1) {
+      assert((unsigned)TiedTo < Inst.getNumOperands());
+      // handle tied old or src2 for MAC instructions
+      Inst.addOperand(Inst.getOperand(TiedTo));
+    }
     AMDGPUOperand &Op = ((AMDGPUOperand &)*Operands[I]);
     // Add the register arguments
     if (Op.isReg() && Op.Reg.RegNo == AMDGPU::VCC) {
diff --git a/lib/Target/AMDGPU/BUFInstructions.td b/lib/Target/AMDGPU/BUFInstructions.td
index b87c47a6b9eea..51c2abeac2ffb 100644
--- a/lib/Target/AMDGPU/BUFInstructions.td
+++ b/lib/Target/AMDGPU/BUFInstructions.td
@@ -17,14 +17,12 @@ def MUBUFScratchOffset : ComplexPattern<i64, 3, "SelectMUBUFScratchOffset", [],
 def MUBUFOffset : ComplexPattern<i64, 6, "SelectMUBUFOffset">;
 def MUBUFOffsetNoGLC : ComplexPattern<i64, 3, "SelectMUBUFOffset">;
 def MUBUFOffsetAtomic : ComplexPattern<i64, 4, "SelectMUBUFOffset">;
-def MUBUFIntrinsicOffset : ComplexPattern<i32, 2, "SelectMUBUFIntrinsicOffset">;
-def MUBUFIntrinsicVOffset : ComplexPattern<i32, 3, "SelectMUBUFIntrinsicVOffset">;
 
 class MubufLoad <SDPatternOperator op> : PatFrag <
   (ops node:$ptr), (op node:$ptr), [{
   auto const AS = cast<MemSDNode>(N)->getAddressSpace();
-  return AS == AMDGPUASI.GLOBAL_ADDRESS ||
-         AS == AMDGPUASI.CONSTANT_ADDRESS;
+  return AS == AMDGPUAS::GLOBAL_ADDRESS ||
+         AS == AMDGPUAS::CONSTANT_ADDRESS;
 }]>;
 
 def mubuf_load          : MubufLoad <load>;
@@ -100,15 +98,11 @@ class MTBUF_Pseudo <string opName, dag outs, dag ins,
   bits<1> has_vaddr   = 1;
   bits<1> has_glc     = 1;
   bits<1> glc_value   = 0; // the value for glc if no such operand
-  bits<4> dfmt_value  = 1; // the value for dfmt if no such operand
-  bits<3> nfmt_value  = 0; // the value for nfmt if no such operand
   bits<1> has_srsrc   = 1;
   bits<1> has_soffset = 1;
   bits<1> has_offset  = 1;
   bits<1> has_slc     = 1;
   bits<1> has_tfe     = 1;
-  bits<1> has_dfmt    = 1;
-  bits<1> has_nfmt    = 1;
 }
 
 class MTBUF_Real <MTBUF_Pseudo ps> :
@@ -126,14 +120,16 @@ class MTBUF_Real <MTBUF_Pseudo ps> :
 
   bits<12> offset;
   bits<1>  glc;
-  bits<4>  dfmt;
-  bits<3>  nfmt;
+  bits<7>  format;
   bits<8>  vaddr;
   bits<8>  vdata;
   bits<7>  srsrc;
   bits<1>  slc;
   bits<1>  tfe;
   bits<8>  soffset;
+
+  bits<4> dfmt = format{3-0};
+  bits<3> nfmt = format{6-4};
 }
 
 class getMTBUFInsDA<list<RegisterClass> vdataList,
@@ -142,16 +138,16 @@ class getMTBUFInsDA<list<RegisterClass> vdataList,
   RegisterClass vaddrClass = !if(!empty(vaddrList), ?, !head(vaddrList));
   dag InsNoData = !if(!empty(vaddrList),
     (ins                    SReg_128:$srsrc, SCSrc_b32:$soffset,
-         offset:$offset, DFMT:$dfmt, NFMT:$nfmt, GLC:$glc, SLC:$slc, TFE:$tfe),
+         offset:$offset, FORMAT:$format, GLC:$glc, SLC:$slc, TFE:$tfe),
     (ins vaddrClass:$vaddr, SReg_128:$srsrc, SCSrc_b32:$soffset,
-         offset:$offset, DFMT:$dfmt, NFMT:$nfmt, GLC:$glc, SLC:$slc, TFE:$tfe)
+         offset:$offset, FORMAT:$format, GLC:$glc, SLC:$slc, TFE:$tfe)
   );
   dag InsData = !if(!empty(vaddrList),
     (ins vdataClass:$vdata,                    SReg_128:$srsrc,
-         SCSrc_b32:$soffset, offset:$offset, DFMT:$dfmt, NFMT:$nfmt, GLC:$glc,
+         SCSrc_b32:$soffset, offset:$offset, FORMAT:$format, GLC:$glc,
          SLC:$slc, TFE:$tfe),
     (ins vdataClass:$vdata, vaddrClass:$vaddr, SReg_128:$srsrc,
-         SCSrc_b32:$soffset, offset:$offset, DFMT:$dfmt, NFMT:$nfmt, GLC:$glc,
+         SCSrc_b32:$soffset, offset:$offset, FORMAT:$format, GLC:$glc,
          SLC:$slc, TFE:$tfe)
   );
   dag ret = !if(!empty(vdataList), InsNoData, InsData);
@@ -169,15 +165,15 @@ class getMTBUFIns<int addrKind, list<RegisterClass> vdataList=[]> {
 
 class getMTBUFAsmOps<int addrKind> {
   string Pfx =
-    !if(!eq(addrKind, BUFAddrKind.Offset), "off, $srsrc, $dfmt, $nfmt, $soffset",
+    !if(!eq(addrKind, BUFAddrKind.Offset), "off, $srsrc, $format, $soffset",
     !if(!eq(addrKind, BUFAddrKind.OffEn),
-            "$vaddr, $srsrc, $dfmt, $nfmt, $soffset offen",
+            "$vaddr, $srsrc, $format, $soffset offen",
     !if(!eq(addrKind, BUFAddrKind.IdxEn),
-            "$vaddr, $srsrc, $dfmt, $nfmt, $soffset idxen",
+            "$vaddr, $srsrc, $format, $soffset idxen",
     !if(!eq(addrKind, BUFAddrKind.BothEn),
-            "$vaddr, $srsrc, $dfmt, $nfmt, $soffset idxen offen",
+            "$vaddr, $srsrc, $format, $soffset idxen offen",
     !if(!eq(addrKind, BUFAddrKind.Addr64),
-            "$vaddr, $srsrc, $dfmt, $nfmt, $soffset addr64",
+            "$vaddr, $srsrc, $format, $soffset addr64",
     "")))));
   string ret = Pfx # "$offset";
 }
@@ -217,14 +213,14 @@ multiclass MTBUF_Pseudo_Loads<string opName, RegisterClass vdataClass,
 
   def _OFFSET : MTBUF_Load_Pseudo <opName, BUFAddrKind.Offset, vdataClass,
     [(set load_vt:$vdata,
-     (ld (MUBUFOffset v4i32:$srsrc, i32:$soffset, i16:$offset, i8:$dfmt,
-                      i8:$nfmt, i1:$glc, i1:$slc, i1:$tfe)))]>,
+     (ld (MUBUFOffset v4i32:$srsrc, i32:$soffset, i16:$offset, i8:$format,
+                      i1:$glc, i1:$slc, i1:$tfe)))]>,
     MTBUFAddr64Table<0, NAME>;
 
   def _ADDR64 : MTBUF_Load_Pseudo <opName, BUFAddrKind.Addr64, vdataClass,
     [(set load_vt:$vdata,
      (ld (MUBUFAddr64 v4i32:$srsrc, i64:$vaddr, i32:$soffset, i16:$offset,
-                      i8:$dfmt, i8:$nfmt, i1:$glc, i1:$slc, i1:$tfe)))]>,
+                      i8:$format, i1:$glc, i1:$slc, i1:$tfe)))]>,
     MTBUFAddr64Table<1, NAME>;
 
   def _OFFEN  : MTBUF_Load_Pseudo <opName, BUFAddrKind.OffEn, vdataClass>;
@@ -263,13 +259,13 @@ multiclass MTBUF_Pseudo_Stores<string opName, RegisterClass vdataClass,
 
   def _OFFSET : MTBUF_Store_Pseudo <opName, BUFAddrKind.Offset, vdataClass,
     [(st store_vt:$vdata, (MUBUFOffset v4i32:$srsrc, i32:$soffset,
-                                       i16:$offset, i8:$dfmt, i8:$nfmt, i1:$glc,
+                                       i16:$offset, i8:$format, i1:$glc,
                                        i1:$slc, i1:$tfe))]>,
     MTBUFAddr64Table<0, NAME>;
 
   def _ADDR64 : MTBUF_Store_Pseudo <opName, BUFAddrKind.Addr64, vdataClass,
     [(st store_vt:$vdata, (MUBUFAddr64 v4i32:$srsrc, i64:$vaddr, i32:$soffset,
-                                       i16:$offset, i8:$dfmt, i8:$nfmt, i1:$glc,
+                                       i16:$offset, i8:$format, i1:$glc,
                                        i1:$slc, i1:$tfe))]>,
     MTBUFAddr64Table<1, NAME>;
 
@@ -290,6 +286,12 @@ multiclass MTBUF_Pseudo_Stores<string opName, RegisterClass vdataClass,
 // MUBUF classes
 //===----------------------------------------------------------------------===//
 
+class MUBUFGetBaseOpcode<string Op> {
+  string ret = !subst("DWORDX2", "DWORD",
+    !subst("DWORDX3", "DWORD",
+    !subst("DWORDX4", "DWORD", Op)));
+}
+
 class MUBUF_Pseudo <string opName, dag outs, dag ins,
                     string asmOps, list<dag> pattern=[]> :
   InstSI<outs, ins, "", pattern>,
@@ -303,6 +305,9 @@ class MUBUF_Pseudo <string opName, dag outs, dag ins,
   string Mnemonic = opName;
   string AsmOperands = asmOps;
 
+  Instruction Opcode = !cast<Instruction>(NAME);
+  Instruction BaseOpcode = !cast<Instruction>(MUBUFGetBaseOpcode<NAME>.ret);
+
   let VM_CNT = 1;
   let EXP_CNT = 1;
   let MUBUF = 1;
@@ -325,6 +330,7 @@ class MUBUF_Pseudo <string opName, dag outs, dag ins,
   bits<1> has_offset  = 1;
   bits<1> has_slc     = 1;
   bits<1> has_tfe     = 1;
+  bits<4> dwords      = 0;
 }
 
 class MUBUF_Real <bits<7> op, MUBUF_Pseudo ps> :
@@ -398,6 +404,16 @@ class getMUBUFInsDA<list<RegisterClass> vdataList,
              );
 }
 
+class getMUBUFDwords<RegisterClass regClass> {
+  string regClassAsInt = !cast<string>(regClass);
+  int ret =
+    !if(!eq(regClassAsInt, !cast<string>(VGPR_32)), 1,
+    !if(!eq(regClassAsInt, !cast<string>(VReg_64)), 2,
+    !if(!eq(regClassAsInt, !cast<string>(VReg_96)), 3,
+    !if(!eq(regClassAsInt, !cast<string>(VReg_128)), 4,
+    0))));
+}
+
 class getMUBUFIns<int addrKind, list<RegisterClass> vdataList=[], bit isLds = 0> {
   dag ret =
     !if(!eq(addrKind, BUFAddrKind.Offset), getMUBUFInsDA<vdataList, [], isLds>.ret,
@@ -458,6 +474,7 @@ class MUBUF_Load_Pseudo <string opName,
   let Uses = !if(isLds, [EXEC, M0], [EXEC]);
   let has_tfe = !if(isLds, 0, 1);
   let lds = isLds;
+  let dwords = getMUBUFDwords<vdataClass>.ret;
 }
 
 // FIXME: tfe can't be an operand because it requires a separate
@@ -521,6 +538,7 @@ class MUBUF_Store_Pseudo <string opName,
   let mayLoad = 0;
   let mayStore = 1;
   let maybeAtomic = 1;
+  let dwords = getMUBUFDwords<vdataClass>.ret;
 }
 
 multiclass MUBUF_Pseudo_Stores<string opName, RegisterClass vdataClass,
@@ -660,11 +678,10 @@ class MUBUF_AtomicRet_Pseudo<string opName, int addrKind,
   let AsmMatchConverter = "cvtMubufAtomicReturn";
 }
 
-multiclass MUBUF_Pseudo_Atomics <string opName,
-                                 RegisterClass vdataClass,
-                                 ValueType vdataType,
-                                 SDPatternOperator atomic> {
-
+multiclass MUBUF_Pseudo_Atomics_NO_RTN <string opName,
+                                        RegisterClass vdataClass,
+                                        ValueType vdataType,
+                                        SDPatternOperator atomic> {
   def _OFFSET : MUBUF_AtomicNoRet_Pseudo <opName, BUFAddrKind.Offset, vdataClass>,
                 MUBUFAddr64Table <0, NAME>;
   def _ADDR64 : MUBUF_AtomicNoRet_Pseudo <opName, BUFAddrKind.Addr64, vdataClass>,
@@ -672,7 +689,12 @@ multiclass MUBUF_Pseudo_Atomics <string opName,
   def _OFFEN  : MUBUF_AtomicNoRet_Pseudo <opName, BUFAddrKind.OffEn,  vdataClass>;
   def _IDXEN  : MUBUF_AtomicNoRet_Pseudo <opName, BUFAddrKind.IdxEn,  vdataClass>;
   def _BOTHEN : MUBUF_AtomicNoRet_Pseudo <opName, BUFAddrKind.BothEn, vdataClass>;
+}
 
+multiclass MUBUF_Pseudo_Atomics_RTN <string opName,
+                                     RegisterClass vdataClass,
+                                     ValueType vdataType,
+                                     SDPatternOperator atomic> {
   def _OFFSET_RTN : MUBUF_AtomicRet_Pseudo <opName, BUFAddrKind.Offset, vdataClass,
     [(set vdataType:$vdata,
      (atomic (MUBUFOffsetAtomic v4i32:$srsrc, i32:$soffset, i16:$offset, i1:$slc),
@@ -690,6 +712,13 @@ multiclass MUBUF_Pseudo_Atomics <string opName,
   def _BOTHEN_RTN : MUBUF_AtomicRet_Pseudo <opName, BUFAddrKind.BothEn, vdataClass>;
 }
 
+multiclass MUBUF_Pseudo_Atomics <string opName,
+                                 RegisterClass vdataClass,
+                                 ValueType vdataType,
+                                 SDPatternOperator atomic> :
+  MUBUF_Pseudo_Atomics_NO_RTN<opName, vdataClass, vdataType, atomic>,
+  MUBUF_Pseudo_Atomics_RTN<opName, vdataClass, vdataType, atomic>;
+
 
 //===----------------------------------------------------------------------===//
 // MUBUF Instructions
@@ -1030,6 +1059,14 @@ def BUFFER_WBINVL1_VOL : MUBUF_Invalidate <"buffer_wbinvl1_vol",
 // MUBUF Patterns
 //===----------------------------------------------------------------------===//
 
+def extract_glc : SDNodeXForm<imm, [{
+  return CurDAG->getTargetConstant(N->getZExtValue() & 1, SDLoc(N), MVT::i8);
+}]>;
+
+def extract_slc : SDNodeXForm<imm, [{
+  return CurDAG->getTargetConstant((N->getZExtValue() >> 1) & 1, SDLoc(N), MVT::i8);
+}]>;
+
 //===----------------------------------------------------------------------===//
 // buffer_load/store_format patterns
 //===----------------------------------------------------------------------===//
@@ -1037,119 +1074,129 @@ def BUFFER_WBINVL1_VOL : MUBUF_Invalidate <"buffer_wbinvl1_vol",
 multiclass MUBUF_LoadIntrinsicPat<SDPatternOperator name, ValueType vt,
                                   string opcode> {
   def : GCNPat<
-    (vt (name v4i32:$rsrc, 0,
-              (MUBUFIntrinsicOffset i32:$soffset, i16:$offset),
-              imm:$glc, imm:$slc)),
+    (vt (name v4i32:$rsrc, 0, 0, i32:$soffset, imm:$offset,
+              imm:$cachepolicy, 0)),
     (!cast<MUBUF_Pseudo>(opcode # _OFFSET) $rsrc, $soffset, (as_i16imm $offset),
-      (as_i1imm $glc), (as_i1imm $slc), 0)
+      (extract_glc $cachepolicy), (extract_slc $cachepolicy), 0)
   >;
 
   def : GCNPat<
-    (vt (name v4i32:$rsrc, i32:$vindex,
-              (MUBUFIntrinsicOffset i32:$soffset, i16:$offset),
-              imm:$glc, imm:$slc)),
-    (!cast<MUBUF_Pseudo>(opcode # _IDXEN) $vindex, $rsrc, $soffset, (as_i16imm $offset),
-      (as_i1imm $glc), (as_i1imm $slc), 0)
+    (vt (name v4i32:$rsrc, 0, i32:$voffset, i32:$soffset, imm:$offset,
+              imm:$cachepolicy, 0)),
+    (!cast<MUBUF_Pseudo>(opcode # _OFFEN) $voffset, $rsrc, $soffset, (as_i16imm $offset),
+      (extract_glc $cachepolicy), (extract_slc $cachepolicy), 0)
   >;
 
   def : GCNPat<
-    (vt (name v4i32:$rsrc, 0,
-              (MUBUFIntrinsicVOffset i32:$soffset, i16:$offset, i32:$voffset),
-              imm:$glc, imm:$slc)),
-    (!cast<MUBUF_Pseudo>(opcode # _OFFEN) $voffset, $rsrc, $soffset, (as_i16imm $offset),
-      (as_i1imm $glc), (as_i1imm $slc), 0)
+    (vt (name v4i32:$rsrc, i32:$vindex, 0, i32:$soffset, imm:$offset,
+              imm:$cachepolicy, imm)),
+    (!cast<MUBUF_Pseudo>(opcode # _IDXEN) $vindex, $rsrc, $soffset, (as_i16imm $offset),
+      (extract_glc $cachepolicy), (extract_slc $cachepolicy), 0)
   >;
 
   def : GCNPat<
-    (vt (name v4i32:$rsrc, i32:$vindex,
-              (MUBUFIntrinsicVOffset i32:$soffset, i16:$offset, i32:$voffset),
-              imm:$glc, imm:$slc)),
+    (vt (name v4i32:$rsrc, i32:$vindex, i32:$voffset, i32:$soffset, imm:$offset,
+              imm:$cachepolicy, imm)),
     (!cast<MUBUF_Pseudo>(opcode # _BOTHEN)
       (REG_SEQUENCE VReg_64, $vindex, sub0, $voffset, sub1),
       $rsrc, $soffset, (as_i16imm $offset),
-      (as_i1imm $glc), (as_i1imm $slc), 0)
+      (extract_glc $cachepolicy), (extract_slc $cachepolicy), 0)
   >;
 }
 
 defm : MUBUF_LoadIntrinsicPat<SIbuffer_load_format, f32, "BUFFER_LOAD_FORMAT_X">;
+defm : MUBUF_LoadIntrinsicPat<SIbuffer_load_format, i32, "BUFFER_LOAD_FORMAT_X">;
 defm : MUBUF_LoadIntrinsicPat<SIbuffer_load_format, v2f32, "BUFFER_LOAD_FORMAT_XY">;
+defm : MUBUF_LoadIntrinsicPat<SIbuffer_load_format, v2i32, "BUFFER_LOAD_FORMAT_XY">;
 defm : MUBUF_LoadIntrinsicPat<SIbuffer_load_format, v4f32, "BUFFER_LOAD_FORMAT_XYZW">;
+defm : MUBUF_LoadIntrinsicPat<SIbuffer_load_format, v4i32, "BUFFER_LOAD_FORMAT_XYZW">;
 
 let SubtargetPredicate = HasUnpackedD16VMem in {
   defm : MUBUF_LoadIntrinsicPat<SIbuffer_load_format_d16, f16, "BUFFER_LOAD_FORMAT_D16_X_gfx80">;
+  defm : MUBUF_LoadIntrinsicPat<SIbuffer_load_format_d16, i16, "BUFFER_LOAD_FORMAT_D16_X_gfx80">;
   defm : MUBUF_LoadIntrinsicPat<SIbuffer_load_format_d16, v2i32, "BUFFER_LOAD_FORMAT_D16_XY_gfx80">;
   defm : MUBUF_LoadIntrinsicPat<SIbuffer_load_format_d16, v4i32, "BUFFER_LOAD_FORMAT_D16_XYZW_gfx80">;
 } // End HasUnpackedD16VMem.
 
 let SubtargetPredicate = HasPackedD16VMem in {
   defm : MUBUF_LoadIntrinsicPat<SIbuffer_load_format_d16, f16, "BUFFER_LOAD_FORMAT_D16_X">;
+  defm : MUBUF_LoadIntrinsicPat<SIbuffer_load_format_d16, i16, "BUFFER_LOAD_FORMAT_D16_X">;
   defm : MUBUF_LoadIntrinsicPat<SIbuffer_load_format_d16, v2f16, "BUFFER_LOAD_FORMAT_D16_XY">;
+  defm : MUBUF_LoadIntrinsicPat<SIbuffer_load_format_d16, v2i16, "BUFFER_LOAD_FORMAT_D16_XY">;
   defm : MUBUF_LoadIntrinsicPat<SIbuffer_load_format_d16, v4f16, "BUFFER_LOAD_FORMAT_D16_XYZW">;
+  defm : MUBUF_LoadIntrinsicPat<SIbuffer_load_format_d16, v4i16, "BUFFER_LOAD_FORMAT_D16_XYZW">;
 } // End HasPackedD16VMem.
 
 defm : MUBUF_LoadIntrinsicPat<SIbuffer_load, f32, "BUFFER_LOAD_DWORD">;
+defm : MUBUF_LoadIntrinsicPat<SIbuffer_load, i32, "BUFFER_LOAD_DWORD">;
 defm : MUBUF_LoadIntrinsicPat<SIbuffer_load, v2f32, "BUFFER_LOAD_DWORDX2">;
+defm : MUBUF_LoadIntrinsicPat<SIbuffer_load, v2i32, "BUFFER_LOAD_DWORDX2">;
 defm : MUBUF_LoadIntrinsicPat<SIbuffer_load, v4f32, "BUFFER_LOAD_DWORDX4">;
+defm : MUBUF_LoadIntrinsicPat<SIbuffer_load, v4i32, "BUFFER_LOAD_DWORDX4">;
 
 multiclass MUBUF_StoreIntrinsicPat<SDPatternOperator name, ValueType vt,
                                    string opcode> {
   def : GCNPat<
-    (name vt:$vdata, v4i32:$rsrc, 0,
-          (MUBUFIntrinsicOffset i32:$soffset, i16:$offset),
-          imm:$glc, imm:$slc),
+    (name vt:$vdata, v4i32:$rsrc, 0, 0, i32:$soffset, imm:$offset,
+              imm:$cachepolicy, 0),
     (!cast<MUBUF_Pseudo>(opcode # _OFFSET_exact) $vdata, $rsrc, $soffset, (as_i16imm $offset),
-                                    (as_i1imm $glc), (as_i1imm $slc), 0)
+      (extract_glc $cachepolicy), (extract_slc $cachepolicy), 0)
   >;
 
   def : GCNPat<
-    (name vt:$vdata, v4i32:$rsrc, i32:$vindex,
-          (MUBUFIntrinsicOffset i32:$soffset, i16:$offset),
-          imm:$glc, imm:$slc),
-    (!cast<MUBUF_Pseudo>(opcode # _IDXEN_exact) $vdata, $vindex, $rsrc, $soffset,
-                                   (as_i16imm $offset), (as_i1imm $glc),
-                                   (as_i1imm $slc), 0)
+    (name vt:$vdata, v4i32:$rsrc, 0, i32:$voffset, i32:$soffset, imm:$offset,
+              imm:$cachepolicy, 0),
+    (!cast<MUBUF_Pseudo>(opcode # _OFFEN_exact) $vdata, $voffset, $rsrc, $soffset,
+      (as_i16imm $offset), (extract_glc $cachepolicy), (extract_slc $cachepolicy), 0)
   >;
 
   def : GCNPat<
-    (name vt:$vdata, v4i32:$rsrc, 0,
-          (MUBUFIntrinsicVOffset i32:$soffset, i16:$offset, i32:$voffset),
-          imm:$glc, imm:$slc),
-    (!cast<MUBUF_Pseudo>(opcode # _OFFEN_exact) $vdata, $voffset, $rsrc, $soffset,
-                                   (as_i16imm $offset), (as_i1imm $glc),
-                                   (as_i1imm $slc), 0)
+    (name vt:$vdata, v4i32:$rsrc, i32:$vindex, 0, i32:$soffset, imm:$offset,
+              imm:$cachepolicy, imm),
+    (!cast<MUBUF_Pseudo>(opcode # _IDXEN_exact) $vdata, $vindex, $rsrc, $soffset,
+      (as_i16imm $offset), (extract_glc $cachepolicy), (extract_slc $cachepolicy), 0)
   >;
 
   def : GCNPat<
-    (name vt:$vdata, v4i32:$rsrc, i32:$vindex,
-          (MUBUFIntrinsicVOffset i32:$soffset, i16:$offset, i32:$voffset),
-          imm:$glc, imm:$slc),
+    (name vt:$vdata, v4i32:$rsrc, i32:$vindex, i32:$voffset, i32:$soffset, imm:$offset,
+              imm:$cachepolicy, imm),
     (!cast<MUBUF_Pseudo>(opcode # _BOTHEN_exact)
       $vdata,
       (REG_SEQUENCE VReg_64, $vindex, sub0, $voffset, sub1),
       $rsrc, $soffset, (as_i16imm $offset),
-      (as_i1imm $glc), (as_i1imm $slc), 0)
+      (extract_glc $cachepolicy), (extract_slc $cachepolicy), 0)
   >;
 }
 
 defm : MUBUF_StoreIntrinsicPat<SIbuffer_store_format, f32, "BUFFER_STORE_FORMAT_X">;
+defm : MUBUF_StoreIntrinsicPat<SIbuffer_store_format, i32, "BUFFER_STORE_FORMAT_X">;
 defm : MUBUF_StoreIntrinsicPat<SIbuffer_store_format, v2f32, "BUFFER_STORE_FORMAT_XY">;
+defm : MUBUF_StoreIntrinsicPat<SIbuffer_store_format, v2i32, "BUFFER_STORE_FORMAT_XY">;
 defm : MUBUF_StoreIntrinsicPat<SIbuffer_store_format, v4f32, "BUFFER_STORE_FORMAT_XYZW">;
+defm : MUBUF_StoreIntrinsicPat<SIbuffer_store_format, v4i32, "BUFFER_STORE_FORMAT_XYZW">;
 
 let SubtargetPredicate = HasUnpackedD16VMem in {
   defm : MUBUF_StoreIntrinsicPat<SIbuffer_store_format_d16, f16, "BUFFER_STORE_FORMAT_D16_X_gfx80">;
+  defm : MUBUF_StoreIntrinsicPat<SIbuffer_store_format_d16, i16, "BUFFER_STORE_FORMAT_D16_X_gfx80">;
   defm : MUBUF_StoreIntrinsicPat<SIbuffer_store_format_d16, v2i32, "BUFFER_STORE_FORMAT_D16_XY_gfx80">;
   defm : MUBUF_StoreIntrinsicPat<SIbuffer_store_format_d16, v4i32, "BUFFER_STORE_FORMAT_D16_XYZW_gfx80">;
 } // End HasUnpackedD16VMem.
 
 let SubtargetPredicate = HasPackedD16VMem in {
   defm : MUBUF_StoreIntrinsicPat<SIbuffer_store_format_d16, f16, "BUFFER_STORE_FORMAT_D16_X">;
+  defm : MUBUF_StoreIntrinsicPat<SIbuffer_store_format_d16, i16, "BUFFER_STORE_FORMAT_D16_X">;
   defm : MUBUF_StoreIntrinsicPat<SIbuffer_store_format_d16, v2f16, "BUFFER_STORE_FORMAT_D16_XY">;
+  defm : MUBUF_StoreIntrinsicPat<SIbuffer_store_format_d16, v2i16, "BUFFER_STORE_FORMAT_D16_XY">;
   defm : MUBUF_StoreIntrinsicPat<SIbuffer_store_format_d16, v4f16, "BUFFER_STORE_FORMAT_D16_XYZW">;
+  defm : MUBUF_StoreIntrinsicPat<SIbuffer_store_format_d16, v4i16, "BUFFER_STORE_FORMAT_D16_XYZW">;
 } // End HasPackedD16VMem.
 
 defm : MUBUF_StoreIntrinsicPat<SIbuffer_store, f32, "BUFFER_STORE_DWORD">;
+defm : MUBUF_StoreIntrinsicPat<SIbuffer_store, i32, "BUFFER_STORE_DWORD">;
 defm : MUBUF_StoreIntrinsicPat<SIbuffer_store, v2f32, "BUFFER_STORE_DWORDX2">;
+defm : MUBUF_StoreIntrinsicPat<SIbuffer_store, v2i32, "BUFFER_STORE_DWORDX2">;
 defm : MUBUF_StoreIntrinsicPat<SIbuffer_store, v4f32, "BUFFER_STORE_DWORDX4">;
+defm : MUBUF_StoreIntrinsicPat<SIbuffer_store, v4i32, "BUFFER_STORE_DWORDX4">;
 
 //===----------------------------------------------------------------------===//
 // buffer_atomic patterns
@@ -1158,36 +1205,36 @@ defm : MUBUF_StoreIntrinsicPat<SIbuffer_store, v4f32, "BUFFER_STORE_DWORDX4">;
 multiclass BufferAtomicPatterns<SDPatternOperator name, string opcode> {
   def : GCNPat<
     (name i32:$vdata_in, v4i32:$rsrc, 0,
-          (MUBUFIntrinsicOffset i32:$soffset, i16:$offset),
-          imm:$slc),
+          0, i32:$soffset, imm:$offset,
+          imm:$cachepolicy, 0),
     (!cast<MUBUF_Pseudo>(opcode # _OFFSET_RTN) $vdata_in, $rsrc, $soffset,
-                                        (as_i16imm $offset), (as_i1imm $slc))
+                                        (as_i16imm $offset), (extract_slc $cachepolicy))
   >;
 
   def : GCNPat<
     (name i32:$vdata_in, v4i32:$rsrc, i32:$vindex,
-          (MUBUFIntrinsicOffset i32:$soffset, i16:$offset),
-          imm:$slc),
+          0, i32:$soffset, imm:$offset,
+          imm:$cachepolicy, imm),
     (!cast<MUBUF_Pseudo>(opcode # _IDXEN_RTN) $vdata_in, $vindex, $rsrc, $soffset,
-                                       (as_i16imm $offset), (as_i1imm $slc))
+                                       (as_i16imm $offset), (extract_slc $cachepolicy))
   >;
 
   def : GCNPat<
     (name i32:$vdata_in, v4i32:$rsrc, 0,
-          (MUBUFIntrinsicVOffset i32:$soffset, i16:$offset, i32:$voffset),
-          imm:$slc),
+          i32:$voffset, i32:$soffset, imm:$offset,
+          imm:$cachepolicy, 0),
     (!cast<MUBUF_Pseudo>(opcode # _OFFEN_RTN) $vdata_in, $voffset, $rsrc, $soffset,
-                                       (as_i16imm $offset), (as_i1imm $slc))
+                                       (as_i16imm $offset), (extract_slc $cachepolicy))
   >;
 
   def : GCNPat<
     (name i32:$vdata_in, v4i32:$rsrc, i32:$vindex,
-          (MUBUFIntrinsicVOffset i32:$soffset, i16:$offset, i32:$voffset),
-          imm:$slc),
+          i32:$voffset, i32:$soffset, imm:$offset,
+          imm:$cachepolicy, imm),
     (!cast<MUBUF_Pseudo>(opcode # _BOTHEN_RTN)
       $vdata_in,
       (REG_SEQUENCE VReg_64, $vindex, sub0, $voffset, sub1),
-      $rsrc, $soffset, (as_i16imm $offset), (as_i1imm $slc))
+      $rsrc, $soffset, (as_i16imm $offset), (extract_slc $cachepolicy))
   >;
 }
 
@@ -1205,49 +1252,49 @@ defm : BufferAtomicPatterns<SIbuffer_atomic_xor, "BUFFER_ATOMIC_XOR">;
 def : GCNPat<
   (SIbuffer_atomic_cmpswap
       i32:$data, i32:$cmp, v4i32:$rsrc, 0,
-      (MUBUFIntrinsicOffset i32:$soffset, i16:$offset),
-      imm:$slc),
+      0, i32:$soffset, imm:$offset,
+      imm:$cachepolicy, 0),
   (EXTRACT_SUBREG
     (BUFFER_ATOMIC_CMPSWAP_OFFSET_RTN
       (REG_SEQUENCE VReg_64, $data, sub0, $cmp, sub1),
-      $rsrc, $soffset, (as_i16imm $offset), (as_i1imm $slc)),
+      $rsrc, $soffset, (as_i16imm $offset), (extract_slc $cachepolicy)),
     sub0)
 >;
 
 def : GCNPat<
   (SIbuffer_atomic_cmpswap
       i32:$data, i32:$cmp, v4i32:$rsrc, i32:$vindex,
-      (MUBUFIntrinsicOffset i32:$soffset, i16:$offset),
-      imm:$slc),
+      0, i32:$soffset, imm:$offset,
+      imm:$cachepolicy, imm),
   (EXTRACT_SUBREG
     (BUFFER_ATOMIC_CMPSWAP_IDXEN_RTN
       (REG_SEQUENCE VReg_64, $data, sub0, $cmp, sub1),
-      $vindex, $rsrc, $soffset, (as_i16imm $offset), (as_i1imm $slc)),
+      $vindex, $rsrc, $soffset, (as_i16imm $offset), (extract_slc $cachepolicy)),
     sub0)
 >;
 
 def : GCNPat<
   (SIbuffer_atomic_cmpswap
       i32:$data, i32:$cmp, v4i32:$rsrc, 0,
-      (MUBUFIntrinsicVOffset i32:$soffset, i16:$offset, i32:$voffset),
-      imm:$slc),
+      i32:$voffset, i32:$soffset, imm:$offset,
+      imm:$cachepolicy, 0),
   (EXTRACT_SUBREG
     (BUFFER_ATOMIC_CMPSWAP_OFFEN_RTN
       (REG_SEQUENCE VReg_64, $data, sub0, $cmp, sub1),
-      $voffset, $rsrc, $soffset, (as_i16imm $offset), (as_i1imm $slc)),
+      $voffset, $rsrc, $soffset, (as_i16imm $offset), (extract_slc $cachepolicy)),
     sub0)
 >;
 
 def : GCNPat<
   (SIbuffer_atomic_cmpswap
       i32:$data, i32:$cmp, v4i32:$rsrc, i32:$vindex,
-      (MUBUFIntrinsicVOffset i32:$soffset, i16:$offset, i32:$voffset),
-      imm:$slc),
+      i32:$voffset, i32:$soffset, imm:$offset,
+      imm:$cachepolicy, imm),
   (EXTRACT_SUBREG
     (BUFFER_ATOMIC_CMPSWAP_BOTHEN_RTN
       (REG_SEQUENCE VReg_64, $data, sub0, $cmp, sub1),
       (REG_SEQUENCE VReg_64, $vindex, sub0, $voffset, sub1),
-      $rsrc, $soffset, (as_i16imm $offset), (as_i1imm $slc)),
+      $rsrc, $soffset, (as_i16imm $offset), (extract_slc $cachepolicy)),
     sub0)
 >;
 
@@ -1397,54 +1444,6 @@ defm : MUBUFScratchLoadPat_Lo16<BUFFER_LOAD_SHORT_D16_OFFEN, BUFFER_LOAD_SHORT_D
 defm : MUBUFScratchLoadPat_Lo16<BUFFER_LOAD_UBYTE_D16_OFFEN, BUFFER_LOAD_UBYTE_D16_OFFSET, i16, az_extloadi8_private>;
 defm : MUBUFScratchLoadPat_Lo16<BUFFER_LOAD_SBYTE_D16_OFFEN, BUFFER_LOAD_SBYTE_D16_OFFSET, i16, sextloadi8_private>;
 }
-
-// BUFFER_LOAD_DWORD*, addr64=0
-multiclass MUBUF_Load_Dword <ValueType vt,
-                             MUBUF_Pseudo offset,
-                             MUBUF_Pseudo offen,
-                             MUBUF_Pseudo idxen,
-                             MUBUF_Pseudo bothen> {
-
-  def : GCNPat <
-    (vt (int_SI_buffer_load_dword v4i32:$rsrc, (i32 imm), i32:$soffset,
-                                  imm:$offset, 0, 0, imm:$glc, imm:$slc,
-                                  imm:$tfe)),
-    (offset $rsrc, $soffset, (as_i16imm $offset), (as_i1imm $glc),
-            (as_i1imm $slc), (as_i1imm $tfe))
-  >;
-
-  def : GCNPat <
-    (vt (int_SI_buffer_load_dword v4i32:$rsrc, i32:$vaddr, i32:$soffset,
-                                  imm:$offset, 1, 0, imm:$glc, imm:$slc,
-                                  imm:$tfe)),
-    (offen $vaddr, $rsrc, $soffset, (as_i16imm $offset), (as_i1imm $glc), (as_i1imm $slc),
-           (as_i1imm $tfe))
-  >;
-
-  def : GCNPat <
-    (vt (int_SI_buffer_load_dword v4i32:$rsrc, i32:$vaddr, i32:$soffset,
-                                  imm:$offset, 0, 1, imm:$glc, imm:$slc,
-                                  imm:$tfe)),
-    (idxen $vaddr, $rsrc, $soffset, (as_i16imm $offset), (as_i1imm $glc),
-           (as_i1imm $slc), (as_i1imm $tfe))
-  >;
-
-  def : GCNPat <
-    (vt (int_SI_buffer_load_dword v4i32:$rsrc, v2i32:$vaddr, i32:$soffset,
-                                  imm:$offset, 1, 1, imm:$glc, imm:$slc,
-                                  imm:$tfe)),
-    (bothen $vaddr, $rsrc, $soffset, (as_i16imm $offset), (as_i1imm $glc), (as_i1imm $slc),
-            (as_i1imm $tfe))
-  >;
-}
-
-defm : MUBUF_Load_Dword <i32, BUFFER_LOAD_DWORD_OFFSET, BUFFER_LOAD_DWORD_OFFEN,
-                         BUFFER_LOAD_DWORD_IDXEN, BUFFER_LOAD_DWORD_BOTHEN>;
-defm : MUBUF_Load_Dword <v2i32, BUFFER_LOAD_DWORDX2_OFFSET, BUFFER_LOAD_DWORDX2_OFFEN,
-                         BUFFER_LOAD_DWORDX2_IDXEN, BUFFER_LOAD_DWORDX2_BOTHEN>;
-defm : MUBUF_Load_Dword <v4i32, BUFFER_LOAD_DWORDX4_OFFSET, BUFFER_LOAD_DWORDX4_OFFEN,
-                         BUFFER_LOAD_DWORDX4_IDXEN, BUFFER_LOAD_DWORDX4_BOTHEN>;
-
 multiclass MUBUFStore_Atomic_Pattern <MUBUF_Pseudo Instr_ADDR64, MUBUF_Pseudo Instr_OFFSET,
                                       ValueType vt, PatFrag atomic_st> {
   // Store follows atomic op convention so address is forst
@@ -1524,32 +1523,36 @@ multiclass MTBUF_LoadIntrinsicPat<SDPatternOperator name, ValueType vt,
                                   string opcode> {
   def : GCNPat<
     (vt (name v4i32:$rsrc, 0, 0, i32:$soffset, imm:$offset,
-              imm:$dfmt, imm:$nfmt, imm:$glc, imm:$slc)),
+              imm:$format, imm:$cachepolicy, 0)),
     (!cast<MTBUF_Pseudo>(opcode # _OFFSET) $rsrc, $soffset, (as_i16imm $offset),
-      (as_i8imm $dfmt), (as_i8imm $nfmt), (as_i1imm $glc), (as_i1imm $slc), 0)
+      (as_i8imm $format),
+      (extract_glc $cachepolicy), (extract_slc $cachepolicy), 0)
   >;
 
   def : GCNPat<
     (vt (name v4i32:$rsrc, i32:$vindex, 0, i32:$soffset, imm:$offset,
-              imm:$dfmt, imm:$nfmt, imm:$glc, imm:$slc)),
+              imm:$format, imm:$cachepolicy, imm)),
     (!cast<MTBUF_Pseudo>(opcode # _IDXEN) $vindex, $rsrc, $soffset, (as_i16imm $offset),
-      (as_i8imm $dfmt), (as_i8imm $nfmt), (as_i1imm $glc), (as_i1imm $slc), 0)
+      (as_i8imm $format),
+      (extract_glc $cachepolicy), (extract_slc $cachepolicy), 0)
   >;
 
   def : GCNPat<
     (vt (name v4i32:$rsrc, 0, i32:$voffset, i32:$soffset, imm:$offset,
-              imm:$dfmt, imm:$nfmt, imm:$glc, imm:$slc)),
+              imm:$format, imm:$cachepolicy, 0)),
     (!cast<MTBUF_Pseudo>(opcode # _OFFEN) $voffset, $rsrc, $soffset, (as_i16imm $offset),
-      (as_i8imm $dfmt), (as_i8imm $nfmt), (as_i1imm $glc), (as_i1imm $slc), 0)
+      (as_i8imm $format),
+      (extract_glc $cachepolicy), (extract_slc $cachepolicy), 0)
   >;
 
   def : GCNPat<
     (vt (name v4i32:$rsrc, i32:$vindex, i32:$voffset, i32:$soffset, imm:$offset,
-              imm:$dfmt, imm:$nfmt, imm:$glc, imm:$slc)),
+              imm:$format, imm:$cachepolicy, imm)),
     (!cast<MTBUF_Pseudo>(opcode # _BOTHEN)
       (REG_SEQUENCE VReg_64, $vindex, sub0, $voffset, sub1),
       $rsrc, $soffset, (as_i16imm $offset),
-      (as_i8imm $dfmt), (as_i8imm $nfmt), (as_i1imm $glc), (as_i1imm $slc), 0)
+      (as_i8imm $format),
+      (extract_glc $cachepolicy), (extract_slc $cachepolicy), 0)
   >;
 }
 
@@ -1576,39 +1579,36 @@ multiclass MTBUF_StoreIntrinsicPat<SDPatternOperator name, ValueType vt,
                                    string opcode> {
   def : GCNPat<
     (name vt:$vdata, v4i32:$rsrc, 0, 0, i32:$soffset, imm:$offset,
-          imm:$dfmt, imm:$nfmt, imm:$glc, imm:$slc),
+          imm:$format, imm:$cachepolicy, 0),
     (!cast<MTBUF_Pseudo>(opcode # _OFFSET_exact) $vdata, $rsrc, $soffset,
-                                (as_i16imm $offset), (as_i8imm $dfmt),
-                                (as_i8imm $nfmt), (as_i1imm $glc),
-                                (as_i1imm $slc), 0)
+      (as_i16imm $offset), (as_i8imm $format),
+      (extract_glc $cachepolicy), (extract_slc $cachepolicy), 0)
   >;
 
   def : GCNPat<
     (name vt:$vdata, v4i32:$rsrc, i32:$vindex, 0, i32:$soffset, imm:$offset,
-          imm:$dfmt, imm:$nfmt, imm:$glc, imm:$slc),
+          imm:$format, imm:$cachepolicy, imm),
     (!cast<MTBUF_Pseudo>(opcode # _IDXEN_exact) $vdata, $vindex, $rsrc, $soffset,
-                                   (as_i16imm $offset), (as_i8imm $dfmt),
-                                   (as_i8imm $nfmt), (as_i1imm $glc),
-                                   (as_i1imm $slc), 0)
+      (as_i16imm $offset), (as_i8imm $format),
+      (extract_glc $cachepolicy), (extract_slc $cachepolicy), 0)
   >;
 
   def : GCNPat<
     (name vt:$vdata, v4i32:$rsrc, 0, i32:$voffset, i32:$soffset, imm:$offset,
-          imm:$dfmt, imm:$nfmt, imm:$glc, imm:$slc),
+          imm:$format, imm:$cachepolicy, 0),
     (!cast<MTBUF_Pseudo>(opcode # _OFFEN_exact) $vdata, $voffset, $rsrc, $soffset,
-                                   (as_i16imm $offset), (as_i8imm $dfmt),
-                                   (as_i8imm $nfmt), (as_i1imm $glc),
-                                   (as_i1imm $slc), 0)
+      (as_i16imm $offset), (as_i8imm $format),
+      (extract_glc $cachepolicy), (extract_slc $cachepolicy), 0)
   >;
 
   def : GCNPat<
     (name vt:$vdata, v4i32:$rsrc, i32:$vindex, i32:$voffset, i32:$soffset,
-          imm:$offset, imm:$dfmt, imm:$nfmt, imm:$glc, imm:$slc),
+          imm:$offset, imm:$format, imm:$cachepolicy, imm),
     (!cast<MTBUF_Pseudo>(opcode # _BOTHEN_exact)
       $vdata,
       (REG_SEQUENCE VReg_64, $vindex, sub0, $voffset, sub1),
-      $rsrc, $soffset, (as_i16imm $offset),
-      (as_i8imm $dfmt), (as_i8imm $nfmt), (as_i1imm $glc), (as_i1imm $slc), 0)
+      $rsrc, $soffset, (as_i16imm $offset), (as_i8imm $format),
+      (extract_glc $cachepolicy), (extract_slc $cachepolicy), 0)
   >;
 }
 
@@ -1781,8 +1781,8 @@ class MTBUF_Real_si <bits<3> op, MTBUF_Pseudo ps> :
   let Inst{14}    = !if(ps.has_glc, glc, ps.glc_value);
   let Inst{15}    = ps.addr64;
   let Inst{18-16} = op;
-  let Inst{22-19} = !if(ps.has_dfmt, dfmt, ps.dfmt_value);
-  let Inst{25-23} = !if(ps.has_nfmt, nfmt, ps.nfmt_value);
+  let Inst{22-19} = dfmt;
+  let Inst{25-23} = nfmt;
   let Inst{31-26} = 0x3a; //encoding
   let Inst{39-32} = !if(ps.has_vaddr, vaddr, ?);
   let Inst{47-40} = !if(ps.has_vdata, vdata, ?);
@@ -1811,6 +1811,7 @@ defm TBUFFER_STORE_FORMAT_XYZW : MTBUF_Real_AllAddr_si <7>;
 
 //===----------------------------------------------------------------------===//
 // CI
+// MTBUF - GFX6, GFX7.
 //===----------------------------------------------------------------------===//
 
 class MUBUF_Real_ci <bits<7> op, MUBUF_Pseudo ps> :
@@ -2013,8 +2014,8 @@ class MTBUF_Real_vi <bits<4> op, MTBUF_Pseudo ps> :
   let Inst{13}    = ps.idxen;
   let Inst{14}    = !if(ps.has_glc, glc, ps.glc_value);
   let Inst{18-15} = op;
-  let Inst{22-19} = !if(ps.has_dfmt, dfmt, ps.dfmt_value);
-  let Inst{25-23} = !if(ps.has_nfmt, nfmt, ps.nfmt_value);
+  let Inst{22-19} = dfmt;
+  let Inst{25-23} = nfmt;
   let Inst{31-26} = 0x3a; //encoding
   let Inst{39-32} = !if(ps.has_vaddr, vaddr, ?);
   let Inst{47-40} = !if(ps.has_vdata, vdata, ?);
@@ -2043,8 +2044,8 @@ class MTBUF_Real_gfx80 <bits<4> op, MTBUF_Pseudo ps> :
   let Inst{13}    = ps.idxen;
   let Inst{14}    = !if(ps.has_glc, glc, ps.glc_value);
   let Inst{18-15} = op;
-  let Inst{22-19} = !if(ps.has_dfmt, dfmt, ps.dfmt_value);
-  let Inst{25-23} = !if(ps.has_nfmt, nfmt, ps.nfmt_value);
+  let Inst{22-19} = dfmt;
+  let Inst{25-23} = nfmt;
   let Inst{31-26} = 0x3a; //encoding
   let Inst{39-32} = !if(ps.has_vaddr, vaddr, ?);
   let Inst{47-40} = !if(ps.has_vdata, vdata, ?);
@@ -2089,3 +2090,22 @@ let SubtargetPredicate = HasPackedD16VMem in {
   defm TBUFFER_STORE_FORMAT_D16_XYZ  : MTBUF_Real_AllAddr_vi <0x0e>;
   defm TBUFFER_STORE_FORMAT_D16_XYZW : MTBUF_Real_AllAddr_vi <0x0f>;
 } // End HasUnpackedD16VMem.
+
+def MUBUFInfoTable : GenericTable {
+  let FilterClass = "MUBUF_Pseudo";
+  let CppTypeName = "MUBUFInfo";
+  let Fields = ["Opcode", "BaseOpcode", "dwords", "has_vaddr", "has_srsrc", "has_soffset"];
+
+  let PrimaryKey = ["Opcode"];
+  let PrimaryKeyName = "getMUBUFOpcodeHelper";
+}
+
+def getMUBUFInfoFromOpcode : SearchIndex {
+  let Table = MUBUFInfoTable;
+  let Key = ["Opcode"];
+}
+
+def getMUBUFInfoFromBaseOpcodeAndDwords : SearchIndex {
+  let Table = MUBUFInfoTable;
+  let Key = ["BaseOpcode", "dwords"];
+}
diff --git a/lib/Target/AMDGPU/CMakeLists.txt b/lib/Target/AMDGPU/CMakeLists.txt
index 174b2df15300a..393311791ec93 100644
--- a/lib/Target/AMDGPU/CMakeLists.txt
+++ b/lib/Target/AMDGPU/CMakeLists.txt
@@ -37,8 +37,10 @@ add_llvm_target(AMDGPUCodeGen
   AMDGPUAnnotateUniformValues.cpp
   AMDGPUArgumentUsageInfo.cpp
   AMDGPUAsmPrinter.cpp
+  AMDGPUAtomicOptimizer.cpp
   AMDGPUCallLowering.cpp
   AMDGPUCodeGenPrepare.cpp
+  AMDGPUFixFunctionBitcasts.cpp
   AMDGPUFrameLowering.cpp
   AMDGPUHSAMetadataStreamer.cpp
   AMDGPUInstrInfo.cpp
@@ -91,9 +93,11 @@ add_llvm_target(AMDGPUCodeGen
   R600OptimizeVectorRegisters.cpp
   R600Packetizer.cpp
   R600RegisterInfo.cpp
+  SIAddIMGInit.cpp
   SIAnnotateControlFlow.cpp
   SIDebuggerInsertNops.cpp
   SIFixSGPRCopies.cpp
+  SIFixupVectorISel.cpp
   SIFixVGPRCopies.cpp
   SIFixWWMLiveness.cpp
   SIFoldOperands.cpp
@@ -116,6 +120,8 @@ add_llvm_target(AMDGPUCodeGen
   SIShrinkInstructions.cpp
   SIWholeQuadMode.cpp
   GCNILPSched.cpp
+  GCNDPPCombine.cpp
+  SIModeRegister.cpp
   )
 
 add_subdirectory(AsmParser)
diff --git a/lib/Target/AMDGPU/DSInstructions.td b/lib/Target/AMDGPU/DSInstructions.td
index cdc6ab9412e61..31d2ebef481d2 100644
--- a/lib/Target/AMDGPU/DSInstructions.td
+++ b/lib/Target/AMDGPU/DSInstructions.td
@@ -728,7 +728,9 @@ class DS64Bit4ByteAlignedWritePat<DS_Pseudo inst, PatFrag frag> : GCNPat<
               (i1 0))
 >;
 
-let OtherPredicates = [LDSRequiresM0Init] in {
+// v2i32 loads are split into i32 loads on SI during lowering, due to a bug
+// related to bounds checking.
+let OtherPredicates = [LDSRequiresM0Init, isCIVI] in {
 def : DS64Bit4ByteAlignedReadPat<DS_READ2_B32, load_local_m0>;
 def : DS64Bit4ByteAlignedWritePat<DS_WRITE2_B32, store_local_m0>;
 }
diff --git a/lib/Target/AMDGPU/FLATInstructions.td b/lib/Target/AMDGPU/FLATInstructions.td
index 3ef473b7fd966..44040d352e6a8 100644
--- a/lib/Target/AMDGPU/FLATInstructions.td
+++ b/lib/Target/AMDGPU/FLATInstructions.td
@@ -121,6 +121,11 @@ class FLAT_Real <bits<7> op, FLAT_Pseudo ps> :
   let Inst{63-56} = !if(ps.has_vdst, vdst, ?);
 }
 
+class GlobalSaddrTable <bit is_saddr, string Name = ""> {
+  bit IsSaddr = is_saddr;
+  string SaddrOp = Name;
+}
+
 // TODO: Is exec allowed for saddr? The disabled value 0x7f is the
 // same encoding value as exec_hi, so it isn't possible to use that if
 // saddr is 32-bit (which isn't handled here yet).
@@ -171,15 +176,19 @@ class FLAT_Store_Pseudo <string opName, RegisterClass vdataClass,
 
 multiclass FLAT_Global_Load_Pseudo<string opName, RegisterClass regClass, bit HasTiedInput = 0> {
   let is_flat_global = 1 in {
-    def "" : FLAT_Load_Pseudo<opName, regClass, HasTiedInput, 1, 1>;
-    def _SADDR : FLAT_Load_Pseudo<opName, regClass, HasTiedInput, 1, 1, 1>;
+    def "" : FLAT_Load_Pseudo<opName, regClass, HasTiedInput, 1, 1>,
+      GlobalSaddrTable<0, opName>;
+    def _SADDR : FLAT_Load_Pseudo<opName, regClass, HasTiedInput, 1, 1, 1>,
+      GlobalSaddrTable<1, opName>;
   }
 }
 
 multiclass FLAT_Global_Store_Pseudo<string opName, RegisterClass regClass> {
   let is_flat_global = 1 in {
-    def "" : FLAT_Store_Pseudo<opName, regClass, 1, 1>;
-    def _SADDR : FLAT_Store_Pseudo<opName, regClass, 1, 1, 1>;
+    def "" : FLAT_Store_Pseudo<opName, regClass, 1, 1>,
+      GlobalSaddrTable<0, opName>;
+    def _SADDR : FLAT_Store_Pseudo<opName, regClass, 1, 1, 1>,
+      GlobalSaddrTable<1, opName>;
   }
 }
 
@@ -262,6 +271,7 @@ multiclass FLAT_Atomic_Pseudo<
     (outs),
     (ins VReg_64:$vaddr, data_rc:$vdata, offset_u12:$offset, SLC:$slc),
     " $vaddr, $vdata$offset$slc">,
+    GlobalSaddrTable<0, opName>,
     AtomicNoRet <opName, 0> {
     let PseudoInstr = NAME;
   }
@@ -272,10 +282,11 @@ multiclass FLAT_Atomic_Pseudo<
     " $vdst, $vaddr, $vdata$offset glc$slc",
     [(set vt:$vdst,
       (atomic (FLATAtomic i64:$vaddr, i16:$offset, i1:$slc), data_vt:$vdata))]>,
+       GlobalSaddrTable<0, opName#"_rtn">,
        AtomicNoRet <opName, 1>;
 }
 
-multiclass FLAT_Global_Atomic_Pseudo<
+multiclass FLAT_Global_Atomic_Pseudo_NO_RTN<
   string opName,
   RegisterClass vdst_rc,
   ValueType vt,
@@ -287,35 +298,48 @@ multiclass FLAT_Global_Atomic_Pseudo<
     (outs),
     (ins VReg_64:$vaddr, data_rc:$vdata, offset_s13:$offset, SLC:$slc),
     " $vaddr, $vdata, off$offset$slc">,
+    GlobalSaddrTable<0, opName>,
     AtomicNoRet <opName, 0> {
     let has_saddr = 1;
     let PseudoInstr = NAME;
   }
 
-  def _RTN : FLAT_AtomicRet_Pseudo <opName,
-    (outs vdst_rc:$vdst),
-      (ins VReg_64:$vaddr, data_rc:$vdata, offset_s13:$offset, SLC:$slc),
-    " $vdst, $vaddr, $vdata, off$offset glc$slc",
-    [(set vt:$vdst,
-      (atomic (FLATSignedAtomic i64:$vaddr, i16:$offset, i1:$slc), data_vt:$vdata))]>,
-      AtomicNoRet <opName, 1> {
-    let has_saddr = 1;
-  }
-
   def _SADDR : FLAT_AtomicNoRet_Pseudo <opName,
     (outs),
     (ins VReg_64:$vaddr, data_rc:$vdata, SReg_64:$saddr, offset_s13:$offset, SLC:$slc),
     " $vaddr, $vdata, $saddr$offset$slc">,
+    GlobalSaddrTable<1, opName>,
     AtomicNoRet <opName#"_saddr", 0> {
     let has_saddr = 1;
     let enabled_saddr = 1;
     let PseudoInstr = NAME#"_SADDR";
   }
+}
+
+multiclass FLAT_Global_Atomic_Pseudo_RTN<
+  string opName,
+  RegisterClass vdst_rc,
+  ValueType vt,
+  SDPatternOperator atomic = null_frag,
+  ValueType data_vt = vt,
+  RegisterClass data_rc = vdst_rc> {
+
+  def _RTN : FLAT_AtomicRet_Pseudo <opName,
+    (outs vdst_rc:$vdst),
+      (ins VReg_64:$vaddr, data_rc:$vdata, offset_s13:$offset, SLC:$slc),
+    " $vdst, $vaddr, $vdata, off$offset glc$slc",
+    [(set vt:$vdst,
+      (atomic (FLATSignedAtomic i64:$vaddr, i16:$offset, i1:$slc), data_vt:$vdata))]>,
+      GlobalSaddrTable<0, opName#"_rtn">,
+      AtomicNoRet <opName, 1> {
+    let has_saddr = 1;
+  }
 
   def _SADDR_RTN : FLAT_AtomicRet_Pseudo <opName,
     (outs vdst_rc:$vdst),
       (ins VReg_64:$vaddr, data_rc:$vdata, SReg_64:$saddr, offset_s13:$offset, SLC:$slc),
     " $vdst, $vaddr, $vdata, $saddr$offset glc$slc">,
+    GlobalSaddrTable<1, opName#"_rtn">,
     AtomicNoRet <opName#"_saddr", 1> {
      let has_saddr = 1;
      let enabled_saddr = 1;
@@ -323,10 +347,20 @@ multiclass FLAT_Global_Atomic_Pseudo<
   }
 }
 
+multiclass FLAT_Global_Atomic_Pseudo<
+  string opName,
+  RegisterClass vdst_rc,
+  ValueType vt,
+  SDPatternOperator atomic = null_frag,
+  ValueType data_vt = vt,
+  RegisterClass data_rc = vdst_rc> :
+    FLAT_Global_Atomic_Pseudo_NO_RTN<opName, vdst_rc, vt, atomic, data_vt, data_rc>,
+    FLAT_Global_Atomic_Pseudo_RTN<opName, vdst_rc, vt, atomic, data_vt, data_rc>;
+
 class flat_binary_atomic_op<SDNode atomic_op> : PatFrag<
   (ops node:$ptr, node:$value),
   (atomic_op node:$ptr, node:$value),
-  [{return cast<MemSDNode>(N)->getAddressSpace() == AMDGPUASI.FLAT_ADDRESS;}]
+  [{return cast<MemSDNode>(N)->getAddressSpace() == AMDGPUAS::FLAT_ADDRESS;}]
 >;
 
 def atomic_cmp_swap_flat : flat_binary_atomic_op<AMDGPUatomic_cmp_swap>;
diff --git a/lib/Target/AMDGPU/GCNDPPCombine.cpp b/lib/Target/AMDGPU/GCNDPPCombine.cpp
new file mode 100644
index 0000000000000..56071d0d23744
--- /dev/null
+++ b/lib/Target/AMDGPU/GCNDPPCombine.cpp
@@ -0,0 +1,446 @@
+//=======- GCNDPPCombine.cpp - optimization for DPP instructions ---==========//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+// The pass combines V_MOV_B32_dpp instruction with its VALU uses as a DPP src0
+// operand.If any of the use instruction cannot be combined with the mov the
+// whole sequence is reverted.
+//
+// $old = ...
+// $dpp_value = V_MOV_B32_dpp $old, $vgpr_to_be_read_from_other_lane,
+//                            dpp_controls..., $bound_ctrl
+// $res = VALU $dpp_value, ...
+//
+// to
+//
+// $res = VALU_DPP $folded_old, $vgpr_to_be_read_from_other_lane, ...,
+//                 dpp_controls..., $folded_bound_ctrl
+//
+// Combining rules :
+//
+// $bound_ctrl is DPP_BOUND_ZERO, $old is any
+// $bound_ctrl is DPP_BOUND_OFF, $old is 0
+//
+// ->$folded_old = undef, $folded_bound_ctrl = DPP_BOUND_ZERO
+// $bound_ctrl is DPP_BOUND_OFF, $old is undef
+//
+// ->$folded_old = undef, $folded_bound_ctrl = DPP_BOUND_OFF
+// $bound_ctrl is DPP_BOUND_OFF, $old is foldable
+//
+// ->$folded_old = folded value, $folded_bound_ctrl = DPP_BOUND_OFF
+//===----------------------------------------------------------------------===//
+
+#include "AMDGPU.h"
+#include "AMDGPUSubtarget.h"
+#include "SIInstrInfo.h"
+#include "MCTargetDesc/AMDGPUMCTargetDesc.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/CodeGen/MachineBasicBlock.h"
+#include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/CodeGen/MachineInstr.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/MachineOperand.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/CodeGen/TargetRegisterInfo.h"
+#include "llvm/Pass.h"
+#include <cassert>
+
+using namespace llvm;
+
+#define DEBUG_TYPE "gcn-dpp-combine"
+
+STATISTIC(NumDPPMovsCombined, "Number of DPP moves combined.");
+
+namespace {
+
+class GCNDPPCombine : public MachineFunctionPass {
+  MachineRegisterInfo *MRI;
+  const SIInstrInfo *TII;
+
+  using RegSubRegPair = TargetInstrInfo::RegSubRegPair;
+
+  MachineOperand *getOldOpndValue(MachineOperand &OldOpnd) const;
+
+  RegSubRegPair foldOldOpnd(MachineInstr &OrigMI,
+                            RegSubRegPair OldOpndVGPR,
+                            MachineOperand &OldOpndValue) const;
+
+  MachineInstr *createDPPInst(MachineInstr &OrigMI,
+                              MachineInstr &MovMI,
+                              RegSubRegPair OldOpndVGPR,
+                              MachineOperand *OldOpnd,
+                              bool BoundCtrlZero) const;
+
+  MachineInstr *createDPPInst(MachineInstr &OrigMI,
+                              MachineInstr &MovMI,
+                              RegSubRegPair OldOpndVGPR,
+                              bool BoundCtrlZero) const;
+
+  bool hasNoImmOrEqual(MachineInstr &MI,
+                       unsigned OpndName,
+                       int64_t Value,
+                       int64_t Mask = -1) const;
+
+  bool combineDPPMov(MachineInstr &MI) const;
+
+public:
+  static char ID;
+
+  GCNDPPCombine() : MachineFunctionPass(ID) {
+    initializeGCNDPPCombinePass(*PassRegistry::getPassRegistry());
+  }
+
+  bool runOnMachineFunction(MachineFunction &MF) override;
+
+  StringRef getPassName() const override { return "GCN DPP Combine"; }
+
+  void getAnalysisUsage(AnalysisUsage &AU) const override {
+    AU.setPreservesCFG();
+    MachineFunctionPass::getAnalysisUsage(AU);
+  }
+};
+
+} // end anonymous namespace
+
+INITIALIZE_PASS(GCNDPPCombine, DEBUG_TYPE, "GCN DPP Combine", false, false)
+
+char GCNDPPCombine::ID = 0;
+
+char &llvm::GCNDPPCombineID = GCNDPPCombine::ID;
+
+FunctionPass *llvm::createGCNDPPCombinePass() {
+  return new GCNDPPCombine();
+}
+
+static int getDPPOp(unsigned Op) {
+  auto DPP32 = AMDGPU::getDPPOp32(Op);
+  if (DPP32 != -1)
+    return DPP32;
+
+  auto E32 = AMDGPU::getVOPe32(Op);
+  return E32 != -1 ? AMDGPU::getDPPOp32(E32) : -1;
+}
+
+// tracks the register operand definition and returns:
+//   1. immediate operand used to initialize the register if found
+//   2. nullptr if the register operand is undef
+//   3. the operand itself otherwise
+MachineOperand *GCNDPPCombine::getOldOpndValue(MachineOperand &OldOpnd) const {
+  auto *Def = getVRegSubRegDef(getRegSubRegPair(OldOpnd), *MRI);
+  if (!Def)
+    return nullptr;
+
+  switch(Def->getOpcode()) {
+  default: break;
+  case AMDGPU::IMPLICIT_DEF:
+    return nullptr;
+  case AMDGPU::COPY:
+  case AMDGPU::V_MOV_B32_e32: {
+    auto &Op1 = Def->getOperand(1);
+    if (Op1.isImm())
+      return &Op1;
+    break;
+  }
+  }
+  return &OldOpnd;
+}
+
+MachineInstr *GCNDPPCombine::createDPPInst(MachineInstr &OrigMI,
+                                           MachineInstr &MovMI,
+                                           RegSubRegPair OldOpndVGPR,
+                                           bool BoundCtrlZero) const {
+  assert(MovMI.getOpcode() == AMDGPU::V_MOV_B32_dpp);
+  assert(TII->getNamedOperand(MovMI, AMDGPU::OpName::vdst)->getReg() ==
+         TII->getNamedOperand(OrigMI, AMDGPU::OpName::src0)->getReg());
+
+  auto OrigOp = OrigMI.getOpcode();
+  auto DPPOp = getDPPOp(OrigOp);
+  if (DPPOp == -1) {
+    LLVM_DEBUG(dbgs() << "  failed: no DPP opcode\n");
+    return nullptr;
+  }
+
+  auto DPPInst = BuildMI(*OrigMI.getParent(), OrigMI,
+                         OrigMI.getDebugLoc(), TII->get(DPPOp));
+  bool Fail = false;
+  do {
+    auto *Dst = TII->getNamedOperand(OrigMI, AMDGPU::OpName::vdst);
+    assert(Dst);
+    DPPInst.add(*Dst);
+    int NumOperands = 1;
+
+    const int OldIdx = AMDGPU::getNamedOperandIdx(DPPOp, AMDGPU::OpName::old);
+    if (OldIdx != -1) {
+      assert(OldIdx == NumOperands);
+      assert(isOfRegClass(OldOpndVGPR, AMDGPU::VGPR_32RegClass, *MRI));
+      DPPInst.addReg(OldOpndVGPR.Reg, 0, OldOpndVGPR.SubReg);
+      ++NumOperands;
+    }
+
+    if (auto *Mod0 = TII->getNamedOperand(OrigMI,
+                                          AMDGPU::OpName::src0_modifiers)) {
+      assert(NumOperands == AMDGPU::getNamedOperandIdx(DPPOp,
+                                          AMDGPU::OpName::src0_modifiers));
+      assert(0LL == (Mod0->getImm() & ~(SISrcMods::ABS | SISrcMods::NEG)));
+      DPPInst.addImm(Mod0->getImm());
+      ++NumOperands;
+    }
+    auto *Src0 = TII->getNamedOperand(MovMI, AMDGPU::OpName::src0);
+    assert(Src0);
+    if (!TII->isOperandLegal(*DPPInst.getInstr(), NumOperands, Src0)) {
+      LLVM_DEBUG(dbgs() << "  failed: src0 is illegal\n");
+      Fail = true;
+      break;
+    }
+    DPPInst.add(*Src0);
+    ++NumOperands;
+
+    if (auto *Mod1 = TII->getNamedOperand(OrigMI,
+                                          AMDGPU::OpName::src1_modifiers)) {
+      assert(NumOperands == AMDGPU::getNamedOperandIdx(DPPOp,
+                                          AMDGPU::OpName::src1_modifiers));
+      assert(0LL == (Mod1->getImm() & ~(SISrcMods::ABS | SISrcMods::NEG)));
+      DPPInst.addImm(Mod1->getImm());
+      ++NumOperands;
+    }
+    if (auto *Src1 = TII->getNamedOperand(OrigMI, AMDGPU::OpName::src1)) {
+      if (!TII->isOperandLegal(*DPPInst.getInstr(), NumOperands, Src1)) {
+        LLVM_DEBUG(dbgs() << "  failed: src1 is illegal\n");
+        Fail = true;
+        break;
+      }
+      DPPInst.add(*Src1);
+      ++NumOperands;
+    }
+
+    if (auto *Src2 = TII->getNamedOperand(OrigMI, AMDGPU::OpName::src2)) {
+      if (!TII->isOperandLegal(*DPPInst.getInstr(), NumOperands, Src2)) {
+        LLVM_DEBUG(dbgs() << "  failed: src2 is illegal\n");
+        Fail = true;
+        break;
+      }
+      DPPInst.add(*Src2);
+    }
+
+    DPPInst.add(*TII->getNamedOperand(MovMI, AMDGPU::OpName::dpp_ctrl));
+    DPPInst.add(*TII->getNamedOperand(MovMI, AMDGPU::OpName::row_mask));
+    DPPInst.add(*TII->getNamedOperand(MovMI, AMDGPU::OpName::bank_mask));
+    DPPInst.addImm(BoundCtrlZero ? 1 : 0);
+  } while (false);
+
+  if (Fail) {
+    DPPInst.getInstr()->eraseFromParent();
+    return nullptr;
+  }
+  LLVM_DEBUG(dbgs() << "  combined:  " << *DPPInst.getInstr());
+  return DPPInst.getInstr();
+}
+
+GCNDPPCombine::RegSubRegPair
+GCNDPPCombine::foldOldOpnd(MachineInstr &OrigMI,
+                           RegSubRegPair OldOpndVGPR,
+                           MachineOperand &OldOpndValue) const {
+  assert(OldOpndValue.isImm());
+  switch (OrigMI.getOpcode()) {
+  default: break;
+  case AMDGPU::V_MAX_U32_e32:
+    if (OldOpndValue.getImm() == std::numeric_limits<uint32_t>::max())
+      return OldOpndVGPR;
+    break;
+  case AMDGPU::V_MAX_I32_e32:
+    if (OldOpndValue.getImm() == std::numeric_limits<int32_t>::max())
+      return OldOpndVGPR;
+    break;
+  case AMDGPU::V_MIN_I32_e32:
+    if (OldOpndValue.getImm() == std::numeric_limits<int32_t>::min())
+      return OldOpndVGPR;
+    break;
+
+  case AMDGPU::V_MUL_I32_I24_e32:
+  case AMDGPU::V_MUL_U32_U24_e32:
+    if (OldOpndValue.getImm() == 1) {
+      auto *Src1 = TII->getNamedOperand(OrigMI, AMDGPU::OpName::src1);
+      assert(Src1 && Src1->isReg());
+      return getRegSubRegPair(*Src1);
+    }
+    break;
+  }
+  return RegSubRegPair();
+}
+
+// Cases to combine:
+//  $bound_ctrl is DPP_BOUND_ZERO, $old is any
+//  $bound_ctrl is DPP_BOUND_OFF, $old is 0
+//  -> $old = undef, $bound_ctrl = DPP_BOUND_ZERO
+
+//  $bound_ctrl is DPP_BOUND_OFF, $old is undef
+//  -> $old = undef, $bound_ctrl = DPP_BOUND_OFF
+
+//  $bound_ctrl is DPP_BOUND_OFF, $old is foldable
+//  -> $old = folded value, $bound_ctrl = DPP_BOUND_OFF
+
+MachineInstr *GCNDPPCombine::createDPPInst(MachineInstr &OrigMI,
+                                           MachineInstr &MovMI,
+                                           RegSubRegPair OldOpndVGPR,
+                                           MachineOperand *OldOpndValue,
+                                           bool BoundCtrlZero) const {
+  assert(OldOpndVGPR.Reg);
+  if (!BoundCtrlZero && OldOpndValue) {
+    assert(OldOpndValue->isImm());
+    OldOpndVGPR = foldOldOpnd(OrigMI, OldOpndVGPR, *OldOpndValue);
+    if (!OldOpndVGPR.Reg) {
+      LLVM_DEBUG(dbgs() << "  failed: old immediate cannot be folded\n");
+      return nullptr;
+    }
+  }
+  return createDPPInst(OrigMI, MovMI, OldOpndVGPR, BoundCtrlZero);
+}
+
+// returns true if MI doesn't have OpndName immediate operand or the
+// operand has Value
+bool GCNDPPCombine::hasNoImmOrEqual(MachineInstr &MI, unsigned OpndName,
+                                    int64_t Value, int64_t Mask) const {
+  auto *Imm = TII->getNamedOperand(MI, OpndName);
+  if (!Imm)
+    return true;
+
+  assert(Imm->isImm());
+  return (Imm->getImm() & Mask) == Value;
+}
+
+bool GCNDPPCombine::combineDPPMov(MachineInstr &MovMI) const {
+  assert(MovMI.getOpcode() == AMDGPU::V_MOV_B32_dpp);
+  auto *BCZOpnd = TII->getNamedOperand(MovMI, AMDGPU::OpName::bound_ctrl);
+  assert(BCZOpnd && BCZOpnd->isImm());
+  bool BoundCtrlZero = 0 != BCZOpnd->getImm();
+
+  LLVM_DEBUG(dbgs() << "\nDPP combine: " << MovMI);
+
+  auto *OldOpnd = TII->getNamedOperand(MovMI, AMDGPU::OpName::old);
+  assert(OldOpnd && OldOpnd->isReg());
+  auto OldOpndVGPR = getRegSubRegPair(*OldOpnd);
+  auto *OldOpndValue = getOldOpndValue(*OldOpnd);
+  assert(!OldOpndValue || OldOpndValue->isImm() || OldOpndValue == OldOpnd);
+  if (OldOpndValue) {
+    if (BoundCtrlZero) {
+      OldOpndVGPR.Reg = AMDGPU::NoRegister; // should be undef, ignore old opnd
+      OldOpndValue = nullptr;
+    } else {
+      if (!OldOpndValue->isImm()) {
+        LLVM_DEBUG(dbgs() << "  failed: old operand isn't an imm or undef\n");
+        return false;
+      }
+      if (OldOpndValue->getImm() == 0) {
+        OldOpndVGPR.Reg = AMDGPU::NoRegister; // should be undef
+        OldOpndValue = nullptr;
+        BoundCtrlZero = true;
+      }
+    }
+  }
+
+  LLVM_DEBUG(dbgs() << "  old=";
+    if (!OldOpndValue)
+      dbgs() << "undef";
+    else
+      dbgs() << OldOpndValue->getImm();
+    dbgs() << ", bound_ctrl=" << BoundCtrlZero << '\n');
+
+  std::vector<MachineInstr*> OrigMIs, DPPMIs;
+  if (!OldOpndVGPR.Reg) { // OldOpndVGPR = undef
+    OldOpndVGPR = RegSubRegPair(
+      MRI->createVirtualRegister(&AMDGPU::VGPR_32RegClass));
+    auto UndefInst = BuildMI(*MovMI.getParent(), MovMI, MovMI.getDebugLoc(),
+                             TII->get(AMDGPU::IMPLICIT_DEF), OldOpndVGPR.Reg);
+    DPPMIs.push_back(UndefInst.getInstr());
+  }
+
+  OrigMIs.push_back(&MovMI);
+  bool Rollback = true;
+  for (auto &Use : MRI->use_nodbg_operands(
+       TII->getNamedOperand(MovMI, AMDGPU::OpName::vdst)->getReg())) {
+    Rollback = true;
+
+    auto &OrigMI = *Use.getParent();
+    auto OrigOp = OrigMI.getOpcode();
+    if (TII->isVOP3(OrigOp)) {
+      if (!TII->hasVALU32BitEncoding(OrigOp)) {
+        LLVM_DEBUG(dbgs() << "  failed: VOP3 hasn't e32 equivalent\n");
+        break;
+      }
+      // check if other than abs|neg modifiers are set (opsel for example)
+      const int64_t Mask = ~(SISrcMods::ABS | SISrcMods::NEG);
+      if (!hasNoImmOrEqual(OrigMI, AMDGPU::OpName::src0_modifiers, 0, Mask) ||
+          !hasNoImmOrEqual(OrigMI, AMDGPU::OpName::src1_modifiers, 0, Mask) ||
+          !hasNoImmOrEqual(OrigMI, AMDGPU::OpName::clamp, 0) ||
+          !hasNoImmOrEqual(OrigMI, AMDGPU::OpName::omod, 0)) {
+        LLVM_DEBUG(dbgs() << "  failed: VOP3 has non-default modifiers\n");
+        break;
+      }
+    } else if (!TII->isVOP1(OrigOp) && !TII->isVOP2(OrigOp)) {
+      LLVM_DEBUG(dbgs() << "  failed: not VOP1/2/3\n");
+      break;
+    }
+
+    LLVM_DEBUG(dbgs() << "  combining: " << OrigMI);
+    if (&Use == TII->getNamedOperand(OrigMI, AMDGPU::OpName::src0)) {
+      if (auto *DPPInst = createDPPInst(OrigMI, MovMI, OldOpndVGPR,
+                                        OldOpndValue, BoundCtrlZero)) {
+        DPPMIs.push_back(DPPInst);
+        Rollback = false;
+      }
+    } else if (OrigMI.isCommutable() &&
+               &Use == TII->getNamedOperand(OrigMI, AMDGPU::OpName::src1)) {
+      auto *BB = OrigMI.getParent();
+      auto *NewMI = BB->getParent()->CloneMachineInstr(&OrigMI);
+      BB->insert(OrigMI, NewMI);
+      if (TII->commuteInstruction(*NewMI)) {
+        LLVM_DEBUG(dbgs() << "  commuted:  " << *NewMI);
+        if (auto *DPPInst = createDPPInst(*NewMI, MovMI, OldOpndVGPR,
+                                          OldOpndValue, BoundCtrlZero)) {
+          DPPMIs.push_back(DPPInst);
+          Rollback = false;
+        }
+      } else
+        LLVM_DEBUG(dbgs() << "  failed: cannot be commuted\n");
+      NewMI->eraseFromParent();
+    } else
+      LLVM_DEBUG(dbgs() << "  failed: no suitable operands\n");
+    if (Rollback)
+      break;
+    OrigMIs.push_back(&OrigMI);
+  }
+
+  for (auto *MI : *(Rollback? &DPPMIs : &OrigMIs))
+    MI->eraseFromParent();
+
+  return !Rollback;
+}
+
+bool GCNDPPCombine::runOnMachineFunction(MachineFunction &MF) {
+  auto &ST = MF.getSubtarget<GCNSubtarget>();
+  if (!ST.hasDPP() || skipFunction(MF.getFunction()))
+    return false;
+
+  MRI = &MF.getRegInfo();
+  TII = ST.getInstrInfo();
+
+  assert(MRI->isSSA() && "Must be run on SSA");
+
+  bool Changed = false;
+  for (auto &MBB : MF) {
+    for (auto I = MBB.rbegin(), E = MBB.rend(); I != E;) {
+      auto &MI = *I++;
+      if (MI.getOpcode() == AMDGPU::V_MOV_B32_dpp && combineDPPMov(MI)) {
+        Changed = true;
+        ++NumDPPMovsCombined;
+      }
+    }
+  }
+  return Changed;
+}
diff --git a/lib/Target/AMDGPU/GCNHazardRecognizer.cpp b/lib/Target/AMDGPU/GCNHazardRecognizer.cpp
index f236f10ba75ab..c6396de89c4f6 100644
--- a/lib/Target/AMDGPU/GCNHazardRecognizer.cpp
+++ b/lib/Target/AMDGPU/GCNHazardRecognizer.cpp
@@ -215,6 +215,14 @@ void GCNHazardRecognizer::AdvanceCycle() {
   if (!CurrCycleInstr)
     return;
 
+  // Do not track non-instructions which do not affect the wait states.
+  // If included, these instructions can lead to buffer overflow such that
+  // detectable hazards are missed.
+  if (CurrCycleInstr->getOpcode() == AMDGPU::IMPLICIT_DEF)
+    return;
+  else if (CurrCycleInstr->isDebugInstr())
+    return;
+
   unsigned NumWaitStates = TII.getNumWaitStates(*CurrCycleInstr);
 
   // Keep track of emitted instructions
@@ -253,8 +261,7 @@ int GCNHazardRecognizer::getWaitStatesSince(
         return WaitStates;
 
       unsigned Opcode = MI->getOpcode();
-      if (Opcode == AMDGPU::DBG_VALUE || Opcode == AMDGPU::IMPLICIT_DEF ||
-          Opcode == AMDGPU::INLINEASM)
+      if (Opcode == AMDGPU::INLINEASM)
         continue;
     }
     ++WaitStates;
diff --git a/lib/Target/AMDGPU/GCNILPSched.cpp b/lib/Target/AMDGPU/GCNILPSched.cpp
index 651091d441364..d62dc8d86781c 100644
--- a/lib/Target/AMDGPU/GCNILPSched.cpp
+++ b/lib/Target/AMDGPU/GCNILPSched.cpp
@@ -335,7 +335,7 @@ GCNILPScheduler::schedule(ArrayRef<const SUnit*> BotRoots,
     assert(C);
     AvailQueue.remove(*C);
     auto SU = C->SU;
-    LLVM_DEBUG(dbgs() << "Selected "; SU->dump(&DAG));
+    LLVM_DEBUG(dbgs() << "Selected "; DAG.dumpNode(*SU));
 
     advanceToCycle(SU->getHeight());
 
diff --git a/lib/Target/AMDGPU/GCNIterativeScheduler.cpp b/lib/Target/AMDGPU/GCNIterativeScheduler.cpp
index 15366d66bd852..8e4cc391dc21c 100644
--- a/lib/Target/AMDGPU/GCNIterativeScheduler.cpp
+++ b/lib/Target/AMDGPU/GCNIterativeScheduler.cpp
@@ -434,8 +434,7 @@ void GCNIterativeScheduler::scheduleRegion(Region &R, Range &&Schedule,
 // Sort recorded regions by pressure - highest at the front
 void GCNIterativeScheduler::sortRegionsByPressure(unsigned TargetOcc) {
   const auto &ST = MF.getSubtarget<GCNSubtarget>();
-  llvm::sort(Regions.begin(), Regions.end(),
-    [&ST, TargetOcc](const Region *R1, const Region *R2) {
+  llvm::sort(Regions, [&ST, TargetOcc](const Region *R1, const Region *R2) {
     return R2->MaxPressure.less(ST, R1->MaxPressure, TargetOcc);
   });
 }
diff --git a/lib/Target/AMDGPU/GCNMinRegStrategy.cpp b/lib/Target/AMDGPU/GCNMinRegStrategy.cpp
index 192d534bb9cfd..ec6bcae335551 100644
--- a/lib/Target/AMDGPU/GCNMinRegStrategy.cpp
+++ b/lib/Target/AMDGPU/GCNMinRegStrategy.cpp
@@ -258,7 +258,7 @@ GCNMinRegScheduler::schedule(ArrayRef<const SUnit*> TopRoots,
     assert(C);
     RQ.remove(*C);
     auto SU = C->SU;
-    LLVM_DEBUG(dbgs() << "Selected "; SU->dump(&DAG));
+    LLVM_DEBUG(dbgs() << "Selected "; DAG.dumpNode(*SU));
 
     releaseSuccessors(SU, StepNo);
     Schedule.push_back(SU);
diff --git a/lib/Target/AMDGPU/GCNProcessors.td b/lib/Target/AMDGPU/GCNProcessors.td
index d76acfa24f901..b8142a4e4ff88 100644
--- a/lib/Target/AMDGPU/GCNProcessors.td
+++ b/lib/Target/AMDGPU/GCNProcessors.td
@@ -156,3 +156,8 @@ def : ProcessorModel<"gfx904", SIQuarterSpeedModel,
 def : ProcessorModel<"gfx906", SIQuarterSpeedModel,
   [FeatureISAVersion9_0_6]
 >;
+
+def : ProcessorModel<"gfx909", SIQuarterSpeedModel,
+  [FeatureISAVersion9_0_9]
+>;
+
diff --git a/lib/Target/AMDGPU/InstPrinter/AMDGPUInstPrinter.cpp b/lib/Target/AMDGPU/InstPrinter/AMDGPUInstPrinter.cpp
index db908368a1791..fab0f87dfcbea 100644
--- a/lib/Target/AMDGPU/InstPrinter/AMDGPUInstPrinter.cpp
+++ b/lib/Target/AMDGPU/InstPrinter/AMDGPUInstPrinter.cpp
@@ -207,9 +207,12 @@ void AMDGPUInstPrinter::printDA(const MCInst *MI, unsigned OpNo,
   printNamedBit(MI, OpNo, O, "da");
 }
 
-void AMDGPUInstPrinter::printR128(const MCInst *MI, unsigned OpNo,
+void AMDGPUInstPrinter::printR128A16(const MCInst *MI, unsigned OpNo,
                                   const MCSubtargetInfo &STI, raw_ostream &O) {
-  printNamedBit(MI, OpNo, O, "r128");
+  if (STI.hasFeature(AMDGPU::FeatureR128A16))
+    printNamedBit(MI, OpNo, O, "a16");
+  else
+    printNamedBit(MI, OpNo, O, "r128");
 }
 
 void AMDGPUInstPrinter::printLWE(const MCInst *MI, unsigned OpNo,
@@ -236,21 +239,12 @@ void AMDGPUInstPrinter::printExpVM(const MCInst *MI, unsigned OpNo,
     O << " vm";
 }
 
-void AMDGPUInstPrinter::printDFMT(const MCInst *MI, unsigned OpNo,
-                                  const MCSubtargetInfo &STI,
-                                  raw_ostream &O) {
-  if (MI->getOperand(OpNo).getImm()) {
-    O << " dfmt:";
-    printU8ImmDecOperand(MI, OpNo, O);
-  }
-}
-
-void AMDGPUInstPrinter::printNFMT(const MCInst *MI, unsigned OpNo,
-                                  const MCSubtargetInfo &STI,
-                                  raw_ostream &O) {
-  if (MI->getOperand(OpNo).getImm()) {
-    O << " nfmt:";
-    printU8ImmDecOperand(MI, OpNo, O);
+void AMDGPUInstPrinter::printFORMAT(const MCInst *MI, unsigned OpNo,
+                                    const MCSubtargetInfo &STI,
+                                    raw_ostream &O) {
+  if (unsigned Val = MI->getOperand(OpNo).getImm()) {
+    O << " dfmt:" << (Val & 15);
+    O << ", nfmt:" << (Val >> 4);
   }
 }
 
@@ -1161,8 +1155,7 @@ void AMDGPUInstPrinter::printSwizzle(const MCInst *MI, unsigned OpNo,
 void AMDGPUInstPrinter::printWaitFlag(const MCInst *MI, unsigned OpNo,
                                       const MCSubtargetInfo &STI,
                                       raw_ostream &O) {
-  AMDGPU::IsaInfo::IsaVersion ISA =
-      AMDGPU::IsaInfo::getIsaVersion(STI.getFeatureBits());
+  AMDGPU::IsaVersion ISA = AMDGPU::getIsaVersion(STI.getCPU());
 
   unsigned SImm16 = MI->getOperand(OpNo).getImm();
   unsigned Vmcnt, Expcnt, Lgkmcnt;
diff --git a/lib/Target/AMDGPU/InstPrinter/AMDGPUInstPrinter.h b/lib/Target/AMDGPU/InstPrinter/AMDGPUInstPrinter.h
index 11a496a38b2cd..0ba74ca0f3e19 100644
--- a/lib/Target/AMDGPU/InstPrinter/AMDGPUInstPrinter.h
+++ b/lib/Target/AMDGPU/InstPrinter/AMDGPUInstPrinter.h
@@ -80,7 +80,7 @@ private:
                   raw_ostream &O);
   void printDA(const MCInst *MI, unsigned OpNo, const MCSubtargetInfo &STI,
                raw_ostream &O);
-  void printR128(const MCInst *MI, unsigned OpNo, const MCSubtargetInfo &STI,
+  void printR128A16(const MCInst *MI, unsigned OpNo, const MCSubtargetInfo &STI,
                  raw_ostream &O);
   void printLWE(const MCInst *MI, unsigned OpNo,
                 const MCSubtargetInfo &STI, raw_ostream &O);
@@ -90,10 +90,8 @@ private:
                      const MCSubtargetInfo &STI, raw_ostream &O);
   void printExpVM(const MCInst *MI, unsigned OpNo,
                   const MCSubtargetInfo &STI, raw_ostream &O);
-  void printDFMT(const MCInst *MI, unsigned OpNo,
-                 const MCSubtargetInfo &STI, raw_ostream &O);
-  void printNFMT(const MCInst *MI, unsigned OpNo,
-                 const MCSubtargetInfo &STI, raw_ostream &O);
+  void printFORMAT(const MCInst *MI, unsigned OpNo,
+                   const MCSubtargetInfo &STI, raw_ostream &O);
 
   void printRegOperand(unsigned RegNo, raw_ostream &O);
   void printVOPDst(const MCInst *MI, unsigned OpNo, const MCSubtargetInfo &STI,
diff --git a/lib/Target/AMDGPU/LLVMBuild.txt b/lib/Target/AMDGPU/LLVMBuild.txt
index c54a13c4b4d88..e591d756a545e 100644
--- a/lib/Target/AMDGPU/LLVMBuild.txt
+++ b/lib/Target/AMDGPU/LLVMBuild.txt
@@ -30,5 +30,5 @@ has_disassembler = 1
 type = Library
 name = AMDGPUCodeGen
 parent = AMDGPU
-required_libraries = Analysis AsmPrinter CodeGen Core IPO MC AMDGPUAsmPrinter AMDGPUDesc AMDGPUInfo AMDGPUUtils Scalar SelectionDAG Support Target TransformUtils Vectorize GlobalISel
+required_libraries = Analysis AsmPrinter CodeGen Core IPO MC AMDGPUAsmPrinter AMDGPUDesc AMDGPUInfo AMDGPUUtils Scalar SelectionDAG Support Target TransformUtils Vectorize GlobalISel BinaryFormat
 add_to_library_groups = AMDGPU
diff --git a/lib/Target/AMDGPU/MCTargetDesc/AMDGPUELFObjectWriter.cpp b/lib/Target/AMDGPU/MCTargetDesc/AMDGPUELFObjectWriter.cpp
index 07bef9103c0d8..c85a1ea5b0549 100644
--- a/lib/Target/AMDGPU/MCTargetDesc/AMDGPUELFObjectWriter.cpp
+++ b/lib/Target/AMDGPU/MCTargetDesc/AMDGPUELFObjectWriter.cpp
@@ -46,11 +46,9 @@ unsigned AMDGPUELFObjectWriter::getRelocType(MCContext &Ctx,
   if (const auto *SymA = Target.getSymA()) {
     // SCRATCH_RSRC_DWORD[01] is a special global variable that represents
     // the scratch buffer.
-    if (SymA->getSymbol().getName() == "SCRATCH_RSRC_DWORD0")
+    if (SymA->getSymbol().getName() == "SCRATCH_RSRC_DWORD0" ||
+        SymA->getSymbol().getName() == "SCRATCH_RSRC_DWORD1")
       return ELF::R_AMDGPU_ABS32_LO;
-
-    if (SymA->getSymbol().getName() == "SCRATCH_RSRC_DWORD1")
-      return ELF::R_AMDGPU_ABS32_HI;
   }
 
   switch (Target.getAccessVariant()) {
diff --git a/lib/Target/AMDGPU/MCTargetDesc/AMDGPUTargetStreamer.cpp b/lib/Target/AMDGPU/MCTargetDesc/AMDGPUTargetStreamer.cpp
index 6a41e3f650bc2..c17fe126546ce 100644
--- a/lib/Target/AMDGPU/MCTargetDesc/AMDGPUTargetStreamer.cpp
+++ b/lib/Target/AMDGPU/MCTargetDesc/AMDGPUTargetStreamer.cpp
@@ -17,7 +17,9 @@
 #include "Utils/AMDGPUBaseInfo.h"
 #include "Utils/AMDKernelCodeTUtils.h"
 #include "llvm/ADT/Twine.h"
+#include "llvm/BinaryFormat/AMDGPUMetadataVerifier.h"
 #include "llvm/BinaryFormat/ELF.h"
+#include "llvm/BinaryFormat/MsgPackTypes.h"
 #include "llvm/IR/Constants.h"
 #include "llvm/IR/Function.h"
 #include "llvm/IR/Metadata.h"
@@ -27,6 +29,7 @@
 #include "llvm/MC/MCObjectFileInfo.h"
 #include "llvm/MC/MCSectionELF.h"
 #include "llvm/Support/FormattedStream.h"
+#include "llvm/Support/TargetParser.h"
 
 namespace llvm {
 #include "AMDGPUPTNote.h"
@@ -34,95 +37,116 @@ namespace llvm {
 
 using namespace llvm;
 using namespace llvm::AMDGPU;
+using namespace llvm::AMDGPU::HSAMD;
 
 //===----------------------------------------------------------------------===//
 // AMDGPUTargetStreamer
 //===----------------------------------------------------------------------===//
 
-static const struct {
-  const char *Name;
-  unsigned Mach;
-} MachTable[] = {
-      // Radeon HD 2000/3000 Series (R600).
-      { "r600", ELF::EF_AMDGPU_MACH_R600_R600 },
-      { "r630", ELF::EF_AMDGPU_MACH_R600_R630 },
-      { "rs880", ELF::EF_AMDGPU_MACH_R600_RS880 },
-      { "rv670", ELF::EF_AMDGPU_MACH_R600_RV670 },
-      // Radeon HD 4000 Series (R700).
-      { "rv710", ELF::EF_AMDGPU_MACH_R600_RV710 },
-      { "rv730", ELF::EF_AMDGPU_MACH_R600_RV730 },
-      { "rv770", ELF::EF_AMDGPU_MACH_R600_RV770 },
-      // Radeon HD 5000 Series (Evergreen).
-      { "cedar", ELF::EF_AMDGPU_MACH_R600_CEDAR },
-      { "cypress", ELF::EF_AMDGPU_MACH_R600_CYPRESS },
-      { "juniper", ELF::EF_AMDGPU_MACH_R600_JUNIPER },
-      { "redwood", ELF::EF_AMDGPU_MACH_R600_REDWOOD },
-      { "sumo", ELF::EF_AMDGPU_MACH_R600_SUMO },
-      // Radeon HD 6000 Series (Northern Islands).
-      { "barts", ELF::EF_AMDGPU_MACH_R600_BARTS },
-      { "caicos", ELF::EF_AMDGPU_MACH_R600_CAICOS },
-      { "cayman", ELF::EF_AMDGPU_MACH_R600_CAYMAN },
-      { "turks", ELF::EF_AMDGPU_MACH_R600_TURKS },
-      // AMDGCN GFX6.
-      { "gfx600", ELF::EF_AMDGPU_MACH_AMDGCN_GFX600 },
-      { "tahiti", ELF::EF_AMDGPU_MACH_AMDGCN_GFX600 },
-      { "gfx601", ELF::EF_AMDGPU_MACH_AMDGCN_GFX601 },
-      { "hainan", ELF::EF_AMDGPU_MACH_AMDGCN_GFX601 },
-      { "oland", ELF::EF_AMDGPU_MACH_AMDGCN_GFX601 },
-      { "pitcairn", ELF::EF_AMDGPU_MACH_AMDGCN_GFX601 },
-      { "verde", ELF::EF_AMDGPU_MACH_AMDGCN_GFX601 },
-      // AMDGCN GFX7.
-      { "gfx700", ELF::EF_AMDGPU_MACH_AMDGCN_GFX700 },
-      { "kaveri", ELF::EF_AMDGPU_MACH_AMDGCN_GFX700 },
-      { "gfx701", ELF::EF_AMDGPU_MACH_AMDGCN_GFX701 },
-      { "hawaii", ELF::EF_AMDGPU_MACH_AMDGCN_GFX701 },
-      { "gfx702", ELF::EF_AMDGPU_MACH_AMDGCN_GFX702 },
-      { "gfx703", ELF::EF_AMDGPU_MACH_AMDGCN_GFX703 },
-      { "kabini", ELF::EF_AMDGPU_MACH_AMDGCN_GFX703 },
-      { "mullins", ELF::EF_AMDGPU_MACH_AMDGCN_GFX703 },
-      { "gfx704", ELF::EF_AMDGPU_MACH_AMDGCN_GFX704 },
-      { "bonaire", ELF::EF_AMDGPU_MACH_AMDGCN_GFX704 },
-      // AMDGCN GFX8.
-      { "gfx801", ELF::EF_AMDGPU_MACH_AMDGCN_GFX801 },
-      { "carrizo", ELF::EF_AMDGPU_MACH_AMDGCN_GFX801 },
-      { "gfx802", ELF::EF_AMDGPU_MACH_AMDGCN_GFX802 },
-      { "iceland", ELF::EF_AMDGPU_MACH_AMDGCN_GFX802 },
-      { "tonga", ELF::EF_AMDGPU_MACH_AMDGCN_GFX802 },
-      { "gfx803", ELF::EF_AMDGPU_MACH_AMDGCN_GFX803 },
-      { "fiji", ELF::EF_AMDGPU_MACH_AMDGCN_GFX803 },
-      { "polaris10", ELF::EF_AMDGPU_MACH_AMDGCN_GFX803 },
-      { "polaris11", ELF::EF_AMDGPU_MACH_AMDGCN_GFX803 },
-      { "gfx810", ELF::EF_AMDGPU_MACH_AMDGCN_GFX810 },
-      { "stoney", ELF::EF_AMDGPU_MACH_AMDGCN_GFX810 },
-      // AMDGCN GFX9.
-      { "gfx900", ELF::EF_AMDGPU_MACH_AMDGCN_GFX900 },
-      { "gfx902", ELF::EF_AMDGPU_MACH_AMDGCN_GFX902 },
-      { "gfx904", ELF::EF_AMDGPU_MACH_AMDGCN_GFX904 },
-      { "gfx906", ELF::EF_AMDGPU_MACH_AMDGCN_GFX906 },
-      // Not specified processor.
-      { nullptr, ELF::EF_AMDGPU_MACH_NONE }
-};
+bool AMDGPUTargetStreamer::EmitHSAMetadataV2(StringRef HSAMetadataString) {
+  HSAMD::Metadata HSAMetadata;
+  if (HSAMD::fromString(HSAMetadataString, HSAMetadata))
+    return false;
 
-unsigned AMDGPUTargetStreamer::getMACH(StringRef GPU) const {
-  auto Entry = MachTable;
-  for (; Entry->Name && GPU != Entry->Name; ++Entry)
-    ;
-  return Entry->Mach;
+  return EmitHSAMetadata(HSAMetadata);
 }
 
-const char *AMDGPUTargetStreamer::getMachName(unsigned Mach) {
-  auto Entry = MachTable;
-  for (; Entry->Name && Mach != Entry->Mach; ++Entry)
-    ;
-  return Entry->Name;
+bool AMDGPUTargetStreamer::EmitHSAMetadataV3(StringRef HSAMetadataString) {
+  std::shared_ptr<msgpack::Node> HSAMetadataRoot;
+  yaml::Input YIn(HSAMetadataString);
+  YIn >> HSAMetadataRoot;
+  if (YIn.error())
+    return false;
+  return EmitHSAMetadata(HSAMetadataRoot, false);
 }
 
-bool AMDGPUTargetStreamer::EmitHSAMetadata(StringRef HSAMetadataString) {
-  HSAMD::Metadata HSAMetadata;
-  if (HSAMD::fromString(HSAMetadataString, HSAMetadata))
-    return false;
+StringRef AMDGPUTargetStreamer::getArchNameFromElfMach(unsigned ElfMach) {
+  AMDGPU::GPUKind AK;
 
-  return EmitHSAMetadata(HSAMetadata);
+  switch (ElfMach) {
+  case ELF::EF_AMDGPU_MACH_R600_R600:     AK = GK_R600;    break;
+  case ELF::EF_AMDGPU_MACH_R600_R630:     AK = GK_R630;    break;
+  case ELF::EF_AMDGPU_MACH_R600_RS880:    AK = GK_RS880;   break;
+  case ELF::EF_AMDGPU_MACH_R600_RV670:    AK = GK_RV670;   break;
+  case ELF::EF_AMDGPU_MACH_R600_RV710:    AK = GK_RV710;   break;
+  case ELF::EF_AMDGPU_MACH_R600_RV730:    AK = GK_RV730;   break;
+  case ELF::EF_AMDGPU_MACH_R600_RV770:    AK = GK_RV770;   break;
+  case ELF::EF_AMDGPU_MACH_R600_CEDAR:    AK = GK_CEDAR;   break;
+  case ELF::EF_AMDGPU_MACH_R600_CYPRESS:  AK = GK_CYPRESS; break;
+  case ELF::EF_AMDGPU_MACH_R600_JUNIPER:  AK = GK_JUNIPER; break;
+  case ELF::EF_AMDGPU_MACH_R600_REDWOOD:  AK = GK_REDWOOD; break;
+  case ELF::EF_AMDGPU_MACH_R600_SUMO:     AK = GK_SUMO;    break;
+  case ELF::EF_AMDGPU_MACH_R600_BARTS:    AK = GK_BARTS;   break;
+  case ELF::EF_AMDGPU_MACH_R600_CAICOS:   AK = GK_CAICOS;  break;
+  case ELF::EF_AMDGPU_MACH_R600_CAYMAN:   AK = GK_CAYMAN;  break;
+  case ELF::EF_AMDGPU_MACH_R600_TURKS:    AK = GK_TURKS;   break;
+  case ELF::EF_AMDGPU_MACH_AMDGCN_GFX600: AK = GK_GFX600;  break;
+  case ELF::EF_AMDGPU_MACH_AMDGCN_GFX601: AK = GK_GFX601;  break;
+  case ELF::EF_AMDGPU_MACH_AMDGCN_GFX700: AK = GK_GFX700;  break;
+  case ELF::EF_AMDGPU_MACH_AMDGCN_GFX701: AK = GK_GFX701;  break;
+  case ELF::EF_AMDGPU_MACH_AMDGCN_GFX702: AK = GK_GFX702;  break;
+  case ELF::EF_AMDGPU_MACH_AMDGCN_GFX703: AK = GK_GFX703;  break;
+  case ELF::EF_AMDGPU_MACH_AMDGCN_GFX704: AK = GK_GFX704;  break;
+  case ELF::EF_AMDGPU_MACH_AMDGCN_GFX801: AK = GK_GFX801;  break;
+  case ELF::EF_AMDGPU_MACH_AMDGCN_GFX802: AK = GK_GFX802;  break;
+  case ELF::EF_AMDGPU_MACH_AMDGCN_GFX803: AK = GK_GFX803;  break;
+  case ELF::EF_AMDGPU_MACH_AMDGCN_GFX810: AK = GK_GFX810;  break;
+  case ELF::EF_AMDGPU_MACH_AMDGCN_GFX900: AK = GK_GFX900;  break;
+  case ELF::EF_AMDGPU_MACH_AMDGCN_GFX902: AK = GK_GFX902;  break;
+  case ELF::EF_AMDGPU_MACH_AMDGCN_GFX904: AK = GK_GFX904;  break;
+  case ELF::EF_AMDGPU_MACH_AMDGCN_GFX906: AK = GK_GFX906;  break;
+  case ELF::EF_AMDGPU_MACH_AMDGCN_GFX909: AK = GK_GFX909;  break;
+  case ELF::EF_AMDGPU_MACH_NONE:          AK = GK_NONE;    break;
+  }
+
+  StringRef GPUName = getArchNameAMDGCN(AK);
+  if (GPUName != "")
+    return GPUName;
+  return getArchNameR600(AK);
+}
+
+unsigned AMDGPUTargetStreamer::getElfMach(StringRef GPU) {
+  AMDGPU::GPUKind AK = parseArchAMDGCN(GPU);
+  if (AK == AMDGPU::GPUKind::GK_NONE)
+    AK = parseArchR600(GPU);
+
+  switch (AK) {
+  case GK_R600:    return ELF::EF_AMDGPU_MACH_R600_R600;
+  case GK_R630:    return ELF::EF_AMDGPU_MACH_R600_R630;
+  case GK_RS880:   return ELF::EF_AMDGPU_MACH_R600_RS880;
+  case GK_RV670:   return ELF::EF_AMDGPU_MACH_R600_RV670;
+  case GK_RV710:   return ELF::EF_AMDGPU_MACH_R600_RV710;
+  case GK_RV730:   return ELF::EF_AMDGPU_MACH_R600_RV730;
+  case GK_RV770:   return ELF::EF_AMDGPU_MACH_R600_RV770;
+  case GK_CEDAR:   return ELF::EF_AMDGPU_MACH_R600_CEDAR;
+  case GK_CYPRESS: return ELF::EF_AMDGPU_MACH_R600_CYPRESS;
+  case GK_JUNIPER: return ELF::EF_AMDGPU_MACH_R600_JUNIPER;
+  case GK_REDWOOD: return ELF::EF_AMDGPU_MACH_R600_REDWOOD;
+  case GK_SUMO:    return ELF::EF_AMDGPU_MACH_R600_SUMO;
+  case GK_BARTS:   return ELF::EF_AMDGPU_MACH_R600_BARTS;
+  case GK_CAICOS:  return ELF::EF_AMDGPU_MACH_R600_CAICOS;
+  case GK_CAYMAN:  return ELF::EF_AMDGPU_MACH_R600_CAYMAN;
+  case GK_TURKS:   return ELF::EF_AMDGPU_MACH_R600_TURKS;
+  case GK_GFX600:  return ELF::EF_AMDGPU_MACH_AMDGCN_GFX600;
+  case GK_GFX601:  return ELF::EF_AMDGPU_MACH_AMDGCN_GFX601;
+  case GK_GFX700:  return ELF::EF_AMDGPU_MACH_AMDGCN_GFX700;
+  case GK_GFX701:  return ELF::EF_AMDGPU_MACH_AMDGCN_GFX701;
+  case GK_GFX702:  return ELF::EF_AMDGPU_MACH_AMDGCN_GFX702;
+  case GK_GFX703:  return ELF::EF_AMDGPU_MACH_AMDGCN_GFX703;
+  case GK_GFX704:  return ELF::EF_AMDGPU_MACH_AMDGCN_GFX704;
+  case GK_GFX801:  return ELF::EF_AMDGPU_MACH_AMDGCN_GFX801;
+  case GK_GFX802:  return ELF::EF_AMDGPU_MACH_AMDGCN_GFX802;
+  case GK_GFX803:  return ELF::EF_AMDGPU_MACH_AMDGCN_GFX803;
+  case GK_GFX810:  return ELF::EF_AMDGPU_MACH_AMDGCN_GFX810;
+  case GK_GFX900:  return ELF::EF_AMDGPU_MACH_AMDGCN_GFX900;
+  case GK_GFX902:  return ELF::EF_AMDGPU_MACH_AMDGCN_GFX902;
+  case GK_GFX904:  return ELF::EF_AMDGPU_MACH_AMDGCN_GFX904;
+  case GK_GFX906:  return ELF::EF_AMDGPU_MACH_AMDGCN_GFX906;
+  case GK_GFX909:  return ELF::EF_AMDGPU_MACH_AMDGCN_GFX909;
+  case GK_NONE:    return ELF::EF_AMDGPU_MACH_NONE;
+  }
+
+  llvm_unreachable("unknown GPU");
 }
 
 //===----------------------------------------------------------------------===//
@@ -183,9 +207,26 @@ bool AMDGPUTargetAsmStreamer::EmitHSAMetadata(
   if (HSAMD::toString(HSAMetadata, HSAMetadataString))
     return false;
 
-  OS << '\t' << HSAMD::AssemblerDirectiveBegin << '\n';
+  OS << '\t' << AssemblerDirectiveBegin << '\n';
   OS << HSAMetadataString << '\n';
-  OS << '\t' << HSAMD::AssemblerDirectiveEnd << '\n';
+  OS << '\t' << AssemblerDirectiveEnd << '\n';
+  return true;
+}
+
+bool AMDGPUTargetAsmStreamer::EmitHSAMetadata(
+    std::shared_ptr<msgpack::Node> &HSAMetadataRoot, bool Strict) {
+  V3::MetadataVerifier Verifier(Strict);
+  if (!Verifier.verify(*HSAMetadataRoot))
+    return false;
+
+  std::string HSAMetadataString;
+  raw_string_ostream StrOS(HSAMetadataString);
+  yaml::Output YOut(StrOS);
+  YOut << HSAMetadataRoot;
+
+  OS << '\t' << V3::AssemblerDirectiveBegin << '\n';
+  OS << StrOS.str() << '\n';
+  OS << '\t' << V3::AssemblerDirectiveEnd << '\n';
   return true;
 }
 
@@ -203,70 +244,59 @@ void AMDGPUTargetAsmStreamer::EmitAmdhsaKernelDescriptor(
     const MCSubtargetInfo &STI, StringRef KernelName,
     const amdhsa::kernel_descriptor_t &KD, uint64_t NextVGPR, uint64_t NextSGPR,
     bool ReserveVCC, bool ReserveFlatScr, bool ReserveXNACK) {
-  amdhsa::kernel_descriptor_t DefaultKD = getDefaultAmdhsaKernelDescriptor();
-
-  IsaInfo::IsaVersion IVersion = IsaInfo::getIsaVersion(STI.getFeatureBits());
+  IsaVersion IVersion = getIsaVersion(STI.getCPU());
 
   OS << "\t.amdhsa_kernel " << KernelName << '\n';
 
-#define PRINT_IF_NOT_DEFAULT(STREAM, DIRECTIVE, KERNEL_DESC,                   \
-                             DEFAULT_KERNEL_DESC, MEMBER_NAME, FIELD_NAME)     \
-  if (AMDHSA_BITS_GET(KERNEL_DESC.MEMBER_NAME, FIELD_NAME) !=                  \
-      AMDHSA_BITS_GET(DEFAULT_KERNEL_DESC.MEMBER_NAME, FIELD_NAME))            \
-    STREAM << "\t\t" << DIRECTIVE << " "                                       \
-           << AMDHSA_BITS_GET(KERNEL_DESC.MEMBER_NAME, FIELD_NAME) << '\n';
+#define PRINT_FIELD(STREAM, DIRECTIVE, KERNEL_DESC, MEMBER_NAME, FIELD_NAME)   \
+  STREAM << "\t\t" << DIRECTIVE << " "                                         \
+         << AMDHSA_BITS_GET(KERNEL_DESC.MEMBER_NAME, FIELD_NAME) << '\n';
 
-  if (KD.group_segment_fixed_size != DefaultKD.group_segment_fixed_size)
-    OS << "\t\t.amdhsa_group_segment_fixed_size " << KD.group_segment_fixed_size
-       << '\n';
-  if (KD.private_segment_fixed_size != DefaultKD.private_segment_fixed_size)
-    OS << "\t\t.amdhsa_private_segment_fixed_size "
-       << KD.private_segment_fixed_size << '\n';
+  OS << "\t\t.amdhsa_group_segment_fixed_size " << KD.group_segment_fixed_size
+     << '\n';
+  OS << "\t\t.amdhsa_private_segment_fixed_size "
+     << KD.private_segment_fixed_size << '\n';
 
-  PRINT_IF_NOT_DEFAULT(
-      OS, ".amdhsa_user_sgpr_private_segment_buffer", KD, DefaultKD,
-      kernel_code_properties,
-      amdhsa::KERNEL_CODE_PROPERTY_ENABLE_SGPR_PRIVATE_SEGMENT_BUFFER);
-  PRINT_IF_NOT_DEFAULT(OS, ".amdhsa_user_sgpr_dispatch_ptr", KD, DefaultKD,
-                       kernel_code_properties,
-                       amdhsa::KERNEL_CODE_PROPERTY_ENABLE_SGPR_DISPATCH_PTR);
-  PRINT_IF_NOT_DEFAULT(OS, ".amdhsa_user_sgpr_queue_ptr", KD, DefaultKD,
-                       kernel_code_properties,
-                       amdhsa::KERNEL_CODE_PROPERTY_ENABLE_SGPR_QUEUE_PTR);
-  PRINT_IF_NOT_DEFAULT(
-      OS, ".amdhsa_user_sgpr_kernarg_segment_ptr", KD, DefaultKD,
-      kernel_code_properties,
-      amdhsa::KERNEL_CODE_PROPERTY_ENABLE_SGPR_KERNARG_SEGMENT_PTR);
-  PRINT_IF_NOT_DEFAULT(OS, ".amdhsa_user_sgpr_dispatch_id", KD, DefaultKD,
-                       kernel_code_properties,
-                       amdhsa::KERNEL_CODE_PROPERTY_ENABLE_SGPR_DISPATCH_ID);
-  PRINT_IF_NOT_DEFAULT(
-      OS, ".amdhsa_user_sgpr_flat_scratch_init", KD, DefaultKD,
-      kernel_code_properties,
-      amdhsa::KERNEL_CODE_PROPERTY_ENABLE_SGPR_FLAT_SCRATCH_INIT);
-  PRINT_IF_NOT_DEFAULT(
-      OS, ".amdhsa_user_sgpr_private_segment_size", KD, DefaultKD,
-      kernel_code_properties,
-      amdhsa::KERNEL_CODE_PROPERTY_ENABLE_SGPR_PRIVATE_SEGMENT_SIZE);
-  PRINT_IF_NOT_DEFAULT(
-      OS, ".amdhsa_system_sgpr_private_segment_wavefront_offset", KD, DefaultKD,
+  PRINT_FIELD(OS, ".amdhsa_user_sgpr_private_segment_buffer", KD,
+              kernel_code_properties,
+              amdhsa::KERNEL_CODE_PROPERTY_ENABLE_SGPR_PRIVATE_SEGMENT_BUFFER);
+  PRINT_FIELD(OS, ".amdhsa_user_sgpr_dispatch_ptr", KD,
+              kernel_code_properties,
+              amdhsa::KERNEL_CODE_PROPERTY_ENABLE_SGPR_DISPATCH_PTR);
+  PRINT_FIELD(OS, ".amdhsa_user_sgpr_queue_ptr", KD,
+              kernel_code_properties,
+              amdhsa::KERNEL_CODE_PROPERTY_ENABLE_SGPR_QUEUE_PTR);
+  PRINT_FIELD(OS, ".amdhsa_user_sgpr_kernarg_segment_ptr", KD,
+              kernel_code_properties,
+              amdhsa::KERNEL_CODE_PROPERTY_ENABLE_SGPR_KERNARG_SEGMENT_PTR);
+  PRINT_FIELD(OS, ".amdhsa_user_sgpr_dispatch_id", KD,
+              kernel_code_properties,
+              amdhsa::KERNEL_CODE_PROPERTY_ENABLE_SGPR_DISPATCH_ID);
+  PRINT_FIELD(OS, ".amdhsa_user_sgpr_flat_scratch_init", KD,
+              kernel_code_properties,
+              amdhsa::KERNEL_CODE_PROPERTY_ENABLE_SGPR_FLAT_SCRATCH_INIT);
+  PRINT_FIELD(OS, ".amdhsa_user_sgpr_private_segment_size", KD,
+              kernel_code_properties,
+              amdhsa::KERNEL_CODE_PROPERTY_ENABLE_SGPR_PRIVATE_SEGMENT_SIZE);
+  PRINT_FIELD(
+      OS, ".amdhsa_system_sgpr_private_segment_wavefront_offset", KD,
       compute_pgm_rsrc2,
       amdhsa::COMPUTE_PGM_RSRC2_ENABLE_SGPR_PRIVATE_SEGMENT_WAVEFRONT_OFFSET);
-  PRINT_IF_NOT_DEFAULT(OS, ".amdhsa_system_sgpr_workgroup_id_x", KD, DefaultKD,
-                       compute_pgm_rsrc2,
-                       amdhsa::COMPUTE_PGM_RSRC2_ENABLE_SGPR_WORKGROUP_ID_X);
-  PRINT_IF_NOT_DEFAULT(OS, ".amdhsa_system_sgpr_workgroup_id_y", KD, DefaultKD,
-                       compute_pgm_rsrc2,
-                       amdhsa::COMPUTE_PGM_RSRC2_ENABLE_SGPR_WORKGROUP_ID_Y);
-  PRINT_IF_NOT_DEFAULT(OS, ".amdhsa_system_sgpr_workgroup_id_z", KD, DefaultKD,
-                       compute_pgm_rsrc2,
-                       amdhsa::COMPUTE_PGM_RSRC2_ENABLE_SGPR_WORKGROUP_ID_Z);
-  PRINT_IF_NOT_DEFAULT(OS, ".amdhsa_system_sgpr_workgroup_info", KD, DefaultKD,
-                       compute_pgm_rsrc2,
-                       amdhsa::COMPUTE_PGM_RSRC2_ENABLE_SGPR_WORKGROUP_INFO);
-  PRINT_IF_NOT_DEFAULT(OS, ".amdhsa_system_vgpr_workitem_id", KD, DefaultKD,
-                       compute_pgm_rsrc2,
-                       amdhsa::COMPUTE_PGM_RSRC2_ENABLE_VGPR_WORKITEM_ID);
+  PRINT_FIELD(OS, ".amdhsa_system_sgpr_workgroup_id_x", KD,
+              compute_pgm_rsrc2,
+              amdhsa::COMPUTE_PGM_RSRC2_ENABLE_SGPR_WORKGROUP_ID_X);
+  PRINT_FIELD(OS, ".amdhsa_system_sgpr_workgroup_id_y", KD,
+              compute_pgm_rsrc2,
+              amdhsa::COMPUTE_PGM_RSRC2_ENABLE_SGPR_WORKGROUP_ID_Y);
+  PRINT_FIELD(OS, ".amdhsa_system_sgpr_workgroup_id_z", KD,
+              compute_pgm_rsrc2,
+              amdhsa::COMPUTE_PGM_RSRC2_ENABLE_SGPR_WORKGROUP_ID_Z);
+  PRINT_FIELD(OS, ".amdhsa_system_sgpr_workgroup_info", KD,
+              compute_pgm_rsrc2,
+              amdhsa::COMPUTE_PGM_RSRC2_ENABLE_SGPR_WORKGROUP_INFO);
+  PRINT_FIELD(OS, ".amdhsa_system_vgpr_workitem_id", KD,
+              compute_pgm_rsrc2,
+              amdhsa::COMPUTE_PGM_RSRC2_ENABLE_VGPR_WORKITEM_ID);
 
   // These directives are required.
   OS << "\t\t.amdhsa_next_free_vgpr " << NextVGPR << '\n';
@@ -279,54 +309,52 @@ void AMDGPUTargetAsmStreamer::EmitAmdhsaKernelDescriptor(
   if (IVersion.Major >= 8 && ReserveXNACK != hasXNACK(STI))
     OS << "\t\t.amdhsa_reserve_xnack_mask " << ReserveXNACK << '\n';
 
-  PRINT_IF_NOT_DEFAULT(OS, ".amdhsa_float_round_mode_32", KD, DefaultKD,
-                       compute_pgm_rsrc1,
-                       amdhsa::COMPUTE_PGM_RSRC1_FLOAT_ROUND_MODE_32);
-  PRINT_IF_NOT_DEFAULT(OS, ".amdhsa_float_round_mode_16_64", KD, DefaultKD,
-                       compute_pgm_rsrc1,
-                       amdhsa::COMPUTE_PGM_RSRC1_FLOAT_ROUND_MODE_16_64);
-  PRINT_IF_NOT_DEFAULT(OS, ".amdhsa_float_denorm_mode_32", KD, DefaultKD,
-                       compute_pgm_rsrc1,
-                       amdhsa::COMPUTE_PGM_RSRC1_FLOAT_DENORM_MODE_32);
-  PRINT_IF_NOT_DEFAULT(OS, ".amdhsa_float_denorm_mode_16_64", KD, DefaultKD,
-                       compute_pgm_rsrc1,
-                       amdhsa::COMPUTE_PGM_RSRC1_FLOAT_DENORM_MODE_16_64);
-  PRINT_IF_NOT_DEFAULT(OS, ".amdhsa_dx10_clamp", KD, DefaultKD,
-                       compute_pgm_rsrc1,
-                       amdhsa::COMPUTE_PGM_RSRC1_ENABLE_DX10_CLAMP);
-  PRINT_IF_NOT_DEFAULT(OS, ".amdhsa_ieee_mode", KD, DefaultKD,
-                       compute_pgm_rsrc1,
-                       amdhsa::COMPUTE_PGM_RSRC1_ENABLE_IEEE_MODE);
+  PRINT_FIELD(OS, ".amdhsa_float_round_mode_32", KD,
+              compute_pgm_rsrc1,
+              amdhsa::COMPUTE_PGM_RSRC1_FLOAT_ROUND_MODE_32);
+  PRINT_FIELD(OS, ".amdhsa_float_round_mode_16_64", KD,
+              compute_pgm_rsrc1,
+              amdhsa::COMPUTE_PGM_RSRC1_FLOAT_ROUND_MODE_16_64);
+  PRINT_FIELD(OS, ".amdhsa_float_denorm_mode_32", KD,
+              compute_pgm_rsrc1,
+              amdhsa::COMPUTE_PGM_RSRC1_FLOAT_DENORM_MODE_32);
+  PRINT_FIELD(OS, ".amdhsa_float_denorm_mode_16_64", KD,
+              compute_pgm_rsrc1,
+              amdhsa::COMPUTE_PGM_RSRC1_FLOAT_DENORM_MODE_16_64);
+  PRINT_FIELD(OS, ".amdhsa_dx10_clamp", KD,
+              compute_pgm_rsrc1,
+              amdhsa::COMPUTE_PGM_RSRC1_ENABLE_DX10_CLAMP);
+  PRINT_FIELD(OS, ".amdhsa_ieee_mode", KD,
+              compute_pgm_rsrc1,
+              amdhsa::COMPUTE_PGM_RSRC1_ENABLE_IEEE_MODE);
   if (IVersion.Major >= 9)
-    PRINT_IF_NOT_DEFAULT(OS, ".amdhsa_fp16_overflow", KD, DefaultKD,
-                         compute_pgm_rsrc1,
-                         amdhsa::COMPUTE_PGM_RSRC1_FP16_OVFL);
-  PRINT_IF_NOT_DEFAULT(
-      OS, ".amdhsa_exception_fp_ieee_invalid_op", KD, DefaultKD,
+    PRINT_FIELD(OS, ".amdhsa_fp16_overflow", KD,
+                compute_pgm_rsrc1,
+                amdhsa::COMPUTE_PGM_RSRC1_FP16_OVFL);
+  PRINT_FIELD(
+      OS, ".amdhsa_exception_fp_ieee_invalid_op", KD,
       compute_pgm_rsrc2,
       amdhsa::COMPUTE_PGM_RSRC2_ENABLE_EXCEPTION_IEEE_754_FP_INVALID_OPERATION);
-  PRINT_IF_NOT_DEFAULT(
-      OS, ".amdhsa_exception_fp_denorm_src", KD, DefaultKD, compute_pgm_rsrc2,
-      amdhsa::COMPUTE_PGM_RSRC2_ENABLE_EXCEPTION_FP_DENORMAL_SOURCE);
-  PRINT_IF_NOT_DEFAULT(
-      OS, ".amdhsa_exception_fp_ieee_div_zero", KD, DefaultKD,
+  PRINT_FIELD(OS, ".amdhsa_exception_fp_denorm_src", KD,
+              compute_pgm_rsrc2,
+              amdhsa::COMPUTE_PGM_RSRC2_ENABLE_EXCEPTION_FP_DENORMAL_SOURCE);
+  PRINT_FIELD(
+      OS, ".amdhsa_exception_fp_ieee_div_zero", KD,
       compute_pgm_rsrc2,
       amdhsa::COMPUTE_PGM_RSRC2_ENABLE_EXCEPTION_IEEE_754_FP_DIVISION_BY_ZERO);
-  PRINT_IF_NOT_DEFAULT(
-      OS, ".amdhsa_exception_fp_ieee_overflow", KD, DefaultKD,
-      compute_pgm_rsrc2,
-      amdhsa::COMPUTE_PGM_RSRC2_ENABLE_EXCEPTION_IEEE_754_FP_OVERFLOW);
-  PRINT_IF_NOT_DEFAULT(
-      OS, ".amdhsa_exception_fp_ieee_underflow", KD, DefaultKD,
-      compute_pgm_rsrc2,
-      amdhsa::COMPUTE_PGM_RSRC2_ENABLE_EXCEPTION_IEEE_754_FP_UNDERFLOW);
-  PRINT_IF_NOT_DEFAULT(
-      OS, ".amdhsa_exception_fp_ieee_inexact", KD, DefaultKD, compute_pgm_rsrc2,
-      amdhsa::COMPUTE_PGM_RSRC2_ENABLE_EXCEPTION_IEEE_754_FP_INEXACT);
-  PRINT_IF_NOT_DEFAULT(
-      OS, ".amdhsa_exception_int_div_zero", KD, DefaultKD, compute_pgm_rsrc2,
-      amdhsa::COMPUTE_PGM_RSRC2_ENABLE_EXCEPTION_INT_DIVIDE_BY_ZERO);
-#undef PRINT_IF_NOT_DEFAULT
+  PRINT_FIELD(OS, ".amdhsa_exception_fp_ieee_overflow", KD,
+              compute_pgm_rsrc2,
+              amdhsa::COMPUTE_PGM_RSRC2_ENABLE_EXCEPTION_IEEE_754_FP_OVERFLOW);
+  PRINT_FIELD(OS, ".amdhsa_exception_fp_ieee_underflow", KD,
+              compute_pgm_rsrc2,
+              amdhsa::COMPUTE_PGM_RSRC2_ENABLE_EXCEPTION_IEEE_754_FP_UNDERFLOW);
+  PRINT_FIELD(OS, ".amdhsa_exception_fp_ieee_inexact", KD,
+              compute_pgm_rsrc2,
+              amdhsa::COMPUTE_PGM_RSRC2_ENABLE_EXCEPTION_IEEE_754_FP_INEXACT);
+  PRINT_FIELD(OS, ".amdhsa_exception_int_div_zero", KD,
+              compute_pgm_rsrc2,
+              amdhsa::COMPUTE_PGM_RSRC2_ENABLE_EXCEPTION_INT_DIVIDE_BY_ZERO);
+#undef PRINT_FIELD
 
   OS << "\t.end_amdhsa_kernel\n";
 }
@@ -342,12 +370,16 @@ AMDGPUTargetELFStreamer::AMDGPUTargetELFStreamer(
   unsigned EFlags = MCA.getELFHeaderEFlags();
 
   EFlags &= ~ELF::EF_AMDGPU_MACH;
-  EFlags |= getMACH(STI.getCPU());
+  EFlags |= getElfMach(STI.getCPU());
 
   EFlags &= ~ELF::EF_AMDGPU_XNACK;
   if (AMDGPU::hasXNACK(STI))
     EFlags |= ELF::EF_AMDGPU_XNACK;
 
+  EFlags &= ~ELF::EF_AMDGPU_SRAM_ECC;
+  if (AMDGPU::hasSRAMECC(STI))
+    EFlags |= ELF::EF_AMDGPU_SRAM_ECC;
+
   MCA.setELFHeaderEFlags(EFlags);
 }
 
@@ -355,13 +387,13 @@ MCELFStreamer &AMDGPUTargetELFStreamer::getStreamer() {
   return static_cast<MCELFStreamer &>(Streamer);
 }
 
-void AMDGPUTargetELFStreamer::EmitAMDGPUNote(
-    const MCExpr *DescSZ, unsigned NoteType,
+void AMDGPUTargetELFStreamer::EmitNote(
+    StringRef Name, const MCExpr *DescSZ, unsigned NoteType,
     function_ref<void(MCELFStreamer &)> EmitDesc) {
   auto &S = getStreamer();
   auto &Context = S.getContext();
 
-  auto NameSZ = sizeof(ElfNote::NoteName);
+  auto NameSZ = Name.size() + 1;
 
   S.PushSection();
   S.SwitchSection(Context.getELFSection(
@@ -369,7 +401,7 @@ void AMDGPUTargetELFStreamer::EmitAMDGPUNote(
   S.EmitIntValue(NameSZ, 4);                                  // namesz
   S.EmitValue(DescSZ, 4);                                     // descz
   S.EmitIntValue(NoteType, 4);                                // type
-  S.EmitBytes(StringRef(ElfNote::NoteName, NameSZ));          // name
+  S.EmitBytes(Name);                                          // name
   S.EmitValueToAlignment(4, 0, 1, 0);                         // padding 0
   EmitDesc(S);                                                // desc
   S.EmitValueToAlignment(4, 0, 1, 0);                         // padding 0
@@ -381,14 +413,11 @@ void AMDGPUTargetELFStreamer::EmitDirectiveAMDGCNTarget(StringRef Target) {}
 void AMDGPUTargetELFStreamer::EmitDirectiveHSACodeObjectVersion(
     uint32_t Major, uint32_t Minor) {
 
-  EmitAMDGPUNote(
-    MCConstantExpr::create(8, getContext()),
-    ElfNote::NT_AMDGPU_HSA_CODE_OBJECT_VERSION,
-    [&](MCELFStreamer &OS){
-      OS.EmitIntValue(Major, 4);
-      OS.EmitIntValue(Minor, 4);
-    }
-  );
+  EmitNote(ElfNote::NoteNameV2, MCConstantExpr::create(8, getContext()),
+           ElfNote::NT_AMDGPU_HSA_CODE_OBJECT_VERSION, [&](MCELFStreamer &OS) {
+             OS.EmitIntValue(Major, 4);
+             OS.EmitIntValue(Minor, 4);
+           });
 }
 
 void
@@ -404,21 +433,18 @@ AMDGPUTargetELFStreamer::EmitDirectiveHSACodeObjectISA(uint32_t Major,
     sizeof(Major) + sizeof(Minor) + sizeof(Stepping) +
     VendorNameSize + ArchNameSize;
 
-  EmitAMDGPUNote(
-    MCConstantExpr::create(DescSZ, getContext()),
-    ElfNote::NT_AMDGPU_HSA_ISA,
-    [&](MCELFStreamer &OS) {
-      OS.EmitIntValue(VendorNameSize, 2);
-      OS.EmitIntValue(ArchNameSize, 2);
-      OS.EmitIntValue(Major, 4);
-      OS.EmitIntValue(Minor, 4);
-      OS.EmitIntValue(Stepping, 4);
-      OS.EmitBytes(VendorName);
-      OS.EmitIntValue(0, 1); // NULL terminate VendorName
-      OS.EmitBytes(ArchName);
-      OS.EmitIntValue(0, 1); // NULL terminte ArchName
-    }
-  );
+  EmitNote(ElfNote::NoteNameV2, MCConstantExpr::create(DescSZ, getContext()),
+           ElfNote::NT_AMDGPU_HSA_ISA, [&](MCELFStreamer &OS) {
+             OS.EmitIntValue(VendorNameSize, 2);
+             OS.EmitIntValue(ArchNameSize, 2);
+             OS.EmitIntValue(Major, 4);
+             OS.EmitIntValue(Minor, 4);
+             OS.EmitIntValue(Stepping, 4);
+             OS.EmitBytes(VendorName);
+             OS.EmitIntValue(0, 1); // NULL terminate VendorName
+             OS.EmitBytes(ArchName);
+             OS.EmitIntValue(0, 1); // NULL terminte ArchName
+           });
 }
 
 void
@@ -447,15 +473,41 @@ bool AMDGPUTargetELFStreamer::EmitISAVersion(StringRef IsaVersionString) {
     MCSymbolRefExpr::create(DescEnd, Context),
     MCSymbolRefExpr::create(DescBegin, Context), Context);
 
-  EmitAMDGPUNote(
-    DescSZ,
-    ELF::NT_AMD_AMDGPU_ISA,
-    [&](MCELFStreamer &OS) {
-      OS.EmitLabel(DescBegin);
-      OS.EmitBytes(IsaVersionString);
-      OS.EmitLabel(DescEnd);
-    }
-  );
+  EmitNote(ElfNote::NoteNameV2, DescSZ, ELF::NT_AMD_AMDGPU_ISA,
+           [&](MCELFStreamer &OS) {
+             OS.EmitLabel(DescBegin);
+             OS.EmitBytes(IsaVersionString);
+             OS.EmitLabel(DescEnd);
+           });
+  return true;
+}
+
+bool AMDGPUTargetELFStreamer::EmitHSAMetadata(
+    std::shared_ptr<msgpack::Node> &HSAMetadataRoot, bool Strict) {
+  V3::MetadataVerifier Verifier(Strict);
+  if (!Verifier.verify(*HSAMetadataRoot))
+    return false;
+
+  std::string HSAMetadataString;
+  raw_string_ostream StrOS(HSAMetadataString);
+  msgpack::Writer MPWriter(StrOS);
+  HSAMetadataRoot->write(MPWriter);
+
+  // Create two labels to mark the beginning and end of the desc field
+  // and a MCExpr to calculate the size of the desc field.
+  auto &Context = getContext();
+  auto *DescBegin = Context.createTempSymbol();
+  auto *DescEnd = Context.createTempSymbol();
+  auto *DescSZ = MCBinaryExpr::createSub(
+      MCSymbolRefExpr::create(DescEnd, Context),
+      MCSymbolRefExpr::create(DescBegin, Context), Context);
+
+  EmitNote(ElfNote::NoteNameV3, DescSZ, ELF::NT_AMDGPU_METADATA,
+           [&](MCELFStreamer &OS) {
+             OS.EmitLabel(DescBegin);
+             OS.EmitBytes(StrOS.str());
+             OS.EmitLabel(DescEnd);
+           });
   return true;
 }
 
@@ -474,28 +526,24 @@ bool AMDGPUTargetELFStreamer::EmitHSAMetadata(
     MCSymbolRefExpr::create(DescEnd, Context),
     MCSymbolRefExpr::create(DescBegin, Context), Context);
 
-  EmitAMDGPUNote(
-    DescSZ,
-    ELF::NT_AMD_AMDGPU_HSA_METADATA,
-    [&](MCELFStreamer &OS) {
-      OS.EmitLabel(DescBegin);
-      OS.EmitBytes(HSAMetadataString);
-      OS.EmitLabel(DescEnd);
-    }
-  );
+  EmitNote(ElfNote::NoteNameV2, DescSZ, ELF::NT_AMD_AMDGPU_HSA_METADATA,
+           [&](MCELFStreamer &OS) {
+             OS.EmitLabel(DescBegin);
+             OS.EmitBytes(HSAMetadataString);
+             OS.EmitLabel(DescEnd);
+           });
   return true;
 }
 
 bool AMDGPUTargetELFStreamer::EmitPALMetadata(
     const PALMD::Metadata &PALMetadata) {
-  EmitAMDGPUNote(
-    MCConstantExpr::create(PALMetadata.size() * sizeof(uint32_t), getContext()),
-    ELF::NT_AMD_AMDGPU_PAL_METADATA,
-    [&](MCELFStreamer &OS){
-      for (auto I : PALMetadata)
-        OS.EmitIntValue(I, sizeof(uint32_t));
-    }
-  );
+  EmitNote(ElfNote::NoteNameV2,
+           MCConstantExpr::create(PALMetadata.size() * sizeof(uint32_t),
+                                  getContext()),
+           ELF::NT_AMD_AMDGPU_PAL_METADATA, [&](MCELFStreamer &OS) {
+             for (auto I : PALMetadata)
+               OS.EmitIntValue(I, sizeof(uint32_t));
+           });
   return true;
 }
 
diff --git a/lib/Target/AMDGPU/MCTargetDesc/AMDGPUTargetStreamer.h b/lib/Target/AMDGPU/MCTargetDesc/AMDGPUTargetStreamer.h
index 472da1b735936..9a807c804f9ff 100644
--- a/lib/Target/AMDGPU/MCTargetDesc/AMDGPUTargetStreamer.h
+++ b/lib/Target/AMDGPU/MCTargetDesc/AMDGPUTargetStreamer.h
@@ -11,6 +11,7 @@
 #define LLVM_LIB_TARGET_AMDGPU_MCTARGETDESC_AMDGPUTARGETSTREAMER_H
 
 #include "AMDKernelCodeT.h"
+#include "llvm/BinaryFormat/MsgPackTypes.h"
 #include "llvm/MC/MCStreamer.h"
 #include "llvm/MC/MCSubtargetInfo.h"
 #include "llvm/Support/AMDGPUMetadata.h"
@@ -31,13 +32,7 @@ class AMDGPUTargetStreamer : public MCTargetStreamer {
 protected:
   MCContext &getContext() const { return Streamer.getContext(); }
 
-  /// \returns Equivalent EF_AMDGPU_MACH_* value for given \p GPU name.
-  unsigned getMACH(StringRef GPU) const;
-
 public:
-  /// \returns Equivalent GPU name for an EF_AMDGPU_MACH_* value.
-  static const char *getMachName(unsigned Mach);
-
   AMDGPUTargetStreamer(MCStreamer &S) : MCTargetStreamer(S) {}
 
   virtual void EmitDirectiveAMDGCNTarget(StringRef Target) = 0;
@@ -58,7 +53,20 @@ public:
   virtual bool EmitISAVersion(StringRef IsaVersionString) = 0;
 
   /// \returns True on success, false on failure.
-  virtual bool EmitHSAMetadata(StringRef HSAMetadataString);
+  virtual bool EmitHSAMetadataV2(StringRef HSAMetadataString);
+
+  /// \returns True on success, false on failure.
+  virtual bool EmitHSAMetadataV3(StringRef HSAMetadataString);
+
+  /// Emit HSA Metadata
+  ///
+  /// When \p Strict is true, known metadata elements must already be
+  /// well-typed. When \p Strict is false, known types are inferred and
+  /// the \p HSAMetadata structure is updated with the correct types.
+  ///
+  /// \returns True on success, false on failure.
+  virtual bool EmitHSAMetadata(std::shared_ptr<msgpack::Node> &HSAMetadata,
+                               bool Strict) = 0;
 
   /// \returns True on success, false on failure.
   virtual bool EmitHSAMetadata(const AMDGPU::HSAMD::Metadata &HSAMetadata) = 0;
@@ -71,6 +79,9 @@ public:
       const amdhsa::kernel_descriptor_t &KernelDescriptor, uint64_t NextVGPR,
       uint64_t NextSGPR, bool ReserveVCC, bool ReserveFlatScr,
       bool ReserveXNACK) = 0;
+
+  static StringRef getArchNameFromElfMach(unsigned ElfMach);
+  static unsigned getElfMach(StringRef GPU);
 };
 
 class AMDGPUTargetAsmStreamer final : public AMDGPUTargetStreamer {
@@ -95,6 +106,10 @@ public:
   bool EmitISAVersion(StringRef IsaVersionString) override;
 
   /// \returns True on success, false on failure.
+  bool EmitHSAMetadata(std::shared_ptr<msgpack::Node> &HSAMetadata,
+                       bool Strict) override;
+
+  /// \returns True on success, false on failure.
   bool EmitHSAMetadata(const AMDGPU::HSAMD::Metadata &HSAMetadata) override;
 
   /// \returns True on success, false on failure.
@@ -110,8 +125,8 @@ public:
 class AMDGPUTargetELFStreamer final : public AMDGPUTargetStreamer {
   MCStreamer &Streamer;
 
-  void EmitAMDGPUNote(const MCExpr *DescSize, unsigned NoteType,
-                      function_ref<void(MCELFStreamer &)> EmitDesc);
+  void EmitNote(StringRef Name, const MCExpr *DescSize, unsigned NoteType,
+                function_ref<void(MCELFStreamer &)> EmitDesc);
 
 public:
   AMDGPUTargetELFStreamer(MCStreamer &S, const MCSubtargetInfo &STI);
@@ -135,6 +150,10 @@ public:
   bool EmitISAVersion(StringRef IsaVersionString) override;
 
   /// \returns True on success, false on failure.
+  bool EmitHSAMetadata(std::shared_ptr<msgpack::Node> &HSAMetadata,
+                       bool Strict) override;
+
+  /// \returns True on success, false on failure.
   bool EmitHSAMetadata(const AMDGPU::HSAMD::Metadata &HSAMetadata) override;
 
   /// \returns True on success, false on failure.
diff --git a/lib/Target/AMDGPU/MCTargetDesc/LLVMBuild.txt b/lib/Target/AMDGPU/MCTargetDesc/LLVMBuild.txt
index 773ee7c0a4ba7..bc910a470d72a 100644
--- a/lib/Target/AMDGPU/MCTargetDesc/LLVMBuild.txt
+++ b/lib/Target/AMDGPU/MCTargetDesc/LLVMBuild.txt
@@ -19,5 +19,5 @@
 type = Library
 name = AMDGPUDesc
 parent = AMDGPU
-required_libraries = Core MC AMDGPUAsmPrinter AMDGPUInfo AMDGPUUtils Support
+required_libraries = Core MC AMDGPUAsmPrinter AMDGPUInfo AMDGPUUtils Support BinaryFormat
 add_to_library_groups = AMDGPU
diff --git a/lib/Target/AMDGPU/MIMGInstructions.td b/lib/Target/AMDGPU/MIMGInstructions.td
index 44c2d366e4613..1c68dbd78e758 100644
--- a/lib/Target/AMDGPU/MIMGInstructions.td
+++ b/lib/Target/AMDGPU/MIMGInstructions.td
@@ -29,6 +29,7 @@ class MIMGBaseOpcode {
   bit Atomic = 0;
   bit AtomicX2 = 0; // (f)cmpswap
   bit Sampler = 0;
+  bit Gather4 = 0;
   bits<8> NumExtraArgs = 0;
   bit Gradients = 0;
   bit Coordinates = 1;
@@ -43,7 +44,7 @@ def MIMGBaseOpcode : GenericEnum {
 def MIMGBaseOpcodesTable : GenericTable {
   let FilterClass = "MIMGBaseOpcode";
   let CppTypeName = "MIMGBaseOpcodeInfo";
-  let Fields = ["BaseOpcode", "Store", "Atomic", "AtomicX2", "Sampler",
+  let Fields = ["BaseOpcode", "Store", "Atomic", "AtomicX2", "Sampler", "Gather4",
                 "NumExtraArgs", "Gradients", "Coordinates", "LodOrClampOrMip",
                 "HasD16"];
   GenericEnum TypeOf_BaseOpcode = MIMGBaseOpcode;
@@ -141,7 +142,7 @@ class MIMG_NoSampler_Helper <bits<7> op, string asm,
 
   let InOperandList = !con((ins addr_rc:$vaddr, SReg_256:$srsrc,
                                 DMask:$dmask, UNorm:$unorm, GLC:$glc, SLC:$slc,
-                                R128:$r128, TFE:$tfe, LWE:$lwe, DA:$da),
+                                R128A16:$r128, TFE:$tfe, LWE:$lwe, DA:$da),
                            !if(BaseOpcode.HasD16, (ins D16:$d16), (ins)));
   let AsmString = asm#" $vdata, $vaddr, $srsrc$dmask$unorm$glc$slc$r128$tfe$lwe$da"
                       #!if(BaseOpcode.HasD16, "$d16", "");
@@ -179,6 +180,8 @@ multiclass MIMG_NoSampler <bits<7> op, string asm, bit has_d16, bit mip = 0,
     defm _V3 : MIMG_NoSampler_Src_Helper <op, asm, VReg_96, 0>;
     let VDataDwords = 4 in
     defm _V4 : MIMG_NoSampler_Src_Helper <op, asm, VReg_128, 0>;
+    let VDataDwords = 8 in
+    defm _V8 : MIMG_NoSampler_Src_Helper <op, asm, VReg_256, 0>;
   }
 }
 
@@ -199,7 +202,7 @@ class MIMG_Store_Helper <bits<7> op, string asm,
 
   let InOperandList = !con((ins data_rc:$vdata, addr_rc:$vaddr, SReg_256:$srsrc,
                                 DMask:$dmask, UNorm:$unorm, GLC:$glc, SLC:$slc,
-                                R128:$r128, TFE:$tfe, LWE:$lwe, DA:$da),
+                                R128A16:$r128, TFE:$tfe, LWE:$lwe, DA:$da),
                            !if(BaseOpcode.HasD16, (ins D16:$d16), (ins)));
   let AsmString = asm#" $vdata, $vaddr, $srsrc$dmask$unorm$glc$slc$r128$tfe$lwe$da"
                       #!if(BaseOpcode.HasD16, "$d16", "");
@@ -252,7 +255,7 @@ class MIMG_Atomic_Helper <string asm, RegisterClass data_rc,
 
   let InOperandList = (ins data_rc:$vdata, addr_rc:$vaddr, SReg_256:$srsrc,
                            DMask:$dmask, UNorm:$unorm, GLC:$glc, SLC:$slc,
-                           R128:$r128, TFE:$tfe, LWE:$lwe, DA:$da);
+                           R128A16:$r128, TFE:$tfe, LWE:$lwe, DA:$da);
   let AsmString = asm#" $vdst, $vaddr, $srsrc$dmask$unorm$glc$slc$r128$tfe$lwe$da";
 }
 
@@ -316,7 +319,7 @@ class MIMG_Sampler_Helper <bits<7> op, string asm, RegisterClass dst_rc,
 
   let InOperandList = !con((ins src_rc:$vaddr, SReg_256:$srsrc, SReg_128:$ssamp,
                                 DMask:$dmask, UNorm:$unorm, GLC:$glc, SLC:$slc,
-                                R128:$r128, TFE:$tfe, LWE:$lwe, DA:$da),
+                                R128A16:$r128, TFE:$tfe, LWE:$lwe, DA:$da),
                            !if(BaseOpcode.HasD16, (ins D16:$d16), (ins)));
   let AsmString = asm#" $vdata, $vaddr, $srsrc, $ssamp$dmask$unorm$glc$slc$r128$tfe$lwe$da"
                       #!if(BaseOpcode.HasD16, "$d16", "");
@@ -411,6 +414,8 @@ multiclass MIMG_Sampler <bits<7> op, AMDGPUSampleVariant sample, bit wqm = 0,
     defm _V3 : MIMG_Sampler_Src_Helper<op, asm, sample, VReg_96>;
     let VDataDwords = 4 in
     defm _V4 : MIMG_Sampler_Src_Helper<op, asm, sample, VReg_128>;
+    let VDataDwords = 8 in
+    defm _V8 : MIMG_Sampler_Src_Helper<op, asm, sample, VReg_256>;
   }
 }
 
@@ -421,6 +426,7 @@ multiclass MIMG_Gather <bits<7> op, AMDGPUSampleVariant sample, bit wqm = 0,
                         string asm = "image_gather4"#sample.LowerCaseMod> {
   def "" : MIMG_Sampler_BaseOpcode<sample> {
     let HasD16 = 1;
+    let Gather4 = 1;
   }
 
   let BaseOpcode = !cast<MIMGBaseOpcode>(NAME), WQM = wqm,
@@ -429,6 +435,8 @@ multiclass MIMG_Gather <bits<7> op, AMDGPUSampleVariant sample, bit wqm = 0,
     defm _V2 : MIMG_Sampler_Src_Helper<op, asm, sample, VReg_64>; /* for packed D16 only */
     let VDataDwords = 4 in
     defm _V4 : MIMG_Sampler_Src_Helper<op, asm, sample, VReg_128, 1>;
+    let VDataDwords = 8 in
+    defm _V8 : MIMG_Sampler_Src_Helper<op, asm, sample, VReg_256>;
   }
 }
 
diff --git a/lib/Target/AMDGPU/R600EmitClauseMarkers.cpp b/lib/Target/AMDGPU/R600EmitClauseMarkers.cpp
index 1683fe6c9a571..679cf18d2c20b 100644
--- a/lib/Target/AMDGPU/R600EmitClauseMarkers.cpp
+++ b/lib/Target/AMDGPU/R600EmitClauseMarkers.cpp
@@ -226,11 +226,11 @@ private:
         // occur in the same basic block as its definition, because
         // it is illegal for the scheduler to schedule them in
         // different blocks.
-        if (UseI->readsRegister(MOI->getReg()))
+        if (UseI->readsRegister(MOI->getReg(), &TRI))
           LastUseCount = AluInstCount;
 
         // Exit early if the current use kills the register
-        if (UseI != Def && UseI->killsRegister(MOI->getReg()))
+        if (UseI != Def && UseI->killsRegister(MOI->getReg(), &TRI))
           break;
       }
       if (LastUseCount)
diff --git a/lib/Target/AMDGPU/R600ISelLowering.cpp b/lib/Target/AMDGPU/R600ISelLowering.cpp
index 113d6249fa60a..e2a0f05d2b34d 100644
--- a/lib/Target/AMDGPU/R600ISelLowering.cpp
+++ b/lib/Target/AMDGPU/R600ISelLowering.cpp
@@ -589,7 +589,7 @@ SDValue R600TargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const
     }
 
     case Intrinsic::r600_implicitarg_ptr: {
-      MVT PtrVT = getPointerTy(DAG.getDataLayout(), AMDGPUASI.PARAM_I_ADDRESS);
+      MVT PtrVT = getPointerTy(DAG.getDataLayout(), AMDGPUAS::PARAM_I_ADDRESS);
       uint32_t ByteOffset = getImplicitParameterOffset(MF, FIRST_IMPLICIT);
       return DAG.getConstant(ByteOffset, DL, PtrVT);
     }
@@ -741,12 +741,12 @@ SDValue R600TargetLowering::LowerGlobalAddress(AMDGPUMachineFunction *MFI,
                                                SDValue Op,
                                                SelectionDAG &DAG) const {
   GlobalAddressSDNode *GSD = cast<GlobalAddressSDNode>(Op);
-  if (GSD->getAddressSpace() != AMDGPUASI.CONSTANT_ADDRESS)
+  if (GSD->getAddressSpace() != AMDGPUAS::CONSTANT_ADDRESS)
     return AMDGPUTargetLowering::LowerGlobalAddress(MFI, Op, DAG);
 
   const DataLayout &DL = DAG.getDataLayout();
   const GlobalValue *GV = GSD->getGlobal();
-  MVT ConstPtrVT = getPointerTy(DL, AMDGPUASI.CONSTANT_ADDRESS);
+  MVT ConstPtrVT = getPointerTy(DL, AMDGPUAS::CONSTANT_ADDRESS);
 
   SDValue GA = DAG.getTargetGlobalAddress(GV, SDLoc(GSD), ConstPtrVT);
   return DAG.getNode(AMDGPUISD::CONST_DATA_PTR, SDLoc(GSD), ConstPtrVT, GA);
@@ -903,7 +903,7 @@ SDValue R600TargetLowering::LowerImplicitParameter(SelectionDAG &DAG, EVT VT,
                                                    unsigned DwordOffset) const {
   unsigned ByteOffset = DwordOffset * 4;
   PointerType * PtrType = PointerType::get(VT.getTypeForEVT(*DAG.getContext()),
-                                      AMDGPUASI.CONSTANT_BUFFER_0);
+                                      AMDGPUAS::PARAM_I_ADDRESS);
 
   // We shouldn't be using an offset wider than 16-bits for implicit parameters.
   assert(isInt<16>(ByteOffset));
@@ -1141,7 +1141,7 @@ SDValue R600TargetLowering::lowerPrivateTruncStore(StoreSDNode *Store,
   //TODO: Who creates the i8 stores?
   assert(Store->isTruncatingStore()
          || Store->getValue().getValueType() == MVT::i8);
-  assert(Store->getAddressSpace() == AMDGPUASI.PRIVATE_ADDRESS);
+  assert(Store->getAddressSpace() == AMDGPUAS::PRIVATE_ADDRESS);
 
   SDValue Mask;
   if (Store->getMemoryVT() == MVT::i8) {
@@ -1175,7 +1175,7 @@ SDValue R600TargetLowering::lowerPrivateTruncStore(StoreSDNode *Store,
   // Load dword
   // TODO: can we be smarter about machine pointer info?
   MachinePointerInfo PtrInfo(UndefValue::get(
-      Type::getInt32PtrTy(*DAG.getContext(), AMDGPUASI.PRIVATE_ADDRESS)));
+      Type::getInt32PtrTy(*DAG.getContext(), AMDGPUAS::PRIVATE_ADDRESS)));
   SDValue Dst = DAG.getLoad(MVT::i32, DL, Chain, Ptr, PtrInfo);
 
   Chain = Dst.getValue(1);
@@ -1241,9 +1241,9 @@ SDValue R600TargetLowering::LowerSTORE(SDValue Op, SelectionDAG &DAG) const {
   SDLoc DL(Op);
 
   // Neither LOCAL nor PRIVATE can do vectors at the moment
-  if ((AS == AMDGPUASI.LOCAL_ADDRESS || AS == AMDGPUASI.PRIVATE_ADDRESS) &&
+  if ((AS == AMDGPUAS::LOCAL_ADDRESS || AS == AMDGPUAS::PRIVATE_ADDRESS) &&
       VT.isVector()) {
-    if ((AS == AMDGPUASI.PRIVATE_ADDRESS) &&
+    if ((AS == AMDGPUAS::PRIVATE_ADDRESS) &&
          StoreNode->isTruncatingStore()) {
       // Add an extra level of chain to isolate this vector
       SDValue NewChain = DAG.getNode(AMDGPUISD::DUMMY_CHAIN, DL, MVT::Other, Chain);
@@ -1267,7 +1267,7 @@ SDValue R600TargetLowering::LowerSTORE(SDValue Op, SelectionDAG &DAG) const {
   SDValue DWordAddr = DAG.getNode(ISD::SRL, DL, PtrVT, Ptr,
                                   DAG.getConstant(2, DL, PtrVT));
 
-  if (AS == AMDGPUASI.GLOBAL_ADDRESS) {
+  if (AS == AMDGPUAS::GLOBAL_ADDRESS) {
     // It is beneficial to create MSKOR here instead of combiner to avoid
     // artificial dependencies introduced by RMW
     if (StoreNode->isTruncatingStore()) {
@@ -1320,7 +1320,7 @@ SDValue R600TargetLowering::LowerSTORE(SDValue Op, SelectionDAG &DAG) const {
   }
 
   // GLOBAL_ADDRESS has been handled above, LOCAL_ADDRESS allows all sizes
-  if (AS != AMDGPUASI.PRIVATE_ADDRESS)
+  if (AS != AMDGPUAS::PRIVATE_ADDRESS)
     return SDValue();
 
   if (MemVT.bitsLT(MVT::i32))
@@ -1403,7 +1403,7 @@ SDValue R600TargetLowering::lowerPrivateExtLoad(SDValue Op,
   // Load dword
   // TODO: can we be smarter about machine pointer info?
   MachinePointerInfo PtrInfo(UndefValue::get(
-      Type::getInt32PtrTy(*DAG.getContext(), AMDGPUASI.PRIVATE_ADDRESS)));
+      Type::getInt32PtrTy(*DAG.getContext(), AMDGPUAS::PRIVATE_ADDRESS)));
   SDValue Read = DAG.getLoad(MVT::i32, DL, Chain, Ptr, PtrInfo);
 
   // Get offset within the register.
@@ -1441,7 +1441,7 @@ SDValue R600TargetLowering::LowerLOAD(SDValue Op, SelectionDAG &DAG) const {
   EVT MemVT = LoadNode->getMemoryVT();
   ISD::LoadExtType ExtType = LoadNode->getExtensionType();
 
-  if (AS == AMDGPUASI.PRIVATE_ADDRESS &&
+  if (AS == AMDGPUAS::PRIVATE_ADDRESS &&
       ExtType != ISD::NON_EXTLOAD && MemVT.bitsLT(MVT::i32)) {
     return lowerPrivateExtLoad(Op, DAG);
   }
@@ -1451,45 +1451,29 @@ SDValue R600TargetLowering::LowerLOAD(SDValue Op, SelectionDAG &DAG) const {
   SDValue Chain = LoadNode->getChain();
   SDValue Ptr = LoadNode->getBasePtr();
 
-  if ((LoadNode->getAddressSpace() == AMDGPUASI.LOCAL_ADDRESS ||
-      LoadNode->getAddressSpace() == AMDGPUASI.PRIVATE_ADDRESS) &&
+  if ((LoadNode->getAddressSpace() == AMDGPUAS::LOCAL_ADDRESS ||
+      LoadNode->getAddressSpace() == AMDGPUAS::PRIVATE_ADDRESS) &&
       VT.isVector()) {
       return scalarizeVectorLoad(LoadNode, DAG);
   }
 
+  // This is still used for explicit load from addrspace(8)
   int ConstantBlock = ConstantAddressBlock(LoadNode->getAddressSpace());
   if (ConstantBlock > -1 &&
       ((LoadNode->getExtensionType() == ISD::NON_EXTLOAD) ||
        (LoadNode->getExtensionType() == ISD::ZEXTLOAD))) {
     SDValue Result;
-    if (isa<ConstantExpr>(LoadNode->getMemOperand()->getValue()) ||
-        isa<Constant>(LoadNode->getMemOperand()->getValue()) ||
+    if (isa<Constant>(LoadNode->getMemOperand()->getValue()) ||
         isa<ConstantSDNode>(Ptr)) {
-      SDValue Slots[4];
-      for (unsigned i = 0; i < 4; i++) {
-        // We want Const position encoded with the following formula :
-        // (((512 + (kc_bank << 12) + const_index) << 2) + chan)
-        // const_index is Ptr computed by llvm using an alignment of 16.
-        // Thus we add (((512 + (kc_bank << 12)) + chan ) * 4 here and
-        // then div by 4 at the ISel step
-        SDValue NewPtr = DAG.getNode(ISD::ADD, DL, Ptr.getValueType(), Ptr,
-            DAG.getConstant(4 * i + ConstantBlock * 16, DL, MVT::i32));
-        Slots[i] = DAG.getNode(AMDGPUISD::CONST_ADDRESS, DL, MVT::i32, NewPtr);
-      }
-      EVT NewVT = MVT::v4i32;
-      unsigned NumElements = 4;
-      if (VT.isVector()) {
-        NewVT = VT;
-        NumElements = VT.getVectorNumElements();
-      }
-      Result = DAG.getBuildVector(NewVT, DL, makeArrayRef(Slots, NumElements));
+      return constBufferLoad(LoadNode, LoadNode->getAddressSpace(), DAG);
     } else {
+      //TODO: Does this even work?
       // non-constant ptr can't be folded, keeps it as a v4f32 load
       Result = DAG.getNode(AMDGPUISD::CONST_ADDRESS, DL, MVT::v4i32,
           DAG.getNode(ISD::SRL, DL, MVT::i32, Ptr,
                       DAG.getConstant(4, DL, MVT::i32)),
                       DAG.getConstant(LoadNode->getAddressSpace() -
-                                      AMDGPUASI.CONSTANT_BUFFER_0, DL, MVT::i32)
+                                      AMDGPUAS::CONSTANT_BUFFER_0, DL, MVT::i32)
           );
     }
 
@@ -1525,7 +1509,7 @@ SDValue R600TargetLowering::LowerLOAD(SDValue Op, SelectionDAG &DAG) const {
     return DAG.getMergeValues(MergedValues, DL);
   }
 
-  if (LoadNode->getAddressSpace() != AMDGPUASI.PRIVATE_ADDRESS) {
+  if (LoadNode->getAddressSpace() != AMDGPUAS::PRIVATE_ADDRESS) {
     return SDValue();
   }
 
@@ -1622,7 +1606,7 @@ SDValue R600TargetLowering::LowerFormalArguments(
     }
 
     PointerType *PtrTy = PointerType::get(VT.getTypeForEVT(*DAG.getContext()),
-                                          AMDGPUASI.CONSTANT_BUFFER_0);
+                                          AMDGPUAS::PARAM_I_ADDRESS);
 
     // i64 isn't a legal type, so the register type used ends up as i32, which
     // isn't expected here. It attempts to create this sextload, but it ends up
@@ -1646,17 +1630,17 @@ SDValue R600TargetLowering::LowerFormalArguments(
 
     unsigned ValBase = ArgLocs[In.getOrigArgIndex()].getLocMemOffset();
     unsigned PartOffset = VA.getLocMemOffset();
+    unsigned Alignment = MinAlign(VT.getStoreSize(), PartOffset);
 
     MachinePointerInfo PtrInfo(UndefValue::get(PtrTy), PartOffset - ValBase);
     SDValue Arg = DAG.getLoad(
         ISD::UNINDEXED, Ext, VT, DL, Chain,
         DAG.getConstant(PartOffset, DL, MVT::i32), DAG.getUNDEF(MVT::i32),
         PtrInfo,
-        MemVT, /* Alignment = */ 4, MachineMemOperand::MONonTemporal |
+        MemVT, Alignment, MachineMemOperand::MONonTemporal |
                                         MachineMemOperand::MODereferenceable |
                                         MachineMemOperand::MOInvariant);
 
-    // 4 is the preferred alignment for the CONSTANT memory space.
     InVals.push_back(Arg);
   }
   return Chain;
@@ -1672,7 +1656,7 @@ EVT R600TargetLowering::getSetCCResultType(const DataLayout &DL, LLVMContext &,
 bool R600TargetLowering::canMergeStoresTo(unsigned AS, EVT MemVT,
                                           const SelectionDAG &DAG) const {
   // Local and Private addresses do not handle vectors. Limit to i32
-  if ((AS == AMDGPUASI.LOCAL_ADDRESS || AS == AMDGPUASI.PRIVATE_ADDRESS)) {
+  if ((AS == AMDGPUAS::LOCAL_ADDRESS || AS == AMDGPUAS::PRIVATE_ADDRESS)) {
     return (MemVT.getSizeInBits() <= 32);
   }
   return true;
@@ -1701,14 +1685,15 @@ bool R600TargetLowering::allowsMisalignedMemoryAccesses(EVT VT,
 static SDValue CompactSwizzlableVector(
   SelectionDAG &DAG, SDValue VectorEntry,
   DenseMap<unsigned, unsigned> &RemapSwizzle) {
-  assert(VectorEntry.getOpcode() == ISD::BUILD_VECTOR);
   assert(RemapSwizzle.empty());
-  SDValue NewBldVec[4] = {
-    VectorEntry.getOperand(0),
-    VectorEntry.getOperand(1),
-    VectorEntry.getOperand(2),
-    VectorEntry.getOperand(3)
-  };
+
+  SDLoc DL(VectorEntry);
+  EVT EltTy = VectorEntry.getValueType().getVectorElementType();
+
+  SDValue NewBldVec[4];
+  for (unsigned i = 0; i < 4; i++)
+    NewBldVec[i] = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, EltTy, VectorEntry,
+                               DAG.getIntPtrConstant(i, DL));
 
   for (unsigned i = 0; i < 4; i++) {
     if (NewBldVec[i].isUndef())
@@ -1743,15 +1728,17 @@ static SDValue CompactSwizzlableVector(
 
 static SDValue ReorganizeVector(SelectionDAG &DAG, SDValue VectorEntry,
                                 DenseMap<unsigned, unsigned> &RemapSwizzle) {
-  assert(VectorEntry.getOpcode() == ISD::BUILD_VECTOR);
   assert(RemapSwizzle.empty());
-  SDValue NewBldVec[4] = {
-      VectorEntry.getOperand(0),
-      VectorEntry.getOperand(1),
-      VectorEntry.getOperand(2),
-      VectorEntry.getOperand(3)
-  };
-  bool isUnmovable[4] = { false, false, false, false };
+
+  SDLoc DL(VectorEntry);
+  EVT EltTy = VectorEntry.getValueType().getVectorElementType();
+
+  SDValue NewBldVec[4];
+  bool isUnmovable[4] = {false, false, false, false};
+  for (unsigned i = 0; i < 4; i++)
+    NewBldVec[i] = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, EltTy, VectorEntry,
+                               DAG.getIntPtrConstant(i, DL));
+
   for (unsigned i = 0; i < 4; i++) {
     RemapSwizzle[i] = i;
     if (NewBldVec[i].getOpcode() == ISD::EXTRACT_VECTOR_ELT) {
@@ -1782,7 +1769,6 @@ static SDValue ReorganizeVector(SelectionDAG &DAG, SDValue VectorEntry,
 SDValue R600TargetLowering::OptimizeSwizzle(SDValue BuildVector, SDValue Swz[4],
                                             SelectionDAG &DAG,
                                             const SDLoc &DL) const {
-  assert(BuildVector.getOpcode() == ISD::BUILD_VECTOR);
   // Old -> New swizzle values
   DenseMap<unsigned, unsigned> SwizzleRemap;
 
@@ -1804,6 +1790,52 @@ SDValue R600TargetLowering::OptimizeSwizzle(SDValue BuildVector, SDValue Swz[4],
   return BuildVector;
 }
 
+SDValue R600TargetLowering::constBufferLoad(LoadSDNode *LoadNode, int Block,
+                                            SelectionDAG &DAG) const {
+  SDLoc DL(LoadNode);
+  EVT VT = LoadNode->getValueType(0);
+  SDValue Chain = LoadNode->getChain();
+  SDValue Ptr = LoadNode->getBasePtr();
+  assert (isa<ConstantSDNode>(Ptr));
+
+  //TODO: Support smaller loads
+  if (LoadNode->getMemoryVT().getScalarType() != MVT::i32 || !ISD::isNON_EXTLoad(LoadNode))
+    return SDValue();
+
+  if (LoadNode->getAlignment() < 4)
+    return SDValue();
+
+  int ConstantBlock = ConstantAddressBlock(Block);
+
+  SDValue Slots[4];
+  for (unsigned i = 0; i < 4; i++) {
+    // We want Const position encoded with the following formula :
+    // (((512 + (kc_bank << 12) + const_index) << 2) + chan)
+    // const_index is Ptr computed by llvm using an alignment of 16.
+    // Thus we add (((512 + (kc_bank << 12)) + chan ) * 4 here and
+    // then div by 4 at the ISel step
+    SDValue NewPtr = DAG.getNode(ISD::ADD, DL, Ptr.getValueType(), Ptr,
+        DAG.getConstant(4 * i + ConstantBlock * 16, DL, MVT::i32));
+    Slots[i] = DAG.getNode(AMDGPUISD::CONST_ADDRESS, DL, MVT::i32, NewPtr);
+  }
+  EVT NewVT = MVT::v4i32;
+  unsigned NumElements = 4;
+  if (VT.isVector()) {
+    NewVT = VT;
+    NumElements = VT.getVectorNumElements();
+  }
+  SDValue Result = DAG.getBuildVector(NewVT, DL, makeArrayRef(Slots, NumElements));
+  if (!VT.isVector()) {
+    Result = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i32, Result,
+                         DAG.getConstant(0, DL, MVT::i32));
+  }
+  SDValue MergedValues[2] = {
+    Result,
+    Chain
+  };
+  return DAG.getMergeValues(MergedValues, DL);
+}
+
 //===----------------------------------------------------------------------===//
 // Custom DAG Optimizations
 //===----------------------------------------------------------------------===//
@@ -2022,6 +2054,16 @@ SDValue R600TargetLowering::PerformDAGCombine(SDNode *N,
     NewArgs[1] = OptimizeSwizzle(N->getOperand(1), &NewArgs[2], DAG, DL);
     return DAG.getNode(AMDGPUISD::TEXTURE_FETCH, DL, N->getVTList(), NewArgs);
   }
+
+  case ISD::LOAD: {
+    LoadSDNode *LoadNode = cast<LoadSDNode>(N);
+    SDValue Ptr = LoadNode->getBasePtr();
+    if (LoadNode->getAddressSpace() == AMDGPUAS::PARAM_I_ADDRESS &&
+         isa<ConstantSDNode>(Ptr))
+      return constBufferLoad(LoadNode, AMDGPUAS::CONSTANT_BUFFER_0, DAG);
+    break;
+  }
+
   default: break;
   }
 
diff --git a/lib/Target/AMDGPU/R600ISelLowering.h b/lib/Target/AMDGPU/R600ISelLowering.h
index 907d1f10e1519..767c3c7bd5bfe 100644
--- a/lib/Target/AMDGPU/R600ISelLowering.h
+++ b/lib/Target/AMDGPU/R600ISelLowering.h
@@ -98,9 +98,11 @@ private:
   bool isHWTrueValue(SDValue Op) const;
   bool isHWFalseValue(SDValue Op) const;
 
- bool FoldOperand(SDNode *ParentNode, unsigned SrcIdx, SDValue &Src,
-                  SDValue &Neg, SDValue &Abs, SDValue &Sel, SDValue &Imm,
-                  SelectionDAG &DAG) const;
+  bool FoldOperand(SDNode *ParentNode, unsigned SrcIdx, SDValue &Src,
+                   SDValue &Neg, SDValue &Abs, SDValue &Sel, SDValue &Imm,
+                   SelectionDAG &DAG) const;
+  SDValue constBufferLoad(LoadSDNode *LoadNode, int Block,
+                          SelectionDAG &DAG) const;
 
   SDNode *PostISelFolding(MachineSDNode *N, SelectionDAG &DAG) const override;
 };
diff --git a/lib/Target/AMDGPU/R600InstrInfo.cpp b/lib/Target/AMDGPU/R600InstrInfo.cpp
index 5397e779474c8..9cc3e5f3c314c 100644
--- a/lib/Target/AMDGPU/R600InstrInfo.cpp
+++ b/lib/Target/AMDGPU/R600InstrInfo.cpp
@@ -229,11 +229,11 @@ bool R600InstrInfo::mustBeLastInClause(unsigned Opcode) const {
 }
 
 bool R600InstrInfo::usesAddressRegister(MachineInstr &MI) const {
-  return MI.findRegisterUseOperandIdx(R600::AR_X) != -1;
+  return MI.findRegisterUseOperandIdx(R600::AR_X, false, &RI) != -1;
 }
 
 bool R600InstrInfo::definesAddressRegister(MachineInstr &MI) const {
-  return MI.findRegisterDefOperandIdx(R600::AR_X) != -1;
+  return MI.findRegisterDefOperandIdx(R600::AR_X, false, false, &RI) != -1;
 }
 
 bool R600InstrInfo::readsLDSSrcReg(const MachineInstr &MI) const {
@@ -1500,19 +1500,19 @@ void R600InstrInfo::clearFlag(MachineInstr &MI, unsigned Operand,
 }
 
 unsigned R600InstrInfo::getAddressSpaceForPseudoSourceKind(
-    PseudoSourceValue::PSVKind Kind) const {
+    unsigned Kind) const {
   switch (Kind) {
   case PseudoSourceValue::Stack:
   case PseudoSourceValue::FixedStack:
-    return ST.getAMDGPUAS().PRIVATE_ADDRESS;
+    return AMDGPUAS::PRIVATE_ADDRESS;
   case PseudoSourceValue::ConstantPool:
   case PseudoSourceValue::GOT:
   case PseudoSourceValue::JumpTable:
   case PseudoSourceValue::GlobalValueCallEntry:
   case PseudoSourceValue::ExternalSymbolCallEntry:
   case PseudoSourceValue::TargetCustom:
-    return ST.getAMDGPUAS().CONSTANT_ADDRESS;
+    return AMDGPUAS::CONSTANT_ADDRESS;
   }
+
   llvm_unreachable("Invalid pseudo source kind");
-  return ST.getAMDGPUAS().PRIVATE_ADDRESS;
 }
diff --git a/lib/Target/AMDGPU/R600InstrInfo.h b/lib/Target/AMDGPU/R600InstrInfo.h
index 7a3dece316650..e6e34dc125f4d 100644
--- a/lib/Target/AMDGPU/R600InstrInfo.h
+++ b/lib/Target/AMDGPU/R600InstrInfo.h
@@ -324,7 +324,7 @@ public:
   }
 
   unsigned getAddressSpaceForPseudoSourceKind(
-      PseudoSourceValue::PSVKind Kind) const override;
+      unsigned Kind) const override;
 };
 
 namespace R600 {
diff --git a/lib/Target/AMDGPU/R600Instructions.td b/lib/Target/AMDGPU/R600Instructions.td
index 7bf174f4cd864..10e8737552224 100644
--- a/lib/Target/AMDGPU/R600Instructions.td
+++ b/lib/Target/AMDGPU/R600Instructions.td
@@ -299,7 +299,7 @@ class VTX_READ <string name, dag outs, list<dag> pattern>
 class LoadParamFrag <PatFrag load_type> : PatFrag <
   (ops node:$ptr), (load_type node:$ptr),
   [{ return isConstantLoad(cast<LoadSDNode>(N), 0) ||
-            (cast<LoadSDNode>(N)->getAddressSpace() == AMDGPUASI.PARAM_I_ADDRESS); }]
+            (cast<LoadSDNode>(N)->getAddressSpace() == AMDGPUAS::PARAM_I_ADDRESS); }]
 >;
 
 def vtx_id3_az_extloadi8 : LoadParamFrag<az_extloadi8>;
@@ -309,8 +309,8 @@ def vtx_id3_load : LoadParamFrag<load>;
 class LoadVtxId1 <PatFrag load> : PatFrag <
   (ops node:$ptr), (load node:$ptr), [{
   const MemSDNode *LD = cast<MemSDNode>(N);
-  return LD->getAddressSpace() == AMDGPUASI.GLOBAL_ADDRESS ||
-         (LD->getAddressSpace() == AMDGPUASI.CONSTANT_ADDRESS &&
+  return LD->getAddressSpace() == AMDGPUAS::GLOBAL_ADDRESS ||
+         (LD->getAddressSpace() == AMDGPUAS::CONSTANT_ADDRESS &&
            !isa<GlobalValue>(GetUnderlyingObject(
            LD->getMemOperand()->getValue(), CurDAG->getDataLayout())));
 }]>;
@@ -322,7 +322,7 @@ def vtx_id1_load : LoadVtxId1 <load>;
 class LoadVtxId2 <PatFrag load> : PatFrag <
   (ops node:$ptr), (load node:$ptr), [{
   const MemSDNode *LD = cast<MemSDNode>(N);
-  return LD->getAddressSpace() == AMDGPUASI.CONSTANT_ADDRESS &&
+  return LD->getAddressSpace() == AMDGPUAS::CONSTANT_ADDRESS &&
          isa<GlobalValue>(GetUnderlyingObject(
          LD->getMemOperand()->getValue(), CurDAG->getDataLayout()));
 }]>;
diff --git a/lib/Target/AMDGPU/R600MachineScheduler.cpp b/lib/Target/AMDGPU/R600MachineScheduler.cpp
index a1429a2ac50f1..7769a35aadcee 100644
--- a/lib/Target/AMDGPU/R600MachineScheduler.cpp
+++ b/lib/Target/AMDGPU/R600MachineScheduler.cpp
@@ -127,13 +127,13 @@ SUnit* R600SchedStrategy::pickNode(bool &IsTopNode) {
 
   LLVM_DEBUG(if (SU) {
     dbgs() << " ** Pick node **\n";
-    SU->dump(DAG);
+    DAG->dumpNode(*SU);
   } else {
     dbgs() << "NO NODE \n";
     for (unsigned i = 0; i < DAG->SUnits.size(); i++) {
       const SUnit &S = DAG->SUnits[i];
       if (!S.isScheduled)
-        S.dump(DAG);
+        DAG->dumpNode(S);
     }
   });
 
@@ -188,11 +188,11 @@ isPhysicalRegCopy(MachineInstr *MI) {
 }
 
 void R600SchedStrategy::releaseTopNode(SUnit *SU) {
-  LLVM_DEBUG(dbgs() << "Top Releasing "; SU->dump(DAG););
+  LLVM_DEBUG(dbgs() << "Top Releasing "; DAG->dumpNode(*SU));
 }
 
 void R600SchedStrategy::releaseBottomNode(SUnit *SU) {
-  LLVM_DEBUG(dbgs() << "Bottom Releasing "; SU->dump(DAG););
+  LLVM_DEBUG(dbgs() << "Bottom Releasing "; DAG->dumpNode(*SU));
   if (isPhysicalRegCopy(SU->getInstr())) {
     PhysicalRegCopy.push_back(SU);
     return;
@@ -236,6 +236,7 @@ R600SchedStrategy::AluKind R600SchedStrategy::getAluKind(SUnit *SU) const {
       // MI will become a KILL, don't considers it in scheduling
       return AluDiscarded;
     }
+    break;
   default:
     break;
   }
diff --git a/lib/Target/AMDGPU/SIAddIMGInit.cpp b/lib/Target/AMDGPU/SIAddIMGInit.cpp
new file mode 100644
index 0000000000000..69cafef4a3513
--- /dev/null
+++ b/lib/Target/AMDGPU/SIAddIMGInit.cpp
@@ -0,0 +1,181 @@
+//===-- SIAddIMGInit.cpp - Add any required IMG inits ---------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+/// \file
+/// Any MIMG instructions that use tfe or lwe require an initialization of the
+/// result register that will be written in the case of a memory access failure
+/// The required code is also added to tie this init code to the result of the
+/// img instruction
+///
+//===----------------------------------------------------------------------===//
+//
+
+#include "AMDGPU.h"
+#include "AMDGPUSubtarget.h"
+#include "MCTargetDesc/AMDGPUMCTargetDesc.h"
+#include "SIInstrInfo.h"
+#include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/IR/Function.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Target/TargetMachine.h"
+
+#define DEBUG_TYPE "si-img-init"
+
+using namespace llvm;
+
+namespace {
+
+class SIAddIMGInit : public MachineFunctionPass {
+public:
+  static char ID;
+
+public:
+  SIAddIMGInit() : MachineFunctionPass(ID) {
+    initializeSIAddIMGInitPass(*PassRegistry::getPassRegistry());
+  }
+
+  bool runOnMachineFunction(MachineFunction &MF) override;
+
+  void getAnalysisUsage(AnalysisUsage &AU) const override {
+    AU.setPreservesCFG();
+    MachineFunctionPass::getAnalysisUsage(AU);
+  }
+};
+
+} // End anonymous namespace.
+
+INITIALIZE_PASS(SIAddIMGInit, DEBUG_TYPE, "SI Add IMG Init", false, false)
+
+char SIAddIMGInit::ID = 0;
+
+char &llvm::SIAddIMGInitID = SIAddIMGInit::ID;
+
+FunctionPass *llvm::createSIAddIMGInitPass() { return new SIAddIMGInit(); }
+
+bool SIAddIMGInit::runOnMachineFunction(MachineFunction &MF) {
+  MachineRegisterInfo &MRI = MF.getRegInfo();
+  const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
+  const SIInstrInfo *TII = ST.getInstrInfo();
+  const SIRegisterInfo *RI = ST.getRegisterInfo();
+  bool Changed = false;
+
+  for (MachineFunction::iterator BI = MF.begin(), BE = MF.end(); BI != BE;
+       ++BI) {
+    MachineBasicBlock &MBB = *BI;
+    MachineBasicBlock::iterator I, Next;
+    for (I = MBB.begin(); I != MBB.end(); I = Next) {
+      Next = std::next(I);
+      MachineInstr &MI = *I;
+
+      auto Opcode = MI.getOpcode();
+      if (TII->isMIMG(Opcode) && !MI.mayStore()) {
+        MachineOperand *TFE = TII->getNamedOperand(MI, AMDGPU::OpName::tfe);
+        MachineOperand *LWE = TII->getNamedOperand(MI, AMDGPU::OpName::lwe);
+        MachineOperand *D16 = TII->getNamedOperand(MI, AMDGPU::OpName::d16);
+
+        // Check for instructions that don't have tfe or lwe fields
+        // There shouldn't be any at this point.
+        assert( (TFE && LWE) && "Expected tfe and lwe operands in instruction");
+
+        unsigned TFEVal = TFE->getImm();
+        unsigned LWEVal = LWE->getImm();
+        unsigned D16Val = D16 ? D16->getImm() : 0;
+
+        if (TFEVal || LWEVal) {
+          // At least one of TFE or LWE are non-zero
+          // We have to insert a suitable initialization of the result value and
+          // tie this to the dest of the image instruction.
+
+          const DebugLoc &DL = MI.getDebugLoc();
+
+          int DstIdx =
+              AMDGPU::getNamedOperandIdx(MI.getOpcode(), AMDGPU::OpName::vdata);
+
+          // Calculate which dword we have to initialize to 0.
+          MachineOperand *MO_Dmask =
+              TII->getNamedOperand(MI, AMDGPU::OpName::dmask);
+
+          // check that dmask operand is found.
+          assert(MO_Dmask && "Expected dmask operand in instruction");
+
+          unsigned dmask = MO_Dmask->getImm();
+          // Determine the number of active lanes taking into account the
+          // Gather4 special case
+          unsigned ActiveLanes =
+              TII->isGather4(Opcode) ? 4 : countPopulation(dmask);
+
+          // Subreg indices are counted from 1
+          // When D16 then we want next whole VGPR after write data.
+          static_assert(AMDGPU::sub0 == 1 && AMDGPU::sub4 == 5, "Subreg indices different from expected");
+
+          bool Packed = !ST.hasUnpackedD16VMem();
+
+          unsigned InitIdx =
+              D16Val && Packed ? ((ActiveLanes + 1) >> 1) + 1 : ActiveLanes + 1;
+
+          // Abandon attempt if the dst size isn't large enough
+          // - this is in fact an error but this is picked up elsewhere and
+          // reported correctly.
+          uint32_t DstSize =
+              RI->getRegSizeInBits(*TII->getOpRegClass(MI, DstIdx)) / 32;
+          if (DstSize < InitIdx)
+            continue;
+
+          // Create a register for the intialization value.
+          unsigned PrevDst =
+              MRI.createVirtualRegister(TII->getOpRegClass(MI, DstIdx));
+          unsigned NewDst = 0; // Final initialized value will be in here
+
+          // If PRTStrictNull feature is enabled (the default) then initialize
+          // all the result registers to 0, otherwise just the error indication
+          // register (VGPRn+1)
+          unsigned SizeLeft = ST.usePRTStrictNull() ? InitIdx : 1;
+          unsigned CurrIdx = ST.usePRTStrictNull() ? 1 : InitIdx;
+
+          if (DstSize == 1) {
+            // In this case we can just initialize the result directly
+            BuildMI(MBB, MI, DL, TII->get(AMDGPU::V_MOV_B32_e32), PrevDst)
+                .addImm(0);
+            NewDst = PrevDst;
+          } else {
+            BuildMI(MBB, MI, DL, TII->get(AMDGPU::IMPLICIT_DEF), PrevDst);
+            for (; SizeLeft; SizeLeft--, CurrIdx++) {
+              NewDst =
+                  MRI.createVirtualRegister(TII->getOpRegClass(MI, DstIdx));
+              // Initialize dword
+              unsigned SubReg =
+                  MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
+              BuildMI(MBB, MI, DL, TII->get(AMDGPU::V_MOV_B32_e32), SubReg)
+                  .addImm(0);
+              // Insert into the super-reg
+              BuildMI(MBB, I, DL, TII->get(TargetOpcode::INSERT_SUBREG), NewDst)
+                  .addReg(PrevDst)
+                  .addReg(SubReg)
+                  .addImm(CurrIdx);
+
+              PrevDst = NewDst;
+            }
+          }
+
+          // Add as an implicit operand
+          MachineInstrBuilder(MF, MI).addReg(NewDst, RegState::Implicit);
+
+          // Tie the just added implicit operand to the dst
+          MI.tieOperands(DstIdx, MI.getNumOperands() - 1);
+
+          Changed = true;
+        }
+      }
+    }
+  }
+
+  return Changed;
+}
diff --git a/lib/Target/AMDGPU/SIAnnotateControlFlow.cpp b/lib/Target/AMDGPU/SIAnnotateControlFlow.cpp
index 74f1bd8fb9866..98e9ea662324f 100644
--- a/lib/Target/AMDGPU/SIAnnotateControlFlow.cpp
+++ b/lib/Target/AMDGPU/SIAnnotateControlFlow.cpp
@@ -16,7 +16,7 @@
 #include "llvm/ADT/DepthFirstIterator.h"
 #include "llvm/ADT/STLExtras.h"
 #include "llvm/ADT/SmallVector.h"
-#include "llvm/Analysis/DivergenceAnalysis.h"
+#include "llvm/Analysis/LegacyDivergenceAnalysis.h"
 #include "llvm/Analysis/LoopInfo.h"
 #include "llvm/Transforms/Utils/Local.h"
 #include "llvm/IR/BasicBlock.h"
@@ -52,7 +52,7 @@ using StackEntry = std::pair<BasicBlock *, Value *>;
 using StackVector = SmallVector<StackEntry, 16>;
 
 class SIAnnotateControlFlow : public FunctionPass {
-  DivergenceAnalysis *DA;
+  LegacyDivergenceAnalysis *DA;
 
   Type *Boolean;
   Type *Void;
@@ -66,9 +66,7 @@ class SIAnnotateControlFlow : public FunctionPass {
 
   Function *If;
   Function *Else;
-  Function *Break;
   Function *IfBreak;
-  Function *ElseBreak;
   Function *Loop;
   Function *EndCf;
 
@@ -95,8 +93,7 @@ class SIAnnotateControlFlow : public FunctionPass {
 
   Value *
   handleLoopCondition(Value *Cond, PHINode *Broken, llvm::Loop *L,
-                      BranchInst *Term,
-                      SmallVectorImpl<WeakTrackingVH> &LoopPhiConditions);
+                      BranchInst *Term);
 
   void handleLoop(BranchInst *Term);
 
@@ -116,7 +113,7 @@ public:
   void getAnalysisUsage(AnalysisUsage &AU) const override {
     AU.addRequired<LoopInfoWrapperPass>();
     AU.addRequired<DominatorTreeWrapperPass>();
-    AU.addRequired<DivergenceAnalysis>();
+    AU.addRequired<LegacyDivergenceAnalysis>();
     AU.addPreserved<DominatorTreeWrapperPass>();
     FunctionPass::getAnalysisUsage(AU);
   }
@@ -127,7 +124,7 @@ public:
 INITIALIZE_PASS_BEGIN(SIAnnotateControlFlow, DEBUG_TYPE,
                       "Annotate SI Control Flow", false, false)
 INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass)
-INITIALIZE_PASS_DEPENDENCY(DivergenceAnalysis)
+INITIALIZE_PASS_DEPENDENCY(LegacyDivergenceAnalysis)
 INITIALIZE_PASS_END(SIAnnotateControlFlow, DEBUG_TYPE,
                     "Annotate SI Control Flow", false, false)
 
@@ -149,9 +146,7 @@ bool SIAnnotateControlFlow::doInitialization(Module &M) {
 
   If = Intrinsic::getDeclaration(&M, Intrinsic::amdgcn_if);
   Else = Intrinsic::getDeclaration(&M, Intrinsic::amdgcn_else);
-  Break = Intrinsic::getDeclaration(&M, Intrinsic::amdgcn_break);
   IfBreak = Intrinsic::getDeclaration(&M, Intrinsic::amdgcn_if_break);
-  ElseBreak = Intrinsic::getDeclaration(&M, Intrinsic::amdgcn_else_break);
   Loop = Intrinsic::getDeclaration(&M, Intrinsic::amdgcn_loop);
   EndCf = Intrinsic::getDeclaration(&M, Intrinsic::amdgcn_end_cf);
   return false;
@@ -160,7 +155,7 @@ bool SIAnnotateControlFlow::doInitialization(Module &M) {
 /// Is the branch condition uniform or did the StructurizeCFG pass
 /// consider it as such?
 bool SIAnnotateControlFlow::isUniform(BranchInst *T) {
-  return DA->isUniform(T->getCondition()) ||
+  return DA->isUniform(T) ||
          T->getMetadata("structurizecfg.uniform") != nullptr;
 }
 
@@ -227,76 +222,7 @@ void SIAnnotateControlFlow::insertElse(BranchInst *Term) {
 
 /// Recursively handle the condition leading to a loop
 Value *SIAnnotateControlFlow::handleLoopCondition(
-    Value *Cond, PHINode *Broken, llvm::Loop *L, BranchInst *Term,
-    SmallVectorImpl<WeakTrackingVH> &LoopPhiConditions) {
-  // Only search through PHI nodes which are inside the loop.  If we try this
-  // with PHI nodes that are outside of the loop, we end up inserting new PHI
-  // nodes outside of the loop which depend on values defined inside the loop.
-  // This will break the module with
-  // 'Instruction does not dominate all users!' errors.
-  PHINode *Phi = nullptr;
-  if ((Phi = dyn_cast<PHINode>(Cond)) && L->contains(Phi)) {
-    BasicBlock *Parent = Phi->getParent();
-    PHINode *NewPhi = PHINode::Create(Int64, 0, "loop.phi", &Parent->front());
-    Value *Ret = NewPhi;
-
-    // Handle all non-constant incoming values first
-    for (unsigned i = 0, e = Phi->getNumIncomingValues(); i != e; ++i) {
-      Value *Incoming = Phi->getIncomingValue(i);
-      BasicBlock *From = Phi->getIncomingBlock(i);
-      if (isa<ConstantInt>(Incoming)) {
-        NewPhi->addIncoming(Broken, From);
-        continue;
-      }
-
-      Phi->setIncomingValue(i, BoolFalse);
-      Value *PhiArg = handleLoopCondition(Incoming, Broken, L,
-                                          Term, LoopPhiConditions);
-      NewPhi->addIncoming(PhiArg, From);
-    }
-
-    BasicBlock *IDom = DT->getNode(Parent)->getIDom()->getBlock();
-
-    for (unsigned i = 0, e = Phi->getNumIncomingValues(); i != e; ++i) {
-      Value *Incoming = Phi->getIncomingValue(i);
-      if (Incoming != BoolTrue)
-        continue;
-
-      BasicBlock *From = Phi->getIncomingBlock(i);
-      if (From == IDom) {
-        // We're in the following situation:
-        //   IDom/From
-        //      |   \
-        //      |   If-block
-        //      |   /
-        //     Parent
-        // where we want to break out of the loop if the If-block is not taken.
-        // Due to the depth-first traversal, there should be an end.cf
-        // intrinsic in Parent, and we insert an else.break before it.
-        //
-        // Note that the end.cf need not be the first non-phi instruction
-        // of parent, particularly when we're dealing with a multi-level
-        // break, but it should occur within a group of intrinsic calls
-        // at the beginning of the block.
-        CallInst *OldEnd = dyn_cast<CallInst>(Parent->getFirstInsertionPt());
-        while (OldEnd && OldEnd->getCalledFunction() != EndCf)
-          OldEnd = dyn_cast<CallInst>(OldEnd->getNextNode());
-        if (OldEnd && OldEnd->getCalledFunction() == EndCf) {
-          Value *Args[] = { OldEnd->getArgOperand(0), NewPhi };
-          Ret = CallInst::Create(ElseBreak, Args, "", OldEnd);
-          continue;
-        }
-      }
-
-      TerminatorInst *Insert = From->getTerminator();
-      Value *PhiArg = CallInst::Create(Break, Broken, "", Insert);
-      NewPhi->setIncomingValue(i, PhiArg);
-    }
-
-    LoopPhiConditions.push_back(WeakTrackingVH(Phi));
-    return Ret;
-  }
-
+    Value *Cond, PHINode *Broken, llvm::Loop *L, BranchInst *Term) {
   if (Instruction *Inst = dyn_cast<Instruction>(Cond)) {
     BasicBlock *Parent = Inst->getParent();
     Instruction *Insert;
@@ -335,21 +261,15 @@ void SIAnnotateControlFlow::handleLoop(BranchInst *Term) {
   BasicBlock *Target = Term->getSuccessor(1);
   PHINode *Broken = PHINode::Create(Int64, 0, "phi.broken", &Target->front());
 
-  SmallVector<WeakTrackingVH, 8> LoopPhiConditions;
   Value *Cond = Term->getCondition();
   Term->setCondition(BoolTrue);
-  Value *Arg = handleLoopCondition(Cond, Broken, L, Term, LoopPhiConditions);
+  Value *Arg = handleLoopCondition(Cond, Broken, L, Term);
 
   for (BasicBlock *Pred : predecessors(Target))
     Broken->addIncoming(Pred == BB ? Arg : Int64Zero, Pred);
 
   Term->setCondition(CallInst::Create(Loop, Arg, "", Term));
 
-  for (WeakTrackingVH Val : llvm::reverse(LoopPhiConditions)) {
-    if (PHINode *Cond = cast_or_null<PHINode>(Val))
-      eraseIfUnused(Cond);
-  }
-
   push(Term->getSuccessor(0), Arg);
 }
 
@@ -372,7 +292,8 @@ void SIAnnotateControlFlow::closeControlFlow(BasicBlock *BB) {
         Preds.push_back(Pred);
     }
 
-    BB = SplitBlockPredecessors(BB, Preds, "endcf.split", DT, LI, false);
+    BB = SplitBlockPredecessors(BB, Preds, "endcf.split", DT, LI, nullptr,
+                                false);
   }
 
   Value *Exec = popSaved();
@@ -386,7 +307,7 @@ void SIAnnotateControlFlow::closeControlFlow(BasicBlock *BB) {
 bool SIAnnotateControlFlow::runOnFunction(Function &F) {
   DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree();
   LI = &getAnalysis<LoopInfoWrapperPass>().getLoopInfo();
-  DA = &getAnalysis<DivergenceAnalysis>();
+  DA = &getAnalysis<LegacyDivergenceAnalysis>();
 
   for (df_iterator<BasicBlock *> I = df_begin(&F.getEntryBlock()),
        E = df_end(&F.getEntryBlock()); I != E; ++I) {
diff --git a/lib/Target/AMDGPU/SIDefines.h b/lib/Target/AMDGPU/SIDefines.h
index a6d28d6999e5f..7f6abc34cff3a 100644
--- a/lib/Target/AMDGPU/SIDefines.h
+++ b/lib/Target/AMDGPU/SIDefines.h
@@ -88,7 +88,10 @@ enum : uint64_t {
   IsPacked = UINT64_C(1) << 49,
 
   // Is a D16 buffer instruction.
-  D16Buf = UINT64_C(1) << 50
+  D16Buf = UINT64_C(1) << 50,
+
+  // Uses floating point double precision rounding mode
+  FPDPRounding = UINT64_C(1) << 51
 };
 
 // v_cmp_class_* etc. use a 10-bit mask for what operation is checked.
diff --git a/lib/Target/AMDGPU/SIFixSGPRCopies.cpp b/lib/Target/AMDGPU/SIFixSGPRCopies.cpp
index 566e0d3febc78..809f5bab46932 100644
--- a/lib/Target/AMDGPU/SIFixSGPRCopies.cpp
+++ b/lib/Target/AMDGPU/SIFixSGPRCopies.cpp
@@ -183,13 +183,15 @@ getCopyRegClasses(const MachineInstr &Copy,
 static bool isVGPRToSGPRCopy(const TargetRegisterClass *SrcRC,
                              const TargetRegisterClass *DstRC,
                              const SIRegisterInfo &TRI) {
-  return TRI.isSGPRClass(DstRC) && TRI.hasVGPRs(SrcRC);
+  return SrcRC != &AMDGPU::VReg_1RegClass && TRI.isSGPRClass(DstRC) &&
+         TRI.hasVGPRs(SrcRC);
 }
 
 static bool isSGPRToVGPRCopy(const TargetRegisterClass *SrcRC,
                              const TargetRegisterClass *DstRC,
                              const SIRegisterInfo &TRI) {
-  return TRI.isSGPRClass(SrcRC) && TRI.hasVGPRs(DstRC);
+  return DstRC != &AMDGPU::VReg_1RegClass && TRI.isSGPRClass(SrcRC) &&
+         TRI.hasVGPRs(DstRC);
 }
 
 static bool tryChangeVGPRtoSGPRinCopy(MachineInstr &MI,
@@ -327,9 +329,7 @@ static bool phiHasBreakDef(const MachineInstr &PHI,
     switch (DefInstr->getOpcode()) {
     default:
       break;
-    case AMDGPU::SI_BREAK:
     case AMDGPU::SI_IF_BREAK:
-    case AMDGPU::SI_ELSE_BREAK:
       return true;
     case AMDGPU::PHI:
       if (phiHasBreakDef(*DefInstr, MRI, Visited))
@@ -599,7 +599,7 @@ bool SIFixSGPRCopies::runOnMachineFunction(MachineFunction &MF) {
         if (isVGPRToSGPRCopy(SrcRC, DstRC, *TRI)) {
           unsigned SrcReg = MI.getOperand(1).getReg();
           if (!TargetRegisterInfo::isVirtualRegister(SrcReg)) {
-            TII->moveToVALU(MI);
+            TII->moveToVALU(MI, MDT);
             break;
           }
 
@@ -614,7 +614,7 @@ bool SIFixSGPRCopies::runOnMachineFunction(MachineFunction &MF) {
             MI.setDesc(TII->get(SMovOp));
             break;
           }
-          TII->moveToVALU(MI);
+          TII->moveToVALU(MI, MDT);
         } else if (isSGPRToVGPRCopy(SrcRC, DstRC, *TRI)) {
           tryChangeVGPRtoSGPRinCopy(MI, TRI, TII);
         }
@@ -677,7 +677,7 @@ bool SIFixSGPRCopies::runOnMachineFunction(MachineFunction &MF) {
         SmallSet<unsigned, 8> Visited;
         if (HasVGPROperand || !phiHasBreakDef(MI, MRI, Visited)) {
           LLVM_DEBUG(dbgs() << "Fixing PHI: " << MI);
-          TII->moveToVALU(MI);
+          TII->moveToVALU(MI, MDT);
         }
         break;
       }
@@ -690,7 +690,7 @@ bool SIFixSGPRCopies::runOnMachineFunction(MachineFunction &MF) {
 
         LLVM_DEBUG(dbgs() << "Fixing REG_SEQUENCE: " << MI);
 
-        TII->moveToVALU(MI);
+        TII->moveToVALU(MI, MDT);
         break;
       case AMDGPU::INSERT_SUBREG: {
         const TargetRegisterClass *DstRC, *Src0RC, *Src1RC;
@@ -700,7 +700,7 @@ bool SIFixSGPRCopies::runOnMachineFunction(MachineFunction &MF) {
         if (TRI->isSGPRClass(DstRC) &&
             (TRI->hasVGPRs(Src0RC) || TRI->hasVGPRs(Src1RC))) {
           LLVM_DEBUG(dbgs() << " Fixing INSERT_SUBREG: " << MI);
-          TII->moveToVALU(MI);
+          TII->moveToVALU(MI, MDT);
         }
         break;
       }
diff --git a/lib/Target/AMDGPU/SIFixWWMLiveness.cpp b/lib/Target/AMDGPU/SIFixWWMLiveness.cpp
index 5d613d8874fab..7761418c53364 100644
--- a/lib/Target/AMDGPU/SIFixWWMLiveness.cpp
+++ b/lib/Target/AMDGPU/SIFixWWMLiveness.cpp
@@ -10,7 +10,7 @@
 /// \file
 /// Computations in WWM can overwrite values in inactive channels for
 /// variables that the register allocator thinks are dead. This pass adds fake
-/// uses of those variables to WWM instructions to make sure that they aren't
+/// uses of those variables to their def(s) to make sure that they aren't
 /// overwritten.
 ///
 /// As an example, consider this snippet:
@@ -29,25 +29,44 @@
 /// second write to %vgpr0 anyways. But if %vgpr1 is written with WWM enabled,
 /// it would clobber even the inactive channels for which the if-condition is
 /// false, for which %vgpr0 is supposed to be 0. This pass adds an implicit use
-/// of %vgpr0 to the WWM instruction to make sure they aren't allocated to the
+/// of %vgpr0 to its def to make sure they aren't allocated to the
 /// same register.
 ///
 /// In general, we need to figure out what registers might have their inactive
 /// channels which are eventually used accidentally clobbered by a WWM
-/// instruction. We approximate this using two conditions:
+/// instruction. We do that by spotting three separate cases of registers:
 ///
-/// 1. A definition of the variable reaches the WWM instruction.
-/// 2. The variable would be live at the WWM instruction if all its defs were
-/// partial defs (i.e. considered as a use), ignoring normal uses.
+/// 1. A "then phi": the value resulting from phi elimination of a phi node at
+///    the end of an if..endif. If there is WWM code in the "then", then we
+///    make the def at the end of the "then" branch a partial def by adding an
+///    implicit use of the register.
 ///
-/// If a register matches both conditions, then we add an implicit use of it to
-/// the WWM instruction. Condition #2 is the heart of the matter: every
-/// definition is really a partial definition, since every VALU instruction is
-/// implicitly predicated.  We can usually ignore this, but WWM forces us not
-/// to. Condition #1 prevents false positives if the variable is undefined at
-/// the WWM instruction anyways. This is overly conservative in certain cases,
-/// especially in uniform control flow, but this is a workaround anyways until
-/// LLVM gains the notion of predicated uses and definitions of variables.
+/// 2. A "loop exit register": a value written inside a loop but used outside the
+///    loop, where there is WWM code inside the loop (the case in the example
+///    above). We add an implicit_def of the register in the loop pre-header,
+///    and make the original def a partial def by adding an implicit use of the
+///    register.
+///
+/// 3. A "loop exit phi": the value resulting from phi elimination of a phi node
+///    in a loop header. If there is WWM code inside the loop, then we make all
+///    defs inside the loop partial defs by adding an implicit use of the
+///    register on each one.
+///
+/// Note that we do not need to consider an if..else..endif phi. We only need to
+/// consider non-uniform control flow, and control flow structurization would
+/// have transformed a non-uniform if..else..endif into two if..endifs.
+///
+/// The analysis to detect these cases relies on a property of the MIR
+/// arising from this pass running straight after PHIElimination and before any
+/// coalescing: that any virtual register with more than one definition must be
+/// the new register added to lower a phi node by PHIElimination.
+///
+/// FIXME: We should detect whether a register in one of the above categories is
+/// already live at the WWM code before deciding to add the implicit uses to
+/// synthesize its liveness.
+///
+/// FIXME: I believe this whole scheme may be flawed due to the possibility of
+/// the register allocator doing live interval splitting.
 ///
 //===----------------------------------------------------------------------===//
 
@@ -59,7 +78,9 @@
 #include "llvm/ADT/DepthFirstIterator.h"
 #include "llvm/ADT/SparseBitVector.h"
 #include "llvm/CodeGen/LiveIntervals.h"
+#include "llvm/CodeGen/MachineDominators.h"
 #include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/CodeGen/MachineLoopInfo.h"
 #include "llvm/CodeGen/Passes.h"
 #include "llvm/CodeGen/TargetRegisterInfo.h"
 
@@ -71,10 +92,18 @@ namespace {
 
 class SIFixWWMLiveness : public MachineFunctionPass {
 private:
+  MachineDominatorTree *DomTree;
+  MachineLoopInfo *LoopInfo;
   LiveIntervals *LIS = nullptr;
+  const SIInstrInfo *TII;
   const SIRegisterInfo *TRI;
   MachineRegisterInfo *MRI;
 
+  std::vector<MachineInstr *> WWMs;
+  std::vector<MachineOperand *> ThenDefs;
+  std::vector<std::pair<MachineOperand *, MachineLoop *>> LoopExitDefs;
+  std::vector<std::pair<MachineOperand *, MachineLoop *>> LoopPhiDefs;
+
 public:
   static char ID;
 
@@ -84,13 +113,11 @@ public:
 
   bool runOnMachineFunction(MachineFunction &MF) override;
 
-  bool runOnWWMInstruction(MachineInstr &MI);
-
-  void addDefs(const MachineInstr &MI, SparseBitVector<> &set);
-
   StringRef getPassName() const override { return "SI Fix WWM Liveness"; }
 
   void getAnalysisUsage(AnalysisUsage &AU) const override {
+    AU.addRequiredID(MachineDominatorsID);
+    AU.addRequiredID(MachineLoopInfoID);
     // Should preserve the same set that TwoAddressInstructions does.
     AU.addPreserved<SlotIndexes>();
     AU.addPreserved<LiveIntervals>();
@@ -100,11 +127,21 @@ public:
     AU.setPreservesCFG();
     MachineFunctionPass::getAnalysisUsage(AU);
   }
+
+private:
+  void processDef(MachineOperand &DefOpnd);
+  bool processThenDef(MachineOperand *DefOpnd);
+  bool processLoopExitDef(MachineOperand *DefOpnd, MachineLoop *Loop);
+  bool processLoopPhiDef(MachineOperand *DefOpnd, MachineLoop *Loop);
 };
 
 } // End anonymous namespace.
 
-INITIALIZE_PASS(SIFixWWMLiveness, DEBUG_TYPE,
+INITIALIZE_PASS_BEGIN(SIFixWWMLiveness, DEBUG_TYPE,
+                "SI fix WWM liveness", false, false)
+INITIALIZE_PASS_DEPENDENCY(MachineDominatorTree)
+INITIALIZE_PASS_DEPENDENCY(MachineLoopInfo)
+INITIALIZE_PASS_END(SIFixWWMLiveness, DEBUG_TYPE,
                 "SI fix WWM liveness", false, false)
 
 char SIFixWWMLiveness::ID = 0;
@@ -115,89 +152,267 @@ FunctionPass *llvm::createSIFixWWMLivenessPass() {
   return new SIFixWWMLiveness();
 }
 
-void SIFixWWMLiveness::addDefs(const MachineInstr &MI, SparseBitVector<> &Regs)
-{
-  for (const MachineOperand &Op : MI.defs()) {
-    if (Op.isReg()) {
-      unsigned Reg = Op.getReg();
-      if (TRI->isVGPR(*MRI, Reg))
-        Regs.set(Reg);
-    }
-  }
-}
+bool SIFixWWMLiveness::runOnMachineFunction(MachineFunction &MF) {
+  LLVM_DEBUG(dbgs() << "SIFixWWMLiveness: function " << MF.getName() << "\n");
+  bool Modified = false;
+
+  // This doesn't actually need LiveIntervals, but we can preserve them.
+  LIS = getAnalysisIfAvailable<LiveIntervals>();
 
-bool SIFixWWMLiveness::runOnWWMInstruction(MachineInstr &WWM) {
-  MachineBasicBlock *MBB = WWM.getParent();
+  const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
 
-  // Compute the registers that are live out of MI by figuring out which defs
-  // are reachable from MI.
-  SparseBitVector<> LiveOut;
+  TII = ST.getInstrInfo();
+  TRI = &TII->getRegisterInfo();
+  MRI = &MF.getRegInfo();
 
-  for (auto II = MachineBasicBlock::iterator(WWM), IE =
-       MBB->end(); II != IE; ++II) {
-    addDefs(*II, LiveOut);
-  }
+  DomTree = &getAnalysis<MachineDominatorTree>();
+  LoopInfo = &getAnalysis<MachineLoopInfo>();
 
-  for (df_iterator<MachineBasicBlock *> I = ++df_begin(MBB),
-                                        E = df_end(MBB);
-       I != E; ++I) {
-    for (const MachineInstr &MI : **I) {
-      addDefs(MI, LiveOut);
+  // Scan the function to find the WWM sections and the candidate registers for
+  // having liveness modified.
+  for (MachineBasicBlock &MBB : MF) {
+    for (MachineInstr &MI : MBB) {
+      if (MI.getOpcode() == AMDGPU::EXIT_WWM)
+        WWMs.push_back(&MI);
+      else {
+        for (MachineOperand &DefOpnd : MI.defs()) {
+          if (DefOpnd.isReg()) {
+            unsigned Reg = DefOpnd.getReg();
+            if (TRI->isVGPR(*MRI, Reg))
+              processDef(DefOpnd);
+          }
+        }
+      }
     }
   }
+  if (!WWMs.empty()) {
+    // Synthesize liveness over WWM sections as required.
+    for (auto ThenDef : ThenDefs)
+      Modified |= processThenDef(ThenDef);
+    for (auto LoopExitDef : LoopExitDefs)
+      Modified |= processLoopExitDef(LoopExitDef.first, LoopExitDef.second);
+    for (auto LoopPhiDef : LoopPhiDefs)
+      Modified |= processLoopPhiDef(LoopPhiDef.first, LoopPhiDef.second);
+  }
 
-  // Compute the registers that reach MI.
-  SparseBitVector<> Reachable;
+  WWMs.clear();
+  ThenDefs.clear();
+  LoopExitDefs.clear();
+  LoopPhiDefs.clear();
 
-  for (auto II = ++MachineBasicBlock::reverse_iterator(WWM), IE =
-       MBB->rend(); II != IE; ++II) {
-    addDefs(*II, Reachable);
-  }
+  return Modified;
+}
 
-  for (idf_iterator<MachineBasicBlock *> I = ++idf_begin(MBB),
-                                         E = idf_end(MBB);
-       I != E; ++I) {
-    for (const MachineInstr &MI : **I) {
-      addDefs(MI, Reachable);
+// During the function scan, process an operand that defines a VGPR.
+// This categorizes the register and puts it in the appropriate list for later
+// use when processing a WWM section.
+void SIFixWWMLiveness::processDef(MachineOperand &DefOpnd) {
+  unsigned Reg = DefOpnd.getReg();
+  // Get all the defining instructions. For convenience, make Defs[0] the def
+  // we are on now.
+  SmallVector<const MachineInstr *, 4> Defs;
+  Defs.push_back(DefOpnd.getParent());
+  for (auto &MI : MRI->def_instructions(Reg)) {
+    if (&MI != DefOpnd.getParent())
+      Defs.push_back(&MI);
+  }
+  // Check whether this def dominates all the others. If not, ignore this def.
+  // Either it is going to be processed when the scan encounters its other def
+  // that dominates all defs, or there is no def that dominates all others.
+  // The latter case is an eliminated phi from an if..else..endif or similar,
+  // which must be for uniform control flow so can be ignored.
+  // Because this pass runs shortly after PHIElimination, we assume that any
+  // multi-def register is a lowered phi, and thus has each def in a separate
+  // basic block.
+  for (unsigned I = 1; I != Defs.size(); ++I) {
+    if (!DomTree->dominates(Defs[0]->getParent(), Defs[I]->getParent()))
+      return;
+  }
+  // Check for the case of an if..endif lowered phi: It has two defs, one
+  // dominates the other, and there is a single use in a successor of the
+  // dominant def.
+  // Later we will spot any WWM code inside
+  // the "then" clause and turn the second def into a partial def so its
+  // liveness goes through the WWM code in the "then" clause.
+  if (Defs.size() == 2) {
+    auto DomDefBlock = Defs[0]->getParent();
+    if (DomDefBlock->succ_size() == 2 && MRI->hasOneUse(Reg)) {
+      auto UseBlock = MRI->use_begin(Reg)->getParent()->getParent();
+      for (auto Succ : DomDefBlock->successors()) {
+        if (Succ == UseBlock) {
+          LLVM_DEBUG(dbgs() << printReg(Reg, TRI) << " is a then phi reg\n");
+          ThenDefs.push_back(&DefOpnd);
+          return;
+        }
+      }
     }
   }
-
-  // find the intersection, and add implicit uses.
-  LiveOut &= Reachable;
-
-  bool Modified = false;
-  for (unsigned Reg : LiveOut) {
-    WWM.addOperand(MachineOperand::CreateReg(Reg, false, /*isImp=*/true));
-    if (LIS) {
-      // FIXME: is there a better way to update the live interval?
-      LIS->removeInterval(Reg);
-      LIS->createAndComputeVirtRegInterval(Reg);
+  // Check for the case of a non-lowered-phi register (single def) that exits
+  // a loop, that is, it has a use that is outside a loop that the def is
+  // inside. We find the outermost loop that the def is inside but a use is
+  // outside. Later we will spot any WWM code inside that loop and then make
+  // the def a partial def so its liveness goes round the loop and through the
+  // WWM code.
+  if (Defs.size() == 1) {
+    auto Loop = LoopInfo->getLoopFor(Defs[0]->getParent());
+    if (!Loop)
+      return;
+    bool IsLoopExit = false;
+    for (auto &Use : MRI->use_instructions(Reg)) {
+      auto UseBlock = Use.getParent();
+      if (Loop->contains(UseBlock))
+        continue;
+      IsLoopExit = true;
+      while (auto Parent = Loop->getParentLoop()) {
+        if (Parent->contains(UseBlock))
+          break;
+        Loop = Parent;
+      }
     }
-    Modified = true;
+    if (!IsLoopExit)
+      return;
+    LLVM_DEBUG(dbgs() << printReg(Reg, TRI)
+        << " is a loop exit reg with loop header at "
+        << "bb." << Loop->getHeader()->getNumber() << "\n");
+    LoopExitDefs.push_back(std::pair<MachineOperand *, MachineLoop *>(
+            &DefOpnd, Loop));
+    return;
   }
-
-  return Modified;
+  // Check for the case of a lowered single-preheader-loop phi, that is, a
+  // multi-def register where the dominating def is in the loop pre-header and
+  // all other defs are in backedges. Later we will spot any WWM code inside
+  // that loop and then make the backedge defs partial defs so the liveness
+  // goes through the WWM code.
+  // Note that we are ignoring multi-preheader loops on the basis that the
+  // structurizer does not allow that for non-uniform loops.
+  // There must be a single use in the loop header.
+  if (!MRI->hasOneUse(Reg))
+    return;
+  auto UseBlock = MRI->use_begin(Reg)->getParent()->getParent();
+  auto Loop = LoopInfo->getLoopFor(UseBlock);
+  if (!Loop || Loop->getHeader() != UseBlock
+      || Loop->contains(Defs[0]->getParent())) {
+    LLVM_DEBUG(dbgs() << printReg(Reg, TRI)
+        << " is multi-def but single use not in loop header\n");
+    return;
+  }
+  for (unsigned I = 1; I != Defs.size(); ++I) {
+    if (!Loop->contains(Defs[I]->getParent()))
+      return;
+  }
+  LLVM_DEBUG(dbgs() << printReg(Reg, TRI)
+      << " is a loop phi reg with loop header at "
+      << "bb." << Loop->getHeader()->getNumber() << "\n");
+  LoopPhiDefs.push_back(
+      std::pair<MachineOperand *, MachineLoop *>(&DefOpnd, Loop));
 }
 
-bool SIFixWWMLiveness::runOnMachineFunction(MachineFunction &MF) {
-  bool Modified = false;
-
-  // This doesn't actually need LiveIntervals, but we can preserve them.
-  LIS = getAnalysisIfAvailable<LiveIntervals>();
-
-  const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
-  const SIInstrInfo *TII = ST.getInstrInfo();
-
-  TRI = &TII->getRegisterInfo();
-  MRI = &MF.getRegInfo();
+// Process a then phi def: It has two defs, one dominates the other, and there
+// is a single use in a successor of the dominant def. Here we spot any WWM
+// code inside the "then" clause and turn the second def into a partial def so
+// its liveness goes through the WWM code in the "then" clause.
+bool SIFixWWMLiveness::processThenDef(MachineOperand *DefOpnd) {
+  LLVM_DEBUG(dbgs() << "Processing then def: " << *DefOpnd->getParent());
+  if (DefOpnd->getParent()->getOpcode() == TargetOpcode::IMPLICIT_DEF) {
+    // Ignore if dominating def is undef.
+    LLVM_DEBUG(dbgs() << "  ignoring as dominating def is undef\n");
+    return false;
+  }
+  unsigned Reg = DefOpnd->getReg();
+  // Get the use block, which is the endif block.
+  auto UseBlock = MRI->use_instr_begin(Reg)->getParent();
+  // Check whether there is WWM code inside the then branch. The WWM code must
+  // be dominated by the if but not dominated by the endif.
+  bool ContainsWWM = false;
+  for (auto WWM : WWMs) {
+    if (DomTree->dominates(DefOpnd->getParent()->getParent(), WWM->getParent())
+        && !DomTree->dominates(UseBlock, WWM->getParent())) {
+      LLVM_DEBUG(dbgs() << "  contains WWM: " << *WWM);
+      ContainsWWM = true;
+      break;
+    }
+  }
+  if (!ContainsWWM)
+    return false;
+  // Get the other def.
+  MachineInstr *OtherDef = nullptr;
+  for (auto &MI : MRI->def_instructions(Reg)) {
+    if (&MI != DefOpnd->getParent())
+      OtherDef = &MI;
+  }
+  // Make it a partial def.
+  OtherDef->addOperand(MachineOperand::CreateReg(Reg, false, /*isImp=*/true));
+  LLVM_DEBUG(dbgs() << *OtherDef);
+  return true;
+}
 
-  for (MachineBasicBlock &MBB : MF) {
-    for (MachineInstr &MI : MBB) {
-      if (MI.getOpcode() == AMDGPU::EXIT_WWM) {
-        Modified |= runOnWWMInstruction(MI);
-      }
+// Process a loop exit def, that is, a register with a single use in a loop
+// that has a use outside the loop.  Here we spot any WWM code inside that loop
+// and then make the def a partial def so its liveness goes round the loop and
+// through the WWM code.
+bool SIFixWWMLiveness::processLoopExitDef(MachineOperand *DefOpnd,
+      MachineLoop *Loop) {
+  LLVM_DEBUG(dbgs() << "Processing loop exit def: " << *DefOpnd->getParent());
+  // Check whether there is WWM code inside the loop.
+  bool ContainsWWM = false;
+  for (auto WWM : WWMs) {
+    if (Loop->contains(WWM->getParent())) {
+      LLVM_DEBUG(dbgs() << "  contains WWM: " << *WWM);
+      ContainsWWM = true;
+      break;
     }
   }
+  if (!ContainsWWM)
+    return false;
+  unsigned Reg = DefOpnd->getReg();
+  // Add a new implicit_def in loop preheader(s).
+  for (auto Pred : Loop->getHeader()->predecessors()) {
+    if (!Loop->contains(Pred)) {
+      auto ImplicitDef = BuildMI(*Pred, Pred->getFirstTerminator(), DebugLoc(),
+          TII->get(TargetOpcode::IMPLICIT_DEF), Reg);
+      LLVM_DEBUG(dbgs() << *ImplicitDef);
+      (void)ImplicitDef;
+    }
+  }
+  // Make the original def partial.
+  DefOpnd->getParent()->addOperand(MachineOperand::CreateReg(
+          Reg, false, /*isImp=*/true));
+  LLVM_DEBUG(dbgs() << *DefOpnd->getParent());
+  return true;
+}
 
-  return Modified;
+// Process a loop phi def, that is, a multi-def register where the dominating
+// def is in the loop pre-header and all other defs are in backedges. Here we
+// spot any WWM code inside that loop and then make the backedge defs partial
+// defs so the liveness goes through the WWM code.
+bool SIFixWWMLiveness::processLoopPhiDef(MachineOperand *DefOpnd,
+      MachineLoop *Loop) {
+  LLVM_DEBUG(dbgs() << "Processing loop phi def: " << *DefOpnd->getParent());
+  // Check whether there is WWM code inside the loop.
+  bool ContainsWWM = false;
+  for (auto WWM : WWMs) {
+    if (Loop->contains(WWM->getParent())) {
+      LLVM_DEBUG(dbgs() << "  contains WWM: " << *WWM);
+      ContainsWWM = true;
+      break;
+    }
+  }
+  if (!ContainsWWM)
+    return false;
+  unsigned Reg = DefOpnd->getReg();
+  // Remove kill mark from uses.
+  for (auto &Use : MRI->use_operands(Reg))
+    Use.setIsKill(false);
+  // Make all defs except the dominating one partial defs.
+  SmallVector<MachineInstr *, 4> Defs;
+  for (auto &Def : MRI->def_instructions(Reg))
+    Defs.push_back(&Def);
+  for (auto Def : Defs) {
+    if (DefOpnd->getParent() == Def)
+      continue;
+    Def->addOperand(MachineOperand::CreateReg(Reg, false, /*isImp=*/true));
+    LLVM_DEBUG(dbgs() << *Def);
+  }
+  return true;
 }
+
diff --git a/lib/Target/AMDGPU/SIFixupVectorISel.cpp b/lib/Target/AMDGPU/SIFixupVectorISel.cpp
new file mode 100644
index 0000000000000..ee39eb04d8316
--- /dev/null
+++ b/lib/Target/AMDGPU/SIFixupVectorISel.cpp
@@ -0,0 +1,231 @@
+//===-- SIFixupVectorISel.cpp - Fixup post ISel vector issues -------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+/// \file
+/// SIFixupVectorISel pass cleans up post ISEL Vector issues.
+/// Currently this will convert GLOBAL_{LOAD|STORE}_*
+/// and GLOBAL_Atomic_* instructions into their _SADDR variants,
+/// feeding the sreg into the saddr field of the new instruction.
+/// We currently handle a REG_SEQUENCE feeding the vaddr
+/// and decompose it into a base and index.
+///
+/// Transform:
+/// %17:vgpr_32, %19:sreg_64_xexec = V_ADD_I32_e64 %21:sgpr_32, %22:vgpr_32
+/// %18:vgpr_32, %20:sreg_64_xexec = V_ADDC_U32_e64 %25:vgpr_32,
+///                                    %24:vgpr_32, %19:sreg_64_xexec
+/// %16:vreg_64 = REG_SEQUENCE %17:vgpr_32, %sub0, %18:vgpr_32, %sub1
+/// %11:vreg_64 = COPY %16:vreg_64
+/// %10:vgpr_32 = GLOBAL_LOAD_DWORD killed %11:vreg_64, 16, 0, 0
+/// Into:
+/// %4:sreg_64_xexec = S_LOAD_DWORDX2_IMM %1:sgpr_64, 36, 0
+/// %14:vreg_64 = REG_SEQUENCE %6:vgpr_32, %sub0, %15:vgpr_32, %sub1
+/// %10:vgpr_32 = GLOBAL_LOAD_DWORD_SADDR %14:vreg_64, %4:sreg_64_xexec,16...
+///
+//===----------------------------------------------------------------------===//
+//
+
+#include "AMDGPU.h"
+#include "AMDGPUSubtarget.h"
+#include "MCTargetDesc/AMDGPUMCTargetDesc.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/IR/Function.h"
+#include "llvm/IR/LLVMContext.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Target/TargetMachine.h"
+#define DEBUG_TYPE "si-fixup-vector-isel"
+
+using namespace llvm;
+
+static cl::opt<bool> EnableGlobalSGPRAddr(
+  "amdgpu-enable-global-sgpr-addr",
+  cl::desc("Enable use of SGPR regs for GLOBAL LOAD/STORE instructions"),
+  cl::init(false));
+
+STATISTIC(NumSGPRGlobalOccurs, "Number of global ld/st opportunities");
+STATISTIC(NumSGPRGlobalSaddrs, "Number of global sgpr instructions converted");
+
+namespace {
+
+class SIFixupVectorISel : public MachineFunctionPass {
+public:
+  static char ID;
+
+public:
+  SIFixupVectorISel() : MachineFunctionPass(ID) {
+    initializeSIFixupVectorISelPass(*PassRegistry::getPassRegistry());
+  }
+
+  bool runOnMachineFunction(MachineFunction &MF) override;
+
+  void getAnalysisUsage(AnalysisUsage &AU) const override {
+    AU.setPreservesCFG();
+    MachineFunctionPass::getAnalysisUsage(AU);
+  }
+};
+
+} // End anonymous namespace.
+
+INITIALIZE_PASS(SIFixupVectorISel, DEBUG_TYPE,
+                "SI Fixup Vector ISel", false, false)
+
+char SIFixupVectorISel::ID = 0;
+
+char &llvm::SIFixupVectorISelID = SIFixupVectorISel::ID;
+
+FunctionPass *llvm::createSIFixupVectorISelPass() {
+  return new SIFixupVectorISel();
+}
+
+static bool findSRegBaseAndIndex(MachineOperand *Op,
+                                 unsigned &BaseReg,
+                                 unsigned &IndexReg,
+                                 MachineRegisterInfo &MRI,
+                                 const SIRegisterInfo *TRI) {
+  SmallVector<MachineOperand *, 8> Worklist;
+  Worklist.push_back(Op);
+  while (!Worklist.empty()) {
+    MachineOperand *WOp = Worklist.pop_back_val();
+    if (!WOp->isReg() ||
+        !TargetRegisterInfo::isVirtualRegister(WOp->getReg()))
+      continue;
+    MachineInstr *DefInst = MRI.getUniqueVRegDef(WOp->getReg());
+    switch (DefInst->getOpcode()) {
+    default:
+      continue;
+    case AMDGPU::COPY:
+      Worklist.push_back(&DefInst->getOperand(1));
+      break;
+    case AMDGPU::REG_SEQUENCE:
+      if (DefInst->getNumOperands() != 5)
+        continue;
+      Worklist.push_back(&DefInst->getOperand(1));
+      Worklist.push_back(&DefInst->getOperand(3));
+      break;
+    case AMDGPU::V_ADD_I32_e64:
+      // The V_ADD_* and its analogous V_ADDCV_* are generated by
+      // a previous pass which lowered from an ADD_64_PSEUDO,
+      // which generates subregs to break up the 64 bit args.
+      if (DefInst->getOperand(2).getSubReg() != AMDGPU::NoSubRegister)
+        continue;
+      BaseReg = DefInst->getOperand(2).getReg();
+      if (DefInst->getOperand(3).getSubReg() != AMDGPU::NoSubRegister)
+        continue;
+      IndexReg = DefInst->getOperand(3).getReg();
+      // Chase the IndexReg.
+      MachineInstr *MI = MRI.getUniqueVRegDef(IndexReg);
+      if (!MI || !MI->isCopy())
+        continue;
+      // Make sure the reg class is 64 bit for Index.
+      // If the Index register is a subreg, we want it to reference
+      // a 64 bit register which we will use as the Index reg.
+      const TargetRegisterClass *IdxRC, *BaseRC;
+      IdxRC = MRI.getRegClass(MI->getOperand(1).getReg());
+      if (AMDGPU::getRegBitWidth(IdxRC->getID()) != 64)
+        continue;
+      IndexReg = MI->getOperand(1).getReg();
+      // Chase the BaseReg.
+      MI = MRI.getUniqueVRegDef(BaseReg);
+      if (!MI || !MI->isCopy())
+        continue;
+      // Make sure the register class is 64 bit for Base.
+      BaseReg = MI->getOperand(1).getReg();
+      BaseRC = MRI.getRegClass(BaseReg);
+      if (AMDGPU::getRegBitWidth(BaseRC->getID()) != 64)
+        continue;
+      // Make sure Base is SReg and Index is VReg.
+      if (!TRI->isSGPRReg(MRI, BaseReg))
+        return false;
+      if (!TRI->hasVGPRs(MRI.getRegClass(IndexReg)))
+        return false;
+      // clear any killed flags on Index and Base regs, used later.
+      MRI.clearKillFlags(IndexReg);
+      MRI.clearKillFlags(BaseReg);
+      return true;
+    }
+  }
+  return false;
+}
+
+// Identify Global LOAD|STORE/ATOMIC and try to convert to _SADDR.
+static bool fixupGlobalSaddr(MachineBasicBlock &MBB,
+                             MachineFunction &MF,
+                             MachineRegisterInfo &MRI,
+                             const GCNSubtarget &ST,
+                             const SIInstrInfo *TII,
+                             const SIRegisterInfo *TRI) {
+  if (!EnableGlobalSGPRAddr)
+    return false;
+  bool FuncModified = false;
+  MachineBasicBlock::iterator I, Next;
+  for (I = MBB.begin(); I != MBB.end(); I = Next) {
+    Next = std::next(I);
+    MachineInstr &MI = *I;
+    int NewOpcd = AMDGPU::getGlobalSaddrOp(MI.getOpcode());
+    if (NewOpcd < 0)
+      continue;
+    // Update our statistics on opportunities seen.
+    ++NumSGPRGlobalOccurs;
+    LLVM_DEBUG(dbgs() << "Global Mem opp " << MI << '\n');
+    // Need a Base and Index or we cant transform to _SADDR.
+    unsigned BaseReg = 0;
+    unsigned IndexReg = 0;
+    MachineOperand *Op = TII->getNamedOperand(MI, AMDGPU::OpName::vaddr);
+    if (!findSRegBaseAndIndex(Op, BaseReg, IndexReg, MRI, TRI))
+      continue;
+    ++NumSGPRGlobalSaddrs;
+    FuncModified = true;
+    // Create the new _SADDR Memory instruction.
+    bool HasVdst = TII->getNamedOperand(MI, AMDGPU::OpName::vdst) != nullptr;
+    MachineOperand *VData = TII->getNamedOperand(MI, AMDGPU::OpName::vdata);
+    MachineInstr *NewGlob = nullptr;
+    NewGlob = BuildMI(MBB, I, MI.getDebugLoc(), TII->get(NewOpcd));
+    if (HasVdst)
+      NewGlob->addOperand(MF, MI.getOperand(0));
+    NewGlob->addOperand(MF, MachineOperand::CreateReg(IndexReg, false));
+    if (VData)
+      NewGlob->addOperand(MF, *VData);
+    NewGlob->addOperand(MF, MachineOperand::CreateReg(BaseReg, false));
+    NewGlob->addOperand(*TII->getNamedOperand(MI, AMDGPU::OpName::offset));
+
+    MachineOperand *Glc = TII->getNamedOperand(MI, AMDGPU::OpName::glc);
+    // Atomics dont have a GLC, so omit the field if not there.
+    if (Glc)
+      NewGlob->addOperand(MF, *Glc);
+    NewGlob->addOperand(*TII->getNamedOperand(MI, AMDGPU::OpName::slc));
+    // _D16 have an vdst_in operand, copy it in.
+    MachineOperand *VDstInOp = TII->getNamedOperand(MI,
+                                      AMDGPU::OpName::vdst_in);
+    if (VDstInOp)
+      NewGlob->addOperand(MF, *VDstInOp);
+    NewGlob->copyImplicitOps(MF, MI);
+    NewGlob->cloneMemRefs(MF, MI);
+    // Remove the old Global Memop instruction.
+    MI.eraseFromParent();
+    LLVM_DEBUG(dbgs() << "New Global Mem " << *NewGlob << '\n');
+  }
+  return FuncModified;
+}
+
+bool SIFixupVectorISel::runOnMachineFunction(MachineFunction &MF) {
+  if (skipFunction(MF.getFunction()))
+    return false;
+
+  MachineRegisterInfo &MRI = MF.getRegInfo();
+  const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
+  const SIInstrInfo *TII = ST.getInstrInfo();
+  const SIRegisterInfo *TRI = ST.getRegisterInfo();
+
+  bool FuncModified = false;
+  for (MachineBasicBlock &MBB : MF) {
+    // Cleanup missed Saddr opportunites from ISel.
+    FuncModified |= fixupGlobalSaddr(MBB, MF, MRI, ST, TII, TRI);
+  }
+  return FuncModified;
+}
diff --git a/lib/Target/AMDGPU/SIFoldOperands.cpp b/lib/Target/AMDGPU/SIFoldOperands.cpp
index 338cabcb906bc..f4e8669583699 100644
--- a/lib/Target/AMDGPU/SIFoldOperands.cpp
+++ b/lib/Target/AMDGPU/SIFoldOperands.cpp
@@ -35,13 +35,16 @@ struct FoldCandidate {
     uint64_t ImmToFold;
     int FrameIndexToFold;
   };
+  int ShrinkOpcode;
   unsigned char UseOpNo;
   MachineOperand::MachineOperandType Kind;
   bool Commuted;
 
   FoldCandidate(MachineInstr *MI, unsigned OpNo, MachineOperand *FoldOp,
-                bool Commuted_ = false) :
-    UseMI(MI), OpToFold(nullptr), UseOpNo(OpNo), Kind(FoldOp->getType()),
+                bool Commuted_ = false,
+                int ShrinkOp = -1) :
+    UseMI(MI), OpToFold(nullptr), ShrinkOpcode(ShrinkOp), UseOpNo(OpNo),
+    Kind(FoldOp->getType()),
     Commuted(Commuted_) {
     if (FoldOp->isImm()) {
       ImmToFold = FoldOp->getImm();
@@ -68,6 +71,14 @@ struct FoldCandidate {
   bool isCommuted() const {
     return Commuted;
   }
+
+  bool needsShrink() const {
+    return ShrinkOpcode != -1;
+  }
+
+  int getShrinkOpcode() const {
+    return ShrinkOpcode;
+  }
 };
 
 class SIFoldOperands : public MachineFunctionPass {
@@ -154,6 +165,7 @@ FunctionPass *llvm::createSIFoldOperandsPass() {
 }
 
 static bool updateOperand(FoldCandidate &Fold,
+                          const SIInstrInfo &TII,
                           const TargetRegisterInfo &TRI) {
   MachineInstr *MI = Fold.UseMI;
   MachineOperand &Old = MI->getOperand(Fold.UseOpNo);
@@ -189,10 +201,49 @@ static bool updateOperand(FoldCandidate &Fold,
         Mod.setImm(Mod.getImm() & ~SISrcMods::OP_SEL_1);
       }
     }
+
+    if (Fold.needsShrink()) {
+      MachineBasicBlock *MBB = MI->getParent();
+      auto Liveness = MBB->computeRegisterLiveness(&TRI, AMDGPU::VCC, MI);
+      if (Liveness != MachineBasicBlock::LQR_Dead)
+        return false;
+
+      MachineRegisterInfo &MRI = MBB->getParent()->getRegInfo();
+      int Op32 = Fold.getShrinkOpcode();
+      MachineOperand &Dst0 = MI->getOperand(0);
+      MachineOperand &Dst1 = MI->getOperand(1);
+      assert(Dst0.isDef() && Dst1.isDef());
+
+      bool HaveNonDbgCarryUse = !MRI.use_nodbg_empty(Dst1.getReg());
+
+      const TargetRegisterClass *Dst0RC = MRI.getRegClass(Dst0.getReg());
+      unsigned NewReg0 = MRI.createVirtualRegister(Dst0RC);
+      const TargetRegisterClass *Dst1RC = MRI.getRegClass(Dst1.getReg());
+      unsigned NewReg1 = MRI.createVirtualRegister(Dst1RC);
+
+      MachineInstr *Inst32 = TII.buildShrunkInst(*MI, Op32);
+
+      if (HaveNonDbgCarryUse) {
+        BuildMI(*MBB, MI, MI->getDebugLoc(), TII.get(AMDGPU::COPY), Dst1.getReg())
+          .addReg(AMDGPU::VCC, RegState::Kill);
+      }
+
+      // Keep the old instruction around to avoid breaking iterators, but
+      // replace the outputs with dummy registers.
+      Dst0.setReg(NewReg0);
+      Dst1.setReg(NewReg1);
+
+      if (Fold.isCommuted())
+        TII.commuteInstruction(*Inst32, false);
+      return true;
+    }
+
     Old.ChangeToImmediate(Fold.ImmToFold);
     return true;
   }
 
+  assert(!Fold.needsShrink() && "not handled");
+
   if (Fold.isFI()) {
     Old.ChangeToFrameIndex(Fold.FrameIndexToFold);
     return true;
@@ -261,6 +312,8 @@ static bool tryAddToFoldList(SmallVectorImpl<FoldCandidate> &FoldList,
     if (isUseMIInFoldList(FoldList, MI))
       return false;
 
+    unsigned CommuteOpNo = OpNo;
+
     // Operand is not legal, so try to commute the instruction to
     // see if this makes it possible to fold.
     unsigned CommuteIdx0 = TargetInstrInfo::CommuteAnyOperandIndex;
@@ -269,11 +322,12 @@ static bool tryAddToFoldList(SmallVectorImpl<FoldCandidate> &FoldList,
 
     if (CanCommute) {
       if (CommuteIdx0 == OpNo)
-        OpNo = CommuteIdx1;
+        CommuteOpNo = CommuteIdx1;
       else if (CommuteIdx1 == OpNo)
-        OpNo = CommuteIdx0;
+        CommuteOpNo = CommuteIdx0;
     }
 
+
     // One of operands might be an Imm operand, and OpNo may refer to it after
     // the call of commuteInstruction() below. Such situations are avoided
     // here explicitly as OpNo must be a register operand to be a candidate
@@ -286,12 +340,34 @@ static bool tryAddToFoldList(SmallVectorImpl<FoldCandidate> &FoldList,
         !TII->commuteInstruction(*MI, false, CommuteIdx0, CommuteIdx1))
       return false;
 
-    if (!TII->isOperandLegal(*MI, OpNo, OpToFold)) {
+    if (!TII->isOperandLegal(*MI, CommuteOpNo, OpToFold)) {
+      if ((Opc == AMDGPU::V_ADD_I32_e64 ||
+           Opc == AMDGPU::V_SUB_I32_e64 ||
+           Opc == AMDGPU::V_SUBREV_I32_e64) && // FIXME
+          OpToFold->isImm()) {
+        MachineRegisterInfo &MRI = MI->getParent()->getParent()->getRegInfo();
+
+        // Verify the other operand is a VGPR, otherwise we would violate the
+        // constant bus restriction.
+        unsigned OtherIdx = CommuteOpNo == CommuteIdx0 ? CommuteIdx1 : CommuteIdx0;
+        MachineOperand &OtherOp = MI->getOperand(OtherIdx);
+        if (!OtherOp.isReg() ||
+            !TII->getRegisterInfo().isVGPR(MRI, OtherOp.getReg()))
+          return false;
+
+        assert(MI->getOperand(1).isDef());
+
+        int Op32 =  AMDGPU::getVOPe32(Opc);
+        FoldList.push_back(FoldCandidate(MI, CommuteOpNo, OpToFold, true,
+                                         Op32));
+        return true;
+      }
+
       TII->commuteInstruction(*MI, false, CommuteIdx0, CommuteIdx1);
       return false;
     }
 
-    FoldList.push_back(FoldCandidate(MI, OpNo, OpToFold, true));
+    FoldList.push_back(FoldCandidate(MI, CommuteOpNo, OpToFold, true));
     return true;
   }
 
@@ -362,8 +438,6 @@ void SIFoldOperands::foldOperand(
 
   bool FoldingImm = OpToFold.isImm();
 
-  // In order to fold immediates into copies, we need to change the
-  // copy to a MOV.
   if (FoldingImm && UseMI->isCopy()) {
     unsigned DestReg = UseMI->getOperand(0).getReg();
     const TargetRegisterClass *DestRC
@@ -371,6 +445,31 @@ void SIFoldOperands::foldOperand(
       MRI->getRegClass(DestReg) :
       TRI->getPhysRegClass(DestReg);
 
+    unsigned SrcReg  = UseMI->getOperand(1).getReg();
+    if (TargetRegisterInfo::isVirtualRegister(DestReg) &&
+      TargetRegisterInfo::isVirtualRegister(SrcReg)) {
+      const TargetRegisterClass * SrcRC = MRI->getRegClass(SrcReg);
+      if (TRI->isSGPRClass(SrcRC) && TRI->hasVGPRs(DestRC)) {
+        MachineRegisterInfo::use_iterator NextUse;
+        SmallVector<FoldCandidate, 4> CopyUses;
+        for (MachineRegisterInfo::use_iterator
+          Use = MRI->use_begin(DestReg), E = MRI->use_end();
+          Use != E; Use = NextUse) {
+          NextUse = std::next(Use);
+          FoldCandidate FC = FoldCandidate(Use->getParent(),
+           Use.getOperandNo(), &UseMI->getOperand(1));
+          CopyUses.push_back(FC);
+       }
+        for (auto & F : CopyUses) {
+          foldOperand(*F.OpToFold, F.UseMI, F.UseOpNo,
+           FoldList, CopiesToReplace);
+        }
+      }
+    }
+
+    // In order to fold immediates into copies, we need to change the
+    // copy to a MOV.
+
     unsigned MovOp = TII->getMovOpcode(DestRC);
     if (MovOp == AMDGPU::COPY)
       return;
@@ -378,6 +477,20 @@ void SIFoldOperands::foldOperand(
     UseMI->setDesc(TII->get(MovOp));
     CopiesToReplace.push_back(UseMI);
   } else {
+    if (UseMI->isCopy() && OpToFold.isReg() &&
+        TargetRegisterInfo::isVirtualRegister(UseMI->getOperand(0).getReg()) &&
+        TargetRegisterInfo::isVirtualRegister(UseMI->getOperand(1).getReg()) &&
+        TRI->isVGPR(*MRI, UseMI->getOperand(0).getReg()) &&
+        TRI->isVGPR(*MRI, UseMI->getOperand(1).getReg()) &&
+        !UseMI->getOperand(1).getSubReg()) {
+      UseMI->getOperand(1).setReg(OpToFold.getReg());
+      UseMI->getOperand(1).setSubReg(OpToFold.getSubReg());
+      UseMI->getOperand(1).setIsKill(false);
+      CopiesToReplace.push_back(UseMI);
+      OpToFold.setIsKill(false);
+      return;
+    }
+
     const MCInstrDesc &UseDesc = UseMI->getDesc();
 
     // Don't fold into target independent nodes.  Target independent opcodes
@@ -550,6 +663,19 @@ static bool tryConstantFoldOp(MachineRegisterInfo &MRI,
   if (!Src0->isImm() && !Src1->isImm())
     return false;
 
+  if (MI->getOpcode() == AMDGPU::V_LSHL_OR_B32) {
+    if (Src0->isImm() && Src0->getImm() == 0) {
+      // v_lshl_or_b32 0, X, Y -> copy Y
+      // v_lshl_or_b32 0, X, K -> v_mov_b32 K
+      bool UseCopy = TII->getNamedOperand(*MI, AMDGPU::OpName::src2)->isReg();
+      MI->RemoveOperand(Src1Idx);
+      MI->RemoveOperand(Src0Idx);
+
+      MI->setDesc(TII->get(UseCopy ? AMDGPU::COPY : AMDGPU::V_MOV_B32_e32));
+      return true;
+    }
+  }
+
   // and k0, k1 -> v_mov_b32 (k0 & k1)
   // or k0, k1 -> v_mov_b32 (k0 | k1)
   // xor k0, k1 -> v_mov_b32 (k0 ^ k1)
@@ -728,13 +854,17 @@ void SIFoldOperands::foldInstOperand(MachineInstr &MI,
     }
   } else {
     // Folding register.
+    SmallVector <MachineRegisterInfo::use_iterator, 4> UsesToProcess;
     for (MachineRegisterInfo::use_iterator
            Use = MRI->use_begin(Dst.getReg()), E = MRI->use_end();
          Use != E; ++Use) {
-      MachineInstr *UseMI = Use->getParent();
+      UsesToProcess.push_back(Use);
+    }
+    for (auto U : UsesToProcess) {
+      MachineInstr *UseMI = U->getParent();
 
-      foldOperand(OpToFold, UseMI, Use.getOperandNo(),
-                  FoldList, CopiesToReplace);
+      foldOperand(OpToFold, UseMI, U.getOperandNo(),
+        FoldList, CopiesToReplace);
     }
   }
 
@@ -744,7 +874,7 @@ void SIFoldOperands::foldInstOperand(MachineInstr &MI,
     Copy->addImplicitDefUseOperands(*MF);
 
   for (FoldCandidate &Fold : FoldList) {
-    if (updateOperand(Fold, *TRI)) {
+    if (updateOperand(Fold, *TII, *TRI)) {
       // Clear kill flags.
       if (Fold.isReg()) {
         assert(Fold.OpToFold && Fold.OpToFold->isReg());
@@ -981,9 +1111,8 @@ bool SIFoldOperands::runOnMachineFunction(MachineFunction &MF) {
   // omod is ignored by hardware if IEEE bit is enabled. omod also does not
   // correctly handle signed zeros.
   //
-  // TODO: Check nsz on instructions when fast math flags are preserved to MI
-  // level.
-  bool IsIEEEMode = ST->enableIEEEBit(MF) || !MFI->hasNoSignedZerosFPMath();
+  bool IsIEEEMode = ST->enableIEEEBit(MF);
+  bool HasNSZ = MFI->hasNoSignedZerosFPMath();
 
   for (MachineBasicBlock *MBB : depth_first(&MF)) {
     MachineBasicBlock::iterator I, Next;
@@ -994,7 +1123,10 @@ bool SIFoldOperands::runOnMachineFunction(MachineFunction &MF) {
       tryFoldInst(TII, &MI);
 
       if (!TII->isFoldableCopy(MI)) {
-        if (IsIEEEMode || !tryFoldOMod(MI))
+        // TODO: Omod might be OK if there is NSZ only on the source
+        // instruction, and not the omod multiply.
+        if (IsIEEEMode || (!HasNSZ && !MI.getFlag(MachineInstr::FmNsz)) ||
+            !tryFoldOMod(MI))
           tryFoldClamp(MI);
         continue;
       }
diff --git a/lib/Target/AMDGPU/SIFormMemoryClauses.cpp b/lib/Target/AMDGPU/SIFormMemoryClauses.cpp
index cd14239de822b..aa976d5141f86 100644
--- a/lib/Target/AMDGPU/SIFormMemoryClauses.cpp
+++ b/lib/Target/AMDGPU/SIFormMemoryClauses.cpp
@@ -168,16 +168,15 @@ void SIFormMemoryClauses::forAllLanes(unsigned Reg, LaneBitmask LaneMask,
     CoveringSubregs.push_back(Idx);
   }
 
-  llvm::sort(CoveringSubregs.begin(), CoveringSubregs.end(),
-             [this](unsigned A, unsigned B) {
-               LaneBitmask MaskA = TRI->getSubRegIndexLaneMask(A);
-               LaneBitmask MaskB = TRI->getSubRegIndexLaneMask(B);
-               unsigned NA = MaskA.getNumLanes();
-               unsigned NB = MaskB.getNumLanes();
-               if (NA != NB)
-                 return NA > NB;
-               return MaskA.getHighestLane() > MaskB.getHighestLane();
-             });
+  llvm::sort(CoveringSubregs, [this](unsigned A, unsigned B) {
+    LaneBitmask MaskA = TRI->getSubRegIndexLaneMask(A);
+    LaneBitmask MaskB = TRI->getSubRegIndexLaneMask(B);
+    unsigned NA = MaskA.getNumLanes();
+    unsigned NB = MaskB.getNumLanes();
+    if (NA != NB)
+      return NA > NB;
+    return MaskA.getHighestLane() > MaskB.getHighestLane();
+  });
 
   for (unsigned Idx : CoveringSubregs) {
     LaneBitmask SubRegMask = TRI->getSubRegIndexLaneMask(Idx);
diff --git a/lib/Target/AMDGPU/SIFrameLowering.cpp b/lib/Target/AMDGPU/SIFrameLowering.cpp
index ac0ef90f25a4f..e4633c88e18ff 100644
--- a/lib/Target/AMDGPU/SIFrameLowering.cpp
+++ b/lib/Target/AMDGPU/SIFrameLowering.cpp
@@ -289,7 +289,7 @@ void SIFrameLowering::emitEntryFunctionPrologue(MachineFunction &MF,
     AMDGPUFunctionArgInfo::PRIVATE_SEGMENT_WAVE_BYTE_OFFSET);
 
   unsigned PreloadedPrivateBufferReg = AMDGPU::NoRegister;
-  if (ST.isAmdCodeObjectV2(F)) {
+  if (ST.isAmdHsaOrMesa(F)) {
     PreloadedPrivateBufferReg = MFI->getPreloadedReg(
       AMDGPUFunctionArgInfo::PRIVATE_SEGMENT_BUFFER);
   }
@@ -308,7 +308,7 @@ void SIFrameLowering::emitEntryFunctionPrologue(MachineFunction &MF,
   }
 
   if (ResourceRegUsed && PreloadedPrivateBufferReg != AMDGPU::NoRegister) {
-    assert(ST.isAmdCodeObjectV2(F) || ST.isMesaGfxShader(F));
+    assert(ST.isAmdHsaOrMesa(F) || ST.isMesaGfxShader(F));
     MRI.addLiveIn(PreloadedPrivateBufferReg);
     MBB.addLiveIn(PreloadedPrivateBufferReg);
   }
@@ -333,7 +333,7 @@ void SIFrameLowering::emitEntryFunctionPrologue(MachineFunction &MF,
 
   bool CopyBuffer = ResourceRegUsed &&
     PreloadedPrivateBufferReg != AMDGPU::NoRegister &&
-    ST.isAmdCodeObjectV2(F) &&
+    ST.isAmdHsaOrMesa(F) &&
     ScratchRsrcReg != PreloadedPrivateBufferReg;
 
   // This needs to be careful of the copying order to avoid overwriting one of
@@ -433,7 +433,7 @@ void SIFrameLowering::emitEntryFunctionScratchSetup(const GCNSubtarget &ST,
   }
   if (ST.isMesaGfxShader(Fn)
       || (PreloadedPrivateBufferReg == AMDGPU::NoRegister)) {
-    assert(!ST.isAmdCodeObjectV2(Fn));
+    assert(!ST.isAmdHsaOrMesa(Fn));
     const MCInstrDesc &SMovB32 = TII->get(AMDGPU::S_MOV_B32);
 
     unsigned Rsrc2 = TRI->getSubReg(ScratchRsrcReg, AMDGPU::sub2);
diff --git a/lib/Target/AMDGPU/SIISelLowering.cpp b/lib/Target/AMDGPU/SIISelLowering.cpp
index 25007861fd158..0ba921647097d 100644
--- a/lib/Target/AMDGPU/SIISelLowering.cpp
+++ b/lib/Target/AMDGPU/SIISelLowering.cpp
@@ -12,7 +12,7 @@
 //
 //===----------------------------------------------------------------------===//
 
-#ifdef _MSC_VER
+#if defined(_MSC_VER) || defined(__MINGW32__)
 // Provide M_PI.
 #define _USE_MATH_DEFINES
 #endif
@@ -156,12 +156,14 @@ SITargetLowering::SITargetLowering(const TargetMachine &TM,
   setOperationAction(ISD::LOAD, MVT::v8i32, Custom);
   setOperationAction(ISD::LOAD, MVT::v16i32, Custom);
   setOperationAction(ISD::LOAD, MVT::i1, Custom);
+  setOperationAction(ISD::LOAD, MVT::v32i32, Custom);
 
   setOperationAction(ISD::STORE, MVT::v2i32, Custom);
   setOperationAction(ISD::STORE, MVT::v4i32, Custom);
   setOperationAction(ISD::STORE, MVT::v8i32, Custom);
   setOperationAction(ISD::STORE, MVT::v16i32, Custom);
   setOperationAction(ISD::STORE, MVT::i1, Custom);
+  setOperationAction(ISD::STORE, MVT::v32i32, Custom);
 
   setTruncStoreAction(MVT::v2i32, MVT::v2i16, Expand);
   setTruncStoreAction(MVT::v4i32, MVT::v4i16, Expand);
@@ -207,11 +209,14 @@ SITargetLowering::SITargetLowering(const TargetMachine &TM,
   setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::Other, Custom);
   setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::f32, Custom);
   setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::v4f32, Custom);
+  setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::i16, Custom);
+  setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::f16, Custom);
   setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::v2i16, Custom);
   setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::v2f16, Custom);
 
   setOperationAction(ISD::INTRINSIC_W_CHAIN, MVT::v2f16, Custom);
   setOperationAction(ISD::INTRINSIC_W_CHAIN, MVT::v4f16, Custom);
+  setOperationAction(ISD::INTRINSIC_W_CHAIN, MVT::v8f16, Custom);
   setOperationAction(ISD::INTRINSIC_W_CHAIN, MVT::Other, Custom);
 
   setOperationAction(ISD::INTRINSIC_VOID, MVT::Other, Custom);
@@ -232,6 +237,10 @@ SITargetLowering::SITargetLowering(const TargetMachine &TM,
   setOperationAction(ISD::ADDCARRY, MVT::i32, Legal);
   setOperationAction(ISD::SUBCARRY, MVT::i32, Legal);
 
+  setOperationAction(ISD::SHL_PARTS, MVT::i64, Expand);
+  setOperationAction(ISD::SRA_PARTS, MVT::i64, Expand);
+  setOperationAction(ISD::SRL_PARTS, MVT::i64, Expand);
+
 #if 0
   setOperationAction(ISD::ADDCARRY, MVT::i64, Legal);
   setOperationAction(ISD::SUBCARRY, MVT::i64, Legal);
@@ -240,7 +249,7 @@ SITargetLowering::SITargetLowering(const TargetMachine &TM,
   // We only support LOAD/STORE and vector manipulation ops for vectors
   // with > 4 elements.
   for (MVT VT : {MVT::v8i32, MVT::v8f32, MVT::v16i32, MVT::v16f32,
-        MVT::v2i64, MVT::v2f64, MVT::v4i16, MVT::v4f16 }) {
+        MVT::v2i64, MVT::v2f64, MVT::v4i16, MVT::v4f16, MVT::v32i32 }) {
     for (unsigned Op = 0; Op < ISD::BUILTIN_OP_END; ++Op) {
       switch (Op) {
       case ISD::LOAD:
@@ -339,6 +348,7 @@ SITargetLowering::SITargetLowering(const TargetMachine &TM,
 
   if (Subtarget->has16BitInsts()) {
     setOperationAction(ISD::FLOG, MVT::f16, Custom);
+    setOperationAction(ISD::FEXP, MVT::f16, Custom);
     setOperationAction(ISD::FLOG10, MVT::f16, Custom);
   }
 
@@ -375,8 +385,20 @@ SITargetLowering::SITargetLowering(const TargetMachine &TM,
   if (Subtarget->hasBFE())
     setHasExtractBitsInsn(true);
 
-  setOperationAction(ISD::FMINNUM, MVT::f64, Legal);
-  setOperationAction(ISD::FMAXNUM, MVT::f64, Legal);
+  setOperationAction(ISD::FMINNUM, MVT::f32, Custom);
+  setOperationAction(ISD::FMAXNUM, MVT::f32, Custom);
+  setOperationAction(ISD::FMINNUM, MVT::f64, Custom);
+  setOperationAction(ISD::FMAXNUM, MVT::f64, Custom);
+
+
+  // These are really only legal for ieee_mode functions. We should be avoiding
+  // them for functions that don't have ieee_mode enabled, so just say they are
+  // legal.
+  setOperationAction(ISD::FMINNUM_IEEE, MVT::f32, Legal);
+  setOperationAction(ISD::FMAXNUM_IEEE, MVT::f32, Legal);
+  setOperationAction(ISD::FMINNUM_IEEE, MVT::f64, Legal);
+  setOperationAction(ISD::FMAXNUM_IEEE, MVT::f64, Legal);
+
 
   if (Subtarget->getGeneration() >= AMDGPUSubtarget::SEA_ISLANDS) {
     setOperationAction(ISD::FTRUNC, MVT::f64, Legal);
@@ -465,8 +487,7 @@ SITargetLowering::SITargetLowering(const TargetMachine &TM,
     // F16 - VOP2 Actions.
     setOperationAction(ISD::BR_CC, MVT::f16, Expand);
     setOperationAction(ISD::SELECT_CC, MVT::f16, Expand);
-    setOperationAction(ISD::FMAXNUM, MVT::f16, Legal);
-    setOperationAction(ISD::FMINNUM, MVT::f16, Legal);
+
     setOperationAction(ISD::FDIV, MVT::f16, Custom);
 
     // F16 - VOP3 Actions.
@@ -549,6 +570,17 @@ SITargetLowering::SITargetLowering(const TargetMachine &TM,
     // This isn't really legal, but this avoids the legalizer unrolling it (and
     // allows matching fneg (fabs x) patterns)
     setOperationAction(ISD::FABS, MVT::v2f16, Legal);
+
+    setOperationAction(ISD::FMAXNUM, MVT::f16, Custom);
+    setOperationAction(ISD::FMINNUM, MVT::f16, Custom);
+    setOperationAction(ISD::FMAXNUM_IEEE, MVT::f16, Legal);
+    setOperationAction(ISD::FMINNUM_IEEE, MVT::f16, Legal);
+
+    setOperationAction(ISD::FMINNUM_IEEE, MVT::v4f16, Custom);
+    setOperationAction(ISD::FMAXNUM_IEEE, MVT::v4f16, Custom);
+
+    setOperationAction(ISD::FMINNUM, MVT::v4f16, Expand);
+    setOperationAction(ISD::FMAXNUM, MVT::v4f16, Expand);
   }
 
   if (Subtarget->hasVOP3PInsts()) {
@@ -566,8 +598,10 @@ SITargetLowering::SITargetLowering(const TargetMachine &TM,
     setOperationAction(ISD::FADD, MVT::v2f16, Legal);
     setOperationAction(ISD::FMUL, MVT::v2f16, Legal);
     setOperationAction(ISD::FMA, MVT::v2f16, Legal);
-    setOperationAction(ISD::FMINNUM, MVT::v2f16, Legal);
-    setOperationAction(ISD::FMAXNUM, MVT::v2f16, Legal);
+
+    setOperationAction(ISD::FMINNUM_IEEE, MVT::v2f16, Legal);
+    setOperationAction(ISD::FMAXNUM_IEEE, MVT::v2f16, Legal);
+
     setOperationAction(ISD::FCANONICALIZE, MVT::v2f16, Legal);
 
     setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v2i16, Custom);
@@ -587,9 +621,15 @@ SITargetLowering::SITargetLowering(const TargetMachine &TM,
 
     setOperationAction(ISD::FADD, MVT::v4f16, Custom);
     setOperationAction(ISD::FMUL, MVT::v4f16, Custom);
+
+    setOperationAction(ISD::FMAXNUM, MVT::v2f16, Custom);
+    setOperationAction(ISD::FMINNUM, MVT::v2f16, Custom);
+
     setOperationAction(ISD::FMINNUM, MVT::v4f16, Custom);
     setOperationAction(ISD::FMAXNUM, MVT::v4f16, Custom);
+    setOperationAction(ISD::FCANONICALIZE, MVT::v4f16, Custom);
 
+    setOperationAction(ISD::FEXP, MVT::v2f16, Custom);
     setOperationAction(ISD::SELECT, MVT::v4i16, Custom);
     setOperationAction(ISD::SELECT, MVT::v4f16, Custom);
   }
@@ -623,6 +663,8 @@ SITargetLowering::SITargetLowering(const TargetMachine &TM,
   setTargetDAGCombine(ISD::FSUB);
   setTargetDAGCombine(ISD::FMINNUM);
   setTargetDAGCombine(ISD::FMAXNUM);
+  setTargetDAGCombine(ISD::FMINNUM_IEEE);
+  setTargetDAGCombine(ISD::FMAXNUM_IEEE);
   setTargetDAGCombine(ISD::FMA);
   setTargetDAGCombine(ISD::SMIN);
   setTargetDAGCombine(ISD::SMAX);
@@ -638,7 +680,7 @@ SITargetLowering::SITargetLowering(const TargetMachine &TM,
   setTargetDAGCombine(ISD::SCALAR_TO_VECTOR);
   setTargetDAGCombine(ISD::ZERO_EXTEND);
   setTargetDAGCombine(ISD::EXTRACT_VECTOR_ELT);
-  setTargetDAGCombine(ISD::BUILD_VECTOR);
+  setTargetDAGCombine(ISD::INSERT_VECTOR_ELT);
 
   // All memory operations. Some folding on the pointer operand is done to help
   // matching the constant offsets in the addressing modes.
@@ -707,9 +749,7 @@ MVT SITargetLowering::getRegisterTypeForCallingConv(LLVMContext &Context,
     if (Size == 64)
       return MVT::i32;
 
-    if (Size == 16 &&
-        Subtarget->has16BitInsts() &&
-        isPowerOf2_32(VT.getVectorNumElements()))
+    if (Size == 16 && Subtarget->has16BitInsts())
       return VT.isInteger() ? MVT::v2i16 : MVT::v2f16;
   }
 
@@ -730,9 +770,8 @@ unsigned SITargetLowering::getNumRegistersForCallingConv(LLVMContext &Context,
     if (Size == 64)
       return 2 * NumElts;
 
-    // FIXME: Fails to break down as we want with v3.
-    if (Size == 16 && Subtarget->has16BitInsts() && isPowerOf2_32(NumElts))
-      return VT.getVectorNumElements() / 2;
+    if (Size == 16 && Subtarget->has16BitInsts())
+      return (VT.getVectorNumElements() + 1) / 2;
   }
 
   return TargetLowering::getNumRegistersForCallingConv(Context, CC, VT);
@@ -763,10 +802,10 @@ unsigned SITargetLowering::getVectorTypeBreakdownForCallingConv(
     // FIXME: We should fix the ABI to be the same on targets without 16-bit
     // support, but unless we can properly handle 3-vectors, it will be still be
     // inconsistent.
-    if (Size == 16 && Subtarget->has16BitInsts() && isPowerOf2_32(NumElts)) {
+    if (Size == 16 && Subtarget->has16BitInsts()) {
       RegisterVT = VT.isInteger() ? MVT::v2i16 : MVT::v2f16;
       IntermediateVT = RegisterVT;
-      NumIntermediates = NumElts / 2;
+      NumIntermediates = (NumElts + 1) / 2;
       return NumIntermediates;
     }
   }
@@ -775,6 +814,47 @@ unsigned SITargetLowering::getVectorTypeBreakdownForCallingConv(
     Context, CC, VT, IntermediateVT, NumIntermediates, RegisterVT);
 }
 
+static MVT memVTFromAggregate(Type *Ty) {
+  // Only limited forms of aggregate type currently expected.
+  assert(Ty->isStructTy() && "Expected struct type");
+
+
+  Type *ElementType = nullptr;
+  unsigned NumElts;
+  if (Ty->getContainedType(0)->isVectorTy()) {
+    VectorType *VecComponent = cast<VectorType>(Ty->getContainedType(0));
+    ElementType = VecComponent->getElementType();
+    NumElts = VecComponent->getNumElements();
+  } else {
+    ElementType = Ty->getContainedType(0);
+    NumElts = 1;
+  }
+
+  assert((Ty->getContainedType(1) && Ty->getContainedType(1)->isIntegerTy(32)) && "Expected int32 type");
+
+  // Calculate the size of the memVT type from the aggregate
+  unsigned Pow2Elts = 0;
+  unsigned ElementSize;
+  switch (ElementType->getTypeID()) {
+    default:
+      llvm_unreachable("Unknown type!");
+    case Type::IntegerTyID:
+      ElementSize = cast<IntegerType>(ElementType)->getBitWidth();
+      break;
+    case Type::HalfTyID:
+      ElementSize = 16;
+      break;
+    case Type::FloatTyID:
+      ElementSize = 32;
+      break;
+  }
+  unsigned AdditionalElts = ElementSize == 16 ? 2 : 1;
+  Pow2Elts = 1 << Log2_32_Ceil(NumElts + AdditionalElts);
+
+  return MVT::getVectorVT(MVT::getVT(ElementType, false),
+                          Pow2Elts);
+}
+
 bool SITargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info,
                                           const CallInst &CI,
                                           MachineFunction &MF,
@@ -802,7 +882,12 @@ bool SITargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info,
     Info.flags = MachineMemOperand::MODereferenceable;
     if (Attr.hasFnAttribute(Attribute::ReadOnly)) {
       Info.opc = ISD::INTRINSIC_W_CHAIN;
-      Info.memVT = MVT::getVT(CI.getType());
+      Info.memVT = MVT::getVT(CI.getType(), true);
+      if (Info.memVT == MVT::Other) {
+        // Some intrinsics return an aggregate type - special case to work out
+        // the correct memVT
+        Info.memVT = memVTFromAggregate(CI.getType());
+      }
       Info.flags |= MachineMemOperand::MOLoad;
     } else if (Attr.hasFnAttribute(Attribute::WriteOnly)) {
       Info.opc = ISD::INTRINSIC_VOID;
@@ -941,11 +1026,11 @@ bool SITargetLowering::isLegalAddressingMode(const DataLayout &DL,
   if (AM.BaseGV)
     return false;
 
-  if (AS == AMDGPUASI.GLOBAL_ADDRESS)
+  if (AS == AMDGPUAS::GLOBAL_ADDRESS)
     return isLegalGlobalAddressingMode(AM);
 
-  if (AS == AMDGPUASI.CONSTANT_ADDRESS ||
-      AS == AMDGPUASI.CONSTANT_ADDRESS_32BIT) {
+  if (AS == AMDGPUAS::CONSTANT_ADDRESS ||
+      AS == AMDGPUAS::CONSTANT_ADDRESS_32BIT) {
     // If the offset isn't a multiple of 4, it probably isn't going to be
     // correctly aligned.
     // FIXME: Can we get the real alignment here?
@@ -983,10 +1068,10 @@ bool SITargetLowering::isLegalAddressingMode(const DataLayout &DL,
 
     return false;
 
-  } else if (AS == AMDGPUASI.PRIVATE_ADDRESS) {
+  } else if (AS == AMDGPUAS::PRIVATE_ADDRESS) {
     return isLegalMUBUFAddressingMode(AM);
-  } else if (AS == AMDGPUASI.LOCAL_ADDRESS ||
-             AS == AMDGPUASI.REGION_ADDRESS) {
+  } else if (AS == AMDGPUAS::LOCAL_ADDRESS ||
+             AS == AMDGPUAS::REGION_ADDRESS) {
     // Basic, single offset DS instructions allow a 16-bit unsigned immediate
     // field.
     // XXX - If doing a 4-byte aligned 8-byte type access, we effectively have
@@ -1001,8 +1086,8 @@ bool SITargetLowering::isLegalAddressingMode(const DataLayout &DL,
       return true;
 
     return false;
-  } else if (AS == AMDGPUASI.FLAT_ADDRESS ||
-             AS == AMDGPUASI.UNKNOWN_ADDRESS_SPACE) {
+  } else if (AS == AMDGPUAS::FLAT_ADDRESS ||
+             AS == AMDGPUAS::UNKNOWN_ADDRESS_SPACE) {
     // For an unknown address space, this usually means that this is for some
     // reason being used for pure arithmetic, and not based on some addressing
     // computation. We don't have instructions that compute pointers with any
@@ -1016,12 +1101,12 @@ bool SITargetLowering::isLegalAddressingMode(const DataLayout &DL,
 
 bool SITargetLowering::canMergeStoresTo(unsigned AS, EVT MemVT,
                                         const SelectionDAG &DAG) const {
-  if (AS == AMDGPUASI.GLOBAL_ADDRESS || AS == AMDGPUASI.FLAT_ADDRESS) {
+  if (AS == AMDGPUAS::GLOBAL_ADDRESS || AS == AMDGPUAS::FLAT_ADDRESS) {
     return (MemVT.getSizeInBits() <= 4 * 32);
-  } else if (AS == AMDGPUASI.PRIVATE_ADDRESS) {
+  } else if (AS == AMDGPUAS::PRIVATE_ADDRESS) {
     unsigned MaxPrivateBits = 8 * getSubtarget()->getMaxPrivateElementSize();
     return (MemVT.getSizeInBits() <= MaxPrivateBits);
-  } else if (AS == AMDGPUASI.LOCAL_ADDRESS) {
+  } else if (AS == AMDGPUAS::LOCAL_ADDRESS) {
     return (MemVT.getSizeInBits() <= 2 * 32);
   }
   return true;
@@ -1043,8 +1128,8 @@ bool SITargetLowering::allowsMisalignedMemoryAccesses(EVT VT,
     return false;
   }
 
-  if (AddrSpace == AMDGPUASI.LOCAL_ADDRESS ||
-      AddrSpace == AMDGPUASI.REGION_ADDRESS) {
+  if (AddrSpace == AMDGPUAS::LOCAL_ADDRESS ||
+      AddrSpace == AMDGPUAS::REGION_ADDRESS) {
     // ds_read/write_b64 require 8-byte alignment, but we can do a 4 byte
     // aligned, 8 byte access in a single operation using ds_read2/write2_b32
     // with adjacent offsets.
@@ -1059,17 +1144,21 @@ bool SITargetLowering::allowsMisalignedMemoryAccesses(EVT VT,
   // will access scratch.  If we had access to the IR function, then we
   // could determine if any private memory was used in the function.
   if (!Subtarget->hasUnalignedScratchAccess() &&
-      (AddrSpace == AMDGPUASI.PRIVATE_ADDRESS ||
-       AddrSpace == AMDGPUASI.FLAT_ADDRESS)) {
-    return false;
+      (AddrSpace == AMDGPUAS::PRIVATE_ADDRESS ||
+       AddrSpace == AMDGPUAS::FLAT_ADDRESS)) {
+    bool AlignedBy4 = Align >= 4;
+    if (IsFast)
+      *IsFast = AlignedBy4;
+
+    return AlignedBy4;
   }
 
   if (Subtarget->hasUnalignedBufferAccess()) {
     // If we have an uniform constant load, it still requires using a slow
     // buffer instruction if unaligned.
     if (IsFast) {
-      *IsFast = (AddrSpace == AMDGPUASI.CONSTANT_ADDRESS ||
-                 AddrSpace == AMDGPUASI.CONSTANT_ADDRESS_32BIT) ?
+      *IsFast = (AddrSpace == AMDGPUAS::CONSTANT_ADDRESS ||
+                 AddrSpace == AMDGPUAS::CONSTANT_ADDRESS_32BIT) ?
         (Align % 4 == 0) : true;
     }
 
@@ -1109,17 +1198,15 @@ EVT SITargetLowering::getOptimalMemOpType(uint64_t Size, unsigned DstAlign,
   return MVT::Other;
 }
 
-static bool isFlatGlobalAddrSpace(unsigned AS, AMDGPUAS AMDGPUASI) {
-  return AS == AMDGPUASI.GLOBAL_ADDRESS ||
-         AS == AMDGPUASI.FLAT_ADDRESS ||
-         AS == AMDGPUASI.CONSTANT_ADDRESS ||
-         AS == AMDGPUASI.CONSTANT_ADDRESS_32BIT;
+static bool isFlatGlobalAddrSpace(unsigned AS) {
+  return AS == AMDGPUAS::GLOBAL_ADDRESS ||
+         AS == AMDGPUAS::FLAT_ADDRESS ||
+         AS == AMDGPUAS::CONSTANT_ADDRESS;
 }
 
 bool SITargetLowering::isNoopAddrSpaceCast(unsigned SrcAS,
                                            unsigned DestAS) const {
-  return isFlatGlobalAddrSpace(SrcAS, AMDGPUASI) &&
-         isFlatGlobalAddrSpace(DestAS, AMDGPUASI);
+  return isFlatGlobalAddrSpace(SrcAS) && isFlatGlobalAddrSpace(DestAS);
 }
 
 bool SITargetLowering::isMemOpHasNoClobberedMemOperand(const SDNode *N) const {
@@ -1133,7 +1220,7 @@ bool SITargetLowering::isCheapAddrSpaceCast(unsigned SrcAS,
                                             unsigned DestAS) const {
   // Flat -> private/local is a simple truncate.
   // Flat -> global is no-op
-  if (SrcAS == AMDGPUASI.FLAT_ADDRESS)
+  if (SrcAS == AMDGPUAS::FLAT_ADDRESS)
     return true;
 
   return isNoopAddrSpaceCast(SrcAS, DestAS);
@@ -1146,7 +1233,7 @@ bool SITargetLowering::isMemOpUniform(const SDNode *N) const {
 }
 
 TargetLoweringBase::LegalizeTypeAction
-SITargetLowering::getPreferredVectorAction(EVT VT) const {
+SITargetLowering::getPreferredVectorAction(MVT VT) const {
   if (VT.getVectorNumElements() != 1 && VT.getScalarType().bitsLE(MVT::i16))
     return TypeSplitVector;
 
@@ -1200,7 +1287,7 @@ SDValue SITargetLowering::lowerKernArgParameterPtr(SelectionDAG &DAG,
     = Info->getPreloadedValue(AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR);
 
   MachineRegisterInfo &MRI = DAG.getMachineFunction().getRegInfo();
-  MVT PtrVT = getPointerTy(DL, AMDGPUASI.CONSTANT_ADDRESS);
+  MVT PtrVT = getPointerTy(DL, AMDGPUAS::CONSTANT_ADDRESS);
   SDValue BasePtr = DAG.getCopyFromReg(Chain, SL,
     MRI.getLiveInVirtReg(InputPtrReg->getRegister()), PtrVT);
 
@@ -1240,7 +1327,7 @@ SDValue SITargetLowering::lowerKernargMemParameter(
   uint64_t Offset, unsigned Align, bool Signed,
   const ISD::InputArg *Arg) const {
   Type *Ty = MemVT.getTypeForEVT(*DAG.getContext());
-  PointerType *PtrTy = PointerType::get(Ty, AMDGPUASI.CONSTANT_ADDRESS);
+  PointerType *PtrTy = PointerType::get(Ty, AMDGPUAS::CONSTANT_ADDRESS);
   MachinePointerInfo PtrInfo(UndefValue::get(PtrTy));
 
   // Try to avoid using an extload by loading earlier than the argument address,
@@ -1349,7 +1436,8 @@ static void processShaderInputArgs(SmallVectorImpl<ISD::InputArg> &Splits,
   for (unsigned I = 0, E = Ins.size(), PSInputNum = 0; I != E; ++I) {
     const ISD::InputArg *Arg = &Ins[I];
 
-    assert(!Arg->VT.isVector() && "vector type argument should have been split");
+    assert((!Arg->VT.isVector() || Arg->VT.getScalarSizeInBits() == 16) &&
+           "vector type argument should have been split");
 
     // First check if it's a PS input addr.
     if (CallConv == CallingConv::AMDGPU_PS &&
@@ -1642,7 +1730,7 @@ static void reservePrivateMemoryRegs(const TargetMachine &TM,
   bool RequiresStackAccess = HasStackObjects || MFI.hasCalls();
 
   const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
-  if (ST.isAmdCodeObjectV2(MF.getFunction())) {
+  if (ST.isAmdHsaOrMesa(MF.getFunction())) {
     if (RequiresStackAccess) {
       // If we have stack objects, we unquestionably need the private buffer
       // resource. For the Code Object V2 ABI, this will be the first 4 user
@@ -1951,29 +2039,6 @@ SDValue SITargetLowering::LowerFormalArguments(
       llvm_unreachable("Unknown loc info!");
     }
 
-    if (IsShader && Arg.VT.isVector()) {
-      // Build a vector from the registers
-      Type *ParamType = FType->getParamType(Arg.getOrigArgIndex());
-      unsigned NumElements = ParamType->getVectorNumElements();
-
-      SmallVector<SDValue, 4> Regs;
-      Regs.push_back(Val);
-      for (unsigned j = 1; j != NumElements; ++j) {
-        Reg = ArgLocs[ArgIdx++].getLocReg();
-        Reg = MF.addLiveIn(Reg, RC);
-
-        SDValue Copy = DAG.getCopyFromReg(Chain, DL, Reg, VT);
-        Regs.push_back(Copy);
-      }
-
-      // Fill up the missing vector elements
-      NumElements = Arg.VT.getVectorNumElements() - NumElements;
-      Regs.append(NumElements, DAG.getUNDEF(VT));
-
-      InVals.push_back(DAG.getBuildVector(Arg.VT, DL, Regs));
-      continue;
-    }
-
     InVals.push_back(Val);
   }
 
@@ -2037,48 +2102,19 @@ SITargetLowering::LowerReturn(SDValue Chain, CallingConv::ID CallConv,
 
   bool IsShader = AMDGPU::isShader(CallConv);
 
-  Info->setIfReturnsVoid(Outs.size() == 0);
+  Info->setIfReturnsVoid(Outs.empty());
   bool IsWaveEnd = Info->returnsVoid() && IsShader;
 
-  SmallVector<ISD::OutputArg, 48> Splits;
-  SmallVector<SDValue, 48> SplitVals;
-
-  // Split vectors into their elements.
-  for (unsigned i = 0, e = Outs.size(); i != e; ++i) {
-    const ISD::OutputArg &Out = Outs[i];
-
-    if (IsShader && Out.VT.isVector()) {
-      MVT VT = Out.VT.getVectorElementType();
-      ISD::OutputArg NewOut = Out;
-      NewOut.Flags.setSplit();
-      NewOut.VT = VT;
-
-      // We want the original number of vector elements here, e.g.
-      // three or five, not four or eight.
-      unsigned NumElements = Out.ArgVT.getVectorNumElements();
-
-      for (unsigned j = 0; j != NumElements; ++j) {
-        SDValue Elem = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT, OutVals[i],
-                                   DAG.getConstant(j, DL, MVT::i32));
-        SplitVals.push_back(Elem);
-        Splits.push_back(NewOut);
-        NewOut.PartOffset += NewOut.VT.getStoreSize();
-      }
-    } else {
-      SplitVals.push_back(OutVals[i]);
-      Splits.push_back(Out);
-    }
-  }
-
   // CCValAssign - represent the assignment of the return value to a location.
   SmallVector<CCValAssign, 48> RVLocs;
+  SmallVector<ISD::OutputArg, 48> Splits;
 
   // CCState - Info about the registers and stack slots.
   CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(), RVLocs,
                  *DAG.getContext());
 
   // Analyze outgoing return values.
-  CCInfo.AnalyzeReturn(Splits, CCAssignFnForReturn(CallConv, isVarArg));
+  CCInfo.AnalyzeReturn(Outs, CCAssignFnForReturn(CallConv, isVarArg));
 
   SDValue Flag;
   SmallVector<SDValue, 48> RetOps;
@@ -2103,14 +2139,12 @@ SITargetLowering::LowerReturn(SDValue Chain, CallingConv::ID CallConv,
   }
 
   // Copy the result values into the output registers.
-  for (unsigned i = 0, realRVLocIdx = 0;
-       i != RVLocs.size();
-       ++i, ++realRVLocIdx) {
-    CCValAssign &VA = RVLocs[i];
+  for (unsigned I = 0, RealRVLocIdx = 0, E = RVLocs.size(); I != E;
+       ++I, ++RealRVLocIdx) {
+    CCValAssign &VA = RVLocs[I];
     assert(VA.isRegLoc() && "Can only return in registers!");
     // TODO: Partially return in registers if return values don't fit.
-
-    SDValue Arg = SplitVals[realRVLocIdx];
+    SDValue Arg = OutVals[RealRVLocIdx];
 
     // Copied from other backends.
     switch (VA.getLocInfo()) {
@@ -2225,11 +2259,11 @@ SDValue SITargetLowering::LowerCallResult(
 // from the explicit user arguments present in the IR.
 void SITargetLowering::passSpecialInputs(
     CallLoweringInfo &CLI,
+    CCState &CCInfo,
     const SIMachineFunctionInfo &Info,
     SmallVectorImpl<std::pair<unsigned, SDValue>> &RegsToPass,
     SmallVectorImpl<SDValue> &MemOpChains,
-    SDValue Chain,
-    SDValue StackPtr) const {
+    SDValue Chain) const {
   // If we don't have a call site, this was a call inserted by
   // legalization. These can never use special inputs.
   if (!CLI.CS)
@@ -2297,9 +2331,9 @@ void SITargetLowering::passSpecialInputs(
     if (OutgoingArg->isRegister()) {
       RegsToPass.emplace_back(OutgoingArg->getRegister(), InputReg);
     } else {
-      SDValue ArgStore = storeStackInputValue(DAG, DL, Chain, StackPtr,
-                                              InputReg,
-                                              OutgoingArg->getStackOffset());
+      unsigned SpecialArgOffset = CCInfo.AllocateStack(ArgVT.getStoreSize(), 4);
+      SDValue ArgStore = storeStackInputValue(DAG, DL, Chain, InputReg,
+                                              SpecialArgOffset);
       MemOpChains.push_back(ArgStore);
     }
   }
@@ -2424,6 +2458,9 @@ SDValue SITargetLowering::LowerCall(CallLoweringInfo &CLI,
                               "unsupported call to variadic function ");
   }
 
+  if (!CLI.CS.getInstruction())
+    report_fatal_error("unsupported libcall legalization");
+
   if (!CLI.CS.getCalledFunction()) {
     return lowerUnhandledCall(CLI, InVals,
                               "unsupported indirect call to function ");
@@ -2442,8 +2479,6 @@ SDValue SITargetLowering::LowerCall(CallLoweringInfo &CLI,
   }
 
   // The first 4 bytes are reserved for the callee's emergency stack slot.
-  const unsigned CalleeUsableStackOffset = 4;
-
   if (IsTailCall) {
     IsTailCall = isEligibleForTailCallOptimization(
       Callee, CallConv, IsVarArg, Outs, OutVals, Ins, DAG);
@@ -2463,25 +2498,16 @@ SDValue SITargetLowering::LowerCall(CallLoweringInfo &CLI,
       ++NumTailCalls;
   }
 
-  if (GlobalAddressSDNode *GA = dyn_cast<GlobalAddressSDNode>(Callee)) {
-    // FIXME: Remove this hack for function pointer types after removing
-    // support of old address space mapping. In the new address space
-    // mapping the pointer in default address space is 64 bit, therefore
-    // does not need this hack.
-    if (Callee.getValueType() == MVT::i32) {
-      const GlobalValue *GV = GA->getGlobal();
-      Callee = DAG.getGlobalAddress(GV, DL, MVT::i64, GA->getOffset(), false,
-                                    GA->getTargetFlags());
-    }
-  }
-  assert(Callee.getValueType() == MVT::i64);
-
   const SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();
 
   // Analyze operands of the call, assigning locations to each operand.
   SmallVector<CCValAssign, 16> ArgLocs;
   CCState CCInfo(CallConv, IsVarArg, MF, ArgLocs, *DAG.getContext());
   CCAssignFn *AssignFn = CCAssignFnForCall(CallConv, IsVarArg);
+
+  // The first 4 bytes are reserved for the callee's emergency stack slot.
+  CCInfo.AllocateStack(4, 4);
+
   CCInfo.AnalyzeCallOperands(Outs, AssignFn);
 
   // Get a count of how many bytes are to be pushed on the stack.
@@ -2529,10 +2555,6 @@ SDValue SITargetLowering::LowerCall(CallLoweringInfo &CLI,
     }
   }
 
-  // Stack pointer relative accesses are done by changing the offset SGPR. This
-  // is just the VGPR offset component.
-  SDValue StackPtr = DAG.getConstant(CalleeUsableStackOffset, DL, MVT::i32);
-
   SmallVector<SDValue, 8> MemOpChains;
   MVT PtrVT = MVT::i32;
 
@@ -2576,18 +2598,22 @@ SDValue SITargetLowering::LowerCall(CallLoweringInfo &CLI,
       unsigned LocMemOffset = VA.getLocMemOffset();
       int32_t Offset = LocMemOffset;
 
-      SDValue PtrOff = DAG.getObjectPtrOffset(DL, StackPtr, Offset);
+      SDValue PtrOff = DAG.getConstant(Offset, DL, PtrVT);
+      unsigned Align = 0;
 
       if (IsTailCall) {
         ISD::ArgFlagsTy Flags = Outs[realArgIdx].Flags;
         unsigned OpSize = Flags.isByVal() ?
           Flags.getByValSize() : VA.getValVT().getStoreSize();
 
+        // FIXME: We can have better than the minimum byval required alignment.
+        Align = Flags.isByVal() ? Flags.getByValAlign() :
+          MinAlign(Subtarget->getStackAlignment(), Offset);
+
         Offset = Offset + FPDiff;
         int FI = MFI.CreateFixedObject(OpSize, Offset, true);
 
-        DstAddr = DAG.getObjectPtrOffset(DL, DAG.getFrameIndex(FI, PtrVT),
-                                         StackPtr);
+        DstAddr = DAG.getFrameIndex(FI, PtrVT);
         DstInfo = MachinePointerInfo::getFixedStack(MF, FI);
 
         // Make sure any stack arguments overlapping with where we're storing
@@ -2601,6 +2627,7 @@ SDValue SITargetLowering::LowerCall(CallLoweringInfo &CLI,
       } else {
         DstAddr = PtrOff;
         DstInfo = MachinePointerInfo::getStack(MF, LocMemOffset);
+        Align = MinAlign(Subtarget->getStackAlignment(), LocMemOffset);
       }
 
       if (Outs[i].Flags.isByVal()) {
@@ -2611,18 +2638,18 @@ SDValue SITargetLowering::LowerCall(CallLoweringInfo &CLI,
             /*isVol = */ false, /*AlwaysInline = */ true,
             /*isTailCall = */ false, DstInfo,
             MachinePointerInfo(UndefValue::get(Type::getInt8PtrTy(
-                *DAG.getContext(), AMDGPUASI.PRIVATE_ADDRESS))));
+                *DAG.getContext(), AMDGPUAS::PRIVATE_ADDRESS))));
 
         MemOpChains.push_back(Cpy);
       } else {
-        SDValue Store = DAG.getStore(Chain, DL, Arg, DstAddr, DstInfo);
+        SDValue Store = DAG.getStore(Chain, DL, Arg, DstAddr, DstInfo, Align);
         MemOpChains.push_back(Store);
       }
     }
   }
 
   // Copy special input registers after user input arguments.
-  passSpecialInputs(CLI, *Info, RegsToPass, MemOpChains, Chain, StackPtr);
+  passSpecialInputs(CLI, CCInfo, *Info, RegsToPass, MemOpChains, Chain);
 
   if (!MemOpChains.empty())
     Chain = DAG.getNode(ISD::TokenFactor, DL, MVT::Other, MemOpChains);
@@ -3460,7 +3487,7 @@ MachineBasicBlock *SITargetLowering::EmitInstrWithCustomInserter(
     for (unsigned I = 1, E = MI.getNumOperands(); I != E; ++I)
       MIB.add(MI.getOperand(I));
 
-    MIB.setMemRefs(MI.memoperands_begin(), MI.memoperands_end());
+    MIB.cloneMemRefs(MI);
     MI.eraseFromParent();
     return BB;
   }
@@ -3628,7 +3655,11 @@ SDValue SITargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const {
     return lowerDEBUGTRAP(Op, DAG);
   case ISD::FABS:
   case ISD::FNEG:
+  case ISD::FCANONICALIZE:
     return splitUnaryVectorOp(Op, DAG);
+  case ISD::FMINNUM:
+  case ISD::FMAXNUM:
+    return lowerFMINNUM_FMAXNUM(Op, DAG);
   case ISD::SHL:
   case ISD::SRA:
   case ISD::SRL:
@@ -3639,10 +3670,10 @@ SDValue SITargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const {
   case ISD::SMAX:
   case ISD::UMIN:
   case ISD::UMAX:
-  case ISD::FMINNUM:
-  case ISD::FMAXNUM:
   case ISD::FADD:
   case ISD::FMUL:
+  case ISD::FMINNUM_IEEE:
+  case ISD::FMAXNUM_IEEE:
     return splitBinaryVectorOp(Op, DAG);
   }
   return SDValue();
@@ -3678,18 +3709,9 @@ static SDValue adjustLoadValueTypeImpl(SDValue Result, EVT LoadVT,
 SDValue SITargetLowering::adjustLoadValueType(unsigned Opcode,
                                               MemSDNode *M,
                                               SelectionDAG &DAG,
+                                              ArrayRef<SDValue> Ops,
                                               bool IsIntrinsic) const {
   SDLoc DL(M);
-  SmallVector<SDValue, 10> Ops;
-  Ops.reserve(M->getNumOperands());
-
-  Ops.push_back(M->getOperand(0));
-  if (IsIntrinsic)
-    Ops.push_back(DAG.getConstant(Opcode, DL, MVT::i32));
-
-  // Skip 1, as it is the intrinsic ID.
-  for (unsigned I = 2, E = M->getNumOperands(); I != E; ++I)
-    Ops.push_back(M->getOperand(I));
 
   bool Unpacked = Subtarget->hasUnpackedD16VMem();
   EVT LoadVT = M->getValueType(0);
@@ -3717,6 +3739,69 @@ SDValue SITargetLowering::adjustLoadValueType(unsigned Opcode,
   return DAG.getMergeValues({ Adjusted, Load.getValue(1) }, DL);
 }
 
+static SDValue lowerICMPIntrinsic(const SITargetLowering &TLI,
+                                  SDNode *N, SelectionDAG &DAG) {
+  EVT VT = N->getValueType(0);
+  const auto *CD = dyn_cast<ConstantSDNode>(N->getOperand(3));
+  if (!CD)
+    return DAG.getUNDEF(VT);
+
+  int CondCode = CD->getSExtValue();
+  if (CondCode < ICmpInst::Predicate::FIRST_ICMP_PREDICATE ||
+      CondCode > ICmpInst::Predicate::LAST_ICMP_PREDICATE)
+    return DAG.getUNDEF(VT);
+
+  ICmpInst::Predicate IcInput = static_cast<ICmpInst::Predicate>(CondCode);
+
+
+  SDValue LHS = N->getOperand(1);
+  SDValue RHS = N->getOperand(2);
+
+  SDLoc DL(N);
+
+  EVT CmpVT = LHS.getValueType();
+  if (CmpVT == MVT::i16 && !TLI.isTypeLegal(MVT::i16)) {
+    unsigned PromoteOp = ICmpInst::isSigned(IcInput) ?
+      ISD::SIGN_EXTEND : ISD::ZERO_EXTEND;
+    LHS = DAG.getNode(PromoteOp, DL, MVT::i32, LHS);
+    RHS = DAG.getNode(PromoteOp, DL, MVT::i32, RHS);
+  }
+
+  ISD::CondCode CCOpcode = getICmpCondCode(IcInput);
+
+  return DAG.getNode(AMDGPUISD::SETCC, DL, VT, LHS, RHS,
+                     DAG.getCondCode(CCOpcode));
+}
+
+static SDValue lowerFCMPIntrinsic(const SITargetLowering &TLI,
+                                  SDNode *N, SelectionDAG &DAG) {
+  EVT VT = N->getValueType(0);
+  const auto *CD = dyn_cast<ConstantSDNode>(N->getOperand(3));
+  if (!CD)
+    return DAG.getUNDEF(VT);
+
+  int CondCode = CD->getSExtValue();
+  if (CondCode < FCmpInst::Predicate::FIRST_FCMP_PREDICATE ||
+      CondCode > FCmpInst::Predicate::LAST_FCMP_PREDICATE) {
+    return DAG.getUNDEF(VT);
+  }
+
+  SDValue Src0 = N->getOperand(1);
+  SDValue Src1 = N->getOperand(2);
+  EVT CmpVT = Src0.getValueType();
+  SDLoc SL(N);
+
+  if (CmpVT == MVT::f16 && !TLI.isTypeLegal(CmpVT)) {
+    Src0 = DAG.getNode(ISD::FP_EXTEND, SL, MVT::f32, Src0);
+    Src1 = DAG.getNode(ISD::FP_EXTEND, SL, MVT::f32, Src1);
+  }
+
+  FCmpInst::Predicate IcInput = static_cast<FCmpInst::Predicate>(CondCode);
+  ISD::CondCode CCOpcode = getFCmpCondCode(IcInput);
+  return DAG.getNode(AMDGPUISD::SETCC, SL, VT, Src0,
+                     Src1, DAG.getCondCode(CCOpcode));
+}
+
 void SITargetLowering::ReplaceNodeResults(SDNode *N,
                                           SmallVectorImpl<SDValue> &Results,
                                           SelectionDAG &DAG) const {
@@ -3761,8 +3846,13 @@ void SITargetLowering::ReplaceNodeResults(SDNode *N,
       else
         Opcode = AMDGPUISD::CVT_PK_U16_U32;
 
-      SDValue Cvt = DAG.getNode(Opcode, SL, MVT::i32, Src0, Src1);
-      Results.push_back(DAG.getNode(ISD::BITCAST, SL, MVT::v2i16, Cvt));
+      EVT VT = N->getValueType(0);
+      if (isTypeLegal(VT))
+        Results.push_back(DAG.getNode(Opcode, SL, VT, Src0, Src1));
+      else {
+        SDValue Cvt = DAG.getNode(Opcode, SL, MVT::i32, Src0, Src1);
+        Results.push_back(DAG.getNode(ISD::BITCAST, SL, MVT::v2i16, Cvt));
+      }
       return;
     }
     }
@@ -3895,15 +3985,15 @@ void SITargetLowering::createDebuggerPrologueStackObjects(
 
 bool SITargetLowering::shouldEmitFixup(const GlobalValue *GV) const {
   const Triple &TT = getTargetMachine().getTargetTriple();
-  return (GV->getType()->getAddressSpace() == AMDGPUASI.CONSTANT_ADDRESS ||
-          GV->getType()->getAddressSpace() == AMDGPUASI.CONSTANT_ADDRESS_32BIT) &&
+  return (GV->getType()->getAddressSpace() == AMDGPUAS::CONSTANT_ADDRESS ||
+          GV->getType()->getAddressSpace() == AMDGPUAS::CONSTANT_ADDRESS_32BIT) &&
          AMDGPU::shouldEmitConstantsToTextSection(TT);
 }
 
 bool SITargetLowering::shouldEmitGOTReloc(const GlobalValue *GV) const {
-  return (GV->getType()->getAddressSpace() == AMDGPUASI.GLOBAL_ADDRESS ||
-          GV->getType()->getAddressSpace() == AMDGPUASI.CONSTANT_ADDRESS ||
-          GV->getType()->getAddressSpace() == AMDGPUASI.CONSTANT_ADDRESS_32BIT) &&
+  return (GV->getType()->getAddressSpace() == AMDGPUAS::GLOBAL_ADDRESS ||
+          GV->getType()->getAddressSpace() == AMDGPUAS::CONSTANT_ADDRESS ||
+          GV->getType()->getAddressSpace() == AMDGPUAS::CONSTANT_ADDRESS_32BIT) &&
          !shouldEmitFixup(GV) &&
          !getTargetMachine().shouldAssumeDSOLocal(*GV->getParent(), GV);
 }
@@ -4038,6 +4128,23 @@ SDValue SITargetLowering::lowerFP_ROUND(SDValue Op, SelectionDAG &DAG) const {
   return DAG.getNode(ISD::BITCAST, DL, MVT::f16, Trunc);
 }
 
+SDValue SITargetLowering::lowerFMINNUM_FMAXNUM(SDValue Op,
+                                               SelectionDAG &DAG) const {
+  EVT VT = Op.getValueType();
+  bool IsIEEEMode = Subtarget->enableIEEEBit(DAG.getMachineFunction());
+
+  // FIXME: Assert during eslection that this is only selected for
+  // ieee_mode. Currently a combine can produce the ieee version for non-ieee
+  // mode functions, but this happens to be OK since it's only done in cases
+  // where there is known no sNaN.
+  if (IsIEEEMode)
+    return expandFMINNUM_FMAXNUM(Op.getNode(), DAG);
+
+  if (VT == MVT::v4f16)
+    return splitBinaryVectorOp(Op, DAG);
+  return Op;
+}
+
 SDValue SITargetLowering::lowerTRAP(SDValue Op, SelectionDAG &DAG) const {
   SDLoc SL(Op);
   SDValue Chain = Op.getOperand(0);
@@ -4091,10 +4198,10 @@ SDValue SITargetLowering::getSegmentAperture(unsigned AS, const SDLoc &DL,
                                              SelectionDAG &DAG) const {
   // FIXME: Use inline constants (src_{shared, private}_base) instead.
   if (Subtarget->hasApertureRegs()) {
-    unsigned Offset = AS == AMDGPUASI.LOCAL_ADDRESS ?
+    unsigned Offset = AS == AMDGPUAS::LOCAL_ADDRESS ?
         AMDGPU::Hwreg::OFFSET_SRC_SHARED_BASE :
         AMDGPU::Hwreg::OFFSET_SRC_PRIVATE_BASE;
-    unsigned WidthM1 = AS == AMDGPUASI.LOCAL_ADDRESS ?
+    unsigned WidthM1 = AS == AMDGPUAS::LOCAL_ADDRESS ?
         AMDGPU::Hwreg::WIDTH_M1_SRC_SHARED_BASE :
         AMDGPU::Hwreg::WIDTH_M1_SRC_PRIVATE_BASE;
     unsigned Encoding =
@@ -4119,7 +4226,7 @@ SDValue SITargetLowering::getSegmentAperture(unsigned AS, const SDLoc &DL,
 
   // Offset into amd_queue_t for group_segment_aperture_base_hi /
   // private_segment_aperture_base_hi.
-  uint32_t StructOffset = (AS == AMDGPUASI.LOCAL_ADDRESS) ? 0x40 : 0x44;
+  uint32_t StructOffset = (AS == AMDGPUAS::LOCAL_ADDRESS) ? 0x40 : 0x44;
 
   SDValue Ptr = DAG.getObjectPtrOffset(DL, QueuePtr, StructOffset);
 
@@ -4127,7 +4234,7 @@ SDValue SITargetLowering::getSegmentAperture(unsigned AS, const SDLoc &DL,
   // TODO: We should use the value from the IR intrinsic call, but it might not
   // be available and how do we get it?
   Value *V = UndefValue::get(PointerType::get(Type::getInt8Ty(*DAG.getContext()),
-                                              AMDGPUASI.CONSTANT_ADDRESS));
+                                              AMDGPUAS::CONSTANT_ADDRESS));
 
   MachinePointerInfo PtrInfo(V, StructOffset);
   return DAG.getLoad(MVT::i32, DL, QueuePtr.getValue(1), Ptr, PtrInfo,
@@ -4148,11 +4255,11 @@ SDValue SITargetLowering::lowerADDRSPACECAST(SDValue Op,
     static_cast<const AMDGPUTargetMachine &>(getTargetMachine());
 
   // flat -> local/private
-  if (ASC->getSrcAddressSpace() == AMDGPUASI.FLAT_ADDRESS) {
+  if (ASC->getSrcAddressSpace() == AMDGPUAS::FLAT_ADDRESS) {
     unsigned DestAS = ASC->getDestAddressSpace();
 
-    if (DestAS == AMDGPUASI.LOCAL_ADDRESS ||
-        DestAS == AMDGPUASI.PRIVATE_ADDRESS) {
+    if (DestAS == AMDGPUAS::LOCAL_ADDRESS ||
+        DestAS == AMDGPUAS::PRIVATE_ADDRESS) {
       unsigned NullVal = TM.getNullPointerValue(DestAS);
       SDValue SegmentNullPtr = DAG.getConstant(NullVal, SL, MVT::i32);
       SDValue NonNull = DAG.getSetCC(SL, MVT::i1, Src, FlatNullPtr, ISD::SETNE);
@@ -4164,11 +4271,11 @@ SDValue SITargetLowering::lowerADDRSPACECAST(SDValue Op,
   }
 
   // local/private -> flat
-  if (ASC->getDestAddressSpace() == AMDGPUASI.FLAT_ADDRESS) {
+  if (ASC->getDestAddressSpace() == AMDGPUAS::FLAT_ADDRESS) {
     unsigned SrcAS = ASC->getSrcAddressSpace();
 
-    if (SrcAS == AMDGPUASI.LOCAL_ADDRESS ||
-        SrcAS == AMDGPUASI.PRIVATE_ADDRESS) {
+    if (SrcAS == AMDGPUAS::LOCAL_ADDRESS ||
+        SrcAS == AMDGPUAS::PRIVATE_ADDRESS) {
       unsigned NullVal = TM.getNullPointerValue(SrcAS);
       SDValue SegmentNullPtr = DAG.getConstant(NullVal, SL, MVT::i32);
 
@@ -4335,30 +4442,39 @@ SDValue SITargetLowering::lowerBUILD_VECTOR(SDValue Op,
   }
 
   assert(VT == MVT::v2f16 || VT == MVT::v2i16);
+  assert(!Subtarget->hasVOP3PInsts() && "this should be legal");
 
   SDValue Lo = Op.getOperand(0);
   SDValue Hi = Op.getOperand(1);
 
-  Lo = DAG.getNode(ISD::BITCAST, SL, MVT::i16, Lo);
-  Hi = DAG.getNode(ISD::BITCAST, SL, MVT::i16, Hi);
+  // Avoid adding defined bits with the zero_extend.
+  if (Hi.isUndef()) {
+    Lo = DAG.getNode(ISD::BITCAST, SL, MVT::i16, Lo);
+    SDValue ExtLo = DAG.getNode(ISD::ANY_EXTEND, SL, MVT::i32, Lo);
+    return DAG.getNode(ISD::BITCAST, SL, VT, ExtLo);
+  }
 
-  Lo = DAG.getNode(ISD::ZERO_EXTEND, SL, MVT::i32, Lo);
+  Hi = DAG.getNode(ISD::BITCAST, SL, MVT::i16, Hi);
   Hi = DAG.getNode(ISD::ZERO_EXTEND, SL, MVT::i32, Hi);
 
   SDValue ShlHi = DAG.getNode(ISD::SHL, SL, MVT::i32, Hi,
                               DAG.getConstant(16, SL, MVT::i32));
+  if (Lo.isUndef())
+    return DAG.getNode(ISD::BITCAST, SL, VT, ShlHi);
 
-  SDValue Or = DAG.getNode(ISD::OR, SL, MVT::i32, Lo, ShlHi);
+  Lo = DAG.getNode(ISD::BITCAST, SL, MVT::i16, Lo);
+  Lo = DAG.getNode(ISD::ZERO_EXTEND, SL, MVT::i32, Lo);
 
+  SDValue Or = DAG.getNode(ISD::OR, SL, MVT::i32, Lo, ShlHi);
   return DAG.getNode(ISD::BITCAST, SL, VT, Or);
 }
 
 bool
 SITargetLowering::isOffsetFoldingLegal(const GlobalAddressSDNode *GA) const {
   // We can fold offsets for anything that doesn't require a GOT relocation.
-  return (GA->getAddressSpace() == AMDGPUASI.GLOBAL_ADDRESS ||
-          GA->getAddressSpace() == AMDGPUASI.CONSTANT_ADDRESS ||
-          GA->getAddressSpace() == AMDGPUASI.CONSTANT_ADDRESS_32BIT) &&
+  return (GA->getAddressSpace() == AMDGPUAS::GLOBAL_ADDRESS ||
+          GA->getAddressSpace() == AMDGPUAS::CONSTANT_ADDRESS ||
+          GA->getAddressSpace() == AMDGPUAS::CONSTANT_ADDRESS_32BIT) &&
          !shouldEmitGOTReloc(GA->getGlobal());
 }
 
@@ -4409,18 +4525,15 @@ SDValue SITargetLowering::LowerGlobalAddress(AMDGPUMachineFunction *MFI,
                                              SelectionDAG &DAG) const {
   GlobalAddressSDNode *GSD = cast<GlobalAddressSDNode>(Op);
   const GlobalValue *GV = GSD->getGlobal();
-
-  if (GSD->getAddressSpace() != AMDGPUASI.CONSTANT_ADDRESS &&
-      GSD->getAddressSpace() != AMDGPUASI.CONSTANT_ADDRESS_32BIT &&
-      GSD->getAddressSpace() != AMDGPUASI.GLOBAL_ADDRESS &&
-      // FIXME: It isn't correct to rely on the type of the pointer. This should
-      // be removed when address space 0 is 64-bit.
-      !GV->getType()->getElementType()->isFunctionTy())
+  if (GSD->getAddressSpace() == AMDGPUAS::LOCAL_ADDRESS ||
+      GSD->getAddressSpace() == AMDGPUAS::REGION_ADDRESS ||
+      GSD->getAddressSpace() == AMDGPUAS::PRIVATE_ADDRESS)
     return AMDGPUTargetLowering::LowerGlobalAddress(MFI, Op, DAG);
 
   SDLoc DL(GSD);
   EVT PtrVT = Op.getValueType();
 
+  // FIXME: Should not make address space based decisions here.
   if (shouldEmitFixup(GV))
     return buildPCRelGlobalAddress(DAG, GV, DL, GSD->getOffset(), PtrVT);
   else if (shouldEmitPCReloc(GV))
@@ -4431,11 +4544,11 @@ SDValue SITargetLowering::LowerGlobalAddress(AMDGPUMachineFunction *MFI,
                                             SIInstrInfo::MO_GOTPCREL32);
 
   Type *Ty = PtrVT.getTypeForEVT(*DAG.getContext());
-  PointerType *PtrTy = PointerType::get(Ty, AMDGPUASI.CONSTANT_ADDRESS);
+  PointerType *PtrTy = PointerType::get(Ty, AMDGPUAS::CONSTANT_ADDRESS);
   const DataLayout &DataLayout = DAG.getDataLayout();
   unsigned Align = DataLayout.getABITypeAlignment(PtrTy);
-  // FIXME: Use a PseudoSourceValue once those can be assigned an address space.
-  MachinePointerInfo PtrInfo(UndefValue::get(PtrTy));
+  MachinePointerInfo PtrInfo
+    = MachinePointerInfo::getGOT(DAG.getMachineFunction());
 
   return DAG.getLoad(PtrVT, DL, DAG.getEntryNode(), GOTAddr, PtrInfo, Align,
                      MachineMemOperand::MODereferenceable |
@@ -4547,11 +4660,115 @@ static bool parseCachePolicy(SDValue CachePolicy, SelectionDAG &DAG,
   return Value == 0;
 }
 
+// Re-construct the required return value for a image load intrinsic.
+// This is more complicated due to the optional use TexFailCtrl which means the required
+// return type is an aggregate
+static SDValue constructRetValue(SelectionDAG &DAG,
+                                 MachineSDNode *Result,
+                                 ArrayRef<EVT> ResultTypes,
+                                 bool IsTexFail, bool Unpacked, bool IsD16,
+                                 int DMaskPop, int NumVDataDwords,
+                                 const SDLoc &DL, LLVMContext &Context) {
+  // Determine the required return type. This is the same regardless of IsTexFail flag
+  EVT ReqRetVT = ResultTypes[0];
+  EVT ReqRetEltVT = ReqRetVT.isVector() ? ReqRetVT.getVectorElementType() : ReqRetVT;
+  int ReqRetNumElts = ReqRetVT.isVector() ? ReqRetVT.getVectorNumElements() : 1;
+  EVT AdjEltVT = Unpacked && IsD16 ? MVT::i32 : ReqRetEltVT;
+  EVT AdjVT = Unpacked ? ReqRetNumElts > 1 ? EVT::getVectorVT(Context, AdjEltVT, ReqRetNumElts)
+                                           : AdjEltVT
+                       : ReqRetVT;
+
+  // Extract data part of the result
+  // Bitcast the result to the same type as the required return type
+  int NumElts;
+  if (IsD16 && !Unpacked)
+    NumElts = NumVDataDwords << 1;
+  else
+    NumElts = NumVDataDwords;
+
+  EVT CastVT = NumElts > 1 ? EVT::getVectorVT(Context, AdjEltVT, NumElts)
+                           : AdjEltVT;
+
+  // Special case for v8f16. Rather than add support for this, use v4i32 to
+  // extract the data elements
+  bool V8F16Special = false;
+  if (CastVT == MVT::v8f16) {
+    CastVT = MVT::v4i32;
+    DMaskPop >>= 1;
+    ReqRetNumElts >>= 1;
+    V8F16Special = true;
+    AdjVT = MVT::v2i32;
+  }
+
+  SDValue N = SDValue(Result, 0);
+  SDValue CastRes = DAG.getNode(ISD::BITCAST, DL, CastVT, N);
+
+  // Iterate over the result
+  SmallVector<SDValue, 4> BVElts;
+
+  if (CastVT.isVector()) {
+    DAG.ExtractVectorElements(CastRes, BVElts, 0, DMaskPop);
+  } else {
+    BVElts.push_back(CastRes);
+  }
+  int ExtraElts = ReqRetNumElts - DMaskPop;
+  while(ExtraElts--)
+    BVElts.push_back(DAG.getUNDEF(AdjEltVT));
+
+  SDValue PreTFCRes;
+  if (ReqRetNumElts > 1) {
+    SDValue NewVec = DAG.getBuildVector(AdjVT, DL, BVElts);
+    if (IsD16 && Unpacked)
+      PreTFCRes = adjustLoadValueTypeImpl(NewVec, ReqRetVT, DL, DAG, Unpacked);
+    else
+      PreTFCRes = NewVec;
+  } else {
+    PreTFCRes = BVElts[0];
+  }
+
+  if (V8F16Special)
+    PreTFCRes = DAG.getNode(ISD::BITCAST, DL, MVT::v4f16, PreTFCRes);
+
+  if (!IsTexFail) {
+    if (Result->getNumValues() > 1)
+      return DAG.getMergeValues({PreTFCRes, SDValue(Result, 1)}, DL);
+    else
+      return PreTFCRes;
+  }
+
+  // Extract the TexFail result and insert into aggregate return
+  SmallVector<SDValue, 1> TFCElt;
+  DAG.ExtractVectorElements(N, TFCElt, DMaskPop, 1);
+  SDValue TFCRes = DAG.getNode(ISD::BITCAST, DL, ResultTypes[1], TFCElt[0]);
+  return DAG.getMergeValues({PreTFCRes, TFCRes, SDValue(Result, 1)}, DL);
+}
+
+static bool parseTexFail(SDValue TexFailCtrl, SelectionDAG &DAG, SDValue *TFE,
+                         SDValue *LWE, bool &IsTexFail) {
+  auto TexFailCtrlConst = dyn_cast<ConstantSDNode>(TexFailCtrl.getNode());
+  if (!TexFailCtrlConst)
+    return false;
+
+  uint64_t Value = TexFailCtrlConst->getZExtValue();
+  if (Value) {
+    IsTexFail = true;
+  }
+
+  SDLoc DL(TexFailCtrlConst);
+  *TFE = DAG.getTargetConstant((Value & 0x1) ? 1 : 0, DL, MVT::i32);
+  Value &= ~(uint64_t)0x1;
+  *LWE = DAG.getTargetConstant((Value & 0x2) ? 1 : 0, DL, MVT::i32);
+  Value &= ~(uint64_t)0x2;
+
+  return Value == 0;
+}
+
 SDValue SITargetLowering::lowerImage(SDValue Op,
                                      const AMDGPU::ImageDimIntrinsicInfo *Intr,
                                      SelectionDAG &DAG) const {
   SDLoc DL(Op);
   MachineFunction &MF = DAG.getMachineFunction();
+  const GCNSubtarget* ST = &MF.getSubtarget<GCNSubtarget>();
   const AMDGPU::MIMGBaseOpcodeInfo *BaseOpcode =
       AMDGPU::getMIMGBaseOpcodeInfo(Intr->BaseOpcode);
   const AMDGPU::MIMGDimInfo *DimInfo = AMDGPU::getMIMGDimInfo(Intr->Dim);
@@ -4559,12 +4776,17 @@ SDValue SITargetLowering::lowerImage(SDValue Op,
       AMDGPU::getMIMGLZMappingInfo(Intr->BaseOpcode);
   unsigned IntrOpcode = Intr->BaseOpcode;
 
-  SmallVector<EVT, 2> ResultTypes(Op->value_begin(), Op->value_end());
+  SmallVector<EVT, 3> ResultTypes(Op->value_begin(), Op->value_end());
+  SmallVector<EVT, 3> OrigResultTypes(Op->value_begin(), Op->value_end());
   bool IsD16 = false;
+  bool IsA16 = false;
   SDValue VData;
   int NumVDataDwords;
+  bool AdjustRetType = false;
+
   unsigned AddrIdx; // Index of first address argument
   unsigned DMask;
+  unsigned DMaskLanes = 0;
 
   if (BaseOpcode->Atomic) {
     VData = Op.getOperand(2);
@@ -4587,7 +4809,12 @@ SDValue SITargetLowering::lowerImage(SDValue Op,
       AddrIdx = 3;
     }
   } else {
-    unsigned DMaskIdx;
+    unsigned DMaskIdx = BaseOpcode->Store ? 3 : isa<MemSDNode>(Op) ? 2 : 1;
+    auto DMaskConst = dyn_cast<ConstantSDNode>(Op.getOperand(DMaskIdx));
+    if (!DMaskConst)
+      return Op;
+    DMask = DMaskConst->getZExtValue();
+    DMaskLanes = BaseOpcode->Gather4 ? 4 : countPopulation(DMask);
 
     if (BaseOpcode->Store) {
       VData = Op.getOperand(2);
@@ -4603,58 +4830,91 @@ SDValue SITargetLowering::lowerImage(SDValue Op,
       }
 
       NumVDataDwords = (VData.getValueType().getSizeInBits() + 31) / 32;
-      DMaskIdx = 3;
     } else {
-      MVT LoadVT = Op.getSimpleValueType();
+      // Work out the num dwords based on the dmask popcount and underlying type
+      // and whether packing is supported.
+      MVT LoadVT = ResultTypes[0].getSimpleVT();
       if (LoadVT.getScalarType() == MVT::f16) {
         if (Subtarget->getGeneration() < AMDGPUSubtarget::VOLCANIC_ISLANDS ||
             !BaseOpcode->HasD16)
           return Op; // D16 is unsupported for this instruction
 
         IsD16 = true;
-        if (LoadVT.isVector() && Subtarget->hasUnpackedD16VMem())
-          ResultTypes[0] = (LoadVT == MVT::v2f16) ? MVT::v2i32 : MVT::v4i32;
       }
 
-      NumVDataDwords = (ResultTypes[0].getSizeInBits() + 31) / 32;
-      DMaskIdx = isa<MemSDNode>(Op) ? 2 : 1;
-    }
+      // Confirm that the return type is large enough for the dmask specified
+      if ((LoadVT.isVector() && LoadVT.getVectorNumElements() < DMaskLanes) ||
+          (!LoadVT.isVector() && DMaskLanes > 1))
+          return Op;
 
-    auto DMaskConst = dyn_cast<ConstantSDNode>(Op.getOperand(DMaskIdx));
-    if (!DMaskConst)
-      return Op;
+      if (IsD16 && !Subtarget->hasUnpackedD16VMem())
+        NumVDataDwords = (DMaskLanes + 1) / 2;
+      else
+        NumVDataDwords = DMaskLanes;
 
-    AddrIdx = DMaskIdx + 1;
-    DMask = DMaskConst->getZExtValue();
-    if (!DMask && !BaseOpcode->Store) {
-      // Eliminate no-op loads. Stores with dmask == 0 are *not* no-op: they
-      // store the channels' default values.
-      SDValue Undef = DAG.getUNDEF(Op.getValueType());
-      if (isa<MemSDNode>(Op))
-        return DAG.getMergeValues({Undef, Op.getOperand(0)}, DL);
-      return Undef;
+      AdjustRetType = true;
     }
+
+    AddrIdx = DMaskIdx + 1;
   }
 
-  unsigned NumVAddrs = BaseOpcode->NumExtraArgs +
-                       (BaseOpcode->Gradients ? DimInfo->NumGradients : 0) +
-                       (BaseOpcode->Coordinates ? DimInfo->NumCoords : 0) +
-                       (BaseOpcode->LodOrClampOrMip ? 1 : 0);
+  unsigned NumGradients = BaseOpcode->Gradients ? DimInfo->NumGradients : 0;
+  unsigned NumCoords = BaseOpcode->Coordinates ? DimInfo->NumCoords : 0;
+  unsigned NumLCM = BaseOpcode->LodOrClampOrMip ? 1 : 0;
+  unsigned NumVAddrs = BaseOpcode->NumExtraArgs + NumGradients +
+                       NumCoords + NumLCM;
+  unsigned NumMIVAddrs = NumVAddrs;
+
   SmallVector<SDValue, 4> VAddrs;
-  for (unsigned i = 0; i < NumVAddrs; ++i)
-    VAddrs.push_back(Op.getOperand(AddrIdx + i));
 
   // Optimize _L to _LZ when _L is zero
   if (LZMappingInfo) {
     if (auto ConstantLod =
-         dyn_cast<ConstantFPSDNode>(VAddrs[NumVAddrs-1].getNode())) {
+         dyn_cast<ConstantFPSDNode>(Op.getOperand(AddrIdx+NumVAddrs-1))) {
       if (ConstantLod->isZero() || ConstantLod->isNegative()) {
         IntrOpcode = LZMappingInfo->LZ;  // set new opcode to _lz variant of _l
-        VAddrs.pop_back();               // remove 'lod'
+        NumMIVAddrs--;               // remove 'lod'
       }
     }
   }
 
+  // Check for 16 bit addresses and pack if true.
+  unsigned DimIdx = AddrIdx + BaseOpcode->NumExtraArgs;
+  MVT VAddrVT = Op.getOperand(DimIdx).getSimpleValueType();
+  const MVT VAddrScalarVT = VAddrVT.getScalarType();
+  if (((VAddrScalarVT == MVT::f16) || (VAddrScalarVT == MVT::i16)) &&
+      ST->hasFeature(AMDGPU::FeatureR128A16)) {
+    IsA16 = true;
+    const MVT VectorVT = VAddrScalarVT == MVT::f16 ? MVT::v2f16 : MVT::v2i16;
+    for (unsigned i = AddrIdx; i < (AddrIdx + NumMIVAddrs); ++i) {
+      SDValue AddrLo, AddrHi;
+      // Push back extra arguments.
+      if (i < DimIdx) {
+        AddrLo = Op.getOperand(i);
+      } else {
+        AddrLo = Op.getOperand(i);
+        // Dz/dh, dz/dv and the last odd coord are packed with undef. Also,
+        // in 1D, derivatives dx/dh and dx/dv are packed with undef.
+        if (((i + 1) >= (AddrIdx + NumMIVAddrs)) ||
+            ((NumGradients / 2) % 2 == 1 &&
+            (i == DimIdx + (NumGradients / 2) - 1 ||
+             i == DimIdx + NumGradients - 1))) {
+          AddrHi = DAG.getUNDEF(MVT::f16);
+        } else {
+          AddrHi = Op.getOperand(i + 1);
+          i++;
+        }
+        AddrLo = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, VectorVT,
+                             {AddrLo, AddrHi});
+        AddrLo = DAG.getBitcast(MVT::i32, AddrLo);
+      }
+      VAddrs.push_back(AddrLo);
+    }
+  } else {
+    for (unsigned i = 0; i < NumMIVAddrs; ++i)
+      VAddrs.push_back(Op.getOperand(AddrIdx + i));
+  }
+
   SDValue VAddr = getBuildDwordsVector(DAG, DL, VAddrs);
 
   SDValue True = DAG.getTargetConstant(1, DL, MVT::i1);
@@ -4674,11 +4934,53 @@ SDValue SITargetLowering::lowerImage(SDValue Op,
     CtrlIdx = AddrIdx + NumVAddrs + 3;
   }
 
+  SDValue TFE;
+  SDValue LWE;
   SDValue TexFail = Op.getOperand(CtrlIdx);
-  auto TexFailConst = dyn_cast<ConstantSDNode>(TexFail.getNode());
-  if (!TexFailConst || TexFailConst->getZExtValue() != 0)
+  bool IsTexFail = false;
+  if (!parseTexFail(TexFail, DAG, &TFE, &LWE, IsTexFail))
     return Op;
 
+  if (IsTexFail) {
+    if (!DMaskLanes) {
+      // Expecting to get an error flag since TFC is on - and dmask is 0
+      // Force dmask to be at least 1 otherwise the instruction will fail
+      DMask = 0x1;
+      DMaskLanes = 1;
+      NumVDataDwords = 1;
+    }
+    NumVDataDwords += 1;
+    AdjustRetType = true;
+  }
+
+  // Has something earlier tagged that the return type needs adjusting
+  // This happens if the instruction is a load or has set TexFailCtrl flags
+  if (AdjustRetType) {
+    // NumVDataDwords reflects the true number of dwords required in the return type
+    if (DMaskLanes == 0 && !BaseOpcode->Store) {
+      // This is a no-op load. This can be eliminated
+      SDValue Undef = DAG.getUNDEF(Op.getValueType());
+      if (isa<MemSDNode>(Op))
+        return DAG.getMergeValues({Undef, Op.getOperand(0)}, DL);
+      return Undef;
+    }
+
+    // Have to use a power of 2 number of dwords
+    NumVDataDwords = 1 << Log2_32_Ceil(NumVDataDwords);
+
+    EVT NewVT = NumVDataDwords > 1 ?
+                  EVT::getVectorVT(*DAG.getContext(), MVT::f32, NumVDataDwords)
+                : MVT::f32;
+
+    ResultTypes[0] = NewVT;
+    if (ResultTypes.size() == 3) {
+      // Original result was aggregate type used for TexFailCtrl results
+      // The actual instruction returns as a vector type which has now been
+      // created. Remove the aggregate result.
+      ResultTypes.erase(&ResultTypes[1]);
+    }
+  }
+
   SDValue GLC;
   SDValue SLC;
   if (BaseOpcode->Atomic) {
@@ -4701,9 +5003,10 @@ SDValue SITargetLowering::lowerImage(SDValue Op,
   Ops.push_back(Unorm);
   Ops.push_back(GLC);
   Ops.push_back(SLC);
-  Ops.push_back(False); // r128
-  Ops.push_back(False); // tfe
-  Ops.push_back(False); // lwe
+  Ops.push_back(IsA16 &&  // a16 or r128
+                ST->hasFeature(AMDGPU::FeatureR128A16) ? True : False);
+  Ops.push_back(TFE); // tfe
+  Ops.push_back(LWE); // lwe
   Ops.push_back(DimInfo->DA ? True : False);
   if (BaseOpcode->HasD16)
     Ops.push_back(IsD16 ? True : False);
@@ -4723,25 +5026,90 @@ SDValue SITargetLowering::lowerImage(SDValue Op,
 
   MachineSDNode *NewNode = DAG.getMachineNode(Opcode, DL, ResultTypes, Ops);
   if (auto MemOp = dyn_cast<MemSDNode>(Op)) {
-    MachineInstr::mmo_iterator MemRefs = MF.allocateMemRefsArray(1);
-    *MemRefs = MemOp->getMemOperand();
-    NewNode->setMemRefs(MemRefs, MemRefs + 1);
+    MachineMemOperand *MemRef = MemOp->getMemOperand();
+    DAG.setNodeMemRefs(NewNode, {MemRef});
   }
 
   if (BaseOpcode->AtomicX2) {
     SmallVector<SDValue, 1> Elt;
     DAG.ExtractVectorElements(SDValue(NewNode, 0), Elt, 0, 1);
     return DAG.getMergeValues({Elt[0], SDValue(NewNode, 1)}, DL);
-  } else if (IsD16 && !BaseOpcode->Store) {
-    MVT LoadVT = Op.getSimpleValueType();
-    SDValue Adjusted = adjustLoadValueTypeImpl(
-        SDValue(NewNode, 0), LoadVT, DL, DAG, Subtarget->hasUnpackedD16VMem());
-    return DAG.getMergeValues({Adjusted, SDValue(NewNode, 1)}, DL);
+  } else if (!BaseOpcode->Store) {
+    return constructRetValue(DAG, NewNode,
+                             OrigResultTypes, IsTexFail,
+                             Subtarget->hasUnpackedD16VMem(), IsD16,
+                             DMaskLanes, NumVDataDwords, DL,
+                             *DAG.getContext());
   }
 
   return SDValue(NewNode, 0);
 }
 
+SDValue SITargetLowering::lowerSBuffer(EVT VT, SDLoc DL, SDValue Rsrc,
+                                       SDValue Offset, SDValue GLC,
+                                       SelectionDAG &DAG) const {
+  MachineFunction &MF = DAG.getMachineFunction();
+  MachineMemOperand *MMO = MF.getMachineMemOperand(
+      MachinePointerInfo(),
+      MachineMemOperand::MOLoad | MachineMemOperand::MODereferenceable |
+          MachineMemOperand::MOInvariant,
+      VT.getStoreSize(), VT.getStoreSize());
+
+  if (!Offset->isDivergent()) {
+    SDValue Ops[] = {
+        Rsrc,
+        Offset, // Offset
+        GLC     // glc
+    };
+    return DAG.getMemIntrinsicNode(AMDGPUISD::SBUFFER_LOAD, DL,
+                                   DAG.getVTList(VT), Ops, VT, MMO);
+  }
+
+  // We have a divergent offset. Emit a MUBUF buffer load instead. We can
+  // assume that the buffer is unswizzled.
+  SmallVector<SDValue, 4> Loads;
+  unsigned NumLoads = 1;
+  MVT LoadVT = VT.getSimpleVT();
+  unsigned NumElts = LoadVT.isVector() ? LoadVT.getVectorNumElements() : 1;
+  assert((LoadVT.getScalarType() == MVT::i32 ||
+          LoadVT.getScalarType() == MVT::f32) &&
+         isPowerOf2_32(NumElts));
+
+  if (NumElts == 8 || NumElts == 16) {
+    NumLoads = NumElts == 16 ? 4 : 2;
+    LoadVT = MVT::v4i32;
+  }
+
+  SDVTList VTList = DAG.getVTList({LoadVT, MVT::Glue});
+  unsigned CachePolicy = cast<ConstantSDNode>(GLC)->getZExtValue();
+  SDValue Ops[] = {
+      DAG.getEntryNode(),                         // Chain
+      Rsrc,                                       // rsrc
+      DAG.getConstant(0, DL, MVT::i32),           // vindex
+      {},                                         // voffset
+      {},                                         // soffset
+      {},                                         // offset
+      DAG.getConstant(CachePolicy, DL, MVT::i32), // cachepolicy
+      DAG.getConstant(0, DL, MVT::i1),            // idxen
+  };
+
+  // Use the alignment to ensure that the required offsets will fit into the
+  // immediate offsets.
+  setBufferOffsets(Offset, DAG, &Ops[3], NumLoads > 1 ? 16 * NumLoads : 4);
+
+  uint64_t InstOffset = cast<ConstantSDNode>(Ops[5])->getZExtValue();
+  for (unsigned i = 0; i < NumLoads; ++i) {
+    Ops[5] = DAG.getConstant(InstOffset + 16 * i, DL, MVT::i32);
+    Loads.push_back(DAG.getMemIntrinsicNode(AMDGPUISD::BUFFER_LOAD, DL, VTList,
+                                            Ops, LoadVT, MMO));
+  }
+
+  if (VT == MVT::v8i32 || VT == MVT::v16i32)
+    return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, Loads);
+
+  return Loads[0];
+}
+
 SDValue SITargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op,
                                                   SelectionDAG &DAG) const {
   MachineFunction &MF = DAG.getMachineFunction();
@@ -4755,14 +5123,14 @@ SDValue SITargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op,
 
   switch (IntrinsicID) {
   case Intrinsic::amdgcn_implicit_buffer_ptr: {
-    if (getSubtarget()->isAmdCodeObjectV2(MF.getFunction()))
+    if (getSubtarget()->isAmdHsaOrMesa(MF.getFunction()))
       return emitNonHSAIntrinsicError(DAG, DL, VT);
     return getPreloadedValue(DAG, *MFI, VT,
                              AMDGPUFunctionArgInfo::IMPLICIT_BUFFER_PTR);
   }
   case Intrinsic::amdgcn_dispatch_ptr:
   case Intrinsic::amdgcn_queue_ptr: {
-    if (!Subtarget->isAmdCodeObjectV2(MF.getFunction())) {
+    if (!Subtarget->isAmdHsaOrMesa(MF.getFunction())) {
       DiagnosticInfoUnsupported BadIntrin(
           MF.getFunction(), "unsupported hsa intrinsic without hsa target",
           DL.getDebugLoc());
@@ -4880,12 +5248,11 @@ SDValue SITargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op,
   case Intrinsic::r600_read_tgid_z:
     return getPreloadedValue(DAG, *MFI, VT,
                              AMDGPUFunctionArgInfo::WORKGROUP_ID_Z);
-  case Intrinsic::amdgcn_workitem_id_x: {
+  case Intrinsic::amdgcn_workitem_id_x:
   case Intrinsic::r600_read_tidig_x:
     return loadInputValue(DAG, &AMDGPU::VGPR_32RegClass, MVT::i32,
                           SDLoc(DAG.getEntryNode()),
                           MFI->getArgInfo().WorkItemIDX);
-  }
   case Intrinsic::amdgcn_workitem_id_y:
   case Intrinsic::r600_read_tidig_y:
     return loadInputValue(DAG, &AMDGPU::VGPR_32RegClass, MVT::i32,
@@ -4896,19 +5263,16 @@ SDValue SITargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op,
     return loadInputValue(DAG, &AMDGPU::VGPR_32RegClass, MVT::i32,
                           SDLoc(DAG.getEntryNode()),
                           MFI->getArgInfo().WorkItemIDZ);
-  case AMDGPUIntrinsic::SI_load_const: {
-    SDValue Ops[] = {
-      Op.getOperand(1),
-      Op.getOperand(2)
-    };
-
-    MachineMemOperand *MMO = MF.getMachineMemOperand(
-        MachinePointerInfo(),
-        MachineMemOperand::MOLoad | MachineMemOperand::MODereferenceable |
-            MachineMemOperand::MOInvariant,
-        VT.getStoreSize(), 4);
-    return DAG.getMemIntrinsicNode(AMDGPUISD::LOAD_CONSTANT, DL,
-                                   Op->getVTList(), Ops, VT, MMO);
+  case SIIntrinsic::SI_load_const: {
+    SDValue Load =
+        lowerSBuffer(MVT::i32, DL, Op.getOperand(1), Op.getOperand(2),
+                     DAG.getTargetConstant(0, DL, MVT::i1), DAG);
+    return DAG.getNode(ISD::BITCAST, DL, MVT::f32, Load);
+  }
+  case Intrinsic::amdgcn_s_buffer_load: {
+    unsigned Cache = cast<ConstantSDNode>(Op.getOperand(3))->getZExtValue();
+    return lowerSBuffer(VT, DL, Op.getOperand(1), Op.getOperand(2),
+                        DAG.getTargetConstant(Cache & 1, DL, MVT::i1), DAG);
   }
   case Intrinsic::amdgcn_fdiv_fast:
     return lowerFDIV_FAST(Op, DAG);
@@ -4991,34 +5355,15 @@ SDValue SITargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op,
                        Denominator, Numerator);
   }
   case Intrinsic::amdgcn_icmp: {
-    const auto *CD = dyn_cast<ConstantSDNode>(Op.getOperand(3));
-    if (!CD)
-      return DAG.getUNDEF(VT);
-
-    int CondCode = CD->getSExtValue();
-    if (CondCode < ICmpInst::Predicate::FIRST_ICMP_PREDICATE ||
-        CondCode > ICmpInst::Predicate::LAST_ICMP_PREDICATE)
-      return DAG.getUNDEF(VT);
-
-    ICmpInst::Predicate IcInput = static_cast<ICmpInst::Predicate>(CondCode);
-    ISD::CondCode CCOpcode = getICmpCondCode(IcInput);
-    return DAG.getNode(AMDGPUISD::SETCC, DL, VT, Op.getOperand(1),
-                       Op.getOperand(2), DAG.getCondCode(CCOpcode));
+    // There is a Pat that handles this variant, so return it as-is.
+    if (Op.getOperand(1).getValueType() == MVT::i1 &&
+        Op.getConstantOperandVal(2) == 0 &&
+        Op.getConstantOperandVal(3) == ICmpInst::Predicate::ICMP_NE)
+      return Op;
+    return lowerICMPIntrinsic(*this, Op.getNode(), DAG);
   }
   case Intrinsic::amdgcn_fcmp: {
-    const auto *CD = dyn_cast<ConstantSDNode>(Op.getOperand(3));
-    if (!CD)
-      return DAG.getUNDEF(VT);
-
-    int CondCode = CD->getSExtValue();
-    if (CondCode < FCmpInst::Predicate::FIRST_FCMP_PREDICATE ||
-        CondCode > FCmpInst::Predicate::LAST_FCMP_PREDICATE)
-      return DAG.getUNDEF(VT);
-
-    FCmpInst::Predicate IcInput = static_cast<FCmpInst::Predicate>(CondCode);
-    ISD::CondCode CCOpcode = getFCmpCondCode(IcInput);
-    return DAG.getNode(AMDGPUISD::SETCC, DL, VT, Op.getOperand(1),
-                       Op.getOperand(2), DAG.getCondCode(CCOpcode));
+    return lowerFCMPIntrinsic(*this, Op.getNode(), DAG);
   }
   case Intrinsic::amdgcn_fmed3:
     return DAG.getNode(AMDGPUISD::FMED3, DL, VT,
@@ -5058,6 +5403,9 @@ SDValue SITargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op,
     else
       Opcode = AMDGPUISD::CVT_PK_U16_U32;
 
+    if (isTypeLegal(VT))
+      return DAG.getNode(Opcode, DL, VT, Op.getOperand(1), Op.getOperand(2));
+
     SDValue Node = DAG.getNode(Opcode, DL, MVT::i32,
                                Op.getOperand(1), Op.getOperand(2));
     return DAG.getNode(ISD::BITCAST, DL, VT, Node);
@@ -5127,36 +5475,104 @@ SDValue SITargetLowering::LowerINTRINSIC_W_CHAIN(SDValue Op,
   }
   case Intrinsic::amdgcn_buffer_load:
   case Intrinsic::amdgcn_buffer_load_format: {
+    unsigned Glc = cast<ConstantSDNode>(Op.getOperand(5))->getZExtValue();
+    unsigned Slc = cast<ConstantSDNode>(Op.getOperand(6))->getZExtValue();
+    unsigned IdxEn = 1;
+    if (auto Idx = dyn_cast<ConstantSDNode>(Op.getOperand(3)))
+      IdxEn = Idx->getZExtValue() != 0;
     SDValue Ops[] = {
       Op.getOperand(0), // Chain
       Op.getOperand(2), // rsrc
       Op.getOperand(3), // vindex
-      Op.getOperand(4), // offset
-      Op.getOperand(5), // glc
-      Op.getOperand(6)  // slc
+      SDValue(),        // voffset -- will be set by setBufferOffsets
+      SDValue(),        // soffset -- will be set by setBufferOffsets
+      SDValue(),        // offset -- will be set by setBufferOffsets
+      DAG.getConstant(Glc | (Slc << 1), DL, MVT::i32), // cachepolicy
+      DAG.getConstant(IdxEn, DL, MVT::i1), // idxen
     };
 
+    setBufferOffsets(Op.getOperand(4), DAG, &Ops[3]);
     unsigned Opc = (IntrID == Intrinsic::amdgcn_buffer_load) ?
         AMDGPUISD::BUFFER_LOAD : AMDGPUISD::BUFFER_LOAD_FORMAT;
+
     EVT VT = Op.getValueType();
     EVT IntVT = VT.changeTypeToInteger();
     auto *M = cast<MemSDNode>(Op);
     EVT LoadVT = Op.getValueType();
-    bool IsD16 = LoadVT.getScalarType() == MVT::f16;
-    if (IsD16)
-      return adjustLoadValueType(AMDGPUISD::BUFFER_LOAD_FORMAT_D16, M, DAG);
 
+    if (LoadVT.getScalarType() == MVT::f16)
+      return adjustLoadValueType(AMDGPUISD::BUFFER_LOAD_FORMAT_D16,
+                                 M, DAG, Ops);
+    return DAG.getMemIntrinsicNode(Opc, DL, Op->getVTList(), Ops, IntVT,
+                                   M->getMemOperand());
+  }
+  case Intrinsic::amdgcn_raw_buffer_load:
+  case Intrinsic::amdgcn_raw_buffer_load_format: {
+    auto Offsets = splitBufferOffsets(Op.getOperand(3), DAG);
+    SDValue Ops[] = {
+      Op.getOperand(0), // Chain
+      Op.getOperand(2), // rsrc
+      DAG.getConstant(0, DL, MVT::i32), // vindex
+      Offsets.first,    // voffset
+      Op.getOperand(4), // soffset
+      Offsets.second,   // offset
+      Op.getOperand(5), // cachepolicy
+      DAG.getConstant(0, DL, MVT::i1), // idxen
+    };
+
+    unsigned Opc = (IntrID == Intrinsic::amdgcn_raw_buffer_load) ?
+        AMDGPUISD::BUFFER_LOAD : AMDGPUISD::BUFFER_LOAD_FORMAT;
+
+    EVT VT = Op.getValueType();
+    EVT IntVT = VT.changeTypeToInteger();
+    auto *M = cast<MemSDNode>(Op);
+    EVT LoadVT = Op.getValueType();
+
+    if (LoadVT.getScalarType() == MVT::f16)
+      return adjustLoadValueType(AMDGPUISD::BUFFER_LOAD_FORMAT_D16,
+                                 M, DAG, Ops);
+    return DAG.getMemIntrinsicNode(Opc, DL, Op->getVTList(), Ops, IntVT,
+                                   M->getMemOperand());
+  }
+  case Intrinsic::amdgcn_struct_buffer_load:
+  case Intrinsic::amdgcn_struct_buffer_load_format: {
+    auto Offsets = splitBufferOffsets(Op.getOperand(4), DAG);
+    SDValue Ops[] = {
+      Op.getOperand(0), // Chain
+      Op.getOperand(2), // rsrc
+      Op.getOperand(3), // vindex
+      Offsets.first,    // voffset
+      Op.getOperand(5), // soffset
+      Offsets.second,   // offset
+      Op.getOperand(6), // cachepolicy
+      DAG.getConstant(1, DL, MVT::i1), // idxen
+    };
+
+    unsigned Opc = (IntrID == Intrinsic::amdgcn_struct_buffer_load) ?
+        AMDGPUISD::BUFFER_LOAD : AMDGPUISD::BUFFER_LOAD_FORMAT;
+
+    EVT VT = Op.getValueType();
+    EVT IntVT = VT.changeTypeToInteger();
+    auto *M = cast<MemSDNode>(Op);
+    EVT LoadVT = Op.getValueType();
+
+    if (LoadVT.getScalarType() == MVT::f16)
+      return adjustLoadValueType(AMDGPUISD::BUFFER_LOAD_FORMAT_D16,
+                                 M, DAG, Ops);
     return DAG.getMemIntrinsicNode(Opc, DL, Op->getVTList(), Ops, IntVT,
                                    M->getMemOperand());
   }
   case Intrinsic::amdgcn_tbuffer_load: {
     MemSDNode *M = cast<MemSDNode>(Op);
     EVT LoadVT = Op.getValueType();
-    bool IsD16 = LoadVT.getScalarType() == MVT::f16;
-    if (IsD16) {
-      return adjustLoadValueType(AMDGPUISD::TBUFFER_LOAD_FORMAT_D16, M, DAG);
-    }
 
+    unsigned Dfmt = cast<ConstantSDNode>(Op.getOperand(7))->getZExtValue();
+    unsigned Nfmt = cast<ConstantSDNode>(Op.getOperand(8))->getZExtValue();
+    unsigned Glc = cast<ConstantSDNode>(Op.getOperand(9))->getZExtValue();
+    unsigned Slc = cast<ConstantSDNode>(Op.getOperand(10))->getZExtValue();
+    unsigned IdxEn = 1;
+    if (auto Idx = dyn_cast<ConstantSDNode>(Op.getOperand(3)))
+      IdxEn = Idx->getZExtValue() != 0;
     SDValue Ops[] = {
       Op.getOperand(0),  // Chain
       Op.getOperand(2),  // rsrc
@@ -5164,12 +5580,62 @@ SDValue SITargetLowering::LowerINTRINSIC_W_CHAIN(SDValue Op,
       Op.getOperand(4),  // voffset
       Op.getOperand(5),  // soffset
       Op.getOperand(6),  // offset
-      Op.getOperand(7),  // dfmt
-      Op.getOperand(8),  // nfmt
-      Op.getOperand(9),  // glc
-      Op.getOperand(10)   // slc
+      DAG.getConstant(Dfmt | (Nfmt << 4), DL, MVT::i32), // format
+      DAG.getConstant(Glc | (Slc << 1), DL, MVT::i32), // cachepolicy
+      DAG.getConstant(IdxEn, DL, MVT::i1), // idxen
+    };
+
+    if (LoadVT.getScalarType() == MVT::f16)
+      return adjustLoadValueType(AMDGPUISD::TBUFFER_LOAD_FORMAT_D16,
+                                 M, DAG, Ops);
+    return DAG.getMemIntrinsicNode(AMDGPUISD::TBUFFER_LOAD_FORMAT, DL,
+                                   Op->getVTList(), Ops, LoadVT,
+                                   M->getMemOperand());
+  }
+  case Intrinsic::amdgcn_raw_tbuffer_load: {
+    MemSDNode *M = cast<MemSDNode>(Op);
+    EVT LoadVT = Op.getValueType();
+    auto Offsets = splitBufferOffsets(Op.getOperand(3), DAG);
+
+    SDValue Ops[] = {
+      Op.getOperand(0),  // Chain
+      Op.getOperand(2),  // rsrc
+      DAG.getConstant(0, DL, MVT::i32), // vindex
+      Offsets.first,     // voffset
+      Op.getOperand(4),  // soffset
+      Offsets.second,    // offset
+      Op.getOperand(5),  // format
+      Op.getOperand(6),  // cachepolicy
+      DAG.getConstant(0, DL, MVT::i1), // idxen
+    };
+
+    if (LoadVT.getScalarType() == MVT::f16)
+      return adjustLoadValueType(AMDGPUISD::TBUFFER_LOAD_FORMAT_D16,
+                                 M, DAG, Ops);
+    return DAG.getMemIntrinsicNode(AMDGPUISD::TBUFFER_LOAD_FORMAT, DL,
+                                   Op->getVTList(), Ops, LoadVT,
+                                   M->getMemOperand());
+  }
+  case Intrinsic::amdgcn_struct_tbuffer_load: {
+    MemSDNode *M = cast<MemSDNode>(Op);
+    EVT LoadVT = Op.getValueType();
+    auto Offsets = splitBufferOffsets(Op.getOperand(4), DAG);
+
+    SDValue Ops[] = {
+      Op.getOperand(0),  // Chain
+      Op.getOperand(2),  // rsrc
+      Op.getOperand(3),  // vindex
+      Offsets.first,     // voffset
+      Op.getOperand(5),  // soffset
+      Offsets.second,    // offset
+      Op.getOperand(6),  // format
+      Op.getOperand(7),  // cachepolicy
+      DAG.getConstant(1, DL, MVT::i1), // idxen
     };
 
+    if (LoadVT.getScalarType() == MVT::f16)
+      return adjustLoadValueType(AMDGPUISD::TBUFFER_LOAD_FORMAT_D16,
+                                 M, DAG, Ops);
     return DAG.getMemIntrinsicNode(AMDGPUISD::TBUFFER_LOAD_FORMAT, DL,
                                    Op->getVTList(), Ops, LoadVT,
                                    M->getMemOperand());
@@ -5184,14 +5650,22 @@ SDValue SITargetLowering::LowerINTRINSIC_W_CHAIN(SDValue Op,
   case Intrinsic::amdgcn_buffer_atomic_and:
   case Intrinsic::amdgcn_buffer_atomic_or:
   case Intrinsic::amdgcn_buffer_atomic_xor: {
+    unsigned Slc = cast<ConstantSDNode>(Op.getOperand(6))->getZExtValue();
+    unsigned IdxEn = 1;
+    if (auto Idx = dyn_cast<ConstantSDNode>(Op.getOperand(4)))
+      IdxEn = Idx->getZExtValue() != 0;
     SDValue Ops[] = {
       Op.getOperand(0), // Chain
       Op.getOperand(2), // vdata
       Op.getOperand(3), // rsrc
       Op.getOperand(4), // vindex
-      Op.getOperand(5), // offset
-      Op.getOperand(6)  // slc
+      SDValue(),        // voffset -- will be set by setBufferOffsets
+      SDValue(),        // soffset -- will be set by setBufferOffsets
+      SDValue(),        // offset -- will be set by setBufferOffsets
+      DAG.getConstant(Slc << 1, DL, MVT::i32), // cachepolicy
+      DAG.getConstant(IdxEn, DL, MVT::i1), // idxen
     };
+    setBufferOffsets(Op.getOperand(5), DAG, &Ops[4]);
     EVT VT = Op.getValueType();
 
     auto *M = cast<MemSDNode>(Op);
@@ -5235,16 +5709,193 @@ SDValue SITargetLowering::LowerINTRINSIC_W_CHAIN(SDValue Op,
     return DAG.getMemIntrinsicNode(Opcode, DL, Op->getVTList(), Ops, VT,
                                    M->getMemOperand());
   }
+  case Intrinsic::amdgcn_raw_buffer_atomic_swap:
+  case Intrinsic::amdgcn_raw_buffer_atomic_add:
+  case Intrinsic::amdgcn_raw_buffer_atomic_sub:
+  case Intrinsic::amdgcn_raw_buffer_atomic_smin:
+  case Intrinsic::amdgcn_raw_buffer_atomic_umin:
+  case Intrinsic::amdgcn_raw_buffer_atomic_smax:
+  case Intrinsic::amdgcn_raw_buffer_atomic_umax:
+  case Intrinsic::amdgcn_raw_buffer_atomic_and:
+  case Intrinsic::amdgcn_raw_buffer_atomic_or:
+  case Intrinsic::amdgcn_raw_buffer_atomic_xor: {
+    auto Offsets = splitBufferOffsets(Op.getOperand(4), DAG);
+    SDValue Ops[] = {
+      Op.getOperand(0), // Chain
+      Op.getOperand(2), // vdata
+      Op.getOperand(3), // rsrc
+      DAG.getConstant(0, DL, MVT::i32), // vindex
+      Offsets.first,    // voffset
+      Op.getOperand(5), // soffset
+      Offsets.second,   // offset
+      Op.getOperand(6), // cachepolicy
+      DAG.getConstant(0, DL, MVT::i1), // idxen
+    };
+    EVT VT = Op.getValueType();
+
+    auto *M = cast<MemSDNode>(Op);
+    unsigned Opcode = 0;
+
+    switch (IntrID) {
+    case Intrinsic::amdgcn_raw_buffer_atomic_swap:
+      Opcode = AMDGPUISD::BUFFER_ATOMIC_SWAP;
+      break;
+    case Intrinsic::amdgcn_raw_buffer_atomic_add:
+      Opcode = AMDGPUISD::BUFFER_ATOMIC_ADD;
+      break;
+    case Intrinsic::amdgcn_raw_buffer_atomic_sub:
+      Opcode = AMDGPUISD::BUFFER_ATOMIC_SUB;
+      break;
+    case Intrinsic::amdgcn_raw_buffer_atomic_smin:
+      Opcode = AMDGPUISD::BUFFER_ATOMIC_SMIN;
+      break;
+    case Intrinsic::amdgcn_raw_buffer_atomic_umin:
+      Opcode = AMDGPUISD::BUFFER_ATOMIC_UMIN;
+      break;
+    case Intrinsic::amdgcn_raw_buffer_atomic_smax:
+      Opcode = AMDGPUISD::BUFFER_ATOMIC_SMAX;
+      break;
+    case Intrinsic::amdgcn_raw_buffer_atomic_umax:
+      Opcode = AMDGPUISD::BUFFER_ATOMIC_UMAX;
+      break;
+    case Intrinsic::amdgcn_raw_buffer_atomic_and:
+      Opcode = AMDGPUISD::BUFFER_ATOMIC_AND;
+      break;
+    case Intrinsic::amdgcn_raw_buffer_atomic_or:
+      Opcode = AMDGPUISD::BUFFER_ATOMIC_OR;
+      break;
+    case Intrinsic::amdgcn_raw_buffer_atomic_xor:
+      Opcode = AMDGPUISD::BUFFER_ATOMIC_XOR;
+      break;
+    default:
+      llvm_unreachable("unhandled atomic opcode");
+    }
+
+    return DAG.getMemIntrinsicNode(Opcode, DL, Op->getVTList(), Ops, VT,
+                                   M->getMemOperand());
+  }
+  case Intrinsic::amdgcn_struct_buffer_atomic_swap:
+  case Intrinsic::amdgcn_struct_buffer_atomic_add:
+  case Intrinsic::amdgcn_struct_buffer_atomic_sub:
+  case Intrinsic::amdgcn_struct_buffer_atomic_smin:
+  case Intrinsic::amdgcn_struct_buffer_atomic_umin:
+  case Intrinsic::amdgcn_struct_buffer_atomic_smax:
+  case Intrinsic::amdgcn_struct_buffer_atomic_umax:
+  case Intrinsic::amdgcn_struct_buffer_atomic_and:
+  case Intrinsic::amdgcn_struct_buffer_atomic_or:
+  case Intrinsic::amdgcn_struct_buffer_atomic_xor: {
+    auto Offsets = splitBufferOffsets(Op.getOperand(5), DAG);
+    SDValue Ops[] = {
+      Op.getOperand(0), // Chain
+      Op.getOperand(2), // vdata
+      Op.getOperand(3), // rsrc
+      Op.getOperand(4), // vindex
+      Offsets.first,    // voffset
+      Op.getOperand(6), // soffset
+      Offsets.second,   // offset
+      Op.getOperand(7), // cachepolicy
+      DAG.getConstant(1, DL, MVT::i1), // idxen
+    };
+    EVT VT = Op.getValueType();
+
+    auto *M = cast<MemSDNode>(Op);
+    unsigned Opcode = 0;
+
+    switch (IntrID) {
+    case Intrinsic::amdgcn_struct_buffer_atomic_swap:
+      Opcode = AMDGPUISD::BUFFER_ATOMIC_SWAP;
+      break;
+    case Intrinsic::amdgcn_struct_buffer_atomic_add:
+      Opcode = AMDGPUISD::BUFFER_ATOMIC_ADD;
+      break;
+    case Intrinsic::amdgcn_struct_buffer_atomic_sub:
+      Opcode = AMDGPUISD::BUFFER_ATOMIC_SUB;
+      break;
+    case Intrinsic::amdgcn_struct_buffer_atomic_smin:
+      Opcode = AMDGPUISD::BUFFER_ATOMIC_SMIN;
+      break;
+    case Intrinsic::amdgcn_struct_buffer_atomic_umin:
+      Opcode = AMDGPUISD::BUFFER_ATOMIC_UMIN;
+      break;
+    case Intrinsic::amdgcn_struct_buffer_atomic_smax:
+      Opcode = AMDGPUISD::BUFFER_ATOMIC_SMAX;
+      break;
+    case Intrinsic::amdgcn_struct_buffer_atomic_umax:
+      Opcode = AMDGPUISD::BUFFER_ATOMIC_UMAX;
+      break;
+    case Intrinsic::amdgcn_struct_buffer_atomic_and:
+      Opcode = AMDGPUISD::BUFFER_ATOMIC_AND;
+      break;
+    case Intrinsic::amdgcn_struct_buffer_atomic_or:
+      Opcode = AMDGPUISD::BUFFER_ATOMIC_OR;
+      break;
+    case Intrinsic::amdgcn_struct_buffer_atomic_xor:
+      Opcode = AMDGPUISD::BUFFER_ATOMIC_XOR;
+      break;
+    default:
+      llvm_unreachable("unhandled atomic opcode");
+    }
 
+    return DAG.getMemIntrinsicNode(Opcode, DL, Op->getVTList(), Ops, VT,
+                                   M->getMemOperand());
+  }
   case Intrinsic::amdgcn_buffer_atomic_cmpswap: {
+    unsigned Slc = cast<ConstantSDNode>(Op.getOperand(7))->getZExtValue();
+    unsigned IdxEn = 1;
+    if (auto Idx = dyn_cast<ConstantSDNode>(Op.getOperand(5)))
+      IdxEn = Idx->getZExtValue() != 0;
+    SDValue Ops[] = {
+      Op.getOperand(0), // Chain
+      Op.getOperand(2), // src
+      Op.getOperand(3), // cmp
+      Op.getOperand(4), // rsrc
+      Op.getOperand(5), // vindex
+      SDValue(),        // voffset -- will be set by setBufferOffsets
+      SDValue(),        // soffset -- will be set by setBufferOffsets
+      SDValue(),        // offset -- will be set by setBufferOffsets
+      DAG.getConstant(Slc << 1, DL, MVT::i32), // cachepolicy
+      DAG.getConstant(IdxEn, DL, MVT::i1), // idxen
+    };
+    setBufferOffsets(Op.getOperand(6), DAG, &Ops[5]);
+    EVT VT = Op.getValueType();
+    auto *M = cast<MemSDNode>(Op);
+
+    return DAG.getMemIntrinsicNode(AMDGPUISD::BUFFER_ATOMIC_CMPSWAP, DL,
+                                   Op->getVTList(), Ops, VT, M->getMemOperand());
+  }
+  case Intrinsic::amdgcn_raw_buffer_atomic_cmpswap: {
+    auto Offsets = splitBufferOffsets(Op.getOperand(5), DAG);
+    SDValue Ops[] = {
+      Op.getOperand(0), // Chain
+      Op.getOperand(2), // src
+      Op.getOperand(3), // cmp
+      Op.getOperand(4), // rsrc
+      DAG.getConstant(0, DL, MVT::i32), // vindex
+      Offsets.first,    // voffset
+      Op.getOperand(6), // soffset
+      Offsets.second,   // offset
+      Op.getOperand(7), // cachepolicy
+      DAG.getConstant(0, DL, MVT::i1), // idxen
+    };
+    EVT VT = Op.getValueType();
+    auto *M = cast<MemSDNode>(Op);
+
+    return DAG.getMemIntrinsicNode(AMDGPUISD::BUFFER_ATOMIC_CMPSWAP, DL,
+                                   Op->getVTList(), Ops, VT, M->getMemOperand());
+  }
+  case Intrinsic::amdgcn_struct_buffer_atomic_cmpswap: {
+    auto Offsets = splitBufferOffsets(Op.getOperand(6), DAG);
     SDValue Ops[] = {
       Op.getOperand(0), // Chain
       Op.getOperand(2), // src
       Op.getOperand(3), // cmp
       Op.getOperand(4), // rsrc
       Op.getOperand(5), // vindex
-      Op.getOperand(6), // offset
-      Op.getOperand(7)  // slc
+      Offsets.first,    // voffset
+      Op.getOperand(7), // soffset
+      Offsets.second,   // offset
+      Op.getOperand(8), // cachepolicy
+      DAG.getConstant(1, DL, MVT::i1), // idxen
     };
     EVT VT = Op.getValueType();
     auto *M = cast<MemSDNode>(Op);
@@ -5360,19 +6011,6 @@ SDValue SITargetLowering::LowerINTRINSIC_VOID(SDValue Op,
     return DAG.getNode(AMDGPUISD::INIT_EXEC_FROM_INPUT, DL, MVT::Other, Chain,
                        Op.getOperand(2), Op.getOperand(3));
   }
-  case AMDGPUIntrinsic::AMDGPU_kill: {
-    SDValue Src = Op.getOperand(2);
-    if (const ConstantFPSDNode *K = dyn_cast<ConstantFPSDNode>(Src)) {
-      if (!K->isNegative())
-        return Chain;
-
-      SDValue NegOne = DAG.getTargetConstant(FloatToBits(-1.0f), DL, MVT::i32);
-      return DAG.getNode(AMDGPUISD::KILL, DL, MVT::Other, Chain, NegOne);
-    }
-
-    SDValue Cast = DAG.getNode(ISD::BITCAST, DL, MVT::i32, Src);
-    return DAG.getNode(AMDGPUISD::KILL, DL, MVT::Other, Chain, Cast);
-  }
   case Intrinsic::amdgcn_s_barrier: {
     if (getTargetMachine().getOptLevel() > CodeGenOpt::None) {
       const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
@@ -5383,69 +6021,79 @@ SDValue SITargetLowering::LowerINTRINSIC_VOID(SDValue Op,
     }
     return SDValue();
   };
-  case AMDGPUIntrinsic::SI_tbuffer_store: {
-
-    // Extract vindex and voffset from vaddr as appropriate
-    const ConstantSDNode *OffEn = cast<ConstantSDNode>(Op.getOperand(10));
-    const ConstantSDNode *IdxEn = cast<ConstantSDNode>(Op.getOperand(11));
-    SDValue VAddr = Op.getOperand(5);
-
-    SDValue Zero = DAG.getTargetConstant(0, DL, MVT::i32);
-
-    assert(!(OffEn->isOne() && IdxEn->isOne()) &&
-           "Legacy intrinsic doesn't support both offset and index - use new version");
-
-    SDValue VIndex = IdxEn->isOne() ? VAddr : Zero;
-    SDValue VOffset = OffEn->isOne() ? VAddr : Zero;
-
-    // Deal with the vec-3 case
-    const ConstantSDNode *NumChannels = cast<ConstantSDNode>(Op.getOperand(4));
-    auto Opcode = NumChannels->getZExtValue() == 3 ?
-      AMDGPUISD::TBUFFER_STORE_FORMAT_X3 : AMDGPUISD::TBUFFER_STORE_FORMAT;
-
+  case Intrinsic::amdgcn_tbuffer_store: {
+    SDValue VData = Op.getOperand(2);
+    bool IsD16 = (VData.getValueType().getScalarType() == MVT::f16);
+    if (IsD16)
+      VData = handleD16VData(VData, DAG);
+    unsigned Dfmt = cast<ConstantSDNode>(Op.getOperand(8))->getZExtValue();
+    unsigned Nfmt = cast<ConstantSDNode>(Op.getOperand(9))->getZExtValue();
+    unsigned Glc = cast<ConstantSDNode>(Op.getOperand(10))->getZExtValue();
+    unsigned Slc = cast<ConstantSDNode>(Op.getOperand(11))->getZExtValue();
+    unsigned IdxEn = 1;
+    if (auto Idx = dyn_cast<ConstantSDNode>(Op.getOperand(4)))
+      IdxEn = Idx->getZExtValue() != 0;
     SDValue Ops[] = {
-     Chain,
-     Op.getOperand(3),  // vdata
-     Op.getOperand(2),  // rsrc
-     VIndex,
-     VOffset,
-     Op.getOperand(6),  // soffset
-     Op.getOperand(7),  // inst_offset
-     Op.getOperand(8),  // dfmt
-     Op.getOperand(9),  // nfmt
-     Op.getOperand(12), // glc
-     Op.getOperand(13), // slc
+      Chain,
+      VData,             // vdata
+      Op.getOperand(3),  // rsrc
+      Op.getOperand(4),  // vindex
+      Op.getOperand(5),  // voffset
+      Op.getOperand(6),  // soffset
+      Op.getOperand(7),  // offset
+      DAG.getConstant(Dfmt | (Nfmt << 4), DL, MVT::i32), // format
+      DAG.getConstant(Glc | (Slc << 1), DL, MVT::i32), // cachepolicy
+      DAG.getConstant(IdxEn, DL, MVT::i1), // idexen
     };
-
-    assert((cast<ConstantSDNode>(Op.getOperand(14)))->getZExtValue() == 0 &&
-           "Value of tfe other than zero is unsupported");
-
-    EVT VT = Op.getOperand(3).getValueType();
-    MachineMemOperand *MMO = MF.getMachineMemOperand(
-      MachinePointerInfo(),
-      MachineMemOperand::MOStore,
-      VT.getStoreSize(), 4);
-    return DAG.getMemIntrinsicNode(Opcode, DL,
-                                   Op->getVTList(), Ops, VT, MMO);
+    unsigned Opc = IsD16 ? AMDGPUISD::TBUFFER_STORE_FORMAT_D16 :
+                           AMDGPUISD::TBUFFER_STORE_FORMAT;
+    MemSDNode *M = cast<MemSDNode>(Op);
+    return DAG.getMemIntrinsicNode(Opc, DL, Op->getVTList(), Ops,
+                                   M->getMemoryVT(), M->getMemOperand());
   }
 
-  case Intrinsic::amdgcn_tbuffer_store: {
+  case Intrinsic::amdgcn_struct_tbuffer_store: {
     SDValue VData = Op.getOperand(2);
     bool IsD16 = (VData.getValueType().getScalarType() == MVT::f16);
     if (IsD16)
       VData = handleD16VData(VData, DAG);
+    auto Offsets = splitBufferOffsets(Op.getOperand(5), DAG);
     SDValue Ops[] = {
       Chain,
       VData,             // vdata
       Op.getOperand(3),  // rsrc
       Op.getOperand(4),  // vindex
-      Op.getOperand(5),  // voffset
+      Offsets.first,     // voffset
       Op.getOperand(6),  // soffset
-      Op.getOperand(7),  // offset
-      Op.getOperand(8),  // dfmt
-      Op.getOperand(9),  // nfmt
-      Op.getOperand(10), // glc
-      Op.getOperand(11)  // slc
+      Offsets.second,    // offset
+      Op.getOperand(7),  // format
+      Op.getOperand(8),  // cachepolicy
+      DAG.getConstant(1, DL, MVT::i1), // idexen
+    };
+    unsigned Opc = IsD16 ? AMDGPUISD::TBUFFER_STORE_FORMAT_D16 :
+                           AMDGPUISD::TBUFFER_STORE_FORMAT;
+    MemSDNode *M = cast<MemSDNode>(Op);
+    return DAG.getMemIntrinsicNode(Opc, DL, Op->getVTList(), Ops,
+                                   M->getMemoryVT(), M->getMemOperand());
+  }
+
+  case Intrinsic::amdgcn_raw_tbuffer_store: {
+    SDValue VData = Op.getOperand(2);
+    bool IsD16 = (VData.getValueType().getScalarType() == MVT::f16);
+    if (IsD16)
+      VData = handleD16VData(VData, DAG);
+    auto Offsets = splitBufferOffsets(Op.getOperand(4), DAG);
+    SDValue Ops[] = {
+      Chain,
+      VData,             // vdata
+      Op.getOperand(3),  // rsrc
+      DAG.getConstant(0, DL, MVT::i32), // vindex
+      Offsets.first,     // voffset
+      Op.getOperand(5),  // soffset
+      Offsets.second,    // offset
+      Op.getOperand(6),  // format
+      Op.getOperand(7),  // cachepolicy
+      DAG.getConstant(0, DL, MVT::i1), // idexen
     };
     unsigned Opc = IsD16 ? AMDGPUISD::TBUFFER_STORE_FORMAT_D16 :
                            AMDGPUISD::TBUFFER_STORE_FORMAT;
@@ -5460,15 +6108,23 @@ SDValue SITargetLowering::LowerINTRINSIC_VOID(SDValue Op,
     bool IsD16 = (VData.getValueType().getScalarType() == MVT::f16);
     if (IsD16)
       VData = handleD16VData(VData, DAG);
+    unsigned Glc = cast<ConstantSDNode>(Op.getOperand(6))->getZExtValue();
+    unsigned Slc = cast<ConstantSDNode>(Op.getOperand(7))->getZExtValue();
+    unsigned IdxEn = 1;
+    if (auto Idx = dyn_cast<ConstantSDNode>(Op.getOperand(4)))
+      IdxEn = Idx->getZExtValue() != 0;
     SDValue Ops[] = {
       Chain,
-      VData,            // vdata
+      VData,
       Op.getOperand(3), // rsrc
       Op.getOperand(4), // vindex
-      Op.getOperand(5), // offset
-      Op.getOperand(6), // glc
-      Op.getOperand(7)  // slc
+      SDValue(), // voffset -- will be set by setBufferOffsets
+      SDValue(), // soffset -- will be set by setBufferOffsets
+      SDValue(), // offset -- will be set by setBufferOffsets
+      DAG.getConstant(Glc | (Slc << 1), DL, MVT::i32), // cachepolicy
+      DAG.getConstant(IdxEn, DL, MVT::i1), // idxen
     };
+    setBufferOffsets(Op.getOperand(5), DAG, &Ops[4]);
     unsigned Opc = IntrinsicID == Intrinsic::amdgcn_buffer_store ?
                    AMDGPUISD::BUFFER_STORE : AMDGPUISD::BUFFER_STORE_FORMAT;
     Opc = IsD16 ? AMDGPUISD::BUFFER_STORE_FORMAT_D16 : Opc;
@@ -5476,6 +6132,59 @@ SDValue SITargetLowering::LowerINTRINSIC_VOID(SDValue Op,
     return DAG.getMemIntrinsicNode(Opc, DL, Op->getVTList(), Ops,
                                    M->getMemoryVT(), M->getMemOperand());
   }
+
+  case Intrinsic::amdgcn_raw_buffer_store:
+  case Intrinsic::amdgcn_raw_buffer_store_format: {
+    SDValue VData = Op.getOperand(2);
+    bool IsD16 = (VData.getValueType().getScalarType() == MVT::f16);
+    if (IsD16)
+      VData = handleD16VData(VData, DAG);
+    auto Offsets = splitBufferOffsets(Op.getOperand(4), DAG);
+    SDValue Ops[] = {
+      Chain,
+      VData,
+      Op.getOperand(3), // rsrc
+      DAG.getConstant(0, DL, MVT::i32), // vindex
+      Offsets.first,    // voffset
+      Op.getOperand(5), // soffset
+      Offsets.second,   // offset
+      Op.getOperand(6), // cachepolicy
+      DAG.getConstant(0, DL, MVT::i1), // idxen
+    };
+    unsigned Opc = IntrinsicID == Intrinsic::amdgcn_raw_buffer_store ?
+                   AMDGPUISD::BUFFER_STORE : AMDGPUISD::BUFFER_STORE_FORMAT;
+    Opc = IsD16 ? AMDGPUISD::BUFFER_STORE_FORMAT_D16 : Opc;
+    MemSDNode *M = cast<MemSDNode>(Op);
+    return DAG.getMemIntrinsicNode(Opc, DL, Op->getVTList(), Ops,
+                                   M->getMemoryVT(), M->getMemOperand());
+  }
+
+  case Intrinsic::amdgcn_struct_buffer_store:
+  case Intrinsic::amdgcn_struct_buffer_store_format: {
+    SDValue VData = Op.getOperand(2);
+    bool IsD16 = (VData.getValueType().getScalarType() == MVT::f16);
+    if (IsD16)
+      VData = handleD16VData(VData, DAG);
+    auto Offsets = splitBufferOffsets(Op.getOperand(5), DAG);
+    SDValue Ops[] = {
+      Chain,
+      VData,
+      Op.getOperand(3), // rsrc
+      Op.getOperand(4), // vindex
+      Offsets.first,    // voffset
+      Op.getOperand(6), // soffset
+      Offsets.second,   // offset
+      Op.getOperand(7), // cachepolicy
+      DAG.getConstant(1, DL, MVT::i1), // idxen
+    };
+    unsigned Opc = IntrinsicID == Intrinsic::amdgcn_struct_buffer_store ?
+                   AMDGPUISD::BUFFER_STORE : AMDGPUISD::BUFFER_STORE_FORMAT;
+    Opc = IsD16 ? AMDGPUISD::BUFFER_STORE_FORMAT_D16 : Opc;
+    MemSDNode *M = cast<MemSDNode>(Op);
+    return DAG.getMemIntrinsicNode(Opc, DL, Op->getVTList(), Ops,
+                                   M->getMemoryVT(), M->getMemOperand());
+  }
+
   default: {
     if (const AMDGPU::ImageDimIntrinsicInfo *ImageDimIntr =
             AMDGPU::getImageDimIntrinsicInfo(IntrinsicID))
@@ -5486,6 +6195,94 @@ SDValue SITargetLowering::LowerINTRINSIC_VOID(SDValue Op,
   }
 }
 
+// The raw.(t)buffer and struct.(t)buffer intrinsics have two offset args:
+// offset (the offset that is included in bounds checking and swizzling, to be
+// split between the instruction's voffset and immoffset fields) and soffset
+// (the offset that is excluded from bounds checking and swizzling, to go in
+// the instruction's soffset field).  This function takes the first kind of
+// offset and figures out how to split it between voffset and immoffset.
+std::pair<SDValue, SDValue> SITargetLowering::splitBufferOffsets(
+    SDValue Offset, SelectionDAG &DAG) const {
+  SDLoc DL(Offset);
+  const unsigned MaxImm = 4095;
+  SDValue N0 = Offset;
+  ConstantSDNode *C1 = nullptr;
+
+  if ((C1 = dyn_cast<ConstantSDNode>(N0)))
+    N0 = SDValue();
+  else if (DAG.isBaseWithConstantOffset(N0)) {
+    C1 = cast<ConstantSDNode>(N0.getOperand(1));
+    N0 = N0.getOperand(0);
+  }
+
+  if (C1) {
+    unsigned ImmOffset = C1->getZExtValue();
+    // If the immediate value is too big for the immoffset field, put the value
+    // and -4096 into the immoffset field so that the value that is copied/added
+    // for the voffset field is a multiple of 4096, and it stands more chance
+    // of being CSEd with the copy/add for another similar load/store.
+    // However, do not do that rounding down to a multiple of 4096 if that is a
+    // negative number, as it appears to be illegal to have a negative offset
+    // in the vgpr, even if adding the immediate offset makes it positive.
+    unsigned Overflow = ImmOffset & ~MaxImm;
+    ImmOffset -= Overflow;
+    if ((int32_t)Overflow < 0) {
+      Overflow += ImmOffset;
+      ImmOffset = 0;
+    }
+    C1 = cast<ConstantSDNode>(DAG.getConstant(ImmOffset, DL, MVT::i32));
+    if (Overflow) {
+      auto OverflowVal = DAG.getConstant(Overflow, DL, MVT::i32);
+      if (!N0)
+        N0 = OverflowVal;
+      else {
+        SDValue Ops[] = { N0, OverflowVal };
+        N0 = DAG.getNode(ISD::ADD, DL, MVT::i32, Ops);
+      }
+    }
+  }
+  if (!N0)
+    N0 = DAG.getConstant(0, DL, MVT::i32);
+  if (!C1)
+    C1 = cast<ConstantSDNode>(DAG.getConstant(0, DL, MVT::i32));
+  return {N0, SDValue(C1, 0)};
+}
+
+// Analyze a combined offset from an amdgcn_buffer_ intrinsic and store the
+// three offsets (voffset, soffset and instoffset) into the SDValue[3] array
+// pointed to by Offsets.
+void SITargetLowering::setBufferOffsets(SDValue CombinedOffset,
+                                        SelectionDAG &DAG, SDValue *Offsets,
+                                        unsigned Align) const {
+  SDLoc DL(CombinedOffset);
+  if (auto C = dyn_cast<ConstantSDNode>(CombinedOffset)) {
+    uint32_t Imm = C->getZExtValue();
+    uint32_t SOffset, ImmOffset;
+    if (AMDGPU::splitMUBUFOffset(Imm, SOffset, ImmOffset, Subtarget, Align)) {
+      Offsets[0] = DAG.getConstant(0, DL, MVT::i32);
+      Offsets[1] = DAG.getConstant(SOffset, DL, MVT::i32);
+      Offsets[2] = DAG.getConstant(ImmOffset, DL, MVT::i32);
+      return;
+    }
+  }
+  if (DAG.isBaseWithConstantOffset(CombinedOffset)) {
+    SDValue N0 = CombinedOffset.getOperand(0);
+    SDValue N1 = CombinedOffset.getOperand(1);
+    uint32_t SOffset, ImmOffset;
+    int Offset = cast<ConstantSDNode>(N1)->getSExtValue();
+    if (Offset >= 0 && AMDGPU::splitMUBUFOffset(Offset, SOffset, ImmOffset,
+                                                Subtarget, Align)) {
+      Offsets[0] = N0;
+      Offsets[1] = DAG.getConstant(SOffset, DL, MVT::i32);
+      Offsets[2] = DAG.getConstant(ImmOffset, DL, MVT::i32);
+      return;
+    }
+  }
+  Offsets[0] = CombinedOffset;
+  Offsets[1] = DAG.getConstant(0, DL, MVT::i32);
+  Offsets[2] = DAG.getConstant(0, DL, MVT::i32);
+}
+
 static SDValue getLoadExtOrTrunc(SelectionDAG &DAG,
                                  ISD::LoadExtType ExtType, SDValue Op,
                                  const SDLoc &SL, EVT VT) {
@@ -5513,8 +6310,8 @@ SDValue SITargetLowering::widenLoad(LoadSDNode *Ld, DAGCombinerInfo &DCI) const
 
   // FIXME: Constant loads should all be marked invariant.
   unsigned AS = Ld->getAddressSpace();
-  if (AS != AMDGPUASI.CONSTANT_ADDRESS &&
-      AS != AMDGPUASI.CONSTANT_ADDRESS_32BIT &&
+  if (AS != AMDGPUAS::CONSTANT_ADDRESS &&
+      AS != AMDGPUAS::CONSTANT_ADDRESS_32BIT &&
       (AS != AMDGPUAS::GLOBAL_ADDRESS || !Ld->isInvariant()))
     return SDValue();
 
@@ -5625,15 +6422,15 @@ SDValue SITargetLowering::LowerLOAD(SDValue Op, SelectionDAG &DAG) const {
   SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
   // If there is a possibilty that flat instruction access scratch memory
   // then we need to use the same legalization rules we use for private.
-  if (AS == AMDGPUASI.FLAT_ADDRESS)
+  if (AS == AMDGPUAS::FLAT_ADDRESS)
     AS = MFI->hasFlatScratchInit() ?
-         AMDGPUASI.PRIVATE_ADDRESS : AMDGPUASI.GLOBAL_ADDRESS;
+         AMDGPUAS::PRIVATE_ADDRESS : AMDGPUAS::GLOBAL_ADDRESS;
 
   unsigned NumElements = MemVT.getVectorNumElements();
 
-  if (AS == AMDGPUASI.CONSTANT_ADDRESS ||
-      AS == AMDGPUASI.CONSTANT_ADDRESS_32BIT) {
-    if (!Op->isDivergent() && Alignment >= 4)
+  if (AS == AMDGPUAS::CONSTANT_ADDRESS ||
+      AS == AMDGPUAS::CONSTANT_ADDRESS_32BIT) {
+    if (!Op->isDivergent() && Alignment >= 4 && NumElements < 32)
       return SDValue();
     // Non-uniform loads will be selected to MUBUF instructions, so they
     // have the same legalization requirements as global and private
@@ -5641,28 +6438,28 @@ SDValue SITargetLowering::LowerLOAD(SDValue Op, SelectionDAG &DAG) const {
     //
   }
 
-  if (AS == AMDGPUASI.CONSTANT_ADDRESS ||
-      AS == AMDGPUASI.CONSTANT_ADDRESS_32BIT ||
-      AS == AMDGPUASI.GLOBAL_ADDRESS) {
+  if (AS == AMDGPUAS::CONSTANT_ADDRESS ||
+      AS == AMDGPUAS::CONSTANT_ADDRESS_32BIT ||
+      AS == AMDGPUAS::GLOBAL_ADDRESS) {
     if (Subtarget->getScalarizeGlobalBehavior() && !Op->isDivergent() &&
         !Load->isVolatile() && isMemOpHasNoClobberedMemOperand(Load) &&
-        Alignment >= 4)
+        Alignment >= 4 && NumElements < 32)
       return SDValue();
     // Non-uniform loads will be selected to MUBUF instructions, so they
     // have the same legalization requirements as global and private
     // loads.
     //
   }
-  if (AS == AMDGPUASI.CONSTANT_ADDRESS ||
-      AS == AMDGPUASI.CONSTANT_ADDRESS_32BIT ||
-      AS == AMDGPUASI.GLOBAL_ADDRESS ||
-      AS == AMDGPUASI.FLAT_ADDRESS) {
+  if (AS == AMDGPUAS::CONSTANT_ADDRESS ||
+      AS == AMDGPUAS::CONSTANT_ADDRESS_32BIT ||
+      AS == AMDGPUAS::GLOBAL_ADDRESS ||
+      AS == AMDGPUAS::FLAT_ADDRESS) {
     if (NumElements > 4)
       return SplitVectorLoad(Op, DAG);
     // v4 loads are supported for private and global memory.
     return SDValue();
   }
-  if (AS == AMDGPUASI.PRIVATE_ADDRESS) {
+  if (AS == AMDGPUAS::PRIVATE_ADDRESS) {
     // Depending on the setting of the private_element_size field in the
     // resource descriptor, we can only make private accesses up to a certain
     // size.
@@ -5681,7 +6478,7 @@ SDValue SITargetLowering::LowerLOAD(SDValue Op, SelectionDAG &DAG) const {
     default:
       llvm_unreachable("unsupported private_element_size");
     }
-  } else if (AS == AMDGPUASI.LOCAL_ADDRESS) {
+  } else if (AS == AMDGPUAS::LOCAL_ADDRESS) {
     // Use ds_read_b128 if possible.
     if (Subtarget->useDS128() && Load->getAlignment() >= 16 &&
         MemVT.getStoreSize() == 16)
@@ -5689,6 +6486,17 @@ SDValue SITargetLowering::LowerLOAD(SDValue Op, SelectionDAG &DAG) const {
 
     if (NumElements > 2)
       return SplitVectorLoad(Op, DAG);
+
+    // SI has a hardware bug in the LDS / GDS boounds checking: if the base
+    // address is negative, then the instruction is incorrectly treated as
+    // out-of-bounds even if base + offsets is in bounds. Split vectorized
+    // loads here to avoid emitting ds_read2_b32. We may re-combine the
+    // load later in the SILoadStoreOptimizer.
+    if (Subtarget->getGeneration() == AMDGPUSubtarget::SOUTHERN_ISLANDS &&
+        NumElements == 2 && MemVT.getStoreSize() == 8 &&
+        Load->getAlignment() < 8) {
+      return SplitVectorLoad(Op, DAG);
+    }
   }
   return SDValue();
 }
@@ -6058,17 +6866,17 @@ SDValue SITargetLowering::LowerSTORE(SDValue Op, SelectionDAG &DAG) const {
   SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
   // If there is a possibilty that flat instruction access scratch memory
   // then we need to use the same legalization rules we use for private.
-  if (AS == AMDGPUASI.FLAT_ADDRESS)
+  if (AS == AMDGPUAS::FLAT_ADDRESS)
     AS = MFI->hasFlatScratchInit() ?
-         AMDGPUASI.PRIVATE_ADDRESS : AMDGPUASI.GLOBAL_ADDRESS;
+         AMDGPUAS::PRIVATE_ADDRESS : AMDGPUAS::GLOBAL_ADDRESS;
 
   unsigned NumElements = VT.getVectorNumElements();
-  if (AS == AMDGPUASI.GLOBAL_ADDRESS ||
-      AS == AMDGPUASI.FLAT_ADDRESS) {
+  if (AS == AMDGPUAS::GLOBAL_ADDRESS ||
+      AS == AMDGPUAS::FLAT_ADDRESS) {
     if (NumElements > 4)
       return SplitVectorStore(Op, DAG);
     return SDValue();
-  } else if (AS == AMDGPUASI.PRIVATE_ADDRESS) {
+  } else if (AS == AMDGPUAS::PRIVATE_ADDRESS) {
     switch (Subtarget->getMaxPrivateElementSize()) {
     case 4:
       return scalarizeVectorStore(Store, DAG);
@@ -6083,7 +6891,7 @@ SDValue SITargetLowering::LowerSTORE(SDValue Op, SelectionDAG &DAG) const {
     default:
       llvm_unreachable("unsupported private_element_size");
     }
-  } else if (AS == AMDGPUASI.LOCAL_ADDRESS) {
+  } else if (AS == AMDGPUAS::LOCAL_ADDRESS) {
     // Use ds_write_b128 if possible.
     if (Subtarget->useDS128() && Store->getAlignment() >= 16 &&
         VT.getStoreSize() == 16)
@@ -6091,6 +6899,18 @@ SDValue SITargetLowering::LowerSTORE(SDValue Op, SelectionDAG &DAG) const {
 
     if (NumElements > 2)
       return SplitVectorStore(Op, DAG);
+
+    // SI has a hardware bug in the LDS / GDS boounds checking: if the base
+    // address is negative, then the instruction is incorrectly treated as
+    // out-of-bounds even if base + offsets is in bounds. Split vectorized
+    // stores here to avoid emitting ds_write2_b32. We may re-combine the
+    // store later in the SILoadStoreOptimizer.
+    if (Subtarget->getGeneration() == AMDGPUSubtarget::SOUTHERN_ISLANDS &&
+        NumElements == 2 && VT.getStoreSize() == 8 &&
+        Store->getAlignment() < 8) {
+      return SplitVectorStore(Op, DAG);
+    }
+
     return SDValue();
   } else {
     llvm_unreachable("unhandled address space");
@@ -6101,17 +6921,24 @@ SDValue SITargetLowering::LowerTrig(SDValue Op, SelectionDAG &DAG) const {
   SDLoc DL(Op);
   EVT VT = Op.getValueType();
   SDValue Arg = Op.getOperand(0);
+  SDValue TrigVal;
+
   // TODO: Should this propagate fast-math-flags?
-  SDValue FractPart = DAG.getNode(AMDGPUISD::FRACT, DL, VT,
-                                  DAG.getNode(ISD::FMUL, DL, VT, Arg,
-                                              DAG.getConstantFP(0.5/M_PI, DL,
-                                                                VT)));
+
+  SDValue OneOver2Pi = DAG.getConstantFP(0.5 / M_PI, DL, VT);
+
+  if (Subtarget->hasTrigReducedRange()) {
+    SDValue MulVal = DAG.getNode(ISD::FMUL, DL, VT, Arg, OneOver2Pi);
+    TrigVal = DAG.getNode(AMDGPUISD::FRACT, DL, VT, MulVal);
+  } else {
+    TrigVal = DAG.getNode(ISD::FMUL, DL, VT, Arg, OneOver2Pi);
+  }
 
   switch (Op.getOpcode()) {
   case ISD::FCOS:
-    return DAG.getNode(AMDGPUISD::COS_HW, SDLoc(Op), VT, FractPart);
+    return DAG.getNode(AMDGPUISD::COS_HW, SDLoc(Op), VT, TrigVal);
   case ISD::FSIN:
-    return DAG.getNode(AMDGPUISD::SIN_HW, SDLoc(Op), VT, FractPart);
+    return DAG.getNode(AMDGPUISD::SIN_HW, SDLoc(Op), VT, TrigVal);
   default:
     llvm_unreachable("Wrong trig opcode");
   }
@@ -6123,7 +6950,7 @@ SDValue SITargetLowering::LowerATOMIC_CMP_SWAP(SDValue Op, SelectionDAG &DAG) co
   unsigned AS = AtomicNode->getAddressSpace();
 
   // No custom lowering required for local address space
-  if (!isFlatGlobalAddrSpace(AS, AMDGPUASI))
+  if (!isFlatGlobalAddrSpace(AS))
     return Op;
 
   // Non-local address space requires custom lowering for atomic compare
@@ -6475,6 +7302,29 @@ SDValue SITargetLowering::performAndCombine(SDNode *N,
     }
   }
 
+  if (RHS.getOpcode() == ISD::SETCC && LHS.getOpcode() == AMDGPUISD::FP_CLASS)
+    std::swap(LHS, RHS);
+
+  if (LHS.getOpcode() == ISD::SETCC && RHS.getOpcode() == AMDGPUISD::FP_CLASS &&
+      RHS.hasOneUse()) {
+    ISD::CondCode LCC = cast<CondCodeSDNode>(LHS.getOperand(2))->get();
+    // and (fcmp seto), (fp_class x, mask) -> fp_class x, mask & ~(p_nan | n_nan)
+    // and (fcmp setuo), (fp_class x, mask) -> fp_class x, mask & (p_nan | n_nan)
+    const ConstantSDNode *Mask = dyn_cast<ConstantSDNode>(RHS.getOperand(1));
+    if ((LCC == ISD::SETO || LCC == ISD::SETUO) && Mask &&
+        (RHS.getOperand(0) == LHS.getOperand(0) &&
+         LHS.getOperand(0) == LHS.getOperand(1))) {
+      const unsigned OrdMask = SIInstrFlags::S_NAN | SIInstrFlags::Q_NAN;
+      unsigned NewMask = LCC == ISD::SETO ?
+        Mask->getZExtValue() & ~OrdMask :
+        Mask->getZExtValue() & OrdMask;
+
+      SDLoc DL(N);
+      return DAG.getNode(AMDGPUISD::FP_CLASS, DL, MVT::i1, RHS.getOperand(0),
+                         DAG.getConstant(NewMask, DL, MVT::i32));
+    }
+  }
+
   if (VT == MVT::i32 &&
       (RHS.getOpcode() == ISD::SIGN_EXTEND || LHS.getOpcode() == ISD::SIGN_EXTEND)) {
     // and x, (sext cc from i1) => select cc, x, 0
@@ -6798,158 +7648,294 @@ SDValue SITargetLowering::performRcpCombine(SDNode *N,
   return AMDGPUTargetLowering::performRcpCombine(N, DCI);
 }
 
-static bool isKnownNeverSNan(SelectionDAG &DAG, SDValue Op) {
-  if (!DAG.getTargetLoweringInfo().hasFloatingPointExceptions())
+bool SITargetLowering::isCanonicalized(SelectionDAG &DAG, SDValue Op,
+                                       unsigned MaxDepth) const {
+  unsigned Opcode = Op.getOpcode();
+  if (Opcode == ISD::FCANONICALIZE)
     return true;
 
-  return DAG.isKnownNeverNaN(Op);
-}
+  if (auto *CFP = dyn_cast<ConstantFPSDNode>(Op)) {
+    auto F = CFP->getValueAPF();
+    if (F.isNaN() && F.isSignaling())
+      return false;
+    return !F.isDenormal() || denormalsEnabledForType(Op.getValueType());
+  }
 
-static bool isCanonicalized(SelectionDAG &DAG, SDValue Op,
-                            const GCNSubtarget *ST, unsigned MaxDepth=5) {
   // If source is a result of another standard FP operation it is already in
   // canonical form.
+  if (MaxDepth == 0)
+    return false;
 
-  switch (Op.getOpcode()) {
-  default:
-    break;
-
+  switch (Opcode) {
   // These will flush denorms if required.
   case ISD::FADD:
   case ISD::FSUB:
   case ISD::FMUL:
-  case ISD::FSQRT:
   case ISD::FCEIL:
   case ISD::FFLOOR:
   case ISD::FMA:
   case ISD::FMAD:
-
-  case ISD::FCANONICALIZE:
-    return true;
-
+  case ISD::FSQRT:
+  case ISD::FDIV:
+  case ISD::FREM:
   case ISD::FP_ROUND:
-    return Op.getValueType().getScalarType() != MVT::f16 ||
-           ST->hasFP16Denormals();
-
   case ISD::FP_EXTEND:
-    return Op.getOperand(0).getValueType().getScalarType() != MVT::f16 ||
-           ST->hasFP16Denormals();
+  case AMDGPUISD::FMUL_LEGACY:
+  case AMDGPUISD::FMAD_FTZ:
+  case AMDGPUISD::RCP:
+  case AMDGPUISD::RSQ:
+  case AMDGPUISD::RSQ_CLAMP:
+  case AMDGPUISD::RCP_LEGACY:
+  case AMDGPUISD::RSQ_LEGACY:
+  case AMDGPUISD::RCP_IFLAG:
+  case AMDGPUISD::TRIG_PREOP:
+  case AMDGPUISD::DIV_SCALE:
+  case AMDGPUISD::DIV_FMAS:
+  case AMDGPUISD::DIV_FIXUP:
+  case AMDGPUISD::FRACT:
+  case AMDGPUISD::LDEXP:
+  case AMDGPUISD::CVT_PKRTZ_F16_F32:
+  case AMDGPUISD::CVT_F32_UBYTE0:
+  case AMDGPUISD::CVT_F32_UBYTE1:
+  case AMDGPUISD::CVT_F32_UBYTE2:
+  case AMDGPUISD::CVT_F32_UBYTE3:
+    return true;
 
   // It can/will be lowered or combined as a bit operation.
   // Need to check their input recursively to handle.
   case ISD::FNEG:
   case ISD::FABS:
-    return (MaxDepth > 0) &&
-           isCanonicalized(DAG, Op.getOperand(0), ST, MaxDepth - 1);
+  case ISD::FCOPYSIGN:
+    return isCanonicalized(DAG, Op.getOperand(0), MaxDepth - 1);
 
   case ISD::FSIN:
   case ISD::FCOS:
   case ISD::FSINCOS:
     return Op.getValueType().getScalarType() != MVT::f16;
 
-  // In pre-GFX9 targets V_MIN_F32 and others do not flush denorms.
-  // For such targets need to check their input recursively.
   case ISD::FMINNUM:
   case ISD::FMAXNUM:
-  case ISD::FMINNAN:
-  case ISD::FMAXNAN:
+  case ISD::FMINNUM_IEEE:
+  case ISD::FMAXNUM_IEEE:
+  case AMDGPUISD::CLAMP:
+  case AMDGPUISD::FMED3:
+  case AMDGPUISD::FMAX3:
+  case AMDGPUISD::FMIN3: {
+    // FIXME: Shouldn't treat the generic operations different based these.
+    // However, we aren't really required to flush the result from
+    // minnum/maxnum..
 
-    if (ST->supportsMinMaxDenormModes() &&
-        DAG.isKnownNeverNaN(Op.getOperand(0)) &&
-        DAG.isKnownNeverNaN(Op.getOperand(1)))
+    // snans will be quieted, so we only need to worry about denormals.
+    if (Subtarget->supportsMinMaxDenormModes() ||
+        denormalsEnabledForType(Op.getValueType()))
       return true;
 
-    return (MaxDepth > 0) &&
-           isCanonicalized(DAG, Op.getOperand(0), ST, MaxDepth - 1) &&
-           isCanonicalized(DAG, Op.getOperand(1), ST, MaxDepth - 1);
+    // Flushing may be required.
+    // In pre-GFX9 targets V_MIN_F32 and others do not flush denorms. For such
+    // targets need to check their input recursively.
+
+    // FIXME: Does this apply with clamp? It's implemented with max.
+    for (unsigned I = 0, E = Op.getNumOperands(); I != E; ++I) {
+      if (!isCanonicalized(DAG, Op.getOperand(I), MaxDepth - 1))
+        return false;
+    }
 
-  case ISD::ConstantFP: {
-    auto F = cast<ConstantFPSDNode>(Op)->getValueAPF();
-    return !F.isDenormal() && !(F.isNaN() && F.isSignaling());
+    return true;
   }
+  case ISD::SELECT: {
+    return isCanonicalized(DAG, Op.getOperand(1), MaxDepth - 1) &&
+           isCanonicalized(DAG, Op.getOperand(2), MaxDepth - 1);
   }
-  return false;
+  case ISD::BUILD_VECTOR: {
+    for (unsigned i = 0, e = Op.getNumOperands(); i != e; ++i) {
+      SDValue SrcOp = Op.getOperand(i);
+      if (!isCanonicalized(DAG, SrcOp, MaxDepth - 1))
+        return false;
+    }
+
+    return true;
+  }
+  case ISD::EXTRACT_VECTOR_ELT:
+  case ISD::EXTRACT_SUBVECTOR: {
+    return isCanonicalized(DAG, Op.getOperand(0), MaxDepth - 1);
+  }
+  case ISD::INSERT_VECTOR_ELT: {
+    return isCanonicalized(DAG, Op.getOperand(0), MaxDepth - 1) &&
+           isCanonicalized(DAG, Op.getOperand(1), MaxDepth - 1);
+  }
+  case ISD::UNDEF:
+    // Could be anything.
+    return false;
+
+  case ISD::BITCAST: {
+    // Hack round the mess we make when legalizing extract_vector_elt
+    SDValue Src = Op.getOperand(0);
+    if (Src.getValueType() == MVT::i16 &&
+        Src.getOpcode() == ISD::TRUNCATE) {
+      SDValue TruncSrc = Src.getOperand(0);
+      if (TruncSrc.getValueType() == MVT::i32 &&
+          TruncSrc.getOpcode() == ISD::BITCAST &&
+          TruncSrc.getOperand(0).getValueType() == MVT::v2f16) {
+        return isCanonicalized(DAG, TruncSrc.getOperand(0), MaxDepth - 1);
+      }
+    }
+
+    return false;
+  }
+  case ISD::INTRINSIC_WO_CHAIN: {
+    unsigned IntrinsicID
+      = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue();
+    // TODO: Handle more intrinsics
+    switch (IntrinsicID) {
+    case Intrinsic::amdgcn_cvt_pkrtz:
+    case Intrinsic::amdgcn_cubeid:
+    case Intrinsic::amdgcn_frexp_mant:
+    case Intrinsic::amdgcn_fdot2:
+      return true;
+    default:
+      break;
+    }
+
+    LLVM_FALLTHROUGH;
+  }
+  default:
+    return denormalsEnabledForType(Op.getValueType()) &&
+           DAG.isKnownNeverSNaN(Op);
+  }
+
+  llvm_unreachable("invalid operation");
 }
 
 // Constant fold canonicalize.
+SDValue SITargetLowering::getCanonicalConstantFP(
+  SelectionDAG &DAG, const SDLoc &SL, EVT VT, const APFloat &C) const {
+  // Flush denormals to 0 if not enabled.
+  if (C.isDenormal() && !denormalsEnabledForType(VT))
+    return DAG.getConstantFP(0.0, SL, VT);
+
+  if (C.isNaN()) {
+    APFloat CanonicalQNaN = APFloat::getQNaN(C.getSemantics());
+    if (C.isSignaling()) {
+      // Quiet a signaling NaN.
+      // FIXME: Is this supposed to preserve payload bits?
+      return DAG.getConstantFP(CanonicalQNaN, SL, VT);
+    }
+
+    // Make sure it is the canonical NaN bitpattern.
+    //
+    // TODO: Can we use -1 as the canonical NaN value since it's an inline
+    // immediate?
+    if (C.bitcastToAPInt() != CanonicalQNaN.bitcastToAPInt())
+      return DAG.getConstantFP(CanonicalQNaN, SL, VT);
+  }
+
+  // Already canonical.
+  return DAG.getConstantFP(C, SL, VT);
+}
+
+static bool vectorEltWillFoldAway(SDValue Op) {
+  return Op.isUndef() || isa<ConstantFPSDNode>(Op);
+}
+
 SDValue SITargetLowering::performFCanonicalizeCombine(
   SDNode *N,
   DAGCombinerInfo &DCI) const {
   SelectionDAG &DAG = DCI.DAG;
   SDValue N0 = N->getOperand(0);
+  EVT VT = N->getValueType(0);
 
   // fcanonicalize undef -> qnan
   if (N0.isUndef()) {
-    EVT VT = N->getValueType(0);
     APFloat QNaN = APFloat::getQNaN(SelectionDAG::EVTToAPFloatSemantics(VT));
     return DAG.getConstantFP(QNaN, SDLoc(N), VT);
   }
 
-  ConstantFPSDNode *CFP = isConstOrConstSplatFP(N0);
-  if (!CFP) {
-    SDValue N0 = N->getOperand(0);
-    EVT VT = N0.getValueType().getScalarType();
-    auto ST = getSubtarget();
+  if (ConstantFPSDNode *CFP = isConstOrConstSplatFP(N0)) {
+    EVT VT = N->getValueType(0);
+    return getCanonicalConstantFP(DAG, SDLoc(N), VT, CFP->getValueAPF());
+  }
 
-    if (((VT == MVT::f32 && ST->hasFP32Denormals()) ||
-         (VT == MVT::f64 && ST->hasFP64Denormals()) ||
-         (VT == MVT::f16 && ST->hasFP16Denormals())) &&
-        DAG.isKnownNeverNaN(N0))
-      return N0;
+  // fcanonicalize (build_vector x, k) -> build_vector (fcanonicalize x),
+  //                                                   (fcanonicalize k)
+  //
+  // fcanonicalize (build_vector x, undef) -> build_vector (fcanonicalize x), 0
 
-    bool IsIEEEMode = Subtarget->enableIEEEBit(DAG.getMachineFunction());
+  // TODO: This could be better with wider vectors that will be split to v2f16,
+  // and to consider uses since there aren't that many packed operations.
+  if (N0.getOpcode() == ISD::BUILD_VECTOR && VT == MVT::v2f16 &&
+      isTypeLegal(MVT::v2f16)) {
+    SDLoc SL(N);
+    SDValue NewElts[2];
+    SDValue Lo = N0.getOperand(0);
+    SDValue Hi = N0.getOperand(1);
+    EVT EltVT = Lo.getValueType();
 
-    if ((IsIEEEMode || isKnownNeverSNan(DAG, N0)) &&
-        isCanonicalized(DAG, N0, ST))
-      return N0;
+    if (vectorEltWillFoldAway(Lo) || vectorEltWillFoldAway(Hi)) {
+      for (unsigned I = 0; I != 2; ++I) {
+        SDValue Op = N0.getOperand(I);
+        if (ConstantFPSDNode *CFP = dyn_cast<ConstantFPSDNode>(Op)) {
+          NewElts[I] = getCanonicalConstantFP(DAG, SL, EltVT,
+                                              CFP->getValueAPF());
+        } else if (Op.isUndef()) {
+          // Handled below based on what the other operand is.
+          NewElts[I] = Op;
+        } else {
+          NewElts[I] = DAG.getNode(ISD::FCANONICALIZE, SL, EltVT, Op);
+        }
+      }
 
-    return SDValue();
-  }
+      // If one half is undef, and one is constant, perfer a splat vector rather
+      // than the normal qNaN. If it's a register, prefer 0.0 since that's
+      // cheaper to use and may be free with a packed operation.
+      if (NewElts[0].isUndef()) {
+        if (isa<ConstantFPSDNode>(NewElts[1]))
+          NewElts[0] = isa<ConstantFPSDNode>(NewElts[1]) ?
+            NewElts[1]: DAG.getConstantFP(0.0f, SL, EltVT);
+      }
 
-  const APFloat &C = CFP->getValueAPF();
+      if (NewElts[1].isUndef()) {
+        NewElts[1] = isa<ConstantFPSDNode>(NewElts[0]) ?
+          NewElts[0] : DAG.getConstantFP(0.0f, SL, EltVT);
+      }
 
-  // Flush denormals to 0 if not enabled.
-  if (C.isDenormal()) {
-    EVT VT = N->getValueType(0);
-    EVT SVT = VT.getScalarType();
-    if (SVT == MVT::f32 && !Subtarget->hasFP32Denormals())
-      return DAG.getConstantFP(0.0, SDLoc(N), VT);
+      return DAG.getBuildVector(VT, SL, NewElts);
+    }
+  }
 
-    if (SVT == MVT::f64 && !Subtarget->hasFP64Denormals())
-      return DAG.getConstantFP(0.0, SDLoc(N), VT);
+  unsigned SrcOpc = N0.getOpcode();
 
-    if (SVT == MVT::f16 && !Subtarget->hasFP16Denormals())
-      return DAG.getConstantFP(0.0, SDLoc(N), VT);
-  }
+  // If it's free to do so, push canonicalizes further up the source, which may
+  // find a canonical source.
+  //
+  // TODO: More opcodes. Note this is unsafe for the the _ieee minnum/maxnum for
+  // sNaNs.
+  if (SrcOpc == ISD::FMINNUM || SrcOpc == ISD::FMAXNUM) {
+    auto *CRHS = dyn_cast<ConstantFPSDNode>(N0.getOperand(1));
+    if (CRHS && N0.hasOneUse()) {
+      SDLoc SL(N);
+      SDValue Canon0 = DAG.getNode(ISD::FCANONICALIZE, SL, VT,
+                                   N0.getOperand(0));
+      SDValue Canon1 = getCanonicalConstantFP(DAG, SL, VT, CRHS->getValueAPF());
+      DCI.AddToWorklist(Canon0.getNode());
 
-  if (C.isNaN()) {
-    EVT VT = N->getValueType(0);
-    APFloat CanonicalQNaN = APFloat::getQNaN(C.getSemantics());
-    if (C.isSignaling()) {
-      // Quiet a signaling NaN.
-      return DAG.getConstantFP(CanonicalQNaN, SDLoc(N), VT);
+      return DAG.getNode(N0.getOpcode(), SL, VT, Canon0, Canon1);
     }
-
-    // Make sure it is the canonical NaN bitpattern.
-    //
-    // TODO: Can we use -1 as the canonical NaN value since it's an inline
-    // immediate?
-    if (C.bitcastToAPInt() != CanonicalQNaN.bitcastToAPInt())
-      return DAG.getConstantFP(CanonicalQNaN, SDLoc(N), VT);
   }
 
-  return N0;
+  return isCanonicalized(DAG, N0) ? N0 : SDValue();
 }
 
 static unsigned minMaxOpcToMin3Max3Opc(unsigned Opc) {
   switch (Opc) {
   case ISD::FMAXNUM:
+  case ISD::FMAXNUM_IEEE:
     return AMDGPUISD::FMAX3;
   case ISD::SMAX:
     return AMDGPUISD::SMAX3;
   case ISD::UMAX:
     return AMDGPUISD::UMAX3;
   case ISD::FMINNUM:
+  case ISD::FMINNUM_IEEE:
     return AMDGPUISD::FMIN3;
   case ISD::SMIN:
     return AMDGPUISD::SMIN3;
@@ -7044,11 +8030,18 @@ SDValue SITargetLowering::performFPMed3ImmCombine(SelectionDAG &DAG,
     // then give the other result, which is different from med3 with a NaN
     // input.
     SDValue Var = Op0.getOperand(0);
-    if (!isKnownNeverSNan(DAG, Var))
+    if (!DAG.isKnownNeverSNaN(Var))
       return SDValue();
 
-    return DAG.getNode(AMDGPUISD::FMED3, SL, K0->getValueType(0),
-                       Var, SDValue(K0, 0), SDValue(K1, 0));
+    const SIInstrInfo *TII = getSubtarget()->getInstrInfo();
+
+    if ((!K0->hasOneUse() ||
+         TII->isInlineConstant(K0->getValueAPF().bitcastToAPInt())) &&
+        (!K1->hasOneUse() ||
+         TII->isInlineConstant(K1->getValueAPF().bitcastToAPInt()))) {
+      return DAG.getNode(AMDGPUISD::FMED3, SL, K0->getValueType(0),
+                         Var, SDValue(K0, 0), SDValue(K1, 0));
+    }
   }
 
   return SDValue();
@@ -7109,6 +8102,7 @@ SDValue SITargetLowering::performMinMaxCombine(SDNode *N,
 
   // fminnum(fmaxnum(x, K0), K1), K0 < K1 && !is_snan(x) -> fmed3(x, K0, K1)
   if (((Opc == ISD::FMINNUM && Op0.getOpcode() == ISD::FMAXNUM) ||
+       (Opc == ISD::FMINNUM_IEEE && Op0.getOpcode() == ISD::FMAXNUM_IEEE) ||
        (Opc == AMDGPUISD::FMIN_LEGACY &&
         Op0.getOpcode() == AMDGPUISD::FMAX_LEGACY)) &&
       (VT == MVT::f32 || VT == MVT::f64 ||
@@ -7216,9 +8210,11 @@ SDValue SITargetLowering::performExtractVectorEltCombine(
 
     switch(Opc) {
     default:
-      return SDValue();
+      break;
       // TODO: Support other binary operations.
     case ISD::FADD:
+    case ISD::FSUB:
+    case ISD::FMUL:
     case ISD::ADD:
     case ISD::UMIN:
     case ISD::UMAX:
@@ -7226,25 +8222,54 @@ SDValue SITargetLowering::performExtractVectorEltCombine(
     case ISD::SMAX:
     case ISD::FMAXNUM:
     case ISD::FMINNUM:
-      return DAG.getNode(Opc, SL, EltVT,
-                         DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, EltVT,
-                                     Vec.getOperand(0), Idx),
-                         DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, EltVT,
-                                     Vec.getOperand(1), Idx));
+    case ISD::FMAXNUM_IEEE:
+    case ISD::FMINNUM_IEEE: {
+      SDValue Elt0 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, EltVT,
+                                 Vec.getOperand(0), Idx);
+      SDValue Elt1 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, EltVT,
+                                 Vec.getOperand(1), Idx);
+
+      DCI.AddToWorklist(Elt0.getNode());
+      DCI.AddToWorklist(Elt1.getNode());
+      return DAG.getNode(Opc, SL, EltVT, Elt0, Elt1, Vec->getFlags());
+    }
     }
   }
 
-  if (!DCI.isBeforeLegalize())
-    return SDValue();
-
   unsigned VecSize = VecVT.getSizeInBits();
   unsigned EltSize = EltVT.getSizeInBits();
 
+  // EXTRACT_VECTOR_ELT (<n x e>, var-idx) => n x select (e, const-idx)
+  // This elminates non-constant index and subsequent movrel or scratch access.
+  // Sub-dword vectors of size 2 dword or less have better implementation.
+  // Vectors of size bigger than 8 dwords would yield too many v_cndmask_b32
+  // instructions.
+  if (VecSize <= 256 && (VecSize > 64 || EltSize >= 32) &&
+      !isa<ConstantSDNode>(N->getOperand(1))) {
+    SDLoc SL(N);
+    SDValue Idx = N->getOperand(1);
+    EVT IdxVT = Idx.getValueType();
+    SDValue V;
+    for (unsigned I = 0, E = VecVT.getVectorNumElements(); I < E; ++I) {
+      SDValue IC = DAG.getConstant(I, SL, IdxVT);
+      SDValue Elt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, EltVT, Vec, IC);
+      if (I == 0)
+        V = Elt;
+      else
+        V = DAG.getSelectCC(SL, Idx, IC, Elt, V, ISD::SETEQ);
+    }
+    return V;
+  }
+
+  if (!DCI.isBeforeLegalize())
+    return SDValue();
+
   // Try to turn sub-dword accesses of vectors into accesses of the same 32-bit
   // elements. This exposes more load reduction opportunities by replacing
   // multiple small extract_vector_elements with a single 32-bit extract.
   auto *Idx = dyn_cast<ConstantSDNode>(N->getOperand(1));
-  if (EltSize <= 16 &&
+  if (isa<MemSDNode>(Vec) &&
+      EltSize <= 16 &&
       EltVT.isByteSized() &&
       VecSize > 32 &&
       VecSize % 32 == 0 &&
@@ -7274,46 +8299,40 @@ SDValue SITargetLowering::performExtractVectorEltCombine(
   return SDValue();
 }
 
-static bool convertBuildVectorCastElt(SelectionDAG &DAG,
-                                      SDValue &Lo, SDValue &Hi) {
-  if (Hi.getOpcode() == ISD::BITCAST &&
-      Hi.getOperand(0).getValueType() == MVT::f16 &&
-      (isa<ConstantSDNode>(Lo) || Lo.isUndef())) {
-    Lo = DAG.getNode(ISD::BITCAST, SDLoc(Lo), MVT::f16, Lo);
-    Hi = Hi.getOperand(0);
-    return true;
-  }
-
-  return false;
-}
-
-SDValue SITargetLowering::performBuildVectorCombine(
-  SDNode *N, DAGCombinerInfo &DCI) const {
-  SDLoc SL(N);
+SDValue
+SITargetLowering::performInsertVectorEltCombine(SDNode *N,
+                                                DAGCombinerInfo &DCI) const {
+  SDValue Vec = N->getOperand(0);
+  SDValue Idx = N->getOperand(2);
+  EVT VecVT = Vec.getValueType();
+  EVT EltVT = VecVT.getVectorElementType();
+  unsigned VecSize = VecVT.getSizeInBits();
+  unsigned EltSize = EltVT.getSizeInBits();
 
-  if (!isTypeLegal(MVT::v2i16))
+  // INSERT_VECTOR_ELT (<n x e>, var-idx)
+  // => BUILD_VECTOR n x select (e, const-idx)
+  // This elminates non-constant index and subsequent movrel or scratch access.
+  // Sub-dword vectors of size 2 dword or less have better implementation.
+  // Vectors of size bigger than 8 dwords would yield too many v_cndmask_b32
+  // instructions.
+  if (isa<ConstantSDNode>(Idx) ||
+      VecSize > 256 || (VecSize <= 64 && EltSize < 32))
     return SDValue();
-  SelectionDAG &DAG = DCI.DAG;
-  EVT VT = N->getValueType(0);
-
-  if (VT == MVT::v2i16) {
-    SDValue Lo = N->getOperand(0);
-    SDValue Hi = N->getOperand(1);
 
-    // v2i16 build_vector (const|undef), (bitcast f16:$x)
-    // -> bitcast (v2f16 build_vector const|undef, $x
-    if (convertBuildVectorCastElt(DAG, Lo, Hi)) {
-      SDValue NewVec = DAG.getBuildVector(MVT::v2f16, SL, { Lo, Hi  });
-      return DAG.getNode(ISD::BITCAST, SL, VT, NewVec);
-    }
+  SelectionDAG &DAG = DCI.DAG;
+  SDLoc SL(N);
+  SDValue Ins = N->getOperand(1);
+  EVT IdxVT = Idx.getValueType();
 
-    if (convertBuildVectorCastElt(DAG, Hi, Lo)) {
-      SDValue NewVec = DAG.getBuildVector(MVT::v2f16, SL, { Hi, Lo  });
-      return DAG.getNode(ISD::BITCAST, SL, VT, NewVec);
-    }
+  SmallVector<SDValue, 16> Ops;
+  for (unsigned I = 0, E = VecVT.getVectorNumElements(); I < E; ++I) {
+    SDValue IC = DAG.getConstant(I, SL, IdxVT);
+    SDValue Elt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, EltVT, Vec, IC);
+    SDValue V = DAG.getSelectCC(SL, Idx, IC, Ins, Elt, ISD::SETEQ);
+    Ops.push_back(V);
   }
 
-  return SDValue();
+  return DAG.getBuildVector(VecVT, SL, Ops);
 }
 
 unsigned SITargetLowering::getFusedOpcode(const SelectionDAG &DAG,
@@ -7568,7 +8587,7 @@ SDValue SITargetLowering::performFMACombine(SDNode *N,
   EVT VT = N->getValueType(0);
   SDLoc SL(N);
 
-  if (!Subtarget->hasDLInsts() || VT != MVT::f32)
+  if (!Subtarget->hasDotInsts() || VT != MVT::f32)
     return SDValue();
 
   // FMA((F32)S0.x, (F32)S1. x, FMA((F32)S0.y, (F32)S1.y, (F32)z)) ->
@@ -7705,16 +8724,26 @@ SDValue SITargetLowering::performSetCCCombine(SDNode *N,
                                            VT != MVT::f16))
     return SDValue();
 
-  // Match isinf pattern
+  // Match isinf/isfinite pattern
   // (fcmp oeq (fabs x), inf) -> (fp_class x, (p_infinity | n_infinity))
-  if (CC == ISD::SETOEQ && LHS.getOpcode() == ISD::FABS) {
+  // (fcmp one (fabs x), inf) -> (fp_class x,
+  // (p_normal | n_normal | p_subnormal | n_subnormal | p_zero | n_zero)
+  if ((CC == ISD::SETOEQ || CC == ISD::SETONE) && LHS.getOpcode() == ISD::FABS) {
     const ConstantFPSDNode *CRHS = dyn_cast<ConstantFPSDNode>(RHS);
     if (!CRHS)
       return SDValue();
 
     const APFloat &APF = CRHS->getValueAPF();
     if (APF.isInfinity() && !APF.isNegative()) {
-      unsigned Mask = SIInstrFlags::P_INFINITY | SIInstrFlags::N_INFINITY;
+      const unsigned IsInfMask = SIInstrFlags::P_INFINITY |
+                                 SIInstrFlags::N_INFINITY;
+      const unsigned IsFiniteMask = SIInstrFlags::N_ZERO |
+                                    SIInstrFlags::P_ZERO |
+                                    SIInstrFlags::N_NORMAL |
+                                    SIInstrFlags::P_NORMAL |
+                                    SIInstrFlags::N_SUBNORMAL |
+                                    SIInstrFlags::P_SUBNORMAL;
+      unsigned Mask = CC == ISD::SETOEQ ? IsInfMask : IsFiniteMask;
       return DAG.getNode(AMDGPUISD::FP_CLASS, SL, MVT::i1, LHS.getOperand(0),
                          DAG.getConstant(Mask, SL, MVT::i32));
     }
@@ -7759,8 +8788,7 @@ SDValue SITargetLowering::performCvtF32UByteNCombine(SDNode *N,
   TargetLowering::TargetLoweringOpt TLO(DAG, !DCI.isBeforeLegalize(),
                                         !DCI.isBeforeLegalizeOps());
   const TargetLowering &TLI = DAG.getTargetLoweringInfo();
-  if (TLI.ShrinkDemandedConstant(Src, Demanded, TLO) ||
-      TLI.SimplifyDemandedBits(Src, Demanded, Known, TLO)) {
+  if (TLI.SimplifyDemandedBits(Src, Demanded, Known, TLO)) {
     DCI.CommitTargetLoweringOpt(TLO);
   }
 
@@ -7792,6 +8820,9 @@ SDValue SITargetLowering::performClampCombine(SDNode *N,
 
 SDValue SITargetLowering::PerformDAGCombine(SDNode *N,
                                             DAGCombinerInfo &DCI) const {
+  if (getTargetMachine().getOptLevel() == CodeGenOpt::None)
+    return SDValue();
+
   switch (N->getOpcode()) {
   default:
     return AMDGPUTargetLowering::PerformDAGCombine(N, DCI);
@@ -7810,17 +8841,15 @@ SDValue SITargetLowering::PerformDAGCombine(SDNode *N,
     return performSetCCCombine(N, DCI);
   case ISD::FMAXNUM:
   case ISD::FMINNUM:
+  case ISD::FMAXNUM_IEEE:
+  case ISD::FMINNUM_IEEE:
   case ISD::SMAX:
   case ISD::SMIN:
   case ISD::UMAX:
   case ISD::UMIN:
   case AMDGPUISD::FMIN_LEGACY:
-  case AMDGPUISD::FMAX_LEGACY: {
-    if (DCI.getDAGCombineLevel() >= AfterLegalizeDAG &&
-        getTargetMachine().getOptLevel() > CodeGenOpt::None)
-      return performMinMaxCombine(N, DCI);
-    break;
-  }
+  case AMDGPUISD::FMAX_LEGACY:
+    return performMinMaxCombine(N, DCI);
   case ISD::FMA:
     return performFMACombine(N, DCI);
   case ISD::LOAD: {
@@ -7912,8 +8941,8 @@ SDValue SITargetLowering::PerformDAGCombine(SDNode *N,
   }
   case ISD::EXTRACT_VECTOR_ELT:
     return performExtractVectorEltCombine(N, DCI);
-  case ISD::BUILD_VECTOR:
-    return performBuildVectorCombine(N, DCI);
+  case ISD::INSERT_VECTOR_ELT:
+    return performInsertVectorEltCombine(N, DCI);
   }
   return AMDGPUTargetLowering::PerformDAGCombine(N, DCI);
 }
@@ -7926,6 +8955,7 @@ static unsigned SubIdx2Lane(unsigned Idx) {
   case AMDGPU::sub1: return 1;
   case AMDGPU::sub2: return 2;
   case AMDGPU::sub3: return 3;
+  case AMDGPU::sub4: return 4; // Possible with TFE/LWE
   }
 }
 
@@ -7939,11 +8969,16 @@ SDNode *SITargetLowering::adjustWritemask(MachineSDNode *&Node,
   if (D16Idx >= 0 && Node->getConstantOperandVal(D16Idx))
     return Node; // not implemented for D16
 
-  SDNode *Users[4] = { nullptr };
+  SDNode *Users[5] = { nullptr };
   unsigned Lane = 0;
   unsigned DmaskIdx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::dmask) - 1;
   unsigned OldDmask = Node->getConstantOperandVal(DmaskIdx);
   unsigned NewDmask = 0;
+  unsigned TFEIdx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::tfe) - 1;
+  unsigned LWEIdx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::lwe) - 1;
+  bool UsesTFC = (Node->getConstantOperandVal(TFEIdx) ||
+                  Node->getConstantOperandVal(LWEIdx)) ? 1 : 0;
+  unsigned TFCLane = 0;
   bool HasChain = Node->getNumValues() > 1;
 
   if (OldDmask == 0) {
@@ -7951,6 +8986,12 @@ SDNode *SITargetLowering::adjustWritemask(MachineSDNode *&Node,
     return Node;
   }
 
+  unsigned OldBitsSet = countPopulation(OldDmask);
+  // Work out which is the TFE/LWE lane if that is enabled.
+  if (UsesTFC) {
+    TFCLane = OldBitsSet;
+  }
+
   // Try to figure out the used register components
   for (SDNode::use_iterator I = Node->use_begin(), E = Node->use_end();
        I != E; ++I) {
@@ -7970,28 +9011,49 @@ SDNode *SITargetLowering::adjustWritemask(MachineSDNode *&Node,
     // set, etc.
     Lane = SubIdx2Lane(I->getConstantOperandVal(1));
 
-    // Set which texture component corresponds to the lane.
-    unsigned Comp;
-    for (unsigned i = 0, Dmask = OldDmask; i <= Lane; i++) {
-      Comp = countTrailingZeros(Dmask);
-      Dmask &= ~(1 << Comp);
-    }
+    // Check if the use is for the TFE/LWE generated result at VGPRn+1.
+    if (UsesTFC && Lane == TFCLane) {
+      Users[Lane] = *I;
+    } else {
+      // Set which texture component corresponds to the lane.
+      unsigned Comp;
+      for (unsigned i = 0, Dmask = OldDmask; (i <= Lane) && (Dmask != 0); i++) {
+        Comp = countTrailingZeros(Dmask);
+        Dmask &= ~(1 << Comp);
+      }
 
-    // Abort if we have more than one user per component
-    if (Users[Lane])
-      return Node;
+      // Abort if we have more than one user per component.
+      if (Users[Lane])
+        return Node;
 
-    Users[Lane] = *I;
-    NewDmask |= 1 << Comp;
+      Users[Lane] = *I;
+      NewDmask |= 1 << Comp;
+    }
   }
 
+  // Don't allow 0 dmask, as hardware assumes one channel enabled.
+  bool NoChannels = !NewDmask;
+  if (NoChannels) {
+    // If the original dmask has one channel - then nothing to do
+    if (OldBitsSet == 1)
+      return Node;
+    // Use an arbitrary dmask - required for the instruction to work
+    NewDmask = 1;
+  }
   // Abort if there's no change
   if (NewDmask == OldDmask)
     return Node;
 
   unsigned BitsSet = countPopulation(NewDmask);
 
-  int NewOpcode = AMDGPU::getMaskedMIMGOp(Node->getMachineOpcode(), BitsSet);
+  // Check for TFE or LWE - increase the number of channels by one to account
+  // for the extra return value
+  // This will need adjustment for D16 if this is also included in
+  // adjustWriteMask (this function) but at present D16 are excluded.
+  unsigned NewChannels = BitsSet + UsesTFC;
+
+  int NewOpcode =
+      AMDGPU::getMaskedMIMGOp(Node->getMachineOpcode(), NewChannels);
   assert(NewOpcode != -1 &&
          NewOpcode != static_cast<int>(Node->getMachineOpcode()) &&
          "failed to find equivalent MIMG op");
@@ -8004,8 +9066,9 @@ SDNode *SITargetLowering::adjustWritemask(MachineSDNode *&Node,
 
   MVT SVT = Node->getValueType(0).getVectorElementType().getSimpleVT();
 
-  MVT ResultVT = BitsSet == 1 ?
-    SVT : MVT::getVectorVT(SVT, BitsSet == 3 ? 4 : BitsSet);
+  MVT ResultVT = NewChannels == 1 ?
+    SVT : MVT::getVectorVT(SVT, NewChannels == 3 ? 4 :
+                           NewChannels == 5 ? 8 : NewChannels);
   SDVTList NewVTList = HasChain ?
     DAG.getVTList(ResultVT, MVT::Other) : DAG.getVTList(ResultVT);
 
@@ -8015,11 +9078,11 @@ SDNode *SITargetLowering::adjustWritemask(MachineSDNode *&Node,
 
   if (HasChain) {
     // Update chain.
-    NewNode->setMemRefs(Node->memoperands_begin(), Node->memoperands_end());
+    DAG.setNodeMemRefs(NewNode, Node->memoperands());
     DAG.ReplaceAllUsesOfValueWith(SDValue(Node, 1), SDValue(NewNode, 1));
   }
 
-  if (BitsSet == 1) {
+  if (NewChannels == 1) {
     assert(Node->hasNUsesOfValue(1, 0));
     SDNode *Copy = DAG.getMachineNode(TargetOpcode::COPY,
                                       SDLoc(Node), Users[Lane]->getValueType(0),
@@ -8029,19 +9092,24 @@ SDNode *SITargetLowering::adjustWritemask(MachineSDNode *&Node,
   }
 
   // Update the users of the node with the new indices
-  for (unsigned i = 0, Idx = AMDGPU::sub0; i < 4; ++i) {
+  for (unsigned i = 0, Idx = AMDGPU::sub0; i < 5; ++i) {
     SDNode *User = Users[i];
-    if (!User)
-      continue;
-
-    SDValue Op = DAG.getTargetConstant(Idx, SDLoc(User), MVT::i32);
-    DAG.UpdateNodeOperands(User, SDValue(NewNode, 0), Op);
+    if (!User) {
+      // Handle the special case of NoChannels. We set NewDmask to 1 above, but
+      // Users[0] is still nullptr because channel 0 doesn't really have a use.
+      if (i || !NoChannels)
+        continue;
+    } else {
+      SDValue Op = DAG.getTargetConstant(Idx, SDLoc(User), MVT::i32);
+      DAG.UpdateNodeOperands(User, SDValue(NewNode, 0), Op);
+    }
 
     switch (Idx) {
     default: break;
     case AMDGPU::sub0: Idx = AMDGPU::sub1; break;
     case AMDGPU::sub1: Idx = AMDGPU::sub2; break;
     case AMDGPU::sub2: Idx = AMDGPU::sub3; break;
+    case AMDGPU::sub3: Idx = AMDGPU::sub4; break;
     }
   }
 
@@ -8457,49 +9525,56 @@ void SITargetLowering::computeKnownBitsForFrameIndex(const SDValue Op,
   Known.Zero.setHighBits(AssumeFrameIndexHighZeroBits);
 }
 
+LLVM_ATTRIBUTE_UNUSED
+static bool isCopyFromRegOfInlineAsm(const SDNode *N) {
+  assert(N->getOpcode() == ISD::CopyFromReg);
+  do {
+    // Follow the chain until we find an INLINEASM node.
+    N = N->getOperand(0).getNode();
+    if (N->getOpcode() == ISD::INLINEASM)
+      return true;
+  } while (N->getOpcode() == ISD::CopyFromReg);
+  return false;
+}
+
 bool SITargetLowering::isSDNodeSourceOfDivergence(const SDNode * N,
-  FunctionLoweringInfo * FLI, DivergenceAnalysis * DA) const
+  FunctionLoweringInfo * FLI, LegacyDivergenceAnalysis * KDA) const
 {
   switch (N->getOpcode()) {
-    case ISD::Register:
     case ISD::CopyFromReg:
     {
-      const RegisterSDNode *R = nullptr;
-      if (N->getOpcode() == ISD::Register) {
-        R = dyn_cast<RegisterSDNode>(N);
-      }
-      else {
-        R = dyn_cast<RegisterSDNode>(N->getOperand(1));
-      }
-      if (R)
-      {
-        const MachineFunction * MF = FLI->MF;
-        const GCNSubtarget &ST = MF->getSubtarget<GCNSubtarget>();
-        const MachineRegisterInfo &MRI = MF->getRegInfo();
-        const SIRegisterInfo &TRI = ST.getInstrInfo()->getRegisterInfo();
-        unsigned Reg = R->getReg();
-        if (TRI.isPhysicalRegister(Reg))
-          return TRI.isVGPR(MRI, Reg);
+      const RegisterSDNode *R = cast<RegisterSDNode>(N->getOperand(1));
+      const MachineFunction * MF = FLI->MF;
+      const GCNSubtarget &ST = MF->getSubtarget<GCNSubtarget>();
+      const MachineRegisterInfo &MRI = MF->getRegInfo();
+      const SIRegisterInfo &TRI = ST.getInstrInfo()->getRegisterInfo();
+      unsigned Reg = R->getReg();
+      if (TRI.isPhysicalRegister(Reg))
+        return !TRI.isSGPRReg(MRI, Reg);
 
-        if (MRI.isLiveIn(Reg)) {
-          // workitem.id.x workitem.id.y workitem.id.z
-          // Any VGPR formal argument is also considered divergent
-          if (TRI.isVGPR(MRI, Reg))
-              return true;
-          // Formal arguments of non-entry functions
-          // are conservatively considered divergent
-          else if (!AMDGPU::isEntryFunctionCC(FLI->Fn->getCallingConv()))
-            return true;
-        }
-        return !DA || DA->isDivergent(FLI->getValueFromVirtualReg(Reg));
+      if (MRI.isLiveIn(Reg)) {
+        // workitem.id.x workitem.id.y workitem.id.z
+        // Any VGPR formal argument is also considered divergent
+        if (!TRI.isSGPRReg(MRI, Reg))
+          return true;
+        // Formal arguments of non-entry functions
+        // are conservatively considered divergent
+        else if (!AMDGPU::isEntryFunctionCC(FLI->Fn->getCallingConv()))
+          return true;
+        return false;
       }
+      const Value *V = FLI->getValueFromVirtualReg(Reg);
+      if (V)
+        return KDA->isDivergent(V);
+      assert(Reg == FLI->DemoteRegister || isCopyFromRegOfInlineAsm(N));
+      return !TRI.isSGPRReg(MRI, Reg);
     }
     break;
     case ISD::LOAD: {
-      const LoadSDNode *L = dyn_cast<LoadSDNode>(N);
-      if (L->getMemOperand()->getAddrSpace() ==
-          Subtarget->getAMDGPUAS().PRIVATE_ADDRESS)
-        return true;
+      const LoadSDNode *L = cast<LoadSDNode>(N);
+      unsigned AS = L->getAddressSpace();
+      // A flat load may access private memory.
+      return AS == AMDGPUAS::PRIVATE_ADDRESS || AS == AMDGPUAS::FLAT_ADDRESS;
     } break;
     case ISD::CALLSEQ_END:
     return true;
@@ -8522,3 +9597,30 @@ bool SITargetLowering::isSDNodeSourceOfDivergence(const SDNode * N,
   }
   return false;
 }
+
+bool SITargetLowering::denormalsEnabledForType(EVT VT) const {
+  switch (VT.getScalarType().getSimpleVT().SimpleTy) {
+  case MVT::f32:
+    return Subtarget->hasFP32Denormals();
+  case MVT::f64:
+    return Subtarget->hasFP64Denormals();
+  case MVT::f16:
+    return Subtarget->hasFP16Denormals();
+  default:
+    return false;
+  }
+}
+
+bool SITargetLowering::isKnownNeverNaNForTargetNode(SDValue Op,
+                                                    const SelectionDAG &DAG,
+                                                    bool SNaN,
+                                                    unsigned Depth) const {
+  if (Op.getOpcode() == AMDGPUISD::CLAMP) {
+    if (Subtarget->enableDX10Clamp())
+      return true; // Clamped to 0.
+    return DAG.isKnownNeverNaN(Op.getOperand(0), SNaN, Depth + 1);
+  }
+
+  return AMDGPUTargetLowering::isKnownNeverNaNForTargetNode(Op, DAG,
+                                                            SNaN, Depth);
+}
diff --git a/lib/Target/AMDGPU/SIISelLowering.h b/lib/Target/AMDGPU/SIISelLowering.h
index 5b3d49b3d8e30..bcef519ee6635 100644
--- a/lib/Target/AMDGPU/SIISelLowering.h
+++ b/lib/Target/AMDGPU/SIISelLowering.h
@@ -60,11 +60,22 @@ private:
                                  MVT VT, unsigned Offset) const;
   SDValue lowerImage(SDValue Op, const AMDGPU::ImageDimIntrinsicInfo *Intr,
                      SelectionDAG &DAG) const;
+  SDValue lowerSBuffer(EVT VT, SDLoc DL, SDValue Rsrc, SDValue Offset,
+                       SDValue GLC, SelectionDAG &DAG) const;
 
   SDValue LowerINTRINSIC_WO_CHAIN(SDValue Op, SelectionDAG &DAG) const;
   SDValue LowerINTRINSIC_W_CHAIN(SDValue Op, SelectionDAG &DAG) const;
   SDValue LowerINTRINSIC_VOID(SDValue Op, SelectionDAG &DAG) const;
 
+  // The raw.tbuffer and struct.tbuffer intrinsics have two offset args: offset
+  // (the offset that is included in bounds checking and swizzling, to be split
+  // between the instruction's voffset and immoffset fields) and soffset (the
+  // offset that is excluded from bounds checking and swizzling, to go in the
+  // instruction's soffset field).  This function takes the first kind of
+  // offset and figures out how to split it between voffset and immoffset.
+  std::pair<SDValue, SDValue> splitBufferOffsets(SDValue Offset,
+                                                 SelectionDAG &DAG) const;
+
   SDValue widenLoad(LoadSDNode *Ld, DAGCombinerInfo &DCI) const;
   SDValue LowerLOAD(SDValue Op, SelectionDAG &DAG) const;
   SDValue LowerSELECT(SDValue Op, SelectionDAG &DAG) const;
@@ -81,7 +92,7 @@ private:
   SDValue LowerBRCOND(SDValue Op, SelectionDAG &DAG) const;
 
   SDValue adjustLoadValueType(unsigned Opcode, MemSDNode *M,
-                              SelectionDAG &DAG,
+                              SelectionDAG &DAG, ArrayRef<SDValue> Ops,
                               bool IsIntrinsic = false) const;
 
   SDValue handleD16VData(SDValue VData, SelectionDAG &DAG) const;
@@ -99,6 +110,7 @@ private:
 
   /// Custom lowering for ISD::FP_ROUND for MVT::f16.
   SDValue lowerFP_ROUND(SDValue Op, SelectionDAG &DAG) const;
+  SDValue lowerFMINNUM_FMAXNUM(SDValue Op, SelectionDAG &DAG) const;
 
   SDValue getSegmentAperture(unsigned AS, const SDLoc &DL,
                              SelectionDAG &DAG) const;
@@ -130,6 +142,8 @@ private:
   SDValue performXorCombine(SDNode *N, DAGCombinerInfo &DCI) const;
   SDValue performZeroExtendCombine(SDNode *N, DAGCombinerInfo &DCI) const;
   SDValue performClassCombine(SDNode *N, DAGCombinerInfo &DCI) const;
+  SDValue getCanonicalConstantFP(SelectionDAG &DAG, const SDLoc &SL, EVT VT,
+                                 const APFloat &C) const;
   SDValue performFCanonicalizeCombine(SDNode *N, DAGCombinerInfo &DCI) const;
 
   SDValue performFPMed3ImmCombine(SelectionDAG &DAG, const SDLoc &SL,
@@ -140,7 +154,7 @@ private:
   SDValue performFMed3Combine(SDNode *N, DAGCombinerInfo &DCI) const;
   SDValue performCvtPkRTZCombine(SDNode *N, DAGCombinerInfo &DCI) const;
   SDValue performExtractVectorEltCombine(SDNode *N, DAGCombinerInfo &DCI) const;
-  SDValue performBuildVectorCombine(SDNode *N, DAGCombinerInfo &DCI) const;
+  SDValue performInsertVectorEltCombine(SDNode *N, DAGCombinerInfo &DCI) const;
 
   unsigned getFusedOpcode(const SelectionDAG &DAG,
                           const SDNode *N0, const SDNode *N1) const;
@@ -156,7 +170,6 @@ private:
   SDValue performRcpCombine(SDNode *N, DAGCombinerInfo &DCI) const;
 
   bool isLegalFlatAddressingMode(const AddrMode &AM) const;
-  bool isLegalGlobalAddressingMode(const AddrMode &AM) const;
   bool isLegalMUBUFAddressingMode(const AddrMode &AM) const;
 
   unsigned isCFIntrinsic(const SDNode *Intr) const;
@@ -175,6 +188,12 @@ private:
   /// global value \p GV, false otherwise.
   bool shouldEmitPCReloc(const GlobalValue *GV) const;
 
+  // Analyze a combined offset from an amdgcn_buffer_ intrinsic and store the
+  // three offsets (voffset, soffset and instoffset) into the SDValue[3] array
+  // pointed to by Offsets.
+  void setBufferOffsets(SDValue CombinedOffset, SelectionDAG &DAG,
+                        SDValue *Offsets, unsigned Align = 4) const;
+
 public:
   SITargetLowering(const TargetMachine &tm, const GCNSubtarget &STI);
 
@@ -192,6 +211,7 @@ public:
                             SmallVectorImpl<Value*> &/*Ops*/,
                             Type *&/*AccessTy*/) const override;
 
+  bool isLegalGlobalAddressingMode(const AddrMode &AM) const;
   bool isLegalAddressingMode(const DataLayout &DL, const AddrMode &AM, Type *Ty,
                              unsigned AS,
                              Instruction *I = nullptr) const override;
@@ -215,7 +235,7 @@ public:
   bool isCheapAddrSpaceCast(unsigned SrcAS, unsigned DestAS) const override;
 
   TargetLoweringBase::LegalizeTypeAction
-  getPreferredVectorAction(EVT VT) const override;
+  getPreferredVectorAction(MVT VT) const override;
 
   bool shouldConvertConstantLoadToIntImm(const APInt &Imm,
                                         Type *Ty) const override;
@@ -248,11 +268,11 @@ public:
 
   void passSpecialInputs(
     CallLoweringInfo &CLI,
+    CCState &CCInfo,
     const SIMachineFunctionInfo &Info,
     SmallVectorImpl<std::pair<unsigned, SDValue>> &RegsToPass,
     SmallVectorImpl<SDValue> &MemOpChains,
-    SDValue Chain,
-    SDValue StackPtr) const;
+    SDValue Chain) const;
 
   SDValue LowerCallResult(SDValue Chain, SDValue InFlag,
                           CallingConv::ID CallConv, bool isVarArg,
@@ -322,7 +342,16 @@ public:
                                      unsigned Depth = 0) const override;
 
   bool isSDNodeSourceOfDivergence(const SDNode *N,
-    FunctionLoweringInfo *FLI, DivergenceAnalysis *DA) const override;
+    FunctionLoweringInfo *FLI, LegacyDivergenceAnalysis *DA) const override;
+
+  bool isCanonicalized(SelectionDAG &DAG, SDValue Op,
+                       unsigned MaxDepth = 5) const;
+  bool denormalsEnabledForType(EVT VT) const;
+
+  bool isKnownNeverNaNForTargetNode(SDValue Op,
+                                    const SelectionDAG &DAG,
+                                    bool SNaN = false,
+                                    unsigned Depth = 0) const override;
 };
 
 } // End namespace llvm
diff --git a/lib/Target/AMDGPU/SIInsertSkips.cpp b/lib/Target/AMDGPU/SIInsertSkips.cpp
index dc9397cf7b85e..ba21a5ce1293a 100644
--- a/lib/Target/AMDGPU/SIInsertSkips.cpp
+++ b/lib/Target/AMDGPU/SIInsertSkips.cpp
@@ -66,6 +66,8 @@ private:
 
   bool skipMaskBranch(MachineInstr &MI, MachineBasicBlock &MBB);
 
+  bool optimizeVccBranch(MachineInstr &MI) const;
+
 public:
   static char ID;
 
@@ -320,6 +322,96 @@ bool SIInsertSkips::skipMaskBranch(MachineInstr &MI,
   return true;
 }
 
+bool SIInsertSkips::optimizeVccBranch(MachineInstr &MI) const {
+  // Match:
+  // sreg = -1
+  // vcc = S_AND_B64 exec, sreg
+  // S_CBRANCH_VCC[N]Z
+  // =>
+  // S_CBRANCH_EXEC[N]Z
+  bool Changed = false;
+  MachineBasicBlock &MBB = *MI.getParent();
+  const unsigned CondReg = AMDGPU::VCC;
+  const unsigned ExecReg = AMDGPU::EXEC;
+  const unsigned And = AMDGPU::S_AND_B64;
+
+  MachineBasicBlock::reverse_iterator A = MI.getReverseIterator(),
+                                      E = MBB.rend();
+  bool ReadsCond = false;
+  unsigned Threshold = 5;
+  for (++A ; A != E ; ++A) {
+    if (!--Threshold)
+      return false;
+    if (A->modifiesRegister(ExecReg, TRI))
+      return false;
+    if (A->modifiesRegister(CondReg, TRI)) {
+      if (!A->definesRegister(CondReg, TRI) || A->getOpcode() != And)
+        return false;
+      break;
+    }
+    ReadsCond |= A->readsRegister(CondReg, TRI);
+  }
+  if (A == E)
+    return false;
+
+  MachineOperand &Op1 = A->getOperand(1);
+  MachineOperand &Op2 = A->getOperand(2);
+  if (Op1.getReg() != ExecReg && Op2.isReg() && Op2.getReg() == ExecReg) {
+    TII->commuteInstruction(*A);
+    Changed = true;
+  }
+  if (Op1.getReg() != ExecReg)
+    return Changed;
+  if (Op2.isImm() && Op2.getImm() != -1)
+    return Changed;
+
+  unsigned SReg = AMDGPU::NoRegister;
+  if (Op2.isReg()) {
+    SReg = Op2.getReg();
+    auto M = std::next(A);
+    bool ReadsSreg = false;
+    for ( ; M != E ; ++M) {
+      if (M->definesRegister(SReg, TRI))
+        break;
+      if (M->modifiesRegister(SReg, TRI))
+        return Changed;
+      ReadsSreg |= M->readsRegister(SReg, TRI);
+    }
+    if (M == E ||
+        !M->isMoveImmediate() ||
+        !M->getOperand(1).isImm() ||
+        M->getOperand(1).getImm() != -1)
+      return Changed;
+    // First if sreg is only used in and instruction fold the immediate
+    // into that and.
+    if (!ReadsSreg && Op2.isKill()) {
+      A->getOperand(2).ChangeToImmediate(-1);
+      M->eraseFromParent();
+    }
+  }
+
+  if (!ReadsCond && A->registerDefIsDead(AMDGPU::SCC) &&
+      MI.killsRegister(CondReg, TRI))
+    A->eraseFromParent();
+
+  bool IsVCCZ = MI.getOpcode() == AMDGPU::S_CBRANCH_VCCZ;
+  if (SReg == ExecReg) {
+    if (IsVCCZ) {
+      MI.eraseFromParent();
+      return true;
+    }
+    MI.setDesc(TII->get(AMDGPU::S_BRANCH));
+  } else {
+    MI.setDesc(TII->get(IsVCCZ ? AMDGPU::S_CBRANCH_EXECZ
+                               : AMDGPU::S_CBRANCH_EXECNZ));
+  }
+
+  MI.RemoveOperand(MI.findRegisterUseOperandIdx(CondReg, false /*Kill*/, TRI));
+  MI.addImplicitDefUseOperands(*MBB.getParent());
+
+  return true;
+}
+
 bool SIInsertSkips::runOnMachineFunction(MachineFunction &MF) {
   const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
   TII = ST.getInstrInfo();
@@ -384,7 +476,7 @@ bool SIInsertSkips::runOnMachineFunction(MachineFunction &MF) {
         kill(MI);
 
         if (ExecBranchStack.empty()) {
-          if (skipIfDead(MI, *NextBB)) {
+          if (NextBB != BE && skipIfDead(MI, *NextBB)) {
             HaveSkipBlock = true;
             NextBB = std::next(BI);
             BE = MF.end();
@@ -417,6 +509,11 @@ bool SIInsertSkips::runOnMachineFunction(MachineFunction &MF) {
         }
         break;
 
+      case AMDGPU::S_CBRANCH_VCCZ:
+      case AMDGPU::S_CBRANCH_VCCNZ:
+        MadeChange |= optimizeVccBranch(MI);
+        break;
+
       default:
         break;
       }
diff --git a/lib/Target/AMDGPU/SIInsertWaitcnts.cpp b/lib/Target/AMDGPU/SIInsertWaitcnts.cpp
index d456e3d9b94d0..afc0b44676109 100644
--- a/lib/Target/AMDGPU/SIInsertWaitcnts.cpp
+++ b/lib/Target/AMDGPU/SIInsertWaitcnts.cpp
@@ -13,6 +13,14 @@
 /// Memory reads and writes are issued asynchronously, so we need to insert
 /// S_WAITCNT instructions when we want to access any of their results or
 /// overwrite any register that's used asynchronously.
+///
+/// TODO: This pass currently keeps one timeline per hardware counter. A more
+/// finely-grained approach that keeps one timeline per event type could
+/// sometimes get away with generating weaker s_waitcnt instructions. For
+/// example, when both SMEM and LDS are in flight and we need to wait for
+/// the i-th-last LDS instruction, then an lgkmcnt(i) is actually sufficient,
+/// but the pass will currently generate a conservative lgkmcnt(0) because
+/// multiple event types are in flight.
 //
 //===----------------------------------------------------------------------===//
 
@@ -33,7 +41,6 @@
 #include "llvm/CodeGen/MachineFunctionPass.h"
 #include "llvm/CodeGen/MachineInstr.h"
 #include "llvm/CodeGen/MachineInstrBuilder.h"
-#include "llvm/CodeGen/MachineLoopInfo.h"
 #include "llvm/CodeGen/MachineMemOperand.h"
 #include "llvm/CodeGen/MachineOperand.h"
 #include "llvm/CodeGen/MachineRegisterInfo.h"
@@ -69,6 +76,25 @@ static cl::opt<unsigned> ForceEmitZeroFlag(
 
 namespace {
 
+template <typename EnumT>
+class enum_iterator
+    : public iterator_facade_base<enum_iterator<EnumT>,
+                                  std::forward_iterator_tag, const EnumT> {
+  EnumT Value;
+public:
+  enum_iterator() = default;
+  enum_iterator(EnumT Value) : Value(Value) {}
+
+  enum_iterator &operator++() {
+    Value = static_cast<EnumT>(Value + 1);
+    return *this;
+  }
+
+  bool operator==(const enum_iterator &RHS) const { return Value == RHS.Value; }
+
+  EnumT operator*() const { return Value; }
+};
+
 // Class of object that encapsulates latest instruction counter score
 // associated with the operand.  Used for determining whether
 // s_waitcnt instruction needs to be emited.
@@ -77,12 +103,17 @@ namespace {
 
 enum InstCounterType { VM_CNT = 0, LGKM_CNT, EXP_CNT, NUM_INST_CNTS };
 
+iterator_range<enum_iterator<InstCounterType>> inst_counter_types() {
+  return make_range(enum_iterator<InstCounterType>(VM_CNT),
+                    enum_iterator<InstCounterType>(NUM_INST_CNTS));
+}
+
 using RegInterval = std::pair<signed, signed>;
 
 struct {
-  int32_t VmcntMax;
-  int32_t ExpcntMax;
-  int32_t LgkmcntMax;
+  uint32_t VmcntMax;
+  uint32_t ExpcntMax;
+  uint32_t LgkmcntMax;
   int32_t NumVGPRsMax;
   int32_t NumSGPRsMax;
 } HardwareLimits;
@@ -108,6 +139,14 @@ enum WaitEventType {
   NUM_WAIT_EVENTS,
 };
 
+static const uint32_t WaitEventMaskForInst[NUM_INST_CNTS] = {
+  (1 << VMEM_ACCESS),
+  (1 << SMEM_ACCESS) | (1 << LDS_ACCESS) | (1 << GDS_ACCESS) |
+      (1 << SQ_MESSAGE),
+  (1 << EXP_GPR_LOCK) | (1 << GDS_GPR_LOCK) | (1 << VMW_GPR_LOCK) |
+      (1 << EXP_PARAM_ACCESS) | (1 << EXP_POS_ACCESS),
+};
+
 // The mapping is:
 //  0                .. SQ_MAX_PGM_VGPRS-1               real VGPRs
 //  SQ_MAX_PGM_VGPRS .. NUM_ALL_VGPRS-1                  extra VGPR-like slots
@@ -122,30 +161,38 @@ enum RegisterMapping {
   NUM_ALL_VGPRS = SQ_MAX_PGM_VGPRS + NUM_EXTRA_VGPRS, // Where SGPR starts.
 };
 
-#define ForAllWaitEventType(w)                                                 \
-  for (enum WaitEventType w = (enum WaitEventType)0;                           \
-       (w) < (enum WaitEventType)NUM_WAIT_EVENTS;                              \
-       (w) = (enum WaitEventType)((w) + 1))
+void addWait(AMDGPU::Waitcnt &Wait, InstCounterType T, unsigned Count) {
+  switch (T) {
+  case VM_CNT:
+    Wait.VmCnt = std::min(Wait.VmCnt, Count);
+    break;
+  case EXP_CNT:
+    Wait.ExpCnt = std::min(Wait.ExpCnt, Count);
+    break;
+  case LGKM_CNT:
+    Wait.LgkmCnt = std::min(Wait.LgkmCnt, Count);
+    break;
+  default:
+    llvm_unreachable("bad InstCounterType");
+  }
+}
 
-// This is a per-basic-block object that maintains current score brackets
-// of each wait counter, and a per-register scoreboard for each wait counter.
+// This objects maintains the current score brackets of each wait counter, and
+// a per-register scoreboard for each wait counter.
+//
 // We also maintain the latest score for every event type that can change the
 // waitcnt in order to know if there are multiple types of events within
 // the brackets. When multiple types of event happen in the bracket,
 // wait count may get decreased out of order, therefore we need to put in
 // "s_waitcnt 0" before use.
-class BlockWaitcntBrackets {
+class WaitcntBrackets {
 public:
-  BlockWaitcntBrackets(const GCNSubtarget *SubTarget) : ST(SubTarget) {
-    for (enum InstCounterType T = VM_CNT; T < NUM_INST_CNTS;
-         T = (enum InstCounterType)(T + 1)) {
+  WaitcntBrackets(const GCNSubtarget *SubTarget) : ST(SubTarget) {
+    for (auto T : inst_counter_types())
       memset(VgprScores[T], 0, sizeof(VgprScores[T]));
-    }
   }
 
-  ~BlockWaitcntBrackets() = default;
-
-  static int32_t getWaitCountMax(InstCounterType T) {
+  static uint32_t getWaitCountMax(InstCounterType T) {
     switch (T) {
     case VM_CNT:
       return HardwareLimits.VmcntMax;
@@ -159,33 +206,14 @@ public:
     return 0;
   }
 
-  void setScoreLB(InstCounterType T, int32_t Val) {
-    assert(T < NUM_INST_CNTS);
-    if (T >= NUM_INST_CNTS)
-      return;
-    ScoreLBs[T] = Val;
-  }
-
-  void setScoreUB(InstCounterType T, int32_t Val) {
-    assert(T < NUM_INST_CNTS);
-    if (T >= NUM_INST_CNTS)
-      return;
-    ScoreUBs[T] = Val;
-    if (T == EXP_CNT) {
-      int32_t UB = (int)(ScoreUBs[T] - getWaitCountMax(EXP_CNT));
-      if (ScoreLBs[T] < UB)
-        ScoreLBs[T] = UB;
-    }
-  }
-
-  int32_t getScoreLB(InstCounterType T) {
+  uint32_t getScoreLB(InstCounterType T) const {
     assert(T < NUM_INST_CNTS);
     if (T >= NUM_INST_CNTS)
       return 0;
     return ScoreLBs[T];
   }
 
-  int32_t getScoreUB(InstCounterType T) {
+  uint32_t getScoreUB(InstCounterType T) const {
     assert(T < NUM_INST_CNTS);
     if (T >= NUM_INST_CNTS)
       return 0;
@@ -194,89 +222,56 @@ public:
 
   // Mapping from event to counter.
   InstCounterType eventCounter(WaitEventType E) {
-    switch (E) {
-    case VMEM_ACCESS:
+    if (E == VMEM_ACCESS)
       return VM_CNT;
-    case LDS_ACCESS:
-    case GDS_ACCESS:
-    case SQ_MESSAGE:
-    case SMEM_ACCESS:
+    if (WaitEventMaskForInst[LGKM_CNT] & (1 << E))
       return LGKM_CNT;
-    case EXP_GPR_LOCK:
-    case GDS_GPR_LOCK:
-    case VMW_GPR_LOCK:
-    case EXP_POS_ACCESS:
-    case EXP_PARAM_ACCESS:
-      return EXP_CNT;
-    default:
-      llvm_unreachable("unhandled event type");
-    }
-    return NUM_INST_CNTS;
-  }
-
-  void setRegScore(int GprNo, InstCounterType T, int32_t Val) {
-    if (GprNo < NUM_ALL_VGPRS) {
-      if (GprNo > VgprUB) {
-        VgprUB = GprNo;
-      }
-      VgprScores[T][GprNo] = Val;
-    } else {
-      assert(T == LGKM_CNT);
-      if (GprNo - NUM_ALL_VGPRS > SgprUB) {
-        SgprUB = GprNo - NUM_ALL_VGPRS;
-      }
-      SgprScores[GprNo - NUM_ALL_VGPRS] = Val;
-    }
+    assert(WaitEventMaskForInst[EXP_CNT] & (1 << E));
+    return EXP_CNT;
   }
 
-  int32_t getRegScore(int GprNo, InstCounterType T) {
+  uint32_t getRegScore(int GprNo, InstCounterType T) {
     if (GprNo < NUM_ALL_VGPRS) {
       return VgprScores[T][GprNo];
     }
+    assert(T == LGKM_CNT);
     return SgprScores[GprNo - NUM_ALL_VGPRS];
   }
 
   void clear() {
     memset(ScoreLBs, 0, sizeof(ScoreLBs));
     memset(ScoreUBs, 0, sizeof(ScoreUBs));
-    memset(EventUBs, 0, sizeof(EventUBs));
-    for (enum InstCounterType T = VM_CNT; T < NUM_INST_CNTS;
-         T = (enum InstCounterType)(T + 1)) {
+    PendingEvents = 0;
+    memset(MixedPendingEvents, 0, sizeof(MixedPendingEvents));
+    for (auto T : inst_counter_types())
       memset(VgprScores[T], 0, sizeof(VgprScores[T]));
-    }
     memset(SgprScores, 0, sizeof(SgprScores));
   }
 
+  bool merge(const WaitcntBrackets &Other);
+
   RegInterval getRegInterval(const MachineInstr *MI, const SIInstrInfo *TII,
                              const MachineRegisterInfo *MRI,
                              const SIRegisterInfo *TRI, unsigned OpNo,
                              bool Def) const;
 
-  void setExpScore(const MachineInstr *MI, const SIInstrInfo *TII,
-                   const SIRegisterInfo *TRI, const MachineRegisterInfo *MRI,
-                   unsigned OpNo, int32_t Val);
-
-  void setWaitAtBeginning() { WaitAtBeginning = true; }
-  void clearWaitAtBeginning() { WaitAtBeginning = false; }
-  bool getWaitAtBeginning() const { return WaitAtBeginning; }
-  void setEventUB(enum WaitEventType W, int32_t Val) { EventUBs[W] = Val; }
   int32_t getMaxVGPR() const { return VgprUB; }
   int32_t getMaxSGPR() const { return SgprUB; }
 
-  int32_t getEventUB(enum WaitEventType W) const {
-    assert(W < NUM_WAIT_EVENTS);
-    return EventUBs[W];
-  }
-
-  bool counterOutOfOrder(InstCounterType T);
-  unsigned int updateByWait(InstCounterType T, int ScoreToWait);
+  bool counterOutOfOrder(InstCounterType T) const;
+  bool simplifyWaitcnt(AMDGPU::Waitcnt &Wait) const;
+  bool simplifyWaitcnt(InstCounterType T, unsigned &Count) const;
+  void determineWait(InstCounterType T, uint32_t ScoreToWait,
+                     AMDGPU::Waitcnt &Wait) const;
+  void applyWaitcnt(const AMDGPU::Waitcnt &Wait);
+  void applyWaitcnt(InstCounterType T, unsigned Count);
   void updateByEvent(const SIInstrInfo *TII, const SIRegisterInfo *TRI,
                      const MachineRegisterInfo *MRI, WaitEventType E,
                      MachineInstr &MI);
 
-  bool hasPendingSMEM() const {
-    return (EventUBs[SMEM_ACCESS] > ScoreLBs[LGKM_CNT] &&
-            EventUBs[SMEM_ACCESS] <= ScoreUBs[LGKM_CNT]);
+  bool hasPending() const { return PendingEvents != 0; }
+  bool hasPendingEvent(WaitEventType E) const {
+    return PendingEvents & (1 << E);
   }
 
   bool hasPendingFlat() const {
@@ -291,75 +286,71 @@ public:
     LastFlat[LGKM_CNT] = ScoreUBs[LGKM_CNT];
   }
 
-  int pendingFlat(InstCounterType Ct) const { return LastFlat[Ct]; }
-
-  void setLastFlat(InstCounterType Ct, int Val) { LastFlat[Ct] = Val; }
+  void print(raw_ostream &);
+  void dump() { print(dbgs()); }
 
-  bool getRevisitLoop() const { return RevisitLoop; }
-  void setRevisitLoop(bool RevisitLoopIn) { RevisitLoop = RevisitLoopIn; }
+private:
+  struct MergeInfo {
+    uint32_t OldLB;
+    uint32_t OtherLB;
+    uint32_t MyShift;
+    uint32_t OtherShift;
+  };
+  static bool mergeScore(const MergeInfo &M, uint32_t &Score,
+                         uint32_t OtherScore);
 
-  void setPostOrder(int32_t PostOrderIn) { PostOrder = PostOrderIn; }
-  int32_t getPostOrder() const { return PostOrder; }
+  void setScoreLB(InstCounterType T, uint32_t Val) {
+    assert(T < NUM_INST_CNTS);
+    if (T >= NUM_INST_CNTS)
+      return;
+    ScoreLBs[T] = Val;
+  }
 
-  void setWaitcnt(MachineInstr *WaitcntIn) { Waitcnt = WaitcntIn; }
-  void clearWaitcnt() { Waitcnt = nullptr; }
-  MachineInstr *getWaitcnt() const { return Waitcnt; }
+  void setScoreUB(InstCounterType T, uint32_t Val) {
+    assert(T < NUM_INST_CNTS);
+    if (T >= NUM_INST_CNTS)
+      return;
+    ScoreUBs[T] = Val;
+    if (T == EXP_CNT) {
+      uint32_t UB = ScoreUBs[T] - getWaitCountMax(EXP_CNT);
+      if (ScoreLBs[T] < UB && UB < ScoreUBs[T])
+        ScoreLBs[T] = UB;
+    }
+  }
 
-  bool mixedExpTypes() const { return MixedExpTypes; }
-  void setMixedExpTypes(bool MixedExpTypesIn) {
-    MixedExpTypes = MixedExpTypesIn;
+  void setRegScore(int GprNo, InstCounterType T, uint32_t Val) {
+    if (GprNo < NUM_ALL_VGPRS) {
+      if (GprNo > VgprUB) {
+        VgprUB = GprNo;
+      }
+      VgprScores[T][GprNo] = Val;
+    } else {
+      assert(T == LGKM_CNT);
+      if (GprNo - NUM_ALL_VGPRS > SgprUB) {
+        SgprUB = GprNo - NUM_ALL_VGPRS;
+      }
+      SgprScores[GprNo - NUM_ALL_VGPRS] = Val;
+    }
   }
 
-  void print(raw_ostream &);
-  void dump() { print(dbgs()); }
+  void setExpScore(const MachineInstr *MI, const SIInstrInfo *TII,
+                   const SIRegisterInfo *TRI, const MachineRegisterInfo *MRI,
+                   unsigned OpNo, uint32_t Val);
 
-private:
   const GCNSubtarget *ST = nullptr;
-  bool WaitAtBeginning = false;
-  bool RevisitLoop = false;
-  bool MixedExpTypes = false;
-  int32_t PostOrder = 0;
-  MachineInstr *Waitcnt = nullptr;
-  int32_t ScoreLBs[NUM_INST_CNTS] = {0};
-  int32_t ScoreUBs[NUM_INST_CNTS] = {0};
-  int32_t EventUBs[NUM_WAIT_EVENTS] = {0};
+  uint32_t ScoreLBs[NUM_INST_CNTS] = {0};
+  uint32_t ScoreUBs[NUM_INST_CNTS] = {0};
+  uint32_t PendingEvents = 0;
+  bool MixedPendingEvents[NUM_INST_CNTS] = {false};
   // Remember the last flat memory operation.
-  int32_t LastFlat[NUM_INST_CNTS] = {0};
+  uint32_t LastFlat[NUM_INST_CNTS] = {0};
   // wait_cnt scores for every vgpr.
   // Keep track of the VgprUB and SgprUB to make merge at join efficient.
   int32_t VgprUB = 0;
   int32_t SgprUB = 0;
-  int32_t VgprScores[NUM_INST_CNTS][NUM_ALL_VGPRS];
+  uint32_t VgprScores[NUM_INST_CNTS][NUM_ALL_VGPRS];
   // Wait cnt scores for every sgpr, only lgkmcnt is relevant.
-  int32_t SgprScores[SQ_MAX_PGM_SGPRS] = {0};
-};
-
-// This is a per-loop-region object that records waitcnt status at the end of
-// loop footer from the previous iteration. We also maintain an iteration
-// count to track the number of times the loop has been visited. When it
-// doesn't converge naturally, we force convergence by inserting s_waitcnt 0
-// at the end of the loop footer.
-class LoopWaitcntData {
-public:
-  LoopWaitcntData() = default;
-  ~LoopWaitcntData() = default;
-
-  void incIterCnt() { IterCnt++; }
-  void resetIterCnt() { IterCnt = 0; }
-  unsigned getIterCnt() { return IterCnt; }
-
-  void setWaitcnt(MachineInstr *WaitcntIn) { LfWaitcnt = WaitcntIn; }
-  MachineInstr *getWaitcnt() const { return LfWaitcnt; }
-
-  void print() { LLVM_DEBUG(dbgs() << "  iteration " << IterCnt << '\n';); }
-
-private:
-  // s_waitcnt added at the end of loop footer to stablize wait scores
-  // at the end of the loop footer.
-  MachineInstr *LfWaitcnt = nullptr;
-  // Number of iterations the loop has been visited, not including the initial
-  // walk over.
-  int32_t IterCnt = 0;
+  uint32_t SgprScores[SQ_MAX_PGM_SGPRS] = {0};
 };
 
 class SIInsertWaitcnts : public MachineFunctionPass {
@@ -368,22 +359,21 @@ private:
   const SIInstrInfo *TII = nullptr;
   const SIRegisterInfo *TRI = nullptr;
   const MachineRegisterInfo *MRI = nullptr;
-  const MachineLoopInfo *MLI = nullptr;
-  AMDGPU::IsaInfo::IsaVersion IV;
-  AMDGPUAS AMDGPUASI;
+  AMDGPU::IsaVersion IV;
 
-  DenseSet<MachineBasicBlock *> BlockVisitedSet;
   DenseSet<MachineInstr *> TrackedWaitcntSet;
   DenseSet<MachineInstr *> VCCZBugHandledSet;
 
-  DenseMap<MachineBasicBlock *, std::unique_ptr<BlockWaitcntBrackets>>
-      BlockWaitcntBracketsMap;
+  struct BlockInfo {
+    MachineBasicBlock *MBB;
+    std::unique_ptr<WaitcntBrackets> Incoming;
+    bool Dirty = true;
 
-  std::vector<MachineBasicBlock *> BlockWaitcntProcessedSet;
+    explicit BlockInfo(MachineBasicBlock *MBB) : MBB(MBB) {}
+  };
 
-  DenseMap<MachineLoop *, std::unique_ptr<LoopWaitcntData>> LoopWaitcntDataMap;
-
-  std::vector<std::unique_ptr<BlockWaitcntBrackets>> KillWaitBrackets;
+  std::vector<BlockInfo> BlockInfos; // by reverse post-order traversal index
+  DenseMap<MachineBasicBlock *, unsigned> RpotIdxMap;
 
   // ForceEmitZeroWaitcnts: force all waitcnts insts to be s_waitcnt 0
   // because of amdgpu-waitcnt-forcezero flag
@@ -407,20 +397,11 @@ public:
 
   void getAnalysisUsage(AnalysisUsage &AU) const override {
     AU.setPreservesCFG();
-    AU.addRequired<MachineLoopInfo>();
     MachineFunctionPass::getAnalysisUsage(AU);
   }
 
-  void addKillWaitBracket(BlockWaitcntBrackets *Bracket) {
-    // The waitcnt information is copied because it changes as the block is
-    // traversed.
-    KillWaitBrackets.push_back(
-        llvm::make_unique<BlockWaitcntBrackets>(*Bracket));
-  }
-
   bool isForceEmitWaitcnt() const {
-    for (enum InstCounterType T = VM_CNT; T < NUM_INST_CNTS;
-         T = (enum InstCounterType)(T + 1))
+    for (auto T : inst_counter_types())
       if (ForceEmitWaitcnt[T])
         return true;
     return false;
@@ -454,27 +435,22 @@ public:
   }
 
   bool mayAccessLDSThroughFlat(const MachineInstr &MI) const;
-  void generateWaitcntInstBefore(MachineInstr &MI,
-                                  BlockWaitcntBrackets *ScoreBrackets);
+  bool generateWaitcntInstBefore(MachineInstr &MI,
+                                 WaitcntBrackets &ScoreBrackets,
+                                 MachineInstr *OldWaitcntInstr);
   void updateEventWaitcntAfter(MachineInstr &Inst,
-                               BlockWaitcntBrackets *ScoreBrackets);
-  void mergeInputScoreBrackets(MachineBasicBlock &Block);
-  bool isLoopBottom(const MachineLoop *Loop, const MachineBasicBlock *Block);
-  unsigned countNumBottomBlocks(const MachineLoop *Loop);
-  void insertWaitcntInBlock(MachineFunction &MF, MachineBasicBlock &Block);
-  void insertWaitcntBeforeCF(MachineBasicBlock &Block, MachineInstr *Inst);
-  bool isWaitcntStronger(unsigned LHS, unsigned RHS);
-  unsigned combineWaitcnt(unsigned LHS, unsigned RHS);
+                               WaitcntBrackets *ScoreBrackets);
+  bool insertWaitcntInBlock(MachineFunction &MF, MachineBasicBlock &Block,
+                            WaitcntBrackets &ScoreBrackets);
 };
 
 } // end anonymous namespace
 
-RegInterval BlockWaitcntBrackets::getRegInterval(const MachineInstr *MI,
-                                                 const SIInstrInfo *TII,
-                                                 const MachineRegisterInfo *MRI,
-                                                 const SIRegisterInfo *TRI,
-                                                 unsigned OpNo,
-                                                 bool Def) const {
+RegInterval WaitcntBrackets::getRegInterval(const MachineInstr *MI,
+                                            const SIInstrInfo *TII,
+                                            const MachineRegisterInfo *MRI,
+                                            const SIRegisterInfo *TRI,
+                                            unsigned OpNo, bool Def) const {
   const MachineOperand &Op = MI->getOperand(OpNo);
   if (!Op.isReg() || !TRI->isInAllocatableClass(Op.getReg()) ||
       (Def && !Op.isDef()))
@@ -512,11 +488,11 @@ RegInterval BlockWaitcntBrackets::getRegInterval(const MachineInstr *MI,
   return Result;
 }
 
-void BlockWaitcntBrackets::setExpScore(const MachineInstr *MI,
-                                       const SIInstrInfo *TII,
-                                       const SIRegisterInfo *TRI,
-                                       const MachineRegisterInfo *MRI,
-                                       unsigned OpNo, int32_t Val) {
+void WaitcntBrackets::setExpScore(const MachineInstr *MI,
+                                  const SIInstrInfo *TII,
+                                  const SIRegisterInfo *TRI,
+                                  const MachineRegisterInfo *MRI, unsigned OpNo,
+                                  uint32_t Val) {
   RegInterval Interval = getRegInterval(MI, TII, MRI, TRI, OpNo, false);
   LLVM_DEBUG({
     const MachineOperand &Opnd = MI->getOperand(OpNo);
@@ -527,26 +503,26 @@ void BlockWaitcntBrackets::setExpScore(const MachineInstr *MI,
   }
 }
 
-void BlockWaitcntBrackets::updateByEvent(const SIInstrInfo *TII,
-                                         const SIRegisterInfo *TRI,
-                                         const MachineRegisterInfo *MRI,
-                                         WaitEventType E, MachineInstr &Inst) {
+void WaitcntBrackets::updateByEvent(const SIInstrInfo *TII,
+                                    const SIRegisterInfo *TRI,
+                                    const MachineRegisterInfo *MRI,
+                                    WaitEventType E, MachineInstr &Inst) {
   const MachineRegisterInfo &MRIA = *MRI;
   InstCounterType T = eventCounter(E);
-  int32_t CurrScore = getScoreUB(T) + 1;
-  // EventUB and ScoreUB need to be update regardless if this event changes
-  // the score of a register or not.
+  uint32_t CurrScore = getScoreUB(T) + 1;
+  if (CurrScore == 0)
+    report_fatal_error("InsertWaitcnt score wraparound");
+  // PendingEvents and ScoreUB need to be update regardless if this event
+  // changes the score of a register or not.
   // Examples including vm_cnt when buffer-store or lgkm_cnt when send-message.
-  EventUBs[E] = CurrScore;
+  if (!hasPendingEvent(E)) {
+    if (PendingEvents & WaitEventMaskForInst[T])
+      MixedPendingEvents[T] = true;
+    PendingEvents |= 1 << E;
+  }
   setScoreUB(T, CurrScore);
 
   if (T == EXP_CNT) {
-    // Check for mixed export types. If they are mixed, then a waitcnt exp(0)
-    // is required.
-    if (!MixedExpTypes) {
-      MixedExpTypes = counterOutOfOrder(EXP_CNT);
-    }
-
     // Put score on the source vgprs. If this is a store, just use those
     // specific register(s).
     if (TII->isDS(Inst) && (Inst.mayStore() || Inst.mayLoad())) {
@@ -671,12 +647,11 @@ void BlockWaitcntBrackets::updateByEvent(const SIInstrInfo *TII,
   }
 }
 
-void BlockWaitcntBrackets::print(raw_ostream &OS) {
+void WaitcntBrackets::print(raw_ostream &OS) {
   OS << '\n';
-  for (enum InstCounterType T = VM_CNT; T < NUM_INST_CNTS;
-       T = (enum InstCounterType)(T + 1)) {
-    int LB = getScoreLB(T);
-    int UB = getScoreUB(T);
+  for (auto T : inst_counter_types()) {
+    uint32_t LB = getScoreLB(T);
+    uint32_t UB = getScoreUB(T);
 
     switch (T) {
     case VM_CNT:
@@ -696,10 +671,10 @@ void BlockWaitcntBrackets::print(raw_ostream &OS) {
     if (LB < UB) {
       // Print vgpr scores.
       for (int J = 0; J <= getMaxVGPR(); J++) {
-        int RegScore = getRegScore(J, T);
+        uint32_t RegScore = getRegScore(J, T);
         if (RegScore <= LB)
           continue;
-        int RelScore = RegScore - LB - 1;
+        uint32_t RelScore = RegScore - LB - 1;
         if (J < SQ_MAX_PGM_VGPRS + EXTRA_VGPR_LDS) {
           OS << RelScore << ":v" << J << " ";
         } else {
@@ -709,10 +684,10 @@ void BlockWaitcntBrackets::print(raw_ostream &OS) {
       // Also need to print sgpr scores for lgkm_cnt.
       if (T == LGKM_CNT) {
         for (int J = 0; J <= getMaxSGPR(); J++) {
-          int RegScore = getRegScore(J + NUM_ALL_VGPRS, LGKM_CNT);
+          uint32_t RegScore = getRegScore(J + NUM_ALL_VGPRS, LGKM_CNT);
           if (RegScore <= LB)
             continue;
-          int RelScore = RegScore - LB - 1;
+          uint32_t RelScore = RegScore - LB - 1;
           OS << RelScore << ":s" << J << " ";
         }
       }
@@ -722,23 +697,31 @@ void BlockWaitcntBrackets::print(raw_ostream &OS) {
   OS << '\n';
 }
 
-unsigned int BlockWaitcntBrackets::updateByWait(InstCounterType T,
-                                                int ScoreToWait) {
-  unsigned int NeedWait = 0;
-  if (ScoreToWait == -1) {
-    // The score to wait is unknown. This implies that it was not encountered
-    // during the path of the CFG walk done during the current traversal but
-    // may be seen on a different path. Emit an s_wait counter with a
-    // conservative value of 0 for the counter.
-    NeedWait = CNT_MASK(T);
-    setScoreLB(T, getScoreUB(T));
-    return NeedWait;
-  }
+/// Simplify the waitcnt, in the sense of removing redundant counts, and return
+/// whether a waitcnt instruction is needed at all.
+bool WaitcntBrackets::simplifyWaitcnt(AMDGPU::Waitcnt &Wait) const {
+  return simplifyWaitcnt(VM_CNT, Wait.VmCnt) |
+         simplifyWaitcnt(EXP_CNT, Wait.ExpCnt) |
+         simplifyWaitcnt(LGKM_CNT, Wait.LgkmCnt);
+}
+
+bool WaitcntBrackets::simplifyWaitcnt(InstCounterType T,
+                                      unsigned &Count) const {
+  const uint32_t LB = getScoreLB(T);
+  const uint32_t UB = getScoreUB(T);
+  if (Count < UB && UB - Count > LB)
+    return true;
 
+  Count = ~0u;
+  return false;
+}
+
+void WaitcntBrackets::determineWait(InstCounterType T, uint32_t ScoreToWait,
+                                    AMDGPU::Waitcnt &Wait) const {
   // If the score of src_operand falls within the bracket, we need an
   // s_waitcnt instruction.
-  const int32_t LB = getScoreLB(T);
-  const int32_t UB = getScoreUB(T);
+  const uint32_t LB = getScoreLB(T);
+  const uint32_t UB = getScoreUB(T);
   if ((UB >= ScoreToWait) && (ScoreToWait > LB)) {
     if ((T == VM_CNT || T == LGKM_CNT) &&
         hasPendingFlat() &&
@@ -746,90 +729,46 @@ unsigned int BlockWaitcntBrackets::updateByWait(InstCounterType T,
       // If there is a pending FLAT operation, and this is a VMem or LGKM
       // waitcnt and the target can report early completion, then we need
       // to force a waitcnt 0.
-      NeedWait = CNT_MASK(T);
-      setScoreLB(T, getScoreUB(T));
+      addWait(Wait, T, 0);
     } else if (counterOutOfOrder(T)) {
       // Counter can get decremented out-of-order when there
       // are multiple types event in the bracket. Also emit an s_wait counter
       // with a conservative value of 0 for the counter.
-      NeedWait = CNT_MASK(T);
-      setScoreLB(T, getScoreUB(T));
+      addWait(Wait, T, 0);
     } else {
-      NeedWait = CNT_MASK(T);
-      setScoreLB(T, ScoreToWait);
+      addWait(Wait, T, UB - ScoreToWait);
     }
   }
+}
 
-  return NeedWait;
+void WaitcntBrackets::applyWaitcnt(const AMDGPU::Waitcnt &Wait) {
+  applyWaitcnt(VM_CNT, Wait.VmCnt);
+  applyWaitcnt(EXP_CNT, Wait.ExpCnt);
+  applyWaitcnt(LGKM_CNT, Wait.LgkmCnt);
 }
 
-// Where there are multiple types of event in the bracket of a counter,
-// the decrement may go out of order.
-bool BlockWaitcntBrackets::counterOutOfOrder(InstCounterType T) {
-  switch (T) {
-  case VM_CNT:
-    return false;
-  case LGKM_CNT: {
-    if (EventUBs[SMEM_ACCESS] > ScoreLBs[LGKM_CNT] &&
-        EventUBs[SMEM_ACCESS] <= ScoreUBs[LGKM_CNT]) {
-      // Scalar memory read always can go out of order.
-      return true;
-    }
-    int NumEventTypes = 0;
-    if (EventUBs[LDS_ACCESS] > ScoreLBs[LGKM_CNT] &&
-        EventUBs[LDS_ACCESS] <= ScoreUBs[LGKM_CNT]) {
-      NumEventTypes++;
-    }
-    if (EventUBs[GDS_ACCESS] > ScoreLBs[LGKM_CNT] &&
-        EventUBs[GDS_ACCESS] <= ScoreUBs[LGKM_CNT]) {
-      NumEventTypes++;
-    }
-    if (EventUBs[SQ_MESSAGE] > ScoreLBs[LGKM_CNT] &&
-        EventUBs[SQ_MESSAGE] <= ScoreUBs[LGKM_CNT]) {
-      NumEventTypes++;
-    }
-    if (NumEventTypes <= 1) {
-      return false;
-    }
-    break;
+void WaitcntBrackets::applyWaitcnt(InstCounterType T, unsigned Count) {
+  const uint32_t UB = getScoreUB(T);
+  if (Count >= UB)
+    return;
+  if (Count != 0) {
+    if (counterOutOfOrder(T))
+      return;
+    setScoreLB(T, std::max(getScoreLB(T), UB - Count));
+  } else {
+    setScoreLB(T, UB);
+    MixedPendingEvents[T] = false;
+    PendingEvents &= ~WaitEventMaskForInst[T];
   }
-  case EXP_CNT: {
-    // If there has been a mixture of export types, then a waitcnt exp(0) is
-    // required.
-    if (MixedExpTypes)
-      return true;
-    int NumEventTypes = 0;
-    if (EventUBs[EXP_GPR_LOCK] > ScoreLBs[EXP_CNT] &&
-        EventUBs[EXP_GPR_LOCK] <= ScoreUBs[EXP_CNT]) {
-      NumEventTypes++;
-    }
-    if (EventUBs[GDS_GPR_LOCK] > ScoreLBs[EXP_CNT] &&
-        EventUBs[GDS_GPR_LOCK] <= ScoreUBs[EXP_CNT]) {
-      NumEventTypes++;
-    }
-    if (EventUBs[VMW_GPR_LOCK] > ScoreLBs[EXP_CNT] &&
-        EventUBs[VMW_GPR_LOCK] <= ScoreUBs[EXP_CNT]) {
-      NumEventTypes++;
-    }
-    if (EventUBs[EXP_PARAM_ACCESS] > ScoreLBs[EXP_CNT] &&
-        EventUBs[EXP_PARAM_ACCESS] <= ScoreUBs[EXP_CNT]) {
-      NumEventTypes++;
-    }
-
-    if (EventUBs[EXP_POS_ACCESS] > ScoreLBs[EXP_CNT] &&
-        EventUBs[EXP_POS_ACCESS] <= ScoreUBs[EXP_CNT]) {
-      NumEventTypes++;
-    }
+}
 
-    if (NumEventTypes <= 1) {
-      return false;
-    }
-    break;
-  }
-  default:
-    break;
-  }
-  return true;
+// Where there are multiple types of event in the bracket of a counter,
+// the decrement may go out of order.
+bool WaitcntBrackets::counterOutOfOrder(InstCounterType T) const {
+  // Scalar memory read always can go out of order.
+  if (T == LGKM_CNT && hasPendingEvent(SMEM_ACCESS))
+    return true;
+  return MixedPendingEvents[T];
 }
 
 INITIALIZE_PASS_BEGIN(SIInsertWaitcnts, DEBUG_TYPE, "SI Insert Waitcnts", false,
@@ -851,29 +790,6 @@ static bool readsVCCZ(const MachineInstr &MI) {
          !MI.getOperand(1).isUndef();
 }
 
-/// Given wait count encodings checks if LHS is stronger than RHS.
-bool SIInsertWaitcnts::isWaitcntStronger(unsigned LHS, unsigned RHS) {
-  if (AMDGPU::decodeVmcnt(IV, LHS) > AMDGPU::decodeVmcnt(IV, RHS))
-    return false;
-  if (AMDGPU::decodeLgkmcnt(IV, LHS) > AMDGPU::decodeLgkmcnt(IV, RHS))
-    return false;
-  if (AMDGPU::decodeExpcnt(IV, LHS) > AMDGPU::decodeExpcnt(IV, RHS))
-    return false;
-  return true;
-}
-
-/// Given wait count encodings create a new encoding which is stronger
-/// or equal to both.
-unsigned SIInsertWaitcnts::combineWaitcnt(unsigned LHS, unsigned RHS) {
-  unsigned VmCnt = std::min(AMDGPU::decodeVmcnt(IV, LHS),
-                            AMDGPU::decodeVmcnt(IV, RHS));
-  unsigned LgkmCnt = std::min(AMDGPU::decodeLgkmcnt(IV, LHS),
-                              AMDGPU::decodeLgkmcnt(IV, RHS));
-  unsigned ExpCnt = std::min(AMDGPU::decodeExpcnt(IV, LHS),
-                             AMDGPU::decodeExpcnt(IV, RHS));
-  return AMDGPU::encodeWaitcnt(IV, VmCnt, ExpCnt, LgkmCnt);
-}
-
 ///  Generate s_waitcnt instruction to be placed before cur_Inst.
 ///  Instructions of a given type are returned in order,
 ///  but instructions of different types can complete out of order.
@@ -884,51 +800,23 @@ unsigned SIInsertWaitcnts::combineWaitcnt(unsigned LHS, unsigned RHS) {
 ///  and if so what the value of each counter is.
 ///  The "score bracket" is bound by the lower bound and upper bound
 ///  scores (*_score_LB and *_score_ub respectively).
-void SIInsertWaitcnts::generateWaitcntInstBefore(
-    MachineInstr &MI, BlockWaitcntBrackets *ScoreBrackets) {
-  // To emit, or not to emit - that's the question!
-  // Start with an assumption that there is no need to emit.
-  unsigned int EmitWaitcnt = 0;
-
-  // No need to wait before phi. If a phi-move exists, then the wait should
-  // has been inserted before the move. If a phi-move does not exist, then
-  // wait should be inserted before the real use. The same is true for
-  // sc-merge. It is not a coincident that all these cases correspond to the
-  // instructions that are skipped in the assembling loop.
-  bool NeedLineMapping = false; // TODO: Check on this.
-
-  // ForceEmitZeroWaitcnt: force a single s_waitcnt 0 due to hw bug
-  bool ForceEmitZeroWaitcnt = false;
-
+bool SIInsertWaitcnts::generateWaitcntInstBefore(
+    MachineInstr &MI, WaitcntBrackets &ScoreBrackets,
+    MachineInstr *OldWaitcntInstr) {
   setForceEmitWaitcnt();
   bool IsForceEmitWaitcnt = isForceEmitWaitcnt();
 
-  if (MI.isDebugInstr() &&
-      // TODO: any other opcode?
-      !NeedLineMapping) {
-    return;
-  }
+  if (MI.isDebugInstr())
+    return false;
 
-  // See if an s_waitcnt is forced at block entry, or is needed at
-  // program end.
-  if (ScoreBrackets->getWaitAtBeginning()) {
-    // Note that we have already cleared the state, so we don't need to update
-    // it.
-    ScoreBrackets->clearWaitAtBeginning();
-    for (enum InstCounterType T = VM_CNT; T < NUM_INST_CNTS;
-         T = (enum InstCounterType)(T + 1)) {
-      EmitWaitcnt |= CNT_MASK(T);
-      ScoreBrackets->setScoreLB(T, ScoreBrackets->getScoreUB(T));
-    }
-  }
+  AMDGPU::Waitcnt Wait;
 
   // See if this instruction has a forced S_WAITCNT VM.
   // TODO: Handle other cases of NeedsWaitcntVmBefore()
-  else if (MI.getOpcode() == AMDGPU::BUFFER_WBINVL1 ||
-           MI.getOpcode() == AMDGPU::BUFFER_WBINVL1_SC ||
-           MI.getOpcode() == AMDGPU::BUFFER_WBINVL1_VOL) {
-    EmitWaitcnt |=
-        ScoreBrackets->updateByWait(VM_CNT, ScoreBrackets->getScoreUB(VM_CNT));
+  if (MI.getOpcode() == AMDGPU::BUFFER_WBINVL1 ||
+      MI.getOpcode() == AMDGPU::BUFFER_WBINVL1_SC ||
+      MI.getOpcode() == AMDGPU::BUFFER_WBINVL1_VOL) {
+    Wait.VmCnt = 0;
   }
 
   // All waits must be resolved at call return.
@@ -936,23 +824,14 @@ void SIInsertWaitcnts::generateWaitcntInstBefore(
   //   with knowledge of the called routines.
   if (MI.getOpcode() == AMDGPU::SI_RETURN_TO_EPILOG ||
       MI.getOpcode() == AMDGPU::S_SETPC_B64_return) {
-    for (enum InstCounterType T = VM_CNT; T < NUM_INST_CNTS;
-         T = (enum InstCounterType)(T + 1)) {
-      if (ScoreBrackets->getScoreUB(T) > ScoreBrackets->getScoreLB(T)) {
-        ScoreBrackets->setScoreLB(T, ScoreBrackets->getScoreUB(T));
-        EmitWaitcnt |= CNT_MASK(T);
-      }
-    }
+    Wait = AMDGPU::Waitcnt::allZero();
   }
   // Resolve vm waits before gs-done.
   else if ((MI.getOpcode() == AMDGPU::S_SENDMSG ||
             MI.getOpcode() == AMDGPU::S_SENDMSGHALT) &&
            ((MI.getOperand(0).getImm() & AMDGPU::SendMsg::ID_MASK_) ==
             AMDGPU::SendMsg::ID_GS_DONE)) {
-    if (ScoreBrackets->getScoreUB(VM_CNT) > ScoreBrackets->getScoreLB(VM_CNT)) {
-      ScoreBrackets->setScoreLB(VM_CNT, ScoreBrackets->getScoreUB(VM_CNT));
-      EmitWaitcnt |= CNT_MASK(VM_CNT);
-    }
+    Wait.VmCnt = 0;
   }
 #if 0 // TODO: the following blocks of logic when we have fence.
   else if (MI.getOpcode() == SC_FENCE) {
@@ -1016,14 +895,12 @@ void SIInsertWaitcnts::generateWaitcntInstBefore(
     if (MI.modifiesRegister(AMDGPU::EXEC, TRI)) {
       // Export and GDS are tracked individually, either may trigger a waitcnt
       // for EXEC.
-      EmitWaitcnt |= ScoreBrackets->updateByWait(
-          EXP_CNT, ScoreBrackets->getEventUB(EXP_GPR_LOCK));
-      EmitWaitcnt |= ScoreBrackets->updateByWait(
-          EXP_CNT, ScoreBrackets->getEventUB(EXP_PARAM_ACCESS));
-      EmitWaitcnt |= ScoreBrackets->updateByWait(
-          EXP_CNT, ScoreBrackets->getEventUB(EXP_POS_ACCESS));
-      EmitWaitcnt |= ScoreBrackets->updateByWait(
-          EXP_CNT, ScoreBrackets->getEventUB(GDS_GPR_LOCK));
+      if (ScoreBrackets.hasPendingEvent(EXP_GPR_LOCK) ||
+          ScoreBrackets.hasPendingEvent(EXP_PARAM_ACCESS) ||
+          ScoreBrackets.hasPendingEvent(EXP_POS_ACCESS) ||
+          ScoreBrackets.hasPendingEvent(GDS_GPR_LOCK)) {
+        Wait.ExpCnt = 0;
+      }
     }
 
 #if 0 // TODO: the following code to handle CALL.
@@ -1051,27 +928,27 @@ void SIInsertWaitcnts::generateWaitcntInstBefore(
     // instruction.
     for (const MachineMemOperand *Memop : MI.memoperands()) {
       unsigned AS = Memop->getAddrSpace();
-      if (AS != AMDGPUASI.LOCAL_ADDRESS)
+      if (AS != AMDGPUAS::LOCAL_ADDRESS)
         continue;
       unsigned RegNo = SQ_MAX_PGM_VGPRS + EXTRA_VGPR_LDS;
       // VM_CNT is only relevant to vgpr or LDS.
-      EmitWaitcnt |= ScoreBrackets->updateByWait(
-          VM_CNT, ScoreBrackets->getRegScore(RegNo, VM_CNT));
+      ScoreBrackets.determineWait(
+          VM_CNT, ScoreBrackets.getRegScore(RegNo, VM_CNT), Wait);
     }
 
     for (unsigned I = 0, E = MI.getNumOperands(); I != E; ++I) {
       const MachineOperand &Op = MI.getOperand(I);
       const MachineRegisterInfo &MRIA = *MRI;
       RegInterval Interval =
-          ScoreBrackets->getRegInterval(&MI, TII, MRI, TRI, I, false);
+          ScoreBrackets.getRegInterval(&MI, TII, MRI, TRI, I, false);
       for (signed RegNo = Interval.first; RegNo < Interval.second; ++RegNo) {
         if (TRI->isVGPR(MRIA, Op.getReg())) {
           // VM_CNT is only relevant to vgpr or LDS.
-          EmitWaitcnt |= ScoreBrackets->updateByWait(
-              VM_CNT, ScoreBrackets->getRegScore(RegNo, VM_CNT));
+          ScoreBrackets.determineWait(
+              VM_CNT, ScoreBrackets.getRegScore(RegNo, VM_CNT), Wait);
         }
-        EmitWaitcnt |= ScoreBrackets->updateByWait(
-            LGKM_CNT, ScoreBrackets->getRegScore(RegNo, LGKM_CNT));
+        ScoreBrackets.determineWait(
+            LGKM_CNT, ScoreBrackets.getRegScore(RegNo, LGKM_CNT), Wait);
       }
     }
     // End of for loop that looks at all source operands to decide vm_wait_cnt
@@ -1086,29 +963,29 @@ void SIInsertWaitcnts::generateWaitcntInstBefore(
       // FIXME: Should not be relying on memoperands.
       for (const MachineMemOperand *Memop : MI.memoperands()) {
         unsigned AS = Memop->getAddrSpace();
-        if (AS != AMDGPUASI.LOCAL_ADDRESS)
+        if (AS != AMDGPUAS::LOCAL_ADDRESS)
           continue;
         unsigned RegNo = SQ_MAX_PGM_VGPRS + EXTRA_VGPR_LDS;
-        EmitWaitcnt |= ScoreBrackets->updateByWait(
-            VM_CNT, ScoreBrackets->getRegScore(RegNo, VM_CNT));
-        EmitWaitcnt |= ScoreBrackets->updateByWait(
-            EXP_CNT, ScoreBrackets->getRegScore(RegNo, EXP_CNT));
+        ScoreBrackets.determineWait(
+            VM_CNT, ScoreBrackets.getRegScore(RegNo, VM_CNT), Wait);
+        ScoreBrackets.determineWait(
+            EXP_CNT, ScoreBrackets.getRegScore(RegNo, EXP_CNT), Wait);
       }
     }
     for (unsigned I = 0, E = MI.getNumOperands(); I != E; ++I) {
       MachineOperand &Def = MI.getOperand(I);
       const MachineRegisterInfo &MRIA = *MRI;
       RegInterval Interval =
-          ScoreBrackets->getRegInterval(&MI, TII, MRI, TRI, I, true);
+          ScoreBrackets.getRegInterval(&MI, TII, MRI, TRI, I, true);
       for (signed RegNo = Interval.first; RegNo < Interval.second; ++RegNo) {
         if (TRI->isVGPR(MRIA, Def.getReg())) {
-          EmitWaitcnt |= ScoreBrackets->updateByWait(
-              VM_CNT, ScoreBrackets->getRegScore(RegNo, VM_CNT));
-          EmitWaitcnt |= ScoreBrackets->updateByWait(
-              EXP_CNT, ScoreBrackets->getRegScore(RegNo, EXP_CNT));
+          ScoreBrackets.determineWait(
+              VM_CNT, ScoreBrackets.getRegScore(RegNo, VM_CNT), Wait);
+          ScoreBrackets.determineWait(
+              EXP_CNT, ScoreBrackets.getRegScore(RegNo, EXP_CNT), Wait);
         }
-        EmitWaitcnt |= ScoreBrackets->updateByWait(
-            LGKM_CNT, ScoreBrackets->getRegScore(RegNo, LGKM_CNT));
+        ScoreBrackets.determineWait(
+            LGKM_CNT, ScoreBrackets.getRegScore(RegNo, LGKM_CNT), Wait);
       }
     } // End of for loop that looks at all dest operands.
   }
@@ -1119,182 +996,79 @@ void SIInsertWaitcnts::generateWaitcntInstBefore(
   // requiring a WAITCNT beforehand.
   if (MI.getOpcode() == AMDGPU::S_BARRIER &&
       !ST->hasAutoWaitcntBeforeBarrier()) {
-    EmitWaitcnt |=
-        ScoreBrackets->updateByWait(VM_CNT, ScoreBrackets->getScoreUB(VM_CNT));
-    EmitWaitcnt |= ScoreBrackets->updateByWait(
-        EXP_CNT, ScoreBrackets->getScoreUB(EXP_CNT));
-    EmitWaitcnt |= ScoreBrackets->updateByWait(
-        LGKM_CNT, ScoreBrackets->getScoreUB(LGKM_CNT));
+    Wait = AMDGPU::Waitcnt::allZero();
   }
 
   // TODO: Remove this work-around, enable the assert for Bug 457939
   //       after fixing the scheduler. Also, the Shader Compiler code is
   //       independent of target.
   if (readsVCCZ(MI) && ST->getGeneration() <= AMDGPUSubtarget::SEA_ISLANDS) {
-    if (ScoreBrackets->getScoreLB(LGKM_CNT) <
-            ScoreBrackets->getScoreUB(LGKM_CNT) &&
-        ScoreBrackets->hasPendingSMEM()) {
-      // Wait on everything, not just LGKM.  vccz reads usually come from
-      // terminators, and we always wait on everything at the end of the
-      // block, so if we only wait on LGKM here, we might end up with
-      // another s_waitcnt inserted right after this if there are non-LGKM
-      // instructions still outstanding.
-      // FIXME: this is too conservative / the comment is wrong.
-      // We don't wait on everything at the end of the block and we combine
-      // waitcnts so we should never have back-to-back waitcnts.
-      ForceEmitZeroWaitcnt = true;
-      EmitWaitcnt = true;
+    if (ScoreBrackets.getScoreLB(LGKM_CNT) <
+            ScoreBrackets.getScoreUB(LGKM_CNT) &&
+        ScoreBrackets.hasPendingEvent(SMEM_ACCESS)) {
+      Wait.LgkmCnt = 0;
     }
   }
 
-  // Does this operand processing indicate s_wait counter update?
-  if (EmitWaitcnt || IsForceEmitWaitcnt) {
-    int CntVal[NUM_INST_CNTS];
-
-    bool UseDefaultWaitcntStrategy = true;
-    if (ForceEmitZeroWaitcnt || ForceEmitZeroWaitcnts) {
-      // Force all waitcnts to 0.
-      for (enum InstCounterType T = VM_CNT; T < NUM_INST_CNTS;
-           T = (enum InstCounterType)(T + 1)) {
-        ScoreBrackets->setScoreLB(T, ScoreBrackets->getScoreUB(T));
-      }
-      CntVal[VM_CNT] = 0;
-      CntVal[EXP_CNT] = 0;
-      CntVal[LGKM_CNT] = 0;
-      UseDefaultWaitcntStrategy = false;
-    }
-
-    if (UseDefaultWaitcntStrategy) {
-      for (enum InstCounterType T = VM_CNT; T < NUM_INST_CNTS;
-           T = (enum InstCounterType)(T + 1)) {
-        if (EmitWaitcnt & CNT_MASK(T)) {
-          int Delta =
-              ScoreBrackets->getScoreUB(T) - ScoreBrackets->getScoreLB(T);
-          int MaxDelta = ScoreBrackets->getWaitCountMax(T);
-          if (Delta >= MaxDelta) {
-            Delta = -1;
-            if (T != EXP_CNT) {
-              ScoreBrackets->setScoreLB(
-                  T, ScoreBrackets->getScoreUB(T) - MaxDelta);
-            }
-            EmitWaitcnt &= ~CNT_MASK(T);
-          }
-          CntVal[T] = Delta;
-        } else {
-          // If we are not waiting for a particular counter then encode
-          // it as -1 which means "don't care."
-          CntVal[T] = -1;
-        }
+  // Early-out if no wait is indicated.
+  if (!ScoreBrackets.simplifyWaitcnt(Wait) && !IsForceEmitWaitcnt) {
+    bool Modified = false;
+    if (OldWaitcntInstr) {
+      if (TrackedWaitcntSet.count(OldWaitcntInstr)) {
+        TrackedWaitcntSet.erase(OldWaitcntInstr);
+        OldWaitcntInstr->eraseFromParent();
+        Modified = true;
+      } else {
+        int64_t Imm = OldWaitcntInstr->getOperand(0).getImm();
+        ScoreBrackets.applyWaitcnt(AMDGPU::decodeWaitcnt(IV, Imm));
       }
+      Modified = true;
     }
+    return Modified;
+  }
 
-    // If we are not waiting on any counter we can skip the wait altogether.
-    if (EmitWaitcnt != 0 || IsForceEmitWaitcnt) {
-      MachineInstr *OldWaitcnt = ScoreBrackets->getWaitcnt();
-      int Imm = (!OldWaitcnt) ? 0 : OldWaitcnt->getOperand(0).getImm();
-      if (!OldWaitcnt ||
-          (AMDGPU::decodeVmcnt(IV, Imm) !=
-                          (CntVal[VM_CNT] & AMDGPU::getVmcntBitMask(IV))) ||
-          (AMDGPU::decodeExpcnt(IV, Imm) !=
-           (CntVal[EXP_CNT] & AMDGPU::getExpcntBitMask(IV))) ||
-          (AMDGPU::decodeLgkmcnt(IV, Imm) !=
-           (CntVal[LGKM_CNT] & AMDGPU::getLgkmcntBitMask(IV)))) {
-        MachineLoop *ContainingLoop = MLI->getLoopFor(MI.getParent());
-        if (ContainingLoop) {
-          MachineBasicBlock *TBB = ContainingLoop->getHeader();
-          BlockWaitcntBrackets *ScoreBracket =
-              BlockWaitcntBracketsMap[TBB].get();
-          if (!ScoreBracket) {
-            assert(!BlockVisitedSet.count(TBB));
-            BlockWaitcntBracketsMap[TBB] =
-                llvm::make_unique<BlockWaitcntBrackets>(ST);
-            ScoreBracket = BlockWaitcntBracketsMap[TBB].get();
-          }
-          ScoreBracket->setRevisitLoop(true);
-          LLVM_DEBUG(dbgs()
-                         << "set-revisit2: Block"
-                         << ContainingLoop->getHeader()->getNumber() << '\n';);
-        }
-      }
+  if (ForceEmitZeroWaitcnts)
+    Wait = AMDGPU::Waitcnt::allZero();
 
-      // Update an existing waitcount, or make a new one.
-      unsigned Enc = AMDGPU::encodeWaitcnt(IV,
-                      ForceEmitWaitcnt[VM_CNT] ? 0 : CntVal[VM_CNT],
-                      ForceEmitWaitcnt[EXP_CNT] ? 0 : CntVal[EXP_CNT],
-                      ForceEmitWaitcnt[LGKM_CNT] ? 0 : CntVal[LGKM_CNT]);
-      // We don't remove waitcnts that existed prior to the waitcnt
-      // pass. Check if the waitcnt to-be-inserted can be avoided
-      // or if the prev waitcnt can be updated.
-      bool insertSWaitInst = true;
-      for (MachineBasicBlock::iterator I = MI.getIterator(),
-                                       B = MI.getParent()->begin();
-           insertSWaitInst && I != B; --I) {
-        if (I == MI.getIterator())
-          continue;
+  if (ForceEmitWaitcnt[VM_CNT])
+    Wait.VmCnt = 0;
+  if (ForceEmitWaitcnt[EXP_CNT])
+    Wait.ExpCnt = 0;
+  if (ForceEmitWaitcnt[LGKM_CNT])
+    Wait.LgkmCnt = 0;
 
-        switch (I->getOpcode()) {
-        case AMDGPU::S_WAITCNT:
-          if (isWaitcntStronger(I->getOperand(0).getImm(), Enc))
-            insertSWaitInst = false;
-          else if (!OldWaitcnt) {
-            OldWaitcnt = &*I;
-            Enc = combineWaitcnt(I->getOperand(0).getImm(), Enc);
-          }
-          break;
-        // TODO: skip over instructions which never require wait.
-        }
-        break;
-      }
-      if (insertSWaitInst) {
-        if (OldWaitcnt && OldWaitcnt->getOpcode() == AMDGPU::S_WAITCNT) {
-          if (ForceEmitZeroWaitcnts)
-            LLVM_DEBUG(
-                dbgs()
-                << "Force emit s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)\n");
-          if (IsForceEmitWaitcnt)
-            LLVM_DEBUG(dbgs()
-                       << "Force emit a s_waitcnt due to debug counter\n");
+  ScoreBrackets.applyWaitcnt(Wait);
 
-          OldWaitcnt->getOperand(0).setImm(Enc);
-          if (!OldWaitcnt->getParent())
-            MI.getParent()->insert(MI, OldWaitcnt);
+  AMDGPU::Waitcnt OldWait;
+  if (OldWaitcntInstr) {
+    OldWait =
+        AMDGPU::decodeWaitcnt(IV, OldWaitcntInstr->getOperand(0).getImm());
+  }
+  if (OldWait.dominates(Wait))
+    return false;
 
-          LLVM_DEBUG(dbgs() << "updateWaitcntInBlock\n"
-                            << "Old Instr: " << MI << '\n'
-                            << "New Instr: " << *OldWaitcnt << '\n');
-        } else {
-            auto SWaitInst = BuildMI(*MI.getParent(), MI.getIterator(),
-                               MI.getDebugLoc(), TII->get(AMDGPU::S_WAITCNT))
-                             .addImm(Enc);
-            TrackedWaitcntSet.insert(SWaitInst);
+  if (OldWaitcntInstr && !TrackedWaitcntSet.count(OldWaitcntInstr))
+    Wait = Wait.combined(OldWait);
 
-            LLVM_DEBUG(dbgs() << "insertWaitcntInBlock\n"
-                              << "Old Instr: " << MI << '\n'
-                              << "New Instr: " << *SWaitInst << '\n');
-        }
-      }
+  unsigned Enc = AMDGPU::encodeWaitcnt(IV, Wait);
+  if (OldWaitcntInstr) {
+    OldWaitcntInstr->getOperand(0).setImm(Enc);
 
-      if (CntVal[EXP_CNT] == 0) {
-        ScoreBrackets->setMixedExpTypes(false);
-      }
-    }
-  }
-}
+    LLVM_DEBUG(dbgs() << "updateWaitcntInBlock\n"
+                      << "Old Instr: " << MI << '\n'
+                      << "New Instr: " << *OldWaitcntInstr << '\n');
+  } else {
+    auto SWaitInst = BuildMI(*MI.getParent(), MI.getIterator(),
+                             MI.getDebugLoc(), TII->get(AMDGPU::S_WAITCNT))
+                         .addImm(Enc);
+    TrackedWaitcntSet.insert(SWaitInst);
 
-void SIInsertWaitcnts::insertWaitcntBeforeCF(MachineBasicBlock &MBB,
-                                             MachineInstr *Waitcnt) {
-  if (MBB.empty()) {
-    MBB.push_back(Waitcnt);
-    return;
+    LLVM_DEBUG(dbgs() << "insertWaitcntInBlock\n"
+                      << "Old Instr: " << MI << '\n'
+                      << "New Instr: " << *SWaitInst << '\n');
   }
 
-  MachineBasicBlock::iterator It = MBB.end();
-  MachineInstr *MI = &*(--It);
-  if (MI->isBranch()) {
-    MBB.insert(It, Waitcnt);
-  } else {
-    MBB.push_back(Waitcnt);
-  }
+  return true;
 }
 
 // This is a flat memory operation. Check to see if it has memory
@@ -1305,15 +1079,15 @@ bool SIInsertWaitcnts::mayAccessLDSThroughFlat(const MachineInstr &MI) const {
 
   for (const MachineMemOperand *Memop : MI.memoperands()) {
     unsigned AS = Memop->getAddrSpace();
-    if (AS == AMDGPUASI.LOCAL_ADDRESS || AS == AMDGPUASI.FLAT_ADDRESS)
+    if (AS == AMDGPUAS::LOCAL_ADDRESS || AS == AMDGPUAS::FLAT_ADDRESS)
       return true;
   }
 
   return false;
 }
 
-void SIInsertWaitcnts::updateEventWaitcntAfter(
-    MachineInstr &Inst, BlockWaitcntBrackets *ScoreBrackets) {
+void SIInsertWaitcnts::updateEventWaitcntAfter(MachineInstr &Inst,
+                                               WaitcntBrackets *ScoreBrackets) {
   // Now look at the instruction opcode. If it is a memory access
   // instruction, update the upper-bound of the appropriate counter's
   // bracket and the destination operand scores.
@@ -1379,342 +1153,124 @@ void SIInsertWaitcnts::updateEventWaitcntAfter(
   }
 }
 
-// Merge the score brackets of the Block's predecessors;
-// this merged score bracket is used when adding waitcnts to the Block
-void SIInsertWaitcnts::mergeInputScoreBrackets(MachineBasicBlock &Block) {
-  BlockWaitcntBrackets *ScoreBrackets = BlockWaitcntBracketsMap[&Block].get();
-  int32_t MaxPending[NUM_INST_CNTS] = {0};
-  int32_t MaxFlat[NUM_INST_CNTS] = {0};
-  bool MixedExpTypes = false;
-
-  // For single basic block loops, we need to retain the Block's
-  // score bracket to have accurate Pred info. So, make a copy of Block's
-  // score bracket, clear() it (which retains several important bits of info),
-  // populate, and then replace en masse. For non-single basic block loops,
-  // just clear Block's current score bracket and repopulate in-place.
-  bool IsSelfPred;
-  std::unique_ptr<BlockWaitcntBrackets> S;
-
-  IsSelfPred = (std::find(Block.pred_begin(), Block.pred_end(), &Block))
-    != Block.pred_end();
-  if (IsSelfPred) {
-    S = llvm::make_unique<BlockWaitcntBrackets>(*ScoreBrackets);
-    ScoreBrackets = S.get();
-  }
-
-  ScoreBrackets->clear();
-
-  // See if there are any uninitialized predecessors. If so, emit an
-  // s_waitcnt 0 at the beginning of the block.
-  for (MachineBasicBlock *Pred : Block.predecessors()) {
-    BlockWaitcntBrackets *PredScoreBrackets =
-        BlockWaitcntBracketsMap[Pred].get();
-    bool Visited = BlockVisitedSet.count(Pred);
-    if (!Visited || PredScoreBrackets->getWaitAtBeginning()) {
-      continue;
-    }
-    for (enum InstCounterType T = VM_CNT; T < NUM_INST_CNTS;
-         T = (enum InstCounterType)(T + 1)) {
-      int span =
-          PredScoreBrackets->getScoreUB(T) - PredScoreBrackets->getScoreLB(T);
-      MaxPending[T] = std::max(MaxPending[T], span);
-      span =
-          PredScoreBrackets->pendingFlat(T) - PredScoreBrackets->getScoreLB(T);
-      MaxFlat[T] = std::max(MaxFlat[T], span);
-    }
-
-    MixedExpTypes |= PredScoreBrackets->mixedExpTypes();
-  }
-
-  // TODO: Is SC Block->IsMainExit() same as Block.succ_empty()?
-  // Also handle kills for exit block.
-  if (Block.succ_empty() && !KillWaitBrackets.empty()) {
-    for (unsigned int I = 0; I < KillWaitBrackets.size(); I++) {
-      for (enum InstCounterType T = VM_CNT; T < NUM_INST_CNTS;
-           T = (enum InstCounterType)(T + 1)) {
-        int Span = KillWaitBrackets[I]->getScoreUB(T) -
-                   KillWaitBrackets[I]->getScoreLB(T);
-        MaxPending[T] = std::max(MaxPending[T], Span);
-        Span = KillWaitBrackets[I]->pendingFlat(T) -
-               KillWaitBrackets[I]->getScoreLB(T);
-        MaxFlat[T] = std::max(MaxFlat[T], Span);
-      }
-
-      MixedExpTypes |= KillWaitBrackets[I]->mixedExpTypes();
-    }
-  }
-
-  // Special handling for GDS_GPR_LOCK and EXP_GPR_LOCK.
-  for (MachineBasicBlock *Pred : Block.predecessors()) {
-    BlockWaitcntBrackets *PredScoreBrackets =
-        BlockWaitcntBracketsMap[Pred].get();
-    bool Visited = BlockVisitedSet.count(Pred);
-    if (!Visited || PredScoreBrackets->getWaitAtBeginning()) {
-      continue;
-    }
-
-    int GDSSpan = PredScoreBrackets->getEventUB(GDS_GPR_LOCK) -
-                  PredScoreBrackets->getScoreLB(EXP_CNT);
-    MaxPending[EXP_CNT] = std::max(MaxPending[EXP_CNT], GDSSpan);
-    int EXPSpan = PredScoreBrackets->getEventUB(EXP_GPR_LOCK) -
-                  PredScoreBrackets->getScoreLB(EXP_CNT);
-    MaxPending[EXP_CNT] = std::max(MaxPending[EXP_CNT], EXPSpan);
-  }
-
-  // TODO: Is SC Block->IsMainExit() same as Block.succ_empty()?
-  if (Block.succ_empty() && !KillWaitBrackets.empty()) {
-    for (unsigned int I = 0; I < KillWaitBrackets.size(); I++) {
-      int GDSSpan = KillWaitBrackets[I]->getEventUB(GDS_GPR_LOCK) -
-                    KillWaitBrackets[I]->getScoreLB(EXP_CNT);
-      MaxPending[EXP_CNT] = std::max(MaxPending[EXP_CNT], GDSSpan);
-      int EXPSpan = KillWaitBrackets[I]->getEventUB(EXP_GPR_LOCK) -
-                    KillWaitBrackets[I]->getScoreLB(EXP_CNT);
-      MaxPending[EXP_CNT] = std::max(MaxPending[EXP_CNT], EXPSpan);
-    }
-  }
-
-#if 0
-  // LC does not (unlike) add a waitcnt at beginning. Leaving it as marker.
-  // TODO: how does LC distinguish between function entry and main entry?
-  // If this is the entry to a function, force a wait.
-  MachineBasicBlock &Entry = Block.getParent()->front();
-  if (Entry.getNumber() == Block.getNumber()) {
-    ScoreBrackets->setWaitAtBeginning();
-    return;
-  }
-#endif
-
-  // Now set the current Block's brackets to the largest ending bracket.
-  for (enum InstCounterType T = VM_CNT; T < NUM_INST_CNTS;
-       T = (enum InstCounterType)(T + 1)) {
-    ScoreBrackets->setScoreUB(T, MaxPending[T]);
-    ScoreBrackets->setScoreLB(T, 0);
-    ScoreBrackets->setLastFlat(T, MaxFlat[T]);
-  }
-
-  ScoreBrackets->setMixedExpTypes(MixedExpTypes);
-
-  // Set the register scoreboard.
-  for (MachineBasicBlock *Pred : Block.predecessors()) {
-    if (!BlockVisitedSet.count(Pred)) {
-      continue;
-    }
+bool WaitcntBrackets::mergeScore(const MergeInfo &M, uint32_t &Score,
+                                 uint32_t OtherScore) {
+  uint32_t MyShifted = Score <= M.OldLB ? 0 : Score + M.MyShift;
+  uint32_t OtherShifted =
+      OtherScore <= M.OtherLB ? 0 : OtherScore + M.OtherShift;
+  Score = std::max(MyShifted, OtherShifted);
+  return OtherShifted > MyShifted;
+}
 
-    BlockWaitcntBrackets *PredScoreBrackets =
-        BlockWaitcntBracketsMap[Pred].get();
+/// Merge the pending events and associater score brackets of \p Other into
+/// this brackets status.
+///
+/// Returns whether the merge resulted in a change that requires tighter waits
+/// (i.e. the merged brackets strictly dominate the original brackets).
+bool WaitcntBrackets::merge(const WaitcntBrackets &Other) {
+  bool StrictDom = false;
 
-    // Now merge the gpr_reg_score information
-    for (enum InstCounterType T = VM_CNT; T < NUM_INST_CNTS;
-         T = (enum InstCounterType)(T + 1)) {
-      int PredLB = PredScoreBrackets->getScoreLB(T);
-      int PredUB = PredScoreBrackets->getScoreUB(T);
-      if (PredLB < PredUB) {
-        int PredScale = MaxPending[T] - PredUB;
-        // Merge vgpr scores.
-        for (int J = 0; J <= PredScoreBrackets->getMaxVGPR(); J++) {
-          int PredRegScore = PredScoreBrackets->getRegScore(J, T);
-          if (PredRegScore <= PredLB)
-            continue;
-          int NewRegScore = PredScale + PredRegScore;
-          ScoreBrackets->setRegScore(
-              J, T, std::max(ScoreBrackets->getRegScore(J, T), NewRegScore));
-        }
-        // Also need to merge sgpr scores for lgkm_cnt.
-        if (T == LGKM_CNT) {
-          for (int J = 0; J <= PredScoreBrackets->getMaxSGPR(); J++) {
-            int PredRegScore =
-                PredScoreBrackets->getRegScore(J + NUM_ALL_VGPRS, LGKM_CNT);
-            if (PredRegScore <= PredLB)
-              continue;
-            int NewRegScore = PredScale + PredRegScore;
-            ScoreBrackets->setRegScore(
-                J + NUM_ALL_VGPRS, LGKM_CNT,
-                std::max(
-                    ScoreBrackets->getRegScore(J + NUM_ALL_VGPRS, LGKM_CNT),
-                    NewRegScore));
-          }
-        }
-      }
-    }
+  for (auto T : inst_counter_types()) {
+    // Merge event flags for this counter
+    const bool OldOutOfOrder = counterOutOfOrder(T);
+    const uint32_t OldEvents = PendingEvents & WaitEventMaskForInst[T];
+    const uint32_t OtherEvents = Other.PendingEvents & WaitEventMaskForInst[T];
+    if (OtherEvents & ~OldEvents)
+      StrictDom = true;
+    if (Other.MixedPendingEvents[T] ||
+        (OldEvents && OtherEvents && OldEvents != OtherEvents))
+      MixedPendingEvents[T] = true;
+    PendingEvents |= OtherEvents;
 
-    // Also merge the WaitEvent information.
-    ForAllWaitEventType(W) {
-      enum InstCounterType T = PredScoreBrackets->eventCounter(W);
-      int PredEventUB = PredScoreBrackets->getEventUB(W);
-      if (PredEventUB > PredScoreBrackets->getScoreLB(T)) {
-        int NewEventUB =
-            MaxPending[T] + PredEventUB - PredScoreBrackets->getScoreUB(T);
-        if (NewEventUB > 0) {
-          ScoreBrackets->setEventUB(
-              W, std::max(ScoreBrackets->getEventUB(W), NewEventUB));
-        }
-      }
-    }
-  }
+    // Merge scores for this counter
+    const uint32_t MyPending = ScoreUBs[T] - ScoreLBs[T];
+    const uint32_t OtherPending = Other.ScoreUBs[T] - Other.ScoreLBs[T];
+    MergeInfo M;
+    M.OldLB = ScoreLBs[T];
+    M.OtherLB = Other.ScoreLBs[T];
+    M.MyShift = OtherPending > MyPending ? OtherPending - MyPending : 0;
+    M.OtherShift = ScoreUBs[T] - Other.ScoreUBs[T] + M.MyShift;
 
-  // TODO: Is SC Block->IsMainExit() same as Block.succ_empty()?
-  // Set the register scoreboard.
-  if (Block.succ_empty() && !KillWaitBrackets.empty()) {
-    for (unsigned int I = 0; I < KillWaitBrackets.size(); I++) {
-      // Now merge the gpr_reg_score information.
-      for (enum InstCounterType T = VM_CNT; T < NUM_INST_CNTS;
-           T = (enum InstCounterType)(T + 1)) {
-        int PredLB = KillWaitBrackets[I]->getScoreLB(T);
-        int PredUB = KillWaitBrackets[I]->getScoreUB(T);
-        if (PredLB < PredUB) {
-          int PredScale = MaxPending[T] - PredUB;
-          // Merge vgpr scores.
-          for (int J = 0; J <= KillWaitBrackets[I]->getMaxVGPR(); J++) {
-            int PredRegScore = KillWaitBrackets[I]->getRegScore(J, T);
-            if (PredRegScore <= PredLB)
-              continue;
-            int NewRegScore = PredScale + PredRegScore;
-            ScoreBrackets->setRegScore(
-                J, T, std::max(ScoreBrackets->getRegScore(J, T), NewRegScore));
-          }
-          // Also need to merge sgpr scores for lgkm_cnt.
-          if (T == LGKM_CNT) {
-            for (int J = 0; J <= KillWaitBrackets[I]->getMaxSGPR(); J++) {
-              int PredRegScore =
-                  KillWaitBrackets[I]->getRegScore(J + NUM_ALL_VGPRS, LGKM_CNT);
-              if (PredRegScore <= PredLB)
-                continue;
-              int NewRegScore = PredScale + PredRegScore;
-              ScoreBrackets->setRegScore(
-                  J + NUM_ALL_VGPRS, LGKM_CNT,
-                  std::max(
-                      ScoreBrackets->getRegScore(J + NUM_ALL_VGPRS, LGKM_CNT),
-                      NewRegScore));
-            }
-          }
-        }
-      }
+    const uint32_t NewUB = ScoreUBs[T] + M.MyShift;
+    if (NewUB < ScoreUBs[T])
+      report_fatal_error("waitcnt score overflow");
+    ScoreUBs[T] = NewUB;
+    ScoreLBs[T] = std::min(M.OldLB + M.MyShift, M.OtherLB + M.OtherShift);
 
-      // Also merge the WaitEvent information.
-      ForAllWaitEventType(W) {
-        enum InstCounterType T = KillWaitBrackets[I]->eventCounter(W);
-        int PredEventUB = KillWaitBrackets[I]->getEventUB(W);
-        if (PredEventUB > KillWaitBrackets[I]->getScoreLB(T)) {
-          int NewEventUB =
-              MaxPending[T] + PredEventUB - KillWaitBrackets[I]->getScoreUB(T);
-          if (NewEventUB > 0) {
-            ScoreBrackets->setEventUB(
-                W, std::max(ScoreBrackets->getEventUB(W), NewEventUB));
-          }
-        }
-      }
-    }
-  }
+    StrictDom |= mergeScore(M, LastFlat[T], Other.LastFlat[T]);
 
-  // Special case handling of GDS_GPR_LOCK and EXP_GPR_LOCK. Merge this for the
-  // sequencing predecessors, because changes to EXEC require waitcnts due to
-  // the delayed nature of these operations.
-  for (MachineBasicBlock *Pred : Block.predecessors()) {
-    if (!BlockVisitedSet.count(Pred)) {
-      continue;
+    bool RegStrictDom = false;
+    for (int J = 0, E = std::max(getMaxVGPR(), Other.getMaxVGPR()) + 1; J != E;
+         J++) {
+      RegStrictDom |= mergeScore(M, VgprScores[T][J], Other.VgprScores[T][J]);
     }
 
-    BlockWaitcntBrackets *PredScoreBrackets =
-        BlockWaitcntBracketsMap[Pred].get();
-
-    int pred_gds_ub = PredScoreBrackets->getEventUB(GDS_GPR_LOCK);
-    if (pred_gds_ub > PredScoreBrackets->getScoreLB(EXP_CNT)) {
-      int new_gds_ub = MaxPending[EXP_CNT] + pred_gds_ub -
-                       PredScoreBrackets->getScoreUB(EXP_CNT);
-      if (new_gds_ub > 0) {
-        ScoreBrackets->setEventUB(
-            GDS_GPR_LOCK,
-            std::max(ScoreBrackets->getEventUB(GDS_GPR_LOCK), new_gds_ub));
-      }
-    }
-    int pred_exp_ub = PredScoreBrackets->getEventUB(EXP_GPR_LOCK);
-    if (pred_exp_ub > PredScoreBrackets->getScoreLB(EXP_CNT)) {
-      int new_exp_ub = MaxPending[EXP_CNT] + pred_exp_ub -
-                       PredScoreBrackets->getScoreUB(EXP_CNT);
-      if (new_exp_ub > 0) {
-        ScoreBrackets->setEventUB(
-            EXP_GPR_LOCK,
-            std::max(ScoreBrackets->getEventUB(EXP_GPR_LOCK), new_exp_ub));
+    if (T == LGKM_CNT) {
+      for (int J = 0, E = std::max(getMaxSGPR(), Other.getMaxSGPR()) + 1;
+           J != E; J++) {
+        RegStrictDom |= mergeScore(M, SgprScores[J], Other.SgprScores[J]);
       }
     }
-  }
 
-  // if a single block loop, update the score brackets. Not needed for other
-  // blocks, as we did this in-place
-  if (IsSelfPred) {
-    BlockWaitcntBracketsMap[&Block] = llvm::make_unique<BlockWaitcntBrackets>(*ScoreBrackets);
+    if (RegStrictDom && !OldOutOfOrder)
+      StrictDom = true;
   }
-}
 
-/// Return true if the given basic block is a "bottom" block of a loop.
-/// This works even if the loop is discontiguous. This also handles
-/// multiple back-edges for the same "header" block of a loop.
-bool SIInsertWaitcnts::isLoopBottom(const MachineLoop *Loop,
-                                    const MachineBasicBlock *Block) {
-  for (MachineBasicBlock *MBB : Loop->blocks()) {
-    if (MBB == Block && MBB->isSuccessor(Loop->getHeader())) {
-      return true;
-    }
-  }
-  return false;
-}
+  VgprUB = std::max(getMaxVGPR(), Other.getMaxVGPR());
+  SgprUB = std::max(getMaxSGPR(), Other.getMaxSGPR());
 
-/// Count the number of "bottom" basic blocks of a loop.
-unsigned SIInsertWaitcnts::countNumBottomBlocks(const MachineLoop *Loop) {
-  unsigned Count = 0;
-  for (MachineBasicBlock *MBB : Loop->blocks()) {
-    if (MBB->isSuccessor(Loop->getHeader())) {
-      Count++;
-    }
-  }
-  return Count;
+  return StrictDom;
 }
 
 // Generate s_waitcnt instructions where needed.
-void SIInsertWaitcnts::insertWaitcntInBlock(MachineFunction &MF,
-                                            MachineBasicBlock &Block) {
-  // Initialize the state information.
-  mergeInputScoreBrackets(Block);
-
-  BlockWaitcntBrackets *ScoreBrackets = BlockWaitcntBracketsMap[&Block].get();
+bool SIInsertWaitcnts::insertWaitcntInBlock(MachineFunction &MF,
+                                            MachineBasicBlock &Block,
+                                            WaitcntBrackets &ScoreBrackets) {
+  bool Modified = false;
 
   LLVM_DEBUG({
     dbgs() << "*** Block" << Block.getNumber() << " ***";
-    ScoreBrackets->dump();
+    ScoreBrackets.dump();
   });
 
   // Walk over the instructions.
+  MachineInstr *OldWaitcntInstr = nullptr;
+
   for (MachineBasicBlock::iterator Iter = Block.begin(), E = Block.end();
        Iter != E;) {
     MachineInstr &Inst = *Iter;
+
     // Remove any previously existing waitcnts.
     if (Inst.getOpcode() == AMDGPU::S_WAITCNT) {
-      // Leave pre-existing waitcnts, but note their existence via setWaitcnt.
-      // Remove the waitcnt-pass-generated waitcnts; the pass will add them back
-      // as needed.
-      if (!TrackedWaitcntSet.count(&Inst))
-        ++Iter;
-      else {
-        ++Iter;
-        Inst.removeFromParent();
+      if (OldWaitcntInstr) {
+        if (TrackedWaitcntSet.count(OldWaitcntInstr)) {
+          TrackedWaitcntSet.erase(OldWaitcntInstr);
+          OldWaitcntInstr->eraseFromParent();
+          OldWaitcntInstr = nullptr;
+        } else if (!TrackedWaitcntSet.count(&Inst)) {
+          // Two successive s_waitcnt's, both of which are pre-existing and
+          // are therefore preserved.
+          int64_t Imm = OldWaitcntInstr->getOperand(0).getImm();
+          ScoreBrackets.applyWaitcnt(AMDGPU::decodeWaitcnt(IV, Imm));
+        } else {
+          ++Iter;
+          Inst.eraseFromParent();
+          Modified = true;
+          continue;
+        }
       }
-      ScoreBrackets->setWaitcnt(&Inst);
-      continue;
-    }
 
-    // Kill instructions generate a conditional branch to the endmain block.
-    // Merge the current waitcnt state into the endmain block information.
-    // TODO: Are there other flavors of KILL instruction?
-    if (Inst.getOpcode() == AMDGPU::KILL) {
-      addKillWaitBracket(ScoreBrackets);
+      OldWaitcntInstr = &Inst;
+      ++Iter;
+      continue;
     }
 
     bool VCCZBugWorkAround = false;
     if (readsVCCZ(Inst) &&
         (!VCCZBugHandledSet.count(&Inst))) {
-      if (ScoreBrackets->getScoreLB(LGKM_CNT) <
-              ScoreBrackets->getScoreUB(LGKM_CNT) &&
-          ScoreBrackets->hasPendingSMEM()) {
+      if (ScoreBrackets.getScoreLB(LGKM_CNT) <
+              ScoreBrackets.getScoreUB(LGKM_CNT) &&
+          ScoreBrackets.hasPendingEvent(SMEM_ACCESS)) {
         if (ST->getGeneration() <= AMDGPUSubtarget::SEA_ISLANDS)
           VCCZBugWorkAround = true;
       }
@@ -1722,9 +1278,10 @@ void SIInsertWaitcnts::insertWaitcntInBlock(MachineFunction &MF,
 
     // Generate an s_waitcnt instruction to be placed before
     // cur_Inst, if needed.
-    generateWaitcntInstBefore(Inst, ScoreBrackets);
+    Modified |= generateWaitcntInstBefore(Inst, ScoreBrackets, OldWaitcntInstr);
+    OldWaitcntInstr = nullptr;
 
-    updateEventWaitcntAfter(Inst, ScoreBrackets);
+    updateEventWaitcntAfter(Inst, &ScoreBrackets);
 
 #if 0 // TODO: implement resource type check controlled by options with ub = LB.
     // If this instruction generates a S_SETVSKIP because it is an
@@ -1737,11 +1294,9 @@ void SIInsertWaitcnts::insertWaitcntInBlock(MachineFunction &MF,
     }
 #endif
 
-    ScoreBrackets->clearWaitcnt();
-
     LLVM_DEBUG({
       Inst.print(dbgs());
-      ScoreBrackets->dump();
+      ScoreBrackets.dump();
     });
 
     // Check to see if this is a GWS instruction. If so, and if this is CI or
@@ -1753,10 +1308,7 @@ void SIInsertWaitcnts::insertWaitcntInBlock(MachineFunction &MF,
         Inst.getOpcode() == AMDGPU::DS_GWS_SEMA_P ||
         Inst.getOpcode() == AMDGPU::DS_GWS_BARRIER) {
       // TODO: && context->target_info->GwsRequiresMemViolTest() ) {
-      ScoreBrackets->updateByWait(VM_CNT, ScoreBrackets->getScoreUB(VM_CNT));
-      ScoreBrackets->updateByWait(EXP_CNT, ScoreBrackets->getScoreUB(EXP_CNT));
-      ScoreBrackets->updateByWait(LGKM_CNT,
-                                  ScoreBrackets->getScoreUB(LGKM_CNT));
+      ScoreBrackets.applyWaitcnt(AMDGPU::Waitcnt::allZero());
     }
 
     // TODO: Remove this work-around after fixing the scheduler and enable the
@@ -1769,71 +1321,13 @@ void SIInsertWaitcnts::insertWaitcntInBlock(MachineFunction &MF,
               AMDGPU::VCC)
           .addReg(AMDGPU::VCC);
       VCCZBugHandledSet.insert(&Inst);
+      Modified = true;
     }
 
     ++Iter;
   }
 
-  // Check if we need to force convergence at loop footer.
-  MachineLoop *ContainingLoop = MLI->getLoopFor(&Block);
-  if (ContainingLoop && isLoopBottom(ContainingLoop, &Block)) {
-    LoopWaitcntData *WaitcntData = LoopWaitcntDataMap[ContainingLoop].get();
-    WaitcntData->print();
-    LLVM_DEBUG(dbgs() << '\n';);
-
-    // The iterative waitcnt insertion algorithm aims for optimal waitcnt
-    // placement, but doesn't guarantee convergence for a loop. Each
-    // loop should take at most (n+1) iterations for it to converge naturally,
-    // where n is the number of bottom blocks. If this threshold is reached and
-    // the result hasn't converged, then we force convergence by inserting
-    // a s_waitcnt at the end of loop footer.
-    if (WaitcntData->getIterCnt() > (countNumBottomBlocks(ContainingLoop) + 1)) {
-      // To ensure convergence, need to make wait events at loop footer be no
-      // more than those from the previous iteration.
-      // As a simplification, instead of tracking individual scores and
-      // generating the precise wait count, just wait on 0.
-      bool HasPending = false;
-      MachineInstr *SWaitInst = WaitcntData->getWaitcnt();
-      for (enum InstCounterType T = VM_CNT; T < NUM_INST_CNTS;
-           T = (enum InstCounterType)(T + 1)) {
-        if (ScoreBrackets->getScoreUB(T) > ScoreBrackets->getScoreLB(T)) {
-          ScoreBrackets->setScoreLB(T, ScoreBrackets->getScoreUB(T));
-          HasPending = true;
-          break;
-        }
-      }
-
-      if (HasPending) {
-        if (!SWaitInst) {
-          SWaitInst = BuildMI(Block, Block.getFirstNonPHI(),
-                              DebugLoc(), TII->get(AMDGPU::S_WAITCNT))
-                              .addImm(0);
-          TrackedWaitcntSet.insert(SWaitInst);
-#if 0 // TODO: Format the debug output
-          OutputTransformBanner("insertWaitcntInBlock",0,"Create:",context);
-          OutputTransformAdd(SWaitInst, context);
-#endif
-        }
-#if 0 // TODO: ??
-        _DEV( REPORTED_STATS->force_waitcnt_converge = 1; )
-#endif
-      }
-
-      if (SWaitInst) {
-        LLVM_DEBUG({
-          SWaitInst->print(dbgs());
-          dbgs() << "\nAdjusted score board:";
-          ScoreBrackets->dump();
-        });
-
-        // Add this waitcnt to the block. It is either newly created or
-        // created in previous iterations and added back since block traversal
-        // always removes waitcnts.
-        insertWaitcntBeforeCF(Block, SWaitInst);
-        WaitcntData->setWaitcnt(SWaitInst);
-      }
-    }
-  }
+  return Modified;
 }
 
 bool SIInsertWaitcnts::runOnMachineFunction(MachineFunction &MF) {
@@ -1841,14 +1335,11 @@ bool SIInsertWaitcnts::runOnMachineFunction(MachineFunction &MF) {
   TII = ST->getInstrInfo();
   TRI = &TII->getRegisterInfo();
   MRI = &MF.getRegInfo();
-  MLI = &getAnalysis<MachineLoopInfo>();
-  IV = AMDGPU::IsaInfo::getIsaVersion(ST->getFeatureBits());
+  IV = AMDGPU::getIsaVersion(ST->getCPU());
   const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
-  AMDGPUASI = ST->getAMDGPUAS();
 
   ForceEmitZeroWaitcnts = ForceEmitZeroFlag;
-  for (enum InstCounterType T = VM_CNT; T < NUM_INST_CNTS;
-       T = (enum InstCounterType)(T + 1))
+  for (auto T : inst_counter_types())
     ForceEmitWaitcnt[T] = false;
 
   HardwareLimits.VmcntMax = AMDGPU::getVmcntBitMask(IV);
@@ -1868,93 +1359,70 @@ bool SIInsertWaitcnts::runOnMachineFunction(MachineFunction &MF) {
       RegisterEncoding.SGPR0 + HardwareLimits.NumSGPRsMax - 1;
 
   TrackedWaitcntSet.clear();
-  BlockVisitedSet.clear();
   VCCZBugHandledSet.clear();
-  LoopWaitcntDataMap.clear();
-  BlockWaitcntProcessedSet.clear();
+  RpotIdxMap.clear();
+  BlockInfos.clear();
 
-  // Walk over the blocks in reverse post-dominator order, inserting
-  // s_waitcnt where needed.
-  ReversePostOrderTraversal<MachineFunction *> RPOT(&MF);
-  bool Modified = false;
-  for (ReversePostOrderTraversal<MachineFunction *>::rpo_iterator
-           I = RPOT.begin(),
-           E = RPOT.end(), J = RPOT.begin();
-       I != E;) {
-    MachineBasicBlock &MBB = **I;
+  // Keep iterating over the blocks in reverse post order, inserting and
+  // updating s_waitcnt where needed, until a fix point is reached.
+  for (MachineBasicBlock *MBB :
+       ReversePostOrderTraversal<MachineFunction *>(&MF)) {
+    RpotIdxMap[MBB] = BlockInfos.size();
+    BlockInfos.emplace_back(MBB);
+  }
 
-    BlockVisitedSet.insert(&MBB);
+  std::unique_ptr<WaitcntBrackets> Brackets;
+  bool Modified = false;
+  bool Repeat;
+  do {
+    Repeat = false;
 
-    BlockWaitcntBrackets *ScoreBrackets = BlockWaitcntBracketsMap[&MBB].get();
-    if (!ScoreBrackets) {
-      BlockWaitcntBracketsMap[&MBB] = llvm::make_unique<BlockWaitcntBrackets>(ST);
-      ScoreBrackets = BlockWaitcntBracketsMap[&MBB].get();
-    }
-    ScoreBrackets->setPostOrder(MBB.getNumber());
-    MachineLoop *ContainingLoop = MLI->getLoopFor(&MBB);
-    if (ContainingLoop && LoopWaitcntDataMap[ContainingLoop] == nullptr)
-      LoopWaitcntDataMap[ContainingLoop] = llvm::make_unique<LoopWaitcntData>();
+    for (BlockInfo &BI : BlockInfos) {
+      if (!BI.Dirty)
+        continue;
 
-    // If we are walking into the block from before the loop, then guarantee
-    // at least 1 re-walk over the loop to propagate the information, even if
-    // no S_WAITCNT instructions were generated.
-    if (ContainingLoop && ContainingLoop->getHeader() == &MBB) {
-      unsigned Count = countNumBottomBlocks(ContainingLoop);
+      unsigned Idx = std::distance(&*BlockInfos.begin(), &BI);
 
-      // If the loop has multiple back-edges, and so more than one "bottom"
-      // basic block, we have to guarantee a re-walk over every blocks.
-      if ((std::count(BlockWaitcntProcessedSet.begin(),
-                      BlockWaitcntProcessedSet.end(), &MBB) < (int)Count)) {
-        BlockWaitcntBracketsMap[&MBB]->setRevisitLoop(true);
-        LLVM_DEBUG(dbgs() << "set-revisit1: Block"
-                          << ContainingLoop->getHeader()->getNumber() << '\n';);
+      if (BI.Incoming) {
+        if (!Brackets)
+          Brackets = llvm::make_unique<WaitcntBrackets>(*BI.Incoming);
+        else
+          *Brackets = *BI.Incoming;
+      } else {
+        if (!Brackets)
+          Brackets = llvm::make_unique<WaitcntBrackets>(ST);
+        else
+          Brackets->clear();
       }
-    }
 
-    // Walk over the instructions.
-    insertWaitcntInBlock(MF, MBB);
+      Modified |= insertWaitcntInBlock(MF, *BI.MBB, *Brackets);
+      BI.Dirty = false;
 
-    // Record that waitcnts have been processed at least once for this block.
-    BlockWaitcntProcessedSet.push_back(&MBB);
-
-    // See if we want to revisit the loop. If a loop has multiple back-edges,
-    // we shouldn't revisit the same "bottom" basic block.
-    if (ContainingLoop && isLoopBottom(ContainingLoop, &MBB) &&
-        std::count(BlockWaitcntProcessedSet.begin(),
-                   BlockWaitcntProcessedSet.end(), &MBB) == 1) {
-      MachineBasicBlock *EntryBB = ContainingLoop->getHeader();
-      BlockWaitcntBrackets *EntrySB = BlockWaitcntBracketsMap[EntryBB].get();
-      if (EntrySB && EntrySB->getRevisitLoop()) {
-        EntrySB->setRevisitLoop(false);
-        J = I;
-        int32_t PostOrder = EntrySB->getPostOrder();
-        // TODO: Avoid this loop. Find another way to set I.
-        for (ReversePostOrderTraversal<MachineFunction *>::rpo_iterator
-                 X = RPOT.begin(),
-                 Y = RPOT.end();
-             X != Y; ++X) {
-          MachineBasicBlock &MBBX = **X;
-          if (MBBX.getNumber() == PostOrder) {
-            I = X;
-            break;
+      if (Brackets->hasPending()) {
+        BlockInfo *MoveBracketsToSucc = nullptr;
+        for (MachineBasicBlock *Succ : BI.MBB->successors()) {
+          unsigned SuccIdx = RpotIdxMap[Succ];
+          BlockInfo &SuccBI = BlockInfos[SuccIdx];
+          if (!SuccBI.Incoming) {
+            SuccBI.Dirty = true;
+            if (SuccIdx <= Idx)
+              Repeat = true;
+            if (!MoveBracketsToSucc) {
+              MoveBracketsToSucc = &SuccBI;
+            } else {
+              SuccBI.Incoming = llvm::make_unique<WaitcntBrackets>(*Brackets);
+            }
+          } else if (SuccBI.Incoming->merge(*Brackets)) {
+            SuccBI.Dirty = true;
+            if (SuccIdx <= Idx)
+              Repeat = true;
           }
         }
-        LoopWaitcntData *WaitcntData = LoopWaitcntDataMap[ContainingLoop].get();
-        WaitcntData->incIterCnt();
-        LLVM_DEBUG(dbgs() << "revisit: Block" << EntryBB->getNumber() << '\n';);
-        continue;
-      } else {
-        LoopWaitcntData *WaitcntData = LoopWaitcntDataMap[ContainingLoop].get();
-        // Loop converged, reset iteration count. If this loop gets revisited,
-        // it must be from an outer loop, the counter will restart, this will
-        // ensure we don't force convergence on such revisits.
-        WaitcntData->resetIterCnt();
+        if (MoveBracketsToSucc)
+          MoveBracketsToSucc->Incoming = std::move(Brackets);
       }
     }
-
-    J = I;
-    ++I;
-  }
+  } while (Repeat);
 
   SmallVector<MachineBasicBlock *, 4> EndPgmBlocks;
 
diff --git a/lib/Target/AMDGPU/SIInstrFormats.td b/lib/Target/AMDGPU/SIInstrFormats.td
index b73d30940fc38..65ffc27b8b608 100644
--- a/lib/Target/AMDGPU/SIInstrFormats.td
+++ b/lib/Target/AMDGPU/SIInstrFormats.td
@@ -121,6 +121,10 @@ class InstSI <dag outs, dag ins, string asm = "",
   // This bit indicates that this is a D16 buffer instruction.
   field bit D16Buf = 0;
 
+  // This bit indicates that this uses the floating point double precision
+  // rounding mode flags
+  field bit FPDPRounding = 0;
+
   // These need to be kept in sync with the enum in SIInstrFlags.
   let TSFlags{0} = SALU;
   let TSFlags{1} = VALU;
@@ -178,6 +182,8 @@ class InstSI <dag outs, dag ins, string asm = "",
 
   let TSFlags{50} = D16Buf;
 
+  let TSFlags{51} = FPDPRounding;
+
   let SchedRW = [Write32Bit];
 
   field bits<1> DisableSIDecoder = 0;
diff --git a/lib/Target/AMDGPU/SIInstrInfo.cpp b/lib/Target/AMDGPU/SIInstrInfo.cpp
index f3745382a6f4b..2370d5fa7b27b 100644
--- a/lib/Target/AMDGPU/SIInstrInfo.cpp
+++ b/lib/Target/AMDGPU/SIInstrInfo.cpp
@@ -31,6 +31,7 @@
 #include "llvm/Analysis/MemoryLocation.h"
 #include "llvm/Analysis/ValueTracking.h"
 #include "llvm/CodeGen/MachineBasicBlock.h"
+#include "llvm/CodeGen/MachineDominators.h"
 #include "llvm/CodeGen/MachineFrameInfo.h"
 #include "llvm/CodeGen/MachineFunction.h"
 #include "llvm/CodeGen/MachineInstr.h"
@@ -264,9 +265,10 @@ static bool isStride64(unsigned Opc) {
   }
 }
 
-bool SIInstrInfo::getMemOpBaseRegImmOfs(MachineInstr &LdSt, unsigned &BaseReg,
-                                        int64_t &Offset,
-                                        const TargetRegisterInfo *TRI) const {
+bool SIInstrInfo::getMemOperandWithOffset(MachineInstr &LdSt,
+                                          MachineOperand *&BaseOp,
+                                          int64_t &Offset,
+                                          const TargetRegisterInfo *TRI) const {
   unsigned Opc = LdSt.getOpcode();
 
   if (isDS(LdSt)) {
@@ -274,11 +276,10 @@ bool SIInstrInfo::getMemOpBaseRegImmOfs(MachineInstr &LdSt, unsigned &BaseReg,
         getNamedOperand(LdSt, AMDGPU::OpName::offset);
     if (OffsetImm) {
       // Normal, single offset LDS instruction.
-      const MachineOperand *AddrReg =
-          getNamedOperand(LdSt, AMDGPU::OpName::addr);
-
-      BaseReg = AddrReg->getReg();
+      BaseOp = getNamedOperand(LdSt, AMDGPU::OpName::addr);
       Offset = OffsetImm->getImm();
+      assert(BaseOp->isReg() && "getMemOperandWithOffset only supports base "
+                                "operands of type register.");
       return true;
     }
 
@@ -309,10 +310,10 @@ bool SIInstrInfo::getMemOpBaseRegImmOfs(MachineInstr &LdSt, unsigned &BaseReg,
       if (isStride64(Opc))
         EltSize *= 64;
 
-      const MachineOperand *AddrReg =
-          getNamedOperand(LdSt, AMDGPU::OpName::addr);
-      BaseReg = AddrReg->getReg();
+      BaseOp = getNamedOperand(LdSt, AMDGPU::OpName::addr);
       Offset = EltSize * Offset0;
+      assert(BaseOp->isReg() && "getMemOperandWithOffset only supports base "
+                                "operands of type register.");
       return true;
     }
 
@@ -324,19 +325,20 @@ bool SIInstrInfo::getMemOpBaseRegImmOfs(MachineInstr &LdSt, unsigned &BaseReg,
     if (SOffset && SOffset->isReg())
       return false;
 
-    const MachineOperand *AddrReg =
-        getNamedOperand(LdSt, AMDGPU::OpName::vaddr);
+    MachineOperand *AddrReg = getNamedOperand(LdSt, AMDGPU::OpName::vaddr);
     if (!AddrReg)
       return false;
 
     const MachineOperand *OffsetImm =
         getNamedOperand(LdSt, AMDGPU::OpName::offset);
-    BaseReg = AddrReg->getReg();
+    BaseOp = AddrReg;
     Offset = OffsetImm->getImm();
 
     if (SOffset) // soffset can be an inline immediate.
       Offset += SOffset->getImm();
 
+    assert(BaseOp->isReg() && "getMemOperandWithOffset only supports base "
+                              "operands of type register.");
     return true;
   }
 
@@ -346,36 +348,46 @@ bool SIInstrInfo::getMemOpBaseRegImmOfs(MachineInstr &LdSt, unsigned &BaseReg,
     if (!OffsetImm)
       return false;
 
-    const MachineOperand *SBaseReg =
-        getNamedOperand(LdSt, AMDGPU::OpName::sbase);
-    BaseReg = SBaseReg->getReg();
+    MachineOperand *SBaseReg = getNamedOperand(LdSt, AMDGPU::OpName::sbase);
+    BaseOp = SBaseReg;
     Offset = OffsetImm->getImm();
+    assert(BaseOp->isReg() && "getMemOperandWithOffset only supports base "
+                              "operands of type register.");
     return true;
   }
 
   if (isFLAT(LdSt)) {
-    const MachineOperand *VAddr = getNamedOperand(LdSt, AMDGPU::OpName::vaddr);
+    MachineOperand *VAddr = getNamedOperand(LdSt, AMDGPU::OpName::vaddr);
     if (VAddr) {
       // Can't analyze 2 offsets.
       if (getNamedOperand(LdSt, AMDGPU::OpName::saddr))
         return false;
 
-      BaseReg = VAddr->getReg();
+      BaseOp = VAddr;
     } else {
       // scratch instructions have either vaddr or saddr.
-      BaseReg = getNamedOperand(LdSt, AMDGPU::OpName::saddr)->getReg();
+      BaseOp = getNamedOperand(LdSt, AMDGPU::OpName::saddr);
     }
 
     Offset = getNamedOperand(LdSt, AMDGPU::OpName::offset)->getImm();
+    assert(BaseOp->isReg() && "getMemOperandWithOffset only supports base "
+                              "operands of type register.");
     return true;
   }
 
   return false;
 }
 
-static bool memOpsHaveSameBasePtr(const MachineInstr &MI1, unsigned BaseReg1,
-                                  const MachineInstr &MI2, unsigned BaseReg2) {
-  if (BaseReg1 == BaseReg2)
+static bool memOpsHaveSameBasePtr(const MachineInstr &MI1,
+                                  const MachineOperand &BaseOp1,
+                                  const MachineInstr &MI2,
+                                  const MachineOperand &BaseOp2) {
+  // Support only base operands with base registers.
+  // Note: this could be extended to support FI operands.
+  if (!BaseOp1.isReg() || !BaseOp2.isReg())
+    return false;
+
+  if (BaseOp1.isIdenticalTo(BaseOp2))
     return true;
 
   if (!MI1.hasOneMemOperand() || !MI2.hasOneMemOperand())
@@ -401,12 +413,13 @@ static bool memOpsHaveSameBasePtr(const MachineInstr &MI1, unsigned BaseReg1,
   return Base1 == Base2;
 }
 
-bool SIInstrInfo::shouldClusterMemOps(MachineInstr &FirstLdSt,
-                                      unsigned BaseReg1,
-                                      MachineInstr &SecondLdSt,
-                                      unsigned BaseReg2,
+bool SIInstrInfo::shouldClusterMemOps(MachineOperand &BaseOp1,
+                                      MachineOperand &BaseOp2,
                                       unsigned NumLoads) const {
-  if (!memOpsHaveSameBasePtr(FirstLdSt, BaseReg1, SecondLdSt, BaseReg2))
+  MachineInstr &FirstLdSt = *BaseOp1.getParent();
+  MachineInstr &SecondLdSt = *BaseOp2.getParent();
+
+  if (!memOpsHaveSameBasePtr(FirstLdSt, BaseOp1, SecondLdSt, BaseOp2))
     return false;
 
   const MachineOperand *FirstDst = nullptr;
@@ -863,7 +876,7 @@ void SIInstrInfo::storeRegToStackSlot(MachineBasicBlock &MBB,
   MachineFunction *MF = MBB.getParent();
   SIMachineFunctionInfo *MFI = MF->getInfo<SIMachineFunctionInfo>();
   MachineFrameInfo &FrameInfo = MF->getFrameInfo();
-  DebugLoc DL = MBB.findDebugLoc(MI);
+  const DebugLoc &DL = MBB.findDebugLoc(MI);
 
   unsigned Size = FrameInfo.getObjectSize(FrameIndex);
   unsigned Align = FrameInfo.getObjectAlignment(FrameIndex);
@@ -907,16 +920,6 @@ void SIInstrInfo::storeRegToStackSlot(MachineBasicBlock &MBB,
     return;
   }
 
-  if (!ST.isVGPRSpillingEnabled(MF->getFunction())) {
-    LLVMContext &Ctx = MF->getFunction().getContext();
-    Ctx.emitError("SIInstrInfo::storeRegToStackSlot - Do not know how to"
-                  " spill register");
-    BuildMI(MBB, MI, DL, get(AMDGPU::KILL))
-      .addReg(SrcReg);
-
-    return;
-  }
-
   assert(RI.hasVGPRs(RC) && "Only VGPR spilling expected");
 
   unsigned Opcode = getVGPRSpillSaveOpcode(SpillSize);
@@ -972,9 +975,9 @@ void SIInstrInfo::loadRegFromStackSlot(MachineBasicBlock &MBB,
                                        const TargetRegisterClass *RC,
                                        const TargetRegisterInfo *TRI) const {
   MachineFunction *MF = MBB.getParent();
-  const SIMachineFunctionInfo *MFI = MF->getInfo<SIMachineFunctionInfo>();
+  SIMachineFunctionInfo *MFI = MF->getInfo<SIMachineFunctionInfo>();
   MachineFrameInfo &FrameInfo = MF->getFrameInfo();
-  DebugLoc DL = MBB.findDebugLoc(MI);
+  const DebugLoc &DL = MBB.findDebugLoc(MI);
   unsigned Align = FrameInfo.getObjectAlignment(FrameIndex);
   unsigned Size = FrameInfo.getObjectSize(FrameIndex);
   unsigned SpillSize = TRI->getSpillSize(*RC);
@@ -986,6 +989,8 @@ void SIInstrInfo::loadRegFromStackSlot(MachineBasicBlock &MBB,
     PtrInfo, MachineMemOperand::MOLoad, Size, Align);
 
   if (RI.isSGPRClass(RC)) {
+    MFI->setHasSpilledSGPRs();
+
     // FIXME: Maybe this should not include a memoperand because it will be
     // lowered to non-memory instructions.
     const MCInstrDesc &OpDesc = get(getSGPRSpillRestoreOpcode(SpillSize));
@@ -1009,15 +1014,6 @@ void SIInstrInfo::loadRegFromStackSlot(MachineBasicBlock &MBB,
     return;
   }
 
-  if (!ST.isVGPRSpillingEnabled(MF->getFunction())) {
-    LLVMContext &Ctx = MF->getFunction().getContext();
-    Ctx.emitError("SIInstrInfo::loadRegFromStackSlot - Do not know how to"
-                  " restore register");
-    BuildMI(MBB, MI, DL, get(AMDGPU::IMPLICIT_DEF), DestReg);
-
-    return;
-  }
-
   assert(RI.hasVGPRs(RC) && "Only VGPR spilling expected");
 
   unsigned Opcode = getVGPRSpillRestoreOpcode(SpillSize);
@@ -1036,7 +1032,7 @@ unsigned SIInstrInfo::calculateLDSSpillAddress(
   MachineFunction *MF = MBB.getParent();
   SIMachineFunctionInfo *MFI = MF->getInfo<SIMachineFunctionInfo>();
   const GCNSubtarget &ST = MF->getSubtarget<GCNSubtarget>();
-  DebugLoc DL = MBB.findDebugLoc(MI);
+  const DebugLoc &DL = MBB.findDebugLoc(MI);
   unsigned WorkGroupSize = MFI->getMaxFlatWorkGroupSize();
   unsigned WavefrontSize = ST.getWavefrontSize();
 
@@ -1044,7 +1040,7 @@ unsigned SIInstrInfo::calculateLDSSpillAddress(
   if (!MFI->hasCalculatedTID()) {
     MachineBasicBlock &Entry = MBB.getParent()->front();
     MachineBasicBlock::iterator Insert = Entry.front();
-    DebugLoc DL = Insert->getDebugLoc();
+    const DebugLoc &DL = Insert->getDebugLoc();
 
     TIDReg = RI.findUnusedRegister(MF->getRegInfo(), &AMDGPU::VGPR_32RegClass,
                                    *MF);
@@ -1421,10 +1417,15 @@ MachineInstr *SIInstrInfo::commuteInstructionImpl(MachineInstr &MI, bool NewMI,
 // TargetInstrInfo::commuteInstruction uses it.
 bool SIInstrInfo::findCommutedOpIndices(MachineInstr &MI, unsigned &SrcOpIdx0,
                                         unsigned &SrcOpIdx1) const {
-  if (!MI.isCommutable())
+  return findCommutedOpIndices(MI.getDesc(), SrcOpIdx0, SrcOpIdx1);
+}
+
+bool SIInstrInfo::findCommutedOpIndices(MCInstrDesc Desc, unsigned &SrcOpIdx0,
+                                        unsigned &SrcOpIdx1) const {
+  if (!Desc.isCommutable())
     return false;
 
-  unsigned Opc = MI.getOpcode();
+  unsigned Opc = Desc.getOpcode();
   int Src0Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src0);
   if (Src0Idx == -1)
     return false;
@@ -1549,8 +1550,9 @@ unsigned SIInstrInfo::insertIndirectBranch(MachineBasicBlock &MBB,
   //   buzz;
 
   RS->enterBasicBlockEnd(MBB);
-  unsigned Scav = RS->scavengeRegister(&AMDGPU::SReg_64RegClass,
-                                       MachineBasicBlock::iterator(GetPC), 0);
+  unsigned Scav = RS->scavengeRegisterBackwards(
+    AMDGPU::SReg_64RegClass,
+    MachineBasicBlock::iterator(GetPC), false, 0);
   MRI.replaceRegWith(PCReg, Scav);
   MRI.clearVirtRegs();
   RS->setRegUsed(Scav);
@@ -1644,7 +1646,34 @@ bool SIInstrInfo::analyzeBranch(MachineBasicBlock &MBB, MachineBasicBlock *&TBB,
                                 SmallVectorImpl<MachineOperand> &Cond,
                                 bool AllowModify) const {
   MachineBasicBlock::iterator I = MBB.getFirstTerminator();
-  if (I == MBB.end())
+  auto E = MBB.end();
+  if (I == E)
+    return false;
+
+  // Skip over the instructions that are artificially terminators for special
+  // exec management.
+  while (I != E && !I->isBranch() && !I->isReturn() &&
+         I->getOpcode() != AMDGPU::SI_MASK_BRANCH) {
+    switch (I->getOpcode()) {
+    case AMDGPU::SI_MASK_BRANCH:
+    case AMDGPU::S_MOV_B64_term:
+    case AMDGPU::S_XOR_B64_term:
+    case AMDGPU::S_ANDN2_B64_term:
+      break;
+    case AMDGPU::SI_IF:
+    case AMDGPU::SI_ELSE:
+    case AMDGPU::SI_KILL_I1_TERMINATOR:
+    case AMDGPU::SI_KILL_F32_COND_IMM_TERMINATOR:
+      // FIXME: It's messy that these need to be considered here at all.
+      return true;
+    default:
+      llvm_unreachable("unexpected non-branch terminator inst");
+    }
+
+    ++I;
+  }
+
+  if (I == E)
     return false;
 
   if (I->getOpcode() != AMDGPU::SI_MASK_BRANCH)
@@ -1933,20 +1962,20 @@ bool SIInstrInfo::isFoldableCopy(const MachineInstr &MI) const {
 }
 
 unsigned SIInstrInfo::getAddressSpaceForPseudoSourceKind(
-    PseudoSourceValue::PSVKind Kind) const {
+    unsigned Kind) const {
   switch(Kind) {
   case PseudoSourceValue::Stack:
   case PseudoSourceValue::FixedStack:
-    return ST.getAMDGPUAS().PRIVATE_ADDRESS;
+    return AMDGPUAS::PRIVATE_ADDRESS;
   case PseudoSourceValue::ConstantPool:
   case PseudoSourceValue::GOT:
   case PseudoSourceValue::JumpTable:
   case PseudoSourceValue::GlobalValueCallEntry:
   case PseudoSourceValue::ExternalSymbolCallEntry:
   case PseudoSourceValue::TargetCustom:
-    return ST.getAMDGPUAS().CONSTANT_ADDRESS;
+    return AMDGPUAS::CONSTANT_ADDRESS;
   }
-  return ST.getAMDGPUAS().FLAT_ADDRESS;
+  return AMDGPUAS::FLAT_ADDRESS;
 }
 
 static void removeModOperands(MachineInstr &MI) {
@@ -2066,12 +2095,40 @@ bool SIInstrInfo::FoldImmediate(MachineInstr &UseMI, MachineInstr &DefMI,
     if (Src2->isReg() && Src2->getReg() == Reg) {
       // Not allowed to use constant bus for another operand.
       // We can however allow an inline immediate as src0.
-      if (!Src0->isImm() &&
-          (Src0->isReg() && RI.isSGPRClass(MRI->getRegClass(Src0->getReg()))))
-        return false;
+      bool Src0Inlined = false;
+      if (Src0->isReg()) {
+        // Try to inline constant if possible.
+        // If the Def moves immediate and the use is single
+        // We are saving VGPR here.
+        MachineInstr *Def = MRI->getUniqueVRegDef(Src0->getReg());
+        if (Def && Def->isMoveImmediate() &&
+          isInlineConstant(Def->getOperand(1)) &&
+          MRI->hasOneUse(Src0->getReg())) {
+          Src0->ChangeToImmediate(Def->getOperand(1).getImm());
+          Src0Inlined = true;
+        } else if ((RI.isPhysicalRegister(Src0->getReg()) &&
+            RI.isSGPRClass(RI.getPhysRegClass(Src0->getReg()))) ||
+            (RI.isVirtualRegister(Src0->getReg()) &&
+            RI.isSGPRClass(MRI->getRegClass(Src0->getReg()))))
+          return false;
+          // VGPR is okay as Src0 - fallthrough
+      }
 
-      if (!Src1->isReg() || RI.isSGPRClass(MRI->getRegClass(Src1->getReg())))
-        return false;
+      if (Src1->isReg() && !Src0Inlined ) {
+        // We have one slot for inlinable constant so far - try to fill it
+        MachineInstr *Def = MRI->getUniqueVRegDef(Src1->getReg());
+        if (Def && Def->isMoveImmediate() &&
+            isInlineConstant(Def->getOperand(1)) &&
+            MRI->hasOneUse(Src1->getReg()) &&
+            commuteInstruction(UseMI)) {
+            Src0->ChangeToImmediate(Def->getOperand(1).getImm());
+        } else if ((RI.isPhysicalRegister(Src1->getReg()) &&
+            RI.isSGPRClass(RI.getPhysRegClass(Src1->getReg()))) ||
+            (RI.isVirtualRegister(Src1->getReg()) &&
+            RI.isSGPRClass(MRI->getRegClass(Src1->getReg()))))
+          return false;
+          // VGPR is okay as Src1 - fallthrough
+      }
 
       const int64_t Imm = ImmOp->getImm();
 
@@ -2117,11 +2174,13 @@ static bool offsetsDoNotOverlap(int WidthA, int OffsetA,
 
 bool SIInstrInfo::checkInstOffsetsDoNotOverlap(MachineInstr &MIa,
                                                MachineInstr &MIb) const {
-  unsigned BaseReg0, BaseReg1;
+  MachineOperand *BaseOp0, *BaseOp1;
   int64_t Offset0, Offset1;
 
-  if (getMemOpBaseRegImmOfs(MIa, BaseReg0, Offset0, &RI) &&
-      getMemOpBaseRegImmOfs(MIb, BaseReg1, Offset1, &RI)) {
+  if (getMemOperandWithOffset(MIa, BaseOp0, Offset0, &RI) &&
+      getMemOperandWithOffset(MIb, BaseOp1, Offset1, &RI)) {
+    if (!BaseOp0->isIdenticalTo(*BaseOp1))
+      return false;
 
     if (!MIa.hasOneMemOperand() || !MIb.hasOneMemOperand()) {
       // FIXME: Handle ds_read2 / ds_write2.
@@ -2129,8 +2188,7 @@ bool SIInstrInfo::checkInstOffsetsDoNotOverlap(MachineInstr &MIa,
     }
     unsigned Width0 = (*MIa.memoperands_begin())->getSize();
     unsigned Width1 = (*MIb.memoperands_begin())->getSize();
-    if (BaseReg0 == BaseReg1 &&
-        offsetsDoNotOverlap(Width0, Offset0, Width1, Offset1)) {
+    if (offsetsDoNotOverlap(Width0, Offset0, Width1, Offset1)) {
       return true;
     }
   }
@@ -2398,8 +2456,7 @@ bool SIInstrInfo::isInlineConstant(const MachineOperand &MO,
   case AMDGPU::OPERAND_REG_INLINE_C_INT32:
   case AMDGPU::OPERAND_REG_INLINE_C_FP32: {
     int32_t Trunc = static_cast<int32_t>(Imm);
-    return Trunc == Imm &&
-           AMDGPU::isInlinableLiteral32(Trunc, ST.hasInv2PiInlineImm());
+    return AMDGPU::isInlinableLiteral32(Trunc, ST.hasInv2PiInlineImm());
   }
   case AMDGPU::OPERAND_REG_IMM_INT64:
   case AMDGPU::OPERAND_REG_IMM_FP64:
@@ -2523,6 +2580,115 @@ bool SIInstrInfo::hasAnyModifiersSet(const MachineInstr &MI) const {
          hasModifiersSet(MI, AMDGPU::OpName::omod);
 }
 
+bool SIInstrInfo::canShrink(const MachineInstr &MI,
+                            const MachineRegisterInfo &MRI) const {
+  const MachineOperand *Src2 = getNamedOperand(MI, AMDGPU::OpName::src2);
+  // Can't shrink instruction with three operands.
+  // FIXME: v_cndmask_b32 has 3 operands and is shrinkable, but we need to add
+  // a special case for it.  It can only be shrunk if the third operand
+  // is vcc.  We should handle this the same way we handle vopc, by addding
+  // a register allocation hint pre-regalloc and then do the shrinking
+  // post-regalloc.
+  if (Src2) {
+    switch (MI.getOpcode()) {
+      default: return false;
+
+      case AMDGPU::V_ADDC_U32_e64:
+      case AMDGPU::V_SUBB_U32_e64:
+      case AMDGPU::V_SUBBREV_U32_e64: {
+        const MachineOperand *Src1
+          = getNamedOperand(MI, AMDGPU::OpName::src1);
+        if (!Src1->isReg() || !RI.isVGPR(MRI, Src1->getReg()))
+          return false;
+        // Additional verification is needed for sdst/src2.
+        return true;
+      }
+      case AMDGPU::V_MAC_F32_e64:
+      case AMDGPU::V_MAC_F16_e64:
+      case AMDGPU::V_FMAC_F32_e64:
+        if (!Src2->isReg() || !RI.isVGPR(MRI, Src2->getReg()) ||
+            hasModifiersSet(MI, AMDGPU::OpName::src2_modifiers))
+          return false;
+        break;
+
+      case AMDGPU::V_CNDMASK_B32_e64:
+        break;
+    }
+  }
+
+  const MachineOperand *Src1 = getNamedOperand(MI, AMDGPU::OpName::src1);
+  if (Src1 && (!Src1->isReg() || !RI.isVGPR(MRI, Src1->getReg()) ||
+               hasModifiersSet(MI, AMDGPU::OpName::src1_modifiers)))
+    return false;
+
+  // We don't need to check src0, all input types are legal, so just make sure
+  // src0 isn't using any modifiers.
+  if (hasModifiersSet(MI, AMDGPU::OpName::src0_modifiers))
+    return false;
+
+  // Can it be shrunk to a valid 32 bit opcode?
+  if (!hasVALU32BitEncoding(MI.getOpcode()))
+    return false;
+
+  // Check output modifiers
+  return !hasModifiersSet(MI, AMDGPU::OpName::omod) &&
+         !hasModifiersSet(MI, AMDGPU::OpName::clamp);
+}
+
+// Set VCC operand with all flags from \p Orig, except for setting it as
+// implicit.
+static void copyFlagsToImplicitVCC(MachineInstr &MI,
+                                   const MachineOperand &Orig) {
+
+  for (MachineOperand &Use : MI.implicit_operands()) {
+    if (Use.isUse() && Use.getReg() == AMDGPU::VCC) {
+      Use.setIsUndef(Orig.isUndef());
+      Use.setIsKill(Orig.isKill());
+      return;
+    }
+  }
+}
+
+MachineInstr *SIInstrInfo::buildShrunkInst(MachineInstr &MI,
+                                           unsigned Op32) const {
+  MachineBasicBlock *MBB = MI.getParent();;
+  MachineInstrBuilder Inst32 =
+    BuildMI(*MBB, MI, MI.getDebugLoc(), get(Op32));
+
+  // Add the dst operand if the 32-bit encoding also has an explicit $vdst.
+  // For VOPC instructions, this is replaced by an implicit def of vcc.
+  int Op32DstIdx = AMDGPU::getNamedOperandIdx(Op32, AMDGPU::OpName::vdst);
+  if (Op32DstIdx != -1) {
+    // dst
+    Inst32.add(MI.getOperand(0));
+  } else {
+    assert(MI.getOperand(0).getReg() == AMDGPU::VCC &&
+           "Unexpected case");
+  }
+
+  Inst32.add(*getNamedOperand(MI, AMDGPU::OpName::src0));
+
+  const MachineOperand *Src1 = getNamedOperand(MI, AMDGPU::OpName::src1);
+  if (Src1)
+    Inst32.add(*Src1);
+
+  const MachineOperand *Src2 = getNamedOperand(MI, AMDGPU::OpName::src2);
+
+  if (Src2) {
+    int Op32Src2Idx = AMDGPU::getNamedOperandIdx(Op32, AMDGPU::OpName::src2);
+    if (Op32Src2Idx != -1) {
+      Inst32.add(*Src2);
+    } else {
+      // In the case of V_CNDMASK_B32_e32, the explicit operand src2 is
+      // replaced with an implicit read of vcc. This was already added
+      // during the initial BuildMI, so find it to preserve the flags.
+      copyFlagsToImplicitVCC(*Inst32, *Src2);
+    }
+  }
+
+  return Inst32;
+}
+
 bool SIInstrInfo::usesConstantBus(const MachineRegisterInfo &MRI,
                                   const MachineOperand &MO,
                                   const MCOperandInfo &OpInfo) const {
@@ -2806,6 +2972,42 @@ bool SIInstrInfo::verifyInstruction(const MachineInstr &MI,
     }
   }
 
+  // Verify MIMG
+  if (isMIMG(MI.getOpcode()) && !MI.mayStore()) {
+    // Ensure that the return type used is large enough for all the options
+    // being used TFE/LWE require an extra result register.
+    const MachineOperand *DMask = getNamedOperand(MI, AMDGPU::OpName::dmask);
+    if (DMask) {
+      uint64_t DMaskImm = DMask->getImm();
+      uint32_t RegCount =
+          isGather4(MI.getOpcode()) ? 4 : countPopulation(DMaskImm);
+      const MachineOperand *TFE = getNamedOperand(MI, AMDGPU::OpName::tfe);
+      const MachineOperand *LWE = getNamedOperand(MI, AMDGPU::OpName::lwe);
+      const MachineOperand *D16 = getNamedOperand(MI, AMDGPU::OpName::d16);
+
+      // Adjust for packed 16 bit values
+      if (D16 && D16->getImm() && !ST.hasUnpackedD16VMem())
+        RegCount >>= 1;
+
+      // Adjust if using LWE or TFE
+      if ((LWE && LWE->getImm()) || (TFE && TFE->getImm()))
+        RegCount += 1;
+
+      const uint32_t DstIdx =
+          AMDGPU::getNamedOperandIdx(MI.getOpcode(), AMDGPU::OpName::vdata);
+      const MachineOperand &Dst = MI.getOperand(DstIdx);
+      if (Dst.isReg()) {
+        const TargetRegisterClass *DstRC = getOpRegClass(MI, DstIdx);
+        uint32_t DstSize = RI.getRegSizeInBits(*DstRC) / 32;
+        if (RegCount > DstSize) {
+          ErrInfo = "MIMG instruction returns too many registers for dst "
+                    "register class";
+          return false;
+        }
+      }
+    }
+  }
+
   // Verify VOP*. Ignore multiple sgpr operands on writelane.
   if (Desc.getOpcode() != AMDGPU::V_WRITELANE_B32
       && (isVOP1(MI) || isVOP2(MI) || isVOP3(MI) || isVOPC(MI) || isSDWA(MI))) {
@@ -3001,6 +3203,8 @@ unsigned SIInstrInfo::getVALUOp(const MachineInstr &MI) const {
   case AMDGPU::S_AND_B32: return AMDGPU::V_AND_B32_e64;
   case AMDGPU::S_OR_B32: return AMDGPU::V_OR_B32_e64;
   case AMDGPU::S_XOR_B32: return AMDGPU::V_XOR_B32_e64;
+  case AMDGPU::S_XNOR_B32:
+    return ST.hasDLInsts() ? AMDGPU::V_XNOR_B32_e64 : AMDGPU::INSTRUCTION_LIST_END;
   case AMDGPU::S_MIN_I32: return AMDGPU::V_MIN_I32_e64;
   case AMDGPU::S_MIN_U32: return AMDGPU::V_MIN_U32_e64;
   case AMDGPU::S_MAX_I32: return AMDGPU::V_MAX_I32_e64;
@@ -3438,8 +3642,13 @@ void SIInstrInfo::legalizeOperandsSMRD(MachineRegisterInfo &MRI,
   // pointer value is uniform.
   MachineOperand *SBase = getNamedOperand(MI, AMDGPU::OpName::sbase);
   if (SBase && !RI.isSGPRClass(MRI.getRegClass(SBase->getReg()))) {
-      unsigned SGPR = readlaneVGPRToSGPR(SBase->getReg(), MI, MRI);
-      SBase->setReg(SGPR);
+    unsigned SGPR = readlaneVGPRToSGPR(SBase->getReg(), MI, MRI);
+    SBase->setReg(SGPR);
+  }
+  MachineOperand *SOff = getNamedOperand(MI, AMDGPU::OpName::soff);
+  if (SOff && !RI.isSGPRClass(MRI.getRegClass(SOff->getReg()))) {
+    unsigned SGPR = readlaneVGPRToSGPR(SOff->getReg(), MI, MRI);
+    SOff->setReg(SGPR);
   }
 }
 
@@ -3475,7 +3684,191 @@ void SIInstrInfo::legalizeGenericOperand(MachineBasicBlock &InsertMBB,
     FoldImmediate(*Copy, *Def, OpReg, &MRI);
 }
 
-void SIInstrInfo::legalizeOperands(MachineInstr &MI) const {
+// Emit the actual waterfall loop, executing the wrapped instruction for each
+// unique value of \p Rsrc across all lanes. In the best case we execute 1
+// iteration, in the worst case we execute 64 (once per lane).
+static void
+emitLoadSRsrcFromVGPRLoop(const SIInstrInfo &TII, MachineRegisterInfo &MRI,
+                          MachineBasicBlock &OrigBB, MachineBasicBlock &LoopBB,
+                          const DebugLoc &DL, MachineOperand &Rsrc) {
+  MachineBasicBlock::iterator I = LoopBB.begin();
+
+  unsigned VRsrc = Rsrc.getReg();
+  unsigned VRsrcUndef = getUndefRegState(Rsrc.isUndef());
+
+  unsigned SaveExec = MRI.createVirtualRegister(&AMDGPU::SReg_64RegClass);
+  unsigned CondReg0 = MRI.createVirtualRegister(&AMDGPU::SReg_64RegClass);
+  unsigned CondReg1 = MRI.createVirtualRegister(&AMDGPU::SReg_64RegClass);
+  unsigned AndCond = MRI.createVirtualRegister(&AMDGPU::SReg_64RegClass);
+  unsigned SRsrcSub0 = MRI.createVirtualRegister(&AMDGPU::SGPR_32RegClass);
+  unsigned SRsrcSub1 = MRI.createVirtualRegister(&AMDGPU::SGPR_32RegClass);
+  unsigned SRsrcSub2 = MRI.createVirtualRegister(&AMDGPU::SGPR_32RegClass);
+  unsigned SRsrcSub3 = MRI.createVirtualRegister(&AMDGPU::SGPR_32RegClass);
+  unsigned SRsrc = MRI.createVirtualRegister(&AMDGPU::SReg_128RegClass);
+
+  // Beginning of the loop, read the next Rsrc variant.
+  BuildMI(LoopBB, I, DL, TII.get(AMDGPU::V_READFIRSTLANE_B32), SRsrcSub0)
+      .addReg(VRsrc, VRsrcUndef, AMDGPU::sub0);
+  BuildMI(LoopBB, I, DL, TII.get(AMDGPU::V_READFIRSTLANE_B32), SRsrcSub1)
+      .addReg(VRsrc, VRsrcUndef, AMDGPU::sub1);
+  BuildMI(LoopBB, I, DL, TII.get(AMDGPU::V_READFIRSTLANE_B32), SRsrcSub2)
+      .addReg(VRsrc, VRsrcUndef, AMDGPU::sub2);
+  BuildMI(LoopBB, I, DL, TII.get(AMDGPU::V_READFIRSTLANE_B32), SRsrcSub3)
+      .addReg(VRsrc, VRsrcUndef, AMDGPU::sub3);
+
+  BuildMI(LoopBB, I, DL, TII.get(AMDGPU::REG_SEQUENCE), SRsrc)
+      .addReg(SRsrcSub0)
+      .addImm(AMDGPU::sub0)
+      .addReg(SRsrcSub1)
+      .addImm(AMDGPU::sub1)
+      .addReg(SRsrcSub2)
+      .addImm(AMDGPU::sub2)
+      .addReg(SRsrcSub3)
+      .addImm(AMDGPU::sub3);
+
+  // Update Rsrc operand to use the SGPR Rsrc.
+  Rsrc.setReg(SRsrc);
+  Rsrc.setIsKill(true);
+
+  // Identify all lanes with identical Rsrc operands in their VGPRs.
+  BuildMI(LoopBB, I, DL, TII.get(AMDGPU::V_CMP_EQ_U64_e64), CondReg0)
+      .addReg(SRsrc, 0, AMDGPU::sub0_sub1)
+      .addReg(VRsrc, 0, AMDGPU::sub0_sub1);
+  BuildMI(LoopBB, I, DL, TII.get(AMDGPU::V_CMP_EQ_U64_e64), CondReg1)
+      .addReg(SRsrc, 0, AMDGPU::sub2_sub3)
+      .addReg(VRsrc, 0, AMDGPU::sub2_sub3);
+  BuildMI(LoopBB, I, DL, TII.get(AMDGPU::S_AND_B64), AndCond)
+      .addReg(CondReg0)
+      .addReg(CondReg1);
+
+  MRI.setSimpleHint(SaveExec, AndCond);
+
+  // Update EXEC to matching lanes, saving original to SaveExec.
+  BuildMI(LoopBB, I, DL, TII.get(AMDGPU::S_AND_SAVEEXEC_B64), SaveExec)
+      .addReg(AndCond, RegState::Kill);
+
+  // The original instruction is here; we insert the terminators after it.
+  I = LoopBB.end();
+
+  // Update EXEC, switch all done bits to 0 and all todo bits to 1.
+  BuildMI(LoopBB, I, DL, TII.get(AMDGPU::S_XOR_B64_term), AMDGPU::EXEC)
+      .addReg(AMDGPU::EXEC)
+      .addReg(SaveExec);
+  BuildMI(LoopBB, I, DL, TII.get(AMDGPU::S_CBRANCH_EXECNZ)).addMBB(&LoopBB);
+}
+
+// Build a waterfall loop around \p MI, replacing the VGPR \p Rsrc register
+// with SGPRs by iterating over all unique values across all lanes.
+static void loadSRsrcFromVGPR(const SIInstrInfo &TII, MachineInstr &MI,
+                              MachineOperand &Rsrc, MachineDominatorTree *MDT) {
+  MachineBasicBlock &MBB = *MI.getParent();
+  MachineFunction &MF = *MBB.getParent();
+  MachineRegisterInfo &MRI = MF.getRegInfo();
+  MachineBasicBlock::iterator I(&MI);
+  const DebugLoc &DL = MI.getDebugLoc();
+
+  unsigned SaveExec = MRI.createVirtualRegister(&AMDGPU::SReg_64_XEXECRegClass);
+
+  // Save the EXEC mask
+  BuildMI(MBB, I, DL, TII.get(AMDGPU::S_MOV_B64), SaveExec)
+      .addReg(AMDGPU::EXEC);
+
+  // Killed uses in the instruction we are waterfalling around will be
+  // incorrect due to the added control-flow.
+  for (auto &MO : MI.uses()) {
+    if (MO.isReg() && MO.isUse()) {
+      MRI.clearKillFlags(MO.getReg());
+    }
+  }
+
+  // To insert the loop we need to split the block. Move everything after this
+  // point to a new block, and insert a new empty block between the two.
+  MachineBasicBlock *LoopBB = MF.CreateMachineBasicBlock();
+  MachineBasicBlock *RemainderBB = MF.CreateMachineBasicBlock();
+  MachineFunction::iterator MBBI(MBB);
+  ++MBBI;
+
+  MF.insert(MBBI, LoopBB);
+  MF.insert(MBBI, RemainderBB);
+
+  LoopBB->addSuccessor(LoopBB);
+  LoopBB->addSuccessor(RemainderBB);
+
+  // Move MI to the LoopBB, and the remainder of the block to RemainderBB.
+  MachineBasicBlock::iterator J = I++;
+  RemainderBB->transferSuccessorsAndUpdatePHIs(&MBB);
+  RemainderBB->splice(RemainderBB->begin(), &MBB, I, MBB.end());
+  LoopBB->splice(LoopBB->begin(), &MBB, J);
+
+  MBB.addSuccessor(LoopBB);
+
+  // Update dominators. We know that MBB immediately dominates LoopBB, that
+  // LoopBB immediately dominates RemainderBB, and that RemainderBB immediately
+  // dominates all of the successors transferred to it from MBB that MBB used
+  // to dominate.
+  if (MDT) {
+    MDT->addNewBlock(LoopBB, &MBB);
+    MDT->addNewBlock(RemainderBB, LoopBB);
+    for (auto &Succ : RemainderBB->successors()) {
+      if (MDT->dominates(&MBB, Succ)) {
+        MDT->changeImmediateDominator(Succ, RemainderBB);
+      }
+    }
+  }
+
+  emitLoadSRsrcFromVGPRLoop(TII, MRI, MBB, *LoopBB, DL, Rsrc);
+
+  // Restore the EXEC mask
+  MachineBasicBlock::iterator First = RemainderBB->begin();
+  BuildMI(*RemainderBB, First, DL, TII.get(AMDGPU::S_MOV_B64), AMDGPU::EXEC)
+      .addReg(SaveExec);
+}
+
+// Extract pointer from Rsrc and return a zero-value Rsrc replacement.
+static std::tuple<unsigned, unsigned>
+extractRsrcPtr(const SIInstrInfo &TII, MachineInstr &MI, MachineOperand &Rsrc) {
+  MachineBasicBlock &MBB = *MI.getParent();
+  MachineFunction &MF = *MBB.getParent();
+  MachineRegisterInfo &MRI = MF.getRegInfo();
+
+  // Extract the ptr from the resource descriptor.
+  unsigned RsrcPtr =
+      TII.buildExtractSubReg(MI, MRI, Rsrc, &AMDGPU::VReg_128RegClass,
+                             AMDGPU::sub0_sub1, &AMDGPU::VReg_64RegClass);
+
+  // Create an empty resource descriptor
+  unsigned Zero64 = MRI.createVirtualRegister(&AMDGPU::SReg_64RegClass);
+  unsigned SRsrcFormatLo = MRI.createVirtualRegister(&AMDGPU::SGPR_32RegClass);
+  unsigned SRsrcFormatHi = MRI.createVirtualRegister(&AMDGPU::SGPR_32RegClass);
+  unsigned NewSRsrc = MRI.createVirtualRegister(&AMDGPU::SReg_128RegClass);
+  uint64_t RsrcDataFormat = TII.getDefaultRsrcDataFormat();
+
+  // Zero64 = 0
+  BuildMI(MBB, MI, MI.getDebugLoc(), TII.get(AMDGPU::S_MOV_B64), Zero64)
+      .addImm(0);
+
+  // SRsrcFormatLo = RSRC_DATA_FORMAT{31-0}
+  BuildMI(MBB, MI, MI.getDebugLoc(), TII.get(AMDGPU::S_MOV_B32), SRsrcFormatLo)
+      .addImm(RsrcDataFormat & 0xFFFFFFFF);
+
+  // SRsrcFormatHi = RSRC_DATA_FORMAT{63-32}
+  BuildMI(MBB, MI, MI.getDebugLoc(), TII.get(AMDGPU::S_MOV_B32), SRsrcFormatHi)
+      .addImm(RsrcDataFormat >> 32);
+
+  // NewSRsrc = {Zero64, SRsrcFormat}
+  BuildMI(MBB, MI, MI.getDebugLoc(), TII.get(AMDGPU::REG_SEQUENCE), NewSRsrc)
+      .addReg(Zero64)
+      .addImm(AMDGPU::sub0_sub1)
+      .addReg(SRsrcFormatLo)
+      .addImm(AMDGPU::sub2)
+      .addReg(SRsrcFormatHi)
+      .addImm(AMDGPU::sub3);
+
+  return std::make_tuple(RsrcPtr, NewSRsrc);
+}
+
+void SIInstrInfo::legalizeOperands(MachineInstr &MI,
+                                   MachineDominatorTree *MDT) const {
   MachineFunction &MF = *MI.getParent()->getParent();
   MachineRegisterInfo &MRI = MF.getRegInfo();
 
@@ -3617,75 +4010,56 @@ void SIInstrInfo::legalizeOperands(MachineInstr &MI) const {
     return;
   }
 
-  // Legalize MUBUF* instructions by converting to addr64 form.
-  // FIXME: If we start using the non-addr64 instructions for compute, we
-  // may need to legalize them as above. This especially applies to the
-  // buffer_load_format_* variants and variants with idxen (or bothen).
-  int SRsrcIdx =
+  // Legalize MUBUF* instructions.
+  int RsrcIdx =
       AMDGPU::getNamedOperandIdx(MI.getOpcode(), AMDGPU::OpName::srsrc);
-  if (SRsrcIdx != -1) {
+  if (RsrcIdx != -1) {
     // We have an MUBUF instruction
-    MachineOperand *SRsrc = &MI.getOperand(SRsrcIdx);
-    unsigned SRsrcRC = get(MI.getOpcode()).OpInfo[SRsrcIdx].RegClass;
-    if (RI.getCommonSubClass(MRI.getRegClass(SRsrc->getReg()),
-                                             RI.getRegClass(SRsrcRC))) {
+    MachineOperand *Rsrc = &MI.getOperand(RsrcIdx);
+    unsigned RsrcRC = get(MI.getOpcode()).OpInfo[RsrcIdx].RegClass;
+    if (RI.getCommonSubClass(MRI.getRegClass(Rsrc->getReg()),
+                             RI.getRegClass(RsrcRC))) {
       // The operands are legal.
       // FIXME: We may need to legalize operands besided srsrc.
       return;
     }
 
-    MachineBasicBlock &MBB = *MI.getParent();
-
-    // Extract the ptr from the resource descriptor.
-    unsigned SRsrcPtr = buildExtractSubReg(MI, MRI, *SRsrc,
-      &AMDGPU::VReg_128RegClass, AMDGPU::sub0_sub1, &AMDGPU::VReg_64RegClass);
-
-    // Create an empty resource descriptor
-    unsigned Zero64 = MRI.createVirtualRegister(&AMDGPU::SReg_64RegClass);
-    unsigned SRsrcFormatLo = MRI.createVirtualRegister(&AMDGPU::SGPR_32RegClass);
-    unsigned SRsrcFormatHi = MRI.createVirtualRegister(&AMDGPU::SGPR_32RegClass);
-    unsigned NewSRsrc = MRI.createVirtualRegister(&AMDGPU::SReg_128RegClass);
-    uint64_t RsrcDataFormat = getDefaultRsrcDataFormat();
-
-    // Zero64 = 0
-    BuildMI(MBB, MI, MI.getDebugLoc(), get(AMDGPU::S_MOV_B64), Zero64)
-        .addImm(0);
+    // Legalize a VGPR Rsrc.
+    //
+    // If the instruction is _ADDR64, we can avoid a waterfall by extracting
+    // the base pointer from the VGPR Rsrc, adding it to the VAddr, then using
+    // a zero-value SRsrc.
+    //
+    // If the instruction is _OFFSET (both idxen and offen disabled), and we
+    // support ADDR64 instructions, we can convert to ADDR64 and do the same as
+    // above.
+    //
+    // Otherwise we are on non-ADDR64 hardware, and/or we have
+    // idxen/offen/bothen and we fall back to a waterfall loop.
 
-    // SRsrcFormatLo = RSRC_DATA_FORMAT{31-0}
-    BuildMI(MBB, MI, MI.getDebugLoc(), get(AMDGPU::S_MOV_B32), SRsrcFormatLo)
-        .addImm(RsrcDataFormat & 0xFFFFFFFF);
-
-    // SRsrcFormatHi = RSRC_DATA_FORMAT{63-32}
-    BuildMI(MBB, MI, MI.getDebugLoc(), get(AMDGPU::S_MOV_B32), SRsrcFormatHi)
-        .addImm(RsrcDataFormat >> 32);
-
-    // NewSRsrc = {Zero64, SRsrcFormat}
-    BuildMI(MBB, MI, MI.getDebugLoc(), get(AMDGPU::REG_SEQUENCE), NewSRsrc)
-        .addReg(Zero64)
-        .addImm(AMDGPU::sub0_sub1)
-        .addReg(SRsrcFormatLo)
-        .addImm(AMDGPU::sub2)
-        .addReg(SRsrcFormatHi)
-        .addImm(AMDGPU::sub3);
+    MachineBasicBlock &MBB = *MI.getParent();
 
     MachineOperand *VAddr = getNamedOperand(MI, AMDGPU::OpName::vaddr);
-    unsigned NewVAddr = MRI.createVirtualRegister(&AMDGPU::VReg_64RegClass);
-    if (VAddr) {
+    if (VAddr && AMDGPU::getIfAddr64Inst(MI.getOpcode()) != -1) {
       // This is already an ADDR64 instruction so we need to add the pointer
       // extracted from the resource descriptor to the current value of VAddr.
       unsigned NewVAddrLo = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
       unsigned NewVAddrHi = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
+      unsigned NewVAddr = MRI.createVirtualRegister(&AMDGPU::VReg_64RegClass);
+
+      unsigned RsrcPtr, NewSRsrc;
+      std::tie(RsrcPtr, NewSRsrc) = extractRsrcPtr(*this, MI, *Rsrc);
 
-      // NewVaddrLo = SRsrcPtr:sub0 + VAddr:sub0
+      // NewVaddrLo = RsrcPtr:sub0 + VAddr:sub0
       DebugLoc DL = MI.getDebugLoc();
       BuildMI(MBB, MI, DL, get(AMDGPU::V_ADD_I32_e32), NewVAddrLo)
-        .addReg(SRsrcPtr, 0, AMDGPU::sub0)
-        .addReg(VAddr->getReg(), 0, AMDGPU::sub0);
+          .addReg(RsrcPtr, 0, AMDGPU::sub0)
+          .addReg(VAddr->getReg(), 0, AMDGPU::sub0);
 
-      // NewVaddrHi = SRsrcPtr:sub1 + VAddr:sub1
+      // NewVaddrHi = RsrcPtr:sub1 + VAddr:sub1
       BuildMI(MBB, MI, DL, get(AMDGPU::V_ADDC_U32_e32), NewVAddrHi)
-        .addReg(SRsrcPtr, 0, AMDGPU::sub1)
-        .addReg(VAddr->getReg(), 0, AMDGPU::sub1);
+          .addReg(RsrcPtr, 0, AMDGPU::sub1)
+          .addReg(VAddr->getReg(), 0, AMDGPU::sub1);
 
       // NewVaddr = {NewVaddrHi, NewVaddrLo}
       BuildMI(MBB, MI, MI.getDebugLoc(), get(AMDGPU::REG_SEQUENCE), NewVAddr)
@@ -3693,13 +4067,20 @@ void SIInstrInfo::legalizeOperands(MachineInstr &MI) const {
           .addImm(AMDGPU::sub0)
           .addReg(NewVAddrHi)
           .addImm(AMDGPU::sub1);
-    } else {
+
+      VAddr->setReg(NewVAddr);
+      Rsrc->setReg(NewSRsrc);
+    } else if (!VAddr && ST.hasAddr64()) {
       // This instructions is the _OFFSET variant, so we need to convert it to
       // ADDR64.
       assert(MBB.getParent()->getSubtarget<GCNSubtarget>().getGeneration()
              < AMDGPUSubtarget::VOLCANIC_ISLANDS &&
              "FIXME: Need to emit flat atomics here");
 
+      unsigned RsrcPtr, NewSRsrc;
+      std::tie(RsrcPtr, NewSRsrc) = extractRsrcPtr(*this, MI, *Rsrc);
+
+      unsigned NewVAddr = MRI.createVirtualRegister(&AMDGPU::VReg_64RegClass);
       MachineOperand *VData = getNamedOperand(MI, AMDGPU::OpName::vdata);
       MachineOperand *Offset = getNamedOperand(MI, AMDGPU::OpName::offset);
       MachineOperand *SOffset = getNamedOperand(MI, AMDGPU::OpName::soffset);
@@ -3715,10 +4096,8 @@ void SIInstrInfo::legalizeOperands(MachineInstr &MI) const {
         MachineInstrBuilder MIB =
             BuildMI(MBB, MI, MI.getDebugLoc(), get(Addr64Opcode))
                 .add(*VData)
-                .addReg(AMDGPU::NoRegister) // Dummy value for vaddr.
-                // This will be replaced later
-                // with the new value of vaddr.
-                .add(*SRsrc)
+                .addReg(NewVAddr)
+                .addReg(NewSRsrc)
                 .add(*SOffset)
                 .add(*Offset);
 
@@ -3735,21 +4114,19 @@ void SIInstrInfo::legalizeOperands(MachineInstr &MI) const {
           MIB.addImm(TFE->getImm());
         }
 
-        MIB.setMemRefs(MI.memoperands_begin(), MI.memoperands_end());
+        MIB.cloneMemRefs(MI);
         Addr64 = MIB;
       } else {
         // Atomics with return.
         Addr64 = BuildMI(MBB, MI, MI.getDebugLoc(), get(Addr64Opcode))
                      .add(*VData)
                      .add(*VDataIn)
-                     .addReg(AMDGPU::NoRegister) // Dummy value for vaddr.
-                     // This will be replaced later
-                     // with the new value of vaddr.
-                     .add(*SRsrc)
+                     .addReg(NewVAddr)
+                     .addReg(NewSRsrc)
                      .add(*SOffset)
                      .add(*Offset)
                      .addImm(getNamedImmOperand(MI, AMDGPU::OpName::slc))
-                     .setMemRefs(MI.memoperands_begin(), MI.memoperands_end());
+                     .cloneMemRefs(MI);
       }
 
       MI.removeFromParent();
@@ -3757,23 +4134,20 @@ void SIInstrInfo::legalizeOperands(MachineInstr &MI) const {
       // NewVaddr = {NewVaddrHi, NewVaddrLo}
       BuildMI(MBB, Addr64, Addr64->getDebugLoc(), get(AMDGPU::REG_SEQUENCE),
               NewVAddr)
-          .addReg(SRsrcPtr, 0, AMDGPU::sub0)
+          .addReg(RsrcPtr, 0, AMDGPU::sub0)
           .addImm(AMDGPU::sub0)
-          .addReg(SRsrcPtr, 0, AMDGPU::sub1)
+          .addReg(RsrcPtr, 0, AMDGPU::sub1)
           .addImm(AMDGPU::sub1);
-
-      VAddr = getNamedOperand(*Addr64, AMDGPU::OpName::vaddr);
-      SRsrc = getNamedOperand(*Addr64, AMDGPU::OpName::srsrc);
+    } else {
+      // This is another variant; legalize Rsrc with waterfall loop from VGPRs
+      // to SGPRs.
+      loadSRsrcFromVGPR(*this, MI, *Rsrc, MDT);
     }
-
-    // Update the instruction to use NewVaddr
-    VAddr->setReg(NewVAddr);
-    // Update the instruction to use NewSRsrc
-    SRsrc->setReg(NewSRsrc);
   }
 }
 
-void SIInstrInfo::moveToVALU(MachineInstr &TopInst) const {
+void SIInstrInfo::moveToVALU(MachineInstr &TopInst,
+                             MachineDominatorTree *MDT) const {
   SetVectorType Worklist;
   Worklist.insert(&TopInst);
 
@@ -3791,34 +4165,62 @@ void SIInstrInfo::moveToVALU(MachineInstr &TopInst) const {
       break;
     case AMDGPU::S_ADD_U64_PSEUDO:
     case AMDGPU::S_SUB_U64_PSEUDO:
-      splitScalar64BitAddSub(Worklist, Inst);
+      splitScalar64BitAddSub(Worklist, Inst, MDT);
       Inst.eraseFromParent();
       continue;
     case AMDGPU::S_ADD_I32:
     case AMDGPU::S_SUB_I32:
       // FIXME: The u32 versions currently selected use the carry.
-      if (moveScalarAddSub(Worklist, Inst))
+      if (moveScalarAddSub(Worklist, Inst, MDT))
         continue;
 
       // Default handling
       break;
     case AMDGPU::S_AND_B64:
-      splitScalar64BitBinaryOp(Worklist, Inst, AMDGPU::V_AND_B32_e64);
+      splitScalar64BitBinaryOp(Worklist, Inst, AMDGPU::S_AND_B32, MDT);
       Inst.eraseFromParent();
       continue;
 
     case AMDGPU::S_OR_B64:
-      splitScalar64BitBinaryOp(Worklist, Inst, AMDGPU::V_OR_B32_e64);
+      splitScalar64BitBinaryOp(Worklist, Inst, AMDGPU::S_OR_B32, MDT);
       Inst.eraseFromParent();
       continue;
 
     case AMDGPU::S_XOR_B64:
-      splitScalar64BitBinaryOp(Worklist, Inst, AMDGPU::V_XOR_B32_e64);
+      splitScalar64BitBinaryOp(Worklist, Inst, AMDGPU::S_XOR_B32, MDT);
+      Inst.eraseFromParent();
+      continue;
+
+    case AMDGPU::S_NAND_B64:
+      splitScalar64BitBinaryOp(Worklist, Inst, AMDGPU::S_NAND_B32, MDT);
+      Inst.eraseFromParent();
+      continue;
+
+    case AMDGPU::S_NOR_B64:
+      splitScalar64BitBinaryOp(Worklist, Inst, AMDGPU::S_NOR_B32, MDT);
+      Inst.eraseFromParent();
+      continue;
+
+    case AMDGPU::S_XNOR_B64:
+      if (ST.hasDLInsts())
+        splitScalar64BitBinaryOp(Worklist, Inst, AMDGPU::S_XNOR_B32, MDT);
+      else
+        splitScalar64BitXnor(Worklist, Inst, MDT);
+      Inst.eraseFromParent();
+      continue;
+
+    case AMDGPU::S_ANDN2_B64:
+      splitScalar64BitBinaryOp(Worklist, Inst, AMDGPU::S_ANDN2_B32, MDT);
+      Inst.eraseFromParent();
+      continue;
+
+    case AMDGPU::S_ORN2_B64:
+      splitScalar64BitBinaryOp(Worklist, Inst, AMDGPU::S_ORN2_B32, MDT);
       Inst.eraseFromParent();
       continue;
 
     case AMDGPU::S_NOT_B64:
-      splitScalar64BitUnaryOp(Worklist, Inst, AMDGPU::V_NOT_B32_e32);
+      splitScalar64BitUnaryOp(Worklist, Inst, AMDGPU::S_NOT_B32);
       Inst.eraseFromParent();
       continue;
 
@@ -3899,90 +4301,31 @@ void SIInstrInfo::moveToVALU(MachineInstr &TopInst) const {
       Inst.eraseFromParent();
       continue;
 
-    case AMDGPU::S_XNOR_B64:
-      splitScalar64BitBinaryOp(Worklist, Inst, AMDGPU::S_XNOR_B32);
+    case AMDGPU::S_NAND_B32:
+      splitScalarNotBinop(Worklist, Inst, AMDGPU::S_AND_B32);
       Inst.eraseFromParent();
       continue;
 
-    case AMDGPU::S_BUFFER_LOAD_DWORD_SGPR: {
-      unsigned VDst = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
-      const MachineOperand *VAddr = getNamedOperand(Inst, AMDGPU::OpName::soff);
-      auto Add = MRI.getUniqueVRegDef(VAddr->getReg());
-      unsigned Offset = 0;
-
-      // FIXME: This isn't safe because the addressing mode doesn't work
-      // correctly if vaddr is negative.
-      //
-      // FIXME: Should probably be done somewhere else, maybe SIFoldOperands.
-      //
-      // See if we can extract an immediate offset by recognizing one of these:
-      //   V_ADD_I32_e32 dst, imm, src1
-      //   V_ADD_I32_e32 dst, (S_MOV_B32 imm), src1
-      // V_ADD will be removed by "Remove dead machine instructions".
-      if (Add &&
-          (Add->getOpcode() == AMDGPU::V_ADD_I32_e32 ||
-           Add->getOpcode() == AMDGPU::V_ADD_U32_e64)) {
-        static const unsigned SrcNames[2] = {
-          AMDGPU::OpName::src0,
-          AMDGPU::OpName::src1,
-        };
-
-        // Find a literal offset in one of source operands.
-        for (int i = 0; i < 2; i++) {
-          const MachineOperand *Src =
-            getNamedOperand(*Add, SrcNames[i]);
-
-          if (Src->isReg()) {
-            auto Mov = MRI.getUniqueVRegDef(Src->getReg());
-            if (Mov && Mov->getOpcode() == AMDGPU::S_MOV_B32)
-              Src = &Mov->getOperand(1);
-          }
-
-          if (Src) {
-            if (Src->isImm())
-              Offset = Src->getImm();
-            else if (Src->isCImm())
-              Offset = Src->getCImm()->getZExtValue();
-          }
-
-          if (Offset && isLegalMUBUFImmOffset(Offset)) {
-            VAddr = getNamedOperand(*Add, SrcNames[!i]);
-            break;
-          }
-
-          Offset = 0;
-        }
-      }
-
-      MachineInstr *NewInstr =
-        BuildMI(*MBB, Inst, Inst.getDebugLoc(),
-              get(AMDGPU::BUFFER_LOAD_DWORD_OFFEN), VDst)
-        .add(*VAddr) // vaddr
-        .add(*getNamedOperand(Inst, AMDGPU::OpName::sbase)) // srsrc
-        .addImm(0) // soffset
-        .addImm(Offset) // offset
-        .addImm(getNamedOperand(Inst, AMDGPU::OpName::glc)->getImm())
-        .addImm(0) // slc
-        .addImm(0) // tfe
-        .setMemRefs(Inst.memoperands_begin(), Inst.memoperands_end())
-        .getInstr();
+    case AMDGPU::S_NOR_B32:
+      splitScalarNotBinop(Worklist, Inst, AMDGPU::S_OR_B32);
+      Inst.eraseFromParent();
+      continue;
 
-      MRI.replaceRegWith(getNamedOperand(Inst, AMDGPU::OpName::sdst)->getReg(),
-                         VDst);
-      addUsersToMoveToVALUWorklist(VDst, MRI, Worklist);
+    case AMDGPU::S_ANDN2_B32:
+      splitScalarBinOpN2(Worklist, Inst, AMDGPU::S_AND_B32);
       Inst.eraseFromParent();
+      continue;
 
-      // Legalize all operands other than the offset. Notably, convert the srsrc
-      // into SGPRs using v_readfirstlane if needed.
-      legalizeOperands(*NewInstr);
+    case AMDGPU::S_ORN2_B32:
+      splitScalarBinOpN2(Worklist, Inst, AMDGPU::S_OR_B32);
+      Inst.eraseFromParent();
       continue;
     }
-    }
 
     if (NewOpcode == AMDGPU::INSTRUCTION_LIST_END) {
       // We cannot move this instruction to the VALU, so we should try to
       // legalize its operands instead.
-      legalizeOperands(Inst);
+      legalizeOperands(Inst, MDT);
       continue;
     }
 
@@ -4071,7 +4414,7 @@ void SIInstrInfo::moveToVALU(MachineInstr &TopInst) const {
     }
 
     // Legalize the operands
-    legalizeOperands(Inst);
+    legalizeOperands(Inst, MDT);
 
     if (HasDst)
      addUsersToMoveToVALUWorklist(NewDstReg, MRI, Worklist);
@@ -4079,8 +4422,8 @@ void SIInstrInfo::moveToVALU(MachineInstr &TopInst) const {
 }
 
 // Add/sub require special handling to deal with carry outs.
-bool SIInstrInfo::moveScalarAddSub(SetVectorType &Worklist,
-                                   MachineInstr &Inst) const {
+bool SIInstrInfo::moveScalarAddSub(SetVectorType &Worklist, MachineInstr &Inst,
+                                   MachineDominatorTree *MDT) const {
   if (ST.hasAddNoCarry()) {
     // Assume there is no user of scc since we don't select this in that case.
     // Since scc isn't used, it doesn't really matter if the i32 or u32 variant
@@ -4104,7 +4447,7 @@ bool SIInstrInfo::moveScalarAddSub(SetVectorType &Worklist,
     Inst.setDesc(get(NewOpc));
     Inst.addImplicitDefUseOperands(*MBB.getParent());
     MRI.replaceRegWith(OldDstReg, ResultReg);
-    legalizeOperands(Inst);
+    legalizeOperands(Inst, MDT);
 
     addUsersToMoveToVALUWorklist(ResultReg, MRI, Worklist);
     return true;
@@ -4151,23 +4494,116 @@ void SIInstrInfo::lowerScalarXnor(SetVectorType &Worklist,
   MachineOperand &Src0 = Inst.getOperand(1);
   MachineOperand &Src1 = Inst.getOperand(2);
 
-  legalizeGenericOperand(MBB, MII, &AMDGPU::VGPR_32RegClass, Src0, MRI, DL);
-  legalizeGenericOperand(MBB, MII, &AMDGPU::VGPR_32RegClass, Src1, MRI, DL);
-
-  unsigned NewDest = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
   if (ST.hasDLInsts()) {
+    unsigned NewDest = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
+    legalizeGenericOperand(MBB, MII, &AMDGPU::VGPR_32RegClass, Src0, MRI, DL);
+    legalizeGenericOperand(MBB, MII, &AMDGPU::VGPR_32RegClass, Src1, MRI, DL);
+
     BuildMI(MBB, MII, DL, get(AMDGPU::V_XNOR_B32_e64), NewDest)
       .add(Src0)
       .add(Src1);
+
+    MRI.replaceRegWith(Dest.getReg(), NewDest);
+    addUsersToMoveToVALUWorklist(NewDest, MRI, Worklist);
   } else {
-    unsigned Xor = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
-    BuildMI(MBB, MII, DL, get(AMDGPU::V_XOR_B32_e64), Xor)
-      .add(Src0)
+    // Using the identity !(x ^ y) == (!x ^ y) == (x ^ !y), we can
+    // invert either source and then perform the XOR. If either source is a
+    // scalar register, then we can leave the inversion on the scalar unit to
+    // acheive a better distrubution of scalar and vector instructions.
+    bool Src0IsSGPR = Src0.isReg() &&
+                      RI.isSGPRClass(MRI.getRegClass(Src0.getReg()));
+    bool Src1IsSGPR = Src1.isReg() &&
+                      RI.isSGPRClass(MRI.getRegClass(Src1.getReg()));
+    MachineInstr *Not = nullptr;
+    MachineInstr *Xor = nullptr;
+    unsigned Temp = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
+    unsigned NewDest = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
+
+    // Build a pair of scalar instructions and add them to the work list.
+    // The next iteration over the work list will lower these to the vector
+    // unit as necessary.
+    if (Src0IsSGPR) {
+      Not = BuildMI(MBB, MII, DL, get(AMDGPU::S_NOT_B32), Temp)
+        .add(Src0);
+      Xor = BuildMI(MBB, MII, DL, get(AMDGPU::S_XOR_B32), NewDest)
+      .addReg(Temp)
       .add(Src1);
+    } else if (Src1IsSGPR) {
+      Not = BuildMI(MBB, MII, DL, get(AMDGPU::S_NOT_B32), Temp)
+        .add(Src1);
+      Xor = BuildMI(MBB, MII, DL, get(AMDGPU::S_XOR_B32), NewDest)
+      .add(Src0)
+      .addReg(Temp);
+    } else {
+      Xor = BuildMI(MBB, MII, DL, get(AMDGPU::S_XOR_B32), Temp)
+        .add(Src0)
+        .add(Src1);
+      Not = BuildMI(MBB, MII, DL, get(AMDGPU::S_NOT_B32), NewDest)
+        .addReg(Temp);
+      Worklist.insert(Not);
+    }
+
+    MRI.replaceRegWith(Dest.getReg(), NewDest);
 
-    BuildMI(MBB, MII, DL, get(AMDGPU::V_NOT_B32_e64), NewDest)
-      .addReg(Xor);
+    Worklist.insert(Xor);
+
+    addUsersToMoveToVALUWorklist(NewDest, MRI, Worklist);
   }
+}
+
+void SIInstrInfo::splitScalarNotBinop(SetVectorType &Worklist,
+                                      MachineInstr &Inst,
+                                      unsigned Opcode) const {
+  MachineBasicBlock &MBB = *Inst.getParent();
+  MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
+  MachineBasicBlock::iterator MII = Inst;
+  const DebugLoc &DL = Inst.getDebugLoc();
+
+  MachineOperand &Dest = Inst.getOperand(0);
+  MachineOperand &Src0 = Inst.getOperand(1);
+  MachineOperand &Src1 = Inst.getOperand(2);
+
+  unsigned NewDest = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
+  unsigned Interm = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
+
+  MachineInstr &Op = *BuildMI(MBB, MII, DL, get(Opcode), Interm)
+    .add(Src0)
+    .add(Src1);
+
+  MachineInstr &Not = *BuildMI(MBB, MII, DL, get(AMDGPU::S_NOT_B32), NewDest)
+    .addReg(Interm);
+
+  Worklist.insert(&Op);
+  Worklist.insert(&Not);
+
+  MRI.replaceRegWith(Dest.getReg(), NewDest);
+  addUsersToMoveToVALUWorklist(NewDest, MRI, Worklist);
+}
+
+void SIInstrInfo::splitScalarBinOpN2(SetVectorType& Worklist,
+                                     MachineInstr &Inst,
+                                     unsigned Opcode) const {
+  MachineBasicBlock &MBB = *Inst.getParent();
+  MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
+  MachineBasicBlock::iterator MII = Inst;
+  const DebugLoc &DL = Inst.getDebugLoc();
+
+  MachineOperand &Dest = Inst.getOperand(0);
+  MachineOperand &Src0 = Inst.getOperand(1);
+  MachineOperand &Src1 = Inst.getOperand(2);
+
+  unsigned NewDest = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
+  unsigned Interm = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
+
+  MachineInstr &Not = *BuildMI(MBB, MII, DL, get(AMDGPU::S_NOT_B32), Interm)
+    .add(Src1);
+
+  MachineInstr &Op = *BuildMI(MBB, MII, DL, get(Opcode), NewDest)
+    .add(Src0)
+    .addReg(Interm);
+
+  Worklist.insert(&Not);
+  Worklist.insert(&Op);
 
   MRI.replaceRegWith(Dest.getReg(), NewDest);
   addUsersToMoveToVALUWorklist(NewDest, MRI, Worklist);
@@ -4200,13 +4636,13 @@ void SIInstrInfo::splitScalar64BitUnaryOp(
   const TargetRegisterClass *NewDestSubRC = RI.getSubRegClass(NewDestRC, AMDGPU::sub0);
 
   unsigned DestSub0 = MRI.createVirtualRegister(NewDestSubRC);
-  BuildMI(MBB, MII, DL, InstDesc, DestSub0).add(SrcReg0Sub0);
+  MachineInstr &LoHalf = *BuildMI(MBB, MII, DL, InstDesc, DestSub0).add(SrcReg0Sub0);
 
   MachineOperand SrcReg0Sub1 = buildExtractSubRegOrImm(MII, MRI, Src0, Src0RC,
                                                        AMDGPU::sub1, Src0SubRC);
 
   unsigned DestSub1 = MRI.createVirtualRegister(NewDestSubRC);
-  BuildMI(MBB, MII, DL, InstDesc, DestSub1).add(SrcReg0Sub1);
+  MachineInstr &HiHalf = *BuildMI(MBB, MII, DL, InstDesc, DestSub1).add(SrcReg0Sub1);
 
   unsigned FullDestReg = MRI.createVirtualRegister(NewDestRC);
   BuildMI(MBB, MII, DL, get(TargetOpcode::REG_SEQUENCE), FullDestReg)
@@ -4217,6 +4653,9 @@ void SIInstrInfo::splitScalar64BitUnaryOp(
 
   MRI.replaceRegWith(Dest.getReg(), FullDestReg);
 
+  Worklist.insert(&LoHalf);
+  Worklist.insert(&HiHalf);
+
   // We don't need to legalizeOperands here because for a single operand, src0
   // will support any kind of input.
 
@@ -4224,8 +4663,9 @@ void SIInstrInfo::splitScalar64BitUnaryOp(
   addUsersToMoveToVALUWorklist(FullDestReg, MRI, Worklist);
 }
 
-void SIInstrInfo::splitScalar64BitAddSub(
-  SetVectorType &Worklist, MachineInstr &Inst) const {
+void SIInstrInfo::splitScalar64BitAddSub(SetVectorType &Worklist,
+                                         MachineInstr &Inst,
+                                         MachineDominatorTree *MDT) const {
   bool IsAdd = (Inst.getOpcode() == AMDGPU::S_ADD_U64_PSEUDO);
 
   MachineBasicBlock &MBB = *Inst.getParent();
@@ -4285,16 +4725,16 @@ void SIInstrInfo::splitScalar64BitAddSub(
 
   // Try to legalize the operands in case we need to swap the order to keep it
   // valid.
-  legalizeOperands(*LoHalf);
-  legalizeOperands(*HiHalf);
+  legalizeOperands(*LoHalf, MDT);
+  legalizeOperands(*HiHalf, MDT);
 
   // Move all users of this moved vlaue.
   addUsersToMoveToVALUWorklist(FullDestReg, MRI, Worklist);
 }
 
-void SIInstrInfo::splitScalar64BitBinaryOp(
-    SetVectorType &Worklist, MachineInstr &Inst,
-    unsigned Opcode) const {
+void SIInstrInfo::splitScalar64BitBinaryOp(SetVectorType &Worklist,
+                                           MachineInstr &Inst, unsigned Opcode,
+                                           MachineDominatorTree *MDT) const {
   MachineBasicBlock &MBB = *Inst.getParent();
   MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
 
@@ -4321,6 +4761,10 @@ void SIInstrInfo::splitScalar64BitBinaryOp(
                                                        AMDGPU::sub0, Src0SubRC);
   MachineOperand SrcReg1Sub0 = buildExtractSubRegOrImm(MII, MRI, Src1, Src1RC,
                                                        AMDGPU::sub0, Src1SubRC);
+  MachineOperand SrcReg0Sub1 = buildExtractSubRegOrImm(MII, MRI, Src0, Src0RC,
+                                                       AMDGPU::sub1, Src0SubRC);
+  MachineOperand SrcReg1Sub1 = buildExtractSubRegOrImm(MII, MRI, Src1, Src1RC,
+                                                       AMDGPU::sub1, Src1SubRC);
 
   const TargetRegisterClass *DestRC = MRI.getRegClass(Dest.getReg());
   const TargetRegisterClass *NewDestRC = RI.getEquivalentVGPRClass(DestRC);
@@ -4331,11 +4775,6 @@ void SIInstrInfo::splitScalar64BitBinaryOp(
                               .add(SrcReg0Sub0)
                               .add(SrcReg1Sub0);
 
-  MachineOperand SrcReg0Sub1 = buildExtractSubRegOrImm(MII, MRI, Src0, Src0RC,
-                                                       AMDGPU::sub1, Src0SubRC);
-  MachineOperand SrcReg1Sub1 = buildExtractSubRegOrImm(MII, MRI, Src1, Src1RC,
-                                                       AMDGPU::sub1, Src1SubRC);
-
   unsigned DestSub1 = MRI.createVirtualRegister(NewDestSubRC);
   MachineInstr &HiHalf = *BuildMI(MBB, MII, DL, InstDesc, DestSub1)
                               .add(SrcReg0Sub1)
@@ -4350,22 +4789,62 @@ void SIInstrInfo::splitScalar64BitBinaryOp(
 
   MRI.replaceRegWith(Dest.getReg(), FullDestReg);
 
-  // Try to legalize the operands in case we need to swap the order to keep it
-  // valid.
-  legalizeOperands(LoHalf);
-  legalizeOperands(HiHalf);
+  Worklist.insert(&LoHalf);
+  Worklist.insert(&HiHalf);
 
   // Move all users of this moved vlaue.
   addUsersToMoveToVALUWorklist(FullDestReg, MRI, Worklist);
 }
 
+void SIInstrInfo::splitScalar64BitXnor(SetVectorType &Worklist,
+                                       MachineInstr &Inst,
+                                       MachineDominatorTree *MDT) const {
+  MachineBasicBlock &MBB = *Inst.getParent();
+  MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
+
+  MachineOperand &Dest = Inst.getOperand(0);
+  MachineOperand &Src0 = Inst.getOperand(1);
+  MachineOperand &Src1 = Inst.getOperand(2);
+  const DebugLoc &DL = Inst.getDebugLoc();
+
+  MachineBasicBlock::iterator MII = Inst;
+
+  const TargetRegisterClass *DestRC = MRI.getRegClass(Dest.getReg());
+
+  unsigned Interm = MRI.createVirtualRegister(&AMDGPU::SReg_64RegClass);
+
+  MachineOperand* Op0;
+  MachineOperand* Op1;
+
+  if (Src0.isReg() && RI.isSGPRReg(MRI, Src0.getReg())) {
+    Op0 = &Src0;
+    Op1 = &Src1;
+  } else {
+    Op0 = &Src1;
+    Op1 = &Src0;
+  }
+
+  BuildMI(MBB, MII, DL, get(AMDGPU::S_NOT_B64), Interm)
+    .add(*Op0);
+
+  unsigned NewDest = MRI.createVirtualRegister(DestRC);
+
+  MachineInstr &Xor = *BuildMI(MBB, MII, DL, get(AMDGPU::S_XOR_B64), NewDest)
+    .addReg(Interm)
+    .add(*Op1);
+
+  MRI.replaceRegWith(Dest.getReg(), NewDest);
+
+  Worklist.insert(&Xor);
+}
+
 void SIInstrInfo::splitScalar64BitBCNT(
     SetVectorType &Worklist, MachineInstr &Inst) const {
   MachineBasicBlock &MBB = *Inst.getParent();
   MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
 
   MachineBasicBlock::iterator MII = Inst;
-  DebugLoc DL = Inst.getDebugLoc();
+  const DebugLoc &DL = Inst.getDebugLoc();
 
   MachineOperand &Dest = Inst.getOperand(0);
   MachineOperand &Src = Inst.getOperand(1);
@@ -4401,7 +4880,7 @@ void SIInstrInfo::splitScalar64BitBFE(SetVectorType &Worklist,
   MachineBasicBlock &MBB = *Inst.getParent();
   MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
   MachineBasicBlock::iterator MII = Inst;
-  DebugLoc DL = Inst.getDebugLoc();
+  const DebugLoc &DL = Inst.getDebugLoc();
 
   MachineOperand &Dest = Inst.getOperand(0);
   uint32_t Imm = Inst.getOperand(2).getImm();
@@ -4546,10 +5025,10 @@ void SIInstrInfo::addSCCDefUsersToVALUWorklist(
        make_range(MachineBasicBlock::iterator(SCCDefInst),
                       SCCDefInst.getParent()->end())) {
     // Exit if we find another SCC def.
-    if (MI.findRegisterDefOperandIdx(AMDGPU::SCC) != -1)
+    if (MI.findRegisterDefOperandIdx(AMDGPU::SCC, false, false, &RI) != -1)
       return;
 
-    if (MI.findRegisterUseOperandIdx(AMDGPU::SCC) != -1)
+    if (MI.findRegisterUseOperandIdx(AMDGPU::SCC, false, &RI) != -1)
       Worklist.insert(&MI);
   }
 }
@@ -4716,7 +5195,7 @@ unsigned SIInstrInfo::isStackAccess(const MachineInstr &MI,
     return AMDGPU::NoRegister;
 
   assert(!MI.memoperands_empty() &&
-         (*MI.memoperands_begin())->getAddrSpace() == ST.getAMDGPUAS().PRIVATE_ADDRESS);
+         (*MI.memoperands_begin())->getAddrSpace() == AMDGPUAS::PRIVATE_ADDRESS);
 
   FrameIndex = Addr->getIndex();
   return getNamedOperand(MI, AMDGPU::OpName::vdata)->getReg();
@@ -4777,12 +5256,6 @@ unsigned SIInstrInfo::getInstSizeInBytes(const MachineInstr &MI) const {
 
   // If we have a definitive size, we can use it. Otherwise we need to inspect
   // the operands to know the size.
-  //
-  // FIXME: Instructions that have a base 32-bit encoding report their size as
-  // 4, even though they are really 8 bytes if they have a literal operand.
-  if (DescSize != 0 && DescSize != 4)
-    return DescSize;
-
   if (isFixedSize(MI))
     return DescSize;
 
@@ -4791,23 +5264,27 @@ unsigned SIInstrInfo::getInstSizeInBytes(const MachineInstr &MI) const {
   if (isVALU(MI) || isSALU(MI)) {
     int Src0Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src0);
     if (Src0Idx == -1)
-      return 4; // No operands.
+      return DescSize; // No operands.
 
     if (isLiteralConstantLike(MI.getOperand(Src0Idx), Desc.OpInfo[Src0Idx]))
-      return 8;
+      return DescSize + 4;
 
     int Src1Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src1);
     if (Src1Idx == -1)
-      return 4;
+      return DescSize;
 
     if (isLiteralConstantLike(MI.getOperand(Src1Idx), Desc.OpInfo[Src1Idx]))
-      return 8;
+      return DescSize + 4;
 
-    return 4;
-  }
+    int Src2Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src2);
+    if (Src2Idx == -1)
+      return DescSize;
 
-  if (DescSize == 4)
-    return 4;
+    if (isLiteralConstantLike(MI.getOperand(Src2Idx), Desc.OpInfo[Src2Idx]))
+      return DescSize + 4;
+
+    return DescSize;
+  }
 
   switch (Opc) {
   case TargetOpcode::IMPLICIT_DEF:
@@ -4823,7 +5300,7 @@ unsigned SIInstrInfo::getInstSizeInBytes(const MachineInstr &MI) const {
     return getInlineAsmLength(AsmStr, *MF->getTarget().getMCAsmInfo());
   }
   default:
-    llvm_unreachable("unable to find instruction size");
+    return DescSize;
   }
 }
 
@@ -4835,7 +5312,7 @@ bool SIInstrInfo::mayAccessFlatAddressSpace(const MachineInstr &MI) const {
     return true;
 
   for (const MachineMemOperand *MMO : MI.memoperands()) {
-    if (MMO->getAddrSpace() == ST.getAMDGPUAS().FLAT_ADDRESS)
+    if (MMO->getAddrSpace() == AMDGPUAS::FLAT_ADDRESS)
       return true;
   }
   return false;
@@ -5069,3 +5546,84 @@ int SIInstrInfo::pseudoToMCOpcode(int Opcode) const {
 
   return MCOp;
 }
+
+static
+TargetInstrInfo::RegSubRegPair getRegOrUndef(const MachineOperand &RegOpnd) {
+  assert(RegOpnd.isReg());
+  return RegOpnd.isUndef() ? TargetInstrInfo::RegSubRegPair() :
+                             getRegSubRegPair(RegOpnd);
+}
+
+TargetInstrInfo::RegSubRegPair
+llvm::getRegSequenceSubReg(MachineInstr &MI, unsigned SubReg) {
+  assert(MI.isRegSequence());
+  for (unsigned I = 0, E = (MI.getNumOperands() - 1)/ 2; I < E; ++I)
+    if (MI.getOperand(1 + 2 * I + 1).getImm() == SubReg) {
+      auto &RegOp = MI.getOperand(1 + 2 * I);
+      return getRegOrUndef(RegOp);
+    }
+  return TargetInstrInfo::RegSubRegPair();
+}
+
+// Try to find the definition of reg:subreg in subreg-manipulation pseudos
+// Following a subreg of reg:subreg isn't supported
+static bool followSubRegDef(MachineInstr &MI,
+                            TargetInstrInfo::RegSubRegPair &RSR) {
+  if (!RSR.SubReg)
+    return false;
+  switch (MI.getOpcode()) {
+  default: break;
+  case AMDGPU::REG_SEQUENCE:
+    RSR = getRegSequenceSubReg(MI, RSR.SubReg);
+    return true;
+  // EXTRACT_SUBREG ins't supported as this would follow a subreg of subreg
+  case AMDGPU::INSERT_SUBREG:
+    if (RSR.SubReg == (unsigned)MI.getOperand(3).getImm())
+      // inserted the subreg we're looking for
+      RSR = getRegOrUndef(MI.getOperand(2));
+    else { // the subreg in the rest of the reg
+      auto R1 = getRegOrUndef(MI.getOperand(1));
+      if (R1.SubReg) // subreg of subreg isn't supported
+        return false;
+      RSR.Reg = R1.Reg;
+    }
+    return true;
+  }
+  return false;
+}
+
+MachineInstr *llvm::getVRegSubRegDef(const TargetInstrInfo::RegSubRegPair &P,
+                                     MachineRegisterInfo &MRI) {
+  assert(MRI.isSSA());
+  if (!TargetRegisterInfo::isVirtualRegister(P.Reg))
+    return nullptr;
+
+  auto RSR = P;
+  auto *DefInst = MRI.getVRegDef(RSR.Reg);
+  while (auto *MI = DefInst) {
+    DefInst = nullptr;
+    switch (MI->getOpcode()) {
+    case AMDGPU::COPY:
+    case AMDGPU::V_MOV_B32_e32: {
+      auto &Op1 = MI->getOperand(1);
+      if (Op1.isReg() &&
+        TargetRegisterInfo::isVirtualRegister(Op1.getReg())) {
+        if (Op1.isUndef())
+          return nullptr;
+        RSR = getRegSubRegPair(Op1);
+        DefInst = MRI.getVRegDef(RSR.Reg);
+      }
+      break;
+    }
+    default:
+      if (followSubRegDef(*MI, RSR)) {
+        if (!RSR.Reg)
+          return nullptr;
+        DefInst = MRI.getVRegDef(RSR.Reg);
+      }
+    }
+    if (!DefInst)
+      return MI;
+  }
+  return nullptr;
+}
diff --git a/lib/Target/AMDGPU/SIInstrInfo.h b/lib/Target/AMDGPU/SIInstrInfo.h
index d681b926504ed..5b1a05f3785ec 100644
--- a/lib/Target/AMDGPU/SIInstrInfo.h
+++ b/lib/Target/AMDGPU/SIInstrInfo.h
@@ -37,6 +37,7 @@
 namespace llvm {
 
 class APInt;
+class MachineDominatorTree;
 class MachineRegisterInfo;
 class RegScavenger;
 class GCNSubtarget;
@@ -79,8 +80,8 @@ public:
 private:
   void swapOperands(MachineInstr &Inst) const;
 
-  bool moveScalarAddSub(SetVectorType &Worklist,
-                        MachineInstr &Inst) const;
+  bool moveScalarAddSub(SetVectorType &Worklist, MachineInstr &Inst,
+                        MachineDominatorTree *MDT = nullptr) const;
 
   void lowerScalarAbs(SetVectorType &Worklist,
                       MachineInstr &Inst) const;
@@ -88,14 +89,26 @@ private:
   void lowerScalarXnor(SetVectorType &Worklist,
                        MachineInstr &Inst) const;
 
+  void splitScalarNotBinop(SetVectorType &Worklist,
+                           MachineInstr &Inst,
+                           unsigned Opcode) const;
+
+  void splitScalarBinOpN2(SetVectorType &Worklist,
+                          MachineInstr &Inst,
+                          unsigned Opcode) const;
+
   void splitScalar64BitUnaryOp(SetVectorType &Worklist,
                                MachineInstr &Inst, unsigned Opcode) const;
 
-  void splitScalar64BitAddSub(SetVectorType &Worklist,
-                              MachineInstr &Inst) const;
+  void splitScalar64BitAddSub(SetVectorType &Worklist, MachineInstr &Inst,
+                              MachineDominatorTree *MDT = nullptr) const;
+
+  void splitScalar64BitBinaryOp(SetVectorType &Worklist, MachineInstr &Inst,
+                                unsigned Opcode,
+                                MachineDominatorTree *MDT = nullptr) const;
 
-  void splitScalar64BitBinaryOp(SetVectorType &Worklist,
-                                MachineInstr &Inst, unsigned Opcode) const;
+  void splitScalar64BitXnor(SetVectorType &Worklist, MachineInstr &Inst,
+                                MachineDominatorTree *MDT = nullptr) const;
 
   void splitScalar64BitBCNT(SetVectorType &Worklist,
                             MachineInstr &Inst) const;
@@ -160,12 +173,11 @@ public:
                                int64_t &Offset1,
                                int64_t &Offset2) const override;
 
-  bool getMemOpBaseRegImmOfs(MachineInstr &LdSt, unsigned &BaseReg,
-                             int64_t &Offset,
-                             const TargetRegisterInfo *TRI) const final;
+  bool getMemOperandWithOffset(MachineInstr &LdSt, MachineOperand *&BaseOp,
+                               int64_t &Offset,
+                               const TargetRegisterInfo *TRI) const final;
 
-  bool shouldClusterMemOps(MachineInstr &FirstLdSt, unsigned BaseReg1,
-                           MachineInstr &SecondLdSt, unsigned BaseReg2,
+  bool shouldClusterMemOps(MachineOperand &BaseOp1, MachineOperand &BaseOp2,
                            unsigned NumLoads) const override;
 
   bool shouldScheduleLoadsNear(SDNode *Load0, SDNode *Load1, int64_t Offset0,
@@ -225,6 +237,9 @@ public:
   bool findCommutedOpIndices(MachineInstr &MI, unsigned &SrcOpIdx1,
                              unsigned &SrcOpIdx2) const override;
 
+  bool findCommutedOpIndices(MCInstrDesc Desc, unsigned & SrcOpIdx0,
+   unsigned & SrcOpIdx1) const;
+
   bool isBranchOffsetInRange(unsigned BranchOpc,
                              int64_t BrOffset) const override;
 
@@ -276,7 +291,7 @@ public:
                           unsigned TrueReg, unsigned FalseReg) const;
 
   unsigned getAddressSpaceForPseudoSourceKind(
-             PseudoSourceValue::PSVKind Kind) const override;
+             unsigned Kind) const override;
 
   bool
   areMemAccessesTriviallyDisjoint(MachineInstr &MIa, MachineInstr &MIb,
@@ -589,6 +604,14 @@ public:
       return MI.getDesc().TSFlags & ClampFlags;
   }
 
+  static bool usesFPDPRounding(const MachineInstr &MI) {
+    return MI.getDesc().TSFlags & SIInstrFlags::FPDPRounding;
+  }
+
+  bool usesFPDPRounding(uint16_t Opcode) const {
+    return get(Opcode).TSFlags & SIInstrFlags::FPDPRounding;
+  }
+
   bool isVGPRCopy(const MachineInstr &MI) const {
     assert(MI.isCopy());
     unsigned Dest = MI.getOperand(0).getReg();
@@ -689,6 +712,12 @@ public:
                        unsigned OpName) const;
   bool hasAnyModifiersSet(const MachineInstr &MI) const;
 
+  bool canShrink(const MachineInstr &MI,
+                 const MachineRegisterInfo &MRI) const;
+
+  MachineInstr *buildShrunkInst(MachineInstr &MI,
+                                unsigned NewOpcode) const;
+
   bool verifyInstruction(const MachineInstr &MI,
                          StringRef &ErrInfo) const override;
 
@@ -719,6 +748,16 @@ public:
   /// This form should usually be preferred since it handles operands
   /// with unknown register classes.
   unsigned getOpSize(const MachineInstr &MI, unsigned OpNo) const {
+    const MachineOperand &MO = MI.getOperand(OpNo);
+    if (MO.isReg()) {
+      if (unsigned SubReg = MO.getSubReg()) {
+        assert(RI.getRegSizeInBits(*RI.getSubClassWithSubReg(
+                                   MI.getParent()->getParent()->getRegInfo().
+                                     getRegClass(MO.getReg()), SubReg)) >= 32 &&
+               "Sub-dword subregs are not supported");
+        return RI.getSubRegIndexLaneMask(SubReg).getNumLanes() * 4;
+      }
+    }
     return RI.getRegSizeInBits(*getOpRegClass(MI, OpNo)) / 8;
   }
 
@@ -777,14 +816,16 @@ public:
                               MachineOperand &Op, MachineRegisterInfo &MRI,
                               const DebugLoc &DL) const;
 
-  /// Legalize all operands in this instruction.  This function may
-  /// create new instruction and insert them before \p MI.
-  void legalizeOperands(MachineInstr &MI) const;
+  /// Legalize all operands in this instruction.  This function may create new
+  /// instructions and control-flow around \p MI.  If present, \p MDT is
+  /// updated.
+  void legalizeOperands(MachineInstr &MI,
+                        MachineDominatorTree *MDT = nullptr) const;
 
   /// Replace this instruction's opcode with the equivalent VALU
   /// opcode.  This function will also move the users of \p MI to the
-  /// VALU if necessary.
-  void moveToVALU(MachineInstr &MI) const;
+  /// VALU if necessary. If present, \p MDT is updated.
+  void moveToVALU(MachineInstr &MI, MachineDominatorTree *MDT = nullptr) const;
 
   void insertWaitStates(MachineBasicBlock &MBB,MachineBasicBlock::iterator MI,
                         int Count) const;
@@ -885,9 +926,36 @@ public:
   /// Return -1 if the target-specific opcode for the pseudo instruction does
   /// not exist. If Opcode is not a pseudo instruction, this is identity.
   int pseudoToMCOpcode(int Opcode) const;
-
 };
 
+/// \brief Returns true if a reg:subreg pair P has a TRC class
+inline bool isOfRegClass(const TargetInstrInfo::RegSubRegPair &P,
+                         const TargetRegisterClass &TRC,
+                         MachineRegisterInfo &MRI) {
+  auto *RC = MRI.getRegClass(P.Reg);
+  if (!P.SubReg)
+    return RC == &TRC;
+  auto *TRI = MRI.getTargetRegisterInfo();
+  return RC == TRI->getMatchingSuperRegClass(RC, &TRC, P.SubReg);
+}
+
+/// \brief Create RegSubRegPair from a register MachineOperand
+inline
+TargetInstrInfo::RegSubRegPair getRegSubRegPair(const MachineOperand &O) {
+  assert(O.isReg());
+  return TargetInstrInfo::RegSubRegPair(O.getReg(), O.getSubReg());
+}
+
+/// \brief Return the SubReg component from REG_SEQUENCE
+TargetInstrInfo::RegSubRegPair getRegSequenceSubReg(MachineInstr &MI,
+                                                    unsigned SubReg);
+
+/// \brief Return the defining instruction for a given reg:subreg pair
+/// skipping copy like instructions and subreg-manipulation pseudos.
+/// Following another subreg of a reg:subreg isn't supported.
+MachineInstr *getVRegSubRegDef(const TargetInstrInfo::RegSubRegPair &P,
+                               MachineRegisterInfo &MRI);
+
 namespace AMDGPU {
 
   LLVM_READONLY
@@ -900,6 +968,9 @@ namespace AMDGPU {
   int getSDWAOp(uint16_t Opcode);
 
   LLVM_READONLY
+  int getDPPOp32(uint16_t Opcode);
+
+  LLVM_READONLY
   int getBasicFromSDWAOp(uint16_t Opcode);
 
   LLVM_READONLY
@@ -911,6 +982,12 @@ namespace AMDGPU {
   LLVM_READONLY
   int getAddr64Inst(uint16_t Opcode);
 
+  /// Check if \p Opcode is an Addr64 opcode.
+  ///
+  /// \returns \p Opcode if it is an Addr64 opcode, otherwise -1.
+  LLVM_READONLY
+  int getIfAddr64Inst(uint16_t Opcode);
+
   LLVM_READONLY
   int getMUBUFNoLdsInst(uint16_t Opcode);
 
@@ -923,6 +1000,9 @@ namespace AMDGPU {
   LLVM_READONLY
   int getSOPKOp(uint16_t Opcode);
 
+  LLVM_READONLY
+  int getGlobalSaddrOp(uint16_t Opcode);
+
   const uint64_t RSRC_DATA_FORMAT = 0xf00000000000LL;
   const uint64_t RSRC_ELEMENT_SIZE_SHIFT = (32 + 19);
   const uint64_t RSRC_INDEX_STRIDE_SHIFT = (32 + 21);
diff --git a/lib/Target/AMDGPU/SIInstrInfo.td b/lib/Target/AMDGPU/SIInstrInfo.td
index 8fa37aa83daed..13afa4d4974bf 100644
--- a/lib/Target/AMDGPU/SIInstrInfo.td
+++ b/lib/Target/AMDGPU/SIInstrInfo.td
@@ -40,9 +40,9 @@ def SIEncodingFamily {
 
 def AMDGPUclamp : SDNode<"AMDGPUISD::CLAMP", SDTFPUnaryOp>;
 
-def SIload_constant : SDNode<"AMDGPUISD::LOAD_CONSTANT",
-  SDTypeProfile<1, 2, [SDTCisVT<0, f32>, SDTCisVT<1, v4i32>, SDTCisVT<2, i32>]>,
-                      [SDNPMayLoad, SDNPMemOperand]
+def SIsbuffer_load : SDNode<"AMDGPUISD::SBUFFER_LOAD",
+  SDTypeProfile<1, 3, [SDTCisVT<1, v4i32>, SDTCisVT<2, i32>, SDTCisVT<3, i1>]>,
+  [SDNPMayLoad, SDNPMemOperand]
 >;
 
 def SIatomic_inc : SDNode<"AMDGPUISD::ATOMIC_INC", SDTAtomic2,
@@ -69,36 +69,34 @@ def SIatomic_fmax : SDNode<"AMDGPUISD::ATOMIC_LOAD_FMAX", SDTAtomic2_f32,
   [SDNPMayLoad, SDNPMayStore, SDNPMemOperand, SDNPHasChain]
 >;
 
-def SDTbuffer_load : SDTypeProfile<1, 9,
+def SDTtbuffer_load : SDTypeProfile<1, 8,
   [                     // vdata
    SDTCisVT<1, v4i32>,  // rsrc
    SDTCisVT<2, i32>,    // vindex(VGPR)
    SDTCisVT<3, i32>,    // voffset(VGPR)
    SDTCisVT<4, i32>,    // soffset(SGPR)
    SDTCisVT<5, i32>,    // offset(imm)
-   SDTCisVT<6, i32>,    // dfmt(imm)
-   SDTCisVT<7, i32>,    // nfmt(imm)
-   SDTCisVT<8, i32>,    // glc(imm)
-   SDTCisVT<9, i32>     // slc(imm)
+   SDTCisVT<6, i32>,    // format(imm)
+   SDTCisVT<7, i32>,    // cachecontrol(imm)
+   SDTCisVT<8, i1>      // idxen(imm)
   ]>;
 
-def SItbuffer_load :   SDNode<"AMDGPUISD::TBUFFER_LOAD_FORMAT", SDTbuffer_load,
+def SItbuffer_load :   SDNode<"AMDGPUISD::TBUFFER_LOAD_FORMAT", SDTtbuffer_load,
                               [SDNPMayLoad, SDNPMemOperand, SDNPHasChain]>;
 def SItbuffer_load_d16 : SDNode<"AMDGPUISD::TBUFFER_LOAD_FORMAT_D16",
-                                SDTbuffer_load,
+                                SDTtbuffer_load,
                                 [SDNPMayLoad, SDNPMemOperand, SDNPHasChain]>;
 
-def SDTtbuffer_store : SDTypeProfile<0, 10,
+def SDTtbuffer_store : SDTypeProfile<0, 9,
     [                     // vdata
      SDTCisVT<1, v4i32>,  // rsrc
      SDTCisVT<2, i32>,    // vindex(VGPR)
      SDTCisVT<3, i32>,    // voffset(VGPR)
      SDTCisVT<4, i32>,    // soffset(SGPR)
      SDTCisVT<5, i32>,    // offset(imm)
-     SDTCisVT<6, i32>,    // dfmt(imm)
-     SDTCisVT<7, i32>,    // nfmt(imm)
-     SDTCisVT<8, i32>,    // glc(imm)
-     SDTCisVT<9, i32>     // slc(imm)
+     SDTCisVT<6, i32>,    // format(imm)
+     SDTCisVT<7, i32>,    // cachecontrol(imm)
+     SDTCisVT<8, i1>      // idxen(imm)
     ]>;
 
 def SItbuffer_store : SDNode<"AMDGPUISD::TBUFFER_STORE_FORMAT", SDTtbuffer_store,
@@ -110,13 +108,15 @@ def SItbuffer_store_d16 : SDNode<"AMDGPUISD::TBUFFER_STORE_FORMAT_D16",
                                 SDTtbuffer_store,
                                 [SDNPMayStore, SDNPMemOperand, SDNPHasChain]>;
 
-def SDTBufferLoad : SDTypeProfile<1, 5,
+def SDTBufferLoad : SDTypeProfile<1, 7,
     [                    // vdata
      SDTCisVT<1, v4i32>, // rsrc
-     SDTCisVT<2, i32>,   // vindex
-     SDTCisVT<3, i32>,   // offset
-     SDTCisVT<4, i1>,    // glc
-     SDTCisVT<5, i1>]>;  // slc
+     SDTCisVT<2, i32>,   // vindex(VGPR)
+     SDTCisVT<3, i32>,   // voffset(VGPR)
+     SDTCisVT<4, i32>,   // soffset(SGPR)
+     SDTCisVT<5, i32>,   // offset(imm)
+     SDTCisVT<6, i32>,   // cachepolicy(imm)
+     SDTCisVT<7, i1>]>;  // idxen(imm)
 
 def SIbuffer_load : SDNode <"AMDGPUISD::BUFFER_LOAD", SDTBufferLoad,
                             [SDNPMemOperand, SDNPHasChain, SDNPMayLoad]>;
@@ -126,13 +126,15 @@ def SIbuffer_load_format_d16 : SDNode <"AMDGPUISD::BUFFER_LOAD_FORMAT_D16",
                                 SDTBufferLoad,
                                 [SDNPMemOperand, SDNPHasChain, SDNPMayLoad]>;
 
-def SDTBufferStore : SDTypeProfile<0, 6,
+def SDTBufferStore : SDTypeProfile<0, 8,
     [                    // vdata
      SDTCisVT<1, v4i32>, // rsrc
-     SDTCisVT<2, i32>,   // vindex
-     SDTCisVT<3, i32>,   // offset
-     SDTCisVT<4, i1>,    // glc
-     SDTCisVT<5, i1>]>;  // slc
+     SDTCisVT<2, i32>,   // vindex(VGPR)
+     SDTCisVT<3, i32>,   // voffset(VGPR)
+     SDTCisVT<4, i32>,   // soffset(SGPR)
+     SDTCisVT<5, i32>,   // offset(imm)
+     SDTCisVT<6, i32>,   // cachepolicy(imm)
+     SDTCisVT<7, i1>]>;  // idxen(imm)
 
 def SIbuffer_store : SDNode <"AMDGPUISD::BUFFER_STORE", SDTBufferStore,
                              [SDNPMayStore, SDNPMemOperand, SDNPHasChain]>;
@@ -144,13 +146,16 @@ def SIbuffer_store_format_d16 : SDNode <"AMDGPUISD::BUFFER_STORE_FORMAT_D16",
                             [SDNPMayStore, SDNPMemOperand, SDNPHasChain]>;
 
 class SDBufferAtomic<string opcode> : SDNode <opcode,
-  SDTypeProfile<1, 5,
+  SDTypeProfile<1, 8,
       [SDTCisVT<0, i32>,   // dst
        SDTCisVT<1, i32>,   // vdata
        SDTCisVT<2, v4i32>, // rsrc
-       SDTCisVT<3, i32>,   // vindex
-       SDTCisVT<4, i32>,   // offset
-       SDTCisVT<5, i1>]>,  // slc
+       SDTCisVT<3, i32>,   // vindex(VGPR)
+       SDTCisVT<4, i32>,   // voffset(VGPR)
+       SDTCisVT<5, i32>,   // soffset(SGPR)
+       SDTCisVT<6, i32>,   // offset(imm)
+       SDTCisVT<7, i32>,   // cachepolicy(imm)
+       SDTCisVT<8, i1>]>,  // idxen(imm)
   [SDNPMemOperand, SDNPHasChain, SDNPMayLoad, SDNPMayStore]
 >;
 
@@ -166,14 +171,17 @@ def SIbuffer_atomic_or : SDBufferAtomic <"AMDGPUISD::BUFFER_ATOMIC_OR">;
 def SIbuffer_atomic_xor : SDBufferAtomic <"AMDGPUISD::BUFFER_ATOMIC_XOR">;
 
 def SIbuffer_atomic_cmpswap : SDNode <"AMDGPUISD::BUFFER_ATOMIC_CMPSWAP",
-  SDTypeProfile<1, 6,
+  SDTypeProfile<1, 9,
     [SDTCisVT<0, i32>,   // dst
      SDTCisVT<1, i32>,   // src
      SDTCisVT<2, i32>,   // cmp
      SDTCisVT<3, v4i32>, // rsrc
-     SDTCisVT<4, i32>,   // vindex
-     SDTCisVT<5, i32>,   // offset
-     SDTCisVT<6, i1>]>,  // slc
+     SDTCisVT<4, i32>,   // vindex(VGPR)
+     SDTCisVT<5, i32>,   // voffset(VGPR)
+     SDTCisVT<6, i32>,   // soffset(SGPR)
+     SDTCisVT<7, i32>,   // offset(imm)
+     SDTCisVT<8, i32>,   // cachepolicy(imm)
+     SDTCisVT<9, i1>]>,  // idxen(imm)
   [SDNPMemOperand, SDNPHasChain, SDNPMayLoad, SDNPMayStore]
 >;
 
@@ -487,24 +495,7 @@ class InlineFPImm <ValueType vt> : PatLeaf <(vt fpimm), [{
 }]>;
 
 class VGPRImm <dag frag> : PatLeaf<frag, [{
-  if (Subtarget->getGeneration() < AMDGPUSubtarget::SOUTHERN_ISLANDS) {
-    return false;
-  }
-  const SIRegisterInfo *SIRI =
-      static_cast<const SIRegisterInfo *>(Subtarget->getRegisterInfo());
-  unsigned Limit = 0;
-  for (SDNode::use_iterator U = N->use_begin(), E = SDNode::use_end();
-         Limit < 10 && U != E; ++U, ++Limit) {
-    const TargetRegisterClass *RC = getOperandRegClass(*U, U.getOperandNo());
-
-    // If the register class is unknown, it could be an unknown
-    // register class that needs to be an SGPR, e.g. an inline asm
-    // constraint
-    if (!RC || SIRI->isSGPRClass(RC))
-      return false;
-  }
-
-  return Limit < 10;
+  return isVGPRImm(N);
 }]>;
 
 def NegateImm : SDNodeXForm<imm, [{
@@ -746,14 +737,13 @@ def SLC : NamedOperandBit<"SLC", NamedMatchClass<"SLC">>;
 def TFE : NamedOperandBit<"TFE", NamedMatchClass<"TFE">>;
 def UNorm : NamedOperandBit<"UNorm", NamedMatchClass<"UNorm">>;
 def DA : NamedOperandBit<"DA", NamedMatchClass<"DA">>;
-def R128 : NamedOperandBit<"R128", NamedMatchClass<"R128">>;
+def R128A16 : NamedOperandBit<"R128A16", NamedMatchClass<"R128A16">>;
 def D16 : NamedOperandBit<"D16", NamedMatchClass<"D16">>;
 def LWE : NamedOperandBit<"LWE", NamedMatchClass<"LWE">>;
 def exp_compr : NamedOperandBit<"ExpCompr", NamedMatchClass<"ExpCompr">>;
 def exp_vm : NamedOperandBit<"ExpVM", NamedMatchClass<"ExpVM">>;
 
-def DFMT : NamedOperandU8<"DFMT", NamedMatchClass<"DFMT">>;
-def NFMT : NamedOperandU8<"NFMT", NamedMatchClass<"NFMT">>;
+def FORMAT : NamedOperandU8<"FORMAT", NamedMatchClass<"FORMAT">>;
 
 def DMask : NamedOperandU16<"DMask", NamedMatchClass<"DMask">>;
 
@@ -1632,7 +1622,7 @@ class getHasExt <int NumSrcArgs, ValueType DstVT = i32, ValueType Src0VT = i32,
                     0, // 64-bit dst - No DPP or SDWA for 64-bit operands
                     !if(!eq(Src0VT.Size, 64),
                         0, // 64-bit src0
-                        !if(!eq(Src0VT.Size, 64),
+                        !if(!eq(Src1VT.Size, 64),
                             0, // 64-bit src2
                             1
                         )
@@ -1641,6 +1631,12 @@ class getHasExt <int NumSrcArgs, ValueType DstVT = i32, ValueType Src0VT = i32,
             );
 }
 
+class getHasDPP <int NumSrcArgs, ValueType DstVT = i32, ValueType Src0VT = i32,
+                 ValueType Src1VT = i32> {
+  bit ret = !if(!eq(NumSrcArgs, 0), 0,
+                getHasExt<NumSrcArgs, DstVT, Src0VT, Src1VT>.ret);
+}
+
 class BitOr<bit a, bit b> {
   bit ret = !if(a, 1, !if(b, 1, 0));
 }
@@ -1649,6 +1645,11 @@ class BitAnd<bit a, bit b> {
   bit ret = !if(a, !if(b, 1, 0), 0);
 }
 
+def PatGenMode {
+  int NoPattern = 0;
+  int Pattern   = 1;
+}
+
 class VOPProfile <list<ValueType> _ArgVT> {
 
   field list<ValueType> ArgVT = _ArgVT;
@@ -1715,7 +1716,10 @@ class VOPProfile <list<ValueType> _ArgVT> {
   field bit HasSDWAOMod = isFloatType<DstVT>.ret;
 
   field bit HasExt = getHasExt<NumSrcArgs, DstVT, Src0VT, Src1VT>.ret;
-  field bit HasSDWA9 = HasExt;
+  field bit HasExtDPP = getHasDPP<NumSrcArgs, DstVT, Src0VT, Src1VT>.ret;
+  field bit HasExtSDWA = HasExt;
+  field bit HasExtSDWA9 = HasExt;
+  field int NeedPatGen = PatGenMode.NoPattern;
 
   field Operand Src0PackedMod = !if(HasSrc0FloatMods, PackedF16InputMods, PackedI16InputMods);
   field Operand Src1PackedMod = !if(HasSrc1FloatMods, PackedF16InputMods, PackedI16InputMods);
@@ -1743,8 +1747,10 @@ class VOPProfile <list<ValueType> _ArgVT> {
                                            getOpSelMod<Src0VT>.ret,
                                            getOpSelMod<Src1VT>.ret,
                                            getOpSelMod<Src2VT>.ret>.ret;
-  field dag InsDPP = getInsDPP<DstRCDPP, Src0DPP, Src1DPP, NumSrcArgs,
-                               HasModifiers, Src0ModDPP, Src1ModDPP>.ret;
+  field dag InsDPP = !if(HasExtDPP,
+                         getInsDPP<DstRCDPP, Src0DPP, Src1DPP, NumSrcArgs,
+                                   HasModifiers, Src0ModDPP, Src1ModDPP>.ret,
+                         (ins));
   field dag InsSDWA = getInsSDWA<Src0SDWA, Src1SDWA, NumSrcArgs,
                                  HasSDWAOMod, Src0ModSDWA, Src1ModSDWA,
                                  DstVT>.ret;
@@ -1758,14 +1764,21 @@ class VOPProfile <list<ValueType> _ArgVT> {
                                               HasSrc0FloatMods,
                                               HasSrc1FloatMods,
                                               HasSrc2FloatMods>.ret;
-  field string AsmDPP = getAsmDPP<HasDst, NumSrcArgs, HasModifiers, DstVT>.ret;
+  field string AsmDPP = !if(HasExtDPP,
+                            getAsmDPP<HasDst, NumSrcArgs, HasModifiers, DstVT>.ret, "");
   field string AsmSDWA = getAsmSDWA<HasDst, NumSrcArgs, DstVT>.ret;
   field string AsmSDWA9 = getAsmSDWA9<HasDst, HasSDWAOMod, NumSrcArgs, DstVT>.ret;
 }
 
 class VOP_NO_EXT <VOPProfile p> : VOPProfile <p.ArgVT> {
   let HasExt = 0;
-  let HasSDWA9 = 0;
+  let HasExtDPP = 0;
+  let HasExtSDWA = 0;
+  let HasExtSDWA9 = 0;
+}
+
+class VOP_PAT_GEN <VOPProfile p, int mode=PatGenMode.Pattern> : VOPProfile <p.ArgVT> {
+  let NeedPatGen = mode;
 }
 
 def VOP_F16_F16 : VOPProfile <[f16, f16, untyped, untyped]>;
@@ -1788,6 +1801,8 @@ def VOP_B32_F16_F16 : VOPProfile <[i32, f16, f16, untyped]>;
 
 def VOP_V2F16_V2F16_V2F16_V2F16 : VOPProfile <[v2f16, v2f16, v2f16, v2f16]>;
 def VOP_V2I16_V2I16_V2I16_V2I16 : VOPProfile <[v2i16, v2i16, v2i16, v2i16]>;
+def VOP_V2I16_F32_F32 : VOPProfile <[v2i16, f32, f32, untyped]>;
+def VOP_V2I16_I32_I32 : VOPProfile <[v2i16, i32, i32, untyped]>;
 
 def VOP_F32_V2F16_V2F16_V2F16 : VOPProfile <[f32, v2f16, v2f16, v2f16]>;
 
@@ -1925,6 +1940,15 @@ def getBasicFromSDWAOp : InstrMapping {
   let ValueCols = [["Default"]];
 }
 
+// Maps ordinary instructions to their DPP counterparts
+def getDPPOp32 : InstrMapping {
+  let FilterClass = "VOP";
+  let RowFields = ["OpName"];
+  let ColFields = ["AsmVariantName"];
+  let KeyCol = ["Default"];
+  let ValueCols = [["DPP"]];
+}
+
 // Maps an commuted opcode to its original version
 def getCommuteOrig : InstrMapping {
   let FilterClass = "Commutable_REV";
@@ -1977,6 +2001,14 @@ def getAddr64Inst : InstrMapping {
   let ValueCols = [["1"]];
 }
 
+def getIfAddr64Inst : InstrMapping {
+  let FilterClass = "MUBUFAddr64Table";
+  let RowFields = ["OpName"];
+  let ColFields = ["IsAddr64"];
+  let KeyCol = ["1"];
+  let ValueCols = [["1"]];
+}
+
 def getMUBUFNoLdsInst : InstrMapping {
   let FilterClass = "MUBUFLdsTable";
   let RowFields = ["OpName"];
@@ -2003,6 +2035,15 @@ def getAtomicNoRetOp : InstrMapping {
   let ValueCols = [["0"]];
 }
 
+// Maps a GLOBAL to its SADDR form.
+def getGlobalSaddrOp : InstrMapping {
+  let FilterClass = "GlobalSaddrTable";
+  let RowFields = ["SaddrOp"];
+  let ColFields = ["IsSaddr"];
+  let KeyCol = ["0"];
+  let ValueCols = [["1"]];
+}
+
 include "SIInstructions.td"
 
 include "DSInstructions.td"
diff --git a/lib/Target/AMDGPU/SIInstructions.td b/lib/Target/AMDGPU/SIInstructions.td
index 5c10646161b39..b6b00c2e4257a 100644
--- a/lib/Target/AMDGPU/SIInstructions.td
+++ b/lib/Target/AMDGPU/SIInstructions.td
@@ -15,8 +15,8 @@ class GCNPat<dag pattern, dag result> : Pat<pattern, result>, GCNPredicateContro
   let SubtargetPredicate = isGCN;
 }
 
-include "VOPInstructions.td"
 include "SOPInstructions.td"
+include "VOPInstructions.td"
 include "SMInstructions.td"
 include "FLATInstructions.td"
 include "BUFInstructions.td"
@@ -164,29 +164,26 @@ def S_SUB_U64_CO_PSEUDO : SPseudoInstSI <
 
 } // End usesCustomInserter = 1, Defs = [SCC]
 
-let usesCustomInserter = 1, SALU = 1 in {
-def GET_GROUPSTATICSIZE : PseudoInstSI <(outs SReg_32:$sdst), (ins),
+let usesCustomInserter = 1 in {
+def GET_GROUPSTATICSIZE : SPseudoInstSI <(outs SReg_32:$sdst), (ins),
   [(set SReg_32:$sdst, (int_amdgcn_groupstaticsize))]>;
 } // End let usesCustomInserter = 1, SALU = 1
 
-def S_MOV_B64_term : PseudoInstSI<(outs SReg_64:$dst),
+def S_MOV_B64_term : SPseudoInstSI<(outs SReg_64:$dst),
    (ins SSrc_b64:$src0)> {
-  let SALU = 1;
   let isAsCheapAsAMove = 1;
   let isTerminator = 1;
 }
 
-def S_XOR_B64_term : PseudoInstSI<(outs SReg_64:$dst),
+def S_XOR_B64_term : SPseudoInstSI<(outs SReg_64:$dst),
    (ins SSrc_b64:$src0, SSrc_b64:$src1)> {
-  let SALU = 1;
   let isAsCheapAsAMove = 1;
   let isTerminator = 1;
   let Defs = [SCC];
 }
 
-def S_ANDN2_B64_term : PseudoInstSI<(outs SReg_64:$dst),
+def S_ANDN2_B64_term : SPseudoInstSI<(outs SReg_64:$dst),
    (ins SSrc_b64:$src0, SSrc_b64:$src1)> {
-  let SALU = 1;
   let isAsCheapAsAMove = 1;
   let isTerminator = 1;
 }
@@ -250,7 +247,7 @@ def SI_LOOP : CFPseudoInstSI <
   (outs), (ins SReg_64:$saved, brtarget:$target),
   [(AMDGPUloop i64:$saved, bb:$target)], 1, 1> {
   let Size = 8;
-  let isBranch = 0;
+  let isBranch = 1;
   let hasSideEffects = 1;
 }
 
@@ -267,14 +264,6 @@ def SI_END_CF : CFPseudoInstSI <
   let mayStore = 1;
 }
 
-def SI_BREAK : CFPseudoInstSI <
-  (outs SReg_64:$dst), (ins SReg_64:$src),
-  [(set i64:$dst, (int_amdgcn_break i64:$src))], 1> {
-  let Size = 4;
-  let isAsCheapAsAMove = 1;
-  let isReMaterializable = 1;
-}
-
 def SI_IF_BREAK : CFPseudoInstSI <
   (outs SReg_64:$dst), (ins SReg_64:$vcc, SReg_64:$src),
   [(set i64:$dst, (int_amdgcn_if_break i1:$vcc, i64:$src))]> {
@@ -283,14 +272,6 @@ def SI_IF_BREAK : CFPseudoInstSI <
   let isReMaterializable = 1;
 }
 
-def SI_ELSE_BREAK : CFPseudoInstSI <
-  (outs SReg_64:$dst), (ins SReg_64:$src0, SReg_64:$src1),
-  [(set i64:$dst, (int_amdgcn_else_break i64:$src0, i64:$src1))]> {
-  let Size = 4;
-  let isAsCheapAsAMove = 1;
-  let isReMaterializable = 1;
-}
-
 let Uses = [EXEC] in {
 
 multiclass PseudoInstKill <dag ins> {
@@ -326,6 +307,7 @@ def SI_ILLEGAL_COPY : SPseudoInstSI <
 def SI_BR_UNDEF : SPseudoInstSI <(outs), (ins sopp_brtarget:$simm16)> {
   let isTerminator = 1;
   let usesCustomInserter = 1;
+  let isBranch = 1;
 }
 
 def SI_PS_LIVE : PseudoInstSI <
@@ -598,7 +580,13 @@ def : Pat <
   (int_amdgcn_kill (i1 (setcc f32:$src, InlineFPImm<f32>:$imm, cond:$cond))),
   (SI_KILL_F32_COND_IMM_PSEUDO $src, (bitcast_fpimm_to_i32 $imm), (cond_as_i32imm $cond))
 >;
-// TODO: we could add more variants for other types of conditionals
+
+  // TODO: we could add more variants for other types of conditionals
+
+def : Pat <
+  (int_amdgcn_icmp i1:$src, (i1 0), (i32 33)),
+  (COPY $src) // Return the SGPRs representing i1 src
+>;
 
 //===----------------------------------------------------------------------===//
 // VOP1 Patterns
@@ -730,12 +718,14 @@ defm : SelectPat <i32, V_CNDMASK_B32_e64>;
 defm : SelectPat <f16, V_CNDMASK_B32_e64>;
 defm : SelectPat <f32, V_CNDMASK_B32_e64>;
 
+let AddedComplexity = 1 in {
 def : GCNPat <
-  (i32 (add (i32 (ctpop i32:$popcnt)), i32:$val)),
+  (i32 (add (i32 (getDivergentFrag<ctpop>.ret i32:$popcnt)), i32:$val)),
   (V_BCNT_U32_B32_e64 $popcnt, $val)
 >;
+}
 def : GCNPat <
-  (i16 (add (i16 (trunc (ctpop i32:$popcnt))), i16:$val)),
+  (i16 (add (i16 (trunc (getDivergentFrag<ctpop>.ret i32:$popcnt))), i16:$val)),
   (V_BCNT_U32_B32_e64 $popcnt, $val)
 >;
 
@@ -867,6 +857,8 @@ def : BitConvert <f64, v2f32, VReg_64>;
 def : BitConvert <v2f32, f64, VReg_64>;
 def : BitConvert <f64, v2i32, VReg_64>;
 def : BitConvert <v2i32, f64, VReg_64>;
+def : BitConvert <v4i16, v4f16, VReg_64>;
+def : BitConvert <v4f16, v4i16, VReg_64>;
 
 // FIXME: Make SGPR
 def : BitConvert <v2i32, v4f16, VReg_64>;
@@ -1324,6 +1316,38 @@ def : GCNPat <
 >;
 
 def : GCNPat <
+  (i1 (add i1:$src0, i1:$src1)),
+  (S_XOR_B64 $src0, $src1)
+>;
+
+def : GCNPat <
+  (i1 (sub i1:$src0, i1:$src1)),
+  (S_XOR_B64 $src0, $src1)
+>;
+
+let AddedComplexity = 1 in {
+def : GCNPat <
+  (i1 (add i1:$src0, (i1 -1))),
+  (S_NOT_B64 $src0)
+>;
+
+def : GCNPat <
+  (i1 (sub i1:$src0, (i1 -1))),
+  (S_NOT_B64 $src0)
+>;
+}
+
+def : GCNPat <
+  (f16 (sint_to_fp i1:$src)),
+  (V_CVT_F16_F32_e32 (V_CNDMASK_B32_e64 (i32 0), (i32 CONST.FP32_NEG_ONE), $src))
+>;
+
+def : GCNPat <
+  (f16 (uint_to_fp i1:$src)),
+  (V_CVT_F16_F32_e32 (V_CNDMASK_B32_e64 (i32 0), (i32 CONST.FP32_ONE), $src))
+>;
+
+def : GCNPat <
   (f32 (sint_to_fp i1:$src)),
   (V_CNDMASK_B32_e64 (i32 0), (i32 CONST.FP32_NEG_ONE), $src)
 >;
@@ -1464,13 +1488,32 @@ class ExpPattern<SDPatternOperator node, ValueType vt, Instruction Inst> : GCNPa
 def : ExpPattern<AMDGPUexport, i32, EXP>;
 def : ExpPattern<AMDGPUexport_done, i32, EXP_DONE>;
 
-// COPY_TO_REGCLASS is workaround tablegen bug from multiple outputs
+// COPY is workaround tablegen bug from multiple outputs
 // from S_LSHL_B32's multiple outputs from implicit scc def.
 def : GCNPat <
   (v2i16 (build_vector (i16 0), i16:$src1)),
-  (v2i16 (COPY_TO_REGCLASS (S_LSHL_B32 i16:$src1, (i16 16)), SReg_32_XM0))
+  (v2i16 (COPY (S_LSHL_B32 i16:$src1, (i16 16))))
+>;
+
+def : GCNPat <
+  (v2i16 (build_vector i16:$src0, (i16 undef))),
+  (v2i16 (COPY $src0))
+>;
+
+def : GCNPat <
+  (v2f16 (build_vector f16:$src0, (f16 undef))),
+  (v2f16 (COPY $src0))
+>;
+
+def : GCNPat <
+  (v2i16 (build_vector (i16 undef), i16:$src1)),
+  (v2i16 (COPY (S_LSHL_B32 $src1, (i32 16))))
 >;
 
+def : GCNPat <
+  (v2f16 (build_vector (f16 undef), f16:$src1)),
+  (v2f16 (COPY (S_LSHL_B32 $src1, (i32 16))))
+>;
 
 let SubtargetPredicate = HasVOP3PInsts in {
 def : GCNPat <
@@ -1501,15 +1544,15 @@ def : GCNPat <
 } // End SubtargetPredicate = HasVOP3PInsts
 
 
-// def : GCNPat <
-//   (v2f16 (scalar_to_vector f16:$src0)),
-//   (COPY $src0)
-// >;
+def : GCNPat <
+  (v2f16 (scalar_to_vector f16:$src0)),
+  (COPY $src0)
+>;
 
-// def : GCNPat <
-//   (v2i16 (scalar_to_vector i16:$src0)),
-//   (COPY $src0)
-// >;
+def : GCNPat <
+  (v2i16 (scalar_to_vector i16:$src0)),
+  (COPY $src0)
+>;
 
 def : GCNPat <
   (v4i16 (scalar_to_vector i16:$src0)),
@@ -1587,18 +1630,19 @@ defm : BFMPatterns <i32, S_BFM_B32, S_MOV_B32>;
 defm : BFEPattern <V_BFE_U32, V_BFE_I32, S_MOV_B32>;
 defm : SHA256MaPattern <V_BFI_B32, V_XOR_B32_e64, SReg_64>;
 
-def : IntMed3Pat<V_MED3_I32, smax, smax_oneuse, smin_oneuse>;
-def : IntMed3Pat<V_MED3_U32, umax, umax_oneuse, umin_oneuse>;
+defm : IntMed3Pat<V_MED3_I32, smin, smax, smin_oneuse, smax_oneuse>;
+defm : IntMed3Pat<V_MED3_U32, umin, umax, umin_oneuse, umax_oneuse>;
 
 }
 
 // This matches 16 permutations of
 // max(min(x, y), min(max(x, y), z))
 class FPMed3Pat<ValueType vt,
+                //SDPatternOperator max, SDPatternOperator min,
                 Instruction med3Inst> : GCNPat<
-  (fmaxnum (fminnum_oneuse (VOP3Mods_nnan vt:$src0, i32:$src0_mods),
+  (fmaxnum_like (fminnum_like_oneuse (VOP3Mods_nnan vt:$src0, i32:$src0_mods),
                            (VOP3Mods_nnan vt:$src1, i32:$src1_mods)),
-           (fminnum_oneuse (fmaxnum_oneuse (VOP3Mods_nnan vt:$src0, i32:$src0_mods),
+           (fminnum_like_oneuse (fmaxnum_like_oneuse (VOP3Mods_nnan vt:$src0, i32:$src0_mods),
                                            (VOP3Mods_nnan vt:$src1, i32:$src1_mods)),
                            (vt (VOP3Mods_nnan vt:$src2, i32:$src2_mods)))),
   (med3Inst $src0_mods, $src0, $src1_mods, $src1, $src2_mods, $src2, DSTCLAMP.NONE, DSTOMOD.NONE)
@@ -1606,28 +1650,41 @@ class FPMed3Pat<ValueType vt,
 
 class FP16Med3Pat<ValueType vt,
                 Instruction med3Inst> : GCNPat<
-  (fmaxnum (fminnum_oneuse (VOP3Mods_nnan vt:$src0, i32:$src0_mods),
-                           (VOP3Mods_nnan vt:$src1, i32:$src1_mods)),
-           (fminnum_oneuse (fmaxnum_oneuse (VOP3Mods_nnan vt:$src0, i32:$src0_mods),
-                                           (VOP3Mods_nnan vt:$src1, i32:$src1_mods)),
+  (fmaxnum_like (fminnum_like_oneuse (VOP3Mods_nnan vt:$src0, i32:$src0_mods),
+                                     (VOP3Mods_nnan vt:$src1, i32:$src1_mods)),
+           (fminnum_like_oneuse (fmaxnum_like_oneuse (VOP3Mods_nnan vt:$src0, i32:$src0_mods),
+                                                     (VOP3Mods_nnan vt:$src1, i32:$src1_mods)),
                            (vt (VOP3Mods_nnan vt:$src2, i32:$src2_mods)))),
   (med3Inst $src0_mods, $src0, $src1_mods, $src1, $src2_mods, $src2, DSTCLAMP.NONE)
 >;
 
-class Int16Med3Pat<Instruction med3Inst,
+multiclass Int16Med3Pat<Instruction med3Inst,
+                   SDPatternOperator min,
                    SDPatternOperator max,
                    SDPatternOperator max_oneuse,
                    SDPatternOperator min_oneuse,
-                   ValueType vt = i32> : GCNPat<
+                   ValueType vt = i16> {
+  // This matches 16 permutations of
+  // max(min(x, y), min(max(x, y), z))
+  def : GCNPat <
   (max (min_oneuse vt:$src0, vt:$src1),
        (min_oneuse (max_oneuse vt:$src0, vt:$src1), vt:$src2)),
   (med3Inst SRCMODS.NONE, $src0, SRCMODS.NONE, $src1, SRCMODS.NONE, $src2, DSTCLAMP.NONE)
 >;
 
+  // This matches 16 permutations of
+  // min(max(a, b), max(min(a, b), c))
+  def : GCNPat <
+  (min (max_oneuse vt:$src0, vt:$src1),
+      (max_oneuse (min_oneuse vt:$src0, vt:$src1), vt:$src2)),
+  (med3Inst SRCMODS.NONE, $src0, SRCMODS.NONE, $src1, SRCMODS.NONE, $src2, DSTCLAMP.NONE)
+>;
+}
+
 def : FPMed3Pat<f32, V_MED3_F32>;
 
 let OtherPredicates = [isGFX9] in {
 def : FP16Med3Pat<f16, V_MED3_F16>;
-def : Int16Med3Pat<V_MED3_I16, smax, smax_oneuse, smin_oneuse, i16>;
-def : Int16Med3Pat<V_MED3_U16, umax, umax_oneuse, umin_oneuse, i16>;
+defm : Int16Med3Pat<V_MED3_I16, smin, smax, smax_oneuse, smin_oneuse>;
+defm : Int16Med3Pat<V_MED3_U16, umin, umax, umax_oneuse, umin_oneuse>;
 } // End Predicates = [isGFX9]
diff --git a/lib/Target/AMDGPU/SIIntrinsics.td b/lib/Target/AMDGPU/SIIntrinsics.td
index 7b7cf1635050b..e51ff4b4bc50e 100644
--- a/lib/Target/AMDGPU/SIIntrinsics.td
+++ b/lib/Target/AMDGPU/SIIntrinsics.td
@@ -16,36 +16,4 @@
 let TargetPrefix = "SI", isTarget = 1 in {
   def int_SI_load_const : Intrinsic <[llvm_float_ty], [llvm_anyint_ty, llvm_i32_ty], [IntrNoMem]>;
 
-  // Fully-flexible TBUFFER_STORE_FORMAT_* except for the ADDR64 bit, which is not exposed
-  def int_SI_tbuffer_store : Intrinsic <
-    [],
-    [llvm_anyint_ty, // rsrc(SGPR)
-     llvm_anyint_ty, // vdata(VGPR), overloaded for types i32, v2i32, v4i32
-     llvm_i32_ty,    // num_channels(imm), selects opcode suffix: 1=X, 2=XY, 3=XYZ, 4=XYZW
-     llvm_i32_ty,    // vaddr(VGPR)
-     llvm_i32_ty,    // soffset(SGPR)
-     llvm_i32_ty,    // inst_offset(imm)
-     llvm_i32_ty,    // dfmt(imm)
-     llvm_i32_ty,    // nfmt(imm)
-     llvm_i32_ty,    // offen(imm)
-     llvm_i32_ty,    // idxen(imm)
-     llvm_i32_ty,    // glc(imm)
-     llvm_i32_ty,    // slc(imm)
-     llvm_i32_ty],   // tfe(imm)
-    []>;
-
-  // Fully-flexible BUFFER_LOAD_DWORD_* except for the ADDR64 bit, which is not exposed
-  def int_SI_buffer_load_dword : Intrinsic <
-    [llvm_anyint_ty], // vdata(VGPR), overloaded for types i32, v2i32, v4i32
-    [llvm_anyint_ty,  // rsrc(SGPR)
-     llvm_anyint_ty,  // vaddr(VGPR)
-     llvm_i32_ty,     // soffset(SGPR)
-     llvm_i32_ty,     // inst_offset(imm)
-     llvm_i32_ty,     // offen(imm)
-     llvm_i32_ty,     // idxen(imm)
-     llvm_i32_ty,     // glc(imm)
-     llvm_i32_ty,     // slc(imm)
-     llvm_i32_ty],    // tfe(imm)
-    [IntrReadMem, IntrArgMemOnly]>;
-
 } // End TargetPrefix = "SI", isTarget = 1
diff --git a/lib/Target/AMDGPU/SILoadStoreOptimizer.cpp b/lib/Target/AMDGPU/SILoadStoreOptimizer.cpp
index 4b537540046fe..be291b127301d 100644
--- a/lib/Target/AMDGPU/SILoadStoreOptimizer.cpp
+++ b/lib/Target/AMDGPU/SILoadStoreOptimizer.cpp
@@ -20,6 +20,26 @@
 // ==>
 //  s_buffer_load_dwordx2 s[4:5], s[0:3], 4
 //
+// This pass also tries to promote constant offset to the immediate by
+// adjusting the base. It tries to use a base from the nearby instructions that
+// allows it to have a 13bit constant offset and then promotes the 13bit offset
+// to the immediate.
+// E.g.
+//  s_movk_i32 s0, 0x1800
+//  v_add_co_u32_e32 v0, vcc, s0, v2
+//  v_addc_co_u32_e32 v1, vcc, 0, v6, vcc
+//
+//  s_movk_i32 s0, 0x1000
+//  v_add_co_u32_e32 v5, vcc, s0, v2
+//  v_addc_co_u32_e32 v6, vcc, 0, v6, vcc
+//  global_load_dwordx2 v[5:6], v[5:6], off
+//  global_load_dwordx2 v[0:1], v[0:1], off
+// =>
+//  s_movk_i32 s0, 0x1000
+//  v_add_co_u32_e32 v5, vcc, s0, v2
+//  v_addc_co_u32_e32 v6, vcc, 0, v6, vcc
+//  global_load_dwordx2 v[5:6], v[5:6], off
+//  global_load_dwordx2 v[0:1], v[5:6], off offset:2048
 //
 // Future improvements:
 //
@@ -43,9 +63,9 @@
 
 #include "AMDGPU.h"
 #include "AMDGPUSubtarget.h"
+#include "MCTargetDesc/AMDGPUMCTargetDesc.h"
 #include "SIInstrInfo.h"
 #include "SIRegisterInfo.h"
-#include "MCTargetDesc/AMDGPUMCTargetDesc.h"
 #include "Utils/AMDGPUBaseInfo.h"
 #include "llvm/ADT/ArrayRef.h"
 #include "llvm/ADT/SmallVector.h"
@@ -74,23 +94,38 @@ using namespace llvm;
 #define DEBUG_TYPE "si-load-store-opt"
 
 namespace {
+enum InstClassEnum {
+  UNKNOWN,
+  DS_READ,
+  DS_WRITE,
+  S_BUFFER_LOAD_IMM,
+  BUFFER_LOAD_OFFEN = AMDGPU::BUFFER_LOAD_DWORD_OFFEN,
+  BUFFER_LOAD_OFFSET = AMDGPU::BUFFER_LOAD_DWORD_OFFSET,
+  BUFFER_STORE_OFFEN = AMDGPU::BUFFER_STORE_DWORD_OFFEN,
+  BUFFER_STORE_OFFSET = AMDGPU::BUFFER_STORE_DWORD_OFFSET,
+  BUFFER_LOAD_OFFEN_exact = AMDGPU::BUFFER_LOAD_DWORD_OFFEN_exact,
+  BUFFER_LOAD_OFFSET_exact = AMDGPU::BUFFER_LOAD_DWORD_OFFSET_exact,
+  BUFFER_STORE_OFFEN_exact = AMDGPU::BUFFER_STORE_DWORD_OFFEN_exact,
+  BUFFER_STORE_OFFSET_exact = AMDGPU::BUFFER_STORE_DWORD_OFFSET_exact,
+};
 
-class SILoadStoreOptimizer : public MachineFunctionPass {
-  enum InstClassEnum {
-    DS_READ_WRITE,
-    S_BUFFER_LOAD_IMM,
-    BUFFER_LOAD_OFFEN,
-    BUFFER_LOAD_OFFSET,
-    BUFFER_STORE_OFFEN,
-    BUFFER_STORE_OFFSET,
-  };
+enum RegisterEnum {
+  SBASE = 0x1,
+  SRSRC = 0x2,
+  SOFFSET = 0x4,
+  VADDR = 0x8,
+  ADDR = 0x10,
+};
 
+class SILoadStoreOptimizer : public MachineFunctionPass {
   struct CombineInfo {
     MachineBasicBlock::iterator I;
     MachineBasicBlock::iterator Paired;
     unsigned EltSize;
     unsigned Offset0;
     unsigned Offset1;
+    unsigned Width0;
+    unsigned Width1;
     unsigned BaseOff;
     InstClassEnum InstClass;
     bool GLC0;
@@ -98,9 +133,23 @@ class SILoadStoreOptimizer : public MachineFunctionPass {
     bool SLC0;
     bool SLC1;
     bool UseST64;
-    bool IsX2;
-    SmallVector<MachineInstr*, 8> InstsToMove;
-   };
+    SmallVector<MachineInstr *, 8> InstsToMove;
+  };
+
+  struct BaseRegisters {
+    unsigned LoReg = 0;
+    unsigned HiReg = 0;
+
+    unsigned LoSubReg = 0;
+    unsigned HiSubReg = 0;
+  };
+
+  struct MemAddress {
+    BaseRegisters Base;
+    int64_t Offset = 0;
+  };
+
+  using MemInfoMap = DenseMap<MachineInstr *, MemAddress>;
 
 private:
   const GCNSubtarget *STM = nullptr;
@@ -108,9 +157,16 @@ private:
   const SIRegisterInfo *TRI = nullptr;
   MachineRegisterInfo *MRI = nullptr;
   AliasAnalysis *AA = nullptr;
-  unsigned CreatedX2;
+  bool OptimizeAgain;
 
   static bool offsetsCanBeCombined(CombineInfo &CI);
+  static bool widthsFit(const GCNSubtarget &STM, const CombineInfo &CI);
+  static unsigned getNewOpcode(const CombineInfo &CI);
+  static std::pair<unsigned, unsigned> getSubRegIdxs(const CombineInfo &CI);
+  const TargetRegisterClass *getTargetRegisterClass(const CombineInfo &CI);
+  unsigned getOpcodeWidth(const MachineInstr &MI);
+  InstClassEnum getInstClass(unsigned Opc);
+  unsigned getRegs(unsigned Opc);
 
   bool findMatchingInst(CombineInfo &CI);
 
@@ -123,10 +179,21 @@ private:
   MachineBasicBlock::iterator mergeWrite2Pair(CombineInfo &CI);
   MachineBasicBlock::iterator mergeSBufferLoadImmPair(CombineInfo &CI);
   MachineBasicBlock::iterator mergeBufferLoadPair(CombineInfo &CI);
-  unsigned promoteBufferStoreOpcode(const MachineInstr &I, bool &IsX2,
-                                    bool &IsOffen) const;
   MachineBasicBlock::iterator mergeBufferStorePair(CombineInfo &CI);
 
+  void updateBaseAndOffset(MachineInstr &I, unsigned NewBase,
+                           int32_t NewOffset);
+  unsigned computeBase(MachineInstr &MI, const MemAddress &Addr);
+  MachineOperand createRegOrImm(int32_t Val, MachineInstr &MI);
+  Optional<int32_t> extractConstOffset(const MachineOperand &Op);
+  void processBaseWithConstOffset(const MachineOperand &Base, MemAddress &Addr);
+  /// Promotes constant offset to the immediate by adjusting the base. It
+  /// tries to use a base from the nearby instructions that allows it to have
+  /// a 13bit constant offset which gets promoted to the immediate.
+  bool promoteConstantOffsetToImm(MachineInstr &CI,
+                                  MemInfoMap &Visited,
+                                  SmallPtrSet<MachineInstr *, 4> &Promoted);
+
 public:
   static char ID;
 
@@ -153,8 +220,8 @@ public:
 INITIALIZE_PASS_BEGIN(SILoadStoreOptimizer, DEBUG_TYPE,
                       "SI Load Store Optimizer", false, false)
 INITIALIZE_PASS_DEPENDENCY(AAResultsWrapperPass)
-INITIALIZE_PASS_END(SILoadStoreOptimizer, DEBUG_TYPE,
-                    "SI Load Store Optimizer", false, false)
+INITIALIZE_PASS_END(SILoadStoreOptimizer, DEBUG_TYPE, "SI Load Store Optimizer",
+                    false, false)
 
 char SILoadStoreOptimizer::ID = 0;
 
@@ -165,7 +232,7 @@ FunctionPass *llvm::createSILoadStoreOptimizerPass() {
 }
 
 static void moveInstsAfter(MachineBasicBlock::iterator I,
-                           ArrayRef<MachineInstr*> InstsToMove) {
+                           ArrayRef<MachineInstr *> InstsToMove) {
   MachineBasicBlock *MBB = I->getParent();
   ++I;
   for (MachineInstr *MI : InstsToMove) {
@@ -191,21 +258,19 @@ static void addDefsUsesToList(const MachineInstr &MI,
 static bool memAccessesCanBeReordered(MachineBasicBlock::iterator A,
                                       MachineBasicBlock::iterator B,
                                       const SIInstrInfo *TII,
-                                      AliasAnalysis * AA) {
+                                      AliasAnalysis *AA) {
   // RAW or WAR - cannot reorder
   // WAW - cannot reorder
   // RAR - safe to reorder
   return !(A->mayStore() || B->mayStore()) ||
-    TII->areMemAccessesTriviallyDisjoint(*A, *B, AA);
+         TII->areMemAccessesTriviallyDisjoint(*A, *B, AA);
 }
 
 // Add MI and its defs to the lists if MI reads one of the defs that are
 // already in the list. Returns true in that case.
-static bool
-addToListsIfDependent(MachineInstr &MI,
-                      DenseSet<unsigned> &RegDefs,
-                      DenseSet<unsigned> &PhysRegUses,
-                      SmallVectorImpl<MachineInstr*> &Insts) {
+static bool addToListsIfDependent(MachineInstr &MI, DenseSet<unsigned> &RegDefs,
+                                  DenseSet<unsigned> &PhysRegUses,
+                                  SmallVectorImpl<MachineInstr *> &Insts) {
   for (MachineOperand &Use : MI.operands()) {
     // If one of the defs is read, then there is a use of Def between I and the
     // instruction that I will potentially be merged with. We will need to move
@@ -228,18 +293,16 @@ addToListsIfDependent(MachineInstr &MI,
   return false;
 }
 
-static bool
-canMoveInstsAcrossMemOp(MachineInstr &MemOp,
-                        ArrayRef<MachineInstr*> InstsToMove,
-                        const SIInstrInfo *TII,
-                        AliasAnalysis *AA) {
+static bool canMoveInstsAcrossMemOp(MachineInstr &MemOp,
+                                    ArrayRef<MachineInstr *> InstsToMove,
+                                    const SIInstrInfo *TII, AliasAnalysis *AA) {
   assert(MemOp.mayLoadOrStore());
 
   for (MachineInstr *InstToMove : InstsToMove) {
     if (!InstToMove->mayLoadOrStore())
       continue;
     if (!memAccessesCanBeReordered(MemOp, *InstToMove, TII, AA))
-        return false;
+      return false;
   }
   return true;
 }
@@ -260,10 +323,9 @@ bool SILoadStoreOptimizer::offsetsCanBeCombined(CombineInfo &CI) {
   CI.BaseOff = 0;
 
   // Handle SMEM and VMEM instructions.
-  if (CI.InstClass != DS_READ_WRITE) {
-    unsigned Diff = CI.IsX2 ? 2 : 1;
-    return (EltOffset0 + Diff == EltOffset1 ||
-            EltOffset1 + Diff == EltOffset0) &&
+  if ((CI.InstClass != DS_READ) && (CI.InstClass != DS_WRITE)) {
+    return (EltOffset0 + CI.Width0 == EltOffset1 ||
+            EltOffset1 + CI.Width1 == EltOffset0) &&
            CI.GLC0 == CI.GLC1 &&
            (CI.InstClass == S_BUFFER_LOAD_IMM || CI.SLC0 == CI.SLC1);
   }
@@ -305,42 +367,176 @@ bool SILoadStoreOptimizer::offsetsCanBeCombined(CombineInfo &CI) {
   return false;
 }
 
+bool SILoadStoreOptimizer::widthsFit(const GCNSubtarget &STM,
+                                     const CombineInfo &CI) {
+  const unsigned Width = (CI.Width0 + CI.Width1);
+  switch (CI.InstClass) {
+  default:
+    return (Width <= 4) && (STM.hasDwordx3LoadStores() || (Width != 3));
+  case S_BUFFER_LOAD_IMM:
+    switch (Width) {
+    default:
+      return false;
+    case 2:
+    case 4:
+      return true;
+    }
+  }
+}
+
+unsigned SILoadStoreOptimizer::getOpcodeWidth(const MachineInstr &MI) {
+  const unsigned Opc = MI.getOpcode();
+
+  if (TII->isMUBUF(MI)) {
+    return AMDGPU::getMUBUFDwords(Opc);
+  }
+
+  switch (Opc) {
+  default:
+    return 0;
+  case AMDGPU::S_BUFFER_LOAD_DWORD_IMM:
+    return 1;
+  case AMDGPU::S_BUFFER_LOAD_DWORDX2_IMM:
+    return 2;
+  case AMDGPU::S_BUFFER_LOAD_DWORDX4_IMM:
+    return 4;
+  }
+}
+
+InstClassEnum SILoadStoreOptimizer::getInstClass(unsigned Opc) {
+  if (TII->isMUBUF(Opc)) {
+    const int baseOpcode = AMDGPU::getMUBUFBaseOpcode(Opc);
+
+    // If we couldn't identify the opcode, bail out.
+    if (baseOpcode == -1) {
+      return UNKNOWN;
+    }
+
+    switch (baseOpcode) {
+    default:
+      return UNKNOWN;
+    case AMDGPU::BUFFER_LOAD_DWORD_OFFEN:
+      return BUFFER_LOAD_OFFEN;
+    case AMDGPU::BUFFER_LOAD_DWORD_OFFSET:
+      return BUFFER_LOAD_OFFSET;
+    case AMDGPU::BUFFER_STORE_DWORD_OFFEN:
+      return BUFFER_STORE_OFFEN;
+    case AMDGPU::BUFFER_STORE_DWORD_OFFSET:
+      return BUFFER_STORE_OFFSET;
+    case AMDGPU::BUFFER_LOAD_DWORD_OFFEN_exact:
+      return BUFFER_LOAD_OFFEN_exact;
+    case AMDGPU::BUFFER_LOAD_DWORD_OFFSET_exact:
+      return BUFFER_LOAD_OFFSET_exact;
+    case AMDGPU::BUFFER_STORE_DWORD_OFFEN_exact:
+      return BUFFER_STORE_OFFEN_exact;
+    case AMDGPU::BUFFER_STORE_DWORD_OFFSET_exact:
+      return BUFFER_STORE_OFFSET_exact;
+    }
+  }
+
+  switch (Opc) {
+  default:
+    return UNKNOWN;
+  case AMDGPU::S_BUFFER_LOAD_DWORD_IMM:
+  case AMDGPU::S_BUFFER_LOAD_DWORDX2_IMM:
+  case AMDGPU::S_BUFFER_LOAD_DWORDX4_IMM:
+    return S_BUFFER_LOAD_IMM;
+  case AMDGPU::DS_READ_B32:
+  case AMDGPU::DS_READ_B64:
+  case AMDGPU::DS_READ_B32_gfx9:
+  case AMDGPU::DS_READ_B64_gfx9:
+    return DS_READ;
+  case AMDGPU::DS_WRITE_B32:
+  case AMDGPU::DS_WRITE_B64:
+  case AMDGPU::DS_WRITE_B32_gfx9:
+  case AMDGPU::DS_WRITE_B64_gfx9:
+    return DS_WRITE;
+  }
+}
+
+unsigned SILoadStoreOptimizer::getRegs(unsigned Opc) {
+  if (TII->isMUBUF(Opc)) {
+    unsigned result = 0;
+
+    if (AMDGPU::getMUBUFHasVAddr(Opc)) {
+      result |= VADDR;
+    }
+
+    if (AMDGPU::getMUBUFHasSrsrc(Opc)) {
+      result |= SRSRC;
+    }
+
+    if (AMDGPU::getMUBUFHasSoffset(Opc)) {
+      result |= SOFFSET;
+    }
+
+    return result;
+  }
+
+  switch (Opc) {
+  default:
+    return 0;
+  case AMDGPU::S_BUFFER_LOAD_DWORD_IMM:
+  case AMDGPU::S_BUFFER_LOAD_DWORDX2_IMM:
+  case AMDGPU::S_BUFFER_LOAD_DWORDX4_IMM:
+    return SBASE;
+  case AMDGPU::DS_READ_B32:
+  case AMDGPU::DS_READ_B64:
+  case AMDGPU::DS_READ_B32_gfx9:
+  case AMDGPU::DS_READ_B64_gfx9:
+  case AMDGPU::DS_WRITE_B32:
+  case AMDGPU::DS_WRITE_B64:
+  case AMDGPU::DS_WRITE_B32_gfx9:
+  case AMDGPU::DS_WRITE_B64_gfx9:
+    return ADDR;
+  }
+}
+
 bool SILoadStoreOptimizer::findMatchingInst(CombineInfo &CI) {
   MachineBasicBlock *MBB = CI.I->getParent();
   MachineBasicBlock::iterator E = MBB->end();
   MachineBasicBlock::iterator MBBI = CI.I;
 
-  unsigned AddrOpName[3] = {0};
-  int AddrIdx[3];
-  const MachineOperand *AddrReg[3];
+  const unsigned Opc = CI.I->getOpcode();
+  const InstClassEnum InstClass = getInstClass(Opc);
+
+  if (InstClass == UNKNOWN) {
+    return false;
+  }
+
+  const unsigned Regs = getRegs(Opc);
+
+  unsigned AddrOpName[5] = {0};
+  int AddrIdx[5];
+  const MachineOperand *AddrReg[5];
   unsigned NumAddresses = 0;
 
-  switch (CI.InstClass) {
-  case DS_READ_WRITE:
+  if (Regs & ADDR) {
     AddrOpName[NumAddresses++] = AMDGPU::OpName::addr;
-    break;
-  case S_BUFFER_LOAD_IMM:
+  }
+
+  if (Regs & SBASE) {
     AddrOpName[NumAddresses++] = AMDGPU::OpName::sbase;
-    break;
-  case BUFFER_LOAD_OFFEN:
-  case BUFFER_STORE_OFFEN:
-    AddrOpName[NumAddresses++] = AMDGPU::OpName::srsrc;
-    AddrOpName[NumAddresses++] = AMDGPU::OpName::vaddr;
-    AddrOpName[NumAddresses++] = AMDGPU::OpName::soffset;
-    break;
-  case BUFFER_LOAD_OFFSET:
-  case BUFFER_STORE_OFFSET:
+  }
+
+  if (Regs & SRSRC) {
     AddrOpName[NumAddresses++] = AMDGPU::OpName::srsrc;
+  }
+
+  if (Regs & SOFFSET) {
     AddrOpName[NumAddresses++] = AMDGPU::OpName::soffset;
-    break;
+  }
+
+  if (Regs & VADDR) {
+    AddrOpName[NumAddresses++] = AMDGPU::OpName::vaddr;
   }
 
   for (unsigned i = 0; i < NumAddresses; i++) {
     AddrIdx[i] = AMDGPU::getNamedOperandIdx(CI.I->getOpcode(), AddrOpName[i]);
     AddrReg[i] = &CI.I->getOperand(AddrIdx[i]);
 
-    // We only ever merge operations with the same base address register, so don't
-    // bother scanning forward if there are no other uses.
+    // We only ever merge operations with the same base address register, so
+    // don't bother scanning forward if there are no other uses.
     if (AddrReg[i]->isReg() &&
         (TargetRegisterInfo::isPhysicalRegister(AddrReg[i]->getReg()) ||
          MRI->hasOneNonDBGUse(AddrReg[i]->getReg())))
@@ -353,8 +549,11 @@ bool SILoadStoreOptimizer::findMatchingInst(CombineInfo &CI) {
   DenseSet<unsigned> PhysRegUsesToMove;
   addDefsUsesToList(*CI.I, RegDefsToMove, PhysRegUsesToMove);
 
-  for ( ; MBBI != E; ++MBBI) {
-    if (MBBI->getOpcode() != CI.I->getOpcode()) {
+  for (; MBBI != E; ++MBBI) {
+    const bool IsDS = (InstClass == DS_READ) || (InstClass == DS_WRITE);
+
+    if ((getInstClass(MBBI->getOpcode()) != InstClass) ||
+        (IsDS && (MBBI->getOpcode() != Opc))) {
       // This is not a matching DS instruction, but we can keep looking as
       // long as one of these conditions are met:
       // 1. It is safe to move I down past MBBI.
@@ -368,8 +567,8 @@ bool SILoadStoreOptimizer::findMatchingInst(CombineInfo &CI) {
       }
 
       if (MBBI->mayLoadOrStore() &&
-        (!memAccessesCanBeReordered(*CI.I, *MBBI, TII, AA) ||
-         !canMoveInstsAcrossMemOp(*MBBI, CI.InstsToMove, TII, AA))) {
+          (!memAccessesCanBeReordered(*CI.I, *MBBI, TII, AA) ||
+           !canMoveInstsAcrossMemOp(*MBBI, CI.InstsToMove, TII, AA))) {
         // We fail condition #1, but we may still be able to satisfy condition
         // #2.  Add this instruction to the move list and then we will check
         // if condition #2 holds once we have selected the matching instruction.
@@ -413,8 +612,8 @@ bool SILoadStoreOptimizer::findMatchingInst(CombineInfo &CI) {
         continue;
       }
 
-      // Check same base pointer. Be careful of subregisters, which can occur with
-      // vectors of pointers.
+      // Check same base pointer. Be careful of subregisters, which can occur
+      // with vectors of pointers.
       if (AddrReg[i]->getReg() != AddrRegNext.getReg() ||
           AddrReg[i]->getSubReg() != AddrRegNext.getSubReg()) {
         Match = false;
@@ -423,13 +622,15 @@ bool SILoadStoreOptimizer::findMatchingInst(CombineInfo &CI) {
     }
 
     if (Match) {
-      int OffsetIdx = AMDGPU::getNamedOperandIdx(CI.I->getOpcode(),
-                                                 AMDGPU::OpName::offset);
+      int OffsetIdx =
+          AMDGPU::getNamedOperandIdx(CI.I->getOpcode(), AMDGPU::OpName::offset);
       CI.Offset0 = CI.I->getOperand(OffsetIdx).getImm();
+      CI.Width0 = getOpcodeWidth(*CI.I);
       CI.Offset1 = MBBI->getOperand(OffsetIdx).getImm();
+      CI.Width1 = getOpcodeWidth(*MBBI);
       CI.Paired = MBBI;
 
-      if (CI.InstClass == DS_READ_WRITE) {
+      if ((CI.InstClass == DS_READ) || (CI.InstClass == DS_WRITE)) {
         CI.Offset0 &= 0xffff;
         CI.Offset1 &= 0xffff;
       } else {
@@ -445,7 +646,7 @@ bool SILoadStoreOptimizer::findMatchingInst(CombineInfo &CI) {
       // We also need to go through the list of instructions that we plan to
       // move and make sure they are all safe to move down past the merged
       // instruction.
-      if (offsetsCanBeCombined(CI))
+      if (widthsFit(*STM, CI) && offsetsCanBeCombined(CI))
         if (canMoveInstsAcrossMemOp(*MBBI, CI.InstsToMove, TII, AA))
           return true;
     }
@@ -472,12 +673,12 @@ unsigned SILoadStoreOptimizer::read2ST64Opcode(unsigned EltSize) const {
   if (STM->ldsRequiresM0Init())
     return (EltSize == 4) ? AMDGPU::DS_READ2ST64_B32 : AMDGPU::DS_READ2ST64_B64;
 
-  return (EltSize == 4) ?
-    AMDGPU::DS_READ2ST64_B32_gfx9 : AMDGPU::DS_READ2ST64_B64_gfx9;
+  return (EltSize == 4) ? AMDGPU::DS_READ2ST64_B32_gfx9
+                        : AMDGPU::DS_READ2ST64_B64_gfx9;
 }
 
-MachineBasicBlock::iterator  SILoadStoreOptimizer::mergeRead2Pair(
-  CombineInfo &CI) {
+MachineBasicBlock::iterator
+SILoadStoreOptimizer::mergeRead2Pair(CombineInfo &CI) {
   MachineBasicBlock *MBB = CI.I->getParent();
 
   // Be careful, since the addresses could be subregisters themselves in weird
@@ -489,8 +690,8 @@ MachineBasicBlock::iterator  SILoadStoreOptimizer::mergeRead2Pair(
 
   unsigned NewOffset0 = CI.Offset0;
   unsigned NewOffset1 = CI.Offset1;
-  unsigned Opc = CI.UseST64 ?
-    read2ST64Opcode(CI.EltSize) : read2Opcode(CI.EltSize);
+  unsigned Opc =
+      CI.UseST64 ? read2ST64Opcode(CI.EltSize) : read2Opcode(CI.EltSize);
 
   unsigned SubRegIdx0 = (CI.EltSize == 4) ? AMDGPU::sub0 : AMDGPU::sub0_sub1;
   unsigned SubRegIdx1 = (CI.EltSize == 4) ? AMDGPU::sub1 : AMDGPU::sub2_sub3;
@@ -502,39 +703,40 @@ MachineBasicBlock::iterator  SILoadStoreOptimizer::mergeRead2Pair(
   }
 
   assert((isUInt<8>(NewOffset0) && isUInt<8>(NewOffset1)) &&
-         (NewOffset0 != NewOffset1) &&
-         "Computed offset doesn't fit");
+         (NewOffset0 != NewOffset1) && "Computed offset doesn't fit");
 
   const MCInstrDesc &Read2Desc = TII->get(Opc);
 
-  const TargetRegisterClass *SuperRC
-    = (CI.EltSize == 4) ? &AMDGPU::VReg_64RegClass : &AMDGPU::VReg_128RegClass;
+  const TargetRegisterClass *SuperRC =
+      (CI.EltSize == 4) ? &AMDGPU::VReg_64RegClass : &AMDGPU::VReg_128RegClass;
   unsigned DestReg = MRI->createVirtualRegister(SuperRC);
 
   DebugLoc DL = CI.I->getDebugLoc();
 
   unsigned BaseReg = AddrReg->getReg();
+  unsigned BaseSubReg = AddrReg->getSubReg();
   unsigned BaseRegFlags = 0;
   if (CI.BaseOff) {
     unsigned ImmReg = MRI->createVirtualRegister(&AMDGPU::SGPR_32RegClass);
     BuildMI(*MBB, CI.Paired, DL, TII->get(AMDGPU::S_MOV_B32), ImmReg)
-      .addImm(CI.BaseOff);
+        .addImm(CI.BaseOff);
 
     BaseReg = MRI->createVirtualRegister(&AMDGPU::VGPR_32RegClass);
     BaseRegFlags = RegState::Kill;
 
     TII->getAddNoCarry(*MBB, CI.Paired, DL, BaseReg)
-      .addReg(ImmReg)
-      .addReg(AddrReg->getReg());
+        .addReg(ImmReg)
+        .addReg(AddrReg->getReg(), 0, BaseSubReg);
+    BaseSubReg = 0;
   }
 
   MachineInstrBuilder Read2 =
-    BuildMI(*MBB, CI.Paired, DL, Read2Desc, DestReg)
-      .addReg(BaseReg, BaseRegFlags) // addr
-      .addImm(NewOffset0)            // offset0
-      .addImm(NewOffset1)            // offset1
-      .addImm(0)                     // gds
-      .setMemRefs(CI.I->mergeMemRefsWith(*CI.Paired));
+      BuildMI(*MBB, CI.Paired, DL, Read2Desc, DestReg)
+          .addReg(BaseReg, BaseRegFlags, BaseSubReg) // addr
+          .addImm(NewOffset0)                        // offset0
+          .addImm(NewOffset1)                        // offset1
+          .addImm(0)                                 // gds
+          .cloneMergedMemRefs({&*CI.I, &*CI.Paired});
 
   (void)Read2;
 
@@ -561,32 +763,36 @@ MachineBasicBlock::iterator  SILoadStoreOptimizer::mergeRead2Pair(
 unsigned SILoadStoreOptimizer::write2Opcode(unsigned EltSize) const {
   if (STM->ldsRequiresM0Init())
     return (EltSize == 4) ? AMDGPU::DS_WRITE2_B32 : AMDGPU::DS_WRITE2_B64;
-  return (EltSize == 4) ? AMDGPU::DS_WRITE2_B32_gfx9 : AMDGPU::DS_WRITE2_B64_gfx9;
+  return (EltSize == 4) ? AMDGPU::DS_WRITE2_B32_gfx9
+                        : AMDGPU::DS_WRITE2_B64_gfx9;
 }
 
 unsigned SILoadStoreOptimizer::write2ST64Opcode(unsigned EltSize) const {
   if (STM->ldsRequiresM0Init())
-    return (EltSize == 4) ? AMDGPU::DS_WRITE2ST64_B32 : AMDGPU::DS_WRITE2ST64_B64;
+    return (EltSize == 4) ? AMDGPU::DS_WRITE2ST64_B32
+                          : AMDGPU::DS_WRITE2ST64_B64;
 
-  return (EltSize == 4) ?
-    AMDGPU::DS_WRITE2ST64_B32_gfx9 : AMDGPU::DS_WRITE2ST64_B64_gfx9;
+  return (EltSize == 4) ? AMDGPU::DS_WRITE2ST64_B32_gfx9
+                        : AMDGPU::DS_WRITE2ST64_B64_gfx9;
 }
 
-MachineBasicBlock::iterator SILoadStoreOptimizer::mergeWrite2Pair(
-  CombineInfo &CI) {
+MachineBasicBlock::iterator
+SILoadStoreOptimizer::mergeWrite2Pair(CombineInfo &CI) {
   MachineBasicBlock *MBB = CI.I->getParent();
 
   // Be sure to use .addOperand(), and not .addReg() with these. We want to be
   // sure we preserve the subregister index and any register flags set on them.
-  const MachineOperand *AddrReg = TII->getNamedOperand(*CI.I, AMDGPU::OpName::addr);
-  const MachineOperand *Data0 = TII->getNamedOperand(*CI.I, AMDGPU::OpName::data0);
-  const MachineOperand *Data1
-    = TII->getNamedOperand(*CI.Paired, AMDGPU::OpName::data0);
+  const MachineOperand *AddrReg =
+      TII->getNamedOperand(*CI.I, AMDGPU::OpName::addr);
+  const MachineOperand *Data0 =
+      TII->getNamedOperand(*CI.I, AMDGPU::OpName::data0);
+  const MachineOperand *Data1 =
+      TII->getNamedOperand(*CI.Paired, AMDGPU::OpName::data0);
 
   unsigned NewOffset0 = CI.Offset0;
   unsigned NewOffset1 = CI.Offset1;
-  unsigned Opc = CI.UseST64 ?
-    write2ST64Opcode(CI.EltSize) : write2Opcode(CI.EltSize);
+  unsigned Opc =
+      CI.UseST64 ? write2ST64Opcode(CI.EltSize) : write2Opcode(CI.EltSize);
 
   if (NewOffset0 > NewOffset1) {
     // Canonicalize the merged instruction so the smaller offset comes first.
@@ -595,36 +801,37 @@ MachineBasicBlock::iterator SILoadStoreOptimizer::mergeWrite2Pair(
   }
 
   assert((isUInt<8>(NewOffset0) && isUInt<8>(NewOffset1)) &&
-         (NewOffset0 != NewOffset1) &&
-         "Computed offset doesn't fit");
+         (NewOffset0 != NewOffset1) && "Computed offset doesn't fit");
 
   const MCInstrDesc &Write2Desc = TII->get(Opc);
   DebugLoc DL = CI.I->getDebugLoc();
 
   unsigned BaseReg = AddrReg->getReg();
+  unsigned BaseSubReg = AddrReg->getSubReg();
   unsigned BaseRegFlags = 0;
   if (CI.BaseOff) {
     unsigned ImmReg = MRI->createVirtualRegister(&AMDGPU::SGPR_32RegClass);
     BuildMI(*MBB, CI.Paired, DL, TII->get(AMDGPU::S_MOV_B32), ImmReg)
-      .addImm(CI.BaseOff);
+        .addImm(CI.BaseOff);
 
     BaseReg = MRI->createVirtualRegister(&AMDGPU::VGPR_32RegClass);
     BaseRegFlags = RegState::Kill;
 
     TII->getAddNoCarry(*MBB, CI.Paired, DL, BaseReg)
-      .addReg(ImmReg)
-      .addReg(AddrReg->getReg());
+        .addReg(ImmReg)
+        .addReg(AddrReg->getReg(), 0, BaseSubReg);
+    BaseSubReg = 0;
   }
 
   MachineInstrBuilder Write2 =
-    BuildMI(*MBB, CI.Paired, DL, Write2Desc)
-      .addReg(BaseReg, BaseRegFlags) // addr
-      .add(*Data0)                   // data0
-      .add(*Data1)                   // data1
-      .addImm(NewOffset0)            // offset0
-      .addImm(NewOffset1)            // offset1
-      .addImm(0)                     // gds
-      .setMemRefs(CI.I->mergeMemRefsWith(*CI.Paired));
+      BuildMI(*MBB, CI.Paired, DL, Write2Desc)
+          .addReg(BaseReg, BaseRegFlags, BaseSubReg) // addr
+          .add(*Data0)                               // data0
+          .add(*Data1)                               // data1
+          .addImm(NewOffset0)                        // offset0
+          .addImm(NewOffset1)                        // offset1
+          .addImm(0)                                 // gds
+          .cloneMergedMemRefs({&*CI.I, &*CI.Paired});
 
   moveInstsAfter(Write2, CI.InstsToMove);
 
@@ -636,15 +843,14 @@ MachineBasicBlock::iterator SILoadStoreOptimizer::mergeWrite2Pair(
   return Next;
 }
 
-MachineBasicBlock::iterator SILoadStoreOptimizer::mergeSBufferLoadImmPair(
-  CombineInfo &CI) {
+MachineBasicBlock::iterator
+SILoadStoreOptimizer::mergeSBufferLoadImmPair(CombineInfo &CI) {
   MachineBasicBlock *MBB = CI.I->getParent();
   DebugLoc DL = CI.I->getDebugLoc();
-  unsigned Opcode = CI.IsX2 ? AMDGPU::S_BUFFER_LOAD_DWORDX4_IMM :
-                              AMDGPU::S_BUFFER_LOAD_DWORDX2_IMM;
+  const unsigned Opcode = getNewOpcode(CI);
+
+  const TargetRegisterClass *SuperRC = getTargetRegisterClass(CI);
 
-  const TargetRegisterClass *SuperRC =
-    CI.IsX2 ? &AMDGPU::SReg_128RegClass : &AMDGPU::SReg_64_XEXECRegClass;
   unsigned DestReg = MRI->createVirtualRegister(SuperRC);
   unsigned MergedOffset = std::min(CI.Offset0, CI.Offset1);
 
@@ -652,14 +858,11 @@ MachineBasicBlock::iterator SILoadStoreOptimizer::mergeSBufferLoadImmPair(
       .add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::sbase))
       .addImm(MergedOffset) // offset
       .addImm(CI.GLC0)      // glc
-      .setMemRefs(CI.I->mergeMemRefsWith(*CI.Paired));
+      .cloneMergedMemRefs({&*CI.I, &*CI.Paired});
 
-  unsigned SubRegIdx0 = CI.IsX2 ? AMDGPU::sub0_sub1 : AMDGPU::sub0;
-  unsigned SubRegIdx1 = CI.IsX2 ? AMDGPU::sub2_sub3 : AMDGPU::sub1;
-
-  // Handle descending offsets
-  if (CI.Offset0 > CI.Offset1)
-    std::swap(SubRegIdx0, SubRegIdx1);
+  std::pair<unsigned, unsigned> SubRegIdx = getSubRegIdxs(CI);
+  const unsigned SubRegIdx0 = std::get<0>(SubRegIdx);
+  const unsigned SubRegIdx1 = std::get<1>(SubRegIdx);
 
   // Copy to the old destination registers.
   const MCInstrDesc &CopyDesc = TII->get(TargetOpcode::COPY);
@@ -681,29 +884,25 @@ MachineBasicBlock::iterator SILoadStoreOptimizer::mergeSBufferLoadImmPair(
   return Next;
 }
 
-MachineBasicBlock::iterator SILoadStoreOptimizer::mergeBufferLoadPair(
-  CombineInfo &CI) {
+MachineBasicBlock::iterator
+SILoadStoreOptimizer::mergeBufferLoadPair(CombineInfo &CI) {
   MachineBasicBlock *MBB = CI.I->getParent();
   DebugLoc DL = CI.I->getDebugLoc();
-  unsigned Opcode;
 
-  if (CI.InstClass == BUFFER_LOAD_OFFEN) {
-    Opcode = CI.IsX2 ? AMDGPU::BUFFER_LOAD_DWORDX4_OFFEN :
-                       AMDGPU::BUFFER_LOAD_DWORDX2_OFFEN;
-  } else {
-    Opcode = CI.IsX2 ? AMDGPU::BUFFER_LOAD_DWORDX4_OFFSET :
-                       AMDGPU::BUFFER_LOAD_DWORDX2_OFFSET;
-  }
+  const unsigned Opcode = getNewOpcode(CI);
 
-  const TargetRegisterClass *SuperRC =
-    CI.IsX2 ? &AMDGPU::VReg_128RegClass : &AMDGPU::VReg_64RegClass;
+  const TargetRegisterClass *SuperRC = getTargetRegisterClass(CI);
+
+  // Copy to the new source register.
   unsigned DestReg = MRI->createVirtualRegister(SuperRC);
   unsigned MergedOffset = std::min(CI.Offset0, CI.Offset1);
 
   auto MIB = BuildMI(*MBB, CI.Paired, DL, TII->get(Opcode), DestReg);
 
-  if (CI.InstClass == BUFFER_LOAD_OFFEN)
-      MIB.add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::vaddr));
+  const unsigned Regs = getRegs(Opcode);
+
+  if (Regs & VADDR)
+    MIB.add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::vaddr));
 
   MIB.add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::srsrc))
       .add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::soffset))
@@ -711,14 +910,11 @@ MachineBasicBlock::iterator SILoadStoreOptimizer::mergeBufferLoadPair(
       .addImm(CI.GLC0)      // glc
       .addImm(CI.SLC0)      // slc
       .addImm(0)            // tfe
-      .setMemRefs(CI.I->mergeMemRefsWith(*CI.Paired));
+      .cloneMergedMemRefs({&*CI.I, &*CI.Paired});
 
-  unsigned SubRegIdx0 = CI.IsX2 ? AMDGPU::sub0_sub1 : AMDGPU::sub0;
-  unsigned SubRegIdx1 = CI.IsX2 ? AMDGPU::sub2_sub3 : AMDGPU::sub1;
-
-  // Handle descending offsets
-  if (CI.Offset0 > CI.Offset1)
-    std::swap(SubRegIdx0, SubRegIdx1);
+  std::pair<unsigned, unsigned> SubRegIdx = getSubRegIdxs(CI);
+  const unsigned SubRegIdx0 = std::get<0>(SubRegIdx);
+  const unsigned SubRegIdx1 = std::get<1>(SubRegIdx);
 
   // Copy to the old destination registers.
   const MCInstrDesc &CopyDesc = TII->get(TargetOpcode::COPY);
@@ -740,57 +936,137 @@ MachineBasicBlock::iterator SILoadStoreOptimizer::mergeBufferLoadPair(
   return Next;
 }
 
-unsigned SILoadStoreOptimizer::promoteBufferStoreOpcode(
-  const MachineInstr &I, bool &IsX2, bool &IsOffen) const {
-  IsX2 = false;
-  IsOffen = false;
+unsigned SILoadStoreOptimizer::getNewOpcode(const CombineInfo &CI) {
+  const unsigned Width = CI.Width0 + CI.Width1;
 
-  switch (I.getOpcode()) {
-  case AMDGPU::BUFFER_STORE_DWORD_OFFEN:
-    IsOffen = true;
-    return AMDGPU::BUFFER_STORE_DWORDX2_OFFEN;
-  case AMDGPU::BUFFER_STORE_DWORD_OFFEN_exact:
-    IsOffen = true;
-    return AMDGPU::BUFFER_STORE_DWORDX2_OFFEN_exact;
-  case AMDGPU::BUFFER_STORE_DWORDX2_OFFEN:
-    IsX2 = true;
-    IsOffen = true;
-    return AMDGPU::BUFFER_STORE_DWORDX4_OFFEN;
-  case AMDGPU::BUFFER_STORE_DWORDX2_OFFEN_exact:
-    IsX2 = true;
-    IsOffen = true;
-    return AMDGPU::BUFFER_STORE_DWORDX4_OFFEN_exact;
-  case AMDGPU::BUFFER_STORE_DWORD_OFFSET:
-    return AMDGPU::BUFFER_STORE_DWORDX2_OFFSET;
-  case AMDGPU::BUFFER_STORE_DWORD_OFFSET_exact:
-    return AMDGPU::BUFFER_STORE_DWORDX2_OFFSET_exact;
-  case AMDGPU::BUFFER_STORE_DWORDX2_OFFSET:
-    IsX2 = true;
-    return AMDGPU::BUFFER_STORE_DWORDX4_OFFSET;
-  case AMDGPU::BUFFER_STORE_DWORDX2_OFFSET_exact:
-    IsX2 = true;
-    return AMDGPU::BUFFER_STORE_DWORDX4_OFFSET_exact;
+  switch (CI.InstClass) {
+  default:
+    return AMDGPU::getMUBUFOpcode(CI.InstClass, Width);
+  case UNKNOWN:
+    llvm_unreachable("Unknown instruction class");
+  case S_BUFFER_LOAD_IMM:
+    switch (Width) {
+    default:
+      return 0;
+    case 2:
+      return AMDGPU::S_BUFFER_LOAD_DWORDX2_IMM;
+    case 4:
+      return AMDGPU::S_BUFFER_LOAD_DWORDX4_IMM;
+    }
   }
-  return 0;
 }
 
-MachineBasicBlock::iterator SILoadStoreOptimizer::mergeBufferStorePair(
-  CombineInfo &CI) {
+std::pair<unsigned, unsigned>
+SILoadStoreOptimizer::getSubRegIdxs(const CombineInfo &CI) {
+  if (CI.Offset0 > CI.Offset1) {
+    switch (CI.Width0) {
+    default:
+      return std::make_pair(0, 0);
+    case 1:
+      switch (CI.Width1) {
+      default:
+        return std::make_pair(0, 0);
+      case 1:
+        return std::make_pair(AMDGPU::sub1, AMDGPU::sub0);
+      case 2:
+        return std::make_pair(AMDGPU::sub2, AMDGPU::sub0_sub1);
+      case 3:
+        return std::make_pair(AMDGPU::sub3, AMDGPU::sub0_sub1_sub2);
+      }
+    case 2:
+      switch (CI.Width1) {
+      default:
+        return std::make_pair(0, 0);
+      case 1:
+        return std::make_pair(AMDGPU::sub1_sub2, AMDGPU::sub0);
+      case 2:
+        return std::make_pair(AMDGPU::sub2_sub3, AMDGPU::sub0_sub1);
+      }
+    case 3:
+      switch (CI.Width1) {
+      default:
+        return std::make_pair(0, 0);
+      case 1:
+        return std::make_pair(AMDGPU::sub1_sub2_sub3, AMDGPU::sub0);
+      }
+    }
+  } else {
+    switch (CI.Width0) {
+    default:
+      return std::make_pair(0, 0);
+    case 1:
+      switch (CI.Width1) {
+      default:
+        return std::make_pair(0, 0);
+      case 1:
+        return std::make_pair(AMDGPU::sub0, AMDGPU::sub1);
+      case 2:
+        return std::make_pair(AMDGPU::sub0, AMDGPU::sub1_sub2);
+      case 3:
+        return std::make_pair(AMDGPU::sub0, AMDGPU::sub1_sub2_sub3);
+      }
+    case 2:
+      switch (CI.Width1) {
+      default:
+        return std::make_pair(0, 0);
+      case 1:
+        return std::make_pair(AMDGPU::sub0_sub1, AMDGPU::sub2);
+      case 2:
+        return std::make_pair(AMDGPU::sub0_sub1, AMDGPU::sub2_sub3);
+      }
+    case 3:
+      switch (CI.Width1) {
+      default:
+        return std::make_pair(0, 0);
+      case 1:
+        return std::make_pair(AMDGPU::sub0_sub1_sub2, AMDGPU::sub3);
+      }
+    }
+  }
+}
+
+const TargetRegisterClass *
+SILoadStoreOptimizer::getTargetRegisterClass(const CombineInfo &CI) {
+  if (CI.InstClass == S_BUFFER_LOAD_IMM) {
+    switch (CI.Width0 + CI.Width1) {
+    default:
+      return nullptr;
+    case 2:
+      return &AMDGPU::SReg_64_XEXECRegClass;
+    case 4:
+      return &AMDGPU::SReg_128RegClass;
+    case 8:
+      return &AMDGPU::SReg_256RegClass;
+    case 16:
+      return &AMDGPU::SReg_512RegClass;
+    }
+  } else {
+    switch (CI.Width0 + CI.Width1) {
+    default:
+      return nullptr;
+    case 2:
+      return &AMDGPU::VReg_64RegClass;
+    case 3:
+      return &AMDGPU::VReg_96RegClass;
+    case 4:
+      return &AMDGPU::VReg_128RegClass;
+    }
+  }
+}
+
+MachineBasicBlock::iterator
+SILoadStoreOptimizer::mergeBufferStorePair(CombineInfo &CI) {
   MachineBasicBlock *MBB = CI.I->getParent();
   DebugLoc DL = CI.I->getDebugLoc();
-  bool Unused1, Unused2;
-  unsigned Opcode = promoteBufferStoreOpcode(*CI.I, Unused1, Unused2);
 
-  unsigned SubRegIdx0 = CI.IsX2 ? AMDGPU::sub0_sub1 : AMDGPU::sub0;
-  unsigned SubRegIdx1 = CI.IsX2 ? AMDGPU::sub2_sub3 : AMDGPU::sub1;
+  const unsigned Opcode = getNewOpcode(CI);
 
-  // Handle descending offsets
-  if (CI.Offset0 > CI.Offset1)
-    std::swap(SubRegIdx0, SubRegIdx1);
+  std::pair<unsigned, unsigned> SubRegIdx = getSubRegIdxs(CI);
+  const unsigned SubRegIdx0 = std::get<0>(SubRegIdx);
+  const unsigned SubRegIdx1 = std::get<1>(SubRegIdx);
 
   // Copy to the new source register.
-  const TargetRegisterClass *SuperRC =
-    CI.IsX2 ? &AMDGPU::VReg_128RegClass : &AMDGPU::VReg_64RegClass;
+  const TargetRegisterClass *SuperRC = getTargetRegisterClass(CI);
   unsigned SrcReg = MRI->createVirtualRegister(SuperRC);
 
   const auto *Src0 = TII->getNamedOperand(*CI.I, AMDGPU::OpName::vdata);
@@ -803,18 +1079,20 @@ MachineBasicBlock::iterator SILoadStoreOptimizer::mergeBufferStorePair(
       .addImm(SubRegIdx1);
 
   auto MIB = BuildMI(*MBB, CI.Paired, DL, TII->get(Opcode))
-      .addReg(SrcReg, RegState::Kill);
+                 .addReg(SrcReg, RegState::Kill);
 
-  if (CI.InstClass == BUFFER_STORE_OFFEN)
-      MIB.add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::vaddr));
+  const unsigned Regs = getRegs(Opcode);
+
+  if (Regs & VADDR)
+    MIB.add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::vaddr));
 
   MIB.add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::srsrc))
       .add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::soffset))
       .addImm(std::min(CI.Offset0, CI.Offset1)) // offset
-      .addImm(CI.GLC0)      // glc
-      .addImm(CI.SLC0)      // slc
-      .addImm(0)            // tfe
-      .setMemRefs(CI.I->mergeMemRefsWith(*CI.Paired));
+      .addImm(CI.GLC0)                          // glc
+      .addImm(CI.SLC0)                          // slc
+      .addImm(0)                                // tfe
+      .cloneMergedMemRefs({&*CI.I, &*CI.Paired});
 
   moveInstsAfter(MIB, CI.InstsToMove);
 
@@ -824,105 +1102,399 @@ MachineBasicBlock::iterator SILoadStoreOptimizer::mergeBufferStorePair(
   return Next;
 }
 
+MachineOperand
+SILoadStoreOptimizer::createRegOrImm(int32_t Val, MachineInstr &MI) {
+  APInt V(32, Val, true);
+  if (TII->isInlineConstant(V))
+    return MachineOperand::CreateImm(Val);
+
+  unsigned Reg = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
+  MachineInstr *Mov =
+  BuildMI(*MI.getParent(), MI.getIterator(), MI.getDebugLoc(),
+          TII->get(AMDGPU::S_MOV_B32), Reg)
+    .addImm(Val);
+  (void)Mov;
+  LLVM_DEBUG(dbgs() << "    "; Mov->dump());
+  return MachineOperand::CreateReg(Reg, false);
+}
+
+// Compute base address using Addr and return the final register.
+unsigned SILoadStoreOptimizer::computeBase(MachineInstr &MI,
+                                           const MemAddress &Addr) {
+  MachineBasicBlock *MBB = MI.getParent();
+  MachineBasicBlock::iterator MBBI = MI.getIterator();
+  DebugLoc DL = MI.getDebugLoc();
+
+  assert((TRI->getRegSizeInBits(Addr.Base.LoReg, *MRI) == 32 ||
+          Addr.Base.LoSubReg) &&
+         "Expected 32-bit Base-Register-Low!!");
+
+  assert((TRI->getRegSizeInBits(Addr.Base.HiReg, *MRI) == 32 ||
+          Addr.Base.HiSubReg) &&
+         "Expected 32-bit Base-Register-Hi!!");
+
+  LLVM_DEBUG(dbgs() << "  Re-Computed Anchor-Base:\n");
+  MachineOperand OffsetLo = createRegOrImm(static_cast<int32_t>(Addr.Offset), MI);
+  MachineOperand OffsetHi =
+    createRegOrImm(static_cast<int32_t>(Addr.Offset >> 32), MI);
+  unsigned CarryReg = MRI->createVirtualRegister(&AMDGPU::SReg_64_XEXECRegClass);
+  unsigned DeadCarryReg =
+    MRI->createVirtualRegister(&AMDGPU::SReg_64_XEXECRegClass);
+
+  unsigned DestSub0 = MRI->createVirtualRegister(&AMDGPU::VGPR_32RegClass);
+  unsigned DestSub1 = MRI->createVirtualRegister(&AMDGPU::VGPR_32RegClass);
+  MachineInstr *LoHalf =
+    BuildMI(*MBB, MBBI, DL, TII->get(AMDGPU::V_ADD_I32_e64), DestSub0)
+      .addReg(CarryReg, RegState::Define)
+      .addReg(Addr.Base.LoReg, 0, Addr.Base.LoSubReg)
+    .add(OffsetLo);
+  (void)LoHalf;
+  LLVM_DEBUG(dbgs() << "    "; LoHalf->dump(););
+
+  MachineInstr *HiHalf =
+  BuildMI(*MBB, MBBI, DL, TII->get(AMDGPU::V_ADDC_U32_e64), DestSub1)
+    .addReg(DeadCarryReg, RegState::Define | RegState::Dead)
+    .addReg(Addr.Base.HiReg, 0, Addr.Base.HiSubReg)
+    .add(OffsetHi)
+    .addReg(CarryReg, RegState::Kill);
+  (void)HiHalf;
+  LLVM_DEBUG(dbgs() << "    "; HiHalf->dump(););
+
+  unsigned FullDestReg = MRI->createVirtualRegister(&AMDGPU::VReg_64RegClass);
+  MachineInstr *FullBase =
+    BuildMI(*MBB, MBBI, DL, TII->get(TargetOpcode::REG_SEQUENCE), FullDestReg)
+      .addReg(DestSub0)
+      .addImm(AMDGPU::sub0)
+      .addReg(DestSub1)
+      .addImm(AMDGPU::sub1);
+  (void)FullBase;
+  LLVM_DEBUG(dbgs() << "    "; FullBase->dump(); dbgs() << "\n";);
+
+  return FullDestReg;
+}
+
+// Update base and offset with the NewBase and NewOffset in MI.
+void SILoadStoreOptimizer::updateBaseAndOffset(MachineInstr &MI,
+                                               unsigned NewBase,
+                                               int32_t NewOffset) {
+  TII->getNamedOperand(MI, AMDGPU::OpName::vaddr)->setReg(NewBase);
+  TII->getNamedOperand(MI, AMDGPU::OpName::offset)->setImm(NewOffset);
+}
+
+Optional<int32_t>
+SILoadStoreOptimizer::extractConstOffset(const MachineOperand &Op) {
+  if (Op.isImm())
+    return Op.getImm();
+
+  if (!Op.isReg())
+    return None;
+
+  MachineInstr *Def = MRI->getUniqueVRegDef(Op.getReg());
+  if (!Def || Def->getOpcode() != AMDGPU::S_MOV_B32 ||
+      !Def->getOperand(1).isImm())
+    return None;
+
+  return Def->getOperand(1).getImm();
+}
+
+// Analyze Base and extracts:
+//  - 32bit base registers, subregisters
+//  - 64bit constant offset
+// Expecting base computation as:
+//   %OFFSET0:sgpr_32 = S_MOV_B32 8000
+//   %LO:vgpr_32, %c:sreg_64_xexec =
+//       V_ADD_I32_e64 %BASE_LO:vgpr_32, %103:sgpr_32,
+//   %HI:vgpr_32, = V_ADDC_U32_e64 %BASE_HI:vgpr_32, 0, killed %c:sreg_64_xexec
+//   %Base:vreg_64 =
+//       REG_SEQUENCE %LO:vgpr_32, %subreg.sub0, %HI:vgpr_32, %subreg.sub1
+void SILoadStoreOptimizer::processBaseWithConstOffset(const MachineOperand &Base,
+                                                      MemAddress &Addr) {
+  if (!Base.isReg())
+    return;
+
+  MachineInstr *Def = MRI->getUniqueVRegDef(Base.getReg());
+  if (!Def || Def->getOpcode() != AMDGPU::REG_SEQUENCE
+      || Def->getNumOperands() != 5)
+    return;
+
+  MachineOperand BaseLo = Def->getOperand(1);
+  MachineOperand BaseHi = Def->getOperand(3);
+  if (!BaseLo.isReg() || !BaseHi.isReg())
+    return;
+
+  MachineInstr *BaseLoDef = MRI->getUniqueVRegDef(BaseLo.getReg());
+  MachineInstr *BaseHiDef = MRI->getUniqueVRegDef(BaseHi.getReg());
+
+  if (!BaseLoDef || BaseLoDef->getOpcode() != AMDGPU::V_ADD_I32_e64 ||
+      !BaseHiDef || BaseHiDef->getOpcode() != AMDGPU::V_ADDC_U32_e64)
+    return;
+
+  const auto *Src0 = TII->getNamedOperand(*BaseLoDef, AMDGPU::OpName::src0);
+  const auto *Src1 = TII->getNamedOperand(*BaseLoDef, AMDGPU::OpName::src1);
+
+  auto Offset0P = extractConstOffset(*Src0);
+  if (Offset0P)
+    BaseLo = *Src1;
+  else {
+    if (!(Offset0P = extractConstOffset(*Src1)))
+      return;
+    BaseLo = *Src0;
+  }
+
+  Src0 = TII->getNamedOperand(*BaseHiDef, AMDGPU::OpName::src0);
+  Src1 = TII->getNamedOperand(*BaseHiDef, AMDGPU::OpName::src1);
+
+  if (Src0->isImm())
+    std::swap(Src0, Src1);
+
+  if (!Src1->isImm())
+    return;
+
+  uint64_t Offset1 = Src1->getImm();
+  BaseHi = *Src0;
+
+  Addr.Base.LoReg = BaseLo.getReg();
+  Addr.Base.HiReg = BaseHi.getReg();
+  Addr.Base.LoSubReg = BaseLo.getSubReg();
+  Addr.Base.HiSubReg = BaseHi.getSubReg();
+  Addr.Offset = (*Offset0P & 0x00000000ffffffff) | (Offset1 << 32);
+}
+
+bool SILoadStoreOptimizer::promoteConstantOffsetToImm(
+    MachineInstr &MI,
+    MemInfoMap &Visited,
+    SmallPtrSet<MachineInstr *, 4> &AnchorList) {
+
+  // TODO: Support flat and scratch.
+  if (AMDGPU::getGlobalSaddrOp(MI.getOpcode()) < 0 ||
+      TII->getNamedOperand(MI, AMDGPU::OpName::vdata) != NULL)
+    return false;
+
+  // TODO: Support Store.
+  if (!MI.mayLoad())
+    return false;
+
+  if (AnchorList.count(&MI))
+    return false;
+
+  LLVM_DEBUG(dbgs() << "\nTryToPromoteConstantOffsetToImmFor "; MI.dump());
+
+  if (TII->getNamedOperand(MI, AMDGPU::OpName::offset)->getImm()) {
+    LLVM_DEBUG(dbgs() << "  Const-offset is already promoted.\n";);
+    return false;
+  }
+
+  // Step1: Find the base-registers and a 64bit constant offset.
+  MachineOperand &Base = *TII->getNamedOperand(MI, AMDGPU::OpName::vaddr);
+  MemAddress MAddr;
+  if (Visited.find(&MI) == Visited.end()) {
+    processBaseWithConstOffset(Base, MAddr);
+    Visited[&MI] = MAddr;
+  } else
+    MAddr = Visited[&MI];
+
+  if (MAddr.Offset == 0) {
+    LLVM_DEBUG(dbgs() << "  Failed to extract constant-offset or there are no"
+                         " constant offsets that can be promoted.\n";);
+    return false;
+  }
+
+  LLVM_DEBUG(dbgs() << "  BASE: {" << MAddr.Base.HiReg << ", "
+             << MAddr.Base.LoReg << "} Offset: " << MAddr.Offset << "\n\n";);
+
+  // Step2: Traverse through MI's basic block and find an anchor(that has the
+  // same base-registers) with the highest 13bit distance from MI's offset.
+  // E.g. (64bit loads)
+  // bb:
+  //   addr1 = &a + 4096;   load1 = load(addr1,  0)
+  //   addr2 = &a + 6144;   load2 = load(addr2,  0)
+  //   addr3 = &a + 8192;   load3 = load(addr3,  0)
+  //   addr4 = &a + 10240;  load4 = load(addr4,  0)
+  //   addr5 = &a + 12288;  load5 = load(addr5,  0)
+  //
+  // Starting from the first load, the optimization will try to find a new base
+  // from which (&a + 4096) has 13 bit distance. Both &a + 6144 and &a + 8192
+  // has 13bit distance from &a + 4096. The heuristic considers &a + 8192
+  // as the new-base(anchor) because of the maximum distance which can
+  // accomodate more intermediate bases presumeably.
+  //
+  // Step3: move (&a + 8192) above load1. Compute and promote offsets from
+  // (&a + 8192) for load1, load2, load4.
+  //   addr = &a + 8192
+  //   load1 = load(addr,       -4096)
+  //   load2 = load(addr,       -2048)
+  //   load3 = load(addr,       0)
+  //   load4 = load(addr,       2048)
+  //   addr5 = &a + 12288;  load5 = load(addr5,  0)
+  //
+  MachineInstr *AnchorInst = nullptr;
+  MemAddress AnchorAddr;
+  uint32_t MaxDist = std::numeric_limits<uint32_t>::min();
+  SmallVector<std::pair<MachineInstr *, int64_t>, 4> InstsWCommonBase;
+
+  MachineBasicBlock *MBB = MI.getParent();
+  MachineBasicBlock::iterator E = MBB->end();
+  MachineBasicBlock::iterator MBBI = MI.getIterator();
+  ++MBBI;
+  const SITargetLowering *TLI =
+    static_cast<const SITargetLowering *>(STM->getTargetLowering());
+
+  for ( ; MBBI != E; ++MBBI) {
+    MachineInstr &MINext = *MBBI;
+    // TODO: Support finding an anchor(with same base) from store addresses or
+    // any other load addresses where the opcodes are different.
+    if (MINext.getOpcode() != MI.getOpcode() ||
+        TII->getNamedOperand(MINext, AMDGPU::OpName::offset)->getImm())
+      continue;
+
+    const MachineOperand &BaseNext =
+      *TII->getNamedOperand(MINext, AMDGPU::OpName::vaddr);
+    MemAddress MAddrNext;
+    if (Visited.find(&MINext) == Visited.end()) {
+      processBaseWithConstOffset(BaseNext, MAddrNext);
+      Visited[&MINext] = MAddrNext;
+    } else
+      MAddrNext = Visited[&MINext];
+
+    if (MAddrNext.Base.LoReg != MAddr.Base.LoReg ||
+        MAddrNext.Base.HiReg != MAddr.Base.HiReg ||
+        MAddrNext.Base.LoSubReg != MAddr.Base.LoSubReg ||
+        MAddrNext.Base.HiSubReg != MAddr.Base.HiSubReg)
+      continue;
+
+    InstsWCommonBase.push_back(std::make_pair(&MINext, MAddrNext.Offset));
+
+    int64_t Dist = MAddr.Offset - MAddrNext.Offset;
+    TargetLoweringBase::AddrMode AM;
+    AM.HasBaseReg = true;
+    AM.BaseOffs = Dist;
+    if (TLI->isLegalGlobalAddressingMode(AM) &&
+        (uint32_t)std::abs(Dist) > MaxDist) {
+      MaxDist = std::abs(Dist);
+
+      AnchorAddr = MAddrNext;
+      AnchorInst = &MINext;
+    }
+  }
+
+  if (AnchorInst) {
+    LLVM_DEBUG(dbgs() << "  Anchor-Inst(with max-distance from Offset): ";
+               AnchorInst->dump());
+    LLVM_DEBUG(dbgs() << "  Anchor-Offset from BASE: "
+               <<  AnchorAddr.Offset << "\n\n");
+
+    // Instead of moving up, just re-compute anchor-instruction's base address.
+    unsigned Base = computeBase(MI, AnchorAddr);
+
+    updateBaseAndOffset(MI, Base, MAddr.Offset - AnchorAddr.Offset);
+    LLVM_DEBUG(dbgs() << "  After promotion: "; MI.dump(););
+
+    for (auto P : InstsWCommonBase) {
+      TargetLoweringBase::AddrMode AM;
+      AM.HasBaseReg = true;
+      AM.BaseOffs = P.second - AnchorAddr.Offset;
+
+      if (TLI->isLegalGlobalAddressingMode(AM)) {
+        LLVM_DEBUG(dbgs() << "  Promote Offset(" << P.second;
+                   dbgs() << ")"; P.first->dump());
+        updateBaseAndOffset(*P.first, Base, P.second - AnchorAddr.Offset);
+        LLVM_DEBUG(dbgs() << "     After promotion: "; P.first->dump());
+      }
+    }
+    AnchorList.insert(AnchorInst);
+    return true;
+  }
+
+  return false;
+}
+
 // Scan through looking for adjacent LDS operations with constant offsets from
 // the same base register. We rely on the scheduler to do the hard work of
 // clustering nearby loads, and assume these are all adjacent.
 bool SILoadStoreOptimizer::optimizeBlock(MachineBasicBlock &MBB) {
   bool Modified = false;
 
+  // Contain the list
+  MemInfoMap Visited;
+  // Contains the list of instructions for which constant offsets are being
+  // promoted to the IMM.
+  SmallPtrSet<MachineInstr *, 4> AnchorList;
+
   for (MachineBasicBlock::iterator I = MBB.begin(), E = MBB.end(); I != E;) {
     MachineInstr &MI = *I;
 
+    if (promoteConstantOffsetToImm(MI, Visited, AnchorList))
+      Modified = true;
+
     // Don't combine if volatile.
     if (MI.hasOrderedMemoryRef()) {
       ++I;
       continue;
     }
 
+    const unsigned Opc = MI.getOpcode();
+
     CombineInfo CI;
     CI.I = I;
-    unsigned Opc = MI.getOpcode();
-    if (Opc == AMDGPU::DS_READ_B32 || Opc == AMDGPU::DS_READ_B64 ||
-        Opc == AMDGPU::DS_READ_B32_gfx9 || Opc == AMDGPU::DS_READ_B64_gfx9) {
+    CI.InstClass = getInstClass(Opc);
 
-      CI.InstClass = DS_READ_WRITE;
+    switch (CI.InstClass) {
+    default:
+      break;
+    case DS_READ:
       CI.EltSize =
-        (Opc == AMDGPU::DS_READ_B64 || Opc == AMDGPU::DS_READ_B64_gfx9) ? 8 : 4;
-
+          (Opc == AMDGPU::DS_READ_B64 || Opc == AMDGPU::DS_READ_B64_gfx9) ? 8
+                                                                          : 4;
       if (findMatchingInst(CI)) {
         Modified = true;
         I = mergeRead2Pair(CI);
       } else {
         ++I;
       }
-
       continue;
-    } else if (Opc == AMDGPU::DS_WRITE_B32 || Opc == AMDGPU::DS_WRITE_B64 ||
-               Opc == AMDGPU::DS_WRITE_B32_gfx9 ||
-               Opc == AMDGPU::DS_WRITE_B64_gfx9) {
-      CI.InstClass = DS_READ_WRITE;
-      CI.EltSize
-        = (Opc == AMDGPU::DS_WRITE_B64 || Opc == AMDGPU::DS_WRITE_B64_gfx9) ? 8 : 4;
-
+    case DS_WRITE:
+      CI.EltSize =
+          (Opc == AMDGPU::DS_WRITE_B64 || Opc == AMDGPU::DS_WRITE_B64_gfx9) ? 8
+                                                                            : 4;
       if (findMatchingInst(CI)) {
         Modified = true;
         I = mergeWrite2Pair(CI);
       } else {
         ++I;
       }
-
       continue;
-    }
-    if (Opc == AMDGPU::S_BUFFER_LOAD_DWORD_IMM ||
-        Opc == AMDGPU::S_BUFFER_LOAD_DWORDX2_IMM) {
-      // EltSize is in units of the offset encoding.
-      CI.InstClass = S_BUFFER_LOAD_IMM;
+    case S_BUFFER_LOAD_IMM:
       CI.EltSize = AMDGPU::getSMRDEncodedOffset(*STM, 4);
-      CI.IsX2 = Opc == AMDGPU::S_BUFFER_LOAD_DWORDX2_IMM;
       if (findMatchingInst(CI)) {
         Modified = true;
         I = mergeSBufferLoadImmPair(CI);
-        if (!CI.IsX2)
-          CreatedX2++;
+        OptimizeAgain |= (CI.Width0 + CI.Width1) < 16;
       } else {
         ++I;
       }
       continue;
-    }
-    if (Opc == AMDGPU::BUFFER_LOAD_DWORD_OFFEN ||
-        Opc == AMDGPU::BUFFER_LOAD_DWORDX2_OFFEN ||
-        Opc == AMDGPU::BUFFER_LOAD_DWORD_OFFSET ||
-        Opc == AMDGPU::BUFFER_LOAD_DWORDX2_OFFSET) {
-      if (Opc == AMDGPU::BUFFER_LOAD_DWORD_OFFEN ||
-          Opc == AMDGPU::BUFFER_LOAD_DWORDX2_OFFEN)
-        CI.InstClass = BUFFER_LOAD_OFFEN;
-      else
-        CI.InstClass = BUFFER_LOAD_OFFSET;
-
+    case BUFFER_LOAD_OFFEN:
+    case BUFFER_LOAD_OFFSET:
+    case BUFFER_LOAD_OFFEN_exact:
+    case BUFFER_LOAD_OFFSET_exact:
       CI.EltSize = 4;
-      CI.IsX2 = Opc == AMDGPU::BUFFER_LOAD_DWORDX2_OFFEN ||
-                Opc == AMDGPU::BUFFER_LOAD_DWORDX2_OFFSET;
       if (findMatchingInst(CI)) {
         Modified = true;
         I = mergeBufferLoadPair(CI);
-        if (!CI.IsX2)
-          CreatedX2++;
+        OptimizeAgain |= (CI.Width0 + CI.Width1) < 4;
       } else {
         ++I;
       }
       continue;
-    }
-
-    bool StoreIsX2, IsOffen;
-    if (promoteBufferStoreOpcode(*I, StoreIsX2, IsOffen)) {
-      CI.InstClass = IsOffen ? BUFFER_STORE_OFFEN : BUFFER_STORE_OFFSET;
+    case BUFFER_STORE_OFFEN:
+    case BUFFER_STORE_OFFSET:
+    case BUFFER_STORE_OFFEN_exact:
+    case BUFFER_STORE_OFFSET_exact:
       CI.EltSize = 4;
-      CI.IsX2 = StoreIsX2;
       if (findMatchingInst(CI)) {
         Modified = true;
         I = mergeBufferStorePair(CI);
-        if (!CI.IsX2)
-          CreatedX2++;
+        OptimizeAgain |= (CI.Width0 + CI.Width1) < 4;
       } else {
         ++I;
       }
@@ -956,12 +1528,10 @@ bool SILoadStoreOptimizer::runOnMachineFunction(MachineFunction &MF) {
   bool Modified = false;
 
   for (MachineBasicBlock &MBB : MF) {
-    CreatedX2 = 0;
-    Modified |= optimizeBlock(MBB);
-
-    // Run again to convert x2 to x4.
-    if (CreatedX2 >= 1)
+    do {
+      OptimizeAgain = false;
       Modified |= optimizeBlock(MBB);
+    } while (OptimizeAgain);
   }
 
   return Modified;
diff --git a/lib/Target/AMDGPU/SILowerControlFlow.cpp b/lib/Target/AMDGPU/SILowerControlFlow.cpp
index ad30317c344c3..1aa1feebbdae6 100644
--- a/lib/Target/AMDGPU/SILowerControlFlow.cpp
+++ b/lib/Target/AMDGPU/SILowerControlFlow.cpp
@@ -85,9 +85,7 @@ private:
 
   void emitIf(MachineInstr &MI);
   void emitElse(MachineInstr &MI);
-  void emitBreak(MachineInstr &MI);
   void emitIfBreak(MachineInstr &MI);
-  void emitElseBreak(MachineInstr &MI);
   void emitLoop(MachineInstr &MI);
   void emitEndCf(MachineInstr &MI);
 
@@ -329,20 +327,6 @@ void SILowerControlFlow::emitElse(MachineInstr &MI) {
   LIS->removeRegUnit(*MCRegUnitIterator(AMDGPU::EXEC, TRI));
 }
 
-void SILowerControlFlow::emitBreak(MachineInstr &MI) {
-  MachineBasicBlock &MBB = *MI.getParent();
-  const DebugLoc &DL = MI.getDebugLoc();
-  unsigned Dst = MI.getOperand(0).getReg();
-
-  MachineInstr *Or = BuildMI(MBB, &MI, DL, TII->get(AMDGPU::S_OR_B64), Dst)
-                         .addReg(AMDGPU::EXEC)
-                         .add(MI.getOperand(1));
-
-  if (LIS)
-    LIS->ReplaceMachineInstrInMaps(MI, *Or);
-  MI.eraseFromParent();
-}
-
 void SILowerControlFlow::emitIfBreak(MachineInstr &MI) {
   MachineBasicBlock &MBB = *MI.getParent();
   const DebugLoc &DL = MI.getDebugLoc();
@@ -384,11 +368,6 @@ void SILowerControlFlow::emitIfBreak(MachineInstr &MI) {
   MI.eraseFromParent();
 }
 
-void SILowerControlFlow::emitElseBreak(MachineInstr &MI) {
-  // Lowered in the same way as emitIfBreak above.
-  emitIfBreak(MI);
-}
-
 void SILowerControlFlow::emitLoop(MachineInstr &MI) {
   MachineBasicBlock &MBB = *MI.getParent();
   const DebugLoc &DL = MI.getDebugLoc();
@@ -515,18 +494,10 @@ bool SILowerControlFlow::runOnMachineFunction(MachineFunction &MF) {
         emitElse(MI);
         break;
 
-      case AMDGPU::SI_BREAK:
-        emitBreak(MI);
-        break;
-
       case AMDGPU::SI_IF_BREAK:
         emitIfBreak(MI);
         break;
 
-      case AMDGPU::SI_ELSE_BREAK:
-        emitElseBreak(MI);
-        break;
-
       case AMDGPU::SI_LOOP:
         emitLoop(MI);
         break;
diff --git a/lib/Target/AMDGPU/SILowerI1Copies.cpp b/lib/Target/AMDGPU/SILowerI1Copies.cpp
index ecc6cff407e18..eb038bb5d5fcf 100644
--- a/lib/Target/AMDGPU/SILowerI1Copies.cpp
+++ b/lib/Target/AMDGPU/SILowerI1Copies.cpp
@@ -5,37 +5,61 @@
 // This file is distributed under the University of Illinois Open Source
 // License. See LICENSE.TXT for details.
 //
-/// i1 values are usually inserted by the CFG Structurize pass and they are
-/// unique in that they can be copied from VALU to SALU registers.
-/// This is not possible for any other value type.  Since there are no
-/// MOV instructions for i1, we to use V_CMP_* and V_CNDMASK to move the i1.
-///
 //===----------------------------------------------------------------------===//
 //
+// This pass lowers all occurrences of i1 values (with a vreg_1 register class)
+// to lane masks (64-bit scalar registers). The pass assumes machine SSA form
+// and a wave-level control flow graph.
+//
+// Before this pass, values that are semantically i1 and are defined and used
+// within the same basic block are already represented as lane masks in scalar
+// registers. However, values that cross basic blocks are always transferred
+// between basic blocks in vreg_1 virtual registers and are lowered by this
+// pass.
+//
+// The only instructions that use or define vreg_1 virtual registers are COPY,
+// PHI, and IMPLICIT_DEF.
+//
+//===----------------------------------------------------------------------===//
 
-#define DEBUG_TYPE "si-i1-copies"
 #include "AMDGPU.h"
 #include "AMDGPUSubtarget.h"
-#include "SIInstrInfo.h"
 #include "MCTargetDesc/AMDGPUMCTargetDesc.h"
-#include "Utils/AMDGPULaneDominator.h"
-#include "llvm/CodeGen/LiveIntervals.h"
+#include "SIInstrInfo.h"
+#include "llvm/CodeGen/MachineDominators.h"
 #include "llvm/CodeGen/MachineFunctionPass.h"
 #include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/MachinePostDominators.h"
 #include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/CodeGen/MachineSSAUpdater.h"
 #include "llvm/IR/Function.h"
 #include "llvm/IR/LLVMContext.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Target/TargetMachine.h"
 
+#define DEBUG_TYPE "si-i1-copies"
+
 using namespace llvm;
 
+static unsigned createLaneMaskReg(MachineFunction &MF);
+static unsigned insertUndefLaneMask(MachineBasicBlock &MBB);
+
 namespace {
 
 class SILowerI1Copies : public MachineFunctionPass {
 public:
   static char ID;
 
+private:
+  MachineFunction *MF = nullptr;
+  MachineDominatorTree *DT = nullptr;
+  MachinePostDominatorTree *PDT = nullptr;
+  MachineRegisterInfo *MRI = nullptr;
+  const GCNSubtarget *ST = nullptr;
+  const SIInstrInfo *TII = nullptr;
+
+  DenseSet<unsigned> ConstrainRegs;
+
 public:
   SILowerI1Copies() : MachineFunctionPass(ID) {
     initializeSILowerI1CopiesPass(*PassRegistry::getPassRegistry());
@@ -47,14 +71,337 @@ public:
 
   void getAnalysisUsage(AnalysisUsage &AU) const override {
     AU.setPreservesCFG();
+    AU.addRequired<MachineDominatorTree>();
+    AU.addRequired<MachinePostDominatorTree>();
     MachineFunctionPass::getAnalysisUsage(AU);
   }
+
+private:
+  void lowerCopiesFromI1();
+  void lowerPhis();
+  void lowerCopiesToI1();
+  bool isConstantLaneMask(unsigned Reg, bool &Val) const;
+  void buildMergeLaneMasks(MachineBasicBlock &MBB,
+                           MachineBasicBlock::iterator I, const DebugLoc &DL,
+                           unsigned DstReg, unsigned PrevReg, unsigned CurReg);
+  MachineBasicBlock::iterator
+  getSaluInsertionAtEnd(MachineBasicBlock &MBB) const;
+
+  bool isLaneMaskReg(unsigned Reg) const {
+    return TII->getRegisterInfo().isSGPRReg(*MRI, Reg) &&
+           TII->getRegisterInfo().getRegSizeInBits(Reg, *MRI) ==
+               ST->getWavefrontSize();
+  }
+};
+
+/// Helper class that determines the relationship between incoming values of a
+/// phi in the control flow graph to determine where an incoming value can
+/// simply be taken as a scalar lane mask as-is, and where it needs to be
+/// merged with another, previously defined lane mask.
+///
+/// The approach is as follows:
+///  - Determine all basic blocks which, starting from the incoming blocks,
+///    a wave may reach before entering the def block (the block containing the
+///    phi).
+///  - If an incoming block has no predecessors in this set, we can take the
+///    incoming value as a scalar lane mask as-is.
+///  -- A special case of this is when the def block has a self-loop.
+///  - Otherwise, the incoming value needs to be merged with a previously
+///    defined lane mask.
+///  - If there is a path into the set of reachable blocks that does _not_ go
+///    through an incoming block where we can take the scalar lane mask as-is,
+///    we need to invent an available value for the SSAUpdater. Choices are
+///    0 and undef, with differing consequences for how to merge values etc.
+///
+/// TODO: We could use region analysis to quickly skip over SESE regions during
+///       the traversal.
+///
+class PhiIncomingAnalysis {
+  MachinePostDominatorTree &PDT;
+
+  // For each reachable basic block, whether it is a source in the induced
+  // subgraph of the CFG.
+  DenseMap<MachineBasicBlock *, bool> ReachableMap;
+  SmallVector<MachineBasicBlock *, 4> ReachableOrdered;
+  SmallVector<MachineBasicBlock *, 4> Stack;
+  SmallVector<MachineBasicBlock *, 4> Predecessors;
+
+public:
+  PhiIncomingAnalysis(MachinePostDominatorTree &PDT) : PDT(PDT) {}
+
+  /// Returns whether \p MBB is a source in the induced subgraph of reachable
+  /// blocks.
+  bool isSource(MachineBasicBlock &MBB) const {
+    return ReachableMap.find(&MBB)->second;
+  }
+
+  ArrayRef<MachineBasicBlock *> predecessors() const { return Predecessors; }
+
+  void analyze(MachineBasicBlock &DefBlock,
+               ArrayRef<MachineBasicBlock *> IncomingBlocks) {
+    assert(Stack.empty());
+    ReachableMap.clear();
+    ReachableOrdered.clear();
+    Predecessors.clear();
+
+    // Insert the def block first, so that it acts as an end point for the
+    // traversal.
+    ReachableMap.try_emplace(&DefBlock, false);
+    ReachableOrdered.push_back(&DefBlock);
+
+    for (MachineBasicBlock *MBB : IncomingBlocks) {
+      if (MBB == &DefBlock) {
+        ReachableMap[&DefBlock] = true; // self-loop on DefBlock
+        continue;
+      }
+
+      ReachableMap.try_emplace(MBB, false);
+      ReachableOrdered.push_back(MBB);
+
+      // If this block has a divergent terminator and the def block is its
+      // post-dominator, the wave may first visit the other successors.
+      bool Divergent = false;
+      for (MachineInstr &MI : MBB->terminators()) {
+        if (MI.getOpcode() == AMDGPU::SI_NON_UNIFORM_BRCOND_PSEUDO ||
+            MI.getOpcode() == AMDGPU::SI_IF ||
+            MI.getOpcode() == AMDGPU::SI_ELSE ||
+            MI.getOpcode() == AMDGPU::SI_LOOP) {
+          Divergent = true;
+          break;
+        }
+      }
+
+      if (Divergent && PDT.dominates(&DefBlock, MBB)) {
+        for (MachineBasicBlock *Succ : MBB->successors())
+          Stack.push_back(Succ);
+      }
+    }
+
+    while (!Stack.empty()) {
+      MachineBasicBlock *MBB = Stack.pop_back_val();
+      if (!ReachableMap.try_emplace(MBB, false).second)
+        continue;
+      ReachableOrdered.push_back(MBB);
+
+      for (MachineBasicBlock *Succ : MBB->successors())
+        Stack.push_back(Succ);
+    }
+
+    for (MachineBasicBlock *MBB : ReachableOrdered) {
+      bool HaveReachablePred = false;
+      for (MachineBasicBlock *Pred : MBB->predecessors()) {
+        if (ReachableMap.count(Pred)) {
+          HaveReachablePred = true;
+        } else {
+          Stack.push_back(Pred);
+        }
+      }
+      if (!HaveReachablePred)
+        ReachableMap[MBB] = true;
+      if (HaveReachablePred) {
+        for (MachineBasicBlock *UnreachablePred : Stack) {
+          if (llvm::find(Predecessors, UnreachablePred) == Predecessors.end())
+            Predecessors.push_back(UnreachablePred);
+        }
+      }
+      Stack.clear();
+    }
+  }
+};
+
+/// Helper class that detects loops which require us to lower an i1 COPY into
+/// bitwise manipulation.
+///
+/// Unfortunately, we cannot use LoopInfo because LoopInfo does not distinguish
+/// between loops with the same header. Consider this example:
+///
+///  A-+-+
+///  | | |
+///  B-+ |
+///  |   |
+///  C---+
+///
+/// A is the header of a loop containing A, B, and C as far as LoopInfo is
+/// concerned. However, an i1 COPY in B that is used in C must be lowered to
+/// bitwise operations to combine results from different loop iterations when
+/// B has a divergent branch (since by default we will compile this code such
+/// that threads in a wave are merged at the entry of C).
+///
+/// The following rule is implemented to determine whether bitwise operations
+/// are required: use the bitwise lowering for a def in block B if a backward
+/// edge to B is reachable without going through the nearest common
+/// post-dominator of B and all uses of the def.
+///
+/// TODO: This rule is conservative because it does not check whether the
+///       relevant branches are actually divergent.
+///
+/// The class is designed to cache the CFG traversal so that it can be re-used
+/// for multiple defs within the same basic block.
+///
+/// TODO: We could use region analysis to quickly skip over SESE regions during
+///       the traversal.
+///
+class LoopFinder {
+  MachineDominatorTree &DT;
+  MachinePostDominatorTree &PDT;
+
+  // All visited / reachable block, tagged by level (level 0 is the def block,
+  // level 1 are all blocks reachable including but not going through the def
+  // block's IPDOM, etc.).
+  DenseMap<MachineBasicBlock *, unsigned> Visited;
+
+  // Nearest common dominator of all visited blocks by level (level 0 is the
+  // def block). Used for seeding the SSAUpdater.
+  SmallVector<MachineBasicBlock *, 4> CommonDominators;
+
+  // Post-dominator of all visited blocks.
+  MachineBasicBlock *VisitedPostDom = nullptr;
+
+  // Level at which a loop was found: 0 is not possible; 1 = a backward edge is
+  // reachable without going through the IPDOM of the def block (if the IPDOM
+  // itself has an edge to the def block, the loop level is 2), etc.
+  unsigned FoundLoopLevel = ~0u;
+
+  MachineBasicBlock *DefBlock = nullptr;
+  SmallVector<MachineBasicBlock *, 4> Stack;
+  SmallVector<MachineBasicBlock *, 4> NextLevel;
+
+public:
+  LoopFinder(MachineDominatorTree &DT, MachinePostDominatorTree &PDT)
+      : DT(DT), PDT(PDT) {}
+
+  void initialize(MachineBasicBlock &MBB) {
+    Visited.clear();
+    CommonDominators.clear();
+    Stack.clear();
+    NextLevel.clear();
+    VisitedPostDom = nullptr;
+    FoundLoopLevel = ~0u;
+
+    DefBlock = &MBB;
+  }
+
+  /// Check whether a backward edge can be reached without going through the
+  /// given \p PostDom of the def block.
+  ///
+  /// Return the level of \p PostDom if a loop was found, or 0 otherwise.
+  unsigned findLoop(MachineBasicBlock *PostDom) {
+    MachineDomTreeNode *PDNode = PDT.getNode(DefBlock);
+
+    if (!VisitedPostDom)
+      advanceLevel();
+
+    unsigned Level = 0;
+    while (PDNode->getBlock() != PostDom) {
+      if (PDNode->getBlock() == VisitedPostDom)
+        advanceLevel();
+      PDNode = PDNode->getIDom();
+      Level++;
+      if (FoundLoopLevel == Level)
+        return Level;
+    }
+
+    return 0;
+  }
+
+  /// Add undef values dominating the loop and the optionally given additional
+  /// blocks, so that the SSA updater doesn't have to search all the way to the
+  /// function entry.
+  void addLoopEntries(unsigned LoopLevel, MachineSSAUpdater &SSAUpdater,
+                      ArrayRef<MachineBasicBlock *> Blocks = {}) {
+    assert(LoopLevel < CommonDominators.size());
+
+    MachineBasicBlock *Dom = CommonDominators[LoopLevel];
+    for (MachineBasicBlock *MBB : Blocks)
+      Dom = DT.findNearestCommonDominator(Dom, MBB);
+
+    if (!inLoopLevel(*Dom, LoopLevel, Blocks)) {
+      SSAUpdater.AddAvailableValue(Dom, insertUndefLaneMask(*Dom));
+    } else {
+      // The dominator is part of the loop or the given blocks, so add the
+      // undef value to unreachable predecessors instead.
+      for (MachineBasicBlock *Pred : Dom->predecessors()) {
+        if (!inLoopLevel(*Pred, LoopLevel, Blocks))
+          SSAUpdater.AddAvailableValue(Pred, insertUndefLaneMask(*Pred));
+      }
+    }
+  }
+
+private:
+  bool inLoopLevel(MachineBasicBlock &MBB, unsigned LoopLevel,
+                   ArrayRef<MachineBasicBlock *> Blocks) const {
+    auto DomIt = Visited.find(&MBB);
+    if (DomIt != Visited.end() && DomIt->second <= LoopLevel)
+      return true;
+
+    if (llvm::find(Blocks, &MBB) != Blocks.end())
+      return true;
+
+    return false;
+  }
+
+  void advanceLevel() {
+    MachineBasicBlock *VisitedDom;
+
+    if (!VisitedPostDom) {
+      VisitedPostDom = DefBlock;
+      VisitedDom = DefBlock;
+      Stack.push_back(DefBlock);
+    } else {
+      VisitedPostDom = PDT.getNode(VisitedPostDom)->getIDom()->getBlock();
+      VisitedDom = CommonDominators.back();
+
+      for (unsigned i = 0; i < NextLevel.size();) {
+        if (PDT.dominates(VisitedPostDom, NextLevel[i])) {
+          Stack.push_back(NextLevel[i]);
+
+          NextLevel[i] = NextLevel.back();
+          NextLevel.pop_back();
+        } else {
+          i++;
+        }
+      }
+    }
+
+    unsigned Level = CommonDominators.size();
+    while (!Stack.empty()) {
+      MachineBasicBlock *MBB = Stack.pop_back_val();
+      if (!PDT.dominates(VisitedPostDom, MBB))
+        NextLevel.push_back(MBB);
+
+      Visited[MBB] = Level;
+      VisitedDom = DT.findNearestCommonDominator(VisitedDom, MBB);
+
+      for (MachineBasicBlock *Succ : MBB->successors()) {
+        if (Succ == DefBlock) {
+          if (MBB == VisitedPostDom)
+            FoundLoopLevel = std::min(FoundLoopLevel, Level + 1);
+          else
+            FoundLoopLevel = std::min(FoundLoopLevel, Level);
+          continue;
+        }
+
+        if (Visited.try_emplace(Succ, ~0u).second) {
+          if (MBB == VisitedPostDom)
+            NextLevel.push_back(Succ);
+          else
+            Stack.push_back(Succ);
+        }
+      }
+    }
+
+    CommonDominators.push_back(VisitedDom);
+  }
 };
 
 } // End anonymous namespace.
 
-INITIALIZE_PASS(SILowerI1Copies, DEBUG_TYPE,
-                "SI Lower i1 Copies", false, false)
+INITIALIZE_PASS_BEGIN(SILowerI1Copies, DEBUG_TYPE, "SI Lower i1 Copies", false,
+                      false)
+INITIALIZE_PASS_DEPENDENCY(MachineDominatorTree)
+INITIALIZE_PASS_DEPENDENCY(MachinePostDominatorTree)
+INITIALIZE_PASS_END(SILowerI1Copies, DEBUG_TYPE, "SI Lower i1 Copies", false,
+                    false)
 
 char SILowerI1Copies::ID = 0;
 
@@ -64,104 +411,415 @@ FunctionPass *llvm::createSILowerI1CopiesPass() {
   return new SILowerI1Copies();
 }
 
-bool SILowerI1Copies::runOnMachineFunction(MachineFunction &MF) {
+static unsigned createLaneMaskReg(MachineFunction &MF) {
   MachineRegisterInfo &MRI = MF.getRegInfo();
+  return MRI.createVirtualRegister(&AMDGPU::SReg_64RegClass);
+}
+
+static unsigned insertUndefLaneMask(MachineBasicBlock &MBB) {
+  MachineFunction &MF = *MBB.getParent();
   const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
   const SIInstrInfo *TII = ST.getInstrInfo();
-  const TargetRegisterInfo *TRI = &TII->getRegisterInfo();
+  unsigned UndefReg = createLaneMaskReg(MF);
+  BuildMI(MBB, MBB.getFirstTerminator(), {}, TII->get(AMDGPU::IMPLICIT_DEF),
+          UndefReg);
+  return UndefReg;
+}
 
-  std::vector<unsigned> I1Defs;
+/// Lower all instructions that def or use vreg_1 registers.
+///
+/// In a first pass, we lower COPYs from vreg_1 to vector registers, as can
+/// occur around inline assembly. We do this first, before vreg_1 registers
+/// are changed to scalar mask registers.
+///
+/// Then we lower all defs of vreg_1 registers. Phi nodes are lowered before
+/// all others, because phi lowering looks through copies and can therefore
+/// often make copy lowering unnecessary.
+bool SILowerI1Copies::runOnMachineFunction(MachineFunction &TheMF) {
+  MF = &TheMF;
+  MRI = &MF->getRegInfo();
+  DT = &getAnalysis<MachineDominatorTree>();
+  PDT = &getAnalysis<MachinePostDominatorTree>();
 
-  for (MachineFunction::iterator BI = MF.begin(), BE = MF.end();
-                                                  BI != BE; ++BI) {
+  ST = &MF->getSubtarget<GCNSubtarget>();
+  TII = ST->getInstrInfo();
 
-    MachineBasicBlock &MBB = *BI;
-    MachineBasicBlock::iterator I, Next;
-    for (I = MBB.begin(); I != MBB.end(); I = Next) {
-      Next = std::next(I);
-      MachineInstr &MI = *I;
+  lowerCopiesFromI1();
+  lowerPhis();
+  lowerCopiesToI1();
 
-      if (MI.getOpcode() == AMDGPU::IMPLICIT_DEF) {
-        unsigned Reg = MI.getOperand(0).getReg();
-        const TargetRegisterClass *RC = MRI.getRegClass(Reg);
-        if (RC == &AMDGPU::VReg_1RegClass)
-          MRI.setRegClass(Reg, &AMDGPU::SReg_64RegClass);
-        continue;
-      }
+  for (unsigned Reg : ConstrainRegs)
+    MRI->constrainRegClass(Reg, &AMDGPU::SReg_64_XEXECRegClass);
+  ConstrainRegs.clear();
 
+  return true;
+}
+
+void SILowerI1Copies::lowerCopiesFromI1() {
+  SmallVector<MachineInstr *, 4> DeadCopies;
+
+  for (MachineBasicBlock &MBB : *MF) {
+    for (MachineInstr &MI : MBB) {
       if (MI.getOpcode() != AMDGPU::COPY)
         continue;
 
-      const MachineOperand &Dst = MI.getOperand(0);
-      const MachineOperand &Src = MI.getOperand(1);
-
-      if (!TargetRegisterInfo::isVirtualRegister(Src.getReg()) ||
-          !TargetRegisterInfo::isVirtualRegister(Dst.getReg()))
+      unsigned DstReg = MI.getOperand(0).getReg();
+      unsigned SrcReg = MI.getOperand(1).getReg();
+      if (!TargetRegisterInfo::isVirtualRegister(SrcReg) ||
+          MRI->getRegClass(SrcReg) != &AMDGPU::VReg_1RegClass)
         continue;
 
-      const TargetRegisterClass *DstRC = MRI.getRegClass(Dst.getReg());
-      const TargetRegisterClass *SrcRC = MRI.getRegClass(Src.getReg());
+      if (isLaneMaskReg(DstReg) ||
+          (TargetRegisterInfo::isVirtualRegister(DstReg) &&
+           MRI->getRegClass(DstReg) == &AMDGPU::VReg_1RegClass))
+        continue;
 
+      // Copy into a 32-bit vector register.
+      LLVM_DEBUG(dbgs() << "Lower copy from i1: " << MI);
       DebugLoc DL = MI.getDebugLoc();
-      MachineInstr *DefInst = MRI.getUniqueVRegDef(Src.getReg());
-      if (DstRC == &AMDGPU::VReg_1RegClass &&
-          TRI->getCommonSubClass(SrcRC, &AMDGPU::SGPR_64RegClass)) {
-        I1Defs.push_back(Dst.getReg());
 
-        if (DefInst->getOpcode() == AMDGPU::S_MOV_B64) {
-          if (DefInst->getOperand(1).isImm()) {
-            I1Defs.push_back(Dst.getReg());
+      assert(TII->getRegisterInfo().getRegSizeInBits(DstReg, *MRI) == 32);
+      assert(!MI.getOperand(0).getSubReg());
 
-            int64_t Val = DefInst->getOperand(1).getImm();
-            assert(Val == 0 || Val == -1);
+      ConstrainRegs.insert(SrcReg);
+      BuildMI(MBB, MI, DL, TII->get(AMDGPU::V_CNDMASK_B32_e64), DstReg)
+          .addImm(0)
+          .addImm(-1)
+          .addReg(SrcReg);
+      DeadCopies.push_back(&MI);
+    }
 
-            BuildMI(MBB, &MI, DL, TII->get(AMDGPU::V_MOV_B32_e32))
-                .add(Dst)
-                .addImm(Val);
-            MI.eraseFromParent();
-            continue;
+    for (MachineInstr *MI : DeadCopies)
+      MI->eraseFromParent();
+    DeadCopies.clear();
+  }
+}
+
+void SILowerI1Copies::lowerPhis() {
+  MachineSSAUpdater SSAUpdater(*MF);
+  LoopFinder LF(*DT, *PDT);
+  PhiIncomingAnalysis PIA(*PDT);
+  SmallVector<MachineInstr *, 4> DeadPhis;
+  SmallVector<MachineBasicBlock *, 4> IncomingBlocks;
+  SmallVector<unsigned, 4> IncomingRegs;
+  SmallVector<unsigned, 4> IncomingUpdated;
+
+  for (MachineBasicBlock &MBB : *MF) {
+    LF.initialize(MBB);
+
+    for (MachineInstr &MI : MBB.phis()) {
+      unsigned DstReg = MI.getOperand(0).getReg();
+      if (MRI->getRegClass(DstReg) != &AMDGPU::VReg_1RegClass)
+        continue;
+
+      LLVM_DEBUG(dbgs() << "Lower PHI: " << MI);
+
+      MRI->setRegClass(DstReg, &AMDGPU::SReg_64RegClass);
+
+      // Collect incoming values.
+      for (unsigned i = 1; i < MI.getNumOperands(); i += 2) {
+        assert(i + 1 < MI.getNumOperands());
+        unsigned IncomingReg = MI.getOperand(i).getReg();
+        MachineBasicBlock *IncomingMBB = MI.getOperand(i + 1).getMBB();
+        MachineInstr *IncomingDef = MRI->getUniqueVRegDef(IncomingReg);
+
+        if (IncomingDef->getOpcode() == AMDGPU::COPY) {
+          IncomingReg = IncomingDef->getOperand(1).getReg();
+          assert(isLaneMaskReg(IncomingReg));
+          assert(!IncomingDef->getOperand(1).getSubReg());
+        } else if (IncomingDef->getOpcode() == AMDGPU::IMPLICIT_DEF) {
+          continue;
+        } else {
+          assert(IncomingDef->isPHI());
+        }
+
+        IncomingBlocks.push_back(IncomingMBB);
+        IncomingRegs.push_back(IncomingReg);
+      }
+
+      // Phis in a loop that are observed outside the loop receive a simple but
+      // conservatively correct treatment.
+      MachineBasicBlock *PostDomBound = &MBB;
+      for (MachineInstr &Use : MRI->use_instructions(DstReg)) {
+        PostDomBound =
+            PDT->findNearestCommonDominator(PostDomBound, Use.getParent());
+      }
+
+      unsigned FoundLoopLevel = LF.findLoop(PostDomBound);
+
+      SSAUpdater.Initialize(DstReg);
+
+      if (FoundLoopLevel) {
+        LF.addLoopEntries(FoundLoopLevel, SSAUpdater, IncomingBlocks);
+
+        for (unsigned i = 0; i < IncomingRegs.size(); ++i) {
+          IncomingUpdated.push_back(createLaneMaskReg(*MF));
+          SSAUpdater.AddAvailableValue(IncomingBlocks[i],
+                                       IncomingUpdated.back());
+        }
+
+        for (unsigned i = 0; i < IncomingRegs.size(); ++i) {
+          MachineBasicBlock &IMBB = *IncomingBlocks[i];
+          buildMergeLaneMasks(
+              IMBB, getSaluInsertionAtEnd(IMBB), {}, IncomingUpdated[i],
+              SSAUpdater.GetValueInMiddleOfBlock(&IMBB), IncomingRegs[i]);
+        }
+      } else {
+        // The phi is not observed from outside a loop. Use a more accurate
+        // lowering.
+        PIA.analyze(MBB, IncomingBlocks);
+
+        for (MachineBasicBlock *MBB : PIA.predecessors())
+          SSAUpdater.AddAvailableValue(MBB, insertUndefLaneMask(*MBB));
+
+        for (unsigned i = 0; i < IncomingRegs.size(); ++i) {
+          MachineBasicBlock &IMBB = *IncomingBlocks[i];
+          if (PIA.isSource(IMBB)) {
+            IncomingUpdated.push_back(0);
+            SSAUpdater.AddAvailableValue(&IMBB, IncomingRegs[i]);
+          } else {
+            IncomingUpdated.push_back(createLaneMaskReg(*MF));
+            SSAUpdater.AddAvailableValue(&IMBB, IncomingUpdated.back());
           }
         }
 
-        unsigned int TmpSrc = MRI.createVirtualRegister(&AMDGPU::SReg_64_XEXECRegClass);
-        BuildMI(MBB, &MI, DL, TII->get(AMDGPU::COPY), TmpSrc)
-            .add(Src);
-        BuildMI(MBB, &MI, DL, TII->get(AMDGPU::V_CNDMASK_B32_e64))
-            .add(Dst)
-            .addImm(0)
-            .addImm(-1)
-            .addReg(TmpSrc);
-        MI.eraseFromParent();
-      } else if (TRI->getCommonSubClass(DstRC, &AMDGPU::SGPR_64RegClass) &&
-                 SrcRC == &AMDGPU::VReg_1RegClass) {
-        if (DefInst->getOpcode() == AMDGPU::V_CNDMASK_B32_e64 &&
-            DefInst->getOperand(1).isImm() && DefInst->getOperand(2).isImm() &&
-            DefInst->getOperand(1).getImm() == 0 &&
-            DefInst->getOperand(2).getImm() != 0 &&
-            DefInst->getOperand(3).isReg() &&
-            TargetRegisterInfo::isVirtualRegister(
-              DefInst->getOperand(3).getReg()) &&
-            TRI->getCommonSubClass(
-              MRI.getRegClass(DefInst->getOperand(3).getReg()),
-              &AMDGPU::SGPR_64RegClass) &&
-            AMDGPU::laneDominates(DefInst->getParent(), &MBB)) {
-          BuildMI(MBB, &MI, DL, TII->get(AMDGPU::S_AND_B64))
-              .add(Dst)
-              .addReg(AMDGPU::EXEC)
-              .add(DefInst->getOperand(3));
-        } else {
-          BuildMI(MBB, &MI, DL, TII->get(AMDGPU::V_CMP_NE_U32_e64))
-              .add(Dst)
-              .add(Src)
-              .addImm(0);
+        for (unsigned i = 0; i < IncomingRegs.size(); ++i) {
+          if (!IncomingUpdated[i])
+            continue;
+
+          MachineBasicBlock &IMBB = *IncomingBlocks[i];
+          buildMergeLaneMasks(
+              IMBB, getSaluInsertionAtEnd(IMBB), {}, IncomingUpdated[i],
+              SSAUpdater.GetValueInMiddleOfBlock(&IMBB), IncomingRegs[i]);
         }
-        MI.eraseFromParent();
       }
+
+      unsigned NewReg = SSAUpdater.GetValueInMiddleOfBlock(&MBB);
+      if (NewReg != DstReg) {
+        MRI->replaceRegWith(NewReg, DstReg);
+
+        // Ensure that DstReg has a single def and mark the old PHI node for
+        // deletion.
+        MI.getOperand(0).setReg(NewReg);
+        DeadPhis.push_back(&MI);
+      }
+
+      IncomingBlocks.clear();
+      IncomingRegs.clear();
+      IncomingUpdated.clear();
     }
+
+    for (MachineInstr *MI : DeadPhis)
+      MI->eraseFromParent();
+    DeadPhis.clear();
   }
+}
+
+void SILowerI1Copies::lowerCopiesToI1() {
+  MachineSSAUpdater SSAUpdater(*MF);
+  LoopFinder LF(*DT, *PDT);
+  SmallVector<MachineInstr *, 4> DeadCopies;
+
+  for (MachineBasicBlock &MBB : *MF) {
+    LF.initialize(MBB);
 
-  for (unsigned Reg : I1Defs)
-    MRI.setRegClass(Reg, &AMDGPU::VGPR_32RegClass);
+    for (MachineInstr &MI : MBB) {
+      if (MI.getOpcode() != AMDGPU::IMPLICIT_DEF &&
+          MI.getOpcode() != AMDGPU::COPY)
+        continue;
+
+      unsigned DstReg = MI.getOperand(0).getReg();
+      if (!TargetRegisterInfo::isVirtualRegister(DstReg) ||
+          MRI->getRegClass(DstReg) != &AMDGPU::VReg_1RegClass)
+        continue;
+
+      if (MRI->use_empty(DstReg)) {
+        DeadCopies.push_back(&MI);
+        continue;
+      }
+
+      LLVM_DEBUG(dbgs() << "Lower Other: " << MI);
+
+      MRI->setRegClass(DstReg, &AMDGPU::SReg_64RegClass);
+      if (MI.getOpcode() == AMDGPU::IMPLICIT_DEF)
+        continue;
+
+      DebugLoc DL = MI.getDebugLoc();
+      unsigned SrcReg = MI.getOperand(1).getReg();
+      assert(!MI.getOperand(1).getSubReg());
+
+      if (!TargetRegisterInfo::isVirtualRegister(SrcReg) ||
+          !isLaneMaskReg(SrcReg)) {
+        assert(TII->getRegisterInfo().getRegSizeInBits(SrcReg, *MRI) == 32);
+        unsigned TmpReg = createLaneMaskReg(*MF);
+        BuildMI(MBB, MI, DL, TII->get(AMDGPU::V_CMP_NE_U32_e64), TmpReg)
+            .addReg(SrcReg)
+            .addImm(0);
+        MI.getOperand(1).setReg(TmpReg);
+        SrcReg = TmpReg;
+      }
+
+      // Defs in a loop that are observed outside the loop must be transformed
+      // into appropriate bit manipulation.
+      MachineBasicBlock *PostDomBound = &MBB;
+      for (MachineInstr &Use : MRI->use_instructions(DstReg)) {
+        PostDomBound =
+            PDT->findNearestCommonDominator(PostDomBound, Use.getParent());
+      }
+
+      unsigned FoundLoopLevel = LF.findLoop(PostDomBound);
+      if (FoundLoopLevel) {
+        SSAUpdater.Initialize(DstReg);
+        SSAUpdater.AddAvailableValue(&MBB, DstReg);
+        LF.addLoopEntries(FoundLoopLevel, SSAUpdater);
+
+        buildMergeLaneMasks(MBB, MI, DL, DstReg,
+                            SSAUpdater.GetValueInMiddleOfBlock(&MBB), SrcReg);
+        DeadCopies.push_back(&MI);
+      }
+    }
+
+    for (MachineInstr *MI : DeadCopies)
+      MI->eraseFromParent();
+    DeadCopies.clear();
+  }
+}
+
+bool SILowerI1Copies::isConstantLaneMask(unsigned Reg, bool &Val) const {
+  const MachineInstr *MI;
+  for (;;) {
+    MI = MRI->getUniqueVRegDef(Reg);
+    if (MI->getOpcode() != AMDGPU::COPY)
+      break;
+
+    Reg = MI->getOperand(1).getReg();
+    if (!TargetRegisterInfo::isVirtualRegister(Reg))
+      return false;
+    if (!isLaneMaskReg(Reg))
+      return false;
+  }
+
+  if (MI->getOpcode() != AMDGPU::S_MOV_B64)
+    return false;
+
+  if (!MI->getOperand(1).isImm())
+    return false;
+
+  int64_t Imm = MI->getOperand(1).getImm();
+  if (Imm == 0) {
+    Val = false;
+    return true;
+  }
+  if (Imm == -1) {
+    Val = true;
+    return true;
+  }
 
   return false;
 }
+
+static void instrDefsUsesSCC(const MachineInstr &MI, bool &Def, bool &Use) {
+  Def = false;
+  Use = false;
+
+  for (const MachineOperand &MO : MI.operands()) {
+    if (MO.isReg() && MO.getReg() == AMDGPU::SCC) {
+      if (MO.isUse())
+        Use = true;
+      else
+        Def = true;
+    }
+  }
+}
+
+/// Return a point at the end of the given \p MBB to insert SALU instructions
+/// for lane mask calculation. Take terminators and SCC into account.
+MachineBasicBlock::iterator
+SILowerI1Copies::getSaluInsertionAtEnd(MachineBasicBlock &MBB) const {
+  auto InsertionPt = MBB.getFirstTerminator();
+  bool TerminatorsUseSCC = false;
+  for (auto I = InsertionPt, E = MBB.end(); I != E; ++I) {
+    bool DefsSCC;
+    instrDefsUsesSCC(*I, DefsSCC, TerminatorsUseSCC);
+    if (TerminatorsUseSCC || DefsSCC)
+      break;
+  }
+
+  if (!TerminatorsUseSCC)
+    return InsertionPt;
+
+  while (InsertionPt != MBB.begin()) {
+    InsertionPt--;
+
+    bool DefSCC, UseSCC;
+    instrDefsUsesSCC(*InsertionPt, DefSCC, UseSCC);
+    if (DefSCC)
+      return InsertionPt;
+  }
+
+  // We should have at least seen an IMPLICIT_DEF or COPY
+  llvm_unreachable("SCC used by terminator but no def in block");
+}
+
+void SILowerI1Copies::buildMergeLaneMasks(MachineBasicBlock &MBB,
+                                          MachineBasicBlock::iterator I,
+                                          const DebugLoc &DL, unsigned DstReg,
+                                          unsigned PrevReg, unsigned CurReg) {
+  bool PrevVal;
+  bool PrevConstant = isConstantLaneMask(PrevReg, PrevVal);
+  bool CurVal;
+  bool CurConstant = isConstantLaneMask(CurReg, CurVal);
+
+  if (PrevConstant && CurConstant) {
+    if (PrevVal == CurVal) {
+      BuildMI(MBB, I, DL, TII->get(AMDGPU::COPY), DstReg).addReg(CurReg);
+    } else if (CurVal) {
+      BuildMI(MBB, I, DL, TII->get(AMDGPU::COPY), DstReg).addReg(AMDGPU::EXEC);
+    } else {
+      BuildMI(MBB, I, DL, TII->get(AMDGPU::S_XOR_B64), DstReg)
+          .addReg(AMDGPU::EXEC)
+          .addImm(-1);
+    }
+    return;
+  }
+
+  unsigned PrevMaskedReg = 0;
+  unsigned CurMaskedReg = 0;
+  if (!PrevConstant) {
+    if (CurConstant && CurVal) {
+      PrevMaskedReg = PrevReg;
+    } else {
+      PrevMaskedReg = createLaneMaskReg(*MF);
+      BuildMI(MBB, I, DL, TII->get(AMDGPU::S_ANDN2_B64), PrevMaskedReg)
+          .addReg(PrevReg)
+          .addReg(AMDGPU::EXEC);
+    }
+  }
+  if (!CurConstant) {
+    // TODO: check whether CurReg is already masked by EXEC
+    if (PrevConstant && PrevVal) {
+      CurMaskedReg = CurReg;
+    } else {
+      CurMaskedReg = createLaneMaskReg(*MF);
+      BuildMI(MBB, I, DL, TII->get(AMDGPU::S_AND_B64), CurMaskedReg)
+          .addReg(CurReg)
+          .addReg(AMDGPU::EXEC);
+    }
+  }
+
+  if (PrevConstant && !PrevVal) {
+    BuildMI(MBB, I, DL, TII->get(AMDGPU::COPY), DstReg)
+        .addReg(CurMaskedReg);
+  } else if (CurConstant && !CurVal) {
+    BuildMI(MBB, I, DL, TII->get(AMDGPU::COPY), DstReg)
+        .addReg(PrevMaskedReg);
+  } else if (PrevConstant && PrevVal) {
+    BuildMI(MBB, I, DL, TII->get(AMDGPU::S_ORN2_B64), DstReg)
+        .addReg(CurMaskedReg)
+        .addReg(AMDGPU::EXEC);
+  } else {
+    BuildMI(MBB, I, DL, TII->get(AMDGPU::S_OR_B64), DstReg)
+        .addReg(PrevMaskedReg)
+        .addReg(CurMaskedReg ? CurMaskedReg : (unsigned)AMDGPU::EXEC);
+  }
+}
diff --git a/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp b/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp
index 0d5ff75e37ed8..181cc41bd5ff7 100644
--- a/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp
+++ b/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp
@@ -117,7 +117,6 @@ SIMachineFunctionInfo::SIMachineFunctionInfo(const MachineFunction &MF)
   }
 
   const MachineFrameInfo &FrameInfo = MF.getFrameInfo();
-  bool MaySpill = ST.isVGPRSpillingEnabled(F);
   bool HasStackObjects = FrameInfo.hasStackObjects();
 
   if (isEntryFunction()) {
@@ -126,21 +125,18 @@ SIMachineFunctionInfo::SIMachineFunctionInfo(const MachineFunction &MF)
     if (WorkItemIDZ)
       WorkItemIDY = true;
 
-    if (HasStackObjects || MaySpill) {
-      PrivateSegmentWaveByteOffset = true;
+    PrivateSegmentWaveByteOffset = true;
 
     // HS and GS always have the scratch wave offset in SGPR5 on GFX9.
     if (ST.getGeneration() >= AMDGPUSubtarget::GFX9 &&
         (CC == CallingConv::AMDGPU_HS || CC == CallingConv::AMDGPU_GS))
-      ArgInfo.PrivateSegmentWaveByteOffset
-        = ArgDescriptor::createRegister(AMDGPU::SGPR5);
-    }
+      ArgInfo.PrivateSegmentWaveByteOffset =
+          ArgDescriptor::createRegister(AMDGPU::SGPR5);
   }
 
-  bool IsCOV2 = ST.isAmdCodeObjectV2(F);
-  if (IsCOV2) {
-    if (HasStackObjects || MaySpill)
-      PrivateSegmentBuffer = true;
+  bool isAmdHsaOrMesa = ST.isAmdHsaOrMesa(F);
+  if (isAmdHsaOrMesa) {
+    PrivateSegmentBuffer = true;
 
     if (F.hasFnAttribute("amdgpu-dispatch-ptr"))
       DispatchPtr = true;
@@ -151,14 +147,13 @@ SIMachineFunctionInfo::SIMachineFunctionInfo(const MachineFunction &MF)
     if (F.hasFnAttribute("amdgpu-dispatch-id"))
       DispatchID = true;
   } else if (ST.isMesaGfxShader(F)) {
-    if (HasStackObjects || MaySpill)
-      ImplicitBufferPtr = true;
+    ImplicitBufferPtr = true;
   }
 
   if (F.hasFnAttribute("amdgpu-kernarg-segment-ptr"))
     KernargSegmentPtr = true;
 
-  if (ST.hasFlatAddressSpace() && isEntryFunction() && IsCOV2) {
+  if (ST.hasFlatAddressSpace() && isEntryFunction() && isAmdHsaOrMesa) {
     // TODO: This could be refined a lot. The attribute is a poor way of
     // detecting calls that may require it before argument lowering.
     if (HasStackObjects || F.hasFnAttribute("amdgpu-flat-scratch"))
diff --git a/lib/Target/AMDGPU/SIMachineScheduler.cpp b/lib/Target/AMDGPU/SIMachineScheduler.cpp
index 18754442898f7..fb7e670068fe6 100644
--- a/lib/Target/AMDGPU/SIMachineScheduler.cpp
+++ b/lib/Target/AMDGPU/SIMachineScheduler.cpp
@@ -471,7 +471,7 @@ void SIScheduleBlock::releaseSucc(SUnit *SU, SDep *SuccEdge) {
 #ifndef NDEBUG
   if (SuccSU->NumPredsLeft == 0) {
     dbgs() << "*** Scheduling failed! ***\n";
-    SuccSU->dump(DAG);
+    DAG->dumpNode(*SuccSU);
     dbgs() << " has been released too many times!\n";
     llvm_unreachable(nullptr);
   }
@@ -611,13 +611,11 @@ void SIScheduleBlock::printDebug(bool full) {
 
   dbgs() << "\nInstructions:\n";
   if (!Scheduled) {
-    for (SUnit* SU : SUnits) {
-      SU->dump(DAG);
-    }
+    for (const SUnit* SU : SUnits)
+      DAG->dumpNode(*SU);
   } else {
-    for (SUnit* SU : SUnits) {
-      SU->dump(DAG);
-    }
+    for (const SUnit* SU : SUnits)
+      DAG->dumpNode(*SU);
   }
 
   dbgs() << "///////////////////////\n";
@@ -1933,7 +1931,7 @@ void SIScheduleDAGMI::schedule()
   LLVM_DEBUG(dbgs() << "Preparing Scheduling\n");
 
   buildDAGWithRegPressure();
-  LLVM_DEBUG(for (SUnit &SU : SUnits) SU.dumpAll(this));
+  LLVM_DEBUG(dump());
 
   topologicalSort();
   findRootsAndBiasEdges(TopRoots, BotRoots);
@@ -1957,12 +1955,12 @@ void SIScheduleDAGMI::schedule()
 
   for (unsigned i = 0, e = (unsigned)SUnits.size(); i != e; ++i) {
     SUnit *SU = &SUnits[i];
-    unsigned BaseLatReg;
+    MachineOperand *BaseLatOp;
     int64_t OffLatReg;
     if (SITII->isLowLatencyInstruction(*SU->getInstr())) {
       IsLowLatencySU[i] = 1;
-      if (SITII->getMemOpBaseRegImmOfs(*SU->getInstr(), BaseLatReg, OffLatReg,
-                                       TRI))
+      if (SITII->getMemOperandWithOffset(*SU->getInstr(), BaseLatOp, OffLatReg,
+                                         TRI))
         LowLatencyOffset[i] = OffLatReg;
     } else if (SITII->isHighLatencyInstruction(*SU->getInstr()))
       IsHighLatencySU[i] = 1;
diff --git a/lib/Target/AMDGPU/SIMemoryLegalizer.cpp b/lib/Target/AMDGPU/SIMemoryLegalizer.cpp
index 938cdaf1ef8fb..b4a4e9e33133d 100644
--- a/lib/Target/AMDGPU/SIMemoryLegalizer.cpp
+++ b/lib/Target/AMDGPU/SIMemoryLegalizer.cpp
@@ -202,8 +202,6 @@ public:
 
 class SIMemOpAccess final {
 private:
-
-  AMDGPUAS SIAddrSpaceInfo;
   AMDGPUMachineModuleInfo *MMI = nullptr;
 
   /// Reports unsupported message \p Msg for \p MI to LLVM context.
@@ -255,7 +253,7 @@ protected:
   /// Instruction info.
   const SIInstrInfo *TII = nullptr;
 
-  IsaInfo::IsaVersion IV;
+  IsaVersion IV;
 
   SICacheControl(const GCNSubtarget &ST);
 
@@ -453,22 +451,21 @@ SIMemOpAccess::toSIAtomicScope(SyncScope::ID SSID,
 }
 
 SIAtomicAddrSpace SIMemOpAccess::toSIAtomicAddrSpace(unsigned AS) const {
-  if (AS == SIAddrSpaceInfo.FLAT_ADDRESS)
+  if (AS == AMDGPUAS::FLAT_ADDRESS)
     return SIAtomicAddrSpace::FLAT;
-  if (AS == SIAddrSpaceInfo.GLOBAL_ADDRESS)
+  if (AS == AMDGPUAS::GLOBAL_ADDRESS)
     return SIAtomicAddrSpace::GLOBAL;
-  if (AS == SIAddrSpaceInfo.LOCAL_ADDRESS)
+  if (AS == AMDGPUAS::LOCAL_ADDRESS)
     return SIAtomicAddrSpace::LDS;
-  if (AS == SIAddrSpaceInfo.PRIVATE_ADDRESS)
+  if (AS == AMDGPUAS::PRIVATE_ADDRESS)
     return SIAtomicAddrSpace::SCRATCH;
-  if (AS == SIAddrSpaceInfo.REGION_ADDRESS)
+  if (AS == AMDGPUAS::REGION_ADDRESS)
     return SIAtomicAddrSpace::GDS;
 
   return SIAtomicAddrSpace::OTHER;
 }
 
 SIMemOpAccess::SIMemOpAccess(MachineFunction &MF) {
-  SIAddrSpaceInfo = getAMDGPUAS(MF.getTarget());
   MMI = &MF.getMMI().getObjFileInfo<AMDGPUMachineModuleInfo>();
 }
 
@@ -608,7 +605,7 @@ Optional<SIMemOpInfo> SIMemOpAccess::getAtomicCmpxchgOrRmwInfo(
 
 SICacheControl::SICacheControl(const GCNSubtarget &ST) {
   TII = ST.getInstrInfo();
-  IV = IsaInfo::getIsaVersion(ST.getFeatureBits());
+  IV = getIsaVersion(ST.getCPU());
 }
 
 /* static */
@@ -815,6 +812,12 @@ bool SIGfx7CacheControl::insertCacheInvalidate(MachineBasicBlock::iterator &MI,
   MachineBasicBlock &MBB = *MI->getParent();
   DebugLoc DL = MI->getDebugLoc();
 
+  const GCNSubtarget &STM = MBB.getParent()->getSubtarget<GCNSubtarget>();
+
+  const unsigned Flush = STM.isAmdPalOS() || STM.isMesa3DOS()
+                             ? AMDGPU::BUFFER_WBINVL1
+                             : AMDGPU::BUFFER_WBINVL1_VOL;
+
   if (Pos == Position::AFTER)
     ++MI;
 
@@ -822,7 +825,7 @@ bool SIGfx7CacheControl::insertCacheInvalidate(MachineBasicBlock::iterator &MI,
     switch (Scope) {
     case SIAtomicScope::SYSTEM:
     case SIAtomicScope::AGENT:
-      BuildMI(MBB, MI, DL, TII->get(AMDGPU::BUFFER_WBINVL1_VOL));
+      BuildMI(MBB, MI, DL, TII->get(Flush));
       Changed = true;
       break;
     case SIAtomicScope::WORKGROUP:
diff --git a/lib/Target/AMDGPU/SIModeRegister.cpp b/lib/Target/AMDGPU/SIModeRegister.cpp
new file mode 100644
index 0000000000000..883fd308f2f4b
--- /dev/null
+++ b/lib/Target/AMDGPU/SIModeRegister.cpp
@@ -0,0 +1,406 @@
+//===-- SIModeRegister.cpp - Mode Register --------------------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+/// \file
+/// This pass inserts changes to the Mode register settings as required.
+/// Note that currently it only deals with the Double Precision Floating Point
+/// rounding mode setting, but is intended to be generic enough to be easily
+/// expanded.
+///
+//===----------------------------------------------------------------------===//
+//
+#include "AMDGPU.h"
+#include "AMDGPUInstrInfo.h"
+#include "AMDGPUSubtarget.h"
+#include "SIInstrInfo.h"
+#include "SIMachineFunctionInfo.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/IR/Constants.h"
+#include "llvm/IR/Function.h"
+#include "llvm/IR/LLVMContext.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/Target/TargetMachine.h"
+#include <queue>
+
+#define DEBUG_TYPE "si-mode-register"
+
+STATISTIC(NumSetregInserted, "Number of setreg of mode register inserted.");
+
+using namespace llvm;
+
+struct Status {
+  // Mask is a bitmask where a '1' indicates the corresponding Mode bit has a
+  // known value
+  unsigned Mask;
+  unsigned Mode;
+
+  Status() : Mask(0), Mode(0){};
+
+  Status(unsigned Mask, unsigned Mode) : Mask(Mask), Mode(Mode) {
+    Mode &= Mask;
+  };
+
+  // merge two status values such that only values that don't conflict are
+  // preserved
+  Status merge(const Status &S) const {
+    return Status((Mask | S.Mask), ((Mode & ~S.Mask) | (S.Mode & S.Mask)));
+  }
+
+  // merge an unknown value by using the unknown value's mask to remove bits
+  // from the result
+  Status mergeUnknown(unsigned newMask) {
+    return Status(Mask & ~newMask, Mode & ~newMask);
+  }
+
+  // intersect two Status values to produce a mode and mask that is a subset
+  // of both values
+  Status intersect(const Status &S) const {
+    unsigned NewMask = (Mask & S.Mask) & (Mode ^ ~S.Mode);
+    unsigned NewMode = (Mode & NewMask);
+    return Status(NewMask, NewMode);
+  }
+
+  // produce the delta required to change the Mode to the required Mode
+  Status delta(const Status &S) const {
+    return Status((S.Mask & (Mode ^ S.Mode)) | (~Mask & S.Mask), S.Mode);
+  }
+
+  bool operator==(const Status &S) const {
+    return (Mask == S.Mask) && (Mode == S.Mode);
+  }
+
+  bool operator!=(const Status &S) const { return !(*this == S); }
+
+  bool isCompatible(Status &S) {
+    return ((Mask & S.Mask) == S.Mask) && ((Mode & S.Mask) == S.Mode);
+  }
+
+  bool isCombinable(Status &S) {
+    return !(Mask & S.Mask) || isCompatible(S);
+  }
+};
+
+class BlockData {
+public:
+  // The Status that represents the mode register settings required by the
+  // FirstInsertionPoint (if any) in this block. Calculated in Phase 1.
+  Status Require;
+
+  // The Status that represents the net changes to the Mode register made by
+  // this block, Calculated in Phase 1.
+  Status Change;
+
+  // The Status that represents the mode register settings on exit from this
+  // block. Calculated in Phase 2.
+  Status Exit;
+
+  // The Status that represents the intersection of exit Mode register settings
+  // from all predecessor blocks. Calculated in Phase 2, and used by Phase 3.
+  Status Pred;
+
+  // In Phase 1 we record the first instruction that has a mode requirement,
+  // which is used in Phase 3 if we need to insert a mode change.
+  MachineInstr *FirstInsertionPoint;
+
+  BlockData() : FirstInsertionPoint(nullptr) {};
+};
+
+namespace {
+
+class SIModeRegister : public MachineFunctionPass {
+public:
+  static char ID;
+
+  std::vector<std::unique_ptr<BlockData>> BlockInfo;
+  std::queue<MachineBasicBlock *> Phase2List;
+
+  // The default mode register setting currently only caters for the floating
+  // point double precision rounding mode.
+  // We currently assume the default rounding mode is Round to Nearest
+  // NOTE: this should come from a per function rounding mode setting once such
+  // a setting exists.
+  unsigned DefaultMode = FP_ROUND_ROUND_TO_NEAREST;
+  Status DefaultStatus =
+      Status(FP_ROUND_MODE_DP(0x3), FP_ROUND_MODE_DP(DefaultMode));
+
+public:
+  SIModeRegister() : MachineFunctionPass(ID) {}
+
+  bool runOnMachineFunction(MachineFunction &MF) override;
+
+  void getAnalysisUsage(AnalysisUsage &AU) const override {
+    AU.setPreservesCFG();
+    MachineFunctionPass::getAnalysisUsage(AU);
+  }
+
+  void processBlockPhase1(MachineBasicBlock &MBB, const SIInstrInfo *TII);
+
+  void processBlockPhase2(MachineBasicBlock &MBB, const SIInstrInfo *TII);
+
+  void processBlockPhase3(MachineBasicBlock &MBB, const SIInstrInfo *TII);
+
+  Status getInstructionMode(MachineInstr &MI, const SIInstrInfo *TII);
+
+  void insertSetreg(MachineBasicBlock &MBB, MachineInstr *I,
+                    const SIInstrInfo *TII, Status InstrMode);
+};
+} // End anonymous namespace.
+
+INITIALIZE_PASS(SIModeRegister, DEBUG_TYPE,
+                "Insert required mode register values", false, false)
+
+char SIModeRegister::ID = 0;
+
+char &llvm::SIModeRegisterID = SIModeRegister::ID;
+
+FunctionPass *llvm::createSIModeRegisterPass() { return new SIModeRegister(); }
+
+// Determine the Mode register setting required for this instruction.
+// Instructions which don't use the Mode register return a null Status.
+// Note this currently only deals with instructions that use the floating point
+// double precision setting.
+Status SIModeRegister::getInstructionMode(MachineInstr &MI,
+                                          const SIInstrInfo *TII) {
+  if (TII->usesFPDPRounding(MI)) {
+    switch (MI.getOpcode()) {
+    case AMDGPU::V_INTERP_P1LL_F16:
+    case AMDGPU::V_INTERP_P1LV_F16:
+    case AMDGPU::V_INTERP_P2_F16:
+      // f16 interpolation instructions need double precision round to zero
+      return Status(FP_ROUND_MODE_DP(3),
+                    FP_ROUND_MODE_DP(FP_ROUND_ROUND_TO_ZERO));
+    default:
+      return DefaultStatus;
+    }
+  }
+  return Status();
+}
+
+// Insert a setreg instruction to update the Mode register.
+// It is possible (though unlikely) for an instruction to require a change to
+// the value of disjoint parts of the Mode register when we don't know the
+// value of the intervening bits. In that case we need to use more than one
+// setreg instruction.
+void SIModeRegister::insertSetreg(MachineBasicBlock &MBB, MachineInstr *MI,
+                                  const SIInstrInfo *TII, Status InstrMode) {
+  while (InstrMode.Mask) {
+    unsigned Offset = countTrailingZeros<unsigned>(InstrMode.Mask);
+    unsigned Width = countTrailingOnes<unsigned>(InstrMode.Mask >> Offset);
+    unsigned Value = (InstrMode.Mode >> Offset) & ((1 << Width) - 1);
+    BuildMI(MBB, MI, 0, TII->get(AMDGPU::S_SETREG_IMM32_B32))
+        .addImm(Value)
+        .addImm(((Width - 1) << AMDGPU::Hwreg::WIDTH_M1_SHIFT_) |
+                (Offset << AMDGPU::Hwreg::OFFSET_SHIFT_) |
+                (AMDGPU::Hwreg::ID_MODE << AMDGPU::Hwreg::ID_SHIFT_));
+    ++NumSetregInserted;
+    InstrMode.Mask &= ~(((1 << Width) - 1) << Offset);
+  }
+}
+
+// In Phase 1 we iterate through the instructions of the block and for each
+// instruction we get its mode usage. If the instruction uses the Mode register
+// we:
+// - update the Change status, which tracks the changes to the Mode register
+//   made by this block
+// - if this instruction's requirements are compatible with the current setting
+//   of the Mode register we merge the modes
+// - if it isn't compatible and an InsertionPoint isn't set, then we set the
+//   InsertionPoint to the current instruction, and we remember the current
+//   mode
+// - if it isn't compatible and InsertionPoint is set we insert a seteg before
+//   that instruction (unless this instruction forms part of the block's
+//   entry requirements in which case the insertion is deferred until Phase 3
+//   when predecessor exit values are known), and move the insertion point to
+//   this instruction
+// - if this is a setreg instruction we treat it as an incompatible instruction.
+//   This is sub-optimal but avoids some nasty corner cases, and is expected to
+//   occur very rarely.
+// - on exit we have set the Require, Change, and initial Exit modes.
+void SIModeRegister::processBlockPhase1(MachineBasicBlock &MBB,
+                                        const SIInstrInfo *TII) {
+  auto NewInfo = llvm::make_unique<BlockData>();
+  MachineInstr *InsertionPoint = nullptr;
+  // RequirePending is used to indicate whether we are collecting the initial
+  // requirements for the block, and need to defer the first InsertionPoint to
+  // Phase 3. It is set to false once we have set FirstInsertionPoint, or when
+  // we discover an explict setreg that means this block doesn't have any
+  // initial requirements.
+  bool RequirePending = true;
+  Status IPChange;
+  for (MachineInstr &MI : MBB) {
+    Status InstrMode = getInstructionMode(MI, TII);
+    if ((MI.getOpcode() == AMDGPU::S_SETREG_B32) ||
+        (MI.getOpcode() == AMDGPU::S_SETREG_IMM32_B32)) {
+      // We preserve any explicit mode register setreg instruction we encounter,
+      // as we assume it has been inserted by a higher authority (this is
+      // likely to be a very rare occurrence).
+      unsigned Dst = TII->getNamedOperand(MI, AMDGPU::OpName::simm16)->getImm();
+      if (((Dst & AMDGPU::Hwreg::ID_MASK_) >> AMDGPU::Hwreg::ID_SHIFT_) !=
+          AMDGPU::Hwreg::ID_MODE)
+        continue;
+
+      unsigned Width = ((Dst & AMDGPU::Hwreg::WIDTH_M1_MASK_) >>
+                        AMDGPU::Hwreg::WIDTH_M1_SHIFT_) +
+                       1;
+      unsigned Offset =
+          (Dst & AMDGPU::Hwreg::OFFSET_MASK_) >> AMDGPU::Hwreg::OFFSET_SHIFT_;
+      unsigned Mask = ((1 << Width) - 1) << Offset;
+
+      // If an InsertionPoint is set we will insert a setreg there.
+      if (InsertionPoint) {
+        insertSetreg(MBB, InsertionPoint, TII, IPChange.delta(NewInfo->Change));
+        InsertionPoint = nullptr;
+      }
+      // If this is an immediate then we know the value being set, but if it is
+      // not an immediate then we treat the modified bits of the mode register
+      // as unknown.
+      if (MI.getOpcode() == AMDGPU::S_SETREG_IMM32_B32) {
+        unsigned Val = TII->getNamedOperand(MI, AMDGPU::OpName::imm)->getImm();
+        unsigned Mode = (Val << Offset) & Mask;
+        Status Setreg = Status(Mask, Mode);
+        // If we haven't already set the initial requirements for the block we
+        // don't need to as the requirements start from this explicit setreg.
+        RequirePending = false;
+        NewInfo->Change = NewInfo->Change.merge(Setreg);
+      } else {
+        NewInfo->Change = NewInfo->Change.mergeUnknown(Mask);
+      }
+    } else if (!NewInfo->Change.isCompatible(InstrMode)) {
+      // This instruction uses the Mode register and its requirements aren't
+      // compatible with the current mode.
+      if (InsertionPoint) {
+        // If the required mode change cannot be included in the current
+        // InsertionPoint changes, we need a setreg and start a new
+        // InsertionPoint.
+        if (!IPChange.delta(NewInfo->Change).isCombinable(InstrMode)) {
+          if (RequirePending) {
+            // This is the first insertionPoint in the block so we will defer
+            // the insertion of the setreg to Phase 3 where we know whether or
+            // not it is actually needed.
+            NewInfo->FirstInsertionPoint = InsertionPoint;
+            NewInfo->Require = NewInfo->Change;
+            RequirePending = false;
+          } else {
+            insertSetreg(MBB, InsertionPoint, TII,
+                         IPChange.delta(NewInfo->Change));
+            IPChange = NewInfo->Change;
+          }
+          // Set the new InsertionPoint
+          InsertionPoint = &MI;
+        }
+        NewInfo->Change = NewInfo->Change.merge(InstrMode);
+      } else {
+        // No InsertionPoint is currently set - this is either the first in
+        // the block or we have previously seen an explicit setreg.
+        InsertionPoint = &MI;
+        IPChange = NewInfo->Change;
+        NewInfo->Change = NewInfo->Change.merge(InstrMode);
+      }
+    }
+  }
+  if (RequirePending) {
+    // If we haven't yet set the initial requirements for the block we set them
+    // now.
+    NewInfo->FirstInsertionPoint = InsertionPoint;
+    NewInfo->Require = NewInfo->Change;
+  } else if (InsertionPoint) {
+    // We need to insert a setreg at the InsertionPoint
+    insertSetreg(MBB, InsertionPoint, TII, IPChange.delta(NewInfo->Change));
+  }
+  NewInfo->Exit = NewInfo->Change;
+  BlockInfo[MBB.getNumber()] = std::move(NewInfo);
+}
+
+// In Phase 2 we revisit each block and calculate the common Mode register
+// value provided by all predecessor blocks. If the Exit value for the block
+// is changed, then we add the successor blocks to the worklist so that the
+// exit value is propagated.
+void SIModeRegister::processBlockPhase2(MachineBasicBlock &MBB,
+                                        const SIInstrInfo *TII) {
+//  BlockData *BI = BlockInfo[MBB.getNumber()];
+  unsigned ThisBlock = MBB.getNumber();
+  if (MBB.pred_empty()) {
+    // There are no predecessors, so use the default starting status.
+    BlockInfo[ThisBlock]->Pred = DefaultStatus;
+  } else {
+    // Build a status that is common to all the predecessors by intersecting
+    // all the predecessor exit status values.
+    MachineBasicBlock::pred_iterator P = MBB.pred_begin(), E = MBB.pred_end();
+    MachineBasicBlock &PB = *(*P);
+    BlockInfo[ThisBlock]->Pred = BlockInfo[PB.getNumber()]->Exit;
+
+    for (P = std::next(P); P != E; P = std::next(P)) {
+      MachineBasicBlock *Pred = *P;
+      BlockInfo[ThisBlock]->Pred = BlockInfo[ThisBlock]->Pred.intersect(BlockInfo[Pred->getNumber()]->Exit);
+    }
+  }
+  Status TmpStatus = BlockInfo[ThisBlock]->Pred.merge(BlockInfo[ThisBlock]->Change);
+  if (BlockInfo[ThisBlock]->Exit != TmpStatus) {
+    BlockInfo[ThisBlock]->Exit = TmpStatus;
+    // Add the successors to the work list so we can propagate the changed exit
+    // status.
+    for (MachineBasicBlock::succ_iterator S = MBB.succ_begin(),
+                                          E = MBB.succ_end();
+         S != E; S = std::next(S)) {
+      MachineBasicBlock &B = *(*S);
+      Phase2List.push(&B);
+    }
+  }
+}
+
+// In Phase 3 we revisit each block and if it has an insertion point defined we
+// check whether the predecessor mode meets the block's entry requirements. If
+// not we insert an appropriate setreg instruction to modify the Mode register.
+void SIModeRegister::processBlockPhase3(MachineBasicBlock &MBB,
+                                        const SIInstrInfo *TII) {
+//  BlockData *BI = BlockInfo[MBB.getNumber()];
+  unsigned ThisBlock = MBB.getNumber();
+  if (!BlockInfo[ThisBlock]->Pred.isCompatible(BlockInfo[ThisBlock]->Require)) {
+    Status Delta = BlockInfo[ThisBlock]->Pred.delta(BlockInfo[ThisBlock]->Require);
+    if (BlockInfo[ThisBlock]->FirstInsertionPoint)
+      insertSetreg(MBB, BlockInfo[ThisBlock]->FirstInsertionPoint, TII, Delta);
+    else
+      insertSetreg(MBB, &MBB.instr_front(), TII, Delta);
+  }
+}
+
+bool SIModeRegister::runOnMachineFunction(MachineFunction &MF) {
+  BlockInfo.resize(MF.getNumBlockIDs());
+  const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
+  const SIInstrInfo *TII = ST.getInstrInfo();
+
+  // Processing is performed in a number of phases
+
+  // Phase 1 - determine the initial mode required by each block, and add setreg
+  // instructions for intra block requirements.
+  for (MachineBasicBlock &BB : MF)
+    processBlockPhase1(BB, TII);
+
+  // Phase 2 - determine the exit mode from each block. We add all blocks to the
+  // list here, but will also add any that need to be revisited during Phase 2
+  // processing.
+  for (MachineBasicBlock &BB : MF)
+    Phase2List.push(&BB);
+  while (!Phase2List.empty()) {
+    processBlockPhase2(*Phase2List.front(), TII);
+    Phase2List.pop();
+  }
+
+  // Phase 3 - add an initial setreg to each block where the required entry mode
+  // is not satisfied by the exit mode of all its predecessors.
+  for (MachineBasicBlock &BB : MF)
+    processBlockPhase3(BB, TII);
+
+  BlockInfo.clear();
+
+  return NumSetregInserted > 0;
+}
diff --git a/lib/Target/AMDGPU/SIOptimizeExecMaskingPreRA.cpp b/lib/Target/AMDGPU/SIOptimizeExecMaskingPreRA.cpp
index 7b678d12ba818..c671fed34bdf1 100644
--- a/lib/Target/AMDGPU/SIOptimizeExecMaskingPreRA.cpp
+++ b/lib/Target/AMDGPU/SIOptimizeExecMaskingPreRA.cpp
@@ -103,6 +103,122 @@ static MachineInstr* getOrExecSource(const MachineInstr &MI,
   return SaveExecInst;
 }
 
+// Optimize sequence
+//    %sel = V_CNDMASK_B32_e64 0, 1, %cc
+//    %cmp = V_CMP_NE_U32 1, %1
+//    $vcc = S_AND_B64 $exec, %cmp
+//    S_CBRANCH_VCC[N]Z
+// =>
+//    $vcc = S_ANDN2_B64 $exec, %cc
+//    S_CBRANCH_VCC[N]Z
+//
+// It is the negation pattern inserted by DAGCombiner::visitBRCOND() in the
+// rebuildSetCC(). We start with S_CBRANCH to avoid exhaustive search, but
+// only 3 first instructions are really needed. S_AND_B64 with exec is a
+// required part of the pattern since V_CNDMASK_B32 writes zeroes for inactive
+// lanes.
+//
+// Returns %cc register on success.
+static unsigned optimizeVcndVcmpPair(MachineBasicBlock &MBB,
+                                     const GCNSubtarget &ST,
+                                     MachineRegisterInfo &MRI,
+                                     LiveIntervals *LIS) {
+  const SIRegisterInfo *TRI = ST.getRegisterInfo();
+  const SIInstrInfo *TII = ST.getInstrInfo();
+  const unsigned AndOpc = AMDGPU::S_AND_B64;
+  const unsigned Andn2Opc = AMDGPU::S_ANDN2_B64;
+  const unsigned CondReg = AMDGPU::VCC;
+  const unsigned ExecReg = AMDGPU::EXEC;
+
+  auto I = llvm::find_if(MBB.terminators(), [](const MachineInstr &MI) {
+                           unsigned Opc = MI.getOpcode();
+                           return Opc == AMDGPU::S_CBRANCH_VCCZ ||
+                                  Opc == AMDGPU::S_CBRANCH_VCCNZ; });
+  if (I == MBB.terminators().end())
+    return AMDGPU::NoRegister;
+
+  auto *And = TRI->findReachingDef(CondReg, AMDGPU::NoSubRegister,
+                                   *I, MRI, LIS);
+  if (!And || And->getOpcode() != AndOpc ||
+      !And->getOperand(1).isReg() || !And->getOperand(2).isReg())
+    return AMDGPU::NoRegister;
+
+  MachineOperand *AndCC = &And->getOperand(1);
+  unsigned CmpReg = AndCC->getReg();
+  unsigned CmpSubReg = AndCC->getSubReg();
+  if (CmpReg == ExecReg) {
+    AndCC = &And->getOperand(2);
+    CmpReg = AndCC->getReg();
+    CmpSubReg = AndCC->getSubReg();
+  } else if (And->getOperand(2).getReg() != ExecReg) {
+    return AMDGPU::NoRegister;
+  }
+
+  auto *Cmp = TRI->findReachingDef(CmpReg, CmpSubReg, *And, MRI, LIS);
+  if (!Cmp || !(Cmp->getOpcode() == AMDGPU::V_CMP_NE_U32_e32 ||
+                Cmp->getOpcode() == AMDGPU::V_CMP_NE_U32_e64) ||
+      Cmp->getParent() != And->getParent())
+    return AMDGPU::NoRegister;
+
+  MachineOperand *Op1 = TII->getNamedOperand(*Cmp, AMDGPU::OpName::src0);
+  MachineOperand *Op2 = TII->getNamedOperand(*Cmp, AMDGPU::OpName::src1);
+  if (Op1->isImm() && Op2->isReg())
+    std::swap(Op1, Op2);
+  if (!Op1->isReg() || !Op2->isImm() || Op2->getImm() != 1)
+    return AMDGPU::NoRegister;
+
+  unsigned SelReg = Op1->getReg();
+  auto *Sel = TRI->findReachingDef(SelReg, Op1->getSubReg(), *Cmp, MRI, LIS);
+  if (!Sel || Sel->getOpcode() != AMDGPU::V_CNDMASK_B32_e64)
+    return AMDGPU::NoRegister;
+
+  Op1 = TII->getNamedOperand(*Sel, AMDGPU::OpName::src0);
+  Op2 = TII->getNamedOperand(*Sel, AMDGPU::OpName::src1);
+  MachineOperand *CC = TII->getNamedOperand(*Sel, AMDGPU::OpName::src2);
+  if (!Op1->isImm() || !Op2->isImm() || !CC->isReg() ||
+      Op1->getImm() != 0 || Op2->getImm() != 1)
+    return AMDGPU::NoRegister;
+
+  LLVM_DEBUG(dbgs() << "Folding sequence:\n\t" << *Sel << '\t'
+                    << *Cmp << '\t' << *And);
+
+  unsigned CCReg = CC->getReg();
+  LIS->RemoveMachineInstrFromMaps(*And);
+  MachineInstr *Andn2 = BuildMI(MBB, *And, And->getDebugLoc(),
+                                TII->get(Andn2Opc), And->getOperand(0).getReg())
+                            .addReg(ExecReg)
+                            .addReg(CCReg, CC->getSubReg());
+  And->eraseFromParent();
+  LIS->InsertMachineInstrInMaps(*Andn2);
+
+  LLVM_DEBUG(dbgs() << "=>\n\t" << *Andn2 << '\n');
+
+  // Try to remove compare. Cmp value should not used in between of cmp
+  // and s_and_b64 if VCC or just unused if any other register.
+  if ((TargetRegisterInfo::isVirtualRegister(CmpReg) &&
+       MRI.use_nodbg_empty(CmpReg)) ||
+      (CmpReg == CondReg &&
+       std::none_of(std::next(Cmp->getIterator()), Andn2->getIterator(),
+                    [&](const MachineInstr &MI) {
+                      return MI.readsRegister(CondReg, TRI); }))) {
+    LLVM_DEBUG(dbgs() << "Erasing: " << *Cmp << '\n');
+
+    LIS->RemoveMachineInstrFromMaps(*Cmp);
+    Cmp->eraseFromParent();
+
+    // Try to remove v_cndmask_b32.
+    if (TargetRegisterInfo::isVirtualRegister(SelReg) &&
+        MRI.use_nodbg_empty(SelReg)) {
+      LLVM_DEBUG(dbgs() << "Erasing: " << *Sel << '\n');
+
+      LIS->RemoveMachineInstrFromMaps(*Sel);
+      Sel->eraseFromParent();
+    }
+  }
+
+  return CCReg;
+}
+
 bool SIOptimizeExecMaskingPreRA::runOnMachineFunction(MachineFunction &MF) {
   if (skipFunction(MF.getFunction()))
     return false;
@@ -117,9 +233,24 @@ bool SIOptimizeExecMaskingPreRA::runOnMachineFunction(MachineFunction &MF) {
 
   for (MachineBasicBlock &MBB : MF) {
 
+    if (unsigned Reg = optimizeVcndVcmpPair(MBB, ST, MRI, LIS)) {
+      RecalcRegs.insert(Reg);
+      RecalcRegs.insert(AMDGPU::VCC_LO);
+      RecalcRegs.insert(AMDGPU::VCC_HI);
+      RecalcRegs.insert(AMDGPU::SCC);
+      Changed = true;
+    }
+
     // Try to remove unneeded instructions before s_endpgm.
     if (MBB.succ_empty()) {
-      if (MBB.empty() || MBB.back().getOpcode() != AMDGPU::S_ENDPGM)
+      if (MBB.empty())
+        continue;
+
+      // Skip this if the endpgm has any implicit uses, otherwise we would need
+      // to be careful to update / remove them.
+      MachineInstr &Term = MBB.back();
+      if (Term.getOpcode() != AMDGPU::S_ENDPGM ||
+          Term.getNumOperands() != 0)
         continue;
 
       SmallVector<MachineBasicBlock*, 4> Blocks({&MBB});
diff --git a/lib/Target/AMDGPU/SIPeepholeSDWA.cpp b/lib/Target/AMDGPU/SIPeepholeSDWA.cpp
index 0e000b72962eb..2d43d5d05ef64 100644
--- a/lib/Target/AMDGPU/SIPeepholeSDWA.cpp
+++ b/lib/Target/AMDGPU/SIPeepholeSDWA.cpp
@@ -90,7 +90,9 @@ public:
   bool runOnMachineFunction(MachineFunction &MF) override;
   void matchSDWAOperands(MachineBasicBlock &MBB);
   std::unique_ptr<SDWAOperand> matchSDWAOperand(MachineInstr &MI);
-  bool isConvertibleToSDWA(const MachineInstr &MI, const GCNSubtarget &ST) const;
+  bool isConvertibleToSDWA(MachineInstr &MI, const GCNSubtarget &ST) const;
+  void pseudoOpConvertToVOP2(MachineInstr &MI,
+                             const GCNSubtarget &ST) const;
   bool convertToSDWA(MachineInstr &MI, const SDWAOperandsVector &SDWAOperands);
   void legalizeScalarOperands(MachineInstr &MI, const GCNSubtarget &ST) const;
 
@@ -854,7 +856,82 @@ void SIPeepholeSDWA::matchSDWAOperands(MachineBasicBlock &MBB) {
   }
 }
 
-bool SIPeepholeSDWA::isConvertibleToSDWA(const MachineInstr &MI,
+// Convert the V_ADDC_U32_e64 into V_ADDC_U32_e32, and
+// V_ADD_I32_e64 into V_ADD_I32_e32. This allows isConvertibleToSDWA
+// to perform its transformation on V_ADD_I32_e32 into V_ADD_I32_sdwa.
+//
+// We are transforming from a VOP3 into a VOP2 form of the instruction.
+//   %19:vgpr_32 = V_AND_B32_e32 255,
+//       killed %16:vgpr_32, implicit $exec
+//   %47:vgpr_32, %49:sreg_64_xexec = V_ADD_I32_e64
+//       %26.sub0:vreg_64, %19:vgpr_32, implicit $exec
+//  %48:vgpr_32, dead %50:sreg_64_xexec = V_ADDC_U32_e64
+//       %26.sub1:vreg_64, %54:vgpr_32, killed %49:sreg_64_xexec, implicit $exec
+//
+// becomes
+//   %47:vgpr_32 = V_ADD_I32_sdwa
+//       0, %26.sub0:vreg_64, 0, killed %16:vgpr_32, 0, 6, 0, 6, 0,
+//       implicit-def $vcc, implicit $exec
+//  %48:vgpr_32 = V_ADDC_U32_e32
+//       0, %26.sub1:vreg_64, implicit-def $vcc, implicit $vcc, implicit $exec
+void SIPeepholeSDWA::pseudoOpConvertToVOP2(MachineInstr &MI,
+                                           const GCNSubtarget &ST) const {
+  int Opc = MI.getOpcode();
+  assert((Opc == AMDGPU::V_ADD_I32_e64 || Opc == AMDGPU::V_SUB_I32_e64) &&
+         "Currently only handles V_ADD_I32_e64 or V_SUB_I32_e64");
+
+  // Can the candidate MI be shrunk?
+  if (!TII->canShrink(MI, *MRI))
+    return;
+  Opc = AMDGPU::getVOPe32(Opc);
+  // Find the related ADD instruction.
+  const MachineOperand *Sdst = TII->getNamedOperand(MI, AMDGPU::OpName::sdst);
+  if (!Sdst)
+    return;
+  MachineOperand *NextOp = findSingleRegUse(Sdst, MRI);
+  if (!NextOp)
+    return;
+  MachineInstr &MISucc = *NextOp->getParent();
+  // Can the successor be shrunk?
+  if (!TII->canShrink(MISucc, *MRI))
+    return;
+  int SuccOpc = AMDGPU::getVOPe32(MISucc.getOpcode());
+  // Make sure the carry in/out are subsequently unused.
+  MachineOperand *CarryIn = TII->getNamedOperand(MISucc, AMDGPU::OpName::src2);
+  if (!CarryIn)
+    return;
+  MachineOperand *CarryOut = TII->getNamedOperand(MISucc, AMDGPU::OpName::sdst);
+  if (!CarryOut)
+    return;
+  if (!MRI->hasOneUse(CarryIn->getReg()) || !MRI->use_empty(CarryOut->getReg()))
+    return;
+  // Make sure VCC or its subregs are dead before MI.
+  MachineBasicBlock &MBB = *MI.getParent();
+  auto Liveness = MBB.computeRegisterLiveness(TRI, AMDGPU::VCC, MI, 25);
+  if (Liveness != MachineBasicBlock::LQR_Dead)
+    return;
+  // Check if VCC is referenced in range of (MI,MISucc].
+  for (auto I = std::next(MI.getIterator()), E = MISucc.getIterator();
+       I != E; ++I) {
+    if (I->modifiesRegister(AMDGPU::VCC, TRI))
+      return;
+  }
+  // Make the two new e32 instruction variants.
+  // Replace MI with V_{SUB|ADD}_I32_e32
+  auto NewMI = BuildMI(MBB, MI, MI.getDebugLoc(), TII->get(Opc));
+  NewMI.add(*TII->getNamedOperand(MI, AMDGPU::OpName::vdst));
+  NewMI.add(*TII->getNamedOperand(MI, AMDGPU::OpName::src0));
+  NewMI.add(*TII->getNamedOperand(MI, AMDGPU::OpName::src1));
+  MI.eraseFromParent();
+  // Replace MISucc with V_{SUBB|ADDC}_U32_e32
+  auto NewInst = BuildMI(MBB, MISucc, MISucc.getDebugLoc(), TII->get(SuccOpc));
+  NewInst.add(*TII->getNamedOperand(MISucc, AMDGPU::OpName::vdst));
+  NewInst.add(*TII->getNamedOperand(MISucc, AMDGPU::OpName::src0));
+  NewInst.add(*TII->getNamedOperand(MISucc, AMDGPU::OpName::src1));
+  MISucc.eraseFromParent();
+}
+
+bool SIPeepholeSDWA::isConvertibleToSDWA(MachineInstr &MI,
                                          const GCNSubtarget &ST) const {
   // Check if this is already an SDWA instruction
   unsigned Opc = MI.getOpcode();
@@ -1127,6 +1204,22 @@ bool SIPeepholeSDWA::runOnMachineFunction(MachineFunction &MF) {
   for (MachineBasicBlock &MBB : MF) {
     bool Changed = false;
     do {
+      // Preprocess the ADD/SUB pairs so they could be SDWA'ed.
+      // Look for a possible ADD or SUB that resulted from a previously lowered
+      // V_{ADD|SUB}_U64_PSEUDO. The function pseudoOpConvertToVOP2
+      // lowers the pair of instructions into e32 form.
+      matchSDWAOperands(MBB);
+      for (const auto &OperandPair : SDWAOperands) {
+        const auto &Operand = OperandPair.second;
+        MachineInstr *PotentialMI = Operand->potentialToConvert(TII);
+        if (PotentialMI &&
+           (PotentialMI->getOpcode() == AMDGPU::V_ADD_I32_e64 ||
+            PotentialMI->getOpcode() == AMDGPU::V_SUB_I32_e64))
+          pseudoOpConvertToVOP2(*PotentialMI, ST);
+      }
+      SDWAOperands.clear();
+
+      // Generate potential match list.
       matchSDWAOperands(MBB);
 
       for (const auto &OperandPair : SDWAOperands) {
diff --git a/lib/Target/AMDGPU/SIRegisterInfo.cpp b/lib/Target/AMDGPU/SIRegisterInfo.cpp
index 624607f6ea542..97cfde2b23541 100644
--- a/lib/Target/AMDGPU/SIRegisterInfo.cpp
+++ b/lib/Target/AMDGPU/SIRegisterInfo.cpp
@@ -18,9 +18,12 @@
 #include "SIInstrInfo.h"
 #include "SIMachineFunctionInfo.h"
 #include "MCTargetDesc/AMDGPUMCTargetDesc.h"
+#include "llvm/CodeGen/LiveIntervals.h"
+#include "llvm/CodeGen/MachineDominators.h"
 #include "llvm/CodeGen/MachineFrameInfo.h"
 #include "llvm/CodeGen/MachineInstrBuilder.h"
 #include "llvm/CodeGen/RegisterScavenging.h"
+#include "llvm/CodeGen/SlotIndexes.h"
 #include "llvm/IR/Function.h"
 #include "llvm/IR/LLVMContext.h"
 
@@ -495,15 +498,16 @@ static bool buildMUBUFOffsetLoadStore(const SIInstrInfo *TII,
     return false;
 
   const MachineOperand *Reg = TII->getNamedOperand(*MI, AMDGPU::OpName::vdata);
-  MachineInstrBuilder NewMI = BuildMI(*MBB, MI, DL, TII->get(LoadStoreOp))
-    .add(*Reg)
-    .add(*TII->getNamedOperand(*MI, AMDGPU::OpName::srsrc))
-    .add(*TII->getNamedOperand(*MI, AMDGPU::OpName::soffset))
-    .addImm(Offset)
-    .addImm(0) // glc
-    .addImm(0) // slc
-    .addImm(0) // tfe
-    .setMemRefs(MI->memoperands_begin(), MI->memoperands_end());
+  MachineInstrBuilder NewMI =
+      BuildMI(*MBB, MI, DL, TII->get(LoadStoreOp))
+          .add(*Reg)
+          .add(*TII->getNamedOperand(*MI, AMDGPU::OpName::srsrc))
+          .add(*TII->getNamedOperand(*MI, AMDGPU::OpName::soffset))
+          .addImm(Offset)
+          .addImm(0) // glc
+          .addImm(0) // slc
+          .addImm(0) // tfe
+          .cloneMemRefs(*MI);
 
   const MachineOperand *VDataIn = TII->getNamedOperand(*MI,
                                                        AMDGPU::OpName::vdata_in);
@@ -900,7 +904,7 @@ bool SIRegisterInfo::restoreSGPR(MachineBasicBlock::iterator MI,
         .addImm(0)                        // glc
         .addMemOperand(MMO);
 
-      if (NumSubRegs > 1)
+      if (NumSubRegs > 1 && i == 0)
         MIB.addReg(SuperReg, RegState::ImplicitDefine);
 
       continue;
@@ -914,7 +918,7 @@ bool SIRegisterInfo::restoreSGPR(MachineBasicBlock::iterator MI,
         .addReg(Spill.VGPR)
         .addImm(Spill.Lane);
 
-      if (NumSubRegs > 1)
+      if (NumSubRegs > 1 && i == 0)
         MIB.addReg(SuperReg, RegState::ImplicitDefine);
     } else {
       if (OnlyToVGPR)
@@ -1598,3 +1602,57 @@ SIRegisterInfo::getConstrainedRegClassForOperand(const MachineOperand &MO,
     llvm_unreachable("not implemented");
   }
 }
+
+// Find reaching register definition
+MachineInstr *SIRegisterInfo::findReachingDef(unsigned Reg, unsigned SubReg,
+                                              MachineInstr &Use,
+                                              MachineRegisterInfo &MRI,
+                                              LiveIntervals *LIS) const {
+  auto &MDT = LIS->getAnalysis<MachineDominatorTree>();
+  SlotIndex UseIdx = LIS->getInstructionIndex(Use);
+  SlotIndex DefIdx;
+
+  if (TargetRegisterInfo::isVirtualRegister(Reg)) {
+    if (!LIS->hasInterval(Reg))
+      return nullptr;
+    LiveInterval &LI = LIS->getInterval(Reg);
+    LaneBitmask SubLanes = SubReg ? getSubRegIndexLaneMask(SubReg)
+                                  : MRI.getMaxLaneMaskForVReg(Reg);
+    VNInfo *V = nullptr;
+    if (LI.hasSubRanges()) {
+      for (auto &S : LI.subranges()) {
+        if ((S.LaneMask & SubLanes) == SubLanes) {
+          V = S.getVNInfoAt(UseIdx);
+          break;
+        }
+      }
+    } else {
+      V = LI.getVNInfoAt(UseIdx);
+    }
+    if (!V)
+      return nullptr;
+    DefIdx = V->def;
+  } else {
+    // Find last def.
+    for (MCRegUnitIterator Units(Reg, this); Units.isValid(); ++Units) {
+      LiveRange &LR = LIS->getRegUnit(*Units);
+      if (VNInfo *V = LR.getVNInfoAt(UseIdx)) {
+        if (!DefIdx.isValid() ||
+            MDT.dominates(LIS->getInstructionFromIndex(DefIdx),
+                          LIS->getInstructionFromIndex(V->def)))
+          DefIdx = V->def;
+      } else {
+        return nullptr;
+      }
+    }
+  }
+
+  MachineInstr *Def = LIS->getInstructionFromIndex(DefIdx);
+
+  if (!Def || !MDT.dominates(Def, &Use))
+    return nullptr;
+
+  assert(Def->modifiesRegister(Reg, this));
+
+  return Def;
+}
diff --git a/lib/Target/AMDGPU/SIRegisterInfo.h b/lib/Target/AMDGPU/SIRegisterInfo.h
index 5a51b67ca719c..b82fefde47e13 100644
--- a/lib/Target/AMDGPU/SIRegisterInfo.h
+++ b/lib/Target/AMDGPU/SIRegisterInfo.h
@@ -228,6 +228,12 @@ public:
   getConstrainedRegClassForOperand(const MachineOperand &MO,
                                  const MachineRegisterInfo &MRI) const override;
 
+  // Find reaching register definition
+  MachineInstr *findReachingDef(unsigned Reg, unsigned SubReg,
+                                MachineInstr &Use,
+                                MachineRegisterInfo &MRI,
+                                LiveIntervals *LIS) const;
+
 private:
   void buildSpillLoadStore(MachineBasicBlock::iterator MI,
                            unsigned LoadStoreOp,
diff --git a/lib/Target/AMDGPU/SIRegisterInfo.td b/lib/Target/AMDGPU/SIRegisterInfo.td
index f87a0763b353b..c625ecc9b750e 100644
--- a/lib/Target/AMDGPU/SIRegisterInfo.td
+++ b/lib/Target/AMDGPU/SIRegisterInfo.td
@@ -435,7 +435,7 @@ def SReg_32 : RegisterClass<"AMDGPU", [i32, f32, i16, f16, v2i16, v2f16], 32,
   let AllocationPriority = 7;
 }
 
-def SGPR_64 : RegisterClass<"AMDGPU", [v2i32, i64, f64, v4i16, v4f16], 32, (add SGPR_64Regs)> {
+def SGPR_64 : RegisterClass<"AMDGPU", [v2i32, i64, v2f32, f64, v4i16, v4f16], 32, (add SGPR_64Regs)> {
   let CopyCost = 1;
   let AllocationPriority = 8;
 }
@@ -444,13 +444,13 @@ def TTMP_64 : RegisterClass<"AMDGPU", [v2i32, i64, f64, v4i16, v4f16], 32, (add
   let isAllocatable = 0;
 }
 
-def SReg_64_XEXEC : RegisterClass<"AMDGPU", [v2i32, i64, f64, i1, v4i16, v4f16], 32,
+def SReg_64_XEXEC : RegisterClass<"AMDGPU", [v2i32, i64, v2f32, f64, i1, v4i16, v4f16], 32,
   (add SGPR_64, VCC, FLAT_SCR, XNACK_MASK, TTMP_64, TBA, TMA)> {
   let CopyCost = 1;
   let AllocationPriority = 8;
 }
 
-def SReg_64 : RegisterClass<"AMDGPU", [v2i32, i64, f64, i1, v4i16, v4f16], 32,
+def SReg_64 : RegisterClass<"AMDGPU", [v2i32, i64, v2f32, f64, i1, v4i16, v4f16], 32,
   (add SReg_64_XEXEC, EXEC)> {
   let CopyCost = 1;
   let AllocationPriority = 8;
@@ -459,15 +459,15 @@ def SReg_64 : RegisterClass<"AMDGPU", [v2i32, i64, f64, i1, v4i16, v4f16], 32,
 // Requires 2 s_mov_b64 to copy
 let CopyCost = 2 in {
 
-def SGPR_128 : RegisterClass<"AMDGPU", [v4i32, v16i8, v2i64], 32, (add SGPR_128Regs)> {
+def SGPR_128 : RegisterClass<"AMDGPU", [v4i32, v4f32, v2i64], 32, (add SGPR_128Regs)> {
   let AllocationPriority = 10;
 }
 
-def TTMP_128 : RegisterClass<"AMDGPU", [v4i32, v16i8, v2i64], 32, (add TTMP_128Regs)> {
+def TTMP_128 : RegisterClass<"AMDGPU", [v4i32, v4f32, v2i64], 32, (add TTMP_128Regs)> {
   let isAllocatable = 0;
 }
 
-def SReg_128 : RegisterClass<"AMDGPU", [v4i32, v16i8, v2i64, v2f64], 32,
+def SReg_128 : RegisterClass<"AMDGPU", [v4i32, v4f32, v2i64, v2f64], 32,
   (add SGPR_128, TTMP_128)> {
   let AllocationPriority = 10;
 }
diff --git a/lib/Target/AMDGPU/SIShrinkInstructions.cpp b/lib/Target/AMDGPU/SIShrinkInstructions.cpp
index 4189bcce52ea1..6ad7dd0e3a7ce 100644
--- a/lib/Target/AMDGPU/SIShrinkInstructions.cpp
+++ b/lib/Target/AMDGPU/SIShrinkInstructions.cpp
@@ -64,59 +64,6 @@ FunctionPass *llvm::createSIShrinkInstructionsPass() {
   return new SIShrinkInstructions();
 }
 
-static bool canShrink(MachineInstr &MI, const SIInstrInfo *TII,
-                      const SIRegisterInfo &TRI,
-                      const MachineRegisterInfo &MRI) {
-
-  const MachineOperand *Src2 = TII->getNamedOperand(MI, AMDGPU::OpName::src2);
-  // Can't shrink instruction with three operands.
-  // FIXME: v_cndmask_b32 has 3 operands and is shrinkable, but we need to add
-  // a special case for it.  It can only be shrunk if the third operand
-  // is vcc.  We should handle this the same way we handle vopc, by addding
-  // a register allocation hint pre-regalloc and then do the shrinking
-  // post-regalloc.
-  if (Src2) {
-    switch (MI.getOpcode()) {
-      default: return false;
-
-      case AMDGPU::V_ADDC_U32_e64:
-      case AMDGPU::V_SUBB_U32_e64:
-      case AMDGPU::V_SUBBREV_U32_e64: {
-        const MachineOperand *Src1
-          = TII->getNamedOperand(MI, AMDGPU::OpName::src1);
-        if (!Src1->isReg() || !TRI.isVGPR(MRI, Src1->getReg()))
-          return false;
-        // Additional verification is needed for sdst/src2.
-        return true;
-      }
-      case AMDGPU::V_MAC_F32_e64:
-      case AMDGPU::V_MAC_F16_e64:
-      case AMDGPU::V_FMAC_F32_e64:
-        if (!Src2->isReg() || !TRI.isVGPR(MRI, Src2->getReg()) ||
-            TII->hasModifiersSet(MI, AMDGPU::OpName::src2_modifiers))
-          return false;
-        break;
-
-      case AMDGPU::V_CNDMASK_B32_e64:
-        break;
-    }
-  }
-
-  const MachineOperand *Src1 = TII->getNamedOperand(MI, AMDGPU::OpName::src1);
-  if (Src1 && (!Src1->isReg() || !TRI.isVGPR(MRI, Src1->getReg()) ||
-               TII->hasModifiersSet(MI, AMDGPU::OpName::src1_modifiers)))
-    return false;
-
-  // We don't need to check src0, all input types are legal, so just make sure
-  // src0 isn't using any modifiers.
-  if (TII->hasModifiersSet(MI, AMDGPU::OpName::src0_modifiers))
-    return false;
-
-  // Check output modifiers
-  return !TII->hasModifiersSet(MI, AMDGPU::OpName::omod) &&
-         !TII->hasModifiersSet(MI, AMDGPU::OpName::clamp);
-}
-
 /// This function checks \p MI for operands defined by a move immediate
 /// instruction and then folds the literal constant into the instruction if it
 /// can. This function assumes that \p MI is a VOP1, VOP2, or VOPC instructions.
@@ -173,19 +120,6 @@ static bool foldImmediates(MachineInstr &MI, const SIInstrInfo *TII,
   return false;
 }
 
-// Copy MachineOperand with all flags except setting it as implicit.
-static void copyFlagsToImplicitVCC(MachineInstr &MI,
-                                   const MachineOperand &Orig) {
-
-  for (MachineOperand &Use : MI.implicit_operands()) {
-    if (Use.isUse() && Use.getReg() == AMDGPU::VCC) {
-      Use.setIsUndef(Orig.isUndef());
-      Use.setIsKill(Orig.isKill());
-      return;
-    }
-  }
-}
-
 static bool isKImmOperand(const SIInstrInfo *TII, const MachineOperand &Src) {
   return isInt<16>(Src.getImm()) &&
     !TII->isInlineConstant(*Src.getParent(),
@@ -278,6 +212,245 @@ static void shrinkScalarCompare(const SIInstrInfo *TII, MachineInstr &MI) {
   }
 }
 
+/// Attempt to shink AND/OR/XOR operations requiring non-inlineable literals.
+/// For AND or OR, try using S_BITSET{0,1} to clear or set bits.
+/// If the inverse of the immediate is legal, use ANDN2, ORN2 or
+/// XNOR (as a ^ b == ~(a ^ ~b)).
+/// \returns true if the caller should continue the machine function iterator
+static bool shrinkScalarLogicOp(const GCNSubtarget &ST,
+                                MachineRegisterInfo &MRI,
+                                const SIInstrInfo *TII,
+                                MachineInstr &MI) {
+  unsigned Opc = MI.getOpcode();
+  const MachineOperand *Dest = &MI.getOperand(0);
+  MachineOperand *Src0 = &MI.getOperand(1);
+  MachineOperand *Src1 = &MI.getOperand(2);
+  MachineOperand *SrcReg = Src0;
+  MachineOperand *SrcImm = Src1;
+
+  if (SrcImm->isImm() &&
+      !AMDGPU::isInlinableLiteral32(SrcImm->getImm(), ST.hasInv2PiInlineImm())) {
+    uint32_t Imm = static_cast<uint32_t>(SrcImm->getImm());
+    uint32_t NewImm = 0;
+
+    if (Opc == AMDGPU::S_AND_B32) {
+      if (isPowerOf2_32(~Imm)) {
+        NewImm = countTrailingOnes(Imm);
+        Opc = AMDGPU::S_BITSET0_B32;
+      } else if (AMDGPU::isInlinableLiteral32(~Imm, ST.hasInv2PiInlineImm())) {
+        NewImm = ~Imm;
+        Opc = AMDGPU::S_ANDN2_B32;
+      }
+    } else if (Opc == AMDGPU::S_OR_B32) {
+      if (isPowerOf2_32(Imm)) {
+        NewImm = countTrailingZeros(Imm);
+        Opc = AMDGPU::S_BITSET1_B32;
+      } else if (AMDGPU::isInlinableLiteral32(~Imm, ST.hasInv2PiInlineImm())) {
+        NewImm = ~Imm;
+        Opc = AMDGPU::S_ORN2_B32;
+      }
+    } else if (Opc == AMDGPU::S_XOR_B32) {
+      if (AMDGPU::isInlinableLiteral32(~Imm, ST.hasInv2PiInlineImm())) {
+        NewImm = ~Imm;
+        Opc = AMDGPU::S_XNOR_B32;
+      }
+    } else {
+      llvm_unreachable("unexpected opcode");
+    }
+
+    if ((Opc == AMDGPU::S_ANDN2_B32 || Opc == AMDGPU::S_ORN2_B32) &&
+        SrcImm == Src0) {
+      if (!TII->commuteInstruction(MI, false, 1, 2))
+        NewImm = 0;
+    }
+
+    if (NewImm != 0) {
+      if (TargetRegisterInfo::isVirtualRegister(Dest->getReg()) &&
+        SrcReg->isReg()) {
+        MRI.setRegAllocationHint(Dest->getReg(), 0, SrcReg->getReg());
+        MRI.setRegAllocationHint(SrcReg->getReg(), 0, Dest->getReg());
+        return true;
+      }
+
+      if (SrcReg->isReg() && SrcReg->getReg() == Dest->getReg()) {
+        MI.setDesc(TII->get(Opc));
+        if (Opc == AMDGPU::S_BITSET0_B32 ||
+            Opc == AMDGPU::S_BITSET1_B32) {
+          Src0->ChangeToImmediate(NewImm);
+          MI.RemoveOperand(2);
+        } else {
+          SrcImm->setImm(NewImm);
+        }
+      }
+    }
+  }
+
+  return false;
+}
+
+// This is the same as MachineInstr::readsRegister/modifiesRegister except
+// it takes subregs into account.
+static bool instAccessReg(iterator_range<MachineInstr::const_mop_iterator> &&R,
+                          unsigned Reg, unsigned SubReg,
+                          const SIRegisterInfo &TRI) {
+  for (const MachineOperand &MO : R) {
+    if (!MO.isReg())
+      continue;
+
+    if (TargetRegisterInfo::isPhysicalRegister(Reg) &&
+        TargetRegisterInfo::isPhysicalRegister(MO.getReg())) {
+      if (TRI.regsOverlap(Reg, MO.getReg()))
+        return true;
+    } else if (MO.getReg() == Reg &&
+               TargetRegisterInfo::isVirtualRegister(Reg)) {
+      LaneBitmask Overlap = TRI.getSubRegIndexLaneMask(SubReg) &
+                            TRI.getSubRegIndexLaneMask(MO.getSubReg());
+      if (Overlap.any())
+        return true;
+    }
+  }
+  return false;
+}
+
+static bool instReadsReg(const MachineInstr *MI,
+                         unsigned Reg, unsigned SubReg,
+                         const SIRegisterInfo &TRI) {
+  return instAccessReg(MI->uses(), Reg, SubReg, TRI);
+}
+
+static bool instModifiesReg(const MachineInstr *MI,
+                            unsigned Reg, unsigned SubReg,
+                            const SIRegisterInfo &TRI) {
+  return instAccessReg(MI->defs(), Reg, SubReg, TRI);
+}
+
+static TargetInstrInfo::RegSubRegPair
+getSubRegForIndex(unsigned Reg, unsigned Sub, unsigned I,
+                  const SIRegisterInfo &TRI, const MachineRegisterInfo &MRI) {
+  if (TRI.getRegSizeInBits(Reg, MRI) != 32) {
+    if (TargetRegisterInfo::isPhysicalRegister(Reg)) {
+      Reg = TRI.getSubReg(Reg, TRI.getSubRegFromChannel(I));
+    } else {
+      LaneBitmask LM = TRI.getSubRegIndexLaneMask(Sub);
+      Sub = TRI.getSubRegFromChannel(I + countTrailingZeros(LM.getAsInteger()));
+    }
+  }
+  return TargetInstrInfo::RegSubRegPair(Reg, Sub);
+}
+
+// Match:
+// mov t, x
+// mov x, y
+// mov y, t
+//
+// =>
+//
+// mov t, x (t is potentially dead and move eliminated)
+// v_swap_b32 x, y
+//
+// Returns next valid instruction pointer if was able to create v_swap_b32.
+//
+// This shall not be done too early not to prevent possible folding which may
+// remove matched moves, and this should prefereably be done before RA to
+// release saved registers and also possibly after RA which can insert copies
+// too.
+//
+// This is really just a generic peephole that is not a canocical shrinking,
+// although requirements match the pass placement and it reduces code size too.
+static MachineInstr* matchSwap(MachineInstr &MovT, MachineRegisterInfo &MRI,
+                               const SIInstrInfo *TII) {
+  assert(MovT.getOpcode() == AMDGPU::V_MOV_B32_e32 ||
+         MovT.getOpcode() == AMDGPU::COPY);
+
+  unsigned T = MovT.getOperand(0).getReg();
+  unsigned Tsub = MovT.getOperand(0).getSubReg();
+  MachineOperand &Xop = MovT.getOperand(1);
+
+  if (!Xop.isReg())
+    return nullptr;
+  unsigned X = Xop.getReg();
+  unsigned Xsub = Xop.getSubReg();
+
+  unsigned Size = TII->getOpSize(MovT, 0) / 4;
+
+  const SIRegisterInfo &TRI = TII->getRegisterInfo();
+  if (!TRI.isVGPR(MRI, X))
+    return nullptr;
+
+  for (MachineOperand &YTop : MRI.use_nodbg_operands(T)) {
+    if (YTop.getSubReg() != Tsub)
+      continue;
+
+    MachineInstr &MovY = *YTop.getParent();
+    if ((MovY.getOpcode() != AMDGPU::V_MOV_B32_e32 &&
+         MovY.getOpcode() != AMDGPU::COPY) ||
+        MovY.getOperand(1).getSubReg() != Tsub)
+      continue;
+
+    unsigned Y = MovY.getOperand(0).getReg();
+    unsigned Ysub = MovY.getOperand(0).getSubReg();
+
+    if (!TRI.isVGPR(MRI, Y) || MovT.getParent() != MovY.getParent())
+      continue;
+
+    MachineInstr *MovX = nullptr;
+    auto I = std::next(MovT.getIterator()), E = MovT.getParent()->instr_end();
+    for (auto IY = MovY.getIterator(); I != E && I != IY; ++I) {
+      if (instReadsReg(&*I, X, Xsub, TRI) ||
+          instModifiesReg(&*I, Y, Ysub, TRI) ||
+          instModifiesReg(&*I, T, Tsub, TRI) ||
+          (MovX && instModifiesReg(&*I, X, Xsub, TRI))) {
+        MovX = nullptr;
+        break;
+      }
+      if (!instReadsReg(&*I, Y, Ysub, TRI)) {
+        if (!MovX && instModifiesReg(&*I, X, Xsub, TRI)) {
+          MovX = nullptr;
+          break;
+        }
+        continue;
+      }
+      if (MovX ||
+          (I->getOpcode() != AMDGPU::V_MOV_B32_e32 &&
+           I->getOpcode() != AMDGPU::COPY) ||
+          I->getOperand(0).getReg() != X ||
+          I->getOperand(0).getSubReg() != Xsub) {
+        MovX = nullptr;
+        break;
+      }
+      MovX = &*I;
+    }
+
+    if (!MovX || I == E)
+      continue;
+
+    LLVM_DEBUG(dbgs() << "Matched v_swap_b32:\n" << MovT << *MovX << MovY);
+
+    for (unsigned I = 0; I < Size; ++I) {
+      TargetInstrInfo::RegSubRegPair X1, Y1;
+      X1 = getSubRegForIndex(X, Xsub, I, TRI, MRI);
+      Y1 = getSubRegForIndex(Y, Ysub, I, TRI, MRI);
+      BuildMI(*MovT.getParent(), MovX->getIterator(), MovT.getDebugLoc(),
+                TII->get(AMDGPU::V_SWAP_B32))
+        .addDef(X1.Reg, 0, X1.SubReg)
+        .addDef(Y1.Reg, 0, Y1.SubReg)
+        .addReg(Y1.Reg, 0, Y1.SubReg)
+        .addReg(X1.Reg, 0, X1.SubReg).getInstr();
+    }
+    MovX->eraseFromParent();
+    MovY.eraseFromParent();
+    MachineInstr *Next = &*std::next(MovT.getIterator());
+    if (MRI.use_nodbg_empty(T))
+      MovT.eraseFromParent();
+    else
+      Xop.setIsKill(false);
+
+    return Next;
+  }
+
+  return nullptr;
+}
+
 bool SIShrinkInstructions::runOnMachineFunction(MachineFunction &MF) {
   if (skipFunction(MF.getFunction()))
     return false;
@@ -285,7 +458,6 @@ bool SIShrinkInstructions::runOnMachineFunction(MachineFunction &MF) {
   MachineRegisterInfo &MRI = MF.getRegInfo();
   const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
   const SIInstrInfo *TII = ST.getInstrInfo();
-  const SIRegisterInfo &TRI = TII->getRegisterInfo();
 
   std::vector<unsigned> I1Defs;
 
@@ -319,6 +491,14 @@ bool SIShrinkInstructions::runOnMachineFunction(MachineFunction &MF) {
         }
       }
 
+      if (ST.hasSwap() && (MI.getOpcode() == AMDGPU::V_MOV_B32_e32 ||
+                           MI.getOpcode() == AMDGPU::COPY)) {
+        if (auto *NextMI = matchSwap(MI, MRI, TII)) {
+          Next = NextMI->getIterator();
+          continue;
+        }
+      }
+
       // Combine adjacent s_nops to use the immediate operand encoding how long
       // to wait.
       //
@@ -408,14 +588,22 @@ bool SIShrinkInstructions::runOnMachineFunction(MachineFunction &MF) {
         continue;
       }
 
+      // Shrink scalar logic operations.
+      if (MI.getOpcode() == AMDGPU::S_AND_B32 ||
+          MI.getOpcode() == AMDGPU::S_OR_B32 ||
+          MI.getOpcode() == AMDGPU::S_XOR_B32) {
+        if (shrinkScalarLogicOp(ST, MRI, TII, MI))
+          continue;
+      }
+
       if (!TII->hasVALU32BitEncoding(MI.getOpcode()))
         continue;
 
-      if (!canShrink(MI, TII, TRI, MRI)) {
+      if (!TII->canShrink(MI, MRI)) {
         // Try commuting the instruction and see if that enables us to shrink
         // it.
         if (!MI.isCommutable() || !TII->commuteInstruction(MI) ||
-            !canShrink(MI, TII, TRI, MRI))
+            !TII->canShrink(MI, MRI))
           continue;
       }
 
@@ -488,40 +676,7 @@ bool SIShrinkInstructions::runOnMachineFunction(MachineFunction &MF) {
       // We can shrink this instruction
       LLVM_DEBUG(dbgs() << "Shrinking " << MI);
 
-      MachineInstrBuilder Inst32 =
-          BuildMI(MBB, I, MI.getDebugLoc(), TII->get(Op32));
-
-      // Add the dst operand if the 32-bit encoding also has an explicit $vdst.
-      // For VOPC instructions, this is replaced by an implicit def of vcc.
-      int Op32DstIdx = AMDGPU::getNamedOperandIdx(Op32, AMDGPU::OpName::vdst);
-      if (Op32DstIdx != -1) {
-        // dst
-        Inst32.add(MI.getOperand(0));
-      } else {
-        assert(MI.getOperand(0).getReg() == AMDGPU::VCC &&
-               "Unexpected case");
-      }
-
-
-      Inst32.add(*TII->getNamedOperand(MI, AMDGPU::OpName::src0));
-
-      const MachineOperand *Src1 =
-          TII->getNamedOperand(MI, AMDGPU::OpName::src1);
-      if (Src1)
-        Inst32.add(*Src1);
-
-      if (Src2) {
-        int Op32Src2Idx = AMDGPU::getNamedOperandIdx(Op32, AMDGPU::OpName::src2);
-        if (Op32Src2Idx != -1) {
-          Inst32.add(*Src2);
-        } else {
-          // In the case of V_CNDMASK_B32_e32, the explicit operand src2 is
-          // replaced with an implicit read of vcc. This was already added
-          // during the initial BuildMI, so find it to preserve the flags.
-          copyFlagsToImplicitVCC(*Inst32, *Src2);
-        }
-      }
-
+      MachineInstr *Inst32 = TII->buildShrunkInst(MI, Op32);
       ++NumInstructionsShrunk;
 
       // Copy extra operands not present in the instruction definition.
diff --git a/lib/Target/AMDGPU/SMInstructions.td b/lib/Target/AMDGPU/SMInstructions.td
index 7485326017b26..8a063e1a48673 100644
--- a/lib/Target/AMDGPU/SMInstructions.td
+++ b/lib/Target/AMDGPU/SMInstructions.td
@@ -375,83 +375,6 @@ defm S_DCACHE_DISCARD_X2 : SM_Pseudo_Discards <"s_dcache_discard_x2">;
 }
 
 //===----------------------------------------------------------------------===//
-// Scalar Memory Patterns
-//===----------------------------------------------------------------------===//
-
-
-def smrd_load : PatFrag <(ops node:$ptr), (load node:$ptr), [{
-  auto Ld = cast<LoadSDNode>(N);
-  return Ld->getAlignment() >= 4  &&
-    ((((Ld->getAddressSpace() == AMDGPUASI.CONSTANT_ADDRESS) || (Ld->getAddressSpace() == AMDGPUASI.CONSTANT_ADDRESS_32BIT)) && !N->isDivergent()) ||
-    (Subtarget->getScalarizeGlobalBehavior() && Ld->getAddressSpace() == AMDGPUASI.GLOBAL_ADDRESS &&
-     !Ld->isVolatile() && !N->isDivergent() &&
-    static_cast<const SITargetLowering *>(getTargetLowering())->isMemOpHasNoClobberedMemOperand(N)));
-}]>;
-
-def SMRDImm         : ComplexPattern<i64, 2, "SelectSMRDImm">;
-def SMRDImm32       : ComplexPattern<i64, 2, "SelectSMRDImm32">;
-def SMRDSgpr        : ComplexPattern<i64, 2, "SelectSMRDSgpr">;
-def SMRDBufferImm   : ComplexPattern<i32, 1, "SelectSMRDBufferImm">;
-def SMRDBufferImm32 : ComplexPattern<i32, 1, "SelectSMRDBufferImm32">;
-
-multiclass SMRD_Pattern <string Instr, ValueType vt> {
-
-  // 1. IMM offset
-  def : GCNPat <
-    (smrd_load (SMRDImm i64:$sbase, i32:$offset)),
-    (vt (!cast<SM_Pseudo>(Instr#"_IMM") $sbase, $offset, 0))
-  >;
-
-  // 2. SGPR offset
-  def : GCNPat <
-    (smrd_load (SMRDSgpr i64:$sbase, i32:$offset)),
-    (vt (!cast<SM_Pseudo>(Instr#"_SGPR") $sbase, $offset, 0))
-  >;
-}
-
-let OtherPredicates = [isSICI] in {
-def : GCNPat <
-  (i64 (readcyclecounter)),
-  (S_MEMTIME)
->;
-}
-
-// Global and constant loads can be selected to either MUBUF or SMRD
-// instructions, but SMRD instructions are faster so we want the instruction
-// selector to prefer those.
-let AddedComplexity = 100 in {
-
-defm : SMRD_Pattern <"S_LOAD_DWORD",    i32>;
-defm : SMRD_Pattern <"S_LOAD_DWORDX2",  v2i32>;
-defm : SMRD_Pattern <"S_LOAD_DWORDX4",  v4i32>;
-defm : SMRD_Pattern <"S_LOAD_DWORDX8",  v8i32>;
-defm : SMRD_Pattern <"S_LOAD_DWORDX16", v16i32>;
-
-// 1. Offset as an immediate
-def SM_LOAD_PATTERN : GCNPat <  // name this pattern to reuse AddedComplexity on CI
-  (SIload_constant v4i32:$sbase, (SMRDBufferImm i32:$offset)),
-  (S_BUFFER_LOAD_DWORD_IMM $sbase, $offset, 0)
->;
-
-// 2. Offset loaded in an 32bit SGPR
-def : GCNPat <
-  (SIload_constant v4i32:$sbase, i32:$offset),
-  (S_BUFFER_LOAD_DWORD_SGPR $sbase, $offset, 0)
->;
-
-} // End let AddedComplexity = 100
-
-let OtherPredicates = [isVI] in {
-
-def : GCNPat <
-  (i64 (readcyclecounter)),
-  (S_MEMREALTIME)
->;
-
-} // let OtherPredicates = [isVI]
-
-
-//===----------------------------------------------------------------------===//
 // Targets
 //===----------------------------------------------------------------------===//
 
@@ -757,25 +680,97 @@ class SMRD_Real_ci <bits<5> op, SM_Pseudo ps>
 
 def S_DCACHE_INV_VOL_ci : SMRD_Real_ci <0x1d, S_DCACHE_INV_VOL>;
 
-let AddedComplexity = SM_LOAD_PATTERN.AddedComplexity in {
+//===----------------------------------------------------------------------===//
+// Scalar Memory Patterns
+//===----------------------------------------------------------------------===//
+
+def smrd_load : PatFrag <(ops node:$ptr), (load node:$ptr), [{ return isUniformLoad(N);}]>;
+
+def SMRDImm         : ComplexPattern<i64, 2, "SelectSMRDImm">;
+def SMRDImm32       : ComplexPattern<i64, 2, "SelectSMRDImm32">;
+def SMRDSgpr        : ComplexPattern<i64, 2, "SelectSMRDSgpr">;
+def SMRDBufferImm   : ComplexPattern<i32, 1, "SelectSMRDBufferImm">;
+def SMRDBufferImm32 : ComplexPattern<i32, 1, "SelectSMRDBufferImm32">;
+
+multiclass SMRD_Pattern <string Instr, ValueType vt> {
+
+  // 1. IMM offset
+  def : GCNPat <
+    (smrd_load (SMRDImm i64:$sbase, i32:$offset)),
+    (vt (!cast<SM_Pseudo>(Instr#"_IMM") $sbase, $offset, 0))
+  >;
+
+  // 2. 32-bit IMM offset on CI
+  def : GCNPat <
+    (smrd_load (SMRDImm32 i64:$sbase, i32:$offset)),
+    (vt (!cast<InstSI>(Instr#"_IMM_ci") $sbase, $offset, 0))> {
+    let OtherPredicates = [isCIOnly];
+  }
+
+  // 3. SGPR offset
+  def : GCNPat <
+    (smrd_load (SMRDSgpr i64:$sbase, i32:$offset)),
+    (vt (!cast<SM_Pseudo>(Instr#"_SGPR") $sbase, $offset, 0))
+  >;
+}
+
+multiclass SMLoad_Pattern <string Instr, ValueType vt> {
+  // 1. Offset as an immediate
+  def : GCNPat <
+    (SIsbuffer_load v4i32:$sbase, (SMRDBufferImm i32:$offset), i1:$glc),
+    (vt (!cast<SM_Pseudo>(Instr#"_IMM") $sbase, $offset, (as_i1imm $glc)))
+  >;
+
+  // 2. 32-bit IMM offset on CI
+  def : GCNPat <
+    (vt (SIsbuffer_load v4i32:$sbase, (SMRDBufferImm32 i32:$offset), i1:$glc)),
+    (!cast<InstSI>(Instr#"_IMM_ci") $sbase, $offset, (as_i1imm $glc))> {
+    let OtherPredicates = [isCIOnly];
+  }
 
-class SMRD_Pattern_ci <string Instr, ValueType vt> : GCNPat <
-  (smrd_load (SMRDImm32 i64:$sbase, i32:$offset)),
-  (vt (!cast<InstSI>(Instr#"_IMM_ci") $sbase, $offset, 0))> {
-  let OtherPredicates = [isCIOnly];
+  // 3. Offset loaded in an 32bit SGPR
+  def : GCNPat <
+    (SIsbuffer_load v4i32:$sbase, i32:$offset, i1:$glc),
+    (vt (!cast<SM_Pseudo>(Instr#"_SGPR") $sbase, $offset, (as_i1imm $glc)))
+  >;
 }
 
-def : SMRD_Pattern_ci <"S_LOAD_DWORD",    i32>;
-def : SMRD_Pattern_ci <"S_LOAD_DWORDX2",  v2i32>;
-def : SMRD_Pattern_ci <"S_LOAD_DWORDX4",  v4i32>;
-def : SMRD_Pattern_ci <"S_LOAD_DWORDX8",  v8i32>;
-def : SMRD_Pattern_ci <"S_LOAD_DWORDX16", v16i32>;
+// Global and constant loads can be selected to either MUBUF or SMRD
+// instructions, but SMRD instructions are faster so we want the instruction
+// selector to prefer those.
+let AddedComplexity = 100 in {
+
+defm : SMRD_Pattern <"S_LOAD_DWORD",    i32>;
+defm : SMRD_Pattern <"S_LOAD_DWORDX2",  v2i32>;
+defm : SMRD_Pattern <"S_LOAD_DWORDX4",  v4i32>;
+defm : SMRD_Pattern <"S_LOAD_DWORDX8",  v8i32>;
+defm : SMRD_Pattern <"S_LOAD_DWORDX16", v16i32>;
+
+defm : SMLoad_Pattern <"S_BUFFER_LOAD_DWORD",     i32>;
+defm : SMLoad_Pattern <"S_BUFFER_LOAD_DWORDX2",   v2i32>;
+defm : SMLoad_Pattern <"S_BUFFER_LOAD_DWORDX4",   v4i32>;
+defm : SMLoad_Pattern <"S_BUFFER_LOAD_DWORDX8",   v8i32>;
+defm : SMLoad_Pattern <"S_BUFFER_LOAD_DWORDX16",  v16i32>;
+
+defm : SMLoad_Pattern <"S_BUFFER_LOAD_DWORD",     f32>;
+defm : SMLoad_Pattern <"S_BUFFER_LOAD_DWORDX2",   v2f32>;
+defm : SMLoad_Pattern <"S_BUFFER_LOAD_DWORDX4",   v4f32>;
+defm : SMLoad_Pattern <"S_BUFFER_LOAD_DWORDX8",   v8f32>;
+defm : SMLoad_Pattern <"S_BUFFER_LOAD_DWORDX16",  v16f32>;
+} // End let AddedComplexity = 100
 
+let OtherPredicates = [isSICI] in {
 def : GCNPat <
-  (SIload_constant v4i32:$sbase, (SMRDBufferImm32 i32:$offset)),
-  (S_BUFFER_LOAD_DWORD_IMM_ci $sbase, $offset, 0)> {
-  let OtherPredicates = [isCI]; // should this be isCIOnly?
+  (i64 (readcyclecounter)),
+  (S_MEMTIME)
+>;
 }
 
-} // End let AddedComplexity = SM_LOAD_PATTERN.AddedComplexity
+let OtherPredicates = [isVI] in {
 
+def : GCNPat <
+  (i64 (readcyclecounter)),
+  (S_MEMREALTIME)
+>;
+
+} // let OtherPredicates = [isVI]
diff --git a/lib/Target/AMDGPU/SOPInstructions.td b/lib/Target/AMDGPU/SOPInstructions.td
index 6f5db9644c868..ca5e981ac5c25 100644
--- a/lib/Target/AMDGPU/SOPInstructions.td
+++ b/lib/Target/AMDGPU/SOPInstructions.td
@@ -336,42 +336,54 @@ class SOP2_64_32_32 <string opName, list<dag> pattern=[]> : SOP2_Pseudo <
   "$sdst, $src0, $src1", pattern
 >;
 
+class UniformUnaryFrag<SDPatternOperator Op> : PatFrag <
+  (ops node:$src0),
+  (Op $src0),
+  [{ return !N->isDivergent(); }]
+>;
+
+class UniformBinFrag<SDPatternOperator Op> : PatFrag <
+  (ops node:$src0, node:$src1),
+  (Op $src0, $src1),
+  [{ return !N->isDivergent(); }]
+>;
+
 let Defs = [SCC] in { // Carry out goes to SCC
 let isCommutable = 1 in {
 def S_ADD_U32 : SOP2_32 <"s_add_u32">;
 def S_ADD_I32 : SOP2_32 <"s_add_i32",
-  [(set i32:$sdst, (add SSrc_b32:$src0, SSrc_b32:$src1))]
+  [(set i32:$sdst, (UniformBinFrag<add> SSrc_b32:$src0, SSrc_b32:$src1))]
 >;
 } // End isCommutable = 1
 
 def S_SUB_U32 : SOP2_32 <"s_sub_u32">;
 def S_SUB_I32 : SOP2_32 <"s_sub_i32",
-  [(set i32:$sdst, (sub SSrc_b32:$src0, SSrc_b32:$src1))]
+  [(set i32:$sdst, (UniformBinFrag<sub> SSrc_b32:$src0, SSrc_b32:$src1))]
 >;
 
 let Uses = [SCC] in { // Carry in comes from SCC
 let isCommutable = 1 in {
 def S_ADDC_U32 : SOP2_32 <"s_addc_u32",
-  [(set i32:$sdst, (adde (i32 SSrc_b32:$src0), (i32 SSrc_b32:$src1)))]>;
+  [(set i32:$sdst, (UniformBinFrag<adde> (i32 SSrc_b32:$src0), (i32 SSrc_b32:$src1)))]>;
 } // End isCommutable = 1
 
 def S_SUBB_U32 : SOP2_32 <"s_subb_u32",
-  [(set i32:$sdst, (sube (i32 SSrc_b32:$src0), (i32 SSrc_b32:$src1)))]>;
+  [(set i32:$sdst, (UniformBinFrag<sube> (i32 SSrc_b32:$src0), (i32 SSrc_b32:$src1)))]>;
 } // End Uses = [SCC]
 
 
 let isCommutable = 1 in {
 def S_MIN_I32 : SOP2_32 <"s_min_i32",
-  [(set i32:$sdst, (smin i32:$src0, i32:$src1))]
+  [(set i32:$sdst, (UniformBinFrag<smin> i32:$src0, i32:$src1))]
 >;
 def S_MIN_U32 : SOP2_32 <"s_min_u32",
-  [(set i32:$sdst, (umin i32:$src0, i32:$src1))]
+  [(set i32:$sdst, (UniformBinFrag<umin> i32:$src0, i32:$src1))]
 >;
 def S_MAX_I32 : SOP2_32 <"s_max_i32",
-  [(set i32:$sdst, (smax i32:$src0, i32:$src1))]
+  [(set i32:$sdst, (UniformBinFrag<smax> i32:$src0, i32:$src1))]
 >;
 def S_MAX_U32 : SOP2_32 <"s_max_u32",
-  [(set i32:$sdst, (umax i32:$src0, i32:$src1))]
+  [(set i32:$sdst, (UniformBinFrag<umax> i32:$src0, i32:$src1))]
 >;
 } // End isCommutable = 1
 } // End Defs = [SCC]
@@ -385,27 +397,27 @@ let Uses = [SCC] in {
 let Defs = [SCC] in {
 let isCommutable = 1 in {
 def S_AND_B32 : SOP2_32 <"s_and_b32",
-  [(set i32:$sdst, (and i32:$src0, i32:$src1))]
+  [(set i32:$sdst, (UniformBinFrag<and> i32:$src0, i32:$src1))]
 >;
 
 def S_AND_B64 : SOP2_64 <"s_and_b64",
-  [(set i64:$sdst, (and i64:$src0, i64:$src1))]
+  [(set i64:$sdst, (UniformBinFrag<and> i64:$src0, i64:$src1))]
 >;
 
 def S_OR_B32 : SOP2_32 <"s_or_b32",
-  [(set i32:$sdst, (or i32:$src0, i32:$src1))]
+  [(set i32:$sdst, (UniformBinFrag<or> i32:$src0, i32:$src1))]
 >;
 
 def S_OR_B64 : SOP2_64 <"s_or_b64",
-  [(set i64:$sdst, (or i64:$src0, i64:$src1))]
+  [(set i64:$sdst, (UniformBinFrag<or> i64:$src0, i64:$src1))]
 >;
 
 def S_XOR_B32 : SOP2_32 <"s_xor_b32",
-  [(set i32:$sdst, (xor i32:$src0, i32:$src1))]
+  [(set i32:$sdst, (UniformBinFrag<xor> i32:$src0, i32:$src1))]
 >;
 
 def S_XOR_B64 : SOP2_64 <"s_xor_b64",
-  [(set i64:$sdst, (xor i64:$src0, i64:$src1))]
+  [(set i64:$sdst, (UniformBinFrag<xor> i64:$src0, i64:$src1))]
 >;
 
 def S_XNOR_B32 : SOP2_32 <"s_xnor_b32",
@@ -415,45 +427,71 @@ def S_XNOR_B32 : SOP2_32 <"s_xnor_b32",
 def S_XNOR_B64 : SOP2_64 <"s_xnor_b64",
   [(set i64:$sdst, (not (xor_oneuse i64:$src0, i64:$src1)))]
 >;
+
+def S_NAND_B32 : SOP2_32 <"s_nand_b32",
+  [(set i32:$sdst, (not (and_oneuse i32:$src0, i32:$src1)))]
+>;
+
+def S_NAND_B64 : SOP2_64 <"s_nand_b64",
+  [(set i64:$sdst, (not (and_oneuse i64:$src0, i64:$src1)))]
+>;
+
+def S_NOR_B32 : SOP2_32 <"s_nor_b32",
+  [(set i32:$sdst, (not (or_oneuse i32:$src0, i32:$src1)))]
+>;
+
+def S_NOR_B64 : SOP2_64 <"s_nor_b64",
+  [(set i64:$sdst, (not (or_oneuse i64:$src0, i64:$src1)))]
+>;
 } // End isCommutable = 1
 
-def S_ANDN2_B32 : SOP2_32 <"s_andn2_b32">;
-def S_ANDN2_B64 : SOP2_64 <"s_andn2_b64">;
-def S_ORN2_B32 : SOP2_32 <"s_orn2_b32">;
-def S_ORN2_B64 : SOP2_64 <"s_orn2_b64">;
-def S_NAND_B32 : SOP2_32 <"s_nand_b32">;
-def S_NAND_B64 : SOP2_64 <"s_nand_b64">;
-def S_NOR_B32 : SOP2_32 <"s_nor_b32">;
-def S_NOR_B64 : SOP2_64 <"s_nor_b64">;
+def S_ANDN2_B32 : SOP2_32 <"s_andn2_b32",
+  [(set i32:$sdst, (UniformBinFrag<and> i32:$src0, (UniformUnaryFrag<not> i32:$src1)))]
+>;
+
+def S_ANDN2_B64 : SOP2_64 <"s_andn2_b64",
+  [(set i64:$sdst, (UniformBinFrag<and> i64:$src0, (UniformUnaryFrag<not> i64:$src1)))]
+>;
+
+def S_ORN2_B32 : SOP2_32 <"s_orn2_b32",
+  [(set i32:$sdst, (UniformBinFrag<or> i32:$src0, (UniformUnaryFrag<not> i32:$src1)))]
+>;
+
+def S_ORN2_B64 : SOP2_64 <"s_orn2_b64",
+  [(set i64:$sdst, (UniformBinFrag<or> i64:$src0, (UniformUnaryFrag<not> i64:$src1)))]
+>;
 } // End Defs = [SCC]
 
 // Use added complexity so these patterns are preferred to the VALU patterns.
 let AddedComplexity = 1 in {
 
 let Defs = [SCC] in {
+// TODO: b64 versions require VOP3 change since v_lshlrev_b64 is VOP3
 def S_LSHL_B32 : SOP2_32 <"s_lshl_b32",
-  [(set i32:$sdst, (shl i32:$src0, i32:$src1))]
+  [(set i32:$sdst, (UniformBinFrag<shl> i32:$src0, i32:$src1))]
 >;
 def S_LSHL_B64 : SOP2_64_32 <"s_lshl_b64",
-  [(set i64:$sdst, (shl i64:$src0, i32:$src1))]
+  [(set i64:$sdst, (UniformBinFrag<shl> i64:$src0, i32:$src1))]
 >;
 def S_LSHR_B32 : SOP2_32 <"s_lshr_b32",
-  [(set i32:$sdst, (srl i32:$src0, i32:$src1))]
+  [(set i32:$sdst, (UniformBinFrag<srl> i32:$src0, i32:$src1))]
 >;
 def S_LSHR_B64 : SOP2_64_32 <"s_lshr_b64",
-  [(set i64:$sdst, (srl i64:$src0, i32:$src1))]
+  [(set i64:$sdst, (UniformBinFrag<srl> i64:$src0, i32:$src1))]
 >;
 def S_ASHR_I32 : SOP2_32 <"s_ashr_i32",
-  [(set i32:$sdst, (sra i32:$src0, i32:$src1))]
+  [(set i32:$sdst, (UniformBinFrag<sra> i32:$src0, i32:$src1))]
 >;
 def S_ASHR_I64 : SOP2_64_32 <"s_ashr_i64",
-  [(set i64:$sdst, (sra i64:$src0, i32:$src1))]
+  [(set i64:$sdst, (UniformBinFrag<sra> i64:$src0, i32:$src1))]
 >;
 } // End Defs = [SCC]
 
 def S_BFM_B32 : SOP2_32 <"s_bfm_b32",
-  [(set i32:$sdst, (AMDGPUbfm i32:$src0, i32:$src1))]>;
+  [(set i32:$sdst, (UniformBinFrag<AMDGPUbfm> i32:$src0, i32:$src1))]>;
 def S_BFM_B64 : SOP2_64_32_32 <"s_bfm_b64">;
+
+// TODO: S_MUL_I32 require V_MUL_LO_I32 from VOP3 change
 def S_MUL_I32 : SOP2_32 <"s_mul_i32",
   [(set i32:$sdst, (mul i32:$src0, i32:$src1))]> {
   let isCommutable = 1;
diff --git a/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp b/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp
index 4eba193823154..54c866bdc63ce 100644
--- a/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp
+++ b/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp
@@ -128,131 +128,127 @@ int getMaskedMIMGOp(unsigned Opc, unsigned NewChannels) {
   return NewInfo ? NewInfo->Opcode : -1;
 }
 
-// Wrapper for Tablegen'd function.  enum Subtarget is not defined in any
-// header files, so we need to wrap it in a function that takes unsigned
-// instead.
-int getMCOpcode(uint16_t Opcode, unsigned Gen) {
-  return getMCOpcodeGen(Opcode, static_cast<Subtarget>(Gen));
+struct MUBUFInfo {
+  uint16_t Opcode;
+  uint16_t BaseOpcode;
+  uint8_t dwords;
+  bool has_vaddr;
+  bool has_srsrc;
+  bool has_soffset;
+};
+
+#define GET_MUBUFInfoTable_DECL
+#define GET_MUBUFInfoTable_IMPL
+#include "AMDGPUGenSearchableTables.inc"
+
+int getMUBUFBaseOpcode(unsigned Opc) {
+  const MUBUFInfo *Info = getMUBUFInfoFromOpcode(Opc);
+  return Info ? Info->BaseOpcode : -1;
 }
 
-namespace IsaInfo {
+int getMUBUFOpcode(unsigned BaseOpc, unsigned Dwords) {
+  const MUBUFInfo *Info = getMUBUFInfoFromBaseOpcodeAndDwords(BaseOpc, Dwords);
+  return Info ? Info->Opcode : -1;
+}
 
-IsaVersion getIsaVersion(const FeatureBitset &Features) {
-  // GCN GFX6 (Southern Islands (SI)).
-  if (Features.test(FeatureISAVersion6_0_0))
-    return {6, 0, 0};
-  if (Features.test(FeatureISAVersion6_0_1))
-    return {6, 0, 1};
+int getMUBUFDwords(unsigned Opc) {
+  const MUBUFInfo *Info = getMUBUFOpcodeHelper(Opc);
+  return Info ? Info->dwords : 0;
+}
 
-  // GCN GFX7 (Sea Islands (CI)).
-  if (Features.test(FeatureISAVersion7_0_0))
-    return {7, 0, 0};
-  if (Features.test(FeatureISAVersion7_0_1))
-    return {7, 0, 1};
-  if (Features.test(FeatureISAVersion7_0_2))
-    return {7, 0, 2};
-  if (Features.test(FeatureISAVersion7_0_3))
-    return {7, 0, 3};
-  if (Features.test(FeatureISAVersion7_0_4))
-    return {7, 0, 4};
-  if (Features.test(FeatureSeaIslands))
-    return {7, 0, 0};
+bool getMUBUFHasVAddr(unsigned Opc) {
+  const MUBUFInfo *Info = getMUBUFOpcodeHelper(Opc);
+  return Info ? Info->has_vaddr : false;
+}
 
-  // GCN GFX8 (Volcanic Islands (VI)).
-  if (Features.test(FeatureISAVersion8_0_1))
-    return {8, 0, 1};
-  if (Features.test(FeatureISAVersion8_0_2))
-    return {8, 0, 2};
-  if (Features.test(FeatureISAVersion8_0_3))
-    return {8, 0, 3};
-  if (Features.test(FeatureISAVersion8_1_0))
-    return {8, 1, 0};
-  if (Features.test(FeatureVolcanicIslands))
-    return {8, 0, 0};
+bool getMUBUFHasSrsrc(unsigned Opc) {
+  const MUBUFInfo *Info = getMUBUFOpcodeHelper(Opc);
+  return Info ? Info->has_srsrc : false;
+}
 
-  // GCN GFX9.
-  if (Features.test(FeatureISAVersion9_0_0))
-    return {9, 0, 0};
-  if (Features.test(FeatureISAVersion9_0_2))
-    return {9, 0, 2};
-  if (Features.test(FeatureISAVersion9_0_4))
-    return {9, 0, 4};
-  if (Features.test(FeatureISAVersion9_0_6))
-    return {9, 0, 6};
-  if (Features.test(FeatureGFX9))
-    return {9, 0, 0};
+bool getMUBUFHasSoffset(unsigned Opc) {
+  const MUBUFInfo *Info = getMUBUFOpcodeHelper(Opc);
+  return Info ? Info->has_soffset : false;
+}
 
-  if (Features.test(FeatureSouthernIslands))
-    return {0, 0, 0};
-  return {7, 0, 0};
+// Wrapper for Tablegen'd function.  enum Subtarget is not defined in any
+// header files, so we need to wrap it in a function that takes unsigned
+// instead.
+int getMCOpcode(uint16_t Opcode, unsigned Gen) {
+  return getMCOpcodeGen(Opcode, static_cast<Subtarget>(Gen));
 }
 
+namespace IsaInfo {
+
 void streamIsaVersion(const MCSubtargetInfo *STI, raw_ostream &Stream) {
   auto TargetTriple = STI->getTargetTriple();
-  auto ISAVersion = IsaInfo::getIsaVersion(STI->getFeatureBits());
+  auto Version = getIsaVersion(STI->getCPU());
 
   Stream << TargetTriple.getArchName() << '-'
          << TargetTriple.getVendorName() << '-'
          << TargetTriple.getOSName() << '-'
          << TargetTriple.getEnvironmentName() << '-'
          << "gfx"
-         << ISAVersion.Major
-         << ISAVersion.Minor
-         << ISAVersion.Stepping;
+         << Version.Major
+         << Version.Minor
+         << Version.Stepping;
 
   if (hasXNACK(*STI))
     Stream << "+xnack";
+  if (hasSRAMECC(*STI))
+    Stream << "+sram-ecc";
 
   Stream.flush();
 }
 
 bool hasCodeObjectV3(const MCSubtargetInfo *STI) {
-  return STI->getFeatureBits().test(FeatureCodeObjectV3);
+  return STI->getTargetTriple().getOS() == Triple::AMDHSA &&
+             STI->getFeatureBits().test(FeatureCodeObjectV3);
 }
 
-unsigned getWavefrontSize(const FeatureBitset &Features) {
-  if (Features.test(FeatureWavefrontSize16))
+unsigned getWavefrontSize(const MCSubtargetInfo *STI) {
+  if (STI->getFeatureBits().test(FeatureWavefrontSize16))
     return 16;
-  if (Features.test(FeatureWavefrontSize32))
+  if (STI->getFeatureBits().test(FeatureWavefrontSize32))
     return 32;
 
   return 64;
 }
 
-unsigned getLocalMemorySize(const FeatureBitset &Features) {
-  if (Features.test(FeatureLocalMemorySize32768))
+unsigned getLocalMemorySize(const MCSubtargetInfo *STI) {
+  if (STI->getFeatureBits().test(FeatureLocalMemorySize32768))
     return 32768;
-  if (Features.test(FeatureLocalMemorySize65536))
+  if (STI->getFeatureBits().test(FeatureLocalMemorySize65536))
     return 65536;
 
   return 0;
 }
 
-unsigned getEUsPerCU(const FeatureBitset &Features) {
+unsigned getEUsPerCU(const MCSubtargetInfo *STI) {
   return 4;
 }
 
-unsigned getMaxWorkGroupsPerCU(const FeatureBitset &Features,
+unsigned getMaxWorkGroupsPerCU(const MCSubtargetInfo *STI,
                                unsigned FlatWorkGroupSize) {
-  if (!Features.test(FeatureGCN))
+  if (!STI->getFeatureBits().test(FeatureGCN))
     return 8;
-  unsigned N = getWavesPerWorkGroup(Features, FlatWorkGroupSize);
+  unsigned N = getWavesPerWorkGroup(STI, FlatWorkGroupSize);
   if (N == 1)
     return 40;
   N = 40 / N;
   return std::min(N, 16u);
 }
 
-unsigned getMaxWavesPerCU(const FeatureBitset &Features) {
-  return getMaxWavesPerEU() * getEUsPerCU(Features);
+unsigned getMaxWavesPerCU(const MCSubtargetInfo *STI) {
+  return getMaxWavesPerEU() * getEUsPerCU(STI);
 }
 
-unsigned getMaxWavesPerCU(const FeatureBitset &Features,
+unsigned getMaxWavesPerCU(const MCSubtargetInfo *STI,
                           unsigned FlatWorkGroupSize) {
-  return getWavesPerWorkGroup(Features, FlatWorkGroupSize);
+  return getWavesPerWorkGroup(STI, FlatWorkGroupSize);
 }
 
-unsigned getMinWavesPerEU(const FeatureBitset &Features) {
+unsigned getMinWavesPerEU(const MCSubtargetInfo *STI) {
   return 1;
 }
 
@@ -261,89 +257,89 @@ unsigned getMaxWavesPerEU() {
   return 10;
 }
 
-unsigned getMaxWavesPerEU(const FeatureBitset &Features,
+unsigned getMaxWavesPerEU(const MCSubtargetInfo *STI,
                           unsigned FlatWorkGroupSize) {
-  return alignTo(getMaxWavesPerCU(Features, FlatWorkGroupSize),
-                 getEUsPerCU(Features)) / getEUsPerCU(Features);
+  return alignTo(getMaxWavesPerCU(STI, FlatWorkGroupSize),
+                 getEUsPerCU(STI)) / getEUsPerCU(STI);
 }
 
-unsigned getMinFlatWorkGroupSize(const FeatureBitset &Features) {
+unsigned getMinFlatWorkGroupSize(const MCSubtargetInfo *STI) {
   return 1;
 }
 
-unsigned getMaxFlatWorkGroupSize(const FeatureBitset &Features) {
+unsigned getMaxFlatWorkGroupSize(const MCSubtargetInfo *STI) {
   return 2048;
 }
 
-unsigned getWavesPerWorkGroup(const FeatureBitset &Features,
+unsigned getWavesPerWorkGroup(const MCSubtargetInfo *STI,
                               unsigned FlatWorkGroupSize) {
-  return alignTo(FlatWorkGroupSize, getWavefrontSize(Features)) /
-                 getWavefrontSize(Features);
+  return alignTo(FlatWorkGroupSize, getWavefrontSize(STI)) /
+                 getWavefrontSize(STI);
 }
 
-unsigned getSGPRAllocGranule(const FeatureBitset &Features) {
-  IsaVersion Version = getIsaVersion(Features);
+unsigned getSGPRAllocGranule(const MCSubtargetInfo *STI) {
+  IsaVersion Version = getIsaVersion(STI->getCPU());
   if (Version.Major >= 8)
     return 16;
   return 8;
 }
 
-unsigned getSGPREncodingGranule(const FeatureBitset &Features) {
+unsigned getSGPREncodingGranule(const MCSubtargetInfo *STI) {
   return 8;
 }
 
-unsigned getTotalNumSGPRs(const FeatureBitset &Features) {
-  IsaVersion Version = getIsaVersion(Features);
+unsigned getTotalNumSGPRs(const MCSubtargetInfo *STI) {
+  IsaVersion Version = getIsaVersion(STI->getCPU());
   if (Version.Major >= 8)
     return 800;
   return 512;
 }
 
-unsigned getAddressableNumSGPRs(const FeatureBitset &Features) {
-  if (Features.test(FeatureSGPRInitBug))
+unsigned getAddressableNumSGPRs(const MCSubtargetInfo *STI) {
+  if (STI->getFeatureBits().test(FeatureSGPRInitBug))
     return FIXED_NUM_SGPRS_FOR_INIT_BUG;
 
-  IsaVersion Version = getIsaVersion(Features);
+  IsaVersion Version = getIsaVersion(STI->getCPU());
   if (Version.Major >= 8)
     return 102;
   return 104;
 }
 
-unsigned getMinNumSGPRs(const FeatureBitset &Features, unsigned WavesPerEU) {
+unsigned getMinNumSGPRs(const MCSubtargetInfo *STI, unsigned WavesPerEU) {
   assert(WavesPerEU != 0);
 
   if (WavesPerEU >= getMaxWavesPerEU())
     return 0;
 
-  unsigned MinNumSGPRs = getTotalNumSGPRs(Features) / (WavesPerEU + 1);
-  if (Features.test(FeatureTrapHandler))
+  unsigned MinNumSGPRs = getTotalNumSGPRs(STI) / (WavesPerEU + 1);
+  if (STI->getFeatureBits().test(FeatureTrapHandler))
     MinNumSGPRs -= std::min(MinNumSGPRs, (unsigned)TRAP_NUM_SGPRS);
-  MinNumSGPRs = alignDown(MinNumSGPRs, getSGPRAllocGranule(Features)) + 1;
-  return std::min(MinNumSGPRs, getAddressableNumSGPRs(Features));
+  MinNumSGPRs = alignDown(MinNumSGPRs, getSGPRAllocGranule(STI)) + 1;
+  return std::min(MinNumSGPRs, getAddressableNumSGPRs(STI));
 }
 
-unsigned getMaxNumSGPRs(const FeatureBitset &Features, unsigned WavesPerEU,
+unsigned getMaxNumSGPRs(const MCSubtargetInfo *STI, unsigned WavesPerEU,
                         bool Addressable) {
   assert(WavesPerEU != 0);
 
-  IsaVersion Version = getIsaVersion(Features);
-  unsigned AddressableNumSGPRs = getAddressableNumSGPRs(Features);
+  IsaVersion Version = getIsaVersion(STI->getCPU());
+  unsigned AddressableNumSGPRs = getAddressableNumSGPRs(STI);
   if (Version.Major >= 8 && !Addressable)
     AddressableNumSGPRs = 112;
-  unsigned MaxNumSGPRs = getTotalNumSGPRs(Features) / WavesPerEU;
-  if (Features.test(FeatureTrapHandler))
+  unsigned MaxNumSGPRs = getTotalNumSGPRs(STI) / WavesPerEU;
+  if (STI->getFeatureBits().test(FeatureTrapHandler))
     MaxNumSGPRs -= std::min(MaxNumSGPRs, (unsigned)TRAP_NUM_SGPRS);
-  MaxNumSGPRs = alignDown(MaxNumSGPRs, getSGPRAllocGranule(Features));
+  MaxNumSGPRs = alignDown(MaxNumSGPRs, getSGPRAllocGranule(STI));
   return std::min(MaxNumSGPRs, AddressableNumSGPRs);
 }
 
-unsigned getNumExtraSGPRs(const FeatureBitset &Features, bool VCCUsed,
+unsigned getNumExtraSGPRs(const MCSubtargetInfo *STI, bool VCCUsed,
                           bool FlatScrUsed, bool XNACKUsed) {
   unsigned ExtraSGPRs = 0;
   if (VCCUsed)
     ExtraSGPRs = 2;
 
-  IsaVersion Version = getIsaVersion(Features);
+  IsaVersion Version = getIsaVersion(STI->getCPU());
   if (Version.Major < 8) {
     if (FlatScrUsed)
       ExtraSGPRs = 4;
@@ -358,74 +354,74 @@ unsigned getNumExtraSGPRs(const FeatureBitset &Features, bool VCCUsed,
   return ExtraSGPRs;
 }
 
-unsigned getNumExtraSGPRs(const FeatureBitset &Features, bool VCCUsed,
+unsigned getNumExtraSGPRs(const MCSubtargetInfo *STI, bool VCCUsed,
                           bool FlatScrUsed) {
-  return getNumExtraSGPRs(Features, VCCUsed, FlatScrUsed,
-                          Features[AMDGPU::FeatureXNACK]);
+  return getNumExtraSGPRs(STI, VCCUsed, FlatScrUsed,
+                          STI->getFeatureBits().test(AMDGPU::FeatureXNACK));
 }
 
-unsigned getNumSGPRBlocks(const FeatureBitset &Features, unsigned NumSGPRs) {
-  NumSGPRs = alignTo(std::max(1u, NumSGPRs), getSGPREncodingGranule(Features));
+unsigned getNumSGPRBlocks(const MCSubtargetInfo *STI, unsigned NumSGPRs) {
+  NumSGPRs = alignTo(std::max(1u, NumSGPRs), getSGPREncodingGranule(STI));
   // SGPRBlocks is actual number of SGPR blocks minus 1.
-  return NumSGPRs / getSGPREncodingGranule(Features) - 1;
+  return NumSGPRs / getSGPREncodingGranule(STI) - 1;
 }
 
-unsigned getVGPRAllocGranule(const FeatureBitset &Features) {
+unsigned getVGPRAllocGranule(const MCSubtargetInfo *STI) {
   return 4;
 }
 
-unsigned getVGPREncodingGranule(const FeatureBitset &Features) {
-  return getVGPRAllocGranule(Features);
+unsigned getVGPREncodingGranule(const MCSubtargetInfo *STI) {
+  return getVGPRAllocGranule(STI);
 }
 
-unsigned getTotalNumVGPRs(const FeatureBitset &Features) {
+unsigned getTotalNumVGPRs(const MCSubtargetInfo *STI) {
   return 256;
 }
 
-unsigned getAddressableNumVGPRs(const FeatureBitset &Features) {
-  return getTotalNumVGPRs(Features);
+unsigned getAddressableNumVGPRs(const MCSubtargetInfo *STI) {
+  return getTotalNumVGPRs(STI);
 }
 
-unsigned getMinNumVGPRs(const FeatureBitset &Features, unsigned WavesPerEU) {
+unsigned getMinNumVGPRs(const MCSubtargetInfo *STI, unsigned WavesPerEU) {
   assert(WavesPerEU != 0);
 
   if (WavesPerEU >= getMaxWavesPerEU())
     return 0;
   unsigned MinNumVGPRs =
-      alignDown(getTotalNumVGPRs(Features) / (WavesPerEU + 1),
-                getVGPRAllocGranule(Features)) + 1;
-  return std::min(MinNumVGPRs, getAddressableNumVGPRs(Features));
+      alignDown(getTotalNumVGPRs(STI) / (WavesPerEU + 1),
+                getVGPRAllocGranule(STI)) + 1;
+  return std::min(MinNumVGPRs, getAddressableNumVGPRs(STI));
 }
 
-unsigned getMaxNumVGPRs(const FeatureBitset &Features, unsigned WavesPerEU) {
+unsigned getMaxNumVGPRs(const MCSubtargetInfo *STI, unsigned WavesPerEU) {
   assert(WavesPerEU != 0);
 
-  unsigned MaxNumVGPRs = alignDown(getTotalNumVGPRs(Features) / WavesPerEU,
-                                   getVGPRAllocGranule(Features));
-  unsigned AddressableNumVGPRs = getAddressableNumVGPRs(Features);
+  unsigned MaxNumVGPRs = alignDown(getTotalNumVGPRs(STI) / WavesPerEU,
+                                   getVGPRAllocGranule(STI));
+  unsigned AddressableNumVGPRs = getAddressableNumVGPRs(STI);
   return std::min(MaxNumVGPRs, AddressableNumVGPRs);
 }
 
-unsigned getNumVGPRBlocks(const FeatureBitset &Features, unsigned NumVGPRs) {
-  NumVGPRs = alignTo(std::max(1u, NumVGPRs), getVGPREncodingGranule(Features));
+unsigned getNumVGPRBlocks(const MCSubtargetInfo *STI, unsigned NumVGPRs) {
+  NumVGPRs = alignTo(std::max(1u, NumVGPRs), getVGPREncodingGranule(STI));
   // VGPRBlocks is actual number of VGPR blocks minus 1.
-  return NumVGPRs / getVGPREncodingGranule(Features) - 1;
+  return NumVGPRs / getVGPREncodingGranule(STI) - 1;
 }
 
 } // end namespace IsaInfo
 
 void initDefaultAMDKernelCodeT(amd_kernel_code_t &Header,
-                               const FeatureBitset &Features) {
-  IsaInfo::IsaVersion ISA = IsaInfo::getIsaVersion(Features);
+                               const MCSubtargetInfo *STI) {
+  IsaVersion Version = getIsaVersion(STI->getCPU());
 
   memset(&Header, 0, sizeof(Header));
 
   Header.amd_kernel_code_version_major = 1;
   Header.amd_kernel_code_version_minor = 2;
   Header.amd_machine_kind = 1; // AMD_MACHINE_KIND_AMDGPU
-  Header.amd_machine_version_major = ISA.Major;
-  Header.amd_machine_version_minor = ISA.Minor;
-  Header.amd_machine_version_stepping = ISA.Stepping;
+  Header.amd_machine_version_major = Version.Major;
+  Header.amd_machine_version_minor = Version.Minor;
+  Header.amd_machine_version_stepping = Version.Stepping;
   Header.kernel_code_entry_byte_offset = sizeof(Header);
   // wavefront_size is specified as a power of 2: 2^6 = 64 threads.
   Header.wavefront_size = 6;
@@ -513,7 +509,7 @@ std::pair<int, int> getIntegerPairAttribute(const Function &F,
   return Ints;
 }
 
-unsigned getVmcntBitMask(const IsaInfo::IsaVersion &Version) {
+unsigned getVmcntBitMask(const IsaVersion &Version) {
   unsigned VmcntLo = (1 << getVmcntBitWidthLo()) - 1;
   if (Version.Major < 9)
     return VmcntLo;
@@ -522,15 +518,15 @@ unsigned getVmcntBitMask(const IsaInfo::IsaVersion &Version) {
   return VmcntLo | VmcntHi;
 }
 
-unsigned getExpcntBitMask(const IsaInfo::IsaVersion &Version) {
+unsigned getExpcntBitMask(const IsaVersion &Version) {
   return (1 << getExpcntBitWidth()) - 1;
 }
 
-unsigned getLgkmcntBitMask(const IsaInfo::IsaVersion &Version) {
+unsigned getLgkmcntBitMask(const IsaVersion &Version) {
   return (1 << getLgkmcntBitWidth()) - 1;
 }
 
-unsigned getWaitcntBitMask(const IsaInfo::IsaVersion &Version) {
+unsigned getWaitcntBitMask(const IsaVersion &Version) {
   unsigned VmcntLo = getBitMask(getVmcntBitShiftLo(), getVmcntBitWidthLo());
   unsigned Expcnt = getBitMask(getExpcntBitShift(), getExpcntBitWidth());
   unsigned Lgkmcnt = getBitMask(getLgkmcntBitShift(), getLgkmcntBitWidth());
@@ -542,7 +538,7 @@ unsigned getWaitcntBitMask(const IsaInfo::IsaVersion &Version) {
   return Waitcnt | VmcntHi;
 }
 
-unsigned decodeVmcnt(const IsaInfo::IsaVersion &Version, unsigned Waitcnt) {
+unsigned decodeVmcnt(const IsaVersion &Version, unsigned Waitcnt) {
   unsigned VmcntLo =
       unpackBits(Waitcnt, getVmcntBitShiftLo(), getVmcntBitWidthLo());
   if (Version.Major < 9)
@@ -554,22 +550,30 @@ unsigned decodeVmcnt(const IsaInfo::IsaVersion &Version, unsigned Waitcnt) {
   return VmcntLo | VmcntHi;
 }
 
-unsigned decodeExpcnt(const IsaInfo::IsaVersion &Version, unsigned Waitcnt) {
+unsigned decodeExpcnt(const IsaVersion &Version, unsigned Waitcnt) {
   return unpackBits(Waitcnt, getExpcntBitShift(), getExpcntBitWidth());
 }
 
-unsigned decodeLgkmcnt(const IsaInfo::IsaVersion &Version, unsigned Waitcnt) {
+unsigned decodeLgkmcnt(const IsaVersion &Version, unsigned Waitcnt) {
   return unpackBits(Waitcnt, getLgkmcntBitShift(), getLgkmcntBitWidth());
 }
 
-void decodeWaitcnt(const IsaInfo::IsaVersion &Version, unsigned Waitcnt,
+void decodeWaitcnt(const IsaVersion &Version, unsigned Waitcnt,
                    unsigned &Vmcnt, unsigned &Expcnt, unsigned &Lgkmcnt) {
   Vmcnt = decodeVmcnt(Version, Waitcnt);
   Expcnt = decodeExpcnt(Version, Waitcnt);
   Lgkmcnt = decodeLgkmcnt(Version, Waitcnt);
 }
 
-unsigned encodeVmcnt(const IsaInfo::IsaVersion &Version, unsigned Waitcnt,
+Waitcnt decodeWaitcnt(const IsaVersion &Version, unsigned Encoded) {
+  Waitcnt Decoded;
+  Decoded.VmCnt = decodeVmcnt(Version, Encoded);
+  Decoded.ExpCnt = decodeExpcnt(Version, Encoded);
+  Decoded.LgkmCnt = decodeLgkmcnt(Version, Encoded);
+  return Decoded;
+}
+
+unsigned encodeVmcnt(const IsaVersion &Version, unsigned Waitcnt,
                      unsigned Vmcnt) {
   Waitcnt =
       packBits(Vmcnt, Waitcnt, getVmcntBitShiftLo(), getVmcntBitWidthLo());
@@ -580,17 +584,17 @@ unsigned encodeVmcnt(const IsaInfo::IsaVersion &Version, unsigned Waitcnt,
   return packBits(Vmcnt, Waitcnt, getVmcntBitShiftHi(), getVmcntBitWidthHi());
 }
 
-unsigned encodeExpcnt(const IsaInfo::IsaVersion &Version, unsigned Waitcnt,
+unsigned encodeExpcnt(const IsaVersion &Version, unsigned Waitcnt,
                       unsigned Expcnt) {
   return packBits(Expcnt, Waitcnt, getExpcntBitShift(), getExpcntBitWidth());
 }
 
-unsigned encodeLgkmcnt(const IsaInfo::IsaVersion &Version, unsigned Waitcnt,
+unsigned encodeLgkmcnt(const IsaVersion &Version, unsigned Waitcnt,
                        unsigned Lgkmcnt) {
   return packBits(Lgkmcnt, Waitcnt, getLgkmcntBitShift(), getLgkmcntBitWidth());
 }
 
-unsigned encodeWaitcnt(const IsaInfo::IsaVersion &Version,
+unsigned encodeWaitcnt(const IsaVersion &Version,
                        unsigned Vmcnt, unsigned Expcnt, unsigned Lgkmcnt) {
   unsigned Waitcnt = getWaitcntBitMask(Version);
   Waitcnt = encodeVmcnt(Version, Waitcnt, Vmcnt);
@@ -599,6 +603,10 @@ unsigned encodeWaitcnt(const IsaInfo::IsaVersion &Version,
   return Waitcnt;
 }
 
+unsigned encodeWaitcnt(const IsaVersion &Version, const Waitcnt &Decoded) {
+  return encodeWaitcnt(Version, Decoded.VmCnt, Decoded.ExpCnt, Decoded.LgkmCnt);
+}
+
 unsigned getInitialPSInputAddr(const Function &F) {
   return getIntegerAttribute(F, "InitialPSInputAddr", 0);
 }
@@ -643,6 +651,10 @@ bool hasXNACK(const MCSubtargetInfo &STI) {
   return STI.getFeatureBits()[AMDGPU::FeatureXNACK];
 }
 
+bool hasSRAMECC(const MCSubtargetInfo &STI) {
+  return STI.getFeatureBits()[AMDGPU::FeatureSRAMECC];
+}
+
 bool hasMIMG_R128(const MCSubtargetInfo &STI) {
   return STI.getFeatureBits()[AMDGPU::FeatureMIMG_R128];
 }
@@ -798,6 +810,7 @@ unsigned getRegBitWidth(unsigned RCID) {
   case AMDGPU::VS_64RegClassID:
   case AMDGPU::SReg_64RegClassID:
   case AMDGPU::VReg_64RegClassID:
+  case AMDGPU::SReg_64_XEXECRegClassID:
     return 64;
   case AMDGPU::VReg_96RegClassID:
     return 96;
@@ -935,27 +948,50 @@ bool isLegalSMRDImmOffset(const MCSubtargetInfo &ST, int64_t ByteOffset) {
     isUInt<20>(EncodedOffset) : isUInt<8>(EncodedOffset);
 }
 
-} // end namespace AMDGPU
-
-} // end namespace llvm
-
-namespace llvm {
-namespace AMDGPU {
+// Given Imm, split it into the values to put into the SOffset and ImmOffset
+// fields in an MUBUF instruction. Return false if it is not possible (due to a
+// hardware bug needing a workaround).
+//
+// The required alignment ensures that individual address components remain
+// aligned if they are aligned to begin with. It also ensures that additional
+// offsets within the given alignment can be added to the resulting ImmOffset.
+bool splitMUBUFOffset(uint32_t Imm, uint32_t &SOffset, uint32_t &ImmOffset,
+                      const GCNSubtarget *Subtarget, uint32_t Align) {
+  const uint32_t MaxImm = alignDown(4095, Align);
+  uint32_t Overflow = 0;
 
-AMDGPUAS getAMDGPUAS(Triple T) {
-  AMDGPUAS AS;
-  AS.FLAT_ADDRESS = 0;
-  AS.PRIVATE_ADDRESS = 5;
-  AS.REGION_ADDRESS = 2;
-  return AS;
-}
+  if (Imm > MaxImm) {
+    if (Imm <= MaxImm + 64) {
+      // Use an SOffset inline constant for 4..64
+      Overflow = Imm - MaxImm;
+      Imm = MaxImm;
+    } else {
+      // Try to keep the same value in SOffset for adjacent loads, so that
+      // the corresponding register contents can be re-used.
+      //
+      // Load values with all low-bits (except for alignment bits) set into
+      // SOffset, so that a larger range of values can be covered using
+      // s_movk_i32.
+      //
+      // Atomic operations fail to work correctly when individual address
+      // components are unaligned, even if their sum is aligned.
+      uint32_t High = (Imm + Align) & ~4095;
+      uint32_t Low = (Imm + Align) & 4095;
+      Imm = Low;
+      Overflow = High - Align;
+    }
+  }
 
-AMDGPUAS getAMDGPUAS(const TargetMachine &M) {
-  return getAMDGPUAS(M.getTargetTriple());
-}
+  // There is a hardware bug in SI and CI which prevents address clamping in
+  // MUBUF instructions from working correctly with SOffsets. The immediate
+  // offset is unaffected.
+  if (Overflow > 0 &&
+      Subtarget->getGeneration() <= AMDGPUSubtarget::SEA_ISLANDS)
+    return false;
 
-AMDGPUAS getAMDGPUAS(const Module &M) {
-  return getAMDGPUAS(Triple(M.getTargetTriple()));
+  ImmOffset = Imm;
+  SOffset = Overflow;
+  return true;
 }
 
 namespace {
diff --git a/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h b/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h
index 5b7af8268cdaf..20123ed4ac815 100644
--- a/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h
+++ b/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h
@@ -19,6 +19,7 @@
 #include "llvm/Support/AMDHSAKernelDescriptor.h"
 #include "llvm/Support/Compiler.h"
 #include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/TargetParser.h"
 #include <cstdint>
 #include <string>
 #include <utility>
@@ -26,8 +27,10 @@
 namespace llvm {
 
 class Argument;
+class AMDGPUSubtarget;
 class FeatureBitset;
 class Function;
+class GCNSubtarget;
 class GlobalValue;
 class MCContext;
 class MCRegisterClass;
@@ -54,16 +57,6 @@ enum {
   TRAP_NUM_SGPRS = 16
 };
 
-/// Instruction set architecture version.
-struct IsaVersion {
-  unsigned Major;
-  unsigned Minor;
-  unsigned Stepping;
-};
-
-/// \returns Isa version for given subtarget \p Features.
-IsaVersion getIsaVersion(const FeatureBitset &Features);
-
 /// Streams isa version string for given subtarget \p STI into \p Stream.
 void streamIsaVersion(const MCSubtargetInfo *STI, raw_ostream &Stream);
 
@@ -71,114 +64,114 @@ void streamIsaVersion(const MCSubtargetInfo *STI, raw_ostream &Stream);
 /// false otherwise.
 bool hasCodeObjectV3(const MCSubtargetInfo *STI);
 
-/// \returns Wavefront size for given subtarget \p Features.
-unsigned getWavefrontSize(const FeatureBitset &Features);
+/// \returns Wavefront size for given subtarget \p STI.
+unsigned getWavefrontSize(const MCSubtargetInfo *STI);
 
-/// \returns Local memory size in bytes for given subtarget \p Features.
-unsigned getLocalMemorySize(const FeatureBitset &Features);
+/// \returns Local memory size in bytes for given subtarget \p STI.
+unsigned getLocalMemorySize(const MCSubtargetInfo *STI);
 
 /// \returns Number of execution units per compute unit for given subtarget \p
-/// Features.
-unsigned getEUsPerCU(const FeatureBitset &Features);
+/// STI.
+unsigned getEUsPerCU(const MCSubtargetInfo *STI);
 
 /// \returns Maximum number of work groups per compute unit for given subtarget
-/// \p Features and limited by given \p FlatWorkGroupSize.
-unsigned getMaxWorkGroupsPerCU(const FeatureBitset &Features,
+/// \p STI and limited by given \p FlatWorkGroupSize.
+unsigned getMaxWorkGroupsPerCU(const MCSubtargetInfo *STI,
                                unsigned FlatWorkGroupSize);
 
 /// \returns Maximum number of waves per compute unit for given subtarget \p
-/// Features without any kind of limitation.
-unsigned getMaxWavesPerCU(const FeatureBitset &Features);
+/// STI without any kind of limitation.
+unsigned getMaxWavesPerCU(const MCSubtargetInfo *STI);
 
 /// \returns Maximum number of waves per compute unit for given subtarget \p
-/// Features and limited by given \p FlatWorkGroupSize.
-unsigned getMaxWavesPerCU(const FeatureBitset &Features,
+/// STI and limited by given \p FlatWorkGroupSize.
+unsigned getMaxWavesPerCU(const MCSubtargetInfo *STI,
                           unsigned FlatWorkGroupSize);
 
 /// \returns Minimum number of waves per execution unit for given subtarget \p
-/// Features.
-unsigned getMinWavesPerEU(const FeatureBitset &Features);
+/// STI.
+unsigned getMinWavesPerEU(const MCSubtargetInfo *STI);
 
 /// \returns Maximum number of waves per execution unit for given subtarget \p
-/// Features without any kind of limitation.
+/// STI without any kind of limitation.
 unsigned getMaxWavesPerEU();
 
 /// \returns Maximum number of waves per execution unit for given subtarget \p
-/// Features and limited by given \p FlatWorkGroupSize.
-unsigned getMaxWavesPerEU(const FeatureBitset &Features,
+/// STI and limited by given \p FlatWorkGroupSize.
+unsigned getMaxWavesPerEU(const MCSubtargetInfo *STI,
                           unsigned FlatWorkGroupSize);
 
-/// \returns Minimum flat work group size for given subtarget \p Features.
-unsigned getMinFlatWorkGroupSize(const FeatureBitset &Features);
+/// \returns Minimum flat work group size for given subtarget \p STI.
+unsigned getMinFlatWorkGroupSize(const MCSubtargetInfo *STI);
 
-/// \returns Maximum flat work group size for given subtarget \p Features.
-unsigned getMaxFlatWorkGroupSize(const FeatureBitset &Features);
+/// \returns Maximum flat work group size for given subtarget \p STI.
+unsigned getMaxFlatWorkGroupSize(const MCSubtargetInfo *STI);
 
-/// \returns Number of waves per work group for given subtarget \p Features and
+/// \returns Number of waves per work group for given subtarget \p STI and
 /// limited by given \p FlatWorkGroupSize.
-unsigned getWavesPerWorkGroup(const FeatureBitset &Features,
+unsigned getWavesPerWorkGroup(const MCSubtargetInfo *STI,
                               unsigned FlatWorkGroupSize);
 
-/// \returns SGPR allocation granularity for given subtarget \p Features.
-unsigned getSGPRAllocGranule(const FeatureBitset &Features);
+/// \returns SGPR allocation granularity for given subtarget \p STI.
+unsigned getSGPRAllocGranule(const MCSubtargetInfo *STI);
 
-/// \returns SGPR encoding granularity for given subtarget \p Features.
-unsigned getSGPREncodingGranule(const FeatureBitset &Features);
+/// \returns SGPR encoding granularity for given subtarget \p STI.
+unsigned getSGPREncodingGranule(const MCSubtargetInfo *STI);
 
-/// \returns Total number of SGPRs for given subtarget \p Features.
-unsigned getTotalNumSGPRs(const FeatureBitset &Features);
+/// \returns Total number of SGPRs for given subtarget \p STI.
+unsigned getTotalNumSGPRs(const MCSubtargetInfo *STI);
 
-/// \returns Addressable number of SGPRs for given subtarget \p Features.
-unsigned getAddressableNumSGPRs(const FeatureBitset &Features);
+/// \returns Addressable number of SGPRs for given subtarget \p STI.
+unsigned getAddressableNumSGPRs(const MCSubtargetInfo *STI);
 
 /// \returns Minimum number of SGPRs that meets the given number of waves per
-/// execution unit requirement for given subtarget \p Features.
-unsigned getMinNumSGPRs(const FeatureBitset &Features, unsigned WavesPerEU);
+/// execution unit requirement for given subtarget \p STI.
+unsigned getMinNumSGPRs(const MCSubtargetInfo *STI, unsigned WavesPerEU);
 
 /// \returns Maximum number of SGPRs that meets the given number of waves per
-/// execution unit requirement for given subtarget \p Features.
-unsigned getMaxNumSGPRs(const FeatureBitset &Features, unsigned WavesPerEU,
+/// execution unit requirement for given subtarget \p STI.
+unsigned getMaxNumSGPRs(const MCSubtargetInfo *STI, unsigned WavesPerEU,
                         bool Addressable);
 
 /// \returns Number of extra SGPRs implicitly required by given subtarget \p
-/// Features when the given special registers are used.
-unsigned getNumExtraSGPRs(const FeatureBitset &Features, bool VCCUsed,
+/// STI when the given special registers are used.
+unsigned getNumExtraSGPRs(const MCSubtargetInfo *STI, bool VCCUsed,
                           bool FlatScrUsed, bool XNACKUsed);
 
 /// \returns Number of extra SGPRs implicitly required by given subtarget \p
-/// Features when the given special registers are used. XNACK is inferred from
-/// \p Features.
-unsigned getNumExtraSGPRs(const FeatureBitset &Features, bool VCCUsed,
+/// STI when the given special registers are used. XNACK is inferred from
+/// \p STI.
+unsigned getNumExtraSGPRs(const MCSubtargetInfo *STI, bool VCCUsed,
                           bool FlatScrUsed);
 
-/// \returns Number of SGPR blocks needed for given subtarget \p Features when
+/// \returns Number of SGPR blocks needed for given subtarget \p STI when
 /// \p NumSGPRs are used. \p NumSGPRs should already include any special
 /// register counts.
-unsigned getNumSGPRBlocks(const FeatureBitset &Features, unsigned NumSGPRs);
+unsigned getNumSGPRBlocks(const MCSubtargetInfo *STI, unsigned NumSGPRs);
 
-/// \returns VGPR allocation granularity for given subtarget \p Features.
-unsigned getVGPRAllocGranule(const FeatureBitset &Features);
+/// \returns VGPR allocation granularity for given subtarget \p STI.
+unsigned getVGPRAllocGranule(const MCSubtargetInfo *STI);
 
-/// \returns VGPR encoding granularity for given subtarget \p Features.
-unsigned getVGPREncodingGranule(const FeatureBitset &Features);
+/// \returns VGPR encoding granularity for given subtarget \p STI.
+unsigned getVGPREncodingGranule(const MCSubtargetInfo *STI);
 
-/// \returns Total number of VGPRs for given subtarget \p Features.
-unsigned getTotalNumVGPRs(const FeatureBitset &Features);
+/// \returns Total number of VGPRs for given subtarget \p STI.
+unsigned getTotalNumVGPRs(const MCSubtargetInfo *STI);
 
-/// \returns Addressable number of VGPRs for given subtarget \p Features.
-unsigned getAddressableNumVGPRs(const FeatureBitset &Features);
+/// \returns Addressable number of VGPRs for given subtarget \p STI.
+unsigned getAddressableNumVGPRs(const MCSubtargetInfo *STI);
 
 /// \returns Minimum number of VGPRs that meets given number of waves per
-/// execution unit requirement for given subtarget \p Features.
-unsigned getMinNumVGPRs(const FeatureBitset &Features, unsigned WavesPerEU);
+/// execution unit requirement for given subtarget \p STI.
+unsigned getMinNumVGPRs(const MCSubtargetInfo *STI, unsigned WavesPerEU);
 
 /// \returns Maximum number of VGPRs that meets given number of waves per
-/// execution unit requirement for given subtarget \p Features.
-unsigned getMaxNumVGPRs(const FeatureBitset &Features, unsigned WavesPerEU);
+/// execution unit requirement for given subtarget \p STI.
+unsigned getMaxNumVGPRs(const MCSubtargetInfo *STI, unsigned WavesPerEU);
 
-/// \returns Number of VGPR blocks needed for given subtarget \p Features when
+/// \returns Number of VGPR blocks needed for given subtarget \p STI when
 /// \p NumVGPRs are used.
-unsigned getNumVGPRBlocks(const FeatureBitset &Features, unsigned NumSGPRs);
+unsigned getNumVGPRBlocks(const MCSubtargetInfo *STI, unsigned NumSGPRs);
 
 } // end namespace IsaInfo
 
@@ -191,6 +184,7 @@ struct MIMGBaseOpcodeInfo {
   bool Atomic;
   bool AtomicX2;
   bool Sampler;
+  bool Gather4;
 
   uint8_t NumExtraArgs;
   bool Gradients;
@@ -228,10 +222,28 @@ LLVM_READONLY
 int getMaskedMIMGOp(unsigned Opc, unsigned NewChannels);
 
 LLVM_READONLY
+int getMUBUFBaseOpcode(unsigned Opc);
+
+LLVM_READONLY
+int getMUBUFOpcode(unsigned BaseOpc, unsigned Dwords);
+
+LLVM_READONLY
+int getMUBUFDwords(unsigned Opc);
+
+LLVM_READONLY
+bool getMUBUFHasVAddr(unsigned Opc);
+
+LLVM_READONLY
+bool getMUBUFHasSrsrc(unsigned Opc);
+
+LLVM_READONLY
+bool getMUBUFHasSoffset(unsigned Opc);
+
+LLVM_READONLY
 int getMCOpcode(uint16_t Opcode, unsigned Gen);
 
 void initDefaultAMDKernelCodeT(amd_kernel_code_t &Header,
-                               const FeatureBitset &Features);
+                               const MCSubtargetInfo *STI);
 
 amdhsa::kernel_descriptor_t getDefaultAmdhsaKernelDescriptor();
 
@@ -265,26 +277,52 @@ std::pair<int, int> getIntegerPairAttribute(const Function &F,
                                             std::pair<int, int> Default,
                                             bool OnlyFirstRequired = false);
 
+/// Represents the counter values to wait for in an s_waitcnt instruction.
+///
+/// Large values (including the maximum possible integer) can be used to
+/// represent "don't care" waits.
+struct Waitcnt {
+  unsigned VmCnt = ~0u;
+  unsigned ExpCnt = ~0u;
+  unsigned LgkmCnt = ~0u;
+
+  Waitcnt() {}
+  Waitcnt(unsigned VmCnt, unsigned ExpCnt, unsigned LgkmCnt)
+      : VmCnt(VmCnt), ExpCnt(ExpCnt), LgkmCnt(LgkmCnt) {}
+
+  static Waitcnt allZero() { return Waitcnt(0, 0, 0); }
+
+  bool dominates(const Waitcnt &Other) const {
+    return VmCnt <= Other.VmCnt && ExpCnt <= Other.ExpCnt &&
+           LgkmCnt <= Other.LgkmCnt;
+  }
+
+  Waitcnt combined(const Waitcnt &Other) const {
+    return Waitcnt(std::min(VmCnt, Other.VmCnt), std::min(ExpCnt, Other.ExpCnt),
+                   std::min(LgkmCnt, Other.LgkmCnt));
+  }
+};
+
 /// \returns Vmcnt bit mask for given isa \p Version.
-unsigned getVmcntBitMask(const IsaInfo::IsaVersion &Version);
+unsigned getVmcntBitMask(const IsaVersion &Version);
 
 /// \returns Expcnt bit mask for given isa \p Version.
-unsigned getExpcntBitMask(const IsaInfo::IsaVersion &Version);
+unsigned getExpcntBitMask(const IsaVersion &Version);
 
 /// \returns Lgkmcnt bit mask for given isa \p Version.
-unsigned getLgkmcntBitMask(const IsaInfo::IsaVersion &Version);
+unsigned getLgkmcntBitMask(const IsaVersion &Version);
 
 /// \returns Waitcnt bit mask for given isa \p Version.
-unsigned getWaitcntBitMask(const IsaInfo::IsaVersion &Version);
+unsigned getWaitcntBitMask(const IsaVersion &Version);
 
 /// \returns Decoded Vmcnt from given \p Waitcnt for given isa \p Version.
-unsigned decodeVmcnt(const IsaInfo::IsaVersion &Version, unsigned Waitcnt);
+unsigned decodeVmcnt(const IsaVersion &Version, unsigned Waitcnt);
 
 /// \returns Decoded Expcnt from given \p Waitcnt for given isa \p Version.
-unsigned decodeExpcnt(const IsaInfo::IsaVersion &Version, unsigned Waitcnt);
+unsigned decodeExpcnt(const IsaVersion &Version, unsigned Waitcnt);
 
 /// \returns Decoded Lgkmcnt from given \p Waitcnt for given isa \p Version.
-unsigned decodeLgkmcnt(const IsaInfo::IsaVersion &Version, unsigned Waitcnt);
+unsigned decodeLgkmcnt(const IsaVersion &Version, unsigned Waitcnt);
 
 /// Decodes Vmcnt, Expcnt and Lgkmcnt from given \p Waitcnt for given isa
 /// \p Version, and writes decoded values into \p Vmcnt, \p Expcnt and
@@ -295,19 +333,21 @@ unsigned decodeLgkmcnt(const IsaInfo::IsaVersion &Version, unsigned Waitcnt);
 ///     \p Vmcnt = \p Waitcnt[3:0] | \p Waitcnt[15:14]  (gfx9+ only)
 ///     \p Expcnt = \p Waitcnt[6:4]
 ///     \p Lgkmcnt = \p Waitcnt[11:8]
-void decodeWaitcnt(const IsaInfo::IsaVersion &Version, unsigned Waitcnt,
+void decodeWaitcnt(const IsaVersion &Version, unsigned Waitcnt,
                    unsigned &Vmcnt, unsigned &Expcnt, unsigned &Lgkmcnt);
 
+Waitcnt decodeWaitcnt(const IsaVersion &Version, unsigned Encoded);
+
 /// \returns \p Waitcnt with encoded \p Vmcnt for given isa \p Version.
-unsigned encodeVmcnt(const IsaInfo::IsaVersion &Version, unsigned Waitcnt,
+unsigned encodeVmcnt(const IsaVersion &Version, unsigned Waitcnt,
                      unsigned Vmcnt);
 
 /// \returns \p Waitcnt with encoded \p Expcnt for given isa \p Version.
-unsigned encodeExpcnt(const IsaInfo::IsaVersion &Version, unsigned Waitcnt,
+unsigned encodeExpcnt(const IsaVersion &Version, unsigned Waitcnt,
                       unsigned Expcnt);
 
 /// \returns \p Waitcnt with encoded \p Lgkmcnt for given isa \p Version.
-unsigned encodeLgkmcnt(const IsaInfo::IsaVersion &Version, unsigned Waitcnt,
+unsigned encodeLgkmcnt(const IsaVersion &Version, unsigned Waitcnt,
                        unsigned Lgkmcnt);
 
 /// Encodes \p Vmcnt, \p Expcnt and \p Lgkmcnt into Waitcnt for given isa
@@ -322,9 +362,11 @@ unsigned encodeLgkmcnt(const IsaInfo::IsaVersion &Version, unsigned Waitcnt,
 ///
 /// \returns Waitcnt with encoded \p Vmcnt, \p Expcnt and \p Lgkmcnt for given
 /// isa \p Version.
-unsigned encodeWaitcnt(const IsaInfo::IsaVersion &Version,
+unsigned encodeWaitcnt(const IsaVersion &Version,
                        unsigned Vmcnt, unsigned Expcnt, unsigned Lgkmcnt);
 
+unsigned encodeWaitcnt(const IsaVersion &Version, const Waitcnt &Decoded);
+
 unsigned getInitialPSInputAddr(const Function &F);
 
 LLVM_READNONE
@@ -349,6 +391,7 @@ inline bool isKernel(CallingConv::ID CC) {
 }
 
 bool hasXNACK(const MCSubtargetInfo &STI);
+bool hasSRAMECC(const MCSubtargetInfo &STI);
 bool hasMIMG_R128(const MCSubtargetInfo &STI);
 bool hasPackedD16(const MCSubtargetInfo &STI);
 
@@ -447,6 +490,9 @@ int64_t getSMRDEncodedOffset(const MCSubtargetInfo &ST, int64_t ByteOffset);
 /// not the encoded offset.
 bool isLegalSMRDImmOffset(const MCSubtargetInfo &ST, int64_t ByteOffset);
 
+bool splitMUBUFOffset(uint32_t Imm, uint32_t &SOffset, uint32_t &ImmOffset,
+                      const GCNSubtarget *Subtarget, uint32_t Align = 4);
+
 /// \returns true if the intrinsic is divergent
 bool isIntrinsicSourceOfDivergence(unsigned IntrID);
 
diff --git a/lib/Target/AMDGPU/Utils/AMDGPULaneDominator.cpp b/lib/Target/AMDGPU/Utils/AMDGPULaneDominator.cpp
deleted file mode 100644
index 1924f71f11c84..0000000000000
--- a/lib/Target/AMDGPU/Utils/AMDGPULaneDominator.cpp
+++ /dev/null
@@ -1,75 +0,0 @@
-//===-- AMDGPULaneDominator.cpp - Determine Lane Dominators ---------------===//
-//
-//                     The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
-//
-//===----------------------------------------------------------------------===//
-//
-// MBB A lane-dominates MBB B if
-// 1. A dominates B in the usual sense, i.e. every path from the entry to B
-//    goes through A, and
-// 2. whenever B executes, every active lane during that execution of B was
-//    also active during the most recent execution of A.
-//
-// The simplest example where A dominates B but does not lane-dominate it is
-// where A is a loop:
-//
-//     |
-//     +--+
-//     A  |
-//     +--+
-//     |
-//     B
-//
-// Unfortunately, the second condition is not fully captured by the control
-// flow graph when it is unstructured (as may happen when branch conditions are
-// uniform).
-//
-// The following replacement of the second condition is a conservative
-// approximation. It is an equivalent condition when the CFG is fully
-// structured:
-//
-// 2'. every cycle in the CFG that contains A also contains B.
-//
-//===----------------------------------------------------------------------===//
-
-#include "AMDGPULaneDominator.h"
-
-#include "llvm/ADT/DenseSet.h"
-#include "llvm/ADT/SmallVector.h"
-#include "llvm/CodeGen/MachineBasicBlock.h"
-
-namespace llvm {
-
-namespace AMDGPU {
-
-// Given machine basic blocks A and B where A dominates B, check whether
-// A lane-dominates B.
-//
-// The check is conservative, i.e. there can be false-negatives.
-bool laneDominates(MachineBasicBlock *A, MachineBasicBlock *B) {
-  // Check whether A is reachable from itself without going through B.
-  DenseSet<MachineBasicBlock *> Reachable;
-  SmallVector<MachineBasicBlock *, 8> Stack;
-
-  Stack.push_back(A);
-  do {
-    MachineBasicBlock *MBB = Stack.back();
-    Stack.pop_back();
-
-    for (MachineBasicBlock *Succ : MBB->successors()) {
-      if (Succ == A)
-        return false;
-      if (Succ != B && Reachable.insert(Succ).second)
-        Stack.push_back(Succ);
-    }
-  } while (!Stack.empty());
-
-  return true;
-}
-
-} // namespace AMDGPU
-
-} // namespace llvm
diff --git a/lib/Target/AMDGPU/Utils/AMDGPULaneDominator.h b/lib/Target/AMDGPU/Utils/AMDGPULaneDominator.h
deleted file mode 100644
index 4f33a89a364bd..0000000000000
--- a/lib/Target/AMDGPU/Utils/AMDGPULaneDominator.h
+++ /dev/null
@@ -1,24 +0,0 @@
-//===- AMDGPULaneDominator.h ------------------------------------*- C++ -*-===//
-//
-//                     The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
-//
-//===----------------------------------------------------------------------===//
-
-#ifndef LLVM_LIB_TARGET_AMDGPU_UTILS_AMDGPULANEDOMINATOR_H
-#define LLVM_LIB_TARGET_AMDGPU_UTILS_AMDGPULANEDOMINATOR_H
-
-namespace llvm {
-
-class MachineBasicBlock;
-
-namespace AMDGPU {
-
-bool laneDominates(MachineBasicBlock *MBBA, MachineBasicBlock *MBBB);
-
-} // end namespace AMDGPU
-} // end namespace llvm
-
-#endif // LLVM_LIB_TARGET_AMDGPU_UTILS_AMDGPULANEDOMINATOR_H
diff --git a/lib/Target/AMDGPU/Utils/AMDKernelCodeTInfo.h b/lib/Target/AMDGPU/Utils/AMDKernelCodeTInfo.h
index 9f0a4d29b5e43..82ffdef8e674a 100644
--- a/lib/Target/AMDGPU/Utils/AMDKernelCodeTInfo.h
+++ b/lib/Target/AMDGPU/Utils/AMDKernelCodeTInfo.h
@@ -46,6 +46,7 @@
    int64_t Value = 0; \
    if (!expectAbsExpression(MCParser, Value, Err)) \
      return false; \
+   C.compute_pgm_resource_registers &= ~(SetMacro(0xFFFFFFFFFFFFFFFFULL) << Shift); \
    C.compute_pgm_resource_registers |= SetMacro(Value) << Shift; \
    return true; \
 }
diff --git a/lib/Target/AMDGPU/Utils/CMakeLists.txt b/lib/Target/AMDGPU/Utils/CMakeLists.txt
index c5ed32e46821b..01b80ebe8d3dc 100644
--- a/lib/Target/AMDGPU/Utils/CMakeLists.txt
+++ b/lib/Target/AMDGPU/Utils/CMakeLists.txt
@@ -2,5 +2,4 @@ add_llvm_library(LLVMAMDGPUUtils
   AMDGPUBaseInfo.cpp
   AMDKernelCodeTUtils.cpp
   AMDGPUAsmUtils.cpp
-  AMDGPULaneDominator.cpp
   )
diff --git a/lib/Target/AMDGPU/VOP1Instructions.td b/lib/Target/AMDGPU/VOP1Instructions.td
index 4c7a92219755b..68446ab79720a 100644
--- a/lib/Target/AMDGPU/VOP1Instructions.td
+++ b/lib/Target/AMDGPU/VOP1Instructions.td
@@ -84,6 +84,10 @@ class VOP1_SDWA_Pseudo <string OpName, VOPProfile P, list<dag> pattern=[]> :
   let AsmMatchConverter = "cvtSdwaVOP1";
 }
 
+class VOP1_DPP_Pseudo <string OpName, VOPProfile P, list<dag> pattern=[]> :
+  VOP_DPP_Pseudo <OpName, P, pattern> {
+}
+
 class getVOP1Pat64 <SDPatternOperator node, VOPProfile P> : LetDummies {
   list<dag> ret =
     !if(P.HasModifiers,
@@ -103,6 +107,8 @@ multiclass VOP1Inst <string opName, VOPProfile P,
   def _e32 : VOP1_Pseudo <opName, P>;
   def _e64 : VOP3_Pseudo <opName, P, getVOP1Pat64<node, P>.ret>;
   def _sdwa : VOP1_SDWA_Pseudo <opName, P>;
+  foreach _ = BoolToList<P.HasExtDPP>.ret in
+    def _dpp : VOP1_DPP_Pseudo <opName, P>;
 }
 
 // Special profile for instructions which have clamp
@@ -173,7 +179,9 @@ defm V_CVT_F32_I32 : VOP1Inst <"v_cvt_f32_i32", VOP1_F32_I32, sint_to_fp>;
 defm V_CVT_F32_U32 : VOP1Inst <"v_cvt_f32_u32", VOP1_F32_I32, uint_to_fp>;
 defm V_CVT_U32_F32 : VOP1Inst <"v_cvt_u32_f32", VOP_I32_F32, fp_to_uint>;
 defm V_CVT_I32_F32 : VOP1Inst <"v_cvt_i32_f32", VOP_I32_F32, fp_to_sint>;
+let FPDPRounding = 1 in {
 defm V_CVT_F16_F32 : VOP1Inst <"v_cvt_f16_f32", VOP_F16_F32, fpround>;
+} // End FPDPRounding = 1
 defm V_CVT_F32_F16 : VOP1Inst <"v_cvt_f32_f16", VOP_F32_F16, fpextend>;
 defm V_CVT_RPI_I32_F32 : VOP1Inst <"v_cvt_rpi_i32_f32", VOP_I32_F32, cvt_rpi_i32_f32>;
 defm V_CVT_FLR_I32_F32 : VOP1Inst <"v_cvt_flr_i32_f32", VOP_I32_F32, cvt_flr_i32_f32>;
@@ -226,7 +234,9 @@ defm V_FFBH_I32 : VOP1Inst <"v_ffbh_i32", VOP_I32_I32>;
 let SchedRW = [WriteDoubleAdd] in {
 defm V_FREXP_EXP_I32_F64 : VOP1Inst <"v_frexp_exp_i32_f64", VOP_I32_F64, int_amdgcn_frexp_exp>;
 defm V_FREXP_MANT_F64 : VOP1Inst <"v_frexp_mant_f64", VOP_F64_F64, int_amdgcn_frexp_mant>;
+let FPDPRounding = 1 in {
 defm V_FRACT_F64 : VOP1Inst <"v_fract_f64", VOP_F64_F64, AMDGPUfract>;
+} // End FPDPRounding = 1
 } // End SchedRW = [WriteDoubleAdd]
 
 defm V_FREXP_EXP_I32_F32 : VOP1Inst <"v_frexp_exp_i32_f32", VOP_I32_F32, int_amdgcn_frexp_exp>;
@@ -242,7 +252,9 @@ def VOP_I32_VI32_NO_EXT : VOPProfile<[i32, i32, untyped, untyped]> {
   let Src0RC64 = VRegSrc_32;
 
   let HasExt = 0;
-  let HasSDWA9 = 0;
+  let HasExtDPP = 0;
+  let HasExtSDWA = 0;
+  let HasExtSDWA9 = 0;
 }
 
 // Special case because there are no true output operands.  Hack vdst
@@ -271,7 +283,10 @@ def VOP_MOVRELD : VOPProfile<[untyped, i32, untyped, untyped]> {
   let AsmSDWA9 = getAsmSDWA9<1, 0, 1>.ret;
 
   let HasExt = 0;
-  let HasSDWA9 = 0;
+  let HasExtDPP = 0;
+  let HasExtSDWA = 0;
+  let HasExtSDWA9 = 0;
+
   let HasDst = 0;
   let EmitDst = 1; // force vdst emission
 }
@@ -328,8 +343,10 @@ defm V_EXP_LEGACY_F32 : VOP1Inst <"v_exp_legacy_f32", VOP_F32_F32>;
 
 let SubtargetPredicate = Has16BitInsts in {
 
+let FPDPRounding = 1 in {
 defm V_CVT_F16_U16 : VOP1Inst <"v_cvt_f16_u16", VOP1_F16_I16, uint_to_fp>;
 defm V_CVT_F16_I16 : VOP1Inst <"v_cvt_f16_i16", VOP1_F16_I16, sint_to_fp>;
+} // End FPDPRounding = 1
 defm V_CVT_U16_F16 : VOP1Inst <"v_cvt_u16_f16", VOP_I16_F16, fp_to_uint>;
 defm V_CVT_I16_F16 : VOP1Inst <"v_cvt_i16_f16", VOP_I16_F16, fp_to_sint>;
 let SchedRW = [WriteQuarterRate32] in {
@@ -347,7 +364,9 @@ defm V_FLOOR_F16 : VOP1Inst <"v_floor_f16", VOP_F16_F16, ffloor>;
 defm V_CEIL_F16 : VOP1Inst <"v_ceil_f16", VOP_F16_F16, fceil>;
 defm V_TRUNC_F16 : VOP1Inst <"v_trunc_f16", VOP_F16_F16, ftrunc>;
 defm V_RNDNE_F16 : VOP1Inst <"v_rndne_f16", VOP_F16_F16, frint>;
+let FPDPRounding = 1 in {
 defm V_FRACT_F16 : VOP1Inst <"v_fract_f16", VOP_F16_F16, AMDGPUfract>;
+} // End FPDPRounding = 1
 
 }
 
@@ -495,13 +514,8 @@ defm V_EXP_LEGACY_F32    : VOP1_Real_ci <0x46>;
 // VI
 //===----------------------------------------------------------------------===//
 
-class VOP1_DPP <bits<8> op, VOP1_Pseudo ps, VOPProfile P = ps.Pfl> :
-  VOP_DPP <ps.OpName, P> {
-  let Defs = ps.Defs;
-  let Uses = ps.Uses;
-  let SchedRW = ps.SchedRW;
-  let hasSideEffects = ps.hasSideEffects;
-
+class VOP1_DPPe <bits<8> op, VOP1_DPP_Pseudo ps, VOPProfile P = ps.Pfl> :
+  VOP_DPPe <P> {
   bits<8> vdst;
   let Inst{8-0}   = 0xfa; // dpp
   let Inst{16-9}  = op;
@@ -539,9 +553,10 @@ multiclass VOP1_Real_vi <bits<10> op> {
     VOP_SDWA9_Real <!cast<VOP1_SDWA_Pseudo>(NAME#"_sdwa")>,
     VOP1_SDWA9Ae <op{7-0}, !cast<VOP1_SDWA_Pseudo>(NAME#"_sdwa").Pfl>;
 
-  // For now left dpp only for asm/dasm
-  // TODO: add corresponding pseudo
-  def _dpp : VOP1_DPP<op{7-0}, !cast<VOP1_Pseudo>(NAME#"_e32")>;
+  foreach _ = BoolToList<!cast<VOP1_Pseudo>(NAME#"_e32").Pfl.HasExtDPP>.ret in
+    def _dpp_vi :
+      VOP_DPP_Real<!cast<VOP1_DPP_Pseudo>(NAME#"_dpp"), SIEncodingFamily.VI>,
+      VOP1_DPPe<op{7-0}, !cast<VOP1_DPP_Pseudo>(NAME#"_dpp")>;
 }
 
 defm V_NOP               : VOP1_Real_vi <0x0>;
@@ -712,9 +727,11 @@ multiclass VOP1_Real_gfx9 <bits<10> op> {
     VOP_SDWA9_Real <!cast<VOP1_SDWA_Pseudo>(NAME#"_sdwa")>,
     VOP1_SDWA9Ae <op{7-0}, !cast<VOP1_SDWA_Pseudo>(NAME#"_sdwa").Pfl>;
 
-  // For now left dpp only for asm/dasm
-  // TODO: add corresponding pseudo
-  def _dpp : VOP1_DPP<op{7-0}, !cast<VOP1_Pseudo>(NAME#"_e32")>;
+  foreach _ = BoolToList<!cast<VOP1_Pseudo>(NAME#"_e32").Pfl.HasExtDPP>.ret in
+    def _dpp_gfx9 :
+      VOP_DPP_Real<!cast<VOP1_DPP_Pseudo>(NAME#"_dpp"), SIEncodingFamily.GFX9>,
+      VOP1_DPPe<op{7-0}, !cast<VOP1_DPP_Pseudo>(NAME#"_dpp")>;
+
 }
 
 defm V_SCREEN_PARTITION_4SE_B32 : VOP1_Real_gfx9 <0x37>;
diff --git a/lib/Target/AMDGPU/VOP2Instructions.td b/lib/Target/AMDGPU/VOP2Instructions.td
index 5ec1a15c5cd20..e3fd7b5f9fadd 100644
--- a/lib/Target/AMDGPU/VOP2Instructions.td
+++ b/lib/Target/AMDGPU/VOP2Instructions.td
@@ -105,6 +105,11 @@ class VOP2_SDWA_Pseudo <string OpName, VOPProfile P, list<dag> pattern=[]> :
   let AsmMatchConverter = "cvtSdwaVOP2";
 }
 
+class VOP2_DPP_Pseudo <string OpName, VOPProfile P, list<dag> pattern=[]> :
+  VOP_DPP_Pseudo <OpName, P, pattern> {
+}
+
+
 class getVOP2Pat64 <SDPatternOperator node, VOPProfile P> : LetDummies {
   list<dag> ret = !if(P.HasModifiers,
     [(set P.DstVT:$vdst,
@@ -116,22 +121,49 @@ class getVOP2Pat64 <SDPatternOperator node, VOPProfile P> : LetDummies {
     [(set P.DstVT:$vdst, (node P.Src0VT:$src0, P.Src1VT:$src1))]);
 }
 
-multiclass VOP2Inst <string opName,
-                     VOPProfile P,
-                     SDPatternOperator node = null_frag,
-                     string revOp = opName,
-                     bit GFX9Renamed = 0> {
-
+multiclass VOP2Inst_e32<string opName,
+                        VOPProfile P,
+                        SDPatternOperator node = null_frag,
+                        string revOp = opName,
+                        bit GFX9Renamed = 0> {
   let renamedInGFX9 = GFX9Renamed in {
-
-    def _e32 : VOP2_Pseudo <opName, P>,
+    def _e32 : VOP2_Pseudo <opName, P, VOPPatOrNull<node,P>.ret>,
                Commutable_REV<revOp#"_e32", !eq(revOp, opName)>;
+  } // End renamedInGFX9 = GFX9Renamed
+}
 
+multiclass VOP2Inst_e64<string opName,
+                        VOPProfile P,
+                        SDPatternOperator node = null_frag,
+                        string revOp = opName,
+                        bit GFX9Renamed = 0> {
+  let renamedInGFX9 = GFX9Renamed in {
     def _e64 : VOP3_Pseudo <opName, P, getVOP2Pat64<node, P>.ret>,
                Commutable_REV<revOp#"_e64", !eq(revOp, opName)>;
+  } // End renamedInGFX9 = GFX9Renamed
+}
 
-    def _sdwa  : VOP2_SDWA_Pseudo <opName, P>;
+multiclass VOP2Inst_sdwa<string opName,
+                         VOPProfile P,
+                         SDPatternOperator node = null_frag,
+                         string revOp = opName,
+                         bit GFX9Renamed = 0> {
+  let renamedInGFX9 = GFX9Renamed in {
+    def _sdwa : VOP2_SDWA_Pseudo <opName, P>;
+  } // End renamedInGFX9 = GFX9Renamed
+}
 
+multiclass VOP2Inst<string opName,
+                    VOPProfile P,
+                    SDPatternOperator node = null_frag,
+                    string revOp = opName,
+                    bit GFX9Renamed = 0> :
+    VOP2Inst_e32<opName, P, node, revOp, GFX9Renamed>,
+    VOP2Inst_e64<opName, P, node, revOp, GFX9Renamed>,
+    VOP2Inst_sdwa<opName, P, node, revOp, GFX9Renamed> {
+  let renamedInGFX9 = GFX9Renamed in {
+    foreach _ = BoolToList<P.HasExtDPP>.ret in
+      def _dpp  : VOP2_DPP_Pseudo <opName, P>;
   }
 }
 
@@ -144,12 +176,14 @@ multiclass VOP2bInst <string opName,
   let renamedInGFX9 = GFX9Renamed in {
     let SchedRW = [Write32Bit, WriteSALU] in {
       let Uses = !if(useSGPRInput, [VCC, EXEC], [EXEC]), Defs = [VCC] in {
-        def _e32 : VOP2_Pseudo <opName, P>,
+        def _e32 : VOP2_Pseudo <opName, P, VOPPatOrNull<node,P>.ret>,
                    Commutable_REV<revOp#"_e32", !eq(revOp, opName)>;
 
         def _sdwa  : VOP2_SDWA_Pseudo <opName, P> {
           let AsmMatchConverter = "cvtSdwaVOP2b";
         }
+        foreach _ = BoolToList<P.HasExtDPP>.ret in
+          def _dpp  : VOP2_DPP_Pseudo <opName, P>;
       }
 
       def _e64 : VOP3_Pseudo <opName, P, getVOP2Pat64<node, P>.ret>,
@@ -172,6 +206,9 @@ multiclass VOP2eInst <string opName,
       def _sdwa : VOP2_SDWA_Pseudo <opName, P> {
         let AsmMatchConverter = "cvtSdwaVOP2b";
       }
+
+      foreach _ = BoolToList<P.HasExtDPP>.ret in
+        def _dpp  : VOP2_DPP_Pseudo <opName, P>;
     }
 
     def _e64 : VOP3_Pseudo <opName, P, getVOP2Pat64<node, P>.ret>,
@@ -211,9 +248,9 @@ class VOP_MAC <ValueType vt> : VOPProfile <[vt, vt, vt, vt]> {
   let Ins32 = (ins Src0RC32:$src0, Src1RC32:$src1, VGPR_32:$src2);
   let Ins64 = getIns64<Src0RC64, Src1RC64, RegisterOperand<VGPR_32>, 3,
                        0, HasModifiers, HasOMod, Src0Mod, Src1Mod, Src2Mod>.ret;
-  let InsDPP = (ins DstRCDPP:$old,
-                    Src0ModDPP:$src0_modifiers, Src0DPP:$src0,
+  let InsDPP = (ins Src0ModDPP:$src0_modifiers, Src0DPP:$src0,
                     Src1ModDPP:$src1_modifiers, Src1DPP:$src1,
+                    VGPR_32:$src2, // stub argument
                     dpp_ctrl:$dpp_ctrl, row_mask:$row_mask,
                     bank_mask:$bank_mask, bound_ctrl:$bound_ctrl);
 
@@ -230,21 +267,15 @@ class VOP_MAC <ValueType vt> : VOPProfile <[vt, vt, vt, vt]> {
   let AsmSDWA9 = getAsmSDWA9<1, 1, 2, vt>.ret;
   let HasSrc2 = 0;
   let HasSrc2Mods = 0;
-  let HasExt = 1;
-  let HasSDWA9 = 0;
-}
 
-def VOP_MAC_F16 : VOP_MAC <f16> {
-  // FIXME: Move 'Asm64' definition to VOP_MAC, and use 'vt'. Currently it gives
-  // 'not a string initializer' error.
-  let Asm64 = getAsm64<1, 2, 0, HasModifiers, HasOMod, f16>.ret;
+  let HasExt = 1;
+  let HasExtDPP = 1;
+  let HasExtSDWA = 1;
+  let HasExtSDWA9 = 0;
 }
 
-def VOP_MAC_F32 : VOP_MAC <f32> {
-  // FIXME: Move 'Asm64' definition to VOP_MAC, and use 'vt'. Currently it gives
-  // 'not a string initializer' error.
-  let Asm64 = getAsm64<1, 2, 0, HasModifiers, HasOMod, f32>.ret;
-}
+def VOP_MAC_F16 : VOP_MAC <f16>;
+def VOP_MAC_F32 : VOP_MAC <f32>;
 
 // Write out to vcc or arbitrary SGPR.
 def VOP2b_I32_I1_I32_I32 : VOPProfile<[i32, i32, i32, untyped]> {
@@ -290,7 +321,9 @@ def VOP2b_I32_I1_I32_I32_I1 : VOPProfile<[i32, i32, i32, i1]> {
                     dpp_ctrl:$dpp_ctrl, row_mask:$row_mask,
                     bank_mask:$bank_mask, bound_ctrl:$bound_ctrl);
   let HasExt = 1;
-  let HasSDWA9 = 1;
+  let HasExtDPP = 1;
+  let HasExtSDWA = 1;
+  let HasExtSDWA9 = 1;
 }
 
 // Read in from vcc or arbitrary SGPR
@@ -321,7 +354,9 @@ def VOP2e_I32_I32_I32_I1 : VOPProfile<[i32, i32, i32, i1]> {
                     dpp_ctrl:$dpp_ctrl, row_mask:$row_mask,
                     bank_mask:$bank_mask, bound_ctrl:$bound_ctrl);
   let HasExt = 1;
-  let HasSDWA9 = 1;
+  let HasExtDPP = 1;
+  let HasExtSDWA = 1;
+  let HasExtSDWA9 = 1;
 }
 
 def VOP_READLANE : VOPProfile<[i32, i32, i32]> {
@@ -331,8 +366,11 @@ def VOP_READLANE : VOPProfile<[i32, i32, i32]> {
   let Ins64 = Ins32;
   let Asm32 = " $vdst, $src0, $src1";
   let Asm64 = Asm32;
+
   let HasExt = 0;
-  let HasSDWA9 = 0;
+  let HasExtDPP = 0;
+  let HasExtSDWA = 0;
+  let HasExtSDWA9 = 0;
 }
 
 def VOP_WRITELANE : VOPProfile<[i32, i32, i32, i32]> {
@@ -342,20 +380,23 @@ def VOP_WRITELANE : VOPProfile<[i32, i32, i32, i32]> {
   let Ins64 = Ins32;
   let Asm32 = " $vdst, $src0, $src1";
   let Asm64 = Asm32;
-  let HasExt = 0;
-  let HasSDWA9 = 0;
   let HasSrc2 = 0;
   let HasSrc2Mods = 0;
+
+  let HasExt = 0;
+  let HasExtDPP = 0;
+  let HasExtSDWA = 0;
+  let HasExtSDWA9 = 0;
 }
 
 //===----------------------------------------------------------------------===//
 // VOP2 Instructions
 //===----------------------------------------------------------------------===//
 
-let SubtargetPredicate = isGCN in {
+let SubtargetPredicate = isGCN, Predicates = [isGCN] in {
 
 defm V_CNDMASK_B32 : VOP2eInst <"v_cndmask_b32", VOP2e_I32_I32_I32_I1>;
-def V_MADMK_F32 : VOP2_Pseudo <"v_madmk_f32", VOP_MADMK_F32, [], "">;
+def V_MADMK_F32 : VOP2_Pseudo <"v_madmk_f32", VOP_MADMK_F32, []>;
 
 let isCommutable = 1 in {
 defm V_ADD_F32 : VOP2Inst <"v_add_f32", VOP_F32_F32_F32, fadd>;
@@ -363,29 +404,29 @@ defm V_SUB_F32 : VOP2Inst <"v_sub_f32", VOP_F32_F32_F32, fsub>;
 defm V_SUBREV_F32 : VOP2Inst <"v_subrev_f32", VOP_F32_F32_F32, null_frag, "v_sub_f32">;
 defm V_MUL_LEGACY_F32 : VOP2Inst <"v_mul_legacy_f32", VOP_F32_F32_F32, AMDGPUfmul_legacy>;
 defm V_MUL_F32 : VOP2Inst <"v_mul_f32", VOP_F32_F32_F32, fmul>;
-defm V_MUL_I32_I24 : VOP2Inst <"v_mul_i32_i24", VOP_I32_I32_I32, AMDGPUmul_i24>;
-defm V_MUL_HI_I32_I24 : VOP2Inst <"v_mul_hi_i32_i24", VOP_I32_I32_I32, AMDGPUmulhi_i24>;
-defm V_MUL_U32_U24 : VOP2Inst <"v_mul_u32_u24", VOP_I32_I32_I32, AMDGPUmul_u24>;
-defm V_MUL_HI_U32_U24 : VOP2Inst <"v_mul_hi_u32_u24", VOP_I32_I32_I32, AMDGPUmulhi_u24>;
-defm V_MIN_F32 : VOP2Inst <"v_min_f32", VOP_F32_F32_F32, fminnum>;
-defm V_MAX_F32 : VOP2Inst <"v_max_f32", VOP_F32_F32_F32, fmaxnum>;
-defm V_MIN_I32 : VOP2Inst <"v_min_i32", VOP_I32_I32_I32>;
-defm V_MAX_I32 : VOP2Inst <"v_max_i32", VOP_I32_I32_I32>;
-defm V_MIN_U32 : VOP2Inst <"v_min_u32", VOP_I32_I32_I32>;
-defm V_MAX_U32 : VOP2Inst <"v_max_u32", VOP_I32_I32_I32>;
+defm V_MUL_I32_I24 : VOP2Inst <"v_mul_i32_i24", VOP_PAT_GEN<VOP_I32_I32_I32, 2>, AMDGPUmul_i24>;
+defm V_MUL_HI_I32_I24 : VOP2Inst <"v_mul_hi_i32_i24", VOP_PAT_GEN<VOP_I32_I32_I32, 2>, AMDGPUmulhi_i24>;
+defm V_MUL_U32_U24 : VOP2Inst <"v_mul_u32_u24", VOP_PAT_GEN<VOP_I32_I32_I32, 2>, AMDGPUmul_u24>;
+defm V_MUL_HI_U32_U24 : VOP2Inst <"v_mul_hi_u32_u24", VOP_PAT_GEN<VOP_I32_I32_I32, 2>, AMDGPUmulhi_u24>;
+defm V_MIN_F32 : VOP2Inst <"v_min_f32", VOP_F32_F32_F32, fminnum_like>;
+defm V_MAX_F32 : VOP2Inst <"v_max_f32", VOP_F32_F32_F32, fmaxnum_like>;
+defm V_MIN_I32 : VOP2Inst <"v_min_i32", VOP_PAT_GEN<VOP_I32_I32_I32>, smin>;
+defm V_MAX_I32 : VOP2Inst <"v_max_i32", VOP_PAT_GEN<VOP_I32_I32_I32>, smax>;
+defm V_MIN_U32 : VOP2Inst <"v_min_u32", VOP_PAT_GEN<VOP_I32_I32_I32>, umin>;
+defm V_MAX_U32 : VOP2Inst <"v_max_u32", VOP_PAT_GEN<VOP_I32_I32_I32>, umax>;
 defm V_LSHRREV_B32 : VOP2Inst <"v_lshrrev_b32", VOP_I32_I32_I32, null_frag, "v_lshr_b32">;
 defm V_ASHRREV_I32 : VOP2Inst <"v_ashrrev_i32", VOP_I32_I32_I32, null_frag, "v_ashr_i32">;
 defm V_LSHLREV_B32 : VOP2Inst <"v_lshlrev_b32", VOP_I32_I32_I32, null_frag, "v_lshl_b32">;
-defm V_AND_B32 : VOP2Inst <"v_and_b32", VOP_I32_I32_I32>;
-defm V_OR_B32 : VOP2Inst <"v_or_b32", VOP_I32_I32_I32>;
-defm V_XOR_B32 : VOP2Inst <"v_xor_b32", VOP_I32_I32_I32>;
+defm V_AND_B32 : VOP2Inst <"v_and_b32", VOP_PAT_GEN<VOP_I32_I32_I32>, and>;
+defm V_OR_B32 : VOP2Inst <"v_or_b32", VOP_PAT_GEN<VOP_I32_I32_I32>, or>;
+defm V_XOR_B32 : VOP2Inst <"v_xor_b32", VOP_PAT_GEN<VOP_I32_I32_I32>, xor>;
 
 let Constraints = "$vdst = $src2", DisableEncoding="$src2",
     isConvertibleToThreeAddress = 1 in {
 defm V_MAC_F32 : VOP2Inst <"v_mac_f32", VOP_MAC_F32>;
 }
 
-def V_MADAK_F32 : VOP2_Pseudo <"v_madak_f32", VOP_MADAK_F32, [], "">;
+def V_MADAK_F32 : VOP2_Pseudo <"v_madak_f32", VOP_MADAK_F32, []>;
 
 // No patterns so that the scalar instructions are always selected.
 // The scalar versions will be replaced with vector when needed later.
@@ -411,11 +452,11 @@ defm V_SUBREV_U32 : VOP2Inst <"v_subrev_u32", VOP_I32_I32_I32, null_frag, "v_sub
 // These are special and do not read the exec mask.
 let isConvergent = 1, Uses = []<Register> in {
 def V_READLANE_B32 : VOP2_Pseudo<"v_readlane_b32", VOP_READLANE,
-  [(set i32:$vdst, (int_amdgcn_readlane i32:$src0, i32:$src1))], "">;
+  [(set i32:$vdst, (int_amdgcn_readlane i32:$src0, i32:$src1))]>;
 
 let Constraints = "$vdst = $vdst_in", DisableEncoding="$vdst_in" in {
 def V_WRITELANE_B32 : VOP2_Pseudo<"v_writelane_b32", VOP_WRITELANE,
-  [(set i32:$vdst, (int_amdgcn_writelane i32:$src0, i32:$src1, i32:$vdst_in))], "">;
+  [(set i32:$vdst, (int_amdgcn_writelane i32:$src0, i32:$src1, i32:$vdst_in))]>;
 } // End $vdst = $vdst_in, DisableEncoding $vdst_in
 } // End isConvergent = 1
 
@@ -425,13 +466,13 @@ defm V_MBCNT_LO_U32_B32 : VOP2Inst <"v_mbcnt_lo_u32_b32", VOP_NO_EXT<VOP_I32_I32
 defm V_MBCNT_HI_U32_B32 : VOP2Inst <"v_mbcnt_hi_u32_b32", VOP_NO_EXT<VOP_I32_I32_I32>, int_amdgcn_mbcnt_hi>;
 defm V_LDEXP_F32 : VOP2Inst <"v_ldexp_f32", VOP_NO_EXT<VOP_F32_F32_I32>, AMDGPUldexp>;
 defm V_CVT_PKACCUM_U8_F32 : VOP2Inst <"v_cvt_pkaccum_u8_f32", VOP_NO_EXT<VOP_I32_F32_I32>>; // TODO: set "Uses = dst"
-defm V_CVT_PKNORM_I16_F32 : VOP2Inst <"v_cvt_pknorm_i16_f32", VOP_NO_EXT<VOP_I32_F32_F32>, AMDGPUpknorm_i16_f32>;
-defm V_CVT_PKNORM_U16_F32 : VOP2Inst <"v_cvt_pknorm_u16_f32", VOP_NO_EXT<VOP_I32_F32_F32>, AMDGPUpknorm_u16_f32>;
-defm V_CVT_PKRTZ_F16_F32 : VOP2Inst <"v_cvt_pkrtz_f16_f32", VOP_NO_EXT<VOP_I32_F32_F32>, AMDGPUpkrtz_f16_f32>;
-defm V_CVT_PK_U16_U32 : VOP2Inst <"v_cvt_pk_u16_u32", VOP_NO_EXT<VOP_I32_I32_I32>, AMDGPUpk_u16_u32>;
-defm V_CVT_PK_I16_I32 : VOP2Inst <"v_cvt_pk_i16_i32", VOP_NO_EXT<VOP_I32_I32_I32>, AMDGPUpk_i16_i32>;
+defm V_CVT_PKNORM_I16_F32 : VOP2Inst <"v_cvt_pknorm_i16_f32", VOP_NO_EXT<VOP_V2I16_F32_F32>, AMDGPUpknorm_i16_f32>;
+defm V_CVT_PKNORM_U16_F32 : VOP2Inst <"v_cvt_pknorm_u16_f32", VOP_NO_EXT<VOP_V2I16_F32_F32>, AMDGPUpknorm_u16_f32>;
+defm V_CVT_PKRTZ_F16_F32 : VOP2Inst <"v_cvt_pkrtz_f16_f32", VOP_NO_EXT<VOP_V2F16_F32_F32>, AMDGPUpkrtz_f16_f32>;
+defm V_CVT_PK_U16_U32 : VOP2Inst <"v_cvt_pk_u16_u32", VOP_NO_EXT<VOP_V2I16_I32_I32>, AMDGPUpk_u16_u32>;
+defm V_CVT_PK_I16_I32 : VOP2Inst <"v_cvt_pk_i16_i32", VOP_NO_EXT<VOP_V2I16_I32_I32>, AMDGPUpk_i16_i32>;
 
-} // End SubtargetPredicate = isGCN
+} // End SubtargetPredicate = isGCN, Predicates = [isGCN]
 
 def : GCNPat<
     (AMDGPUadde i32:$src0, i32:$src1, i1:$src2),
@@ -444,40 +485,99 @@ def : GCNPat<
 >;
 
 // These instructions only exist on SI and CI
-let SubtargetPredicate = isSICI in {
+let SubtargetPredicate = isSICI, Predicates = [isSICI] in {
 
 defm V_MIN_LEGACY_F32 : VOP2Inst <"v_min_legacy_f32", VOP_F32_F32_F32, AMDGPUfmin_legacy>;
 defm V_MAX_LEGACY_F32 : VOP2Inst <"v_max_legacy_f32", VOP_F32_F32_F32, AMDGPUfmax_legacy>;
 
 let isCommutable = 1 in {
 defm V_MAC_LEGACY_F32 : VOP2Inst <"v_mac_legacy_f32", VOP_F32_F32_F32>;
-defm V_LSHR_B32 : VOP2Inst <"v_lshr_b32", VOP_I32_I32_I32>;
-defm V_ASHR_I32 : VOP2Inst <"v_ashr_i32", VOP_I32_I32_I32>;
-defm V_LSHL_B32 : VOP2Inst <"v_lshl_b32", VOP_I32_I32_I32>;
+defm V_LSHR_B32 : VOP2Inst <"v_lshr_b32", VOP_PAT_GEN<VOP_I32_I32_I32>, srl>;
+defm V_ASHR_I32 : VOP2Inst <"v_ashr_i32", VOP_PAT_GEN<VOP_I32_I32_I32>, sra>;
+defm V_LSHL_B32 : VOP2Inst <"v_lshl_b32", VOP_PAT_GEN<VOP_I32_I32_I32>, shl>;
 } // End isCommutable = 1
 
-} // End let SubtargetPredicate = SICI
+} // End let SubtargetPredicate = SICI, Predicates = [isSICI]
+
+class DivergentBinOp<SDPatternOperator Op, VOP_Pseudo Inst> :
+  GCNPat<
+      (getDivergentFrag<Op>.ret Inst.Pfl.Src0VT:$src0, Inst.Pfl.Src1VT:$src1),
+      !if(!cast<Commutable_REV>(Inst).IsOrig,
+        (Inst $src0, $src1),
+        (Inst $src1, $src0)
+      )
+  >;
+
+let AddedComplexity = 1 in {
+  def : DivergentBinOp<srl, V_LSHRREV_B32_e64>;
+  def : DivergentBinOp<sra, V_ASHRREV_I32_e64>;
+  def : DivergentBinOp<shl, V_LSHLREV_B32_e64>;
+}
+
+let SubtargetPredicate = HasAddNoCarryInsts in {
+  def : DivergentBinOp<add, V_ADD_U32_e32>;
+  def : DivergentBinOp<sub, V_SUB_U32_e32>;
+  def : DivergentBinOp<sub, V_SUBREV_U32_e32>;
+}
+
+
+def : DivergentBinOp<add, V_ADD_I32_e32>;
+
+def : DivergentBinOp<add, V_ADD_I32_e64>;
+def : DivergentBinOp<sub, V_SUB_I32_e32>;
+
+def : DivergentBinOp<sub, V_SUBREV_I32_e32>;
+
+def : DivergentBinOp<srl, V_LSHRREV_B32_e32>;
+def : DivergentBinOp<sra, V_ASHRREV_I32_e32>;
+def : DivergentBinOp<shl, V_LSHLREV_B32_e32>;
+def : DivergentBinOp<adde, V_ADDC_U32_e32>;
+def : DivergentBinOp<sube, V_SUBB_U32_e32>;
+
+class divergent_i64_BinOp <SDPatternOperator Op, Instruction Inst> :
+  GCNPat<
+      (getDivergentFrag<Op>.ret i64:$src0, i64:$src1),
+      (REG_SEQUENCE VReg_64,
+        (Inst
+          (i32 (EXTRACT_SUBREG $src0, sub0)),
+          (i32 (EXTRACT_SUBREG $src1, sub0))
+        ), sub0,
+        (Inst
+          (i32 (EXTRACT_SUBREG $src0, sub1)),
+          (i32 (EXTRACT_SUBREG $src1, sub1))
+        ), sub1
+      )
+  >;
+
+def :  divergent_i64_BinOp <and, V_AND_B32_e32>;
+def :  divergent_i64_BinOp <or,  V_OR_B32_e32>;
+def :  divergent_i64_BinOp <xor, V_XOR_B32_e32>;
 
 let SubtargetPredicate = Has16BitInsts in {
 
+let FPDPRounding = 1 in {
 def V_MADMK_F16 : VOP2_Pseudo <"v_madmk_f16", VOP_MADMK_F16, [], "">;
+defm V_LDEXP_F16 : VOP2Inst <"v_ldexp_f16", VOP_F16_F16_I32, AMDGPUldexp>;
+} // End FPDPRounding = 1
+
 defm V_LSHLREV_B16 : VOP2Inst <"v_lshlrev_b16", VOP_I16_I16_I16>;
 defm V_LSHRREV_B16 : VOP2Inst <"v_lshrrev_b16", VOP_I16_I16_I16>;
 defm V_ASHRREV_I16 : VOP2Inst <"v_ashrrev_i16", VOP_I16_I16_I16>;
-defm V_LDEXP_F16 : VOP2Inst <"v_ldexp_f16", VOP_F16_F16_I32, AMDGPUldexp>;
 
 let isCommutable = 1 in {
+let FPDPRounding = 1 in {
 defm V_ADD_F16 : VOP2Inst <"v_add_f16", VOP_F16_F16_F16, fadd>;
 defm V_SUB_F16 : VOP2Inst <"v_sub_f16", VOP_F16_F16_F16, fsub>;
 defm V_SUBREV_F16 : VOP2Inst <"v_subrev_f16", VOP_F16_F16_F16, null_frag, "v_sub_f16">;
 defm V_MUL_F16 : VOP2Inst <"v_mul_f16", VOP_F16_F16_F16, fmul>;
 def V_MADAK_F16 : VOP2_Pseudo <"v_madak_f16", VOP_MADAK_F16, [], "">;
+} // End FPDPRounding = 1
 defm V_ADD_U16 : VOP2Inst <"v_add_u16", VOP_I16_I16_I16>;
 defm V_SUB_U16 : VOP2Inst <"v_sub_u16" , VOP_I16_I16_I16>;
 defm V_SUBREV_U16 : VOP2Inst <"v_subrev_u16", VOP_I16_I16_I16, null_frag, "v_sub_u16">;
 defm V_MUL_LO_U16 : VOP2Inst <"v_mul_lo_u16", VOP_I16_I16_I16>;
-defm V_MAX_F16 : VOP2Inst <"v_max_f16", VOP_F16_F16_F16, fmaxnum>;
-defm V_MIN_F16 : VOP2Inst <"v_min_f16", VOP_F16_F16_F16, fminnum>;
+defm V_MAX_F16 : VOP2Inst <"v_max_f16", VOP_F16_F16_F16, fmaxnum_like>;
+defm V_MIN_F16 : VOP2Inst <"v_min_f16", VOP_F16_F16_F16, fminnum_like>;
 defm V_MAX_U16 : VOP2Inst <"v_max_u16", VOP_I16_I16_I16>;
 defm V_MAX_I16 : VOP2Inst <"v_max_i16", VOP_I16_I16_I16>;
 defm V_MIN_U16 : VOP2Inst <"v_min_u16", VOP_I16_I16_I16>;
@@ -698,13 +798,8 @@ defm V_CVT_PK_I16_I32     : VOP2_Real_e32e64_si <0x31>;
 // VI
 //===----------------------------------------------------------------------===//
 
-class VOP2_DPP <bits<6> op, VOP2_Pseudo ps, string OpName = ps.OpName, VOPProfile P = ps.Pfl> :
-  VOP_DPP <OpName, P> {
-  let Defs = ps.Defs;
-  let Uses = ps.Uses;
-  let SchedRW = ps.SchedRW;
-  let hasSideEffects = ps.hasSideEffects;
-
+class VOP2_DPPe <bits<6> op, VOP2_DPP_Pseudo ps, VOPProfile P = ps.Pfl> :
+  VOP_DPPe <P> {
   bits<8> vdst;
   bits<8> src1;
   let Inst{8-0}   = 0xfa; //dpp
@@ -716,12 +811,6 @@ class VOP2_DPP <bits<6> op, VOP2_Pseudo ps, string OpName = ps.OpName, VOPProfil
 
 let AssemblerPredicates = [isVI], DecoderNamespace = "VI" in {
 
-multiclass VOP32_Real_vi <bits<10> op> {
-  def _vi :
-    VOP2_Real<!cast<VOP2_Pseudo>(NAME), SIEncodingFamily.VI>,
-    VOP3e_vi<op, !cast<VOP2_Pseudo>(NAME).Pfl>;
-}
-
 multiclass VOP2_Real_MADK_vi <bits<6> op> {
   def _vi : VOP2_Real<!cast<VOP2_Pseudo>(NAME), SIEncodingFamily.VI>,
             VOP2_MADKe<op{5-0}, !cast<VOP2_Pseudo>(NAME).Pfl>;
@@ -791,8 +880,13 @@ multiclass VOP2be_Real_e32e64_vi_only <bits<6> op, string OpName, string AsmName
       VOP2_SDWA_Pseudo ps = !cast<VOP2_SDWA_Pseudo>(OpName#"_sdwa");
       let AsmString = AsmName # ps.AsmOperands;
     }
-  def _dpp :
-    VOP2_DPP<op, !cast<VOP2_Pseudo>(OpName#"_e32"), AsmName>;
+  foreach _ = BoolToList<!cast<VOP2_Pseudo>(OpName#"_e32").Pfl.HasExtDPP>.ret in
+    def _dpp_vi :
+      VOP_DPP_Real<!cast<VOP2_DPP_Pseudo>(OpName#"_dpp"), SIEncodingFamily.VI>,
+      VOP2_DPPe<op, !cast<VOP2_DPP_Pseudo>(OpName#"_dpp")> {
+        VOP2_DPP_Pseudo ps = !cast<VOP2_DPP_Pseudo>(OpName#"_dpp");
+        let AsmString = AsmName # ps.AsmOperands;
+      }
 }
 }
 
@@ -819,10 +913,14 @@ multiclass VOP2be_Real_e32e64_gfx9 <bits<6> op, string OpName, string AsmName> {
       VOP2_SDWA_Pseudo ps = !cast<VOP2_SDWA_Pseudo>(OpName#"_sdwa");
       let AsmString = AsmName # ps.AsmOperands;
     }
-  def _dpp_gfx9 :
-    VOP2_DPP<op, !cast<VOP2_Pseudo>(OpName#"_e32"), AsmName> {
-      let DecoderNamespace = "SDWA9";
-    }
+  foreach _ = BoolToList<!cast<VOP2_Pseudo>(OpName#"_e32").Pfl.HasExtDPP>.ret in
+    def _dpp_gfx9 :
+      VOP_DPP_Real<!cast<VOP2_DPP_Pseudo>(OpName#"_dpp"), SIEncodingFamily.GFX9>,
+      VOP2_DPPe<op, !cast<VOP2_DPP_Pseudo>(OpName#"_dpp")> {
+        VOP2_DPP_Pseudo ps = !cast<VOP2_DPP_Pseudo>(OpName#"_dpp");
+        let AsmString = AsmName # ps.AsmOperands;
+        let DecoderNamespace = "SDWA9";
+      }
 }
 
 multiclass VOP2_Real_e32e64_gfx9 <bits<6> op> {
@@ -840,19 +938,23 @@ multiclass VOP2_Real_e32e64_gfx9 <bits<6> op> {
     VOP_SDWA9_Real <!cast<VOP2_SDWA_Pseudo>(NAME#"_sdwa")>,
     VOP2_SDWA9Ae <op{5-0}, !cast<VOP2_SDWA_Pseudo>(NAME#"_sdwa").Pfl> {
     }
-  def _dpp_gfx9 :
-    VOP2_DPP<op, !cast<VOP2_Pseudo>(NAME#"_e32")> {
-      let DecoderNamespace = "SDWA9";
-    }
+  foreach _ = BoolToList<!cast<VOP2_Pseudo>(NAME#"_e32").Pfl.HasExtDPP>.ret in
+    def _dpp_gfx9 :
+      VOP_DPP_Real<!cast<VOP2_DPP_Pseudo>(NAME#"_dpp"), SIEncodingFamily.GFX9>,
+      VOP2_DPPe<op, !cast<VOP2_DPP_Pseudo>(NAME#"_dpp")> {
+        let DecoderNamespace = "SDWA9";
+      }
 }
 
 } // AssemblerPredicates = [isGFX9]
 
 multiclass VOP2_Real_e32e64_vi <bits<6> op> :
   Base_VOP2_Real_e32e64_vi<op>, VOP2_SDWA_Real<op>, VOP2_SDWA9_Real<op> {
-  // For now left dpp only for asm/dasm
-  // TODO: add corresponding pseudo
-  def _dpp : VOP2_DPP<op, !cast<VOP2_Pseudo>(NAME#"_e32")>;
+
+  foreach _ = BoolToList<!cast<VOP2_Pseudo>(NAME#"_e32").Pfl.HasExtDPP>.ret in
+    def _dpp_vi :
+      VOP_DPP_Real<!cast<VOP2_DPP_Pseudo>(NAME#"_dpp"), SIEncodingFamily.VI>,
+      VOP2_DPPe<op, !cast<VOP2_DPP_Pseudo>(NAME#"_dpp")>;
 }
 
 defm V_CNDMASK_B32        : VOP2_Real_e32e64_vi <0x0>;
@@ -899,9 +1001,6 @@ defm V_ADD_U32            : VOP2_Real_e32e64_gfx9 <0x34>;
 defm V_SUB_U32            : VOP2_Real_e32e64_gfx9 <0x35>;
 defm V_SUBREV_U32         : VOP2_Real_e32e64_gfx9 <0x36>;
 
-defm V_READLANE_B32       : VOP32_Real_vi <0x289>;
-defm V_WRITELANE_B32      : VOP32_Real_vi <0x28a>;
-
 defm V_BFM_B32            : VOP2_Real_e64only_vi <0x293>;
 defm V_BCNT_U32_B32       : VOP2_Real_e64only_vi <0x28b>;
 defm V_MBCNT_LO_U32_B32   : VOP2_Real_e64only_vi <0x28c>;
diff --git a/lib/Target/AMDGPU/VOP3Instructions.td b/lib/Target/AMDGPU/VOP3Instructions.td
index 17ae08dc62670..4b8c1f208a0ed 100644
--- a/lib/Target/AMDGPU/VOP3Instructions.td
+++ b/lib/Target/AMDGPU/VOP3Instructions.td
@@ -17,16 +17,16 @@ class getVOP3ModPat<VOPProfile P, SDPatternOperator node> {
     (VOP3Mods0 P.Src0VT:$src0, i32:$src0_modifiers, i1:$clamp));
 
   list<dag> ret3 = [(set P.DstVT:$vdst,
-    (node (P.Src0VT src0),
+    (DivergentFragOrOp<node, P>.ret (P.Src0VT src0),
           (P.Src1VT (VOP3Mods P.Src1VT:$src1, i32:$src1_modifiers)),
           (P.Src2VT (VOP3Mods P.Src2VT:$src2, i32:$src2_modifiers))))];
 
   list<dag> ret2 = [(set P.DstVT:$vdst,
-    (node (P.Src0VT src0),
+    (DivergentFragOrOp<node, P>.ret (P.Src0VT src0),
           (P.Src1VT (VOP3Mods P.Src1VT:$src1, i32:$src1_modifiers))))];
 
   list<dag> ret1 = [(set P.DstVT:$vdst,
-    (node (P.Src0VT src0)))];
+    (DivergentFragOrOp<node, P>.ret (P.Src0VT src0)))];
 
   list<dag> ret = !if(!eq(P.NumSrcArgs, 3), ret3,
                   !if(!eq(P.NumSrcArgs, 2), ret2,
@@ -35,18 +35,18 @@ class getVOP3ModPat<VOPProfile P, SDPatternOperator node> {
 
 class getVOP3PModPat<VOPProfile P, SDPatternOperator node> {
   list<dag> ret3 = [(set P.DstVT:$vdst,
-    (node (P.Src0VT !if(P.HasClamp, (VOP3PMods0 P.Src0VT:$src0, i32:$src0_modifiers, i1:$clamp),
+    (DivergentFragOrOp<node, P>.ret (P.Src0VT !if(P.HasClamp, (VOP3PMods0 P.Src0VT:$src0, i32:$src0_modifiers, i1:$clamp),
                                     (VOP3PMods P.Src0VT:$src0, i32:$src0_modifiers))),
           (P.Src1VT (VOP3PMods P.Src1VT:$src1, i32:$src1_modifiers)),
           (P.Src2VT (VOP3PMods P.Src2VT:$src2, i32:$src2_modifiers))))];
 
   list<dag> ret2 = [(set P.DstVT:$vdst,
-    (node !if(P.HasClamp, (P.Src0VT (VOP3PMods0 P.Src0VT:$src0, i32:$src0_modifiers, i1:$clamp)),
+    (DivergentFragOrOp<node, P>.ret !if(P.HasClamp, (P.Src0VT (VOP3PMods0 P.Src0VT:$src0, i32:$src0_modifiers, i1:$clamp)),
                           (P.Src0VT (VOP3PMods P.Src0VT:$src0, i32:$src0_modifiers))),
           (P.Src1VT (VOP3PMods P.Src1VT:$src1, i32:$src1_modifiers))))];
 
   list<dag> ret1 = [(set P.DstVT:$vdst,
-    (node (P.Src0VT (VOP3PMods0 P.Src0VT:$src0, i32:$src0_modifiers, i1:$clamp))))];
+    (DivergentFragOrOp<node, P>.ret (P.Src0VT (VOP3PMods0 P.Src0VT:$src0, i32:$src0_modifiers, i1:$clamp))))];
 
   list<dag> ret = !if(!eq(P.NumSrcArgs, 3), ret3,
                   !if(!eq(P.NumSrcArgs, 2), ret2,
@@ -55,18 +55,18 @@ class getVOP3PModPat<VOPProfile P, SDPatternOperator node> {
 
 class getVOP3OpSelPat<VOPProfile P, SDPatternOperator node> {
   list<dag> ret3 = [(set P.DstVT:$vdst,
-    (node (P.Src0VT !if(P.HasClamp, (VOP3OpSel0 P.Src0VT:$src0, i32:$src0_modifiers, i1:$clamp),
+    (DivergentFragOrOp<node, P>.ret (P.Src0VT !if(P.HasClamp, (VOP3OpSel0 P.Src0VT:$src0, i32:$src0_modifiers, i1:$clamp),
                                     (VOP3OpSel P.Src0VT:$src0, i32:$src0_modifiers))),
           (P.Src1VT (VOP3OpSel P.Src1VT:$src1, i32:$src1_modifiers)),
           (P.Src2VT (VOP3OpSel P.Src2VT:$src2, i32:$src2_modifiers))))];
 
   list<dag> ret2 = [(set P.DstVT:$vdst,
-    (node !if(P.HasClamp, (P.Src0VT (VOP3OpSel0 P.Src0VT:$src0, i32:$src0_modifiers, i1:$clamp)),
+    (DivergentFragOrOp<node, P>.ret !if(P.HasClamp, (P.Src0VT (VOP3OpSel0 P.Src0VT:$src0, i32:$src0_modifiers, i1:$clamp)),
                           (P.Src0VT (VOP3OpSel P.Src0VT:$src0, i32:$src0_modifiers))),
           (P.Src1VT (VOP3OpSel P.Src1VT:$src1, i32:$src1_modifiers))))];
 
   list<dag> ret1 = [(set P.DstVT:$vdst,
-    (node (P.Src0VT (VOP3OpSel0 P.Src0VT:$src0, i32:$src0_modifiers, i1:$clamp))))];
+    (DivergentFragOrOp<node, P>.ret (P.Src0VT (VOP3OpSel0 P.Src0VT:$src0, i32:$src0_modifiers, i1:$clamp))))];
 
   list<dag> ret = !if(!eq(P.NumSrcArgs, 3), ret3,
                   !if(!eq(P.NumSrcArgs, 2), ret2,
@@ -75,18 +75,18 @@ class getVOP3OpSelPat<VOPProfile P, SDPatternOperator node> {
 
 class getVOP3OpSelModPat<VOPProfile P, SDPatternOperator node> {
   list<dag> ret3 = [(set P.DstVT:$vdst,
-    (node (P.Src0VT !if(P.HasClamp, (VOP3OpSelMods0 P.Src0VT:$src0, i32:$src0_modifiers, i1:$clamp),
+    (DivergentFragOrOp<node, P>.ret (P.Src0VT !if(P.HasClamp, (VOP3OpSelMods0 P.Src0VT:$src0, i32:$src0_modifiers, i1:$clamp),
                                     (VOP3OpSelMods P.Src0VT:$src0, i32:$src0_modifiers))),
           (P.Src1VT (VOP3OpSelMods P.Src1VT:$src1, i32:$src1_modifiers)),
           (P.Src2VT (VOP3OpSelMods P.Src2VT:$src2, i32:$src2_modifiers))))];
 
   list<dag> ret2 = [(set P.DstVT:$vdst,
-    (node !if(P.HasClamp, (P.Src0VT (VOP3OpSelMods0 P.Src0VT:$src0, i32:$src0_modifiers, i1:$clamp)),
+    (DivergentFragOrOp<node, P>.ret !if(P.HasClamp, (P.Src0VT (VOP3OpSelMods0 P.Src0VT:$src0, i32:$src0_modifiers, i1:$clamp)),
                           (P.Src0VT (VOP3OpSelMods P.Src0VT:$src0, i32:$src0_modifiers))),
           (P.Src1VT (VOP3OpSelMods P.Src1VT:$src1, i32:$src1_modifiers))))];
 
   list<dag> ret1 = [(set P.DstVT:$vdst,
-    (node (P.Src0VT (VOP3OpSelMods0 P.Src0VT:$src0, i32:$src0_modifiers, i1:$clamp))))];
+    (DivergentFragOrOp<node, P>.ret (P.Src0VT (VOP3OpSelMods0 P.Src0VT:$src0, i32:$src0_modifiers, i1:$clamp))))];
 
   list<dag> ret = !if(!eq(P.NumSrcArgs, 3), ret3,
                   !if(!eq(P.NumSrcArgs, 2), ret2,
@@ -94,9 +94,9 @@ class getVOP3OpSelModPat<VOPProfile P, SDPatternOperator node> {
 }
 
 class getVOP3Pat<VOPProfile P, SDPatternOperator node> {
-  list<dag> ret3 = [(set P.DstVT:$vdst, (node P.Src0VT:$src0, P.Src1VT:$src1, P.Src2VT:$src2))];
-  list<dag> ret2 = [(set P.DstVT:$vdst, (node P.Src0VT:$src0, P.Src1VT:$src1))];
-  list<dag> ret1 = [(set P.DstVT:$vdst, (node P.Src0VT:$src0))];
+  list<dag> ret3 = [(set P.DstVT:$vdst, (DivergentFragOrOp<node, P>.ret P.Src0VT:$src0, P.Src1VT:$src1, P.Src2VT:$src2))];
+  list<dag> ret2 = [(set P.DstVT:$vdst, (DivergentFragOrOp<node, P>.ret P.Src0VT:$src0, P.Src1VT:$src1))];
+  list<dag> ret1 = [(set P.DstVT:$vdst, (DivergentFragOrOp<node, P>.ret P.Src0VT:$src0))];
   list<dag> ret = !if(!eq(P.NumSrcArgs, 3), ret3,
                   !if(!eq(P.NumSrcArgs, 2), ret2,
                   ret1));
@@ -185,6 +185,7 @@ class VOP3_Profile<VOPProfile P, VOP3Features Features = VOP3_REGULAR> : VOPProf
                   getAsm64<HasDst, NumSrcArgs, HasIntClamp,
                            HasModifiers, HasOMod, DstVT>.ret,
                   P.Asm64));
+  let NeedPatGen = P.NeedPatGen;
 }
 
 class VOP3b_Profile<ValueType vt> : VOPProfile<[vt, vt, vt, vt]> {
@@ -219,7 +220,8 @@ def VOP3b_I64_I1_I32_I32_I64 : VOPProfile<[i64, i32, i32, i64]> {
 // VOP3 INTERP
 //===----------------------------------------------------------------------===//
 
-class VOP3Interp<string OpName, VOPProfile P> : VOP3_Pseudo<OpName, P> {
+class VOP3Interp<string OpName, VOPProfile P, list<dag> pattern = []> :
+                 VOP3_Pseudo<OpName, P, pattern> {
   let AsmMatchConverter = "cvtVOP3Interp";
 }
 
@@ -291,11 +293,13 @@ def V_FMA_F32 : VOP3Inst <"v_fma_f32", VOP3_Profile<VOP_F32_F32_F32_F32>, fma>;
 def V_LERP_U8 : VOP3Inst <"v_lerp_u8", VOP3_Profile<VOP_I32_I32_I32_I32>, int_amdgcn_lerp>;
 
 let SchedRW = [WriteDoubleAdd] in {
+let FPDPRounding = 1 in {
 def V_FMA_F64 : VOP3Inst <"v_fma_f64", VOP3_Profile<VOP_F64_F64_F64_F64>, fma>;
 def V_ADD_F64 : VOP3Inst <"v_add_f64", VOP3_Profile<VOP_F64_F64_F64>, fadd, 1>;
 def V_MUL_F64 : VOP3Inst <"v_mul_f64", VOP3_Profile<VOP_F64_F64_F64>, fmul, 1>;
-def V_MIN_F64 : VOP3Inst <"v_min_f64", VOP3_Profile<VOP_F64_F64_F64>, fminnum, 1>;
-def V_MAX_F64 : VOP3Inst <"v_max_f64", VOP3_Profile<VOP_F64_F64_F64>, fmaxnum, 1>;
+} // End FPDPRounding = 1
+def V_MIN_F64 : VOP3Inst <"v_min_f64", VOP3_Profile<VOP_F64_F64_F64>, fminnum_like, 1>;
+def V_MAX_F64 : VOP3Inst <"v_max_f64", VOP3_Profile<VOP_F64_F64_F64>, fmaxnum_like, 1>;
 } // End SchedRW = [WriteDoubleAdd]
 
 let SchedRW = [WriteQuarterRate32] in {
@@ -323,6 +327,7 @@ def V_DIV_FMAS_F32 : VOP3_Pseudo <"v_div_fmas_f32", VOP_F32_F32_F32_F32_VCC,
 def V_DIV_FMAS_F64 : VOP3_Pseudo <"v_div_fmas_f64", VOP_F64_F64_F64_F64_VCC,
   getVOP3VCC<VOP_F64_F64_F64_F64_VCC, AMDGPUdiv_fmas>.ret> {
   let SchedRW = [WriteDouble];
+  let FPDPRounding = 1;
 }
 } // End Uses = [VCC, EXEC]
 
@@ -353,10 +358,10 @@ def V_SAD_U32 : VOP3Inst <"v_sad_u32", VOP3_Profile<VOP_I32_I32_I32_I32, VOP3_CL
 def V_CVT_PK_U8_F32 : VOP3Inst<"v_cvt_pk_u8_f32", VOP3_Profile<VOP_I32_F32_I32_I32>, int_amdgcn_cvt_pk_u8_f32>;
 def V_DIV_FIXUP_F32 : VOP3Inst <"v_div_fixup_f32", VOP3_Profile<VOP_F32_F32_F32_F32>, AMDGPUdiv_fixup>;
 
-let SchedRW = [WriteDoubleAdd] in {
+let SchedRW = [WriteDoubleAdd], FPDPRounding = 1 in {
 def V_DIV_FIXUP_F64 : VOP3Inst <"v_div_fixup_f64", VOP3_Profile<VOP_F64_F64_F64_F64>, AMDGPUdiv_fixup>;
 def V_LDEXP_F64 : VOP3Inst <"v_ldexp_f64", VOP3_Profile<VOP_F64_F64_I32>, AMDGPUldexp, 1>;
-} // End SchedRW = [WriteDoubleAdd]
+} // End SchedRW = [WriteDoubleAdd], FPDPRounding = 1
 
 def V_DIV_SCALE_F32 : VOP3_Pseudo <"v_div_scale_f32", VOP3b_F32_I1_F32_F32_F32, [], 1> {
   let SchedRW = [WriteFloatFMA, WriteSALU];
@@ -367,6 +372,7 @@ def V_DIV_SCALE_F32 : VOP3_Pseudo <"v_div_scale_f32", VOP3b_F32_I1_F32_F32_F32,
 def V_DIV_SCALE_F64 : VOP3_Pseudo <"v_div_scale_f64", VOP3b_F64_I1_F64_F64_F64, [], 1> {
   let SchedRW = [WriteDouble, WriteSALU];
   let AsmMatchConverter = "";
+  let FPDPRounding = 1;
 }
 
 def V_MSAD_U8 : VOP3Inst <"v_msad_u8", VOP3_Profile<VOP_I32_I32_I32_I32, VOP3_CLAMP>>;
@@ -381,12 +387,12 @@ def V_TRIG_PREOP_F64 : VOP3Inst <"v_trig_preop_f64", VOP3_Profile<VOP_F64_F64_I3
 
 let SchedRW = [Write64Bit] in {
 // These instructions only exist on SI and CI
-let SubtargetPredicate = isSICI in {
-def V_LSHL_B64 : VOP3Inst <"v_lshl_b64", VOP3_Profile<VOP_I64_I64_I32>>;
-def V_LSHR_B64 : VOP3Inst <"v_lshr_b64", VOP3_Profile<VOP_I64_I64_I32>>;
-def V_ASHR_I64 : VOP3Inst <"v_ashr_i64", VOP3_Profile<VOP_I64_I64_I32>>;
+let SubtargetPredicate = isSICI, Predicates = [isSICI] in {
+def V_LSHL_B64 : VOP3Inst <"v_lshl_b64", VOP3_Profile<VOP_PAT_GEN<VOP_I64_I64_I32>>, shl>;
+def V_LSHR_B64 : VOP3Inst <"v_lshr_b64", VOP3_Profile<VOP_PAT_GEN<VOP_I64_I64_I32>>, srl>;
+def V_ASHR_I64 : VOP3Inst <"v_ashr_i64", VOP3_Profile<VOP_PAT_GEN<VOP_I64_I64_I32>>, sra>;
 def V_MULLIT_F32 : VOP3Inst <"v_mullit_f32", VOP3_Profile<VOP_F32_F32_F32_F32>>;
-} // End SubtargetPredicate = isSICI
+} // End SubtargetPredicate = isSICI, Predicates = [isSICI]
 
 let SubtargetPredicate = isVI in {
 def V_LSHLREV_B64 : VOP3Inst <"v_lshlrev_b64", VOP3_Profile<VOP_I64_I32_I64>>;
@@ -395,6 +401,22 @@ def V_ASHRREV_I64 : VOP3Inst <"v_ashrrev_i64", VOP3_Profile<VOP_I64_I32_I64>>;
 } // End SubtargetPredicate = isVI
 } // End SchedRW = [Write64Bit]
 
+let Predicates = [isVI] in {
+def : GCNPat <
+ (getDivergentFrag<shl>.ret i64:$x, i32:$y),
+ (V_LSHLREV_B64 $y, $x)
+>;
+def : AMDGPUPat <
+ (getDivergentFrag<srl>.ret i64:$x, i32:$y),
+ (V_LSHRREV_B64 $y, $x)
+>;
+def : AMDGPUPat <
+ (getDivergentFrag<sra>.ret i64:$x, i32:$y),
+ (V_ASHRREV_I64 $y, $x)
+>;
+}
+
+
 let SubtargetPredicate = isCIVI in {
 
 let Constraints = "@earlyclobber $vdst", SchedRW = [WriteQuarterRate32] in {
@@ -414,33 +436,51 @@ def V_MAD_I64_I32 : VOP3Inst <"v_mad_i64_i32", VOP3b_I64_I1_I32_I32_I64>;
 
 def V_DIV_FIXUP_F16 : VOP3Inst <"v_div_fixup_f16", VOP3_Profile<VOP_F16_F16_F16_F16>, AMDGPUdiv_fixup> {
   let Predicates = [Has16BitInsts, isVIOnly];
+  let FPDPRounding = 1;
 }
 def V_DIV_FIXUP_F16_gfx9 : VOP3Inst <"v_div_fixup_f16_gfx9",
                                       VOP3_Profile<VOP_F16_F16_F16_F16, VOP3_OPSEL>, AMDGPUdiv_fixup> {
   let renamedInGFX9 = 1;
   let Predicates = [Has16BitInsts, isGFX9];
+  let FPDPRounding = 1;
+}
+
+def V_FMA_F16 : VOP3Inst <"v_fma_f16", VOP3_Profile<VOP_F16_F16_F16_F16>, fma> {
+  let Predicates = [Has16BitInsts, isVIOnly];
+  let FPDPRounding = 1;
+}
+def V_FMA_F16_gfx9 : VOP3Inst <"v_fma_f16_gfx9", VOP3_Profile<VOP_F16_F16_F16_F16, VOP3_OPSEL>, fma> {
+  let renamedInGFX9 = 1;
+  let Predicates = [Has16BitInsts, isGFX9];
+  let FPDPRounding = 1;
 }
 
 let SubtargetPredicate = Has16BitInsts, isCommutable = 1 in {
 
 let renamedInGFX9 = 1 in {
-def V_MAD_F16 : VOP3Inst <"v_mad_f16", VOP3_Profile<VOP_F16_F16_F16_F16>, fmad>;
 def V_MAD_U16 : VOP3Inst <"v_mad_u16", VOP3_Profile<VOP_I16_I16_I16_I16, VOP3_CLAMP>>;
 def V_MAD_I16 : VOP3Inst <"v_mad_i16", VOP3_Profile<VOP_I16_I16_I16_I16, VOP3_CLAMP>>;
-def V_FMA_F16 : VOP3Inst <"v_fma_f16", VOP3_Profile<VOP_F16_F16_F16_F16>, fma>;
+let FPDPRounding = 1 in {
+def V_MAD_F16 : VOP3Inst <"v_mad_f16", VOP3_Profile<VOP_F16_F16_F16_F16>, fmad>;
+let Uses = [M0, EXEC] in {
 def V_INTERP_P2_F16 : VOP3Interp <"v_interp_p2_f16", VOP3_INTERP16<[f16, f32, i32, f32]>>;
-}
+} // End Uses = [M0, EXEC]
+} // End FPDPRounding = 1
+} // End renamedInGFX9 = 1
 
 let SubtargetPredicate = isGFX9 in {
-def V_MAD_F16_gfx9   : VOP3Inst <"v_mad_f16_gfx9", VOP3_Profile<VOP_F16_F16_F16_F16, VOP3_OPSEL>>;
+def V_MAD_F16_gfx9   : VOP3Inst <"v_mad_f16_gfx9", VOP3_Profile<VOP_F16_F16_F16_F16, VOP3_OPSEL>> {
+  let FPDPRounding = 1;
+}
 def V_MAD_U16_gfx9   : VOP3Inst <"v_mad_u16_gfx9", VOP3_Profile<VOP_I16_I16_I16_I16, VOP3_OPSEL>>;
 def V_MAD_I16_gfx9   : VOP3Inst <"v_mad_i16_gfx9", VOP3_Profile<VOP_I16_I16_I16_I16, VOP3_OPSEL>>;
-def V_FMA_F16_gfx9   : VOP3Inst <"v_fma_f16_gfx9", VOP3_Profile<VOP_F16_F16_F16_F16, VOP3_OPSEL>>;
 def V_INTERP_P2_F16_gfx9 : VOP3Interp <"v_interp_p2_f16_gfx9", VOP3_INTERP16<[f16, f32, i32, f32]>>;
 } // End SubtargetPredicate = isGFX9
 
+let Uses = [M0, EXEC], FPDPRounding = 1 in {
 def V_INTERP_P1LL_F16 : VOP3Interp <"v_interp_p1ll_f16", VOP3_INTERP16<[f32, f32, i32, untyped]>>;
 def V_INTERP_P1LV_F16 : VOP3Interp <"v_interp_p1lv_f16", VOP3_INTERP16<[f32, f32, i32, f16]>>;
+} // End Uses = [M0, EXEC], FPDPRounding = 1
 
 } // End SubtargetPredicate = Has16BitInsts, isCommutable = 1
 
@@ -461,17 +501,6 @@ def : GCNPat <
   (inst i16:$src0, i16:$src1, i16:$src2, (i1 0))
 >;
 
-def : GCNPat<
-  (i32 (op3 (op2 (op1 i16:$src0, i16:$src1), i16:$src2))),
-  (inst i16:$src0, i16:$src1, i16:$src2, (i1 0))
->;
-
-def : GCNPat<
-  (i64 (op3 (op2 (op1 i16:$src0, i16:$src1), i16:$src2))),
-   (REG_SEQUENCE VReg_64,
-     (inst i16:$src0, i16:$src1, i16:$src2, (i1 0)), sub0,
-     (V_MOV_B32_e32 (i32 0)), sub1)
->;
 }
 
 defm: Ternary_i16_Pats<mul, add, V_MAD_U16, zext>;
@@ -479,6 +508,37 @@ defm: Ternary_i16_Pats<mul, add, V_MAD_I16, sext>;
 
 } // End Predicates = [Has16BitInsts]
 
+class ThreeOpFrag<SDPatternOperator op1, SDPatternOperator op2> : PatFrag<
+  (ops node:$x, node:$y, node:$z),
+  // When the inner operation is used multiple times, selecting 3-op
+  // instructions may still be beneficial -- if the other users can be
+  // combined similarly. Let's be conservative for now.
+  (op2 (HasOneUseBinOp<op1> node:$x, node:$y), node:$z),
+  [{
+    // Only use VALU ops when the result is divergent.
+    if (!N->isDivergent())
+      return false;
+
+    // Check constant bus limitations.
+    //
+    // Note: Use !isDivergent as a conservative proxy for whether the value
+    //       is in an SGPR (uniform values can end up in VGPRs as well).
+    unsigned ConstantBusUses = 0;
+    for (unsigned i = 0; i < 3; ++i) {
+      if (!Operands[i]->isDivergent() &&
+          !isInlineImmediate(Operands[i].getNode())) {
+        ConstantBusUses++;
+        if (ConstantBusUses >= 2)
+          return false;
+      }
+    }
+
+    return true;
+  }]
+> {
+  let PredicateCodeUsesOperands = 1;
+}
+
 let SubtargetPredicate = isGFX9 in {
 def V_PACK_B32_F16 : VOP3Inst <"v_pack_b32_f16", VOP3_Profile<VOP_B32_F16_F16, VOP3_OPSEL>>;
 def V_LSHL_ADD_U32 : VOP3Inst <"v_lshl_add_u32", VOP3_Profile<VOP_I32_I32_I32_I32>>;
@@ -513,6 +573,22 @@ def V_CVT_PKNORM_U16_F16 : VOP3Inst <"v_cvt_pknorm_u16_f16", VOP3_Profile<VOP_B3
 
 def V_ADD_I32_gfx9 : VOP3Inst <"v_add_i32_gfx9", VOP3_Profile<VOP_I32_I32_I32>>;
 def V_SUB_I32_gfx9 : VOP3Inst <"v_sub_i32_gfx9", VOP3_Profile<VOP_I32_I32_I32>>;
+
+
+class ThreeOp_i32_Pats <SDPatternOperator op1, SDPatternOperator op2, Instruction inst> : GCNPat <
+  // This matches (op2 (op1 i32:$src0, i32:$src1), i32:$src2) with conditions.
+  (ThreeOpFrag<op1, op2> i32:$src0, i32:$src1, i32:$src2),
+  (inst i32:$src0, i32:$src1, i32:$src2)
+>;
+
+def : ThreeOp_i32_Pats<shl, add, V_LSHL_ADD_U32>;
+def : ThreeOp_i32_Pats<add, shl, V_ADD_LSHL_U32>;
+def : ThreeOp_i32_Pats<add, add, V_ADD3_U32>;
+def : ThreeOp_i32_Pats<shl, or, V_LSHL_OR_B32>;
+def : ThreeOp_i32_Pats<and, or, V_AND_OR_B32>;
+def : ThreeOp_i32_Pats<or, or, V_OR3_B32>;
+def : ThreeOp_i32_Pats<xor, add, V_XAD_U32>;
+
 } // End SubtargetPredicate = isGFX9
 
 //===----------------------------------------------------------------------===//
@@ -662,23 +738,23 @@ defm V_MAD_I64_I32      : VOP3be_Real_ci <0x177>;
 let AssemblerPredicates = [isVI], DecoderNamespace = "VI" in {
 
 multiclass VOP3_Real_vi<bits<10> op> {
-  def _vi : VOP3_Real<!cast<VOP3_Pseudo>(NAME), SIEncodingFamily.VI>,
-            VOP3e_vi <op, !cast<VOP3_Pseudo>(NAME).Pfl>;
+  def _vi : VOP3_Real<!cast<VOP_Pseudo>(NAME), SIEncodingFamily.VI>,
+            VOP3e_vi <op, !cast<VOP_Pseudo>(NAME).Pfl>;
 }
 
 multiclass VOP3be_Real_vi<bits<10> op> {
-  def _vi : VOP3_Real<!cast<VOP3_Pseudo>(NAME), SIEncodingFamily.VI>,
-            VOP3be_vi <op, !cast<VOP3_Pseudo>(NAME).Pfl>;
+  def _vi : VOP3_Real<!cast<VOP_Pseudo>(NAME), SIEncodingFamily.VI>,
+            VOP3be_vi <op, !cast<VOP_Pseudo>(NAME).Pfl>;
 }
 
 multiclass VOP3OpSel_Real_gfx9<bits<10> op> {
-  def _vi : VOP3_Real<!cast<VOP3_Pseudo>(NAME), SIEncodingFamily.VI>,
-            VOP3OpSel_gfx9 <op, !cast<VOP3_Pseudo>(NAME).Pfl>;
+  def _vi : VOP3_Real<!cast<VOP_Pseudo>(NAME), SIEncodingFamily.VI>,
+            VOP3OpSel_gfx9 <op, !cast<VOP_Pseudo>(NAME).Pfl>;
 }
 
 multiclass VOP3Interp_Real_vi<bits<10> op> {
-  def _vi : VOP3_Real<!cast<VOP3_Pseudo>(NAME), SIEncodingFamily.VI>,
-            VOP3Interp_vi <op, !cast<VOP3_Pseudo>(NAME).Pfl>;
+  def _vi : VOP3_Real<!cast<VOP_Pseudo>(NAME), SIEncodingFamily.VI>,
+            VOP3Interp_vi <op, !cast<VOP_Pseudo>(NAME).Pfl>;
 }
 
 } // End AssemblerPredicates = [isVI], DecoderNamespace = "VI"
@@ -786,12 +862,15 @@ defm V_FMA_F16          : VOP3_F16_Real_vi <0x1ee>;
 defm V_DIV_FIXUP_F16    : VOP3_F16_Real_vi <0x1ef>;
 defm V_INTERP_P2_F16    : VOP3Interp_F16_Real_vi <0x276>;
 
+let FPDPRounding = 1 in {
 defm V_MAD_LEGACY_F16       : VOP3_F16_Real_gfx9 <0x1ea, "V_MAD_F16",       "v_mad_legacy_f16">;
-defm V_MAD_LEGACY_U16       : VOP3_F16_Real_gfx9 <0x1eb, "V_MAD_U16",       "v_mad_legacy_u16">;
-defm V_MAD_LEGACY_I16       : VOP3_F16_Real_gfx9 <0x1ec, "V_MAD_I16",       "v_mad_legacy_i16">;
 defm V_FMA_LEGACY_F16       : VOP3_F16_Real_gfx9 <0x1ee, "V_FMA_F16",       "v_fma_legacy_f16">;
 defm V_DIV_FIXUP_LEGACY_F16 : VOP3_F16_Real_gfx9 <0x1ef, "V_DIV_FIXUP_F16", "v_div_fixup_legacy_f16">;
 defm V_INTERP_P2_LEGACY_F16 : VOP3Interp_F16_Real_gfx9 <0x276, "V_INTERP_P2_F16", "v_interp_p2_legacy_f16">;
+} // End FPDPRounding = 1
+
+defm V_MAD_LEGACY_U16       : VOP3_F16_Real_gfx9 <0x1eb, "V_MAD_U16",       "v_mad_legacy_u16">;
+defm V_MAD_LEGACY_I16       : VOP3_F16_Real_gfx9 <0x1ec, "V_MAD_I16",       "v_mad_legacy_i16">;
 
 defm V_MAD_F16_gfx9         : VOP3OpSel_F16_Real_gfx9 <0x203, "v_mad_f16">;
 defm V_MAD_U16_gfx9         : VOP3OpSel_F16_Real_gfx9 <0x204, "v_mad_u16">;
@@ -824,6 +903,9 @@ defm V_MUL_LO_I32       : VOP3_Real_vi <0x285>;
 defm V_MUL_HI_U32       : VOP3_Real_vi <0x286>;
 defm V_MUL_HI_I32       : VOP3_Real_vi <0x287>;
 
+defm V_READLANE_B32     : VOP3_Real_vi <0x289>;
+defm V_WRITELANE_B32    : VOP3_Real_vi <0x28a>;
+
 defm V_LSHLREV_B64      : VOP3_Real_vi <0x28f>;
 defm V_LSHRREV_B64      : VOP3_Real_vi <0x290>;
 defm V_ASHRREV_I64      : VOP3_Real_vi <0x291>;
diff --git a/lib/Target/AMDGPU/VOP3PInstructions.td b/lib/Target/AMDGPU/VOP3PInstructions.td
index b51828b546797..91b45583c8489 100644
--- a/lib/Target/AMDGPU/VOP3PInstructions.td
+++ b/lib/Target/AMDGPU/VOP3PInstructions.td
@@ -42,14 +42,16 @@ class VOP3_VOP3PInst<string OpName, VOPProfile P, bit UseTiedOutput = 0,
 }
 
 let isCommutable = 1 in {
-def V_PK_FMA_F16 : VOP3PInst<"v_pk_fma_f16", VOP3_Profile<VOP_V2F16_V2F16_V2F16_V2F16>, fma>;
 def V_PK_MAD_I16 : VOP3PInst<"v_pk_mad_i16", VOP3_Profile<VOP_V2I16_V2I16_V2I16_V2I16>>;
 def V_PK_MAD_U16 : VOP3PInst<"v_pk_mad_u16", VOP3_Profile<VOP_V2I16_V2I16_V2I16_V2I16>>;
 
+let FPDPRounding = 1 in {
+def V_PK_FMA_F16 : VOP3PInst<"v_pk_fma_f16", VOP3_Profile<VOP_V2F16_V2F16_V2F16_V2F16>, fma>;
 def V_PK_ADD_F16 : VOP3PInst<"v_pk_add_f16", VOP3_Profile<VOP_V2F16_V2F16_V2F16>, fadd>;
 def V_PK_MUL_F16 : VOP3PInst<"v_pk_mul_f16", VOP3_Profile<VOP_V2F16_V2F16_V2F16>, fmul>;
-def V_PK_MAX_F16 : VOP3PInst<"v_pk_max_f16", VOP3_Profile<VOP_V2F16_V2F16_V2F16>, fmaxnum>;
-def V_PK_MIN_F16 : VOP3PInst<"v_pk_min_f16", VOP3_Profile<VOP_V2F16_V2F16_V2F16>, fminnum>;
+} // End FPDPRounding = 1
+def V_PK_MAX_F16 : VOP3PInst<"v_pk_max_f16", VOP3_Profile<VOP_V2F16_V2F16_V2F16>, fmaxnum_like>;
+def V_PK_MIN_F16 : VOP3PInst<"v_pk_min_f16", VOP3_Profile<VOP_V2F16_V2F16_V2F16>, fminnum_like>;
 
 def V_PK_ADD_U16 : VOP3PInst<"v_pk_add_u16", VOP3_Profile<VOP_V2I16_V2I16_V2I16>, add>;
 def V_PK_ADD_I16 : VOP3PInst<"v_pk_add_i16", VOP3_Profile<VOP_V2I16_V2I16_V2I16>>;
@@ -137,12 +139,14 @@ let SubtargetPredicate = HasMadMixInsts in {
 let isCommutable = 1 in {
 def V_MAD_MIX_F32 : VOP3_VOP3PInst<"v_mad_mix_f32", VOP3_Profile<VOP_F32_F16_F16_F16, VOP3_OPSEL>>;
 
+let FPDPRounding = 1 in {
 // Clamp modifier is applied after conversion to f16.
 def V_MAD_MIXLO_F16 : VOP3_VOP3PInst<"v_mad_mixlo_f16", VOP3_Profile<VOP_F16_F16_F16_F16, VOP3_OPSEL>, 1>;
 
 let ClampLo = 0, ClampHi = 1 in {
 def V_MAD_MIXHI_F16 : VOP3_VOP3PInst<"v_mad_mixhi_f16", VOP3_Profile<VOP_F16_F16_F16_F16, VOP3_OPSEL>, 1>;
 }
+} // End FPDPRounding = 1
 }
 
 defm : MadFmaMixPats<fmad, V_MAD_MIX_F32, V_MAD_MIXLO_F16, V_MAD_MIXHI_F16>;
@@ -154,18 +158,99 @@ let SubtargetPredicate = HasFmaMixInsts in {
 let isCommutable = 1 in {
 def V_FMA_MIX_F32 : VOP3_VOP3PInst<"v_fma_mix_f32", VOP3_Profile<VOP_F32_F16_F16_F16, VOP3_OPSEL>>;
 
+let FPDPRounding = 1 in {
 // Clamp modifier is applied after conversion to f16.
 def V_FMA_MIXLO_F16 : VOP3_VOP3PInst<"v_fma_mixlo_f16", VOP3_Profile<VOP_F16_F16_F16_F16, VOP3_OPSEL>, 1>;
 
 let ClampLo = 0, ClampHi = 1 in {
 def V_FMA_MIXHI_F16 : VOP3_VOP3PInst<"v_fma_mixhi_f16", VOP3_Profile<VOP_F16_F16_F16_F16, VOP3_OPSEL>, 1>;
 }
+} // End FPDPRounding = 1
 }
 
 defm : MadFmaMixPats<fma, V_FMA_MIX_F32, V_FMA_MIXLO_F16, V_FMA_MIXHI_F16>;
 }
 
-let SubtargetPredicate = HasDLInsts in {
+// Defines patterns that extract signed 4bit from each Idx[0].
+foreach Idx = [[0,28],[4,24],[8,20],[12,16],[16,12],[20,8],[24,4]] in
+  def ExtractSigned4bit_#Idx[0] : PatFrag<(ops node:$src),
+                                          (sra (shl node:$src, (i32 Idx[1])), (i32 28))>;
+
+// Defines code pattern that extracts U(unsigned/signed) 4/8bit from FromBitIndex.
+class Extract<int FromBitIndex, int BitMask, bit U>: PatFrag<
+  (ops node:$src),
+  !if (!or (!and (!eq (BitMask, 255), !eq (FromBitIndex, 24)), !eq (FromBitIndex, 28)), // last element
+       !if (U, (srl node:$src, (i32 FromBitIndex)), (sra node:$src, (i32 FromBitIndex))),
+       !if (!eq (FromBitIndex, 0), // first element
+            !if (U, (and node:$src, (i32 BitMask)),
+                 !if (!eq (BitMask, 15), (!cast<PatFrag>("ExtractSigned4bit_"#FromBitIndex) node:$src),
+                                         (sext_inreg node:$src, i8))),
+            !if (U, (and (srl node:$src, (i32 FromBitIndex)), (i32 BitMask)),
+                 !if (!eq (BitMask, 15), (!cast<PatFrag>("ExtractSigned4bit_"#FromBitIndex) node:$src),
+                      (sext_inreg (srl node:$src, (i32 FromBitIndex)), i8)))))>;
+
+
+foreach Type = ["I", "U"] in
+  foreach Index = 0-3 in {
+    // Defines patterns that extract each Index'ed 8bit from an unsigned
+    // 32bit scalar value;
+    def #Type#Index#"_8bit" : Extract<!shl(Index, 3), 255, !if (!eq (Type, "U"), 1, 0)>;
+
+    // Defines multiplication patterns where the multiplication is happening on each
+    // Index'ed 8bit of a 32bit scalar value.
+
+    def Mul#Type#_Elt#Index : PatFrag<
+      (ops node:$src0, node:$src1),
+      (!cast<HasOneUseBinOp>(!if (!eq (Type, "I"), AMDGPUmul_i24_oneuse, AMDGPUmul_u24_oneuse))
+                            (!cast<Extract>(#Type#Index#"_8bit") node:$src0),
+                            (!cast<Extract>(#Type#Index#"_8bit") node:$src1))>;
+  }
+
+// Different variants of dot8 patterns cause a huge increase in the compile time.
+// Define non-associative/commutative add/mul to prevent permutation in the dot8
+// pattern.
+def NonACAdd        : SDNode<"ISD::ADD"       , SDTIntBinOp>;
+def NonACAdd_oneuse : HasOneUseBinOp<NonACAdd>;
+
+def NonACAMDGPUmul_u24        : SDNode<"AMDGPUISD::MUL_U24"       , SDTIntBinOp>;
+def NonACAMDGPUmul_u24_oneuse : HasOneUseBinOp<NonACAMDGPUmul_u24>;
+
+def NonACAMDGPUmul_i24        : SDNode<"AMDGPUISD::MUL_I24"       , SDTIntBinOp>;
+def NonACAMDGPUmul_i24_oneuse : HasOneUseBinOp<NonACAMDGPUmul_i24>;
+
+foreach Type = ["I", "U"] in
+  foreach Index = 0-7 in {
+    // Defines patterns that extract each Index'ed 4bit from an unsigned
+    // 32bit scalar value;
+    def #Type#Index#"_4bit" : Extract<!shl(Index, 2), 15, !if (!eq (Type, "U"), 1, 0)>;
+
+    // Defines multiplication patterns where the multiplication is happening on each
+    // Index'ed 8bit of a 32bit scalar value.
+    def Mul#Type#Index#"_4bit" : PatFrag<
+      (ops node:$src0, node:$src1),
+      (!cast<HasOneUseBinOp>(!if (!eq (Type, "I"), NonACAMDGPUmul_i24_oneuse, NonACAMDGPUmul_u24_oneuse))
+                             (!cast<Extract>(#Type#Index#"_4bit") node:$src0),
+                             (!cast<Extract>(#Type#Index#"_4bit") node:$src1))>;
+  }
+
+class UDot2Pat<Instruction Inst> : GCNPat <
+  (add (add_oneuse (AMDGPUmul_u24_oneuse (srl i32:$src0, (i32 16)),
+                                         (srl i32:$src1, (i32 16))), i32:$src2),
+       (AMDGPUmul_u24_oneuse (and i32:$src0, (i32 65535)),
+                             (and i32:$src1, (i32 65535)))
+   ),
+  (Inst (i32 8), $src0, (i32 8), $src1, (i32 8), $src2, (i1 0))
+>;
+
+class SDot2Pat<Instruction Inst> : GCNPat <
+  (add (add_oneuse (AMDGPUmul_i24_oneuse (sra i32:$src0, (i32 16)),
+                                         (sra i32:$src1, (i32 16))), i32:$src2),
+       (AMDGPUmul_i24_oneuse (sext_inreg i32:$src0, i16),
+                             (sext_inreg i32:$src1, i16))),
+  (Inst (i32 8), $src0, (i32 8), $src1, (i32 8), $src2, (i1 0))
+>;
+
+let SubtargetPredicate = HasDotInsts in {
 
 def V_DOT2_F32_F16 : VOP3PInst<"v_dot2_f32_f16", VOP3_Profile<VOP_F32_V2F16_V2F16_F32>>;
 def V_DOT2_I32_I16 : VOP3PInst<"v_dot2_i32_i16", VOP3_Profile<VOP_I32_V2I16_V2I16_I32>>;
@@ -192,7 +277,32 @@ defm : DotPats<int_amdgcn_udot4, V_DOT4_U32_U8>;
 defm : DotPats<int_amdgcn_sdot8, V_DOT8_I32_I4>;
 defm : DotPats<int_amdgcn_udot8, V_DOT8_U32_U4>;
 
-} // End SubtargetPredicate = HasDLInsts
+def : UDot2Pat<V_DOT2_U32_U16>;
+def : SDot2Pat<V_DOT2_I32_I16>;
+
+foreach Type = ["U", "I"] in
+  def : GCNPat <
+    !cast<dag>(!foldl((i32 i32:$src2), [0, 1, 2, 3], lhs, y,
+                      (add_oneuse lhs, (!cast<PatFrag>("Mul"#Type#"_Elt"#y) i32:$src0, i32:$src1)))),
+    (!cast<VOP3PInst>("V_DOT4_"#Type#"32_"#Type#8) (i32 8), $src0, (i32 8), $src1, (i32 8), $src2, (i1 0))>;
+
+foreach Type = ["U", "I"] in
+  def : GCNPat <
+    !cast<dag>(!foldl((add_oneuse i32:$src2, (!cast<PatFrag>("Mul"#Type#"0_4bit") i32:$src0, i32:$src1)),
+                      [1, 2, 3, 4, 5, 6, 7], lhs, y,
+                      (NonACAdd_oneuse lhs, (!cast<PatFrag>("Mul"#Type#y#"_4bit") i32:$src0, i32:$src1)))),
+    (!cast<VOP3PInst>("V_DOT8_"#Type#"32_"#Type#4) (i32 8), $src0, (i32 8), $src1, (i32 8), $src2, (i1 0))>;
+
+// Different variants of dot8 code-gen dag patterns are not generated through table-gen due to a huge increase
+// in the compile time. Directly handle the pattern generated by the FE here.
+foreach Type = ["U", "I"] in
+  def : GCNPat <
+    !cast<dag>(!foldl((add_oneuse i32:$src2, (!cast<PatFrag>("Mul"#Type#"0_4bit") i32:$src0, i32:$src1)),
+                      [7, 1, 2, 3, 4, 5, 6], lhs, y,
+                      (NonACAdd_oneuse lhs, (!cast<PatFrag>("Mul"#Type#y#"_4bit") i32:$src0, i32:$src1)))),
+    (!cast<VOP3PInst>("V_DOT8_"#Type#"32_"#Type#4) (i32 8), $src0, (i32 8), $src1, (i32 8), $src2, (i1 0))>;
+
+} // End SubtargetPredicate = HasDotInsts
 
 multiclass VOP3P_Real_vi<bits<10> op> {
   def _vi : VOP3P_Real<!cast<VOP3_Pseudo>(NAME), SIEncodingFamily.VI>,
@@ -242,7 +352,7 @@ defm V_FMA_MIXHI_F16 : VOP3P_Real_vi <0x3a2>;
 }
 
 
-let SubtargetPredicate = HasDLInsts in {
+let SubtargetPredicate = HasDotInsts in {
 
 defm V_DOT2_F32_F16 : VOP3P_Real_vi <0x3a3>;
 defm V_DOT2_I32_I16 : VOP3P_Real_vi <0x3a6>;
@@ -252,4 +362,4 @@ defm V_DOT4_U32_U8  : VOP3P_Real_vi <0x3a9>;
 defm V_DOT8_I32_I4  : VOP3P_Real_vi <0x3aa>;
 defm V_DOT8_U32_U4  : VOP3P_Real_vi <0x3ab>;
 
-} // End SubtargetPredicate = HasDLInsts
+} // End SubtargetPredicate = HasDotInsts
diff --git a/lib/Target/AMDGPU/VOPCInstructions.td b/lib/Target/AMDGPU/VOPCInstructions.td
index cc6b8116afee1..091cac8cd35ca 100644
--- a/lib/Target/AMDGPU/VOPCInstructions.td
+++ b/lib/Target/AMDGPU/VOPCInstructions.td
@@ -635,6 +635,17 @@ def : ICMP_Pattern <COND_SGE, V_CMP_GE_I64_e64, i64>;
 def : ICMP_Pattern <COND_SLT, V_CMP_LT_I64_e64, i64>;
 def : ICMP_Pattern <COND_SLE, V_CMP_LE_I64_e64, i64>;
 
+def : ICMP_Pattern <COND_EQ, V_CMP_EQ_U16_e64, i16>;
+def : ICMP_Pattern <COND_NE, V_CMP_NE_U16_e64, i16>;
+def : ICMP_Pattern <COND_UGT, V_CMP_GT_U16_e64, i16>;
+def : ICMP_Pattern <COND_UGE, V_CMP_GE_U16_e64, i16>;
+def : ICMP_Pattern <COND_ULT, V_CMP_LT_U16_e64, i16>;
+def : ICMP_Pattern <COND_ULE, V_CMP_LE_U16_e64, i16>;
+def : ICMP_Pattern <COND_SGT, V_CMP_GT_I16_e64, i16>;
+def : ICMP_Pattern <COND_SGE, V_CMP_GE_I16_e64, i16>;
+def : ICMP_Pattern <COND_SLT, V_CMP_LT_I16_e64, i16>;
+def : ICMP_Pattern <COND_SLE, V_CMP_LE_I16_e64, i16>;
+
 class FCMP_Pattern <PatLeaf cond, Instruction inst, ValueType vt> : GCNPat <
   (i64 (AMDGPUsetcc (vt (VOP3Mods vt:$src0, i32:$src0_modifiers)),
                    (vt (VOP3Mods vt:$src1, i32:$src1_modifiers)), cond)),
@@ -656,6 +667,14 @@ def : FCMP_Pattern <COND_OGE, V_CMP_GE_F64_e64, f64>;
 def : FCMP_Pattern <COND_OLT, V_CMP_LT_F64_e64, f64>;
 def : FCMP_Pattern <COND_OLE, V_CMP_LE_F64_e64, f64>;
 
+def : FCMP_Pattern <COND_OEQ, V_CMP_EQ_F16_e64, f16>;
+def : FCMP_Pattern <COND_ONE, V_CMP_NEQ_F16_e64, f16>;
+def : FCMP_Pattern <COND_OGT, V_CMP_GT_F16_e64, f16>;
+def : FCMP_Pattern <COND_OGE, V_CMP_GE_F16_e64, f16>;
+def : FCMP_Pattern <COND_OLT, V_CMP_LT_F16_e64, f16>;
+def : FCMP_Pattern <COND_OLE, V_CMP_LE_F16_e64, f16>;
+
+
 def : FCMP_Pattern <COND_UEQ, V_CMP_NLG_F32_e64, f32>;
 def : FCMP_Pattern <COND_UNE, V_CMP_NEQ_F32_e64, f32>;
 def : FCMP_Pattern <COND_UGT, V_CMP_NLE_F32_e64, f32>;
@@ -670,6 +689,13 @@ def : FCMP_Pattern <COND_UGE, V_CMP_NLT_F64_e64, f64>;
 def : FCMP_Pattern <COND_ULT, V_CMP_NGE_F64_e64, f64>;
 def : FCMP_Pattern <COND_ULE, V_CMP_NGT_F64_e64, f64>;
 
+def : FCMP_Pattern <COND_UEQ, V_CMP_NLG_F16_e64, f16>;
+def : FCMP_Pattern <COND_UNE, V_CMP_NEQ_F16_e64, f16>;
+def : FCMP_Pattern <COND_UGT, V_CMP_NLE_F16_e64, f16>;
+def : FCMP_Pattern <COND_UGE, V_CMP_NLT_F16_e64, f16>;
+def : FCMP_Pattern <COND_ULT, V_CMP_NGE_F16_e64, f16>;
+def : FCMP_Pattern <COND_ULE, V_CMP_NGT_F16_e64, f16>;
+
 //===----------------------------------------------------------------------===//
 // Target
 //===----------------------------------------------------------------------===//
diff --git a/lib/Target/AMDGPU/VOPInstructions.td b/lib/Target/AMDGPU/VOPInstructions.td
index f0f7f259f71d2..7de7d90d27b3a 100644
--- a/lib/Target/AMDGPU/VOPInstructions.td
+++ b/lib/Target/AMDGPU/VOPInstructions.td
@@ -420,10 +420,10 @@ class VOP_SDWA_Pseudo <string opName, VOPProfile P, list<dag> pattern=[]> :
   let SDWA = 1;
   let Uses = [EXEC];
 
-  let SubtargetPredicate = !if(P.HasExt, HasSDWA, DisableInst);
-  let AssemblerPredicate = !if(P.HasExt, HasSDWA, DisableInst);
-  let AsmVariantName = !if(P.HasExt, AMDGPUAsmVariants.SDWA,
-                                     AMDGPUAsmVariants.Disable);
+  let SubtargetPredicate = !if(P.HasExtSDWA, HasSDWA, DisableInst);
+  let AssemblerPredicate = !if(P.HasExtSDWA, HasSDWA, DisableInst);
+  let AsmVariantName = !if(P.HasExtSDWA, AMDGPUAsmVariants.SDWA,
+                                         AMDGPUAsmVariants.Disable);
   let DecoderNamespace = "SDWA";
 
   VOPProfile Pfl = P;
@@ -471,10 +471,10 @@ class VOP_SDWA9_Real <VOP_SDWA_Pseudo ps> :
   let Constraints     = ps.Constraints;
   let DisableEncoding = ps.DisableEncoding;
 
-  let SubtargetPredicate = !if(ps.Pfl.HasSDWA9, HasSDWA9, DisableInst);
-  let AssemblerPredicate = !if(ps.Pfl.HasSDWA9, HasSDWA9, DisableInst);
-  let AsmVariantName = !if(ps.Pfl.HasSDWA9, AMDGPUAsmVariants.SDWA9,
-                                            AMDGPUAsmVariants.Disable);
+  let SubtargetPredicate = !if(ps.Pfl.HasExtSDWA9, HasSDWA9, DisableInst);
+  let AssemblerPredicate = !if(ps.Pfl.HasExtSDWA9, HasSDWA9, DisableInst);
+  let AsmVariantName = !if(ps.Pfl.HasExtSDWA9, AMDGPUAsmVariants.SDWA9,
+                                               AMDGPUAsmVariants.Disable);
   let DecoderNamespace = "SDWA9";
 
   // Copy relevant pseudo op flags
@@ -505,9 +505,14 @@ class VOP_DPPe<VOPProfile P> : Enc64 {
   let Inst{63-60} = row_mask;
 }
 
-class VOP_DPP <string OpName, VOPProfile P> :
-  InstSI <P.OutsDPP, P.InsDPP, OpName#P.AsmDPP, []>,
-  VOP_DPPe<P> {
+class VOP_DPP_Pseudo <string OpName, VOPProfile P, list<dag> pattern=[]> :
+  InstSI <P.OutsDPP, P.InsDPP, OpName#P.AsmDPP, pattern>,
+  VOP <OpName>,
+  SIMCInstr <OpName#"_dpp", SIEncodingFamily.NONE>,
+  MnemonicAlias <OpName#"_dpp", OpName> {
+
+  let isPseudo = 1;
+  let isCodeGenOnly = 1;
 
   let mayLoad = 0;
   let mayStore = 0;
@@ -517,15 +522,99 @@ class VOP_DPP <string OpName, VOPProfile P> :
   let VALU = 1;
   let DPP = 1;
   let Size = 8;
+  let Uses = [EXEC];
+  let isConvergent = 1;
+
+  string Mnemonic = OpName;
+  string AsmOperands = P.AsmDPP;
 
   let AsmMatchConverter = !if(!eq(P.HasModifiers,1), "cvtDPP", "");
   let SubtargetPredicate = HasDPP;
-  let AssemblerPredicate = !if(P.HasExt, HasDPP, DisableInst);
-  let AsmVariantName = !if(P.HasExt, AMDGPUAsmVariants.DPP,
-                                     AMDGPUAsmVariants.Disable);
+  let AssemblerPredicate = !if(P.HasExtDPP, HasDPP, DisableInst);
+  let AsmVariantName = !if(P.HasExtDPP, AMDGPUAsmVariants.DPP,
+                                        AMDGPUAsmVariants.Disable);
   let Constraints = !if(P.NumSrcArgs, "$old = $vdst", "");
   let DisableEncoding = !if(P.NumSrcArgs, "$old", "");
   let DecoderNamespace = "DPP";
+
+  VOPProfile Pfl = P;
+}
+
+class VOP_DPP_Real <VOP_DPP_Pseudo ps, int EncodingFamily> :
+  InstSI <ps.OutOperandList, ps.InOperandList, ps.Mnemonic # ps.AsmOperands, []>,
+  SIMCInstr <ps.PseudoInstr, EncodingFamily> {
+
+  let isPseudo = 0;
+  let isCodeGenOnly = 0;
+
+  let Defs = ps.Defs;
+  let Uses = ps.Uses;
+  let SchedRW = ps.SchedRW;
+  let hasSideEffects = ps.hasSideEffects;
+
+  let Constraints     = ps.Constraints;
+  let DisableEncoding = ps.DisableEncoding;
+
+  // Copy relevant pseudo op flags
+  let isConvergent         = ps.isConvergent;
+  let SubtargetPredicate   = ps.SubtargetPredicate;
+  let AssemblerPredicate   = ps.AssemblerPredicate;
+  let AsmMatchConverter    = ps.AsmMatchConverter;
+  let AsmVariantName       = ps.AsmVariantName;
+  let UseNamedOperandTable = ps.UseNamedOperandTable;
+  let DecoderNamespace     = ps.DecoderNamespace;
+  let Constraints          = ps.Constraints;
+  let DisableEncoding      = ps.DisableEncoding;
+  let TSFlags              = ps.TSFlags;
+}
+
+class getNumNodeArgs<SDPatternOperator Op> {
+  SDNode N = !cast<SDNode>(Op);
+  SDTypeProfile TP = N.TypeProfile;
+  int ret = TP.NumOperands;
+}
+
+
+class getDivergentFrag<SDPatternOperator Op> {
+
+  int NumSrcArgs = getNumNodeArgs<Op>.ret;
+  PatFrag ret = PatFrag <
+    !if(!eq(NumSrcArgs, 1),
+             (ops node:$src0),
+             !if(!eq(NumSrcArgs, 2),
+               (ops node:$src0, node:$src1),
+               (ops node:$src0, node:$src1, node:$src2))),
+    !if(!eq(NumSrcArgs, 1),
+             (Op $src0),
+             !if(!eq(NumSrcArgs, 2),
+               (Op $src0, $src1),
+               (Op $src0, $src1, $src2))),
+    [{ return N->isDivergent(); }]
+  >;
+}
+
+class VOPPatGen<SDPatternOperator Op, VOPProfile P> {
+
+  PatFrag Operator = getDivergentFrag < Op >.ret;
+
+  dag Ins = !foreach(tmp, P.Ins32, !subst(ins, Operator,
+                                         !subst(P.Src0RC32, P.Src0VT,
+                                               !subst(P.Src1RC32, P.Src1VT, tmp))));
+
+
+  dag Outs = !foreach(tmp, P.Outs32, !subst(outs, set,
+                                           !subst(P.DstRC, P.DstVT, tmp)));
+
+  list<dag> ret =  [!con(Outs, (set Ins))];
+}
+
+class VOPPatOrNull<SDPatternOperator Op, VOPProfile P> {
+  list<dag> ret = !if(!ne(P.NeedPatGen,PatGenMode.NoPattern), VOPPatGen<Op, P>.ret, []);
+}
+
+class DivergentFragOrOp<SDPatternOperator Op, VOPProfile P> {
+  SDPatternOperator ret = !if(!eq(P.NeedPatGen,PatGenMode.Pattern),
+   !if(!isa<SDNode>(Op), getDivergentFrag<Op>.ret, Op), Op);
 }
 
 include "VOPCInstructions.td"
author	Dimitry Andric <dim@FreeBSD.org>	2019-01-19 10:01:25 +0000
committer	Dimitry Andric <dim@FreeBSD.org>	2019-01-19 10:01:25 +0000
commit	d8e91e46262bc44006913e6796843909f1ac7bcd (patch)
tree	7d0c143d9b38190e0fa0180805389da22cd834c5 /lib/Target/AMDGPU
parent	b7eb8e35e481a74962664b63dfb09483b200209a (diff)